Change attribute parsing strategy

2025-03-04 23:04:52 +00:00 · 2019-08-28 20:27:33 +09:00 · 2019-08-28 20:27:33 +09:00 · 667a2920f2
commit 667a2920f2
parent 4a770685c0
7 changed files with 109 additions and 244 deletions
--- a/_benchmark/cmark/.gitignore
+++ b/_benchmark/cmark/.gitignore
@ -1 +1,2 @@
 cmark-master
 cmark_benchmark
--- a/_benchmark/go/benchmark_test.go
+++ b/_benchmark/go/benchmark_test.go
@ -11,7 +11,7 @@ import (
 	"gitlab.com/golang-commonmark/markdown"
 	bf1 "github.com/russross/blackfriday"
-	bf2 "github.com/russross/blackfriday/v2"
+	bf2 "gopkg.in/russross/blackfriday.v2"
 )
 func BenchmarkMarkdown(b *testing.B) {
--- a/ast/ast.go
+++ b/ast/ast.go
@ -42,7 +42,7 @@ func NewNodeKind(name string) NodeKind {
 // An Attribute is an attribute of the Node
 type Attribute struct {
 	Name  []byte
-	Value []byte
+	Value interface{}
 }
 var attrNameIDS = []byte("#")
@ -143,17 +143,20 @@ type Node interface {
 	IsRaw() bool
 	// SetAttribute sets the given value to the attributes.
-	SetAttribute(name, value []byte)
+	SetAttribute(name []byte, value interface{})
 	// SetAttributeString sets the given value to the attributes.
 	SetAttributeString(name string, value interface{})
 	// Attribute returns a (attribute value, true) if an attribute
 	// associated with the given name is found, otherwise
 	// (nil, false)
-	Attribute(name []byte) ([]byte, bool)
+	Attribute(name []byte) (interface{}, bool)
 	// AttributeString returns a (attribute value, true) if an attribute
 	// associated with the given name is found, otherwise
 	// (nil, false)
-	AttributeString(name string) ([]byte, bool)
+	AttributeString(name string) (interface{}, bool)
 	// Attributes returns a list of attributes.
 	// This may be a nil if there are no attributes.
@ -327,7 +330,7 @@ func (n *BaseNode) Text(source []byte) []byte {
 }
 // SetAttribute implements Node.SetAttribute.
-func (n *BaseNode) SetAttribute(name, value []byte) {
+func (n *BaseNode) SetAttribute(name []byte, value interface{}) {
 	if n.attributes == nil {
 		n.attributes = make([]Attribute, 0, 10)
 	} else {
@ -339,20 +342,16 @@ func (n *BaseNode) SetAttribute(name, value []byte) {
 			}
 		}
 	}
 	if len(name) == 1 {
 		if name[0] == '#' {
 			n.attributes = append(n.attributes, Attribute{attrNameID, value})
 			return
 		} else if name[0] == '.' {
 			n.attributes = append(n.attributes, Attribute{attrNameClass, value})
 			return
 		}
 	}
 	n.attributes = append(n.attributes, Attribute{name, value})
 }
 // SetAttributeString implements Node.SetAttributeString
 func (n *BaseNode) SetAttributeString(name string, value interface{}) {
 	n.SetAttribute(util.StringToReadOnlyBytes(name), value)
 }
 // Attribute implements Node.Attribute.
-func (n *BaseNode) Attribute(name []byte) ([]byte, bool) {
+func (n *BaseNode) Attribute(name []byte) (interface{}, bool) {
 	if n.attributes == nil {
 		return nil, false
 	}
@ -365,7 +364,7 @@ func (n *BaseNode) Attribute(name []byte) ([]byte, bool) {
 }
 // AttributeString implements Node.AttributeString.
-func (n *BaseNode) AttributeString(s string) ([]byte, bool) {
+func (n *BaseNode) AttributeString(s string) (interface{}, bool) {
 	return n.Attribute(util.StringToReadOnlyBytes(s))
 }
--- a/parser/attribute.go
+++ b/parser/attribute.go
@ -2,21 +2,47 @@ package parser
 import (
 	"bytes"
 	"fmt"
 	"github.com/yuin/goldmark/text"
 	"github.com/yuin/goldmark/util"
 	"strconv"
 )
-type attribute struct {
+var attrNameID = []byte("id")
-	Name  string
+var attrNameClass = []byte("class")
 // An Attribute is an attribute of the markdown elements
 type Attribute struct {
 	Name  []byte
 	Value interface{}
 }
 // An Attributes is a collection of attributes.
 type Attributes []Attribute
 // Find returns a (value, true) if an attribute correspond with given name is found, otherwise (nil, false).
 func (as Attributes) Find(name []byte) (interface{}, bool) {
 	for _, a := range as {
 		if bytes.Equal(a.Name, name) {
 			return a.Value, true
 		}
 	}
 	return nil, false
 }
 func (as Attributes) findUpdate(name []byte, cb func(v interface{}) interface{}) bool {
 	for i, a := range as {
 		if bytes.Equal(a.Name, name) {
 			as[i].Value = cb(a.Value)
 			return true
 		}
 	}
 	return false
 }
 // ParseAttributes parses attributes into a map.
-// ParseAttributes returns a parsed map and true if could parse
+// ParseAttributes returns a parsed attributes and true if could parse
 // attributes, otherwise nil and false.
-func ParseAttributes(reader text.Reader) (map[string]interface{}, bool) {
+func ParseAttributes(reader text.Reader) (Attributes, bool) {
 	savedLine, savedPosition := reader.Position()
 	reader.SkipSpaces()
 	if reader.Peek() != '{' {
@ -24,28 +50,29 @@ func ParseAttributes(reader text.Reader) (map[string]interface{}, bool) {
 		return nil, false
 	}
 	reader.Advance(1)
-	m := map[string]interface{}{}
+	attrs := Attributes{}
 	for {
 		if reader.Peek() == '}' {
 			reader.Advance(1)
-			return m, true
+			return attrs, true
 		}
 		attr, ok := parseAttribute(reader)
 		if !ok {
 			reader.SetPosition(savedLine, savedPosition)
 			return nil, false
 		}
-		if attr.Name == "class" {
+		if bytes.Equal(attr.Name, attrNameClass) {
-			if v, ok := m["class"]; ok {
+			if !attrs.findUpdate(attrNameClass, func(v interface{}) interface{} {
-				if _, ok2 := v.([][]byte); !ok2 {
+				var ret interface{}
-					m["class"] = [][]byte{v.([]byte)}
+				if ret, ok = v.([][]byte); !ok {
 					ret = [][]byte{v.([]byte)}
 				}
-				m["class"] = append(m["class"].([][]byte), util.StringToReadOnlyBytes(fmt.Sprintf("%v", attr.Value)))
+				return append(ret.([][]byte), attr.Value.([]byte))
-			} else {
+			}) {
-				m["class"] = util.StringToReadOnlyBytes(fmt.Sprintf("%v", attr.Value))
+				attrs = append(attrs, attr)
 			}
 		} else {
-			m[attr.Name] = attr.Value
+			attrs = append(attrs, attr)
 		}
 		reader.SkipSpaces()
 		if reader.Peek() == ',' {
@ -55,7 +82,7 @@ func ParseAttributes(reader text.Reader) (map[string]interface{}, bool) {
 	}
 }
-func parseAttribute(reader text.Reader) (attribute, bool) {
+func parseAttribute(reader text.Reader) (Attribute, bool) {
 	reader.SkipSpaces()
 	c := reader.Peek()
 	if c == '#' || c == '.' {
@ -64,18 +91,18 @@ func parseAttribute(reader text.Reader) (attribute, bool) {
 		i := 0
 		for ; i < len(line) && !util.IsSpace(line[i]) && (!util.IsPunct(line[i]) || line[i] == '_' || line[i] == '-'); i++ {
 		}
-		name := "class"
+		name := attrNameClass
 		if c == '#' {
-			name = "id"
+			name = attrNameID
 		}
 		reader.Advance(i)
-		return attribute{Name: name, Value: line[0:i]}, true
+		return Attribute{Name: name, Value: line[0:i]}, true
 	}
 	line, _ := reader.PeekLine()
 	c = line[0]
 	if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
 		c == '_' || c == ':') {
-		return attribute{}, false
+		return Attribute{}, false
 	}
 	i := 0
 	for ; i < len(line); i++ {
@ -86,20 +113,20 @@ func parseAttribute(reader text.Reader) (attribute, bool) {
 			break
 		}
 	}
-	name := string(line[:i])
+	name := line[:i]
 	reader.Advance(i)
 	reader.SkipSpaces()
 	c = reader.Peek()
 	if c != '=' {
-		return attribute{}, false
+		return Attribute{}, false
 	}
 	reader.Advance(1)
 	reader.SkipSpaces()
 	value, ok := parseAttributeValue(reader)
 	if !ok {
-		return attribute{}, false
+		return Attribute{}, false
 	}
-	return attribute{Name: name, Value: value}, true
+	return Attribute{Name: name, Value: value}, true
 }
@ -110,7 +137,7 @@ func parseAttributeValue(reader text.Reader) (interface{}, bool) {
 	ok := false
 	switch c {
 	case text.EOF:
-		return attribute{}, false
+		return Attribute{}, false
 	case '{':
 		value, ok = ParseAttributes(reader)
 	case '[':
--- a/parser/atx_heading.go
+++ b/parser/atx_heading.go
@ -99,8 +99,8 @@ func (b *atxHeadingParser) Open(parent ast.Node, reader text.Reader, pc Context)
 	parsed := false
 	if b.Attribute { // handles special case like ### heading ### {#id}
 		start--
 		closureOpen := -1
 		closureClose := -1
 		closureOpen := -1
 		for i := start; i < stop; {
 			c := line[i]
 			if util.IsEscapedPunctuation(line, i) {
@ -117,28 +117,14 @@ func (b *atxHeadingParser) Open(parent ast.Node, reader text.Reader, pc Context)
 			}
 		}
 		if closureClose > 0 {
-			i := closureClose
+			reader.Advance(closureClose)
-			for ; i < stop && util.IsSpace(line[i]); i++ {
+			attrs, ok := ParseAttributes(reader)
 			parsed = ok
 			if parsed {
 				for _, attr := range attrs {
 					node.SetAttribute(attr.Name, attr.Value)
 				}
 			if i < stop-1 || line[i] == '{' {
 				as := i + 1
 				for as < stop {
 					ai, skip := util.FindAttributeIndex(line[as:], true)
 					if ai[0] < 0 {
 						break
 					}
 					node.SetAttribute(line[as+ai[0]:as+ai[1]],
 						util.UnescapePunctuations(line[as+ai[2]:as+ai[3]]))
 					as += ai[3] + skip
 				}
 				for ; as < stop && util.IsSpace(line[as]); as++ {
 				}
 				if line[as] == '}' && (as > stop-2 || util.IsBlank(line[as:])) {
 					parsed = true
 				node.Lines().Append(text.NewSegment(segment.Start+start+1, segment.Start+closureOpen))
 				} else {
 					node.RemoveAttributes()
 				}
 			}
 		}
 	}
@ -194,7 +180,6 @@ func (b *atxHeadingParser) CanAcceptIndentedLine() bool {
 }
 var attrAutoHeadingIDPrefix = []byte("heading")
 var attrNameID = []byte("#")
 func generateAutoHeadingID(node *ast.Heading, reader text.Reader, pc Context) {
 	lastIndex := node.Lines().Len() - 1
@ -208,14 +193,37 @@ func parseLastLineAttributes(node ast.Node, reader text.Reader, pc Context) {
 	lastIndex := node.Lines().Len() - 1
 	lastLine := node.Lines().At(lastIndex)
 	line := lastLine.Value(reader.Source())
-	indicies := util.FindAttributeIndiciesReverse(line, true)
+	lr := text.NewReader(line)
-	if indicies != nil {
+	var attrs Attributes
-		for _, index := range indicies {
+	var ok bool
-			node.SetAttribute(line[index[0]:index[1]],
+	var start text.Segment
-				util.UnescapePunctuations(line[index[2]:index[3]]))
+	var sl int
 	var end text.Segment
 	for {
 		c := lr.Peek()
 		if c == text.EOF {
 			break
 		}
-		lastLine.Stop = lastLine.Start + indicies[0][0] - 1
+		if c == '\\' {
-		lastLine.TrimRightSpace(reader.Source())
+			lr.Advance(1)
 			if lr.Peek() == '{' {
 				lr.Advance(1)
 			}
 			continue
 		}
 		if c == '{' {
 			sl, start = lr.Position()
 			attrs, ok = ParseAttributes(lr)
 			_, end = lr.Position()
 			lr.SetPosition(sl, start)
 		}
 		lr.Advance(1)
 	}
 	if ok && util.IsBlank(line[end.Stop:]) {
 		for _, attr := range attrs {
 			node.SetAttribute(attr.Name, attr.Value)
 		}
 		lastLine.Stop = lastLine.Start + start.Start
 		node.Lines().Set(lastIndex, lastLine)
 	}
 }
--- a/renderer/html/html.go
+++ b/renderer/html/html.go
@ -505,11 +505,12 @@ func (r *Renderer) renderString(w util.BufWriter, source []byte, node ast.Node,
 // RenderAttributes renders given node's attributes.
 func (r *Renderer) RenderAttributes(w util.BufWriter, node ast.Node) {
 	for _, attr := range node.Attributes() {
 		_, _ = w.WriteString(" ")
 		_, _ = w.Write(attr.Name)
 		_, _ = w.WriteString(`="`)
-		_, _ = w.Write(util.EscapeHTML(attr.Value))
+		_, _ = w.Write(util.EscapeHTML(attr.Value.([]byte)))
 		_ = w.WriteByte('"')
 	}
 }
--- a/util/util.go
+++ b/util/util.go
@ -631,177 +631,6 @@ func URLEscape(v []byte, resolveReference bool) []byte {
 	return cob.Bytes()
 }
 // FindAttributeIndiciesReverse searches attribute indicies from tail of the given
 // bytes and returns indicies.
 func FindAttributeIndiciesReverse(b []byte, canEscapeQuotes bool) [][4]int {
 	i := 0
 retry:
 	var result [][4]int
 	as := -1
 	for i < len(b) {
 		if IsEscapedPunctuation(b, i) {
 			i += 2
 			continue
 		}
 		if b[i] == '{' {
 			i++
 			as = i
 			break
 		}
 		i++
 	}
 	if as < 0 {
 		return nil
 	}
 	for as < len(b) {
 		ai, skip := FindAttributeIndex(b[as:], canEscapeQuotes)
 		if ai[0] < 0 {
 			break
 		}
 		i = as + ai[3]
 		if result == nil {
 			result = [][4]int{}
 		}
 		result = append(result, [4]int{as + ai[0], as + ai[1], as + ai[2], as + ai[3]})
 		as += ai[3] + skip
 	}
 	if b[as] == '}' && (as > len(b)-2 || IsBlank(b[as:])) {
 		return result
 	}
 	goto retry
 }
 // FindAttributeIndex searches
 //     - #id
 //     - .class
 //     - attr=value
 // in given bytes.
 // FindHTMLAttributeIndex returns an int array that elements are
 // [name_start, name_stop, value_start, value_stop].
 // value_start and value_stop does not include " or '.
 // If no attributes found, it returns ([4]int{-1, -1, -1, -1}, 0).
 func FindAttributeIndex(b []byte, canEscapeQuotes bool) ([4]int, int) {
 	result := [4]int{-1, -1, -1, -1}
 	i := 0
 	l := len(b)
 	for ; i < l && IsSpace(b[i]); i++ {
 	}
 	if i >= l {
 		return result, 0
 	}
 	c := b[i]
 	if c == '#' || c == '.' {
 		result[0] = i
 		i++
 		result[1] = i
 		result[2] = i
 		for ; i < l && !IsSpace(b[i]) && (!IsPunct(b[i]) || b[i] == '_' || b[i] == '-'); i++ {
 		}
 		result[3] = i
 		return result, 0
 	}
 	return FindHTMLAttributeIndex(b, canEscapeQuotes)
 }
 // FindHTMLAttributeIndex searches HTML attributes in given bytes.
 // FindHTMLAttributeIndex returns an int array that elements are
 // [name_start, name_stop, value_start, value_stop].
 // value_start and value_stop does not include " or '.
 // If no attributes found, it returns [4]int{-1, -1, -1, -1}.
 func FindHTMLAttributeIndex(b []byte, canEscapeQuotes bool) ([4]int, int) {
 	result := [4]int{-1, -1, -1, -1}
 	i := 0
 	l := len(b)
 	for ; i < l && IsSpace(b[i]); i++ {
 	}
 	if i >= l {
 		return result, 0
 	}
 	c := b[i]
 	if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
 		c == '_' || c == ':') {
 		return result, 0
 	}
 	result[0] = i
 	for ; i < l; i++ {
 		c := b[i]
 		if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
 			(c >= '0' && c <= '9') ||
 			c == '_' || c == ':' || c == '.' || c == '-') {
 			break
 		}
 	}
 	result[1] = i
 	for ; i < l && IsSpace(b[i]); i++ {
 	}
 	if i >= l {
 		return [4]int{-1, -1, -1, -1}, 0
 	}
 	if b[i] != '=' {
 		return [4]int{-1, -1, -1, -1}, 0
 	}
 	i++
 	for ; i < l && IsSpace(b[i]); i++ {
 	}
 	if i >= l {
 		return [4]int{-1, -1, -1, -1}, 0
 	}
 	skip := 0
 	if b[i] == '"' {
 		i++
 		result[2] = i
 		if canEscapeQuotes {
 			pos := FindClosure(b[i:], '"', '"', false, false)
 			if pos < 0 {
 				return [4]int{-1, -1, -1, -1}, 0
 			}
 			result[3] = pos + i
 		} else {
 			for ; i < l && b[i] != '"'; i++ {
 			}
 			result[3] = i
 			if result[2] == result[3] || i == l && b[l-1] != '"' {
 				return [4]int{-1, -1, -1, -1}, 0
 			}
 		}
 		skip = 1
 	} else if b[i] == '\'' {
 		i++
 		result[2] = i
 		if canEscapeQuotes {
 			pos := FindClosure(b[i:], '\'', '\'', false, false)
 			if pos < 0 {
 				return [4]int{-1, -1, -1, -1}, 0
 			}
 			result[3] = pos + i
 		} else {
 			for ; i < l && b[i] != '\''; i++ {
 			}
 			result[3] = i
 			if result[2] == result[3] || i == l && b[l-1] != '\'' {
 				return [4]int{-1, -1, -1, -1}, 0
 			}
 		}
 		skip = 1
 	} else {
 		result[2] = i
 		for ; i < l; i++ {
 			c = b[i]
 			if c == '\\' || c == '"' || c == '\'' ||
 				c == '=' || c == '<' || c == '>' || c == '`' ||
 				c == '{' || c == '}' ||
 				(c >= 0 && c <= 0x20) {
 				break
 			}
 		}
 		result[3] = i
 		if result[2] == result[3] {
 			return [4]int{-1, -1, -1, -1}, 0
 		}
 	}
 	return result, skip
 }
 // FindURLIndex returns a stop index value if the given bytes seem an URL.
 // This function is equivalent to [A-Za-z][A-Za-z0-9.+-]{1,31}:[^<>\x00-\x20]* .
 func FindURLIndex(b []byte) int {