Change attribute parsing strategy

2025-03-04 23:04:52 +00:00 · 2019-08-28 20:27:33 +09:00 · 2019-08-28 20:27:33 +09:00 · 667a2920f2
commit 667a2920f2
parent 4a770685c0
7 changed files with 109 additions and 244 deletions
--- a/_benchmark/cmark/.gitignore
+++ b/_benchmark/cmark/.gitignore
@ -1 +1,2 @@
 cmark-master
+cmark_benchmark
--- a/_benchmark/go/benchmark_test.go
+++ b/_benchmark/go/benchmark_test.go
@ -11,7 +11,7 @@ import (
 	"gitlab.com/golang-commonmark/markdown"

 	bf1 "github.com/russross/blackfriday"
-	bf2 "github.com/russross/blackfriday/v2"
+	bf2 "gopkg.in/russross/blackfriday.v2"
 )

 func BenchmarkMarkdown(b *testing.B) {
--- a/ast/ast.go
+++ b/ast/ast.go
@ -42,7 +42,7 @@ func NewNodeKind(name string) NodeKind {
 // An Attribute is an attribute of the Node
 type Attribute struct {
 	Name  []byte
-	Value []byte
+	Value interface{}
 }

 var attrNameIDS = []byte("#")
@ -143,17 +143,20 @@ type Node interface {
 	IsRaw() bool

 	// SetAttribute sets the given value to the attributes.
-	SetAttribute(name, value []byte)
+	SetAttribute(name []byte, value interface{})
+
+	// SetAttributeString sets the given value to the attributes.
+	SetAttributeString(name string, value interface{})

 	// Attribute returns a (attribute value, true) if an attribute
 	// associated with the given name is found, otherwise
 	// (nil, false)
-	Attribute(name []byte) ([]byte, bool)
+	Attribute(name []byte) (interface{}, bool)

 	// AttributeString returns a (attribute value, true) if an attribute
 	// associated with the given name is found, otherwise
 	// (nil, false)
-	AttributeString(name string) ([]byte, bool)
+	AttributeString(name string) (interface{}, bool)

 	// Attributes returns a list of attributes.
 	// This may be a nil if there are no attributes.
@ -327,7 +330,7 @@ func (n *BaseNode) Text(source []byte) []byte {
 }

 // SetAttribute implements Node.SetAttribute.
-func (n *BaseNode) SetAttribute(name, value []byte) {
+func (n *BaseNode) SetAttribute(name []byte, value interface{}) {
 	if n.attributes == nil {
 		n.attributes = make([]Attribute, 0, 10)
 	} else {
@ -339,20 +342,16 @@ func (n *BaseNode) SetAttribute(name, value []byte) {
 			}
 		}
 	}
-	if len(name) == 1 {
-		if name[0] == '#' {
-			n.attributes = append(n.attributes, Attribute{attrNameID, value})
-			return
-		} else if name[0] == '.' {
-			n.attributes = append(n.attributes, Attribute{attrNameClass, value})
-			return
-		}
-	}
 	n.attributes = append(n.attributes, Attribute{name, value})
 }

+// SetAttributeString implements Node.SetAttributeString
+func (n *BaseNode) SetAttributeString(name string, value interface{}) {
+	n.SetAttribute(util.StringToReadOnlyBytes(name), value)
+}
+
 // Attribute implements Node.Attribute.
-func (n *BaseNode) Attribute(name []byte) ([]byte, bool) {
+func (n *BaseNode) Attribute(name []byte) (interface{}, bool) {
 	if n.attributes == nil {
 		return nil, false
 	}
@ -365,7 +364,7 @@ func (n *BaseNode) Attribute(name []byte) ([]byte, bool) {
 }

 // AttributeString implements Node.AttributeString.
-func (n *BaseNode) AttributeString(s string) ([]byte, bool) {
+func (n *BaseNode) AttributeString(s string) (interface{}, bool) {
 	return n.Attribute(util.StringToReadOnlyBytes(s))
 }

--- a/parser/attribute.go
+++ b/parser/attribute.go
@ -2,21 +2,47 @@ package parser

 import (
 	"bytes"
-	"fmt"
 	"github.com/yuin/goldmark/text"
 	"github.com/yuin/goldmark/util"
 	"strconv"
 )

-type attribute struct {
-	Name  string
+var attrNameID = []byte("id")
+var attrNameClass = []byte("class")
+
+// An Attribute is an attribute of the markdown elements
+type Attribute struct {
+	Name  []byte
 	Value interface{}
 }

+// An Attributes is a collection of attributes.
+type Attributes []Attribute
+
+// Find returns a (value, true) if an attribute correspond with given name is found, otherwise (nil, false).
+func (as Attributes) Find(name []byte) (interface{}, bool) {
+	for _, a := range as {
+		if bytes.Equal(a.Name, name) {
+			return a.Value, true
+		}
+	}
+	return nil, false
+}
+
+func (as Attributes) findUpdate(name []byte, cb func(v interface{}) interface{}) bool {
+	for i, a := range as {
+		if bytes.Equal(a.Name, name) {
+			as[i].Value = cb(a.Value)
+			return true
+		}
+	}
+	return false
+}
+
 // ParseAttributes parses attributes into a map.
-// ParseAttributes returns a parsed map and true if could parse
+// ParseAttributes returns a parsed attributes and true if could parse
 // attributes, otherwise nil and false.
-func ParseAttributes(reader text.Reader) (map[string]interface{}, bool) {
+func ParseAttributes(reader text.Reader) (Attributes, bool) {
 	savedLine, savedPosition := reader.Position()
 	reader.SkipSpaces()
 	if reader.Peek() != '{' {
@ -24,28 +50,29 @@ func ParseAttributes(reader text.Reader) (map[string]interface{}, bool) {
 		return nil, false
 	}
 	reader.Advance(1)
-	m := map[string]interface{}{}
+	attrs := Attributes{}
 	for {
 		if reader.Peek() == '}' {
 			reader.Advance(1)
-			return m, true
+			return attrs, true
 		}
 		attr, ok := parseAttribute(reader)
 		if !ok {
 			reader.SetPosition(savedLine, savedPosition)
 			return nil, false
 		}
-		if attr.Name == "class" {
-			if v, ok := m["class"]; ok {
-				if _, ok2 := v.([][]byte); !ok2 {
-					m["class"] = [][]byte{v.([]byte)}
+		if bytes.Equal(attr.Name, attrNameClass) {
+			if !attrs.findUpdate(attrNameClass, func(v interface{}) interface{} {
+				var ret interface{}
+				if ret, ok = v.([][]byte); !ok {
+					ret = [][]byte{v.([]byte)}
 				}
-				m["class"] = append(m["class"].([][]byte), util.StringToReadOnlyBytes(fmt.Sprintf("%v", attr.Value)))
-			} else {
-				m["class"] = util.StringToReadOnlyBytes(fmt.Sprintf("%v", attr.Value))
+				return append(ret.([][]byte), attr.Value.([]byte))
+			}) {
+				attrs = append(attrs, attr)
 			}
 		} else {
-			m[attr.Name] = attr.Value
+			attrs = append(attrs, attr)
 		}
 		reader.SkipSpaces()
 		if reader.Peek() == ',' {
@ -55,7 +82,7 @@ func ParseAttributes(reader text.Reader) (map[string]interface{}, bool) {
 	}
 }

-func parseAttribute(reader text.Reader) (attribute, bool) {
+func parseAttribute(reader text.Reader) (Attribute, bool) {
 	reader.SkipSpaces()
 	c := reader.Peek()
 	if c == '#' || c == '.' {
@ -64,18 +91,18 @@ func parseAttribute(reader text.Reader) (attribute, bool) {
 		i := 0
 		for ; i < len(line) && !util.IsSpace(line[i]) && (!util.IsPunct(line[i]) || line[i] == '_' || line[i] == '-'); i++ {
 		}
-		name := "class"
+		name := attrNameClass
 		if c == '#' {
-			name = "id"
+			name = attrNameID
 		}
 		reader.Advance(i)
-		return attribute{Name: name, Value: line[0:i]}, true
+		return Attribute{Name: name, Value: line[0:i]}, true
 	}
 	line, _ := reader.PeekLine()
 	c = line[0]
 	if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
 		c == '_' || c == ':') {
-		return attribute{}, false
+		return Attribute{}, false
 	}
 	i := 0
 	for ; i < len(line); i++ {
@ -86,20 +113,20 @@ func parseAttribute(reader text.Reader) (attribute, bool) {
 			break
 		}
 	}
-	name := string(line[:i])
+	name := line[:i]
 	reader.Advance(i)
 	reader.SkipSpaces()
 	c = reader.Peek()
 	if c != '=' {
-		return attribute{}, false
+		return Attribute{}, false
 	}
 	reader.Advance(1)
 	reader.SkipSpaces()
 	value, ok := parseAttributeValue(reader)
 	if !ok {
-		return attribute{}, false
+		return Attribute{}, false
 	}
-	return attribute{Name: name, Value: value}, true
+	return Attribute{Name: name, Value: value}, true

 }

@ -110,7 +137,7 @@ func parseAttributeValue(reader text.Reader) (interface{}, bool) {
 	ok := false
 	switch c {
 	case text.EOF:
-		return attribute{}, false
+		return Attribute{}, false
 	case '{':
 		value, ok = ParseAttributes(reader)
 	case '[':
--- a/parser/atx_heading.go
+++ b/parser/atx_heading.go
@ -99,8 +99,8 @@ func (b *atxHeadingParser) Open(parent ast.Node, reader text.Reader, pc Context)
 	parsed := false
 	if b.Attribute { // handles special case like ### heading ### {#id}
 		start--
-		closureOpen := -1
 		closureClose := -1
+		closureOpen := -1
 		for i := start; i < stop; {
 			c := line[i]
 			if util.IsEscapedPunctuation(line, i) {
@ -117,28 +117,14 @@ func (b *atxHeadingParser) Open(parent ast.Node, reader text.Reader, pc Context)
 			}
 		}
 		if closureClose > 0 {
-			i := closureClose
-			for ; i < stop && util.IsSpace(line[i]); i++ {
+			reader.Advance(closureClose)
+			attrs, ok := ParseAttributes(reader)
+			parsed = ok
+			if parsed {
+				for _, attr := range attrs {
+					node.SetAttribute(attr.Name, attr.Value)
 				}
-			if i < stop-1 || line[i] == '{' {
-				as := i + 1
-				for as < stop {
-					ai, skip := util.FindAttributeIndex(line[as:], true)
-					if ai[0] < 0 {
-						break
-					}
-					node.SetAttribute(line[as+ai[0]:as+ai[1]],
-						util.UnescapePunctuations(line[as+ai[2]:as+ai[3]]))
-					as += ai[3] + skip
-				}
-				for ; as < stop && util.IsSpace(line[as]); as++ {
-				}
-				if line[as] == '}' && (as > stop-2 || util.IsBlank(line[as:])) {
-					parsed = true
 				node.Lines().Append(text.NewSegment(segment.Start+start+1, segment.Start+closureOpen))
-				} else {
-					node.RemoveAttributes()
-				}
 			}
 		}
 	}
@ -194,7 +180,6 @@ func (b *atxHeadingParser) CanAcceptIndentedLine() bool {
 }

 var attrAutoHeadingIDPrefix = []byte("heading")
-var attrNameID = []byte("#")

 func generateAutoHeadingID(node *ast.Heading, reader text.Reader, pc Context) {
 	lastIndex := node.Lines().Len() - 1
@ -208,14 +193,37 @@ func parseLastLineAttributes(node ast.Node, reader text.Reader, pc Context) {
 	lastIndex := node.Lines().Len() - 1
 	lastLine := node.Lines().At(lastIndex)
 	line := lastLine.Value(reader.Source())
-	indicies := util.FindAttributeIndiciesReverse(line, true)
-	if indicies != nil {
-		for _, index := range indicies {
-			node.SetAttribute(line[index[0]:index[1]],
-				util.UnescapePunctuations(line[index[2]:index[3]]))
+	lr := text.NewReader(line)
+	var attrs Attributes
+	var ok bool
+	var start text.Segment
+	var sl int
+	var end text.Segment
+	for {
+		c := lr.Peek()
+		if c == text.EOF {
+			break
 		}
-		lastLine.Stop = lastLine.Start + indicies[0][0] - 1
-		lastLine.TrimRightSpace(reader.Source())
+		if c == '\\' {
+			lr.Advance(1)
+			if lr.Peek() == '{' {
+				lr.Advance(1)
+			}
+			continue
+		}
+		if c == '{' {
+			sl, start = lr.Position()
+			attrs, ok = ParseAttributes(lr)
+			_, end = lr.Position()
+			lr.SetPosition(sl, start)
+		}
+		lr.Advance(1)
+	}
+	if ok && util.IsBlank(line[end.Stop:]) {
+		for _, attr := range attrs {
+			node.SetAttribute(attr.Name, attr.Value)
+		}
+		lastLine.Stop = lastLine.Start + start.Start
 		node.Lines().Set(lastIndex, lastLine)
 	}
 }
--- a/renderer/html/html.go
+++ b/renderer/html/html.go
@ -505,11 +505,12 @@ func (r *Renderer) renderString(w util.BufWriter, source []byte, node ast.Node,

 // RenderAttributes renders given node's attributes.
 func (r *Renderer) RenderAttributes(w util.BufWriter, node ast.Node) {
+
 	for _, attr := range node.Attributes() {
 		_, _ = w.WriteString(" ")
 		_, _ = w.Write(attr.Name)
 		_, _ = w.WriteString(`="`)
-		_, _ = w.Write(util.EscapeHTML(attr.Value))
+		_, _ = w.Write(util.EscapeHTML(attr.Value.([]byte)))
 		_ = w.WriteByte('"')
 	}
 }
--- a/util/util.go
+++ b/util/util.go
@ -631,177 +631,6 @@ func URLEscape(v []byte, resolveReference bool) []byte {
 	return cob.Bytes()
 }

-// FindAttributeIndiciesReverse searches attribute indicies from tail of the given
-// bytes and returns indicies.
-func FindAttributeIndiciesReverse(b []byte, canEscapeQuotes bool) [][4]int {
-	i := 0
-retry:
-	var result [][4]int
-	as := -1
-	for i < len(b) {
-		if IsEscapedPunctuation(b, i) {
-			i += 2
-			continue
-		}
-		if b[i] == '{' {
-			i++
-			as = i
-			break
-		}
-		i++
-	}
-	if as < 0 {
-		return nil
-	}
-	for as < len(b) {
-		ai, skip := FindAttributeIndex(b[as:], canEscapeQuotes)
-		if ai[0] < 0 {
-			break
-		}
-		i = as + ai[3]
-		if result == nil {
-			result = [][4]int{}
-		}
-		result = append(result, [4]int{as + ai[0], as + ai[1], as + ai[2], as + ai[3]})
-		as += ai[3] + skip
-	}
-	if b[as] == '}' && (as > len(b)-2 || IsBlank(b[as:])) {
-		return result
-	}
-	goto retry
-}
-
-// FindAttributeIndex searches
-//     - #id
-//     - .class
-//     - attr=value
-// in given bytes.
-// FindHTMLAttributeIndex returns an int array that elements are
-// [name_start, name_stop, value_start, value_stop].
-// value_start and value_stop does not include " or '.
-// If no attributes found, it returns ([4]int{-1, -1, -1, -1}, 0).
-func FindAttributeIndex(b []byte, canEscapeQuotes bool) ([4]int, int) {
-	result := [4]int{-1, -1, -1, -1}
-	i := 0
-	l := len(b)
-	for ; i < l && IsSpace(b[i]); i++ {
-	}
-	if i >= l {
-		return result, 0
-	}
-	c := b[i]
-	if c == '#' || c == '.' {
-		result[0] = i
-		i++
-		result[1] = i
-		result[2] = i
-		for ; i < l && !IsSpace(b[i]) && (!IsPunct(b[i]) || b[i] == '_' || b[i] == '-'); i++ {
-		}
-		result[3] = i
-		return result, 0
-	}
-	return FindHTMLAttributeIndex(b, canEscapeQuotes)
-}
-
-// FindHTMLAttributeIndex searches HTML attributes in given bytes.
-// FindHTMLAttributeIndex returns an int array that elements are
-// [name_start, name_stop, value_start, value_stop].
-// value_start and value_stop does not include " or '.
-// If no attributes found, it returns [4]int{-1, -1, -1, -1}.
-func FindHTMLAttributeIndex(b []byte, canEscapeQuotes bool) ([4]int, int) {
-	result := [4]int{-1, -1, -1, -1}
-	i := 0
-	l := len(b)
-	for ; i < l && IsSpace(b[i]); i++ {
-	}
-	if i >= l {
-		return result, 0
-	}
-	c := b[i]
-	if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
-		c == '_' || c == ':') {
-		return result, 0
-	}
-	result[0] = i
-	for ; i < l; i++ {
-		c := b[i]
-		if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
-			(c >= '0' && c <= '9') ||
-			c == '_' || c == ':' || c == '.' || c == '-') {
-			break
-		}
-	}
-	result[1] = i
-	for ; i < l && IsSpace(b[i]); i++ {
-	}
-	if i >= l {
-		return [4]int{-1, -1, -1, -1}, 0
-	}
-	if b[i] != '=' {
-		return [4]int{-1, -1, -1, -1}, 0
-	}
-	i++
-	for ; i < l && IsSpace(b[i]); i++ {
-	}
-	if i >= l {
-		return [4]int{-1, -1, -1, -1}, 0
-	}
-	skip := 0
-	if b[i] == '"' {
-		i++
-		result[2] = i
-		if canEscapeQuotes {
-			pos := FindClosure(b[i:], '"', '"', false, false)
-			if pos < 0 {
-				return [4]int{-1, -1, -1, -1}, 0
-			}
-			result[3] = pos + i
-		} else {
-			for ; i < l && b[i] != '"'; i++ {
-			}
-			result[3] = i
-			if result[2] == result[3] || i == l && b[l-1] != '"' {
-				return [4]int{-1, -1, -1, -1}, 0
-			}
-		}
-		skip = 1
-	} else if b[i] == '\'' {
-		i++
-		result[2] = i
-		if canEscapeQuotes {
-			pos := FindClosure(b[i:], '\'', '\'', false, false)
-			if pos < 0 {
-				return [4]int{-1, -1, -1, -1}, 0
-			}
-			result[3] = pos + i
-		} else {
-			for ; i < l && b[i] != '\''; i++ {
-			}
-			result[3] = i
-			if result[2] == result[3] || i == l && b[l-1] != '\'' {
-				return [4]int{-1, -1, -1, -1}, 0
-			}
-		}
-		skip = 1
-	} else {
-		result[2] = i
-		for ; i < l; i++ {
-			c = b[i]
-			if c == '\\' || c == '"' || c == '\'' ||
-				c == '=' || c == '<' || c == '>' || c == '`' ||
-				c == '{' || c == '}' ||
-				(c >= 0 && c <= 0x20) {
-				break
-			}
-		}
-		result[3] = i
-		if result[2] == result[3] {
-			return [4]int{-1, -1, -1, -1}, 0
-		}
-	}
-	return result, skip
-}
-
 // FindURLIndex returns a stop index value if the given bytes seem an URL.
 // This function is equivalent to [A-Za-z][A-Za-z0-9.+-]{1,31}:[^<>\x00-\x20]* .
 func FindURLIndex(b []byte) int {