Change attribute parsing strategy

This commit is contained in:
Yusuke Inuzuka 2019-08-28 20:27:33 +09:00
parent 4a770685c0
commit 667a2920f2
7 changed files with 109 additions and 244 deletions

View file

@ -1 +1,2 @@
cmark-master
cmark_benchmark

View file

@ -11,7 +11,7 @@ import (
"gitlab.com/golang-commonmark/markdown"
bf1 "github.com/russross/blackfriday"
bf2 "github.com/russross/blackfriday/v2"
bf2 "gopkg.in/russross/blackfriday.v2"
)
func BenchmarkMarkdown(b *testing.B) {

View file

@ -42,7 +42,7 @@ func NewNodeKind(name string) NodeKind {
// An Attribute is an attribute of the Node
type Attribute struct {
Name []byte
Value []byte
Value interface{}
}
var attrNameIDS = []byte("#")
@ -143,17 +143,20 @@ type Node interface {
IsRaw() bool
// SetAttribute sets the given value to the attributes.
SetAttribute(name, value []byte)
SetAttribute(name []byte, value interface{})
// SetAttributeString sets the given value to the attributes.
SetAttributeString(name string, value interface{})
// Attribute returns a (attribute value, true) if an attribute
// associated with the given name is found, otherwise
// (nil, false)
Attribute(name []byte) ([]byte, bool)
Attribute(name []byte) (interface{}, bool)
// AttributeString returns a (attribute value, true) if an attribute
// associated with the given name is found, otherwise
// (nil, false)
AttributeString(name string) ([]byte, bool)
AttributeString(name string) (interface{}, bool)
// Attributes returns a list of attributes.
// This may be a nil if there are no attributes.
@ -327,7 +330,7 @@ func (n *BaseNode) Text(source []byte) []byte {
}
// SetAttribute implements Node.SetAttribute.
func (n *BaseNode) SetAttribute(name, value []byte) {
func (n *BaseNode) SetAttribute(name []byte, value interface{}) {
if n.attributes == nil {
n.attributes = make([]Attribute, 0, 10)
} else {
@ -339,20 +342,16 @@ func (n *BaseNode) SetAttribute(name, value []byte) {
}
}
}
if len(name) == 1 {
if name[0] == '#' {
n.attributes = append(n.attributes, Attribute{attrNameID, value})
return
} else if name[0] == '.' {
n.attributes = append(n.attributes, Attribute{attrNameClass, value})
return
}
}
n.attributes = append(n.attributes, Attribute{name, value})
}
// SetAttributeString implements Node.SetAttributeString
func (n *BaseNode) SetAttributeString(name string, value interface{}) {
n.SetAttribute(util.StringToReadOnlyBytes(name), value)
}
// Attribute implements Node.Attribute.
func (n *BaseNode) Attribute(name []byte) ([]byte, bool) {
func (n *BaseNode) Attribute(name []byte) (interface{}, bool) {
if n.attributes == nil {
return nil, false
}
@ -365,7 +364,7 @@ func (n *BaseNode) Attribute(name []byte) ([]byte, bool) {
}
// AttributeString implements Node.AttributeString.
func (n *BaseNode) AttributeString(s string) ([]byte, bool) {
func (n *BaseNode) AttributeString(s string) (interface{}, bool) {
return n.Attribute(util.StringToReadOnlyBytes(s))
}

View file

@ -2,21 +2,47 @@ package parser
import (
"bytes"
"fmt"
"github.com/yuin/goldmark/text"
"github.com/yuin/goldmark/util"
"strconv"
)
type attribute struct {
Name string
var attrNameID = []byte("id")
var attrNameClass = []byte("class")
// An Attribute is an attribute of the markdown elements
type Attribute struct {
Name []byte
Value interface{}
}
// An Attributes is a collection of attributes.
type Attributes []Attribute
// Find returns a (value, true) if an attribute correspond with given name is found, otherwise (nil, false).
func (as Attributes) Find(name []byte) (interface{}, bool) {
for _, a := range as {
if bytes.Equal(a.Name, name) {
return a.Value, true
}
}
return nil, false
}
func (as Attributes) findUpdate(name []byte, cb func(v interface{}) interface{}) bool {
for i, a := range as {
if bytes.Equal(a.Name, name) {
as[i].Value = cb(a.Value)
return true
}
}
return false
}
// ParseAttributes parses attributes into a map.
// ParseAttributes returns a parsed map and true if could parse
// ParseAttributes returns a parsed attributes and true if could parse
// attributes, otherwise nil and false.
func ParseAttributes(reader text.Reader) (map[string]interface{}, bool) {
func ParseAttributes(reader text.Reader) (Attributes, bool) {
savedLine, savedPosition := reader.Position()
reader.SkipSpaces()
if reader.Peek() != '{' {
@ -24,28 +50,29 @@ func ParseAttributes(reader text.Reader) (map[string]interface{}, bool) {
return nil, false
}
reader.Advance(1)
m := map[string]interface{}{}
attrs := Attributes{}
for {
if reader.Peek() == '}' {
reader.Advance(1)
return m, true
return attrs, true
}
attr, ok := parseAttribute(reader)
if !ok {
reader.SetPosition(savedLine, savedPosition)
return nil, false
}
if attr.Name == "class" {
if v, ok := m["class"]; ok {
if _, ok2 := v.([][]byte); !ok2 {
m["class"] = [][]byte{v.([]byte)}
if bytes.Equal(attr.Name, attrNameClass) {
if !attrs.findUpdate(attrNameClass, func(v interface{}) interface{} {
var ret interface{}
if ret, ok = v.([][]byte); !ok {
ret = [][]byte{v.([]byte)}
}
m["class"] = append(m["class"].([][]byte), util.StringToReadOnlyBytes(fmt.Sprintf("%v", attr.Value)))
} else {
m["class"] = util.StringToReadOnlyBytes(fmt.Sprintf("%v", attr.Value))
return append(ret.([][]byte), attr.Value.([]byte))
}) {
attrs = append(attrs, attr)
}
} else {
m[attr.Name] = attr.Value
attrs = append(attrs, attr)
}
reader.SkipSpaces()
if reader.Peek() == ',' {
@ -55,7 +82,7 @@ func ParseAttributes(reader text.Reader) (map[string]interface{}, bool) {
}
}
func parseAttribute(reader text.Reader) (attribute, bool) {
func parseAttribute(reader text.Reader) (Attribute, bool) {
reader.SkipSpaces()
c := reader.Peek()
if c == '#' || c == '.' {
@ -64,18 +91,18 @@ func parseAttribute(reader text.Reader) (attribute, bool) {
i := 0
for ; i < len(line) && !util.IsSpace(line[i]) && (!util.IsPunct(line[i]) || line[i] == '_' || line[i] == '-'); i++ {
}
name := "class"
name := attrNameClass
if c == '#' {
name = "id"
name = attrNameID
}
reader.Advance(i)
return attribute{Name: name, Value: line[0:i]}, true
return Attribute{Name: name, Value: line[0:i]}, true
}
line, _ := reader.PeekLine()
c = line[0]
if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
c == '_' || c == ':') {
return attribute{}, false
return Attribute{}, false
}
i := 0
for ; i < len(line); i++ {
@ -86,20 +113,20 @@ func parseAttribute(reader text.Reader) (attribute, bool) {
break
}
}
name := string(line[:i])
name := line[:i]
reader.Advance(i)
reader.SkipSpaces()
c = reader.Peek()
if c != '=' {
return attribute{}, false
return Attribute{}, false
}
reader.Advance(1)
reader.SkipSpaces()
value, ok := parseAttributeValue(reader)
if !ok {
return attribute{}, false
return Attribute{}, false
}
return attribute{Name: name, Value: value}, true
return Attribute{Name: name, Value: value}, true
}
@ -110,7 +137,7 @@ func parseAttributeValue(reader text.Reader) (interface{}, bool) {
ok := false
switch c {
case text.EOF:
return attribute{}, false
return Attribute{}, false
case '{':
value, ok = ParseAttributes(reader)
case '[':

View file

@ -99,8 +99,8 @@ func (b *atxHeadingParser) Open(parent ast.Node, reader text.Reader, pc Context)
parsed := false
if b.Attribute { // handles special case like ### heading ### {#id}
start--
closureOpen := -1
closureClose := -1
closureOpen := -1
for i := start; i < stop; {
c := line[i]
if util.IsEscapedPunctuation(line, i) {
@ -117,28 +117,14 @@ func (b *atxHeadingParser) Open(parent ast.Node, reader text.Reader, pc Context)
}
}
if closureClose > 0 {
i := closureClose
for ; i < stop && util.IsSpace(line[i]); i++ {
reader.Advance(closureClose)
attrs, ok := ParseAttributes(reader)
parsed = ok
if parsed {
for _, attr := range attrs {
node.SetAttribute(attr.Name, attr.Value)
}
if i < stop-1 || line[i] == '{' {
as := i + 1
for as < stop {
ai, skip := util.FindAttributeIndex(line[as:], true)
if ai[0] < 0 {
break
}
node.SetAttribute(line[as+ai[0]:as+ai[1]],
util.UnescapePunctuations(line[as+ai[2]:as+ai[3]]))
as += ai[3] + skip
}
for ; as < stop && util.IsSpace(line[as]); as++ {
}
if line[as] == '}' && (as > stop-2 || util.IsBlank(line[as:])) {
parsed = true
node.Lines().Append(text.NewSegment(segment.Start+start+1, segment.Start+closureOpen))
} else {
node.RemoveAttributes()
}
}
}
}
@ -194,7 +180,6 @@ func (b *atxHeadingParser) CanAcceptIndentedLine() bool {
}
var attrAutoHeadingIDPrefix = []byte("heading")
var attrNameID = []byte("#")
func generateAutoHeadingID(node *ast.Heading, reader text.Reader, pc Context) {
lastIndex := node.Lines().Len() - 1
@ -208,14 +193,37 @@ func parseLastLineAttributes(node ast.Node, reader text.Reader, pc Context) {
lastIndex := node.Lines().Len() - 1
lastLine := node.Lines().At(lastIndex)
line := lastLine.Value(reader.Source())
indicies := util.FindAttributeIndiciesReverse(line, true)
if indicies != nil {
for _, index := range indicies {
node.SetAttribute(line[index[0]:index[1]],
util.UnescapePunctuations(line[index[2]:index[3]]))
lr := text.NewReader(line)
var attrs Attributes
var ok bool
var start text.Segment
var sl int
var end text.Segment
for {
c := lr.Peek()
if c == text.EOF {
break
}
lastLine.Stop = lastLine.Start + indicies[0][0] - 1
lastLine.TrimRightSpace(reader.Source())
if c == '\\' {
lr.Advance(1)
if lr.Peek() == '{' {
lr.Advance(1)
}
continue
}
if c == '{' {
sl, start = lr.Position()
attrs, ok = ParseAttributes(lr)
_, end = lr.Position()
lr.SetPosition(sl, start)
}
lr.Advance(1)
}
if ok && util.IsBlank(line[end.Stop:]) {
for _, attr := range attrs {
node.SetAttribute(attr.Name, attr.Value)
}
lastLine.Stop = lastLine.Start + start.Start
node.Lines().Set(lastIndex, lastLine)
}
}

View file

@ -505,11 +505,12 @@ func (r *Renderer) renderString(w util.BufWriter, source []byte, node ast.Node,
// RenderAttributes renders given node's attributes.
func (r *Renderer) RenderAttributes(w util.BufWriter, node ast.Node) {
for _, attr := range node.Attributes() {
_, _ = w.WriteString(" ")
_, _ = w.Write(attr.Name)
_, _ = w.WriteString(`="`)
_, _ = w.Write(util.EscapeHTML(attr.Value))
_, _ = w.Write(util.EscapeHTML(attr.Value.([]byte)))
_ = w.WriteByte('"')
}
}

View file

@ -631,177 +631,6 @@ func URLEscape(v []byte, resolveReference bool) []byte {
return cob.Bytes()
}
// FindAttributeIndiciesReverse searches attribute indicies from tail of the given
// bytes and returns indicies.
func FindAttributeIndiciesReverse(b []byte, canEscapeQuotes bool) [][4]int {
i := 0
retry:
var result [][4]int
as := -1
for i < len(b) {
if IsEscapedPunctuation(b, i) {
i += 2
continue
}
if b[i] == '{' {
i++
as = i
break
}
i++
}
if as < 0 {
return nil
}
for as < len(b) {
ai, skip := FindAttributeIndex(b[as:], canEscapeQuotes)
if ai[0] < 0 {
break
}
i = as + ai[3]
if result == nil {
result = [][4]int{}
}
result = append(result, [4]int{as + ai[0], as + ai[1], as + ai[2], as + ai[3]})
as += ai[3] + skip
}
if b[as] == '}' && (as > len(b)-2 || IsBlank(b[as:])) {
return result
}
goto retry
}
// FindAttributeIndex searches
// - #id
// - .class
// - attr=value
// in given bytes.
// FindHTMLAttributeIndex returns an int array that elements are
// [name_start, name_stop, value_start, value_stop].
// value_start and value_stop does not include " or '.
// If no attributes found, it returns ([4]int{-1, -1, -1, -1}, 0).
func FindAttributeIndex(b []byte, canEscapeQuotes bool) ([4]int, int) {
result := [4]int{-1, -1, -1, -1}
i := 0
l := len(b)
for ; i < l && IsSpace(b[i]); i++ {
}
if i >= l {
return result, 0
}
c := b[i]
if c == '#' || c == '.' {
result[0] = i
i++
result[1] = i
result[2] = i
for ; i < l && !IsSpace(b[i]) && (!IsPunct(b[i]) || b[i] == '_' || b[i] == '-'); i++ {
}
result[3] = i
return result, 0
}
return FindHTMLAttributeIndex(b, canEscapeQuotes)
}
// FindHTMLAttributeIndex searches HTML attributes in given bytes.
// FindHTMLAttributeIndex returns an int array that elements are
// [name_start, name_stop, value_start, value_stop].
// value_start and value_stop does not include " or '.
// If no attributes found, it returns [4]int{-1, -1, -1, -1}.
func FindHTMLAttributeIndex(b []byte, canEscapeQuotes bool) ([4]int, int) {
result := [4]int{-1, -1, -1, -1}
i := 0
l := len(b)
for ; i < l && IsSpace(b[i]); i++ {
}
if i >= l {
return result, 0
}
c := b[i]
if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
c == '_' || c == ':') {
return result, 0
}
result[0] = i
for ; i < l; i++ {
c := b[i]
if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
(c >= '0' && c <= '9') ||
c == '_' || c == ':' || c == '.' || c == '-') {
break
}
}
result[1] = i
for ; i < l && IsSpace(b[i]); i++ {
}
if i >= l {
return [4]int{-1, -1, -1, -1}, 0
}
if b[i] != '=' {
return [4]int{-1, -1, -1, -1}, 0
}
i++
for ; i < l && IsSpace(b[i]); i++ {
}
if i >= l {
return [4]int{-1, -1, -1, -1}, 0
}
skip := 0
if b[i] == '"' {
i++
result[2] = i
if canEscapeQuotes {
pos := FindClosure(b[i:], '"', '"', false, false)
if pos < 0 {
return [4]int{-1, -1, -1, -1}, 0
}
result[3] = pos + i
} else {
for ; i < l && b[i] != '"'; i++ {
}
result[3] = i
if result[2] == result[3] || i == l && b[l-1] != '"' {
return [4]int{-1, -1, -1, -1}, 0
}
}
skip = 1
} else if b[i] == '\'' {
i++
result[2] = i
if canEscapeQuotes {
pos := FindClosure(b[i:], '\'', '\'', false, false)
if pos < 0 {
return [4]int{-1, -1, -1, -1}, 0
}
result[3] = pos + i
} else {
for ; i < l && b[i] != '\''; i++ {
}
result[3] = i
if result[2] == result[3] || i == l && b[l-1] != '\'' {
return [4]int{-1, -1, -1, -1}, 0
}
}
skip = 1
} else {
result[2] = i
for ; i < l; i++ {
c = b[i]
if c == '\\' || c == '"' || c == '\'' ||
c == '=' || c == '<' || c == '>' || c == '`' ||
c == '{' || c == '}' ||
(c >= 0 && c <= 0x20) {
break
}
}
result[3] = i
if result[2] == result[3] {
return [4]int{-1, -1, -1, -1}, 0
}
}
return result, skip
}
// FindURLIndex returns a stop index value if the given bytes seem an URL.
// This function is equivalent to [A-Za-z][A-Za-z0-9.+-]{1,31}:[^<>\x00-\x20]* .
func FindURLIndex(b []byte) int {