mirror of
https://github.com/yuin/goldmark
synced 2025-03-04 23:04:52 +00:00
Change attribute parsing strategy
This commit is contained in:
parent
4a770685c0
commit
667a2920f2
7 changed files with 109 additions and 244 deletions
1
_benchmark/cmark/.gitignore
vendored
1
_benchmark/cmark/.gitignore
vendored
|
|
@ -1 +1,2 @@
|
|||
cmark-master
|
||||
cmark_benchmark
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ import (
|
|||
"gitlab.com/golang-commonmark/markdown"
|
||||
|
||||
bf1 "github.com/russross/blackfriday"
|
||||
bf2 "github.com/russross/blackfriday/v2"
|
||||
bf2 "gopkg.in/russross/blackfriday.v2"
|
||||
)
|
||||
|
||||
func BenchmarkMarkdown(b *testing.B) {
|
||||
|
|
|
|||
31
ast/ast.go
31
ast/ast.go
|
|
@ -42,7 +42,7 @@ func NewNodeKind(name string) NodeKind {
|
|||
// An Attribute is an attribute of the Node
|
||||
type Attribute struct {
|
||||
Name []byte
|
||||
Value []byte
|
||||
Value interface{}
|
||||
}
|
||||
|
||||
var attrNameIDS = []byte("#")
|
||||
|
|
@ -143,17 +143,20 @@ type Node interface {
|
|||
IsRaw() bool
|
||||
|
||||
// SetAttribute sets the given value to the attributes.
|
||||
SetAttribute(name, value []byte)
|
||||
SetAttribute(name []byte, value interface{})
|
||||
|
||||
// SetAttributeString sets the given value to the attributes.
|
||||
SetAttributeString(name string, value interface{})
|
||||
|
||||
// Attribute returns a (attribute value, true) if an attribute
|
||||
// associated with the given name is found, otherwise
|
||||
// (nil, false)
|
||||
Attribute(name []byte) ([]byte, bool)
|
||||
Attribute(name []byte) (interface{}, bool)
|
||||
|
||||
// AttributeString returns a (attribute value, true) if an attribute
|
||||
// associated with the given name is found, otherwise
|
||||
// (nil, false)
|
||||
AttributeString(name string) ([]byte, bool)
|
||||
AttributeString(name string) (interface{}, bool)
|
||||
|
||||
// Attributes returns a list of attributes.
|
||||
// This may be a nil if there are no attributes.
|
||||
|
|
@ -327,7 +330,7 @@ func (n *BaseNode) Text(source []byte) []byte {
|
|||
}
|
||||
|
||||
// SetAttribute implements Node.SetAttribute.
|
||||
func (n *BaseNode) SetAttribute(name, value []byte) {
|
||||
func (n *BaseNode) SetAttribute(name []byte, value interface{}) {
|
||||
if n.attributes == nil {
|
||||
n.attributes = make([]Attribute, 0, 10)
|
||||
} else {
|
||||
|
|
@ -339,20 +342,16 @@ func (n *BaseNode) SetAttribute(name, value []byte) {
|
|||
}
|
||||
}
|
||||
}
|
||||
if len(name) == 1 {
|
||||
if name[0] == '#' {
|
||||
n.attributes = append(n.attributes, Attribute{attrNameID, value})
|
||||
return
|
||||
} else if name[0] == '.' {
|
||||
n.attributes = append(n.attributes, Attribute{attrNameClass, value})
|
||||
return
|
||||
}
|
||||
}
|
||||
n.attributes = append(n.attributes, Attribute{name, value})
|
||||
}
|
||||
|
||||
// SetAttributeString implements Node.SetAttributeString
|
||||
func (n *BaseNode) SetAttributeString(name string, value interface{}) {
|
||||
n.SetAttribute(util.StringToReadOnlyBytes(name), value)
|
||||
}
|
||||
|
||||
// Attribute implements Node.Attribute.
|
||||
func (n *BaseNode) Attribute(name []byte) ([]byte, bool) {
|
||||
func (n *BaseNode) Attribute(name []byte) (interface{}, bool) {
|
||||
if n.attributes == nil {
|
||||
return nil, false
|
||||
}
|
||||
|
|
@ -365,7 +364,7 @@ func (n *BaseNode) Attribute(name []byte) ([]byte, bool) {
|
|||
}
|
||||
|
||||
// AttributeString implements Node.AttributeString.
|
||||
func (n *BaseNode) AttributeString(s string) ([]byte, bool) {
|
||||
func (n *BaseNode) AttributeString(s string) (interface{}, bool) {
|
||||
return n.Attribute(util.StringToReadOnlyBytes(s))
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -2,21 +2,47 @@ package parser
|
|||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"github.com/yuin/goldmark/text"
|
||||
"github.com/yuin/goldmark/util"
|
||||
"strconv"
|
||||
)
|
||||
|
||||
type attribute struct {
|
||||
Name string
|
||||
var attrNameID = []byte("id")
|
||||
var attrNameClass = []byte("class")
|
||||
|
||||
// An Attribute is an attribute of the markdown elements
|
||||
type Attribute struct {
|
||||
Name []byte
|
||||
Value interface{}
|
||||
}
|
||||
|
||||
// An Attributes is a collection of attributes.
|
||||
type Attributes []Attribute
|
||||
|
||||
// Find returns a (value, true) if an attribute correspond with given name is found, otherwise (nil, false).
|
||||
func (as Attributes) Find(name []byte) (interface{}, bool) {
|
||||
for _, a := range as {
|
||||
if bytes.Equal(a.Name, name) {
|
||||
return a.Value, true
|
||||
}
|
||||
}
|
||||
return nil, false
|
||||
}
|
||||
|
||||
func (as Attributes) findUpdate(name []byte, cb func(v interface{}) interface{}) bool {
|
||||
for i, a := range as {
|
||||
if bytes.Equal(a.Name, name) {
|
||||
as[i].Value = cb(a.Value)
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// ParseAttributes parses attributes into a map.
|
||||
// ParseAttributes returns a parsed map and true if could parse
|
||||
// ParseAttributes returns a parsed attributes and true if could parse
|
||||
// attributes, otherwise nil and false.
|
||||
func ParseAttributes(reader text.Reader) (map[string]interface{}, bool) {
|
||||
func ParseAttributes(reader text.Reader) (Attributes, bool) {
|
||||
savedLine, savedPosition := reader.Position()
|
||||
reader.SkipSpaces()
|
||||
if reader.Peek() != '{' {
|
||||
|
|
@ -24,28 +50,29 @@ func ParseAttributes(reader text.Reader) (map[string]interface{}, bool) {
|
|||
return nil, false
|
||||
}
|
||||
reader.Advance(1)
|
||||
m := map[string]interface{}{}
|
||||
attrs := Attributes{}
|
||||
for {
|
||||
if reader.Peek() == '}' {
|
||||
reader.Advance(1)
|
||||
return m, true
|
||||
return attrs, true
|
||||
}
|
||||
attr, ok := parseAttribute(reader)
|
||||
if !ok {
|
||||
reader.SetPosition(savedLine, savedPosition)
|
||||
return nil, false
|
||||
}
|
||||
if attr.Name == "class" {
|
||||
if v, ok := m["class"]; ok {
|
||||
if _, ok2 := v.([][]byte); !ok2 {
|
||||
m["class"] = [][]byte{v.([]byte)}
|
||||
if bytes.Equal(attr.Name, attrNameClass) {
|
||||
if !attrs.findUpdate(attrNameClass, func(v interface{}) interface{} {
|
||||
var ret interface{}
|
||||
if ret, ok = v.([][]byte); !ok {
|
||||
ret = [][]byte{v.([]byte)}
|
||||
}
|
||||
m["class"] = append(m["class"].([][]byte), util.StringToReadOnlyBytes(fmt.Sprintf("%v", attr.Value)))
|
||||
} else {
|
||||
m["class"] = util.StringToReadOnlyBytes(fmt.Sprintf("%v", attr.Value))
|
||||
return append(ret.([][]byte), attr.Value.([]byte))
|
||||
}) {
|
||||
attrs = append(attrs, attr)
|
||||
}
|
||||
} else {
|
||||
m[attr.Name] = attr.Value
|
||||
attrs = append(attrs, attr)
|
||||
}
|
||||
reader.SkipSpaces()
|
||||
if reader.Peek() == ',' {
|
||||
|
|
@ -55,7 +82,7 @@ func ParseAttributes(reader text.Reader) (map[string]interface{}, bool) {
|
|||
}
|
||||
}
|
||||
|
||||
func parseAttribute(reader text.Reader) (attribute, bool) {
|
||||
func parseAttribute(reader text.Reader) (Attribute, bool) {
|
||||
reader.SkipSpaces()
|
||||
c := reader.Peek()
|
||||
if c == '#' || c == '.' {
|
||||
|
|
@ -64,18 +91,18 @@ func parseAttribute(reader text.Reader) (attribute, bool) {
|
|||
i := 0
|
||||
for ; i < len(line) && !util.IsSpace(line[i]) && (!util.IsPunct(line[i]) || line[i] == '_' || line[i] == '-'); i++ {
|
||||
}
|
||||
name := "class"
|
||||
name := attrNameClass
|
||||
if c == '#' {
|
||||
name = "id"
|
||||
name = attrNameID
|
||||
}
|
||||
reader.Advance(i)
|
||||
return attribute{Name: name, Value: line[0:i]}, true
|
||||
return Attribute{Name: name, Value: line[0:i]}, true
|
||||
}
|
||||
line, _ := reader.PeekLine()
|
||||
c = line[0]
|
||||
if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
|
||||
c == '_' || c == ':') {
|
||||
return attribute{}, false
|
||||
return Attribute{}, false
|
||||
}
|
||||
i := 0
|
||||
for ; i < len(line); i++ {
|
||||
|
|
@ -86,20 +113,20 @@ func parseAttribute(reader text.Reader) (attribute, bool) {
|
|||
break
|
||||
}
|
||||
}
|
||||
name := string(line[:i])
|
||||
name := line[:i]
|
||||
reader.Advance(i)
|
||||
reader.SkipSpaces()
|
||||
c = reader.Peek()
|
||||
if c != '=' {
|
||||
return attribute{}, false
|
||||
return Attribute{}, false
|
||||
}
|
||||
reader.Advance(1)
|
||||
reader.SkipSpaces()
|
||||
value, ok := parseAttributeValue(reader)
|
||||
if !ok {
|
||||
return attribute{}, false
|
||||
return Attribute{}, false
|
||||
}
|
||||
return attribute{Name: name, Value: value}, true
|
||||
return Attribute{Name: name, Value: value}, true
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -110,7 +137,7 @@ func parseAttributeValue(reader text.Reader) (interface{}, bool) {
|
|||
ok := false
|
||||
switch c {
|
||||
case text.EOF:
|
||||
return attribute{}, false
|
||||
return Attribute{}, false
|
||||
case '{':
|
||||
value, ok = ParseAttributes(reader)
|
||||
case '[':
|
||||
|
|
|
|||
|
|
@ -99,8 +99,8 @@ func (b *atxHeadingParser) Open(parent ast.Node, reader text.Reader, pc Context)
|
|||
parsed := false
|
||||
if b.Attribute { // handles special case like ### heading ### {#id}
|
||||
start--
|
||||
closureOpen := -1
|
||||
closureClose := -1
|
||||
closureOpen := -1
|
||||
for i := start; i < stop; {
|
||||
c := line[i]
|
||||
if util.IsEscapedPunctuation(line, i) {
|
||||
|
|
@ -117,28 +117,14 @@ func (b *atxHeadingParser) Open(parent ast.Node, reader text.Reader, pc Context)
|
|||
}
|
||||
}
|
||||
if closureClose > 0 {
|
||||
i := closureClose
|
||||
for ; i < stop && util.IsSpace(line[i]); i++ {
|
||||
reader.Advance(closureClose)
|
||||
attrs, ok := ParseAttributes(reader)
|
||||
parsed = ok
|
||||
if parsed {
|
||||
for _, attr := range attrs {
|
||||
node.SetAttribute(attr.Name, attr.Value)
|
||||
}
|
||||
if i < stop-1 || line[i] == '{' {
|
||||
as := i + 1
|
||||
for as < stop {
|
||||
ai, skip := util.FindAttributeIndex(line[as:], true)
|
||||
if ai[0] < 0 {
|
||||
break
|
||||
}
|
||||
node.SetAttribute(line[as+ai[0]:as+ai[1]],
|
||||
util.UnescapePunctuations(line[as+ai[2]:as+ai[3]]))
|
||||
as += ai[3] + skip
|
||||
}
|
||||
for ; as < stop && util.IsSpace(line[as]); as++ {
|
||||
}
|
||||
if line[as] == '}' && (as > stop-2 || util.IsBlank(line[as:])) {
|
||||
parsed = true
|
||||
node.Lines().Append(text.NewSegment(segment.Start+start+1, segment.Start+closureOpen))
|
||||
} else {
|
||||
node.RemoveAttributes()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -194,7 +180,6 @@ func (b *atxHeadingParser) CanAcceptIndentedLine() bool {
|
|||
}
|
||||
|
||||
var attrAutoHeadingIDPrefix = []byte("heading")
|
||||
var attrNameID = []byte("#")
|
||||
|
||||
func generateAutoHeadingID(node *ast.Heading, reader text.Reader, pc Context) {
|
||||
lastIndex := node.Lines().Len() - 1
|
||||
|
|
@ -208,14 +193,37 @@ func parseLastLineAttributes(node ast.Node, reader text.Reader, pc Context) {
|
|||
lastIndex := node.Lines().Len() - 1
|
||||
lastLine := node.Lines().At(lastIndex)
|
||||
line := lastLine.Value(reader.Source())
|
||||
indicies := util.FindAttributeIndiciesReverse(line, true)
|
||||
if indicies != nil {
|
||||
for _, index := range indicies {
|
||||
node.SetAttribute(line[index[0]:index[1]],
|
||||
util.UnescapePunctuations(line[index[2]:index[3]]))
|
||||
lr := text.NewReader(line)
|
||||
var attrs Attributes
|
||||
var ok bool
|
||||
var start text.Segment
|
||||
var sl int
|
||||
var end text.Segment
|
||||
for {
|
||||
c := lr.Peek()
|
||||
if c == text.EOF {
|
||||
break
|
||||
}
|
||||
lastLine.Stop = lastLine.Start + indicies[0][0] - 1
|
||||
lastLine.TrimRightSpace(reader.Source())
|
||||
if c == '\\' {
|
||||
lr.Advance(1)
|
||||
if lr.Peek() == '{' {
|
||||
lr.Advance(1)
|
||||
}
|
||||
continue
|
||||
}
|
||||
if c == '{' {
|
||||
sl, start = lr.Position()
|
||||
attrs, ok = ParseAttributes(lr)
|
||||
_, end = lr.Position()
|
||||
lr.SetPosition(sl, start)
|
||||
}
|
||||
lr.Advance(1)
|
||||
}
|
||||
if ok && util.IsBlank(line[end.Stop:]) {
|
||||
for _, attr := range attrs {
|
||||
node.SetAttribute(attr.Name, attr.Value)
|
||||
}
|
||||
lastLine.Stop = lastLine.Start + start.Start
|
||||
node.Lines().Set(lastIndex, lastLine)
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -505,11 +505,12 @@ func (r *Renderer) renderString(w util.BufWriter, source []byte, node ast.Node,
|
|||
|
||||
// RenderAttributes renders given node's attributes.
|
||||
func (r *Renderer) RenderAttributes(w util.BufWriter, node ast.Node) {
|
||||
|
||||
for _, attr := range node.Attributes() {
|
||||
_, _ = w.WriteString(" ")
|
||||
_, _ = w.Write(attr.Name)
|
||||
_, _ = w.WriteString(`="`)
|
||||
_, _ = w.Write(util.EscapeHTML(attr.Value))
|
||||
_, _ = w.Write(util.EscapeHTML(attr.Value.([]byte)))
|
||||
_ = w.WriteByte('"')
|
||||
}
|
||||
}
|
||||
|
|
|
|||
171
util/util.go
171
util/util.go
|
|
@ -631,177 +631,6 @@ func URLEscape(v []byte, resolveReference bool) []byte {
|
|||
return cob.Bytes()
|
||||
}
|
||||
|
||||
// FindAttributeIndiciesReverse searches attribute indicies from tail of the given
|
||||
// bytes and returns indicies.
|
||||
func FindAttributeIndiciesReverse(b []byte, canEscapeQuotes bool) [][4]int {
|
||||
i := 0
|
||||
retry:
|
||||
var result [][4]int
|
||||
as := -1
|
||||
for i < len(b) {
|
||||
if IsEscapedPunctuation(b, i) {
|
||||
i += 2
|
||||
continue
|
||||
}
|
||||
if b[i] == '{' {
|
||||
i++
|
||||
as = i
|
||||
break
|
||||
}
|
||||
i++
|
||||
}
|
||||
if as < 0 {
|
||||
return nil
|
||||
}
|
||||
for as < len(b) {
|
||||
ai, skip := FindAttributeIndex(b[as:], canEscapeQuotes)
|
||||
if ai[0] < 0 {
|
||||
break
|
||||
}
|
||||
i = as + ai[3]
|
||||
if result == nil {
|
||||
result = [][4]int{}
|
||||
}
|
||||
result = append(result, [4]int{as + ai[0], as + ai[1], as + ai[2], as + ai[3]})
|
||||
as += ai[3] + skip
|
||||
}
|
||||
if b[as] == '}' && (as > len(b)-2 || IsBlank(b[as:])) {
|
||||
return result
|
||||
}
|
||||
goto retry
|
||||
}
|
||||
|
||||
// FindAttributeIndex searches
|
||||
// - #id
|
||||
// - .class
|
||||
// - attr=value
|
||||
// in given bytes.
|
||||
// FindHTMLAttributeIndex returns an int array that elements are
|
||||
// [name_start, name_stop, value_start, value_stop].
|
||||
// value_start and value_stop does not include " or '.
|
||||
// If no attributes found, it returns ([4]int{-1, -1, -1, -1}, 0).
|
||||
func FindAttributeIndex(b []byte, canEscapeQuotes bool) ([4]int, int) {
|
||||
result := [4]int{-1, -1, -1, -1}
|
||||
i := 0
|
||||
l := len(b)
|
||||
for ; i < l && IsSpace(b[i]); i++ {
|
||||
}
|
||||
if i >= l {
|
||||
return result, 0
|
||||
}
|
||||
c := b[i]
|
||||
if c == '#' || c == '.' {
|
||||
result[0] = i
|
||||
i++
|
||||
result[1] = i
|
||||
result[2] = i
|
||||
for ; i < l && !IsSpace(b[i]) && (!IsPunct(b[i]) || b[i] == '_' || b[i] == '-'); i++ {
|
||||
}
|
||||
result[3] = i
|
||||
return result, 0
|
||||
}
|
||||
return FindHTMLAttributeIndex(b, canEscapeQuotes)
|
||||
}
|
||||
|
||||
// FindHTMLAttributeIndex searches HTML attributes in given bytes.
|
||||
// FindHTMLAttributeIndex returns an int array that elements are
|
||||
// [name_start, name_stop, value_start, value_stop].
|
||||
// value_start and value_stop does not include " or '.
|
||||
// If no attributes found, it returns [4]int{-1, -1, -1, -1}.
|
||||
func FindHTMLAttributeIndex(b []byte, canEscapeQuotes bool) ([4]int, int) {
|
||||
result := [4]int{-1, -1, -1, -1}
|
||||
i := 0
|
||||
l := len(b)
|
||||
for ; i < l && IsSpace(b[i]); i++ {
|
||||
}
|
||||
if i >= l {
|
||||
return result, 0
|
||||
}
|
||||
c := b[i]
|
||||
if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
|
||||
c == '_' || c == ':') {
|
||||
return result, 0
|
||||
}
|
||||
result[0] = i
|
||||
for ; i < l; i++ {
|
||||
c := b[i]
|
||||
if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
|
||||
(c >= '0' && c <= '9') ||
|
||||
c == '_' || c == ':' || c == '.' || c == '-') {
|
||||
break
|
||||
}
|
||||
}
|
||||
result[1] = i
|
||||
for ; i < l && IsSpace(b[i]); i++ {
|
||||
}
|
||||
if i >= l {
|
||||
return [4]int{-1, -1, -1, -1}, 0
|
||||
}
|
||||
if b[i] != '=' {
|
||||
return [4]int{-1, -1, -1, -1}, 0
|
||||
}
|
||||
i++
|
||||
for ; i < l && IsSpace(b[i]); i++ {
|
||||
}
|
||||
if i >= l {
|
||||
return [4]int{-1, -1, -1, -1}, 0
|
||||
}
|
||||
skip := 0
|
||||
if b[i] == '"' {
|
||||
i++
|
||||
result[2] = i
|
||||
if canEscapeQuotes {
|
||||
pos := FindClosure(b[i:], '"', '"', false, false)
|
||||
if pos < 0 {
|
||||
return [4]int{-1, -1, -1, -1}, 0
|
||||
}
|
||||
result[3] = pos + i
|
||||
} else {
|
||||
for ; i < l && b[i] != '"'; i++ {
|
||||
}
|
||||
result[3] = i
|
||||
if result[2] == result[3] || i == l && b[l-1] != '"' {
|
||||
return [4]int{-1, -1, -1, -1}, 0
|
||||
}
|
||||
}
|
||||
skip = 1
|
||||
} else if b[i] == '\'' {
|
||||
i++
|
||||
result[2] = i
|
||||
if canEscapeQuotes {
|
||||
pos := FindClosure(b[i:], '\'', '\'', false, false)
|
||||
if pos < 0 {
|
||||
return [4]int{-1, -1, -1, -1}, 0
|
||||
}
|
||||
result[3] = pos + i
|
||||
} else {
|
||||
for ; i < l && b[i] != '\''; i++ {
|
||||
}
|
||||
result[3] = i
|
||||
if result[2] == result[3] || i == l && b[l-1] != '\'' {
|
||||
return [4]int{-1, -1, -1, -1}, 0
|
||||
}
|
||||
}
|
||||
skip = 1
|
||||
} else {
|
||||
result[2] = i
|
||||
for ; i < l; i++ {
|
||||
c = b[i]
|
||||
if c == '\\' || c == '"' || c == '\'' ||
|
||||
c == '=' || c == '<' || c == '>' || c == '`' ||
|
||||
c == '{' || c == '}' ||
|
||||
(c >= 0 && c <= 0x20) {
|
||||
break
|
||||
}
|
||||
}
|
||||
result[3] = i
|
||||
if result[2] == result[3] {
|
||||
return [4]int{-1, -1, -1, -1}, 0
|
||||
}
|
||||
}
|
||||
return result, skip
|
||||
}
|
||||
|
||||
// FindURLIndex returns a stop index value if the given bytes seem an URL.
|
||||
// This function is equivalent to [A-Za-z][A-Za-z0-9.+-]{1,31}:[^<>\x00-\x20]* .
|
||||
func FindURLIndex(b []byte) int {
|
||||
|
|
|
|||
Loading…
Reference in a new issue