mirror of
https://github.com/yuin/goldmark
synced 2025-03-04 23:04:52 +00:00
Improve raw html parsing performance
This commit is contained in:
parent
be2bf82af9
commit
920c3818d4
2 changed files with 143 additions and 20 deletions
|
|
@ -100,8 +100,75 @@ func TestDeepNestedLabelPerformance(t *testing.T) {
|
||||||
var b bytes.Buffer
|
var b bytes.Buffer
|
||||||
_ = markdown.Convert(source, &b)
|
_ = markdown.Convert(source, &b)
|
||||||
finished := time.Now().UnixMilli()
|
finished := time.Now().UnixMilli()
|
||||||
println(finished - started)
|
if (finished - started) > 5000 {
|
||||||
if (finished - started) > 3000 {
|
t.Error("Parsing deep nested labels took more 5 secs")
|
||||||
t.Error("Parsing deep nested labels took more 3 secs")
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestManyProcessingInstructionPerformance(t *testing.T) {
|
||||||
|
markdown := New(WithRendererOptions(
|
||||||
|
html.WithXHTML(),
|
||||||
|
html.WithUnsafe(),
|
||||||
|
))
|
||||||
|
|
||||||
|
started := time.Now().UnixMilli()
|
||||||
|
n := 50000
|
||||||
|
source := []byte("a " + strings.Repeat("<?", n))
|
||||||
|
var b bytes.Buffer
|
||||||
|
_ = markdown.Convert(source, &b)
|
||||||
|
finished := time.Now().UnixMilli()
|
||||||
|
if (finished - started) > 5000 {
|
||||||
|
t.Error("Parsing processing instructions took more 5 secs")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestManyCDATAPerformance(t *testing.T) {
|
||||||
|
markdown := New(WithRendererOptions(
|
||||||
|
html.WithXHTML(),
|
||||||
|
html.WithUnsafe(),
|
||||||
|
))
|
||||||
|
|
||||||
|
started := time.Now().UnixMilli()
|
||||||
|
n := 50000
|
||||||
|
source := []byte(strings.Repeat("a <![CDATA[", n))
|
||||||
|
var b bytes.Buffer
|
||||||
|
_ = markdown.Convert(source, &b)
|
||||||
|
finished := time.Now().UnixMilli()
|
||||||
|
if (finished - started) > 5000 {
|
||||||
|
t.Error("Parsing processing instructions took more 5 secs")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestManyDeclPerformance(t *testing.T) {
|
||||||
|
markdown := New(WithRendererOptions(
|
||||||
|
html.WithXHTML(),
|
||||||
|
html.WithUnsafe(),
|
||||||
|
))
|
||||||
|
|
||||||
|
started := time.Now().UnixMilli()
|
||||||
|
n := 50000
|
||||||
|
source := []byte(strings.Repeat("a <!A ", n))
|
||||||
|
var b bytes.Buffer
|
||||||
|
_ = markdown.Convert(source, &b)
|
||||||
|
finished := time.Now().UnixMilli()
|
||||||
|
if (finished - started) > 5000 {
|
||||||
|
t.Error("Parsing processing instructions took more 5 secs")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestManyCommentPerformance(t *testing.T) {
|
||||||
|
markdown := New(WithRendererOptions(
|
||||||
|
html.WithXHTML(),
|
||||||
|
html.WithUnsafe(),
|
||||||
|
))
|
||||||
|
|
||||||
|
started := time.Now().UnixMilli()
|
||||||
|
n := 50000
|
||||||
|
source := []byte(strings.Repeat("a <!-- ", n))
|
||||||
|
var b bytes.Buffer
|
||||||
|
_ = markdown.Convert(source, &b)
|
||||||
|
finished := time.Now().UnixMilli()
|
||||||
|
if (finished - started) > 5000 {
|
||||||
|
t.Error("Parsing processing instructions took more 5 secs")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -32,17 +32,17 @@ func (s *rawHTMLParser) Parse(parent ast.Node, block text.Reader, pc Context) as
|
||||||
if len(line) > 2 && line[1] == '/' && util.IsAlphaNumeric(line[2]) {
|
if len(line) > 2 && line[1] == '/' && util.IsAlphaNumeric(line[2]) {
|
||||||
return s.parseMultiLineRegexp(closeTagRegexp, block, pc)
|
return s.parseMultiLineRegexp(closeTagRegexp, block, pc)
|
||||||
}
|
}
|
||||||
if bytes.HasPrefix(line, []byte("<!--")) {
|
if bytes.HasPrefix(line, openComment) {
|
||||||
return s.parseMultiLineRegexp(commentRegexp, block, pc)
|
return s.parseComment(block, pc)
|
||||||
}
|
}
|
||||||
if bytes.HasPrefix(line, []byte("<?")) {
|
if bytes.HasPrefix(line, openProcessingInstruction) {
|
||||||
return s.parseSingleLineRegexp(processingInstructionRegexp, block, pc)
|
return s.parseUntil(block, closeProcessingInstruction, pc)
|
||||||
}
|
}
|
||||||
if len(line) > 2 && line[1] == '!' && line[2] >= 'A' && line[2] <= 'Z' {
|
if len(line) > 2 && line[1] == '!' && line[2] >= 'A' && line[2] <= 'Z' {
|
||||||
return s.parseSingleLineRegexp(declRegexp, block, pc)
|
return s.parseUntil(block, closeDecl, pc)
|
||||||
}
|
}
|
||||||
if bytes.HasPrefix(line, []byte("<![CDATA[")) {
|
if bytes.HasPrefix(line, openCDATA) {
|
||||||
return s.parseMultiLineRegexp(cdataRegexp, block, pc)
|
return s.parseUntil(block, closeCDATA, pc)
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
@ -52,21 +52,77 @@ var tagnamePattern = `([A-Za-z][A-Za-z0-9-]*)`
|
||||||
var attributePattern = `(?:[\r\n \t]+[a-zA-Z_:][a-zA-Z0-9:._-]*(?:[\r\n \t]*=[\r\n \t]*(?:[^\"'=<>` + "`" + `\x00-\x20]+|'[^']*'|"[^"]*"))?)`
|
var attributePattern = `(?:[\r\n \t]+[a-zA-Z_:][a-zA-Z0-9:._-]*(?:[\r\n \t]*=[\r\n \t]*(?:[^\"'=<>` + "`" + `\x00-\x20]+|'[^']*'|"[^"]*"))?)`
|
||||||
var openTagRegexp = regexp.MustCompile("^<" + tagnamePattern + attributePattern + `*[ \t]*/?>`)
|
var openTagRegexp = regexp.MustCompile("^<" + tagnamePattern + attributePattern + `*[ \t]*/?>`)
|
||||||
var closeTagRegexp = regexp.MustCompile("^</" + tagnamePattern + `\s*>`)
|
var closeTagRegexp = regexp.MustCompile("^</" + tagnamePattern + `\s*>`)
|
||||||
var commentRegexp = regexp.MustCompile(`^<!---->|<!--(?:-?[^>-])(?:-?[^-])*-->`)
|
|
||||||
var processingInstructionRegexp = regexp.MustCompile(`^(?:<\?).*?(?:\?>)`)
|
|
||||||
var declRegexp = regexp.MustCompile(`^<![A-Z]+\s+[^>]*>`)
|
|
||||||
var cdataRegexp = regexp.MustCompile(`<!\[CDATA\[[\s\S]*?\]\]>`)
|
|
||||||
|
|
||||||
func (s *rawHTMLParser) parseSingleLineRegexp(reg *regexp.Regexp, block text.Reader, pc Context) ast.Node {
|
var openProcessingInstruction = []byte("<?")
|
||||||
|
var closeProcessingInstruction = []byte("?>")
|
||||||
|
var openCDATA = []byte("<![CDATA[")
|
||||||
|
var closeCDATA = []byte("]]>")
|
||||||
|
var closeDecl = []byte(">")
|
||||||
|
var emptyComment = []byte("<!---->")
|
||||||
|
var invalidComment1 = []byte("<!-->")
|
||||||
|
var invalidComment2 = []byte("<!--->")
|
||||||
|
var openComment = []byte("<!--")
|
||||||
|
var closeComment = []byte("-->")
|
||||||
|
var doubleHyphen = []byte("--")
|
||||||
|
|
||||||
|
func (s *rawHTMLParser) parseComment(block text.Reader, pc Context) ast.Node {
|
||||||
|
savedLine, savedSegment := block.Position()
|
||||||
|
node := ast.NewRawHTML()
|
||||||
line, segment := block.PeekLine()
|
line, segment := block.PeekLine()
|
||||||
match := reg.FindSubmatchIndex(line)
|
if bytes.HasPrefix(line, emptyComment) {
|
||||||
if match == nil {
|
node.Segments.Append(segment.WithStop(segment.Start + len(emptyComment)))
|
||||||
|
block.Advance(len(emptyComment))
|
||||||
|
return node
|
||||||
|
}
|
||||||
|
if bytes.HasPrefix(line, invalidComment1) || bytes.HasPrefix(line, invalidComment2) {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
offset := len(openComment)
|
||||||
|
line = line[offset:]
|
||||||
|
for {
|
||||||
|
hindex := bytes.Index(line, doubleHyphen)
|
||||||
|
index := bytes.Index(line, closeComment) + offset
|
||||||
|
if index > -1 && hindex == index {
|
||||||
|
if index == 0 || line[index-1] != '-' {
|
||||||
|
node.Segments.Append(segment.WithStop(segment.Start + index + len(closeComment)))
|
||||||
|
block.Advance(index + len(closeComment))
|
||||||
|
return node
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if hindex > 0 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
node.Segments.Append(segment)
|
||||||
|
block.AdvanceLine()
|
||||||
|
line, segment = block.PeekLine()
|
||||||
|
offset = 0
|
||||||
|
if line == nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
block.SetPosition(savedLine, savedSegment)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *rawHTMLParser) parseUntil(block text.Reader, closer []byte, pc Context) ast.Node {
|
||||||
|
savedLine, savedSegment := block.Position()
|
||||||
node := ast.NewRawHTML()
|
node := ast.NewRawHTML()
|
||||||
node.Segments.Append(segment.WithStop(segment.Start + match[1]))
|
for {
|
||||||
block.Advance(match[1])
|
line, segment := block.PeekLine()
|
||||||
return node
|
if line == nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
index := bytes.Index(line, closer)
|
||||||
|
if index > -1 {
|
||||||
|
node.Segments.Append(segment.WithStop(segment.Start + index + len(closer)))
|
||||||
|
block.Advance(index + len(closer))
|
||||||
|
return node
|
||||||
|
}
|
||||||
|
node.Segments.Append(segment)
|
||||||
|
block.AdvanceLine()
|
||||||
|
}
|
||||||
|
block.SetPosition(savedLine, savedSegment)
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *rawHTMLParser) parseMultiLineRegexp(reg *regexp.Regexp, block text.Reader, pc Context) ast.Node {
|
func (s *rawHTMLParser) parseMultiLineRegexp(reg *regexp.Regexp, block text.Reader, pc Context) ast.Node {
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue