Improve raw html parsing performance

2025-03-04 23:04:52 +00:00 · 2022-03-05 18:45:57 +09:00 · 2022-03-05 18:45:57 +09:00 · 920c3818d4
commit 920c3818d4
parent be2bf82af9
2 changed files with 143 additions and 20 deletions
--- a/extra_test.go
+++ b/extra_test.go
@ -100,8 +100,75 @@ func TestDeepNestedLabelPerformance(t *testing.T) {
 	var b bytes.Buffer
 	_ = markdown.Convert(source, &b)
 	finished := time.Now().UnixMilli()
-	println(finished - started)
+	if (finished - started) > 5000 {
-	if (finished - started) > 3000 {
+		t.Error("Parsing deep nested labels took more 5 secs")
-		t.Error("Parsing deep nested labels took more 3 secs")
+	}
 }
 func TestManyProcessingInstructionPerformance(t *testing.T) {
 	markdown := New(WithRendererOptions(
 		html.WithXHTML(),
 		html.WithUnsafe(),
 	))
 	started := time.Now().UnixMilli()
 	n := 50000
 	source := []byte("a " + strings.Repeat("<?", n))
 	var b bytes.Buffer
 	_ = markdown.Convert(source, &b)
 	finished := time.Now().UnixMilli()
 	if (finished - started) > 5000 {
 		t.Error("Parsing processing instructions took more 5 secs")
 	}
 }
 func TestManyCDATAPerformance(t *testing.T) {
 	markdown := New(WithRendererOptions(
 		html.WithXHTML(),
 		html.WithUnsafe(),
 	))
 	started := time.Now().UnixMilli()
 	n := 50000
 	source := []byte(strings.Repeat("a <![CDATA[", n))
 	var b bytes.Buffer
 	_ = markdown.Convert(source, &b)
 	finished := time.Now().UnixMilli()
 	if (finished - started) > 5000 {
 		t.Error("Parsing processing instructions took more 5 secs")
 	}
 }
 func TestManyDeclPerformance(t *testing.T) {
 	markdown := New(WithRendererOptions(
 		html.WithXHTML(),
 		html.WithUnsafe(),
 	))
 	started := time.Now().UnixMilli()
 	n := 50000
 	source := []byte(strings.Repeat("a <!A ", n))
 	var b bytes.Buffer
 	_ = markdown.Convert(source, &b)
 	finished := time.Now().UnixMilli()
 	if (finished - started) > 5000 {
 		t.Error("Parsing processing instructions took more 5 secs")
 	}
 }
 func TestManyCommentPerformance(t *testing.T) {
 	markdown := New(WithRendererOptions(
 		html.WithXHTML(),
 		html.WithUnsafe(),
 	))
 	started := time.Now().UnixMilli()
 	n := 50000
 	source := []byte(strings.Repeat("a <!-- ", n))
 	var b bytes.Buffer
 	_ = markdown.Convert(source, &b)
 	finished := time.Now().UnixMilli()
 	if (finished - started) > 5000 {
 		t.Error("Parsing processing instructions took more 5 secs")
 	}
 }
--- a/parser/raw_html.go
+++ b/parser/raw_html.go
@ -32,17 +32,17 @@ func (s *rawHTMLParser) Parse(parent ast.Node, block text.Reader, pc Context) as
 	if len(line) > 2 && line[1] == '/' && util.IsAlphaNumeric(line[2]) {
 		return s.parseMultiLineRegexp(closeTagRegexp, block, pc)
 	}
-	if bytes.HasPrefix(line, []byte("<!--")) {
+	if bytes.HasPrefix(line, openComment) {
-		return s.parseMultiLineRegexp(commentRegexp, block, pc)
+		return s.parseComment(block, pc)
 	}
-	if bytes.HasPrefix(line, []byte("<?")) {
+	if bytes.HasPrefix(line, openProcessingInstruction) {
-		return s.parseSingleLineRegexp(processingInstructionRegexp, block, pc)
+		return s.parseUntil(block, closeProcessingInstruction, pc)
 	}
 	if len(line) > 2 && line[1] == '!' && line[2] >= 'A' && line[2] <= 'Z' {
-		return s.parseSingleLineRegexp(declRegexp, block, pc)
+		return s.parseUntil(block, closeDecl, pc)
 	}
-	if bytes.HasPrefix(line, []byte("<![CDATA[")) {
+	if bytes.HasPrefix(line, openCDATA) {
-		return s.parseMultiLineRegexp(cdataRegexp, block, pc)
+		return s.parseUntil(block, closeCDATA, pc)
 	}
 	return nil
 }
@ -52,21 +52,77 @@ var tagnamePattern = `([A-Za-z][A-Za-z0-9-]*)`
 var attributePattern = `(?:[\r\n \t]+[a-zA-Z_:][a-zA-Z0-9:._-]*(?:[\r\n \t]*=[\r\n \t]*(?:[^\"'=<>` + "`" + `\x00-\x20]+|'[^']*'|"[^"]*"))?)`
 var openTagRegexp = regexp.MustCompile("^<" + tagnamePattern + attributePattern + `*[ \t]*/?>`)
 var closeTagRegexp = regexp.MustCompile("^</" + tagnamePattern + `\s*>`)
 var commentRegexp = regexp.MustCompile(`^<!---->|<!--(?:-?[^>-])(?:-?[^-])*-->`)
 var processingInstructionRegexp = regexp.MustCompile(`^(?:<\?).*?(?:\?>)`)
 var declRegexp = regexp.MustCompile(`^<![A-Z]+\s+[^>]*>`)
 var cdataRegexp = regexp.MustCompile(`<!\[CDATA\[[\s\S]*?\]\]>`)
-func (s *rawHTMLParser) parseSingleLineRegexp(reg *regexp.Regexp, block text.Reader, pc Context) ast.Node {
+var openProcessingInstruction = []byte("<?")
 var closeProcessingInstruction = []byte("?>")
 var openCDATA = []byte("<![CDATA[")
 var closeCDATA = []byte("]]>")
 var closeDecl = []byte(">")
 var emptyComment = []byte("<!---->")
 var invalidComment1 = []byte("<!-->")
 var invalidComment2 = []byte("<!--->")
 var openComment = []byte("<!--")
 var closeComment = []byte("-->")
 var doubleHyphen = []byte("--")
 func (s *rawHTMLParser) parseComment(block text.Reader, pc Context) ast.Node {
 	savedLine, savedSegment := block.Position()
 	node := ast.NewRawHTML()
 	line, segment := block.PeekLine()
-	match := reg.FindSubmatchIndex(line)
+	if bytes.HasPrefix(line, emptyComment) {
-	if match == nil {
+		node.Segments.Append(segment.WithStop(segment.Start + len(emptyComment)))
 		block.Advance(len(emptyComment))
 		return node
 	}
 	if bytes.HasPrefix(line, invalidComment1) || bytes.HasPrefix(line, invalidComment2) {
 		return nil
 	}
 	offset := len(openComment)
 	line = line[offset:]
 	for {
 		hindex := bytes.Index(line, doubleHyphen)
 		index := bytes.Index(line, closeComment) + offset
 		if index > -1 && hindex == index {
 			if index == 0 || line[index-1] != '-' {
 				node.Segments.Append(segment.WithStop(segment.Start + index + len(closeComment)))
 				block.Advance(index + len(closeComment))
 				return node
 			}
 		}
 		if hindex > 0 {
 			break
 		}
 		node.Segments.Append(segment)
 		block.AdvanceLine()
 		line, segment = block.PeekLine()
 		offset = 0
 		if line == nil {
 			break
 		}
 	}
 	block.SetPosition(savedLine, savedSegment)
 	return nil
 }
 func (s *rawHTMLParser) parseUntil(block text.Reader, closer []byte, pc Context) ast.Node {
 	savedLine, savedSegment := block.Position()
 	node := ast.NewRawHTML()
-	node.Segments.Append(segment.WithStop(segment.Start + match[1]))
+	for {
-	block.Advance(match[1])
+		line, segment := block.PeekLine()
-	return node
+		if line == nil {
 			break
 		}
 		index := bytes.Index(line, closer)
 		if index > -1 {
 			node.Segments.Append(segment.WithStop(segment.Start + index + len(closer)))
 			block.Advance(index + len(closer))
 			return node
 		}
 		node.Segments.Append(segment)
 		block.AdvanceLine()
 	}
 	block.SetPosition(savedLine, savedSegment)
 	return nil
 }
 func (s *rawHTMLParser) parseMultiLineRegexp(reg *regexp.Regexp, block text.Reader, pc Context) ast.Node {