From 920c3818d40e572fc3fe19724e2435d1ff4e130c Mon Sep 17 00:00:00 2001 From: yuin Date: Sat, 5 Mar 2022 18:45:57 +0900 Subject: [PATCH] Improve raw html parsing performance --- extra_test.go | 73 +++++++++++++++++++++++++++++++++++-- parser/raw_html.go | 90 +++++++++++++++++++++++++++++++++++++--------- 2 files changed, 143 insertions(+), 20 deletions(-) diff --git a/extra_test.go b/extra_test.go index 762bee8..8378243 100644 --- a/extra_test.go +++ b/extra_test.go @@ -100,8 +100,75 @@ func TestDeepNestedLabelPerformance(t *testing.T) { var b bytes.Buffer _ = markdown.Convert(source, &b) finished := time.Now().UnixMilli() - println(finished - started) - if (finished - started) > 3000 { - t.Error("Parsing deep nested labels took more 3 secs") + if (finished - started) > 5000 { + t.Error("Parsing deep nested labels took more 5 secs") + } +} + +func TestManyProcessingInstructionPerformance(t *testing.T) { + markdown := New(WithRendererOptions( + html.WithXHTML(), + html.WithUnsafe(), + )) + + started := time.Now().UnixMilli() + n := 50000 + source := []byte("a " + strings.Repeat(" 5000 { + t.Error("Parsing processing instructions took more 5 secs") + } +} + +func TestManyCDATAPerformance(t *testing.T) { + markdown := New(WithRendererOptions( + html.WithXHTML(), + html.WithUnsafe(), + )) + + started := time.Now().UnixMilli() + n := 50000 + source := []byte(strings.Repeat("a 5000 { + t.Error("Parsing processing instructions took more 5 secs") + } +} + +func TestManyDeclPerformance(t *testing.T) { + markdown := New(WithRendererOptions( + html.WithXHTML(), + html.WithUnsafe(), + )) + + started := time.Now().UnixMilli() + n := 50000 + source := []byte(strings.Repeat("a 5000 { + t.Error("Parsing processing instructions took more 5 secs") + } +} + +func TestManyCommentPerformance(t *testing.T) { + markdown := New(WithRendererOptions( + html.WithXHTML(), + html.WithUnsafe(), + )) + + started := time.Now().UnixMilli() + n := 50000 + source := []byte(strings.Repeat("a |`) -var processingInstructionRegexp = regexp.MustCompile(`^(?:<\?).*?(?:\?>)`) -var declRegexp = regexp.MustCompile(`^]*>`) -var cdataRegexp = regexp.MustCompile(``) -func (s *rawHTMLParser) parseSingleLineRegexp(reg *regexp.Regexp, block text.Reader, pc Context) ast.Node { +var openProcessingInstruction = []byte("") +var openCDATA = []byte("") +var closeDecl = []byte(">") +var emptyComment = []byte("") +var invalidComment1 = []byte("") +var invalidComment2 = []byte("") +var openComment = []byte("") +var doubleHyphen = []byte("--") + +func (s *rawHTMLParser) parseComment(block text.Reader, pc Context) ast.Node { + savedLine, savedSegment := block.Position() + node := ast.NewRawHTML() line, segment := block.PeekLine() - match := reg.FindSubmatchIndex(line) - if match == nil { + if bytes.HasPrefix(line, emptyComment) { + node.Segments.Append(segment.WithStop(segment.Start + len(emptyComment))) + block.Advance(len(emptyComment)) + return node + } + if bytes.HasPrefix(line, invalidComment1) || bytes.HasPrefix(line, invalidComment2) { return nil } + offset := len(openComment) + line = line[offset:] + for { + hindex := bytes.Index(line, doubleHyphen) + index := bytes.Index(line, closeComment) + offset + if index > -1 && hindex == index { + if index == 0 || line[index-1] != '-' { + node.Segments.Append(segment.WithStop(segment.Start + index + len(closeComment))) + block.Advance(index + len(closeComment)) + return node + } + } + if hindex > 0 { + break + } + node.Segments.Append(segment) + block.AdvanceLine() + line, segment = block.PeekLine() + offset = 0 + if line == nil { + break + } + } + block.SetPosition(savedLine, savedSegment) + return nil +} + +func (s *rawHTMLParser) parseUntil(block text.Reader, closer []byte, pc Context) ast.Node { + savedLine, savedSegment := block.Position() node := ast.NewRawHTML() - node.Segments.Append(segment.WithStop(segment.Start + match[1])) - block.Advance(match[1]) - return node + for { + line, segment := block.PeekLine() + if line == nil { + break + } + index := bytes.Index(line, closer) + if index > -1 { + node.Segments.Append(segment.WithStop(segment.Start + index + len(closer))) + block.Advance(index + len(closer)) + return node + } + node.Segments.Append(segment) + block.AdvanceLine() + } + block.SetPosition(savedLine, savedSegment) + return nil } func (s *rawHTMLParser) parseMultiLineRegexp(reg *regexp.Regexp, block text.Reader, pc Context) ast.Node {