diff --git a/extension/linkify.go b/extension/linkify.go index 62c41cc..f0544bf 100644 --- a/extension/linkify.go +++ b/extension/linkify.go @@ -14,8 +14,6 @@ var wwwURLRegxp = regexp.MustCompile(`^www\.[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z] var urlRegexp = regexp.MustCompile(`^(?:http|https|ftp):\/\/(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=\(\);]*)`) -var emailRegexp = regexp.MustCompile(`^[a-zA-Z0-9\.\-_\+]+@([a-zA-Z0-9\.\-_]+)`) - type linkifyParser struct { } @@ -32,6 +30,11 @@ func (s *linkifyParser) Trigger() []byte { return []byte{' ', '*', '_', '~', '('} } +var protoHTTP = []byte("http:") +var protoHTTPS = []byte("https:") +var protoFTP = []byte("ftp:") +var domainWWW = []byte("www.") + func (s *linkifyParser) Parse(parent ast.Node, block text.Reader, pc parser.Context) ast.Node { line, segment := block.PeekLine() consumes := 0 @@ -47,8 +50,10 @@ func (s *linkifyParser) Parse(parent ast.Node, block text.Reader, pc parser.Cont var m []int typ := ast.AutoLinkType(ast.AutoLinkEmail) typ = ast.AutoLinkURL - m = urlRegexp.FindSubmatchIndex(line) - if m == nil { + if bytes.HasPrefix(line, protoHTTP) || bytes.HasPrefix(line, protoHTTPS) || bytes.HasPrefix(line, protoFTP) { + m = urlRegexp.FindSubmatchIndex(line) + } + if m == nil && bytes.HasPrefix(line, domainWWW) { m = wwwURLRegxp.FindSubmatchIndex(line) } if m != nil { @@ -84,15 +89,24 @@ func (s *linkifyParser) Parse(parent ast.Node, block text.Reader, pc parser.Cont } if m == nil { typ = ast.AutoLinkEmail - m = emailRegexp.FindSubmatchIndex(line) + stop := util.FindEmailIndex(line) + if stop < 0 { + return nil + } + at := bytes.IndexByte(line, '@') + m = []int{0, stop, at, stop - 1} if m == nil || bytes.IndexByte(line[m[2]:m[3]], '.') < 0 { return nil } lastChar := line[m[1]-1] if lastChar == '.' { m[1]-- - } else if lastChar == '-' || lastChar == '_' { - return nil + } + if m[1] < len(line) { + nextChar := line[m[1]] + if nextChar == '-' || nextChar == '_' { + return nil + } } } if m == nil { diff --git a/parser/auto_link.go b/parser/auto_link.go index 6972680..726a505 100644 --- a/parser/auto_link.go +++ b/parser/auto_link.go @@ -3,7 +3,7 @@ package parser import ( "github.com/yuin/goldmark/ast" "github.com/yuin/goldmark/text" - "regexp" + "github.com/yuin/goldmark/util" ) type autoLinkParser struct { @@ -21,22 +21,22 @@ func (s *autoLinkParser) Trigger() []byte { return []byte{'<'} } -var emailAutoLinkRegexp = regexp.MustCompile(`^<([a-zA-Z0-9.!#$%&'*+\/=?^_` + "`" + `{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)>`) - -var autoLinkRegexp = regexp.MustCompile(`(?i)^<[A-Za-z][A-Za-z0-9.+-]{1,31}:[^<>\x00-\x20]*>`) - func (s *autoLinkParser) Parse(parent ast.Node, block text.Reader, pc Context) ast.Node { line, segment := block.PeekLine() - match := emailAutoLinkRegexp.FindSubmatchIndex(line) + stop := util.FindEmailIndex(line[1:]) typ := ast.AutoLinkType(ast.AutoLinkEmail) - if match == nil { - match = autoLinkRegexp.FindSubmatchIndex(line) + if stop < 0 { + stop = util.FindURLIndex(line[1:]) typ = ast.AutoLinkURL } - if match == nil { + if stop < 0 { return nil } - value := ast.NewTextSegment(text.NewSegment(segment.Start+1, segment.Start+match[1]-1)) - block.Advance(match[1]) + stop++ + if stop >= len(line) || line[stop] != '>' { + return nil + } + value := ast.NewTextSegment(text.NewSegment(segment.Start+1, segment.Start+stop)) + block.Advance(stop + 1) return ast.NewAutoLink(typ, value) } diff --git a/parser/setext_headings.go b/parser/setext_headings.go index 4cc5b87..ae12b6a 100644 --- a/parser/setext_headings.go +++ b/parser/setext_headings.go @@ -27,7 +27,9 @@ func matchesSetextHeadingBar(line []byte) (byte, bool) { level2 = util.TrimLeftLength(line[start:end], []byte{'-'}) c = '-' } - end -= util.TrimRightSpaceLength(line[start:end]) + if util.IsSpace(line[end-1]) { + end -= util.TrimRightSpaceLength(line[start:end]) + } if !((level1 > 0 && start+level1 == end) || (level2 > 0 && start+level2 == end)) { return 0, false } diff --git a/util/util.go b/util/util.go index 43a1226..db2d4aa 100644 --- a/util/util.go +++ b/util/util.go @@ -5,6 +5,7 @@ import ( "bytes" "io" "net/url" + "regexp" "sort" "strconv" "strings" @@ -291,12 +292,28 @@ func TrimRightLength(source, s []byte) int { // TrimLeftSpaceLength returns a length of leading space characters. func TrimLeftSpaceLength(source []byte) int { - return TrimLeftLength(source, spaces) + i := 0 + for ; i < len(source); i++ { + if !IsSpace(source[i]) { + break + } + } + return i } // TrimRightSpaceLength returns a length of trailing space characters. func TrimRightSpaceLength(source []byte) int { - return TrimRightLength(source, spaces) + l := len(source) + i := l - 1 + for ; i >= 0; i-- { + if !IsSpace(source[i]) { + break + } + } + if i < 0 { + return l + } + return l - 1 - i } // TrimLeftSpace returns a subslice of the given string by slicing off all leading @@ -722,6 +739,65 @@ func FindHTMLAttributeIndex(b []byte, canEscapeQuotes bool) [4]int { return result } +// FindURLIndex returns a stop index value if the given bytes seem an URL. +// This function is equivalent to [A-Za-z][A-Za-z0-9.+-]{1,31}:[^<>\x00-\x20]* . +func FindURLIndex(b []byte) int { + i := 0 + if !(len(b) > 0 && urlTable[b[i]]&7 == 7) { + return -1 + } + i++ + for ; i < len(b); i++ { + c := b[i] + if urlTable[c]&4 != 4 { + break + } + } + if i == 1 || i > 33 || i >= len(b) { + return -1 + } + if b[i] != ':' { + return -1 + } + i++ + for ; i < len(b); i++ { + c := b[i] + if urlTable[c]&1 != 1 { + break + } + } + return i +} + +var emailDomainRegexp = regexp.MustCompile(`^[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*`) + +// FindEmailIndex returns a stop index value if the given bytes seem an email address. +func FindEmailIndex(b []byte) int { + // TODO: eliminate regexps + i := 0 + for ; i < len(b); i++ { + c := b[i] + if emailTable[c]&1 != 1 { + break + } + } + if i == 0 { + return -1 + } + if i >= len(b) || b[i] != '@' { + return -1 + } + i++ + if i >= len(b) { + return -1 + } + match := emailDomainRegexp.FindSubmatchIndex(b[i:]) + if match == nil { + return -1 + } + return i + match[1] +} + var spaces = []byte(" \t\n\x0b\x0c\x0d") var spaceTable = [256]int8{0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} @@ -733,6 +809,10 @@ var urlEscapeTable = [256]int8{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 var utf8lenTable = [256]int8{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 99, 99, 99, 99, 99, 99, 99, 99} +var urlTable = [256]uint8{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 5, 5, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 0, 1, 0, 1, 1, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 1, 1, 1, 1, 1, 1, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1} + +var emailTable = [256]uint8{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} + // UTF8Len returns a byte length of the utf-8 character. func UTF8Len(b byte) int8 { return utf8lenTable[b]