From f331fc07cdb0069f3f84dc7e68aa306687d9852e Mon Sep 17 00:00:00 2001 From: Kirill Date: Fri, 6 Dec 2019 18:41:53 +0100 Subject: [PATCH] Init blocks --- README.md | 106 ++++++ renderer/blocks/blocks.go | 695 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 801 insertions(+) create mode 100644 renderer/blocks/blocks.go diff --git a/README.md b/README.md index 50426bf..4696b3b 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,112 @@ goldmark goldmark is compliant with CommonMark 0.29. +Anytype +--------- + +1. Так как сейчас скелет мидла пока устаканивается, пишу утилитки для copy/paste, но не только для обычного текстового, но и для html. Когда закончу с этой библиотечкой, смартблоки чуть более подустаканятся и можно будет все разом и подключить. +2. Написал простенький playground вокруг https://github.com/JohannesKaufmann/html-to-markdown и потестил. Работает хорошо. Однако этот конвертер работает на уровне текста и замен, там нет никакого AST, переделать в html -> blocks затруднительно. Но можно его output отдавать в парсер markdown +3. Парсер markdown. Чекал goldmark: он делает AST, потом по нодам ходит и рендерит их в html строку. Пытался придумать где именно врезаться – брать ли голые ноды и с ними что-то придумывать, или модифицировать соответствующие им рендерные функции. + +В goldmark в renderer/html есть 22 рендер-функции, которые принимают ноду и io.writer и пишут туда всякие строки. + +Нужно сделать так, чтобы туда передавалась структура состояния, а не `w util.BufWriter`, и переделать эти 22 функции из записи html строк в изменение состояния. + +Пока что вложенность будем игнорировать, то есть конвертация будет без row/column. +```js + { + textBuffer: "..." + marksBuffer: []marks + } + +``` + +Что делать с renderAttributes? Один к одному не переделаешь, надо выписать список возможных атрибутов и подумать что делать с каждым. + +```go +func (r *Renderer) RenderAttributes(w util.BufWriter, node ast.Node) { + + for _, attr := range node.Attributes() { + _, _ = w.WriteString(" ") + _, _ = w.Write(attr.Name) + _, _ = w.WriteString(`="`) + _, _ = w.Write(util.EscapeHTML(attr.Value.([]byte))) + _ = w.WriteByte('"') + } +} +``` + +Что делать со сложно-вложенными структурами? Например: + +```html +

+ Text + + fmt.printLn("Hello world") + + +

+``` + +Типичная функция для рендеринга paragraph, попробуем ее переделать. + +**BEFORE** + +```go +func (r *Renderer) renderParagraph(w util.BufWriter, source []byte, n ast.Node, entering bool) (ast.WalkStatus, error) { + if entering { + _, _ = w.WriteString("

") + } else { + _, _ = w.WriteString("

\n") + } + return ast.WalkContinue, nil +} +``` + +**AFTER** + +```go +func (r *Renderer) renderParagraph(s RenderState, source []byte, n ast.Node, entering bool) (ast.WalkStatus, error) { + if entering { + if s.isCurrentBlock { // если был какой-то блок открыт, то закрываем его, так как мы не поддерживаем вложенность (да ее особо и не может быть после ковертации в markdown) + _, _ = s.closeCurrentBlock(); + s.pushLastBlockToList(); + } + + _, _ = s.openNewBlock("Paragraph"); // Открываем блок + + } else { + _, _ = s.closeCurrentBlock(); + } + return ast.WalkContinue, nil +} +``` + +Есть `RenderState`, у которого примерно такие интерфейс и структура: + +```go +type rState interface {} + +type renderState struct { + isCurrentBlock bool + blockBuffer &model.Block + textBuffer string + marksBuffer *[]model.Block.Content.Text.Mark + blocksList *[]model.Block + + closeCurrentBlock func() + openNewBlock func(blockType string) + pushLastBlockToList func() +} + +``` + +А маркап может быть вложенным, значит нам нужна очередь + + Motivation ---------------------- I need a Markdown parser for Go that meets following conditions: diff --git a/renderer/blocks/blocks.go b/renderer/blocks/blocks.go new file mode 100644 index 0000000..5af5a57 --- /dev/null +++ b/renderer/blocks/blocks.go @@ -0,0 +1,695 @@ +package blocks + +import ( + "bytes" + "fmt" + "strconv" + + "github.com/yuin/goldmark/ast" + "github.com/yuin/goldmark/renderer" + "github.com/yuin/goldmark/util" + "github.com/anytypeio/go-anytype-library/pb/model" +) + +// A Config struct has configurations for the HTML based renderers. +type Config struct { + Writer Writer + HardWraps bool + XHTML bool + Unsafe bool +} + +// NewConfig returns a new Config with defaults. +func NewConfig() Config { + return Config{ + Writer: DefaultWriter, + HardWraps: false, + XHTML: false, + Unsafe: false, + } +} + +// SetOption implements renderer.NodeRenderer.SetOption. +func (c *Config) SetOption(name renderer.OptionName, value interface{}) { + switch name { + case optHardWraps: + c.HardWraps = value.(bool) + case optXHTML: + c.XHTML = value.(bool) + case optUnsafe: + c.Unsafe = value.(bool) + case optTextWriter: + c.Writer = value.(Writer) + } +} + +// An Option interface sets options for HTML based renderers. +type Option interface { + SetHTMLOption(*Config) +} + +// TextWriter is an option name used in WithWriter. +const optTextWriter renderer.OptionName = "Writer" + +type withWriter struct { + value Writer +} + +func (o *withWriter) SetConfig(c *renderer.Config) { + c.Options[optTextWriter] = o.value +} + +func (o *withWriter) SetHTMLOption(c *Config) { + c.Writer = o.value +} + +// WithWriter is a functional option that allow you to set the given writer to +// the renderer. +func WithWriter(writer Writer) interface { + renderer.Option + Option +} { + return &withWriter{writer} +} + +// HardWraps is an option name used in WithHardWraps. +const optHardWraps renderer.OptionName = "HardWraps" + +type withHardWraps struct { +} + +func (o *withHardWraps) SetConfig(c *renderer.Config) { + c.Options[optHardWraps] = true +} + +func (o *withHardWraps) SetHTMLOption(c *Config) { + c.HardWraps = true +} + +// WithHardWraps is a functional option that indicates whether softline breaks +// should be rendered as '
'. +func WithHardWraps() interface { + renderer.Option + Option +} { + return &withHardWraps{} +} + +// XHTML is an option name used in WithXHTML. +const optXHTML renderer.OptionName = "XHTML" + +type withXHTML struct { +} + +func (o *withXHTML) SetConfig(c *renderer.Config) { + c.Options[optXHTML] = true +} + +func (o *withXHTML) SetHTMLOption(c *Config) { + c.XHTML = true +} + +// WithXHTML is a functional option indicates that nodes should be rendered in +// xhtml instead of HTML5. +func WithXHTML() interface { + Option + renderer.Option +} { + return &withXHTML{} +} + +// Unsafe is an option name used in WithUnsafe. +const optUnsafe renderer.OptionName = "Unsafe" + +type withUnsafe struct { +} + +func (o *withUnsafe) SetConfig(c *renderer.Config) { + c.Options[optUnsafe] = true +} + +func (o *withUnsafe) SetHTMLOption(c *Config) { + c.Unsafe = true +} + +// WithUnsafe is a functional option that renders dangerous contents +// (raw htmls and potentially dangerous links) as it is. +func WithUnsafe() interface { + renderer.Option + Option +} { + return &withUnsafe{} +} + +// A Renderer struct is an implementation of renderer.NodeRenderer that renders +// nodes as (X)HTML. +type Renderer struct { + Config +} + +// NewRenderer returns a new Renderer with given options. +func NewRenderer(opts ...Option) renderer.NodeRenderer { + r := &Renderer{ + Config: NewConfig(), + } + + for _, opt := range opts { + opt.SetHTMLOption(&r.Config) + } + return r +} + +// RegisterFuncs implements NodeRenderer.RegisterFuncs . +func (r *Renderer) RegisterFuncs(reg renderer.NodeRendererFuncRegisterer) { + // blocks + + reg.Register(ast.KindDocument, r.renderDocument) + reg.Register(ast.KindHeading, r.renderHeading) + reg.Register(ast.KindBlockquote, r.renderBlockquote) + reg.Register(ast.KindCodeBlock, r.renderCodeBlock) + reg.Register(ast.KindFencedCodeBlock, r.renderFencedCodeBlock) + reg.Register(ast.KindHTMLBlock, r.renderHTMLBlock) + reg.Register(ast.KindList, r.renderList) + reg.Register(ast.KindListItem, r.renderListItem) + reg.Register(ast.KindParagraph, r.renderParagraph) + reg.Register(ast.KindTextBlock, r.renderTextBlock) + reg.Register(ast.KindThematicBreak, r.renderThematicBreak) + + // inlines + + reg.Register(ast.KindAutoLink, r.renderAutoLink) + reg.Register(ast.KindCodeSpan, r.renderCodeSpan) + reg.Register(ast.KindEmphasis, r.renderEmphasis) + reg.Register(ast.KindImage, r.renderImage) + reg.Register(ast.KindLink, r.renderLink) + reg.Register(ast.KindRawHTML, r.renderRawHTML) + reg.Register(ast.KindText, r.renderText) + reg.Register(ast.KindString, r.renderString) +} + +type renderState struct { + isCurrentBlock bool + blockBuffer *model.Block + textBuffer string + marksBuffer []model.BlockContentTextMark + blocksList []model.Block +} + +func (rs *renderState) closeCurrentBlock() { + rs.isCurrentBlock = false; + rs.blocksList = append(rs.blocksList, *rs.blockBuffer); + rs.blockBuffer = &model.Block{}; +} + +func (rs *renderState) openNewBlock(content model.IsBlockContent) { + if rs.isCurrentBlock { + rs.closeCurrentBlock(); + } + rs.isCurrentBlock = true; + rs.blockBuffer = &model.Block{ + //Id: "3", + Content: content, + } +} + +func (rs *renderState) addTextToBuffer(text string) { + rs.textBuffer += text; +} + + +func (rs *renderState) openMark(text string) { + rs.textBuffer += text; +} + + +func (r *Renderer) writeLines(w util.BufWriter, source []byte, n ast.Node) { + l := n.Lines().Len() + for i := 0; i < l; i++ { + line := n.Lines().At(i) + r.Writer.RawWrite(w, line.Value(source)) + } +} + +func (r *Renderer) renderDocument(w util.BufWriter, source []byte, node ast.Node, entering bool) (ast.WalkStatus, error) { + // nothing to do + return ast.WalkContinue, nil +} + +func (r *Renderer) renderHeading(w util.BufWriter, source []byte, node ast.Node, entering bool) (ast.WalkStatus, error) { + n := node.(*ast.Heading) + if entering { + _, _ = w.WriteString("') + } else { + _, _ = w.WriteString("\n") + } + return ast.WalkContinue, nil +} + +func (r *Renderer) renderBlockquote(w util.BufWriter, source []byte, n ast.Node, entering bool) (ast.WalkStatus, error) { + if entering { + _, _ = w.WriteString("
\n") + } else { + _, _ = w.WriteString("
\n") + } + return ast.WalkContinue, nil +} + +func (r *Renderer) renderCodeBlock(w util.BufWriter, source []byte, n ast.Node, entering bool) (ast.WalkStatus, error) { + if entering { + _, _ = w.WriteString("
")
+		r.writeLines(w, source, n)
+	} else {
+		_, _ = w.WriteString("
\n") + } + return ast.WalkContinue, nil +} + +func (r *Renderer) renderFencedCodeBlock(w util.BufWriter, source []byte, node ast.Node, entering bool) (ast.WalkStatus, error) { + n := node.(*ast.FencedCodeBlock) + if entering { + _, _ = w.WriteString("
')
+		r.writeLines(w, source, n)
+	} else {
+		_, _ = w.WriteString("
\n") + } + return ast.WalkContinue, nil +} + +func (r *Renderer) renderHTMLBlock(w util.BufWriter, source []byte, node ast.Node, entering bool) (ast.WalkStatus, error) { + n := node.(*ast.HTMLBlock) + if entering { + if r.Unsafe { + l := n.Lines().Len() + for i := 0; i < l; i++ { + line := n.Lines().At(i) + _, _ = w.Write(line.Value(source)) + } + } else { + _, _ = w.WriteString("\n") + } + } else { + if n.HasClosure() { + if r.Unsafe { + closure := n.ClosureLine + _, _ = w.Write(closure.Value(source)) + } else { + _, _ = w.WriteString("\n") + } + } + } + return ast.WalkContinue, nil +} + +func (r *Renderer) renderList(w util.BufWriter, source []byte, node ast.Node, entering bool) (ast.WalkStatus, error) { + n := node.(*ast.List) + tag := "ul" + if n.IsOrdered() { + tag = "ol" + } + if entering { + _ = w.WriteByte('<') + _, _ = w.WriteString(tag) + if n.IsOrdered() && n.Start != 1 { + fmt.Fprintf(w, " start=\"%d\">\n", n.Start) + } else { + _, _ = w.WriteString(">\n") + } + } else { + _, _ = w.WriteString("\n") + } + return ast.WalkContinue, nil +} + +func (r *Renderer) renderListItem(w util.BufWriter, source []byte, n ast.Node, entering bool) (ast.WalkStatus, error) { + if entering { + _, _ = w.WriteString("
  • ") + fc := n.FirstChild() + if fc != nil { + if _, ok := fc.(*ast.TextBlock); !ok { + _ = w.WriteByte('\n') + } + } + } else { + _, _ = w.WriteString("
  • \n") + } + return ast.WalkContinue, nil +} + +func (r *Renderer) renderParagraph(w util.BufWriter, source []byte, n ast.Node, entering bool) (ast.WalkStatus, error) { + if entering { + _, _ = w.WriteString("

    ") + } else { + _, _ = w.WriteString("

    \n") + } + return ast.WalkContinue, nil +} + +func (r *Renderer) renderTextBlock(w util.BufWriter, source []byte, n ast.Node, entering bool) (ast.WalkStatus, error) { + if !entering { + if _, ok := n.NextSibling().(ast.Node); ok && n.FirstChild() != nil { + _ = w.WriteByte('\n') + } + } + return ast.WalkContinue, nil +} + +func (r *Renderer) renderThematicBreak(w util.BufWriter, source []byte, n ast.Node, entering bool) (ast.WalkStatus, error) { + if !entering { + return ast.WalkContinue, nil + } + if r.XHTML { + _, _ = w.WriteString("
    \n") + } else { + _, _ = w.WriteString("
    \n") + } + return ast.WalkContinue, nil +} + +func (r *Renderer) renderAutoLink(w util.BufWriter, source []byte, node ast.Node, entering bool) (ast.WalkStatus, error) { + n := node.(*ast.AutoLink) + if !entering { + return ast.WalkContinue, nil + } + _, _ = w.WriteString(``) + _, _ = w.Write(util.EscapeHTML(label)) + _, _ = w.WriteString(``) + return ast.WalkContinue, nil +} + +func (r *Renderer) renderCodeSpan(w util.BufWriter, source []byte, n ast.Node, entering bool) (ast.WalkStatus, error) { + if entering { + _, _ = w.WriteString("") + for c := n.FirstChild(); c != nil; c = c.NextSibling() { + segment := c.(*ast.Text).Segment + value := segment.Value(source) + if bytes.HasSuffix(value, []byte("\n")) { + r.Writer.RawWrite(w, value[:len(value)-1]) + if c != n.LastChild() { + r.Writer.RawWrite(w, []byte(" ")) + } + } else { + r.Writer.RawWrite(w, value) + } + } + return ast.WalkSkipChildren, nil + } + _, _ = w.WriteString("") + return ast.WalkContinue, nil +} + +func (r *Renderer) renderEmphasis(w util.BufWriter, source []byte, node ast.Node, entering bool) (ast.WalkStatus, error) { + n := node.(*ast.Emphasis) + tag := "em" + if n.Level == 2 { + tag = "strong" + } + if entering { + _ = w.WriteByte('<') + _, _ = w.WriteString(tag) + _ = w.WriteByte('>') + } else { + _, _ = w.WriteString("') + } + return ast.WalkContinue, nil +} + +func (r *Renderer) renderLink(w util.BufWriter, source []byte, node ast.Node, entering bool) (ast.WalkStatus, error) { + n := node.(*ast.Link) + if entering { + _, _ = w.WriteString("') + } else { + _, _ = w.WriteString("") + } + return ast.WalkContinue, nil +} +func (r *Renderer) renderImage(w util.BufWriter, source []byte, node ast.Node, entering bool) (ast.WalkStatus, error) { + if !entering { + return ast.WalkContinue, nil + } + n := node.(*ast.Image) + _, _ = w.WriteString("`)
+	_, _ = w.Write(n.Text(source))
+	_ = w.WriteByte('") + } else { + _, _ = w.WriteString(">") + } + return ast.WalkSkipChildren, nil +} + +func (r *Renderer) renderRawHTML(w util.BufWriter, source []byte, node ast.Node, entering bool) (ast.WalkStatus, error) { + if !entering { + return ast.WalkSkipChildren, nil + } + if r.Unsafe { + n := node.(*ast.RawHTML) + l := n.Segments.Len() + for i := 0; i < l; i++ { + segment := n.Segments.At(i) + _, _ = w.Write(segment.Value(source)) + } + return ast.WalkSkipChildren, nil + } + _, _ = w.WriteString("") + return ast.WalkSkipChildren, nil +} + +func (r *Renderer) renderText(w util.BufWriter, source []byte, node ast.Node, entering bool) (ast.WalkStatus, error) { + if !entering { + return ast.WalkContinue, nil + } + n := node.(*ast.Text) + segment := n.Segment + if n.IsRaw() { + r.Writer.RawWrite(w, segment.Value(source)) + } else { + r.Writer.Write(w, segment.Value(source)) + if n.HardLineBreak() || (n.SoftLineBreak() && r.HardWraps) { + if r.XHTML { + _, _ = w.WriteString("
    \n") + } else { + _, _ = w.WriteString("
    \n") + } + } else if n.SoftLineBreak() { + _ = w.WriteByte('\n') + } + } + return ast.WalkContinue, nil +} + +func (r *Renderer) renderString(w util.BufWriter, source []byte, node ast.Node, entering bool) (ast.WalkStatus, error) { + if !entering { + return ast.WalkContinue, nil + } + n := node.(*ast.String) + if n.IsCode() { + _, _ = w.Write(n.Value) + } else { + if n.IsRaw() { + r.Writer.RawWrite(w, n.Value) + } else { + r.Writer.Write(w, n.Value) + } + } + return ast.WalkContinue, nil +} + +// RenderAttributes renders given node's attributes. +func (r *Renderer) RenderAttributes(w util.BufWriter, node ast.Node) { + + for _, attr := range node.Attributes() { + _, _ = w.WriteString(" ") + _, _ = w.Write(attr.Name) + _, _ = w.WriteString(`="`) + _, _ = w.Write(util.EscapeHTML(attr.Value.([]byte))) + _ = w.WriteByte('"') + } +} + +// A Writer interface wirtes textual contents to a writer. +type Writer interface { + // Write writes the given source to writer with resolving references and unescaping + // backslash escaped characters. + Write(writer util.BufWriter, source []byte) + + // RawWrite wirtes the given source to writer without resolving references and + // unescaping backslash escaped characters. + RawWrite(writer util.BufWriter, source []byte) +} + +type defaultWriter struct { +} + +func escapeRune(writer util.BufWriter, r rune) { + if r < 256 { + v := util.EscapeHTMLByte(byte(r)) + if v != nil { + _, _ = writer.Write(v) + return + } + } + _, _ = writer.WriteRune(util.ToValidRune(r)) +} + +func (d *defaultWriter) RawWrite(writer util.BufWriter, source []byte) { + n := 0 + l := len(source) + for i := 0; i < l; i++ { + v := util.EscapeHTMLByte(source[i]) + if v != nil { + _, _ = writer.Write(source[i-n : i]) + n = 0 + _, _ = writer.Write(v) + continue + } + n++ + } + if n != 0 { + _, _ = writer.Write(source[l-n:]) + } +} + +func (d *defaultWriter) Write(writer util.BufWriter, source []byte) { + escaped := false + var ok bool + limit := len(source) + n := 0 + for i := 0; i < limit; i++ { + c := source[i] + if escaped { + if util.IsPunct(c) { + d.RawWrite(writer, source[n:i-1]) + n = i + escaped = false + continue + } + } + if c == '&' { + pos := i + next := i + 1 + if next < limit && source[next] == '#' { + nnext := next + 1 + if nnext < limit { + nc := source[nnext] + // code point like #x22; + if nnext < limit && nc == 'x' || nc == 'X' { + start := nnext + 1 + i, ok = util.ReadWhile(source, [2]int{start, limit}, util.IsHexDecimal) + if ok && i < limit && source[i] == ';' { + v, _ := strconv.ParseUint(util.BytesToReadOnlyString(source[start:i]), 16, 32) + d.RawWrite(writer, source[n:pos]) + n = i + 1 + escapeRune(writer, rune(v)) + continue + } + // code point like #1234; + } else if nc >= '0' && nc <= '9' { + start := nnext + i, ok = util.ReadWhile(source, [2]int{start, limit}, util.IsNumeric) + if ok && i < limit && i-start < 8 && source[i] == ';' { + v, _ := strconv.ParseUint(util.BytesToReadOnlyString(source[start:i]), 0, 32) + d.RawWrite(writer, source[n:pos]) + n = i + 1 + escapeRune(writer, rune(v)) + continue + } + } + } + } else { + start := next + i, ok = util.ReadWhile(source, [2]int{start, limit}, util.IsAlphaNumeric) + // entity reference + if ok && i < limit && source[i] == ';' { + name := util.BytesToReadOnlyString(source[start:i]) + entity, ok := util.LookUpHTML5EntityByName(name) + if ok { + d.RawWrite(writer, source[n:pos]) + n = i + 1 + d.RawWrite(writer, entity.Characters) + continue + } + } + } + i = next - 1 + } + if c == '\\' { + escaped = true + continue + } + escaped = false + } + d.RawWrite(writer, source[n:]) +} + +// DefaultWriter is a default implementation of the Writer. +var DefaultWriter = &defaultWriter{} + +var bDataImage = []byte("data:image/") +var bPng = []byte("png;") +var bGif = []byte("gif;") +var bJpeg = []byte("jpeg;") +var bWebp = []byte("webp;") +var bJs = []byte("javascript:") +var bVb = []byte("vbscript:") +var bFile = []byte("file:") +var bData = []byte("data:") + +// IsDangerousURL returns true if the given url seems a potentially dangerous url, +// otherwise false. +func IsDangerousURL(url []byte) bool { + if bytes.HasPrefix(url, bDataImage) && len(url) >= 11 { + v := url[11:] + if bytes.HasPrefix(v, bPng) || bytes.HasPrefix(v, bGif) || + bytes.HasPrefix(v, bJpeg) || bytes.HasPrefix(v, bWebp) { + return false + } + return true + } + return bytes.HasPrefix(url, bJs) || bytes.HasPrefix(url, bVb) || + bytes.HasPrefix(url, bFile) || bytes.HasPrefix(url, bData) +}