lsp/source/comment.go

package source

import (
	"bytes"
	"io"
	"regexp"
	"strings"
	"unicode"
	"unicode/utf8"
)

// CommentToMarkdown converts comment text to formatted markdown.
// The comment was prepared by DocReader,
// so it is known not to have leading, trailing blank lines
// nor to have trailing spaces at the end of lines.
// The comment markers have already been removed.
//
// Each line is converted into a markdown line and empty lines are just converted to
// newlines. Heading are prefixed with `### ` to make it a markdown heading.
//
// A span of indented lines retains a 4 space prefix block, with the common indent
// prefix removed unless empty, in which case it will be converted to a newline.
//
// URLs in the comment text are converted into links.
func CommentToMarkdown(text string) string {
	buf := &bytes.Buffer{}
	commentToMarkdown(buf, text)
	return buf.String()
}

var (
	mdNewline   = []byte("\n")
	mdHeader    = []byte("### ")
	mdIndent    = []byte("&nbsp;&nbsp;&nbsp;&nbsp;")
	mdLinkStart = []byte("[")
	mdLinkDiv   = []byte("](")
	mdLinkEnd   = []byte(")")
)

func commentToMarkdown(w io.Writer, text string) {
	isFirstLine := true
	for _, b := range blocks(text) {
		switch b.op {
		case opPara:
			if !isFirstLine {
				w.Write(mdNewline)
			}

			for _, line := range b.lines {
				emphasize(w, line, true)
			}
		case opHead:
			if !isFirstLine {
				w.Write(mdNewline)
			}
			w.Write(mdNewline)

			for _, line := range b.lines {
				w.Write(mdHeader)
				commentEscape(w, line, true)
				w.Write(mdNewline)
			}
		case opPre:
			if !isFirstLine {
				w.Write(mdNewline)
			}
			w.Write(mdNewline)

			for _, line := range b.lines {
				if isBlank(line) {
					w.Write(mdNewline)
				} else {
					w.Write(mdIndent)
					w.Write([]byte(line))
					w.Write(mdNewline)
				}
			}
		}
		isFirstLine = false
	}
}

const (
	ulquo = "“"
	urquo = "”"
)

var (
	markdownEscape = regexp.MustCompile(`([\\\x60*{}[\]()#+\-.!_>~|"$%&'\/:;<=?@^])`)

	unicodeQuoteReplacer = strings.NewReplacer("``", ulquo, "''", urquo)
)

// commentEscape escapes comment text for markdown. If nice is set,
// also turn `` into “; and '' into ”;.
func commentEscape(w io.Writer, text string, nice bool) {
	if nice {
		text = convertQuotes(text)
	}
	text = escapeRegex(text)
	w.Write([]byte(text))
}

func convertQuotes(text string) string {
	return unicodeQuoteReplacer.Replace(text)
}

func escapeRegex(text string) string {
	return markdownEscape.ReplaceAllString(text, `\$1`)
}

func emphasize(w io.Writer, line string, nice bool) {
	for {
		m := matchRx.FindStringSubmatchIndex(line)
		if m == nil {
			break
		}
		// m >= 6 (two parenthesized sub-regexps in matchRx, 1st one is urlRx)

		// write text before match
		commentEscape(w, line[0:m[0]], nice)

		// adjust match for URLs
		match := line[m[0]:m[1]]
		if strings.Contains(match, "://") {
			m0, m1 := m[0], m[1]
			for _, s := range []string{"()", "{}", "[]"} {
				open, close := s[:1], s[1:] // E.g., "(" and ")"
				// require opening parentheses before closing parentheses (#22285)
				if i := strings.Index(match, close); i >= 0 && i < strings.Index(match, open) {
					m1 = m0 + i
					match = line[m0:m1]
				}
				// require balanced pairs of parentheses (#5043)
				for i := 0; strings.Count(match, open) != strings.Count(match, close) && i < 10; i++ {
					m1 = strings.LastIndexAny(line[:m1], s)
					match = line[m0:m1]
				}
			}
			if m1 != m[1] {
				// redo matching with shortened line for correct indices
				m = matchRx.FindStringSubmatchIndex(line[:m[0]+len(match)])
			}
		}

		// Following code has been modified from go/doc since words is always
		// nil. All html formatting has also been transformed into markdown formatting

		// analyze match
		url := ""
		if m[2] >= 0 {
			url = match
		}

		// write match
		if len(url) > 0 {
			w.Write(mdLinkStart)
		}

		commentEscape(w, match, nice)

		if len(url) > 0 {
			w.Write(mdLinkDiv)
			w.Write([]byte(urlReplacer.Replace(url)))
			w.Write(mdLinkEnd)
		}

		// advance
		line = line[m[1]:]
	}
	commentEscape(w, line, nice)
}

// Everything from here on is a copy of go/doc/comment.go

const (
	// Regexp for Go identifiers
	identRx = `[\pL_][\pL_0-9]*`

	// Regexp for URLs
	// Match parens, and check later for balance - see #5043, #22285
	// Match .,:;?! within path, but not at end - see #18139, #16565
	// This excludes some rare yet valid urls ending in common punctuation
	// in order to allow sentences ending in URLs.

	// protocol (required) e.g. http
	protoPart = `(https?|ftp|file|gopher|mailto|nntp)`
	// host (required) e.g. www.example.com or [::1]:8080
	hostPart = `([a-zA-Z0-9_@\-.\[\]:]+)`
	// path+query+fragment (optional) e.g. /path/index.html?q=foo#bar
	pathPart = `([.,:;?!]*[a-zA-Z0-9$'()*+&#=@~_/\-\[\]%])*`

	urlRx = protoPart + `://` + hostPart + pathPart
)

var (
	matchRx     = regexp.MustCompile(`(` + urlRx + `)|(` + identRx + `)`)
	urlReplacer = strings.NewReplacer(`(`, `\(`, `)`, `\)`)
)

func indentLen(s string) int {
	i := 0
	for i < len(s) && (s[i] == ' ' || s[i] == '\t') {
		i++
	}
	return i
}

func isBlank(s string) bool {
	return len(s) == 0 || (len(s) == 1 && s[0] == '\n')
}

func commonPrefix(a, b string) string {
	i := 0
	for i < len(a) && i < len(b) && a[i] == b[i] {
		i++
	}
	return a[0:i]
}

func unindent(block []string) {
	if len(block) == 0 {
		return
	}

	// compute maximum common white prefix
	prefix := block[0][0:indentLen(block[0])]
	for _, line := range block {
		if !isBlank(line) {
			prefix = commonPrefix(prefix, line[0:indentLen(line)])
		}
	}
	n := len(prefix)

	// remove
	for i, line := range block {
		if !isBlank(line) {
			block[i] = line[n:]
		}
	}
}

// heading returns the trimmed line if it passes as a section heading;
// otherwise it returns the empty string.
func heading(line string) string {
	line = strings.TrimSpace(line)
	if len(line) == 0 {
		return ""
	}

	// a heading must start with an uppercase letter
	r, _ := utf8.DecodeRuneInString(line)
	if !unicode.IsLetter(r) || !unicode.IsUpper(r) {
		return ""
	}

	// it must end in a letter or digit:
	r, _ = utf8.DecodeLastRuneInString(line)
	if !unicode.IsLetter(r) && !unicode.IsDigit(r) {
		return ""
	}

	// exclude lines with illegal characters. we allow "(),"
	if strings.ContainsAny(line, ";:!?+*/=[]{}_^°&§~%#@<\">\\") {
		return ""
	}

	// allow "'" for possessive "'s" only
	for b := line; ; {
		i := strings.IndexRune(b, '\'')
		if i < 0 {
			break
		}
		if i+1 >= len(b) || b[i+1] != 's' || (i+2 < len(b) && b[i+2] != ' ') {
			return "" // not followed by "s "
		}
		b = b[i+2:]
	}

	// allow "." when followed by non-space
	for b := line; ; {
		i := strings.IndexRune(b, '.')
		if i < 0 {
			break
		}
		if i+1 >= len(b) || b[i+1] == ' ' {
			return "" // not followed by non-space
		}
		b = b[i+1:]
	}

	return line
}

type op int

const (
	opPara op = iota
	opHead
	opPre
)

type block struct {
	op    op
	lines []string
}

var nonAlphaNumRx = regexp.MustCompile(`[^a-zA-Z0-9]`)

func anchorID(line string) string {
	// Add a "hdr-" prefix to avoid conflicting with IDs used for package symbols.
	return "hdr-" + nonAlphaNumRx.ReplaceAllString(line, "_")
}

func blocks(text string) []block {
	var (
		out  []block
		para []string

		lastWasBlank   = false
		lastWasHeading = false
	)

	close := func() {
		if para != nil {
			out = append(out, block{opPara, para})
			para = nil
		}
	}

	lines := strings.SplitAfter(text, "\n")
	unindent(lines)
	for i := 0; i < len(lines); {
		line := lines[i]
		if isBlank(line) {
			// close paragraph
			close()
			i++
			lastWasBlank = true
			continue
		}
		if indentLen(line) > 0 {
			// close paragraph
			close()

			// count indented or blank lines
			j := i + 1
			for j < len(lines) && (isBlank(lines[j]) || indentLen(lines[j]) > 0) {
				j++
			}
			// but not trailing blank lines
			for j > i && isBlank(lines[j-1]) {
				j--
			}
			pre := lines[i:j]
			i = j

			unindent(pre)

			// put those lines in a pre block
			out = append(out, block{opPre, pre})
			lastWasHeading = false
			continue
		}

		if lastWasBlank && !lastWasHeading && i+2 < len(lines) &&
			isBlank(lines[i+1]) && !isBlank(lines[i+2]) && indentLen(lines[i+2]) == 0 {
			// current line is non-blank, surrounded by blank lines
			// and the next non-blank line is not indented: this
			// might be a heading.
			if head := heading(line); head != "" {
				close()
				out = append(out, block{opHead, []string{head}})
				i += 2
				lastWasHeading = true
				continue
			}
		}

		// open paragraph
		lastWasBlank = false
		lastWasHeading = false
		para = append(para, lines[i])
		i++
	}
	close()

	return out
}