1// Copyright 2019 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package source
6
7import (
8	"bytes"
9	"io"
10	"regexp"
11	"strings"
12	"unicode"
13	"unicode/utf8"
14)
15
16// CommentToMarkdown converts comment text to formatted markdown.
17// The comment was prepared by DocReader,
18// so it is known not to have leading, trailing blank lines
19// nor to have trailing spaces at the end of lines.
20// The comment markers have already been removed.
21//
22// Each line is converted into a markdown line and empty lines are just converted to
23// newlines. Heading are prefixed with `### ` to make it a markdown heading.
24//
25// A span of indented lines retains a 4 space prefix block, with the common indent
26// prefix removed unless empty, in which case it will be converted to a newline.
27//
28// URLs in the comment text are converted into links.
29func CommentToMarkdown(text string) string {
30	buf := &bytes.Buffer{}
31	commentToMarkdown(buf, text)
32	return buf.String()
33}
34
35var (
36	mdNewline   = []byte("\n")
37	mdHeader    = []byte("### ")
38	mdIndent    = []byte("    ")
39	mdLinkStart = []byte("[")
40	mdLinkDiv   = []byte("](")
41	mdLinkEnd   = []byte(")")
42)
43
44func commentToMarkdown(w io.Writer, text string) {
45	isFirstLine := true
46	for _, b := range blocks(text) {
47		switch b.op {
48		case opPara:
49			if !isFirstLine {
50				w.Write(mdNewline)
51			}
52
53			for _, line := range b.lines {
54				emphasize(w, line, true)
55			}
56		case opHead:
57			if !isFirstLine {
58				w.Write(mdNewline)
59			}
60			w.Write(mdNewline)
61
62			for _, line := range b.lines {
63				w.Write(mdHeader)
64				commentEscape(w, line, true)
65				w.Write(mdNewline)
66			}
67		case opPre:
68			if !isFirstLine {
69				w.Write(mdNewline)
70			}
71			w.Write(mdNewline)
72
73			for _, line := range b.lines {
74				if isBlank(line) {
75					w.Write(mdNewline)
76				} else {
77					w.Write(mdIndent)
78					w.Write([]byte(line))
79					w.Write(mdNewline)
80				}
81			}
82		}
83		isFirstLine = false
84	}
85}
86
87const (
88	ulquo = "“"
89	urquo = "”"
90)
91
92var (
93	markdownEscape = regexp.MustCompile(`([\\\x60*{}[\]()#+\-.!_>~|"$%&'\/:;<=?@^])`)
94
95	unicodeQuoteReplacer = strings.NewReplacer("``", ulquo, "''", urquo)
96)
97
98// commentEscape escapes comment text for markdown. If nice is set,
99// also turn `` into “; and '' into ”;.
100func commentEscape(w io.Writer, text string, nice bool) {
101	if nice {
102		text = convertQuotes(text)
103	}
104	text = escapeRegex(text)
105	w.Write([]byte(text))
106}
107
108func convertQuotes(text string) string {
109	return unicodeQuoteReplacer.Replace(text)
110}
111
112func escapeRegex(text string) string {
113	return markdownEscape.ReplaceAllString(text, `\$1`)
114}
115
116func emphasize(w io.Writer, line string, nice bool) {
117	for {
118		m := matchRx.FindStringSubmatchIndex(line)
119		if m == nil {
120			break
121		}
122		// m >= 6 (two parenthesized sub-regexps in matchRx, 1st one is urlRx)
123
124		// write text before match
125		commentEscape(w, line[0:m[0]], nice)
126
127		// adjust match for URLs
128		match := line[m[0]:m[1]]
129		if strings.Contains(match, "://") {
130			m0, m1 := m[0], m[1]
131			for _, s := range []string{"()", "{}", "[]"} {
132				open, close := s[:1], s[1:] // E.g., "(" and ")"
133				// require opening parentheses before closing parentheses (#22285)
134				if i := strings.Index(match, close); i >= 0 && i < strings.Index(match, open) {
135					m1 = m0 + i
136					match = line[m0:m1]
137				}
138				// require balanced pairs of parentheses (#5043)
139				for i := 0; strings.Count(match, open) != strings.Count(match, close) && i < 10; i++ {
140					m1 = strings.LastIndexAny(line[:m1], s)
141					match = line[m0:m1]
142				}
143			}
144			if m1 != m[1] {
145				// redo matching with shortened line for correct indices
146				m = matchRx.FindStringSubmatchIndex(line[:m[0]+len(match)])
147			}
148		}
149
150		// Following code has been modified from go/doc since words is always
151		// nil. All html formatting has also been transformed into markdown formatting
152
153		// analyze match
154		url := ""
155		if m[2] >= 0 {
156			url = match
157		}
158
159		// write match
160		if len(url) > 0 {
161			w.Write(mdLinkStart)
162		}
163
164		commentEscape(w, match, nice)
165
166		if len(url) > 0 {
167			w.Write(mdLinkDiv)
168			w.Write([]byte(urlReplacer.Replace(url)))
169			w.Write(mdLinkEnd)
170		}
171
172		// advance
173		line = line[m[1]:]
174	}
175	commentEscape(w, line, nice)
176}
177
178// Everything from here on is a copy of go/doc/comment.go
179
180const (
181	// Regexp for Go identifiers
182	identRx = `[\pL_][\pL_0-9]*`
183
184	// Regexp for URLs
185	// Match parens, and check later for balance - see #5043, #22285
186	// Match .,:;?! within path, but not at end - see #18139, #16565
187	// This excludes some rare yet valid urls ending in common punctuation
188	// in order to allow sentences ending in URLs.
189
190	// protocol (required) e.g. http
191	protoPart = `(https?|ftp|file|gopher|mailto|nntp)`
192	// host (required) e.g. www.example.com or [::1]:8080
193	hostPart = `([a-zA-Z0-9_@\-.\[\]:]+)`
194	// path+query+fragment (optional) e.g. /path/index.html?q=foo#bar
195	pathPart = `([.,:;?!]*[a-zA-Z0-9$'()*+&#=@~_/\-\[\]%])*`
196
197	urlRx = protoPart + `://` + hostPart + pathPart
198)
199
200var (
201	matchRx     = regexp.MustCompile(`(` + urlRx + `)|(` + identRx + `)`)
202	urlReplacer = strings.NewReplacer(`(`, `\(`, `)`, `\)`)
203)
204
205func indentLen(s string) int {
206	i := 0
207	for i < len(s) && (s[i] == ' ' || s[i] == '\t') {
208		i++
209	}
210	return i
211}
212
213func isBlank(s string) bool {
214	return len(s) == 0 || (len(s) == 1 && s[0] == '\n')
215}
216
217func commonPrefix(a, b string) string {
218	i := 0
219	for i < len(a) && i < len(b) && a[i] == b[i] {
220		i++
221	}
222	return a[0:i]
223}
224
225func unindent(block []string) {
226	if len(block) == 0 {
227		return
228	}
229
230	// compute maximum common white prefix
231	prefix := block[0][0:indentLen(block[0])]
232	for _, line := range block {
233		if !isBlank(line) {
234			prefix = commonPrefix(prefix, line[0:indentLen(line)])
235		}
236	}
237	n := len(prefix)
238
239	// remove
240	for i, line := range block {
241		if !isBlank(line) {
242			block[i] = line[n:]
243		}
244	}
245}
246
247// heading returns the trimmed line if it passes as a section heading;
248// otherwise it returns the empty string.
249func heading(line string) string {
250	line = strings.TrimSpace(line)
251	if len(line) == 0 {
252		return ""
253	}
254
255	// a heading must start with an uppercase letter
256	r, _ := utf8.DecodeRuneInString(line)
257	if !unicode.IsLetter(r) || !unicode.IsUpper(r) {
258		return ""
259	}
260
261	// it must end in a letter or digit:
262	r, _ = utf8.DecodeLastRuneInString(line)
263	if !unicode.IsLetter(r) && !unicode.IsDigit(r) {
264		return ""
265	}
266
267	// exclude lines with illegal characters. we allow "(),"
268	if strings.ContainsAny(line, ";:!?+*/=[]{}_^°&§~%#@<\">\\") {
269		return ""
270	}
271
272	// allow "'" for possessive "'s" only
273	for b := line; ; {
274		i := strings.IndexRune(b, '\'')
275		if i < 0 {
276			break
277		}
278		if i+1 >= len(b) || b[i+1] != 's' || (i+2 < len(b) && b[i+2] != ' ') {
279			return "" // not followed by "s "
280		}
281		b = b[i+2:]
282	}
283
284	// allow "." when followed by non-space
285	for b := line; ; {
286		i := strings.IndexRune(b, '.')
287		if i < 0 {
288			break
289		}
290		if i+1 >= len(b) || b[i+1] == ' ' {
291			return "" // not followed by non-space
292		}
293		b = b[i+1:]
294	}
295
296	return line
297}
298
299type op int
300
301const (
302	opPara op = iota
303	opHead
304	opPre
305)
306
307type block struct {
308	op    op
309	lines []string
310}
311
312func blocks(text string) []block {
313	var (
314		out  []block
315		para []string
316
317		lastWasBlank   = false
318		lastWasHeading = false
319	)
320
321	close := func() {
322		if para != nil {
323			out = append(out, block{opPara, para})
324			para = nil
325		}
326	}
327
328	lines := strings.SplitAfter(text, "\n")
329	unindent(lines)
330	for i := 0; i < len(lines); {
331		line := lines[i]
332		if isBlank(line) {
333			// close paragraph
334			close()
335			i++
336			lastWasBlank = true
337			continue
338		}
339		if indentLen(line) > 0 {
340			// close paragraph
341			close()
342
343			// count indented or blank lines
344			j := i + 1
345			for j < len(lines) && (isBlank(lines[j]) || indentLen(lines[j]) > 0) {
346				j++
347			}
348			// but not trailing blank lines
349			for j > i && isBlank(lines[j-1]) {
350				j--
351			}
352			pre := lines[i:j]
353			i = j
354
355			unindent(pre)
356
357			// put those lines in a pre block
358			out = append(out, block{opPre, pre})
359			lastWasHeading = false
360			continue
361		}
362
363		if lastWasBlank && !lastWasHeading && i+2 < len(lines) &&
364			isBlank(lines[i+1]) && !isBlank(lines[i+2]) && indentLen(lines[i+2]) == 0 {
365			// current line is non-blank, surrounded by blank lines
366			// and the next non-blank line is not indented: this
367			// might be a heading.
368			if head := heading(line); head != "" {
369				close()
370				out = append(out, block{opHead, []string{head}})
371				i += 2
372				lastWasHeading = true
373				continue
374			}
375		}
376
377		// open paragraph
378		lastWasBlank = false
379		lastWasHeading = false
380		para = append(para, lines[i])
381		i++
382	}
383	close()
384
385	return out
386}
387