1// Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved.
2// See LICENSE.txt for license information.
3
4package markdown
5
6import (
7	"container/list"
8	"strings"
9	"unicode"
10	"unicode/utf8"
11)
12
13type Inline interface {
14	IsInline() bool
15}
16
17type inlineBase struct{}
18
19func (inlineBase) IsInline() bool { return true }
20
21type Text struct {
22	inlineBase
23
24	Text  string
25	Range Range
26}
27
28type CodeSpan struct {
29	inlineBase
30
31	Code string
32}
33
34type HardLineBreak struct {
35	inlineBase
36}
37
38type SoftLineBreak struct {
39	inlineBase
40}
41
42type InlineLinkOrImage struct {
43	inlineBase
44
45	Children []Inline
46
47	RawDestination Range
48
49	markdown string
50	rawTitle string
51}
52
53func (i *InlineLinkOrImage) Destination() string {
54	return Unescape(i.markdown[i.RawDestination.Position:i.RawDestination.End])
55}
56
57func (i *InlineLinkOrImage) Title() string {
58	return Unescape(i.rawTitle)
59}
60
61type InlineLink struct {
62	InlineLinkOrImage
63}
64
65type InlineImage struct {
66	InlineLinkOrImage
67}
68
69type ReferenceLinkOrImage struct {
70	inlineBase
71	*ReferenceDefinition
72
73	Children []Inline
74}
75
76type ReferenceLink struct {
77	ReferenceLinkOrImage
78}
79
80type ReferenceImage struct {
81	ReferenceLinkOrImage
82}
83
84type Autolink struct {
85	inlineBase
86
87	Children []Inline
88
89	RawDestination Range
90
91	markdown string
92}
93
94func (i *Autolink) Destination() string {
95	destination := Unescape(i.markdown[i.RawDestination.Position:i.RawDestination.End])
96
97	if strings.HasPrefix(destination, "www") {
98		destination = "http://" + destination
99	}
100
101	return destination
102}
103
104type delimiterType int
105
106const (
107	linkOpeningDelimiter delimiterType = iota
108	imageOpeningDelimiter
109)
110
111type delimiter struct {
112	Type       delimiterType
113	IsInactive bool
114	TextNode   int
115	Range      Range
116}
117
118type inlineParser struct {
119	markdown             string
120	ranges               []Range
121	referenceDefinitions []*ReferenceDefinition
122
123	raw            string
124	position       int
125	inlines        []Inline
126	delimiterStack *list.List
127}
128
129func newInlineParser(markdown string, ranges []Range, referenceDefinitions []*ReferenceDefinition) *inlineParser {
130	return &inlineParser{
131		markdown:             markdown,
132		ranges:               ranges,
133		referenceDefinitions: referenceDefinitions,
134		delimiterStack:       list.New(),
135	}
136}
137
138func (p *inlineParser) parseBackticks() {
139	count := 1
140	for i := p.position + 1; i < len(p.raw) && p.raw[i] == '`'; i++ {
141		count++
142	}
143	opening := p.raw[p.position : p.position+count]
144	search := p.position + count
145	for search < len(p.raw) {
146		end := strings.Index(p.raw[search:], opening)
147		if end == -1 {
148			break
149		}
150		if search+end+count < len(p.raw) && p.raw[search+end+count] == '`' {
151			search += end + count
152			for search < len(p.raw) && p.raw[search] == '`' {
153				search++
154			}
155			continue
156		}
157		code := strings.Join(strings.Fields(p.raw[p.position+count:search+end]), " ")
158		p.position = search + end + count
159		p.inlines = append(p.inlines, &CodeSpan{
160			Code: code,
161		})
162		return
163	}
164	p.position += len(opening)
165	absPos := relativeToAbsolutePosition(p.ranges, p.position-len(opening))
166	p.inlines = append(p.inlines, &Text{
167		Text:  opening,
168		Range: Range{absPos, absPos + len(opening)},
169	})
170}
171
172func (p *inlineParser) parseLineEnding() {
173	if p.position >= 1 && p.raw[p.position-1] == '\t' {
174		p.inlines = append(p.inlines, &HardLineBreak{})
175	} else if p.position >= 2 && p.raw[p.position-1] == ' ' && (p.raw[p.position-2] == '\t' || p.raw[p.position-1] == ' ') {
176		p.inlines = append(p.inlines, &HardLineBreak{})
177	} else {
178		p.inlines = append(p.inlines, &SoftLineBreak{})
179	}
180	p.position++
181	if p.position < len(p.raw) && p.raw[p.position] == '\n' {
182		p.position++
183	}
184}
185
186func (p *inlineParser) parseEscapeCharacter() {
187	if p.position+1 < len(p.raw) && isEscapableByte(p.raw[p.position+1]) {
188		absPos := relativeToAbsolutePosition(p.ranges, p.position+1)
189		p.inlines = append(p.inlines, &Text{
190			Text:  string(p.raw[p.position+1]),
191			Range: Range{absPos, absPos + len(string(p.raw[p.position+1]))},
192		})
193		p.position += 2
194	} else {
195		absPos := relativeToAbsolutePosition(p.ranges, p.position)
196		p.inlines = append(p.inlines, &Text{
197			Text:  `\`,
198			Range: Range{absPos, absPos + 1},
199		})
200		p.position++
201	}
202}
203
204func (p *inlineParser) parseText() {
205	if next := strings.IndexAny(p.raw[p.position:], "\r\n\\`&![]wW:"); next == -1 {
206		absPos := relativeToAbsolutePosition(p.ranges, p.position)
207		p.inlines = append(p.inlines, &Text{
208			Text:  strings.TrimRightFunc(p.raw[p.position:], isWhitespace),
209			Range: Range{absPos, absPos + len(p.raw[p.position:])},
210		})
211		p.position = len(p.raw)
212	} else {
213		absPos := relativeToAbsolutePosition(p.ranges, p.position)
214		if p.raw[p.position+next] == '\r' || p.raw[p.position+next] == '\n' {
215			s := strings.TrimRightFunc(p.raw[p.position:p.position+next], isWhitespace)
216			p.inlines = append(p.inlines, &Text{
217				Text:  s,
218				Range: Range{absPos, absPos + len(s)},
219			})
220		} else {
221			if next == 0 {
222				// Always read at least one character since 'w', 'W', and ':' may not actually match another
223				// type of node
224				next = 1
225			}
226
227			p.inlines = append(p.inlines, &Text{
228				Text:  p.raw[p.position : p.position+next],
229				Range: Range{absPos, absPos + next},
230			})
231		}
232		p.position += next
233	}
234}
235
236func (p *inlineParser) parseLinkOrImageDelimiter() {
237	absPos := relativeToAbsolutePosition(p.ranges, p.position)
238	if p.raw[p.position] == '[' {
239		p.inlines = append(p.inlines, &Text{
240			Text:  "[",
241			Range: Range{absPos, absPos + 1},
242		})
243		p.delimiterStack.PushBack(&delimiter{
244			Type:     linkOpeningDelimiter,
245			TextNode: len(p.inlines) - 1,
246			Range:    Range{p.position, p.position + 1},
247		})
248		p.position++
249	} else if p.raw[p.position] == '!' && p.position+1 < len(p.raw) && p.raw[p.position+1] == '[' {
250		p.inlines = append(p.inlines, &Text{
251			Text:  "![",
252			Range: Range{absPos, absPos + 2},
253		})
254		p.delimiterStack.PushBack(&delimiter{
255			Type:     imageOpeningDelimiter,
256			TextNode: len(p.inlines) - 1,
257			Range:    Range{p.position, p.position + 2},
258		})
259		p.position += 2
260	} else {
261		p.inlines = append(p.inlines, &Text{
262			Text:  "!",
263			Range: Range{absPos, absPos + 1},
264		})
265		p.position++
266	}
267}
268
269func (p *inlineParser) peekAtInlineLinkDestinationAndTitle(position int, isImage bool) (destination, title Range, end int, ok bool) {
270	if position >= len(p.raw) || p.raw[position] != '(' {
271		return
272	}
273	position++
274
275	destinationStart := nextNonWhitespace(p.raw, position)
276	if destinationStart >= len(p.raw) {
277		return
278	} else if p.raw[destinationStart] == ')' {
279		return Range{destinationStart, destinationStart}, Range{destinationStart, destinationStart}, destinationStart + 1, true
280	}
281
282	destination, end, ok = parseLinkDestination(p.raw, destinationStart)
283	if !ok {
284		return
285	}
286	position = end
287
288	if isImage && position < len(p.raw) && isWhitespaceByte(p.raw[position]) {
289		dimensionsStart := nextNonWhitespace(p.raw, position)
290		if dimensionsStart >= len(p.raw) {
291			return
292		}
293
294		if p.raw[dimensionsStart] == '=' {
295			// Read optional image dimensions even if we don't use them
296			_, end, ok = parseImageDimensions(p.raw, dimensionsStart)
297			if !ok {
298				return
299			}
300
301			position = end
302		}
303	}
304
305	if position < len(p.raw) && isWhitespaceByte(p.raw[position]) {
306		titleStart := nextNonWhitespace(p.raw, position)
307		if titleStart >= len(p.raw) {
308			return
309		} else if p.raw[titleStart] == ')' {
310			return destination, Range{titleStart, titleStart}, titleStart + 1, true
311		}
312
313		if p.raw[titleStart] == '"' || p.raw[titleStart] == '\'' || p.raw[titleStart] == '(' {
314			title, end, ok = parseLinkTitle(p.raw, titleStart)
315			if !ok {
316				return
317			}
318			position = end
319		}
320	}
321
322	closingPosition := nextNonWhitespace(p.raw, position)
323	if closingPosition >= len(p.raw) || p.raw[closingPosition] != ')' {
324		return Range{}, Range{}, 0, false
325	}
326
327	return destination, title, closingPosition + 1, true
328}
329
330func (p *inlineParser) referenceDefinition(label string) *ReferenceDefinition {
331	clean := strings.Join(strings.Fields(label), " ")
332	for _, d := range p.referenceDefinitions {
333		if strings.EqualFold(clean, strings.Join(strings.Fields(d.Label()), " ")) {
334			return d
335		}
336	}
337	return nil
338}
339
340func (p *inlineParser) lookForLinkOrImage() {
341	for element := p.delimiterStack.Back(); element != nil; element = element.Prev() {
342		d := element.Value.(*delimiter)
343		if d.Type != imageOpeningDelimiter && d.Type != linkOpeningDelimiter {
344			continue
345		}
346		if d.IsInactive {
347			p.delimiterStack.Remove(element)
348			break
349		}
350
351		isImage := d.Type == imageOpeningDelimiter
352
353		var inline Inline
354
355		if destination, title, next, ok := p.peekAtInlineLinkDestinationAndTitle(p.position+1, isImage); ok {
356			destinationMarkdownPosition := relativeToAbsolutePosition(p.ranges, destination.Position)
357			linkOrImage := InlineLinkOrImage{
358				Children:       append([]Inline(nil), p.inlines[d.TextNode+1:]...),
359				RawDestination: Range{destinationMarkdownPosition, destinationMarkdownPosition + destination.End - destination.Position},
360				markdown:       p.markdown,
361				rawTitle:       p.raw[title.Position:title.End],
362			}
363			if d.Type == imageOpeningDelimiter {
364				inline = &InlineImage{linkOrImage}
365			} else {
366				inline = &InlineLink{linkOrImage}
367			}
368			p.position = next
369		} else {
370			referenceLabel := ""
371			label, next, hasLinkLabel := parseLinkLabel(p.raw, p.position+1)
372			if hasLinkLabel && label.End > label.Position {
373				referenceLabel = p.raw[label.Position:label.End]
374			} else {
375				referenceLabel = p.raw[d.Range.End:p.position]
376				if !hasLinkLabel {
377					next = p.position + 1
378				}
379			}
380			if referenceLabel != "" {
381				if reference := p.referenceDefinition(referenceLabel); reference != nil {
382					linkOrImage := ReferenceLinkOrImage{
383						ReferenceDefinition: reference,
384						Children:            append([]Inline(nil), p.inlines[d.TextNode+1:]...),
385					}
386					if d.Type == imageOpeningDelimiter {
387						inline = &ReferenceImage{linkOrImage}
388					} else {
389						inline = &ReferenceLink{linkOrImage}
390					}
391					p.position = next
392				}
393			}
394		}
395
396		if inline != nil {
397			if d.Type == imageOpeningDelimiter {
398				p.inlines = append(p.inlines[:d.TextNode], inline)
399			} else {
400				p.inlines = append(p.inlines[:d.TextNode], inline)
401				for inlineElement := element.Prev(); inlineElement != nil; inlineElement = inlineElement.Prev() {
402					if d := inlineElement.Value.(*delimiter); d.Type == linkOpeningDelimiter {
403						d.IsInactive = true
404					}
405				}
406			}
407			p.delimiterStack.Remove(element)
408			return
409		}
410		p.delimiterStack.Remove(element)
411		break
412	}
413	absPos := relativeToAbsolutePosition(p.ranges, p.position)
414	p.inlines = append(p.inlines, &Text{
415		Text:  "]",
416		Range: Range{absPos, absPos + 1},
417	})
418	p.position++
419}
420
421func CharacterReference(ref string) string {
422	if ref == "" {
423		return ""
424	}
425	if ref[0] == '#' {
426		if len(ref) < 2 {
427			return ""
428		}
429		n := 0
430		if ref[1] == 'X' || ref[1] == 'x' {
431			if len(ref) < 3 {
432				return ""
433			}
434			for i := 2; i < len(ref); i++ {
435				if i > 9 {
436					return ""
437				}
438				d := ref[i]
439				switch {
440				case d >= '0' && d <= '9':
441					n = n*16 + int(d-'0')
442				case d >= 'a' && d <= 'f':
443					n = n*16 + 10 + int(d-'a')
444				case d >= 'A' && d <= 'F':
445					n = n*16 + 10 + int(d-'A')
446				default:
447					return ""
448				}
449			}
450		} else {
451			for i := 1; i < len(ref); i++ {
452				if i > 8 || ref[i] < '0' || ref[i] > '9' {
453					return ""
454				}
455				n = n*10 + int(ref[i]-'0')
456			}
457		}
458		c := rune(n)
459		if c == '\u0000' || !utf8.ValidRune(c) {
460			return string(unicode.ReplacementChar)
461		}
462		return string(c)
463	}
464	if entity, ok := htmlEntities[ref]; ok {
465		return entity
466	}
467	return ""
468}
469
470func (p *inlineParser) parseCharacterReference() {
471	absPos := relativeToAbsolutePosition(p.ranges, p.position)
472	p.position++
473	if semicolon := strings.IndexByte(p.raw[p.position:], ';'); semicolon == -1 {
474		p.inlines = append(p.inlines, &Text{
475			Text:  "&",
476			Range: Range{absPos, absPos + 1},
477		})
478	} else if s := CharacterReference(p.raw[p.position : p.position+semicolon]); s != "" {
479		p.position += semicolon + 1
480		p.inlines = append(p.inlines, &Text{
481			Text:  s,
482			Range: Range{absPos, absPos + len(s)},
483		})
484	} else {
485		p.inlines = append(p.inlines, &Text{
486			Text:  "&",
487			Range: Range{absPos, absPos + 1},
488		})
489	}
490}
491
492func (p *inlineParser) parseAutolink(c rune) bool {
493	for element := p.delimiterStack.Back(); element != nil; element = element.Prev() {
494		d := element.Value.(*delimiter)
495		if !d.IsInactive {
496			return false
497		}
498	}
499
500	var link Range
501	if c == ':' {
502		var ok bool
503		link, ok = parseURLAutolink(p.raw, p.position)
504
505		if !ok {
506			return false
507		}
508
509		// Since the current position is at the colon, we have to rewind the parsing slightly so that
510		// we don't duplicate the URL scheme
511		rewind := strings.Index(p.raw[link.Position:link.End], ":")
512		if rewind != -1 {
513			lastInline := p.inlines[len(p.inlines)-1]
514			lastText, ok := lastInline.(*Text)
515
516			if !ok {
517				// This should never occur since parseURLAutolink will only return a non-empty value
518				// when the previous text ends in a valid URL protocol which would mean that the previous
519				// node is a Text node
520				return false
521			}
522
523			p.inlines = p.inlines[0 : len(p.inlines)-1]
524			p.inlines = append(p.inlines, &Text{
525				Text:  lastText.Text[:len(lastText.Text)-rewind],
526				Range: Range{lastText.Range.Position, lastText.Range.End - rewind},
527			})
528			p.position -= rewind
529		}
530	} else if c == 'w' || c == 'W' {
531		var ok bool
532		link, ok = parseWWWAutolink(p.raw, p.position)
533
534		if !ok {
535			return false
536		}
537	}
538
539	linkMarkdownPosition := relativeToAbsolutePosition(p.ranges, link.Position)
540	linkRange := Range{linkMarkdownPosition, linkMarkdownPosition + link.End - link.Position}
541
542	p.inlines = append(p.inlines, &Autolink{
543		Children: []Inline{
544			&Text{
545				Text:  p.raw[link.Position:link.End],
546				Range: linkRange,
547			},
548		},
549		RawDestination: linkRange,
550		markdown:       p.markdown,
551	})
552	p.position += (link.End - link.Position)
553
554	return true
555}
556
557func (p *inlineParser) Parse() []Inline {
558	for _, r := range p.ranges {
559		p.raw += p.markdown[r.Position:r.End]
560	}
561
562	for p.position < len(p.raw) {
563		c, _ := utf8.DecodeRuneInString(p.raw[p.position:])
564
565		switch c {
566		case '\r', '\n':
567			p.parseLineEnding()
568		case '\\':
569			p.parseEscapeCharacter()
570		case '`':
571			p.parseBackticks()
572		case '&':
573			p.parseCharacterReference()
574		case '!', '[':
575			p.parseLinkOrImageDelimiter()
576		case ']':
577			p.lookForLinkOrImage()
578		case 'w', 'W', ':':
579			matched := p.parseAutolink(c)
580
581			if !matched {
582				p.parseText()
583			}
584		default:
585			p.parseText()
586		}
587	}
588
589	return p.inlines
590}
591
592func ParseInlines(markdown string, ranges []Range, referenceDefinitions []*ReferenceDefinition) (inlines []Inline) {
593	return newInlineParser(markdown, ranges, referenceDefinitions).Parse()
594}
595
596func MergeInlineText(inlines []Inline) []Inline {
597	ret := inlines[:0]
598	for i, v := range inlines {
599		// always add first node
600		if i == 0 {
601			ret = append(ret, v)
602			continue
603		}
604		// not a text node? nothing to merge
605		text, ok := v.(*Text)
606		if !ok {
607			ret = append(ret, v)
608			continue
609		}
610		// previous node is not a text node? nothing to merge
611		prevText, ok := ret[len(ret)-1].(*Text)
612		if !ok {
613			ret = append(ret, v)
614			continue
615		}
616		// previous node is not right before this one
617		if prevText.Range.End != text.Range.Position {
618			ret = append(ret, v)
619			continue
620		}
621		// we have two consecutive text nodes
622		ret[len(ret)-1] = &Text{
623			Text:  prevText.Text + text.Text,
624			Range: Range{prevText.Range.Position, text.Range.End},
625		}
626	}
627	return ret
628}
629
630func Unescape(markdown string) string {
631	ret := ""
632
633	position := 0
634	for position < len(markdown) {
635		c, cSize := utf8.DecodeRuneInString(markdown[position:])
636
637		switch c {
638		case '\\':
639			if position+1 < len(markdown) && isEscapableByte(markdown[position+1]) {
640				ret += string(markdown[position+1])
641				position += 2
642			} else {
643				ret += `\`
644				position++
645			}
646		case '&':
647			position++
648			if semicolon := strings.IndexByte(markdown[position:], ';'); semicolon == -1 {
649				ret += "&"
650			} else if s := CharacterReference(markdown[position : position+semicolon]); s != "" {
651				position += semicolon + 1
652				ret += s
653			} else {
654				ret += "&"
655			}
656		default:
657			ret += string(c)
658			position += cSize
659		}
660	}
661
662	return ret
663}
664