1// Package markdown is a package for parsing and processing markdown text.
2// It translates plain text with simple formatting rules into HTML or XML.
3package mmark
4
5import (
6	"bytes"
7	"path"
8	"unicode/utf8"
9)
10
11const Version = "1.3.6"
12
13var test = false
14
15// These are the supported markdown parsing extensions.
16// OR these values together to select multiple extensions.
17const (
18	_                                    = 1 << iota
19	EXTENSION_ABBREVIATIONS              // Render abbreviations `*[HTML]: Hyper Text Markup Language`
20	EXTENSION_AUTO_HEADER_IDS            // Create the header ID from the text
21	EXTENSION_AUTOLINK                   // Detect embedded URLs that are not explicitly marked
22	EXTENSION_CITATION                   // Support citations via the link syntax
23	EXTENSION_EXAMPLE_LISTS              // Render '(@tag)  ' example lists
24	EXTENSION_FENCED_CODE                // Render fenced code blocks
25	EXTENSION_FOOTNOTES                  // Pandoc-style footnotes
26	EXTENSION_HARD_LINE_BREAK            // Translate newlines into line breaks
27	EXTENSION_HEADER_IDS                 // Specify header IDs with {#id}
28	EXTENSION_INCLUDE                    // Include file with {{ syntax
29	EXTENSION_INLINE_ATTR                // Detect CommonMark's IAL syntax
30	EXTENSION_LAX_HTML_BLOCKS            // Loosen up HTML block parsing rules
31	EXTENSION_MATH                       // Detect $$...$$ and parse as math
32	EXTENSION_MATTER                     // Use {frontmatter} {mainmatter} {backmatter} (TODO(miek): not actually used)
33	EXTENSION_NO_EMPTY_LINE_BEFORE_BLOCK // No need to insert an empty line to start a (code, quote, order list, unorder list)block
34	EXTENSION_PARTS                      // Detect part headers (-#)
35	EXTENSION_QUOTES                     // Allow A> as asides
36	EXTENSION_SHORT_REF                  // (#id) will be a cross reference
37	EXTENSION_SPACE_HEADERS              // Be strict about prefix header rules
38	EXTENSION_TABLES                     // Render tables
39	EXTENSION_TITLEBLOCK_TOML            // Titleblock in TOML
40	EXTENSION_UNIQUE_HEADER_IDS          // When detecting identical anchors add a sequence number -1, -2 etc
41	EXTENSION_BACKSLASH_LINE_BREAK       // Translate trailing backslashes into line breaks
42	EXTENSION_RFC7328                    // Parse RFC 7328 markdown. Depends on FOOTNOTES extension.
43	EXTENSION_DEFINITION_LISTS           // render definition lists
44
45	commonHtmlFlags = 0 |
46		HTML_USE_SMARTYPANTS |
47		HTML_SMARTYPANTS_FRACTIONS |
48		HTML_SMARTYPANTS_DASHES |
49		HTML_SMARTYPANTS_LATEX_DASHES
50
51	commonExtensions = 0 |
52		EXTENSION_TABLES |
53		EXTENSION_FENCED_CODE |
54		EXTENSION_AUTOLINK |
55		EXTENSION_SPACE_HEADERS |
56		EXTENSION_HEADER_IDS |
57		EXTENSION_ABBREVIATIONS |
58		EXTENSION_NO_EMPTY_LINE_BEFORE_BLOCK | // CommonMark
59		EXTENSION_BACKSLASH_LINE_BREAK | // CommonMark
60		EXTENSION_DEFINITION_LISTS
61
62	commonXmlExtensions = commonExtensions |
63		EXTENSION_UNIQUE_HEADER_IDS |
64		EXTENSION_AUTO_HEADER_IDS |
65		EXTENSION_INLINE_ATTR |
66		EXTENSION_QUOTES |
67		EXTENSION_MATTER |
68		EXTENSION_CITATION |
69		EXTENSION_EXAMPLE_LISTS |
70		EXTENSION_SHORT_REF |
71		EXTENSION_DEFINITION_LISTS
72)
73
74// These are the possible flag values for the link renderer.
75// Only a single one of these values will be used; they are not ORed together.
76// These are mostly of interest if you are writing a new output format.
77const (
78	_LINK_TYPE_NOT_AUTOLINK = iota
79	_LINK_TYPE_NORMAL
80	_LINK_TYPE_EMAIL
81)
82
83// These are the possible flag values for the ListItem renderer.
84// Multiple flag values may be ORed together.
85// These are mostly of interest if you are writing a new output format.
86const (
87	_LIST_TYPE_ORDERED = 1 << iota
88	_LIST_TYPE_ORDERED_ROMAN_UPPER
89	_LIST_TYPE_ORDERED_ROMAN_LOWER
90	_LIST_TYPE_ORDERED_ALPHA_UPPER
91	_LIST_TYPE_ORDERED_ALPHA_LOWER
92	_LIST_TYPE_ORDERED_GROUP
93	_LIST_TYPE_DEFINITION
94	_LIST_TYPE_TERM
95	_LIST_ITEM_CONTAINS_BLOCK
96	_LIST_ITEM_BEGINNING_OF_LIST
97	_LIST_ITEM_END_OF_LIST
98	_LIST_INSIDE_LIST
99	_INSIDE_FIGURE
100)
101
102// These are the possible flag values for the table cell renderer.
103// Only a single one of these values will be used; they are not ORed together.
104// These are mostly of interest if you are writing a new output format.
105const (
106	_TABLE_ALIGNMENT_LEFT = 1 << iota
107	_TABLE_ALIGNMENT_RIGHT
108	_TABLE_ALIGNMENT_CENTER = (_TABLE_ALIGNMENT_LEFT | _TABLE_ALIGNMENT_RIGHT)
109)
110
111// The size of a tab stop.
112const _TAB_SIZE_DEFAULT = 4
113
114const (
115	_DOC_FRONT_MATTER = iota + 1 // Different divisions of the document
116	_DOC_MAIN_MATTER
117	_DOC_BACK_MATTER
118	_ABSTRACT // Special headers, keep track if there are open
119	_NOTE     // Special Note headers, keep track if there are open
120	_PREFACE
121	_COLOPHON
122)
123
124// blockTags is a set of tags that are recognized as HTML block tags.
125// Any of these can be included in markdown text without special escaping.
126var blockTags = map[string]struct{}{
127	"blockquote": {},
128	"del":        {},
129	"div":        {},
130	"dl":         {},
131	"fieldset":   {},
132	"form":       {},
133	"h1":         {},
134	"h2":         {},
135	"h3":         {},
136	"h4":         {},
137	"h5":         {},
138	"h6":         {},
139	"iframe":     {},
140	"ins":        {},
141	"math":       {},
142	"noscript":   {},
143	"ol":         {},
144	"pre":        {},
145	"p":          {},
146	"script":     {},
147	"style":      {},
148	"table":      {},
149	"ul":         {},
150
151	// HTML5
152	"article":    {},
153	"aside":      {},
154	"canvas":     {},
155	"figcaption": {},
156	"figure":     {},
157	"footer":     {},
158	"header":     {},
159	"hgroup":     {},
160	"output":     {},
161	"progress":   {},
162	"section":    {},
163	"video":      {},
164}
165
166// Renderer is the rendering interface.
167// This is mostly of interest if you are implementing a new rendering format.
168//
169// When a byte slice is provided, it contains the (rendered) contents of the
170// element.
171//
172// When a callback is provided instead, it will write the contents of the
173// respective element directly to the output buffer and return true on success.
174// If the callback returns false, the rendering function should reset the
175// output buffer as though it had never been called.
176//
177// Currently Html, XML2RFC v3 and XML2RFC v2 implementations are provided.
178type Renderer interface {
179	// block-level callbacks
180	BlockCode(out *bytes.Buffer, text []byte, lang string, caption []byte, subfigure bool, callouts bool)
181	BlockQuote(out *bytes.Buffer, text []byte, attribution []byte)
182	BlockHtml(out *bytes.Buffer, text []byte)
183	CommentHtml(out *bytes.Buffer, text []byte)
184	// SpecialHeader is used for Abstract and Preface. The what string contains abstract or preface.
185	SpecialHeader(out *bytes.Buffer, what []byte, text func() bool, id string)
186	// Note is use for typesetting notes.
187	Note(out *bytes.Buffer, text func() bool, id string)
188	Part(out *bytes.Buffer, text func() bool, id string)
189	Header(out *bytes.Buffer, text func() bool, level int, id string)
190	HRule(out *bytes.Buffer)
191	List(out *bytes.Buffer, text func() bool, flags, start int, group []byte)
192	ListItem(out *bytes.Buffer, text []byte, flags int)
193	Paragraph(out *bytes.Buffer, text func() bool, flags int)
194
195	Table(out *bytes.Buffer, header []byte, body []byte, footer []byte, columnData []int, caption []byte)
196	TableRow(out *bytes.Buffer, text []byte)
197	TableHeaderCell(out *bytes.Buffer, text []byte, flags, colspan int)
198	TableCell(out *bytes.Buffer, text []byte, flags, colspan int)
199
200	Footnotes(out *bytes.Buffer, text func() bool)
201	FootnoteItem(out *bytes.Buffer, name, text []byte, flags int)
202	TitleBlockTOML(out *bytes.Buffer, data *title)
203	Aside(out *bytes.Buffer, text []byte)
204	Figure(out *bytes.Buffer, text []byte, caption []byte)
205
206	// Span-level callbacks
207	AutoLink(out *bytes.Buffer, link []byte, kind int)
208	CodeSpan(out *bytes.Buffer, text []byte)
209	// CalloutText is called when a callout is seen in the text. Id is the text
210	// seen between < and > and ids references the callout counter(s) in the code.
211	CalloutText(out *bytes.Buffer, id string, ids []string)
212	// Called when a callout is seen in a code block. Index is the callout counter, id
213	// is the number seen between < and >.
214	CalloutCode(out *bytes.Buffer, index, id string)
215	DoubleEmphasis(out *bytes.Buffer, text []byte)
216	Emphasis(out *bytes.Buffer, text []byte)
217	Subscript(out *bytes.Buffer, text []byte)
218	Superscript(out *bytes.Buffer, text []byte)
219	Image(out *bytes.Buffer, link []byte, title []byte, alt []byte, subfigure bool)
220	LineBreak(out *bytes.Buffer)
221	Link(out *bytes.Buffer, link []byte, title []byte, content []byte)
222	RawHtmlTag(out *bytes.Buffer, tag []byte)
223	TripleEmphasis(out *bytes.Buffer, text []byte)
224	StrikeThrough(out *bytes.Buffer, text []byte)
225	FootnoteRef(out *bytes.Buffer, ref []byte, id int)
226	Index(out *bytes.Buffer, primary, secondary []byte, prim bool)
227	Citation(out *bytes.Buffer, link, title []byte)
228	Abbreviation(out *bytes.Buffer, abbr, title []byte)
229	Example(out *bytes.Buffer, index int)
230	Math(out *bytes.Buffer, text []byte, display bool)
231
232	// Low-level callbacks
233	Entity(out *bytes.Buffer, entity []byte)
234	NormalText(out *bytes.Buffer, text []byte)
235
236	// Header and footer
237	DocumentHeader(out *bytes.Buffer, start bool)
238	DocumentFooter(out *bytes.Buffer, start bool)
239
240	// Frontmatter, mainmatter or backmatter
241	DocumentMatter(out *bytes.Buffer, matter int)
242	References(out *bytes.Buffer, citations map[string]*citation)
243
244	// Helper functions
245	Flags() int
246
247	// Attr returns the inline attribute.
248	Attr() *inlineAttr
249	// SetAttr set the inline attribute.
250	SetAttr(*inlineAttr)
251	// AttrString return the string representation of this inline attribute.
252	AttrString(*inlineAttr) string
253}
254
255// Callback functions for inline parsing. One such function is defined
256// for each character that triggers a response when parsing inline data.
257type inlineParser func(p *parser, out *bytes.Buffer, data []byte, offset int) int
258
259// Parser holds runtime state used by the parser.
260// This is constructed by the Markdown function.
261type parser struct {
262	r                    Renderer
263	refs                 map[string]*reference
264	citations            map[string]*citation
265	abbreviations        map[string]*abbreviation
266	examples             map[string]int
267	callouts             map[string][]string
268	codeBlock            int // count codeblock for callout ID generation
269	inlineCallback       [256]inlineParser
270	flags                int
271	nesting              int
272	maxNesting           int
273	insideLink           bool
274	insideDefinitionList bool // when in def. list ... TODO(miek):doc
275	insideList           int  // list in list counter
276	insideFigure         bool // when inside a F> paragraph
277	displayMath          bool
278
279	// Footnotes need to be ordered as well as available to quickly check for
280	// presence. If a ref is also a footnote, it's stored both in refs and here
281	// in notes. Slice is nil if footnotes not enabled.
282	notes []*reference
283
284	appendix   bool // have we seen a {backmatter}?
285	titleblock bool // have we seen a titleblock
286	headerLen  int  // if a header is written what is length
287
288	partCount    int // TODO, keep track of part counts (-#)
289	chapterCount int // TODO, keep track of chapter count (#)
290
291	// Placeholder IAL that can be added to blocklevel elements.
292	ial *inlineAttr
293
294	// Prevent identical header anchors by appending -<sequence_number> starting
295	// with -1, this is the same thing that pandoc does.
296	anchors map[string]int
297}
298
299// Markdown is an io.Writer. Writing a buffer with markdown text will be converted to
300// the output format the renderer outputs. Note that the conversion only takes place
301// when String() or Bytes() is called.
302type Markdown struct {
303	renderer   Renderer
304	extensions int
305	in         *bytes.Buffer
306	out        *bytes.Buffer
307
308	renderedSinceLastWrite bool
309}
310
311func NewMarkdown(renderer Renderer, extensions int) *Markdown {
312	return &Markdown{renderer, extensions, &bytes.Buffer{}, &bytes.Buffer{}, false}
313}
314
315func (m *Markdown) Write(p []byte) (n int, err error) {
316	m.renderedSinceLastWrite = false
317	return m.in.Write(p)
318}
319
320func (m *Markdown) String() string { m.render(); return m.out.String() }
321func (m *Markdown) Bytes() []byte  { m.render(); return m.out.Bytes() }
322
323func (m *Markdown) render() {
324	if m.renderer == nil {
325		// default to Html renderer
326	}
327	if m.renderedSinceLastWrite {
328		return
329	}
330	m.out = Parse(m.in.Bytes(), m.renderer, m.extensions)
331	m.renderedSinceLastWrite = true
332}
333
334// Parse is the main rendering function.
335// It parses and renders a block of markdown-encoded text.
336// The supplied Renderer is used to format the output, and extensions dictates
337// which non-standard extensions are enabled.
338//
339// To use the supplied Html or XML renderers, see HtmlRenderer, XmlRenderer and
340// Xml2Renderer, respectively.
341func Parse(input []byte, renderer Renderer, extensions int) *bytes.Buffer {
342	// no point in parsing if we can't render
343	if renderer == nil {
344		return nil
345	}
346
347	// fill in the render structure
348	p := new(parser)
349	p.r = renderer
350	p.flags = extensions
351	p.refs = make(map[string]*reference)
352	p.abbreviations = make(map[string]*abbreviation)
353	p.anchors = make(map[string]int)
354	p.examples = make(map[string]int)
355	// newly created in 'callouts'
356	p.maxNesting = 16
357	p.insideLink = false
358
359	// register inline parsers
360	p.inlineCallback['*'] = emphasis
361	p.inlineCallback['_'] = emphasis
362	p.inlineCallback['~'] = emphasis
363	p.inlineCallback['`'] = codeSpan
364	p.inlineCallback['\n'] = lineBreak
365	p.inlineCallback['['] = link
366	p.inlineCallback['<'] = leftAngle
367	p.inlineCallback['\\'] = escape
368	p.inlineCallback['&'] = entity
369	p.inlineCallback['{'] = leftBrace
370	p.inlineCallback['^'] = superscript // subscript is handled in emphasis
371	p.inlineCallback['('] = index       // also find example list references and cross references
372	p.inlineCallback['$'] = math
373
374	if extensions&EXTENSION_AUTOLINK != 0 {
375		p.inlineCallback[':'] = autoLink
376	}
377
378	if extensions&EXTENSION_FOOTNOTES != 0 {
379		p.notes = make([]*reference, 0)
380	}
381
382	if extensions&EXTENSION_CITATION != 0 {
383		p.inlineCallback['@'] = citationReference // @ref, short form of citations
384		p.citations = make(map[string]*citation)
385	}
386
387	first := firstPass(p, input, 0)
388	second := secondPass(p, first.Bytes(), 0)
389	return second
390}
391
392// first pass:
393// - extract references
394// - extract abbreviations
395// - expand tabs
396// - normalize newlines
397// - copy everything else
398// - include includes
399func firstPass(p *parser, input []byte, depth int) *bytes.Buffer {
400	var out bytes.Buffer
401	if depth > 8 {
402		printf(p, "nested includes depth > 8")
403		out.WriteByte('\n')
404		return &out
405	}
406
407	tabSize := _TAB_SIZE_DEFAULT
408	beg, end := 0, 0
409	lastFencedCodeBlockEnd := 0
410	for beg < len(input) { // iterate over lines
411		if beg >= lastFencedCodeBlockEnd { // don't parse inside fenced code blocks
412			if end = isReference(p, input[beg:], tabSize); end > 0 {
413				beg += end
414				continue
415			}
416		}
417		// skip to the next line
418		end = beg
419		for end < len(input) && input[end] != '\n' && input[end] != '\r' {
420			end++
421		}
422
423		if p.flags&EXTENSION_FENCED_CODE != 0 {
424			// track fenced code block boundaries to suppress tab expansion
425			// inside them:
426			if beg >= lastFencedCodeBlockEnd {
427				if i := p.fencedCode(&out, append(input[beg:], '\n'), false); i > 0 {
428					lastFencedCodeBlockEnd = beg + i
429				}
430			}
431		}
432
433		// add the line body if present
434		if end > beg {
435			if end < lastFencedCodeBlockEnd { // Do not expand tabs while inside fenced code blocks.
436				out.Write(input[beg:end])
437			} else {
438				if p.flags&EXTENSION_INCLUDE != 0 && beg+1 < len(input) && input[beg] == '{' && input[beg+1] == '{' {
439					if beg == 0 || (beg > 0 && input[beg-1] == '\n') {
440						if j := p.include(&out, input[beg:end], depth); j > 0 {
441							beg += j
442						}
443					}
444				}
445				expandTabs(&out, input[beg:end], tabSize)
446			}
447		}
448		out.WriteByte('\n')
449
450		if end < len(input) && input[end] == '\r' {
451			end++
452		}
453		if end < len(input) && input[end] == '\n' {
454			end++
455		}
456		beg = end
457	}
458
459	// empty input?
460	if out.Len() == 0 {
461		out.WriteByte('\n')
462	}
463
464	return &out
465}
466
467// second pass: actual rendering
468func secondPass(p *parser, input []byte, depth int) *bytes.Buffer {
469	var output bytes.Buffer
470
471	p.r.DocumentHeader(&output, depth == 0)
472	p.headerLen = output.Len()
473	p.block(&output, input)
474
475	if p.flags&EXTENSION_FOOTNOTES != 0 && len(p.notes) > 0 {
476		p.r.Footnotes(&output, func() bool {
477			flags := _LIST_ITEM_BEGINNING_OF_LIST
478			for i := 0; i < len(p.notes); i += 1 {
479				var buf bytes.Buffer
480				ref := p.notes[i]
481				if ref.hasBlock {
482					flags |= _LIST_ITEM_CONTAINS_BLOCK
483					p.block(&buf, ref.title)
484				} else {
485					p.inline(&buf, ref.title)
486				}
487				p.r.FootnoteItem(&output, ref.link, buf.Bytes(), flags)
488				flags &^= _LIST_ITEM_BEGINNING_OF_LIST | _LIST_ITEM_CONTAINS_BLOCK
489			}
490
491			return true
492		})
493	}
494	if !p.appendix {
495		if len(p.citations) > 0 {
496			// appendix not started in doc, start it now and output references
497			p.r.DocumentMatter(&output, _DOC_BACK_MATTER)
498			p.r.References(&output, p.citations)
499		}
500		p.appendix = true
501	}
502	p.r.DocumentFooter(&output, depth == 0)
503
504	if p.nesting != 0 {
505		panic("Nesting level did not end at zero")
506	}
507
508	return &output
509}
510
511//
512// Link references
513//
514// This section implements support for references that (usually) appear
515// as footnotes in a document, and can be referenced anywhere in the document.
516// The basic format is:
517//
518//    [1]: http://www.google.com/ "Google"
519//    [2]: http://www.github.com/ "Github"
520//
521// Anywhere in the document, the reference can be linked by referring to its
522// label, i.e., 1 and 2 in this example, as in:
523//
524//    This library is hosted on [Github][2], a git hosting site.
525//
526// Actual footnotes as specified in Pandoc and supported by some other Markdown
527// libraries such as php-markdown are also taken care of. They look like this:
528//
529//    This sentence needs a bit of further explanation.[^note]
530//
531//    [^note]: This is the explanation.
532//
533// Footnotes should be placed at the end of the document in an ordered list.
534// Inline footnotes such as:
535//
536//    Inline footnotes^[Not supported.] also exist.
537//
538// are not yet supported.
539
540// References are parsed and stored in this struct.
541type reference struct {
542	link     []byte
543	title    []byte
544	noteId   int // 0 if not a footnote ref
545	hasBlock bool
546}
547
548// abbreviations are parsed and stored in this struct.
549type abbreviation struct {
550	title []byte
551}
552
553// citations are parsed and stored in this struct.
554type citation struct {
555	link  []byte
556	title []byte
557	xml   []byte // raw include of reference XML
558	typ   byte   // 'i' for informal, 'n' normative (default = 'i')
559	seq   int    // sequence number for I-Ds
560}
561
562// Check whether or not data starts with a reference link.
563// If so, it is parsed and stored in the list of references
564// (in the render struct).
565// Returns the number of bytes to skip to move past it,
566// or zero if the first line is not a reference.
567func isReference(p *parser, data []byte, tabSize int) int {
568	// up to 3 optional leading spaces
569	if len(data) < 4 {
570		return 0
571	}
572	i := 0
573	for i < 3 && data[i] == ' ' {
574		i++
575	}
576
577	noteId := 0
578	abbrId := ""
579
580	// id part: anything but a newline between brackets
581	// abbreviations start with *[
582	if data[i] != '[' && data[i] != '*' {
583		return 0
584	}
585	if data[i] == '*' && (i < len(data)-1 && data[i+1] != '[') {
586		return 0
587	}
588	if data[i] == '*' && p.flags&EXTENSION_ABBREVIATIONS != 0 {
589		abbrId = "yes" // any non empty
590	}
591	i++
592	if p.flags&EXTENSION_FOOTNOTES != 0 {
593		if data[i] == '^' {
594			// we can set it to anything here because the proper noteIds will
595			// be assigned later during the second pass. It just has to be != 0
596			noteId = 1
597			i++
598		}
599	}
600	idOffset := i
601	for i < len(data) && data[i] != '\n' && data[i] != '\r' && data[i] != ']' {
602		if data[i] == '\\' {
603			i++
604		}
605		i++
606	}
607	if i >= len(data) || data[i] != ']' {
608		return 0
609	}
610	idEnd := i
611	if abbrId != "" {
612		abbrId = string(data[idOffset+1 : idEnd])
613	}
614
615	// spacer: colon (space | tab)* newline? (space | tab)*
616	i++
617	if i >= len(data) || data[i] != ':' {
618		return 0
619	}
620	i++
621	for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
622		i++
623	}
624	if i < len(data) && (data[i] == '\n' || data[i] == '\r') {
625		i++
626		if i < len(data) && data[i] == '\n' && data[i-1] == '\r' {
627			i++
628		}
629	}
630	for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
631		i++
632	}
633	if i >= len(data) {
634		return 0
635	}
636
637	var (
638		linkOffset, linkEnd   int
639		titleOffset, titleEnd int
640		lineEnd               int
641		raw                   []byte
642		hasBlock              bool
643	)
644
645	if p.flags&EXTENSION_FOOTNOTES != 0 && noteId != 0 {
646		linkOffset, linkEnd, raw, hasBlock = scanFootnote(p, data, i, tabSize)
647		lineEnd = linkEnd
648	} else if abbrId != "" {
649		titleOffset, titleEnd, lineEnd = scanAbbreviation(p, data, idEnd)
650		p.abbreviations[abbrId] = &abbreviation{title: data[titleOffset:titleEnd]}
651		return lineEnd
652	} else {
653		linkOffset, linkEnd, titleOffset, titleEnd, lineEnd = scanLinkRef(p, data, i)
654	}
655	if lineEnd == 0 {
656		return 0
657	}
658
659	// a valid ref has been found
660	ref := &reference{
661		noteId:   noteId,
662		hasBlock: hasBlock,
663	}
664
665	if noteId > 0 {
666		// reusing the link field for the id since footnotes don't have links
667		ref.link = data[idOffset:idEnd]
668		// if footnote, it's not really a title, it's the contained text
669		ref.title = raw
670	} else {
671		ref.link = data[linkOffset:linkEnd]
672		ref.title = data[titleOffset:titleEnd]
673	}
674
675	// id matches are case-insensitive
676	id := string(bytes.ToLower(data[idOffset:idEnd]))
677
678	// CommonMark don't overwrite newly found references
679	if _, ok := p.refs[id]; !ok {
680		p.refs[id] = ref
681	}
682
683	return lineEnd
684}
685
686func scanLinkRef(p *parser, data []byte, i int) (linkOffset, linkEnd, titleOffset, titleEnd, lineEnd int) {
687	// link: whitespace-free sequence, optionally between angle brackets
688	if data[i] == '<' {
689		i++
690	}
691	linkOffset = i
692	if i == len(data) {
693		return
694	}
695	for i < len(data) && data[i] != ' ' && data[i] != '\t' && data[i] != '\n' && data[i] != '\r' {
696		i++
697	}
698	linkEnd = i
699	if data[linkOffset] == '<' && data[linkEnd-1] == '>' {
700		linkOffset++
701		linkEnd--
702	}
703
704	// optional spacer: (space | tab)* (newline | '\'' | '"' | '(' )
705	for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
706		i++
707	}
708	if i < len(data) && data[i] != '\n' && data[i] != '\r' && data[i] != '\'' && data[i] != '"' && data[i] != '(' {
709		return
710	}
711
712	// compute end-of-line
713	if i >= len(data) || data[i] == '\r' || data[i] == '\n' {
714		lineEnd = i
715	}
716	if i+1 < len(data) && data[i] == '\r' && data[i+1] == '\n' {
717		lineEnd++
718	}
719
720	// optional (space|tab)* spacer after a newline
721	if lineEnd > 0 {
722		i = lineEnd + 1
723		for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
724			i++
725		}
726	}
727
728	// optional title: any non-newline sequence enclosed in '"() alone on its line
729	if i+1 < len(data) && (data[i] == '\'' || data[i] == '"' || data[i] == '(') {
730		i++
731		titleOffset = i
732
733		// look for EOL
734		for i < len(data) && data[i] != '\n' && data[i] != '\r' {
735			i++
736		}
737		if i+1 < len(data) && data[i] == '\n' && data[i+1] == '\r' {
738			titleEnd = i + 1
739		} else {
740			titleEnd = i
741		}
742
743		// step back
744		i--
745		for i > titleOffset && (data[i] == ' ' || data[i] == '\t') {
746			i--
747		}
748		if i > titleOffset && (data[i] == '\'' || data[i] == '"' || data[i] == ')') {
749			lineEnd = titleEnd
750			titleEnd = i
751		}
752	}
753
754	return
755}
756
757// The first bit of this logic is the same as (*parser).listItem, but the rest
758// is much simpler. This function simply finds the entire block and shifts it
759// over by one tab if it is indeed a block (just returns the line if it's not).
760// blockEnd is the end of the section in the input buffer, and contents is the
761// extracted text that was shifted over one tab. It will need to be rendered at
762// the end of the document.
763func scanFootnote(p *parser, data []byte, i, indentSize int) (blockStart, blockEnd int, contents []byte, hasBlock bool) {
764	if i == 0 || len(data) == 0 {
765		return
766	}
767
768	// skip leading whitespace on first line
769	for i < len(data) && data[i] == ' ' {
770		i++
771	}
772
773	blockStart = i
774
775	// find the end of the line
776	blockEnd = i
777	for i < len(data) && data[i-1] != '\n' {
778		i++
779	}
780
781	// get working buffer
782	var raw bytes.Buffer
783
784	// put the first line into the working buffer
785	raw.Write(data[blockEnd:i])
786	blockEnd = i
787
788	// process the following lines
789	containsBlankLine := false
790
791gatherLines:
792	for blockEnd < len(data) {
793		i++
794
795		// find the end of this line
796		for i < len(data) && data[i-1] != '\n' {
797			i++
798		}
799
800		// if it is an empty line, guess that it is part of this item
801		// and move on to the next line
802		if p.isEmpty(data[blockEnd:i]) > 0 {
803			containsBlankLine = true
804			blockEnd = i
805			continue
806		}
807
808		n := 0
809		if n = isIndented(data[blockEnd:i], indentSize); n == 0 {
810			// this is the end of the block.
811			// we don't want to include this last line in the index.
812			break gatherLines
813		}
814
815		// if there were blank lines before this one, insert a new one now
816		if containsBlankLine {
817			raw.WriteByte('\n')
818			containsBlankLine = false
819		}
820
821		// get rid of that first tab, write to buffer
822		raw.Write(data[blockEnd+n : i])
823		hasBlock = true
824
825		blockEnd = i
826	}
827
828	if data[blockEnd-1] != '\n' {
829		raw.WriteByte('\n')
830	}
831
832	contents = raw.Bytes()
833
834	return
835}
836
837func scanAbbreviation(p *parser, data []byte, i int) (titleOffset, titleEnd, lineEnd int) {
838	lineEnd = i
839	for lineEnd < len(data) && data[lineEnd] != '\n' {
840		lineEnd++
841	}
842
843	if len(data[i+2:lineEnd]) == 0 || p.isEmpty(data[i+2:lineEnd]) > 0 {
844		return i + 2, i + 2, lineEnd
845	}
846
847	titleOffset = i + 2
848	for data[titleOffset] == ' ' {
849		titleOffset++
850	}
851	titleEnd = lineEnd
852	for data[titleEnd-1] == ' ' {
853		titleEnd--
854	}
855
856	return
857}
858
859// Miscellaneous helper functions
860// Test if a character is a whitespace character.
861func isspace(c byte) bool {
862	return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == '\v'
863}
864func ispunct(c byte) bool {
865	for _, r := range []byte("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~") {
866		if c == r {
867			return true
868		}
869	}
870	return false
871}
872
873func isupper(c byte) bool  { return (c >= 'A' && c <= 'Z') }
874func isletter(c byte) bool { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') }
875
876// Test if a character is a letter or a digit.
877// TODO: check when this is looking for ASCII alnum and when it should use unicode
878func isalnum(c byte) bool { return (c >= '0' && c <= '9') || isletter(c) }
879func isnum(c byte) bool   { return (c >= '0' && c <= '9') }
880
881// check if the string only contains, i, v, x, c and l. If uppercase is true, check uppercase version.
882func isroman(digit byte, uppercase bool) bool {
883	if !uppercase {
884		if digit == 'i' || digit == 'v' || digit == 'x' || digit == 'c' || digit == 'l' {
885			return true
886		}
887		return false
888	}
889	if digit == 'I' || digit == 'V' || digit == 'X' || digit == 'C' || digit == 'L' {
890		return true
891	}
892	return false
893}
894
895// replace {{file.md}} with the contents of the file.
896func (p *parser) include(out *bytes.Buffer, data []byte, depth int) int {
897	i := 0
898	if len(data) < 3 {
899		return 0
900	}
901	if data[i] != '{' && data[i+1] != '{' {
902		return 0
903	}
904
905	// find the end delimiter
906	end, j := 0, 0
907	for end = i; end < len(data) && j < 2; end++ {
908		if data[end] == '}' {
909			j++
910		} else {
911			j = 0
912		}
913	}
914	if j < 2 && end >= len(data) {
915		return 0
916	}
917	filename := data[i+2 : end-2]
918
919	// Now a possible address in blockquotes
920	var address []byte
921	if end < len(data) && data[end] == '[' {
922		j = end
923		for j < len(data) && data[j] != ']' {
924			j++
925		}
926		if j == len(data) {
927			// assuming no address
928			address = nil
929		} else {
930			address = data[end+1 : j]
931			end = j + 1
932		}
933	}
934
935	input := parseAddress(address, filename)
936	if input == nil {
937		return end
938	}
939	if input[len(input)-1] != '\n' {
940		input = append(input, '\n')
941	}
942	first := firstPass(p, input, depth+1)
943	out.Write(first.Bytes())
944	return end
945}
946
947// replace <{{file.go}}[address] with the contents of the file. Pay attention to the indentation of the
948// include and prefix the code with that number of spaces + 4, it returns the new bytes and a boolean
949// indicating we've detected a code include.
950func (p *parser) codeInclude(out *bytes.Buffer, data []byte) int {
951	// TODO: this is not an inline element
952	i := 0
953	l := len(data)
954	if l < 3 {
955		return 0
956	}
957	if data[i] != '<' && data[i+1] != '{' && data[i+2] != '{' {
958		return 0
959	}
960
961	// find the end delimiter
962	end, j := 0, 0
963	for end = i; end < l && j < 2; end++ {
964		if data[end] == '}' {
965			j++
966		} else {
967			j = 0
968		}
969	}
970	if j < 2 && end >= l {
971		return 0
972	}
973
974	lang := ""
975	// found <{{filename}}
976	// this could be the end, or we could have an option [address] -block attached
977	filename := data[i+3 : end-2]
978	// get the extension of the filename, if it is a member of a predefined set a
979	// language we use it as the lang (and we will emit <sourcecode>)
980	if x := path.Ext(string(filename)); x != "" {
981		// x includes the dot
982		if _, ok := SourceCodeTypes[x[1:]]; ok {
983			lang = x[1:]
984		}
985	}
986
987	// Now a possible address in blockquotes
988	var address []byte
989	if end < l && data[end] == '[' {
990		j = end
991		for j < l && data[j] != ']' {
992			j++
993		}
994		if j == l {
995			// assuming no address
996			address = nil
997			end = l
998		} else {
999			address = data[end+1 : j]
1000			end = j + 1
1001		}
1002	}
1003
1004	code := parseAddress(address, filename)
1005
1006	if len(code) == 0 {
1007		code = []byte{'\n'}
1008	}
1009	if code[len(code)-1] != '\n' {
1010		code = append(code, '\n')
1011	}
1012
1013	// if the next line starts with Figure: we consider that a caption
1014	var caption bytes.Buffer
1015	if end < l-1 && bytes.HasPrefix(data[end+1:], []byte("Figure: ")) {
1016		line := end + 1
1017		j := end + 1
1018		for line < l {
1019			j++
1020			// find the end of this line
1021			for j <= l && data[j-1] != '\n' {
1022				j++
1023			}
1024			if p.isEmpty(data[line:j]) > 0 {
1025				break
1026			}
1027			line = j
1028		}
1029		p.inline(&caption, data[end+1+8:j-1]) // +8 for 'Figure: '
1030		end = j - 1
1031	}
1032
1033	co := ""
1034	if p.ial != nil {
1035		co = p.ial.Value("callout")
1036		p.ial.DropAttr("callout")
1037	}
1038
1039	p.r.SetAttr(p.ial)
1040	p.ial = nil
1041
1042	if co != "" {
1043		var callout bytes.Buffer
1044		callouts(p, &callout, code, 0, co)
1045		p.r.BlockCode(out, callout.Bytes(), lang, caption.Bytes(), p.insideFigure, true)
1046	} else {
1047		p.callouts = nil
1048		p.r.BlockCode(out, code, lang, caption.Bytes(), p.insideFigure, false)
1049	}
1050	p.r.SetAttr(nil) // reset it again. TODO(miek): double check
1051
1052	return end
1053}
1054
1055// replace tab characters with spaces, aligning to the next tab_size column.
1056// always ends output with a newline
1057func expandTabs(out *bytes.Buffer, line []byte, tabSize int) {
1058	// first, check for common cases: no tabs, or only tabs at beginning of line
1059	i, prefix := 0, 0
1060	slowcase := false
1061	for i = 0; i < len(line); i++ {
1062		if line[i] == '\t' {
1063			if prefix == i {
1064				prefix++
1065			} else {
1066				slowcase = true
1067				break
1068			}
1069		}
1070	}
1071
1072	// no need to decode runes if all tabs are at the beginning of the line
1073	if !slowcase {
1074		for i = 0; i < prefix*tabSize; i++ {
1075			out.WriteByte(' ')
1076		}
1077		out.Write(line[prefix:])
1078		return
1079	}
1080
1081	// the slow case: we need to count runes to figure out how
1082	// many spaces to insert for each tab
1083	column := 0
1084	i = 0
1085	for i < len(line) {
1086		start := i
1087		for i < len(line) && line[i] != '\t' {
1088			_, size := utf8.DecodeRune(line[i:])
1089			i += size
1090			column++
1091		}
1092
1093		if i > start {
1094			out.Write(line[start:i])
1095		}
1096
1097		if i >= len(line) {
1098			break
1099		}
1100
1101		for {
1102			out.WriteByte(' ')
1103			column++
1104			if column%tabSize == 0 {
1105				break
1106			}
1107		}
1108
1109		i++
1110	}
1111}
1112
1113// Find if a line counts as indented or not.
1114// Returns number of characters the indent is (0 = not indented).
1115func isIndented(data []byte, indentSize int) int {
1116	if len(data) == 0 {
1117		return 0
1118	}
1119	if data[0] == '\t' {
1120		return 1
1121	}
1122	if len(data) < indentSize {
1123		return 0
1124	}
1125	for i := 0; i < indentSize; i++ {
1126		if data[i] != ' ' {
1127			return 0
1128		}
1129	}
1130	return indentSize
1131}
1132
1133// Create a url-safe slug for fragments
1134func slugify(in []byte) []byte {
1135	if len(in) == 0 {
1136		return in
1137	}
1138	out := make([]byte, 0, len(in))
1139	sym := false
1140
1141	for _, ch := range in {
1142		if isalnum(ch) {
1143			sym = false
1144			out = append(out, ch)
1145		} else if sym {
1146			continue
1147		} else {
1148			out = append(out, '-')
1149			sym = true
1150		}
1151	}
1152	var a, b int
1153	var ch byte
1154	for a, ch = range out {
1155		if ch != '-' {
1156			break
1157		}
1158	}
1159	for b = len(out) - 1; b > 0; b-- {
1160		if out[b] != '-' {
1161			break
1162		}
1163	}
1164	return out[a : b+1]
1165}
1166