1// Blackfriday Markdown Processor
2// Available at http://github.com/russross/blackfriday
3//
4// Copyright © 2011 Russ Ross <russ@russross.com>.
5// Distributed under the Simplified BSD License.
6// See README.md for details.
7
8package blackfriday
9
10import (
11	"bytes"
12	"fmt"
13	"io"
14	"strings"
15	"unicode/utf8"
16)
17
18//
19// Markdown parsing and processing
20//
21
22// Version string of the package. Appears in the rendered document when
23// CompletePage flag is on.
24const Version = "2.0"
25
26// Extensions is a bitwise or'ed collection of enabled Blackfriday's
27// extensions.
28type Extensions int
29
30// These are the supported markdown parsing extensions.
31// OR these values together to select multiple extensions.
32const (
33	NoExtensions           Extensions = 0
34	NoIntraEmphasis        Extensions = 1 << iota // Ignore emphasis markers inside words
35	Tables                                        // Render tables
36	FencedCode                                    // Render fenced code blocks
37	Autolink                                      // Detect embedded URLs that are not explicitly marked
38	Strikethrough                                 // Strikethrough text using ~~test~~
39	LaxHTMLBlocks                                 // Loosen up HTML block parsing rules
40	SpaceHeadings                                 // Be strict about prefix heading rules
41	HardLineBreak                                 // Translate newlines into line breaks
42	TabSizeEight                                  // Expand tabs to eight spaces instead of four
43	Footnotes                                     // Pandoc-style footnotes
44	NoEmptyLineBeforeBlock                        // No need to insert an empty line to start a (code, quote, ordered list, unordered list) block
45	HeadingIDs                                    // specify heading IDs  with {#id}
46	Titleblock                                    // Titleblock ala pandoc
47	AutoHeadingIDs                                // Create the heading ID from the text
48	BackslashLineBreak                            // Translate trailing backslashes into line breaks
49	DefinitionLists                               // Render definition lists
50
51	CommonHTMLFlags HTMLFlags = UseXHTML | Smartypants |
52		SmartypantsFractions | SmartypantsDashes | SmartypantsLatexDashes
53
54	CommonExtensions Extensions = NoIntraEmphasis | Tables | FencedCode |
55		Autolink | Strikethrough | SpaceHeadings | HeadingIDs |
56		BackslashLineBreak | DefinitionLists
57)
58
59// ListType contains bitwise or'ed flags for list and list item objects.
60type ListType int
61
62// These are the possible flag values for the ListItem renderer.
63// Multiple flag values may be ORed together.
64// These are mostly of interest if you are writing a new output format.
65const (
66	ListTypeOrdered ListType = 1 << iota
67	ListTypeDefinition
68	ListTypeTerm
69
70	ListItemContainsBlock
71	ListItemBeginningOfList // TODO: figure out if this is of any use now
72	ListItemEndOfList
73)
74
75// CellAlignFlags holds a type of alignment in a table cell.
76type CellAlignFlags int
77
78// These are the possible flag values for the table cell renderer.
79// Only a single one of these values will be used; they are not ORed together.
80// These are mostly of interest if you are writing a new output format.
81const (
82	TableAlignmentLeft CellAlignFlags = 1 << iota
83	TableAlignmentRight
84	TableAlignmentCenter = (TableAlignmentLeft | TableAlignmentRight)
85)
86
87// The size of a tab stop.
88const (
89	TabSizeDefault = 4
90	TabSizeDouble  = 8
91)
92
93// blockTags is a set of tags that are recognized as HTML block tags.
94// Any of these can be included in markdown text without special escaping.
95var blockTags = map[string]struct{}{
96	"blockquote": {},
97	"del":        {},
98	"div":        {},
99	"dl":         {},
100	"fieldset":   {},
101	"form":       {},
102	"h1":         {},
103	"h2":         {},
104	"h3":         {},
105	"h4":         {},
106	"h5":         {},
107	"h6":         {},
108	"iframe":     {},
109	"ins":        {},
110	"math":       {},
111	"noscript":   {},
112	"ol":         {},
113	"pre":        {},
114	"p":          {},
115	"script":     {},
116	"style":      {},
117	"table":      {},
118	"ul":         {},
119
120	// HTML5
121	"address":    {},
122	"article":    {},
123	"aside":      {},
124	"canvas":     {},
125	"figcaption": {},
126	"figure":     {},
127	"footer":     {},
128	"header":     {},
129	"hgroup":     {},
130	"main":       {},
131	"nav":        {},
132	"output":     {},
133	"progress":   {},
134	"section":    {},
135	"video":      {},
136}
137
138// Renderer is the rendering interface. This is mostly of interest if you are
139// implementing a new rendering format.
140//
141// Only an HTML implementation is provided in this repository, see the README
142// for external implementations.
143type Renderer interface {
144	// RenderNode is the main rendering method. It will be called once for
145	// every leaf node and twice for every non-leaf node (first with
146	// entering=true, then with entering=false). The method should write its
147	// rendition of the node to the supplied writer w.
148	RenderNode(w io.Writer, node *Node, entering bool) WalkStatus
149
150	// RenderHeader is a method that allows the renderer to produce some
151	// content preceding the main body of the output document. The header is
152	// understood in the broad sense here. For example, the default HTML
153	// renderer will write not only the HTML document preamble, but also the
154	// table of contents if it was requested.
155	//
156	// The method will be passed an entire document tree, in case a particular
157	// implementation needs to inspect it to produce output.
158	//
159	// The output should be written to the supplied writer w. If your
160	// implementation has no header to write, supply an empty implementation.
161	RenderHeader(w io.Writer, ast *Node)
162
163	// RenderFooter is a symmetric counterpart of RenderHeader.
164	RenderFooter(w io.Writer, ast *Node)
165}
166
167// Callback functions for inline parsing. One such function is defined
168// for each character that triggers a response when parsing inline data.
169type inlineParser func(p *Markdown, data []byte, offset int) (int, *Node)
170
171// Markdown is a type that holds extensions and the runtime state used by
172// Parse, and the renderer. You can not use it directly, construct it with New.
173type Markdown struct {
174	renderer          Renderer
175	referenceOverride ReferenceOverrideFunc
176	refs              map[string]*reference
177	inlineCallback    [256]inlineParser
178	extensions        Extensions
179	nesting           int
180	maxNesting        int
181	insideLink        bool
182
183	// Footnotes need to be ordered as well as available to quickly check for
184	// presence. If a ref is also a footnote, it's stored both in refs and here
185	// in notes. Slice is nil if footnotes not enabled.
186	notes []*reference
187
188	doc                  *Node
189	tip                  *Node // = doc
190	oldTip               *Node
191	lastMatchedContainer *Node // = doc
192	allClosed            bool
193}
194
195func (p *Markdown) getRef(refid string) (ref *reference, found bool) {
196	if p.referenceOverride != nil {
197		r, overridden := p.referenceOverride(refid)
198		if overridden {
199			if r == nil {
200				return nil, false
201			}
202			return &reference{
203				link:     []byte(r.Link),
204				title:    []byte(r.Title),
205				noteID:   0,
206				hasBlock: false,
207				text:     []byte(r.Text)}, true
208		}
209	}
210	// refs are case insensitive
211	ref, found = p.refs[strings.ToLower(refid)]
212	return ref, found
213}
214
215func (p *Markdown) finalize(block *Node) {
216	above := block.Parent
217	block.open = false
218	p.tip = above
219}
220
221func (p *Markdown) addChild(node NodeType, offset uint32) *Node {
222	return p.addExistingChild(NewNode(node), offset)
223}
224
225func (p *Markdown) addExistingChild(node *Node, offset uint32) *Node {
226	for !p.tip.canContain(node.Type) {
227		p.finalize(p.tip)
228	}
229	p.tip.AppendChild(node)
230	p.tip = node
231	return node
232}
233
234func (p *Markdown) closeUnmatchedBlocks() {
235	if !p.allClosed {
236		for p.oldTip != p.lastMatchedContainer {
237			parent := p.oldTip.Parent
238			p.finalize(p.oldTip)
239			p.oldTip = parent
240		}
241		p.allClosed = true
242	}
243}
244
245//
246//
247// Public interface
248//
249//
250
251// Reference represents the details of a link.
252// See the documentation in Options for more details on use-case.
253type Reference struct {
254	// Link is usually the URL the reference points to.
255	Link string
256	// Title is the alternate text describing the link in more detail.
257	Title string
258	// Text is the optional text to override the ref with if the syntax used was
259	// [refid][]
260	Text string
261}
262
263// ReferenceOverrideFunc is expected to be called with a reference string and
264// return either a valid Reference type that the reference string maps to or
265// nil. If overridden is false, the default reference logic will be executed.
266// See the documentation in Options for more details on use-case.
267type ReferenceOverrideFunc func(reference string) (ref *Reference, overridden bool)
268
269// New constructs a Markdown processor. You can use the same With* functions as
270// for Run() to customize parser's behavior and the renderer.
271func New(opts ...Option) *Markdown {
272	var p Markdown
273	for _, opt := range opts {
274		opt(&p)
275	}
276	p.refs = make(map[string]*reference)
277	p.maxNesting = 16
278	p.insideLink = false
279	docNode := NewNode(Document)
280	p.doc = docNode
281	p.tip = docNode
282	p.oldTip = docNode
283	p.lastMatchedContainer = docNode
284	p.allClosed = true
285	// register inline parsers
286	p.inlineCallback[' '] = maybeLineBreak
287	p.inlineCallback['*'] = emphasis
288	p.inlineCallback['_'] = emphasis
289	if p.extensions&Strikethrough != 0 {
290		p.inlineCallback['~'] = emphasis
291	}
292	p.inlineCallback['`'] = codeSpan
293	p.inlineCallback['\n'] = lineBreak
294	p.inlineCallback['['] = link
295	p.inlineCallback['<'] = leftAngle
296	p.inlineCallback['\\'] = escape
297	p.inlineCallback['&'] = entity
298	p.inlineCallback['!'] = maybeImage
299	p.inlineCallback['^'] = maybeInlineFootnote
300	if p.extensions&Autolink != 0 {
301		p.inlineCallback['h'] = maybeAutoLink
302		p.inlineCallback['m'] = maybeAutoLink
303		p.inlineCallback['f'] = maybeAutoLink
304		p.inlineCallback['H'] = maybeAutoLink
305		p.inlineCallback['M'] = maybeAutoLink
306		p.inlineCallback['F'] = maybeAutoLink
307	}
308	if p.extensions&Footnotes != 0 {
309		p.notes = make([]*reference, 0)
310	}
311	return &p
312}
313
314// Option customizes the Markdown processor's default behavior.
315type Option func(*Markdown)
316
317// WithRenderer allows you to override the default renderer.
318func WithRenderer(r Renderer) Option {
319	return func(p *Markdown) {
320		p.renderer = r
321	}
322}
323
324// WithExtensions allows you to pick some of the many extensions provided by
325// Blackfriday. You can bitwise OR them.
326func WithExtensions(e Extensions) Option {
327	return func(p *Markdown) {
328		p.extensions = e
329	}
330}
331
332// WithNoExtensions turns off all extensions and custom behavior.
333func WithNoExtensions() Option {
334	return func(p *Markdown) {
335		p.extensions = NoExtensions
336		p.renderer = NewHTMLRenderer(HTMLRendererParameters{
337			Flags: HTMLFlagsNone,
338		})
339	}
340}
341
342// WithRefOverride sets an optional function callback that is called every
343// time a reference is resolved.
344//
345// In Markdown, the link reference syntax can be made to resolve a link to
346// a reference instead of an inline URL, in one of the following ways:
347//
348//  * [link text][refid]
349//  * [refid][]
350//
351// Usually, the refid is defined at the bottom of the Markdown document. If
352// this override function is provided, the refid is passed to the override
353// function first, before consulting the defined refids at the bottom. If
354// the override function indicates an override did not occur, the refids at
355// the bottom will be used to fill in the link details.
356func WithRefOverride(o ReferenceOverrideFunc) Option {
357	return func(p *Markdown) {
358		p.referenceOverride = o
359	}
360}
361
362// Run is the main entry point to Blackfriday. It parses and renders a
363// block of markdown-encoded text.
364//
365// The simplest invocation of Run takes one argument, input:
366//     output := Run(input)
367// This will parse the input with CommonExtensions enabled and render it with
368// the default HTMLRenderer (with CommonHTMLFlags).
369//
370// Variadic arguments opts can customize the default behavior. Since Markdown
371// type does not contain exported fields, you can not use it directly. Instead,
372// use the With* functions. For example, this will call the most basic
373// functionality, with no extensions:
374//     output := Run(input, WithNoExtensions())
375//
376// You can use any number of With* arguments, even contradicting ones. They
377// will be applied in order of appearance and the latter will override the
378// former:
379//     output := Run(input, WithNoExtensions(), WithExtensions(exts),
380//         WithRenderer(yourRenderer))
381func Run(input []byte, opts ...Option) []byte {
382	r := NewHTMLRenderer(HTMLRendererParameters{
383		Flags: CommonHTMLFlags,
384	})
385	optList := []Option{WithRenderer(r), WithExtensions(CommonExtensions)}
386	optList = append(optList, opts...)
387	parser := New(optList...)
388	ast := parser.Parse(input)
389	var buf bytes.Buffer
390	parser.renderer.RenderHeader(&buf, ast)
391	ast.Walk(func(node *Node, entering bool) WalkStatus {
392		return parser.renderer.RenderNode(&buf, node, entering)
393	})
394	parser.renderer.RenderFooter(&buf, ast)
395	return buf.Bytes()
396}
397
398// Parse is an entry point to the parsing part of Blackfriday. It takes an
399// input markdown document and produces a syntax tree for its contents. This
400// tree can then be rendered with a default or custom renderer, or
401// analyzed/transformed by the caller to whatever non-standard needs they have.
402// The return value is the root node of the syntax tree.
403func (p *Markdown) Parse(input []byte) *Node {
404	p.block(input)
405	// Walk the tree and finish up some of unfinished blocks
406	for p.tip != nil {
407		p.finalize(p.tip)
408	}
409	// Walk the tree again and process inline markdown in each block
410	p.doc.Walk(func(node *Node, entering bool) WalkStatus {
411		if node.Type == Paragraph || node.Type == Heading || node.Type == TableCell {
412			p.inline(node, node.content)
413			node.content = nil
414		}
415		return GoToNext
416	})
417	p.parseRefsToAST()
418	return p.doc
419}
420
421func (p *Markdown) parseRefsToAST() {
422	if p.extensions&Footnotes == 0 || len(p.notes) == 0 {
423		return
424	}
425	p.tip = p.doc
426	block := p.addBlock(List, nil)
427	block.IsFootnotesList = true
428	block.ListFlags = ListTypeOrdered
429	flags := ListItemBeginningOfList
430	// Note: this loop is intentionally explicit, not range-form. This is
431	// because the body of the loop will append nested footnotes to p.notes and
432	// we need to process those late additions. Range form would only walk over
433	// the fixed initial set.
434	for i := 0; i < len(p.notes); i++ {
435		ref := p.notes[i]
436		p.addExistingChild(ref.footnote, 0)
437		block := ref.footnote
438		block.ListFlags = flags | ListTypeOrdered
439		block.RefLink = ref.link
440		if ref.hasBlock {
441			flags |= ListItemContainsBlock
442			p.block(ref.title)
443		} else {
444			p.inline(block, ref.title)
445		}
446		flags &^= ListItemBeginningOfList | ListItemContainsBlock
447	}
448	above := block.Parent
449	finalizeList(block)
450	p.tip = above
451	block.Walk(func(node *Node, entering bool) WalkStatus {
452		if node.Type == Paragraph || node.Type == Heading {
453			p.inline(node, node.content)
454			node.content = nil
455		}
456		return GoToNext
457	})
458}
459
460//
461// Link references
462//
463// This section implements support for references that (usually) appear
464// as footnotes in a document, and can be referenced anywhere in the document.
465// The basic format is:
466//
467//    [1]: http://www.google.com/ "Google"
468//    [2]: http://www.github.com/ "Github"
469//
470// Anywhere in the document, the reference can be linked by referring to its
471// label, i.e., 1 and 2 in this example, as in:
472//
473//    This library is hosted on [Github][2], a git hosting site.
474//
475// Actual footnotes as specified in Pandoc and supported by some other Markdown
476// libraries such as php-markdown are also taken care of. They look like this:
477//
478//    This sentence needs a bit of further explanation.[^note]
479//
480//    [^note]: This is the explanation.
481//
482// Footnotes should be placed at the end of the document in an ordered list.
483// Finally, there are inline footnotes such as:
484//
485//    Inline footnotes^[Also supported.] provide a quick inline explanation,
486//    but are rendered at the bottom of the document.
487//
488
489// reference holds all information necessary for a reference-style links or
490// footnotes.
491//
492// Consider this markdown with reference-style links:
493//
494//     [link][ref]
495//
496//     [ref]: /url/ "tooltip title"
497//
498// It will be ultimately converted to this HTML:
499//
500//     <p><a href=\"/url/\" title=\"title\">link</a></p>
501//
502// And a reference structure will be populated as follows:
503//
504//     p.refs["ref"] = &reference{
505//         link: "/url/",
506//         title: "tooltip title",
507//     }
508//
509// Alternatively, reference can contain information about a footnote. Consider
510// this markdown:
511//
512//     Text needing a footnote.[^a]
513//
514//     [^a]: This is the note
515//
516// A reference structure will be populated as follows:
517//
518//     p.refs["a"] = &reference{
519//         link: "a",
520//         title: "This is the note",
521//         noteID: <some positive int>,
522//     }
523//
524// TODO: As you can see, it begs for splitting into two dedicated structures
525// for refs and for footnotes.
526type reference struct {
527	link     []byte
528	title    []byte
529	noteID   int // 0 if not a footnote ref
530	hasBlock bool
531	footnote *Node // a link to the Item node within a list of footnotes
532
533	text []byte // only gets populated by refOverride feature with Reference.Text
534}
535
536func (r *reference) String() string {
537	return fmt.Sprintf("{link: %q, title: %q, text: %q, noteID: %d, hasBlock: %v}",
538		r.link, r.title, r.text, r.noteID, r.hasBlock)
539}
540
541// Check whether or not data starts with a reference link.
542// If so, it is parsed and stored in the list of references
543// (in the render struct).
544// Returns the number of bytes to skip to move past it,
545// or zero if the first line is not a reference.
546func isReference(p *Markdown, data []byte, tabSize int) int {
547	// up to 3 optional leading spaces
548	if len(data) < 4 {
549		return 0
550	}
551	i := 0
552	for i < 3 && data[i] == ' ' {
553		i++
554	}
555
556	noteID := 0
557
558	// id part: anything but a newline between brackets
559	if data[i] != '[' {
560		return 0
561	}
562	i++
563	if p.extensions&Footnotes != 0 {
564		if i < len(data) && data[i] == '^' {
565			// we can set it to anything here because the proper noteIds will
566			// be assigned later during the second pass. It just has to be != 0
567			noteID = 1
568			i++
569		}
570	}
571	idOffset := i
572	for i < len(data) && data[i] != '\n' && data[i] != '\r' && data[i] != ']' {
573		i++
574	}
575	if i >= len(data) || data[i] != ']' {
576		return 0
577	}
578	idEnd := i
579	// footnotes can have empty ID, like this: [^], but a reference can not be
580	// empty like this: []. Break early if it's not a footnote and there's no ID
581	if noteID == 0 && idOffset == idEnd {
582		return 0
583	}
584	// spacer: colon (space | tab)* newline? (space | tab)*
585	i++
586	if i >= len(data) || data[i] != ':' {
587		return 0
588	}
589	i++
590	for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
591		i++
592	}
593	if i < len(data) && (data[i] == '\n' || data[i] == '\r') {
594		i++
595		if i < len(data) && data[i] == '\n' && data[i-1] == '\r' {
596			i++
597		}
598	}
599	for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
600		i++
601	}
602	if i >= len(data) {
603		return 0
604	}
605
606	var (
607		linkOffset, linkEnd   int
608		titleOffset, titleEnd int
609		lineEnd               int
610		raw                   []byte
611		hasBlock              bool
612	)
613
614	if p.extensions&Footnotes != 0 && noteID != 0 {
615		linkOffset, linkEnd, raw, hasBlock = scanFootnote(p, data, i, tabSize)
616		lineEnd = linkEnd
617	} else {
618		linkOffset, linkEnd, titleOffset, titleEnd, lineEnd = scanLinkRef(p, data, i)
619	}
620	if lineEnd == 0 {
621		return 0
622	}
623
624	// a valid ref has been found
625
626	ref := &reference{
627		noteID:   noteID,
628		hasBlock: hasBlock,
629	}
630
631	if noteID > 0 {
632		// reusing the link field for the id since footnotes don't have links
633		ref.link = data[idOffset:idEnd]
634		// if footnote, it's not really a title, it's the contained text
635		ref.title = raw
636	} else {
637		ref.link = data[linkOffset:linkEnd]
638		ref.title = data[titleOffset:titleEnd]
639	}
640
641	// id matches are case-insensitive
642	id := string(bytes.ToLower(data[idOffset:idEnd]))
643
644	p.refs[id] = ref
645
646	return lineEnd
647}
648
649func scanLinkRef(p *Markdown, data []byte, i int) (linkOffset, linkEnd, titleOffset, titleEnd, lineEnd int) {
650	// link: whitespace-free sequence, optionally between angle brackets
651	if data[i] == '<' {
652		i++
653	}
654	linkOffset = i
655	for i < len(data) && data[i] != ' ' && data[i] != '\t' && data[i] != '\n' && data[i] != '\r' {
656		i++
657	}
658	linkEnd = i
659	if data[linkOffset] == '<' && data[linkEnd-1] == '>' {
660		linkOffset++
661		linkEnd--
662	}
663
664	// optional spacer: (space | tab)* (newline | '\'' | '"' | '(' )
665	for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
666		i++
667	}
668	if i < len(data) && data[i] != '\n' && data[i] != '\r' && data[i] != '\'' && data[i] != '"' && data[i] != '(' {
669		return
670	}
671
672	// compute end-of-line
673	if i >= len(data) || data[i] == '\r' || data[i] == '\n' {
674		lineEnd = i
675	}
676	if i+1 < len(data) && data[i] == '\r' && data[i+1] == '\n' {
677		lineEnd++
678	}
679
680	// optional (space|tab)* spacer after a newline
681	if lineEnd > 0 {
682		i = lineEnd + 1
683		for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
684			i++
685		}
686	}
687
688	// optional title: any non-newline sequence enclosed in '"() alone on its line
689	if i+1 < len(data) && (data[i] == '\'' || data[i] == '"' || data[i] == '(') {
690		i++
691		titleOffset = i
692
693		// look for EOL
694		for i < len(data) && data[i] != '\n' && data[i] != '\r' {
695			i++
696		}
697		if i+1 < len(data) && data[i] == '\n' && data[i+1] == '\r' {
698			titleEnd = i + 1
699		} else {
700			titleEnd = i
701		}
702
703		// step back
704		i--
705		for i > titleOffset && (data[i] == ' ' || data[i] == '\t') {
706			i--
707		}
708		if i > titleOffset && (data[i] == '\'' || data[i] == '"' || data[i] == ')') {
709			lineEnd = titleEnd
710			titleEnd = i
711		}
712	}
713
714	return
715}
716
717// The first bit of this logic is the same as Parser.listItem, but the rest
718// is much simpler. This function simply finds the entire block and shifts it
719// over by one tab if it is indeed a block (just returns the line if it's not).
720// blockEnd is the end of the section in the input buffer, and contents is the
721// extracted text that was shifted over one tab. It will need to be rendered at
722// the end of the document.
723func scanFootnote(p *Markdown, data []byte, i, indentSize int) (blockStart, blockEnd int, contents []byte, hasBlock bool) {
724	if i == 0 || len(data) == 0 {
725		return
726	}
727
728	// skip leading whitespace on first line
729	for i < len(data) && data[i] == ' ' {
730		i++
731	}
732
733	blockStart = i
734
735	// find the end of the line
736	blockEnd = i
737	for i < len(data) && data[i-1] != '\n' {
738		i++
739	}
740
741	// get working buffer
742	var raw bytes.Buffer
743
744	// put the first line into the working buffer
745	raw.Write(data[blockEnd:i])
746	blockEnd = i
747
748	// process the following lines
749	containsBlankLine := false
750
751gatherLines:
752	for blockEnd < len(data) {
753		i++
754
755		// find the end of this line
756		for i < len(data) && data[i-1] != '\n' {
757			i++
758		}
759
760		// if it is an empty line, guess that it is part of this item
761		// and move on to the next line
762		if p.isEmpty(data[blockEnd:i]) > 0 {
763			containsBlankLine = true
764			blockEnd = i
765			continue
766		}
767
768		n := 0
769		if n = isIndented(data[blockEnd:i], indentSize); n == 0 {
770			// this is the end of the block.
771			// we don't want to include this last line in the index.
772			break gatherLines
773		}
774
775		// if there were blank lines before this one, insert a new one now
776		if containsBlankLine {
777			raw.WriteByte('\n')
778			containsBlankLine = false
779		}
780
781		// get rid of that first tab, write to buffer
782		raw.Write(data[blockEnd+n : i])
783		hasBlock = true
784
785		blockEnd = i
786	}
787
788	if data[blockEnd-1] != '\n' {
789		raw.WriteByte('\n')
790	}
791
792	contents = raw.Bytes()
793
794	return
795}
796
797//
798//
799// Miscellaneous helper functions
800//
801//
802
803// Test if a character is a punctuation symbol.
804// Taken from a private function in regexp in the stdlib.
805func ispunct(c byte) bool {
806	for _, r := range []byte("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~") {
807		if c == r {
808			return true
809		}
810	}
811	return false
812}
813
814// Test if a character is a whitespace character.
815func isspace(c byte) bool {
816	return ishorizontalspace(c) || isverticalspace(c)
817}
818
819// Test if a character is a horizontal whitespace character.
820func ishorizontalspace(c byte) bool {
821	return c == ' ' || c == '\t'
822}
823
824// Test if a character is a vertical character.
825func isverticalspace(c byte) bool {
826	return c == '\n' || c == '\r' || c == '\f' || c == '\v'
827}
828
829// Test if a character is letter.
830func isletter(c byte) bool {
831	return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
832}
833
834// Test if a character is a letter or a digit.
835// TODO: check when this is looking for ASCII alnum and when it should use unicode
836func isalnum(c byte) bool {
837	return (c >= '0' && c <= '9') || isletter(c)
838}
839
840// Replace tab characters with spaces, aligning to the next TAB_SIZE column.
841// always ends output with a newline
842func expandTabs(out *bytes.Buffer, line []byte, tabSize int) {
843	// first, check for common cases: no tabs, or only tabs at beginning of line
844	i, prefix := 0, 0
845	slowcase := false
846	for i = 0; i < len(line); i++ {
847		if line[i] == '\t' {
848			if prefix == i {
849				prefix++
850			} else {
851				slowcase = true
852				break
853			}
854		}
855	}
856
857	// no need to decode runes if all tabs are at the beginning of the line
858	if !slowcase {
859		for i = 0; i < prefix*tabSize; i++ {
860			out.WriteByte(' ')
861		}
862		out.Write(line[prefix:])
863		return
864	}
865
866	// the slow case: we need to count runes to figure out how
867	// many spaces to insert for each tab
868	column := 0
869	i = 0
870	for i < len(line) {
871		start := i
872		for i < len(line) && line[i] != '\t' {
873			_, size := utf8.DecodeRune(line[i:])
874			i += size
875			column++
876		}
877
878		if i > start {
879			out.Write(line[start:i])
880		}
881
882		if i >= len(line) {
883			break
884		}
885
886		for {
887			out.WriteByte(' ')
888			column++
889			if column%tabSize == 0 {
890				break
891			}
892		}
893
894		i++
895	}
896}
897
898// Find if a line counts as indented or not.
899// Returns number of characters the indent is (0 = not indented).
900func isIndented(data []byte, indentSize int) int {
901	if len(data) == 0 {
902		return 0
903	}
904	if data[0] == '\t' {
905		return 1
906	}
907	if len(data) < indentSize {
908		return 0
909	}
910	for i := 0; i < indentSize; i++ {
911		if data[i] != ' ' {
912			return 0
913		}
914	}
915	return indentSize
916}
917
918// Create a url-safe slug for fragments
919func slugify(in []byte) []byte {
920	if len(in) == 0 {
921		return in
922	}
923	out := make([]byte, 0, len(in))
924	sym := false
925
926	for _, ch := range in {
927		if isalnum(ch) {
928			sym = false
929			out = append(out, ch)
930		} else if sym {
931			continue
932		} else {
933			out = append(out, '-')
934			sym = true
935		}
936	}
937	var a, b int
938	var ch byte
939	for a, ch = range out {
940		if ch != '-' {
941			break
942		}
943	}
944	for b = len(out) - 1; b > 0; b-- {
945		if out[b] != '-' {
946			break
947		}
948	}
949	return out[a : b+1]
950}
951