1 // Copyright 2017 Google Inc. All rights reserved.
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a copy
4 // of this software and associated documentation files (the "Software"), to deal
5 // in the Software without restriction, including without limitation the rights
6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 // copies of the Software, and to permit persons to whom the Software is
8 // furnished to do so, subject to the following conditions:
9 //
10 // The above copyright notice and this permission notice shall be included in
11 // all copies or substantial portions of the Software.
12 //
13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 // THE SOFTWARE.
20 
21 //! Tree-based two pass parser.
22 
23 use std::cmp::{max, min};
24 use std::collections::{HashMap, VecDeque};
25 use std::ops::{Index, Range};
26 
27 use unicase::UniCase;
28 
29 use crate::linklabel::{scan_link_label_rest, LinkLabel, ReferenceLabel};
30 use crate::scanners::*;
31 use crate::strings::CowStr;
32 use crate::tree::{Tree, TreeIndex};
33 
34 // Allowing arbitrary depth nested parentheses inside link destinations
35 // can create denial of service vulnerabilities if we're not careful.
36 // The simplest countermeasure is to limit their depth, which is
37 // explicitly allowed by the spec as long as the limit is at least 3:
38 // https://spec.commonmark.org/0.29/#link-destination
39 const LINK_MAX_NESTED_PARENS: usize = 5;
40 
41 /// Codeblock kind.
42 #[derive(Clone, Debug, PartialEq)]
43 pub enum CodeBlockKind<'a> {
44     Indented,
45     /// The value contained in the tag describes the language of the code, which may be empty.
46     Fenced(CowStr<'a>),
47 }
48 
49 impl<'a> CodeBlockKind<'a> {
is_indented(&self) -> bool50     pub fn is_indented(&self) -> bool {
51         match *self {
52             CodeBlockKind::Indented => true,
53             _ => false,
54         }
55     }
56 
is_fenced(&self) -> bool57     pub fn is_fenced(&self) -> bool {
58         match *self {
59             CodeBlockKind::Fenced(_) => true,
60             _ => false,
61         }
62     }
63 }
64 
65 /// Tags for elements that can contain other elements.
66 #[derive(Clone, Debug, PartialEq)]
67 pub enum Tag<'a> {
68     /// A paragraph of text and other inline elements.
69     Paragraph,
70 
71     /// A heading. The field indicates the level of the heading.
72     Heading(u32),
73 
74     BlockQuote,
75     /// A code block.
76     CodeBlock(CodeBlockKind<'a>),
77 
78     /// A list. If the list is ordered the field indicates the number of the first item.
79     /// Contains only list items.
80     List(Option<u64>), // TODO: add delim and tight for ast (not needed for html)
81     /// A list item.
82     Item,
83     /// A footnote definition. The value contained is the footnote's label by which it can
84     /// be referred to.
85     FootnoteDefinition(CowStr<'a>),
86 
87     /// A table. Contains a vector describing the text-alignment for each of its columns.
88     Table(Vec<Alignment>),
89     /// A table header. Contains only `TableRow`s. Note that the table body starts immediately
90     /// after the closure of the `TableHead` tag. There is no `TableBody` tag.
91     TableHead,
92     /// A table row. Is used both for header rows as body rows. Contains only `TableCell`s.
93     TableRow,
94     TableCell,
95 
96     // span-level tags
97     Emphasis,
98     Strong,
99     Strikethrough,
100 
101     /// A link. The first field is the link type, the second the destination URL and the third is a title.
102     Link(LinkType, CowStr<'a>, CowStr<'a>),
103 
104     /// An image. The first field is the link type, the second the destination URL and the third is a title.
105     Image(LinkType, CowStr<'a>, CowStr<'a>),
106 }
107 
108 /// Type specifier for inline links. See [the Tag::Link](enum.Tag.html#variant.Link) for more information.
109 #[derive(Clone, Debug, PartialEq, Copy)]
110 pub enum LinkType {
111     /// Inline link like `[foo](bar)`
112     Inline,
113     /// Reference link like `[foo][bar]`
114     Reference,
115     /// Reference without destination in the document, but resolved by the broken_link_callback
116     ReferenceUnknown,
117     /// Collapsed link like `[foo][]`
118     Collapsed,
119     /// Collapsed link without destination in the document, but resolved by the broken_link_callback
120     CollapsedUnknown,
121     /// Shortcut link like `[foo]`
122     Shortcut,
123     /// Shortcut without destination in the document, but resolved by the broken_link_callback
124     ShortcutUnknown,
125     /// Autolink like `<http://foo.bar/baz>`
126     Autolink,
127     /// Email address in autolink like `<john@example.org>`
128     Email,
129 }
130 
131 impl LinkType {
to_unknown(self) -> Self132     fn to_unknown(self) -> Self {
133         match self {
134             LinkType::Reference => LinkType::ReferenceUnknown,
135             LinkType::Collapsed => LinkType::CollapsedUnknown,
136             LinkType::Shortcut => LinkType::ShortcutUnknown,
137             _ => unreachable!(),
138         }
139     }
140 }
141 
142 /// Markdown events that are generated in a preorder traversal of the document
143 /// tree, with additional `End` events whenever all of an inner node's children
144 /// have been visited.
145 #[derive(Clone, Debug, PartialEq)]
146 pub enum Event<'a> {
147     /// Start of a tagged element. Events that are yielded after this event
148     /// and before its corresponding `End` event are inside this element.
149     /// Start and end events are guaranteed to be balanced.
150     Start(Tag<'a>),
151     /// End of a tagged element.
152     End(Tag<'a>),
153     /// A text node.
154     Text(CowStr<'a>),
155     /// An inline code node.
156     Code(CowStr<'a>),
157     /// An HTML node.
158     Html(CowStr<'a>),
159     /// A reference to a footnote with given label, which may or may not be defined
160     /// by an event with a `Tag::FootnoteDefinition` tag. Definitions and references to them may
161     /// occur in any order.
162     FootnoteReference(CowStr<'a>),
163     /// A soft line break.
164     SoftBreak,
165     /// A hard line break.
166     HardBreak,
167     /// A horizontal ruler.
168     Rule,
169     /// A task list marker, rendered as a checkbox in HTML. Contains a true when it is checked.
170     TaskListMarker(bool),
171 }
172 
173 /// Table column text alignment.
174 #[derive(Copy, Clone, Debug, PartialEq)]
175 pub enum Alignment {
176     /// Default text alignment.
177     None,
178     Left,
179     Center,
180     Right,
181 }
182 
183 bitflags! {
184     /// Option struct containing flags for enabling extra features
185     /// that are not part of the CommonMark spec.
186     pub struct Options: u32 {
187         const ENABLE_TABLES = 1 << 1;
188         const ENABLE_FOOTNOTES = 1 << 2;
189         const ENABLE_STRIKETHROUGH = 1 << 3;
190         const ENABLE_TASKLISTS = 1 << 4;
191         const ENABLE_SMART_PUNCTUATION = 1 << 5;
192     }
193 }
194 
195 #[derive(Debug, Default, Clone, Copy)]
196 struct Item {
197     start: usize,
198     end: usize,
199     body: ItemBody,
200 }
201 
202 #[derive(Debug, PartialEq, Clone, Copy)]
203 enum ItemBody {
204     Paragraph,
205     Text,
206     SoftBreak,
207     HardBreak,
208 
209     // These are possible inline items, need to be resolved in second pass.
210 
211     // repeats, can_open, can_close
212     MaybeEmphasis(usize, bool, bool),
213     // quote byte, can_open, can_close
214     MaybeSmartQuote(u8, bool, bool),
215     MaybeCode(usize, bool), // number of backticks, preceeded by backslash
216     MaybeHtml,
217     MaybeLinkOpen,
218     // bool indicates whether or not the preceeding section could be a reference
219     MaybeLinkClose(bool),
220     MaybeImage,
221 
222     // These are inline items after resolution.
223     Emphasis,
224     Strong,
225     Strikethrough,
226     Code(CowIndex),
227     Link(LinkIndex),
228     Image(LinkIndex),
229     FootnoteReference(CowIndex),
230     TaskListMarker(bool), // true for checked
231 
232     Rule,
233     Heading(u32), // heading level
234     FencedCodeBlock(CowIndex),
235     IndentCodeBlock,
236     Html,
237     OwnedHtml(CowIndex),
238     BlockQuote,
239     List(bool, u8, u64), // is_tight, list character, list start index
240     ListItem(usize),     // indent level
241     SynthesizeText(CowIndex),
242     SynthesizeChar(char),
243     FootnoteDefinition(CowIndex),
244 
245     // Tables
246     Table(AlignmentIndex),
247     TableHead,
248     TableRow,
249     TableCell,
250 
251     // Dummy node at the top of the tree - should not be used otherwise!
252     Root,
253 }
254 
255 impl<'a> ItemBody {
is_inline(&self) -> bool256     fn is_inline(&self) -> bool {
257         match *self {
258             ItemBody::MaybeEmphasis(..)
259             | ItemBody::MaybeSmartQuote(..)
260             | ItemBody::MaybeHtml
261             | ItemBody::MaybeCode(..)
262             | ItemBody::MaybeLinkOpen
263             | ItemBody::MaybeLinkClose(..)
264             | ItemBody::MaybeImage => true,
265             _ => false,
266         }
267     }
268 }
269 
270 impl<'a> Default for ItemBody {
default() -> Self271     fn default() -> Self {
272         ItemBody::Root
273     }
274 }
275 
276 /// Scanning modes for `Parser`'s `parse_line` method.
277 #[derive(PartialEq, Eq, Copy, Clone)]
278 enum TableParseMode {
279     /// Inside a paragraph, scanning for table headers.
280     Scan,
281     /// Inside a table.
282     Active,
283     /// Inside a paragraph, not scanning for table headers.
284     Disabled,
285 }
286 
287 pub struct BrokenLink<'a> {
288     pub span: std::ops::Range<usize>,
289     pub link_type: LinkType,
290     pub reference: &'a str,
291 }
292 
293 /// State for the first parsing pass.
294 ///
295 /// The first pass resolves all block structure, generating an AST. Within a block, items
296 /// are in a linear chain with potential inline markup identified.
297 struct FirstPass<'a, 'b> {
298     text: &'a str,
299     tree: Tree<Item>,
300     begin_list_item: bool,
301     last_line_blank: bool,
302     allocs: Allocations<'a>,
303     options: Options,
304     list_nesting: usize,
305     lookup_table: &'b LookupTable,
306 }
307 
308 impl<'a, 'b> FirstPass<'a, 'b> {
new(text: &'a str, options: Options, lookup_table: &'b LookupTable) -> FirstPass<'a, 'b>309     fn new(text: &'a str, options: Options, lookup_table: &'b LookupTable) -> FirstPass<'a, 'b> {
310         // This is a very naive heuristic for the number of nodes
311         // we'll need.
312         let start_capacity = max(128, text.len() / 32);
313         let tree = Tree::with_capacity(start_capacity);
314         FirstPass {
315             text,
316             tree,
317             begin_list_item: false,
318             last_line_blank: false,
319             allocs: Allocations::new(),
320             options,
321             list_nesting: 0,
322             lookup_table,
323         }
324     }
325 
run(mut self) -> (Tree<Item>, Allocations<'a>)326     fn run(mut self) -> (Tree<Item>, Allocations<'a>) {
327         let mut ix = 0;
328         while ix < self.text.len() {
329             ix = self.parse_block(ix);
330         }
331         for _ in 0..self.tree.spine_len() {
332             self.pop(ix);
333         }
334         (self.tree, self.allocs)
335     }
336 
337     /// Returns offset after block.
parse_block(&mut self, mut start_ix: usize) -> usize338     fn parse_block(&mut self, mut start_ix: usize) -> usize {
339         let bytes = self.text.as_bytes();
340         let mut line_start = LineStart::new(&bytes[start_ix..]);
341 
342         let i = scan_containers(&self.tree, &mut line_start);
343         for _ in i..self.tree.spine_len() {
344             self.pop(start_ix);
345         }
346 
347         if self.options.contains(Options::ENABLE_FOOTNOTES) {
348             // finish footnote if it's still open and was preceeded by blank line
349             if let Some(node_ix) = self.tree.peek_up() {
350                 if let ItemBody::FootnoteDefinition(..) = self.tree[node_ix].item.body {
351                     if self.last_line_blank {
352                         self.pop(start_ix);
353                     }
354                 }
355             }
356 
357             // Footnote definitions of the form
358             // [^bar]:
359             // * anything really
360             let container_start = start_ix + line_start.bytes_scanned();
361             if let Some(bytecount) = self.parse_footnote(container_start) {
362                 start_ix = container_start + bytecount;
363                 start_ix += scan_blank_line(&bytes[start_ix..]).unwrap_or(0);
364                 line_start = LineStart::new(&bytes[start_ix..]);
365             }
366         }
367 
368         // Process new containers
369         loop {
370             let container_start = start_ix + line_start.bytes_scanned();
371             if let Some((ch, index, indent)) = line_start.scan_list_marker() {
372                 let after_marker_index = start_ix + line_start.bytes_scanned();
373                 self.continue_list(container_start, ch, index);
374                 self.tree.append(Item {
375                     start: container_start,
376                     end: after_marker_index, // will get updated later if item not empty
377                     body: ItemBody::ListItem(indent),
378                 });
379                 self.tree.push();
380                 if let Some(n) = scan_blank_line(&bytes[after_marker_index..]) {
381                     self.begin_list_item = true;
382                     return after_marker_index + n;
383                 }
384                 if self.options.contains(Options::ENABLE_TASKLISTS) {
385                     if let Some(is_checked) = line_start.scan_task_list_marker() {
386                         self.tree.append(Item {
387                             start: after_marker_index,
388                             end: start_ix + line_start.bytes_scanned(),
389                             body: ItemBody::TaskListMarker(is_checked),
390                         });
391                     }
392                 }
393             } else if line_start.scan_blockquote_marker() {
394                 self.finish_list(start_ix);
395                 self.tree.append(Item {
396                     start: container_start,
397                     end: 0, // will get set later
398                     body: ItemBody::BlockQuote,
399                 });
400                 self.tree.push();
401             } else {
402                 break;
403             }
404         }
405 
406         let ix = start_ix + line_start.bytes_scanned();
407 
408         if let Some(n) = scan_blank_line(&bytes[ix..]) {
409             if let Some(node_ix) = self.tree.peek_up() {
410                 match self.tree[node_ix].item.body {
411                     ItemBody::BlockQuote => (),
412                     _ => {
413                         if self.begin_list_item {
414                             // A list item can begin with at most one blank line.
415                             self.pop(start_ix);
416                         }
417                         self.last_line_blank = true;
418                     }
419                 }
420             }
421             return ix + n;
422         }
423 
424         self.begin_list_item = false;
425         self.finish_list(start_ix);
426 
427         // Save `remaining_space` here to avoid needing to backtrack `line_start` for HTML blocks
428         let remaining_space = line_start.remaining_space();
429 
430         let indent = line_start.scan_space_upto(4);
431         if indent == 4 {
432             let ix = start_ix + line_start.bytes_scanned();
433             let remaining_space = line_start.remaining_space();
434             return self.parse_indented_code_block(ix, remaining_space);
435         }
436 
437         let ix = start_ix + line_start.bytes_scanned();
438 
439         // HTML Blocks
440         if bytes[ix] == b'<' {
441             // Types 1-5 are all detected by one function and all end with the same
442             // pattern
443             if let Some(html_end_tag) = get_html_end_tag(&bytes[(ix + 1)..]) {
444                 return self.parse_html_block_type_1_to_5(ix, html_end_tag, remaining_space);
445             }
446 
447             // Detect type 6
448             let possible_tag = scan_html_block_tag(&bytes[(ix + 1)..]).1;
449             if is_html_tag(possible_tag) {
450                 return self.parse_html_block_type_6_or_7(ix, remaining_space);
451             }
452 
453             // Detect type 7
454             if let Some(_html_bytes) = scan_html_type_7(&bytes[ix..]) {
455                 return self.parse_html_block_type_6_or_7(ix, remaining_space);
456             }
457         }
458 
459         if let Ok(n) = scan_hrule(&bytes[ix..]) {
460             return self.parse_hrule(n, ix);
461         }
462 
463         if let Some(atx_size) = scan_atx_heading(&bytes[ix..]) {
464             return self.parse_atx_heading(ix, atx_size);
465         }
466 
467         // parse refdef
468         if let Some((bytecount, label, link_def)) = self.parse_refdef_total(ix) {
469             self.allocs.refdefs.entry(label).or_insert(link_def);
470             let ix = ix + bytecount;
471             // try to read trailing whitespace or it will register as a completely blank line
472             // TODO: shouldn't we do this for all block level items?
473             return ix + scan_blank_line(&bytes[ix..]).unwrap_or(0);
474         }
475 
476         if let Some((n, fence_ch)) = scan_code_fence(&bytes[ix..]) {
477             return self.parse_fenced_code_block(ix, indent, fence_ch, n);
478         }
479         self.parse_paragraph(ix)
480     }
481 
482     /// Returns the offset of the first line after the table.
483     /// Assumptions: current focus is a table element and the table header
484     /// matches the separator line (same number of columns).
parse_table(&mut self, table_cols: usize, head_start: usize, body_start: usize) -> usize485     fn parse_table(&mut self, table_cols: usize, head_start: usize, body_start: usize) -> usize {
486         // parse header. this shouldn't fail because we made sure the table header is ok
487         let (_sep_start, thead_ix) = self.parse_table_row_inner(head_start, table_cols);
488         self.tree[thead_ix].item.body = ItemBody::TableHead;
489 
490         // parse body
491         let mut ix = body_start;
492         while let Some((next_ix, _row_ix)) = self.parse_table_row(ix, table_cols) {
493             ix = next_ix;
494         }
495 
496         self.pop(ix);
497         ix
498     }
499 
500     /// Call this when containers are taken care of.
501     /// Returns bytes scanned, row_ix
parse_table_row_inner(&mut self, mut ix: usize, row_cells: usize) -> (usize, TreeIndex)502     fn parse_table_row_inner(&mut self, mut ix: usize, row_cells: usize) -> (usize, TreeIndex) {
503         let bytes = self.text.as_bytes();
504         let mut cells = 0;
505         let mut final_cell_ix = None;
506 
507         let row_ix = self.tree.append(Item {
508             start: ix,
509             end: 0, // set at end of this function
510             body: ItemBody::TableRow,
511         });
512         self.tree.push();
513 
514         loop {
515             ix += scan_ch(&bytes[ix..], b'|');
516             ix += scan_whitespace_no_nl(&bytes[ix..]);
517 
518             if let Some(eol_bytes) = scan_eol(&bytes[ix..]) {
519                 ix += eol_bytes;
520                 break;
521             }
522 
523             let cell_ix = self.tree.append(Item {
524                 start: ix,
525                 end: ix,
526                 body: ItemBody::TableCell,
527             });
528             self.tree.push();
529             let (next_ix, _brk) = self.parse_line(ix, TableParseMode::Active);
530             let trailing_whitespace = scan_rev_while(&bytes[..next_ix], is_ascii_whitespace);
531 
532             if let Some(cur_ix) = self.tree.cur() {
533                 self.tree[cur_ix].item.end -= trailing_whitespace;
534             }
535 
536             self.tree[cell_ix].item.end = next_ix - trailing_whitespace;
537             self.tree.pop();
538 
539             ix = next_ix;
540             cells += 1;
541 
542             if cells == row_cells {
543                 final_cell_ix = Some(cell_ix);
544             }
545         }
546 
547         // fill empty cells if needed
548         // note: this is where GFM and commonmark-extra diverge. we follow
549         // GFM here
550         for _ in cells..row_cells {
551             self.tree.append(Item {
552                 start: ix,
553                 end: ix,
554                 body: ItemBody::TableCell,
555             });
556         }
557 
558         // drop excess cells
559         if let Some(cell_ix) = final_cell_ix {
560             self.tree[cell_ix].next = None;
561         }
562 
563         self.pop(ix);
564 
565         (ix, row_ix)
566     }
567 
568     /// Returns first offset after the row and the tree index of the row.
parse_table_row(&mut self, mut ix: usize, row_cells: usize) -> Option<(usize, TreeIndex)>569     fn parse_table_row(&mut self, mut ix: usize, row_cells: usize) -> Option<(usize, TreeIndex)> {
570         let bytes = self.text.as_bytes();
571         let mut line_start = LineStart::new(&bytes[ix..]);
572         let containers = scan_containers(&self.tree, &mut line_start);
573         if containers != self.tree.spine_len() {
574             return None;
575         }
576         line_start.scan_all_space();
577         ix += line_start.bytes_scanned();
578         if scan_paragraph_interrupt(&bytes[ix..]) {
579             return None;
580         }
581 
582         let (ix, row_ix) = self.parse_table_row_inner(ix, row_cells);
583         Some((ix, row_ix))
584     }
585 
586     /// Returns offset of line start after paragraph.
parse_paragraph(&mut self, start_ix: usize) -> usize587     fn parse_paragraph(&mut self, start_ix: usize) -> usize {
588         let node_ix = self.tree.append(Item {
589             start: start_ix,
590             end: 0, // will get set later
591             body: ItemBody::Paragraph,
592         });
593         self.tree.push();
594         let bytes = self.text.as_bytes();
595 
596         let mut ix = start_ix;
597         loop {
598             let scan_mode = if self.options.contains(Options::ENABLE_TABLES) && ix == start_ix {
599                 TableParseMode::Scan
600             } else {
601                 TableParseMode::Disabled
602             };
603             let (next_ix, brk) = self.parse_line(ix, scan_mode);
604 
605             // break out when we find a table
606             if let Some(Item {
607                 body: ItemBody::Table(alignment_ix),
608                 ..
609             }) = brk
610             {
611                 let table_cols = self.allocs[alignment_ix].len();
612                 self.tree[node_ix].item.body = ItemBody::Table(alignment_ix);
613                 // this clears out any stuff we may have appended - but there may
614                 // be a cleaner way
615                 self.tree[node_ix].child = None;
616                 self.tree.pop();
617                 self.tree.push();
618                 return self.parse_table(table_cols, ix, next_ix);
619             }
620 
621             ix = next_ix;
622             let mut line_start = LineStart::new(&bytes[ix..]);
623             let n_containers = scan_containers(&self.tree, &mut line_start);
624             if !line_start.scan_space(4) {
625                 let ix_new = ix + line_start.bytes_scanned();
626                 if n_containers == self.tree.spine_len() {
627                     if let Some(ix_setext) = self.parse_setext_heading(ix_new, node_ix) {
628                         if let Some(Item {
629                             start,
630                             body: ItemBody::HardBreak,
631                             ..
632                         }) = brk
633                         {
634                             if bytes[start] == b'\\' {
635                                 self.tree.append_text(start, start + 1);
636                             }
637                         }
638                         ix = ix_setext;
639                         break;
640                     }
641                 }
642                 // first check for non-empty lists, then for other interrupts
643                 let suffix = &bytes[ix_new..];
644                 if self.interrupt_paragraph_by_list(suffix) || scan_paragraph_interrupt(suffix) {
645                     break;
646                 }
647             }
648             line_start.scan_all_space();
649             if line_start.is_at_eol() {
650                 break;
651             }
652             ix = next_ix + line_start.bytes_scanned();
653             if let Some(item) = brk {
654                 self.tree.append(item);
655             }
656         }
657 
658         self.pop(ix);
659         ix
660     }
661 
662     /// Returns end ix of setext_heading on success.
parse_setext_heading(&mut self, ix: usize, node_ix: TreeIndex) -> Option<usize>663     fn parse_setext_heading(&mut self, ix: usize, node_ix: TreeIndex) -> Option<usize> {
664         let bytes = self.text.as_bytes();
665         let (n, level) = scan_setext_heading(&bytes[ix..])?;
666         self.tree[node_ix].item.body = ItemBody::Heading(level);
667 
668         // strip trailing whitespace
669         if let Some(cur_ix) = self.tree.cur() {
670             self.tree[cur_ix].item.end -= scan_rev_while(
671                 &bytes[..self.tree[cur_ix].item.end],
672                 is_ascii_whitespace_no_nl,
673             );
674         }
675 
676         Some(ix + n)
677     }
678 
679     /// Parse a line of input, appending text and items to tree.
680     ///
681     /// Returns: index after line and an item representing the break.
parse_line(&mut self, start: usize, mode: TableParseMode) -> (usize, Option<Item>)682     fn parse_line(&mut self, start: usize, mode: TableParseMode) -> (usize, Option<Item>) {
683         let bytes = &self.text.as_bytes();
684         let mut pipes = 0;
685         let mut last_pipe_ix = start;
686         let mut begin_text = start;
687 
688         let (final_ix, brk) =
689             iterate_special_bytes(&self.lookup_table, bytes, start, |ix, byte| {
690                 match byte {
691                     b'\n' | b'\r' => {
692                         if let TableParseMode::Active = mode {
693                             return LoopInstruction::BreakAtWith(ix, None);
694                         }
695 
696                         let mut i = ix;
697                         let eol_bytes = scan_eol(&bytes[ix..]).unwrap();
698                         if mode == TableParseMode::Scan && pipes > 0 {
699                             // check if we may be parsing a table
700                             let next_line_ix = ix + eol_bytes;
701                             let mut line_start = LineStart::new(&bytes[next_line_ix..]);
702                             if scan_containers(&self.tree, &mut line_start) == self.tree.spine_len()
703                             {
704                                 let table_head_ix = next_line_ix + line_start.bytes_scanned();
705                                 let (table_head_bytes, alignment) =
706                                     scan_table_head(&bytes[table_head_ix..]);
707 
708                                 if table_head_bytes > 0 {
709                                     // computing header count from number of pipes
710                                     let header_count =
711                                         count_header_cols(bytes, pipes, start, last_pipe_ix);
712 
713                                     // make sure they match the number of columns we find in separator line
714                                     if alignment.len() == header_count {
715                                         let alignment_ix =
716                                             self.allocs.allocate_alignment(alignment);
717                                         let end_ix = table_head_ix + table_head_bytes;
718                                         return LoopInstruction::BreakAtWith(
719                                             end_ix,
720                                             Some(Item {
721                                                 start: i,
722                                                 end: end_ix, // must update later
723                                                 body: ItemBody::Table(alignment_ix),
724                                             }),
725                                         );
726                                     }
727                                 }
728                             }
729                         }
730 
731                         let end_ix = ix + eol_bytes;
732                         let trailing_backslashes = scan_rev_while(&bytes[..ix], |b| b == b'\\');
733                         if trailing_backslashes % 2 == 1 && end_ix < self.text.len() {
734                             i -= 1;
735                             self.tree.append_text(begin_text, i);
736                             return LoopInstruction::BreakAtWith(
737                                 end_ix,
738                                 Some(Item {
739                                     start: i,
740                                     end: end_ix,
741                                     body: ItemBody::HardBreak,
742                                 }),
743                             );
744                         }
745                         let trailing_whitespace =
746                             scan_rev_while(&bytes[..ix], is_ascii_whitespace_no_nl);
747                         if trailing_whitespace >= 2 {
748                             i -= trailing_whitespace;
749                             self.tree.append_text(begin_text, i);
750                             return LoopInstruction::BreakAtWith(
751                                 end_ix,
752                                 Some(Item {
753                                     start: i,
754                                     end: end_ix,
755                                     body: ItemBody::HardBreak,
756                                 }),
757                             );
758                         }
759 
760                         self.tree.append_text(begin_text, ix);
761                         LoopInstruction::BreakAtWith(
762                             end_ix,
763                             Some(Item {
764                                 start: i,
765                                 end: end_ix,
766                                 body: ItemBody::SoftBreak,
767                             }),
768                         )
769                     }
770                     b'\\' => {
771                         if ix + 1 < self.text.len() && is_ascii_punctuation(bytes[ix + 1]) {
772                             self.tree.append_text(begin_text, ix);
773                             if bytes[ix + 1] == b'`' {
774                                 let count = 1 + scan_ch_repeat(&bytes[(ix + 2)..], b'`');
775                                 self.tree.append(Item {
776                                     start: ix + 1,
777                                     end: ix + count + 1,
778                                     body: ItemBody::MaybeCode(count, true),
779                                 });
780                                 begin_text = ix + 1 + count;
781                                 LoopInstruction::ContinueAndSkip(count)
782                             } else {
783                                 begin_text = ix + 1;
784                                 LoopInstruction::ContinueAndSkip(1)
785                             }
786                         } else {
787                             LoopInstruction::ContinueAndSkip(0)
788                         }
789                     }
790                     c @ b'*' | c @ b'_' | c @ b'~' => {
791                         let string_suffix = &self.text[ix..];
792                         let count = 1 + scan_ch_repeat(&string_suffix.as_bytes()[1..], c);
793                         let can_open = delim_run_can_open(self.text, string_suffix, count, ix);
794                         let can_close = delim_run_can_close(self.text, string_suffix, count, ix);
795                         let is_valid_seq = c != b'~' || count == 2;
796 
797                         if (can_open || can_close) && is_valid_seq {
798                             self.tree.append_text(begin_text, ix);
799                             for i in 0..count {
800                                 self.tree.append(Item {
801                                     start: ix + i,
802                                     end: ix + i + 1,
803                                     body: ItemBody::MaybeEmphasis(count - i, can_open, can_close),
804                                 });
805                             }
806                             begin_text = ix + count;
807                         }
808                         LoopInstruction::ContinueAndSkip(count - 1)
809                     }
810                     b'`' => {
811                         self.tree.append_text(begin_text, ix);
812                         let count = 1 + scan_ch_repeat(&bytes[(ix + 1)..], b'`');
813                         self.tree.append(Item {
814                             start: ix,
815                             end: ix + count,
816                             body: ItemBody::MaybeCode(count, false),
817                         });
818                         begin_text = ix + count;
819                         LoopInstruction::ContinueAndSkip(count - 1)
820                     }
821                     b'<' => {
822                         // Note: could detect some non-HTML cases and early escape here, but not
823                         // clear that's a win.
824                         self.tree.append_text(begin_text, ix);
825                         self.tree.append(Item {
826                             start: ix,
827                             end: ix + 1,
828                             body: ItemBody::MaybeHtml,
829                         });
830                         begin_text = ix + 1;
831                         LoopInstruction::ContinueAndSkip(0)
832                     }
833                     b'!' => {
834                         if ix + 1 < self.text.len() && bytes[ix + 1] == b'[' {
835                             self.tree.append_text(begin_text, ix);
836                             self.tree.append(Item {
837                                 start: ix,
838                                 end: ix + 2,
839                                 body: ItemBody::MaybeImage,
840                             });
841                             begin_text = ix + 2;
842                             LoopInstruction::ContinueAndSkip(1)
843                         } else {
844                             LoopInstruction::ContinueAndSkip(0)
845                         }
846                     }
847                     b'[' => {
848                         self.tree.append_text(begin_text, ix);
849                         self.tree.append(Item {
850                             start: ix,
851                             end: ix + 1,
852                             body: ItemBody::MaybeLinkOpen,
853                         });
854                         begin_text = ix + 1;
855                         LoopInstruction::ContinueAndSkip(0)
856                     }
857                     b']' => {
858                         self.tree.append_text(begin_text, ix);
859                         self.tree.append(Item {
860                             start: ix,
861                             end: ix + 1,
862                             body: ItemBody::MaybeLinkClose(true),
863                         });
864                         begin_text = ix + 1;
865                         LoopInstruction::ContinueAndSkip(0)
866                     }
867                     b'&' => match scan_entity(&bytes[ix..]) {
868                         (n, Some(value)) => {
869                             self.tree.append_text(begin_text, ix);
870                             self.tree.append(Item {
871                                 start: ix,
872                                 end: ix + n,
873                                 body: ItemBody::SynthesizeText(self.allocs.allocate_cow(value)),
874                             });
875                             begin_text = ix + n;
876                             LoopInstruction::ContinueAndSkip(n - 1)
877                         }
878                         _ => LoopInstruction::ContinueAndSkip(0),
879                     },
880                     b'|' => {
881                         if let TableParseMode::Active = mode {
882                             LoopInstruction::BreakAtWith(ix, None)
883                         } else {
884                             last_pipe_ix = ix;
885                             pipes += 1;
886                             LoopInstruction::ContinueAndSkip(0)
887                         }
888                     }
889                     b'.' => {
890                         if ix + 2 < bytes.len() && bytes[ix + 1] == b'.' && bytes[ix + 2] == b'.' {
891                             self.tree.append_text(begin_text, ix);
892                             self.tree.append(Item {
893                                 start: ix,
894                                 end: ix + 3,
895                                 body: ItemBody::SynthesizeChar('…'),
896                             });
897                             begin_text = ix + 3;
898                             LoopInstruction::ContinueAndSkip(2)
899                         } else {
900                             LoopInstruction::ContinueAndSkip(0)
901                         }
902                     }
903                     b'-' => {
904                         let count = 1 + scan_ch_repeat(&bytes[(ix + 1)..], b'-');
905                         if count == 1 {
906                             LoopInstruction::ContinueAndSkip(0)
907                         } else {
908                             let itembody = if count == 2 {
909                                 ItemBody::SynthesizeChar('–')
910                             } else if count == 3 {
911                                 ItemBody::SynthesizeChar('—')
912                             } else {
913                                 let (ems, ens) = match count % 6 {
914                                     0 | 3 => (count / 3, 0),
915                                     2 | 4 => (0, count / 2),
916                                     1 => (count / 3 - 1, 2),
917                                     _ => (count / 3, 1),
918                                 };
919                                 // – and — are 3 bytes each in utf8
920                                 let mut buf = String::with_capacity(3 * (ems + ens));
921                                 for _ in 0..ems {
922                                     buf.push('—');
923                                 }
924                                 for _ in 0..ens {
925                                     buf.push('–');
926                                 }
927                                 ItemBody::SynthesizeText(self.allocs.allocate_cow(buf.into()))
928                             };
929 
930                             self.tree.append_text(begin_text, ix);
931                             self.tree.append(Item {
932                                 start: ix,
933                                 end: ix + count,
934                                 body: itembody,
935                             });
936                             begin_text = ix + count;
937                             LoopInstruction::ContinueAndSkip(count - 1)
938                         }
939                     }
940                     c @ b'\'' | c @ b'"' => {
941                         let string_suffix = &self.text[ix..];
942                         let can_open = delim_run_can_open(self.text, string_suffix, 1, ix);
943                         let can_close = delim_run_can_close(self.text, string_suffix, 1, ix);
944 
945                         self.tree.append_text(begin_text, ix);
946                         self.tree.append(Item {
947                             start: ix,
948                             end: ix + 1,
949                             body: ItemBody::MaybeSmartQuote(c, can_open, can_close),
950                         });
951                         begin_text = ix + 1;
952 
953                         LoopInstruction::ContinueAndSkip(0)
954                     }
955                     _ => LoopInstruction::ContinueAndSkip(0),
956                 }
957             });
958 
959         if brk.is_none() {
960             // need to close text at eof
961             self.tree.append_text(begin_text, final_ix);
962         }
963         (final_ix, brk)
964     }
965 
966     /// Check whether we should allow a paragraph interrupt by lists. Only non-empty
967     /// lists are allowed.
interrupt_paragraph_by_list(&self, suffix: &[u8]) -> bool968     fn interrupt_paragraph_by_list(&self, suffix: &[u8]) -> bool {
969         scan_listitem(suffix).map_or(false, |(ix, delim, index, _)| {
970             self.list_nesting > 0 ||
971             // we don't allow interruption by either empty lists or
972             // numbered lists starting at an index other than 1
973             !scan_empty_list(&suffix[ix..]) && (delim == b'*' || delim == b'-' || index == 1)
974         })
975     }
976 
977     /// When start_ix is at the beginning of an HTML block of type 1 to 5,
978     /// this will find the end of the block, adding the block itself to the
979     /// tree and also keeping track of the lines of HTML within the block.
980     ///
981     /// The html_end_tag is the tag that must be found on a line to end the block.
parse_html_block_type_1_to_5( &mut self, start_ix: usize, html_end_tag: &str, mut remaining_space: usize, ) -> usize982     fn parse_html_block_type_1_to_5(
983         &mut self,
984         start_ix: usize,
985         html_end_tag: &str,
986         mut remaining_space: usize,
987     ) -> usize {
988         let bytes = self.text.as_bytes();
989         let mut ix = start_ix;
990         loop {
991             let line_start_ix = ix;
992             ix += scan_nextline(&bytes[ix..]);
993             self.append_html_line(remaining_space, line_start_ix, ix);
994 
995             let mut line_start = LineStart::new(&bytes[ix..]);
996             let n_containers = scan_containers(&self.tree, &mut line_start);
997             if n_containers < self.tree.spine_len() {
998                 break;
999             }
1000 
1001             if (&self.text[line_start_ix..ix]).contains(html_end_tag) {
1002                 break;
1003             }
1004 
1005             let next_line_ix = ix + line_start.bytes_scanned();
1006             if next_line_ix == self.text.len() {
1007                 break;
1008             }
1009             ix = next_line_ix;
1010             remaining_space = line_start.remaining_space();
1011         }
1012         ix
1013     }
1014 
1015     /// When start_ix is at the beginning of an HTML block of type 6 or 7,
1016     /// this will consume lines until there is a blank line and keep track of
1017     /// the HTML within the block.
parse_html_block_type_6_or_7( &mut self, start_ix: usize, mut remaining_space: usize, ) -> usize1018     fn parse_html_block_type_6_or_7(
1019         &mut self,
1020         start_ix: usize,
1021         mut remaining_space: usize,
1022     ) -> usize {
1023         let bytes = self.text.as_bytes();
1024         let mut ix = start_ix;
1025         loop {
1026             let line_start_ix = ix;
1027             ix += scan_nextline(&bytes[ix..]);
1028             self.append_html_line(remaining_space, line_start_ix, ix);
1029 
1030             let mut line_start = LineStart::new(&bytes[ix..]);
1031             let n_containers = scan_containers(&self.tree, &mut line_start);
1032             if n_containers < self.tree.spine_len() || line_start.is_at_eol() {
1033                 break;
1034             }
1035 
1036             let next_line_ix = ix + line_start.bytes_scanned();
1037             if next_line_ix == self.text.len() || scan_blank_line(&bytes[next_line_ix..]).is_some()
1038             {
1039                 break;
1040             }
1041             ix = next_line_ix;
1042             remaining_space = line_start.remaining_space();
1043         }
1044         ix
1045     }
1046 
parse_indented_code_block(&mut self, start_ix: usize, mut remaining_space: usize) -> usize1047     fn parse_indented_code_block(&mut self, start_ix: usize, mut remaining_space: usize) -> usize {
1048         self.tree.append(Item {
1049             start: start_ix,
1050             end: 0, // will get set later
1051             body: ItemBody::IndentCodeBlock,
1052         });
1053         self.tree.push();
1054         let bytes = self.text.as_bytes();
1055         let mut last_nonblank_child = None;
1056         let mut last_nonblank_ix = 0;
1057         let mut end_ix = 0;
1058         let mut last_line_blank = false;
1059 
1060         let mut ix = start_ix;
1061         loop {
1062             let line_start_ix = ix;
1063             ix += scan_nextline(&bytes[ix..]);
1064             self.append_code_text(remaining_space, line_start_ix, ix);
1065             // TODO(spec clarification): should we synthesize newline at EOF?
1066 
1067             if !last_line_blank {
1068                 last_nonblank_child = self.tree.cur();
1069                 last_nonblank_ix = ix;
1070                 end_ix = ix;
1071             }
1072 
1073             let mut line_start = LineStart::new(&bytes[ix..]);
1074             let n_containers = scan_containers(&self.tree, &mut line_start);
1075             if n_containers < self.tree.spine_len()
1076                 || !(line_start.scan_space(4) || line_start.is_at_eol())
1077             {
1078                 break;
1079             }
1080             let next_line_ix = ix + line_start.bytes_scanned();
1081             if next_line_ix == self.text.len() {
1082                 break;
1083             }
1084             ix = next_line_ix;
1085             remaining_space = line_start.remaining_space();
1086             last_line_blank = scan_blank_line(&bytes[ix..]).is_some();
1087         }
1088 
1089         // Trim trailing blank lines.
1090         if let Some(child) = last_nonblank_child {
1091             self.tree[child].next = None;
1092             self.tree[child].item.end = last_nonblank_ix;
1093         }
1094         self.pop(end_ix);
1095         ix
1096     }
1097 
parse_fenced_code_block( &mut self, start_ix: usize, indent: usize, fence_ch: u8, n_fence_char: usize, ) -> usize1098     fn parse_fenced_code_block(
1099         &mut self,
1100         start_ix: usize,
1101         indent: usize,
1102         fence_ch: u8,
1103         n_fence_char: usize,
1104     ) -> usize {
1105         let bytes = self.text.as_bytes();
1106         let mut info_start = start_ix + n_fence_char;
1107         info_start += scan_whitespace_no_nl(&bytes[info_start..]);
1108         // TODO: info strings are typically very short. wouldnt it be faster
1109         // to just do a forward scan here?
1110         let mut ix = info_start + scan_nextline(&bytes[info_start..]);
1111         let info_end = ix - scan_rev_while(&bytes[info_start..ix], is_ascii_whitespace);
1112         let info_string = unescape(&self.text[info_start..info_end]);
1113         self.tree.append(Item {
1114             start: start_ix,
1115             end: 0, // will get set later
1116             body: ItemBody::FencedCodeBlock(self.allocs.allocate_cow(info_string)),
1117         });
1118         self.tree.push();
1119         loop {
1120             let mut line_start = LineStart::new(&bytes[ix..]);
1121             let n_containers = scan_containers(&self.tree, &mut line_start);
1122             if n_containers < self.tree.spine_len() {
1123                 break;
1124             }
1125             line_start.scan_space(indent);
1126             let mut close_line_start = line_start.clone();
1127             if !close_line_start.scan_space(4) {
1128                 let close_ix = ix + close_line_start.bytes_scanned();
1129                 if let Some(n) = scan_closing_code_fence(&bytes[close_ix..], fence_ch, n_fence_char)
1130                 {
1131                     ix = close_ix + n;
1132                     break;
1133                 }
1134             }
1135             let remaining_space = line_start.remaining_space();
1136             ix += line_start.bytes_scanned();
1137             let next_ix = ix + scan_nextline(&bytes[ix..]);
1138             self.append_code_text(remaining_space, ix, next_ix);
1139             ix = next_ix;
1140         }
1141 
1142         self.pop(ix);
1143 
1144         // try to read trailing whitespace or it will register as a completely blank line
1145         ix + scan_blank_line(&bytes[ix..]).unwrap_or(0)
1146     }
1147 
append_code_text(&mut self, remaining_space: usize, start: usize, end: usize)1148     fn append_code_text(&mut self, remaining_space: usize, start: usize, end: usize) {
1149         if remaining_space > 0 {
1150             let cow_ix = self.allocs.allocate_cow("   "[..remaining_space].into());
1151             self.tree.append(Item {
1152                 start,
1153                 end: start,
1154                 body: ItemBody::SynthesizeText(cow_ix),
1155             });
1156         }
1157         if self.text.as_bytes()[end - 2] == b'\r' {
1158             // Normalize CRLF to LF
1159             self.tree.append_text(start, end - 2);
1160             self.tree.append_text(end - 1, end);
1161         } else {
1162             self.tree.append_text(start, end);
1163         }
1164     }
1165 
1166     /// Appends a line of HTML to the tree.
append_html_line(&mut self, remaining_space: usize, start: usize, end: usize)1167     fn append_html_line(&mut self, remaining_space: usize, start: usize, end: usize) {
1168         if remaining_space > 0 {
1169             let cow_ix = self.allocs.allocate_cow("   "[..remaining_space].into());
1170             self.tree.append(Item {
1171                 start,
1172                 end: start,
1173                 // TODO: maybe this should synthesize to html rather than text?
1174                 body: ItemBody::SynthesizeText(cow_ix),
1175             });
1176         }
1177         if self.text.as_bytes()[end - 2] == b'\r' {
1178             // Normalize CRLF to LF
1179             self.tree.append(Item {
1180                 start,
1181                 end: end - 2,
1182                 body: ItemBody::Html,
1183             });
1184             self.tree.append(Item {
1185                 start: end - 1,
1186                 end,
1187                 body: ItemBody::Html,
1188             });
1189         } else {
1190             self.tree.append(Item {
1191                 start,
1192                 end,
1193                 body: ItemBody::Html,
1194             });
1195         }
1196     }
1197 
1198     /// Pop a container, setting its end.
pop(&mut self, ix: usize)1199     fn pop(&mut self, ix: usize) {
1200         let cur_ix = self.tree.pop().unwrap();
1201         self.tree[cur_ix].item.end = ix;
1202         if let ItemBody::List(true, _, _) = self.tree[cur_ix].item.body {
1203             surgerize_tight_list(&mut self.tree, cur_ix);
1204         }
1205     }
1206 
1207     /// Close a list if it's open. Also set loose if last line was blank
finish_list(&mut self, ix: usize)1208     fn finish_list(&mut self, ix: usize) {
1209         if let Some(node_ix) = self.tree.peek_up() {
1210             if let ItemBody::List(_, _, _) = self.tree[node_ix].item.body {
1211                 self.pop(ix);
1212                 self.list_nesting -= 1;
1213             }
1214         }
1215         if self.last_line_blank {
1216             if let Some(node_ix) = self.tree.peek_grandparent() {
1217                 if let ItemBody::List(ref mut is_tight, _, _) = self.tree[node_ix].item.body {
1218                     *is_tight = false;
1219                 }
1220             }
1221             self.last_line_blank = false;
1222         }
1223     }
1224 
1225     /// Continue an existing list or start a new one if there's not an open
1226     /// list that matches.
continue_list(&mut self, start: usize, ch: u8, index: u64)1227     fn continue_list(&mut self, start: usize, ch: u8, index: u64) {
1228         if let Some(node_ix) = self.tree.peek_up() {
1229             if let ItemBody::List(ref mut is_tight, existing_ch, _) = self.tree[node_ix].item.body {
1230                 if existing_ch == ch {
1231                     if self.last_line_blank {
1232                         *is_tight = false;
1233                         self.last_line_blank = false;
1234                     }
1235                     return;
1236                 }
1237             }
1238             // TODO: this is not the best choice for end; maybe get end from last list item.
1239             self.finish_list(start);
1240         }
1241         self.tree.append(Item {
1242             start,
1243             end: 0, // will get set later
1244             body: ItemBody::List(true, ch, index),
1245         });
1246         self.list_nesting += 1;
1247         self.tree.push();
1248         self.last_line_blank = false;
1249     }
1250 
1251     /// Parse a thematic break.
1252     ///
1253     /// Returns index of start of next line.
parse_hrule(&mut self, hrule_size: usize, ix: usize) -> usize1254     fn parse_hrule(&mut self, hrule_size: usize, ix: usize) -> usize {
1255         self.tree.append(Item {
1256             start: ix,
1257             end: ix + hrule_size,
1258             body: ItemBody::Rule,
1259         });
1260         ix + hrule_size
1261     }
1262 
1263     /// Parse an ATX heading.
1264     ///
1265     /// Returns index of start of next line.
parse_atx_heading(&mut self, mut ix: usize, atx_size: usize) -> usize1266     fn parse_atx_heading(&mut self, mut ix: usize, atx_size: usize) -> usize {
1267         let heading_ix = self.tree.append(Item {
1268             start: ix,
1269             end: 0, // set later
1270             body: ItemBody::Heading(atx_size as u32),
1271         });
1272         ix += atx_size;
1273         // next char is space or eol (guaranteed by scan_atx_heading)
1274         let bytes = self.text.as_bytes();
1275         if let Some(eol_bytes) = scan_eol(&bytes[ix..]) {
1276             self.tree[heading_ix].item.end = ix + eol_bytes;
1277             return ix + eol_bytes;
1278         }
1279         // skip leading spaces
1280         let skip_spaces = scan_whitespace_no_nl(&bytes[ix..]);
1281         ix += skip_spaces;
1282 
1283         // now handle the header text
1284         let header_start = ix;
1285         let header_node_idx = self.tree.push(); // so that we can set the endpoint later
1286         ix = self.parse_line(ix, TableParseMode::Disabled).0;
1287         self.tree[header_node_idx].item.end = ix;
1288 
1289         // remove trailing matter from header text
1290         if let Some(cur_ix) = self.tree.cur() {
1291             let header_text = &bytes[header_start..ix];
1292             let mut limit = header_text
1293                 .iter()
1294                 .rposition(|&b| !(b == b'\n' || b == b'\r' || b == b' '))
1295                 .map_or(0, |i| i + 1);
1296             let closer = header_text[..limit]
1297                 .iter()
1298                 .rposition(|&b| b != b'#')
1299                 .map_or(0, |i| i + 1);
1300             if closer == 0 {
1301                 limit = closer;
1302             } else {
1303                 let spaces = scan_rev_while(&header_text[..closer], |b| b == b' ');
1304                 if spaces > 0 {
1305                     limit = closer - spaces;
1306                 }
1307             }
1308             self.tree[cur_ix].item.end = limit + header_start;
1309         }
1310 
1311         self.tree.pop();
1312         ix
1313     }
1314 
1315     /// Returns the number of bytes scanned on success.
parse_footnote(&mut self, start: usize) -> Option<usize>1316     fn parse_footnote(&mut self, start: usize) -> Option<usize> {
1317         let bytes = &self.text.as_bytes()[start..];
1318         if !bytes.starts_with(b"[^") {
1319             return None;
1320         }
1321         let (mut i, label) = self.parse_refdef_label(start + 2)?;
1322         i += 2;
1323         if scan_ch(&bytes[i..], b':') == 0 {
1324             return None;
1325         }
1326         i += 1;
1327         self.finish_list(start);
1328         self.tree.append(Item {
1329             start,
1330             end: 0, // will get set later
1331             // TODO: check whether the label here is strictly necessary
1332             body: ItemBody::FootnoteDefinition(self.allocs.allocate_cow(label)),
1333         });
1334         self.tree.push();
1335         Some(i)
1336     }
1337 
1338     /// Tries to parse a reference label, which can be interrupted by new blocks.
1339     /// On success, returns the number of bytes of the label and the label itself.
parse_refdef_label(&self, start: usize) -> Option<(usize, CowStr<'a>)>1340     fn parse_refdef_label(&self, start: usize) -> Option<(usize, CowStr<'a>)> {
1341         scan_link_label_rest(&self.text[start..], &|bytes| {
1342             let mut line_start = LineStart::new(bytes);
1343             let _ = scan_containers(&self.tree, &mut line_start);
1344             let bytes_scanned = line_start.bytes_scanned();
1345 
1346             let suffix = &bytes[bytes_scanned..];
1347             if self.interrupt_paragraph_by_list(suffix) || scan_paragraph_interrupt(suffix) {
1348                 None
1349             } else {
1350                 Some(bytes_scanned)
1351             }
1352         })
1353     }
1354 
1355     /// Returns number of bytes scanned, label and definition on success.
parse_refdef_total(&mut self, start: usize) -> Option<(usize, LinkLabel<'a>, LinkDef<'a>)>1356     fn parse_refdef_total(&mut self, start: usize) -> Option<(usize, LinkLabel<'a>, LinkDef<'a>)> {
1357         let bytes = &self.text.as_bytes()[start..];
1358         if scan_ch(bytes, b'[') == 0 {
1359             return None;
1360         }
1361         let (mut i, label) = self.parse_refdef_label(start + 1)?;
1362         i += 1;
1363         if scan_ch(&bytes[i..], b':') == 0 {
1364             return None;
1365         }
1366         i += 1;
1367         let (bytecount, link_def) = self.scan_refdef(start + i)?;
1368         Some((bytecount + i, UniCase::new(label), link_def))
1369     }
1370 
1371     /// Returns number of bytes and number of newlines
scan_refdef_space(&self, bytes: &[u8], mut i: usize) -> Option<(usize, usize)>1372     fn scan_refdef_space(&self, bytes: &[u8], mut i: usize) -> Option<(usize, usize)> {
1373         let mut newlines = 0;
1374         loop {
1375             let whitespaces = scan_whitespace_no_nl(&bytes[i..]);
1376             i += whitespaces;
1377             if let Some(eol_bytes) = scan_eol(&bytes[i..]) {
1378                 i += eol_bytes;
1379                 newlines += 1;
1380                 if newlines > 1 {
1381                     return None;
1382                 }
1383             } else {
1384                 break;
1385             }
1386             let mut line_start = LineStart::new(&bytes[i..]);
1387             if self.tree.spine_len() != scan_containers(&self.tree, &mut line_start) {
1388                 return None;
1389             }
1390             i += line_start.bytes_scanned();
1391         }
1392         Some((i, newlines))
1393     }
1394 
1395     /// Returns # of bytes and definition.
1396     /// Assumes the label of the reference including colon has already been scanned.
scan_refdef(&self, start: usize) -> Option<(usize, LinkDef<'a>)>1397     fn scan_refdef(&self, start: usize) -> Option<(usize, LinkDef<'a>)> {
1398         let bytes = self.text.as_bytes();
1399 
1400         // whitespace between label and url (including up to one newline)
1401         let (mut i, _newlines) = self.scan_refdef_space(bytes, start)?;
1402 
1403         // scan link dest
1404         let (dest_length, dest) = scan_link_dest(self.text, i, 1)?;
1405         if dest_length == 0 {
1406             return None;
1407         }
1408         let dest = unescape(dest);
1409         i += dest_length;
1410 
1411         // no title
1412         let mut backup = (i - start, LinkDef { dest, title: None });
1413 
1414         // scan whitespace between dest and label
1415         let (mut i, newlines) =
1416             if let Some((new_i, mut newlines)) = self.scan_refdef_space(bytes, i) {
1417                 if i == self.text.len() {
1418                     newlines += 1;
1419                 }
1420                 if new_i == i && newlines == 0 {
1421                     return None;
1422                 }
1423                 if newlines > 1 {
1424                     return Some(backup);
1425                 };
1426                 (new_i, newlines)
1427             } else {
1428                 return Some(backup);
1429             };
1430 
1431         // scan title
1432         // if this fails but newline == 1, return also a refdef without title
1433         if let Some((title_length, title)) = scan_refdef_title(&self.text[i..]) {
1434             i += title_length;
1435             backup.1.title = Some(unescape(title));
1436         } else if newlines > 0 {
1437             return Some(backup);
1438         } else {
1439             return None;
1440         };
1441 
1442         // scan EOL
1443         if let Some(bytes) = scan_blank_line(&bytes[i..]) {
1444             backup.0 = i + bytes - start;
1445             Some(backup)
1446         } else if newlines > 0 {
1447             Some(backup)
1448         } else {
1449             None
1450         }
1451     }
1452 }
1453 
1454 /// Returns number of containers scanned.
scan_containers(tree: &Tree<Item>, line_start: &mut LineStart) -> usize1455 fn scan_containers(tree: &Tree<Item>, line_start: &mut LineStart) -> usize {
1456     let mut i = 0;
1457     for &node_ix in tree.walk_spine() {
1458         match tree[node_ix].item.body {
1459             ItemBody::BlockQuote => {
1460                 let save = line_start.clone();
1461                 if !line_start.scan_blockquote_marker() {
1462                     *line_start = save;
1463                     break;
1464                 }
1465             }
1466             ItemBody::ListItem(indent) => {
1467                 let save = line_start.clone();
1468                 if !line_start.scan_space(indent) {
1469                     if !line_start.is_at_eol() {
1470                         *line_start = save;
1471                         break;
1472                     }
1473                 }
1474             }
1475             _ => (),
1476         }
1477         i += 1;
1478     }
1479     i
1480 }
1481 
1482 /// Computes the number of header columns in a table line by computing the number of dividing pipes
1483 /// that aren't followed or preceeded by whitespace.
count_header_cols( bytes: &[u8], mut pipes: usize, mut start: usize, last_pipe_ix: usize, ) -> usize1484 fn count_header_cols(
1485     bytes: &[u8],
1486     mut pipes: usize,
1487     mut start: usize,
1488     last_pipe_ix: usize,
1489 ) -> usize {
1490     // was first pipe preceeded by whitespace? if so, subtract one
1491     start += scan_whitespace_no_nl(&bytes[start..]);
1492     if bytes[start] == b'|' {
1493         pipes -= 1;
1494     }
1495 
1496     // was last pipe followed by whitespace? if so, sub one
1497     if scan_blank_line(&bytes[(last_pipe_ix + 1)..]).is_some() {
1498         pipes
1499     } else {
1500         pipes + 1
1501     }
1502 }
1503 
1504 impl<'a> Tree<Item> {
append_text(&mut self, start: usize, end: usize)1505     fn append_text(&mut self, start: usize, end: usize) {
1506         if end > start {
1507             if let Some(ix) = self.cur() {
1508                 if ItemBody::Text == self[ix].item.body && self[ix].item.end == start {
1509                     self[ix].item.end = end;
1510                     return;
1511                 }
1512             }
1513             self.append(Item {
1514                 start,
1515                 end,
1516                 body: ItemBody::Text,
1517             });
1518         }
1519     }
1520 }
1521 
1522 /// Determines whether the delimiter run starting at given index is
1523 /// left-flanking, as defined by the commonmark spec (and isn't intraword
1524 /// for _ delims).
1525 /// suffix is &s[ix..], which is passed in as an optimization, since taking
1526 /// a string subslice is O(n).
delim_run_can_open(s: &str, suffix: &str, run_len: usize, ix: usize) -> bool1527 fn delim_run_can_open(s: &str, suffix: &str, run_len: usize, ix: usize) -> bool {
1528     let next_char = if let Some(c) = suffix.chars().nth(run_len) {
1529         c
1530     } else {
1531         return false;
1532     };
1533     if next_char.is_whitespace() {
1534         return false;
1535     }
1536     if ix == 0 {
1537         return true;
1538     }
1539     let delim = suffix.chars().next().unwrap();
1540     if delim == '*' && !is_punctuation(next_char) {
1541         return true;
1542     }
1543 
1544     let prev_char = s[..ix].chars().last().unwrap();
1545 
1546     prev_char.is_whitespace()
1547         || is_punctuation(prev_char) && (delim != '\'' || ![']', ')'].contains(&prev_char))
1548 }
1549 
1550 /// Determines whether the delimiter run starting at given index is
1551 /// left-flanking, as defined by the commonmark spec (and isn't intraword
1552 /// for _ delims)
delim_run_can_close(s: &str, suffix: &str, run_len: usize, ix: usize) -> bool1553 fn delim_run_can_close(s: &str, suffix: &str, run_len: usize, ix: usize) -> bool {
1554     if ix == 0 {
1555         return false;
1556     }
1557     let prev_char = s[..ix].chars().last().unwrap();
1558     if prev_char.is_whitespace() {
1559         return false;
1560     }
1561     let next_char = if let Some(c) = suffix.chars().nth(run_len) {
1562         c
1563     } else {
1564         return true;
1565     };
1566     let delim = suffix.chars().next().unwrap();
1567     if delim == '*' && !is_punctuation(prev_char) {
1568         return true;
1569     }
1570 
1571     next_char.is_whitespace() || is_punctuation(next_char)
1572 }
1573 
1574 /// Checks whether we should break a paragraph on the given input.
1575 /// Note: lists are dealt with in `interrupt_paragraph_by_list`, because determing
1576 /// whether to break on a list requires additional context.
scan_paragraph_interrupt(bytes: &[u8]) -> bool1577 fn scan_paragraph_interrupt(bytes: &[u8]) -> bool {
1578     if scan_eol(bytes).is_some()
1579         || scan_hrule(bytes).is_ok()
1580         || scan_atx_heading(bytes).is_some()
1581         || scan_code_fence(bytes).is_some()
1582         || scan_blockquote_start(bytes).is_some()
1583     {
1584         return true;
1585     }
1586     bytes.starts_with(b"<")
1587         && (get_html_end_tag(&bytes[1..]).is_some()
1588             || is_html_tag(scan_html_block_tag(&bytes[1..]).1))
1589 }
1590 
1591 /// Assumes `text_bytes` is preceded by `<`.
get_html_end_tag(text_bytes: &[u8]) -> Option<&'static str>1592 fn get_html_end_tag(text_bytes: &[u8]) -> Option<&'static str> {
1593     static BEGIN_TAGS: &[&[u8]; 3] = &[b"pre", b"style", b"script"];
1594     static ST_BEGIN_TAGS: &[&[u8]; 3] = &[b"!--", b"?", b"![CDATA["];
1595 
1596     for (beg_tag, end_tag) in BEGIN_TAGS
1597         .iter()
1598         .zip(["</pre>", "</style>", "</script>"].iter())
1599     {
1600         let tag_len = beg_tag.len();
1601 
1602         if text_bytes.len() < tag_len {
1603             // begin tags are increasing in size
1604             break;
1605         }
1606 
1607         if !text_bytes[..tag_len].eq_ignore_ascii_case(beg_tag) {
1608             continue;
1609         }
1610 
1611         // Must either be the end of the line...
1612         if text_bytes.len() == tag_len {
1613             return Some(end_tag);
1614         }
1615 
1616         // ...or be followed by whitespace, newline, or '>'.
1617         let s = text_bytes[tag_len];
1618         if is_ascii_whitespace(s) || s == b'>' {
1619             return Some(end_tag);
1620         }
1621     }
1622 
1623     for (beg_tag, end_tag) in ST_BEGIN_TAGS.iter().zip(["-->", "?>", "]]>"].iter()) {
1624         if text_bytes.starts_with(beg_tag) {
1625             return Some(end_tag);
1626         }
1627     }
1628 
1629     if text_bytes.len() > 1
1630         && text_bytes[0] == b'!'
1631         && text_bytes[1] >= b'A'
1632         && text_bytes[1] <= b'Z'
1633     {
1634         Some(">")
1635     } else {
1636         None
1637     }
1638 }
1639 
1640 #[derive(Copy, Clone, Debug)]
1641 struct InlineEl {
1642     start: TreeIndex, // offset of tree node
1643     count: usize,
1644     c: u8,      // b'*' or b'_'
1645     both: bool, // can both open and close
1646 }
1647 
1648 #[derive(Debug, Clone, Default)]
1649 struct InlineStack {
1650     stack: Vec<InlineEl>,
1651     // Lower bounds for matching indices in the stack. For example
1652     // a strikethrough delimiter will never match with any element
1653     // in the stack with index smaller than
1654     // `lower_bounds[InlineStack::TILDES]`.
1655     lower_bounds: [usize; 7],
1656 }
1657 
1658 impl InlineStack {
1659     /// These are indices into the lower bounds array.
1660     /// Not both refers to the property that the delimiter can not both
1661     /// be opener as a closer.
1662     const UNDERSCORE_NOT_BOTH: usize = 0;
1663     const ASTERISK_NOT_BOTH: usize = 1;
1664     const ASTERISK_BASE: usize = 2;
1665     const TILDES: usize = 5;
1666     const UNDERSCORE_BOTH: usize = 6;
1667 
pop_all(&mut self, tree: &mut Tree<Item>)1668     fn pop_all(&mut self, tree: &mut Tree<Item>) {
1669         for el in self.stack.drain(..) {
1670             for i in 0..el.count {
1671                 tree[el.start + i].item.body = ItemBody::Text;
1672             }
1673         }
1674         self.lower_bounds = [0; 7];
1675     }
1676 
get_lowerbound(&self, c: u8, count: usize, both: bool) -> usize1677     fn get_lowerbound(&self, c: u8, count: usize, both: bool) -> usize {
1678         if c == b'_' {
1679             if both {
1680                 self.lower_bounds[InlineStack::UNDERSCORE_BOTH]
1681             } else {
1682                 self.lower_bounds[InlineStack::UNDERSCORE_NOT_BOTH]
1683             }
1684         } else if c == b'*' {
1685             let mod3_lower = self.lower_bounds[InlineStack::ASTERISK_BASE + count % 3];
1686             if both {
1687                 mod3_lower
1688             } else {
1689                 min(
1690                     mod3_lower,
1691                     self.lower_bounds[InlineStack::ASTERISK_NOT_BOTH],
1692                 )
1693             }
1694         } else {
1695             self.lower_bounds[InlineStack::TILDES]
1696         }
1697     }
1698 
set_lowerbound(&mut self, c: u8, count: usize, both: bool, new_bound: usize)1699     fn set_lowerbound(&mut self, c: u8, count: usize, both: bool, new_bound: usize) {
1700         if c == b'_' {
1701             if both {
1702                 self.lower_bounds[InlineStack::UNDERSCORE_BOTH] = new_bound;
1703             } else {
1704                 self.lower_bounds[InlineStack::UNDERSCORE_NOT_BOTH] = new_bound;
1705             }
1706         } else if c == b'*' {
1707             self.lower_bounds[InlineStack::ASTERISK_BASE + count % 3] = new_bound;
1708             if !both {
1709                 self.lower_bounds[InlineStack::ASTERISK_NOT_BOTH] = new_bound;
1710             }
1711         } else {
1712             self.lower_bounds[InlineStack::TILDES] = new_bound;
1713         }
1714     }
1715 
find_match( &mut self, tree: &mut Tree<Item>, c: u8, count: usize, both: bool, ) -> Option<InlineEl>1716     fn find_match(
1717         &mut self,
1718         tree: &mut Tree<Item>,
1719         c: u8,
1720         count: usize,
1721         both: bool,
1722     ) -> Option<InlineEl> {
1723         let lowerbound = min(self.stack.len(), self.get_lowerbound(c, count, both));
1724         let res = self.stack[lowerbound..]
1725             .iter()
1726             .cloned()
1727             .enumerate()
1728             .rfind(|(_, el)| {
1729                 el.c == c && (!both && !el.both || (count + el.count) % 3 != 0 || count % 3 == 0)
1730             });
1731 
1732         if let Some((matching_ix, matching_el)) = res {
1733             let matching_ix = matching_ix + lowerbound;
1734             for el in &self.stack[(matching_ix + 1)..] {
1735                 for i in 0..el.count {
1736                     tree[el.start + i].item.body = ItemBody::Text;
1737                 }
1738             }
1739             self.stack.truncate(matching_ix);
1740             Some(matching_el)
1741         } else {
1742             self.set_lowerbound(c, count, both, self.stack.len());
1743             None
1744         }
1745     }
1746 
push(&mut self, el: InlineEl)1747     fn push(&mut self, el: InlineEl) {
1748         self.stack.push(el)
1749     }
1750 }
1751 
1752 #[derive(Debug, Clone)]
1753 enum RefScan<'a> {
1754     // label, source ix of label end
1755     LinkLabel(CowStr<'a>, usize),
1756     // contains next node index
1757     Collapsed(Option<TreeIndex>),
1758     Failed,
1759 }
1760 
1761 /// Skips forward within a block to a node which spans (ends inclusive) the given
1762 /// index into the source.
scan_nodes_to_ix( tree: &Tree<Item>, mut node: Option<TreeIndex>, ix: usize, ) -> Option<TreeIndex>1763 fn scan_nodes_to_ix(
1764     tree: &Tree<Item>,
1765     mut node: Option<TreeIndex>,
1766     ix: usize,
1767 ) -> Option<TreeIndex> {
1768     while let Some(node_ix) = node {
1769         if tree[node_ix].item.end <= ix {
1770             node = tree[node_ix].next;
1771         } else {
1772             break;
1773         }
1774     }
1775     node
1776 }
1777 
1778 /// Scans an inline link label, which cannot be interrupted.
1779 /// Returns number of bytes (including brackets) and label on success.
scan_link_label<'text, 'tree>( tree: &'tree Tree<Item>, text: &'text str, allow_footnote_refs: bool, ) -> Option<(usize, ReferenceLabel<'text>)>1780 fn scan_link_label<'text, 'tree>(
1781     tree: &'tree Tree<Item>,
1782     text: &'text str,
1783     allow_footnote_refs: bool,
1784 ) -> Option<(usize, ReferenceLabel<'text>)> {
1785     let bytes = &text.as_bytes();
1786     if bytes.len() < 2 || bytes[0] != b'[' {
1787         return None;
1788     }
1789     let linebreak_handler = |bytes: &[u8]| {
1790         let mut line_start = LineStart::new(bytes);
1791         let _ = scan_containers(tree, &mut line_start);
1792         Some(line_start.bytes_scanned())
1793     };
1794     let pair = if allow_footnote_refs && b'^' == bytes[1] {
1795         let (byte_index, cow) = scan_link_label_rest(&text[2..], &linebreak_handler)?;
1796         (byte_index + 2, ReferenceLabel::Footnote(cow))
1797     } else {
1798         let (byte_index, cow) = scan_link_label_rest(&text[1..], &linebreak_handler)?;
1799         (byte_index + 1, ReferenceLabel::Link(cow))
1800     };
1801     Some(pair)
1802 }
1803 
scan_reference<'a, 'b>( tree: &'a Tree<Item>, text: &'b str, cur: Option<TreeIndex>, allow_footnote_refs: bool, ) -> RefScan<'b>1804 fn scan_reference<'a, 'b>(
1805     tree: &'a Tree<Item>,
1806     text: &'b str,
1807     cur: Option<TreeIndex>,
1808     allow_footnote_refs: bool,
1809 ) -> RefScan<'b> {
1810     let cur_ix = match cur {
1811         None => return RefScan::Failed,
1812         Some(cur_ix) => cur_ix,
1813     };
1814     let start = tree[cur_ix].item.start;
1815     let tail = &text.as_bytes()[start..];
1816 
1817     if tail.starts_with(b"[]") {
1818         let closing_node = tree[cur_ix].next.unwrap();
1819         RefScan::Collapsed(tree[closing_node].next)
1820     } else if let Some((ix, ReferenceLabel::Link(label))) =
1821         scan_link_label(tree, &text[start..], allow_footnote_refs)
1822     {
1823         RefScan::LinkLabel(label, start + ix)
1824     } else {
1825         RefScan::Failed
1826     }
1827 }
1828 
1829 #[derive(Clone, Default)]
1830 struct LinkStack {
1831     inner: Vec<LinkStackEl>,
1832     disabled_ix: usize,
1833 }
1834 
1835 impl LinkStack {
push(&mut self, el: LinkStackEl)1836     fn push(&mut self, el: LinkStackEl) {
1837         self.inner.push(el);
1838     }
1839 
pop(&mut self) -> Option<LinkStackEl>1840     fn pop(&mut self) -> Option<LinkStackEl> {
1841         let el = self.inner.pop();
1842         self.disabled_ix = std::cmp::min(self.disabled_ix, self.inner.len());
1843         el
1844     }
1845 
clear(&mut self)1846     fn clear(&mut self) {
1847         self.inner.clear();
1848         self.disabled_ix = 0;
1849     }
1850 
disable_all_links(&mut self)1851     fn disable_all_links(&mut self) {
1852         for el in &mut self.inner[self.disabled_ix..] {
1853             if el.ty == LinkStackTy::Link {
1854                 el.ty = LinkStackTy::Disabled;
1855             }
1856         }
1857         self.disabled_ix = self.inner.len();
1858     }
1859 }
1860 
1861 #[derive(Clone, Debug)]
1862 struct LinkStackEl {
1863     node: TreeIndex,
1864     ty: LinkStackTy,
1865 }
1866 
1867 #[derive(PartialEq, Clone, Debug)]
1868 enum LinkStackTy {
1869     Link,
1870     Image,
1871     Disabled,
1872 }
1873 
1874 #[derive(Clone)]
1875 struct LinkDef<'a> {
1876     dest: CowStr<'a>,
1877     title: Option<CowStr<'a>>,
1878 }
1879 
1880 /// Tracks tree indices of code span delimiters of each length. It should prevent
1881 /// quadratic scanning behaviours by providing (amortized) constant time lookups.
1882 struct CodeDelims {
1883     inner: HashMap<usize, VecDeque<TreeIndex>>,
1884     seen_first: bool,
1885 }
1886 
1887 impl CodeDelims {
new() -> Self1888     fn new() -> Self {
1889         Self {
1890             inner: Default::default(),
1891             seen_first: false,
1892         }
1893     }
1894 
insert(&mut self, count: usize, ix: TreeIndex)1895     fn insert(&mut self, count: usize, ix: TreeIndex) {
1896         if self.seen_first {
1897             self.inner
1898                 .entry(count)
1899                 .or_insert_with(Default::default)
1900                 .push_back(ix);
1901         } else {
1902             // Skip the first insert, since that delimiter will always
1903             // be an opener and not a closer.
1904             self.seen_first = true;
1905         }
1906     }
1907 
is_populated(&self) -> bool1908     fn is_populated(&self) -> bool {
1909         !self.inner.is_empty()
1910     }
1911 
find(&mut self, open_ix: TreeIndex, count: usize) -> Option<TreeIndex>1912     fn find(&mut self, open_ix: TreeIndex, count: usize) -> Option<TreeIndex> {
1913         while let Some(ix) = self.inner.get_mut(&count)?.pop_front() {
1914             if ix > open_ix {
1915                 return Some(ix);
1916             }
1917         }
1918         None
1919     }
1920 
clear(&mut self)1921     fn clear(&mut self) {
1922         self.inner.clear();
1923         self.seen_first = false;
1924     }
1925 }
1926 
1927 #[derive(Copy, Clone, PartialEq, Eq, Debug)]
1928 struct LinkIndex(usize);
1929 
1930 #[derive(Copy, Clone, PartialEq, Eq, Debug)]
1931 struct CowIndex(usize);
1932 
1933 #[derive(Copy, Clone, PartialEq, Eq, Debug)]
1934 struct AlignmentIndex(usize);
1935 
1936 #[derive(Clone)]
1937 struct Allocations<'a> {
1938     refdefs: HashMap<LinkLabel<'a>, LinkDef<'a>>,
1939     links: Vec<(LinkType, CowStr<'a>, CowStr<'a>)>,
1940     cows: Vec<CowStr<'a>>,
1941     alignments: Vec<Vec<Alignment>>,
1942 }
1943 
1944 impl<'a> Allocations<'a> {
new() -> Self1945     fn new() -> Self {
1946         Self {
1947             refdefs: HashMap::new(),
1948             links: Vec::with_capacity(128),
1949             cows: Vec::new(),
1950             alignments: Vec::new(),
1951         }
1952     }
1953 
allocate_cow(&mut self, cow: CowStr<'a>) -> CowIndex1954     fn allocate_cow(&mut self, cow: CowStr<'a>) -> CowIndex {
1955         let ix = self.cows.len();
1956         self.cows.push(cow);
1957         CowIndex(ix)
1958     }
1959 
allocate_link(&mut self, ty: LinkType, url: CowStr<'a>, title: CowStr<'a>) -> LinkIndex1960     fn allocate_link(&mut self, ty: LinkType, url: CowStr<'a>, title: CowStr<'a>) -> LinkIndex {
1961         let ix = self.links.len();
1962         self.links.push((ty, url, title));
1963         LinkIndex(ix)
1964     }
1965 
allocate_alignment(&mut self, alignment: Vec<Alignment>) -> AlignmentIndex1966     fn allocate_alignment(&mut self, alignment: Vec<Alignment>) -> AlignmentIndex {
1967         let ix = self.alignments.len();
1968         self.alignments.push(alignment);
1969         AlignmentIndex(ix)
1970     }
1971 }
1972 
1973 impl<'a> Index<CowIndex> for Allocations<'a> {
1974     type Output = CowStr<'a>;
1975 
index(&self, ix: CowIndex) -> &Self::Output1976     fn index(&self, ix: CowIndex) -> &Self::Output {
1977         self.cows.index(ix.0)
1978     }
1979 }
1980 
1981 impl<'a> Index<LinkIndex> for Allocations<'a> {
1982     type Output = (LinkType, CowStr<'a>, CowStr<'a>);
1983 
index(&self, ix: LinkIndex) -> &Self::Output1984     fn index(&self, ix: LinkIndex) -> &Self::Output {
1985         self.links.index(ix.0)
1986     }
1987 }
1988 
1989 impl<'a> Index<AlignmentIndex> for Allocations<'a> {
1990     type Output = Vec<Alignment>;
1991 
index(&self, ix: AlignmentIndex) -> &Self::Output1992     fn index(&self, ix: AlignmentIndex) -> &Self::Output {
1993         self.alignments.index(ix.0)
1994     }
1995 }
1996 
1997 /// A struct containing information on the reachability of certain inline HTML
1998 /// elements. In particular, for cdata elements (`<![CDATA[`), processing
1999 /// elements (`<?`) and declarations (`<!DECLARATION`). The respectives usizes
2000 /// represent the indices before which a scan will always fail and can hence
2001 /// be skipped.
2002 #[derive(Clone, Default)]
2003 pub(crate) struct HtmlScanGuard {
2004     pub cdata: usize,
2005     pub processing: usize,
2006     pub declaration: usize,
2007 }
2008 
special_bytes(options: &Options) -> [bool; 256]2009 fn special_bytes(options: &Options) -> [bool; 256] {
2010     let mut bytes = [false; 256];
2011     let standard_bytes = [
2012         b'\n', b'\r', b'*', b'_', b'&', b'\\', b'[', b']', b'<', b'!', b'`',
2013     ];
2014 
2015     for &byte in &standard_bytes {
2016         bytes[byte as usize] = true;
2017     }
2018     if options.contains(Options::ENABLE_TABLES) {
2019         bytes[b'|' as usize] = true;
2020     }
2021     if options.contains(Options::ENABLE_STRIKETHROUGH) {
2022         bytes[b'~' as usize] = true;
2023     }
2024     if options.contains(Options::ENABLE_SMART_PUNCTUATION) {
2025         for &byte in &[b'.', b'-', b'"', b'\''] {
2026             bytes[byte as usize] = true;
2027         }
2028     }
2029 
2030     bytes
2031 }
2032 
create_lut(options: &Options) -> LookupTable2033 pub(crate) fn create_lut(options: &Options) -> LookupTable {
2034     #[cfg(all(target_arch = "x86_64", feature = "simd"))]
2035     {
2036         LookupTable {
2037             simd: crate::simd::compute_lookup(options),
2038             scalar: special_bytes(options),
2039         }
2040     }
2041     #[cfg(not(all(target_arch = "x86_64", feature = "simd")))]
2042     {
2043         special_bytes(options)
2044     }
2045 }
2046 
2047 pub type BrokenLinkCallback<'a> =
2048     Option<&'a mut dyn FnMut(BrokenLink) -> Option<(CowStr<'a>, CowStr<'a>)>>;
2049 
2050 /// Markdown event iterator.
2051 pub struct Parser<'a> {
2052     text: &'a str,
2053     options: Options,
2054     tree: Tree<Item>,
2055     allocs: Allocations<'a>,
2056     broken_link_callback: BrokenLinkCallback<'a>,
2057     html_scan_guard: HtmlScanGuard,
2058 
2059     // used by inline passes. store them here for reuse
2060     inline_stack: InlineStack,
2061     link_stack: LinkStack,
2062 }
2063 
2064 impl<'a> Parser<'a> {
2065     /// Creates a new event iterator for a markdown string without any options enabled.
new(text: &'a str) -> Parser<'a>2066     pub fn new(text: &'a str) -> Parser<'a> {
2067         Parser::new_ext(text, Options::empty())
2068     }
2069 
2070     /// Creates a new event iterator for a markdown string with given options.
new_ext(text: &'a str, options: Options) -> Parser<'a>2071     pub fn new_ext(text: &'a str, options: Options) -> Parser<'a> {
2072         Parser::new_with_broken_link_callback(text, options, None)
2073     }
2074 
2075     /// In case the parser encounters any potential links that have a broken
2076     /// reference (e.g `[foo]` when there is no `[foo]: ` entry at the bottom)
2077     /// the provided callback will be called with the reference name,
2078     /// and the returned pair will be used as the link name and title if it is not
2079     /// `None`.
new_with_broken_link_callback( text: &'a str, options: Options, broken_link_callback: BrokenLinkCallback<'a>, ) -> Parser<'a>2080     pub fn new_with_broken_link_callback(
2081         text: &'a str,
2082         options: Options,
2083         broken_link_callback: BrokenLinkCallback<'a>,
2084     ) -> Parser<'a> {
2085         let lut = create_lut(&options);
2086         let first_pass = FirstPass::new(text, options, &lut);
2087         let (mut tree, allocs) = first_pass.run();
2088         tree.reset();
2089         let inline_stack = Default::default();
2090         let link_stack = Default::default();
2091         let html_scan_guard = Default::default();
2092         Parser {
2093             text,
2094             options,
2095             tree,
2096             allocs,
2097             broken_link_callback,
2098             inline_stack,
2099             link_stack,
2100             html_scan_guard,
2101         }
2102     }
2103 
2104     /// Handle inline markup.
2105     ///
2106     /// When the parser encounters any item indicating potential inline markup, all
2107     /// inline markup passes are run on the remainder of the chain.
2108     ///
2109     /// Note: there's some potential for optimization here, but that's future work.
handle_inline(&mut self)2110     fn handle_inline(&mut self) {
2111         self.handle_inline_pass1();
2112         self.handle_emphasis();
2113     }
2114 
2115     /// Handle inline HTML, code spans, and links.
2116     ///
2117     /// This function handles both inline HTML and code spans, because they have
2118     /// the same precedence. It also handles links, even though they have lower
2119     /// precedence, because the URL of links must not be processed.
handle_inline_pass1(&mut self)2120     fn handle_inline_pass1(&mut self) {
2121         let mut code_delims = CodeDelims::new();
2122         let mut cur = self.tree.cur();
2123         let mut prev = None;
2124 
2125         let block_end = self.tree[self.tree.peek_up().unwrap()].item.end;
2126         let block_text = &self.text[..block_end];
2127 
2128         while let Some(mut cur_ix) = cur {
2129             match self.tree[cur_ix].item.body {
2130                 ItemBody::MaybeHtml => {
2131                     let next = self.tree[cur_ix].next;
2132                     let autolink = if let Some(next_ix) = next {
2133                         scan_autolink(block_text, self.tree[next_ix].item.start)
2134                     } else {
2135                         None
2136                     };
2137 
2138                     if let Some((ix, uri, link_type)) = autolink {
2139                         let node = scan_nodes_to_ix(&self.tree, next, ix);
2140                         let text_node = self.tree.create_node(Item {
2141                             start: self.tree[cur_ix].item.start + 1,
2142                             end: ix - 1,
2143                             body: ItemBody::Text,
2144                         });
2145                         let link_ix = self.allocs.allocate_link(link_type, uri, "".into());
2146                         self.tree[cur_ix].item.body = ItemBody::Link(link_ix);
2147                         self.tree[cur_ix].item.end = ix;
2148                         self.tree[cur_ix].next = node;
2149                         self.tree[cur_ix].child = Some(text_node);
2150                         prev = cur;
2151                         cur = node;
2152                         if let Some(node_ix) = cur {
2153                             self.tree[node_ix].item.start = max(self.tree[node_ix].item.start, ix);
2154                         }
2155                         continue;
2156                     } else {
2157                         let inline_html = next.and_then(|next_ix| {
2158                             self.scan_inline_html(
2159                                 block_text.as_bytes(),
2160                                 self.tree[next_ix].item.start,
2161                             )
2162                         });
2163                         if let Some((span, ix)) = inline_html {
2164                             let node = scan_nodes_to_ix(&self.tree, next, ix);
2165                             self.tree[cur_ix].item.body = if !span.is_empty() {
2166                                 let converted_string =
2167                                     String::from_utf8(span).expect("invalid utf8");
2168                                 ItemBody::OwnedHtml(
2169                                     self.allocs.allocate_cow(converted_string.into()),
2170                                 )
2171                             } else {
2172                                 ItemBody::Html
2173                             };
2174                             self.tree[cur_ix].item.end = ix;
2175                             self.tree[cur_ix].next = node;
2176                             prev = cur;
2177                             cur = node;
2178                             if let Some(node_ix) = cur {
2179                                 self.tree[node_ix].item.start =
2180                                     max(self.tree[node_ix].item.start, ix);
2181                             }
2182                             continue;
2183                         }
2184                     }
2185                     self.tree[cur_ix].item.body = ItemBody::Text;
2186                 }
2187                 ItemBody::MaybeCode(mut search_count, preceded_by_backslash) => {
2188                     if preceded_by_backslash {
2189                         search_count -= 1;
2190                         if search_count == 0 {
2191                             self.tree[cur_ix].item.body = ItemBody::Text;
2192                             prev = cur;
2193                             cur = self.tree[cur_ix].next;
2194                             continue;
2195                         }
2196                     }
2197 
2198                     if code_delims.is_populated() {
2199                         // we have previously scanned all codeblock delimiters,
2200                         // so we can reuse that work
2201                         if let Some(scan_ix) = code_delims.find(cur_ix, search_count) {
2202                             self.make_code_span(cur_ix, scan_ix, preceded_by_backslash);
2203                         } else {
2204                             self.tree[cur_ix].item.body = ItemBody::Text;
2205                         }
2206                     } else {
2207                         // we haven't previously scanned all codeblock delimiters,
2208                         // so walk the AST
2209                         let mut scan = if search_count > 0 {
2210                             self.tree[cur_ix].next
2211                         } else {
2212                             None
2213                         };
2214                         while let Some(scan_ix) = scan {
2215                             if let ItemBody::MaybeCode(delim_count, _) =
2216                                 self.tree[scan_ix].item.body
2217                             {
2218                                 if search_count == delim_count {
2219                                     self.make_code_span(cur_ix, scan_ix, preceded_by_backslash);
2220                                     code_delims.clear();
2221                                     break;
2222                                 } else {
2223                                     code_delims.insert(delim_count, scan_ix);
2224                                 }
2225                             }
2226                             scan = self.tree[scan_ix].next;
2227                         }
2228                         if scan == None {
2229                             self.tree[cur_ix].item.body = ItemBody::Text;
2230                         }
2231                     }
2232                 }
2233                 ItemBody::MaybeLinkOpen => {
2234                     self.tree[cur_ix].item.body = ItemBody::Text;
2235                     self.link_stack.push(LinkStackEl {
2236                         node: cur_ix,
2237                         ty: LinkStackTy::Link,
2238                     });
2239                 }
2240                 ItemBody::MaybeImage => {
2241                     self.tree[cur_ix].item.body = ItemBody::Text;
2242                     self.link_stack.push(LinkStackEl {
2243                         node: cur_ix,
2244                         ty: LinkStackTy::Image,
2245                     });
2246                 }
2247                 ItemBody::MaybeLinkClose(could_be_ref) => {
2248                     self.tree[cur_ix].item.body = ItemBody::Text;
2249                     if let Some(tos) = self.link_stack.pop() {
2250                         if tos.ty == LinkStackTy::Disabled {
2251                             continue;
2252                         }
2253                         let next = self.tree[cur_ix].next;
2254                         if let Some((next_ix, url, title)) =
2255                             self.scan_inline_link(block_text, self.tree[cur_ix].item.end, next)
2256                         {
2257                             let next_node = scan_nodes_to_ix(&self.tree, next, next_ix);
2258                             if let Some(prev_ix) = prev {
2259                                 self.tree[prev_ix].next = None;
2260                             }
2261                             cur = Some(tos.node);
2262                             cur_ix = tos.node;
2263                             let link_ix = self.allocs.allocate_link(LinkType::Inline, url, title);
2264                             self.tree[cur_ix].item.body = if tos.ty == LinkStackTy::Image {
2265                                 ItemBody::Image(link_ix)
2266                             } else {
2267                                 ItemBody::Link(link_ix)
2268                             };
2269                             self.tree[cur_ix].child = self.tree[cur_ix].next;
2270                             self.tree[cur_ix].next = next_node;
2271                             self.tree[cur_ix].item.end = next_ix;
2272                             if let Some(next_node_ix) = next_node {
2273                                 self.tree[next_node_ix].item.start =
2274                                     max(self.tree[next_node_ix].item.start, next_ix);
2275                             }
2276 
2277                             if tos.ty == LinkStackTy::Link {
2278                                 self.link_stack.disable_all_links();
2279                             }
2280                         } else {
2281                             // ok, so its not an inline link. maybe it is a reference
2282                             // to a defined link?
2283                             let scan_result = scan_reference(
2284                                 &self.tree,
2285                                 block_text,
2286                                 next,
2287                                 self.options.contains(Options::ENABLE_FOOTNOTES),
2288                             );
2289                             let (node_after_link, link_type) = match scan_result {
2290                                 // [label][reference]
2291                                 RefScan::LinkLabel(_, end_ix) => {
2292                                     // Toggle reference viability of the last closing bracket,
2293                                     // so that we can skip it on future iterations in case
2294                                     // it fails in this one. In particular, we won't call
2295                                     // the broken link callback twice on one reference.
2296                                     let reference_close_node =
2297                                         scan_nodes_to_ix(&self.tree, next, end_ix - 1).unwrap();
2298                                     self.tree[reference_close_node].item.body =
2299                                         ItemBody::MaybeLinkClose(false);
2300                                     let next_node = self.tree[reference_close_node].next;
2301 
2302                                     (next_node, LinkType::Reference)
2303                                 }
2304                                 // [reference][]
2305                                 RefScan::Collapsed(next_node) => {
2306                                     // This reference has already been tried, and it's not
2307                                     // valid. Skip it.
2308                                     if !could_be_ref {
2309                                         continue;
2310                                     }
2311                                     (next_node, LinkType::Collapsed)
2312                                 }
2313                                 // [shortcut]
2314                                 //
2315                                 // [shortcut]: /blah
2316                                 RefScan::Failed => {
2317                                     if !could_be_ref {
2318                                         continue;
2319                                     }
2320                                     (next, LinkType::Shortcut)
2321                                 }
2322                             };
2323 
2324                             // FIXME: references and labels are mixed in the naming of variables
2325                             // below. Disambiguate!
2326 
2327                             // (label, source_ix end)
2328                             let label: Option<(ReferenceLabel<'a>, usize)> = match scan_result {
2329                                 RefScan::LinkLabel(l, end_ix) => {
2330                                     Some((ReferenceLabel::Link(l), end_ix))
2331                                 }
2332                                 RefScan::Collapsed(..) | RefScan::Failed => {
2333                                     // No label? maybe it is a shortcut reference
2334                                     let label_start = self.tree[tos.node].item.end - 1;
2335                                     scan_link_label(
2336                                         &self.tree,
2337                                         &self.text[label_start..self.tree[cur_ix].item.end],
2338                                         self.options.contains(Options::ENABLE_FOOTNOTES),
2339                                     )
2340                                     .map(|(ix, label)| (label, label_start + ix))
2341                                 }
2342                             };
2343 
2344                             // see if it's a footnote reference
2345                             if let Some((ReferenceLabel::Footnote(l), end)) = label {
2346                                 self.tree[tos.node].next = node_after_link;
2347                                 self.tree[tos.node].child = None;
2348                                 self.tree[tos.node].item.body =
2349                                     ItemBody::FootnoteReference(self.allocs.allocate_cow(l));
2350                                 self.tree[tos.node].item.end = end;
2351                                 prev = Some(tos.node);
2352                                 cur = node_after_link;
2353                                 self.link_stack.clear();
2354                                 continue;
2355                             } else if let Some((ReferenceLabel::Link(link_label), end)) = label {
2356                                 let type_url_title = self
2357                                     .allocs
2358                                     .refdefs
2359                                     .get(&UniCase::new(link_label.as_ref().into()))
2360                                     .map(|matching_def| {
2361                                         // found a matching definition!
2362                                         let title = matching_def
2363                                             .title
2364                                             .as_ref()
2365                                             .cloned()
2366                                             .unwrap_or_else(|| "".into());
2367                                         let url = matching_def.dest.clone();
2368                                         (link_type, url, title)
2369                                     })
2370                                     .or_else(|| {
2371                                         match self.broken_link_callback.as_mut() {
2372                                             Some(callback) => {
2373                                                 // Construct a BrokenLink struct, which will be passed to the callback
2374                                                 let broken_link = BrokenLink {
2375                                                     span: (self.tree[tos.node].item.start)..end,
2376                                                     link_type: link_type,
2377                                                     reference: link_label.as_ref(),
2378                                                 };
2379 
2380                                                 callback(broken_link).map(|(url, title)| {
2381                                                     (link_type.to_unknown(), url, title)
2382                                                 })
2383                                             }
2384                                             None => None,
2385                                         }
2386                                     });
2387 
2388                                 if let Some((def_link_type, url, title)) = type_url_title {
2389                                     let link_ix =
2390                                         self.allocs.allocate_link(def_link_type, url, title);
2391                                     self.tree[tos.node].item.body = if tos.ty == LinkStackTy::Image
2392                                     {
2393                                         ItemBody::Image(link_ix)
2394                                     } else {
2395                                         ItemBody::Link(link_ix)
2396                                     };
2397                                     let label_node = self.tree[tos.node].next;
2398 
2399                                     // lets do some tree surgery to add the link to the tree
2400                                     // 1st: skip the label node and close node
2401                                     self.tree[tos.node].next = node_after_link;
2402 
2403                                     // then, if it exists, add the label node as a child to the link node
2404                                     if label_node != cur {
2405                                         self.tree[tos.node].child = label_node;
2406 
2407                                         // finally: disconnect list of children
2408                                         if let Some(prev_ix) = prev {
2409                                             self.tree[prev_ix].next = None;
2410                                         }
2411                                     }
2412 
2413                                     self.tree[tos.node].item.end = end;
2414 
2415                                     // set up cur so next node will be node_after_link
2416                                     cur = Some(tos.node);
2417                                     cur_ix = tos.node;
2418 
2419                                     if tos.ty == LinkStackTy::Link {
2420                                         self.link_stack.disable_all_links();
2421                                     }
2422                                 }
2423                             }
2424                         }
2425                     }
2426                 }
2427                 _ => (),
2428             }
2429             prev = cur;
2430             cur = self.tree[cur_ix].next;
2431         }
2432         self.link_stack.clear();
2433     }
2434 
handle_emphasis(&mut self)2435     fn handle_emphasis(&mut self) {
2436         let mut prev = None;
2437         let mut prev_ix: TreeIndex;
2438         let mut cur = self.tree.cur();
2439 
2440         let mut single_quote_open: Option<TreeIndex> = None;
2441         let mut double_quote_open: bool = false;
2442 
2443         while let Some(mut cur_ix) = cur {
2444             match self.tree[cur_ix].item.body {
2445                 ItemBody::MaybeEmphasis(mut count, can_open, can_close) => {
2446                     let c = self.text.as_bytes()[self.tree[cur_ix].item.start];
2447                     let both = can_open && can_close;
2448                     if can_close {
2449                         while let Some(el) =
2450                             self.inline_stack.find_match(&mut self.tree, c, count, both)
2451                         {
2452                             // have a match!
2453                             if let Some(prev_ix) = prev {
2454                                 self.tree[prev_ix].next = None;
2455                             }
2456                             let match_count = min(count, el.count);
2457                             // start, end are tree node indices
2458                             let mut end = cur_ix - 1;
2459                             let mut start = el.start + el.count;
2460 
2461                             // work from the inside out
2462                             while start > el.start + el.count - match_count {
2463                                 let (inc, ty) = if c == b'~' {
2464                                     (2, ItemBody::Strikethrough)
2465                                 } else if start > el.start + el.count - match_count + 1 {
2466                                     (2, ItemBody::Strong)
2467                                 } else {
2468                                     (1, ItemBody::Emphasis)
2469                                 };
2470 
2471                                 let root = start - inc;
2472                                 end = end + inc;
2473                                 self.tree[root].item.body = ty;
2474                                 self.tree[root].item.end = self.tree[end].item.end;
2475                                 self.tree[root].child = Some(start);
2476                                 self.tree[root].next = None;
2477                                 start = root;
2478                             }
2479 
2480                             // set next for top most emph level
2481                             prev_ix = el.start + el.count - match_count;
2482                             prev = Some(prev_ix);
2483                             cur = self.tree[cur_ix + match_count - 1].next;
2484                             self.tree[prev_ix].next = cur;
2485 
2486                             if el.count > match_count {
2487                                 self.inline_stack.push(InlineEl {
2488                                     start: el.start,
2489                                     count: el.count - match_count,
2490                                     c: el.c,
2491                                     both,
2492                                 })
2493                             }
2494                             count -= match_count;
2495                             if count > 0 {
2496                                 cur_ix = cur.unwrap();
2497                             } else {
2498                                 break;
2499                             }
2500                         }
2501                     }
2502                     if count > 0 {
2503                         if can_open {
2504                             self.inline_stack.push(InlineEl {
2505                                 start: cur_ix,
2506                                 count,
2507                                 c,
2508                                 both,
2509                             });
2510                         } else {
2511                             for i in 0..count {
2512                                 self.tree[cur_ix + i].item.body = ItemBody::Text;
2513                             }
2514                         }
2515                         prev_ix = cur_ix + count - 1;
2516                         prev = Some(prev_ix);
2517                         cur = self.tree[prev_ix].next;
2518                     }
2519                 }
2520                 ItemBody::MaybeSmartQuote(c, can_open, can_close) => {
2521                     self.tree[cur_ix].item.body = match c {
2522                         b'\'' => {
2523                             if let (Some(open_ix), true) = (single_quote_open, can_close) {
2524                                 self.tree[open_ix].item.body = ItemBody::SynthesizeChar('‘');
2525                                 single_quote_open = None;
2526                             } else if can_open {
2527                                 single_quote_open = Some(cur_ix);
2528                             }
2529                             ItemBody::SynthesizeChar('’')
2530                         }
2531                         _ /* double quote */ => {
2532                             if can_close && double_quote_open {
2533                                 double_quote_open = false;
2534                                 ItemBody::SynthesizeChar('”')
2535                             } else {
2536                                 if can_open && !double_quote_open {
2537                                     double_quote_open = true;
2538                                 }
2539                                 ItemBody::SynthesizeChar('“')
2540                             }
2541                         }
2542                     };
2543                     prev = cur;
2544                     cur = self.tree[cur_ix].next;
2545                 }
2546                 _ => {
2547                     prev = cur;
2548                     cur = self.tree[cur_ix].next;
2549                 }
2550             }
2551         }
2552         self.inline_stack.pop_all(&mut self.tree);
2553     }
2554 
2555     /// Returns next byte index, url and title.
scan_inline_link( &self, underlying: &'a str, mut ix: usize, node: Option<TreeIndex>, ) -> Option<(usize, CowStr<'a>, CowStr<'a>)>2556     fn scan_inline_link(
2557         &self,
2558         underlying: &'a str,
2559         mut ix: usize,
2560         node: Option<TreeIndex>,
2561     ) -> Option<(usize, CowStr<'a>, CowStr<'a>)> {
2562         if scan_ch(&underlying.as_bytes()[ix..], b'(') == 0 {
2563             return None;
2564         }
2565         ix += 1;
2566         ix += scan_while(&underlying.as_bytes()[ix..], is_ascii_whitespace);
2567 
2568         let (dest_length, dest) = scan_link_dest(underlying, ix, LINK_MAX_NESTED_PARENS)?;
2569         let dest = unescape(dest);
2570         ix += dest_length;
2571 
2572         ix += scan_while(&underlying.as_bytes()[ix..], is_ascii_whitespace);
2573 
2574         let title = if let Some((bytes_scanned, t)) = self.scan_link_title(underlying, ix, node) {
2575             ix += bytes_scanned;
2576             ix += scan_while(&underlying.as_bytes()[ix..], is_ascii_whitespace);
2577             t
2578         } else {
2579             "".into()
2580         };
2581         if scan_ch(&underlying.as_bytes()[ix..], b')') == 0 {
2582             return None;
2583         }
2584         ix += 1;
2585 
2586         Some((ix, dest, title))
2587     }
2588 
2589     // returns (bytes scanned, title cow)
scan_link_title( &self, text: &'a str, start_ix: usize, node: Option<TreeIndex>, ) -> Option<(usize, CowStr<'a>)>2590     fn scan_link_title(
2591         &self,
2592         text: &'a str,
2593         start_ix: usize,
2594         node: Option<TreeIndex>,
2595     ) -> Option<(usize, CowStr<'a>)> {
2596         let bytes = text.as_bytes();
2597         let open = match bytes.get(start_ix) {
2598             Some(b @ b'\'') | Some(b @ b'\"') | Some(b @ b'(') => *b,
2599             _ => return None,
2600         };
2601         let close = if open == b'(' { b')' } else { open };
2602 
2603         let mut title = String::new();
2604         let mut mark = start_ix + 1;
2605         let mut i = start_ix + 1;
2606 
2607         while i < bytes.len() {
2608             let c = bytes[i];
2609 
2610             if c == close {
2611                 let cow = if mark == 1 {
2612                     (i - start_ix + 1, text[mark..i].into())
2613                 } else {
2614                     title.push_str(&text[mark..i]);
2615                     (i - start_ix + 1, title.into())
2616                 };
2617 
2618                 return Some(cow);
2619             }
2620             if c == open {
2621                 return None;
2622             }
2623 
2624             if c == b'\n' || c == b'\r' {
2625                 if let Some(node_ix) = scan_nodes_to_ix(&self.tree, node, i + 1) {
2626                     if self.tree[node_ix].item.start > i {
2627                         title.push_str(&text[mark..i]);
2628                         title.push('\n');
2629                         i = self.tree[node_ix].item.start;
2630                         mark = i;
2631                         continue;
2632                     }
2633                 }
2634             }
2635             if c == b'&' {
2636                 if let (n, Some(value)) = scan_entity(&bytes[i..]) {
2637                     title.push_str(&text[mark..i]);
2638                     title.push_str(&value);
2639                     i += n;
2640                     mark = i;
2641                     continue;
2642                 }
2643             }
2644             if c == b'\\' && i + 1 < bytes.len() && is_ascii_punctuation(bytes[i + 1]) {
2645                 title.push_str(&text[mark..i]);
2646                 i += 1;
2647                 mark = i;
2648             }
2649 
2650             i += 1;
2651         }
2652 
2653         None
2654     }
2655 
2656     /// Make a code span.
2657     ///
2658     /// Both `open` and `close` are matching MaybeCode items.
make_code_span(&mut self, open: TreeIndex, close: TreeIndex, preceding_backslash: bool)2659     fn make_code_span(&mut self, open: TreeIndex, close: TreeIndex, preceding_backslash: bool) {
2660         let first_ix = open + 1;
2661         let last_ix = close - 1;
2662         let bytes = self.text.as_bytes();
2663         let mut span_start = self.tree[open].item.end;
2664         let mut span_end = self.tree[close].item.start;
2665         let mut buf: Option<String> = None;
2666 
2667         // detect all-space sequences, since they are kept as-is as of commonmark 0.29
2668         if !bytes[span_start..span_end].iter().all(|&b| b == b' ') {
2669             let opening = match bytes[span_start] {
2670                 b' ' | b'\r' | b'\n' => true,
2671                 _ => false,
2672             };
2673             let closing = match bytes[span_end - 1] {
2674                 b' ' | b'\r' | b'\n' => true,
2675                 _ => false,
2676             };
2677             let drop_enclosing_whitespace = opening && closing;
2678 
2679             if drop_enclosing_whitespace {
2680                 span_start += 1;
2681                 if span_start < span_end {
2682                     span_end -= 1;
2683                 }
2684             }
2685 
2686             let mut ix = first_ix;
2687 
2688             while ix < close {
2689                 if let ItemBody::HardBreak | ItemBody::SoftBreak = self.tree[ix].item.body {
2690                     if drop_enclosing_whitespace {
2691                         // check whether break should be ignored
2692                         if ix == first_ix {
2693                             ix = ix + 1;
2694                             span_start = min(span_end, self.tree[ix].item.start);
2695                             continue;
2696                         } else if ix == last_ix && last_ix > first_ix {
2697                             ix = ix + 1;
2698                             continue;
2699                         }
2700                     }
2701 
2702                     let end = bytes[self.tree[ix].item.start..]
2703                         .iter()
2704                         .position(|&b| b == b'\r' || b == b'\n')
2705                         .unwrap()
2706                         + self.tree[ix].item.start;
2707                     if let Some(ref mut buf) = buf {
2708                         buf.push_str(&self.text[self.tree[ix].item.start..end]);
2709                         buf.push(' ');
2710                     } else {
2711                         let mut new_buf = String::with_capacity(span_end - span_start);
2712                         new_buf.push_str(&self.text[span_start..end]);
2713                         new_buf.push(' ');
2714                         buf = Some(new_buf);
2715                     }
2716                 } else if let Some(ref mut buf) = buf {
2717                     let end = if ix == last_ix {
2718                         span_end
2719                     } else {
2720                         self.tree[ix].item.end
2721                     };
2722                     buf.push_str(&self.text[self.tree[ix].item.start..end]);
2723                 }
2724                 ix = ix + 1;
2725             }
2726         }
2727 
2728         let cow = if let Some(buf) = buf {
2729             buf.into()
2730         } else {
2731             self.text[span_start..span_end].into()
2732         };
2733         if preceding_backslash {
2734             self.tree[open].item.body = ItemBody::Text;
2735             self.tree[open].item.end = self.tree[open].item.start + 1;
2736             self.tree[open].next = Some(close);
2737             self.tree[close].item.body = ItemBody::Code(self.allocs.allocate_cow(cow));
2738             self.tree[close].item.start = self.tree[open].item.start + 1;
2739         } else {
2740             self.tree[open].item.body = ItemBody::Code(self.allocs.allocate_cow(cow));
2741             self.tree[open].item.end = self.tree[close].item.end;
2742             self.tree[open].next = self.tree[close].next;
2743         }
2744     }
2745 
2746     /// On success, returns a buffer containing the inline html and byte offset.
2747     /// When no bytes were skipped, the buffer will be empty and the html can be
2748     /// represented as a subslice of the input string.
scan_inline_html(&mut self, bytes: &[u8], ix: usize) -> Option<(Vec<u8>, usize)>2749     fn scan_inline_html(&mut self, bytes: &[u8], ix: usize) -> Option<(Vec<u8>, usize)> {
2750         let c = *bytes.get(ix)?;
2751         if c == b'!' {
2752             Some((
2753                 vec![],
2754                 scan_inline_html_comment(bytes, ix + 1, &mut self.html_scan_guard)?,
2755             ))
2756         } else if c == b'?' {
2757             Some((
2758                 vec![],
2759                 scan_inline_html_processing(bytes, ix + 1, &mut self.html_scan_guard)?,
2760             ))
2761         } else {
2762             let (span, i) = scan_html_block_inner(
2763                 // Subtract 1 to include the < character
2764                 &bytes[(ix - 1)..],
2765                 Some(&|_bytes| {
2766                     let mut line_start = LineStart::new(bytes);
2767                     let _ = scan_containers(&self.tree, &mut line_start);
2768                     line_start.bytes_scanned()
2769                 }),
2770             )?;
2771             Some((span, i + ix - 1))
2772         }
2773     }
2774 
2775     /// Consumes the event iterator and produces an iterator that produces
2776     /// `(Event, Range)` pairs, where the `Range` value maps to the corresponding
2777     /// range in the markdown source.
into_offset_iter(self) -> OffsetIter<'a>2778     pub fn into_offset_iter(self) -> OffsetIter<'a> {
2779         OffsetIter { inner: self }
2780     }
2781 }
2782 
2783 pub(crate) enum LoopInstruction<T> {
2784     /// Continue looking for more special bytes, but skip next few bytes.
2785     ContinueAndSkip(usize),
2786     /// Break looping immediately, returning with the given index and value.
2787     BreakAtWith(usize, T),
2788 }
2789 
2790 #[cfg(all(target_arch = "x86_64", feature = "simd"))]
2791 pub(crate) struct LookupTable {
2792     pub simd: [u8; 16],
2793     pub scalar: [bool; 256],
2794 }
2795 
2796 #[cfg(not(all(target_arch = "x86_64", feature = "simd")))]
2797 type LookupTable = [bool; 256];
2798 
2799 /// This function walks the byte slices from the given index and
2800 /// calls the callback function on all bytes (and their indices) that are in the following set:
2801 /// `` ` ``, `\`, `&`, `*`, `_`, `~`, `!`, `<`, `[`, `]`, `|`, `\r`, `\n`
2802 /// It is guaranteed not call the callback on other bytes.
2803 /// Whenever `callback(ix, byte)` returns a `ContinueAndSkip(n)` value, the callback
2804 /// will not be called with an index that is less than `ix + n + 1`.
2805 /// When the callback returns a `BreakAtWith(end_ix, opt+val)`, no more callbacks will be
2806 /// called and the function returns immediately with the return value `(end_ix, opt_val)`.
2807 /// If `BreakAtWith(..)` is never returned, this function will return the first
2808 /// index that is outside the byteslice bound and a `None` value.
iterate_special_bytes<F, T>( lut: &LookupTable, bytes: &[u8], ix: usize, callback: F, ) -> (usize, Option<T>) where F: FnMut(usize, u8) -> LoopInstruction<Option<T>>,2809 fn iterate_special_bytes<F, T>(
2810     lut: &LookupTable,
2811     bytes: &[u8],
2812     ix: usize,
2813     callback: F,
2814 ) -> (usize, Option<T>)
2815 where
2816     F: FnMut(usize, u8) -> LoopInstruction<Option<T>>,
2817 {
2818     #[cfg(all(target_arch = "x86_64", feature = "simd"))]
2819     {
2820         crate::simd::iterate_special_bytes(lut, bytes, ix, callback)
2821     }
2822     #[cfg(not(all(target_arch = "x86_64", feature = "simd")))]
2823     {
2824         scalar_iterate_special_bytes(lut, bytes, ix, callback)
2825     }
2826 }
2827 
scalar_iterate_special_bytes<F, T>( lut: &[bool; 256], bytes: &[u8], mut ix: usize, mut callback: F, ) -> (usize, Option<T>) where F: FnMut(usize, u8) -> LoopInstruction<Option<T>>,2828 pub(crate) fn scalar_iterate_special_bytes<F, T>(
2829     lut: &[bool; 256],
2830     bytes: &[u8],
2831     mut ix: usize,
2832     mut callback: F,
2833 ) -> (usize, Option<T>)
2834 where
2835     F: FnMut(usize, u8) -> LoopInstruction<Option<T>>,
2836 {
2837     while ix < bytes.len() {
2838         let b = bytes[ix];
2839         if lut[b as usize] {
2840             match callback(ix, b) {
2841                 LoopInstruction::ContinueAndSkip(skip) => {
2842                     ix += skip;
2843                 }
2844                 LoopInstruction::BreakAtWith(ix, val) => {
2845                     return (ix, val);
2846                 }
2847             }
2848         }
2849         ix += 1;
2850     }
2851 
2852     (ix, None)
2853 }
2854 
2855 /// Markdown event and source range iterator.
2856 ///
2857 /// Generates tuples where the first element is the markdown event and the second
2858 /// is a the corresponding range in the source string.
2859 ///
2860 /// Constructed from a `Parser` using its
2861 /// [`into_offset_iter`](struct.Parser.html#method.into_offset_iter) method.
2862 pub struct OffsetIter<'a> {
2863     inner: Parser<'a>,
2864 }
2865 
2866 impl<'a> Iterator for OffsetIter<'a> {
2867     type Item = (Event<'a>, Range<usize>);
2868 
next(&mut self) -> Option<Self::Item>2869     fn next(&mut self) -> Option<Self::Item> {
2870         match self.inner.tree.cur() {
2871             None => {
2872                 let ix = self.inner.tree.pop()?;
2873                 let tag = item_to_tag(&self.inner.tree[ix].item, &self.inner.allocs);
2874                 self.inner.tree.next_sibling(ix);
2875                 Some((
2876                     Event::End(tag),
2877                     self.inner.tree[ix].item.start..self.inner.tree[ix].item.end,
2878                 ))
2879             }
2880             Some(cur_ix) => {
2881                 if self.inner.tree[cur_ix].item.body.is_inline() {
2882                     self.inner.handle_inline();
2883                 }
2884 
2885                 let node = self.inner.tree[cur_ix];
2886                 let item = node.item;
2887                 let event = item_to_event(item, self.inner.text, &self.inner.allocs);
2888                 if let Event::Start(..) = event {
2889                     self.inner.tree.push();
2890                 } else {
2891                     self.inner.tree.next_sibling(cur_ix);
2892                 }
2893                 Some((event, item.start..item.end))
2894             }
2895         }
2896     }
2897 }
2898 
item_to_tag<'a>(item: &Item, allocs: &Allocations<'a>) -> Tag<'a>2899 fn item_to_tag<'a>(item: &Item, allocs: &Allocations<'a>) -> Tag<'a> {
2900     match item.body {
2901         ItemBody::Paragraph => Tag::Paragraph,
2902         ItemBody::Emphasis => Tag::Emphasis,
2903         ItemBody::Strong => Tag::Strong,
2904         ItemBody::Strikethrough => Tag::Strikethrough,
2905         ItemBody::Link(link_ix) => {
2906             let &(ref link_type, ref url, ref title) = allocs.index(link_ix);
2907             Tag::Link(*link_type, url.clone(), title.clone())
2908         }
2909         ItemBody::Image(link_ix) => {
2910             let &(ref link_type, ref url, ref title) = allocs.index(link_ix);
2911             Tag::Image(*link_type, url.clone(), title.clone())
2912         }
2913         ItemBody::Heading(level) => Tag::Heading(level),
2914         ItemBody::FencedCodeBlock(cow_ix) => {
2915             Tag::CodeBlock(CodeBlockKind::Fenced(allocs[cow_ix].clone()))
2916         }
2917         ItemBody::IndentCodeBlock => Tag::CodeBlock(CodeBlockKind::Indented),
2918         ItemBody::BlockQuote => Tag::BlockQuote,
2919         ItemBody::List(_, c, listitem_start) => {
2920             if c == b'.' || c == b')' {
2921                 Tag::List(Some(listitem_start))
2922             } else {
2923                 Tag::List(None)
2924             }
2925         }
2926         ItemBody::ListItem(_) => Tag::Item,
2927         ItemBody::TableHead => Tag::TableHead,
2928         ItemBody::TableCell => Tag::TableCell,
2929         ItemBody::TableRow => Tag::TableRow,
2930         ItemBody::Table(alignment_ix) => Tag::Table(allocs[alignment_ix].clone()),
2931         ItemBody::FootnoteDefinition(cow_ix) => Tag::FootnoteDefinition(allocs[cow_ix].clone()),
2932         _ => panic!("unexpected item body {:?}", item.body),
2933     }
2934 }
2935 
item_to_event<'a>(item: Item, text: &'a str, allocs: &Allocations<'a>) -> Event<'a>2936 fn item_to_event<'a>(item: Item, text: &'a str, allocs: &Allocations<'a>) -> Event<'a> {
2937     let tag = match item.body {
2938         ItemBody::Text => return Event::Text(text[item.start..item.end].into()),
2939         ItemBody::Code(cow_ix) => return Event::Code(allocs[cow_ix].clone()),
2940         ItemBody::SynthesizeText(cow_ix) => return Event::Text(allocs[cow_ix].clone()),
2941         ItemBody::SynthesizeChar(c) => return Event::Text(c.into()),
2942         ItemBody::Html => return Event::Html(text[item.start..item.end].into()),
2943         ItemBody::OwnedHtml(cow_ix) => return Event::Html(allocs[cow_ix].clone()),
2944         ItemBody::SoftBreak => return Event::SoftBreak,
2945         ItemBody::HardBreak => return Event::HardBreak,
2946         ItemBody::FootnoteReference(cow_ix) => {
2947             return Event::FootnoteReference(allocs[cow_ix].clone())
2948         }
2949         ItemBody::TaskListMarker(checked) => return Event::TaskListMarker(checked),
2950         ItemBody::Rule => return Event::Rule,
2951 
2952         ItemBody::Paragraph => Tag::Paragraph,
2953         ItemBody::Emphasis => Tag::Emphasis,
2954         ItemBody::Strong => Tag::Strong,
2955         ItemBody::Strikethrough => Tag::Strikethrough,
2956         ItemBody::Link(link_ix) => {
2957             let &(ref link_type, ref url, ref title) = allocs.index(link_ix);
2958             Tag::Link(*link_type, url.clone(), title.clone())
2959         }
2960         ItemBody::Image(link_ix) => {
2961             let &(ref link_type, ref url, ref title) = allocs.index(link_ix);
2962             Tag::Image(*link_type, url.clone(), title.clone())
2963         }
2964         ItemBody::Heading(level) => Tag::Heading(level),
2965         ItemBody::FencedCodeBlock(cow_ix) => {
2966             Tag::CodeBlock(CodeBlockKind::Fenced(allocs[cow_ix].clone()))
2967         }
2968         ItemBody::IndentCodeBlock => Tag::CodeBlock(CodeBlockKind::Indented),
2969         ItemBody::BlockQuote => Tag::BlockQuote,
2970         ItemBody::List(_, c, listitem_start) => {
2971             if c == b'.' || c == b')' {
2972                 Tag::List(Some(listitem_start))
2973             } else {
2974                 Tag::List(None)
2975             }
2976         }
2977         ItemBody::ListItem(_) => Tag::Item,
2978         ItemBody::TableHead => Tag::TableHead,
2979         ItemBody::TableCell => Tag::TableCell,
2980         ItemBody::TableRow => Tag::TableRow,
2981         ItemBody::Table(alignment_ix) => Tag::Table(allocs[alignment_ix].clone()),
2982         ItemBody::FootnoteDefinition(cow_ix) => Tag::FootnoteDefinition(allocs[cow_ix].clone()),
2983         _ => panic!("unexpected item body {:?}", item.body),
2984     };
2985 
2986     Event::Start(tag)
2987 }
2988 
2989 // https://english.stackexchange.com/a/285573
surgerize_tight_list(tree: &mut Tree<Item>, list_ix: TreeIndex)2990 fn surgerize_tight_list(tree: &mut Tree<Item>, list_ix: TreeIndex) {
2991     let mut list_item = tree[list_ix].child;
2992     while let Some(listitem_ix) = list_item {
2993         // first child is special, controls how we repoint list_item.child
2994         let list_item_firstborn = tree[listitem_ix].child;
2995 
2996         // Check that list item has children - this is not necessarily the case!
2997         if let Some(firstborn_ix) = list_item_firstborn {
2998             if let ItemBody::Paragraph = tree[firstborn_ix].item.body {
2999                 tree[listitem_ix].child = tree[firstborn_ix].child;
3000             }
3001 
3002             let mut list_item_child = Some(firstborn_ix);
3003             let mut node_to_repoint = None;
3004             while let Some(child_ix) = list_item_child {
3005                 // surgerize paragraphs
3006                 let repoint_ix = if let ItemBody::Paragraph = tree[child_ix].item.body {
3007                     if let Some(child_firstborn) = tree[child_ix].child {
3008                         if let Some(repoint_ix) = node_to_repoint {
3009                             tree[repoint_ix].next = Some(child_firstborn);
3010                         }
3011                         let mut child_lastborn = child_firstborn;
3012                         while let Some(lastborn_next_ix) = tree[child_lastborn].next {
3013                             child_lastborn = lastborn_next_ix;
3014                         }
3015                         child_lastborn
3016                     } else {
3017                         child_ix
3018                     }
3019                 } else {
3020                     child_ix
3021                 };
3022 
3023                 node_to_repoint = Some(repoint_ix);
3024                 tree[repoint_ix].next = tree[child_ix].next;
3025                 list_item_child = tree[child_ix].next;
3026             }
3027         }
3028 
3029         list_item = tree[listitem_ix].next;
3030     }
3031 }
3032 
3033 impl<'a> Iterator for Parser<'a> {
3034     type Item = Event<'a>;
3035 
next(&mut self) -> Option<Event<'a>>3036     fn next(&mut self) -> Option<Event<'a>> {
3037         match self.tree.cur() {
3038             None => {
3039                 let ix = self.tree.pop()?;
3040                 let tag = item_to_tag(&self.tree[ix].item, &self.allocs);
3041                 self.tree.next_sibling(ix);
3042                 Some(Event::End(tag))
3043             }
3044             Some(cur_ix) => {
3045                 if self.tree[cur_ix].item.body.is_inline() {
3046                     self.handle_inline();
3047                 }
3048 
3049                 let node = self.tree[cur_ix];
3050                 let item = node.item;
3051                 let event = item_to_event(item, self.text, &self.allocs);
3052                 if let Event::Start(..) = event {
3053                     self.tree.push();
3054                 } else {
3055                     self.tree.next_sibling(cur_ix);
3056                 }
3057                 Some(event)
3058             }
3059         }
3060     }
3061 }
3062 
3063 #[cfg(test)]
3064 mod test {
3065     use super::*;
3066     use crate::tree::Node;
3067 
3068     // TODO: move these tests to tests/html.rs?
3069 
parser_with_extensions(text: &str) -> Parser<'_>3070     fn parser_with_extensions(text: &str) -> Parser<'_> {
3071         let mut opts = Options::empty();
3072         opts.insert(Options::ENABLE_TABLES);
3073         opts.insert(Options::ENABLE_FOOTNOTES);
3074         opts.insert(Options::ENABLE_STRIKETHROUGH);
3075         opts.insert(Options::ENABLE_TASKLISTS);
3076 
3077         Parser::new_ext(text, opts)
3078     }
3079 
3080     #[test]
3081     #[cfg(target_pointer_width = "64")]
node_size()3082     fn node_size() {
3083         let node_size = std::mem::size_of::<Node<Item>>();
3084         assert_eq!(48, node_size);
3085     }
3086 
3087     #[test]
3088     #[cfg(target_pointer_width = "64")]
body_size()3089     fn body_size() {
3090         let body_size = std::mem::size_of::<ItemBody>();
3091         assert_eq!(16, body_size);
3092     }
3093 
3094     #[test]
single_open_fish_bracket()3095     fn single_open_fish_bracket() {
3096         // dont crash
3097         assert_eq!(3, Parser::new("<").count());
3098     }
3099 
3100     #[test]
lone_hashtag()3101     fn lone_hashtag() {
3102         // dont crash
3103         assert_eq!(2, Parser::new("#").count());
3104     }
3105 
3106     #[test]
lots_of_backslashes()3107     fn lots_of_backslashes() {
3108         // dont crash
3109         Parser::new("\\\\\r\r").count();
3110         Parser::new("\\\r\r\\.\\\\\r\r\\.\\").count();
3111     }
3112 
3113     #[test]
3114     fn issue_320() {
3115         // dont crash
3116         parser_with_extensions(":\r\t> |\r:\r\t> |\r").count();
3117     }
3118 
3119     #[test]
3120     fn issue_319() {
3121         // dont crash
3122         parser_with_extensions("|\r-]([^|\r-]([^").count();
3123         parser_with_extensions("|\r\r=][^|\r\r=][^car").count();
3124     }
3125 
3126     #[test]
3127     fn issue_303() {
3128         // dont crash
3129         parser_with_extensions("[^\r\ra]").count();
3130         parser_with_extensions("\r\r]Z[^\x00\r\r]Z[^\x00").count();
3131     }
3132 
3133     #[test]
3134     fn issue_313() {
3135         // dont crash
3136         parser_with_extensions("*]0[^\r\r*]0[^").count();
3137         parser_with_extensions("[^\r> `][^\r> `][^\r> `][").count();
3138     }
3139 
3140     #[test]
3141     fn issue_311() {
3142         // dont crash
3143         parser_with_extensions("\\\u{0d}-\u{09}\\\u{0d}-\u{09}").count();
3144     }
3145 
3146     #[test]
3147     fn issue_283() {
3148         let input = std::str::from_utf8(b"\xf0\x9b\xb2\x9f<td:^\xf0\x9b\xb2\x9f").unwrap();
3149         // dont crash
3150         parser_with_extensions(input).count();
3151     }
3152 
3153     #[test]
3154     fn issue_289() {
3155         // dont crash
3156         parser_with_extensions("> - \\\n> - ").count();
3157         parser_with_extensions("- \n\n").count();
3158     }
3159 
3160     #[test]
3161     fn issue_306() {
3162         // dont crash
3163         parser_with_extensions("*\r_<__*\r_<__*\r_<__*\r_<__").count();
3164     }
3165 
3166     #[test]
3167     fn issue_305() {
3168         // dont crash
3169         parser_with_extensions("_6**6*_*").count();
3170     }
3171 
3172     #[test]
3173     fn another_emphasis_panic() {
3174         parser_with_extensions("*__#_#__*").count();
3175     }
3176 
3177     #[test]
3178     fn offset_iter() {
3179         let event_offsets: Vec<_> = Parser::new("*hello* world")
3180             .into_offset_iter()
3181             .map(|(_ev, range)| range)
3182             .collect();
3183         let expected_offsets = vec![(0..13), (0..7), (1..6), (0..7), (7..13), (0..13)];
3184         assert_eq!(expected_offsets, event_offsets);
3185     }
3186 
3187     #[test]
3188     fn reference_link_offsets() {
3189         let range =
3190             Parser::new("# H1\n[testing][Some reference]\n\n[Some reference]: https://github.com")
3191                 .into_offset_iter()
3192                 .filter_map(|(ev, range)| match ev {
3193                     Event::Start(Tag::Link(LinkType::Reference, ..), ..) => Some(range),
3194                     _ => None,
3195                 })
3196                 .next()
3197                 .unwrap();
3198         assert_eq!(5..30, range);
3199     }
3200 
3201     #[test]
footnote_offsets()3202     fn footnote_offsets() {
3203         let range = parser_with_extensions("Testing this[^1] out.\n\n[^1]: Footnote.")
3204             .into_offset_iter()
3205             .filter_map(|(ev, range)| match ev {
3206                 Event::FootnoteReference(..) => Some(range),
3207                 _ => None,
3208             })
3209             .next()
3210             .unwrap();
3211         assert_eq!(12..16, range);
3212     }
3213 
3214     #[test]
table_offset()3215     fn table_offset() {
3216         let markdown = "a\n\nTesting|This|Outtt\n--|:--:|--:\nSome Data|Other data|asdf";
3217         let event_offset = parser_with_extensions(markdown)
3218             .into_offset_iter()
3219             .map(|(_ev, range)| range)
3220             .nth(3)
3221             .unwrap();
3222         let expected_offset = 3..59;
3223         assert_eq!(expected_offset, event_offset);
3224     }
3225 
3226     #[test]
offset_iter_issue_378()3227     fn offset_iter_issue_378() {
3228         let event_offsets: Vec<_> = Parser::new("a [b](c) d")
3229             .into_offset_iter()
3230             .map(|(_ev, range)| range)
3231             .collect();
3232         let expected_offsets = vec![(0..10), (0..2), (2..8), (3..4), (2..8), (8..10), (0..10)];
3233         assert_eq!(expected_offsets, event_offsets);
3234     }
3235 
3236     #[test]
offset_iter_issue_404()3237     fn offset_iter_issue_404() {
3238         let event_offsets: Vec<_> = Parser::new("###\n")
3239             .into_offset_iter()
3240             .map(|(_ev, range)| range)
3241             .collect();
3242         let expected_offsets = vec![(0..4), (0..4)];
3243         assert_eq!(expected_offsets, event_offsets);
3244     }
3245 
3246     // FIXME: add this one regression suite
3247     #[test]
link_def_at_eof()3248     fn link_def_at_eof() {
3249         let test_str = "[My site][world]\n\n[world]: https://vincentprouillet.com";
3250         let expected = "<p><a href=\"https://vincentprouillet.com\">My site</a></p>\n";
3251 
3252         let mut buf = String::new();
3253         crate::html::push_html(&mut buf, Parser::new(test_str));
3254         assert_eq!(expected, buf);
3255     }
3256 
3257     #[test]
no_footnote_refs_without_option()3258     fn no_footnote_refs_without_option() {
3259         let test_str = "a [^a]\n\n[^a]: yolo";
3260         let expected = "<p>a <a href=\"yolo\">^a</a></p>\n";
3261 
3262         let mut buf = String::new();
3263         crate::html::push_html(&mut buf, Parser::new(test_str));
3264         assert_eq!(expected, buf);
3265     }
3266 
3267     #[test]
ref_def_at_eof()3268     fn ref_def_at_eof() {
3269         let test_str = "[test]:\\";
3270         let expected = "";
3271 
3272         let mut buf = String::new();
3273         crate::html::push_html(&mut buf, Parser::new(test_str));
3274         assert_eq!(expected, buf);
3275     }
3276 
3277     #[test]
3278     fn ref_def_cr_lf() {
3279         let test_str = "[a]: /u\r\n\n[a]";
3280         let expected = "<p><a href=\"/u\">a</a></p>\n";
3281 
3282         let mut buf = String::new();
3283         crate::html::push_html(&mut buf, Parser::new(test_str));
3284         assert_eq!(expected, buf);
3285     }
3286 
3287     #[test]
no_dest_refdef()3288     fn no_dest_refdef() {
3289         let test_str = "[a]:";
3290         let expected = "<p>[a]:</p>\n";
3291 
3292         let mut buf = String::new();
3293         crate::html::push_html(&mut buf, Parser::new(test_str));
3294         assert_eq!(expected, buf);
3295     }
3296 
3297     #[test]
broken_links_called_only_once()3298     fn broken_links_called_only_once() {
3299         for &(markdown, expected) in &[
3300             ("See also [`g()`][crate::g].", 1),
3301             ("See also [`g()`][crate::g][].", 1),
3302             ("[brokenlink1] some other node [brokenlink2]", 2),
3303         ] {
3304             let mut times_called = 0;
3305             let callback = &mut |_broken_link: BrokenLink| {
3306                 times_called += 1;
3307                 None
3308             };
3309             let parser =
3310                 Parser::new_with_broken_link_callback(markdown, Options::empty(), Some(callback));
3311             for _ in parser {}
3312             assert_eq!(times_called, expected);
3313         }
3314     }
3315 
3316     #[test]
simple_broken_link_callback()3317     fn simple_broken_link_callback() {
3318         let test_str = "This is a link w/o def: [hello][world]";
3319         let mut callback = |broken_link: BrokenLink| {
3320             assert_eq!("world", broken_link.reference);
3321             assert_eq!(&test_str[broken_link.span], "[hello][world]");
3322             let url = "YOLO".into();
3323             let title = "SWAG".to_owned().into();
3324             Some((url, title))
3325         };
3326         let parser =
3327             Parser::new_with_broken_link_callback(test_str, Options::empty(), Some(&mut callback));
3328         let mut link_tag_count = 0;
3329         for (typ, url, title) in parser.filter_map(|event| match event {
3330             Event::Start(tag) | Event::End(tag) => match tag {
3331                 Tag::Link(typ, url, title) => Some((typ, url, title)),
3332                 _ => None,
3333             },
3334             _ => None,
3335         }) {
3336             link_tag_count += 1;
3337             assert_eq!(typ, LinkType::ReferenceUnknown);
3338             assert_eq!(url.as_ref(), "YOLO");
3339             assert_eq!(title.as_ref(), "SWAG");
3340         }
3341         assert!(link_tag_count > 0);
3342     }
3343 
3344     #[test]
code_block_kind_check_fenced()3345     fn code_block_kind_check_fenced() {
3346         let parser = Parser::new("hello\n```test\ntadam\n```");
3347         let mut found = 0;
3348         for (ev, _range) in parser.into_offset_iter() {
3349             match ev {
3350                 Event::Start(Tag::CodeBlock(CodeBlockKind::Fenced(syntax))) => {
3351                     assert_eq!(syntax.as_ref(), "test");
3352                     found += 1;
3353                 }
3354                 _ => {}
3355             }
3356         }
3357         assert_eq!(found, 1);
3358     }
3359 
3360     #[test]
code_block_kind_check_indented()3361     fn code_block_kind_check_indented() {
3362         let parser = Parser::new("hello\n\n    ```test\n    tadam\nhello");
3363         let mut found = 0;
3364         for (ev, _range) in parser.into_offset_iter() {
3365             match ev {
3366                 Event::Start(Tag::CodeBlock(CodeBlockKind::Indented)) => {
3367                     found += 1;
3368                 }
3369                 _ => {}
3370             }
3371         }
3372         assert_eq!(found, 1);
3373     }
3374 }
3375