1 // Copyright 2017 Google Inc. All rights reserved.
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a copy
4 // of this software and associated documentation files (the "Software"), to deal
5 // in the Software without restriction, including without limitation the rights
6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 // copies of the Software, and to permit persons to whom the Software is
8 // furnished to do so, subject to the following conditions:
9 //
10 // The above copyright notice and this permission notice shall be included in
11 // all copies or substantial portions of the Software.
12 //
13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 // THE SOFTWARE.
20
21 //! Tree-based two pass parser.
22
23 use std::cmp::{max, min};
24 use std::collections::{HashMap, VecDeque};
25 use std::ops::{Index, Range};
26
27 use unicase::UniCase;
28
29 use crate::linklabel::{scan_link_label_rest, LinkLabel, ReferenceLabel};
30 use crate::scanners::*;
31 use crate::strings::CowStr;
32 use crate::tree::{Tree, TreeIndex};
33
34 // Allowing arbitrary depth nested parentheses inside link destinations
35 // can create denial of service vulnerabilities if we're not careful.
36 // The simplest countermeasure is to limit their depth, which is
37 // explicitly allowed by the spec as long as the limit is at least 3:
38 // https://spec.commonmark.org/0.29/#link-destination
39 const LINK_MAX_NESTED_PARENS: usize = 5;
40
41 /// Codeblock kind.
42 #[derive(Clone, Debug, PartialEq)]
43 pub enum CodeBlockKind<'a> {
44 Indented,
45 /// The value contained in the tag describes the language of the code, which may be empty.
46 Fenced(CowStr<'a>),
47 }
48
49 impl<'a> CodeBlockKind<'a> {
is_indented(&self) -> bool50 pub fn is_indented(&self) -> bool {
51 match *self {
52 CodeBlockKind::Indented => true,
53 _ => false,
54 }
55 }
56
is_fenced(&self) -> bool57 pub fn is_fenced(&self) -> bool {
58 match *self {
59 CodeBlockKind::Fenced(_) => true,
60 _ => false,
61 }
62 }
63 }
64
65 /// Tags for elements that can contain other elements.
66 #[derive(Clone, Debug, PartialEq)]
67 pub enum Tag<'a> {
68 /// A paragraph of text and other inline elements.
69 Paragraph,
70
71 /// A heading. The field indicates the level of the heading.
72 Heading(u32),
73
74 BlockQuote,
75 /// A code block.
76 CodeBlock(CodeBlockKind<'a>),
77
78 /// A list. If the list is ordered the field indicates the number of the first item.
79 /// Contains only list items.
80 List(Option<u64>), // TODO: add delim and tight for ast (not needed for html)
81 /// A list item.
82 Item,
83 /// A footnote definition. The value contained is the footnote's label by which it can
84 /// be referred to.
85 FootnoteDefinition(CowStr<'a>),
86
87 /// A table. Contains a vector describing the text-alignment for each of its columns.
88 Table(Vec<Alignment>),
89 /// A table header. Contains only `TableRow`s. Note that the table body starts immediately
90 /// after the closure of the `TableHead` tag. There is no `TableBody` tag.
91 TableHead,
92 /// A table row. Is used both for header rows as body rows. Contains only `TableCell`s.
93 TableRow,
94 TableCell,
95
96 // span-level tags
97 Emphasis,
98 Strong,
99 Strikethrough,
100
101 /// A link. The first field is the link type, the second the destination URL and the third is a title.
102 Link(LinkType, CowStr<'a>, CowStr<'a>),
103
104 /// An image. The first field is the link type, the second the destination URL and the third is a title.
105 Image(LinkType, CowStr<'a>, CowStr<'a>),
106 }
107
108 /// Type specifier for inline links. See [the Tag::Link](enum.Tag.html#variant.Link) for more information.
109 #[derive(Clone, Debug, PartialEq, Copy)]
110 pub enum LinkType {
111 /// Inline link like `[foo](bar)`
112 Inline,
113 /// Reference link like `[foo][bar]`
114 Reference,
115 /// Reference without destination in the document, but resolved by the broken_link_callback
116 ReferenceUnknown,
117 /// Collapsed link like `[foo][]`
118 Collapsed,
119 /// Collapsed link without destination in the document, but resolved by the broken_link_callback
120 CollapsedUnknown,
121 /// Shortcut link like `[foo]`
122 Shortcut,
123 /// Shortcut without destination in the document, but resolved by the broken_link_callback
124 ShortcutUnknown,
125 /// Autolink like `<http://foo.bar/baz>`
126 Autolink,
127 /// Email address in autolink like `<john@example.org>`
128 Email,
129 }
130
131 impl LinkType {
to_unknown(self) -> Self132 fn to_unknown(self) -> Self {
133 match self {
134 LinkType::Reference => LinkType::ReferenceUnknown,
135 LinkType::Collapsed => LinkType::CollapsedUnknown,
136 LinkType::Shortcut => LinkType::ShortcutUnknown,
137 _ => unreachable!(),
138 }
139 }
140 }
141
142 /// Markdown events that are generated in a preorder traversal of the document
143 /// tree, with additional `End` events whenever all of an inner node's children
144 /// have been visited.
145 #[derive(Clone, Debug, PartialEq)]
146 pub enum Event<'a> {
147 /// Start of a tagged element. Events that are yielded after this event
148 /// and before its corresponding `End` event are inside this element.
149 /// Start and end events are guaranteed to be balanced.
150 Start(Tag<'a>),
151 /// End of a tagged element.
152 End(Tag<'a>),
153 /// A text node.
154 Text(CowStr<'a>),
155 /// An inline code node.
156 Code(CowStr<'a>),
157 /// An HTML node.
158 Html(CowStr<'a>),
159 /// A reference to a footnote with given label, which may or may not be defined
160 /// by an event with a `Tag::FootnoteDefinition` tag. Definitions and references to them may
161 /// occur in any order.
162 FootnoteReference(CowStr<'a>),
163 /// A soft line break.
164 SoftBreak,
165 /// A hard line break.
166 HardBreak,
167 /// A horizontal ruler.
168 Rule,
169 /// A task list marker, rendered as a checkbox in HTML. Contains a true when it is checked.
170 TaskListMarker(bool),
171 }
172
173 /// Table column text alignment.
174 #[derive(Copy, Clone, Debug, PartialEq)]
175 pub enum Alignment {
176 /// Default text alignment.
177 None,
178 Left,
179 Center,
180 Right,
181 }
182
183 bitflags! {
184 /// Option struct containing flags for enabling extra features
185 /// that are not part of the CommonMark spec.
186 pub struct Options: u32 {
187 const ENABLE_TABLES = 1 << 1;
188 const ENABLE_FOOTNOTES = 1 << 2;
189 const ENABLE_STRIKETHROUGH = 1 << 3;
190 const ENABLE_TASKLISTS = 1 << 4;
191 const ENABLE_SMART_PUNCTUATION = 1 << 5;
192 }
193 }
194
195 #[derive(Debug, Default, Clone, Copy)]
196 struct Item {
197 start: usize,
198 end: usize,
199 body: ItemBody,
200 }
201
202 #[derive(Debug, PartialEq, Clone, Copy)]
203 enum ItemBody {
204 Paragraph,
205 Text,
206 SoftBreak,
207 HardBreak,
208
209 // These are possible inline items, need to be resolved in second pass.
210
211 // repeats, can_open, can_close
212 MaybeEmphasis(usize, bool, bool),
213 // quote byte, can_open, can_close
214 MaybeSmartQuote(u8, bool, bool),
215 MaybeCode(usize, bool), // number of backticks, preceeded by backslash
216 MaybeHtml,
217 MaybeLinkOpen,
218 // bool indicates whether or not the preceeding section could be a reference
219 MaybeLinkClose(bool),
220 MaybeImage,
221
222 // These are inline items after resolution.
223 Emphasis,
224 Strong,
225 Strikethrough,
226 Code(CowIndex),
227 Link(LinkIndex),
228 Image(LinkIndex),
229 FootnoteReference(CowIndex),
230 TaskListMarker(bool), // true for checked
231
232 Rule,
233 Heading(u32), // heading level
234 FencedCodeBlock(CowIndex),
235 IndentCodeBlock,
236 Html,
237 OwnedHtml(CowIndex),
238 BlockQuote,
239 List(bool, u8, u64), // is_tight, list character, list start index
240 ListItem(usize), // indent level
241 SynthesizeText(CowIndex),
242 SynthesizeChar(char),
243 FootnoteDefinition(CowIndex),
244
245 // Tables
246 Table(AlignmentIndex),
247 TableHead,
248 TableRow,
249 TableCell,
250
251 // Dummy node at the top of the tree - should not be used otherwise!
252 Root,
253 }
254
255 impl<'a> ItemBody {
is_inline(&self) -> bool256 fn is_inline(&self) -> bool {
257 match *self {
258 ItemBody::MaybeEmphasis(..)
259 | ItemBody::MaybeSmartQuote(..)
260 | ItemBody::MaybeHtml
261 | ItemBody::MaybeCode(..)
262 | ItemBody::MaybeLinkOpen
263 | ItemBody::MaybeLinkClose(..)
264 | ItemBody::MaybeImage => true,
265 _ => false,
266 }
267 }
268 }
269
270 impl<'a> Default for ItemBody {
default() -> Self271 fn default() -> Self {
272 ItemBody::Root
273 }
274 }
275
276 /// Scanning modes for `Parser`'s `parse_line` method.
277 #[derive(PartialEq, Eq, Copy, Clone)]
278 enum TableParseMode {
279 /// Inside a paragraph, scanning for table headers.
280 Scan,
281 /// Inside a table.
282 Active,
283 /// Inside a paragraph, not scanning for table headers.
284 Disabled,
285 }
286
287 pub struct BrokenLink<'a> {
288 pub span: std::ops::Range<usize>,
289 pub link_type: LinkType,
290 pub reference: &'a str,
291 }
292
293 /// State for the first parsing pass.
294 ///
295 /// The first pass resolves all block structure, generating an AST. Within a block, items
296 /// are in a linear chain with potential inline markup identified.
297 struct FirstPass<'a, 'b> {
298 text: &'a str,
299 tree: Tree<Item>,
300 begin_list_item: bool,
301 last_line_blank: bool,
302 allocs: Allocations<'a>,
303 options: Options,
304 list_nesting: usize,
305 lookup_table: &'b LookupTable,
306 }
307
308 impl<'a, 'b> FirstPass<'a, 'b> {
new(text: &'a str, options: Options, lookup_table: &'b LookupTable) -> FirstPass<'a, 'b>309 fn new(text: &'a str, options: Options, lookup_table: &'b LookupTable) -> FirstPass<'a, 'b> {
310 // This is a very naive heuristic for the number of nodes
311 // we'll need.
312 let start_capacity = max(128, text.len() / 32);
313 let tree = Tree::with_capacity(start_capacity);
314 FirstPass {
315 text,
316 tree,
317 begin_list_item: false,
318 last_line_blank: false,
319 allocs: Allocations::new(),
320 options,
321 list_nesting: 0,
322 lookup_table,
323 }
324 }
325
run(mut self) -> (Tree<Item>, Allocations<'a>)326 fn run(mut self) -> (Tree<Item>, Allocations<'a>) {
327 let mut ix = 0;
328 while ix < self.text.len() {
329 ix = self.parse_block(ix);
330 }
331 for _ in 0..self.tree.spine_len() {
332 self.pop(ix);
333 }
334 (self.tree, self.allocs)
335 }
336
337 /// Returns offset after block.
parse_block(&mut self, mut start_ix: usize) -> usize338 fn parse_block(&mut self, mut start_ix: usize) -> usize {
339 let bytes = self.text.as_bytes();
340 let mut line_start = LineStart::new(&bytes[start_ix..]);
341
342 let i = scan_containers(&self.tree, &mut line_start);
343 for _ in i..self.tree.spine_len() {
344 self.pop(start_ix);
345 }
346
347 if self.options.contains(Options::ENABLE_FOOTNOTES) {
348 // finish footnote if it's still open and was preceeded by blank line
349 if let Some(node_ix) = self.tree.peek_up() {
350 if let ItemBody::FootnoteDefinition(..) = self.tree[node_ix].item.body {
351 if self.last_line_blank {
352 self.pop(start_ix);
353 }
354 }
355 }
356
357 // Footnote definitions of the form
358 // [^bar]:
359 // * anything really
360 let container_start = start_ix + line_start.bytes_scanned();
361 if let Some(bytecount) = self.parse_footnote(container_start) {
362 start_ix = container_start + bytecount;
363 start_ix += scan_blank_line(&bytes[start_ix..]).unwrap_or(0);
364 line_start = LineStart::new(&bytes[start_ix..]);
365 }
366 }
367
368 // Process new containers
369 loop {
370 let container_start = start_ix + line_start.bytes_scanned();
371 if let Some((ch, index, indent)) = line_start.scan_list_marker() {
372 let after_marker_index = start_ix + line_start.bytes_scanned();
373 self.continue_list(container_start, ch, index);
374 self.tree.append(Item {
375 start: container_start,
376 end: after_marker_index, // will get updated later if item not empty
377 body: ItemBody::ListItem(indent),
378 });
379 self.tree.push();
380 if let Some(n) = scan_blank_line(&bytes[after_marker_index..]) {
381 self.begin_list_item = true;
382 return after_marker_index + n;
383 }
384 if self.options.contains(Options::ENABLE_TASKLISTS) {
385 if let Some(is_checked) = line_start.scan_task_list_marker() {
386 self.tree.append(Item {
387 start: after_marker_index,
388 end: start_ix + line_start.bytes_scanned(),
389 body: ItemBody::TaskListMarker(is_checked),
390 });
391 }
392 }
393 } else if line_start.scan_blockquote_marker() {
394 self.finish_list(start_ix);
395 self.tree.append(Item {
396 start: container_start,
397 end: 0, // will get set later
398 body: ItemBody::BlockQuote,
399 });
400 self.tree.push();
401 } else {
402 break;
403 }
404 }
405
406 let ix = start_ix + line_start.bytes_scanned();
407
408 if let Some(n) = scan_blank_line(&bytes[ix..]) {
409 if let Some(node_ix) = self.tree.peek_up() {
410 match self.tree[node_ix].item.body {
411 ItemBody::BlockQuote => (),
412 _ => {
413 if self.begin_list_item {
414 // A list item can begin with at most one blank line.
415 self.pop(start_ix);
416 }
417 self.last_line_blank = true;
418 }
419 }
420 }
421 return ix + n;
422 }
423
424 self.begin_list_item = false;
425 self.finish_list(start_ix);
426
427 // Save `remaining_space` here to avoid needing to backtrack `line_start` for HTML blocks
428 let remaining_space = line_start.remaining_space();
429
430 let indent = line_start.scan_space_upto(4);
431 if indent == 4 {
432 let ix = start_ix + line_start.bytes_scanned();
433 let remaining_space = line_start.remaining_space();
434 return self.parse_indented_code_block(ix, remaining_space);
435 }
436
437 let ix = start_ix + line_start.bytes_scanned();
438
439 // HTML Blocks
440 if bytes[ix] == b'<' {
441 // Types 1-5 are all detected by one function and all end with the same
442 // pattern
443 if let Some(html_end_tag) = get_html_end_tag(&bytes[(ix + 1)..]) {
444 return self.parse_html_block_type_1_to_5(ix, html_end_tag, remaining_space);
445 }
446
447 // Detect type 6
448 let possible_tag = scan_html_block_tag(&bytes[(ix + 1)..]).1;
449 if is_html_tag(possible_tag) {
450 return self.parse_html_block_type_6_or_7(ix, remaining_space);
451 }
452
453 // Detect type 7
454 if let Some(_html_bytes) = scan_html_type_7(&bytes[ix..]) {
455 return self.parse_html_block_type_6_or_7(ix, remaining_space);
456 }
457 }
458
459 if let Ok(n) = scan_hrule(&bytes[ix..]) {
460 return self.parse_hrule(n, ix);
461 }
462
463 if let Some(atx_size) = scan_atx_heading(&bytes[ix..]) {
464 return self.parse_atx_heading(ix, atx_size);
465 }
466
467 // parse refdef
468 if let Some((bytecount, label, link_def)) = self.parse_refdef_total(ix) {
469 self.allocs.refdefs.entry(label).or_insert(link_def);
470 let ix = ix + bytecount;
471 // try to read trailing whitespace or it will register as a completely blank line
472 // TODO: shouldn't we do this for all block level items?
473 return ix + scan_blank_line(&bytes[ix..]).unwrap_or(0);
474 }
475
476 if let Some((n, fence_ch)) = scan_code_fence(&bytes[ix..]) {
477 return self.parse_fenced_code_block(ix, indent, fence_ch, n);
478 }
479 self.parse_paragraph(ix)
480 }
481
482 /// Returns the offset of the first line after the table.
483 /// Assumptions: current focus is a table element and the table header
484 /// matches the separator line (same number of columns).
parse_table(&mut self, table_cols: usize, head_start: usize, body_start: usize) -> usize485 fn parse_table(&mut self, table_cols: usize, head_start: usize, body_start: usize) -> usize {
486 // parse header. this shouldn't fail because we made sure the table header is ok
487 let (_sep_start, thead_ix) = self.parse_table_row_inner(head_start, table_cols);
488 self.tree[thead_ix].item.body = ItemBody::TableHead;
489
490 // parse body
491 let mut ix = body_start;
492 while let Some((next_ix, _row_ix)) = self.parse_table_row(ix, table_cols) {
493 ix = next_ix;
494 }
495
496 self.pop(ix);
497 ix
498 }
499
500 /// Call this when containers are taken care of.
501 /// Returns bytes scanned, row_ix
parse_table_row_inner(&mut self, mut ix: usize, row_cells: usize) -> (usize, TreeIndex)502 fn parse_table_row_inner(&mut self, mut ix: usize, row_cells: usize) -> (usize, TreeIndex) {
503 let bytes = self.text.as_bytes();
504 let mut cells = 0;
505 let mut final_cell_ix = None;
506
507 let row_ix = self.tree.append(Item {
508 start: ix,
509 end: 0, // set at end of this function
510 body: ItemBody::TableRow,
511 });
512 self.tree.push();
513
514 loop {
515 ix += scan_ch(&bytes[ix..], b'|');
516 ix += scan_whitespace_no_nl(&bytes[ix..]);
517
518 if let Some(eol_bytes) = scan_eol(&bytes[ix..]) {
519 ix += eol_bytes;
520 break;
521 }
522
523 let cell_ix = self.tree.append(Item {
524 start: ix,
525 end: ix,
526 body: ItemBody::TableCell,
527 });
528 self.tree.push();
529 let (next_ix, _brk) = self.parse_line(ix, TableParseMode::Active);
530 let trailing_whitespace = scan_rev_while(&bytes[..next_ix], is_ascii_whitespace);
531
532 if let Some(cur_ix) = self.tree.cur() {
533 self.tree[cur_ix].item.end -= trailing_whitespace;
534 }
535
536 self.tree[cell_ix].item.end = next_ix - trailing_whitespace;
537 self.tree.pop();
538
539 ix = next_ix;
540 cells += 1;
541
542 if cells == row_cells {
543 final_cell_ix = Some(cell_ix);
544 }
545 }
546
547 // fill empty cells if needed
548 // note: this is where GFM and commonmark-extra diverge. we follow
549 // GFM here
550 for _ in cells..row_cells {
551 self.tree.append(Item {
552 start: ix,
553 end: ix,
554 body: ItemBody::TableCell,
555 });
556 }
557
558 // drop excess cells
559 if let Some(cell_ix) = final_cell_ix {
560 self.tree[cell_ix].next = None;
561 }
562
563 self.pop(ix);
564
565 (ix, row_ix)
566 }
567
568 /// Returns first offset after the row and the tree index of the row.
parse_table_row(&mut self, mut ix: usize, row_cells: usize) -> Option<(usize, TreeIndex)>569 fn parse_table_row(&mut self, mut ix: usize, row_cells: usize) -> Option<(usize, TreeIndex)> {
570 let bytes = self.text.as_bytes();
571 let mut line_start = LineStart::new(&bytes[ix..]);
572 let containers = scan_containers(&self.tree, &mut line_start);
573 if containers != self.tree.spine_len() {
574 return None;
575 }
576 line_start.scan_all_space();
577 ix += line_start.bytes_scanned();
578 if scan_paragraph_interrupt(&bytes[ix..]) {
579 return None;
580 }
581
582 let (ix, row_ix) = self.parse_table_row_inner(ix, row_cells);
583 Some((ix, row_ix))
584 }
585
586 /// Returns offset of line start after paragraph.
parse_paragraph(&mut self, start_ix: usize) -> usize587 fn parse_paragraph(&mut self, start_ix: usize) -> usize {
588 let node_ix = self.tree.append(Item {
589 start: start_ix,
590 end: 0, // will get set later
591 body: ItemBody::Paragraph,
592 });
593 self.tree.push();
594 let bytes = self.text.as_bytes();
595
596 let mut ix = start_ix;
597 loop {
598 let scan_mode = if self.options.contains(Options::ENABLE_TABLES) && ix == start_ix {
599 TableParseMode::Scan
600 } else {
601 TableParseMode::Disabled
602 };
603 let (next_ix, brk) = self.parse_line(ix, scan_mode);
604
605 // break out when we find a table
606 if let Some(Item {
607 body: ItemBody::Table(alignment_ix),
608 ..
609 }) = brk
610 {
611 let table_cols = self.allocs[alignment_ix].len();
612 self.tree[node_ix].item.body = ItemBody::Table(alignment_ix);
613 // this clears out any stuff we may have appended - but there may
614 // be a cleaner way
615 self.tree[node_ix].child = None;
616 self.tree.pop();
617 self.tree.push();
618 return self.parse_table(table_cols, ix, next_ix);
619 }
620
621 ix = next_ix;
622 let mut line_start = LineStart::new(&bytes[ix..]);
623 let n_containers = scan_containers(&self.tree, &mut line_start);
624 if !line_start.scan_space(4) {
625 let ix_new = ix + line_start.bytes_scanned();
626 if n_containers == self.tree.spine_len() {
627 if let Some(ix_setext) = self.parse_setext_heading(ix_new, node_ix) {
628 if let Some(Item {
629 start,
630 body: ItemBody::HardBreak,
631 ..
632 }) = brk
633 {
634 if bytes[start] == b'\\' {
635 self.tree.append_text(start, start + 1);
636 }
637 }
638 ix = ix_setext;
639 break;
640 }
641 }
642 // first check for non-empty lists, then for other interrupts
643 let suffix = &bytes[ix_new..];
644 if self.interrupt_paragraph_by_list(suffix) || scan_paragraph_interrupt(suffix) {
645 break;
646 }
647 }
648 line_start.scan_all_space();
649 if line_start.is_at_eol() {
650 break;
651 }
652 ix = next_ix + line_start.bytes_scanned();
653 if let Some(item) = brk {
654 self.tree.append(item);
655 }
656 }
657
658 self.pop(ix);
659 ix
660 }
661
662 /// Returns end ix of setext_heading on success.
parse_setext_heading(&mut self, ix: usize, node_ix: TreeIndex) -> Option<usize>663 fn parse_setext_heading(&mut self, ix: usize, node_ix: TreeIndex) -> Option<usize> {
664 let bytes = self.text.as_bytes();
665 let (n, level) = scan_setext_heading(&bytes[ix..])?;
666 self.tree[node_ix].item.body = ItemBody::Heading(level);
667
668 // strip trailing whitespace
669 if let Some(cur_ix) = self.tree.cur() {
670 self.tree[cur_ix].item.end -= scan_rev_while(
671 &bytes[..self.tree[cur_ix].item.end],
672 is_ascii_whitespace_no_nl,
673 );
674 }
675
676 Some(ix + n)
677 }
678
679 /// Parse a line of input, appending text and items to tree.
680 ///
681 /// Returns: index after line and an item representing the break.
parse_line(&mut self, start: usize, mode: TableParseMode) -> (usize, Option<Item>)682 fn parse_line(&mut self, start: usize, mode: TableParseMode) -> (usize, Option<Item>) {
683 let bytes = &self.text.as_bytes();
684 let mut pipes = 0;
685 let mut last_pipe_ix = start;
686 let mut begin_text = start;
687
688 let (final_ix, brk) =
689 iterate_special_bytes(&self.lookup_table, bytes, start, |ix, byte| {
690 match byte {
691 b'\n' | b'\r' => {
692 if let TableParseMode::Active = mode {
693 return LoopInstruction::BreakAtWith(ix, None);
694 }
695
696 let mut i = ix;
697 let eol_bytes = scan_eol(&bytes[ix..]).unwrap();
698 if mode == TableParseMode::Scan && pipes > 0 {
699 // check if we may be parsing a table
700 let next_line_ix = ix + eol_bytes;
701 let mut line_start = LineStart::new(&bytes[next_line_ix..]);
702 if scan_containers(&self.tree, &mut line_start) == self.tree.spine_len()
703 {
704 let table_head_ix = next_line_ix + line_start.bytes_scanned();
705 let (table_head_bytes, alignment) =
706 scan_table_head(&bytes[table_head_ix..]);
707
708 if table_head_bytes > 0 {
709 // computing header count from number of pipes
710 let header_count =
711 count_header_cols(bytes, pipes, start, last_pipe_ix);
712
713 // make sure they match the number of columns we find in separator line
714 if alignment.len() == header_count {
715 let alignment_ix =
716 self.allocs.allocate_alignment(alignment);
717 let end_ix = table_head_ix + table_head_bytes;
718 return LoopInstruction::BreakAtWith(
719 end_ix,
720 Some(Item {
721 start: i,
722 end: end_ix, // must update later
723 body: ItemBody::Table(alignment_ix),
724 }),
725 );
726 }
727 }
728 }
729 }
730
731 let end_ix = ix + eol_bytes;
732 let trailing_backslashes = scan_rev_while(&bytes[..ix], |b| b == b'\\');
733 if trailing_backslashes % 2 == 1 && end_ix < self.text.len() {
734 i -= 1;
735 self.tree.append_text(begin_text, i);
736 return LoopInstruction::BreakAtWith(
737 end_ix,
738 Some(Item {
739 start: i,
740 end: end_ix,
741 body: ItemBody::HardBreak,
742 }),
743 );
744 }
745 let trailing_whitespace =
746 scan_rev_while(&bytes[..ix], is_ascii_whitespace_no_nl);
747 if trailing_whitespace >= 2 {
748 i -= trailing_whitespace;
749 self.tree.append_text(begin_text, i);
750 return LoopInstruction::BreakAtWith(
751 end_ix,
752 Some(Item {
753 start: i,
754 end: end_ix,
755 body: ItemBody::HardBreak,
756 }),
757 );
758 }
759
760 self.tree.append_text(begin_text, ix);
761 LoopInstruction::BreakAtWith(
762 end_ix,
763 Some(Item {
764 start: i,
765 end: end_ix,
766 body: ItemBody::SoftBreak,
767 }),
768 )
769 }
770 b'\\' => {
771 if ix + 1 < self.text.len() && is_ascii_punctuation(bytes[ix + 1]) {
772 self.tree.append_text(begin_text, ix);
773 if bytes[ix + 1] == b'`' {
774 let count = 1 + scan_ch_repeat(&bytes[(ix + 2)..], b'`');
775 self.tree.append(Item {
776 start: ix + 1,
777 end: ix + count + 1,
778 body: ItemBody::MaybeCode(count, true),
779 });
780 begin_text = ix + 1 + count;
781 LoopInstruction::ContinueAndSkip(count)
782 } else {
783 begin_text = ix + 1;
784 LoopInstruction::ContinueAndSkip(1)
785 }
786 } else {
787 LoopInstruction::ContinueAndSkip(0)
788 }
789 }
790 c @ b'*' | c @ b'_' | c @ b'~' => {
791 let string_suffix = &self.text[ix..];
792 let count = 1 + scan_ch_repeat(&string_suffix.as_bytes()[1..], c);
793 let can_open = delim_run_can_open(self.text, string_suffix, count, ix);
794 let can_close = delim_run_can_close(self.text, string_suffix, count, ix);
795 let is_valid_seq = c != b'~' || count == 2;
796
797 if (can_open || can_close) && is_valid_seq {
798 self.tree.append_text(begin_text, ix);
799 for i in 0..count {
800 self.tree.append(Item {
801 start: ix + i,
802 end: ix + i + 1,
803 body: ItemBody::MaybeEmphasis(count - i, can_open, can_close),
804 });
805 }
806 begin_text = ix + count;
807 }
808 LoopInstruction::ContinueAndSkip(count - 1)
809 }
810 b'`' => {
811 self.tree.append_text(begin_text, ix);
812 let count = 1 + scan_ch_repeat(&bytes[(ix + 1)..], b'`');
813 self.tree.append(Item {
814 start: ix,
815 end: ix + count,
816 body: ItemBody::MaybeCode(count, false),
817 });
818 begin_text = ix + count;
819 LoopInstruction::ContinueAndSkip(count - 1)
820 }
821 b'<' => {
822 // Note: could detect some non-HTML cases and early escape here, but not
823 // clear that's a win.
824 self.tree.append_text(begin_text, ix);
825 self.tree.append(Item {
826 start: ix,
827 end: ix + 1,
828 body: ItemBody::MaybeHtml,
829 });
830 begin_text = ix + 1;
831 LoopInstruction::ContinueAndSkip(0)
832 }
833 b'!' => {
834 if ix + 1 < self.text.len() && bytes[ix + 1] == b'[' {
835 self.tree.append_text(begin_text, ix);
836 self.tree.append(Item {
837 start: ix,
838 end: ix + 2,
839 body: ItemBody::MaybeImage,
840 });
841 begin_text = ix + 2;
842 LoopInstruction::ContinueAndSkip(1)
843 } else {
844 LoopInstruction::ContinueAndSkip(0)
845 }
846 }
847 b'[' => {
848 self.tree.append_text(begin_text, ix);
849 self.tree.append(Item {
850 start: ix,
851 end: ix + 1,
852 body: ItemBody::MaybeLinkOpen,
853 });
854 begin_text = ix + 1;
855 LoopInstruction::ContinueAndSkip(0)
856 }
857 b']' => {
858 self.tree.append_text(begin_text, ix);
859 self.tree.append(Item {
860 start: ix,
861 end: ix + 1,
862 body: ItemBody::MaybeLinkClose(true),
863 });
864 begin_text = ix + 1;
865 LoopInstruction::ContinueAndSkip(0)
866 }
867 b'&' => match scan_entity(&bytes[ix..]) {
868 (n, Some(value)) => {
869 self.tree.append_text(begin_text, ix);
870 self.tree.append(Item {
871 start: ix,
872 end: ix + n,
873 body: ItemBody::SynthesizeText(self.allocs.allocate_cow(value)),
874 });
875 begin_text = ix + n;
876 LoopInstruction::ContinueAndSkip(n - 1)
877 }
878 _ => LoopInstruction::ContinueAndSkip(0),
879 },
880 b'|' => {
881 if let TableParseMode::Active = mode {
882 LoopInstruction::BreakAtWith(ix, None)
883 } else {
884 last_pipe_ix = ix;
885 pipes += 1;
886 LoopInstruction::ContinueAndSkip(0)
887 }
888 }
889 b'.' => {
890 if ix + 2 < bytes.len() && bytes[ix + 1] == b'.' && bytes[ix + 2] == b'.' {
891 self.tree.append_text(begin_text, ix);
892 self.tree.append(Item {
893 start: ix,
894 end: ix + 3,
895 body: ItemBody::SynthesizeChar('…'),
896 });
897 begin_text = ix + 3;
898 LoopInstruction::ContinueAndSkip(2)
899 } else {
900 LoopInstruction::ContinueAndSkip(0)
901 }
902 }
903 b'-' => {
904 let count = 1 + scan_ch_repeat(&bytes[(ix + 1)..], b'-');
905 if count == 1 {
906 LoopInstruction::ContinueAndSkip(0)
907 } else {
908 let itembody = if count == 2 {
909 ItemBody::SynthesizeChar('–')
910 } else if count == 3 {
911 ItemBody::SynthesizeChar('—')
912 } else {
913 let (ems, ens) = match count % 6 {
914 0 | 3 => (count / 3, 0),
915 2 | 4 => (0, count / 2),
916 1 => (count / 3 - 1, 2),
917 _ => (count / 3, 1),
918 };
919 // – and — are 3 bytes each in utf8
920 let mut buf = String::with_capacity(3 * (ems + ens));
921 for _ in 0..ems {
922 buf.push('—');
923 }
924 for _ in 0..ens {
925 buf.push('–');
926 }
927 ItemBody::SynthesizeText(self.allocs.allocate_cow(buf.into()))
928 };
929
930 self.tree.append_text(begin_text, ix);
931 self.tree.append(Item {
932 start: ix,
933 end: ix + count,
934 body: itembody,
935 });
936 begin_text = ix + count;
937 LoopInstruction::ContinueAndSkip(count - 1)
938 }
939 }
940 c @ b'\'' | c @ b'"' => {
941 let string_suffix = &self.text[ix..];
942 let can_open = delim_run_can_open(self.text, string_suffix, 1, ix);
943 let can_close = delim_run_can_close(self.text, string_suffix, 1, ix);
944
945 self.tree.append_text(begin_text, ix);
946 self.tree.append(Item {
947 start: ix,
948 end: ix + 1,
949 body: ItemBody::MaybeSmartQuote(c, can_open, can_close),
950 });
951 begin_text = ix + 1;
952
953 LoopInstruction::ContinueAndSkip(0)
954 }
955 _ => LoopInstruction::ContinueAndSkip(0),
956 }
957 });
958
959 if brk.is_none() {
960 // need to close text at eof
961 self.tree.append_text(begin_text, final_ix);
962 }
963 (final_ix, brk)
964 }
965
966 /// Check whether we should allow a paragraph interrupt by lists. Only non-empty
967 /// lists are allowed.
interrupt_paragraph_by_list(&self, suffix: &[u8]) -> bool968 fn interrupt_paragraph_by_list(&self, suffix: &[u8]) -> bool {
969 scan_listitem(suffix).map_or(false, |(ix, delim, index, _)| {
970 self.list_nesting > 0 ||
971 // we don't allow interruption by either empty lists or
972 // numbered lists starting at an index other than 1
973 !scan_empty_list(&suffix[ix..]) && (delim == b'*' || delim == b'-' || index == 1)
974 })
975 }
976
977 /// When start_ix is at the beginning of an HTML block of type 1 to 5,
978 /// this will find the end of the block, adding the block itself to the
979 /// tree and also keeping track of the lines of HTML within the block.
980 ///
981 /// The html_end_tag is the tag that must be found on a line to end the block.
parse_html_block_type_1_to_5( &mut self, start_ix: usize, html_end_tag: &str, mut remaining_space: usize, ) -> usize982 fn parse_html_block_type_1_to_5(
983 &mut self,
984 start_ix: usize,
985 html_end_tag: &str,
986 mut remaining_space: usize,
987 ) -> usize {
988 let bytes = self.text.as_bytes();
989 let mut ix = start_ix;
990 loop {
991 let line_start_ix = ix;
992 ix += scan_nextline(&bytes[ix..]);
993 self.append_html_line(remaining_space, line_start_ix, ix);
994
995 let mut line_start = LineStart::new(&bytes[ix..]);
996 let n_containers = scan_containers(&self.tree, &mut line_start);
997 if n_containers < self.tree.spine_len() {
998 break;
999 }
1000
1001 if (&self.text[line_start_ix..ix]).contains(html_end_tag) {
1002 break;
1003 }
1004
1005 let next_line_ix = ix + line_start.bytes_scanned();
1006 if next_line_ix == self.text.len() {
1007 break;
1008 }
1009 ix = next_line_ix;
1010 remaining_space = line_start.remaining_space();
1011 }
1012 ix
1013 }
1014
1015 /// When start_ix is at the beginning of an HTML block of type 6 or 7,
1016 /// this will consume lines until there is a blank line and keep track of
1017 /// the HTML within the block.
parse_html_block_type_6_or_7( &mut self, start_ix: usize, mut remaining_space: usize, ) -> usize1018 fn parse_html_block_type_6_or_7(
1019 &mut self,
1020 start_ix: usize,
1021 mut remaining_space: usize,
1022 ) -> usize {
1023 let bytes = self.text.as_bytes();
1024 let mut ix = start_ix;
1025 loop {
1026 let line_start_ix = ix;
1027 ix += scan_nextline(&bytes[ix..]);
1028 self.append_html_line(remaining_space, line_start_ix, ix);
1029
1030 let mut line_start = LineStart::new(&bytes[ix..]);
1031 let n_containers = scan_containers(&self.tree, &mut line_start);
1032 if n_containers < self.tree.spine_len() || line_start.is_at_eol() {
1033 break;
1034 }
1035
1036 let next_line_ix = ix + line_start.bytes_scanned();
1037 if next_line_ix == self.text.len() || scan_blank_line(&bytes[next_line_ix..]).is_some()
1038 {
1039 break;
1040 }
1041 ix = next_line_ix;
1042 remaining_space = line_start.remaining_space();
1043 }
1044 ix
1045 }
1046
parse_indented_code_block(&mut self, start_ix: usize, mut remaining_space: usize) -> usize1047 fn parse_indented_code_block(&mut self, start_ix: usize, mut remaining_space: usize) -> usize {
1048 self.tree.append(Item {
1049 start: start_ix,
1050 end: 0, // will get set later
1051 body: ItemBody::IndentCodeBlock,
1052 });
1053 self.tree.push();
1054 let bytes = self.text.as_bytes();
1055 let mut last_nonblank_child = None;
1056 let mut last_nonblank_ix = 0;
1057 let mut end_ix = 0;
1058 let mut last_line_blank = false;
1059
1060 let mut ix = start_ix;
1061 loop {
1062 let line_start_ix = ix;
1063 ix += scan_nextline(&bytes[ix..]);
1064 self.append_code_text(remaining_space, line_start_ix, ix);
1065 // TODO(spec clarification): should we synthesize newline at EOF?
1066
1067 if !last_line_blank {
1068 last_nonblank_child = self.tree.cur();
1069 last_nonblank_ix = ix;
1070 end_ix = ix;
1071 }
1072
1073 let mut line_start = LineStart::new(&bytes[ix..]);
1074 let n_containers = scan_containers(&self.tree, &mut line_start);
1075 if n_containers < self.tree.spine_len()
1076 || !(line_start.scan_space(4) || line_start.is_at_eol())
1077 {
1078 break;
1079 }
1080 let next_line_ix = ix + line_start.bytes_scanned();
1081 if next_line_ix == self.text.len() {
1082 break;
1083 }
1084 ix = next_line_ix;
1085 remaining_space = line_start.remaining_space();
1086 last_line_blank = scan_blank_line(&bytes[ix..]).is_some();
1087 }
1088
1089 // Trim trailing blank lines.
1090 if let Some(child) = last_nonblank_child {
1091 self.tree[child].next = None;
1092 self.tree[child].item.end = last_nonblank_ix;
1093 }
1094 self.pop(end_ix);
1095 ix
1096 }
1097
parse_fenced_code_block( &mut self, start_ix: usize, indent: usize, fence_ch: u8, n_fence_char: usize, ) -> usize1098 fn parse_fenced_code_block(
1099 &mut self,
1100 start_ix: usize,
1101 indent: usize,
1102 fence_ch: u8,
1103 n_fence_char: usize,
1104 ) -> usize {
1105 let bytes = self.text.as_bytes();
1106 let mut info_start = start_ix + n_fence_char;
1107 info_start += scan_whitespace_no_nl(&bytes[info_start..]);
1108 // TODO: info strings are typically very short. wouldnt it be faster
1109 // to just do a forward scan here?
1110 let mut ix = info_start + scan_nextline(&bytes[info_start..]);
1111 let info_end = ix - scan_rev_while(&bytes[info_start..ix], is_ascii_whitespace);
1112 let info_string = unescape(&self.text[info_start..info_end]);
1113 self.tree.append(Item {
1114 start: start_ix,
1115 end: 0, // will get set later
1116 body: ItemBody::FencedCodeBlock(self.allocs.allocate_cow(info_string)),
1117 });
1118 self.tree.push();
1119 loop {
1120 let mut line_start = LineStart::new(&bytes[ix..]);
1121 let n_containers = scan_containers(&self.tree, &mut line_start);
1122 if n_containers < self.tree.spine_len() {
1123 break;
1124 }
1125 line_start.scan_space(indent);
1126 let mut close_line_start = line_start.clone();
1127 if !close_line_start.scan_space(4) {
1128 let close_ix = ix + close_line_start.bytes_scanned();
1129 if let Some(n) = scan_closing_code_fence(&bytes[close_ix..], fence_ch, n_fence_char)
1130 {
1131 ix = close_ix + n;
1132 break;
1133 }
1134 }
1135 let remaining_space = line_start.remaining_space();
1136 ix += line_start.bytes_scanned();
1137 let next_ix = ix + scan_nextline(&bytes[ix..]);
1138 self.append_code_text(remaining_space, ix, next_ix);
1139 ix = next_ix;
1140 }
1141
1142 self.pop(ix);
1143
1144 // try to read trailing whitespace or it will register as a completely blank line
1145 ix + scan_blank_line(&bytes[ix..]).unwrap_or(0)
1146 }
1147
append_code_text(&mut self, remaining_space: usize, start: usize, end: usize)1148 fn append_code_text(&mut self, remaining_space: usize, start: usize, end: usize) {
1149 if remaining_space > 0 {
1150 let cow_ix = self.allocs.allocate_cow(" "[..remaining_space].into());
1151 self.tree.append(Item {
1152 start,
1153 end: start,
1154 body: ItemBody::SynthesizeText(cow_ix),
1155 });
1156 }
1157 if self.text.as_bytes()[end - 2] == b'\r' {
1158 // Normalize CRLF to LF
1159 self.tree.append_text(start, end - 2);
1160 self.tree.append_text(end - 1, end);
1161 } else {
1162 self.tree.append_text(start, end);
1163 }
1164 }
1165
1166 /// Appends a line of HTML to the tree.
append_html_line(&mut self, remaining_space: usize, start: usize, end: usize)1167 fn append_html_line(&mut self, remaining_space: usize, start: usize, end: usize) {
1168 if remaining_space > 0 {
1169 let cow_ix = self.allocs.allocate_cow(" "[..remaining_space].into());
1170 self.tree.append(Item {
1171 start,
1172 end: start,
1173 // TODO: maybe this should synthesize to html rather than text?
1174 body: ItemBody::SynthesizeText(cow_ix),
1175 });
1176 }
1177 if self.text.as_bytes()[end - 2] == b'\r' {
1178 // Normalize CRLF to LF
1179 self.tree.append(Item {
1180 start,
1181 end: end - 2,
1182 body: ItemBody::Html,
1183 });
1184 self.tree.append(Item {
1185 start: end - 1,
1186 end,
1187 body: ItemBody::Html,
1188 });
1189 } else {
1190 self.tree.append(Item {
1191 start,
1192 end,
1193 body: ItemBody::Html,
1194 });
1195 }
1196 }
1197
1198 /// Pop a container, setting its end.
pop(&mut self, ix: usize)1199 fn pop(&mut self, ix: usize) {
1200 let cur_ix = self.tree.pop().unwrap();
1201 self.tree[cur_ix].item.end = ix;
1202 if let ItemBody::List(true, _, _) = self.tree[cur_ix].item.body {
1203 surgerize_tight_list(&mut self.tree, cur_ix);
1204 }
1205 }
1206
1207 /// Close a list if it's open. Also set loose if last line was blank
finish_list(&mut self, ix: usize)1208 fn finish_list(&mut self, ix: usize) {
1209 if let Some(node_ix) = self.tree.peek_up() {
1210 if let ItemBody::List(_, _, _) = self.tree[node_ix].item.body {
1211 self.pop(ix);
1212 self.list_nesting -= 1;
1213 }
1214 }
1215 if self.last_line_blank {
1216 if let Some(node_ix) = self.tree.peek_grandparent() {
1217 if let ItemBody::List(ref mut is_tight, _, _) = self.tree[node_ix].item.body {
1218 *is_tight = false;
1219 }
1220 }
1221 self.last_line_blank = false;
1222 }
1223 }
1224
1225 /// Continue an existing list or start a new one if there's not an open
1226 /// list that matches.
continue_list(&mut self, start: usize, ch: u8, index: u64)1227 fn continue_list(&mut self, start: usize, ch: u8, index: u64) {
1228 if let Some(node_ix) = self.tree.peek_up() {
1229 if let ItemBody::List(ref mut is_tight, existing_ch, _) = self.tree[node_ix].item.body {
1230 if existing_ch == ch {
1231 if self.last_line_blank {
1232 *is_tight = false;
1233 self.last_line_blank = false;
1234 }
1235 return;
1236 }
1237 }
1238 // TODO: this is not the best choice for end; maybe get end from last list item.
1239 self.finish_list(start);
1240 }
1241 self.tree.append(Item {
1242 start,
1243 end: 0, // will get set later
1244 body: ItemBody::List(true, ch, index),
1245 });
1246 self.list_nesting += 1;
1247 self.tree.push();
1248 self.last_line_blank = false;
1249 }
1250
1251 /// Parse a thematic break.
1252 ///
1253 /// Returns index of start of next line.
parse_hrule(&mut self, hrule_size: usize, ix: usize) -> usize1254 fn parse_hrule(&mut self, hrule_size: usize, ix: usize) -> usize {
1255 self.tree.append(Item {
1256 start: ix,
1257 end: ix + hrule_size,
1258 body: ItemBody::Rule,
1259 });
1260 ix + hrule_size
1261 }
1262
1263 /// Parse an ATX heading.
1264 ///
1265 /// Returns index of start of next line.
parse_atx_heading(&mut self, mut ix: usize, atx_size: usize) -> usize1266 fn parse_atx_heading(&mut self, mut ix: usize, atx_size: usize) -> usize {
1267 let heading_ix = self.tree.append(Item {
1268 start: ix,
1269 end: 0, // set later
1270 body: ItemBody::Heading(atx_size as u32),
1271 });
1272 ix += atx_size;
1273 // next char is space or eol (guaranteed by scan_atx_heading)
1274 let bytes = self.text.as_bytes();
1275 if let Some(eol_bytes) = scan_eol(&bytes[ix..]) {
1276 self.tree[heading_ix].item.end = ix + eol_bytes;
1277 return ix + eol_bytes;
1278 }
1279 // skip leading spaces
1280 let skip_spaces = scan_whitespace_no_nl(&bytes[ix..]);
1281 ix += skip_spaces;
1282
1283 // now handle the header text
1284 let header_start = ix;
1285 let header_node_idx = self.tree.push(); // so that we can set the endpoint later
1286 ix = self.parse_line(ix, TableParseMode::Disabled).0;
1287 self.tree[header_node_idx].item.end = ix;
1288
1289 // remove trailing matter from header text
1290 if let Some(cur_ix) = self.tree.cur() {
1291 let header_text = &bytes[header_start..ix];
1292 let mut limit = header_text
1293 .iter()
1294 .rposition(|&b| !(b == b'\n' || b == b'\r' || b == b' '))
1295 .map_or(0, |i| i + 1);
1296 let closer = header_text[..limit]
1297 .iter()
1298 .rposition(|&b| b != b'#')
1299 .map_or(0, |i| i + 1);
1300 if closer == 0 {
1301 limit = closer;
1302 } else {
1303 let spaces = scan_rev_while(&header_text[..closer], |b| b == b' ');
1304 if spaces > 0 {
1305 limit = closer - spaces;
1306 }
1307 }
1308 self.tree[cur_ix].item.end = limit + header_start;
1309 }
1310
1311 self.tree.pop();
1312 ix
1313 }
1314
1315 /// Returns the number of bytes scanned on success.
parse_footnote(&mut self, start: usize) -> Option<usize>1316 fn parse_footnote(&mut self, start: usize) -> Option<usize> {
1317 let bytes = &self.text.as_bytes()[start..];
1318 if !bytes.starts_with(b"[^") {
1319 return None;
1320 }
1321 let (mut i, label) = self.parse_refdef_label(start + 2)?;
1322 i += 2;
1323 if scan_ch(&bytes[i..], b':') == 0 {
1324 return None;
1325 }
1326 i += 1;
1327 self.finish_list(start);
1328 self.tree.append(Item {
1329 start,
1330 end: 0, // will get set later
1331 // TODO: check whether the label here is strictly necessary
1332 body: ItemBody::FootnoteDefinition(self.allocs.allocate_cow(label)),
1333 });
1334 self.tree.push();
1335 Some(i)
1336 }
1337
1338 /// Tries to parse a reference label, which can be interrupted by new blocks.
1339 /// On success, returns the number of bytes of the label and the label itself.
parse_refdef_label(&self, start: usize) -> Option<(usize, CowStr<'a>)>1340 fn parse_refdef_label(&self, start: usize) -> Option<(usize, CowStr<'a>)> {
1341 scan_link_label_rest(&self.text[start..], &|bytes| {
1342 let mut line_start = LineStart::new(bytes);
1343 let _ = scan_containers(&self.tree, &mut line_start);
1344 let bytes_scanned = line_start.bytes_scanned();
1345
1346 let suffix = &bytes[bytes_scanned..];
1347 if self.interrupt_paragraph_by_list(suffix) || scan_paragraph_interrupt(suffix) {
1348 None
1349 } else {
1350 Some(bytes_scanned)
1351 }
1352 })
1353 }
1354
1355 /// Returns number of bytes scanned, label and definition on success.
parse_refdef_total(&mut self, start: usize) -> Option<(usize, LinkLabel<'a>, LinkDef<'a>)>1356 fn parse_refdef_total(&mut self, start: usize) -> Option<(usize, LinkLabel<'a>, LinkDef<'a>)> {
1357 let bytes = &self.text.as_bytes()[start..];
1358 if scan_ch(bytes, b'[') == 0 {
1359 return None;
1360 }
1361 let (mut i, label) = self.parse_refdef_label(start + 1)?;
1362 i += 1;
1363 if scan_ch(&bytes[i..], b':') == 0 {
1364 return None;
1365 }
1366 i += 1;
1367 let (bytecount, link_def) = self.scan_refdef(start + i)?;
1368 Some((bytecount + i, UniCase::new(label), link_def))
1369 }
1370
1371 /// Returns number of bytes and number of newlines
scan_refdef_space(&self, bytes: &[u8], mut i: usize) -> Option<(usize, usize)>1372 fn scan_refdef_space(&self, bytes: &[u8], mut i: usize) -> Option<(usize, usize)> {
1373 let mut newlines = 0;
1374 loop {
1375 let whitespaces = scan_whitespace_no_nl(&bytes[i..]);
1376 i += whitespaces;
1377 if let Some(eol_bytes) = scan_eol(&bytes[i..]) {
1378 i += eol_bytes;
1379 newlines += 1;
1380 if newlines > 1 {
1381 return None;
1382 }
1383 } else {
1384 break;
1385 }
1386 let mut line_start = LineStart::new(&bytes[i..]);
1387 if self.tree.spine_len() != scan_containers(&self.tree, &mut line_start) {
1388 return None;
1389 }
1390 i += line_start.bytes_scanned();
1391 }
1392 Some((i, newlines))
1393 }
1394
1395 /// Returns # of bytes and definition.
1396 /// Assumes the label of the reference including colon has already been scanned.
scan_refdef(&self, start: usize) -> Option<(usize, LinkDef<'a>)>1397 fn scan_refdef(&self, start: usize) -> Option<(usize, LinkDef<'a>)> {
1398 let bytes = self.text.as_bytes();
1399
1400 // whitespace between label and url (including up to one newline)
1401 let (mut i, _newlines) = self.scan_refdef_space(bytes, start)?;
1402
1403 // scan link dest
1404 let (dest_length, dest) = scan_link_dest(self.text, i, 1)?;
1405 if dest_length == 0 {
1406 return None;
1407 }
1408 let dest = unescape(dest);
1409 i += dest_length;
1410
1411 // no title
1412 let mut backup = (i - start, LinkDef { dest, title: None });
1413
1414 // scan whitespace between dest and label
1415 let (mut i, newlines) =
1416 if let Some((new_i, mut newlines)) = self.scan_refdef_space(bytes, i) {
1417 if i == self.text.len() {
1418 newlines += 1;
1419 }
1420 if new_i == i && newlines == 0 {
1421 return None;
1422 }
1423 if newlines > 1 {
1424 return Some(backup);
1425 };
1426 (new_i, newlines)
1427 } else {
1428 return Some(backup);
1429 };
1430
1431 // scan title
1432 // if this fails but newline == 1, return also a refdef without title
1433 if let Some((title_length, title)) = scan_refdef_title(&self.text[i..]) {
1434 i += title_length;
1435 backup.1.title = Some(unescape(title));
1436 } else if newlines > 0 {
1437 return Some(backup);
1438 } else {
1439 return None;
1440 };
1441
1442 // scan EOL
1443 if let Some(bytes) = scan_blank_line(&bytes[i..]) {
1444 backup.0 = i + bytes - start;
1445 Some(backup)
1446 } else if newlines > 0 {
1447 Some(backup)
1448 } else {
1449 None
1450 }
1451 }
1452 }
1453
1454 /// Returns number of containers scanned.
scan_containers(tree: &Tree<Item>, line_start: &mut LineStart) -> usize1455 fn scan_containers(tree: &Tree<Item>, line_start: &mut LineStart) -> usize {
1456 let mut i = 0;
1457 for &node_ix in tree.walk_spine() {
1458 match tree[node_ix].item.body {
1459 ItemBody::BlockQuote => {
1460 let save = line_start.clone();
1461 if !line_start.scan_blockquote_marker() {
1462 *line_start = save;
1463 break;
1464 }
1465 }
1466 ItemBody::ListItem(indent) => {
1467 let save = line_start.clone();
1468 if !line_start.scan_space(indent) {
1469 if !line_start.is_at_eol() {
1470 *line_start = save;
1471 break;
1472 }
1473 }
1474 }
1475 _ => (),
1476 }
1477 i += 1;
1478 }
1479 i
1480 }
1481
1482 /// Computes the number of header columns in a table line by computing the number of dividing pipes
1483 /// that aren't followed or preceeded by whitespace.
count_header_cols( bytes: &[u8], mut pipes: usize, mut start: usize, last_pipe_ix: usize, ) -> usize1484 fn count_header_cols(
1485 bytes: &[u8],
1486 mut pipes: usize,
1487 mut start: usize,
1488 last_pipe_ix: usize,
1489 ) -> usize {
1490 // was first pipe preceeded by whitespace? if so, subtract one
1491 start += scan_whitespace_no_nl(&bytes[start..]);
1492 if bytes[start] == b'|' {
1493 pipes -= 1;
1494 }
1495
1496 // was last pipe followed by whitespace? if so, sub one
1497 if scan_blank_line(&bytes[(last_pipe_ix + 1)..]).is_some() {
1498 pipes
1499 } else {
1500 pipes + 1
1501 }
1502 }
1503
1504 impl<'a> Tree<Item> {
append_text(&mut self, start: usize, end: usize)1505 fn append_text(&mut self, start: usize, end: usize) {
1506 if end > start {
1507 if let Some(ix) = self.cur() {
1508 if ItemBody::Text == self[ix].item.body && self[ix].item.end == start {
1509 self[ix].item.end = end;
1510 return;
1511 }
1512 }
1513 self.append(Item {
1514 start,
1515 end,
1516 body: ItemBody::Text,
1517 });
1518 }
1519 }
1520 }
1521
1522 /// Determines whether the delimiter run starting at given index is
1523 /// left-flanking, as defined by the commonmark spec (and isn't intraword
1524 /// for _ delims).
1525 /// suffix is &s[ix..], which is passed in as an optimization, since taking
1526 /// a string subslice is O(n).
delim_run_can_open(s: &str, suffix: &str, run_len: usize, ix: usize) -> bool1527 fn delim_run_can_open(s: &str, suffix: &str, run_len: usize, ix: usize) -> bool {
1528 let next_char = if let Some(c) = suffix.chars().nth(run_len) {
1529 c
1530 } else {
1531 return false;
1532 };
1533 if next_char.is_whitespace() {
1534 return false;
1535 }
1536 if ix == 0 {
1537 return true;
1538 }
1539 let delim = suffix.chars().next().unwrap();
1540 if delim == '*' && !is_punctuation(next_char) {
1541 return true;
1542 }
1543
1544 let prev_char = s[..ix].chars().last().unwrap();
1545
1546 prev_char.is_whitespace()
1547 || is_punctuation(prev_char) && (delim != '\'' || ![']', ')'].contains(&prev_char))
1548 }
1549
1550 /// Determines whether the delimiter run starting at given index is
1551 /// left-flanking, as defined by the commonmark spec (and isn't intraword
1552 /// for _ delims)
delim_run_can_close(s: &str, suffix: &str, run_len: usize, ix: usize) -> bool1553 fn delim_run_can_close(s: &str, suffix: &str, run_len: usize, ix: usize) -> bool {
1554 if ix == 0 {
1555 return false;
1556 }
1557 let prev_char = s[..ix].chars().last().unwrap();
1558 if prev_char.is_whitespace() {
1559 return false;
1560 }
1561 let next_char = if let Some(c) = suffix.chars().nth(run_len) {
1562 c
1563 } else {
1564 return true;
1565 };
1566 let delim = suffix.chars().next().unwrap();
1567 if delim == '*' && !is_punctuation(prev_char) {
1568 return true;
1569 }
1570
1571 next_char.is_whitespace() || is_punctuation(next_char)
1572 }
1573
1574 /// Checks whether we should break a paragraph on the given input.
1575 /// Note: lists are dealt with in `interrupt_paragraph_by_list`, because determing
1576 /// whether to break on a list requires additional context.
scan_paragraph_interrupt(bytes: &[u8]) -> bool1577 fn scan_paragraph_interrupt(bytes: &[u8]) -> bool {
1578 if scan_eol(bytes).is_some()
1579 || scan_hrule(bytes).is_ok()
1580 || scan_atx_heading(bytes).is_some()
1581 || scan_code_fence(bytes).is_some()
1582 || scan_blockquote_start(bytes).is_some()
1583 {
1584 return true;
1585 }
1586 bytes.starts_with(b"<")
1587 && (get_html_end_tag(&bytes[1..]).is_some()
1588 || is_html_tag(scan_html_block_tag(&bytes[1..]).1))
1589 }
1590
1591 /// Assumes `text_bytes` is preceded by `<`.
get_html_end_tag(text_bytes: &[u8]) -> Option<&'static str>1592 fn get_html_end_tag(text_bytes: &[u8]) -> Option<&'static str> {
1593 static BEGIN_TAGS: &[&[u8]; 3] = &[b"pre", b"style", b"script"];
1594 static ST_BEGIN_TAGS: &[&[u8]; 3] = &[b"!--", b"?", b"![CDATA["];
1595
1596 for (beg_tag, end_tag) in BEGIN_TAGS
1597 .iter()
1598 .zip(["</pre>", "</style>", "</script>"].iter())
1599 {
1600 let tag_len = beg_tag.len();
1601
1602 if text_bytes.len() < tag_len {
1603 // begin tags are increasing in size
1604 break;
1605 }
1606
1607 if !text_bytes[..tag_len].eq_ignore_ascii_case(beg_tag) {
1608 continue;
1609 }
1610
1611 // Must either be the end of the line...
1612 if text_bytes.len() == tag_len {
1613 return Some(end_tag);
1614 }
1615
1616 // ...or be followed by whitespace, newline, or '>'.
1617 let s = text_bytes[tag_len];
1618 if is_ascii_whitespace(s) || s == b'>' {
1619 return Some(end_tag);
1620 }
1621 }
1622
1623 for (beg_tag, end_tag) in ST_BEGIN_TAGS.iter().zip(["-->", "?>", "]]>"].iter()) {
1624 if text_bytes.starts_with(beg_tag) {
1625 return Some(end_tag);
1626 }
1627 }
1628
1629 if text_bytes.len() > 1
1630 && text_bytes[0] == b'!'
1631 && text_bytes[1] >= b'A'
1632 && text_bytes[1] <= b'Z'
1633 {
1634 Some(">")
1635 } else {
1636 None
1637 }
1638 }
1639
1640 #[derive(Copy, Clone, Debug)]
1641 struct InlineEl {
1642 start: TreeIndex, // offset of tree node
1643 count: usize,
1644 c: u8, // b'*' or b'_'
1645 both: bool, // can both open and close
1646 }
1647
1648 #[derive(Debug, Clone, Default)]
1649 struct InlineStack {
1650 stack: Vec<InlineEl>,
1651 // Lower bounds for matching indices in the stack. For example
1652 // a strikethrough delimiter will never match with any element
1653 // in the stack with index smaller than
1654 // `lower_bounds[InlineStack::TILDES]`.
1655 lower_bounds: [usize; 7],
1656 }
1657
1658 impl InlineStack {
1659 /// These are indices into the lower bounds array.
1660 /// Not both refers to the property that the delimiter can not both
1661 /// be opener as a closer.
1662 const UNDERSCORE_NOT_BOTH: usize = 0;
1663 const ASTERISK_NOT_BOTH: usize = 1;
1664 const ASTERISK_BASE: usize = 2;
1665 const TILDES: usize = 5;
1666 const UNDERSCORE_BOTH: usize = 6;
1667
pop_all(&mut self, tree: &mut Tree<Item>)1668 fn pop_all(&mut self, tree: &mut Tree<Item>) {
1669 for el in self.stack.drain(..) {
1670 for i in 0..el.count {
1671 tree[el.start + i].item.body = ItemBody::Text;
1672 }
1673 }
1674 self.lower_bounds = [0; 7];
1675 }
1676
get_lowerbound(&self, c: u8, count: usize, both: bool) -> usize1677 fn get_lowerbound(&self, c: u8, count: usize, both: bool) -> usize {
1678 if c == b'_' {
1679 if both {
1680 self.lower_bounds[InlineStack::UNDERSCORE_BOTH]
1681 } else {
1682 self.lower_bounds[InlineStack::UNDERSCORE_NOT_BOTH]
1683 }
1684 } else if c == b'*' {
1685 let mod3_lower = self.lower_bounds[InlineStack::ASTERISK_BASE + count % 3];
1686 if both {
1687 mod3_lower
1688 } else {
1689 min(
1690 mod3_lower,
1691 self.lower_bounds[InlineStack::ASTERISK_NOT_BOTH],
1692 )
1693 }
1694 } else {
1695 self.lower_bounds[InlineStack::TILDES]
1696 }
1697 }
1698
set_lowerbound(&mut self, c: u8, count: usize, both: bool, new_bound: usize)1699 fn set_lowerbound(&mut self, c: u8, count: usize, both: bool, new_bound: usize) {
1700 if c == b'_' {
1701 if both {
1702 self.lower_bounds[InlineStack::UNDERSCORE_BOTH] = new_bound;
1703 } else {
1704 self.lower_bounds[InlineStack::UNDERSCORE_NOT_BOTH] = new_bound;
1705 }
1706 } else if c == b'*' {
1707 self.lower_bounds[InlineStack::ASTERISK_BASE + count % 3] = new_bound;
1708 if !both {
1709 self.lower_bounds[InlineStack::ASTERISK_NOT_BOTH] = new_bound;
1710 }
1711 } else {
1712 self.lower_bounds[InlineStack::TILDES] = new_bound;
1713 }
1714 }
1715
find_match( &mut self, tree: &mut Tree<Item>, c: u8, count: usize, both: bool, ) -> Option<InlineEl>1716 fn find_match(
1717 &mut self,
1718 tree: &mut Tree<Item>,
1719 c: u8,
1720 count: usize,
1721 both: bool,
1722 ) -> Option<InlineEl> {
1723 let lowerbound = min(self.stack.len(), self.get_lowerbound(c, count, both));
1724 let res = self.stack[lowerbound..]
1725 .iter()
1726 .cloned()
1727 .enumerate()
1728 .rfind(|(_, el)| {
1729 el.c == c && (!both && !el.both || (count + el.count) % 3 != 0 || count % 3 == 0)
1730 });
1731
1732 if let Some((matching_ix, matching_el)) = res {
1733 let matching_ix = matching_ix + lowerbound;
1734 for el in &self.stack[(matching_ix + 1)..] {
1735 for i in 0..el.count {
1736 tree[el.start + i].item.body = ItemBody::Text;
1737 }
1738 }
1739 self.stack.truncate(matching_ix);
1740 Some(matching_el)
1741 } else {
1742 self.set_lowerbound(c, count, both, self.stack.len());
1743 None
1744 }
1745 }
1746
push(&mut self, el: InlineEl)1747 fn push(&mut self, el: InlineEl) {
1748 self.stack.push(el)
1749 }
1750 }
1751
1752 #[derive(Debug, Clone)]
1753 enum RefScan<'a> {
1754 // label, source ix of label end
1755 LinkLabel(CowStr<'a>, usize),
1756 // contains next node index
1757 Collapsed(Option<TreeIndex>),
1758 Failed,
1759 }
1760
1761 /// Skips forward within a block to a node which spans (ends inclusive) the given
1762 /// index into the source.
scan_nodes_to_ix( tree: &Tree<Item>, mut node: Option<TreeIndex>, ix: usize, ) -> Option<TreeIndex>1763 fn scan_nodes_to_ix(
1764 tree: &Tree<Item>,
1765 mut node: Option<TreeIndex>,
1766 ix: usize,
1767 ) -> Option<TreeIndex> {
1768 while let Some(node_ix) = node {
1769 if tree[node_ix].item.end <= ix {
1770 node = tree[node_ix].next;
1771 } else {
1772 break;
1773 }
1774 }
1775 node
1776 }
1777
1778 /// Scans an inline link label, which cannot be interrupted.
1779 /// Returns number of bytes (including brackets) and label on success.
scan_link_label<'text, 'tree>( tree: &'tree Tree<Item>, text: &'text str, allow_footnote_refs: bool, ) -> Option<(usize, ReferenceLabel<'text>)>1780 fn scan_link_label<'text, 'tree>(
1781 tree: &'tree Tree<Item>,
1782 text: &'text str,
1783 allow_footnote_refs: bool,
1784 ) -> Option<(usize, ReferenceLabel<'text>)> {
1785 let bytes = &text.as_bytes();
1786 if bytes.len() < 2 || bytes[0] != b'[' {
1787 return None;
1788 }
1789 let linebreak_handler = |bytes: &[u8]| {
1790 let mut line_start = LineStart::new(bytes);
1791 let _ = scan_containers(tree, &mut line_start);
1792 Some(line_start.bytes_scanned())
1793 };
1794 let pair = if allow_footnote_refs && b'^' == bytes[1] {
1795 let (byte_index, cow) = scan_link_label_rest(&text[2..], &linebreak_handler)?;
1796 (byte_index + 2, ReferenceLabel::Footnote(cow))
1797 } else {
1798 let (byte_index, cow) = scan_link_label_rest(&text[1..], &linebreak_handler)?;
1799 (byte_index + 1, ReferenceLabel::Link(cow))
1800 };
1801 Some(pair)
1802 }
1803
scan_reference<'a, 'b>( tree: &'a Tree<Item>, text: &'b str, cur: Option<TreeIndex>, allow_footnote_refs: bool, ) -> RefScan<'b>1804 fn scan_reference<'a, 'b>(
1805 tree: &'a Tree<Item>,
1806 text: &'b str,
1807 cur: Option<TreeIndex>,
1808 allow_footnote_refs: bool,
1809 ) -> RefScan<'b> {
1810 let cur_ix = match cur {
1811 None => return RefScan::Failed,
1812 Some(cur_ix) => cur_ix,
1813 };
1814 let start = tree[cur_ix].item.start;
1815 let tail = &text.as_bytes()[start..];
1816
1817 if tail.starts_with(b"[]") {
1818 let closing_node = tree[cur_ix].next.unwrap();
1819 RefScan::Collapsed(tree[closing_node].next)
1820 } else if let Some((ix, ReferenceLabel::Link(label))) =
1821 scan_link_label(tree, &text[start..], allow_footnote_refs)
1822 {
1823 RefScan::LinkLabel(label, start + ix)
1824 } else {
1825 RefScan::Failed
1826 }
1827 }
1828
1829 #[derive(Clone, Default)]
1830 struct LinkStack {
1831 inner: Vec<LinkStackEl>,
1832 disabled_ix: usize,
1833 }
1834
1835 impl LinkStack {
push(&mut self, el: LinkStackEl)1836 fn push(&mut self, el: LinkStackEl) {
1837 self.inner.push(el);
1838 }
1839
pop(&mut self) -> Option<LinkStackEl>1840 fn pop(&mut self) -> Option<LinkStackEl> {
1841 let el = self.inner.pop();
1842 self.disabled_ix = std::cmp::min(self.disabled_ix, self.inner.len());
1843 el
1844 }
1845
clear(&mut self)1846 fn clear(&mut self) {
1847 self.inner.clear();
1848 self.disabled_ix = 0;
1849 }
1850
disable_all_links(&mut self)1851 fn disable_all_links(&mut self) {
1852 for el in &mut self.inner[self.disabled_ix..] {
1853 if el.ty == LinkStackTy::Link {
1854 el.ty = LinkStackTy::Disabled;
1855 }
1856 }
1857 self.disabled_ix = self.inner.len();
1858 }
1859 }
1860
1861 #[derive(Clone, Debug)]
1862 struct LinkStackEl {
1863 node: TreeIndex,
1864 ty: LinkStackTy,
1865 }
1866
1867 #[derive(PartialEq, Clone, Debug)]
1868 enum LinkStackTy {
1869 Link,
1870 Image,
1871 Disabled,
1872 }
1873
1874 #[derive(Clone)]
1875 struct LinkDef<'a> {
1876 dest: CowStr<'a>,
1877 title: Option<CowStr<'a>>,
1878 }
1879
1880 /// Tracks tree indices of code span delimiters of each length. It should prevent
1881 /// quadratic scanning behaviours by providing (amortized) constant time lookups.
1882 struct CodeDelims {
1883 inner: HashMap<usize, VecDeque<TreeIndex>>,
1884 seen_first: bool,
1885 }
1886
1887 impl CodeDelims {
new() -> Self1888 fn new() -> Self {
1889 Self {
1890 inner: Default::default(),
1891 seen_first: false,
1892 }
1893 }
1894
insert(&mut self, count: usize, ix: TreeIndex)1895 fn insert(&mut self, count: usize, ix: TreeIndex) {
1896 if self.seen_first {
1897 self.inner
1898 .entry(count)
1899 .or_insert_with(Default::default)
1900 .push_back(ix);
1901 } else {
1902 // Skip the first insert, since that delimiter will always
1903 // be an opener and not a closer.
1904 self.seen_first = true;
1905 }
1906 }
1907
is_populated(&self) -> bool1908 fn is_populated(&self) -> bool {
1909 !self.inner.is_empty()
1910 }
1911
find(&mut self, open_ix: TreeIndex, count: usize) -> Option<TreeIndex>1912 fn find(&mut self, open_ix: TreeIndex, count: usize) -> Option<TreeIndex> {
1913 while let Some(ix) = self.inner.get_mut(&count)?.pop_front() {
1914 if ix > open_ix {
1915 return Some(ix);
1916 }
1917 }
1918 None
1919 }
1920
clear(&mut self)1921 fn clear(&mut self) {
1922 self.inner.clear();
1923 self.seen_first = false;
1924 }
1925 }
1926
1927 #[derive(Copy, Clone, PartialEq, Eq, Debug)]
1928 struct LinkIndex(usize);
1929
1930 #[derive(Copy, Clone, PartialEq, Eq, Debug)]
1931 struct CowIndex(usize);
1932
1933 #[derive(Copy, Clone, PartialEq, Eq, Debug)]
1934 struct AlignmentIndex(usize);
1935
1936 #[derive(Clone)]
1937 struct Allocations<'a> {
1938 refdefs: HashMap<LinkLabel<'a>, LinkDef<'a>>,
1939 links: Vec<(LinkType, CowStr<'a>, CowStr<'a>)>,
1940 cows: Vec<CowStr<'a>>,
1941 alignments: Vec<Vec<Alignment>>,
1942 }
1943
1944 impl<'a> Allocations<'a> {
new() -> Self1945 fn new() -> Self {
1946 Self {
1947 refdefs: HashMap::new(),
1948 links: Vec::with_capacity(128),
1949 cows: Vec::new(),
1950 alignments: Vec::new(),
1951 }
1952 }
1953
allocate_cow(&mut self, cow: CowStr<'a>) -> CowIndex1954 fn allocate_cow(&mut self, cow: CowStr<'a>) -> CowIndex {
1955 let ix = self.cows.len();
1956 self.cows.push(cow);
1957 CowIndex(ix)
1958 }
1959
allocate_link(&mut self, ty: LinkType, url: CowStr<'a>, title: CowStr<'a>) -> LinkIndex1960 fn allocate_link(&mut self, ty: LinkType, url: CowStr<'a>, title: CowStr<'a>) -> LinkIndex {
1961 let ix = self.links.len();
1962 self.links.push((ty, url, title));
1963 LinkIndex(ix)
1964 }
1965
allocate_alignment(&mut self, alignment: Vec<Alignment>) -> AlignmentIndex1966 fn allocate_alignment(&mut self, alignment: Vec<Alignment>) -> AlignmentIndex {
1967 let ix = self.alignments.len();
1968 self.alignments.push(alignment);
1969 AlignmentIndex(ix)
1970 }
1971 }
1972
1973 impl<'a> Index<CowIndex> for Allocations<'a> {
1974 type Output = CowStr<'a>;
1975
index(&self, ix: CowIndex) -> &Self::Output1976 fn index(&self, ix: CowIndex) -> &Self::Output {
1977 self.cows.index(ix.0)
1978 }
1979 }
1980
1981 impl<'a> Index<LinkIndex> for Allocations<'a> {
1982 type Output = (LinkType, CowStr<'a>, CowStr<'a>);
1983
index(&self, ix: LinkIndex) -> &Self::Output1984 fn index(&self, ix: LinkIndex) -> &Self::Output {
1985 self.links.index(ix.0)
1986 }
1987 }
1988
1989 impl<'a> Index<AlignmentIndex> for Allocations<'a> {
1990 type Output = Vec<Alignment>;
1991
index(&self, ix: AlignmentIndex) -> &Self::Output1992 fn index(&self, ix: AlignmentIndex) -> &Self::Output {
1993 self.alignments.index(ix.0)
1994 }
1995 }
1996
1997 /// A struct containing information on the reachability of certain inline HTML
1998 /// elements. In particular, for cdata elements (`<![CDATA[`), processing
1999 /// elements (`<?`) and declarations (`<!DECLARATION`). The respectives usizes
2000 /// represent the indices before which a scan will always fail and can hence
2001 /// be skipped.
2002 #[derive(Clone, Default)]
2003 pub(crate) struct HtmlScanGuard {
2004 pub cdata: usize,
2005 pub processing: usize,
2006 pub declaration: usize,
2007 }
2008
special_bytes(options: &Options) -> [bool; 256]2009 fn special_bytes(options: &Options) -> [bool; 256] {
2010 let mut bytes = [false; 256];
2011 let standard_bytes = [
2012 b'\n', b'\r', b'*', b'_', b'&', b'\\', b'[', b']', b'<', b'!', b'`',
2013 ];
2014
2015 for &byte in &standard_bytes {
2016 bytes[byte as usize] = true;
2017 }
2018 if options.contains(Options::ENABLE_TABLES) {
2019 bytes[b'|' as usize] = true;
2020 }
2021 if options.contains(Options::ENABLE_STRIKETHROUGH) {
2022 bytes[b'~' as usize] = true;
2023 }
2024 if options.contains(Options::ENABLE_SMART_PUNCTUATION) {
2025 for &byte in &[b'.', b'-', b'"', b'\''] {
2026 bytes[byte as usize] = true;
2027 }
2028 }
2029
2030 bytes
2031 }
2032
create_lut(options: &Options) -> LookupTable2033 pub(crate) fn create_lut(options: &Options) -> LookupTable {
2034 #[cfg(all(target_arch = "x86_64", feature = "simd"))]
2035 {
2036 LookupTable {
2037 simd: crate::simd::compute_lookup(options),
2038 scalar: special_bytes(options),
2039 }
2040 }
2041 #[cfg(not(all(target_arch = "x86_64", feature = "simd")))]
2042 {
2043 special_bytes(options)
2044 }
2045 }
2046
2047 pub type BrokenLinkCallback<'a> =
2048 Option<&'a mut dyn FnMut(BrokenLink) -> Option<(CowStr<'a>, CowStr<'a>)>>;
2049
2050 /// Markdown event iterator.
2051 pub struct Parser<'a> {
2052 text: &'a str,
2053 options: Options,
2054 tree: Tree<Item>,
2055 allocs: Allocations<'a>,
2056 broken_link_callback: BrokenLinkCallback<'a>,
2057 html_scan_guard: HtmlScanGuard,
2058
2059 // used by inline passes. store them here for reuse
2060 inline_stack: InlineStack,
2061 link_stack: LinkStack,
2062 }
2063
2064 impl<'a> Parser<'a> {
2065 /// Creates a new event iterator for a markdown string without any options enabled.
new(text: &'a str) -> Parser<'a>2066 pub fn new(text: &'a str) -> Parser<'a> {
2067 Parser::new_ext(text, Options::empty())
2068 }
2069
2070 /// Creates a new event iterator for a markdown string with given options.
new_ext(text: &'a str, options: Options) -> Parser<'a>2071 pub fn new_ext(text: &'a str, options: Options) -> Parser<'a> {
2072 Parser::new_with_broken_link_callback(text, options, None)
2073 }
2074
2075 /// In case the parser encounters any potential links that have a broken
2076 /// reference (e.g `[foo]` when there is no `[foo]: ` entry at the bottom)
2077 /// the provided callback will be called with the reference name,
2078 /// and the returned pair will be used as the link name and title if it is not
2079 /// `None`.
new_with_broken_link_callback( text: &'a str, options: Options, broken_link_callback: BrokenLinkCallback<'a>, ) -> Parser<'a>2080 pub fn new_with_broken_link_callback(
2081 text: &'a str,
2082 options: Options,
2083 broken_link_callback: BrokenLinkCallback<'a>,
2084 ) -> Parser<'a> {
2085 let lut = create_lut(&options);
2086 let first_pass = FirstPass::new(text, options, &lut);
2087 let (mut tree, allocs) = first_pass.run();
2088 tree.reset();
2089 let inline_stack = Default::default();
2090 let link_stack = Default::default();
2091 let html_scan_guard = Default::default();
2092 Parser {
2093 text,
2094 options,
2095 tree,
2096 allocs,
2097 broken_link_callback,
2098 inline_stack,
2099 link_stack,
2100 html_scan_guard,
2101 }
2102 }
2103
2104 /// Handle inline markup.
2105 ///
2106 /// When the parser encounters any item indicating potential inline markup, all
2107 /// inline markup passes are run on the remainder of the chain.
2108 ///
2109 /// Note: there's some potential for optimization here, but that's future work.
handle_inline(&mut self)2110 fn handle_inline(&mut self) {
2111 self.handle_inline_pass1();
2112 self.handle_emphasis();
2113 }
2114
2115 /// Handle inline HTML, code spans, and links.
2116 ///
2117 /// This function handles both inline HTML and code spans, because they have
2118 /// the same precedence. It also handles links, even though they have lower
2119 /// precedence, because the URL of links must not be processed.
handle_inline_pass1(&mut self)2120 fn handle_inline_pass1(&mut self) {
2121 let mut code_delims = CodeDelims::new();
2122 let mut cur = self.tree.cur();
2123 let mut prev = None;
2124
2125 let block_end = self.tree[self.tree.peek_up().unwrap()].item.end;
2126 let block_text = &self.text[..block_end];
2127
2128 while let Some(mut cur_ix) = cur {
2129 match self.tree[cur_ix].item.body {
2130 ItemBody::MaybeHtml => {
2131 let next = self.tree[cur_ix].next;
2132 let autolink = if let Some(next_ix) = next {
2133 scan_autolink(block_text, self.tree[next_ix].item.start)
2134 } else {
2135 None
2136 };
2137
2138 if let Some((ix, uri, link_type)) = autolink {
2139 let node = scan_nodes_to_ix(&self.tree, next, ix);
2140 let text_node = self.tree.create_node(Item {
2141 start: self.tree[cur_ix].item.start + 1,
2142 end: ix - 1,
2143 body: ItemBody::Text,
2144 });
2145 let link_ix = self.allocs.allocate_link(link_type, uri, "".into());
2146 self.tree[cur_ix].item.body = ItemBody::Link(link_ix);
2147 self.tree[cur_ix].item.end = ix;
2148 self.tree[cur_ix].next = node;
2149 self.tree[cur_ix].child = Some(text_node);
2150 prev = cur;
2151 cur = node;
2152 if let Some(node_ix) = cur {
2153 self.tree[node_ix].item.start = max(self.tree[node_ix].item.start, ix);
2154 }
2155 continue;
2156 } else {
2157 let inline_html = next.and_then(|next_ix| {
2158 self.scan_inline_html(
2159 block_text.as_bytes(),
2160 self.tree[next_ix].item.start,
2161 )
2162 });
2163 if let Some((span, ix)) = inline_html {
2164 let node = scan_nodes_to_ix(&self.tree, next, ix);
2165 self.tree[cur_ix].item.body = if !span.is_empty() {
2166 let converted_string =
2167 String::from_utf8(span).expect("invalid utf8");
2168 ItemBody::OwnedHtml(
2169 self.allocs.allocate_cow(converted_string.into()),
2170 )
2171 } else {
2172 ItemBody::Html
2173 };
2174 self.tree[cur_ix].item.end = ix;
2175 self.tree[cur_ix].next = node;
2176 prev = cur;
2177 cur = node;
2178 if let Some(node_ix) = cur {
2179 self.tree[node_ix].item.start =
2180 max(self.tree[node_ix].item.start, ix);
2181 }
2182 continue;
2183 }
2184 }
2185 self.tree[cur_ix].item.body = ItemBody::Text;
2186 }
2187 ItemBody::MaybeCode(mut search_count, preceded_by_backslash) => {
2188 if preceded_by_backslash {
2189 search_count -= 1;
2190 if search_count == 0 {
2191 self.tree[cur_ix].item.body = ItemBody::Text;
2192 prev = cur;
2193 cur = self.tree[cur_ix].next;
2194 continue;
2195 }
2196 }
2197
2198 if code_delims.is_populated() {
2199 // we have previously scanned all codeblock delimiters,
2200 // so we can reuse that work
2201 if let Some(scan_ix) = code_delims.find(cur_ix, search_count) {
2202 self.make_code_span(cur_ix, scan_ix, preceded_by_backslash);
2203 } else {
2204 self.tree[cur_ix].item.body = ItemBody::Text;
2205 }
2206 } else {
2207 // we haven't previously scanned all codeblock delimiters,
2208 // so walk the AST
2209 let mut scan = if search_count > 0 {
2210 self.tree[cur_ix].next
2211 } else {
2212 None
2213 };
2214 while let Some(scan_ix) = scan {
2215 if let ItemBody::MaybeCode(delim_count, _) =
2216 self.tree[scan_ix].item.body
2217 {
2218 if search_count == delim_count {
2219 self.make_code_span(cur_ix, scan_ix, preceded_by_backslash);
2220 code_delims.clear();
2221 break;
2222 } else {
2223 code_delims.insert(delim_count, scan_ix);
2224 }
2225 }
2226 scan = self.tree[scan_ix].next;
2227 }
2228 if scan == None {
2229 self.tree[cur_ix].item.body = ItemBody::Text;
2230 }
2231 }
2232 }
2233 ItemBody::MaybeLinkOpen => {
2234 self.tree[cur_ix].item.body = ItemBody::Text;
2235 self.link_stack.push(LinkStackEl {
2236 node: cur_ix,
2237 ty: LinkStackTy::Link,
2238 });
2239 }
2240 ItemBody::MaybeImage => {
2241 self.tree[cur_ix].item.body = ItemBody::Text;
2242 self.link_stack.push(LinkStackEl {
2243 node: cur_ix,
2244 ty: LinkStackTy::Image,
2245 });
2246 }
2247 ItemBody::MaybeLinkClose(could_be_ref) => {
2248 self.tree[cur_ix].item.body = ItemBody::Text;
2249 if let Some(tos) = self.link_stack.pop() {
2250 if tos.ty == LinkStackTy::Disabled {
2251 continue;
2252 }
2253 let next = self.tree[cur_ix].next;
2254 if let Some((next_ix, url, title)) =
2255 self.scan_inline_link(block_text, self.tree[cur_ix].item.end, next)
2256 {
2257 let next_node = scan_nodes_to_ix(&self.tree, next, next_ix);
2258 if let Some(prev_ix) = prev {
2259 self.tree[prev_ix].next = None;
2260 }
2261 cur = Some(tos.node);
2262 cur_ix = tos.node;
2263 let link_ix = self.allocs.allocate_link(LinkType::Inline, url, title);
2264 self.tree[cur_ix].item.body = if tos.ty == LinkStackTy::Image {
2265 ItemBody::Image(link_ix)
2266 } else {
2267 ItemBody::Link(link_ix)
2268 };
2269 self.tree[cur_ix].child = self.tree[cur_ix].next;
2270 self.tree[cur_ix].next = next_node;
2271 self.tree[cur_ix].item.end = next_ix;
2272 if let Some(next_node_ix) = next_node {
2273 self.tree[next_node_ix].item.start =
2274 max(self.tree[next_node_ix].item.start, next_ix);
2275 }
2276
2277 if tos.ty == LinkStackTy::Link {
2278 self.link_stack.disable_all_links();
2279 }
2280 } else {
2281 // ok, so its not an inline link. maybe it is a reference
2282 // to a defined link?
2283 let scan_result = scan_reference(
2284 &self.tree,
2285 block_text,
2286 next,
2287 self.options.contains(Options::ENABLE_FOOTNOTES),
2288 );
2289 let (node_after_link, link_type) = match scan_result {
2290 // [label][reference]
2291 RefScan::LinkLabel(_, end_ix) => {
2292 // Toggle reference viability of the last closing bracket,
2293 // so that we can skip it on future iterations in case
2294 // it fails in this one. In particular, we won't call
2295 // the broken link callback twice on one reference.
2296 let reference_close_node =
2297 scan_nodes_to_ix(&self.tree, next, end_ix - 1).unwrap();
2298 self.tree[reference_close_node].item.body =
2299 ItemBody::MaybeLinkClose(false);
2300 let next_node = self.tree[reference_close_node].next;
2301
2302 (next_node, LinkType::Reference)
2303 }
2304 // [reference][]
2305 RefScan::Collapsed(next_node) => {
2306 // This reference has already been tried, and it's not
2307 // valid. Skip it.
2308 if !could_be_ref {
2309 continue;
2310 }
2311 (next_node, LinkType::Collapsed)
2312 }
2313 // [shortcut]
2314 //
2315 // [shortcut]: /blah
2316 RefScan::Failed => {
2317 if !could_be_ref {
2318 continue;
2319 }
2320 (next, LinkType::Shortcut)
2321 }
2322 };
2323
2324 // FIXME: references and labels are mixed in the naming of variables
2325 // below. Disambiguate!
2326
2327 // (label, source_ix end)
2328 let label: Option<(ReferenceLabel<'a>, usize)> = match scan_result {
2329 RefScan::LinkLabel(l, end_ix) => {
2330 Some((ReferenceLabel::Link(l), end_ix))
2331 }
2332 RefScan::Collapsed(..) | RefScan::Failed => {
2333 // No label? maybe it is a shortcut reference
2334 let label_start = self.tree[tos.node].item.end - 1;
2335 scan_link_label(
2336 &self.tree,
2337 &self.text[label_start..self.tree[cur_ix].item.end],
2338 self.options.contains(Options::ENABLE_FOOTNOTES),
2339 )
2340 .map(|(ix, label)| (label, label_start + ix))
2341 }
2342 };
2343
2344 // see if it's a footnote reference
2345 if let Some((ReferenceLabel::Footnote(l), end)) = label {
2346 self.tree[tos.node].next = node_after_link;
2347 self.tree[tos.node].child = None;
2348 self.tree[tos.node].item.body =
2349 ItemBody::FootnoteReference(self.allocs.allocate_cow(l));
2350 self.tree[tos.node].item.end = end;
2351 prev = Some(tos.node);
2352 cur = node_after_link;
2353 self.link_stack.clear();
2354 continue;
2355 } else if let Some((ReferenceLabel::Link(link_label), end)) = label {
2356 let type_url_title = self
2357 .allocs
2358 .refdefs
2359 .get(&UniCase::new(link_label.as_ref().into()))
2360 .map(|matching_def| {
2361 // found a matching definition!
2362 let title = matching_def
2363 .title
2364 .as_ref()
2365 .cloned()
2366 .unwrap_or_else(|| "".into());
2367 let url = matching_def.dest.clone();
2368 (link_type, url, title)
2369 })
2370 .or_else(|| {
2371 match self.broken_link_callback.as_mut() {
2372 Some(callback) => {
2373 // Construct a BrokenLink struct, which will be passed to the callback
2374 let broken_link = BrokenLink {
2375 span: (self.tree[tos.node].item.start)..end,
2376 link_type: link_type,
2377 reference: link_label.as_ref(),
2378 };
2379
2380 callback(broken_link).map(|(url, title)| {
2381 (link_type.to_unknown(), url, title)
2382 })
2383 }
2384 None => None,
2385 }
2386 });
2387
2388 if let Some((def_link_type, url, title)) = type_url_title {
2389 let link_ix =
2390 self.allocs.allocate_link(def_link_type, url, title);
2391 self.tree[tos.node].item.body = if tos.ty == LinkStackTy::Image
2392 {
2393 ItemBody::Image(link_ix)
2394 } else {
2395 ItemBody::Link(link_ix)
2396 };
2397 let label_node = self.tree[tos.node].next;
2398
2399 // lets do some tree surgery to add the link to the tree
2400 // 1st: skip the label node and close node
2401 self.tree[tos.node].next = node_after_link;
2402
2403 // then, if it exists, add the label node as a child to the link node
2404 if label_node != cur {
2405 self.tree[tos.node].child = label_node;
2406
2407 // finally: disconnect list of children
2408 if let Some(prev_ix) = prev {
2409 self.tree[prev_ix].next = None;
2410 }
2411 }
2412
2413 self.tree[tos.node].item.end = end;
2414
2415 // set up cur so next node will be node_after_link
2416 cur = Some(tos.node);
2417 cur_ix = tos.node;
2418
2419 if tos.ty == LinkStackTy::Link {
2420 self.link_stack.disable_all_links();
2421 }
2422 }
2423 }
2424 }
2425 }
2426 }
2427 _ => (),
2428 }
2429 prev = cur;
2430 cur = self.tree[cur_ix].next;
2431 }
2432 self.link_stack.clear();
2433 }
2434
handle_emphasis(&mut self)2435 fn handle_emphasis(&mut self) {
2436 let mut prev = None;
2437 let mut prev_ix: TreeIndex;
2438 let mut cur = self.tree.cur();
2439
2440 let mut single_quote_open: Option<TreeIndex> = None;
2441 let mut double_quote_open: bool = false;
2442
2443 while let Some(mut cur_ix) = cur {
2444 match self.tree[cur_ix].item.body {
2445 ItemBody::MaybeEmphasis(mut count, can_open, can_close) => {
2446 let c = self.text.as_bytes()[self.tree[cur_ix].item.start];
2447 let both = can_open && can_close;
2448 if can_close {
2449 while let Some(el) =
2450 self.inline_stack.find_match(&mut self.tree, c, count, both)
2451 {
2452 // have a match!
2453 if let Some(prev_ix) = prev {
2454 self.tree[prev_ix].next = None;
2455 }
2456 let match_count = min(count, el.count);
2457 // start, end are tree node indices
2458 let mut end = cur_ix - 1;
2459 let mut start = el.start + el.count;
2460
2461 // work from the inside out
2462 while start > el.start + el.count - match_count {
2463 let (inc, ty) = if c == b'~' {
2464 (2, ItemBody::Strikethrough)
2465 } else if start > el.start + el.count - match_count + 1 {
2466 (2, ItemBody::Strong)
2467 } else {
2468 (1, ItemBody::Emphasis)
2469 };
2470
2471 let root = start - inc;
2472 end = end + inc;
2473 self.tree[root].item.body = ty;
2474 self.tree[root].item.end = self.tree[end].item.end;
2475 self.tree[root].child = Some(start);
2476 self.tree[root].next = None;
2477 start = root;
2478 }
2479
2480 // set next for top most emph level
2481 prev_ix = el.start + el.count - match_count;
2482 prev = Some(prev_ix);
2483 cur = self.tree[cur_ix + match_count - 1].next;
2484 self.tree[prev_ix].next = cur;
2485
2486 if el.count > match_count {
2487 self.inline_stack.push(InlineEl {
2488 start: el.start,
2489 count: el.count - match_count,
2490 c: el.c,
2491 both,
2492 })
2493 }
2494 count -= match_count;
2495 if count > 0 {
2496 cur_ix = cur.unwrap();
2497 } else {
2498 break;
2499 }
2500 }
2501 }
2502 if count > 0 {
2503 if can_open {
2504 self.inline_stack.push(InlineEl {
2505 start: cur_ix,
2506 count,
2507 c,
2508 both,
2509 });
2510 } else {
2511 for i in 0..count {
2512 self.tree[cur_ix + i].item.body = ItemBody::Text;
2513 }
2514 }
2515 prev_ix = cur_ix + count - 1;
2516 prev = Some(prev_ix);
2517 cur = self.tree[prev_ix].next;
2518 }
2519 }
2520 ItemBody::MaybeSmartQuote(c, can_open, can_close) => {
2521 self.tree[cur_ix].item.body = match c {
2522 b'\'' => {
2523 if let (Some(open_ix), true) = (single_quote_open, can_close) {
2524 self.tree[open_ix].item.body = ItemBody::SynthesizeChar('‘');
2525 single_quote_open = None;
2526 } else if can_open {
2527 single_quote_open = Some(cur_ix);
2528 }
2529 ItemBody::SynthesizeChar('’')
2530 }
2531 _ /* double quote */ => {
2532 if can_close && double_quote_open {
2533 double_quote_open = false;
2534 ItemBody::SynthesizeChar('”')
2535 } else {
2536 if can_open && !double_quote_open {
2537 double_quote_open = true;
2538 }
2539 ItemBody::SynthesizeChar('“')
2540 }
2541 }
2542 };
2543 prev = cur;
2544 cur = self.tree[cur_ix].next;
2545 }
2546 _ => {
2547 prev = cur;
2548 cur = self.tree[cur_ix].next;
2549 }
2550 }
2551 }
2552 self.inline_stack.pop_all(&mut self.tree);
2553 }
2554
2555 /// Returns next byte index, url and title.
scan_inline_link( &self, underlying: &'a str, mut ix: usize, node: Option<TreeIndex>, ) -> Option<(usize, CowStr<'a>, CowStr<'a>)>2556 fn scan_inline_link(
2557 &self,
2558 underlying: &'a str,
2559 mut ix: usize,
2560 node: Option<TreeIndex>,
2561 ) -> Option<(usize, CowStr<'a>, CowStr<'a>)> {
2562 if scan_ch(&underlying.as_bytes()[ix..], b'(') == 0 {
2563 return None;
2564 }
2565 ix += 1;
2566 ix += scan_while(&underlying.as_bytes()[ix..], is_ascii_whitespace);
2567
2568 let (dest_length, dest) = scan_link_dest(underlying, ix, LINK_MAX_NESTED_PARENS)?;
2569 let dest = unescape(dest);
2570 ix += dest_length;
2571
2572 ix += scan_while(&underlying.as_bytes()[ix..], is_ascii_whitespace);
2573
2574 let title = if let Some((bytes_scanned, t)) = self.scan_link_title(underlying, ix, node) {
2575 ix += bytes_scanned;
2576 ix += scan_while(&underlying.as_bytes()[ix..], is_ascii_whitespace);
2577 t
2578 } else {
2579 "".into()
2580 };
2581 if scan_ch(&underlying.as_bytes()[ix..], b')') == 0 {
2582 return None;
2583 }
2584 ix += 1;
2585
2586 Some((ix, dest, title))
2587 }
2588
2589 // returns (bytes scanned, title cow)
scan_link_title( &self, text: &'a str, start_ix: usize, node: Option<TreeIndex>, ) -> Option<(usize, CowStr<'a>)>2590 fn scan_link_title(
2591 &self,
2592 text: &'a str,
2593 start_ix: usize,
2594 node: Option<TreeIndex>,
2595 ) -> Option<(usize, CowStr<'a>)> {
2596 let bytes = text.as_bytes();
2597 let open = match bytes.get(start_ix) {
2598 Some(b @ b'\'') | Some(b @ b'\"') | Some(b @ b'(') => *b,
2599 _ => return None,
2600 };
2601 let close = if open == b'(' { b')' } else { open };
2602
2603 let mut title = String::new();
2604 let mut mark = start_ix + 1;
2605 let mut i = start_ix + 1;
2606
2607 while i < bytes.len() {
2608 let c = bytes[i];
2609
2610 if c == close {
2611 let cow = if mark == 1 {
2612 (i - start_ix + 1, text[mark..i].into())
2613 } else {
2614 title.push_str(&text[mark..i]);
2615 (i - start_ix + 1, title.into())
2616 };
2617
2618 return Some(cow);
2619 }
2620 if c == open {
2621 return None;
2622 }
2623
2624 if c == b'\n' || c == b'\r' {
2625 if let Some(node_ix) = scan_nodes_to_ix(&self.tree, node, i + 1) {
2626 if self.tree[node_ix].item.start > i {
2627 title.push_str(&text[mark..i]);
2628 title.push('\n');
2629 i = self.tree[node_ix].item.start;
2630 mark = i;
2631 continue;
2632 }
2633 }
2634 }
2635 if c == b'&' {
2636 if let (n, Some(value)) = scan_entity(&bytes[i..]) {
2637 title.push_str(&text[mark..i]);
2638 title.push_str(&value);
2639 i += n;
2640 mark = i;
2641 continue;
2642 }
2643 }
2644 if c == b'\\' && i + 1 < bytes.len() && is_ascii_punctuation(bytes[i + 1]) {
2645 title.push_str(&text[mark..i]);
2646 i += 1;
2647 mark = i;
2648 }
2649
2650 i += 1;
2651 }
2652
2653 None
2654 }
2655
2656 /// Make a code span.
2657 ///
2658 /// Both `open` and `close` are matching MaybeCode items.
make_code_span(&mut self, open: TreeIndex, close: TreeIndex, preceding_backslash: bool)2659 fn make_code_span(&mut self, open: TreeIndex, close: TreeIndex, preceding_backslash: bool) {
2660 let first_ix = open + 1;
2661 let last_ix = close - 1;
2662 let bytes = self.text.as_bytes();
2663 let mut span_start = self.tree[open].item.end;
2664 let mut span_end = self.tree[close].item.start;
2665 let mut buf: Option<String> = None;
2666
2667 // detect all-space sequences, since they are kept as-is as of commonmark 0.29
2668 if !bytes[span_start..span_end].iter().all(|&b| b == b' ') {
2669 let opening = match bytes[span_start] {
2670 b' ' | b'\r' | b'\n' => true,
2671 _ => false,
2672 };
2673 let closing = match bytes[span_end - 1] {
2674 b' ' | b'\r' | b'\n' => true,
2675 _ => false,
2676 };
2677 let drop_enclosing_whitespace = opening && closing;
2678
2679 if drop_enclosing_whitespace {
2680 span_start += 1;
2681 if span_start < span_end {
2682 span_end -= 1;
2683 }
2684 }
2685
2686 let mut ix = first_ix;
2687
2688 while ix < close {
2689 if let ItemBody::HardBreak | ItemBody::SoftBreak = self.tree[ix].item.body {
2690 if drop_enclosing_whitespace {
2691 // check whether break should be ignored
2692 if ix == first_ix {
2693 ix = ix + 1;
2694 span_start = min(span_end, self.tree[ix].item.start);
2695 continue;
2696 } else if ix == last_ix && last_ix > first_ix {
2697 ix = ix + 1;
2698 continue;
2699 }
2700 }
2701
2702 let end = bytes[self.tree[ix].item.start..]
2703 .iter()
2704 .position(|&b| b == b'\r' || b == b'\n')
2705 .unwrap()
2706 + self.tree[ix].item.start;
2707 if let Some(ref mut buf) = buf {
2708 buf.push_str(&self.text[self.tree[ix].item.start..end]);
2709 buf.push(' ');
2710 } else {
2711 let mut new_buf = String::with_capacity(span_end - span_start);
2712 new_buf.push_str(&self.text[span_start..end]);
2713 new_buf.push(' ');
2714 buf = Some(new_buf);
2715 }
2716 } else if let Some(ref mut buf) = buf {
2717 let end = if ix == last_ix {
2718 span_end
2719 } else {
2720 self.tree[ix].item.end
2721 };
2722 buf.push_str(&self.text[self.tree[ix].item.start..end]);
2723 }
2724 ix = ix + 1;
2725 }
2726 }
2727
2728 let cow = if let Some(buf) = buf {
2729 buf.into()
2730 } else {
2731 self.text[span_start..span_end].into()
2732 };
2733 if preceding_backslash {
2734 self.tree[open].item.body = ItemBody::Text;
2735 self.tree[open].item.end = self.tree[open].item.start + 1;
2736 self.tree[open].next = Some(close);
2737 self.tree[close].item.body = ItemBody::Code(self.allocs.allocate_cow(cow));
2738 self.tree[close].item.start = self.tree[open].item.start + 1;
2739 } else {
2740 self.tree[open].item.body = ItemBody::Code(self.allocs.allocate_cow(cow));
2741 self.tree[open].item.end = self.tree[close].item.end;
2742 self.tree[open].next = self.tree[close].next;
2743 }
2744 }
2745
2746 /// On success, returns a buffer containing the inline html and byte offset.
2747 /// When no bytes were skipped, the buffer will be empty and the html can be
2748 /// represented as a subslice of the input string.
scan_inline_html(&mut self, bytes: &[u8], ix: usize) -> Option<(Vec<u8>, usize)>2749 fn scan_inline_html(&mut self, bytes: &[u8], ix: usize) -> Option<(Vec<u8>, usize)> {
2750 let c = *bytes.get(ix)?;
2751 if c == b'!' {
2752 Some((
2753 vec![],
2754 scan_inline_html_comment(bytes, ix + 1, &mut self.html_scan_guard)?,
2755 ))
2756 } else if c == b'?' {
2757 Some((
2758 vec![],
2759 scan_inline_html_processing(bytes, ix + 1, &mut self.html_scan_guard)?,
2760 ))
2761 } else {
2762 let (span, i) = scan_html_block_inner(
2763 // Subtract 1 to include the < character
2764 &bytes[(ix - 1)..],
2765 Some(&|_bytes| {
2766 let mut line_start = LineStart::new(bytes);
2767 let _ = scan_containers(&self.tree, &mut line_start);
2768 line_start.bytes_scanned()
2769 }),
2770 )?;
2771 Some((span, i + ix - 1))
2772 }
2773 }
2774
2775 /// Consumes the event iterator and produces an iterator that produces
2776 /// `(Event, Range)` pairs, where the `Range` value maps to the corresponding
2777 /// range in the markdown source.
into_offset_iter(self) -> OffsetIter<'a>2778 pub fn into_offset_iter(self) -> OffsetIter<'a> {
2779 OffsetIter { inner: self }
2780 }
2781 }
2782
2783 pub(crate) enum LoopInstruction<T> {
2784 /// Continue looking for more special bytes, but skip next few bytes.
2785 ContinueAndSkip(usize),
2786 /// Break looping immediately, returning with the given index and value.
2787 BreakAtWith(usize, T),
2788 }
2789
2790 #[cfg(all(target_arch = "x86_64", feature = "simd"))]
2791 pub(crate) struct LookupTable {
2792 pub simd: [u8; 16],
2793 pub scalar: [bool; 256],
2794 }
2795
2796 #[cfg(not(all(target_arch = "x86_64", feature = "simd")))]
2797 type LookupTable = [bool; 256];
2798
2799 /// This function walks the byte slices from the given index and
2800 /// calls the callback function on all bytes (and their indices) that are in the following set:
2801 /// `` ` ``, `\`, `&`, `*`, `_`, `~`, `!`, `<`, `[`, `]`, `|`, `\r`, `\n`
2802 /// It is guaranteed not call the callback on other bytes.
2803 /// Whenever `callback(ix, byte)` returns a `ContinueAndSkip(n)` value, the callback
2804 /// will not be called with an index that is less than `ix + n + 1`.
2805 /// When the callback returns a `BreakAtWith(end_ix, opt+val)`, no more callbacks will be
2806 /// called and the function returns immediately with the return value `(end_ix, opt_val)`.
2807 /// If `BreakAtWith(..)` is never returned, this function will return the first
2808 /// index that is outside the byteslice bound and a `None` value.
iterate_special_bytes<F, T>( lut: &LookupTable, bytes: &[u8], ix: usize, callback: F, ) -> (usize, Option<T>) where F: FnMut(usize, u8) -> LoopInstruction<Option<T>>,2809 fn iterate_special_bytes<F, T>(
2810 lut: &LookupTable,
2811 bytes: &[u8],
2812 ix: usize,
2813 callback: F,
2814 ) -> (usize, Option<T>)
2815 where
2816 F: FnMut(usize, u8) -> LoopInstruction<Option<T>>,
2817 {
2818 #[cfg(all(target_arch = "x86_64", feature = "simd"))]
2819 {
2820 crate::simd::iterate_special_bytes(lut, bytes, ix, callback)
2821 }
2822 #[cfg(not(all(target_arch = "x86_64", feature = "simd")))]
2823 {
2824 scalar_iterate_special_bytes(lut, bytes, ix, callback)
2825 }
2826 }
2827
scalar_iterate_special_bytes<F, T>( lut: &[bool; 256], bytes: &[u8], mut ix: usize, mut callback: F, ) -> (usize, Option<T>) where F: FnMut(usize, u8) -> LoopInstruction<Option<T>>,2828 pub(crate) fn scalar_iterate_special_bytes<F, T>(
2829 lut: &[bool; 256],
2830 bytes: &[u8],
2831 mut ix: usize,
2832 mut callback: F,
2833 ) -> (usize, Option<T>)
2834 where
2835 F: FnMut(usize, u8) -> LoopInstruction<Option<T>>,
2836 {
2837 while ix < bytes.len() {
2838 let b = bytes[ix];
2839 if lut[b as usize] {
2840 match callback(ix, b) {
2841 LoopInstruction::ContinueAndSkip(skip) => {
2842 ix += skip;
2843 }
2844 LoopInstruction::BreakAtWith(ix, val) => {
2845 return (ix, val);
2846 }
2847 }
2848 }
2849 ix += 1;
2850 }
2851
2852 (ix, None)
2853 }
2854
2855 /// Markdown event and source range iterator.
2856 ///
2857 /// Generates tuples where the first element is the markdown event and the second
2858 /// is a the corresponding range in the source string.
2859 ///
2860 /// Constructed from a `Parser` using its
2861 /// [`into_offset_iter`](struct.Parser.html#method.into_offset_iter) method.
2862 pub struct OffsetIter<'a> {
2863 inner: Parser<'a>,
2864 }
2865
2866 impl<'a> Iterator for OffsetIter<'a> {
2867 type Item = (Event<'a>, Range<usize>);
2868
next(&mut self) -> Option<Self::Item>2869 fn next(&mut self) -> Option<Self::Item> {
2870 match self.inner.tree.cur() {
2871 None => {
2872 let ix = self.inner.tree.pop()?;
2873 let tag = item_to_tag(&self.inner.tree[ix].item, &self.inner.allocs);
2874 self.inner.tree.next_sibling(ix);
2875 Some((
2876 Event::End(tag),
2877 self.inner.tree[ix].item.start..self.inner.tree[ix].item.end,
2878 ))
2879 }
2880 Some(cur_ix) => {
2881 if self.inner.tree[cur_ix].item.body.is_inline() {
2882 self.inner.handle_inline();
2883 }
2884
2885 let node = self.inner.tree[cur_ix];
2886 let item = node.item;
2887 let event = item_to_event(item, self.inner.text, &self.inner.allocs);
2888 if let Event::Start(..) = event {
2889 self.inner.tree.push();
2890 } else {
2891 self.inner.tree.next_sibling(cur_ix);
2892 }
2893 Some((event, item.start..item.end))
2894 }
2895 }
2896 }
2897 }
2898
item_to_tag<'a>(item: &Item, allocs: &Allocations<'a>) -> Tag<'a>2899 fn item_to_tag<'a>(item: &Item, allocs: &Allocations<'a>) -> Tag<'a> {
2900 match item.body {
2901 ItemBody::Paragraph => Tag::Paragraph,
2902 ItemBody::Emphasis => Tag::Emphasis,
2903 ItemBody::Strong => Tag::Strong,
2904 ItemBody::Strikethrough => Tag::Strikethrough,
2905 ItemBody::Link(link_ix) => {
2906 let &(ref link_type, ref url, ref title) = allocs.index(link_ix);
2907 Tag::Link(*link_type, url.clone(), title.clone())
2908 }
2909 ItemBody::Image(link_ix) => {
2910 let &(ref link_type, ref url, ref title) = allocs.index(link_ix);
2911 Tag::Image(*link_type, url.clone(), title.clone())
2912 }
2913 ItemBody::Heading(level) => Tag::Heading(level),
2914 ItemBody::FencedCodeBlock(cow_ix) => {
2915 Tag::CodeBlock(CodeBlockKind::Fenced(allocs[cow_ix].clone()))
2916 }
2917 ItemBody::IndentCodeBlock => Tag::CodeBlock(CodeBlockKind::Indented),
2918 ItemBody::BlockQuote => Tag::BlockQuote,
2919 ItemBody::List(_, c, listitem_start) => {
2920 if c == b'.' || c == b')' {
2921 Tag::List(Some(listitem_start))
2922 } else {
2923 Tag::List(None)
2924 }
2925 }
2926 ItemBody::ListItem(_) => Tag::Item,
2927 ItemBody::TableHead => Tag::TableHead,
2928 ItemBody::TableCell => Tag::TableCell,
2929 ItemBody::TableRow => Tag::TableRow,
2930 ItemBody::Table(alignment_ix) => Tag::Table(allocs[alignment_ix].clone()),
2931 ItemBody::FootnoteDefinition(cow_ix) => Tag::FootnoteDefinition(allocs[cow_ix].clone()),
2932 _ => panic!("unexpected item body {:?}", item.body),
2933 }
2934 }
2935
item_to_event<'a>(item: Item, text: &'a str, allocs: &Allocations<'a>) -> Event<'a>2936 fn item_to_event<'a>(item: Item, text: &'a str, allocs: &Allocations<'a>) -> Event<'a> {
2937 let tag = match item.body {
2938 ItemBody::Text => return Event::Text(text[item.start..item.end].into()),
2939 ItemBody::Code(cow_ix) => return Event::Code(allocs[cow_ix].clone()),
2940 ItemBody::SynthesizeText(cow_ix) => return Event::Text(allocs[cow_ix].clone()),
2941 ItemBody::SynthesizeChar(c) => return Event::Text(c.into()),
2942 ItemBody::Html => return Event::Html(text[item.start..item.end].into()),
2943 ItemBody::OwnedHtml(cow_ix) => return Event::Html(allocs[cow_ix].clone()),
2944 ItemBody::SoftBreak => return Event::SoftBreak,
2945 ItemBody::HardBreak => return Event::HardBreak,
2946 ItemBody::FootnoteReference(cow_ix) => {
2947 return Event::FootnoteReference(allocs[cow_ix].clone())
2948 }
2949 ItemBody::TaskListMarker(checked) => return Event::TaskListMarker(checked),
2950 ItemBody::Rule => return Event::Rule,
2951
2952 ItemBody::Paragraph => Tag::Paragraph,
2953 ItemBody::Emphasis => Tag::Emphasis,
2954 ItemBody::Strong => Tag::Strong,
2955 ItemBody::Strikethrough => Tag::Strikethrough,
2956 ItemBody::Link(link_ix) => {
2957 let &(ref link_type, ref url, ref title) = allocs.index(link_ix);
2958 Tag::Link(*link_type, url.clone(), title.clone())
2959 }
2960 ItemBody::Image(link_ix) => {
2961 let &(ref link_type, ref url, ref title) = allocs.index(link_ix);
2962 Tag::Image(*link_type, url.clone(), title.clone())
2963 }
2964 ItemBody::Heading(level) => Tag::Heading(level),
2965 ItemBody::FencedCodeBlock(cow_ix) => {
2966 Tag::CodeBlock(CodeBlockKind::Fenced(allocs[cow_ix].clone()))
2967 }
2968 ItemBody::IndentCodeBlock => Tag::CodeBlock(CodeBlockKind::Indented),
2969 ItemBody::BlockQuote => Tag::BlockQuote,
2970 ItemBody::List(_, c, listitem_start) => {
2971 if c == b'.' || c == b')' {
2972 Tag::List(Some(listitem_start))
2973 } else {
2974 Tag::List(None)
2975 }
2976 }
2977 ItemBody::ListItem(_) => Tag::Item,
2978 ItemBody::TableHead => Tag::TableHead,
2979 ItemBody::TableCell => Tag::TableCell,
2980 ItemBody::TableRow => Tag::TableRow,
2981 ItemBody::Table(alignment_ix) => Tag::Table(allocs[alignment_ix].clone()),
2982 ItemBody::FootnoteDefinition(cow_ix) => Tag::FootnoteDefinition(allocs[cow_ix].clone()),
2983 _ => panic!("unexpected item body {:?}", item.body),
2984 };
2985
2986 Event::Start(tag)
2987 }
2988
2989 // https://english.stackexchange.com/a/285573
surgerize_tight_list(tree: &mut Tree<Item>, list_ix: TreeIndex)2990 fn surgerize_tight_list(tree: &mut Tree<Item>, list_ix: TreeIndex) {
2991 let mut list_item = tree[list_ix].child;
2992 while let Some(listitem_ix) = list_item {
2993 // first child is special, controls how we repoint list_item.child
2994 let list_item_firstborn = tree[listitem_ix].child;
2995
2996 // Check that list item has children - this is not necessarily the case!
2997 if let Some(firstborn_ix) = list_item_firstborn {
2998 if let ItemBody::Paragraph = tree[firstborn_ix].item.body {
2999 tree[listitem_ix].child = tree[firstborn_ix].child;
3000 }
3001
3002 let mut list_item_child = Some(firstborn_ix);
3003 let mut node_to_repoint = None;
3004 while let Some(child_ix) = list_item_child {
3005 // surgerize paragraphs
3006 let repoint_ix = if let ItemBody::Paragraph = tree[child_ix].item.body {
3007 if let Some(child_firstborn) = tree[child_ix].child {
3008 if let Some(repoint_ix) = node_to_repoint {
3009 tree[repoint_ix].next = Some(child_firstborn);
3010 }
3011 let mut child_lastborn = child_firstborn;
3012 while let Some(lastborn_next_ix) = tree[child_lastborn].next {
3013 child_lastborn = lastborn_next_ix;
3014 }
3015 child_lastborn
3016 } else {
3017 child_ix
3018 }
3019 } else {
3020 child_ix
3021 };
3022
3023 node_to_repoint = Some(repoint_ix);
3024 tree[repoint_ix].next = tree[child_ix].next;
3025 list_item_child = tree[child_ix].next;
3026 }
3027 }
3028
3029 list_item = tree[listitem_ix].next;
3030 }
3031 }
3032
3033 impl<'a> Iterator for Parser<'a> {
3034 type Item = Event<'a>;
3035
next(&mut self) -> Option<Event<'a>>3036 fn next(&mut self) -> Option<Event<'a>> {
3037 match self.tree.cur() {
3038 None => {
3039 let ix = self.tree.pop()?;
3040 let tag = item_to_tag(&self.tree[ix].item, &self.allocs);
3041 self.tree.next_sibling(ix);
3042 Some(Event::End(tag))
3043 }
3044 Some(cur_ix) => {
3045 if self.tree[cur_ix].item.body.is_inline() {
3046 self.handle_inline();
3047 }
3048
3049 let node = self.tree[cur_ix];
3050 let item = node.item;
3051 let event = item_to_event(item, self.text, &self.allocs);
3052 if let Event::Start(..) = event {
3053 self.tree.push();
3054 } else {
3055 self.tree.next_sibling(cur_ix);
3056 }
3057 Some(event)
3058 }
3059 }
3060 }
3061 }
3062
3063 #[cfg(test)]
3064 mod test {
3065 use super::*;
3066 use crate::tree::Node;
3067
3068 // TODO: move these tests to tests/html.rs?
3069
parser_with_extensions(text: &str) -> Parser<'_>3070 fn parser_with_extensions(text: &str) -> Parser<'_> {
3071 let mut opts = Options::empty();
3072 opts.insert(Options::ENABLE_TABLES);
3073 opts.insert(Options::ENABLE_FOOTNOTES);
3074 opts.insert(Options::ENABLE_STRIKETHROUGH);
3075 opts.insert(Options::ENABLE_TASKLISTS);
3076
3077 Parser::new_ext(text, opts)
3078 }
3079
3080 #[test]
3081 #[cfg(target_pointer_width = "64")]
node_size()3082 fn node_size() {
3083 let node_size = std::mem::size_of::<Node<Item>>();
3084 assert_eq!(48, node_size);
3085 }
3086
3087 #[test]
3088 #[cfg(target_pointer_width = "64")]
body_size()3089 fn body_size() {
3090 let body_size = std::mem::size_of::<ItemBody>();
3091 assert_eq!(16, body_size);
3092 }
3093
3094 #[test]
single_open_fish_bracket()3095 fn single_open_fish_bracket() {
3096 // dont crash
3097 assert_eq!(3, Parser::new("<").count());
3098 }
3099
3100 #[test]
lone_hashtag()3101 fn lone_hashtag() {
3102 // dont crash
3103 assert_eq!(2, Parser::new("#").count());
3104 }
3105
3106 #[test]
lots_of_backslashes()3107 fn lots_of_backslashes() {
3108 // dont crash
3109 Parser::new("\\\\\r\r").count();
3110 Parser::new("\\\r\r\\.\\\\\r\r\\.\\").count();
3111 }
3112
3113 #[test]
3114 fn issue_320() {
3115 // dont crash
3116 parser_with_extensions(":\r\t> |\r:\r\t> |\r").count();
3117 }
3118
3119 #[test]
3120 fn issue_319() {
3121 // dont crash
3122 parser_with_extensions("|\r-]([^|\r-]([^").count();
3123 parser_with_extensions("|\r\r=][^|\r\r=][^car").count();
3124 }
3125
3126 #[test]
3127 fn issue_303() {
3128 // dont crash
3129 parser_with_extensions("[^\r\ra]").count();
3130 parser_with_extensions("\r\r]Z[^\x00\r\r]Z[^\x00").count();
3131 }
3132
3133 #[test]
3134 fn issue_313() {
3135 // dont crash
3136 parser_with_extensions("*]0[^\r\r*]0[^").count();
3137 parser_with_extensions("[^\r> `][^\r> `][^\r> `][").count();
3138 }
3139
3140 #[test]
3141 fn issue_311() {
3142 // dont crash
3143 parser_with_extensions("\\\u{0d}-\u{09}\\\u{0d}-\u{09}").count();
3144 }
3145
3146 #[test]
3147 fn issue_283() {
3148 let input = std::str::from_utf8(b"\xf0\x9b\xb2\x9f<td:^\xf0\x9b\xb2\x9f").unwrap();
3149 // dont crash
3150 parser_with_extensions(input).count();
3151 }
3152
3153 #[test]
3154 fn issue_289() {
3155 // dont crash
3156 parser_with_extensions("> - \\\n> - ").count();
3157 parser_with_extensions("- \n\n").count();
3158 }
3159
3160 #[test]
3161 fn issue_306() {
3162 // dont crash
3163 parser_with_extensions("*\r_<__*\r_<__*\r_<__*\r_<__").count();
3164 }
3165
3166 #[test]
3167 fn issue_305() {
3168 // dont crash
3169 parser_with_extensions("_6**6*_*").count();
3170 }
3171
3172 #[test]
3173 fn another_emphasis_panic() {
3174 parser_with_extensions("*__#_#__*").count();
3175 }
3176
3177 #[test]
3178 fn offset_iter() {
3179 let event_offsets: Vec<_> = Parser::new("*hello* world")
3180 .into_offset_iter()
3181 .map(|(_ev, range)| range)
3182 .collect();
3183 let expected_offsets = vec![(0..13), (0..7), (1..6), (0..7), (7..13), (0..13)];
3184 assert_eq!(expected_offsets, event_offsets);
3185 }
3186
3187 #[test]
3188 fn reference_link_offsets() {
3189 let range =
3190 Parser::new("# H1\n[testing][Some reference]\n\n[Some reference]: https://github.com")
3191 .into_offset_iter()
3192 .filter_map(|(ev, range)| match ev {
3193 Event::Start(Tag::Link(LinkType::Reference, ..), ..) => Some(range),
3194 _ => None,
3195 })
3196 .next()
3197 .unwrap();
3198 assert_eq!(5..30, range);
3199 }
3200
3201 #[test]
footnote_offsets()3202 fn footnote_offsets() {
3203 let range = parser_with_extensions("Testing this[^1] out.\n\n[^1]: Footnote.")
3204 .into_offset_iter()
3205 .filter_map(|(ev, range)| match ev {
3206 Event::FootnoteReference(..) => Some(range),
3207 _ => None,
3208 })
3209 .next()
3210 .unwrap();
3211 assert_eq!(12..16, range);
3212 }
3213
3214 #[test]
table_offset()3215 fn table_offset() {
3216 let markdown = "a\n\nTesting|This|Outtt\n--|:--:|--:\nSome Data|Other data|asdf";
3217 let event_offset = parser_with_extensions(markdown)
3218 .into_offset_iter()
3219 .map(|(_ev, range)| range)
3220 .nth(3)
3221 .unwrap();
3222 let expected_offset = 3..59;
3223 assert_eq!(expected_offset, event_offset);
3224 }
3225
3226 #[test]
offset_iter_issue_378()3227 fn offset_iter_issue_378() {
3228 let event_offsets: Vec<_> = Parser::new("a [b](c) d")
3229 .into_offset_iter()
3230 .map(|(_ev, range)| range)
3231 .collect();
3232 let expected_offsets = vec![(0..10), (0..2), (2..8), (3..4), (2..8), (8..10), (0..10)];
3233 assert_eq!(expected_offsets, event_offsets);
3234 }
3235
3236 #[test]
offset_iter_issue_404()3237 fn offset_iter_issue_404() {
3238 let event_offsets: Vec<_> = Parser::new("###\n")
3239 .into_offset_iter()
3240 .map(|(_ev, range)| range)
3241 .collect();
3242 let expected_offsets = vec![(0..4), (0..4)];
3243 assert_eq!(expected_offsets, event_offsets);
3244 }
3245
3246 // FIXME: add this one regression suite
3247 #[test]
link_def_at_eof()3248 fn link_def_at_eof() {
3249 let test_str = "[My site][world]\n\n[world]: https://vincentprouillet.com";
3250 let expected = "<p><a href=\"https://vincentprouillet.com\">My site</a></p>\n";
3251
3252 let mut buf = String::new();
3253 crate::html::push_html(&mut buf, Parser::new(test_str));
3254 assert_eq!(expected, buf);
3255 }
3256
3257 #[test]
no_footnote_refs_without_option()3258 fn no_footnote_refs_without_option() {
3259 let test_str = "a [^a]\n\n[^a]: yolo";
3260 let expected = "<p>a <a href=\"yolo\">^a</a></p>\n";
3261
3262 let mut buf = String::new();
3263 crate::html::push_html(&mut buf, Parser::new(test_str));
3264 assert_eq!(expected, buf);
3265 }
3266
3267 #[test]
ref_def_at_eof()3268 fn ref_def_at_eof() {
3269 let test_str = "[test]:\\";
3270 let expected = "";
3271
3272 let mut buf = String::new();
3273 crate::html::push_html(&mut buf, Parser::new(test_str));
3274 assert_eq!(expected, buf);
3275 }
3276
3277 #[test]
3278 fn ref_def_cr_lf() {
3279 let test_str = "[a]: /u\r\n\n[a]";
3280 let expected = "<p><a href=\"/u\">a</a></p>\n";
3281
3282 let mut buf = String::new();
3283 crate::html::push_html(&mut buf, Parser::new(test_str));
3284 assert_eq!(expected, buf);
3285 }
3286
3287 #[test]
no_dest_refdef()3288 fn no_dest_refdef() {
3289 let test_str = "[a]:";
3290 let expected = "<p>[a]:</p>\n";
3291
3292 let mut buf = String::new();
3293 crate::html::push_html(&mut buf, Parser::new(test_str));
3294 assert_eq!(expected, buf);
3295 }
3296
3297 #[test]
broken_links_called_only_once()3298 fn broken_links_called_only_once() {
3299 for &(markdown, expected) in &[
3300 ("See also [`g()`][crate::g].", 1),
3301 ("See also [`g()`][crate::g][].", 1),
3302 ("[brokenlink1] some other node [brokenlink2]", 2),
3303 ] {
3304 let mut times_called = 0;
3305 let callback = &mut |_broken_link: BrokenLink| {
3306 times_called += 1;
3307 None
3308 };
3309 let parser =
3310 Parser::new_with_broken_link_callback(markdown, Options::empty(), Some(callback));
3311 for _ in parser {}
3312 assert_eq!(times_called, expected);
3313 }
3314 }
3315
3316 #[test]
simple_broken_link_callback()3317 fn simple_broken_link_callback() {
3318 let test_str = "This is a link w/o def: [hello][world]";
3319 let mut callback = |broken_link: BrokenLink| {
3320 assert_eq!("world", broken_link.reference);
3321 assert_eq!(&test_str[broken_link.span], "[hello][world]");
3322 let url = "YOLO".into();
3323 let title = "SWAG".to_owned().into();
3324 Some((url, title))
3325 };
3326 let parser =
3327 Parser::new_with_broken_link_callback(test_str, Options::empty(), Some(&mut callback));
3328 let mut link_tag_count = 0;
3329 for (typ, url, title) in parser.filter_map(|event| match event {
3330 Event::Start(tag) | Event::End(tag) => match tag {
3331 Tag::Link(typ, url, title) => Some((typ, url, title)),
3332 _ => None,
3333 },
3334 _ => None,
3335 }) {
3336 link_tag_count += 1;
3337 assert_eq!(typ, LinkType::ReferenceUnknown);
3338 assert_eq!(url.as_ref(), "YOLO");
3339 assert_eq!(title.as_ref(), "SWAG");
3340 }
3341 assert!(link_tag_count > 0);
3342 }
3343
3344 #[test]
code_block_kind_check_fenced()3345 fn code_block_kind_check_fenced() {
3346 let parser = Parser::new("hello\n```test\ntadam\n```");
3347 let mut found = 0;
3348 for (ev, _range) in parser.into_offset_iter() {
3349 match ev {
3350 Event::Start(Tag::CodeBlock(CodeBlockKind::Fenced(syntax))) => {
3351 assert_eq!(syntax.as_ref(), "test");
3352 found += 1;
3353 }
3354 _ => {}
3355 }
3356 }
3357 assert_eq!(found, 1);
3358 }
3359
3360 #[test]
code_block_kind_check_indented()3361 fn code_block_kind_check_indented() {
3362 let parser = Parser::new("hello\n\n ```test\n tadam\nhello");
3363 let mut found = 0;
3364 for (ev, _range) in parser.into_offset_iter() {
3365 match ev {
3366 Event::Start(Tag::CodeBlock(CodeBlockKind::Indented)) => {
3367 found += 1;
3368 }
3369 _ => {}
3370 }
3371 }
3372 assert_eq!(found, 1);
3373 }
3374 }
3375