1 // Copyright 2015 Google Inc. All rights reserved.
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a copy
4 // of this software and associated documentation files (the "Software"), to deal
5 // in the Software without restriction, including without limitation the rights
6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 // copies of the Software, and to permit persons to whom the Software is
8 // furnished to do so, subject to the following conditions:
9 //
10 // The above copyright notice and this permission notice shall be included in
11 // all copies or substantial portions of the Software.
12 //
13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 // THE SOFTWARE.
20 
21 //! Scanners for fragments of CommonMark syntax
22 
23 use std::char;
24 use std::convert::TryInto;
25 
26 use crate::entities;
27 use crate::parse::{Alignment, HtmlScanGuard, LinkType};
28 pub use crate::puncttable::{is_ascii_punctuation, is_punctuation};
29 use crate::strings::CowStr;
30 
31 use memchr::memchr;
32 
33 // sorted for binary search
34 const HTML_TAGS: [&str; 62] = [
35     "address",
36     "article",
37     "aside",
38     "base",
39     "basefont",
40     "blockquote",
41     "body",
42     "caption",
43     "center",
44     "col",
45     "colgroup",
46     "dd",
47     "details",
48     "dialog",
49     "dir",
50     "div",
51     "dl",
52     "dt",
53     "fieldset",
54     "figcaption",
55     "figure",
56     "footer",
57     "form",
58     "frame",
59     "frameset",
60     "h1",
61     "h2",
62     "h3",
63     "h4",
64     "h5",
65     "h6",
66     "head",
67     "header",
68     "hr",
69     "html",
70     "iframe",
71     "legend",
72     "li",
73     "link",
74     "main",
75     "menu",
76     "menuitem",
77     "nav",
78     "noframes",
79     "ol",
80     "optgroup",
81     "option",
82     "p",
83     "param",
84     "section",
85     "source",
86     "summary",
87     "table",
88     "tbody",
89     "td",
90     "tfoot",
91     "th",
92     "thead",
93     "title",
94     "tr",
95     "track",
96     "ul",
97 ];
98 
99 /// Analysis of the beginning of a line, including indentation and container
100 /// markers.
101 #[derive(Clone)]
102 pub struct LineStart<'a> {
103     bytes: &'a [u8],
104     tab_start: usize,
105     ix: usize,
106     spaces_remaining: usize,
107     // no thematic breaks can occur before this offset.
108     // this prevents scanning over and over up to a certain point
109     min_hrule_offset: usize,
110 }
111 
112 impl<'a> LineStart<'a> {
new(bytes: &[u8]) -> LineStart113     pub(crate) fn new(bytes: &[u8]) -> LineStart {
114         LineStart {
115             bytes,
116             tab_start: 0,
117             ix: 0,
118             spaces_remaining: 0,
119             min_hrule_offset: 0,
120         }
121     }
122 
123     /// Try to scan a number of spaces.
124     ///
125     /// Returns true if all spaces were consumed.
126     ///
127     /// Note: consumes some spaces even if not successful.
scan_space(&mut self, n_space: usize) -> bool128     pub(crate) fn scan_space(&mut self, n_space: usize) -> bool {
129         self.scan_space_inner(n_space) == 0
130     }
131 
132     /// Scan a number of spaces up to a maximum.
133     ///
134     /// Returns number of spaces scanned.
scan_space_upto(&mut self, n_space: usize) -> usize135     pub(crate) fn scan_space_upto(&mut self, n_space: usize) -> usize {
136         n_space - self.scan_space_inner(n_space)
137     }
138 
139     /// Returns unused remainder of spaces.
scan_space_inner(&mut self, mut n_space: usize) -> usize140     fn scan_space_inner(&mut self, mut n_space: usize) -> usize {
141         let n_from_remaining = self.spaces_remaining.min(n_space);
142         self.spaces_remaining -= n_from_remaining;
143         n_space -= n_from_remaining;
144         while n_space > 0 && self.ix < self.bytes.len() {
145             match self.bytes[self.ix] {
146                 b' ' => {
147                     self.ix += 1;
148                     n_space -= 1;
149                 }
150                 b'\t' => {
151                     let spaces = 4 - (self.ix - self.tab_start) % 4;
152                     self.ix += 1;
153                     self.tab_start = self.ix;
154                     let n = spaces.min(n_space);
155                     n_space -= n;
156                     self.spaces_remaining = spaces - n;
157                 }
158                 _ => break,
159             }
160         }
161         n_space
162     }
163 
164     /// Scan all available ASCII whitespace (not including eol).
scan_all_space(&mut self)165     pub(crate) fn scan_all_space(&mut self) {
166         self.spaces_remaining = 0;
167         self.ix += self.bytes[self.ix..]
168             .iter()
169             .take_while(|&&b| b == b' ' || b == b'\t')
170             .count();
171     }
172 
173     /// Determine whether we're at end of line (includes end of file).
is_at_eol(&self) -> bool174     pub(crate) fn is_at_eol(&self) -> bool {
175         self.bytes
176             .get(self.ix)
177             .map(|&c| c == b'\r' || c == b'\n')
178             .unwrap_or(true)
179     }
180 
scan_ch(&mut self, c: u8) -> bool181     fn scan_ch(&mut self, c: u8) -> bool {
182         if self.ix < self.bytes.len() && self.bytes[self.ix] == c {
183             self.ix += 1;
184             true
185         } else {
186             false
187         }
188     }
189 
scan_blockquote_marker(&mut self) -> bool190     pub(crate) fn scan_blockquote_marker(&mut self) -> bool {
191         let save = self.clone();
192         let _ = self.scan_space(3);
193         if self.scan_ch(b'>') {
194             let _ = self.scan_space(1);
195             true
196         } else {
197             *self = save;
198             false
199         }
200     }
201 
202     /// Scan a list marker.
203     ///
204     /// Return value is the character, the start index, and the indent in spaces.
205     /// For ordered list markers, the character will be one of b'.' or b')'. For
206     /// bullet list markers, it will be one of b'-', b'+', or b'*'.
scan_list_marker(&mut self) -> Option<(u8, u64, usize)>207     pub(crate) fn scan_list_marker(&mut self) -> Option<(u8, u64, usize)> {
208         let save = self.clone();
209         let indent = self.scan_space_upto(3);
210         if self.ix < self.bytes.len() {
211             let c = self.bytes[self.ix];
212             if c == b'-' || c == b'+' || c == b'*' {
213                 if self.ix >= self.min_hrule_offset {
214                     // there could be an hrule here
215                     if let Err(min_offset) = scan_hrule(&self.bytes[self.ix..]) {
216                         self.min_hrule_offset = min_offset;
217                     } else {
218                         *self = save;
219                         return None;
220                     }
221                 }
222                 self.ix += 1;
223                 if self.scan_space(1) || self.is_at_eol() {
224                     return self.finish_list_marker(c, 0, indent + 2);
225                 }
226             } else if c >= b'0' && c <= b'9' {
227                 let start_ix = self.ix;
228                 let mut ix = self.ix + 1;
229                 let mut val = u64::from(c - b'0');
230                 while ix < self.bytes.len() && ix - start_ix < 10 {
231                     let c = self.bytes[ix];
232                     ix += 1;
233                     if c >= b'0' && c <= b'9' {
234                         val = val * 10 + u64::from(c - b'0');
235                     } else if c == b')' || c == b'.' {
236                         self.ix = ix;
237                         if self.scan_space(1) || self.is_at_eol() {
238                             return self.finish_list_marker(c, val, indent + self.ix - start_ix);
239                         } else {
240                             break;
241                         }
242                     } else {
243                         break;
244                     }
245                 }
246             }
247         }
248         *self = save;
249         None
250     }
251 
finish_list_marker( &mut self, c: u8, start: u64, mut indent: usize, ) -> Option<(u8, u64, usize)>252     fn finish_list_marker(
253         &mut self,
254         c: u8,
255         start: u64,
256         mut indent: usize,
257     ) -> Option<(u8, u64, usize)> {
258         let save = self.clone();
259 
260         // skip the rest of the line if it's blank
261         if scan_blank_line(&self.bytes[self.ix..]).is_some() {
262             return Some((c, start, indent));
263         }
264 
265         let post_indent = self.scan_space_upto(4);
266         if post_indent < 4 {
267             indent += post_indent;
268         } else {
269             *self = save;
270         }
271         Some((c, start, indent))
272     }
273 
274     /// Returns Some(is_checked) when a task list marker was found. Resets itself
275     /// to original state otherwise.
scan_task_list_marker(&mut self) -> Option<bool>276     pub(crate) fn scan_task_list_marker(&mut self) -> Option<bool> {
277         let save = self.clone();
278         self.scan_space_upto(3);
279 
280         if !self.scan_ch(b'[') {
281             *self = save;
282             return None;
283         }
284         let is_checked = match self.bytes.get(self.ix) {
285             Some(&c) if is_ascii_whitespace_no_nl(c) => {
286                 self.ix += 1;
287                 false
288             }
289             Some(b'x') | Some(b'X') => {
290                 self.ix += 1;
291                 true
292             }
293             _ => {
294                 *self = save;
295                 return None;
296             }
297         };
298         if !self.scan_ch(b']') {
299             *self = save;
300             return None;
301         }
302         if !self
303             .bytes
304             .get(self.ix)
305             .map(|&b| is_ascii_whitespace_no_nl(b))
306             .unwrap_or(false)
307         {
308             *self = save;
309             return None;
310         }
311         Some(is_checked)
312     }
313 
bytes_scanned(&self) -> usize314     pub(crate) fn bytes_scanned(&self) -> usize {
315         self.ix
316     }
317 
remaining_space(&self) -> usize318     pub(crate) fn remaining_space(&self) -> usize {
319         self.spaces_remaining
320     }
321 }
322 
is_ascii_whitespace(c: u8) -> bool323 pub(crate) fn is_ascii_whitespace(c: u8) -> bool {
324     (c >= 0x09 && c <= 0x0d) || c == b' '
325 }
326 
is_ascii_whitespace_no_nl(c: u8) -> bool327 pub(crate) fn is_ascii_whitespace_no_nl(c: u8) -> bool {
328     c == b'\t' || c == 0x0b || c == 0x0c || c == b' '
329 }
330 
is_ascii_alpha(c: u8) -> bool331 fn is_ascii_alpha(c: u8) -> bool {
332     match c {
333         b'a'..=b'z' | b'A'..=b'Z' => true,
334         _ => false,
335     }
336 }
337 
is_ascii_alphanumeric(c: u8) -> bool338 fn is_ascii_alphanumeric(c: u8) -> bool {
339     match c {
340         b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' => true,
341         _ => false,
342     }
343 }
344 
is_ascii_letterdigitdash(c: u8) -> bool345 fn is_ascii_letterdigitdash(c: u8) -> bool {
346     c == b'-' || is_ascii_alphanumeric(c)
347 }
348 
is_digit(c: u8) -> bool349 fn is_digit(c: u8) -> bool {
350     b'0' <= c && c <= b'9'
351 }
352 
is_valid_unquoted_attr_value_char(c: u8) -> bool353 fn is_valid_unquoted_attr_value_char(c: u8) -> bool {
354     match c {
355         b'\'' | b'"' | b' ' | b'=' | b'>' | b'<' | b'`' | b'\n' | b'\r' => false,
356         _ => true,
357     }
358 }
359 
360 // scan a single character
scan_ch(data: &[u8], c: u8) -> usize361 pub(crate) fn scan_ch(data: &[u8], c: u8) -> usize {
362     if !data.is_empty() && data[0] == c {
363         1
364     } else {
365         0
366     }
367 }
368 
scan_while<F>(data: &[u8], mut f: F) -> usize where F: FnMut(u8) -> bool,369 pub(crate) fn scan_while<F>(data: &[u8], mut f: F) -> usize
370 where
371     F: FnMut(u8) -> bool,
372 {
373     data.iter().take_while(|&&c| f(c)).count()
374 }
375 
scan_rev_while<F>(data: &[u8], mut f: F) -> usize where F: FnMut(u8) -> bool,376 pub(crate) fn scan_rev_while<F>(data: &[u8], mut f: F) -> usize
377 where
378     F: FnMut(u8) -> bool,
379 {
380     data.iter().rev().take_while(|&&c| f(c)).count()
381 }
382 
scan_ch_repeat(data: &[u8], c: u8) -> usize383 pub(crate) fn scan_ch_repeat(data: &[u8], c: u8) -> usize {
384     scan_while(data, |x| x == c)
385 }
386 
387 // Note: this scans ASCII whitespace only, for Unicode whitespace use
388 // a different function.
scan_whitespace_no_nl(data: &[u8]) -> usize389 pub(crate) fn scan_whitespace_no_nl(data: &[u8]) -> usize {
390     scan_while(data, is_ascii_whitespace_no_nl)
391 }
392 
scan_attr_value_chars(data: &[u8]) -> usize393 fn scan_attr_value_chars(data: &[u8]) -> usize {
394     scan_while(data, is_valid_unquoted_attr_value_char)
395 }
396 
scan_eol(bytes: &[u8]) -> Option<usize>397 pub(crate) fn scan_eol(bytes: &[u8]) -> Option<usize> {
398     if bytes.is_empty() {
399         return Some(0);
400     }
401     match bytes[0] {
402         b'\n' => Some(1),
403         b'\r' => Some(if bytes.get(1) == Some(&b'\n') { 2 } else { 1 }),
404         _ => None,
405     }
406 }
407 
scan_blank_line(bytes: &[u8]) -> Option<usize>408 pub(crate) fn scan_blank_line(bytes: &[u8]) -> Option<usize> {
409     let i = scan_whitespace_no_nl(bytes);
410     scan_eol(&bytes[i..]).map(|n| i + n)
411 }
412 
scan_nextline(bytes: &[u8]) -> usize413 pub(crate) fn scan_nextline(bytes: &[u8]) -> usize {
414     memchr(b'\n', bytes).map_or(bytes.len(), |x| x + 1)
415 }
416 
417 // return: end byte for closing code fence, or None
418 // if the line is not a closing code fence
scan_closing_code_fence( bytes: &[u8], fence_char: u8, n_fence_char: usize, ) -> Option<usize>419 pub(crate) fn scan_closing_code_fence(
420     bytes: &[u8],
421     fence_char: u8,
422     n_fence_char: usize,
423 ) -> Option<usize> {
424     if bytes.is_empty() {
425         return Some(0);
426     }
427     let mut i = 0;
428     let num_fence_chars_found = scan_ch_repeat(&bytes[i..], fence_char);
429     if num_fence_chars_found < n_fence_char {
430         return None;
431     }
432     i += num_fence_chars_found;
433     let num_trailing_spaces = scan_ch_repeat(&bytes[i..], b' ');
434     i += num_trailing_spaces;
435     scan_eol(&bytes[i..]).map(|_| i)
436 }
437 
438 // returned pair is (number of bytes, number of spaces)
calc_indent(text: &[u8], max: usize) -> (usize, usize)439 fn calc_indent(text: &[u8], max: usize) -> (usize, usize) {
440     let mut spaces = 0;
441     let mut offset = 0;
442 
443     for (i, &b) in text.iter().enumerate() {
444         match b {
445             b' ' => {
446                 spaces += 1;
447                 if spaces == max {
448                     break;
449                 }
450             }
451             b'\t' => {
452                 let new_spaces = spaces + 4 - (spaces & 3);
453                 if new_spaces > max {
454                     break;
455                 }
456                 spaces = new_spaces;
457             }
458             _ => break,
459         }
460         offset = i;
461     }
462 
463     (offset, spaces)
464 }
465 
466 /// Scan hrule opening sequence.
467 ///
468 /// Returns Ok(x) when it finds an hrule, where x is the
469 /// size of line containing the hrule, including the trailing newline.
470 ///
471 /// Returns Err(x) when it does not find an hrule and x is
472 /// the offset in data before no hrule can appear.
scan_hrule(bytes: &[u8]) -> Result<usize, usize>473 pub(crate) fn scan_hrule(bytes: &[u8]) -> Result<usize, usize> {
474     if bytes.len() < 3 {
475         return Err(0);
476     }
477     let c = bytes[0];
478     if !(c == b'*' || c == b'-' || c == b'_') {
479         return Err(0);
480     }
481     let mut n = 0;
482     let mut i = 0;
483 
484     while i < bytes.len() {
485         match bytes[i] {
486             b'\n' | b'\r' => {
487                 i += scan_eol(&bytes[i..]).unwrap_or(0);
488                 break;
489             }
490             c2 if c2 == c => {
491                 n += 1;
492             }
493             b' ' | b'\t' => (),
494             _ => return Err(i),
495         }
496         i += 1;
497     }
498     if n >= 3 {
499         Ok(i)
500     } else {
501         Err(i)
502     }
503 }
504 
505 /// Scan an ATX heading opening sequence.
506 ///
507 /// Returns number of bytes in prefix and level.
scan_atx_heading(data: &[u8]) -> Option<usize>508 pub(crate) fn scan_atx_heading(data: &[u8]) -> Option<usize> {
509     let level = scan_ch_repeat(data, b'#');
510     if level >= 1 && level <= 6 && data.get(level).cloned().map_or(true, is_ascii_whitespace) {
511         Some(level)
512     } else {
513         None
514     }
515 }
516 
517 /// Scan a setext heading underline.
518 ///
519 /// Returns number of bytes in line (including trailing newline) and level.
scan_setext_heading(data: &[u8]) -> Option<(usize, u32)>520 pub(crate) fn scan_setext_heading(data: &[u8]) -> Option<(usize, u32)> {
521     let c = *data.get(0)?;
522     if !(c == b'-' || c == b'=') {
523         return None;
524     }
525     let mut i = 1 + scan_ch_repeat(&data[1..], c);
526     i += scan_blank_line(&data[i..])?;
527     let level = if c == b'=' { 1 } else { 2 };
528     Some((i, level))
529 }
530 
531 // returns number of bytes in line (including trailing
532 // newline) and column alignments
scan_table_head(data: &[u8]) -> (usize, Vec<Alignment>)533 pub(crate) fn scan_table_head(data: &[u8]) -> (usize, Vec<Alignment>) {
534     let (mut i, spaces) = calc_indent(data, 4);
535     if spaces > 3 || i == data.len() {
536         return (0, vec![]);
537     }
538     let mut cols = vec![];
539     let mut active_col = Alignment::None;
540     let mut start_col = true;
541     if data[i] == b'|' {
542         i += 1;
543     }
544     for c in &data[i..] {
545         if let Some(n) = scan_eol(&data[i..]) {
546             i += n;
547             break;
548         }
549         match *c {
550             b' ' => (),
551             b':' => {
552                 active_col = match (start_col, active_col) {
553                     (true, Alignment::None) => Alignment::Left,
554                     (false, Alignment::Left) => Alignment::Center,
555                     (false, Alignment::None) => Alignment::Right,
556                     _ => active_col,
557                 };
558                 start_col = false;
559             }
560             b'-' => {
561                 start_col = false;
562             }
563             b'|' => {
564                 start_col = true;
565                 cols.push(active_col);
566                 active_col = Alignment::None;
567             }
568             _ => {
569                 cols = vec![];
570                 start_col = true;
571                 break;
572             }
573         }
574         i += 1;
575     }
576 
577     if !start_col {
578         cols.push(active_col);
579     }
580 
581     (i, cols)
582 }
583 
584 /// Scan code fence.
585 ///
586 /// Returns number of bytes scanned and the char that is repeated to make the code fence.
scan_code_fence(data: &[u8]) -> Option<(usize, u8)>587 pub(crate) fn scan_code_fence(data: &[u8]) -> Option<(usize, u8)> {
588     let c = *data.get(0)?;
589     if !(c == b'`' || c == b'~') {
590         return None;
591     }
592     let i = 1 + scan_ch_repeat(&data[1..], c);
593     if i >= 3 {
594         if c == b'`' {
595             let suffix = &data[i..];
596             let next_line = i + scan_nextline(suffix);
597             // FIXME: make sure this is correct
598             if suffix[..(next_line - i)].iter().any(|&b| b == b'`') {
599                 return None;
600             }
601         }
602         Some((i, c))
603     } else {
604         None
605     }
606 }
607 
scan_blockquote_start(data: &[u8]) -> Option<usize>608 pub(crate) fn scan_blockquote_start(data: &[u8]) -> Option<usize> {
609     if data.starts_with(b"> ") {
610         Some(2)
611     } else {
612         None
613     }
614 }
615 
616 /// This already assumes the list item has been scanned.
scan_empty_list(data: &[u8]) -> bool617 pub(crate) fn scan_empty_list(data: &[u8]) -> bool {
618     let mut ix = 0;
619     for _ in 0..2 {
620         if let Some(bytes) = scan_blank_line(&data[ix..]) {
621             ix += bytes;
622         } else {
623             return false;
624         }
625     }
626     true
627 }
628 
629 // return number of bytes scanned, delimiter, start index, and indent
scan_listitem(bytes: &[u8]) -> Option<(usize, u8, usize, usize)>630 pub(crate) fn scan_listitem(bytes: &[u8]) -> Option<(usize, u8, usize, usize)> {
631     let mut c = *bytes.get(0)?;
632     let (w, start) = match c {
633         b'-' | b'+' | b'*' => (1, 0),
634         b'0'..=b'9' => {
635             let (length, start) = parse_decimal(bytes);
636             c = *bytes.get(length)?;
637             if !(c == b'.' || c == b')') {
638                 return None;
639             }
640             (length + 1, start)
641         }
642         _ => {
643             return None;
644         }
645     };
646     // TODO: replace calc_indent with scan_leading_whitespace, for tab correctness
647     let (mut postn, mut postindent) = calc_indent(&bytes[w..], 5);
648     if postindent == 0 {
649         scan_eol(&bytes[w..])?;
650         postindent += 1;
651     } else if postindent > 4 {
652         postn = 1;
653         postindent = 1;
654     }
655     if scan_blank_line(&bytes[w..]).is_some() {
656         postn = 0;
657         postindent = 1;
658     }
659     Some((w + postn, c, start, w + postindent))
660 }
661 
662 // returns (number of bytes, parsed decimal)
parse_decimal(bytes: &[u8]) -> (usize, usize)663 fn parse_decimal(bytes: &[u8]) -> (usize, usize) {
664     match bytes
665         .iter()
666         .take_while(|&&b| is_digit(b))
667         .try_fold((0, 0usize), |(count, acc), c| {
668             let digit = usize::from(c - b'0');
669             match acc
670                 .checked_mul(10)
671                 .and_then(|ten_acc| ten_acc.checked_add(digit))
672             {
673                 Some(number) => Ok((count + 1, number)),
674                 // stop early on overflow
675                 None => Err((count, acc)),
676             }
677         }) {
678         Ok(p) | Err(p) => p,
679     }
680 }
681 
682 // returns (number of bytes, parsed hex)
parse_hex(bytes: &[u8]) -> (usize, usize)683 fn parse_hex(bytes: &[u8]) -> (usize, usize) {
684     match bytes.iter().try_fold((0, 0usize), |(count, acc), c| {
685         let mut c = *c;
686         let digit = if c >= b'0' && c <= b'9' {
687             usize::from(c - b'0')
688         } else {
689             // make lower case
690             c |= 0x20;
691             if c >= b'a' && c <= b'f' {
692                 usize::from(c - b'a' + 10)
693             } else {
694                 return Err((count, acc));
695             }
696         };
697         match acc
698             .checked_mul(16)
699             .and_then(|sixteen_acc| sixteen_acc.checked_add(digit))
700         {
701             Some(number) => Ok((count + 1, number)),
702             // stop early on overflow
703             None => Err((count, acc)),
704         }
705     }) {
706         Ok(p) | Err(p) => p,
707     }
708 }
709 
char_from_codepoint(input: usize) -> Option<char>710 fn char_from_codepoint(input: usize) -> Option<char> {
711     let mut codepoint = input.try_into().ok()?;
712     if codepoint == 0 {
713         codepoint = 0xFFFD;
714     }
715     char::from_u32(codepoint)
716 }
717 
718 // doesn't bother to check data[0] == '&'
scan_entity(bytes: &[u8]) -> (usize, Option<CowStr<'static>>)719 pub(crate) fn scan_entity(bytes: &[u8]) -> (usize, Option<CowStr<'static>>) {
720     let mut end = 1;
721     if scan_ch(&bytes[end..], b'#') == 1 {
722         end += 1;
723         let (bytecount, codepoint) = if end < bytes.len() && bytes[end] | 0x20 == b'x' {
724             end += 1;
725             parse_hex(&bytes[end..])
726         } else {
727             parse_decimal(&bytes[end..])
728         };
729         end += bytecount;
730         return if bytecount == 0 || scan_ch(&bytes[end..], b';') == 0 {
731             (0, None)
732         } else if let Some(c) = char_from_codepoint(codepoint) {
733             (end + 1, Some(c.into()))
734         } else {
735             (0, None)
736         };
737     }
738     end += scan_while(&bytes[end..], is_ascii_alphanumeric);
739     if scan_ch(&bytes[end..], b';') == 1 {
740         if let Some(value) = entities::get_entity(&bytes[1..end]) {
741             return (end + 1, Some(value.into()));
742         }
743     }
744     (0, None)
745 }
746 
747 // FIXME: we can most likely re-use other scanners
748 // returns (bytelength, title_str)
scan_refdef_title(text: &str) -> Option<(usize, &str)>749 pub(crate) fn scan_refdef_title(text: &str) -> Option<(usize, &str)> {
750     let mut chars = text.chars().peekable();
751     let closing_delim = match chars.next()? {
752         '\'' => '\'',
753         '"' => '"',
754         '(' => ')',
755         _ => return None,
756     };
757     let mut bytecount = 1;
758 
759     while let Some(c) = chars.next() {
760         match c {
761             '\n' => {
762                 bytecount += 1;
763                 let mut next = *chars.peek()?;
764                 while is_ascii_whitespace_no_nl(next as u8) {
765                     bytecount += chars.next()?.len_utf8();
766                     next = *chars.peek()?;
767                 }
768                 if *chars.peek()? == '\n' {
769                     // blank line - not allowed
770                     return None;
771                 }
772             }
773             '\\' => {
774                 let next_char = chars.next()?;
775                 bytecount += 1 + next_char.len_utf8();
776             }
777             c if c == closing_delim => {
778                 return Some((bytecount + 1, &text[1..bytecount]));
779             }
780             c => {
781                 bytecount += c.len_utf8();
782             }
783         }
784     }
785     None
786 }
787 
788 // note: dest returned is raw, still needs to be unescaped
789 // TODO: check that nested parens are really not allowed for refdefs
790 // TODO(performance): this func should probably its own unescaping
scan_link_dest( data: &str, start_ix: usize, max_next: usize, ) -> Option<(usize, &str)>791 pub(crate) fn scan_link_dest(
792     data: &str,
793     start_ix: usize,
794     max_next: usize,
795 ) -> Option<(usize, &str)> {
796     let bytes = &data.as_bytes()[start_ix..];
797     let mut i = scan_ch(bytes, b'<');
798 
799     if i != 0 {
800         // pointy links
801         while i < bytes.len() {
802             match bytes[i] {
803                 b'\n' | b'\r' | b'<' => return None,
804                 b'>' => return Some((i + 1, &data[(start_ix + 1)..(start_ix + i)])),
805                 b'\\' if i + 1 < bytes.len() && is_ascii_punctuation(bytes[i + 1]) => {
806                     i += 1;
807                 }
808                 _ => {}
809             }
810             i += 1;
811         }
812         None
813     } else {
814         // non-pointy links
815         let mut nest = 0;
816         while i < bytes.len() {
817             match bytes[i] {
818                 0x0..=0x20 => {
819                     break;
820                 }
821                 b'(' => {
822                     if nest > max_next {
823                         return None;
824                     }
825                     nest += 1;
826                 }
827                 b')' => {
828                     if nest == 0 {
829                         break;
830                     }
831                     nest -= 1;
832                 }
833                 b'\\' if i + 1 < bytes.len() && is_ascii_punctuation(bytes[i + 1]) => {
834                     i += 1;
835                 }
836                 _ => {}
837             }
838             i += 1;
839         }
840         Some((i, &data[start_ix..(start_ix + i)]))
841     }
842 }
843 
844 /// Returns bytes scanned
scan_attribute_name(data: &[u8]) -> Option<usize>845 fn scan_attribute_name(data: &[u8]) -> Option<usize> {
846     let (&c, tail) = data.split_first()?;
847     if is_ascii_alpha(c) || c == b'_' || c == b':' {
848         Some(
849             1 + scan_while(tail, |c| {
850                 is_ascii_alphanumeric(c) || c == b'_' || c == b'.' || c == b':' || c == b'-'
851             }),
852         )
853     } else {
854         None
855     }
856 }
857 
858 /// Returns the index immediately following the attribute on success.
859 /// The argument `buffer_ix` refers to the index into `data` from which we
860 /// should copy into `buffer` when we find bytes to skip.
scan_attribute( data: &[u8], mut ix: usize, newline_handler: Option<&dyn Fn(&[u8]) -> usize>, buffer: &mut Vec<u8>, buffer_ix: &mut usize, ) -> Option<usize>861 fn scan_attribute(
862     data: &[u8],
863     mut ix: usize,
864     newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
865     buffer: &mut Vec<u8>,
866     buffer_ix: &mut usize,
867 ) -> Option<usize> {
868     ix += scan_attribute_name(&data[ix..])?;
869     let n_whitespace =
870         scan_whitespace_with_newline_handler(data, ix, newline_handler, buffer, buffer_ix)? - ix;
871     ix += n_whitespace;
872     if scan_ch(&data[ix..], b'=') == 1 {
873         ix += 1;
874         ix = scan_whitespace_with_newline_handler(data, ix, newline_handler, buffer, buffer_ix)?;
875         ix = scan_attribute_value(&data, ix, newline_handler, buffer, buffer_ix)?;
876     } else if n_whitespace > 0 {
877         // Leave whitespace for next attribute.
878         ix -= 1;
879     }
880     Some(ix)
881 }
882 
883 /// Scans whitespace and possibly newlines according to the
884 /// behavior defined by the newline handler. When bytes are skipped,
885 /// all preceeding non-skipped bytes are pushed to the buffer.
scan_whitespace_with_newline_handler( data: &[u8], mut i: usize, newline_handler: Option<&dyn Fn(&[u8]) -> usize>, buffer: &mut Vec<u8>, buffer_ix: &mut usize, ) -> Option<usize>886 fn scan_whitespace_with_newline_handler(
887     data: &[u8],
888     mut i: usize,
889     newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
890     buffer: &mut Vec<u8>,
891     buffer_ix: &mut usize,
892 ) -> Option<usize> {
893     while i < data.len() {
894         if !is_ascii_whitespace(data[i]) {
895             return Some(i);
896         }
897         if let Some(eol_bytes) = scan_eol(&data[i..]) {
898             let handler = newline_handler?;
899             i += eol_bytes;
900             let skipped_bytes = handler(&data[i..]);
901 
902             if skipped_bytes > 0 {
903                 buffer.extend(&data[*buffer_ix..i]);
904                 *buffer_ix = i + skipped_bytes;
905             }
906 
907             i += skipped_bytes;
908         } else {
909             i += 1;
910         }
911     }
912 
913     Some(i)
914 }
915 
916 /// Returns the index immediately following the attribute value on success.
scan_attribute_value( data: &[u8], mut i: usize, newline_handler: Option<&dyn Fn(&[u8]) -> usize>, buffer: &mut Vec<u8>, buffer_ix: &mut usize, ) -> Option<usize>917 fn scan_attribute_value(
918     data: &[u8],
919     mut i: usize,
920     newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
921     buffer: &mut Vec<u8>,
922     buffer_ix: &mut usize,
923 ) -> Option<usize> {
924     match *data.get(i)? {
925         b @ b'"' | b @ b'\'' => {
926             i += 1;
927             while i < data.len() {
928                 if data[i] == b {
929                     return Some(i + 1);
930                 }
931                 if let Some(eol_bytes) = scan_eol(&data[i..]) {
932                     let handler = newline_handler?;
933                     i += eol_bytes;
934                     let skipped_bytes = handler(&data[i..]);
935 
936                     if skipped_bytes > 0 {
937                         buffer.extend(&data[*buffer_ix..i]);
938                         *buffer_ix = i + skipped_bytes;
939                     }
940                     i += skipped_bytes;
941                 } else {
942                     i += 1;
943                 }
944             }
945             return None;
946         }
947         b' ' | b'=' | b'>' | b'<' | b'`' | b'\n' | b'\r' => {
948             return None;
949         }
950         _ => {
951             // unquoted attribute value
952             i += scan_attr_value_chars(&data[i..]);
953         }
954     }
955 
956     Some(i)
957 }
958 
959 // Remove backslash escapes and resolve entities
unescape(input: &str) -> CowStr<'_>960 pub(crate) fn unescape(input: &str) -> CowStr<'_> {
961     let mut result = String::new();
962     let mut mark = 0;
963     let mut i = 0;
964     let bytes = input.as_bytes();
965     while i < bytes.len() {
966         match bytes[i] {
967             b'\\' if i + 1 < bytes.len() && is_ascii_punctuation(bytes[i + 1]) => {
968                 result.push_str(&input[mark..i]);
969                 mark = i + 1;
970                 i += 2;
971             }
972             b'&' => match scan_entity(&bytes[i..]) {
973                 (n, Some(value)) => {
974                     result.push_str(&input[mark..i]);
975                     result.push_str(&value);
976                     i += n;
977                     mark = i;
978                 }
979                 _ => i += 1,
980             },
981             b'\r' => {
982                 result.push_str(&input[mark..i]);
983                 i += 1;
984                 mark = i;
985             }
986             _ => i += 1,
987         }
988     }
989     if mark == 0 {
990         input.into()
991     } else {
992         result.push_str(&input[mark..]);
993         result.into()
994     }
995 }
996 
997 /// Assumes `data` is preceded by `<`.
scan_html_block_tag(data: &[u8]) -> (usize, &[u8])998 pub(crate) fn scan_html_block_tag(data: &[u8]) -> (usize, &[u8]) {
999     let i = scan_ch(data, b'/');
1000     let n = scan_while(&data[i..], is_ascii_alphanumeric);
1001     // TODO: scan attributes and >
1002     (i + n, &data[i..i + n])
1003 }
1004 
is_html_tag(tag: &[u8]) -> bool1005 pub(crate) fn is_html_tag(tag: &[u8]) -> bool {
1006     HTML_TAGS
1007         .binary_search_by(|probe| {
1008             let probe_bytes_iter = probe.as_bytes().iter();
1009             let tag_bytes_iter = tag.iter();
1010 
1011             probe_bytes_iter
1012                 .zip(tag_bytes_iter)
1013                 .find_map(|(&a, &b)| {
1014                     // We can compare case insensitively because the probes are
1015                     // all lower case alpha strings.
1016                     match a.cmp(&(b | 0x20)) {
1017                         std::cmp::Ordering::Equal => None,
1018                         inequality => Some(inequality),
1019                     }
1020                 })
1021                 .unwrap_or_else(|| probe.len().cmp(&tag.len()))
1022         })
1023         .is_ok()
1024 }
1025 
1026 /// Assumes that `data` starts with `<`.
1027 /// Returns the index into data directly after the html tag on success.
scan_html_type_7(data: &[u8]) -> Option<usize>1028 pub(crate) fn scan_html_type_7(data: &[u8]) -> Option<usize> {
1029     // Block type html does not allow for newlines, so we
1030     // do not pass a newline handler.
1031     let (_span, i) = scan_html_block_inner(data, None)?;
1032     scan_blank_line(&data[i..])?;
1033     Some(i)
1034 }
1035 
1036 /// Assumes that `data` starts with `<`.
1037 /// Returns the number of bytes scanned and the html in case of
1038 /// success.
1039 /// When some bytes were skipped, because the html was split over
1040 /// multiple leafs (e.g. over multiple lines in a blockquote),
1041 /// the html is returned as a vector of bytes.
1042 /// If no bytes were skipped, the buffer will be empty.
scan_html_block_inner( data: &[u8], newline_handler: Option<&dyn Fn(&[u8]) -> usize>, ) -> Option<(Vec<u8>, usize)>1043 pub(crate) fn scan_html_block_inner(
1044     data: &[u8],
1045     newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
1046 ) -> Option<(Vec<u8>, usize)> {
1047     let mut buffer = Vec::new();
1048     let mut last_buf_index = 0;
1049 
1050     let close_tag_bytes = scan_ch(&data[1..], b'/');
1051     let l = scan_while(&data[(1 + close_tag_bytes)..], is_ascii_alpha);
1052     if l == 0 {
1053         return None;
1054     }
1055     let mut i = 1 + close_tag_bytes + l;
1056     i += scan_while(&data[i..], is_ascii_letterdigitdash);
1057 
1058     if close_tag_bytes == 0 {
1059         loop {
1060             let old_i = i;
1061             loop {
1062                 i += scan_whitespace_no_nl(&data[i..]);
1063                 if let Some(eol_bytes) = scan_eol(&data[i..]) {
1064                     if eol_bytes == 0 {
1065                         return None;
1066                     }
1067                     let handler = newline_handler?;
1068                     i += eol_bytes;
1069                     let skipped_bytes = handler(&data[i..]);
1070 
1071                     if skipped_bytes > 0 {
1072                         buffer.extend(&data[last_buf_index..i]);
1073                         i += skipped_bytes;
1074                         last_buf_index = i;
1075                     }
1076                 } else {
1077                     break;
1078                 }
1079             }
1080             if let Some(b'/') | Some(b'>') = data.get(i) {
1081                 break;
1082             }
1083             if old_i == i {
1084                 // No whitespace, which is mandatory.
1085                 return None;
1086             }
1087             i = scan_attribute(&data, i, newline_handler, &mut buffer, &mut last_buf_index)?;
1088         }
1089     }
1090 
1091     i += scan_whitespace_no_nl(&data[i..]);
1092 
1093     if close_tag_bytes == 0 {
1094         i += scan_ch(&data[i..], b'/');
1095     }
1096 
1097     if scan_ch(&data[i..], b'>') == 0 {
1098         None
1099     } else {
1100         i += 1;
1101         if !buffer.is_empty() {
1102             buffer.extend(&data[last_buf_index..i]);
1103         }
1104         Some((buffer, i))
1105     }
1106 }
1107 
1108 /// Returns (next_byte_offset, uri, type)
scan_autolink(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>, LinkType)>1109 pub(crate) fn scan_autolink(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>, LinkType)> {
1110     scan_uri(text, start_ix)
1111         .map(|(bytes, uri)| (bytes, uri, LinkType::Autolink))
1112         .or_else(|| scan_email(text, start_ix).map(|(bytes, uri)| (bytes, uri, LinkType::Email)))
1113 }
1114 
1115 /// Returns (next_byte_offset, uri)
scan_uri(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>)>1116 fn scan_uri(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>)> {
1117     let bytes = &text.as_bytes()[start_ix..];
1118 
1119     // scheme's first byte must be an ascii letter
1120     if bytes.is_empty() || !is_ascii_alpha(bytes[0]) {
1121         return None;
1122     }
1123 
1124     let mut i = 1;
1125 
1126     while i < bytes.len() {
1127         let c = bytes[i];
1128         i += 1;
1129         match c {
1130             c if is_ascii_alphanumeric(c) => (),
1131             b'.' | b'-' | b'+' => (),
1132             b':' => break,
1133             _ => return None,
1134         }
1135     }
1136 
1137     // scheme length must be between 2 and 32 characters long. scheme
1138     // must be followed by colon
1139     if i < 3 || i > 33 {
1140         return None;
1141     }
1142 
1143     while i < bytes.len() {
1144         match bytes[i] {
1145             b'>' => return Some((start_ix + i + 1, text[start_ix..(start_ix + i)].into())),
1146             b'\0'..=b' ' | b'<' => return None,
1147             _ => (),
1148         }
1149         i += 1;
1150     }
1151 
1152     None
1153 }
1154 
1155 /// Returns (next_byte_offset, email)
scan_email(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>)>1156 fn scan_email(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>)> {
1157     // using a regex library would be convenient, but doing it by hand is not too bad
1158     let bytes = &text.as_bytes()[start_ix..];
1159     let mut i = 0;
1160 
1161     while i < bytes.len() {
1162         let c = bytes[i];
1163         i += 1;
1164         match c {
1165             c if is_ascii_alphanumeric(c) => (),
1166             b'.' | b'!' | b'#' | b'$' | b'%' | b'&' | b'\'' | b'*' | b'+' | b'/' | b'=' | b'?'
1167             | b'^' | b'_' | b'`' | b'{' | b'|' | b'}' | b'~' | b'-' => (),
1168             b'@' => break,
1169             _ => return None,
1170         }
1171     }
1172 
1173     loop {
1174         let label_start_ix = i;
1175         let mut fresh_label = true;
1176 
1177         while i < bytes.len() {
1178             match bytes[i] {
1179                 c if is_ascii_alphanumeric(c) => (),
1180                 b'-' if fresh_label => {
1181                     return None;
1182                 }
1183                 b'-' => (),
1184                 _ => break,
1185             }
1186             fresh_label = false;
1187             i += 1;
1188         }
1189 
1190         if i == label_start_ix || i - label_start_ix > 63 || bytes[i - 1] == b'-' {
1191             return None;
1192         }
1193 
1194         if scan_ch(&bytes[i..], b'.') == 0 {
1195             break;
1196         }
1197         i += 1;
1198     }
1199 
1200     if scan_ch(&bytes[i..], b'>') == 0 {
1201         return None;
1202     }
1203 
1204     Some((start_ix + i + 1, text[start_ix..(start_ix + i)].into()))
1205 }
1206 
1207 /// Scan comment, declaration, or CDATA section, with initial "<!" already consumed.
1208 /// Returns byte offset on match.
scan_inline_html_comment( bytes: &[u8], mut ix: usize, scan_guard: &mut HtmlScanGuard, ) -> Option<usize>1209 pub(crate) fn scan_inline_html_comment(
1210     bytes: &[u8],
1211     mut ix: usize,
1212     scan_guard: &mut HtmlScanGuard,
1213 ) -> Option<usize> {
1214     let c = *bytes.get(ix)?;
1215     ix += 1;
1216     match c {
1217         b'-' => {
1218             let dashes = scan_ch_repeat(&bytes[ix..], b'-');
1219             if dashes < 1 {
1220                 return None;
1221             }
1222             // Saw "<!--", scan comment.
1223             ix += dashes;
1224             if scan_ch(&bytes[ix..], b'>') == 1 {
1225                 return None;
1226             }
1227 
1228             while let Some(x) = memchr(b'-', &bytes[ix..]) {
1229                 ix += x + 1;
1230                 if scan_ch(&bytes[ix..], b'-') == 1 {
1231                     ix += 1;
1232                     return if scan_ch(&bytes[ix..], b'>') == 1 {
1233                         Some(ix + 1)
1234                     } else {
1235                         None
1236                     };
1237                 }
1238             }
1239             None
1240         }
1241         b'[' if bytes[ix..].starts_with(b"CDATA[") && ix > scan_guard.cdata => {
1242             ix += b"CDATA[".len();
1243             ix = memchr(b']', &bytes[ix..]).map_or(bytes.len(), |x| ix + x);
1244             let close_brackets = scan_ch_repeat(&bytes[ix..], b']');
1245             ix += close_brackets;
1246 
1247             if close_brackets == 0 || scan_ch(&bytes[ix..], b'>') == 0 {
1248                 scan_guard.cdata = ix;
1249                 None
1250             } else {
1251                 Some(ix + 1)
1252             }
1253         }
1254         b'A'..=b'Z' if ix > scan_guard.declaration => {
1255             // Scan declaration.
1256             ix += scan_while(&bytes[ix..], |c| c >= b'A' && c <= b'Z');
1257             let whitespace = scan_while(&bytes[ix..], is_ascii_whitespace);
1258             if whitespace == 0 {
1259                 return None;
1260             }
1261             ix += whitespace;
1262             ix = memchr(b'>', &bytes[ix..]).map_or(bytes.len(), |x| ix + x);
1263             if scan_ch(&bytes[ix..], b'>') == 0 {
1264                 scan_guard.declaration = ix;
1265                 None
1266             } else {
1267                 Some(ix + 1)
1268             }
1269         }
1270         _ => None,
1271     }
1272 }
1273 
1274 /// Scan processing directive, with initial "<?" already consumed.
1275 /// Returns the next byte offset on success.
scan_inline_html_processing( bytes: &[u8], mut ix: usize, scan_guard: &mut HtmlScanGuard, ) -> Option<usize>1276 pub(crate) fn scan_inline_html_processing(
1277     bytes: &[u8],
1278     mut ix: usize,
1279     scan_guard: &mut HtmlScanGuard,
1280 ) -> Option<usize> {
1281     if ix <= scan_guard.processing {
1282         return None;
1283     }
1284     while let Some(offset) = memchr(b'?', &bytes[ix..]) {
1285         ix += offset + 1;
1286         if scan_ch(&bytes[ix..], b'>') == 1 {
1287             return Some(ix + 1);
1288         }
1289     }
1290     scan_guard.processing = ix;
1291     None
1292 }
1293 
1294 #[cfg(test)]
1295 mod test {
1296     use super::*;
1297     #[test]
overflow_list()1298     fn overflow_list() {
1299         assert!(
1300             scan_listitem(b"4444444444444444444444444444444444444444444444444444444444!").is_none()
1301         );
1302     }
1303 
1304     #[test]
overflow_by_addition()1305     fn overflow_by_addition() {
1306         assert!(scan_listitem(b"1844674407370955161615!").is_none());
1307     }
1308 }
1309