1 // Copyright 2015 Google Inc. All rights reserved.
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a copy
4 // of this software and associated documentation files (the "Software"), to deal
5 // in the Software without restriction, including without limitation the rights
6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 // copies of the Software, and to permit persons to whom the Software is
8 // furnished to do so, subject to the following conditions:
9 //
10 // The above copyright notice and this permission notice shall be included in
11 // all copies or substantial portions of the Software.
12 //
13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 // THE SOFTWARE.
20
21 //! Scanners for fragments of CommonMark syntax
22
23 use std::char;
24 use std::convert::TryInto;
25
26 use crate::entities;
27 use crate::parse::{Alignment, HtmlScanGuard, LinkType};
28 pub use crate::puncttable::{is_ascii_punctuation, is_punctuation};
29 use crate::strings::CowStr;
30
31 use memchr::memchr;
32
33 // sorted for binary search
34 const HTML_TAGS: [&str; 62] = [
35 "address",
36 "article",
37 "aside",
38 "base",
39 "basefont",
40 "blockquote",
41 "body",
42 "caption",
43 "center",
44 "col",
45 "colgroup",
46 "dd",
47 "details",
48 "dialog",
49 "dir",
50 "div",
51 "dl",
52 "dt",
53 "fieldset",
54 "figcaption",
55 "figure",
56 "footer",
57 "form",
58 "frame",
59 "frameset",
60 "h1",
61 "h2",
62 "h3",
63 "h4",
64 "h5",
65 "h6",
66 "head",
67 "header",
68 "hr",
69 "html",
70 "iframe",
71 "legend",
72 "li",
73 "link",
74 "main",
75 "menu",
76 "menuitem",
77 "nav",
78 "noframes",
79 "ol",
80 "optgroup",
81 "option",
82 "p",
83 "param",
84 "section",
85 "source",
86 "summary",
87 "table",
88 "tbody",
89 "td",
90 "tfoot",
91 "th",
92 "thead",
93 "title",
94 "tr",
95 "track",
96 "ul",
97 ];
98
99 /// Analysis of the beginning of a line, including indentation and container
100 /// markers.
101 #[derive(Clone)]
102 pub struct LineStart<'a> {
103 bytes: &'a [u8],
104 tab_start: usize,
105 ix: usize,
106 spaces_remaining: usize,
107 // no thematic breaks can occur before this offset.
108 // this prevents scanning over and over up to a certain point
109 min_hrule_offset: usize,
110 }
111
112 impl<'a> LineStart<'a> {
new(bytes: &[u8]) -> LineStart113 pub(crate) fn new(bytes: &[u8]) -> LineStart {
114 LineStart {
115 bytes,
116 tab_start: 0,
117 ix: 0,
118 spaces_remaining: 0,
119 min_hrule_offset: 0,
120 }
121 }
122
123 /// Try to scan a number of spaces.
124 ///
125 /// Returns true if all spaces were consumed.
126 ///
127 /// Note: consumes some spaces even if not successful.
scan_space(&mut self, n_space: usize) -> bool128 pub(crate) fn scan_space(&mut self, n_space: usize) -> bool {
129 self.scan_space_inner(n_space) == 0
130 }
131
132 /// Scan a number of spaces up to a maximum.
133 ///
134 /// Returns number of spaces scanned.
scan_space_upto(&mut self, n_space: usize) -> usize135 pub(crate) fn scan_space_upto(&mut self, n_space: usize) -> usize {
136 n_space - self.scan_space_inner(n_space)
137 }
138
139 /// Returns unused remainder of spaces.
scan_space_inner(&mut self, mut n_space: usize) -> usize140 fn scan_space_inner(&mut self, mut n_space: usize) -> usize {
141 let n_from_remaining = self.spaces_remaining.min(n_space);
142 self.spaces_remaining -= n_from_remaining;
143 n_space -= n_from_remaining;
144 while n_space > 0 && self.ix < self.bytes.len() {
145 match self.bytes[self.ix] {
146 b' ' => {
147 self.ix += 1;
148 n_space -= 1;
149 }
150 b'\t' => {
151 let spaces = 4 - (self.ix - self.tab_start) % 4;
152 self.ix += 1;
153 self.tab_start = self.ix;
154 let n = spaces.min(n_space);
155 n_space -= n;
156 self.spaces_remaining = spaces - n;
157 }
158 _ => break,
159 }
160 }
161 n_space
162 }
163
164 /// Scan all available ASCII whitespace (not including eol).
scan_all_space(&mut self)165 pub(crate) fn scan_all_space(&mut self) {
166 self.spaces_remaining = 0;
167 self.ix += self.bytes[self.ix..]
168 .iter()
169 .take_while(|&&b| b == b' ' || b == b'\t')
170 .count();
171 }
172
173 /// Determine whether we're at end of line (includes end of file).
is_at_eol(&self) -> bool174 pub(crate) fn is_at_eol(&self) -> bool {
175 self.bytes
176 .get(self.ix)
177 .map(|&c| c == b'\r' || c == b'\n')
178 .unwrap_or(true)
179 }
180
scan_ch(&mut self, c: u8) -> bool181 fn scan_ch(&mut self, c: u8) -> bool {
182 if self.ix < self.bytes.len() && self.bytes[self.ix] == c {
183 self.ix += 1;
184 true
185 } else {
186 false
187 }
188 }
189
scan_blockquote_marker(&mut self) -> bool190 pub(crate) fn scan_blockquote_marker(&mut self) -> bool {
191 let save = self.clone();
192 let _ = self.scan_space(3);
193 if self.scan_ch(b'>') {
194 let _ = self.scan_space(1);
195 true
196 } else {
197 *self = save;
198 false
199 }
200 }
201
202 /// Scan a list marker.
203 ///
204 /// Return value is the character, the start index, and the indent in spaces.
205 /// For ordered list markers, the character will be one of b'.' or b')'. For
206 /// bullet list markers, it will be one of b'-', b'+', or b'*'.
scan_list_marker(&mut self) -> Option<(u8, u64, usize)>207 pub(crate) fn scan_list_marker(&mut self) -> Option<(u8, u64, usize)> {
208 let save = self.clone();
209 let indent = self.scan_space_upto(3);
210 if self.ix < self.bytes.len() {
211 let c = self.bytes[self.ix];
212 if c == b'-' || c == b'+' || c == b'*' {
213 if self.ix >= self.min_hrule_offset {
214 // there could be an hrule here
215 if let Err(min_offset) = scan_hrule(&self.bytes[self.ix..]) {
216 self.min_hrule_offset = min_offset;
217 } else {
218 *self = save;
219 return None;
220 }
221 }
222 self.ix += 1;
223 if self.scan_space(1) || self.is_at_eol() {
224 return self.finish_list_marker(c, 0, indent + 2);
225 }
226 } else if c >= b'0' && c <= b'9' {
227 let start_ix = self.ix;
228 let mut ix = self.ix + 1;
229 let mut val = u64::from(c - b'0');
230 while ix < self.bytes.len() && ix - start_ix < 10 {
231 let c = self.bytes[ix];
232 ix += 1;
233 if c >= b'0' && c <= b'9' {
234 val = val * 10 + u64::from(c - b'0');
235 } else if c == b')' || c == b'.' {
236 self.ix = ix;
237 if self.scan_space(1) || self.is_at_eol() {
238 return self.finish_list_marker(c, val, indent + self.ix - start_ix);
239 } else {
240 break;
241 }
242 } else {
243 break;
244 }
245 }
246 }
247 }
248 *self = save;
249 None
250 }
251
finish_list_marker( &mut self, c: u8, start: u64, mut indent: usize, ) -> Option<(u8, u64, usize)>252 fn finish_list_marker(
253 &mut self,
254 c: u8,
255 start: u64,
256 mut indent: usize,
257 ) -> Option<(u8, u64, usize)> {
258 let save = self.clone();
259
260 // skip the rest of the line if it's blank
261 if scan_blank_line(&self.bytes[self.ix..]).is_some() {
262 return Some((c, start, indent));
263 }
264
265 let post_indent = self.scan_space_upto(4);
266 if post_indent < 4 {
267 indent += post_indent;
268 } else {
269 *self = save;
270 }
271 Some((c, start, indent))
272 }
273
274 /// Returns Some(is_checked) when a task list marker was found. Resets itself
275 /// to original state otherwise.
scan_task_list_marker(&mut self) -> Option<bool>276 pub(crate) fn scan_task_list_marker(&mut self) -> Option<bool> {
277 let save = self.clone();
278 self.scan_space_upto(3);
279
280 if !self.scan_ch(b'[') {
281 *self = save;
282 return None;
283 }
284 let is_checked = match self.bytes.get(self.ix) {
285 Some(&c) if is_ascii_whitespace_no_nl(c) => {
286 self.ix += 1;
287 false
288 }
289 Some(b'x') | Some(b'X') => {
290 self.ix += 1;
291 true
292 }
293 _ => {
294 *self = save;
295 return None;
296 }
297 };
298 if !self.scan_ch(b']') {
299 *self = save;
300 return None;
301 }
302 if !self
303 .bytes
304 .get(self.ix)
305 .map(|&b| is_ascii_whitespace_no_nl(b))
306 .unwrap_or(false)
307 {
308 *self = save;
309 return None;
310 }
311 Some(is_checked)
312 }
313
bytes_scanned(&self) -> usize314 pub(crate) fn bytes_scanned(&self) -> usize {
315 self.ix
316 }
317
remaining_space(&self) -> usize318 pub(crate) fn remaining_space(&self) -> usize {
319 self.spaces_remaining
320 }
321 }
322
is_ascii_whitespace(c: u8) -> bool323 pub(crate) fn is_ascii_whitespace(c: u8) -> bool {
324 (c >= 0x09 && c <= 0x0d) || c == b' '
325 }
326
is_ascii_whitespace_no_nl(c: u8) -> bool327 pub(crate) fn is_ascii_whitespace_no_nl(c: u8) -> bool {
328 c == b'\t' || c == 0x0b || c == 0x0c || c == b' '
329 }
330
is_ascii_alpha(c: u8) -> bool331 fn is_ascii_alpha(c: u8) -> bool {
332 match c {
333 b'a'..=b'z' | b'A'..=b'Z' => true,
334 _ => false,
335 }
336 }
337
is_ascii_alphanumeric(c: u8) -> bool338 fn is_ascii_alphanumeric(c: u8) -> bool {
339 match c {
340 b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' => true,
341 _ => false,
342 }
343 }
344
is_ascii_letterdigitdash(c: u8) -> bool345 fn is_ascii_letterdigitdash(c: u8) -> bool {
346 c == b'-' || is_ascii_alphanumeric(c)
347 }
348
is_digit(c: u8) -> bool349 fn is_digit(c: u8) -> bool {
350 b'0' <= c && c <= b'9'
351 }
352
is_valid_unquoted_attr_value_char(c: u8) -> bool353 fn is_valid_unquoted_attr_value_char(c: u8) -> bool {
354 match c {
355 b'\'' | b'"' | b' ' | b'=' | b'>' | b'<' | b'`' | b'\n' | b'\r' => false,
356 _ => true,
357 }
358 }
359
360 // scan a single character
scan_ch(data: &[u8], c: u8) -> usize361 pub(crate) fn scan_ch(data: &[u8], c: u8) -> usize {
362 if !data.is_empty() && data[0] == c {
363 1
364 } else {
365 0
366 }
367 }
368
scan_while<F>(data: &[u8], mut f: F) -> usize where F: FnMut(u8) -> bool,369 pub(crate) fn scan_while<F>(data: &[u8], mut f: F) -> usize
370 where
371 F: FnMut(u8) -> bool,
372 {
373 data.iter().take_while(|&&c| f(c)).count()
374 }
375
scan_rev_while<F>(data: &[u8], mut f: F) -> usize where F: FnMut(u8) -> bool,376 pub(crate) fn scan_rev_while<F>(data: &[u8], mut f: F) -> usize
377 where
378 F: FnMut(u8) -> bool,
379 {
380 data.iter().rev().take_while(|&&c| f(c)).count()
381 }
382
scan_ch_repeat(data: &[u8], c: u8) -> usize383 pub(crate) fn scan_ch_repeat(data: &[u8], c: u8) -> usize {
384 scan_while(data, |x| x == c)
385 }
386
387 // Note: this scans ASCII whitespace only, for Unicode whitespace use
388 // a different function.
scan_whitespace_no_nl(data: &[u8]) -> usize389 pub(crate) fn scan_whitespace_no_nl(data: &[u8]) -> usize {
390 scan_while(data, is_ascii_whitespace_no_nl)
391 }
392
scan_attr_value_chars(data: &[u8]) -> usize393 fn scan_attr_value_chars(data: &[u8]) -> usize {
394 scan_while(data, is_valid_unquoted_attr_value_char)
395 }
396
scan_eol(bytes: &[u8]) -> Option<usize>397 pub(crate) fn scan_eol(bytes: &[u8]) -> Option<usize> {
398 if bytes.is_empty() {
399 return Some(0);
400 }
401 match bytes[0] {
402 b'\n' => Some(1),
403 b'\r' => Some(if bytes.get(1) == Some(&b'\n') { 2 } else { 1 }),
404 _ => None,
405 }
406 }
407
scan_blank_line(bytes: &[u8]) -> Option<usize>408 pub(crate) fn scan_blank_line(bytes: &[u8]) -> Option<usize> {
409 let i = scan_whitespace_no_nl(bytes);
410 scan_eol(&bytes[i..]).map(|n| i + n)
411 }
412
scan_nextline(bytes: &[u8]) -> usize413 pub(crate) fn scan_nextline(bytes: &[u8]) -> usize {
414 memchr(b'\n', bytes).map_or(bytes.len(), |x| x + 1)
415 }
416
417 // return: end byte for closing code fence, or None
418 // if the line is not a closing code fence
scan_closing_code_fence( bytes: &[u8], fence_char: u8, n_fence_char: usize, ) -> Option<usize>419 pub(crate) fn scan_closing_code_fence(
420 bytes: &[u8],
421 fence_char: u8,
422 n_fence_char: usize,
423 ) -> Option<usize> {
424 if bytes.is_empty() {
425 return Some(0);
426 }
427 let mut i = 0;
428 let num_fence_chars_found = scan_ch_repeat(&bytes[i..], fence_char);
429 if num_fence_chars_found < n_fence_char {
430 return None;
431 }
432 i += num_fence_chars_found;
433 let num_trailing_spaces = scan_ch_repeat(&bytes[i..], b' ');
434 i += num_trailing_spaces;
435 scan_eol(&bytes[i..]).map(|_| i)
436 }
437
438 // returned pair is (number of bytes, number of spaces)
calc_indent(text: &[u8], max: usize) -> (usize, usize)439 fn calc_indent(text: &[u8], max: usize) -> (usize, usize) {
440 let mut spaces = 0;
441 let mut offset = 0;
442
443 for (i, &b) in text.iter().enumerate() {
444 match b {
445 b' ' => {
446 spaces += 1;
447 if spaces == max {
448 break;
449 }
450 }
451 b'\t' => {
452 let new_spaces = spaces + 4 - (spaces & 3);
453 if new_spaces > max {
454 break;
455 }
456 spaces = new_spaces;
457 }
458 _ => break,
459 }
460 offset = i;
461 }
462
463 (offset, spaces)
464 }
465
466 /// Scan hrule opening sequence.
467 ///
468 /// Returns Ok(x) when it finds an hrule, where x is the
469 /// size of line containing the hrule, including the trailing newline.
470 ///
471 /// Returns Err(x) when it does not find an hrule and x is
472 /// the offset in data before no hrule can appear.
scan_hrule(bytes: &[u8]) -> Result<usize, usize>473 pub(crate) fn scan_hrule(bytes: &[u8]) -> Result<usize, usize> {
474 if bytes.len() < 3 {
475 return Err(0);
476 }
477 let c = bytes[0];
478 if !(c == b'*' || c == b'-' || c == b'_') {
479 return Err(0);
480 }
481 let mut n = 0;
482 let mut i = 0;
483
484 while i < bytes.len() {
485 match bytes[i] {
486 b'\n' | b'\r' => {
487 i += scan_eol(&bytes[i..]).unwrap_or(0);
488 break;
489 }
490 c2 if c2 == c => {
491 n += 1;
492 }
493 b' ' | b'\t' => (),
494 _ => return Err(i),
495 }
496 i += 1;
497 }
498 if n >= 3 {
499 Ok(i)
500 } else {
501 Err(i)
502 }
503 }
504
505 /// Scan an ATX heading opening sequence.
506 ///
507 /// Returns number of bytes in prefix and level.
scan_atx_heading(data: &[u8]) -> Option<usize>508 pub(crate) fn scan_atx_heading(data: &[u8]) -> Option<usize> {
509 let level = scan_ch_repeat(data, b'#');
510 if level >= 1 && level <= 6 && data.get(level).cloned().map_or(true, is_ascii_whitespace) {
511 Some(level)
512 } else {
513 None
514 }
515 }
516
517 /// Scan a setext heading underline.
518 ///
519 /// Returns number of bytes in line (including trailing newline) and level.
scan_setext_heading(data: &[u8]) -> Option<(usize, u32)>520 pub(crate) fn scan_setext_heading(data: &[u8]) -> Option<(usize, u32)> {
521 let c = *data.get(0)?;
522 if !(c == b'-' || c == b'=') {
523 return None;
524 }
525 let mut i = 1 + scan_ch_repeat(&data[1..], c);
526 i += scan_blank_line(&data[i..])?;
527 let level = if c == b'=' { 1 } else { 2 };
528 Some((i, level))
529 }
530
531 // returns number of bytes in line (including trailing
532 // newline) and column alignments
scan_table_head(data: &[u8]) -> (usize, Vec<Alignment>)533 pub(crate) fn scan_table_head(data: &[u8]) -> (usize, Vec<Alignment>) {
534 let (mut i, spaces) = calc_indent(data, 4);
535 if spaces > 3 || i == data.len() {
536 return (0, vec![]);
537 }
538 let mut cols = vec![];
539 let mut active_col = Alignment::None;
540 let mut start_col = true;
541 if data[i] == b'|' {
542 i += 1;
543 }
544 for c in &data[i..] {
545 if let Some(n) = scan_eol(&data[i..]) {
546 i += n;
547 break;
548 }
549 match *c {
550 b' ' => (),
551 b':' => {
552 active_col = match (start_col, active_col) {
553 (true, Alignment::None) => Alignment::Left,
554 (false, Alignment::Left) => Alignment::Center,
555 (false, Alignment::None) => Alignment::Right,
556 _ => active_col,
557 };
558 start_col = false;
559 }
560 b'-' => {
561 start_col = false;
562 }
563 b'|' => {
564 start_col = true;
565 cols.push(active_col);
566 active_col = Alignment::None;
567 }
568 _ => {
569 cols = vec![];
570 start_col = true;
571 break;
572 }
573 }
574 i += 1;
575 }
576
577 if !start_col {
578 cols.push(active_col);
579 }
580
581 (i, cols)
582 }
583
584 /// Scan code fence.
585 ///
586 /// Returns number of bytes scanned and the char that is repeated to make the code fence.
scan_code_fence(data: &[u8]) -> Option<(usize, u8)>587 pub(crate) fn scan_code_fence(data: &[u8]) -> Option<(usize, u8)> {
588 let c = *data.get(0)?;
589 if !(c == b'`' || c == b'~') {
590 return None;
591 }
592 let i = 1 + scan_ch_repeat(&data[1..], c);
593 if i >= 3 {
594 if c == b'`' {
595 let suffix = &data[i..];
596 let next_line = i + scan_nextline(suffix);
597 // FIXME: make sure this is correct
598 if suffix[..(next_line - i)].iter().any(|&b| b == b'`') {
599 return None;
600 }
601 }
602 Some((i, c))
603 } else {
604 None
605 }
606 }
607
scan_blockquote_start(data: &[u8]) -> Option<usize>608 pub(crate) fn scan_blockquote_start(data: &[u8]) -> Option<usize> {
609 if data.starts_with(b"> ") {
610 Some(2)
611 } else {
612 None
613 }
614 }
615
616 /// This already assumes the list item has been scanned.
scan_empty_list(data: &[u8]) -> bool617 pub(crate) fn scan_empty_list(data: &[u8]) -> bool {
618 let mut ix = 0;
619 for _ in 0..2 {
620 if let Some(bytes) = scan_blank_line(&data[ix..]) {
621 ix += bytes;
622 } else {
623 return false;
624 }
625 }
626 true
627 }
628
629 // return number of bytes scanned, delimiter, start index, and indent
scan_listitem(bytes: &[u8]) -> Option<(usize, u8, usize, usize)>630 pub(crate) fn scan_listitem(bytes: &[u8]) -> Option<(usize, u8, usize, usize)> {
631 let mut c = *bytes.get(0)?;
632 let (w, start) = match c {
633 b'-' | b'+' | b'*' => (1, 0),
634 b'0'..=b'9' => {
635 let (length, start) = parse_decimal(bytes);
636 c = *bytes.get(length)?;
637 if !(c == b'.' || c == b')') {
638 return None;
639 }
640 (length + 1, start)
641 }
642 _ => {
643 return None;
644 }
645 };
646 // TODO: replace calc_indent with scan_leading_whitespace, for tab correctness
647 let (mut postn, mut postindent) = calc_indent(&bytes[w..], 5);
648 if postindent == 0 {
649 scan_eol(&bytes[w..])?;
650 postindent += 1;
651 } else if postindent > 4 {
652 postn = 1;
653 postindent = 1;
654 }
655 if scan_blank_line(&bytes[w..]).is_some() {
656 postn = 0;
657 postindent = 1;
658 }
659 Some((w + postn, c, start, w + postindent))
660 }
661
662 // returns (number of bytes, parsed decimal)
parse_decimal(bytes: &[u8]) -> (usize, usize)663 fn parse_decimal(bytes: &[u8]) -> (usize, usize) {
664 match bytes
665 .iter()
666 .take_while(|&&b| is_digit(b))
667 .try_fold((0, 0usize), |(count, acc), c| {
668 let digit = usize::from(c - b'0');
669 match acc
670 .checked_mul(10)
671 .and_then(|ten_acc| ten_acc.checked_add(digit))
672 {
673 Some(number) => Ok((count + 1, number)),
674 // stop early on overflow
675 None => Err((count, acc)),
676 }
677 }) {
678 Ok(p) | Err(p) => p,
679 }
680 }
681
682 // returns (number of bytes, parsed hex)
parse_hex(bytes: &[u8]) -> (usize, usize)683 fn parse_hex(bytes: &[u8]) -> (usize, usize) {
684 match bytes.iter().try_fold((0, 0usize), |(count, acc), c| {
685 let mut c = *c;
686 let digit = if c >= b'0' && c <= b'9' {
687 usize::from(c - b'0')
688 } else {
689 // make lower case
690 c |= 0x20;
691 if c >= b'a' && c <= b'f' {
692 usize::from(c - b'a' + 10)
693 } else {
694 return Err((count, acc));
695 }
696 };
697 match acc
698 .checked_mul(16)
699 .and_then(|sixteen_acc| sixteen_acc.checked_add(digit))
700 {
701 Some(number) => Ok((count + 1, number)),
702 // stop early on overflow
703 None => Err((count, acc)),
704 }
705 }) {
706 Ok(p) | Err(p) => p,
707 }
708 }
709
char_from_codepoint(input: usize) -> Option<char>710 fn char_from_codepoint(input: usize) -> Option<char> {
711 let mut codepoint = input.try_into().ok()?;
712 if codepoint == 0 {
713 codepoint = 0xFFFD;
714 }
715 char::from_u32(codepoint)
716 }
717
718 // doesn't bother to check data[0] == '&'
scan_entity(bytes: &[u8]) -> (usize, Option<CowStr<'static>>)719 pub(crate) fn scan_entity(bytes: &[u8]) -> (usize, Option<CowStr<'static>>) {
720 let mut end = 1;
721 if scan_ch(&bytes[end..], b'#') == 1 {
722 end += 1;
723 let (bytecount, codepoint) = if end < bytes.len() && bytes[end] | 0x20 == b'x' {
724 end += 1;
725 parse_hex(&bytes[end..])
726 } else {
727 parse_decimal(&bytes[end..])
728 };
729 end += bytecount;
730 return if bytecount == 0 || scan_ch(&bytes[end..], b';') == 0 {
731 (0, None)
732 } else if let Some(c) = char_from_codepoint(codepoint) {
733 (end + 1, Some(c.into()))
734 } else {
735 (0, None)
736 };
737 }
738 end += scan_while(&bytes[end..], is_ascii_alphanumeric);
739 if scan_ch(&bytes[end..], b';') == 1 {
740 if let Some(value) = entities::get_entity(&bytes[1..end]) {
741 return (end + 1, Some(value.into()));
742 }
743 }
744 (0, None)
745 }
746
747 // FIXME: we can most likely re-use other scanners
748 // returns (bytelength, title_str)
scan_refdef_title(text: &str) -> Option<(usize, &str)>749 pub(crate) fn scan_refdef_title(text: &str) -> Option<(usize, &str)> {
750 let mut chars = text.chars().peekable();
751 let closing_delim = match chars.next()? {
752 '\'' => '\'',
753 '"' => '"',
754 '(' => ')',
755 _ => return None,
756 };
757 let mut bytecount = 1;
758
759 while let Some(c) = chars.next() {
760 match c {
761 '\n' => {
762 bytecount += 1;
763 let mut next = *chars.peek()?;
764 while is_ascii_whitespace_no_nl(next as u8) {
765 bytecount += chars.next()?.len_utf8();
766 next = *chars.peek()?;
767 }
768 if *chars.peek()? == '\n' {
769 // blank line - not allowed
770 return None;
771 }
772 }
773 '\\' => {
774 let next_char = chars.next()?;
775 bytecount += 1 + next_char.len_utf8();
776 }
777 c if c == closing_delim => {
778 return Some((bytecount + 1, &text[1..bytecount]));
779 }
780 c => {
781 bytecount += c.len_utf8();
782 }
783 }
784 }
785 None
786 }
787
788 // note: dest returned is raw, still needs to be unescaped
789 // TODO: check that nested parens are really not allowed for refdefs
790 // TODO(performance): this func should probably its own unescaping
scan_link_dest( data: &str, start_ix: usize, max_next: usize, ) -> Option<(usize, &str)>791 pub(crate) fn scan_link_dest(
792 data: &str,
793 start_ix: usize,
794 max_next: usize,
795 ) -> Option<(usize, &str)> {
796 let bytes = &data.as_bytes()[start_ix..];
797 let mut i = scan_ch(bytes, b'<');
798
799 if i != 0 {
800 // pointy links
801 while i < bytes.len() {
802 match bytes[i] {
803 b'\n' | b'\r' | b'<' => return None,
804 b'>' => return Some((i + 1, &data[(start_ix + 1)..(start_ix + i)])),
805 b'\\' if i + 1 < bytes.len() && is_ascii_punctuation(bytes[i + 1]) => {
806 i += 1;
807 }
808 _ => {}
809 }
810 i += 1;
811 }
812 None
813 } else {
814 // non-pointy links
815 let mut nest = 0;
816 while i < bytes.len() {
817 match bytes[i] {
818 0x0..=0x20 => {
819 break;
820 }
821 b'(' => {
822 if nest > max_next {
823 return None;
824 }
825 nest += 1;
826 }
827 b')' => {
828 if nest == 0 {
829 break;
830 }
831 nest -= 1;
832 }
833 b'\\' if i + 1 < bytes.len() && is_ascii_punctuation(bytes[i + 1]) => {
834 i += 1;
835 }
836 _ => {}
837 }
838 i += 1;
839 }
840 Some((i, &data[start_ix..(start_ix + i)]))
841 }
842 }
843
844 /// Returns bytes scanned
scan_attribute_name(data: &[u8]) -> Option<usize>845 fn scan_attribute_name(data: &[u8]) -> Option<usize> {
846 let (&c, tail) = data.split_first()?;
847 if is_ascii_alpha(c) || c == b'_' || c == b':' {
848 Some(
849 1 + scan_while(tail, |c| {
850 is_ascii_alphanumeric(c) || c == b'_' || c == b'.' || c == b':' || c == b'-'
851 }),
852 )
853 } else {
854 None
855 }
856 }
857
858 /// Returns the index immediately following the attribute on success.
859 /// The argument `buffer_ix` refers to the index into `data` from which we
860 /// should copy into `buffer` when we find bytes to skip.
scan_attribute( data: &[u8], mut ix: usize, newline_handler: Option<&dyn Fn(&[u8]) -> usize>, buffer: &mut Vec<u8>, buffer_ix: &mut usize, ) -> Option<usize>861 fn scan_attribute(
862 data: &[u8],
863 mut ix: usize,
864 newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
865 buffer: &mut Vec<u8>,
866 buffer_ix: &mut usize,
867 ) -> Option<usize> {
868 ix += scan_attribute_name(&data[ix..])?;
869 let n_whitespace =
870 scan_whitespace_with_newline_handler(data, ix, newline_handler, buffer, buffer_ix)? - ix;
871 ix += n_whitespace;
872 if scan_ch(&data[ix..], b'=') == 1 {
873 ix += 1;
874 ix = scan_whitespace_with_newline_handler(data, ix, newline_handler, buffer, buffer_ix)?;
875 ix = scan_attribute_value(&data, ix, newline_handler, buffer, buffer_ix)?;
876 } else if n_whitespace > 0 {
877 // Leave whitespace for next attribute.
878 ix -= 1;
879 }
880 Some(ix)
881 }
882
883 /// Scans whitespace and possibly newlines according to the
884 /// behavior defined by the newline handler. When bytes are skipped,
885 /// all preceeding non-skipped bytes are pushed to the buffer.
scan_whitespace_with_newline_handler( data: &[u8], mut i: usize, newline_handler: Option<&dyn Fn(&[u8]) -> usize>, buffer: &mut Vec<u8>, buffer_ix: &mut usize, ) -> Option<usize>886 fn scan_whitespace_with_newline_handler(
887 data: &[u8],
888 mut i: usize,
889 newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
890 buffer: &mut Vec<u8>,
891 buffer_ix: &mut usize,
892 ) -> Option<usize> {
893 while i < data.len() {
894 if !is_ascii_whitespace(data[i]) {
895 return Some(i);
896 }
897 if let Some(eol_bytes) = scan_eol(&data[i..]) {
898 let handler = newline_handler?;
899 i += eol_bytes;
900 let skipped_bytes = handler(&data[i..]);
901
902 if skipped_bytes > 0 {
903 buffer.extend(&data[*buffer_ix..i]);
904 *buffer_ix = i + skipped_bytes;
905 }
906
907 i += skipped_bytes;
908 } else {
909 i += 1;
910 }
911 }
912
913 Some(i)
914 }
915
916 /// Returns the index immediately following the attribute value on success.
scan_attribute_value( data: &[u8], mut i: usize, newline_handler: Option<&dyn Fn(&[u8]) -> usize>, buffer: &mut Vec<u8>, buffer_ix: &mut usize, ) -> Option<usize>917 fn scan_attribute_value(
918 data: &[u8],
919 mut i: usize,
920 newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
921 buffer: &mut Vec<u8>,
922 buffer_ix: &mut usize,
923 ) -> Option<usize> {
924 match *data.get(i)? {
925 b @ b'"' | b @ b'\'' => {
926 i += 1;
927 while i < data.len() {
928 if data[i] == b {
929 return Some(i + 1);
930 }
931 if let Some(eol_bytes) = scan_eol(&data[i..]) {
932 let handler = newline_handler?;
933 i += eol_bytes;
934 let skipped_bytes = handler(&data[i..]);
935
936 if skipped_bytes > 0 {
937 buffer.extend(&data[*buffer_ix..i]);
938 *buffer_ix = i + skipped_bytes;
939 }
940 i += skipped_bytes;
941 } else {
942 i += 1;
943 }
944 }
945 return None;
946 }
947 b' ' | b'=' | b'>' | b'<' | b'`' | b'\n' | b'\r' => {
948 return None;
949 }
950 _ => {
951 // unquoted attribute value
952 i += scan_attr_value_chars(&data[i..]);
953 }
954 }
955
956 Some(i)
957 }
958
959 // Remove backslash escapes and resolve entities
unescape(input: &str) -> CowStr<'_>960 pub(crate) fn unescape(input: &str) -> CowStr<'_> {
961 let mut result = String::new();
962 let mut mark = 0;
963 let mut i = 0;
964 let bytes = input.as_bytes();
965 while i < bytes.len() {
966 match bytes[i] {
967 b'\\' if i + 1 < bytes.len() && is_ascii_punctuation(bytes[i + 1]) => {
968 result.push_str(&input[mark..i]);
969 mark = i + 1;
970 i += 2;
971 }
972 b'&' => match scan_entity(&bytes[i..]) {
973 (n, Some(value)) => {
974 result.push_str(&input[mark..i]);
975 result.push_str(&value);
976 i += n;
977 mark = i;
978 }
979 _ => i += 1,
980 },
981 b'\r' => {
982 result.push_str(&input[mark..i]);
983 i += 1;
984 mark = i;
985 }
986 _ => i += 1,
987 }
988 }
989 if mark == 0 {
990 input.into()
991 } else {
992 result.push_str(&input[mark..]);
993 result.into()
994 }
995 }
996
997 /// Assumes `data` is preceded by `<`.
scan_html_block_tag(data: &[u8]) -> (usize, &[u8])998 pub(crate) fn scan_html_block_tag(data: &[u8]) -> (usize, &[u8]) {
999 let i = scan_ch(data, b'/');
1000 let n = scan_while(&data[i..], is_ascii_alphanumeric);
1001 // TODO: scan attributes and >
1002 (i + n, &data[i..i + n])
1003 }
1004
is_html_tag(tag: &[u8]) -> bool1005 pub(crate) fn is_html_tag(tag: &[u8]) -> bool {
1006 HTML_TAGS
1007 .binary_search_by(|probe| {
1008 let probe_bytes_iter = probe.as_bytes().iter();
1009 let tag_bytes_iter = tag.iter();
1010
1011 probe_bytes_iter
1012 .zip(tag_bytes_iter)
1013 .find_map(|(&a, &b)| {
1014 // We can compare case insensitively because the probes are
1015 // all lower case alpha strings.
1016 match a.cmp(&(b | 0x20)) {
1017 std::cmp::Ordering::Equal => None,
1018 inequality => Some(inequality),
1019 }
1020 })
1021 .unwrap_or_else(|| probe.len().cmp(&tag.len()))
1022 })
1023 .is_ok()
1024 }
1025
1026 /// Assumes that `data` starts with `<`.
1027 /// Returns the index into data directly after the html tag on success.
scan_html_type_7(data: &[u8]) -> Option<usize>1028 pub(crate) fn scan_html_type_7(data: &[u8]) -> Option<usize> {
1029 // Block type html does not allow for newlines, so we
1030 // do not pass a newline handler.
1031 let (_span, i) = scan_html_block_inner(data, None)?;
1032 scan_blank_line(&data[i..])?;
1033 Some(i)
1034 }
1035
1036 /// Assumes that `data` starts with `<`.
1037 /// Returns the number of bytes scanned and the html in case of
1038 /// success.
1039 /// When some bytes were skipped, because the html was split over
1040 /// multiple leafs (e.g. over multiple lines in a blockquote),
1041 /// the html is returned as a vector of bytes.
1042 /// If no bytes were skipped, the buffer will be empty.
scan_html_block_inner( data: &[u8], newline_handler: Option<&dyn Fn(&[u8]) -> usize>, ) -> Option<(Vec<u8>, usize)>1043 pub(crate) fn scan_html_block_inner(
1044 data: &[u8],
1045 newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
1046 ) -> Option<(Vec<u8>, usize)> {
1047 let mut buffer = Vec::new();
1048 let mut last_buf_index = 0;
1049
1050 let close_tag_bytes = scan_ch(&data[1..], b'/');
1051 let l = scan_while(&data[(1 + close_tag_bytes)..], is_ascii_alpha);
1052 if l == 0 {
1053 return None;
1054 }
1055 let mut i = 1 + close_tag_bytes + l;
1056 i += scan_while(&data[i..], is_ascii_letterdigitdash);
1057
1058 if close_tag_bytes == 0 {
1059 loop {
1060 let old_i = i;
1061 loop {
1062 i += scan_whitespace_no_nl(&data[i..]);
1063 if let Some(eol_bytes) = scan_eol(&data[i..]) {
1064 if eol_bytes == 0 {
1065 return None;
1066 }
1067 let handler = newline_handler?;
1068 i += eol_bytes;
1069 let skipped_bytes = handler(&data[i..]);
1070
1071 if skipped_bytes > 0 {
1072 buffer.extend(&data[last_buf_index..i]);
1073 i += skipped_bytes;
1074 last_buf_index = i;
1075 }
1076 } else {
1077 break;
1078 }
1079 }
1080 if let Some(b'/') | Some(b'>') = data.get(i) {
1081 break;
1082 }
1083 if old_i == i {
1084 // No whitespace, which is mandatory.
1085 return None;
1086 }
1087 i = scan_attribute(&data, i, newline_handler, &mut buffer, &mut last_buf_index)?;
1088 }
1089 }
1090
1091 i += scan_whitespace_no_nl(&data[i..]);
1092
1093 if close_tag_bytes == 0 {
1094 i += scan_ch(&data[i..], b'/');
1095 }
1096
1097 if scan_ch(&data[i..], b'>') == 0 {
1098 None
1099 } else {
1100 i += 1;
1101 if !buffer.is_empty() {
1102 buffer.extend(&data[last_buf_index..i]);
1103 }
1104 Some((buffer, i))
1105 }
1106 }
1107
1108 /// Returns (next_byte_offset, uri, type)
scan_autolink(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>, LinkType)>1109 pub(crate) fn scan_autolink(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>, LinkType)> {
1110 scan_uri(text, start_ix)
1111 .map(|(bytes, uri)| (bytes, uri, LinkType::Autolink))
1112 .or_else(|| scan_email(text, start_ix).map(|(bytes, uri)| (bytes, uri, LinkType::Email)))
1113 }
1114
1115 /// Returns (next_byte_offset, uri)
scan_uri(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>)>1116 fn scan_uri(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>)> {
1117 let bytes = &text.as_bytes()[start_ix..];
1118
1119 // scheme's first byte must be an ascii letter
1120 if bytes.is_empty() || !is_ascii_alpha(bytes[0]) {
1121 return None;
1122 }
1123
1124 let mut i = 1;
1125
1126 while i < bytes.len() {
1127 let c = bytes[i];
1128 i += 1;
1129 match c {
1130 c if is_ascii_alphanumeric(c) => (),
1131 b'.' | b'-' | b'+' => (),
1132 b':' => break,
1133 _ => return None,
1134 }
1135 }
1136
1137 // scheme length must be between 2 and 32 characters long. scheme
1138 // must be followed by colon
1139 if i < 3 || i > 33 {
1140 return None;
1141 }
1142
1143 while i < bytes.len() {
1144 match bytes[i] {
1145 b'>' => return Some((start_ix + i + 1, text[start_ix..(start_ix + i)].into())),
1146 b'\0'..=b' ' | b'<' => return None,
1147 _ => (),
1148 }
1149 i += 1;
1150 }
1151
1152 None
1153 }
1154
1155 /// Returns (next_byte_offset, email)
scan_email(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>)>1156 fn scan_email(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>)> {
1157 // using a regex library would be convenient, but doing it by hand is not too bad
1158 let bytes = &text.as_bytes()[start_ix..];
1159 let mut i = 0;
1160
1161 while i < bytes.len() {
1162 let c = bytes[i];
1163 i += 1;
1164 match c {
1165 c if is_ascii_alphanumeric(c) => (),
1166 b'.' | b'!' | b'#' | b'$' | b'%' | b'&' | b'\'' | b'*' | b'+' | b'/' | b'=' | b'?'
1167 | b'^' | b'_' | b'`' | b'{' | b'|' | b'}' | b'~' | b'-' => (),
1168 b'@' => break,
1169 _ => return None,
1170 }
1171 }
1172
1173 loop {
1174 let label_start_ix = i;
1175 let mut fresh_label = true;
1176
1177 while i < bytes.len() {
1178 match bytes[i] {
1179 c if is_ascii_alphanumeric(c) => (),
1180 b'-' if fresh_label => {
1181 return None;
1182 }
1183 b'-' => (),
1184 _ => break,
1185 }
1186 fresh_label = false;
1187 i += 1;
1188 }
1189
1190 if i == label_start_ix || i - label_start_ix > 63 || bytes[i - 1] == b'-' {
1191 return None;
1192 }
1193
1194 if scan_ch(&bytes[i..], b'.') == 0 {
1195 break;
1196 }
1197 i += 1;
1198 }
1199
1200 if scan_ch(&bytes[i..], b'>') == 0 {
1201 return None;
1202 }
1203
1204 Some((start_ix + i + 1, text[start_ix..(start_ix + i)].into()))
1205 }
1206
1207 /// Scan comment, declaration, or CDATA section, with initial "<!" already consumed.
1208 /// Returns byte offset on match.
scan_inline_html_comment( bytes: &[u8], mut ix: usize, scan_guard: &mut HtmlScanGuard, ) -> Option<usize>1209 pub(crate) fn scan_inline_html_comment(
1210 bytes: &[u8],
1211 mut ix: usize,
1212 scan_guard: &mut HtmlScanGuard,
1213 ) -> Option<usize> {
1214 let c = *bytes.get(ix)?;
1215 ix += 1;
1216 match c {
1217 b'-' => {
1218 let dashes = scan_ch_repeat(&bytes[ix..], b'-');
1219 if dashes < 1 {
1220 return None;
1221 }
1222 // Saw "<!--", scan comment.
1223 ix += dashes;
1224 if scan_ch(&bytes[ix..], b'>') == 1 {
1225 return None;
1226 }
1227
1228 while let Some(x) = memchr(b'-', &bytes[ix..]) {
1229 ix += x + 1;
1230 if scan_ch(&bytes[ix..], b'-') == 1 {
1231 ix += 1;
1232 return if scan_ch(&bytes[ix..], b'>') == 1 {
1233 Some(ix + 1)
1234 } else {
1235 None
1236 };
1237 }
1238 }
1239 None
1240 }
1241 b'[' if bytes[ix..].starts_with(b"CDATA[") && ix > scan_guard.cdata => {
1242 ix += b"CDATA[".len();
1243 ix = memchr(b']', &bytes[ix..]).map_or(bytes.len(), |x| ix + x);
1244 let close_brackets = scan_ch_repeat(&bytes[ix..], b']');
1245 ix += close_brackets;
1246
1247 if close_brackets == 0 || scan_ch(&bytes[ix..], b'>') == 0 {
1248 scan_guard.cdata = ix;
1249 None
1250 } else {
1251 Some(ix + 1)
1252 }
1253 }
1254 b'A'..=b'Z' if ix > scan_guard.declaration => {
1255 // Scan declaration.
1256 ix += scan_while(&bytes[ix..], |c| c >= b'A' && c <= b'Z');
1257 let whitespace = scan_while(&bytes[ix..], is_ascii_whitespace);
1258 if whitespace == 0 {
1259 return None;
1260 }
1261 ix += whitespace;
1262 ix = memchr(b'>', &bytes[ix..]).map_or(bytes.len(), |x| ix + x);
1263 if scan_ch(&bytes[ix..], b'>') == 0 {
1264 scan_guard.declaration = ix;
1265 None
1266 } else {
1267 Some(ix + 1)
1268 }
1269 }
1270 _ => None,
1271 }
1272 }
1273
1274 /// Scan processing directive, with initial "<?" already consumed.
1275 /// Returns the next byte offset on success.
scan_inline_html_processing( bytes: &[u8], mut ix: usize, scan_guard: &mut HtmlScanGuard, ) -> Option<usize>1276 pub(crate) fn scan_inline_html_processing(
1277 bytes: &[u8],
1278 mut ix: usize,
1279 scan_guard: &mut HtmlScanGuard,
1280 ) -> Option<usize> {
1281 if ix <= scan_guard.processing {
1282 return None;
1283 }
1284 while let Some(offset) = memchr(b'?', &bytes[ix..]) {
1285 ix += offset + 1;
1286 if scan_ch(&bytes[ix..], b'>') == 1 {
1287 return Some(ix + 1);
1288 }
1289 }
1290 scan_guard.processing = ix;
1291 None
1292 }
1293
1294 #[cfg(test)]
1295 mod test {
1296 use super::*;
1297 #[test]
overflow_list()1298 fn overflow_list() {
1299 assert!(
1300 scan_listitem(b"4444444444444444444444444444444444444444444444444444444444!").is_none()
1301 );
1302 }
1303
1304 #[test]
overflow_by_addition()1305 fn overflow_by_addition() {
1306 assert!(scan_listitem(b"1844674407370955161615!").is_none());
1307 }
1308 }
1309