1 // Copyright 2018 Google LLC
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a copy
4 // of this software and associated documentation files (the "Software"), to deal
5 // in the Software without restriction, including without limitation the rights
6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 // copies of the Software, and to permit persons to whom the Software is
8 // furnished to do so, subject to the following conditions:
9 //
10 // The above copyright notice and this permission notice shall be included in
11 // all copies or substantial portions of the Software.
12 //
13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 // THE SOFTWARE.
20 
21 //! Link label parsing and matching.
22 
23 use unicase::UniCase;
24 
25 use crate::scanners::{is_ascii_whitespace, scan_eol};
26 use crate::strings::CowStr;
27 
28 pub enum ReferenceLabel<'a> {
29     Link(CowStr<'a>),
30     Footnote(CowStr<'a>),
31 }
32 
33 pub type LinkLabel<'a> = UniCase<CowStr<'a>>;
34 
35 /// Assumes the opening bracket has already been scanned.
36 /// The line break handler determines what happens when a linebreak
37 /// is found. It is passed the bytes following the line break and
38 /// either returns `Some(k)`, where `k` is the number of bytes to skip,
39 /// or `None` to abort parsing the label.
40 /// Returns the number of bytes read (including closing bracket) and label on success.
scan_link_label_rest<'t>( text: &'t str, linebreak_handler: &dyn Fn(&[u8]) -> Option<usize>, ) -> Option<(usize, CowStr<'t>)>41 pub(crate) fn scan_link_label_rest<'t>(
42     text: &'t str,
43     linebreak_handler: &dyn Fn(&[u8]) -> Option<usize>,
44 ) -> Option<(usize, CowStr<'t>)> {
45     let bytes = text.as_bytes();
46     let mut ix = 0;
47     let mut only_white_space = true;
48     let mut codepoints = 0;
49     // no worries, doesnt allocate until we push things onto it
50     let mut label = String::new();
51     let mut mark = 0;
52 
53     loop {
54         if codepoints >= 1000 {
55             return None;
56         }
57         match *bytes.get(ix)? {
58             b'[' => return None,
59             b']' => break,
60             b'\\' => {
61                 ix += 2;
62                 codepoints += 2;
63                 only_white_space = false;
64             }
65             b if is_ascii_whitespace(b) => {
66                 // normalize labels by collapsing whitespaces, including linebreaks
67                 let mut whitespaces = 0;
68                 let mut linebreaks = 0;
69                 let whitespace_start = ix;
70 
71                 while ix < bytes.len() && is_ascii_whitespace(bytes[ix]) {
72                     if let Some(eol_bytes) = scan_eol(&bytes[ix..]) {
73                         linebreaks += 1;
74                         if linebreaks > 1 {
75                             return None;
76                         }
77                         ix += eol_bytes;
78                         ix += linebreak_handler(&bytes[ix..])?;
79                         whitespaces += 2; // indicate that we need to replace
80                     } else {
81                         whitespaces += if bytes[ix] == b' ' { 1 } else { 2 };
82                         ix += 1;
83                     }
84                 }
85                 if whitespaces > 1 {
86                     label.push_str(&text[mark..whitespace_start]);
87                     label.push(' ');
88                     mark = ix;
89                     codepoints += ix - whitespace_start;
90                 } else {
91                     codepoints += 1;
92                 }
93             }
94             b => {
95                 only_white_space = false;
96                 ix += 1;
97                 if b & 0b1000_0000 != 0 {
98                     codepoints += 1;
99                 }
100             }
101         }
102     }
103 
104     if only_white_space {
105         None
106     } else {
107         let cow = if mark == 0 {
108             text[..ix].into()
109         } else {
110             label.push_str(&text[mark..ix]);
111             label.into()
112         };
113         Some((ix + 1, cow))
114     }
115 }
116 
117 #[cfg(test)]
118 mod test {
119     use super::scan_link_label_rest;
120 
121     #[test]
whitespace_normalization()122     fn whitespace_normalization() {
123         let input = "«\t\tBlurry Eyes\t\t»][blurry_eyes]";
124         let expected_output = "« Blurry Eyes »"; // regular spaces!
125 
126         let (_bytes, normalized_label) = scan_link_label_rest(input, &|_| None).unwrap();
127         assert_eq!(expected_output, normalized_label.as_ref());
128     }
129 
130     #[test]
return_carriage_linefeed_ok()131     fn return_carriage_linefeed_ok() {
132         let input = "hello\r\nworld\r\n]";
133         assert!(scan_link_label_rest(input, &|_| Some(0)).is_some());
134     }
135 }
136