1 // Copyright 2014-2017 The html5ever Project Developers. See the
2 // COPYRIGHT file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9 
10 use super::{TokenSink, Tokenizer};
11 use crate::buffer_queue::BufferQueue;
12 use crate::data;
13 use crate::tendril::StrTendril;
14 use crate::util::str::is_ascii_alnum;
15 
16 use log::debug;
17 use mac::format_if;
18 use std::borrow::Cow::Borrowed;
19 use std::char::from_u32;
20 
21 use self::State::*;
22 pub use self::Status::*;
23 
24 //§ tokenizing-character-references
25 pub struct CharRef {
26     /// The resulting character(s)
27     pub chars: [char; 2],
28 
29     /// How many slots in `chars` are valid?
30     pub num_chars: u8,
31 }
32 
33 pub enum Status {
34     Stuck,
35     Progress,
36     Done,
37 }
38 
39 #[derive(Debug)]
40 enum State {
41     Begin,
42     Octothorpe,
43     Numeric(u32), // base
44     NumericSemicolon,
45     Named,
46     BogusName,
47 }
48 
49 pub struct CharRefTokenizer {
50     state: State,
51     addnl_allowed: Option<char>,
52     result: Option<CharRef>,
53 
54     num: u32,
55     num_too_big: bool,
56     seen_digit: bool,
57     hex_marker: Option<char>,
58 
59     name_buf_opt: Option<StrTendril>,
60     name_match: Option<(u32, u32)>,
61     name_len: usize,
62 }
63 
64 impl CharRefTokenizer {
65     // NB: We assume that we have an additional allowed character iff we're
66     // tokenizing in an attribute value.
new(addnl_allowed: Option<char>) -> CharRefTokenizer67     pub fn new(addnl_allowed: Option<char>) -> CharRefTokenizer {
68         CharRefTokenizer {
69             state: Begin,
70             addnl_allowed: addnl_allowed,
71             result: None,
72             num: 0,
73             num_too_big: false,
74             seen_digit: false,
75             hex_marker: None,
76             name_buf_opt: None,
77             name_match: None,
78             name_len: 0,
79         }
80     }
81 
82     // A CharRefTokenizer can only tokenize one character reference,
83     // so this method consumes the tokenizer.
get_result(self) -> CharRef84     pub fn get_result(self) -> CharRef {
85         self.result.expect("get_result called before done")
86     }
87 
name_buf<'t>(&'t self) -> &'t StrTendril88     fn name_buf<'t>(&'t self) -> &'t StrTendril {
89         self.name_buf_opt
90             .as_ref()
91             .expect("name_buf missing in named character reference")
92     }
93 
name_buf_mut<'t>(&'t mut self) -> &'t mut StrTendril94     fn name_buf_mut<'t>(&'t mut self) -> &'t mut StrTendril {
95         self.name_buf_opt
96             .as_mut()
97             .expect("name_buf missing in named character reference")
98     }
99 
finish_none(&mut self) -> Status100     fn finish_none(&mut self) -> Status {
101         self.result = Some(CharRef {
102             chars: ['\0', '\0'],
103             num_chars: 0,
104         });
105         Done
106     }
107 
finish_one(&mut self, c: char) -> Status108     fn finish_one(&mut self, c: char) -> Status {
109         self.result = Some(CharRef {
110             chars: [c, '\0'],
111             num_chars: 1,
112         });
113         Done
114     }
115 }
116 
117 impl CharRefTokenizer {
step<Sink: TokenSink>( &mut self, tokenizer: &mut Tokenizer<Sink>, input: &mut BufferQueue, ) -> Status118     pub fn step<Sink: TokenSink>(
119         &mut self,
120         tokenizer: &mut Tokenizer<Sink>,
121         input: &mut BufferQueue,
122     ) -> Status {
123         if self.result.is_some() {
124             return Done;
125         }
126 
127         debug!("char ref tokenizer stepping in state {:?}", self.state);
128         match self.state {
129             Begin => self.do_begin(tokenizer, input),
130             Octothorpe => self.do_octothorpe(tokenizer, input),
131             Numeric(base) => self.do_numeric(tokenizer, input, base),
132             NumericSemicolon => self.do_numeric_semicolon(tokenizer, input),
133             Named => self.do_named(tokenizer, input),
134             BogusName => self.do_bogus_name(tokenizer, input),
135         }
136     }
137 
do_begin<Sink: TokenSink>( &mut self, tokenizer: &mut Tokenizer<Sink>, input: &mut BufferQueue, ) -> Status138     fn do_begin<Sink: TokenSink>(
139         &mut self,
140         tokenizer: &mut Tokenizer<Sink>,
141         input: &mut BufferQueue,
142     ) -> Status {
143         match unwrap_or_return!(tokenizer.peek(input), Stuck) {
144             '\t' | '\n' | '\x0C' | ' ' | '<' | '&' => self.finish_none(),
145             c if Some(c) == self.addnl_allowed => self.finish_none(),
146 
147             '#' => {
148                 tokenizer.discard_char(input);
149                 self.state = Octothorpe;
150                 Progress
151             },
152 
153             _ => {
154                 self.state = Named;
155                 self.name_buf_opt = Some(StrTendril::new());
156                 Progress
157             },
158         }
159     }
160 
do_octothorpe<Sink: TokenSink>( &mut self, tokenizer: &mut Tokenizer<Sink>, input: &mut BufferQueue, ) -> Status161     fn do_octothorpe<Sink: TokenSink>(
162         &mut self,
163         tokenizer: &mut Tokenizer<Sink>,
164         input: &mut BufferQueue,
165     ) -> Status {
166         let c = unwrap_or_return!(tokenizer.peek(input), Stuck);
167         match c {
168             'x' | 'X' => {
169                 tokenizer.discard_char(input);
170                 self.hex_marker = Some(c);
171                 self.state = Numeric(16);
172             },
173 
174             _ => {
175                 self.hex_marker = None;
176                 self.state = Numeric(10);
177             },
178         }
179         Progress
180     }
181 
do_numeric<Sink: TokenSink>( &mut self, tokenizer: &mut Tokenizer<Sink>, input: &mut BufferQueue, base: u32, ) -> Status182     fn do_numeric<Sink: TokenSink>(
183         &mut self,
184         tokenizer: &mut Tokenizer<Sink>,
185         input: &mut BufferQueue,
186         base: u32,
187     ) -> Status {
188         let c = unwrap_or_return!(tokenizer.peek(input), Stuck);
189         match c.to_digit(base) {
190             Some(n) => {
191                 tokenizer.discard_char(input);
192                 self.num = self.num.wrapping_mul(base);
193                 if self.num > 0x10FFFF {
194                     // We might overflow, and the character is definitely invalid.
195                     // We still parse digits and semicolon, but don't use the result.
196                     self.num_too_big = true;
197                 }
198                 self.num = self.num.wrapping_add(n);
199                 self.seen_digit = true;
200                 Progress
201             },
202 
203             None if !self.seen_digit => self.unconsume_numeric(tokenizer, input),
204 
205             None => {
206                 self.state = NumericSemicolon;
207                 Progress
208             },
209         }
210     }
211 
do_numeric_semicolon<Sink: TokenSink>( &mut self, tokenizer: &mut Tokenizer<Sink>, input: &mut BufferQueue, ) -> Status212     fn do_numeric_semicolon<Sink: TokenSink>(
213         &mut self,
214         tokenizer: &mut Tokenizer<Sink>,
215         input: &mut BufferQueue,
216     ) -> Status {
217         match unwrap_or_return!(tokenizer.peek(input), Stuck) {
218             ';' => tokenizer.discard_char(input),
219             _ => tokenizer.emit_error(Borrowed(
220                 "Semicolon missing after numeric character reference",
221             )),
222         };
223         self.finish_numeric(tokenizer)
224     }
225 
unconsume_numeric<Sink: TokenSink>( &mut self, tokenizer: &mut Tokenizer<Sink>, input: &mut BufferQueue, ) -> Status226     fn unconsume_numeric<Sink: TokenSink>(
227         &mut self,
228         tokenizer: &mut Tokenizer<Sink>,
229         input: &mut BufferQueue,
230     ) -> Status {
231         let mut unconsume = StrTendril::from_char('#');
232         match self.hex_marker {
233             Some(c) => unconsume.push_char(c),
234             None => (),
235         }
236 
237         input.push_front(unconsume);
238         tokenizer.emit_error(Borrowed("Numeric character reference without digits"));
239         self.finish_none()
240     }
241 
finish_numeric<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) -> Status242     fn finish_numeric<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) -> Status {
243         fn conv(n: u32) -> char {
244             from_u32(n).expect("invalid char missed by error handling cases")
245         }
246 
247         let (c, error) = match self.num {
248             n if (n > 0x10FFFF) || self.num_too_big => ('\u{fffd}', true),
249             0x00 | 0xD800..=0xDFFF => ('\u{fffd}', true),
250 
251             0x80..=0x9F => match data::C1_REPLACEMENTS[(self.num - 0x80) as usize] {
252                 Some(c) => (c, true),
253                 None => (conv(self.num), true),
254             },
255 
256             0x01..=0x08 | 0x0B | 0x0D..=0x1F | 0x7F | 0xFDD0..=0xFDEF => (conv(self.num), true),
257 
258             n if (n & 0xFFFE) == 0xFFFE => (conv(n), true),
259 
260             n => (conv(n), false),
261         };
262 
263         if error {
264             let msg = format_if!(
265                 tokenizer.opts.exact_errors,
266                 "Invalid numeric character reference",
267                 "Invalid numeric character reference value 0x{:06X}",
268                 self.num
269             );
270             tokenizer.emit_error(msg);
271         }
272 
273         self.finish_one(c)
274     }
275 
do_named<Sink: TokenSink>( &mut self, tokenizer: &mut Tokenizer<Sink>, input: &mut BufferQueue, ) -> Status276     fn do_named<Sink: TokenSink>(
277         &mut self,
278         tokenizer: &mut Tokenizer<Sink>,
279         input: &mut BufferQueue,
280     ) -> Status {
281         let c = unwrap_or_return!(tokenizer.get_char(input), Stuck);
282         self.name_buf_mut().push_char(c);
283         match data::NAMED_ENTITIES.get(&self.name_buf()[..]) {
284             // We have either a full match or a prefix of one.
285             Some(&m) => {
286                 if m.0 != 0 {
287                     // We have a full match, but there might be a longer one to come.
288                     self.name_match = Some(m);
289                     self.name_len = self.name_buf().len();
290                 }
291                 // Otherwise we just have a prefix match.
292                 Progress
293             },
294 
295             // Can't continue the match.
296             None => self.finish_named(tokenizer, input, Some(c)),
297         }
298     }
299 
emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>)300     fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) {
301         let msg = format_if!(
302             tokenizer.opts.exact_errors,
303             "Invalid character reference",
304             "Invalid character reference &{}",
305             self.name_buf()
306         );
307         tokenizer.emit_error(msg);
308     }
309 
unconsume_name(&mut self, input: &mut BufferQueue)310     fn unconsume_name(&mut self, input: &mut BufferQueue) {
311         input.push_front(self.name_buf_opt.take().unwrap());
312     }
313 
finish_named<Sink: TokenSink>( &mut self, tokenizer: &mut Tokenizer<Sink>, input: &mut BufferQueue, end_char: Option<char>, ) -> Status314     fn finish_named<Sink: TokenSink>(
315         &mut self,
316         tokenizer: &mut Tokenizer<Sink>,
317         input: &mut BufferQueue,
318         end_char: Option<char>,
319     ) -> Status {
320         match self.name_match {
321             None => {
322                 match end_char {
323                     Some(c) if is_ascii_alnum(c) => {
324                         // Keep looking for a semicolon, to determine whether
325                         // we emit a parse error.
326                         self.state = BogusName;
327                         return Progress;
328                     },
329 
330                     // Check length because &; is not a parse error.
331                     Some(';') if self.name_buf().len() > 1 => self.emit_name_error(tokenizer),
332 
333                     _ => (),
334                 }
335                 self.unconsume_name(input);
336                 self.finish_none()
337             },
338 
339             Some((c1, c2)) => {
340                 // We have a complete match, but we may have consumed
341                 // additional characters into self.name_buf.  Usually
342                 // at least one, but several in cases like
343                 //
344                 //     &not    => match for U+00AC
345                 //     &noti   => valid prefix for &notin
346                 //     &notit  => can't continue match
347 
348                 let name_len = self.name_len;
349                 assert!(name_len > 0);
350                 let last_matched = self.name_buf()[name_len - 1..].chars().next().unwrap();
351 
352                 // There might not be a next character after the match, if
353                 // we had a full match and then hit EOF.
354                 let next_after = if name_len == self.name_buf().len() {
355                     None
356                 } else {
357                     Some(self.name_buf()[name_len..].chars().next().unwrap())
358                 };
359 
360                 // "If the character reference is being consumed as part of an
361                 // attribute, and the last character matched is not a U+003B
362                 // SEMICOLON character (;), and the next character is either a
363                 // U+003D EQUALS SIGN character (=) or an alphanumeric ASCII
364                 // character, then, for historical reasons, all the characters
365                 // that were matched after the U+0026 AMPERSAND character (&)
366                 // must be unconsumed, and nothing is returned. However, if
367                 // this next character is in fact a U+003D EQUALS SIGN
368                 // character (=), then this is a parse error"
369 
370                 let unconsume_all = match (self.addnl_allowed, last_matched, next_after) {
371                     (_, ';', _) => false,
372                     (Some(_), _, Some('=')) => {
373                         tokenizer.emit_error(Borrowed(
374                             "Equals sign after character reference in attribute",
375                         ));
376                         true
377                     },
378                     (Some(_), _, Some(c)) if is_ascii_alnum(c) => true,
379                     _ => {
380                         tokenizer.emit_error(Borrowed(
381                             "Character reference does not end with semicolon",
382                         ));
383                         false
384                     },
385                 };
386 
387                 if unconsume_all {
388                     self.unconsume_name(input);
389                     self.finish_none()
390                 } else {
391                     input.push_front(StrTendril::from_slice(&self.name_buf()[name_len..]));
392                     self.result = Some(CharRef {
393                         chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()],
394                         num_chars: if c2 == 0 { 1 } else { 2 },
395                     });
396                     Done
397                 }
398             },
399         }
400     }
401 
do_bogus_name<Sink: TokenSink>( &mut self, tokenizer: &mut Tokenizer<Sink>, input: &mut BufferQueue, ) -> Status402     fn do_bogus_name<Sink: TokenSink>(
403         &mut self,
404         tokenizer: &mut Tokenizer<Sink>,
405         input: &mut BufferQueue,
406     ) -> Status {
407         let c = unwrap_or_return!(tokenizer.get_char(input), Stuck);
408         self.name_buf_mut().push_char(c);
409         match c {
410             _ if is_ascii_alnum(c) => return Progress,
411             ';' => self.emit_name_error(tokenizer),
412             _ => (),
413         }
414         self.unconsume_name(input);
415         self.finish_none()
416     }
417 
end_of_file<Sink: TokenSink>( &mut self, tokenizer: &mut Tokenizer<Sink>, input: &mut BufferQueue, )418     pub fn end_of_file<Sink: TokenSink>(
419         &mut self,
420         tokenizer: &mut Tokenizer<Sink>,
421         input: &mut BufferQueue,
422     ) {
423         while self.result.is_none() {
424             match self.state {
425                 Begin => drop(self.finish_none()),
426 
427                 Numeric(_) if !self.seen_digit => drop(self.unconsume_numeric(tokenizer, input)),
428 
429                 Numeric(_) | NumericSemicolon => {
430                     tokenizer.emit_error(Borrowed("EOF in numeric character reference"));
431                     self.finish_numeric(tokenizer);
432                 },
433 
434                 Named => drop(self.finish_named(tokenizer, input, None)),
435 
436                 BogusName => {
437                     self.unconsume_name(input);
438                     self.finish_none();
439                 },
440 
441                 Octothorpe => {
442                     input.push_front(StrTendril::from_slice("#"));
443                     tokenizer.emit_error(Borrowed("EOF after '#' in character reference"));
444                     self.finish_none();
445                 },
446             }
447         }
448     }
449 }
450