1 // Copyright 2014-2017 The html5ever Project Developers. See the
2 // COPYRIGHT file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9 
10 use super::{TokenSink, XmlTokenizer};
11 use crate::data;
12 use crate::tendril::StrTendril;
13 use log::debug;
14 use mac::{format_if, unwrap_or_return};
15 use markup5ever::buffer_queue::BufferQueue;
16 use std::borrow::Cow::Borrowed;
17 use std::char::from_u32;
18 
19 use self::State::*;
20 pub use self::Status::*;
21 
22 //§ tokenizing-character-references
23 pub struct CharRef {
24     /// The resulting character(s)
25     pub chars: [char; 2],
26 
27     /// How many slots in `chars` are valid?
28     pub num_chars: u8,
29 }
30 
31 pub enum Status {
32     Stuck,
33     Progress,
34     Done,
35 }
36 
37 #[derive(Debug)]
38 enum State {
39     Begin,
40     Octothorpe,
41     Numeric(u32), // base
42     NumericSemicolon,
43     Named,
44     BogusName,
45 }
46 
47 pub struct CharRefTokenizer {
48     state: State,
49     addnl_allowed: Option<char>,
50     result: Option<CharRef>,
51 
52     num: u32,
53     num_too_big: bool,
54     seen_digit: bool,
55     hex_marker: Option<char>,
56 
57     name_buf_opt: Option<StrTendril>,
58     name_match: Option<(u32, u32)>,
59     name_len: usize,
60 }
61 
62 impl CharRefTokenizer {
63     // NB: We assume that we have an additional allowed character iff we're
64     // tokenizing in an attribute value.
new(addnl_allowed: Option<char>) -> CharRefTokenizer65     pub fn new(addnl_allowed: Option<char>) -> CharRefTokenizer {
66         CharRefTokenizer {
67             state: Begin,
68             addnl_allowed,
69             result: None,
70             num: 0,
71             num_too_big: false,
72             seen_digit: false,
73             hex_marker: None,
74             name_buf_opt: None,
75             name_match: None,
76             name_len: 0,
77         }
78     }
79 
80     // A CharRefTokenizer can only tokenize one character reference,
81     // so this method consumes the tokenizer.
get_result(self) -> CharRef82     pub fn get_result(self) -> CharRef {
83         self.result.expect("get_result called before done")
84     }
85 
name_buf(&self) -> &StrTendril86     fn name_buf(&self) -> &StrTendril {
87         self.name_buf_opt
88             .as_ref()
89             .expect("name_buf missing in named character reference")
90     }
91 
name_buf_mut(&mut self) -> &mut StrTendril92     fn name_buf_mut(&mut self) -> &mut StrTendril {
93         self.name_buf_opt
94             .as_mut()
95             .expect("name_buf missing in named character reference")
96     }
97 
finish_none(&mut self) -> Status98     fn finish_none(&mut self) -> Status {
99         self.result = Some(CharRef {
100             chars: ['\0', '\0'],
101             num_chars: 0,
102         });
103         Done
104     }
105 
finish_one(&mut self, c: char) -> Status106     fn finish_one(&mut self, c: char) -> Status {
107         self.result = Some(CharRef {
108             chars: [c, '\0'],
109             num_chars: 1,
110         });
111         Done
112     }
113 }
114 
115 impl CharRefTokenizer {
step<Sink: TokenSink>( &mut self, tokenizer: &mut XmlTokenizer<Sink>, input: &mut BufferQueue, ) -> Status116     pub fn step<Sink: TokenSink>(
117         &mut self,
118         tokenizer: &mut XmlTokenizer<Sink>,
119         input: &mut BufferQueue,
120     ) -> Status {
121         if self.result.is_some() {
122             return Done;
123         }
124 
125         debug!("char ref tokenizer stepping in state {:?}", self.state);
126         match self.state {
127             Begin => self.do_begin(tokenizer, input),
128             Octothorpe => self.do_octothorpe(tokenizer, input),
129             Numeric(base) => self.do_numeric(tokenizer, base, input),
130             NumericSemicolon => self.do_numeric_semicolon(tokenizer, input),
131             Named => self.do_named(tokenizer, input),
132             BogusName => self.do_bogus_name(tokenizer, input),
133         }
134     }
135 
do_begin<Sink: TokenSink>( &mut self, tokenizer: &mut XmlTokenizer<Sink>, input: &mut BufferQueue, ) -> Status136     fn do_begin<Sink: TokenSink>(
137         &mut self,
138         tokenizer: &mut XmlTokenizer<Sink>,
139         input: &mut BufferQueue,
140     ) -> Status {
141         match unwrap_or_return!(tokenizer.peek(input), Stuck) {
142             '\t' | '\n' | '\x0C' | ' ' | '<' | '&' => self.finish_none(),
143             c if Some(c) == self.addnl_allowed => self.finish_none(),
144 
145             '#' => {
146                 tokenizer.discard_char(input);
147                 self.state = Octothorpe;
148                 Progress
149             },
150 
151             _ => {
152                 self.state = Named;
153                 self.name_buf_opt = Some(StrTendril::new());
154                 Progress
155             },
156         }
157     }
158 
do_octothorpe<Sink: TokenSink>( &mut self, tokenizer: &mut XmlTokenizer<Sink>, input: &mut BufferQueue, ) -> Status159     fn do_octothorpe<Sink: TokenSink>(
160         &mut self,
161         tokenizer: &mut XmlTokenizer<Sink>,
162         input: &mut BufferQueue,
163     ) -> Status {
164         let c = unwrap_or_return!(tokenizer.peek(input), Stuck);
165         match c {
166             'x' | 'X' => {
167                 tokenizer.discard_char(input);
168                 self.hex_marker = Some(c);
169                 self.state = Numeric(16);
170             },
171 
172             _ => {
173                 self.hex_marker = None;
174                 self.state = Numeric(10);
175             },
176         }
177         Progress
178     }
179 
do_numeric<Sink: TokenSink>( &mut self, tokenizer: &mut XmlTokenizer<Sink>, base: u32, input: &mut BufferQueue, ) -> Status180     fn do_numeric<Sink: TokenSink>(
181         &mut self,
182         tokenizer: &mut XmlTokenizer<Sink>,
183         base: u32,
184         input: &mut BufferQueue,
185     ) -> Status {
186         let c = unwrap_or_return!(tokenizer.peek(input), Stuck);
187         match c.to_digit(base) {
188             Some(n) => {
189                 tokenizer.discard_char(input);
190                 self.num = self.num.wrapping_mul(base);
191                 if self.num > 0x10FFFF {
192                     // We might overflow, and the character is definitely invalid.
193                     // We still parse digits and semicolon, but don't use the result.
194                     self.num_too_big = true;
195                 }
196                 self.num = self.num.wrapping_add(n);
197                 self.seen_digit = true;
198                 Progress
199             },
200 
201             None if !self.seen_digit => self.unconsume_numeric(tokenizer, input),
202 
203             None => {
204                 self.state = NumericSemicolon;
205                 Progress
206             },
207         }
208     }
209 
do_numeric_semicolon<Sink: TokenSink>( &mut self, tokenizer: &mut XmlTokenizer<Sink>, input: &mut BufferQueue, ) -> Status210     fn do_numeric_semicolon<Sink: TokenSink>(
211         &mut self,
212         tokenizer: &mut XmlTokenizer<Sink>,
213         input: &mut BufferQueue,
214     ) -> Status {
215         match unwrap_or_return!(tokenizer.peek(input), Stuck) {
216             ';' => tokenizer.discard_char(input),
217             _ => tokenizer.emit_error(Borrowed(
218                 "Semicolon missing after numeric character reference",
219             )),
220         };
221         self.finish_numeric(tokenizer)
222     }
223 
unconsume_numeric<Sink: TokenSink>( &mut self, tokenizer: &mut XmlTokenizer<Sink>, input: &mut BufferQueue, ) -> Status224     fn unconsume_numeric<Sink: TokenSink>(
225         &mut self,
226         tokenizer: &mut XmlTokenizer<Sink>,
227         input: &mut BufferQueue,
228     ) -> Status {
229         let mut unconsume = StrTendril::from_char('#');
230         match self.hex_marker {
231             Some(c) => unconsume.push_char(c),
232             None => (),
233         }
234 
235         tokenizer.unconsume(input, unconsume);
236         tokenizer.emit_error(Borrowed("Numeric character reference without digits"));
237         self.finish_none()
238     }
239 
finish_numeric<Sink: TokenSink>(&mut self, tokenizer: &mut XmlTokenizer<Sink>) -> Status240     fn finish_numeric<Sink: TokenSink>(&mut self, tokenizer: &mut XmlTokenizer<Sink>) -> Status {
241         fn conv(n: u32) -> char {
242             from_u32(n).expect("invalid char missed by error handling cases")
243         }
244 
245         let (c, error) = match self.num {
246             n if (n > 0x10FFFF) || self.num_too_big => ('\u{fffd}', true),
247             0x00 | 0xD800..=0xDFFF => ('\u{fffd}', true),
248 
249             0x80..=0x9F => match data::C1_REPLACEMENTS[(self.num - 0x80) as usize] {
250                 Some(c) => (c, true),
251                 None => (conv(self.num), true),
252             },
253 
254             0x01..=0x08 | 0x0B | 0x0D..=0x1F | 0x7F | 0xFDD0..=0xFDEF => (conv(self.num), true),
255 
256             n if (n & 0xFFFE) == 0xFFFE => (conv(n), true),
257 
258             n => (conv(n), false),
259         };
260 
261         if error {
262             let msg = format_if!(
263                 tokenizer.opts.exact_errors,
264                 "Invalid numeric character reference",
265                 "Invalid numeric character reference value 0x{:06X}",
266                 self.num
267             );
268             tokenizer.emit_error(msg);
269         }
270 
271         self.finish_one(c)
272     }
273 
do_named<Sink: TokenSink>( &mut self, tokenizer: &mut XmlTokenizer<Sink>, input: &mut BufferQueue, ) -> Status274     fn do_named<Sink: TokenSink>(
275         &mut self,
276         tokenizer: &mut XmlTokenizer<Sink>,
277         input: &mut BufferQueue,
278     ) -> Status {
279         let c = unwrap_or_return!(tokenizer.get_char(input), Stuck);
280         self.name_buf_mut().push_char(c);
281         match data::NAMED_ENTITIES.get(&self.name_buf()[..]) {
282             // We have either a full match or a prefix of one.
283             Some(&m) => {
284                 if m.0 != 0 {
285                     // We have a full match, but there might be a longer one to come.
286                     self.name_match = Some(m);
287                     self.name_len = self.name_buf().len();
288                 }
289                 // Otherwise we just have a prefix match.
290                 Progress
291             },
292 
293             // Can't continue the match.
294             None => self.finish_named(tokenizer, Some(c), input),
295         }
296     }
297 
emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &mut XmlTokenizer<Sink>)298     fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &mut XmlTokenizer<Sink>) {
299         let msg = format_if!(
300             tokenizer.opts.exact_errors,
301             "Invalid character reference",
302             "Invalid character reference &{}",
303             self.name_buf()
304         );
305         tokenizer.emit_error(msg);
306     }
307 
unconsume_name<Sink: TokenSink>( &mut self, tokenizer: &mut XmlTokenizer<Sink>, input: &mut BufferQueue, )308     fn unconsume_name<Sink: TokenSink>(
309         &mut self,
310         tokenizer: &mut XmlTokenizer<Sink>,
311         input: &mut BufferQueue,
312     ) {
313         tokenizer.unconsume(input, self.name_buf_opt.take().unwrap());
314     }
315 
finish_named<Sink: TokenSink>( &mut self, tokenizer: &mut XmlTokenizer<Sink>, end_char: Option<char>, input: &mut BufferQueue, ) -> Status316     fn finish_named<Sink: TokenSink>(
317         &mut self,
318         tokenizer: &mut XmlTokenizer<Sink>,
319         end_char: Option<char>,
320         input: &mut BufferQueue,
321     ) -> Status {
322         match self.name_match {
323             None => {
324                 match end_char {
325                     Some(c) if c.is_ascii_alphanumeric() => {
326                         // Keep looking for a semicolon, to determine whether
327                         // we emit a parse error.
328                         self.state = BogusName;
329                         return Progress;
330                     },
331 
332                     // Check length because &; is not a parse error.
333                     Some(';') if self.name_buf().len() > 1 => self.emit_name_error(tokenizer),
334 
335                     _ => (),
336                 }
337                 self.unconsume_name(tokenizer, input);
338                 self.finish_none()
339             },
340 
341             Some((c1, c2)) => {
342                 // We have a complete match, but we may have consumed
343                 // additional characters into self.name_buf.  Usually
344                 // at least one, but several in cases like
345                 //
346                 //     &not    => match for U+00AC
347                 //     &noti   => valid prefix for &notin
348                 //     &notit  => can't continue match
349 
350                 let name_len = self.name_len;
351                 assert!(name_len > 0);
352                 let last_matched = self.name_buf()[name_len - 1..].chars().next().unwrap();
353 
354                 // There might not be a next character after the match, if
355                 // we had a full match and then hit EOF.
356                 let next_after = if name_len == self.name_buf().len() {
357                     None
358                 } else {
359                     Some(self.name_buf()[name_len..].chars().next().unwrap())
360                 };
361 
362                 // "If the character reference is being consumed as part of an
363                 // attribute, and the last character matched is not a U+003B
364                 // SEMICOLON character (;), and the next character is either a
365                 // U+003D EQUALS SIGN character (=) or an alphanumeric ASCII
366                 // character, then, for historical reasons, all the characters
367                 // that were matched after the U+0026 AMPERSAND character (&)
368                 // must be unconsumed, and nothing is returned. However, if
369                 // this next character is in fact a U+003D EQUALS SIGN
370                 // character (=), then this is a parse error"
371 
372                 let unconsume_all = match (self.addnl_allowed, last_matched, next_after) {
373                     (_, ';', _) => false,
374                     (Some(_), _, Some('=')) => {
375                         tokenizer.emit_error(Borrowed(
376                             "Equals sign after character reference in attribute",
377                         ));
378                         true
379                     },
380                     (Some(_), _, Some(c)) if c.is_ascii_alphanumeric() => true,
381                     _ => {
382                         tokenizer.emit_error(Borrowed(
383                             "Character reference does not end with semicolon",
384                         ));
385                         false
386                     },
387                 };
388 
389                 if unconsume_all {
390                     self.unconsume_name(tokenizer, input);
391                     self.finish_none()
392                 } else {
393                     tokenizer
394                         .unconsume(input, StrTendril::from_slice(&self.name_buf()[name_len..]));
395                     self.result = Some(CharRef {
396                         chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()],
397                         num_chars: if c2 == 0 { 1 } else { 2 },
398                     });
399                     Done
400                 }
401             },
402         }
403     }
404 
do_bogus_name<Sink: TokenSink>( &mut self, tokenizer: &mut XmlTokenizer<Sink>, input: &mut BufferQueue, ) -> Status405     fn do_bogus_name<Sink: TokenSink>(
406         &mut self,
407         tokenizer: &mut XmlTokenizer<Sink>,
408         input: &mut BufferQueue,
409     ) -> Status {
410         let c = unwrap_or_return!(tokenizer.get_char(input), Stuck);
411         self.name_buf_mut().push_char(c);
412         match c {
413             _ if c.is_ascii_alphanumeric() => return Progress,
414             ';' => self.emit_name_error(tokenizer),
415             _ => (),
416         }
417         self.unconsume_name(tokenizer, input);
418         self.finish_none()
419     }
420 
end_of_file<Sink: TokenSink>( &mut self, tokenizer: &mut XmlTokenizer<Sink>, input: &mut BufferQueue, )421     pub fn end_of_file<Sink: TokenSink>(
422         &mut self,
423         tokenizer: &mut XmlTokenizer<Sink>,
424         input: &mut BufferQueue,
425     ) {
426         while self.result.is_none() {
427             match self.state {
428                 Begin => drop(self.finish_none()),
429 
430                 Numeric(_) if !self.seen_digit => drop(self.unconsume_numeric(tokenizer, input)),
431 
432                 Numeric(_) | NumericSemicolon => {
433                     tokenizer.emit_error(Borrowed("EOF in numeric character reference"));
434                     self.finish_numeric(tokenizer);
435                 },
436 
437                 Named => drop(self.finish_named(tokenizer, None, input)),
438 
439                 BogusName => {
440                     self.unconsume_name(tokenizer, input);
441                     self.finish_none();
442                 },
443 
444                 Octothorpe => {
445                     tokenizer.unconsume(input, StrTendril::from_slice("#"));
446                     tokenizer.emit_error(Borrowed("EOF after '#' in character reference"));
447                     self.finish_none();
448                 },
449             }
450         }
451     }
452 }
453