1 // Copyright 2014-2017 The html5ever Project Developers. See the
2 // COPYRIGHT file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9 
10 use super::{TokenSink, XmlTokenizer};
11 use crate::data;
12 use log::debug;
13 use mac::{format_if, unwrap_or_return};
14 use markup5ever::buffer_queue::BufferQueue;
15 use std::borrow::Cow::Borrowed;
16 use std::char::from_u32;
17 use crate::tendril::StrTendril;
18 use crate::util::is_ascii_alnum;
19 
20 use self::State::*;
21 pub use self::Status::*;
22 
23 //§ tokenizing-character-references
24 pub struct CharRef {
25     /// The resulting character(s)
26     pub chars: [char; 2],
27 
28     /// How many slots in `chars` are valid?
29     pub num_chars: u8,
30 }
31 
32 pub enum Status {
33     Stuck,
34     Progress,
35     Done,
36 }
37 
38 #[derive(Debug)]
39 enum State {
40     Begin,
41     Octothorpe,
42     Numeric(u32), // base
43     NumericSemicolon,
44     Named,
45     BogusName,
46 }
47 
48 pub struct CharRefTokenizer {
49     state: State,
50     addnl_allowed: Option<char>,
51     result: Option<CharRef>,
52 
53     num: u32,
54     num_too_big: bool,
55     seen_digit: bool,
56     hex_marker: Option<char>,
57 
58     name_buf_opt: Option<StrTendril>,
59     name_match: Option<(u32, u32)>,
60     name_len: usize,
61 }
62 
63 impl CharRefTokenizer {
64     // NB: We assume that we have an additional allowed character iff we're
65     // tokenizing in an attribute value.
new(addnl_allowed: Option<char>) -> CharRefTokenizer66     pub fn new(addnl_allowed: Option<char>) -> CharRefTokenizer {
67         CharRefTokenizer {
68             state: Begin,
69             addnl_allowed: addnl_allowed,
70             result: None,
71             num: 0,
72             num_too_big: false,
73             seen_digit: false,
74             hex_marker: None,
75             name_buf_opt: None,
76             name_match: None,
77             name_len: 0,
78         }
79     }
80 
81     // A CharRefTokenizer can only tokenize one character reference,
82     // so this method consumes the tokenizer.
get_result(self) -> CharRef83     pub fn get_result(self) -> CharRef {
84         self.result.expect("get_result called before done")
85     }
86 
name_buf<'t>(&'t self) -> &'t StrTendril87     fn name_buf<'t>(&'t self) -> &'t StrTendril {
88         self.name_buf_opt
89             .as_ref()
90             .expect("name_buf missing in named character reference")
91     }
92 
name_buf_mut<'t>(&'t mut self) -> &'t mut StrTendril93     fn name_buf_mut<'t>(&'t mut self) -> &'t mut StrTendril {
94         self.name_buf_opt
95             .as_mut()
96             .expect("name_buf missing in named character reference")
97     }
98 
finish_none(&mut self) -> Status99     fn finish_none(&mut self) -> Status {
100         self.result = Some(CharRef {
101             chars: ['\0', '\0'],
102             num_chars: 0,
103         });
104         Done
105     }
106 
finish_one(&mut self, c: char) -> Status107     fn finish_one(&mut self, c: char) -> Status {
108         self.result = Some(CharRef {
109             chars: [c, '\0'],
110             num_chars: 1,
111         });
112         Done
113     }
114 }
115 
116 impl CharRefTokenizer {
step<Sink: TokenSink>( &mut self, tokenizer: &mut XmlTokenizer<Sink>, input: &mut BufferQueue, ) -> Status117     pub fn step<Sink: TokenSink>(
118         &mut self,
119         tokenizer: &mut XmlTokenizer<Sink>,
120         input: &mut BufferQueue,
121     ) -> Status {
122         if self.result.is_some() {
123             return Done;
124         }
125 
126         debug!("char ref tokenizer stepping in state {:?}", self.state);
127         match self.state {
128             Begin => self.do_begin(tokenizer, input),
129             Octothorpe => self.do_octothorpe(tokenizer, input),
130             Numeric(base) => self.do_numeric(tokenizer, base, input),
131             NumericSemicolon => self.do_numeric_semicolon(tokenizer, input),
132             Named => self.do_named(tokenizer, input),
133             BogusName => self.do_bogus_name(tokenizer, input),
134         }
135     }
136 
do_begin<Sink: TokenSink>( &mut self, tokenizer: &mut XmlTokenizer<Sink>, input: &mut BufferQueue, ) -> Status137     fn do_begin<Sink: TokenSink>(
138         &mut self,
139         tokenizer: &mut XmlTokenizer<Sink>,
140         input: &mut BufferQueue,
141     ) -> Status {
142         match unwrap_or_return!(tokenizer.peek(input), Stuck) {
143             '\t' | '\n' | '\x0C' | ' ' | '<' | '&' => self.finish_none(),
144             c if Some(c) == self.addnl_allowed => self.finish_none(),
145 
146             '#' => {
147                 tokenizer.discard_char(input);
148                 self.state = Octothorpe;
149                 Progress
150             },
151 
152             _ => {
153                 self.state = Named;
154                 self.name_buf_opt = Some(StrTendril::new());
155                 Progress
156             },
157         }
158     }
159 
do_octothorpe<Sink: TokenSink>( &mut self, tokenizer: &mut XmlTokenizer<Sink>, input: &mut BufferQueue, ) -> Status160     fn do_octothorpe<Sink: TokenSink>(
161         &mut self,
162         tokenizer: &mut XmlTokenizer<Sink>,
163         input: &mut BufferQueue,
164     ) -> Status {
165         let c = unwrap_or_return!(tokenizer.peek(input), Stuck);
166         match c {
167             'x' | 'X' => {
168                 tokenizer.discard_char(input);
169                 self.hex_marker = Some(c);
170                 self.state = Numeric(16);
171             },
172 
173             _ => {
174                 self.hex_marker = None;
175                 self.state = Numeric(10);
176             },
177         }
178         Progress
179     }
180 
do_numeric<Sink: TokenSink>( &mut self, tokenizer: &mut XmlTokenizer<Sink>, base: u32, input: &mut BufferQueue, ) -> Status181     fn do_numeric<Sink: TokenSink>(
182         &mut self,
183         tokenizer: &mut XmlTokenizer<Sink>,
184         base: u32,
185         input: &mut BufferQueue,
186     ) -> Status {
187         let c = unwrap_or_return!(tokenizer.peek(input), Stuck);
188         match c.to_digit(base) {
189             Some(n) => {
190                 tokenizer.discard_char(input);
191                 self.num = self.num.wrapping_mul(base);
192                 if self.num > 0x10FFFF {
193                     // We might overflow, and the character is definitely invalid.
194                     // We still parse digits and semicolon, but don't use the result.
195                     self.num_too_big = true;
196                 }
197                 self.num = self.num.wrapping_add(n);
198                 self.seen_digit = true;
199                 Progress
200             },
201 
202             None if !self.seen_digit => self.unconsume_numeric(tokenizer, input),
203 
204             None => {
205                 self.state = NumericSemicolon;
206                 Progress
207             },
208         }
209     }
210 
do_numeric_semicolon<Sink: TokenSink>( &mut self, tokenizer: &mut XmlTokenizer<Sink>, input: &mut BufferQueue, ) -> Status211     fn do_numeric_semicolon<Sink: TokenSink>(
212         &mut self,
213         tokenizer: &mut XmlTokenizer<Sink>,
214         input: &mut BufferQueue,
215     ) -> Status {
216         match unwrap_or_return!(tokenizer.peek(input), Stuck) {
217             ';' => tokenizer.discard_char(input),
218             _ => tokenizer.emit_error(Borrowed(
219                 "Semicolon missing after numeric character reference",
220             )),
221         };
222         self.finish_numeric(tokenizer)
223     }
224 
unconsume_numeric<Sink: TokenSink>( &mut self, tokenizer: &mut XmlTokenizer<Sink>, input: &mut BufferQueue, ) -> Status225     fn unconsume_numeric<Sink: TokenSink>(
226         &mut self,
227         tokenizer: &mut XmlTokenizer<Sink>,
228         input: &mut BufferQueue,
229     ) -> Status {
230         let mut unconsume = StrTendril::from_char('#');
231         match self.hex_marker {
232             Some(c) => unconsume.push_char(c),
233             None => (),
234         }
235 
236         tokenizer.unconsume(input, unconsume);
237         tokenizer.emit_error(Borrowed("Numeric character reference without digits"));
238         self.finish_none()
239     }
240 
finish_numeric<Sink: TokenSink>(&mut self, tokenizer: &mut XmlTokenizer<Sink>) -> Status241     fn finish_numeric<Sink: TokenSink>(&mut self, tokenizer: &mut XmlTokenizer<Sink>) -> Status {
242         fn conv(n: u32) -> char {
243             from_u32(n).expect("invalid char missed by error handling cases")
244         }
245 
246         let (c, error) = match self.num {
247             n if (n > 0x10FFFF) || self.num_too_big => ('\u{fffd}', true),
248             0x00 | 0xD800..=0xDFFF => ('\u{fffd}', true),
249 
250             0x80..=0x9F => match data::C1_REPLACEMENTS[(self.num - 0x80) as usize] {
251                 Some(c) => (c, true),
252                 None => (conv(self.num), true),
253             },
254 
255             0x01..=0x08 | 0x0B | 0x0D..=0x1F | 0x7F | 0xFDD0..=0xFDEF => (conv(self.num), true),
256 
257             n if (n & 0xFFFE) == 0xFFFE => (conv(n), true),
258 
259             n => (conv(n), false),
260         };
261 
262         if error {
263             let msg = format_if!(
264                 tokenizer.opts.exact_errors,
265                 "Invalid numeric character reference",
266                 "Invalid numeric character reference value 0x{:06X}",
267                 self.num
268             );
269             tokenizer.emit_error(msg);
270         }
271 
272         self.finish_one(c)
273     }
274 
do_named<Sink: TokenSink>( &mut self, tokenizer: &mut XmlTokenizer<Sink>, input: &mut BufferQueue, ) -> Status275     fn do_named<Sink: TokenSink>(
276         &mut self,
277         tokenizer: &mut XmlTokenizer<Sink>,
278         input: &mut BufferQueue,
279     ) -> Status {
280         let c = unwrap_or_return!(tokenizer.get_char(input), Stuck);
281         self.name_buf_mut().push_char(c);
282         match data::NAMED_ENTITIES.get(&self.name_buf()[..]) {
283             // We have either a full match or a prefix of one.
284             Some(&m) => {
285                 if m.0 != 0 {
286                     // We have a full match, but there might be a longer one to come.
287                     self.name_match = Some(m);
288                     self.name_len = self.name_buf().len();
289                 }
290                 // Otherwise we just have a prefix match.
291                 Progress
292             },
293 
294             // Can't continue the match.
295             None => self.finish_named(tokenizer, Some(c), input),
296         }
297     }
298 
emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &mut XmlTokenizer<Sink>)299     fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &mut XmlTokenizer<Sink>) {
300         let msg = format_if!(
301             tokenizer.opts.exact_errors,
302             "Invalid character reference",
303             "Invalid character reference &{}",
304             self.name_buf()
305         );
306         tokenizer.emit_error(msg);
307     }
308 
unconsume_name<Sink: TokenSink>( &mut self, tokenizer: &mut XmlTokenizer<Sink>, input: &mut BufferQueue, )309     fn unconsume_name<Sink: TokenSink>(
310         &mut self,
311         tokenizer: &mut XmlTokenizer<Sink>,
312         input: &mut BufferQueue,
313     ) {
314         tokenizer.unconsume(input, self.name_buf_opt.take().unwrap());
315     }
316 
finish_named<Sink: TokenSink>( &mut self, tokenizer: &mut XmlTokenizer<Sink>, end_char: Option<char>, input: &mut BufferQueue, ) -> Status317     fn finish_named<Sink: TokenSink>(
318         &mut self,
319         tokenizer: &mut XmlTokenizer<Sink>,
320         end_char: Option<char>,
321         input: &mut BufferQueue,
322     ) -> Status {
323         match self.name_match {
324             None => {
325                 match end_char {
326                     Some(c) if is_ascii_alnum(c) => {
327                         // Keep looking for a semicolon, to determine whether
328                         // we emit a parse error.
329                         self.state = BogusName;
330                         return Progress;
331                     },
332 
333                     // Check length because &; is not a parse error.
334                     Some(';') if self.name_buf().len() > 1 => self.emit_name_error(tokenizer),
335 
336                     _ => (),
337                 }
338                 self.unconsume_name(tokenizer, input);
339                 self.finish_none()
340             },
341 
342             Some((c1, c2)) => {
343                 // We have a complete match, but we may have consumed
344                 // additional characters into self.name_buf.  Usually
345                 // at least one, but several in cases like
346                 //
347                 //     &not    => match for U+00AC
348                 //     &noti   => valid prefix for &notin
349                 //     &notit  => can't continue match
350 
351                 let name_len = self.name_len;
352                 assert!(name_len > 0);
353                 let last_matched = self.name_buf()[name_len - 1..].chars().next().unwrap();
354 
355                 // There might not be a next character after the match, if
356                 // we had a full match and then hit EOF.
357                 let next_after = if name_len == self.name_buf().len() {
358                     None
359                 } else {
360                     Some(self.name_buf()[name_len..].chars().next().unwrap())
361                 };
362 
363                 // "If the character reference is being consumed as part of an
364                 // attribute, and the last character matched is not a U+003B
365                 // SEMICOLON character (;), and the next character is either a
366                 // U+003D EQUALS SIGN character (=) or an alphanumeric ASCII
367                 // character, then, for historical reasons, all the characters
368                 // that were matched after the U+0026 AMPERSAND character (&)
369                 // must be unconsumed, and nothing is returned. However, if
370                 // this next character is in fact a U+003D EQUALS SIGN
371                 // character (=), then this is a parse error"
372 
373                 let unconsume_all = match (self.addnl_allowed, last_matched, next_after) {
374                     (_, ';', _) => false,
375                     (Some(_), _, Some('=')) => {
376                         tokenizer.emit_error(Borrowed(
377                             "Equals sign after character reference in attribute",
378                         ));
379                         true
380                     },
381                     (Some(_), _, Some(c)) if is_ascii_alnum(c) => true,
382                     _ => {
383                         tokenizer.emit_error(Borrowed(
384                             "Character reference does not end with semicolon",
385                         ));
386                         false
387                     },
388                 };
389 
390                 if unconsume_all {
391                     self.unconsume_name(tokenizer, input);
392                     self.finish_none()
393                 } else {
394                     tokenizer
395                         .unconsume(input, StrTendril::from_slice(&self.name_buf()[name_len..]));
396                     self.result = Some(CharRef {
397                         chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()],
398                         num_chars: if c2 == 0 { 1 } else { 2 },
399                     });
400                     Done
401                 }
402             },
403         }
404     }
405 
do_bogus_name<Sink: TokenSink>( &mut self, tokenizer: &mut XmlTokenizer<Sink>, input: &mut BufferQueue, ) -> Status406     fn do_bogus_name<Sink: TokenSink>(
407         &mut self,
408         tokenizer: &mut XmlTokenizer<Sink>,
409         input: &mut BufferQueue,
410     ) -> Status {
411         let c = unwrap_or_return!(tokenizer.get_char(input), Stuck);
412         self.name_buf_mut().push_char(c);
413         match c {
414             _ if is_ascii_alnum(c) => return Progress,
415             ';' => self.emit_name_error(tokenizer),
416             _ => (),
417         }
418         self.unconsume_name(tokenizer, input);
419         self.finish_none()
420     }
421 
end_of_file<Sink: TokenSink>( &mut self, tokenizer: &mut XmlTokenizer<Sink>, input: &mut BufferQueue, )422     pub fn end_of_file<Sink: TokenSink>(
423         &mut self,
424         tokenizer: &mut XmlTokenizer<Sink>,
425         input: &mut BufferQueue,
426     ) {
427         while self.result.is_none() {
428             match self.state {
429                 Begin => drop(self.finish_none()),
430 
431                 Numeric(_) if !self.seen_digit => drop(self.unconsume_numeric(tokenizer, input)),
432 
433                 Numeric(_) | NumericSemicolon => {
434                     tokenizer.emit_error(Borrowed("EOF in numeric character reference"));
435                     self.finish_numeric(tokenizer);
436                 },
437 
438                 Named => drop(self.finish_named(tokenizer, None, input)),
439 
440                 BogusName => {
441                     self.unconsume_name(tokenizer, input);
442                     self.finish_none();
443                 },
444 
445                 Octothorpe => {
446                     tokenizer.unconsume(input, StrTendril::from_slice("#"));
447                     tokenizer.emit_error(Borrowed("EOF after '#' in character reference"));
448                     self.finish_none();
449                 },
450             }
451         }
452     }
453 }
454