1 // Copyright 2014 The html5ever Project Developers. See the
2 // COPYRIGHT file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9 
10 use super::{Tokenizer, TokenSink};
11 use super::buffer_queue::BufferQueue;
12 
13 use util::str::{is_ascii_alnum};
14 
15 use tendril::StrTendril;
16 
17 use std::char::from_u32;
18 use std::borrow::Cow::Borrowed;
19 
20 pub use self::Status::*;
21 use self::State::*;
22 
23 mod data;
24 
25 //§ tokenizing-character-references
26 pub struct CharRef {
27     /// The resulting character(s)
28     pub chars: [char; 2],
29 
30     /// How many slots in `chars` are valid?
31     pub num_chars: u8,
32 }
33 
34 pub enum Status {
35     Stuck,
36     Progress,
37     Done,
38 }
39 
40 #[derive(Debug)]
41 enum State {
42     Begin,
43     Octothorpe,
44     Numeric(u32), // base
45     NumericSemicolon,
46     Named,
47     BogusName,
48 }
49 
50 pub struct CharRefTokenizer {
51     state: State,
52     addnl_allowed: Option<char>,
53     result: Option<CharRef>,
54 
55     num: u32,
56     num_too_big: bool,
57     seen_digit: bool,
58     hex_marker: Option<char>,
59 
60     name_buf_opt: Option<StrTendril>,
61     name_match: Option<(u32, u32)>,
62     name_len: usize,
63 }
64 
65 impl CharRefTokenizer {
66     // NB: We assume that we have an additional allowed character iff we're
67     // tokenizing in an attribute value.
new(addnl_allowed: Option<char>) -> CharRefTokenizer68     pub fn new(addnl_allowed: Option<char>) -> CharRefTokenizer {
69         CharRefTokenizer {
70             state: Begin,
71             addnl_allowed: addnl_allowed,
72             result: None,
73             num: 0,
74             num_too_big: false,
75             seen_digit: false,
76             hex_marker: None,
77             name_buf_opt: None,
78             name_match: None,
79             name_len: 0,
80         }
81     }
82 
83     // A CharRefTokenizer can only tokenize one character reference,
84     // so this method consumes the tokenizer.
get_result(self) -> CharRef85     pub fn get_result(self) -> CharRef {
86         self.result.expect("get_result called before done")
87     }
88 
name_buf<'t>(&'t self) -> &'t StrTendril89     fn name_buf<'t>(&'t self) -> &'t StrTendril {
90         self.name_buf_opt.as_ref()
91             .expect("name_buf missing in named character reference")
92     }
93 
name_buf_mut<'t>(&'t mut self) -> &'t mut StrTendril94     fn name_buf_mut<'t>(&'t mut self) -> &'t mut StrTendril {
95         self.name_buf_opt.as_mut()
96             .expect("name_buf missing in named character reference")
97     }
98 
finish_none(&mut self) -> Status99     fn finish_none(&mut self) -> Status {
100         self.result = Some(CharRef {
101             chars: ['\0', '\0'],
102             num_chars: 0,
103         });
104         Done
105     }
106 
finish_one(&mut self, c: char) -> Status107     fn finish_one(&mut self, c: char) -> Status {
108         self.result = Some(CharRef {
109             chars: [c, '\0'],
110             num_chars: 1,
111         });
112         Done
113     }
114 }
115 
116 impl CharRefTokenizer {
step<Sink: TokenSink>( &mut self, tokenizer: &mut Tokenizer<Sink>, input: &mut BufferQueue) -> Status117     pub fn step<Sink: TokenSink>(
118             &mut self,
119             tokenizer: &mut Tokenizer<Sink>,
120             input: &mut BufferQueue)
121             -> Status {
122         if self.result.is_some() {
123             return Done;
124         }
125 
126         debug!("char ref tokenizer stepping in state {:?}", self.state);
127         match self.state {
128             Begin => self.do_begin(tokenizer, input),
129             Octothorpe => self.do_octothorpe(tokenizer, input),
130             Numeric(base) => self.do_numeric(tokenizer, input, base),
131             NumericSemicolon => self.do_numeric_semicolon(tokenizer, input),
132             Named => self.do_named(tokenizer, input),
133             BogusName => self.do_bogus_name(tokenizer, input),
134         }
135     }
136 
do_begin<Sink: TokenSink>( &mut self, tokenizer: &mut Tokenizer<Sink>, input: &mut BufferQueue) -> Status137     fn do_begin<Sink: TokenSink>(
138             &mut self,
139             tokenizer: &mut Tokenizer<Sink>,
140             input: &mut BufferQueue)
141             -> Status {
142         match unwrap_or_return!(tokenizer.peek(input), Stuck) {
143             '\t' | '\n' | '\x0C' | ' ' | '<' | '&'
144                 => self.finish_none(),
145             c if Some(c) == self.addnl_allowed
146                 => self.finish_none(),
147 
148             '#' => {
149                 tokenizer.discard_char(input);
150                 self.state = Octothorpe;
151                 Progress
152             }
153 
154             _ => {
155                 self.state = Named;
156                 self.name_buf_opt = Some(StrTendril::new());
157                 Progress
158             }
159         }
160     }
161 
do_octothorpe<Sink: TokenSink>( &mut self, tokenizer: &mut Tokenizer<Sink>, input: &mut BufferQueue) -> Status162     fn do_octothorpe<Sink: TokenSink>(
163             &mut self,
164             tokenizer: &mut Tokenizer<Sink>,
165             input: &mut BufferQueue)
166             -> Status {
167         let c = unwrap_or_return!(tokenizer.peek(input), Stuck);
168         match c {
169             'x' | 'X' => {
170                 tokenizer.discard_char(input);
171                 self.hex_marker = Some(c);
172                 self.state = Numeric(16);
173             }
174 
175             _ => {
176                 self.hex_marker = None;
177                 self.state = Numeric(10);
178             }
179         }
180         Progress
181     }
182 
do_numeric<Sink: TokenSink>( &mut self, tokenizer: &mut Tokenizer<Sink>, input: &mut BufferQueue, base: u32) -> Status183     fn do_numeric<Sink: TokenSink>(
184             &mut self,
185             tokenizer: &mut Tokenizer<Sink>,
186             input: &mut BufferQueue,
187             base: u32)
188             -> Status {
189         let c = unwrap_or_return!(tokenizer.peek(input), Stuck);
190         match c.to_digit(base) {
191             Some(n) => {
192                 tokenizer.discard_char(input);
193                 self.num = self.num.wrapping_mul(base);
194                 if self.num > 0x10FFFF {
195                     // We might overflow, and the character is definitely invalid.
196                     // We still parse digits and semicolon, but don't use the result.
197                     self.num_too_big = true;
198                 }
199                 self.num = self.num.wrapping_add(n);
200                 self.seen_digit = true;
201                 Progress
202             }
203 
204             None if !self.seen_digit => self.unconsume_numeric(tokenizer, input),
205 
206             None => {
207                 self.state = NumericSemicolon;
208                 Progress
209             }
210         }
211     }
212 
do_numeric_semicolon<Sink: TokenSink>( &mut self, tokenizer: &mut Tokenizer<Sink>, input: &mut BufferQueue) -> Status213     fn do_numeric_semicolon<Sink: TokenSink>(
214             &mut self,
215             tokenizer: &mut Tokenizer<Sink>,
216             input: &mut BufferQueue)
217             -> Status {
218         match unwrap_or_return!(tokenizer.peek(input), Stuck) {
219             ';' => tokenizer.discard_char(input),
220             _   => tokenizer.emit_error(Borrowed("Semicolon missing after numeric character reference")),
221         };
222         self.finish_numeric(tokenizer)
223     }
224 
unconsume_numeric<Sink: TokenSink>( &mut self, tokenizer: &mut Tokenizer<Sink>, input: &mut BufferQueue) -> Status225     fn unconsume_numeric<Sink: TokenSink>(
226             &mut self,
227             tokenizer: &mut Tokenizer<Sink>,
228             input: &mut BufferQueue)
229             -> Status {
230         let mut unconsume = StrTendril::from_char('#');
231         match self.hex_marker {
232             Some(c) => unconsume.push_char(c),
233             None => (),
234         }
235 
236         input.push_front(unconsume);
237         tokenizer.emit_error(Borrowed("Numeric character reference without digits"));
238         self.finish_none()
239     }
240 
finish_numeric<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) -> Status241     fn finish_numeric<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) -> Status {
242         fn conv(n: u32) -> char {
243             from_u32(n).expect("invalid char missed by error handling cases")
244         }
245 
246         let (c, error) = match self.num {
247             n if (n > 0x10FFFF) || self.num_too_big => ('\u{fffd}', true),
248             0x00 | 0xD800...0xDFFF => ('\u{fffd}', true),
249 
250             0x80...0x9F => match data::C1_REPLACEMENTS[(self.num - 0x80) as usize] {
251                 Some(c) => (c, true),
252                 None => (conv(self.num), true),
253             },
254 
255             0x01...0x08 | 0x0B | 0x0D...0x1F | 0x7F | 0xFDD0...0xFDEF
256                 => (conv(self.num), true),
257 
258             n if (n & 0xFFFE) == 0xFFFE
259                 => (conv(n), true),
260 
261             n => (conv(n), false),
262         };
263 
264         if error {
265             let msg = format_if!(tokenizer.opts.exact_errors,
266                 "Invalid numeric character reference",
267                 "Invalid numeric character reference value 0x{:06X}", self.num);
268             tokenizer.emit_error(msg);
269         }
270 
271         self.finish_one(c)
272     }
273 
do_named<Sink: TokenSink>( &mut self, tokenizer: &mut Tokenizer<Sink>, input: &mut BufferQueue) -> Status274     fn do_named<Sink: TokenSink>(
275             &mut self,
276             tokenizer: &mut Tokenizer<Sink>,
277             input: &mut BufferQueue)
278             -> Status {
279         let c = unwrap_or_return!(tokenizer.get_char(input), Stuck);
280         self.name_buf_mut().push_char(c);
281         match data::NAMED_ENTITIES.get(&self.name_buf()[..]) {
282             // We have either a full match or a prefix of one.
283             Some(&m) => {
284                 if m.0 != 0 {
285                     // We have a full match, but there might be a longer one to come.
286                     self.name_match = Some(m);
287                     self.name_len = self.name_buf().len();
288                 }
289                 // Otherwise we just have a prefix match.
290                 Progress
291             }
292 
293             // Can't continue the match.
294             None => self.finish_named(tokenizer, input, Some(c)),
295         }
296     }
297 
emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>)298     fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) {
299         let msg = format_if!(tokenizer.opts.exact_errors,
300             "Invalid character reference",
301             "Invalid character reference &{}", self.name_buf());
302         tokenizer.emit_error(msg);
303     }
304 
unconsume_name(&mut self, input: &mut BufferQueue)305     fn unconsume_name(&mut self, input: &mut BufferQueue) {
306         input.push_front(self.name_buf_opt.take().unwrap());
307     }
308 
finish_named<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>, input: &mut BufferQueue, end_char: Option<char>) -> Status309     fn finish_named<Sink: TokenSink>(&mut self,
310             tokenizer: &mut Tokenizer<Sink>,
311             input: &mut BufferQueue,
312             end_char: Option<char>) -> Status {
313         match self.name_match {
314             None => {
315                 match end_char {
316                     Some(c) if is_ascii_alnum(c) => {
317                         // Keep looking for a semicolon, to determine whether
318                         // we emit a parse error.
319                         self.state = BogusName;
320                         return Progress;
321                     }
322 
323                     // Check length because &; is not a parse error.
324                     Some(';') if self.name_buf().len() > 1
325                         => self.emit_name_error(tokenizer),
326 
327                     _ => (),
328                 }
329                 self.unconsume_name(input);
330                 self.finish_none()
331             }
332 
333             Some((c1, c2)) => {
334                 // We have a complete match, but we may have consumed
335                 // additional characters into self.name_buf.  Usually
336                 // at least one, but several in cases like
337                 //
338                 //     &not    => match for U+00AC
339                 //     &noti   => valid prefix for &notin
340                 //     &notit  => can't continue match
341 
342                 let name_len = self.name_len;
343                 assert!(name_len > 0);
344                 let last_matched = self.name_buf()[name_len-1..].chars().next().unwrap();
345 
346                 // There might not be a next character after the match, if
347                 // we had a full match and then hit EOF.
348                 let next_after = if name_len == self.name_buf().len() {
349                     None
350                 } else {
351                     Some(self.name_buf()[name_len..].chars().next().unwrap())
352                 };
353 
354                 // "If the character reference is being consumed as part of an
355                 // attribute, and the last character matched is not a U+003B
356                 // SEMICOLON character (;), and the next character is either a
357                 // U+003D EQUALS SIGN character (=) or an alphanumeric ASCII
358                 // character, then, for historical reasons, all the characters
359                 // that were matched after the U+0026 AMPERSAND character (&)
360                 // must be unconsumed, and nothing is returned. However, if
361                 // this next character is in fact a U+003D EQUALS SIGN
362                 // character (=), then this is a parse error"
363 
364                 let unconsume_all = match (self.addnl_allowed, last_matched, next_after) {
365                     (_, ';', _) => false,
366                     (Some(_), _, Some('=')) => {
367                         tokenizer.emit_error(Borrowed("Equals sign after character reference in attribute"));
368                         true
369                     }
370                     (Some(_), _, Some(c)) if is_ascii_alnum(c) => true,
371                     _ => {
372                         tokenizer.emit_error(Borrowed("Character reference does not end with semicolon"));
373                         false
374                     }
375                 };
376 
377                 if unconsume_all {
378                     self.unconsume_name(input);
379                     self.finish_none()
380                 } else {
381                     input.push_front(StrTendril::from_slice(&self.name_buf()[name_len..]));
382                     self.result = Some(CharRef {
383                         chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()],
384                         num_chars: if c2 == 0 { 1 } else { 2 },
385                     });
386                     Done
387                 }
388             }
389         }
390     }
391 
do_bogus_name<Sink: TokenSink>( &mut self, tokenizer: &mut Tokenizer<Sink>, input: &mut BufferQueue) -> Status392     fn do_bogus_name<Sink: TokenSink>(
393             &mut self,
394             tokenizer: &mut Tokenizer<Sink>,
395             input: &mut BufferQueue)
396             -> Status {
397         let c = unwrap_or_return!(tokenizer.get_char(input), Stuck);
398         self.name_buf_mut().push_char(c);
399         match c {
400             _ if is_ascii_alnum(c) => return Progress,
401             ';' => self.emit_name_error(tokenizer),
402             _ => ()
403         }
404         self.unconsume_name(input);
405         self.finish_none()
406     }
407 
end_of_file<Sink: TokenSink>( &mut self, tokenizer: &mut Tokenizer<Sink>, input: &mut BufferQueue)408     pub fn end_of_file<Sink: TokenSink>(
409             &mut self,
410             tokenizer: &mut Tokenizer<Sink>,
411             input: &mut BufferQueue) {
412         while self.result.is_none() {
413             match self.state {
414                 Begin => drop(self.finish_none()),
415 
416                 Numeric(_) if !self.seen_digit
417                     => drop(self.unconsume_numeric(tokenizer, input)),
418 
419                 Numeric(_) | NumericSemicolon => {
420                     tokenizer.emit_error(Borrowed("EOF in numeric character reference"));
421                     self.finish_numeric(tokenizer);
422                 }
423 
424                 Named => drop(self.finish_named(tokenizer, input, None)),
425 
426                 BogusName => {
427                     self.unconsume_name(input);
428                     self.finish_none();
429                 }
430 
431                 Octothorpe => {
432                     input.push_front(StrTendril::from_slice("#"));
433                     tokenizer.emit_error(Borrowed("EOF after '#' in character reference"));
434                     self.finish_none();
435                 }
436             }
437         }
438     }
439 }
440