1 // Copyright 2014-2017 The html5ever Project Developers. See the
2 // COPYRIGHT file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9 
10 //! The HTML5 tokenizer.
11 
12 pub use self::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
13 pub use self::interface::{CommentToken, DoctypeToken, TagToken, Token};
14 pub use self::interface::{Doctype, EndTag, StartTag, Tag, TagKind};
15 pub use self::interface::{TokenSink, TokenSinkResult};
16 
17 use self::states::{DoctypeIdKind, Public, System};
18 use self::states::{DoubleEscaped, Escaped};
19 use self::states::{DoubleQuoted, SingleQuoted, Unquoted};
20 use self::states::{Rawtext, Rcdata, ScriptData, ScriptDataEscaped};
21 
22 use self::char_ref::{CharRef, CharRefTokenizer};
23 
24 use crate::util::str::lower_ascii_letter;
25 
26 use log::debug;
27 use mac::{format_if, matches, _tt_as_expr_hack};
28 use markup5ever::{namespace_url, ns, small_char_set};
29 use std::borrow::Cow::{self, Borrowed};
30 use std::collections::BTreeMap;
31 use std::default::Default;
32 use std::mem::replace;
33 
34 pub use crate::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult};
35 use crate::tendril::StrTendril;
36 use crate::{Attribute, LocalName, QualName, SmallCharSet};
37 
38 mod char_ref;
39 mod interface;
40 pub mod states;
41 
42 pub enum ProcessResult<Handle> {
43     Continue,
44     Suspend,
45     Script(Handle),
46 }
47 
48 #[must_use]
49 pub enum TokenizerResult<Handle> {
50     Done,
51     Script(Handle),
52 }
53 
option_push(opt_str: &mut Option<StrTendril>, c: char)54 fn option_push(opt_str: &mut Option<StrTendril>, c: char) {
55     match *opt_str {
56         Some(ref mut s) => s.push_char(c),
57         None => *opt_str = Some(StrTendril::from_char(c)),
58     }
59 }
60 
61 /// Tokenizer options, with an impl for `Default`.
62 #[derive(Clone)]
63 pub struct TokenizerOpts {
64     /// Report all parse errors described in the spec, at some
65     /// performance penalty?  Default: false
66     pub exact_errors: bool,
67 
68     /// Discard a `U+FEFF BYTE ORDER MARK` if we see one at the beginning
69     /// of the stream?  Default: true
70     pub discard_bom: bool,
71 
72     /// Keep a record of how long we spent in each state?  Printed
73     /// when `end()` is called.  Default: false
74     pub profile: bool,
75 
76     /// Initial state override.  Only the test runner should use
77     /// a non-`None` value!
78     pub initial_state: Option<states::State>,
79 
80     /// Last start tag.  Only the test runner should use a
81     /// non-`None` value!
82     ///
83     /// FIXME: Can't use Tendril because we want TokenizerOpts
84     /// to be Send.
85     pub last_start_tag_name: Option<String>,
86 }
87 
88 impl Default for TokenizerOpts {
default() -> TokenizerOpts89     fn default() -> TokenizerOpts {
90         TokenizerOpts {
91             exact_errors: false,
92             discard_bom: true,
93             profile: false,
94             initial_state: None,
95             last_start_tag_name: None,
96         }
97     }
98 }
99 
100 /// The HTML tokenizer.
101 pub struct Tokenizer<Sink> {
102     /// Options controlling the behavior of the tokenizer.
103     opts: TokenizerOpts,
104 
105     /// Destination for tokens we emit.
106     pub sink: Sink,
107 
108     /// The abstract machine state as described in the spec.
109     state: states::State,
110 
111     /// Are we at the end of the file, once buffers have been processed
112     /// completely? This affects whether we will wait for lookahead or not.
113     at_eof: bool,
114 
115     /// Tokenizer for character references, if we're tokenizing
116     /// one at the moment.
117     char_ref_tokenizer: Option<Box<CharRefTokenizer>>,
118 
119     /// Current input character.  Just consumed, may reconsume.
120     current_char: char,
121 
122     /// Should we reconsume the current input character?
123     reconsume: bool,
124 
125     /// Did we just consume \r, translating it to \n?  In that case we need
126     /// to ignore the next character if it's \n.
127     ignore_lf: bool,
128 
129     /// Discard a U+FEFF BYTE ORDER MARK if we see one?  Only done at the
130     /// beginning of the stream.
131     discard_bom: bool,
132 
133     /// Current tag kind.
134     current_tag_kind: TagKind,
135 
136     /// Current tag name.
137     current_tag_name: StrTendril,
138 
139     /// Current tag is self-closing?
140     current_tag_self_closing: bool,
141 
142     /// Current tag attributes.
143     current_tag_attrs: Vec<Attribute>,
144 
145     /// Current attribute name.
146     current_attr_name: StrTendril,
147 
148     /// Current attribute value.
149     current_attr_value: StrTendril,
150 
151     /// Current comment.
152     current_comment: StrTendril,
153 
154     /// Current doctype token.
155     current_doctype: Doctype,
156 
157     /// Last start tag name, for use in checking "appropriate end tag".
158     last_start_tag_name: Option<LocalName>,
159 
160     /// The "temporary buffer" mentioned in the spec.
161     temp_buf: StrTendril,
162 
163     /// Record of how many ns we spent in each state, if profiling is enabled.
164     state_profile: BTreeMap<states::State, u64>,
165 
166     /// Record of how many ns we spent in the token sink.
167     time_in_sink: u64,
168 
169     /// Track current line
170     current_line: u64,
171 }
172 
173 impl<Sink: TokenSink> Tokenizer<Sink> {
174     /// Create a new tokenizer which feeds tokens to a particular `TokenSink`.
new(sink: Sink, mut opts: TokenizerOpts) -> Tokenizer<Sink>175     pub fn new(sink: Sink, mut opts: TokenizerOpts) -> Tokenizer<Sink> {
176         let start_tag_name = opts
177             .last_start_tag_name
178             .take()
179             .map(|s| LocalName::from(&*s));
180         let state = opts.initial_state.unwrap_or(states::Data);
181         let discard_bom = opts.discard_bom;
182         Tokenizer {
183             opts: opts,
184             sink: sink,
185             state: state,
186             char_ref_tokenizer: None,
187             at_eof: false,
188             current_char: '\0',
189             reconsume: false,
190             ignore_lf: false,
191             discard_bom: discard_bom,
192             current_tag_kind: StartTag,
193             current_tag_name: StrTendril::new(),
194             current_tag_self_closing: false,
195             current_tag_attrs: vec![],
196             current_attr_name: StrTendril::new(),
197             current_attr_value: StrTendril::new(),
198             current_comment: StrTendril::new(),
199             current_doctype: Doctype::new(),
200             last_start_tag_name: start_tag_name,
201             temp_buf: StrTendril::new(),
202             state_profile: BTreeMap::new(),
203             time_in_sink: 0,
204             current_line: 1,
205         }
206     }
207 
208     /// Feed an input string into the tokenizer.
feed(&mut self, input: &mut BufferQueue) -> TokenizerResult<Sink::Handle>209     pub fn feed(&mut self, input: &mut BufferQueue) -> TokenizerResult<Sink::Handle> {
210         if input.is_empty() {
211             return TokenizerResult::Done;
212         }
213 
214         if self.discard_bom {
215             if let Some(c) = input.peek() {
216                 if c == '\u{feff}' {
217                     input.next();
218                 }
219             } else {
220                 return TokenizerResult::Done;
221             }
222         };
223 
224         self.run(input)
225     }
226 
set_plaintext_state(&mut self)227     pub fn set_plaintext_state(&mut self) {
228         self.state = states::Plaintext;
229     }
230 
process_token(&mut self, token: Token) -> TokenSinkResult<Sink::Handle>231     fn process_token(&mut self, token: Token) -> TokenSinkResult<Sink::Handle> {
232         if self.opts.profile {
233             let (ret, dt) = time!(self.sink.process_token(token, self.current_line));
234             self.time_in_sink += dt;
235             ret
236         } else {
237             self.sink.process_token(token, self.current_line)
238         }
239     }
240 
process_token_and_continue(&mut self, token: Token)241     fn process_token_and_continue(&mut self, token: Token) {
242         assert!(matches!(
243             self.process_token(token),
244             TokenSinkResult::Continue
245         ));
246     }
247 
248     //§ preprocessing-the-input-stream
249     // Get the next input character, which might be the character
250     // 'c' that we already consumed from the buffers.
get_preprocessed_char(&mut self, mut c: char, input: &mut BufferQueue) -> Option<char>251     fn get_preprocessed_char(&mut self, mut c: char, input: &mut BufferQueue) -> Option<char> {
252         if self.ignore_lf {
253             self.ignore_lf = false;
254             if c == '\n' {
255                 c = unwrap_or_return!(input.next(), None);
256             }
257         }
258 
259         if c == '\r' {
260             self.ignore_lf = true;
261             c = '\n';
262         }
263 
264         if c == '\n' {
265             self.current_line += 1;
266         }
267 
268         if self.opts.exact_errors &&
269             match c as u32 {
270                 0x01..=0x08 | 0x0B | 0x0E..=0x1F | 0x7F..=0x9F | 0xFDD0..=0xFDEF => true,
271                 n if (n & 0xFFFE) == 0xFFFE => true,
272                 _ => false,
273             }
274         {
275             let msg = format!("Bad character {}", c);
276             self.emit_error(Cow::Owned(msg));
277         }
278 
279         debug!("got character {}", c);
280         self.current_char = c;
281         Some(c)
282     }
283 
284     //§ tokenization
285     // Get the next input character, if one is available.
get_char(&mut self, input: &mut BufferQueue) -> Option<char>286     fn get_char(&mut self, input: &mut BufferQueue) -> Option<char> {
287         if self.reconsume {
288             self.reconsume = false;
289             Some(self.current_char)
290         } else {
291             input
292                 .next()
293                 .and_then(|c| self.get_preprocessed_char(c, input))
294         }
295     }
296 
pop_except_from(&mut self, input: &mut BufferQueue, set: SmallCharSet) -> Option<SetResult>297     fn pop_except_from(&mut self, input: &mut BufferQueue, set: SmallCharSet) -> Option<SetResult> {
298         // Bail to the slow path for various corner cases.
299         // This means that `FromSet` can contain characters not in the set!
300         // It shouldn't matter because the fallback `FromSet` case should
301         // always do the same thing as the `NotFromSet` case.
302         if self.opts.exact_errors || self.reconsume || self.ignore_lf {
303             return self.get_char(input).map(|x| FromSet(x));
304         }
305 
306         let d = input.pop_except_from(set);
307         debug!("got characters {:?}", d);
308         match d {
309             Some(FromSet(c)) => self.get_preprocessed_char(c, input).map(|x| FromSet(x)),
310 
311             // NB: We don't set self.current_char for a run of characters not
312             // in the set.  It shouldn't matter for the codepaths that use
313             // this.
314             _ => d,
315         }
316     }
317 
318     // Check if the next characters are an ASCII case-insensitive match.  See
319     // BufferQueue::eat.
320     //
321     // NB: this doesn't do input stream preprocessing or set the current input
322     // character.
eat( &mut self, input: &mut BufferQueue, pat: &str, eq: fn(&u8, &u8) -> bool, ) -> Option<bool>323     fn eat(
324         &mut self,
325         input: &mut BufferQueue,
326         pat: &str,
327         eq: fn(&u8, &u8) -> bool,
328     ) -> Option<bool> {
329         input.push_front(replace(&mut self.temp_buf, StrTendril::new()));
330         match input.eat(pat, eq) {
331             None if self.at_eof => Some(false),
332             None => {
333                 while let Some(c) = input.next() {
334                     self.temp_buf.push_char(c);
335                 }
336                 None
337             },
338             Some(matched) => Some(matched),
339         }
340     }
341 
342     /// Run the state machine for as long as we can.
run(&mut self, input: &mut BufferQueue) -> TokenizerResult<Sink::Handle>343     fn run(&mut self, input: &mut BufferQueue) -> TokenizerResult<Sink::Handle> {
344         if self.opts.profile {
345             loop {
346                 let state = self.state;
347                 let old_sink = self.time_in_sink;
348                 let (run, mut dt) = time!(self.step(input));
349                 dt -= (self.time_in_sink - old_sink);
350                 let new = match self.state_profile.get_mut(&state) {
351                     Some(x) => {
352                         *x += dt;
353                         false
354                     },
355                     None => true,
356                 };
357                 if new {
358                     // do this here because of borrow shenanigans
359                     self.state_profile.insert(state, dt);
360                 }
361                 match run {
362                     ProcessResult::Continue => (),
363                     ProcessResult::Suspend => break,
364                     ProcessResult::Script(node) => return TokenizerResult::Script(node),
365                 }
366             }
367         } else {
368             loop {
369                 match self.step(input) {
370                     ProcessResult::Continue => (),
371                     ProcessResult::Suspend => break,
372                     ProcessResult::Script(node) => return TokenizerResult::Script(node),
373                 }
374             }
375         }
376         TokenizerResult::Done
377     }
378 
bad_char_error(&mut self)379     fn bad_char_error(&mut self) {
380         let msg = format_if!(
381             self.opts.exact_errors,
382             "Bad character",
383             "Saw {} in state {:?}",
384             self.current_char,
385             self.state
386         );
387         self.emit_error(msg);
388     }
389 
bad_eof_error(&mut self)390     fn bad_eof_error(&mut self) {
391         let msg = format_if!(
392             self.opts.exact_errors,
393             "Unexpected EOF",
394             "Saw EOF in state {:?}",
395             self.state
396         );
397         self.emit_error(msg);
398     }
399 
emit_char(&mut self, c: char)400     fn emit_char(&mut self, c: char) {
401         self.process_token_and_continue(match c {
402             '\0' => NullCharacterToken,
403             _ => CharacterTokens(StrTendril::from_char(c)),
404         });
405     }
406 
407     // The string must not contain '\0'!
emit_chars(&mut self, b: StrTendril)408     fn emit_chars(&mut self, b: StrTendril) {
409         self.process_token_and_continue(CharacterTokens(b));
410     }
411 
emit_current_tag(&mut self) -> ProcessResult<Sink::Handle>412     fn emit_current_tag(&mut self) -> ProcessResult<Sink::Handle> {
413         self.finish_attribute();
414 
415         let name = LocalName::from(&*self.current_tag_name);
416         self.current_tag_name.clear();
417 
418         match self.current_tag_kind {
419             StartTag => {
420                 self.last_start_tag_name = Some(name.clone());
421             },
422             EndTag => {
423                 if !self.current_tag_attrs.is_empty() {
424                     self.emit_error(Borrowed("Attributes on an end tag"));
425                 }
426                 if self.current_tag_self_closing {
427                     self.emit_error(Borrowed("Self-closing end tag"));
428                 }
429             },
430         }
431 
432         let token = TagToken(Tag {
433             kind: self.current_tag_kind,
434             name: name,
435             self_closing: self.current_tag_self_closing,
436             attrs: replace(&mut self.current_tag_attrs, vec![]),
437         });
438 
439         match self.process_token(token) {
440             TokenSinkResult::Continue => ProcessResult::Continue,
441             TokenSinkResult::Plaintext => {
442                 self.state = states::Plaintext;
443                 ProcessResult::Continue
444             },
445             TokenSinkResult::Script(node) => {
446                 self.state = states::Data;
447                 ProcessResult::Script(node)
448             },
449             TokenSinkResult::RawData(kind) => {
450                 self.state = states::RawData(kind);
451                 ProcessResult::Continue
452             },
453         }
454     }
455 
emit_temp_buf(&mut self)456     fn emit_temp_buf(&mut self) {
457         // FIXME: Make sure that clearing on emit is spec-compatible.
458         let buf = replace(&mut self.temp_buf, StrTendril::new());
459         self.emit_chars(buf);
460     }
461 
clear_temp_buf(&mut self)462     fn clear_temp_buf(&mut self) {
463         // Do this without a new allocation.
464         self.temp_buf.clear();
465     }
466 
emit_current_comment(&mut self)467     fn emit_current_comment(&mut self) {
468         let comment = replace(&mut self.current_comment, StrTendril::new());
469         self.process_token_and_continue(CommentToken(comment));
470     }
471 
discard_tag(&mut self)472     fn discard_tag(&mut self) {
473         self.current_tag_name.clear();
474         self.current_tag_self_closing = false;
475         self.current_tag_attrs = vec![];
476     }
477 
create_tag(&mut self, kind: TagKind, c: char)478     fn create_tag(&mut self, kind: TagKind, c: char) {
479         self.discard_tag();
480         self.current_tag_name.push_char(c);
481         self.current_tag_kind = kind;
482     }
483 
have_appropriate_end_tag(&self) -> bool484     fn have_appropriate_end_tag(&self) -> bool {
485         match self.last_start_tag_name.as_ref() {
486             Some(last) => (self.current_tag_kind == EndTag) && (*self.current_tag_name == **last),
487             None => false,
488         }
489     }
490 
create_attribute(&mut self, c: char)491     fn create_attribute(&mut self, c: char) {
492         self.finish_attribute();
493 
494         self.current_attr_name.push_char(c);
495     }
496 
finish_attribute(&mut self)497     fn finish_attribute(&mut self) {
498         if self.current_attr_name.len() == 0 {
499             return;
500         }
501 
502         // Check for a duplicate attribute.
503         // FIXME: the spec says we should error as soon as the name is finished.
504         // FIXME: linear time search, do we care?
505         let dup = {
506             let name = &*self.current_attr_name;
507             self.current_tag_attrs
508                 .iter()
509                 .any(|a| &*a.name.local == name)
510         };
511 
512         if dup {
513             self.emit_error(Borrowed("Duplicate attribute"));
514             self.current_attr_name.clear();
515             self.current_attr_value.clear();
516         } else {
517             let name = LocalName::from(&*self.current_attr_name);
518             self.current_attr_name.clear();
519             self.current_tag_attrs.push(Attribute {
520                 // The tree builder will adjust the namespace if necessary.
521                 // This only happens in foreign elements.
522                 name: QualName::new(None, ns!(), name),
523                 value: replace(&mut self.current_attr_value, StrTendril::new()),
524             });
525         }
526     }
527 
emit_current_doctype(&mut self)528     fn emit_current_doctype(&mut self) {
529         let doctype = replace(&mut self.current_doctype, Doctype::new());
530         self.process_token_and_continue(DoctypeToken(doctype));
531     }
532 
doctype_id<'a>(&'a mut self, kind: DoctypeIdKind) -> &'a mut Option<StrTendril>533     fn doctype_id<'a>(&'a mut self, kind: DoctypeIdKind) -> &'a mut Option<StrTendril> {
534         match kind {
535             Public => &mut self.current_doctype.public_id,
536             System => &mut self.current_doctype.system_id,
537         }
538     }
539 
clear_doctype_id(&mut self, kind: DoctypeIdKind)540     fn clear_doctype_id(&mut self, kind: DoctypeIdKind) {
541         let id = self.doctype_id(kind);
542         match *id {
543             Some(ref mut s) => s.clear(),
544             None => *id = Some(StrTendril::new()),
545         }
546     }
547 
consume_char_ref(&mut self, addnl_allowed: Option<char>)548     fn consume_char_ref(&mut self, addnl_allowed: Option<char>) {
549         // NB: The char ref tokenizer assumes we have an additional allowed
550         // character iff we're tokenizing in an attribute value.
551         self.char_ref_tokenizer = Some(Box::new(CharRefTokenizer::new(addnl_allowed)));
552     }
553 
emit_eof(&mut self)554     fn emit_eof(&mut self) {
555         self.process_token_and_continue(EOFToken);
556     }
557 
peek(&mut self, input: &BufferQueue) -> Option<char>558     fn peek(&mut self, input: &BufferQueue) -> Option<char> {
559         if self.reconsume {
560             Some(self.current_char)
561         } else {
562             input.peek()
563         }
564     }
565 
discard_char(&mut self, input: &mut BufferQueue)566     fn discard_char(&mut self, input: &mut BufferQueue) {
567         self.get_char(input);
568     }
569 
emit_error(&mut self, error: Cow<'static, str>)570     fn emit_error(&mut self, error: Cow<'static, str>) {
571         self.process_token_and_continue(ParseError(error));
572     }
573 }
574 //§ END
575 
576 // Shorthand for common state machine behaviors.
577 macro_rules! shorthand (
578     ( $me:ident : emit $c:expr                     ) => ( $me.emit_char($c);                                   );
579     ( $me:ident : create_tag $kind:ident $c:expr   ) => ( $me.create_tag($kind, $c);                           );
580     ( $me:ident : push_tag $c:expr                 ) => ( $me.current_tag_name.push_char($c);                  );
581     ( $me:ident : discard_tag                      ) => ( $me.discard_tag();                                   );
582     ( $me:ident : discard_char $input:expr         ) => ( $me.discard_char($input);                            );
583     ( $me:ident : push_temp $c:expr                ) => ( $me.temp_buf.push_char($c);                          );
584     ( $me:ident : emit_temp                        ) => ( $me.emit_temp_buf();                                 );
585     ( $me:ident : clear_temp                       ) => ( $me.clear_temp_buf();                                );
586     ( $me:ident : create_attr $c:expr              ) => ( $me.create_attribute($c);                            );
587     ( $me:ident : push_name $c:expr                ) => ( $me.current_attr_name.push_char($c);                 );
588     ( $me:ident : push_value $c:expr               ) => ( $me.current_attr_value.push_char($c);                );
589     ( $me:ident : append_value $c:expr             ) => ( $me.current_attr_value.push_tendril($c);             );
590     ( $me:ident : push_comment $c:expr             ) => ( $me.current_comment.push_char($c);                   );
591     ( $me:ident : append_comment $c:expr           ) => ( $me.current_comment.push_slice($c);                  );
592     ( $me:ident : emit_comment                     ) => ( $me.emit_current_comment();                          );
593     ( $me:ident : clear_comment                    ) => ( $me.current_comment.clear();                         );
594     ( $me:ident : create_doctype                   ) => ( $me.current_doctype = Doctype::new();                );
595     ( $me:ident : push_doctype_name $c:expr        ) => ( option_push(&mut $me.current_doctype.name, $c);      );
596     ( $me:ident : push_doctype_id $k:ident $c:expr ) => ( option_push($me.doctype_id($k), $c);                 );
597     ( $me:ident : clear_doctype_id $k:ident        ) => ( $me.clear_doctype_id($k);                            );
598     ( $me:ident : force_quirks                     ) => ( $me.current_doctype.force_quirks = true;             );
599     ( $me:ident : emit_doctype                     ) => ( $me.emit_current_doctype();                          );
600     ( $me:ident : error                            ) => ( $me.bad_char_error();                                );
601     ( $me:ident : error_eof                        ) => ( $me.bad_eof_error();                                 );
602 );
603 
604 // Tracing of tokenizer actions.  This adds significant bloat and compile time,
605 // so it's behind a cfg flag.
606 #[cfg(trace_tokenizer)]
607 macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ({
608     debug!("  {:s}", stringify!($($cmds)*));
609     shorthand!($me:expr : $($cmds)*);
610 }));
611 
612 #[cfg(not(trace_tokenizer))]
613 macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ( shorthand!($me: $($cmds)*) ) );
614 
615 // A little DSL for sequencing shorthand actions.
616 macro_rules! go (
617     // A pattern like $($cmd:tt)* ; $($rest:tt)* causes parse ambiguity.
618     // We have to tell the parser how much lookahead we need.
619 
620     ( $me:ident : $a:tt                   ; $($rest:tt)* ) => ({ sh_trace!($me: $a);          go!($me: $($rest)*); });
621     ( $me:ident : $a:tt $b:tt             ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b);       go!($me: $($rest)*); });
622     ( $me:ident : $a:tt $b:tt $c:tt       ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c);    go!($me: $($rest)*); });
623     ( $me:ident : $a:tt $b:tt $c:tt $d:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c $d); go!($me: $($rest)*); });
624 
625     // These can only come at the end.
626 
627     ( $me:ident : to $s:ident                    ) => ({ $me.state = states::$s; return ProcessResult::Continue;           });
628     ( $me:ident : to $s:ident $k1:expr           ) => ({ $me.state = states::$s($k1); return ProcessResult::Continue;      });
629     ( $me:ident : to $s:ident $k1:ident $k2:expr ) => ({ $me.state = states::$s($k1($k2)); return ProcessResult::Continue; });
630 
631     ( $me:ident : reconsume $s:ident                    ) => ({ $me.reconsume = true; go!($me: to $s);         });
632     ( $me:ident : reconsume $s:ident $k1:expr           ) => ({ $me.reconsume = true; go!($me: to $s $k1);     });
633     ( $me:ident : reconsume $s:ident $k1:ident $k2:expr ) => ({ $me.reconsume = true; go!($me: to $s $k1 $k2); });
634 
635     ( $me:ident : consume_char_ref             ) => ({ $me.consume_char_ref(None); return ProcessResult::Continue;         });
636     ( $me:ident : consume_char_ref $addnl:expr ) => ({ $me.consume_char_ref(Some($addnl)); return ProcessResult::Continue; });
637 
638     // We have a default next state after emitting a tag, but the sink can override.
639     ( $me:ident : emit_tag $s:ident ) => ({
640         $me.state = states::$s;
641         return $me.emit_current_tag();
642     });
643 
644     ( $me:ident : eof ) => ({ $me.emit_eof(); return ProcessResult::Suspend; });
645 
646     // If nothing else matched, it's a single command
647     ( $me:ident : $($cmd:tt)+ ) => ( sh_trace!($me: $($cmd)+); );
648 
649     // or nothing.
650     ( $me:ident : ) => (());
651 );
652 
653 macro_rules! go_match ( ( $me:ident : $x:expr, $($pats:pat),+ => $($cmds:tt)* ) => (
654     match $x {
655         $($pats)|+ => go!($me: $($cmds)*),
656         _ => (),
657     }
658 ));
659 
660 // This is a macro because it can cause early return
661 // from the function where it is used.
662 macro_rules! get_char ( ($me:expr, $input:expr) => (
663     unwrap_or_return!($me.get_char($input), ProcessResult::Suspend)
664 ));
665 
666 macro_rules! peek ( ($me:expr, $input:expr) => (
667     unwrap_or_return!($me.peek($input), ProcessResult::Suspend)
668 ));
669 
670 macro_rules! pop_except_from ( ($me:expr, $input:expr, $set:expr) => (
671     unwrap_or_return!($me.pop_except_from($input, $set), ProcessResult::Suspend)
672 ));
673 
674 macro_rules! eat ( ($me:expr, $input:expr, $pat:expr) => (
675     unwrap_or_return!($me.eat($input, $pat, u8::eq_ignore_ascii_case), ProcessResult::Suspend)
676 ));
677 
678 macro_rules! eat_exact ( ($me:expr, $input:expr, $pat:expr) => (
679     unwrap_or_return!($me.eat($input, $pat, u8::eq), ProcessResult::Suspend)
680 ));
681 
682 impl<Sink: TokenSink> Tokenizer<Sink> {
683     // Run the state machine for a while.
684     // Return true if we should be immediately re-invoked
685     // (this just simplifies control flow vs. break / continue).
step(&mut self, input: &mut BufferQueue) -> ProcessResult<Sink::Handle>686     fn step(&mut self, input: &mut BufferQueue) -> ProcessResult<Sink::Handle> {
687         if self.char_ref_tokenizer.is_some() {
688             return self.step_char_ref_tokenizer(input);
689         }
690 
691         debug!("processing in state {:?}", self.state);
692         match self.state {
693             //§ data-state
694             states::Data => loop {
695                 match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n')) {
696                     FromSet('\0') => go!(self: error; emit '\0'),
697                     FromSet('&') => go!(self: consume_char_ref),
698                     FromSet('<') => go!(self: to TagOpen),
699                     FromSet(c) => go!(self: emit c),
700                     NotFromSet(b) => self.emit_chars(b),
701                 }
702             },
703 
704             //§ rcdata-state
705             states::RawData(Rcdata) => loop {
706                 match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n')) {
707                     FromSet('\0') => go!(self: error; emit '\u{fffd}'),
708                     FromSet('&') => go!(self: consume_char_ref),
709                     FromSet('<') => go!(self: to RawLessThanSign Rcdata),
710                     FromSet(c) => go!(self: emit c),
711                     NotFromSet(b) => self.emit_chars(b),
712                 }
713             },
714 
715             //§ rawtext-state
716             states::RawData(Rawtext) => loop {
717                 match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) {
718                     FromSet('\0') => go!(self: error; emit '\u{fffd}'),
719                     FromSet('<') => go!(self: to RawLessThanSign Rawtext),
720                     FromSet(c) => go!(self: emit c),
721                     NotFromSet(b) => self.emit_chars(b),
722                 }
723             },
724 
725             //§ script-data-state
726             states::RawData(ScriptData) => loop {
727                 match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) {
728                     FromSet('\0') => go!(self: error; emit '\u{fffd}'),
729                     FromSet('<') => go!(self: to RawLessThanSign ScriptData),
730                     FromSet(c) => go!(self: emit c),
731                     NotFromSet(b) => self.emit_chars(b),
732                 }
733             },
734 
735             //§ script-data-escaped-state
736             states::RawData(ScriptDataEscaped(Escaped)) => loop {
737                 match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) {
738                     FromSet('\0') => go!(self: error; emit '\u{fffd}'),
739                     FromSet('-') => go!(self: emit '-'; to ScriptDataEscapedDash Escaped),
740                     FromSet('<') => go!(self: to RawLessThanSign ScriptDataEscaped Escaped),
741                     FromSet(c) => go!(self: emit c),
742                     NotFromSet(b) => self.emit_chars(b),
743                 }
744             },
745 
746             //§ script-data-double-escaped-state
747             states::RawData(ScriptDataEscaped(DoubleEscaped)) => loop {
748                 match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) {
749                     FromSet('\0') => go!(self: error; emit '\u{fffd}'),
750                     FromSet('-') => go!(self: emit '-'; to ScriptDataEscapedDash DoubleEscaped),
751                     FromSet('<') => {
752                         go!(self: emit '<'; to RawLessThanSign ScriptDataEscaped DoubleEscaped)
753                     },
754                     FromSet(c) => go!(self: emit c),
755                     NotFromSet(b) => self.emit_chars(b),
756                 }
757             },
758 
759             //§ plaintext-state
760             states::Plaintext => loop {
761                 match pop_except_from!(self, input, small_char_set!('\r' '\0' '\n')) {
762                     FromSet('\0') => go!(self: error; emit '\u{fffd}'),
763                     FromSet(c) => go!(self: emit c),
764                     NotFromSet(b) => self.emit_chars(b),
765                 }
766             },
767 
768             //§ tag-open-state
769             states::TagOpen => loop {
770                 match get_char!(self, input) {
771                     '!' => go!(self: clear_temp; to MarkupDeclarationOpen),
772                     '/' => go!(self: to EndTagOpen),
773                     '?' => go!(self: error; clear_comment; push_comment '?'; to BogusComment),
774                     c => match lower_ascii_letter(c) {
775                         Some(cl) => go!(self: create_tag StartTag cl; to TagName),
776                         None => go!(self: error; emit '<'; reconsume Data),
777                     },
778                 }
779             },
780 
781             //§ end-tag-open-state
782             states::EndTagOpen => loop {
783                 match get_char!(self, input) {
784                     '>' => go!(self: error; to Data),
785                     '\0' => {
786                         go!(self: error; clear_comment; push_comment '\u{fffd}'; to BogusComment)
787                     },
788                     c => match lower_ascii_letter(c) {
789                         Some(cl) => go!(self: create_tag EndTag cl; to TagName),
790                         None => go!(self: error; clear_comment; push_comment c; to BogusComment),
791                     },
792                 }
793             },
794 
795             //§ tag-name-state
796             states::TagName => loop {
797                 match get_char!(self, input) {
798                     '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName),
799                     '/' => go!(self: to SelfClosingStartTag),
800                     '>' => go!(self: emit_tag Data),
801                     '\0' => go!(self: error; push_tag '\u{fffd}'),
802                     c => go!(self: push_tag (c.to_ascii_lowercase())),
803                 }
804             },
805 
806             //§ script-data-escaped-less-than-sign-state
807             states::RawLessThanSign(ScriptDataEscaped(Escaped)) => loop {
808                 match get_char!(self, input) {
809                     '/' => go!(self: clear_temp; to RawEndTagOpen ScriptDataEscaped Escaped),
810                     c => match lower_ascii_letter(c) {
811                         Some(cl) => go!(self: clear_temp; push_temp cl; emit '<'; emit c;
812                                     to ScriptDataEscapeStart DoubleEscaped),
813                         None => go!(self: emit '<'; reconsume RawData ScriptDataEscaped Escaped),
814                     },
815                 }
816             },
817 
818             //§ script-data-double-escaped-less-than-sign-state
819             states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => loop {
820                 match get_char!(self, input) {
821                     '/' => go!(self: clear_temp; emit '/'; to ScriptDataDoubleEscapeEnd),
822                     _ => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped),
823                 }
824             },
825 
826             //§ rcdata-less-than-sign-state rawtext-less-than-sign-state script-data-less-than-sign-state
827             // otherwise
828             states::RawLessThanSign(kind) => loop {
829                 match get_char!(self, input) {
830                     '/' => go!(self: clear_temp; to RawEndTagOpen kind),
831                     '!' if kind == ScriptData => {
832                         go!(self: emit '<'; emit '!'; to ScriptDataEscapeStart Escaped)
833                     },
834                     _ => go!(self: emit '<'; reconsume RawData kind),
835                 }
836             },
837 
838             //§ rcdata-end-tag-open-state rawtext-end-tag-open-state script-data-end-tag-open-state script-data-escaped-end-tag-open-state
839             states::RawEndTagOpen(kind) => loop {
840                 let c = get_char!(self, input);
841                 match lower_ascii_letter(c) {
842                     Some(cl) => go!(self: create_tag EndTag cl; push_temp c; to RawEndTagName kind),
843                     None => go!(self: emit '<'; emit '/'; reconsume RawData kind),
844                 }
845             },
846 
847             //§ rcdata-end-tag-name-state rawtext-end-tag-name-state script-data-end-tag-name-state script-data-escaped-end-tag-name-state
848             states::RawEndTagName(kind) => loop {
849                 let c = get_char!(self, input);
850                 if self.have_appropriate_end_tag() {
851                     match c {
852                         '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName),
853                         '/' => go!(self: to SelfClosingStartTag),
854                         '>' => go!(self: emit_tag Data),
855                         _ => (),
856                     }
857                 }
858 
859                 match lower_ascii_letter(c) {
860                     Some(cl) => go!(self: push_tag cl; push_temp c),
861                     None => {
862                         go!(self: discard_tag; emit '<'; emit '/'; emit_temp; reconsume RawData kind)
863                     },
864                 }
865             },
866 
867             //§ script-data-double-escape-start-state
868             states::ScriptDataEscapeStart(DoubleEscaped) => loop {
869                 let c = get_char!(self, input);
870                 match c {
871                     '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => {
872                         let esc = if &*self.temp_buf == "script" {
873                             DoubleEscaped
874                         } else {
875                             Escaped
876                         };
877                         go!(self: emit c; to RawData ScriptDataEscaped esc);
878                     },
879                     _ => match lower_ascii_letter(c) {
880                         Some(cl) => go!(self: push_temp cl; emit c),
881                         None => go!(self: reconsume RawData ScriptDataEscaped Escaped),
882                     },
883                 }
884             },
885 
886             //§ script-data-escape-start-state
887             states::ScriptDataEscapeStart(Escaped) => loop {
888                 match get_char!(self, input) {
889                     '-' => go!(self: emit '-'; to ScriptDataEscapeStartDash),
890                     _ => go!(self: reconsume RawData ScriptData),
891                 }
892             },
893 
894             //§ script-data-escape-start-dash-state
895             states::ScriptDataEscapeStartDash => loop {
896                 match get_char!(self, input) {
897                     '-' => go!(self: emit '-'; to ScriptDataEscapedDashDash Escaped),
898                     _ => go!(self: reconsume RawData ScriptData),
899                 }
900             },
901 
902             //§ script-data-escaped-dash-state script-data-double-escaped-dash-state
903             states::ScriptDataEscapedDash(kind) => loop {
904                 match get_char!(self, input) {
905                     '-' => go!(self: emit '-'; to ScriptDataEscapedDashDash kind),
906                     '<' => {
907                         if kind == DoubleEscaped {
908                             go!(self: emit '<');
909                         }
910                         go!(self: to RawLessThanSign ScriptDataEscaped kind);
911                     },
912                     '\0' => go!(self: error; emit '\u{fffd}'; to RawData ScriptDataEscaped kind),
913                     c => go!(self: emit c; to RawData ScriptDataEscaped kind),
914                 }
915             },
916 
917             //§ script-data-escaped-dash-dash-state script-data-double-escaped-dash-dash-state
918             states::ScriptDataEscapedDashDash(kind) => loop {
919                 match get_char!(self, input) {
920                     '-' => go!(self: emit '-'),
921                     '<' => {
922                         if kind == DoubleEscaped {
923                             go!(self: emit '<');
924                         }
925                         go!(self: to RawLessThanSign ScriptDataEscaped kind);
926                     },
927                     '>' => go!(self: emit '>'; to RawData ScriptData),
928                     '\0' => go!(self: error; emit '\u{fffd}'; to RawData ScriptDataEscaped kind),
929                     c => go!(self: emit c; to RawData ScriptDataEscaped kind),
930                 }
931             },
932 
933             //§ script-data-double-escape-end-state
934             states::ScriptDataDoubleEscapeEnd => loop {
935                 let c = get_char!(self, input);
936                 match c {
937                     '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => {
938                         let esc = if &*self.temp_buf == "script" {
939                             Escaped
940                         } else {
941                             DoubleEscaped
942                         };
943                         go!(self: emit c; to RawData ScriptDataEscaped esc);
944                     },
945                     _ => match lower_ascii_letter(c) {
946                         Some(cl) => go!(self: push_temp cl; emit c),
947                         None => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped),
948                     },
949                 }
950             },
951 
952             //§ before-attribute-name-state
953             states::BeforeAttributeName => loop {
954                 match get_char!(self, input) {
955                     '\t' | '\n' | '\x0C' | ' ' => (),
956                     '/' => go!(self: to SelfClosingStartTag),
957                     '>' => go!(self: emit_tag Data),
958                     '\0' => go!(self: error; create_attr '\u{fffd}'; to AttributeName),
959                     c => match lower_ascii_letter(c) {
960                         Some(cl) => go!(self: create_attr cl; to AttributeName),
961                         None => {
962                             go_match!(self: c,
963                             '"' , '\'' , '<' , '=' => error);
964                             go!(self: create_attr c; to AttributeName);
965                         },
966                     },
967                 }
968             },
969 
970             //§ attribute-name-state
971             states::AttributeName => loop {
972                 match get_char!(self, input) {
973                     '\t' | '\n' | '\x0C' | ' ' => go!(self: to AfterAttributeName),
974                     '/' => go!(self: to SelfClosingStartTag),
975                     '=' => go!(self: to BeforeAttributeValue),
976                     '>' => go!(self: emit_tag Data),
977                     '\0' => go!(self: error; push_name '\u{fffd}'),
978                     c => match lower_ascii_letter(c) {
979                         Some(cl) => go!(self: push_name cl),
980                         None => {
981                             go_match!(self: c,
982                             '"' , '\'' , '<' => error);
983                             go!(self: push_name c);
984                         },
985                     },
986                 }
987             },
988 
989             //§ after-attribute-name-state
990             states::AfterAttributeName => loop {
991                 match get_char!(self, input) {
992                     '\t' | '\n' | '\x0C' | ' ' => (),
993                     '/' => go!(self: to SelfClosingStartTag),
994                     '=' => go!(self: to BeforeAttributeValue),
995                     '>' => go!(self: emit_tag Data),
996                     '\0' => go!(self: error; create_attr '\u{fffd}'; to AttributeName),
997                     c => match lower_ascii_letter(c) {
998                         Some(cl) => go!(self: create_attr cl; to AttributeName),
999                         None => {
1000                             go_match!(self: c,
1001                             '"' , '\'' , '<' => error);
1002                             go!(self: create_attr c; to AttributeName);
1003                         },
1004                     },
1005                 }
1006             },
1007 
1008             //§ before-attribute-value-state
1009             // Use peek so we can handle the first attr character along with the rest,
1010             // hopefully in the same zero-copy buffer.
1011             states::BeforeAttributeValue => loop {
1012                 match peek!(self, input) {
1013                     '\t' | '\n' | '\r' | '\x0C' | ' ' => go!(self: discard_char input),
1014                     '"' => go!(self: discard_char input; to AttributeValue DoubleQuoted),
1015                     '\'' => go!(self: discard_char input; to AttributeValue SingleQuoted),
1016                     '\0' => {
1017                         go!(self: discard_char input; error; push_value '\u{fffd}'; to AttributeValue Unquoted)
1018                     },
1019                     '>' => go!(self: discard_char input; error; emit_tag Data),
1020                     _ => go!(self: to AttributeValue Unquoted),
1021                 }
1022             },
1023 
1024             //§ attribute-value-(double-quoted)-state
1025             states::AttributeValue(DoubleQuoted) => loop {
1026                 match pop_except_from!(self, input, small_char_set!('\r' '"' '&' '\0' '\n')) {
1027                     FromSet('"') => go!(self: to AfterAttributeValueQuoted),
1028                     FromSet('&') => go!(self: consume_char_ref '"'),
1029                     FromSet('\0') => go!(self: error; push_value '\u{fffd}'),
1030                     FromSet(c) => go!(self: push_value c),
1031                     NotFromSet(ref b) => go!(self: append_value b),
1032                 }
1033             },
1034 
1035             //§ attribute-value-(single-quoted)-state
1036             states::AttributeValue(SingleQuoted) => loop {
1037                 match pop_except_from!(self, input, small_char_set!('\r' '\'' '&' '\0' '\n')) {
1038                     FromSet('\'') => go!(self: to AfterAttributeValueQuoted),
1039                     FromSet('&') => go!(self: consume_char_ref '\''),
1040                     FromSet('\0') => go!(self: error; push_value '\u{fffd}'),
1041                     FromSet(c) => go!(self: push_value c),
1042                     NotFromSet(ref b) => go!(self: append_value b),
1043                 }
1044             },
1045 
1046             //§ attribute-value-(unquoted)-state
1047             states::AttributeValue(Unquoted) => loop {
1048                 match pop_except_from!(
1049                     self,
1050                     input,
1051                     small_char_set!('\r' '\t' '\n' '\x0C' ' ' '&' '>' '\0')
1052                 ) {
1053                     FromSet('\t') | FromSet('\n') | FromSet('\x0C') | FromSet(' ') => {
1054                         go!(self: to BeforeAttributeName)
1055                     },
1056                     FromSet('&') => go!(self: consume_char_ref '>'),
1057                     FromSet('>') => go!(self: emit_tag Data),
1058                     FromSet('\0') => go!(self: error; push_value '\u{fffd}'),
1059                     FromSet(c) => {
1060                         go_match!(self: c,
1061                             '"' , '\'' , '<' , '=' , '`' => error);
1062                         go!(self: push_value c);
1063                     },
1064                     NotFromSet(ref b) => go!(self: append_value b),
1065                 }
1066             },
1067 
1068             //§ after-attribute-value-(quoted)-state
1069             states::AfterAttributeValueQuoted => loop {
1070                 match get_char!(self, input) {
1071                     '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName),
1072                     '/' => go!(self: to SelfClosingStartTag),
1073                     '>' => go!(self: emit_tag Data),
1074                     _ => go!(self: error; reconsume BeforeAttributeName),
1075                 }
1076             },
1077 
1078             //§ self-closing-start-tag-state
1079             states::SelfClosingStartTag => loop {
1080                 match get_char!(self, input) {
1081                     '>' => {
1082                         self.current_tag_self_closing = true;
1083                         go!(self: emit_tag Data);
1084                     },
1085                     _ => go!(self: error; reconsume BeforeAttributeName),
1086                 }
1087             },
1088 
1089             //§ comment-start-state
1090             states::CommentStart => loop {
1091                 match get_char!(self, input) {
1092                     '-' => go!(self: to CommentStartDash),
1093                     '\0' => go!(self: error; push_comment '\u{fffd}'; to Comment),
1094                     '>' => go!(self: error; emit_comment; to Data),
1095                     c => go!(self: push_comment c; to Comment),
1096                 }
1097             },
1098 
1099             //§ comment-start-dash-state
1100             states::CommentStartDash => loop {
1101                 match get_char!(self, input) {
1102                     '-' => go!(self: to CommentEnd),
1103                     '\0' => go!(self: error; append_comment "-\u{fffd}"; to Comment),
1104                     '>' => go!(self: error; emit_comment; to Data),
1105                     c => go!(self: push_comment '-'; push_comment c; to Comment),
1106                 }
1107             },
1108 
1109             //§ comment-state
1110             states::Comment => loop {
1111                 match get_char!(self, input) {
1112                     '-' => go!(self: to CommentEndDash),
1113                     '\0' => go!(self: error; push_comment '\u{fffd}'),
1114                     c => go!(self: push_comment c),
1115                 }
1116             },
1117 
1118             //§ comment-end-dash-state
1119             states::CommentEndDash => loop {
1120                 match get_char!(self, input) {
1121                     '-' => go!(self: to CommentEnd),
1122                     '\0' => go!(self: error; append_comment "-\u{fffd}"; to Comment),
1123                     c => go!(self: push_comment '-'; push_comment c; to Comment),
1124                 }
1125             },
1126 
1127             //§ comment-end-state
1128             states::CommentEnd => loop {
1129                 match get_char!(self, input) {
1130                     '>' => go!(self: emit_comment; to Data),
1131                     '\0' => go!(self: error; append_comment "--\u{fffd}"; to Comment),
1132                     '!' => go!(self: error; to CommentEndBang),
1133                     '-' => go!(self: error; push_comment '-'),
1134                     c => go!(self: error; append_comment "--"; push_comment c; to Comment),
1135                 }
1136             },
1137 
1138             //§ comment-end-bang-state
1139             states::CommentEndBang => loop {
1140                 match get_char!(self, input) {
1141                     '-' => go!(self: append_comment "--!"; to CommentEndDash),
1142                     '>' => go!(self: emit_comment; to Data),
1143                     '\0' => go!(self: error; append_comment "--!\u{fffd}"; to Comment),
1144                     c => go!(self: append_comment "--!"; push_comment c; to Comment),
1145                 }
1146             },
1147 
1148             //§ doctype-state
1149             states::Doctype => loop {
1150                 match get_char!(self, input) {
1151                     '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeName),
1152                     _ => go!(self: error; reconsume BeforeDoctypeName),
1153                 }
1154             },
1155 
1156             //§ before-doctype-name-state
1157             states::BeforeDoctypeName => loop {
1158                 match get_char!(self, input) {
1159                     '\t' | '\n' | '\x0C' | ' ' => (),
1160                     '\0' => {
1161                         go!(self: error; create_doctype; push_doctype_name '\u{fffd}'; to DoctypeName)
1162                     },
1163                     '>' => go!(self: error; create_doctype; force_quirks; emit_doctype; to Data),
1164                     c => go!(self: create_doctype; push_doctype_name (c.to_ascii_lowercase());
1165                                   to DoctypeName),
1166                 }
1167             },
1168 
1169             //§ doctype-name-state
1170             states::DoctypeName => loop {
1171                 match get_char!(self, input) {
1172                     '\t' | '\n' | '\x0C' | ' ' => go!(self: clear_temp; to AfterDoctypeName),
1173                     '>' => go!(self: emit_doctype; to Data),
1174                     '\0' => go!(self: error; push_doctype_name '\u{fffd}'),
1175                     c => go!(self: push_doctype_name (c.to_ascii_lowercase())),
1176                 }
1177             },
1178 
1179             //§ after-doctype-name-state
1180             states::AfterDoctypeName => loop {
1181                 if eat!(self, input, "public") {
1182                     go!(self: to AfterDoctypeKeyword Public);
1183                 } else if eat!(self, input, "system") {
1184                     go!(self: to AfterDoctypeKeyword System);
1185                 } else {
1186                     match get_char!(self, input) {
1187                         '\t' | '\n' | '\x0C' | ' ' => (),
1188                         '>' => go!(self: emit_doctype; to Data),
1189                         _ => go!(self: error; force_quirks; to BogusDoctype),
1190                     }
1191                 }
1192             },
1193 
1194             //§ after-doctype-public-keyword-state after-doctype-system-keyword-state
1195             states::AfterDoctypeKeyword(kind) => loop {
1196                 match get_char!(self, input) {
1197                     '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier kind),
1198                     '"' => {
1199                         go!(self: error; clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind)
1200                     },
1201                     '\'' => {
1202                         go!(self: error; clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind)
1203                     },
1204                     '>' => go!(self: error; force_quirks; emit_doctype; to Data),
1205                     _ => go!(self: error; force_quirks; to BogusDoctype),
1206                 }
1207             },
1208 
1209             //§ before-doctype-public-identifier-state before-doctype-system-identifier-state
1210             states::BeforeDoctypeIdentifier(kind) => loop {
1211                 match get_char!(self, input) {
1212                     '\t' | '\n' | '\x0C' | ' ' => (),
1213                     '"' => go!(self: clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind),
1214                     '\'' => go!(self: clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind),
1215                     '>' => go!(self: error; force_quirks; emit_doctype; to Data),
1216                     _ => go!(self: error; force_quirks; to BogusDoctype),
1217                 }
1218             },
1219 
1220             //§ doctype-public-identifier-(double-quoted)-state doctype-system-identifier-(double-quoted)-state
1221             states::DoctypeIdentifierDoubleQuoted(kind) => loop {
1222                 match get_char!(self, input) {
1223                     '"' => go!(self: to AfterDoctypeIdentifier kind),
1224                     '\0' => go!(self: error; push_doctype_id kind '\u{fffd}'),
1225                     '>' => go!(self: error; force_quirks; emit_doctype; to Data),
1226                     c => go!(self: push_doctype_id kind c),
1227                 }
1228             },
1229 
1230             //§ doctype-public-identifier-(single-quoted)-state doctype-system-identifier-(single-quoted)-state
1231             states::DoctypeIdentifierSingleQuoted(kind) => loop {
1232                 match get_char!(self, input) {
1233                     '\'' => go!(self: to AfterDoctypeIdentifier kind),
1234                     '\0' => go!(self: error; push_doctype_id kind '\u{fffd}'),
1235                     '>' => go!(self: error; force_quirks; emit_doctype; to Data),
1236                     c => go!(self: push_doctype_id kind c),
1237                 }
1238             },
1239 
1240             //§ after-doctype-public-identifier-state
1241             states::AfterDoctypeIdentifier(Public) => loop {
1242                 match get_char!(self, input) {
1243                     '\t' | '\n' | '\x0C' | ' ' => {
1244                         go!(self: to BetweenDoctypePublicAndSystemIdentifiers)
1245                     },
1246                     '>' => go!(self: emit_doctype; to Data),
1247                     '"' => {
1248                         go!(self: error; clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System)
1249                     },
1250                     '\'' => {
1251                         go!(self: error; clear_doctype_id System; to DoctypeIdentifierSingleQuoted System)
1252                     },
1253                     _ => go!(self: error; force_quirks; to BogusDoctype),
1254                 }
1255             },
1256 
1257             //§ after-doctype-system-identifier-state
1258             states::AfterDoctypeIdentifier(System) => loop {
1259                 match get_char!(self, input) {
1260                     '\t' | '\n' | '\x0C' | ' ' => (),
1261                     '>' => go!(self: emit_doctype; to Data),
1262                     _ => go!(self: error; to BogusDoctype),
1263                 }
1264             },
1265 
1266             //§ between-doctype-public-and-system-identifiers-state
1267             states::BetweenDoctypePublicAndSystemIdentifiers => loop {
1268                 match get_char!(self, input) {
1269                     '\t' | '\n' | '\x0C' | ' ' => (),
1270                     '>' => go!(self: emit_doctype; to Data),
1271                     '"' => {
1272                         go!(self: clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System)
1273                     },
1274                     '\'' => {
1275                         go!(self: clear_doctype_id System; to DoctypeIdentifierSingleQuoted System)
1276                     },
1277                     _ => go!(self: error; force_quirks; to BogusDoctype),
1278                 }
1279             },
1280 
1281             //§ bogus-doctype-state
1282             states::BogusDoctype => loop {
1283                 match get_char!(self, input) {
1284                     '>' => go!(self: emit_doctype; to Data),
1285                     _ => (),
1286                 }
1287             },
1288 
1289             //§ bogus-comment-state
1290             states::BogusComment => loop {
1291                 match get_char!(self, input) {
1292                     '>' => go!(self: emit_comment; to Data),
1293                     '\0' => go!(self: push_comment '\u{fffd}'),
1294                     c => go!(self: push_comment c),
1295                 }
1296             },
1297 
1298             //§ markup-declaration-open-state
1299             states::MarkupDeclarationOpen => loop {
1300                 if eat_exact!(self, input, "--") {
1301                     go!(self: clear_comment; to CommentStart);
1302                 } else if eat!(self, input, "doctype") {
1303                     go!(self: to Doctype);
1304                 } else {
1305                     if self
1306                         .sink
1307                         .adjusted_current_node_present_but_not_in_html_namespace()
1308                     {
1309                         if eat_exact!(self, input, "[CDATA[") {
1310                             go!(self: clear_temp; to CdataSection);
1311                         }
1312                     }
1313                     go!(self: error; to BogusComment);
1314                 }
1315             },
1316 
1317             //§ cdata-section-state
1318             states::CdataSection => loop {
1319                 match get_char!(self, input) {
1320                     ']' => go!(self: to CdataSectionBracket),
1321                     '\0' => go!(self: emit_temp; emit '\0'),
1322                     c => go!(self: push_temp c),
1323                 }
1324             },
1325 
1326             //§ cdata-section-bracket
1327             states::CdataSectionBracket => match get_char!(self, input) {
1328                 ']' => go!(self: to CdataSectionEnd),
1329                 _ => go!(self: push_temp ']'; reconsume CdataSection),
1330             },
1331 
1332             //§ cdata-section-end
1333             states::CdataSectionEnd => loop {
1334                 match get_char!(self, input) {
1335                     ']' => go!(self: push_temp ']'),
1336                     '>' => go!(self: emit_temp; to Data),
1337                     _ => go!(self: push_temp ']'; push_temp ']'; reconsume CdataSection),
1338                 }
1339             },
1340             //§ END
1341         }
1342     }
1343 
1344     fn step_char_ref_tokenizer(&mut self, input: &mut BufferQueue) -> ProcessResult<Sink::Handle> {
1345         // FIXME HACK: Take and replace the tokenizer so we don't
1346         // double-mut-borrow self.  This is why it's boxed.
1347         let mut tok = self.char_ref_tokenizer.take().unwrap();
1348         let outcome = tok.step(self, input);
1349 
1350         let progress = match outcome {
1351             char_ref::Done => {
1352                 self.process_char_ref(tok.get_result());
1353                 return ProcessResult::Continue;
1354             },
1355 
1356             char_ref::Stuck => ProcessResult::Suspend,
1357             char_ref::Progress => ProcessResult::Continue,
1358         };
1359 
1360         self.char_ref_tokenizer = Some(tok);
1361         progress
1362     }
1363 
1364     fn process_char_ref(&mut self, char_ref: CharRef) {
1365         let CharRef {
1366             mut chars,
1367             mut num_chars,
1368         } = char_ref;
1369 
1370         if num_chars == 0 {
1371             chars[0] = '&';
1372             num_chars = 1;
1373         }
1374 
1375         for i in 0..num_chars {
1376             let c = chars[i as usize];
1377             match self.state {
1378                 states::Data | states::RawData(states::Rcdata) => go!(self: emit c),
1379 
1380                 states::AttributeValue(_) => go!(self: push_value c),
1381 
1382                 _ => panic!(
1383                     "state {:?} should not be reachable in process_char_ref",
1384                     self.state
1385                 ),
1386             }
1387         }
1388     }
1389 
1390     /// Indicate that we have reached the end of the input.
1391     pub fn end(&mut self) {
1392         // Handle EOF in the char ref sub-tokenizer, if there is one.
1393         // Do this first because it might un-consume stuff.
1394         let mut input = BufferQueue::new();
1395         match self.char_ref_tokenizer.take() {
1396             None => (),
1397             Some(mut tok) => {
1398                 tok.end_of_file(self, &mut input);
1399                 self.process_char_ref(tok.get_result());
1400             },
1401         }
1402 
1403         // Process all remaining buffered input.
1404         // If we're waiting for lookahead, we're not gonna get it.
1405         self.at_eof = true;
1406         assert!(matches!(self.run(&mut input), TokenizerResult::Done));
1407         assert!(input.is_empty());
1408 
1409         loop {
1410             match self.eof_step() {
1411                 ProcessResult::Continue => (),
1412                 ProcessResult::Suspend => break,
1413                 ProcessResult::Script(_) => unreachable!(),
1414             }
1415         }
1416 
1417         self.sink.end();
1418 
1419         if self.opts.profile {
1420             self.dump_profile();
1421         }
1422     }
1423 
1424     fn dump_profile(&self) {
1425         let mut results: Vec<(states::State, u64)> =
1426             self.state_profile.iter().map(|(s, t)| (*s, *t)).collect();
1427         results.sort_by(|&(_, x), &(_, y)| y.cmp(&x));
1428 
1429         let total: u64 = results
1430             .iter()
1431             .map(|&(_, t)| t)
1432             .fold(0, ::std::ops::Add::add);
1433         println!("\nTokenizer profile, in nanoseconds");
1434         println!("\n{:12}         total in token sink", self.time_in_sink);
1435         println!("\n{:12}         total in tokenizer", total);
1436 
1437         for (k, v) in results.into_iter() {
1438             let pct = 100.0 * (v as f64) / (total as f64);
1439             println!("{:12}  {:4.1}%  {:?}", v, pct, k);
1440         }
1441     }
1442 
1443     fn eof_step(&mut self) -> ProcessResult<Sink::Handle> {
1444         debug!("processing EOF in state {:?}", self.state);
1445         match self.state {
1446             states::Data |
1447             states::RawData(Rcdata) |
1448             states::RawData(Rawtext) |
1449             states::RawData(ScriptData) |
1450             states::Plaintext => go!(self: eof),
1451 
1452             states::TagName |
1453             states::RawData(ScriptDataEscaped(_)) |
1454             states::BeforeAttributeName |
1455             states::AttributeName |
1456             states::AfterAttributeName |
1457             states::BeforeAttributeValue |
1458             states::AttributeValue(_) |
1459             states::AfterAttributeValueQuoted |
1460             states::SelfClosingStartTag |
1461             states::ScriptDataEscapedDash(_) |
1462             states::ScriptDataEscapedDashDash(_) => go!(self: error_eof; to Data),
1463 
1464             states::TagOpen => go!(self: error_eof; emit '<'; to Data),
1465 
1466             states::EndTagOpen => go!(self: error_eof; emit '<'; emit '/'; to Data),
1467 
1468             states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => {
1469                 go!(self: to RawData ScriptDataEscaped DoubleEscaped)
1470             },
1471 
1472             states::RawLessThanSign(kind) => go!(self: emit '<'; to RawData kind),
1473 
1474             states::RawEndTagOpen(kind) => go!(self: emit '<'; emit '/'; to RawData kind),
1475 
1476             states::RawEndTagName(kind) => {
1477                 go!(self: emit '<'; emit '/'; emit_temp; to RawData kind)
1478             },
1479 
1480             states::ScriptDataEscapeStart(kind) => go!(self: to RawData ScriptDataEscaped kind),
1481 
1482             states::ScriptDataEscapeStartDash => go!(self: to RawData ScriptData),
1483 
1484             states::ScriptDataDoubleEscapeEnd => {
1485                 go!(self: to RawData ScriptDataEscaped DoubleEscaped)
1486             },
1487 
1488             states::CommentStart |
1489             states::CommentStartDash |
1490             states::Comment |
1491             states::CommentEndDash |
1492             states::CommentEnd |
1493             states::CommentEndBang => go!(self: error_eof; emit_comment; to Data),
1494 
1495             states::Doctype | states::BeforeDoctypeName => {
1496                 go!(self: error_eof; create_doctype; force_quirks; emit_doctype; to Data)
1497             },
1498 
1499             states::DoctypeName |
1500             states::AfterDoctypeName |
1501             states::AfterDoctypeKeyword(_) |
1502             states::BeforeDoctypeIdentifier(_) |
1503             states::DoctypeIdentifierDoubleQuoted(_) |
1504             states::DoctypeIdentifierSingleQuoted(_) |
1505             states::AfterDoctypeIdentifier(_) |
1506             states::BetweenDoctypePublicAndSystemIdentifiers => {
1507                 go!(self: error_eof; force_quirks; emit_doctype; to Data)
1508             },
1509 
1510             states::BogusDoctype => go!(self: emit_doctype; to Data),
1511 
1512             states::BogusComment => go!(self: emit_comment; to Data),
1513 
1514             states::MarkupDeclarationOpen => go!(self: error; to BogusComment),
1515 
1516             states::CdataSection => go!(self: emit_temp; error_eof; to Data),
1517 
1518             states::CdataSectionBracket => go!(self: push_temp ']'; to CdataSection),
1519 
1520             states::CdataSectionEnd => go!(self: push_temp ']'; push_temp ']'; to CdataSection),
1521         }
1522     }
1523 }
1524 
1525 #[cfg(test)]
1526 #[allow(non_snake_case)]
1527 mod test {
1528     use super::option_push; // private items
1529     use crate::tendril::{SliceExt, StrTendril};
1530 
1531     use super::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
1532 
1533     use super::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
1534     use super::interface::{EndTag, StartTag, Tag, TagKind};
1535     use super::interface::{TagToken, Token};
1536 
1537     use markup5ever::buffer_queue::BufferQueue;
1538     use std::mem::replace;
1539 
1540     use crate::LocalName;
1541 
1542     // LinesMatch implements the TokenSink trait. It is used for testing to see
1543     // if current_line is being updated when process_token is called. The lines
1544     // vector is a collection of the line numbers that each token is on.
1545     struct LinesMatch {
1546         tokens: Vec<Token>,
1547         current_str: StrTendril,
1548         lines: Vec<(Token, u64)>,
1549     }
1550 
1551     impl LinesMatch {
1552         fn new() -> LinesMatch {
1553             LinesMatch {
1554                 tokens: vec![],
1555                 current_str: StrTendril::new(),
1556                 lines: vec![],
1557             }
1558         }
1559 
1560         fn push(&mut self, token: Token, line_number: u64) {
1561             self.finish_str();
1562             self.lines.push((token, line_number));
1563         }
1564 
1565         fn finish_str(&mut self) {
1566             if self.current_str.len() > 0 {
1567                 let s = replace(&mut self.current_str, StrTendril::new());
1568                 self.tokens.push(CharacterTokens(s));
1569             }
1570         }
1571     }
1572 
1573     impl TokenSink for LinesMatch {
1574         type Handle = ();
1575 
1576         fn process_token(
1577             &mut self,
1578             token: Token,
1579             line_number: u64,
1580         ) -> TokenSinkResult<Self::Handle> {
1581             match token {
1582                 CharacterTokens(b) => {
1583                     self.current_str.push_slice(&b);
1584                 },
1585 
1586                 NullCharacterToken => {
1587                     self.current_str.push_char('\0');
1588                 },
1589 
1590                 ParseError(_) => {
1591                     panic!("unexpected parse error");
1592                 },
1593 
1594                 TagToken(mut t) => {
1595                     // The spec seems to indicate that one can emit
1596                     // erroneous end tags with attrs, but the test
1597                     // cases don't contain them.
1598                     match t.kind {
1599                         EndTag => {
1600                             t.self_closing = false;
1601                             t.attrs = vec![];
1602                         },
1603                         _ => t.attrs.sort_by(|a1, a2| a1.name.cmp(&a2.name)),
1604                     }
1605                     self.push(TagToken(t), line_number);
1606                 },
1607 
1608                 EOFToken => (),
1609 
1610                 _ => self.push(token, line_number),
1611             }
1612             TokenSinkResult::Continue
1613         }
1614     }
1615 
1616     // Take in tokens, process them, and return vector with line
1617     // numbers that each token is on
1618     fn tokenize(input: Vec<StrTendril>, opts: TokenizerOpts) -> Vec<(Token, u64)> {
1619         let sink = LinesMatch::new();
1620         let mut tok = Tokenizer::new(sink, opts);
1621         let mut buffer = BufferQueue::new();
1622         for chunk in input.into_iter() {
1623             buffer.push_back(chunk);
1624             let _ = tok.feed(&mut buffer);
1625         }
1626         tok.end();
1627         tok.sink.lines
1628     }
1629 
1630     // Create a tag token
1631     fn create_tag(token: StrTendril, tagkind: TagKind) -> Token {
1632         let name = LocalName::from(&*token);
1633         let token = TagToken(Tag {
1634             kind: tagkind,
1635             name: name,
1636             self_closing: false,
1637             attrs: vec![],
1638         });
1639         token
1640     }
1641 
1642     #[test]
1643     fn push_to_None_gives_singleton() {
1644         let mut s: Option<StrTendril> = None;
1645         option_push(&mut s, 'x');
1646         assert_eq!(s, Some("x".to_tendril()));
1647     }
1648 
1649     #[test]
1650     fn push_to_empty_appends() {
1651         let mut s: Option<StrTendril> = Some(StrTendril::new());
1652         option_push(&mut s, 'x');
1653         assert_eq!(s, Some("x".to_tendril()));
1654     }
1655 
1656     #[test]
1657     fn push_to_nonempty_appends() {
1658         let mut s: Option<StrTendril> = Some(StrTendril::from_slice("y"));
1659         option_push(&mut s, 'x');
1660         assert_eq!(s, Some("yx".to_tendril()));
1661     }
1662 
1663     #[test]
1664     fn check_lines() {
1665         let opts = TokenizerOpts {
1666             exact_errors: false,
1667             discard_bom: true,
1668             profile: false,
1669             initial_state: None,
1670             last_start_tag_name: None,
1671         };
1672         let vector = vec![
1673             StrTendril::from("<a>\n"),
1674             StrTendril::from("<b>\n"),
1675             StrTendril::from("</b>\n"),
1676             StrTendril::from("</a>\n"),
1677         ];
1678         let expected = vec![
1679             (create_tag(StrTendril::from("a"), StartTag), 1),
1680             (create_tag(StrTendril::from("b"), StartTag), 2),
1681             (create_tag(StrTendril::from("b"), EndTag), 3),
1682             (create_tag(StrTendril::from("a"), EndTag), 4),
1683         ];
1684         let results = tokenize(vector, opts);
1685         assert_eq!(results, expected);
1686     }
1687 
1688     #[test]
1689     fn check_lines_with_new_line() {
1690         let opts = TokenizerOpts {
1691             exact_errors: false,
1692             discard_bom: true,
1693             profile: false,
1694             initial_state: None,
1695             last_start_tag_name: None,
1696         };
1697         let vector = vec![
1698             StrTendril::from("<a>\r\n"),
1699             StrTendril::from("<b>\r\n"),
1700             StrTendril::from("</b>\r\n"),
1701             StrTendril::from("</a>\r\n"),
1702         ];
1703         let expected = vec![
1704             (create_tag(StrTendril::from("a"), StartTag), 1),
1705             (create_tag(StrTendril::from("b"), StartTag), 2),
1706             (create_tag(StrTendril::from("b"), EndTag), 3),
1707             (create_tag(StrTendril::from("a"), EndTag), 4),
1708         ];
1709         let results = tokenize(vector, opts);
1710         assert_eq!(results, expected);
1711     }
1712 }
1713