1 // Copyright 2014-2017 The html5ever Project Developers. See the
2 // COPYRIGHT file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9 
10 //! The HTML5 tokenizer.
11 
12 pub use self::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
13 pub use self::interface::{CommentToken, DoctypeToken, TagToken, Token};
14 pub use self::interface::{Doctype, EndTag, StartTag, Tag, TagKind};
15 pub use self::interface::{TokenSink, TokenSinkResult};
16 
17 use self::states::{DoctypeIdKind, Public, System};
18 use self::states::{DoubleEscaped, Escaped};
19 use self::states::{DoubleQuoted, SingleQuoted, Unquoted};
20 use self::states::{Rawtext, Rcdata, ScriptData, ScriptDataEscaped};
21 
22 use self::char_ref::{CharRef, CharRefTokenizer};
23 
24 use util::str::lower_ascii_letter;
25 
26 use std::borrow::Cow::{self, Borrowed};
27 use std::collections::BTreeMap;
28 use std::default::Default;
29 use std::mem::replace;
30 
31 pub use buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult};
32 use tendril::StrTendril;
33 use {Attribute, LocalName, QualName, SmallCharSet};
34 
35 mod char_ref;
36 mod interface;
37 pub mod states;
38 
39 pub enum ProcessResult<Handle> {
40     Continue,
41     Suspend,
42     Script(Handle),
43 }
44 
45 #[must_use]
46 pub enum TokenizerResult<Handle> {
47     Done,
48     Script(Handle),
49 }
50 
option_push(opt_str: &mut Option<StrTendril>, c: char)51 fn option_push(opt_str: &mut Option<StrTendril>, c: char) {
52     match *opt_str {
53         Some(ref mut s) => s.push_char(c),
54         None => *opt_str = Some(StrTendril::from_char(c)),
55     }
56 }
57 
58 /// Tokenizer options, with an impl for `Default`.
59 #[derive(Clone)]
60 pub struct TokenizerOpts {
61     /// Report all parse errors described in the spec, at some
62     /// performance penalty?  Default: false
63     pub exact_errors: bool,
64 
65     /// Discard a `U+FEFF BYTE ORDER MARK` if we see one at the beginning
66     /// of the stream?  Default: true
67     pub discard_bom: bool,
68 
69     /// Keep a record of how long we spent in each state?  Printed
70     /// when `end()` is called.  Default: false
71     pub profile: bool,
72 
73     /// Initial state override.  Only the test runner should use
74     /// a non-`None` value!
75     pub initial_state: Option<states::State>,
76 
77     /// Last start tag.  Only the test runner should use a
78     /// non-`None` value!
79     ///
80     /// FIXME: Can't use Tendril because we want TokenizerOpts
81     /// to be Send.
82     pub last_start_tag_name: Option<String>,
83 }
84 
85 impl Default for TokenizerOpts {
default() -> TokenizerOpts86     fn default() -> TokenizerOpts {
87         TokenizerOpts {
88             exact_errors: false,
89             discard_bom: true,
90             profile: false,
91             initial_state: None,
92             last_start_tag_name: None,
93         }
94     }
95 }
96 
97 /// The HTML tokenizer.
98 pub struct Tokenizer<Sink> {
99     /// Options controlling the behavior of the tokenizer.
100     opts: TokenizerOpts,
101 
102     /// Destination for tokens we emit.
103     pub sink: Sink,
104 
105     /// The abstract machine state as described in the spec.
106     state: states::State,
107 
108     /// Are we at the end of the file, once buffers have been processed
109     /// completely? This affects whether we will wait for lookahead or not.
110     at_eof: bool,
111 
112     /// Tokenizer for character references, if we're tokenizing
113     /// one at the moment.
114     char_ref_tokenizer: Option<Box<CharRefTokenizer>>,
115 
116     /// Current input character.  Just consumed, may reconsume.
117     current_char: char,
118 
119     /// Should we reconsume the current input character?
120     reconsume: bool,
121 
122     /// Did we just consume \r, translating it to \n?  In that case we need
123     /// to ignore the next character if it's \n.
124     ignore_lf: bool,
125 
126     /// Discard a U+FEFF BYTE ORDER MARK if we see one?  Only done at the
127     /// beginning of the stream.
128     discard_bom: bool,
129 
130     /// Current tag kind.
131     current_tag_kind: TagKind,
132 
133     /// Current tag name.
134     current_tag_name: StrTendril,
135 
136     /// Current tag is self-closing?
137     current_tag_self_closing: bool,
138 
139     /// Current tag attributes.
140     current_tag_attrs: Vec<Attribute>,
141 
142     /// Current attribute name.
143     current_attr_name: StrTendril,
144 
145     /// Current attribute value.
146     current_attr_value: StrTendril,
147 
148     /// Current comment.
149     current_comment: StrTendril,
150 
151     /// Current doctype token.
152     current_doctype: Doctype,
153 
154     /// Last start tag name, for use in checking "appropriate end tag".
155     last_start_tag_name: Option<LocalName>,
156 
157     /// The "temporary buffer" mentioned in the spec.
158     temp_buf: StrTendril,
159 
160     /// Record of how many ns we spent in each state, if profiling is enabled.
161     state_profile: BTreeMap<states::State, u64>,
162 
163     /// Record of how many ns we spent in the token sink.
164     time_in_sink: u64,
165 
166     /// Track current line
167     current_line: u64,
168 }
169 
170 impl<Sink: TokenSink> Tokenizer<Sink> {
171     /// Create a new tokenizer which feeds tokens to a particular `TokenSink`.
new(sink: Sink, mut opts: TokenizerOpts) -> Tokenizer<Sink>172     pub fn new(sink: Sink, mut opts: TokenizerOpts) -> Tokenizer<Sink> {
173         let start_tag_name = opts
174             .last_start_tag_name
175             .take()
176             .map(|s| LocalName::from(&*s));
177         let state = opts.initial_state.unwrap_or(states::Data);
178         let discard_bom = opts.discard_bom;
179         Tokenizer {
180             opts: opts,
181             sink: sink,
182             state: state,
183             char_ref_tokenizer: None,
184             at_eof: false,
185             current_char: '\0',
186             reconsume: false,
187             ignore_lf: false,
188             discard_bom: discard_bom,
189             current_tag_kind: StartTag,
190             current_tag_name: StrTendril::new(),
191             current_tag_self_closing: false,
192             current_tag_attrs: vec![],
193             current_attr_name: StrTendril::new(),
194             current_attr_value: StrTendril::new(),
195             current_comment: StrTendril::new(),
196             current_doctype: Doctype::new(),
197             last_start_tag_name: start_tag_name,
198             temp_buf: StrTendril::new(),
199             state_profile: BTreeMap::new(),
200             time_in_sink: 0,
201             current_line: 1,
202         }
203     }
204 
205     /// Feed an input string into the tokenizer.
feed(&mut self, input: &mut BufferQueue) -> TokenizerResult<Sink::Handle>206     pub fn feed(&mut self, input: &mut BufferQueue) -> TokenizerResult<Sink::Handle> {
207         if input.is_empty() {
208             return TokenizerResult::Done;
209         }
210 
211         if self.discard_bom {
212             if let Some(c) = input.peek() {
213                 if c == '\u{feff}' {
214                     input.next();
215                 }
216             } else {
217                 return TokenizerResult::Done;
218             }
219         };
220 
221         self.run(input)
222     }
223 
set_plaintext_state(&mut self)224     pub fn set_plaintext_state(&mut self) {
225         self.state = states::Plaintext;
226     }
227 
process_token(&mut self, token: Token) -> TokenSinkResult<Sink::Handle>228     fn process_token(&mut self, token: Token) -> TokenSinkResult<Sink::Handle> {
229         if self.opts.profile {
230             let (ret, dt) = time!(self.sink.process_token(token, self.current_line));
231             self.time_in_sink += dt;
232             ret
233         } else {
234             self.sink.process_token(token, self.current_line)
235         }
236     }
237 
process_token_and_continue(&mut self, token: Token)238     fn process_token_and_continue(&mut self, token: Token) {
239         assert!(matches!(
240             self.process_token(token),
241             TokenSinkResult::Continue
242         ));
243     }
244 
245     //§ preprocessing-the-input-stream
246     // Get the next input character, which might be the character
247     // 'c' that we already consumed from the buffers.
get_preprocessed_char(&mut self, mut c: char, input: &mut BufferQueue) -> Option<char>248     fn get_preprocessed_char(&mut self, mut c: char, input: &mut BufferQueue) -> Option<char> {
249         if self.ignore_lf {
250             self.ignore_lf = false;
251             if c == '\n' {
252                 c = unwrap_or_return!(input.next(), None);
253             }
254         }
255 
256         if c == '\r' {
257             self.ignore_lf = true;
258             c = '\n';
259         }
260 
261         if c == '\n' {
262             self.current_line += 1;
263         }
264 
265         if self.opts.exact_errors &&
266             match c as u32 {
267                 0x01...0x08 | 0x0B | 0x0E...0x1F | 0x7F...0x9F | 0xFDD0...0xFDEF => true,
268                 n if (n & 0xFFFE) == 0xFFFE => true,
269                 _ => false,
270             }
271         {
272             let msg = format!("Bad character {}", c);
273             self.emit_error(Cow::Owned(msg));
274         }
275 
276         debug!("got character {}", c);
277         self.current_char = c;
278         Some(c)
279     }
280 
281     //§ tokenization
282     // Get the next input character, if one is available.
get_char(&mut self, input: &mut BufferQueue) -> Option<char>283     fn get_char(&mut self, input: &mut BufferQueue) -> Option<char> {
284         if self.reconsume {
285             self.reconsume = false;
286             Some(self.current_char)
287         } else {
288             input
289                 .next()
290                 .and_then(|c| self.get_preprocessed_char(c, input))
291         }
292     }
293 
pop_except_from(&mut self, input: &mut BufferQueue, set: SmallCharSet) -> Option<SetResult>294     fn pop_except_from(&mut self, input: &mut BufferQueue, set: SmallCharSet) -> Option<SetResult> {
295         // Bail to the slow path for various corner cases.
296         // This means that `FromSet` can contain characters not in the set!
297         // It shouldn't matter because the fallback `FromSet` case should
298         // always do the same thing as the `NotFromSet` case.
299         if self.opts.exact_errors || self.reconsume || self.ignore_lf {
300             return self.get_char(input).map(|x| FromSet(x));
301         }
302 
303         let d = input.pop_except_from(set);
304         debug!("got characters {:?}", d);
305         match d {
306             Some(FromSet(c)) => self.get_preprocessed_char(c, input).map(|x| FromSet(x)),
307 
308             // NB: We don't set self.current_char for a run of characters not
309             // in the set.  It shouldn't matter for the codepaths that use
310             // this.
311             _ => d,
312         }
313     }
314 
315     // Check if the next characters are an ASCII case-insensitive match.  See
316     // BufferQueue::eat.
317     //
318     // NB: this doesn't do input stream preprocessing or set the current input
319     // character.
eat( &mut self, input: &mut BufferQueue, pat: &str, eq: fn(&u8, &u8) -> bool, ) -> Option<bool>320     fn eat(
321         &mut self,
322         input: &mut BufferQueue,
323         pat: &str,
324         eq: fn(&u8, &u8) -> bool,
325     ) -> Option<bool> {
326         input.push_front(replace(&mut self.temp_buf, StrTendril::new()));
327         match input.eat(pat, eq) {
328             None if self.at_eof => Some(false),
329             None => {
330                 while let Some(c) = input.next() {
331                     self.temp_buf.push_char(c);
332                 }
333                 None
334             },
335             Some(matched) => Some(matched),
336         }
337     }
338 
339     /// Run the state machine for as long as we can.
run(&mut self, input: &mut BufferQueue) -> TokenizerResult<Sink::Handle>340     fn run(&mut self, input: &mut BufferQueue) -> TokenizerResult<Sink::Handle> {
341         if self.opts.profile {
342             loop {
343                 let state = self.state;
344                 let old_sink = self.time_in_sink;
345                 let (run, mut dt) = time!(self.step(input));
346                 dt -= (self.time_in_sink - old_sink);
347                 let new = match self.state_profile.get_mut(&state) {
348                     Some(x) => {
349                         *x += dt;
350                         false
351                     },
352                     None => true,
353                 };
354                 if new {
355                     // do this here because of borrow shenanigans
356                     self.state_profile.insert(state, dt);
357                 }
358                 match run {
359                     ProcessResult::Continue => (),
360                     ProcessResult::Suspend => break,
361                     ProcessResult::Script(node) => return TokenizerResult::Script(node),
362                 }
363             }
364         } else {
365             loop {
366                 match self.step(input) {
367                     ProcessResult::Continue => (),
368                     ProcessResult::Suspend => break,
369                     ProcessResult::Script(node) => return TokenizerResult::Script(node),
370                 }
371             }
372         }
373         TokenizerResult::Done
374     }
375 
bad_char_error(&mut self)376     fn bad_char_error(&mut self) {
377         let msg = format_if!(
378             self.opts.exact_errors,
379             "Bad character",
380             "Saw {} in state {:?}",
381             self.current_char,
382             self.state
383         );
384         self.emit_error(msg);
385     }
386 
bad_eof_error(&mut self)387     fn bad_eof_error(&mut self) {
388         let msg = format_if!(
389             self.opts.exact_errors,
390             "Unexpected EOF",
391             "Saw EOF in state {:?}",
392             self.state
393         );
394         self.emit_error(msg);
395     }
396 
emit_char(&mut self, c: char)397     fn emit_char(&mut self, c: char) {
398         self.process_token_and_continue(match c {
399             '\0' => NullCharacterToken,
400             _ => CharacterTokens(StrTendril::from_char(c)),
401         });
402     }
403 
404     // The string must not contain '\0'!
emit_chars(&mut self, b: StrTendril)405     fn emit_chars(&mut self, b: StrTendril) {
406         self.process_token_and_continue(CharacterTokens(b));
407     }
408 
emit_current_tag(&mut self) -> ProcessResult<Sink::Handle>409     fn emit_current_tag(&mut self) -> ProcessResult<Sink::Handle> {
410         self.finish_attribute();
411 
412         let name = LocalName::from(&*self.current_tag_name);
413         self.current_tag_name.clear();
414 
415         match self.current_tag_kind {
416             StartTag => {
417                 self.last_start_tag_name = Some(name.clone());
418             },
419             EndTag => {
420                 if !self.current_tag_attrs.is_empty() {
421                     self.emit_error(Borrowed("Attributes on an end tag"));
422                 }
423                 if self.current_tag_self_closing {
424                     self.emit_error(Borrowed("Self-closing end tag"));
425                 }
426             },
427         }
428 
429         let token = TagToken(Tag {
430             kind: self.current_tag_kind,
431             name: name,
432             self_closing: self.current_tag_self_closing,
433             attrs: replace(&mut self.current_tag_attrs, vec![]),
434         });
435 
436         match self.process_token(token) {
437             TokenSinkResult::Continue => ProcessResult::Continue,
438             TokenSinkResult::Plaintext => {
439                 self.state = states::Plaintext;
440                 ProcessResult::Continue
441             },
442             TokenSinkResult::Script(node) => {
443                 self.state = states::Data;
444                 ProcessResult::Script(node)
445             },
446             TokenSinkResult::RawData(kind) => {
447                 self.state = states::RawData(kind);
448                 ProcessResult::Continue
449             },
450         }
451     }
452 
emit_temp_buf(&mut self)453     fn emit_temp_buf(&mut self) {
454         // FIXME: Make sure that clearing on emit is spec-compatible.
455         let buf = replace(&mut self.temp_buf, StrTendril::new());
456         self.emit_chars(buf);
457     }
458 
clear_temp_buf(&mut self)459     fn clear_temp_buf(&mut self) {
460         // Do this without a new allocation.
461         self.temp_buf.clear();
462     }
463 
emit_current_comment(&mut self)464     fn emit_current_comment(&mut self) {
465         let comment = replace(&mut self.current_comment, StrTendril::new());
466         self.process_token_and_continue(CommentToken(comment));
467     }
468 
discard_tag(&mut self)469     fn discard_tag(&mut self) {
470         self.current_tag_name.clear();
471         self.current_tag_self_closing = false;
472         self.current_tag_attrs = vec![];
473     }
474 
create_tag(&mut self, kind: TagKind, c: char)475     fn create_tag(&mut self, kind: TagKind, c: char) {
476         self.discard_tag();
477         self.current_tag_name.push_char(c);
478         self.current_tag_kind = kind;
479     }
480 
have_appropriate_end_tag(&self) -> bool481     fn have_appropriate_end_tag(&self) -> bool {
482         match self.last_start_tag_name.as_ref() {
483             Some(last) => (self.current_tag_kind == EndTag) && (*self.current_tag_name == **last),
484             None => false,
485         }
486     }
487 
create_attribute(&mut self, c: char)488     fn create_attribute(&mut self, c: char) {
489         self.finish_attribute();
490 
491         self.current_attr_name.push_char(c);
492     }
493 
finish_attribute(&mut self)494     fn finish_attribute(&mut self) {
495         if self.current_attr_name.len() == 0 {
496             return;
497         }
498 
499         // Check for a duplicate attribute.
500         // FIXME: the spec says we should error as soon as the name is finished.
501         // FIXME: linear time search, do we care?
502         let dup = {
503             let name = &*self.current_attr_name;
504             self.current_tag_attrs
505                 .iter()
506                 .any(|a| &*a.name.local == name)
507         };
508 
509         if dup {
510             self.emit_error(Borrowed("Duplicate attribute"));
511             self.current_attr_name.clear();
512             self.current_attr_value.clear();
513         } else {
514             let name = LocalName::from(&*self.current_attr_name);
515             self.current_attr_name.clear();
516             self.current_tag_attrs.push(Attribute {
517                 // The tree builder will adjust the namespace if necessary.
518                 // This only happens in foreign elements.
519                 name: QualName::new(None, ns!(), name),
520                 value: replace(&mut self.current_attr_value, StrTendril::new()),
521             });
522         }
523     }
524 
emit_current_doctype(&mut self)525     fn emit_current_doctype(&mut self) {
526         let doctype = replace(&mut self.current_doctype, Doctype::new());
527         self.process_token_and_continue(DoctypeToken(doctype));
528     }
529 
doctype_id<'a>(&'a mut self, kind: DoctypeIdKind) -> &'a mut Option<StrTendril>530     fn doctype_id<'a>(&'a mut self, kind: DoctypeIdKind) -> &'a mut Option<StrTendril> {
531         match kind {
532             Public => &mut self.current_doctype.public_id,
533             System => &mut self.current_doctype.system_id,
534         }
535     }
536 
clear_doctype_id(&mut self, kind: DoctypeIdKind)537     fn clear_doctype_id(&mut self, kind: DoctypeIdKind) {
538         let id = self.doctype_id(kind);
539         match *id {
540             Some(ref mut s) => s.clear(),
541             None => *id = Some(StrTendril::new()),
542         }
543     }
544 
consume_char_ref(&mut self, addnl_allowed: Option<char>)545     fn consume_char_ref(&mut self, addnl_allowed: Option<char>) {
546         // NB: The char ref tokenizer assumes we have an additional allowed
547         // character iff we're tokenizing in an attribute value.
548         self.char_ref_tokenizer = Some(Box::new(CharRefTokenizer::new(addnl_allowed)));
549     }
550 
emit_eof(&mut self)551     fn emit_eof(&mut self) {
552         self.process_token_and_continue(EOFToken);
553     }
554 
peek(&mut self, input: &BufferQueue) -> Option<char>555     fn peek(&mut self, input: &BufferQueue) -> Option<char> {
556         if self.reconsume {
557             Some(self.current_char)
558         } else {
559             input.peek()
560         }
561     }
562 
discard_char(&mut self, input: &mut BufferQueue)563     fn discard_char(&mut self, input: &mut BufferQueue) {
564         self.get_char(input);
565     }
566 
emit_error(&mut self, error: Cow<'static, str>)567     fn emit_error(&mut self, error: Cow<'static, str>) {
568         self.process_token_and_continue(ParseError(error));
569     }
570 }
571 //§ END
572 
573 // Shorthand for common state machine behaviors.
574 macro_rules! shorthand (
575     ( $me:ident : emit $c:expr                     ) => ( $me.emit_char($c);                                   );
576     ( $me:ident : create_tag $kind:ident $c:expr   ) => ( $me.create_tag($kind, $c);                           );
577     ( $me:ident : push_tag $c:expr                 ) => ( $me.current_tag_name.push_char($c);                  );
578     ( $me:ident : discard_tag                      ) => ( $me.discard_tag();                                   );
579     ( $me:ident : discard_char $input:expr         ) => ( $me.discard_char($input);                            );
580     ( $me:ident : push_temp $c:expr                ) => ( $me.temp_buf.push_char($c);                          );
581     ( $me:ident : emit_temp                        ) => ( $me.emit_temp_buf();                                 );
582     ( $me:ident : clear_temp                       ) => ( $me.clear_temp_buf();                                );
583     ( $me:ident : create_attr $c:expr              ) => ( $me.create_attribute($c);                            );
584     ( $me:ident : push_name $c:expr                ) => ( $me.current_attr_name.push_char($c);                 );
585     ( $me:ident : push_value $c:expr               ) => ( $me.current_attr_value.push_char($c);                );
586     ( $me:ident : append_value $c:expr             ) => ( $me.current_attr_value.push_tendril($c);             );
587     ( $me:ident : push_comment $c:expr             ) => ( $me.current_comment.push_char($c);                   );
588     ( $me:ident : append_comment $c:expr           ) => ( $me.current_comment.push_slice($c);                  );
589     ( $me:ident : emit_comment                     ) => ( $me.emit_current_comment();                          );
590     ( $me:ident : clear_comment                    ) => ( $me.current_comment.clear();                         );
591     ( $me:ident : create_doctype                   ) => ( $me.current_doctype = Doctype::new();                );
592     ( $me:ident : push_doctype_name $c:expr        ) => ( option_push(&mut $me.current_doctype.name, $c);      );
593     ( $me:ident : push_doctype_id $k:ident $c:expr ) => ( option_push($me.doctype_id($k), $c);                 );
594     ( $me:ident : clear_doctype_id $k:ident        ) => ( $me.clear_doctype_id($k);                            );
595     ( $me:ident : force_quirks                     ) => ( $me.current_doctype.force_quirks = true;             );
596     ( $me:ident : emit_doctype                     ) => ( $me.emit_current_doctype();                          );
597     ( $me:ident : error                            ) => ( $me.bad_char_error();                                );
598     ( $me:ident : error_eof                        ) => ( $me.bad_eof_error();                                 );
599 );
600 
601 // Tracing of tokenizer actions.  This adds significant bloat and compile time,
602 // so it's behind a cfg flag.
603 #[cfg(trace_tokenizer)]
604 macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ({
605     debug!("  {:s}", stringify!($($cmds)*));
606     shorthand!($me:expr : $($cmds)*);
607 }));
608 
609 #[cfg(not(trace_tokenizer))]
610 macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ( shorthand!($me: $($cmds)*) ) );
611 
612 // A little DSL for sequencing shorthand actions.
613 macro_rules! go (
614     // A pattern like $($cmd:tt)* ; $($rest:tt)* causes parse ambiguity.
615     // We have to tell the parser how much lookahead we need.
616 
617     ( $me:ident : $a:tt                   ; $($rest:tt)* ) => ({ sh_trace!($me: $a);          go!($me: $($rest)*); });
618     ( $me:ident : $a:tt $b:tt             ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b);       go!($me: $($rest)*); });
619     ( $me:ident : $a:tt $b:tt $c:tt       ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c);    go!($me: $($rest)*); });
620     ( $me:ident : $a:tt $b:tt $c:tt $d:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c $d); go!($me: $($rest)*); });
621 
622     // These can only come at the end.
623 
624     ( $me:ident : to $s:ident                    ) => ({ $me.state = states::$s; return ProcessResult::Continue;           });
625     ( $me:ident : to $s:ident $k1:expr           ) => ({ $me.state = states::$s($k1); return ProcessResult::Continue;      });
626     ( $me:ident : to $s:ident $k1:ident $k2:expr ) => ({ $me.state = states::$s($k1($k2)); return ProcessResult::Continue; });
627 
628     ( $me:ident : reconsume $s:ident                    ) => ({ $me.reconsume = true; go!($me: to $s);         });
629     ( $me:ident : reconsume $s:ident $k1:expr           ) => ({ $me.reconsume = true; go!($me: to $s $k1);     });
630     ( $me:ident : reconsume $s:ident $k1:ident $k2:expr ) => ({ $me.reconsume = true; go!($me: to $s $k1 $k2); });
631 
632     ( $me:ident : consume_char_ref             ) => ({ $me.consume_char_ref(None); return ProcessResult::Continue;         });
633     ( $me:ident : consume_char_ref $addnl:expr ) => ({ $me.consume_char_ref(Some($addnl)); return ProcessResult::Continue; });
634 
635     // We have a default next state after emitting a tag, but the sink can override.
636     ( $me:ident : emit_tag $s:ident ) => ({
637         $me.state = states::$s;
638         return $me.emit_current_tag();
639     });
640 
641     ( $me:ident : eof ) => ({ $me.emit_eof(); return ProcessResult::Suspend; });
642 
643     // If nothing else matched, it's a single command
644     ( $me:ident : $($cmd:tt)+ ) => ( sh_trace!($me: $($cmd)+); );
645 
646     // or nothing.
647     ( $me:ident : ) => (());
648 );
649 
650 macro_rules! go_match ( ( $me:ident : $x:expr, $($pats:pat),+ => $($cmds:tt)* ) => (
651     match $x {
652         $($pats)|+ => go!($me: $($cmds)*),
653         _ => (),
654     }
655 ));
656 
657 // This is a macro because it can cause early return
658 // from the function where it is used.
659 macro_rules! get_char ( ($me:expr, $input:expr) => (
660     unwrap_or_return!($me.get_char($input), ProcessResult::Suspend)
661 ));
662 
663 macro_rules! peek ( ($me:expr, $input:expr) => (
664     unwrap_or_return!($me.peek($input), ProcessResult::Suspend)
665 ));
666 
667 macro_rules! pop_except_from ( ($me:expr, $input:expr, $set:expr) => (
668     unwrap_or_return!($me.pop_except_from($input, $set), ProcessResult::Suspend)
669 ));
670 
671 macro_rules! eat ( ($me:expr, $input:expr, $pat:expr) => (
672     unwrap_or_return!($me.eat($input, $pat, u8::eq_ignore_ascii_case), ProcessResult::Suspend)
673 ));
674 
675 macro_rules! eat_exact ( ($me:expr, $input:expr, $pat:expr) => (
676     unwrap_or_return!($me.eat($input, $pat, u8::eq), ProcessResult::Suspend)
677 ));
678 
679 impl<Sink: TokenSink> Tokenizer<Sink> {
680     // Run the state machine for a while.
681     // Return true if we should be immediately re-invoked
682     // (this just simplifies control flow vs. break / continue).
step(&mut self, input: &mut BufferQueue) -> ProcessResult<Sink::Handle>683     fn step(&mut self, input: &mut BufferQueue) -> ProcessResult<Sink::Handle> {
684         if self.char_ref_tokenizer.is_some() {
685             return self.step_char_ref_tokenizer(input);
686         }
687 
688         debug!("processing in state {:?}", self.state);
689         match self.state {
690             //§ data-state
691             states::Data => loop {
692                 match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n')) {
693                     FromSet('\0') => go!(self: error; emit '\0'),
694                     FromSet('&') => go!(self: consume_char_ref),
695                     FromSet('<') => go!(self: to TagOpen),
696                     FromSet(c) => go!(self: emit c),
697                     NotFromSet(b) => self.emit_chars(b),
698                 }
699             },
700 
701             //§ rcdata-state
702             states::RawData(Rcdata) => loop {
703                 match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n')) {
704                     FromSet('\0') => go!(self: error; emit '\u{fffd}'),
705                     FromSet('&') => go!(self: consume_char_ref),
706                     FromSet('<') => go!(self: to RawLessThanSign Rcdata),
707                     FromSet(c) => go!(self: emit c),
708                     NotFromSet(b) => self.emit_chars(b),
709                 }
710             },
711 
712             //§ rawtext-state
713             states::RawData(Rawtext) => loop {
714                 match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) {
715                     FromSet('\0') => go!(self: error; emit '\u{fffd}'),
716                     FromSet('<') => go!(self: to RawLessThanSign Rawtext),
717                     FromSet(c) => go!(self: emit c),
718                     NotFromSet(b) => self.emit_chars(b),
719                 }
720             },
721 
722             //§ script-data-state
723             states::RawData(ScriptData) => loop {
724                 match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) {
725                     FromSet('\0') => go!(self: error; emit '\u{fffd}'),
726                     FromSet('<') => go!(self: to RawLessThanSign ScriptData),
727                     FromSet(c) => go!(self: emit c),
728                     NotFromSet(b) => self.emit_chars(b),
729                 }
730             },
731 
732             //§ script-data-escaped-state
733             states::RawData(ScriptDataEscaped(Escaped)) => loop {
734                 match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) {
735                     FromSet('\0') => go!(self: error; emit '\u{fffd}'),
736                     FromSet('-') => go!(self: emit '-'; to ScriptDataEscapedDash Escaped),
737                     FromSet('<') => go!(self: to RawLessThanSign ScriptDataEscaped Escaped),
738                     FromSet(c) => go!(self: emit c),
739                     NotFromSet(b) => self.emit_chars(b),
740                 }
741             },
742 
743             //§ script-data-double-escaped-state
744             states::RawData(ScriptDataEscaped(DoubleEscaped)) => loop {
745                 match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) {
746                     FromSet('\0') => go!(self: error; emit '\u{fffd}'),
747                     FromSet('-') => go!(self: emit '-'; to ScriptDataEscapedDash DoubleEscaped),
748                     FromSet('<') => {
749                         go!(self: emit '<'; to RawLessThanSign ScriptDataEscaped DoubleEscaped)
750                     },
751                     FromSet(c) => go!(self: emit c),
752                     NotFromSet(b) => self.emit_chars(b),
753                 }
754             },
755 
756             //§ plaintext-state
757             states::Plaintext => loop {
758                 match pop_except_from!(self, input, small_char_set!('\r' '\0' '\n')) {
759                     FromSet('\0') => go!(self: error; emit '\u{fffd}'),
760                     FromSet(c) => go!(self: emit c),
761                     NotFromSet(b) => self.emit_chars(b),
762                 }
763             },
764 
765             //§ tag-open-state
766             states::TagOpen => loop {
767                 match get_char!(self, input) {
768                     '!' => go!(self: clear_temp; to MarkupDeclarationOpen),
769                     '/' => go!(self: to EndTagOpen),
770                     '?' => go!(self: error; clear_comment; push_comment '?'; to BogusComment),
771                     c => match lower_ascii_letter(c) {
772                         Some(cl) => go!(self: create_tag StartTag cl; to TagName),
773                         None => go!(self: error; emit '<'; reconsume Data),
774                     },
775                 }
776             },
777 
778             //§ end-tag-open-state
779             states::EndTagOpen => loop {
780                 match get_char!(self, input) {
781                     '>' => go!(self: error; to Data),
782                     '\0' => {
783                         go!(self: error; clear_comment; push_comment '\u{fffd}'; to BogusComment)
784                     },
785                     c => match lower_ascii_letter(c) {
786                         Some(cl) => go!(self: create_tag EndTag cl; to TagName),
787                         None => go!(self: error; clear_comment; push_comment c; to BogusComment),
788                     },
789                 }
790             },
791 
792             //§ tag-name-state
793             states::TagName => loop {
794                 match get_char!(self, input) {
795                     '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName),
796                     '/' => go!(self: to SelfClosingStartTag),
797                     '>' => go!(self: emit_tag Data),
798                     '\0' => go!(self: error; push_tag '\u{fffd}'),
799                     c => go!(self: push_tag (c.to_ascii_lowercase())),
800                 }
801             },
802 
803             //§ script-data-escaped-less-than-sign-state
804             states::RawLessThanSign(ScriptDataEscaped(Escaped)) => loop {
805                 match get_char!(self, input) {
806                     '/' => go!(self: clear_temp; to RawEndTagOpen ScriptDataEscaped Escaped),
807                     c => match lower_ascii_letter(c) {
808                         Some(cl) => go!(self: clear_temp; push_temp cl; emit '<'; emit c;
809                                     to ScriptDataEscapeStart DoubleEscaped),
810                         None => go!(self: emit '<'; reconsume RawData ScriptDataEscaped Escaped),
811                     },
812                 }
813             },
814 
815             //§ script-data-double-escaped-less-than-sign-state
816             states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => loop {
817                 match get_char!(self, input) {
818                     '/' => go!(self: clear_temp; emit '/'; to ScriptDataDoubleEscapeEnd),
819                     _ => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped),
820                 }
821             },
822 
823             //§ rcdata-less-than-sign-state rawtext-less-than-sign-state script-data-less-than-sign-state
824             // otherwise
825             states::RawLessThanSign(kind) => loop {
826                 match get_char!(self, input) {
827                     '/' => go!(self: clear_temp; to RawEndTagOpen kind),
828                     '!' if kind == ScriptData => {
829                         go!(self: emit '<'; emit '!'; to ScriptDataEscapeStart Escaped)
830                     },
831                     _ => go!(self: emit '<'; reconsume RawData kind),
832                 }
833             },
834 
835             //§ rcdata-end-tag-open-state rawtext-end-tag-open-state script-data-end-tag-open-state script-data-escaped-end-tag-open-state
836             states::RawEndTagOpen(kind) => loop {
837                 let c = get_char!(self, input);
838                 match lower_ascii_letter(c) {
839                     Some(cl) => go!(self: create_tag EndTag cl; push_temp c; to RawEndTagName kind),
840                     None => go!(self: emit '<'; emit '/'; reconsume RawData kind),
841                 }
842             },
843 
844             //§ rcdata-end-tag-name-state rawtext-end-tag-name-state script-data-end-tag-name-state script-data-escaped-end-tag-name-state
845             states::RawEndTagName(kind) => loop {
846                 let c = get_char!(self, input);
847                 if self.have_appropriate_end_tag() {
848                     match c {
849                         '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName),
850                         '/' => go!(self: to SelfClosingStartTag),
851                         '>' => go!(self: emit_tag Data),
852                         _ => (),
853                     }
854                 }
855 
856                 match lower_ascii_letter(c) {
857                     Some(cl) => go!(self: push_tag cl; push_temp c),
858                     None => {
859                         go!(self: discard_tag; emit '<'; emit '/'; emit_temp; reconsume RawData kind)
860                     },
861                 }
862             },
863 
864             //§ script-data-double-escape-start-state
865             states::ScriptDataEscapeStart(DoubleEscaped) => loop {
866                 let c = get_char!(self, input);
867                 match c {
868                     '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => {
869                         let esc = if &*self.temp_buf == "script" {
870                             DoubleEscaped
871                         } else {
872                             Escaped
873                         };
874                         go!(self: emit c; to RawData ScriptDataEscaped esc);
875                     },
876                     _ => match lower_ascii_letter(c) {
877                         Some(cl) => go!(self: push_temp cl; emit c),
878                         None => go!(self: reconsume RawData ScriptDataEscaped Escaped),
879                     },
880                 }
881             },
882 
883             //§ script-data-escape-start-state
884             states::ScriptDataEscapeStart(Escaped) => loop {
885                 match get_char!(self, input) {
886                     '-' => go!(self: emit '-'; to ScriptDataEscapeStartDash),
887                     _ => go!(self: reconsume RawData ScriptData),
888                 }
889             },
890 
891             //§ script-data-escape-start-dash-state
892             states::ScriptDataEscapeStartDash => loop {
893                 match get_char!(self, input) {
894                     '-' => go!(self: emit '-'; to ScriptDataEscapedDashDash Escaped),
895                     _ => go!(self: reconsume RawData ScriptData),
896                 }
897             },
898 
899             //§ script-data-escaped-dash-state script-data-double-escaped-dash-state
900             states::ScriptDataEscapedDash(kind) => loop {
901                 match get_char!(self, input) {
902                     '-' => go!(self: emit '-'; to ScriptDataEscapedDashDash kind),
903                     '<' => {
904                         if kind == DoubleEscaped {
905                             go!(self: emit '<');
906                         }
907                         go!(self: to RawLessThanSign ScriptDataEscaped kind);
908                     },
909                     '\0' => go!(self: error; emit '\u{fffd}'; to RawData ScriptDataEscaped kind),
910                     c => go!(self: emit c; to RawData ScriptDataEscaped kind),
911                 }
912             },
913 
914             //§ script-data-escaped-dash-dash-state script-data-double-escaped-dash-dash-state
915             states::ScriptDataEscapedDashDash(kind) => loop {
916                 match get_char!(self, input) {
917                     '-' => go!(self: emit '-'),
918                     '<' => {
919                         if kind == DoubleEscaped {
920                             go!(self: emit '<');
921                         }
922                         go!(self: to RawLessThanSign ScriptDataEscaped kind);
923                     },
924                     '>' => go!(self: emit '>'; to RawData ScriptData),
925                     '\0' => go!(self: error; emit '\u{fffd}'; to RawData ScriptDataEscaped kind),
926                     c => go!(self: emit c; to RawData ScriptDataEscaped kind),
927                 }
928             },
929 
930             //§ script-data-double-escape-end-state
931             states::ScriptDataDoubleEscapeEnd => loop {
932                 let c = get_char!(self, input);
933                 match c {
934                     '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => {
935                         let esc = if &*self.temp_buf == "script" {
936                             Escaped
937                         } else {
938                             DoubleEscaped
939                         };
940                         go!(self: emit c; to RawData ScriptDataEscaped esc);
941                     },
942                     _ => match lower_ascii_letter(c) {
943                         Some(cl) => go!(self: push_temp cl; emit c),
944                         None => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped),
945                     },
946                 }
947             },
948 
949             //§ before-attribute-name-state
950             states::BeforeAttributeName => loop {
951                 match get_char!(self, input) {
952                     '\t' | '\n' | '\x0C' | ' ' => (),
953                     '/' => go!(self: to SelfClosingStartTag),
954                     '>' => go!(self: emit_tag Data),
955                     '\0' => go!(self: error; create_attr '\u{fffd}'; to AttributeName),
956                     c => match lower_ascii_letter(c) {
957                         Some(cl) => go!(self: create_attr cl; to AttributeName),
958                         None => {
959                             go_match!(self: c,
960                             '"' , '\'' , '<' , '=' => error);
961                             go!(self: create_attr c; to AttributeName);
962                         },
963                     },
964                 }
965             },
966 
967             //§ attribute-name-state
968             states::AttributeName => loop {
969                 match get_char!(self, input) {
970                     '\t' | '\n' | '\x0C' | ' ' => go!(self: to AfterAttributeName),
971                     '/' => go!(self: to SelfClosingStartTag),
972                     '=' => go!(self: to BeforeAttributeValue),
973                     '>' => go!(self: emit_tag Data),
974                     '\0' => go!(self: error; push_name '\u{fffd}'),
975                     c => match lower_ascii_letter(c) {
976                         Some(cl) => go!(self: push_name cl),
977                         None => {
978                             go_match!(self: c,
979                             '"' , '\'' , '<' => error);
980                             go!(self: push_name c);
981                         },
982                     },
983                 }
984             },
985 
986             //§ after-attribute-name-state
987             states::AfterAttributeName => loop {
988                 match get_char!(self, input) {
989                     '\t' | '\n' | '\x0C' | ' ' => (),
990                     '/' => go!(self: to SelfClosingStartTag),
991                     '=' => go!(self: to BeforeAttributeValue),
992                     '>' => go!(self: emit_tag Data),
993                     '\0' => go!(self: error; create_attr '\u{fffd}'; to AttributeName),
994                     c => match lower_ascii_letter(c) {
995                         Some(cl) => go!(self: create_attr cl; to AttributeName),
996                         None => {
997                             go_match!(self: c,
998                             '"' , '\'' , '<' => error);
999                             go!(self: create_attr c; to AttributeName);
1000                         },
1001                     },
1002                 }
1003             },
1004 
1005             //§ before-attribute-value-state
1006             // Use peek so we can handle the first attr character along with the rest,
1007             // hopefully in the same zero-copy buffer.
1008             states::BeforeAttributeValue => loop {
1009                 match peek!(self, input) {
1010                     '\t' | '\n' | '\r' | '\x0C' | ' ' => go!(self: discard_char input),
1011                     '"' => go!(self: discard_char input; to AttributeValue DoubleQuoted),
1012                     '\'' => go!(self: discard_char input; to AttributeValue SingleQuoted),
1013                     '\0' => {
1014                         go!(self: discard_char input; error; push_value '\u{fffd}'; to AttributeValue Unquoted)
1015                     },
1016                     '>' => go!(self: discard_char input; error; emit_tag Data),
1017                     _ => go!(self: to AttributeValue Unquoted),
1018                 }
1019             },
1020 
1021             //§ attribute-value-(double-quoted)-state
1022             states::AttributeValue(DoubleQuoted) => loop {
1023                 match pop_except_from!(self, input, small_char_set!('\r' '"' '&' '\0' '\n')) {
1024                     FromSet('"') => go!(self: to AfterAttributeValueQuoted),
1025                     FromSet('&') => go!(self: consume_char_ref '"'),
1026                     FromSet('\0') => go!(self: error; push_value '\u{fffd}'),
1027                     FromSet(c) => go!(self: push_value c),
1028                     NotFromSet(ref b) => go!(self: append_value b),
1029                 }
1030             },
1031 
1032             //§ attribute-value-(single-quoted)-state
1033             states::AttributeValue(SingleQuoted) => loop {
1034                 match pop_except_from!(self, input, small_char_set!('\r' '\'' '&' '\0' '\n')) {
1035                     FromSet('\'') => go!(self: to AfterAttributeValueQuoted),
1036                     FromSet('&') => go!(self: consume_char_ref '\''),
1037                     FromSet('\0') => go!(self: error; push_value '\u{fffd}'),
1038                     FromSet(c) => go!(self: push_value c),
1039                     NotFromSet(ref b) => go!(self: append_value b),
1040                 }
1041             },
1042 
1043             //§ attribute-value-(unquoted)-state
1044             states::AttributeValue(Unquoted) => loop {
1045                 match pop_except_from!(
1046                     self,
1047                     input,
1048                     small_char_set!('\r' '\t' '\n' '\x0C' ' ' '&' '>' '\0')
1049                 ) {
1050                     FromSet('\t') | FromSet('\n') | FromSet('\x0C') | FromSet(' ') => {
1051                         go!(self: to BeforeAttributeName)
1052                     },
1053                     FromSet('&') => go!(self: consume_char_ref '>'),
1054                     FromSet('>') => go!(self: emit_tag Data),
1055                     FromSet('\0') => go!(self: error; push_value '\u{fffd}'),
1056                     FromSet(c) => {
1057                         go_match!(self: c,
1058                             '"' , '\'' , '<' , '=' , '`' => error);
1059                         go!(self: push_value c);
1060                     },
1061                     NotFromSet(ref b) => go!(self: append_value b),
1062                 }
1063             },
1064 
1065             //§ after-attribute-value-(quoted)-state
1066             states::AfterAttributeValueQuoted => loop {
1067                 match get_char!(self, input) {
1068                     '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName),
1069                     '/' => go!(self: to SelfClosingStartTag),
1070                     '>' => go!(self: emit_tag Data),
1071                     _ => go!(self: error; reconsume BeforeAttributeName),
1072                 }
1073             },
1074 
1075             //§ self-closing-start-tag-state
1076             states::SelfClosingStartTag => loop {
1077                 match get_char!(self, input) {
1078                     '>' => {
1079                         self.current_tag_self_closing = true;
1080                         go!(self: emit_tag Data);
1081                     },
1082                     _ => go!(self: error; reconsume BeforeAttributeName),
1083                 }
1084             },
1085 
1086             //§ comment-start-state
1087             states::CommentStart => loop {
1088                 match get_char!(self, input) {
1089                     '-' => go!(self: to CommentStartDash),
1090                     '\0' => go!(self: error; push_comment '\u{fffd}'; to Comment),
1091                     '>' => go!(self: error; emit_comment; to Data),
1092                     c => go!(self: push_comment c; to Comment),
1093                 }
1094             },
1095 
1096             //§ comment-start-dash-state
1097             states::CommentStartDash => loop {
1098                 match get_char!(self, input) {
1099                     '-' => go!(self: to CommentEnd),
1100                     '\0' => go!(self: error; append_comment "-\u{fffd}"; to Comment),
1101                     '>' => go!(self: error; emit_comment; to Data),
1102                     c => go!(self: push_comment '-'; push_comment c; to Comment),
1103                 }
1104             },
1105 
1106             //§ comment-state
1107             states::Comment => loop {
1108                 match get_char!(self, input) {
1109                     '-' => go!(self: to CommentEndDash),
1110                     '\0' => go!(self: error; push_comment '\u{fffd}'),
1111                     c => go!(self: push_comment c),
1112                 }
1113             },
1114 
1115             //§ comment-end-dash-state
1116             states::CommentEndDash => loop {
1117                 match get_char!(self, input) {
1118                     '-' => go!(self: to CommentEnd),
1119                     '\0' => go!(self: error; append_comment "-\u{fffd}"; to Comment),
1120                     c => go!(self: push_comment '-'; push_comment c; to Comment),
1121                 }
1122             },
1123 
1124             //§ comment-end-state
1125             states::CommentEnd => loop {
1126                 match get_char!(self, input) {
1127                     '>' => go!(self: emit_comment; to Data),
1128                     '\0' => go!(self: error; append_comment "--\u{fffd}"; to Comment),
1129                     '!' => go!(self: error; to CommentEndBang),
1130                     '-' => go!(self: error; push_comment '-'),
1131                     c => go!(self: error; append_comment "--"; push_comment c; to Comment),
1132                 }
1133             },
1134 
1135             //§ comment-end-bang-state
1136             states::CommentEndBang => loop {
1137                 match get_char!(self, input) {
1138                     '-' => go!(self: append_comment "--!"; to CommentEndDash),
1139                     '>' => go!(self: emit_comment; to Data),
1140                     '\0' => go!(self: error; append_comment "--!\u{fffd}"; to Comment),
1141                     c => go!(self: append_comment "--!"; push_comment c; to Comment),
1142                 }
1143             },
1144 
1145             //§ doctype-state
1146             states::Doctype => loop {
1147                 match get_char!(self, input) {
1148                     '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeName),
1149                     _ => go!(self: error; reconsume BeforeDoctypeName),
1150                 }
1151             },
1152 
1153             //§ before-doctype-name-state
1154             states::BeforeDoctypeName => loop {
1155                 match get_char!(self, input) {
1156                     '\t' | '\n' | '\x0C' | ' ' => (),
1157                     '\0' => {
1158                         go!(self: error; create_doctype; push_doctype_name '\u{fffd}'; to DoctypeName)
1159                     },
1160                     '>' => go!(self: error; create_doctype; force_quirks; emit_doctype; to Data),
1161                     c => go!(self: create_doctype; push_doctype_name (c.to_ascii_lowercase());
1162                                   to DoctypeName),
1163                 }
1164             },
1165 
1166             //§ doctype-name-state
1167             states::DoctypeName => loop {
1168                 match get_char!(self, input) {
1169                     '\t' | '\n' | '\x0C' | ' ' => go!(self: clear_temp; to AfterDoctypeName),
1170                     '>' => go!(self: emit_doctype; to Data),
1171                     '\0' => go!(self: error; push_doctype_name '\u{fffd}'),
1172                     c => go!(self: push_doctype_name (c.to_ascii_lowercase())),
1173                 }
1174             },
1175 
1176             //§ after-doctype-name-state
1177             states::AfterDoctypeName => loop {
1178                 if eat!(self, input, "public") {
1179                     go!(self: to AfterDoctypeKeyword Public);
1180                 } else if eat!(self, input, "system") {
1181                     go!(self: to AfterDoctypeKeyword System);
1182                 } else {
1183                     match get_char!(self, input) {
1184                         '\t' | '\n' | '\x0C' | ' ' => (),
1185                         '>' => go!(self: emit_doctype; to Data),
1186                         _ => go!(self: error; force_quirks; to BogusDoctype),
1187                     }
1188                 }
1189             },
1190 
1191             //§ after-doctype-public-keyword-state after-doctype-system-keyword-state
1192             states::AfterDoctypeKeyword(kind) => loop {
1193                 match get_char!(self, input) {
1194                     '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier kind),
1195                     '"' => {
1196                         go!(self: error; clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind)
1197                     },
1198                     '\'' => {
1199                         go!(self: error; clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind)
1200                     },
1201                     '>' => go!(self: error; force_quirks; emit_doctype; to Data),
1202                     _ => go!(self: error; force_quirks; to BogusDoctype),
1203                 }
1204             },
1205 
1206             //§ before-doctype-public-identifier-state before-doctype-system-identifier-state
1207             states::BeforeDoctypeIdentifier(kind) => loop {
1208                 match get_char!(self, input) {
1209                     '\t' | '\n' | '\x0C' | ' ' => (),
1210                     '"' => go!(self: clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind),
1211                     '\'' => go!(self: clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind),
1212                     '>' => go!(self: error; force_quirks; emit_doctype; to Data),
1213                     _ => go!(self: error; force_quirks; to BogusDoctype),
1214                 }
1215             },
1216 
1217             //§ doctype-public-identifier-(double-quoted)-state doctype-system-identifier-(double-quoted)-state
1218             states::DoctypeIdentifierDoubleQuoted(kind) => loop {
1219                 match get_char!(self, input) {
1220                     '"' => go!(self: to AfterDoctypeIdentifier kind),
1221                     '\0' => go!(self: error; push_doctype_id kind '\u{fffd}'),
1222                     '>' => go!(self: error; force_quirks; emit_doctype; to Data),
1223                     c => go!(self: push_doctype_id kind c),
1224                 }
1225             },
1226 
1227             //§ doctype-public-identifier-(single-quoted)-state doctype-system-identifier-(single-quoted)-state
1228             states::DoctypeIdentifierSingleQuoted(kind) => loop {
1229                 match get_char!(self, input) {
1230                     '\'' => go!(self: to AfterDoctypeIdentifier kind),
1231                     '\0' => go!(self: error; push_doctype_id kind '\u{fffd}'),
1232                     '>' => go!(self: error; force_quirks; emit_doctype; to Data),
1233                     c => go!(self: push_doctype_id kind c),
1234                 }
1235             },
1236 
1237             //§ after-doctype-public-identifier-state
1238             states::AfterDoctypeIdentifier(Public) => loop {
1239                 match get_char!(self, input) {
1240                     '\t' | '\n' | '\x0C' | ' ' => {
1241                         go!(self: to BetweenDoctypePublicAndSystemIdentifiers)
1242                     },
1243                     '>' => go!(self: emit_doctype; to Data),
1244                     '"' => {
1245                         go!(self: error; clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System)
1246                     },
1247                     '\'' => {
1248                         go!(self: error; clear_doctype_id System; to DoctypeIdentifierSingleQuoted System)
1249                     },
1250                     _ => go!(self: error; force_quirks; to BogusDoctype),
1251                 }
1252             },
1253 
1254             //§ after-doctype-system-identifier-state
1255             states::AfterDoctypeIdentifier(System) => loop {
1256                 match get_char!(self, input) {
1257                     '\t' | '\n' | '\x0C' | ' ' => (),
1258                     '>' => go!(self: emit_doctype; to Data),
1259                     _ => go!(self: error; to BogusDoctype),
1260                 }
1261             },
1262 
1263             //§ between-doctype-public-and-system-identifiers-state
1264             states::BetweenDoctypePublicAndSystemIdentifiers => loop {
1265                 match get_char!(self, input) {
1266                     '\t' | '\n' | '\x0C' | ' ' => (),
1267                     '>' => go!(self: emit_doctype; to Data),
1268                     '"' => {
1269                         go!(self: clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System)
1270                     },
1271                     '\'' => {
1272                         go!(self: clear_doctype_id System; to DoctypeIdentifierSingleQuoted System)
1273                     },
1274                     _ => go!(self: error; force_quirks; to BogusDoctype),
1275                 }
1276             },
1277 
1278             //§ bogus-doctype-state
1279             states::BogusDoctype => loop {
1280                 match get_char!(self, input) {
1281                     '>' => go!(self: emit_doctype; to Data),
1282                     _ => (),
1283                 }
1284             },
1285 
1286             //§ bogus-comment-state
1287             states::BogusComment => loop {
1288                 match get_char!(self, input) {
1289                     '>' => go!(self: emit_comment; to Data),
1290                     '\0' => go!(self: push_comment '\u{fffd}'),
1291                     c => go!(self: push_comment c),
1292                 }
1293             },
1294 
1295             //§ markup-declaration-open-state
1296             states::MarkupDeclarationOpen => loop {
1297                 if eat_exact!(self, input, "--") {
1298                     go!(self: clear_comment; to CommentStart);
1299                 } else if eat!(self, input, "doctype") {
1300                     go!(self: to Doctype);
1301                 } else {
1302                     if self
1303                         .sink
1304                         .adjusted_current_node_present_but_not_in_html_namespace()
1305                     {
1306                         if eat_exact!(self, input, "[CDATA[") {
1307                             go!(self: clear_temp; to CdataSection);
1308                         }
1309                     }
1310                     go!(self: error; to BogusComment);
1311                 }
1312             },
1313 
1314             //§ cdata-section-state
1315             states::CdataSection => loop {
1316                 match get_char!(self, input) {
1317                     ']' => go!(self: to CdataSectionBracket),
1318                     '\0' => go!(self: emit_temp; emit '\0'),
1319                     c => go!(self: push_temp c),
1320                 }
1321             },
1322 
1323             //§ cdata-section-bracket
1324             states::CdataSectionBracket => match get_char!(self, input) {
1325                 ']' => go!(self: to CdataSectionEnd),
1326                 _ => go!(self: push_temp ']'; reconsume CdataSection),
1327             },
1328 
1329             //§ cdata-section-end
1330             states::CdataSectionEnd => loop {
1331                 match get_char!(self, input) {
1332                     ']' => go!(self: push_temp ']'),
1333                     '>' => go!(self: emit_temp; to Data),
1334                     _ => go!(self: push_temp ']'; push_temp ']'; reconsume CdataSection),
1335                 }
1336             },
1337             //§ END
1338         }
1339     }
1340 
1341     fn step_char_ref_tokenizer(&mut self, input: &mut BufferQueue) -> ProcessResult<Sink::Handle> {
1342         // FIXME HACK: Take and replace the tokenizer so we don't
1343         // double-mut-borrow self.  This is why it's boxed.
1344         let mut tok = self.char_ref_tokenizer.take().unwrap();
1345         let outcome = tok.step(self, input);
1346 
1347         let progress = match outcome {
1348             char_ref::Done => {
1349                 self.process_char_ref(tok.get_result());
1350                 return ProcessResult::Continue;
1351             },
1352 
1353             char_ref::Stuck => ProcessResult::Suspend,
1354             char_ref::Progress => ProcessResult::Continue,
1355         };
1356 
1357         self.char_ref_tokenizer = Some(tok);
1358         progress
1359     }
1360 
1361     fn process_char_ref(&mut self, char_ref: CharRef) {
1362         let CharRef {
1363             mut chars,
1364             mut num_chars,
1365         } = char_ref;
1366 
1367         if num_chars == 0 {
1368             chars[0] = '&';
1369             num_chars = 1;
1370         }
1371 
1372         for i in 0..num_chars {
1373             let c = chars[i as usize];
1374             match self.state {
1375                 states::Data | states::RawData(states::Rcdata) => go!(self: emit c),
1376 
1377                 states::AttributeValue(_) => go!(self: push_value c),
1378 
1379                 _ => panic!(
1380                     "state {:?} should not be reachable in process_char_ref",
1381                     self.state
1382                 ),
1383             }
1384         }
1385     }
1386 
1387     /// Indicate that we have reached the end of the input.
1388     pub fn end(&mut self) {
1389         // Handle EOF in the char ref sub-tokenizer, if there is one.
1390         // Do this first because it might un-consume stuff.
1391         let mut input = BufferQueue::new();
1392         match self.char_ref_tokenizer.take() {
1393             None => (),
1394             Some(mut tok) => {
1395                 tok.end_of_file(self, &mut input);
1396                 self.process_char_ref(tok.get_result());
1397             },
1398         }
1399 
1400         // Process all remaining buffered input.
1401         // If we're waiting for lookahead, we're not gonna get it.
1402         self.at_eof = true;
1403         assert!(matches!(self.run(&mut input), TokenizerResult::Done));
1404         assert!(input.is_empty());
1405 
1406         loop {
1407             match self.eof_step() {
1408                 ProcessResult::Continue => (),
1409                 ProcessResult::Suspend => break,
1410                 ProcessResult::Script(_) => unreachable!(),
1411             }
1412         }
1413 
1414         self.sink.end();
1415 
1416         if self.opts.profile {
1417             self.dump_profile();
1418         }
1419     }
1420 
1421     fn dump_profile(&self) {
1422         let mut results: Vec<(states::State, u64)> =
1423             self.state_profile.iter().map(|(s, t)| (*s, *t)).collect();
1424         results.sort_by(|&(_, x), &(_, y)| y.cmp(&x));
1425 
1426         let total: u64 = results
1427             .iter()
1428             .map(|&(_, t)| t)
1429             .fold(0, ::std::ops::Add::add);
1430         println!("\nTokenizer profile, in nanoseconds");
1431         println!("\n{:12}         total in token sink", self.time_in_sink);
1432         println!("\n{:12}         total in tokenizer", total);
1433 
1434         for (k, v) in results.into_iter() {
1435             let pct = 100.0 * (v as f64) / (total as f64);
1436             println!("{:12}  {:4.1}%  {:?}", v, pct, k);
1437         }
1438     }
1439 
1440     fn eof_step(&mut self) -> ProcessResult<Sink::Handle> {
1441         debug!("processing EOF in state {:?}", self.state);
1442         match self.state {
1443             states::Data |
1444             states::RawData(Rcdata) |
1445             states::RawData(Rawtext) |
1446             states::RawData(ScriptData) |
1447             states::Plaintext => go!(self: eof),
1448 
1449             states::TagName |
1450             states::RawData(ScriptDataEscaped(_)) |
1451             states::BeforeAttributeName |
1452             states::AttributeName |
1453             states::AfterAttributeName |
1454             states::BeforeAttributeValue |
1455             states::AttributeValue(_) |
1456             states::AfterAttributeValueQuoted |
1457             states::SelfClosingStartTag |
1458             states::ScriptDataEscapedDash(_) |
1459             states::ScriptDataEscapedDashDash(_) => go!(self: error_eof; to Data),
1460 
1461             states::TagOpen => go!(self: error_eof; emit '<'; to Data),
1462 
1463             states::EndTagOpen => go!(self: error_eof; emit '<'; emit '/'; to Data),
1464 
1465             states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => {
1466                 go!(self: to RawData ScriptDataEscaped DoubleEscaped)
1467             },
1468 
1469             states::RawLessThanSign(kind) => go!(self: emit '<'; to RawData kind),
1470 
1471             states::RawEndTagOpen(kind) => go!(self: emit '<'; emit '/'; to RawData kind),
1472 
1473             states::RawEndTagName(kind) => {
1474                 go!(self: emit '<'; emit '/'; emit_temp; to RawData kind)
1475             },
1476 
1477             states::ScriptDataEscapeStart(kind) => go!(self: to RawData ScriptDataEscaped kind),
1478 
1479             states::ScriptDataEscapeStartDash => go!(self: to RawData ScriptData),
1480 
1481             states::ScriptDataDoubleEscapeEnd => {
1482                 go!(self: to RawData ScriptDataEscaped DoubleEscaped)
1483             },
1484 
1485             states::CommentStart |
1486             states::CommentStartDash |
1487             states::Comment |
1488             states::CommentEndDash |
1489             states::CommentEnd |
1490             states::CommentEndBang => go!(self: error_eof; emit_comment; to Data),
1491 
1492             states::Doctype | states::BeforeDoctypeName => {
1493                 go!(self: error_eof; create_doctype; force_quirks; emit_doctype; to Data)
1494             },
1495 
1496             states::DoctypeName |
1497             states::AfterDoctypeName |
1498             states::AfterDoctypeKeyword(_) |
1499             states::BeforeDoctypeIdentifier(_) |
1500             states::DoctypeIdentifierDoubleQuoted(_) |
1501             states::DoctypeIdentifierSingleQuoted(_) |
1502             states::AfterDoctypeIdentifier(_) |
1503             states::BetweenDoctypePublicAndSystemIdentifiers => {
1504                 go!(self: error_eof; force_quirks; emit_doctype; to Data)
1505             },
1506 
1507             states::BogusDoctype => go!(self: emit_doctype; to Data),
1508 
1509             states::BogusComment => go!(self: emit_comment; to Data),
1510 
1511             states::MarkupDeclarationOpen => go!(self: error; to BogusComment),
1512 
1513             states::CdataSection => go!(self: emit_temp; error_eof; to Data),
1514 
1515             states::CdataSectionBracket => go!(self: push_temp ']'; to CdataSection),
1516 
1517             states::CdataSectionEnd => go!(self: push_temp ']'; push_temp ']'; to CdataSection),
1518         }
1519     }
1520 }
1521 
1522 #[cfg(test)]
1523 #[allow(non_snake_case)]
1524 mod test {
1525     use super::option_push; // private items
1526     use tendril::{SliceExt, StrTendril};
1527 
1528     use super::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
1529 
1530     use super::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
1531     use super::interface::{EndTag, StartTag, Tag, TagKind};
1532     use super::interface::{TagToken, Token};
1533 
1534     use markup5ever::buffer_queue::BufferQueue;
1535     use std::mem::replace;
1536 
1537     use LocalName;
1538 
1539     // LinesMatch implements the TokenSink trait. It is used for testing to see
1540     // if current_line is being updated when process_token is called. The lines
1541     // vector is a collection of the line numbers that each token is on.
1542     struct LinesMatch {
1543         tokens: Vec<Token>,
1544         current_str: StrTendril,
1545         lines: Vec<(Token, u64)>,
1546     }
1547 
1548     impl LinesMatch {
1549         fn new() -> LinesMatch {
1550             LinesMatch {
1551                 tokens: vec![],
1552                 current_str: StrTendril::new(),
1553                 lines: vec![],
1554             }
1555         }
1556 
1557         fn push(&mut self, token: Token, line_number: u64) {
1558             self.finish_str();
1559             self.lines.push((token, line_number));
1560         }
1561 
1562         fn finish_str(&mut self) {
1563             if self.current_str.len() > 0 {
1564                 let s = replace(&mut self.current_str, StrTendril::new());
1565                 self.tokens.push(CharacterTokens(s));
1566             }
1567         }
1568     }
1569 
1570     impl TokenSink for LinesMatch {
1571         type Handle = ();
1572 
1573         fn process_token(
1574             &mut self,
1575             token: Token,
1576             line_number: u64,
1577         ) -> TokenSinkResult<Self::Handle> {
1578             match token {
1579                 CharacterTokens(b) => {
1580                     self.current_str.push_slice(&b);
1581                 },
1582 
1583                 NullCharacterToken => {
1584                     self.current_str.push_char('\0');
1585                 },
1586 
1587                 ParseError(_) => {
1588                     panic!("unexpected parse error");
1589                 },
1590 
1591                 TagToken(mut t) => {
1592                     // The spec seems to indicate that one can emit
1593                     // erroneous end tags with attrs, but the test
1594                     // cases don't contain them.
1595                     match t.kind {
1596                         EndTag => {
1597                             t.self_closing = false;
1598                             t.attrs = vec![];
1599                         },
1600                         _ => t.attrs.sort_by(|a1, a2| a1.name.cmp(&a2.name)),
1601                     }
1602                     self.push(TagToken(t), line_number);
1603                 },
1604 
1605                 EOFToken => (),
1606 
1607                 _ => self.push(token, line_number),
1608             }
1609             TokenSinkResult::Continue
1610         }
1611     }
1612 
1613     // Take in tokens, process them, and return vector with line
1614     // numbers that each token is on
1615     fn tokenize(input: Vec<StrTendril>, opts: TokenizerOpts) -> Vec<(Token, u64)> {
1616         let sink = LinesMatch::new();
1617         let mut tok = Tokenizer::new(sink, opts);
1618         let mut buffer = BufferQueue::new();
1619         for chunk in input.into_iter() {
1620             buffer.push_back(chunk);
1621             let _ = tok.feed(&mut buffer);
1622         }
1623         tok.end();
1624         tok.sink.lines
1625     }
1626 
1627     // Create a tag token
1628     fn create_tag(token: StrTendril, tagkind: TagKind) -> Token {
1629         let name = LocalName::from(&*token);
1630         let token = TagToken(Tag {
1631             kind: tagkind,
1632             name: name,
1633             self_closing: false,
1634             attrs: vec![],
1635         });
1636         token
1637     }
1638 
1639     #[test]
1640     fn push_to_None_gives_singleton() {
1641         let mut s: Option<StrTendril> = None;
1642         option_push(&mut s, 'x');
1643         assert_eq!(s, Some("x".to_tendril()));
1644     }
1645 
1646     #[test]
1647     fn push_to_empty_appends() {
1648         let mut s: Option<StrTendril> = Some(StrTendril::new());
1649         option_push(&mut s, 'x');
1650         assert_eq!(s, Some("x".to_tendril()));
1651     }
1652 
1653     #[test]
1654     fn push_to_nonempty_appends() {
1655         let mut s: Option<StrTendril> = Some(StrTendril::from_slice("y"));
1656         option_push(&mut s, 'x');
1657         assert_eq!(s, Some("yx".to_tendril()));
1658     }
1659 
1660     #[test]
1661     fn check_lines() {
1662         let opts = TokenizerOpts {
1663             exact_errors: false,
1664             discard_bom: true,
1665             profile: false,
1666             initial_state: None,
1667             last_start_tag_name: None,
1668         };
1669         let vector = vec![
1670             StrTendril::from("<a>\n"),
1671             StrTendril::from("<b>\n"),
1672             StrTendril::from("</b>\n"),
1673             StrTendril::from("</a>\n"),
1674         ];
1675         let expected = vec![
1676             (create_tag(StrTendril::from("a"), StartTag), 1),
1677             (create_tag(StrTendril::from("b"), StartTag), 2),
1678             (create_tag(StrTendril::from("b"), EndTag), 3),
1679             (create_tag(StrTendril::from("a"), EndTag), 4),
1680         ];
1681         let results = tokenize(vector, opts);
1682         assert_eq!(results, expected);
1683     }
1684 
1685     #[test]
1686     fn check_lines_with_new_line() {
1687         let opts = TokenizerOpts {
1688             exact_errors: false,
1689             discard_bom: true,
1690             profile: false,
1691             initial_state: None,
1692             last_start_tag_name: None,
1693         };
1694         let vector = vec![
1695             StrTendril::from("<a>\r\n"),
1696             StrTendril::from("<b>\r\n"),
1697             StrTendril::from("</b>\r\n"),
1698             StrTendril::from("</a>\r\n"),
1699         ];
1700         let expected = vec![
1701             (create_tag(StrTendril::from("a"), StartTag), 1),
1702             (create_tag(StrTendril::from("b"), StartTag), 2),
1703             (create_tag(StrTendril::from("b"), EndTag), 3),
1704             (create_tag(StrTendril::from("a"), EndTag), 4),
1705         ];
1706         let results = tokenize(vector, opts);
1707         assert_eq!(results, expected);
1708     }
1709 }
1710