1 // Copyright 2014 The html5ever Project Developers. See the
2 // COPYRIGHT file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9 
10 //! The HTML5 tokenizer.
11 
12 pub use self::interface::{Doctype, Attribute, TagKind, StartTag, EndTag, Tag};
13 pub use self::interface::{Token, DoctypeToken, TagToken, CommentToken};
14 pub use self::interface::{CharacterTokens, NullCharacterToken, EOFToken, ParseError};
15 pub use self::interface::{TokenSink, TokenSinkResult};
16 
17 use self::states::{Rcdata, Rawtext, ScriptData, ScriptDataEscaped};
18 use self::states::{Escaped, DoubleEscaped};
19 use self::states::{Unquoted, SingleQuoted, DoubleQuoted};
20 use self::states::{DoctypeIdKind, Public, System};
21 
22 use self::char_ref::{CharRef, CharRefTokenizer};
23 
24 use self::buffer_queue::{BufferQueue, SetResult, FromSet, NotFromSet};
25 
26 use util::str::lower_ascii_letter;
27 use util::smallcharset::SmallCharSet;
28 
29 use std::ascii::AsciiExt;
30 use std::mem::replace;
31 use std::default::Default;
32 use std::borrow::Cow::{self, Borrowed};
33 use std::collections::BTreeMap;
34 
35 use {LocalName, QualName};
36 use tendril::StrTendril;
37 
38 pub mod buffer_queue;
39 pub mod states;
40 mod interface;
41 mod char_ref;
42 
43 pub enum ProcessResult<Handle> {
44     Continue,
45     Suspend,
46     Script(Handle)
47 }
48 
49 #[must_use]
50 pub enum TokenizerResult<Handle> {
51     Done,
52     Script(Handle)
53 }
54 
option_push(opt_str: &mut Option<StrTendril>, c: char)55 fn option_push(opt_str: &mut Option<StrTendril>, c: char) {
56     match *opt_str {
57         Some(ref mut s) => s.push_char(c),
58         None => *opt_str = Some(StrTendril::from_char(c)),
59     }
60 }
61 
62 /// Tokenizer options, with an impl for `Default`.
63 #[derive(Clone)]
64 pub struct TokenizerOpts {
65     /// Report all parse errors described in the spec, at some
66     /// performance penalty?  Default: false
67     pub exact_errors: bool,
68 
69     /// Discard a `U+FEFF BYTE ORDER MARK` if we see one at the beginning
70     /// of the stream?  Default: true
71     pub discard_bom: bool,
72 
73     /// Keep a record of how long we spent in each state?  Printed
74     /// when `end()` is called.  Default: false
75     pub profile: bool,
76 
77     /// Initial state override.  Only the test runner should use
78     /// a non-`None` value!
79     pub initial_state: Option<states::State>,
80 
81     /// Last start tag.  Only the test runner should use a
82     /// non-`None` value!
83     ///
84     /// FIXME: Can't use Tendril because we want TokenizerOpts
85     /// to be Send.
86     pub last_start_tag_name: Option<String>,
87 }
88 
89 impl Default for TokenizerOpts {
default() -> TokenizerOpts90     fn default() -> TokenizerOpts {
91         TokenizerOpts {
92             exact_errors: false,
93             discard_bom: true,
94             profile: false,
95             initial_state: None,
96             last_start_tag_name: None,
97         }
98     }
99 }
100 
101 /// The HTML tokenizer.
102 pub struct Tokenizer<Sink> {
103     /// Options controlling the behavior of the tokenizer.
104     opts: TokenizerOpts,
105 
106     /// Destination for tokens we emit.
107     sink: Sink,
108 
109     /// The abstract machine state as described in the spec.
110     state: states::State,
111 
112     /// Are we at the end of the file, once buffers have been processed
113     /// completely? This affects whether we will wait for lookahead or not.
114     at_eof: bool,
115 
116     /// Tokenizer for character references, if we're tokenizing
117     /// one at the moment.
118     char_ref_tokenizer: Option<Box<CharRefTokenizer>>,
119 
120     /// Current input character.  Just consumed, may reconsume.
121     current_char: char,
122 
123     /// Should we reconsume the current input character?
124     reconsume: bool,
125 
126     /// Did we just consume \r, translating it to \n?  In that case we need
127     /// to ignore the next character if it's \n.
128     ignore_lf: bool,
129 
130     /// Discard a U+FEFF BYTE ORDER MARK if we see one?  Only done at the
131     /// beginning of the stream.
132     discard_bom: bool,
133 
134     /// Current tag kind.
135     current_tag_kind: TagKind,
136 
137     /// Current tag name.
138     current_tag_name: StrTendril,
139 
140     /// Current tag is self-closing?
141     current_tag_self_closing: bool,
142 
143     /// Current tag attributes.
144     current_tag_attrs: Vec<Attribute>,
145 
146     /// Current attribute name.
147     current_attr_name: StrTendril,
148 
149     /// Current attribute value.
150     current_attr_value: StrTendril,
151 
152     /// Current comment.
153     current_comment: StrTendril,
154 
155     /// Current doctype token.
156     current_doctype: Doctype,
157 
158     /// Last start tag name, for use in checking "appropriate end tag".
159     last_start_tag_name: Option<LocalName>,
160 
161     /// The "temporary buffer" mentioned in the spec.
162     temp_buf: StrTendril,
163 
164     /// Record of how many ns we spent in each state, if profiling is enabled.
165     state_profile: BTreeMap<states::State, u64>,
166 
167     /// Record of how many ns we spent in the token sink.
168     time_in_sink: u64,
169 
170     /// Track current line
171     current_line: u64,
172 }
173 
174 impl<Sink: TokenSink> Tokenizer<Sink> {
175     /// Create a new tokenizer which feeds tokens to a particular `TokenSink`.
new(sink: Sink, mut opts: TokenizerOpts) -> Tokenizer<Sink>176     pub fn new(sink: Sink, mut opts: TokenizerOpts) -> Tokenizer<Sink> {
177         let start_tag_name = opts.last_start_tag_name.take()
178             .map(|s| LocalName::from(&*s));
179         let state = opts.initial_state.unwrap_or(states::Data);
180         let discard_bom = opts.discard_bom;
181         Tokenizer {
182             opts: opts,
183             sink: sink,
184             state: state,
185             char_ref_tokenizer: None,
186             at_eof: false,
187             current_char: '\0',
188             reconsume: false,
189             ignore_lf: false,
190             discard_bom: discard_bom,
191             current_tag_kind: StartTag,
192             current_tag_name: StrTendril::new(),
193             current_tag_self_closing: false,
194             current_tag_attrs: vec!(),
195             current_attr_name: StrTendril::new(),
196             current_attr_value: StrTendril::new(),
197             current_comment: StrTendril::new(),
198             current_doctype: Doctype::new(),
199             last_start_tag_name: start_tag_name,
200             temp_buf: StrTendril::new(),
201             state_profile: BTreeMap::new(),
202             time_in_sink: 0,
203             current_line: 1,
204         }
205     }
206 
unwrap(self) -> Sink207     pub fn unwrap(self) -> Sink {
208         self.sink
209     }
210 
sink<'a>(&'a self) -> &'a Sink211     pub fn sink<'a>(&'a self) -> &'a Sink {
212         &self.sink
213     }
214 
sink_mut<'a>(&'a mut self) -> &'a mut Sink215     pub fn sink_mut<'a>(&'a mut self) -> &'a mut Sink {
216         &mut self.sink
217     }
218 
219     /// Feed an input string into the tokenizer.
feed(&mut self, input: &mut BufferQueue) -> TokenizerResult<Sink::Handle>220     pub fn feed(&mut self, input: &mut BufferQueue) -> TokenizerResult<Sink::Handle> {
221         if input.is_empty() {
222             return TokenizerResult::Done;
223         }
224 
225         if self.discard_bom {
226             if let Some(c) = input.peek() {
227                 if c == '\u{feff}' {
228                     input.next();
229                 }
230             } else {
231                 return TokenizerResult::Done;
232             }
233         };
234 
235         self.run(input)
236     }
237 
set_plaintext_state(&mut self)238     pub fn set_plaintext_state(&mut self) {
239         self.state = states::Plaintext;
240     }
241 
process_token(&mut self, token: Token) -> TokenSinkResult<Sink::Handle>242     fn process_token(&mut self, token: Token) -> TokenSinkResult<Sink::Handle> {
243         if self.opts.profile {
244             let (ret, dt) = time!(self.sink.process_token(token, self.current_line));
245             self.time_in_sink += dt;
246             ret
247         } else {
248             self.sink.process_token(token, self.current_line)
249         }
250     }
251 
process_token_and_continue(&mut self, token: Token)252     fn process_token_and_continue(&mut self, token: Token) {
253         assert!(matches!(self.process_token(token), TokenSinkResult::Continue));
254     }
255 
256     //§ preprocessing-the-input-stream
257     // Get the next input character, which might be the character
258     // 'c' that we already consumed from the buffers.
get_preprocessed_char( &mut self, mut c: char, input: &mut BufferQueue) -> Option<char>259     fn get_preprocessed_char(
260             &mut self,
261             mut c: char,
262             input: &mut BufferQueue)
263             -> Option<char> {
264         if self.ignore_lf {
265             self.ignore_lf = false;
266             if c == '\n' {
267                 c = unwrap_or_return!(input.next(), None);
268             }
269         }
270 
271         if c == '\r' {
272             self.ignore_lf = true;
273             c = '\n';
274         }
275 
276         if c == '\n' {
277             self.current_line += 1;
278         }
279 
280         if self.opts.exact_errors && match c as u32 {
281             0x01...0x08 | 0x0B | 0x0E...0x1F | 0x7F...0x9F | 0xFDD0...0xFDEF => true,
282             n if (n & 0xFFFE) == 0xFFFE => true,
283             _ => false,
284         } {
285             let msg = format!("Bad character {}", c);
286             self.emit_error(Cow::Owned(msg));
287         }
288 
289         debug!("got character {}", c);
290         self.current_char = c;
291         Some(c)
292     }
293 
294     //§ tokenization
295     // Get the next input character, if one is available.
get_char(&mut self, input: &mut BufferQueue) -> Option<char>296     fn get_char(&mut self, input: &mut BufferQueue) -> Option<char> {
297         if self.reconsume {
298             self.reconsume = false;
299             Some(self.current_char)
300         } else {
301             input.next().and_then(|c| self.get_preprocessed_char(c, input))
302         }
303     }
304 
pop_except_from(&mut self, input: &mut BufferQueue, set: SmallCharSet) -> Option<SetResult>305     fn pop_except_from(&mut self, input: &mut BufferQueue, set: SmallCharSet) -> Option<SetResult> {
306         // Bail to the slow path for various corner cases.
307         // This means that `FromSet` can contain characters not in the set!
308         // It shouldn't matter because the fallback `FromSet` case should
309         // always do the same thing as the `NotFromSet` case.
310         if self.opts.exact_errors || self.reconsume || self.ignore_lf {
311             return self.get_char(input).map(|x| FromSet(x));
312         }
313 
314         let d = input.pop_except_from(set);
315         debug!("got characters {:?}", d);
316         match d {
317             Some(FromSet(c)) => self.get_preprocessed_char(c, input).map(|x| FromSet(x)),
318 
319             // NB: We don't set self.current_char for a run of characters not
320             // in the set.  It shouldn't matter for the codepaths that use
321             // this.
322             _ => d
323         }
324     }
325 
326     // Check if the next characters are an ASCII case-insensitive match.  See
327     // BufferQueue::eat.
328     //
329     // NB: this doesn't do input stream preprocessing or set the current input
330     // character.
eat( &mut self, input: &mut BufferQueue, pat: &str, eq: fn(&u8, &u8) -> bool) -> Option<bool>331     fn eat(
332             &mut self,
333             input: &mut BufferQueue,
334             pat: &str,
335             eq: fn(&u8, &u8) -> bool)
336             -> Option<bool> {
337         input.push_front(replace(&mut self.temp_buf, StrTendril::new()));
338         match input.eat(pat, eq) {
339             None if self.at_eof => Some(false),
340             None => {
341                 while let Some(c) = input.next() {
342                     self.temp_buf.push_char(c);
343                 }
344                 None
345             },
346             Some(matched) => Some(matched),
347         }
348     }
349 
350     /// Run the state machine for as long as we can.
run(&mut self, input: &mut BufferQueue) -> TokenizerResult<Sink::Handle>351     fn run(&mut self, input: &mut BufferQueue) -> TokenizerResult<Sink::Handle> {
352         if self.opts.profile {
353             loop {
354                 let state = self.state;
355                 let old_sink = self.time_in_sink;
356                 let (run, mut dt) = time!(self.step(input));
357                 dt -= (self.time_in_sink - old_sink);
358                 let new = match self.state_profile.get_mut(&state) {
359                     Some(x) => {
360                         *x += dt;
361                         false
362                     }
363                     None => true,
364                 };
365                 if new {
366                     // do this here because of borrow shenanigans
367                     self.state_profile.insert(state, dt);
368                 }
369                 match run {
370                     ProcessResult::Continue => (),
371                     ProcessResult::Suspend => break,
372                     ProcessResult::Script(node) => return TokenizerResult::Script(node),
373                 }
374             }
375         } else {
376             loop {
377                 match self.step(input) {
378                     ProcessResult::Continue => (),
379                     ProcessResult::Suspend => break,
380                     ProcessResult::Script(node) => return TokenizerResult::Script(node),
381                 }
382             }
383         }
384         TokenizerResult::Done
385     }
386 
bad_char_error(&mut self)387     fn bad_char_error(&mut self) {
388         let msg = format_if!(
389             self.opts.exact_errors,
390             "Bad character",
391             "Saw {} in state {:?}", self.current_char, self.state);
392         self.emit_error(msg);
393     }
394 
bad_eof_error(&mut self)395     fn bad_eof_error(&mut self) {
396         let msg = format_if!(
397             self.opts.exact_errors,
398             "Unexpected EOF",
399             "Saw EOF in state {:?}", self.state);
400         self.emit_error(msg);
401     }
402 
emit_char(&mut self, c: char)403     fn emit_char(&mut self, c: char) {
404         self.process_token_and_continue(match c {
405             '\0' => NullCharacterToken,
406             _ => CharacterTokens(StrTendril::from_char(c)),
407         });
408     }
409 
410     // The string must not contain '\0'!
emit_chars(&mut self, b: StrTendril)411     fn emit_chars(&mut self, b: StrTendril) {
412         self.process_token_and_continue(CharacterTokens(b));
413     }
414 
emit_current_tag(&mut self) -> ProcessResult<Sink::Handle>415     fn emit_current_tag(&mut self) -> ProcessResult<Sink::Handle> {
416         self.finish_attribute();
417 
418         let name = LocalName::from(&*self.current_tag_name);
419         self.current_tag_name.clear();
420 
421         match self.current_tag_kind {
422             StartTag => {
423                 self.last_start_tag_name = Some(name.clone());
424             }
425             EndTag => {
426                 if !self.current_tag_attrs.is_empty() {
427                     self.emit_error(Borrowed("Attributes on an end tag"));
428                 }
429                 if self.current_tag_self_closing {
430                     self.emit_error(Borrowed("Self-closing end tag"));
431                 }
432             }
433         }
434 
435         let token = TagToken(Tag { kind: self.current_tag_kind,
436             name: name,
437             self_closing: self.current_tag_self_closing,
438             attrs: replace(&mut self.current_tag_attrs, vec!()),
439         });
440 
441         match self.process_token(token) {
442             TokenSinkResult::Continue => ProcessResult::Continue,
443             TokenSinkResult::Plaintext => {
444                 self.state = states::Plaintext;
445                 ProcessResult::Continue
446             },
447             TokenSinkResult::Script(node) => {
448                 self.state = states::Data;
449                 ProcessResult::Script(node)
450             },
451             TokenSinkResult::RawData(kind) => {
452                 self.state = states::RawData(kind);
453                 ProcessResult::Continue
454             }
455         }
456     }
457 
emit_temp_buf(&mut self)458     fn emit_temp_buf(&mut self) {
459         // FIXME: Make sure that clearing on emit is spec-compatible.
460         let buf = replace(&mut self.temp_buf, StrTendril::new());
461         self.emit_chars(buf);
462     }
463 
clear_temp_buf(&mut self)464     fn clear_temp_buf(&mut self) {
465         // Do this without a new allocation.
466         self.temp_buf.clear();
467     }
468 
emit_current_comment(&mut self)469     fn emit_current_comment(&mut self) {
470         let comment = replace(&mut self.current_comment, StrTendril::new());
471         self.process_token_and_continue(CommentToken(comment));
472     }
473 
discard_tag(&mut self)474     fn discard_tag(&mut self) {
475         self.current_tag_name.clear();
476         self.current_tag_self_closing = false;
477         self.current_tag_attrs = vec!();
478     }
479 
create_tag(&mut self, kind: TagKind, c: char)480     fn create_tag(&mut self, kind: TagKind, c: char) {
481         self.discard_tag();
482         self.current_tag_name.push_char(c);
483         self.current_tag_kind = kind;
484     }
485 
have_appropriate_end_tag(&self) -> bool486     fn have_appropriate_end_tag(&self) -> bool {
487         match self.last_start_tag_name.as_ref() {
488             Some(last) =>
489                 (self.current_tag_kind == EndTag)
490                 && (*self.current_tag_name == **last),
491             None => false,
492         }
493     }
494 
create_attribute(&mut self, c: char)495     fn create_attribute(&mut self, c: char) {
496         self.finish_attribute();
497 
498         self.current_attr_name.push_char(c);
499     }
500 
finish_attribute(&mut self)501     fn finish_attribute(&mut self) {
502         if self.current_attr_name.len() == 0 {
503             return;
504         }
505 
506         // Check for a duplicate attribute.
507         // FIXME: the spec says we should error as soon as the name is finished.
508         // FIXME: linear time search, do we care?
509         let dup = {
510             let name = &*self.current_attr_name;
511             self.current_tag_attrs.iter().any(|a| &*a.name.local == name)
512         };
513 
514         if dup {
515             self.emit_error(Borrowed("Duplicate attribute"));
516             self.current_attr_name.clear();
517             self.current_attr_value.clear();
518         } else {
519             let name = LocalName::from(&*self.current_attr_name);
520             self.current_attr_name.clear();
521             self.current_tag_attrs.push(Attribute {
522                 // The tree builder will adjust the namespace if necessary.
523                 // This only happens in foreign elements.
524                 name: QualName::new(ns!(), name),
525                 value: replace(&mut self.current_attr_value, StrTendril::new()),
526             });
527         }
528     }
529 
emit_current_doctype(&mut self)530     fn emit_current_doctype(&mut self) {
531         let doctype = replace(&mut self.current_doctype, Doctype::new());
532         self.process_token_and_continue(DoctypeToken(doctype));
533     }
534 
doctype_id<'a>(&'a mut self, kind: DoctypeIdKind) -> &'a mut Option<StrTendril>535     fn doctype_id<'a>(&'a mut self, kind: DoctypeIdKind) -> &'a mut Option<StrTendril> {
536         match kind {
537             Public => &mut self.current_doctype.public_id,
538             System => &mut self.current_doctype.system_id,
539         }
540     }
541 
clear_doctype_id(&mut self, kind: DoctypeIdKind)542     fn clear_doctype_id(&mut self, kind: DoctypeIdKind) {
543         let id = self.doctype_id(kind);
544         match *id {
545             Some(ref mut s) => s.clear(),
546             None => *id = Some(StrTendril::new()),
547         }
548     }
549 
consume_char_ref(&mut self, addnl_allowed: Option<char>)550     fn consume_char_ref(&mut self, addnl_allowed: Option<char>) {
551         // NB: The char ref tokenizer assumes we have an additional allowed
552         // character iff we're tokenizing in an attribute value.
553         self.char_ref_tokenizer = Some(Box::new(CharRefTokenizer::new(addnl_allowed)));
554     }
555 
emit_eof(&mut self)556     fn emit_eof(&mut self) {
557         self.process_token_and_continue(EOFToken);
558     }
559 
peek(&mut self, input: &BufferQueue) -> Option<char>560     fn peek(&mut self, input: &BufferQueue) -> Option<char> {
561         if self.reconsume {
562             Some(self.current_char)
563         } else {
564             input.peek()
565         }
566     }
567 
discard_char(&mut self, input: &mut BufferQueue)568     fn discard_char(&mut self, input: &mut BufferQueue) {
569         let c = self.get_char(input);
570         assert!(c.is_some());
571     }
572 
emit_error(&mut self, error: Cow<'static, str>)573     fn emit_error(&mut self, error: Cow<'static, str>) {
574         self.process_token_and_continue(ParseError(error));
575     }
576 }
577 //§ END
578 
579 // Shorthand for common state machine behaviors.
580 macro_rules! shorthand (
581     ( $me:ident : emit $c:expr                     ) => ( $me.emit_char($c);                                   );
582     ( $me:ident : create_tag $kind:ident $c:expr   ) => ( $me.create_tag($kind, $c);                           );
583     ( $me:ident : push_tag $c:expr                 ) => ( $me.current_tag_name.push_char($c);                  );
584     ( $me:ident : discard_tag                      ) => ( $me.discard_tag();                                   );
585     ( $me:ident : discard_char $input:expr         ) => ( $me.discard_char($input);                            );
586     ( $me:ident : push_temp $c:expr                ) => ( $me.temp_buf.push_char($c);                          );
587     ( $me:ident : emit_temp                        ) => ( $me.emit_temp_buf();                                 );
588     ( $me:ident : clear_temp                       ) => ( $me.clear_temp_buf();                                );
589     ( $me:ident : create_attr $c:expr              ) => ( $me.create_attribute($c);                            );
590     ( $me:ident : push_name $c:expr                ) => ( $me.current_attr_name.push_char($c);                 );
591     ( $me:ident : push_value $c:expr               ) => ( $me.current_attr_value.push_char($c);                );
592     ( $me:ident : append_value $c:expr             ) => ( $me.current_attr_value.push_tendril($c);             );
593     ( $me:ident : push_comment $c:expr             ) => ( $me.current_comment.push_char($c);                   );
594     ( $me:ident : append_comment $c:expr           ) => ( $me.current_comment.push_slice($c);                  );
595     ( $me:ident : emit_comment                     ) => ( $me.emit_current_comment();                          );
596     ( $me:ident : clear_comment                    ) => ( $me.current_comment.clear();                         );
597     ( $me:ident : create_doctype                   ) => ( $me.current_doctype = Doctype::new();                );
598     ( $me:ident : push_doctype_name $c:expr        ) => ( option_push(&mut $me.current_doctype.name, $c);      );
599     ( $me:ident : push_doctype_id $k:ident $c:expr ) => ( option_push($me.doctype_id($k), $c);                 );
600     ( $me:ident : clear_doctype_id $k:ident        ) => ( $me.clear_doctype_id($k);                            );
601     ( $me:ident : force_quirks                     ) => ( $me.current_doctype.force_quirks = true;             );
602     ( $me:ident : emit_doctype                     ) => ( $me.emit_current_doctype();                          );
603     ( $me:ident : error                            ) => ( $me.bad_char_error();                                );
604     ( $me:ident : error_eof                        ) => ( $me.bad_eof_error();                                 );
605 );
606 
607 // Tracing of tokenizer actions.  This adds significant bloat and compile time,
608 // so it's behind a cfg flag.
609 #[cfg(trace_tokenizer)]
610 macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ({
611     debug!("  {:s}", stringify!($($cmds)*));
612     shorthand!($me:expr : $($cmds)*);
613 }));
614 
615 #[cfg(not(trace_tokenizer))]
616 macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ( shorthand!($me: $($cmds)*) ) );
617 
618 // A little DSL for sequencing shorthand actions.
619 macro_rules! go (
620     // A pattern like $($cmd:tt)* ; $($rest:tt)* causes parse ambiguity.
621     // We have to tell the parser how much lookahead we need.
622 
623     ( $me:ident : $a:tt                   ; $($rest:tt)* ) => ({ sh_trace!($me: $a);          go!($me: $($rest)*); });
624     ( $me:ident : $a:tt $b:tt             ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b);       go!($me: $($rest)*); });
625     ( $me:ident : $a:tt $b:tt $c:tt       ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c);    go!($me: $($rest)*); });
626     ( $me:ident : $a:tt $b:tt $c:tt $d:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c $d); go!($me: $($rest)*); });
627 
628     // These can only come at the end.
629 
630     ( $me:ident : to $s:ident                    ) => ({ $me.state = states::$s; return ProcessResult::Continue;           });
631     ( $me:ident : to $s:ident $k1:expr           ) => ({ $me.state = states::$s($k1); return ProcessResult::Continue;      });
632     ( $me:ident : to $s:ident $k1:ident $k2:expr ) => ({ $me.state = states::$s($k1($k2)); return ProcessResult::Continue; });
633 
634     ( $me:ident : reconsume $s:ident                    ) => ({ $me.reconsume = true; go!($me: to $s);         });
635     ( $me:ident : reconsume $s:ident $k1:expr           ) => ({ $me.reconsume = true; go!($me: to $s $k1);     });
636     ( $me:ident : reconsume $s:ident $k1:ident $k2:expr ) => ({ $me.reconsume = true; go!($me: to $s $k1 $k2); });
637 
638     ( $me:ident : consume_char_ref             ) => ({ $me.consume_char_ref(None); return ProcessResult::Continue;         });
639     ( $me:ident : consume_char_ref $addnl:expr ) => ({ $me.consume_char_ref(Some($addnl)); return ProcessResult::Continue; });
640 
641     // We have a default next state after emitting a tag, but the sink can override.
642     ( $me:ident : emit_tag $s:ident ) => ({
643         $me.state = states::$s;
644         return $me.emit_current_tag();
645     });
646 
647     ( $me:ident : eof ) => ({ $me.emit_eof(); return ProcessResult::Suspend; });
648 
649     // If nothing else matched, it's a single command
650     ( $me:ident : $($cmd:tt)+ ) => ( sh_trace!($me: $($cmd)+); );
651 
652     // or nothing.
653     ( $me:ident : ) => (());
654 );
655 
656 macro_rules! go_match ( ( $me:ident : $x:expr, $($pats:pat),+ => $($cmds:tt)* ) => (
657     match $x {
658         $($pats)|+ => go!($me: $($cmds)*),
659         _ => (),
660     }
661 ));
662 
663 // This is a macro because it can cause early return
664 // from the function where it is used.
665 macro_rules! get_char ( ($me:expr, $input:expr) => (
666     unwrap_or_return!($me.get_char($input), ProcessResult::Suspend)
667 ));
668 
669 macro_rules! peek ( ($me:expr, $input:expr) => (
670     unwrap_or_return!($me.peek($input), ProcessResult::Suspend)
671 ));
672 
673 macro_rules! pop_except_from ( ($me:expr, $input:expr, $set:expr) => (
674     unwrap_or_return!($me.pop_except_from($input, $set), ProcessResult::Suspend)
675 ));
676 
677 macro_rules! eat ( ($me:expr, $input:expr, $pat:expr) => (
678     unwrap_or_return!($me.eat($input, $pat, u8::eq_ignore_ascii_case), ProcessResult::Suspend)
679 ));
680 
681 macro_rules! eat_exact ( ($me:expr, $input:expr, $pat:expr) => (
682     unwrap_or_return!($me.eat($input, $pat, u8::eq), ProcessResult::Suspend)
683 ));
684 
685 impl<Sink: TokenSink> Tokenizer<Sink> {
686     // Run the state machine for a while.
687     // Return true if we should be immediately re-invoked
688     // (this just simplifies control flow vs. break / continue).
step(&mut self, input: &mut BufferQueue) -> ProcessResult<Sink::Handle>689     fn step(&mut self, input: &mut BufferQueue) -> ProcessResult<Sink::Handle> {
690         if self.char_ref_tokenizer.is_some() {
691             return self.step_char_ref_tokenizer(input);
692         }
693 
694         debug!("processing in state {:?}", self.state);
695         match self.state {
696             //§ data-state
697             states::Data => loop {
698                 match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n')) {
699                     FromSet('\0') => go!(self: error; emit '\0'),
700                     FromSet('&')  => go!(self: consume_char_ref),
701                     FromSet('<')  => go!(self: to TagOpen),
702                     FromSet(c)    => go!(self: emit c),
703                     NotFromSet(b) => self.emit_chars(b),
704                 }
705             },
706 
707             //§ rcdata-state
708             states::RawData(Rcdata) => loop {
709                 match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n')) {
710                     FromSet('\0') => go!(self: error; emit '\u{fffd}'),
711                     FromSet('&') => go!(self: consume_char_ref),
712                     FromSet('<') => go!(self: to RawLessThanSign Rcdata),
713                     FromSet(c) => go!(self: emit c),
714                     NotFromSet(b) => self.emit_chars(b),
715                 }
716             },
717 
718             //§ rawtext-state
719             states::RawData(Rawtext) => loop {
720                 match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) {
721                     FromSet('\0') => go!(self: error; emit '\u{fffd}'),
722                     FromSet('<') => go!(self: to RawLessThanSign Rawtext),
723                     FromSet(c) => go!(self: emit c),
724                     NotFromSet(b) => self.emit_chars(b),
725                 }
726             },
727 
728             //§ script-data-state
729             states::RawData(ScriptData) => loop {
730                 match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) {
731                     FromSet('\0') => go!(self: error; emit '\u{fffd}'),
732                     FromSet('<') => go!(self: to RawLessThanSign ScriptData),
733                     FromSet(c) => go!(self: emit c),
734                     NotFromSet(b) => self.emit_chars(b),
735                 }
736             },
737 
738             //§ script-data-escaped-state
739             states::RawData(ScriptDataEscaped(Escaped)) => loop {
740                 match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) {
741                     FromSet('\0') => go!(self: error; emit '\u{fffd}'),
742                     FromSet('-') => go!(self: emit '-'; to ScriptDataEscapedDash Escaped),
743                     FromSet('<') => go!(self: to RawLessThanSign ScriptDataEscaped Escaped),
744                     FromSet(c) => go!(self: emit c),
745                     NotFromSet(b) => self.emit_chars(b),
746                 }
747             },
748 
749             //§ script-data-double-escaped-state
750             states::RawData(ScriptDataEscaped(DoubleEscaped)) => loop {
751                 match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) {
752                     FromSet('\0') => go!(self: error; emit '\u{fffd}'),
753                     FromSet('-') => go!(self: emit '-'; to ScriptDataEscapedDash DoubleEscaped),
754                     FromSet('<') => go!(self: emit '<'; to RawLessThanSign ScriptDataEscaped DoubleEscaped),
755                     FromSet(c) => go!(self: emit c),
756                     NotFromSet(b) => self.emit_chars(b),
757                 }
758             },
759 
760             //§ plaintext-state
761             states::Plaintext => loop {
762                 match pop_except_from!(self, input, small_char_set!('\r' '\0' '\n')) {
763                     FromSet('\0') => go!(self: error; emit '\u{fffd}'),
764                     FromSet(c)    => go!(self: emit c),
765                     NotFromSet(b) => self.emit_chars(b),
766                 }
767             },
768 
769             //§ tag-open-state
770             states::TagOpen => loop { match get_char!(self, input) {
771                 '!' => go!(self: clear_temp; to MarkupDeclarationOpen),
772                 '/' => go!(self: to EndTagOpen),
773                 '?' => go!(self: error; clear_comment; push_comment '?'; to BogusComment),
774                 c => match lower_ascii_letter(c) {
775                     Some(cl) => go!(self: create_tag StartTag cl; to TagName),
776                     None     => go!(self: error; emit '<'; reconsume Data),
777                 }
778             }},
779 
780             //§ end-tag-open-state
781             states::EndTagOpen => loop { match get_char!(self, input) {
782                 '>'  => go!(self: error; to Data),
783                 '\0' => go!(self: error; clear_comment; push_comment '\u{fffd}'; to BogusComment),
784                 c => match lower_ascii_letter(c) {
785                     Some(cl) => go!(self: create_tag EndTag cl; to TagName),
786                     None     => go!(self: error; clear_comment; push_comment c; to BogusComment),
787                 }
788             }},
789 
790             //§ tag-name-state
791             states::TagName => loop { match get_char!(self, input) {
792                 '\t' | '\n' | '\x0C' | ' '
793                      => go!(self: to BeforeAttributeName),
794                 '/'  => go!(self: to SelfClosingStartTag),
795                 '>'  => go!(self: emit_tag Data),
796                 '\0' => go!(self: error; push_tag '\u{fffd}'),
797                 c    => go!(self: push_tag (c.to_ascii_lowercase())),
798             }},
799 
800             //§ script-data-escaped-less-than-sign-state
801             states::RawLessThanSign(ScriptDataEscaped(Escaped)) => loop { match get_char!(self, input) {
802                 '/' => go!(self: clear_temp; to RawEndTagOpen ScriptDataEscaped Escaped),
803                 c => match lower_ascii_letter(c) {
804                     Some(cl) => go!(self: clear_temp; push_temp cl; emit '<'; emit c;
805                                     to ScriptDataEscapeStart DoubleEscaped),
806                     None => go!(self: emit '<'; reconsume RawData ScriptDataEscaped Escaped),
807                 }
808             }},
809 
810             //§ script-data-double-escaped-less-than-sign-state
811             states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => loop { match get_char!(self, input) {
812                 '/' => go!(self: clear_temp; emit '/'; to ScriptDataDoubleEscapeEnd),
813                 _   => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped),
814             }},
815 
816             //§ rcdata-less-than-sign-state rawtext-less-than-sign-state script-data-less-than-sign-state
817             // otherwise
818             states::RawLessThanSign(kind) => loop { match get_char!(self, input) {
819                 '/' => go!(self: clear_temp; to RawEndTagOpen kind),
820                 '!' if kind == ScriptData => go!(self: emit '<'; emit '!'; to ScriptDataEscapeStart Escaped),
821                 _   => go!(self: emit '<'; reconsume RawData kind),
822             }},
823 
824             //§ rcdata-end-tag-open-state rawtext-end-tag-open-state script-data-end-tag-open-state script-data-escaped-end-tag-open-state
825             states::RawEndTagOpen(kind) => loop {
826                 let c = get_char!(self, input);
827                 match lower_ascii_letter(c) {
828                     Some(cl) => go!(self: create_tag EndTag cl; push_temp c; to RawEndTagName kind),
829                     None     => go!(self: emit '<'; emit '/'; reconsume RawData kind),
830                 }
831             },
832 
833             //§ rcdata-end-tag-name-state rawtext-end-tag-name-state script-data-end-tag-name-state script-data-escaped-end-tag-name-state
834             states::RawEndTagName(kind) => loop {
835                 let c = get_char!(self, input);
836                 if self.have_appropriate_end_tag() {
837                     match c {
838                         '\t' | '\n' | '\x0C' | ' '
839                             => go!(self: to BeforeAttributeName),
840                         '/' => go!(self: to SelfClosingStartTag),
841                         '>' => go!(self: emit_tag Data),
842                         _ => (),
843                     }
844                 }
845 
846                 match lower_ascii_letter(c) {
847                     Some(cl) => go!(self: push_tag cl; push_temp c),
848                     None     => go!(self: discard_tag; emit '<'; emit '/'; emit_temp; reconsume RawData kind),
849                 }
850             },
851 
852             //§ script-data-double-escape-start-state
853             states::ScriptDataEscapeStart(DoubleEscaped) => loop {
854                 let c = get_char!(self, input);
855                 match c {
856                     '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => {
857                         let esc = if &*self.temp_buf == "script" { DoubleEscaped } else { Escaped };
858                         go!(self: emit c; to RawData ScriptDataEscaped esc);
859                     }
860                     _ => match lower_ascii_letter(c) {
861                         Some(cl) => go!(self: push_temp cl; emit c),
862                         None     => go!(self: reconsume RawData ScriptDataEscaped Escaped),
863                     }
864                 }
865             },
866 
867             //§ script-data-escape-start-state
868             states::ScriptDataEscapeStart(Escaped) => loop { match get_char!(self, input) {
869                 '-' => go!(self: emit '-'; to ScriptDataEscapeStartDash),
870                 _   => go!(self: reconsume RawData ScriptData),
871             }},
872 
873             //§ script-data-escape-start-dash-state
874             states::ScriptDataEscapeStartDash => loop { match get_char!(self, input) {
875                 '-' => go!(self: emit '-'; to ScriptDataEscapedDashDash Escaped),
876                 _   => go!(self: reconsume RawData ScriptData),
877             }},
878 
879             //§ script-data-escaped-dash-state script-data-double-escaped-dash-state
880             states::ScriptDataEscapedDash(kind) => loop { match get_char!(self, input) {
881                 '-'  => go!(self: emit '-'; to ScriptDataEscapedDashDash kind),
882                 '<'  => {
883                     if kind == DoubleEscaped { go!(self: emit '<'); }
884                     go!(self: to RawLessThanSign ScriptDataEscaped kind);
885                 }
886                 '\0' => go!(self: error; emit '\u{fffd}'; to RawData ScriptDataEscaped kind),
887                 c    => go!(self: emit c; to RawData ScriptDataEscaped kind),
888             }},
889 
890             //§ script-data-escaped-dash-dash-state script-data-double-escaped-dash-dash-state
891             states::ScriptDataEscapedDashDash(kind) => loop { match get_char!(self, input) {
892                 '-'  => go!(self: emit '-'),
893                 '<'  => {
894                     if kind == DoubleEscaped { go!(self: emit '<'); }
895                     go!(self: to RawLessThanSign ScriptDataEscaped kind);
896                 }
897                 '>'  => go!(self: emit '>'; to RawData ScriptData),
898                 '\0' => go!(self: error; emit '\u{fffd}'; to RawData ScriptDataEscaped kind),
899                 c    => go!(self: emit c; to RawData ScriptDataEscaped kind),
900             }},
901 
902             //§ script-data-double-escape-end-state
903             states::ScriptDataDoubleEscapeEnd => loop {
904                 let c = get_char!(self, input);
905                 match c {
906                     '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => {
907                         let esc = if &*self.temp_buf == "script" { Escaped } else { DoubleEscaped };
908                         go!(self: emit c; to RawData ScriptDataEscaped esc);
909                     }
910                     _ => match lower_ascii_letter(c) {
911                         Some(cl) => go!(self: push_temp cl; emit c),
912                         None     => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped),
913                     }
914                 }
915             },
916 
917             //§ before-attribute-name-state
918             states::BeforeAttributeName => loop { match get_char!(self, input) {
919                 '\t' | '\n' | '\x0C' | ' ' => (),
920                 '/'  => go!(self: to SelfClosingStartTag),
921                 '>'  => go!(self: emit_tag Data),
922                 '\0' => go!(self: error; create_attr '\u{fffd}'; to AttributeName),
923                 c    => match lower_ascii_letter(c) {
924                     Some(cl) => go!(self: create_attr cl; to AttributeName),
925                     None => {
926                         go_match!(self: c,
927                             '"' , '\'' , '<' , '=' => error);
928                         go!(self: create_attr c; to AttributeName);
929                     }
930                 }
931             }},
932 
933             //§ attribute-name-state
934             states::AttributeName => loop { match get_char!(self, input) {
935                 '\t' | '\n' | '\x0C' | ' '
936                      => go!(self: to AfterAttributeName),
937                 '/'  => go!(self: to SelfClosingStartTag),
938                 '='  => go!(self: to BeforeAttributeValue),
939                 '>'  => go!(self: emit_tag Data),
940                 '\0' => go!(self: error; push_name '\u{fffd}'),
941                 c    => match lower_ascii_letter(c) {
942                     Some(cl) => go!(self: push_name cl),
943                     None => {
944                         go_match!(self: c,
945                             '"' , '\'' , '<' => error);
946                         go!(self: push_name c);
947                     }
948                 }
949             }},
950 
951             //§ after-attribute-name-state
952             states::AfterAttributeName => loop { match get_char!(self, input) {
953                 '\t' | '\n' | '\x0C' | ' ' => (),
954                 '/'  => go!(self: to SelfClosingStartTag),
955                 '='  => go!(self: to BeforeAttributeValue),
956                 '>'  => go!(self: emit_tag Data),
957                 '\0' => go!(self: error; create_attr '\u{fffd}'; to AttributeName),
958                 c    => match lower_ascii_letter(c) {
959                     Some(cl) => go!(self: create_attr cl; to AttributeName),
960                     None => {
961                         go_match!(self: c,
962                             '"' , '\'' , '<' => error);
963                         go!(self: create_attr c; to AttributeName);
964                     }
965                 }
966             }},
967 
968             //§ before-attribute-value-state
969             // Use peek so we can handle the first attr character along with the rest,
970             // hopefully in the same zero-copy buffer.
971             states::BeforeAttributeValue => loop { match peek!(self, input) {
972                 '\t' | '\n' | '\r' | '\x0C' | ' ' => go!(self: discard_char input),
973                 '"'  => go!(self: discard_char input; to AttributeValue DoubleQuoted),
974                 '\'' => go!(self: discard_char input; to AttributeValue SingleQuoted),
975                 '\0' => go!(self: discard_char input; error; push_value '\u{fffd}'; to AttributeValue Unquoted),
976                 '>'  => go!(self: discard_char input; error; emit_tag Data),
977                 _    => go!(self: to AttributeValue Unquoted),
978             }},
979 
980             //§ attribute-value-(double-quoted)-state
981             states::AttributeValue(DoubleQuoted) => loop {
982                 match pop_except_from!(self, input, small_char_set!('\r' '"' '&' '\0' '\n')) {
983                     FromSet('"')  => go!(self: to AfterAttributeValueQuoted),
984                     FromSet('&')  => go!(self: consume_char_ref '"'),
985                     FromSet('\0') => go!(self: error; push_value '\u{fffd}'),
986                     FromSet(c)    => go!(self: push_value c),
987                     NotFromSet(ref b) => go!(self: append_value b),
988                 }
989             },
990 
991             //§ attribute-value-(single-quoted)-state
992             states::AttributeValue(SingleQuoted) => loop {
993                 match pop_except_from!(self, input, small_char_set!('\r' '\'' '&' '\0' '\n')) {
994                     FromSet('\'') => go!(self: to AfterAttributeValueQuoted),
995                     FromSet('&')  => go!(self: consume_char_ref '\''),
996                     FromSet('\0') => go!(self: error; push_value '\u{fffd}'),
997                     FromSet(c)    => go!(self: push_value c),
998                     NotFromSet(ref b) => go!(self: append_value b),
999                 }
1000             },
1001 
1002             //§ attribute-value-(unquoted)-state
1003             states::AttributeValue(Unquoted) => loop {
1004                 match pop_except_from!(self, input, small_char_set!('\r' '\t' '\n' '\x0C' ' ' '&' '>' '\0')) {
1005                     FromSet('\t') | FromSet('\n') | FromSet('\x0C') | FromSet(' ')
1006                      => go!(self: to BeforeAttributeName),
1007                     FromSet('&')  => go!(self: consume_char_ref '>'),
1008                     FromSet('>')  => go!(self: emit_tag Data),
1009                     FromSet('\0') => go!(self: error; push_value '\u{fffd}'),
1010                     FromSet(c) => {
1011                         go_match!(self: c,
1012                             '"' , '\'' , '<' , '=' , '`' => error);
1013                         go!(self: push_value c);
1014                     }
1015                     NotFromSet(ref b) => go!(self: append_value b),
1016                 }
1017             },
1018 
1019             //§ after-attribute-value-(quoted)-state
1020             states::AfterAttributeValueQuoted => loop { match get_char!(self, input) {
1021                 '\t' | '\n' | '\x0C' | ' '
1022                      => go!(self: to BeforeAttributeName),
1023                 '/'  => go!(self: to SelfClosingStartTag),
1024                 '>'  => go!(self: emit_tag Data),
1025                 _    => go!(self: error; reconsume BeforeAttributeName),
1026             }},
1027 
1028             //§ self-closing-start-tag-state
1029             states::SelfClosingStartTag => loop { match get_char!(self, input) {
1030                 '>' => {
1031                     self.current_tag_self_closing = true;
1032                     go!(self: emit_tag Data);
1033                 }
1034                 _ => go!(self: error; reconsume BeforeAttributeName),
1035             }},
1036 
1037             //§ comment-start-state
1038             states::CommentStart => loop { match get_char!(self, input) {
1039                 '-'  => go!(self: to CommentStartDash),
1040                 '\0' => go!(self: error; push_comment '\u{fffd}'; to Comment),
1041                 '>'  => go!(self: error; emit_comment; to Data),
1042                 c    => go!(self: push_comment c; to Comment),
1043             }},
1044 
1045             //§ comment-start-dash-state
1046             states::CommentStartDash => loop { match get_char!(self, input) {
1047                 '-'  => go!(self: to CommentEnd),
1048                 '\0' => go!(self: error; append_comment "-\u{fffd}"; to Comment),
1049                 '>'  => go!(self: error; emit_comment; to Data),
1050                 c    => go!(self: push_comment '-'; push_comment c; to Comment),
1051             }},
1052 
1053             //§ comment-state
1054             states::Comment => loop { match get_char!(self, input) {
1055                 '-'  => go!(self: to CommentEndDash),
1056                 '\0' => go!(self: error; push_comment '\u{fffd}'),
1057                 c    => go!(self: push_comment c),
1058             }},
1059 
1060             //§ comment-end-dash-state
1061             states::CommentEndDash => loop { match get_char!(self, input) {
1062                 '-'  => go!(self: to CommentEnd),
1063                 '\0' => go!(self: error; append_comment "-\u{fffd}"; to Comment),
1064                 c    => go!(self: push_comment '-'; push_comment c; to Comment),
1065             }},
1066 
1067             //§ comment-end-state
1068             states::CommentEnd => loop { match get_char!(self, input) {
1069                 '>'  => go!(self: emit_comment; to Data),
1070                 '\0' => go!(self: error; append_comment "--\u{fffd}"; to Comment),
1071                 '!'  => go!(self: error; to CommentEndBang),
1072                 '-'  => go!(self: error; push_comment '-'),
1073                 c    => go!(self: error; append_comment "--"; push_comment c; to Comment),
1074             }},
1075 
1076             //§ comment-end-bang-state
1077             states::CommentEndBang => loop { match get_char!(self, input) {
1078                 '-'  => go!(self: append_comment "--!"; to CommentEndDash),
1079                 '>'  => go!(self: emit_comment; to Data),
1080                 '\0' => go!(self: error; append_comment "--!\u{fffd}"; to Comment),
1081                 c    => go!(self: append_comment "--!"; push_comment c; to Comment),
1082             }},
1083 
1084             //§ doctype-state
1085             states::Doctype => loop { match get_char!(self, input) {
1086                 '\t' | '\n' | '\x0C' | ' '
1087                     => go!(self: to BeforeDoctypeName),
1088                 _   => go!(self: error; reconsume BeforeDoctypeName),
1089             }},
1090 
1091             //§ before-doctype-name-state
1092             states::BeforeDoctypeName => loop { match get_char!(self, input) {
1093                 '\t' | '\n' | '\x0C' | ' ' => (),
1094                 '\0' => go!(self: error; create_doctype; push_doctype_name '\u{fffd}'; to DoctypeName),
1095                 '>'  => go!(self: error; create_doctype; force_quirks; emit_doctype; to Data),
1096                 c    => go!(self: create_doctype; push_doctype_name (c.to_ascii_lowercase());
1097                                   to DoctypeName),
1098             }},
1099 
1100             //§ doctype-name-state
1101             states::DoctypeName => loop { match get_char!(self, input) {
1102                 '\t' | '\n' | '\x0C' | ' '
1103                      => go!(self: clear_temp; to AfterDoctypeName),
1104                 '>'  => go!(self: emit_doctype; to Data),
1105                 '\0' => go!(self: error; push_doctype_name '\u{fffd}'),
1106                 c    => go!(self: push_doctype_name (c.to_ascii_lowercase())),
1107             }},
1108 
1109             //§ after-doctype-name-state
1110             states::AfterDoctypeName => loop {
1111                 if eat!(self, input, "public") {
1112                     go!(self: to AfterDoctypeKeyword Public);
1113                 } else if eat!(self, input, "system") {
1114                     go!(self: to AfterDoctypeKeyword System);
1115                 } else {
1116                     match get_char!(self, input) {
1117                         '\t' | '\n' | '\x0C' | ' ' => (),
1118                         '>' => go!(self: emit_doctype; to Data),
1119                         _   => go!(self: error; force_quirks; to BogusDoctype),
1120                     }
1121                 }
1122             },
1123 
1124             //§ after-doctype-public-keyword-state after-doctype-system-keyword-state
1125             states::AfterDoctypeKeyword(kind) => loop { match get_char!(self, input) {
1126                 '\t' | '\n' | '\x0C' | ' '
1127                      => go!(self: to BeforeDoctypeIdentifier kind),
1128                 '"'  => go!(self: error; clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind),
1129                 '\'' => go!(self: error; clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind),
1130                 '>'  => go!(self: error; force_quirks; emit_doctype; to Data),
1131                 _    => go!(self: error; force_quirks; to BogusDoctype),
1132             }},
1133 
1134             //§ before-doctype-public-identifier-state before-doctype-system-identifier-state
1135             states::BeforeDoctypeIdentifier(kind) => loop { match get_char!(self, input) {
1136                 '\t' | '\n' | '\x0C' | ' ' => (),
1137                 '"'  => go!(self: clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind),
1138                 '\'' => go!(self: clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind),
1139                 '>'  => go!(self: error; force_quirks; emit_doctype; to Data),
1140                 _    => go!(self: error; force_quirks; to BogusDoctype),
1141             }},
1142 
1143             //§ doctype-public-identifier-(double-quoted)-state doctype-system-identifier-(double-quoted)-state
1144             states::DoctypeIdentifierDoubleQuoted(kind) => loop { match get_char!(self, input) {
1145                 '"'  => go!(self: to AfterDoctypeIdentifier kind),
1146                 '\0' => go!(self: error; push_doctype_id kind '\u{fffd}'),
1147                 '>'  => go!(self: error; force_quirks; emit_doctype; to Data),
1148                 c    => go!(self: push_doctype_id kind c),
1149             }},
1150 
1151             //§ doctype-public-identifier-(single-quoted)-state doctype-system-identifier-(single-quoted)-state
1152             states::DoctypeIdentifierSingleQuoted(kind) => loop { match get_char!(self, input) {
1153                 '\'' => go!(self: to AfterDoctypeIdentifier kind),
1154                 '\0' => go!(self: error; push_doctype_id kind '\u{fffd}'),
1155                 '>'  => go!(self: error; force_quirks; emit_doctype; to Data),
1156                 c    => go!(self: push_doctype_id kind c),
1157             }},
1158 
1159             //§ after-doctype-public-identifier-state
1160             states::AfterDoctypeIdentifier(Public) => loop { match get_char!(self, input) {
1161                 '\t' | '\n' | '\x0C' | ' '
1162                      => go!(self: to BetweenDoctypePublicAndSystemIdentifiers),
1163                 '>'  => go!(self: emit_doctype; to Data),
1164                 '"'  => go!(self: error; clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System),
1165                 '\'' => go!(self: error; clear_doctype_id System; to DoctypeIdentifierSingleQuoted System),
1166                 _    => go!(self: error; force_quirks; to BogusDoctype),
1167             }},
1168 
1169             //§ after-doctype-system-identifier-state
1170             states::AfterDoctypeIdentifier(System) => loop { match get_char!(self, input) {
1171                 '\t' | '\n' | '\x0C' | ' ' => (),
1172                 '>' => go!(self: emit_doctype; to Data),
1173                 _   => go!(self: error; to BogusDoctype),
1174             }},
1175 
1176             //§ between-doctype-public-and-system-identifiers-state
1177             states::BetweenDoctypePublicAndSystemIdentifiers => loop { match get_char!(self, input) {
1178                 '\t' | '\n' | '\x0C' | ' ' => (),
1179                 '>'  => go!(self: emit_doctype; to Data),
1180                 '"'  => go!(self: clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System),
1181                 '\'' => go!(self: clear_doctype_id System; to DoctypeIdentifierSingleQuoted System),
1182                 _    => go!(self: error; force_quirks; to BogusDoctype),
1183             }},
1184 
1185             //§ bogus-doctype-state
1186             states::BogusDoctype => loop { match get_char!(self, input) {
1187                 '>'  => go!(self: emit_doctype; to Data),
1188                 _    => (),
1189             }},
1190 
1191             //§ bogus-comment-state
1192             states::BogusComment => loop { match get_char!(self, input) {
1193                 '>'  => go!(self: emit_comment; to Data),
1194                 '\0' => go!(self: push_comment '\u{fffd}'),
1195                 c    => go!(self: push_comment c),
1196             }},
1197 
1198             //§ markup-declaration-open-state
1199             states::MarkupDeclarationOpen => loop {
1200                 if eat_exact!(self, input, "--") {
1201                     go!(self: clear_comment; to CommentStart);
1202                 } else if eat!(self, input, "doctype") {
1203                     go!(self: to Doctype);
1204                 } else {
1205                     if self.sink.adjusted_current_node_present_but_not_in_html_namespace() {
1206                         if eat_exact!(self, input, "[CDATA[") {
1207                             go!(self: clear_temp; to CdataSection);
1208                         }
1209                     }
1210                     go!(self: error; to BogusComment);
1211                 }
1212             },
1213 
1214             //§ cdata-section-state
1215             states::CdataSection => loop { match get_char!(self, input) {
1216                 ']' => go!(self: to CdataSectionBracket),
1217                 '\0' => go!(self: emit_temp; emit '\0'),
1218                 c => go!(self: push_temp c),
1219             }},
1220 
1221             //§ cdata-section-bracket
1222             states::CdataSectionBracket => match get_char!(self, input) {
1223                 ']' => go!(self: to CdataSectionEnd),
1224                 _ => go!(self: push_temp ']'; reconsume CdataSection),
1225             },
1226 
1227             //§ cdata-section-end
1228             states::CdataSectionEnd => loop { match get_char!(self, input) {
1229                 ']' => go!(self: push_temp ']'),
1230                 '>' => go!(self: emit_temp; to Data),
1231                 _ => go!(self: push_temp ']'; push_temp ']'; reconsume CdataSection),
1232             }},
1233 
1234             //§ END
1235         }
1236     }
1237 
1238     fn step_char_ref_tokenizer(&mut self, input: &mut BufferQueue) -> ProcessResult<Sink::Handle> {
1239         // FIXME HACK: Take and replace the tokenizer so we don't
1240         // double-mut-borrow self.  This is why it's boxed.
1241         let mut tok = self.char_ref_tokenizer.take().unwrap();
1242         let outcome = tok.step(self, input);
1243 
1244         let progress = match outcome {
1245             char_ref::Done => {
1246                 self.process_char_ref(tok.get_result());
1247                 return ProcessResult::Continue;
1248             }
1249 
1250             char_ref::Stuck => ProcessResult::Suspend,
1251             char_ref::Progress => ProcessResult::Continue,
1252         };
1253 
1254         self.char_ref_tokenizer = Some(tok);
1255         progress
1256     }
1257 
1258     fn process_char_ref(&mut self, char_ref: CharRef) {
1259         let CharRef { mut chars, mut num_chars } = char_ref;
1260 
1261         if num_chars == 0 {
1262             chars[0] = '&';
1263             num_chars = 1;
1264         }
1265 
1266         for i in 0 .. num_chars {
1267             let c = chars[i as usize];
1268             match self.state {
1269                 states::Data | states::RawData(states::Rcdata)
1270                     => go!(self: emit c),
1271 
1272                 states::AttributeValue(_)
1273                     => go!(self: push_value c),
1274 
1275                 _ => panic!("state {:?} should not be reachable in process_char_ref", self.state),
1276             }
1277         }
1278     }
1279 
1280     /// Indicate that we have reached the end of the input.
1281     pub fn end(&mut self) {
1282         // Handle EOF in the char ref sub-tokenizer, if there is one.
1283         // Do this first because it might un-consume stuff.
1284         let mut input = BufferQueue::new();
1285         match self.char_ref_tokenizer.take() {
1286             None => (),
1287             Some(mut tok) => {
1288                 tok.end_of_file(self, &mut input);
1289                 self.process_char_ref(tok.get_result());
1290             }
1291         }
1292 
1293         // Process all remaining buffered input.
1294         // If we're waiting for lookahead, we're not gonna get it.
1295         self.at_eof = true;
1296         assert!(matches!(self.run(&mut input), TokenizerResult::Done));
1297         assert!(input.is_empty());
1298 
1299         loop {
1300             match self.eof_step() {
1301                 ProcessResult::Continue => (),
1302                 ProcessResult::Suspend => break,
1303                 ProcessResult::Script(_) => unreachable!(),
1304             }
1305         }
1306 
1307         self.sink.end();
1308 
1309         if self.opts.profile {
1310             self.dump_profile();
1311         }
1312     }
1313 
1314     fn dump_profile(&self) {
1315         let mut results: Vec<(states::State, u64)>
1316             = self.state_profile.iter().map(|(s, t)| (*s, *t)).collect();
1317         results.sort_by(|&(_, x), &(_, y)| y.cmp(&x));
1318 
1319         let total: u64 = results.iter().map(|&(_, t)| t).fold(0, ::std::ops::Add::add);
1320         println!("\nTokenizer profile, in nanoseconds");
1321         println!("\n{:12}         total in token sink", self.time_in_sink);
1322         println!("\n{:12}         total in tokenizer", total);
1323 
1324         for (k, v) in results.into_iter() {
1325             let pct = 100.0 * (v as f64) / (total as f64);
1326             println!("{:12}  {:4.1}%  {:?}", v, pct, k);
1327         }
1328     }
1329 
1330     fn eof_step(&mut self) -> ProcessResult<Sink::Handle> {
1331         debug!("processing EOF in state {:?}", self.state);
1332         match self.state {
1333             states::Data | states::RawData(Rcdata) | states::RawData(Rawtext)
1334             | states::RawData(ScriptData) | states::Plaintext
1335                 => go!(self: eof),
1336 
1337             states::TagName | states::RawData(ScriptDataEscaped(_))
1338             | states::BeforeAttributeName | states::AttributeName
1339             | states::AfterAttributeName | states::BeforeAttributeValue
1340             | states::AttributeValue(_) | states::AfterAttributeValueQuoted
1341             | states::SelfClosingStartTag | states::ScriptDataEscapedDash(_)
1342             | states::ScriptDataEscapedDashDash(_)
1343                 => go!(self: error_eof; to Data),
1344 
1345             states::TagOpen
1346                 => go!(self: error_eof; emit '<'; to Data),
1347 
1348             states::EndTagOpen
1349                 => go!(self: error_eof; emit '<'; emit '/'; to Data),
1350 
1351             states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped))
1352                 => go!(self: to RawData ScriptDataEscaped DoubleEscaped),
1353 
1354             states::RawLessThanSign(kind)
1355                 => go!(self: emit '<'; to RawData kind),
1356 
1357             states::RawEndTagOpen(kind)
1358                 => go!(self: emit '<'; emit '/'; to RawData kind),
1359 
1360             states::RawEndTagName(kind)
1361                 => go!(self: emit '<'; emit '/'; emit_temp; to RawData kind),
1362 
1363             states::ScriptDataEscapeStart(kind)
1364                 => go!(self: to RawData ScriptDataEscaped kind),
1365 
1366             states::ScriptDataEscapeStartDash
1367                 => go!(self: to RawData ScriptData),
1368 
1369             states::ScriptDataDoubleEscapeEnd
1370                 => go!(self: to RawData ScriptDataEscaped DoubleEscaped),
1371 
1372             states::CommentStart | states::CommentStartDash
1373             | states::Comment | states::CommentEndDash
1374             | states::CommentEnd | states::CommentEndBang
1375                 => go!(self: error_eof; emit_comment; to Data),
1376 
1377             states::Doctype | states::BeforeDoctypeName
1378                 => go!(self: error_eof; create_doctype; force_quirks; emit_doctype; to Data),
1379 
1380             states::DoctypeName | states::AfterDoctypeName | states::AfterDoctypeKeyword(_)
1381             | states::BeforeDoctypeIdentifier(_) | states::DoctypeIdentifierDoubleQuoted(_)
1382             | states::DoctypeIdentifierSingleQuoted(_) | states::AfterDoctypeIdentifier(_)
1383             | states::BetweenDoctypePublicAndSystemIdentifiers
1384                 => go!(self: error_eof; force_quirks; emit_doctype; to Data),
1385 
1386             states::BogusDoctype
1387                 => go!(self: emit_doctype; to Data),
1388 
1389             states::BogusComment
1390                 => go!(self: emit_comment; to Data),
1391 
1392             states::MarkupDeclarationOpen
1393                 => go!(self: error; to BogusComment),
1394 
1395             states::CdataSection
1396                 => go!(self: emit_temp; error_eof; to Data),
1397 
1398             states::CdataSectionBracket
1399                 => go!(self: push_temp ']'; to CdataSection),
1400 
1401             states::CdataSectionEnd
1402                 => go!(self: push_temp ']'; push_temp ']'; to CdataSection),
1403         }
1404     }
1405 }
1406 
1407 #[cfg(test)]
1408 #[allow(non_snake_case)]
1409 mod test {
1410     use super::option_push; // private items
1411     use tendril::{StrTendril, SliceExt};
1412 
1413     use super::{TokenSink, Tokenizer, TokenizerOpts, TokenSinkResult};
1414 
1415     use super::interface::{Token, TagToken};
1416     use super::interface::{CharacterTokens, NullCharacterToken, EOFToken, ParseError};
1417     use super::interface::{TagKind, StartTag, EndTag, Tag};
1418 
1419     use super::buffer_queue::{BufferQueue};
1420     use std::mem::replace;
1421 
1422     use {LocalName};
1423 
1424     // LinesMatch implements the TokenSink trait. It is used for testing to see
1425     // if current_line is being updated when process_token is called. The lines
1426     // vector is a collection of the line numbers that each token is on.
1427     struct LinesMatch {
1428         tokens: Vec<Token>,
1429         current_str: StrTendril,
1430         lines: Vec<(Token, u64)>,
1431     }
1432 
1433     impl LinesMatch {
1434         fn new() -> LinesMatch {
1435             LinesMatch {
1436                 tokens: vec!(),
1437                 current_str: StrTendril::new(),
1438                 lines: vec!(),
1439             }
1440         }
1441 
1442         fn push(&mut self, token: Token, line_number: u64) {
1443             self.finish_str();
1444             self.lines.push((token, line_number));
1445         }
1446 
1447         fn finish_str(&mut self) {
1448             if self.current_str.len() > 0 {
1449                 let s = replace(&mut self.current_str, StrTendril::new());
1450                 self.tokens.push(CharacterTokens(s));
1451             }
1452         }
1453 
1454     }
1455 
1456     impl TokenSink for LinesMatch {
1457 
1458         type Handle = ();
1459 
1460         fn process_token(&mut self, token: Token, line_number: u64) -> TokenSinkResult<Self::Handle> {
1461 
1462             match token {
1463                 CharacterTokens(b) => {
1464                     self.current_str.push_slice(&b);
1465                 }
1466 
1467                 NullCharacterToken => {
1468                     self.current_str.push_char('\0');
1469                 }
1470 
1471                 ParseError(_) => {
1472                     panic!("unexpected parse error");
1473                 }
1474 
1475                 TagToken(mut t) => {
1476                     // The spec seems to indicate that one can emit
1477                     // erroneous end tags with attrs, but the test
1478                     // cases don't contain them.
1479                     match t.kind {
1480                         EndTag => {
1481                             t.self_closing = false;
1482                             t.attrs = vec!();
1483                         }
1484                         _ => t.attrs.sort_by(|a1, a2| a1.name.cmp(&a2.name)),
1485                     }
1486                     self.push(TagToken(t), line_number);
1487                 }
1488 
1489                 EOFToken => (),
1490 
1491                 _ => self.push(token, line_number),
1492             }
1493             TokenSinkResult::Continue
1494         }
1495     }
1496 
1497     // Take in tokens, process them, and return vector with line
1498     // numbers that each token is on
1499     fn tokenize(input: Vec<StrTendril>, opts: TokenizerOpts) -> Vec<(Token, u64)> {
1500         let sink = LinesMatch::new();
1501         let mut tok = Tokenizer::new(sink, opts);
1502         let mut buffer = BufferQueue::new();
1503         for chunk in input.into_iter() {
1504             buffer.push_back(chunk);
1505             let _ = tok.feed(&mut buffer);
1506         }
1507         tok.end();
1508         tok.sink.lines
1509     }
1510 
1511     // Create a tag token
1512     fn create_tag(token: StrTendril, tagkind: TagKind) -> Token {
1513         let name = LocalName::from(&*token);
1514         let token = TagToken(Tag { kind: tagkind,
1515             name: name,
1516             self_closing: false,
1517             attrs: vec!(),
1518         });
1519         token
1520     }
1521 
1522     #[test]
1523     fn push_to_None_gives_singleton() {
1524         let mut s: Option<StrTendril> = None;
1525         option_push(&mut s, 'x');
1526         assert_eq!(s, Some("x".to_tendril()));
1527     }
1528 
1529     #[test]
1530     fn push_to_empty_appends() {
1531         let mut s: Option<StrTendril> = Some(StrTendril::new());
1532         option_push(&mut s, 'x');
1533         assert_eq!(s, Some("x".to_tendril()));
1534     }
1535 
1536     #[test]
1537     fn push_to_nonempty_appends() {
1538         let mut s: Option<StrTendril> = Some(StrTendril::from_slice("y"));
1539         option_push(&mut s, 'x');
1540         assert_eq!(s, Some("yx".to_tendril()));
1541     }
1542 
1543     #[test]
1544     fn check_lines() {
1545         let opts = TokenizerOpts {
1546             exact_errors: false,
1547             discard_bom: true,
1548             profile: false,
1549             initial_state: None,
1550             last_start_tag_name: None,
1551         };
1552         let vector = vec![StrTendril::from("<a>\n"), StrTendril::from("<b>\n"),
1553             StrTendril::from("</b>\n"), StrTendril::from("</a>\n")];
1554         let expected = vec![(create_tag(StrTendril::from("a"), StartTag), 1),
1555             (create_tag(StrTendril::from("b"), StartTag), 2),
1556             (create_tag(StrTendril::from("b"), EndTag), 3),
1557             (create_tag(StrTendril::from("a"), EndTag), 4)];
1558         let results = tokenize(vector, opts);
1559         assert_eq!(results, expected);
1560     }
1561 
1562     #[test]
1563     fn check_lines_with_new_line() {
1564         let opts = TokenizerOpts {
1565             exact_errors: false,
1566             discard_bom: true,
1567             profile: false,
1568             initial_state: None,
1569             last_start_tag_name: None,
1570         };
1571         let vector = vec![StrTendril::from("<a>\r\n"), StrTendril::from("<b>\r\n"),
1572             StrTendril::from("</b>\r\n"), StrTendril::from("</a>\r\n")];
1573         let expected = vec![(create_tag(StrTendril::from("a"), StartTag), 1),
1574             (create_tag(StrTendril::from("b"), StartTag), 2),
1575             (create_tag(StrTendril::from("b"), EndTag), 3),
1576             (create_tag(StrTendril::from("a"), EndTag), 4)];
1577         let results = tokenize(vector, opts);
1578         assert_eq!(results, expected);
1579     }
1580 }
1581