1 // Copyright 2014-2017 The html5ever Project Developers. See the
2 // COPYRIGHT file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9 
10 mod char_ref;
11 mod interface;
12 mod qname;
13 pub mod states;
14 
15 pub use self::interface::{CharacterTokens, EOFToken, NullCharacterToken};
16 pub use self::interface::{CommentToken, DoctypeToken, PIToken, TagToken};
17 pub use self::interface::{Doctype, Pi};
18 pub use self::interface::{EmptyTag, EndTag, ShortTag, StartTag};
19 pub use self::interface::{ParseError, Tag, TagKind, Token, TokenSink};
20 pub use crate::{LocalName, Namespace, Prefix};
21 
22 use log::debug;
23 use mac::{format_if, unwrap_or_return};
24 use markup5ever::{local_name, namespace_prefix, namespace_url, ns, small_char_set};
25 use std::borrow::Cow::{self, Borrowed};
26 use std::collections::BTreeMap;
27 use std::mem::replace;
28 use crate::tendril::StrTendril;
29 use crate::{buffer_queue, Attribute, QualName, SmallCharSet};
30 
31 use self::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult};
32 use self::char_ref::{CharRef, CharRefTokenizer};
33 use self::qname::QualNameTokenizer;
34 use self::states::XmlState;
35 use self::states::{DoctypeKind, Public, System};
36 use self::states::{DoubleQuoted, SingleQuoted, Unquoted};
37 
38 /// Copy of Tokenizer options, with an impl for `Default`.
39 #[derive(Copy, Clone)]
40 pub struct XmlTokenizerOpts {
41     /// Report all parse errors described in the spec, at some
42     /// performance penalty?  Default: false
43     pub exact_errors: bool,
44 
45     /// Discard a `U+FEFF BYTE ORDER MARK` if we see one at the beginning
46     /// of the stream?  Default: true
47     pub discard_bom: bool,
48 
49     /// Keep a record of how long we spent in each state?  Printed
50     /// when `end()` is called.  Default: false
51     pub profile: bool,
52 
53     /// Initial state override.  Only the test runner should use
54     /// a non-`None` value!
55     pub initial_state: Option<states::XmlState>,
56 }
57 
process_qname(tag_name: StrTendril) -> QualName58 fn process_qname(tag_name: StrTendril) -> QualName {
59     // If tag name can't possibly contain full namespace, skip qualified name
60     // parsing altogether. For a tag to have namespace it must look like:
61     //     a:b
62     // Since StrTendril are UTF-8, we know that minimal size in bytes must be
63     // three bytes minimum.
64     let split = if (&*tag_name).as_bytes().len() < 3 {
65         None
66     } else {
67         QualNameTokenizer::new((&*tag_name).as_bytes()).run()
68     };
69 
70     match split {
71         None => QualName::new(None, ns!(), LocalName::from(&*tag_name)),
72         Some(col) => {
73             let len = (&*tag_name).as_bytes().len() as u32;
74             let prefix = tag_name.subtendril(0, col);
75             let local = tag_name.subtendril(col + 1, len - col - 1);
76             let ns = ns!(); // Actual namespace URL set in XmlTreeBuilder::bind_qname
77             QualName::new(Some(Prefix::from(&*prefix)), ns, LocalName::from(&*local))
78         },
79     }
80 }
81 
option_push(opt_str: &mut Option<StrTendril>, c: char)82 fn option_push(opt_str: &mut Option<StrTendril>, c: char) {
83     match *opt_str {
84         Some(ref mut s) => s.push_char(c),
85         None => *opt_str = Some(StrTendril::from_char(c)),
86     }
87 }
88 
89 impl Default for XmlTokenizerOpts {
default() -> XmlTokenizerOpts90     fn default() -> XmlTokenizerOpts {
91         XmlTokenizerOpts {
92             exact_errors: false,
93             discard_bom: true,
94             profile: false,
95             initial_state: None,
96         }
97     }
98 }
99 /// The Xml tokenizer.
100 pub struct XmlTokenizer<Sink> {
101     /// Options controlling the behavior of the tokenizer.
102     opts: XmlTokenizerOpts,
103 
104     /// Destination for tokens we emit.
105     pub sink: Sink,
106 
107     /// The abstract machine state as described in the spec.
108     state: states::XmlState,
109 
110     /// Are we at the end of the file, once buffers have been processed
111     /// completely? This affects whether we will wait for lookahead or not.
112     at_eof: bool,
113 
114     /// Tokenizer for character references, if we're tokenizing
115     /// one at the moment.
116     char_ref_tokenizer: Option<Box<CharRefTokenizer>>,
117 
118     /// Current input character.  Just consumed, may reconsume.
119     current_char: char,
120 
121     /// Should we reconsume the current input character?
122     reconsume: bool,
123 
124     /// Did we just consume \r, translating it to \n?  In that case we need
125     /// to ignore the next character if it's \n.
126     ignore_lf: bool,
127 
128     /// Discard a U+FEFF BYTE ORDER MARK if we see one?  Only done at the
129     /// beginning of the stream.
130     discard_bom: bool,
131 
132     /// Temporary buffer
133     temp_buf: StrTendril,
134 
135     /// Current tag kind.
136     current_tag_kind: TagKind,
137 
138     /// Current tag name.
139     current_tag_name: StrTendril,
140 
141     /// Current tag attributes.
142     current_tag_attrs: Vec<Attribute>,
143 
144     /// Current attribute name.
145     current_attr_name: StrTendril,
146 
147     /// Current attribute value.
148     current_attr_value: StrTendril,
149 
150     current_doctype: Doctype,
151 
152     /// Current comment.
153     current_comment: StrTendril,
154 
155     /// Current processing instruction target.
156     current_pi_target: StrTendril,
157 
158     /// Current processing instruction value.
159     current_pi_data: StrTendril,
160 
161     /// Record of how many ns we spent in each state, if profiling is enabled.
162     state_profile: BTreeMap<states::XmlState, u64>,
163 
164     /// Record of how many ns we spent in the token sink.
165     time_in_sink: u64,
166 }
167 
168 impl<Sink: TokenSink> XmlTokenizer<Sink> {
169     /// Create a new tokenizer which feeds tokens to a particular `TokenSink`.
new(sink: Sink, opts: XmlTokenizerOpts) -> XmlTokenizer<Sink>170     pub fn new(sink: Sink, opts: XmlTokenizerOpts) -> XmlTokenizer<Sink> {
171         if opts.profile && cfg!(for_c) {
172             panic!("Can't profile tokenizer when built as a C library");
173         }
174 
175         let state = *opts.initial_state.as_ref().unwrap_or(&states::Data);
176         let discard_bom = opts.discard_bom;
177         XmlTokenizer {
178             opts: opts,
179             sink: sink,
180             state: state,
181             char_ref_tokenizer: None,
182             at_eof: false,
183             current_char: '\0',
184             reconsume: false,
185             ignore_lf: false,
186             temp_buf: StrTendril::new(),
187             discard_bom: discard_bom,
188             current_tag_kind: StartTag,
189             current_tag_name: StrTendril::new(),
190             current_tag_attrs: vec![],
191             current_attr_name: StrTendril::new(),
192             current_attr_value: StrTendril::new(),
193             current_comment: StrTendril::new(),
194             current_pi_data: StrTendril::new(),
195             current_pi_target: StrTendril::new(),
196             current_doctype: Doctype::new(),
197             state_profile: BTreeMap::new(),
198             time_in_sink: 0,
199         }
200     }
201 
202     /// Feed an input string into the tokenizer.
feed(&mut self, input: &mut BufferQueue)203     pub fn feed(&mut self, input: &mut BufferQueue) {
204         if input.is_empty() {
205             return;
206         }
207 
208         if self.discard_bom {
209             if let Some(c) = input.peek() {
210                 if c == '\u{feff}' {
211                     input.next();
212                 }
213             } else {
214                 return;
215             }
216         };
217 
218         self.run(input);
219     }
220 
process_token(&mut self, token: Token)221     fn process_token(&mut self, token: Token) {
222         if self.opts.profile {
223             let (_, dt) = time!(self.sink.process_token(token));
224             self.time_in_sink += dt;
225         } else {
226             self.sink.process_token(token);
227         }
228     }
229 
230     // Get the next input character, which might be the character
231     // 'c' that we already consumed from the buffers.
get_preprocessed_char(&mut self, mut c: char, input: &mut BufferQueue) -> Option<char>232     fn get_preprocessed_char(&mut self, mut c: char, input: &mut BufferQueue) -> Option<char> {
233         if self.ignore_lf {
234             self.ignore_lf = false;
235             if c == '\n' {
236                 c = unwrap_or_return!(input.next(), None);
237             }
238         }
239 
240         if c == '\r' {
241             self.ignore_lf = true;
242             c = '\n';
243         }
244 
245         // Normalize \x00 into \uFFFD
246         if c == '\x00' {
247             c = '\u{FFFD}'
248         }
249 
250         // Exclude forbidden Unicode characters
251         if self.opts.exact_errors &&
252             match c as u32 {
253                 0x01..=0x08 | 0x0B | 0x0E..=0x1F | 0x7F..=0x9F | 0xFDD0..=0xFDEF => true,
254                 n if (n & 0xFFFE) == 0xFFFE => true,
255                 _ => false,
256             }
257         {
258             let msg = format!("Bad character {}", c);
259             self.emit_error(Cow::Owned(msg));
260         }
261 
262         debug!("got character {}", c);
263         self.current_char = c;
264         Some(c)
265     }
266 
bad_eof_error(&mut self)267     fn bad_eof_error(&mut self) {
268         let msg = format_if!(
269             self.opts.exact_errors,
270             "Unexpected EOF",
271             "Saw EOF in state {:?}",
272             self.state
273         );
274         self.emit_error(msg);
275     }
276 
pop_except_from(&mut self, input: &mut BufferQueue, set: SmallCharSet) -> Option<SetResult>277     fn pop_except_from(&mut self, input: &mut BufferQueue, set: SmallCharSet) -> Option<SetResult> {
278         // Bail to the slow path for various corner cases.
279         // This means that `FromSet` can contain characters not in the set!
280         // It shouldn't matter because the fallback `FromSet` case should
281         // always do the same thing as the `NotFromSet` case.
282         if self.opts.exact_errors || self.reconsume || self.ignore_lf {
283             return self.get_char(input).map(|x| FromSet(x));
284         }
285 
286         let d = input.pop_except_from(set);
287         debug!("got characters {:?}", d);
288         match d {
289             Some(FromSet(c)) => self.get_preprocessed_char(c, input).map(|x| FromSet(x)),
290 
291             // NB: We don't set self.current_char for a run of characters not
292             // in the set.  It shouldn't matter for the codepaths that use
293             // this.
294             _ => d,
295         }
296     }
297 
298     // Check if the next characters are an ASCII case-insensitive match.  See
299     // BufferQueue::eat.
300     //
301     // NB: this doesn't do input stream preprocessing or set the current input
302     // character.
eat(&mut self, input: &mut BufferQueue, pat: &str) -> Option<bool>303     fn eat(&mut self, input: &mut BufferQueue, pat: &str) -> Option<bool> {
304         input.push_front(replace(&mut self.temp_buf, StrTendril::new()));
305         match input.eat(pat, u8::eq_ignore_ascii_case) {
306             None if self.at_eof => Some(false),
307             None => {
308                 while let Some(c) = input.next() {
309                     self.temp_buf.push_char(c);
310                 }
311                 None
312             },
313             Some(matched) => Some(matched),
314         }
315     }
316 
317     /// Run the state machine for as long as we can.
run(&mut self, input: &mut BufferQueue)318     pub fn run(&mut self, input: &mut BufferQueue) {
319         if self.opts.profile {
320             loop {
321                 let state = self.state;
322                 let old_sink = self.time_in_sink;
323                 let (run, mut dt) = time!(self.step(input));
324                 dt -= self.time_in_sink - old_sink;
325                 let new = match self.state_profile.get_mut(&state) {
326                     Some(x) => {
327                         *x += dt;
328                         false
329                     },
330                     None => true,
331                 };
332                 if new {
333                     // do this here because of borrow shenanigans
334                     self.state_profile.insert(state, dt);
335                 }
336                 if !run {
337                     break;
338                 }
339             }
340         } else {
341             while self.step(input) {}
342         }
343     }
344 
345     //§ tokenization
346     // Get the next input character, if one is available.
get_char(&mut self, input: &mut BufferQueue) -> Option<char>347     fn get_char(&mut self, input: &mut BufferQueue) -> Option<char> {
348         if self.reconsume {
349             self.reconsume = false;
350             Some(self.current_char)
351         } else {
352             input
353                 .next()
354                 .and_then(|c| self.get_preprocessed_char(c, input))
355         }
356     }
357 
bad_char_error(&mut self)358     fn bad_char_error(&mut self) {
359         let msg = format_if!(
360             self.opts.exact_errors,
361             "Bad character",
362             "Saw {} in state {:?}",
363             self.current_char,
364             self.state
365         );
366         self.emit_error(msg);
367     }
368 
discard_tag(&mut self)369     fn discard_tag(&mut self) {
370         self.current_tag_name = StrTendril::new();
371         self.current_tag_attrs = Vec::new();
372     }
373 
create_tag(&mut self, kind: TagKind, c: char)374     fn create_tag(&mut self, kind: TagKind, c: char) {
375         self.discard_tag();
376         self.current_tag_name.push_char(c);
377         self.current_tag_kind = kind;
378     }
379 
380     // This method creates a PI token and
381     // sets its target to given char
create_pi(&mut self, c: char)382     fn create_pi(&mut self, c: char) {
383         self.current_pi_target = StrTendril::new();
384         self.current_pi_data = StrTendril::new();
385         self.current_pi_target.push_char(c);
386     }
387 
emit_char(&mut self, c: char)388     fn emit_char(&mut self, c: char) {
389         self.process_token(CharacterTokens(StrTendril::from_char(match c {
390             '\0' => '\u{FFFD}',
391             c => c,
392         })));
393     }
394 
emit_short_tag(&mut self)395     fn emit_short_tag(&mut self) {
396         self.current_tag_kind = ShortTag;
397         self.current_tag_name = StrTendril::new();
398         self.emit_current_tag();
399     }
400 
emit_empty_tag(&mut self)401     fn emit_empty_tag(&mut self) {
402         self.current_tag_kind = EmptyTag;
403         self.emit_current_tag();
404     }
405 
set_empty_tag(&mut self)406     fn set_empty_tag(&mut self) {
407         self.current_tag_kind = EmptyTag;
408     }
409 
emit_start_tag(&mut self)410     fn emit_start_tag(&mut self) {
411         self.current_tag_kind = StartTag;
412         self.emit_current_tag();
413     }
414 
emit_current_tag(&mut self)415     fn emit_current_tag(&mut self) {
416         self.finish_attribute();
417 
418         let qname = process_qname(replace(&mut self.current_tag_name, StrTendril::new()));
419 
420         match self.current_tag_kind {
421             StartTag | EmptyTag => {},
422             EndTag => {
423                 if !self.current_tag_attrs.is_empty() {
424                     self.emit_error(Borrowed("Attributes on an end tag"));
425                 }
426             },
427             ShortTag => {
428                 if !self.current_tag_attrs.is_empty() {
429                     self.emit_error(Borrowed("Attributes on a short tag"));
430                 }
431             },
432         }
433 
434         let token = TagToken(Tag {
435             kind: self.current_tag_kind,
436             name: qname,
437             attrs: replace(&mut self.current_tag_attrs, vec![]),
438         });
439         self.process_token(token);
440 
441         match self.sink.query_state_change() {
442             None => (),
443             Some(s) => self.state = s,
444         }
445     }
446 
447     // The string must not contain '\0'!
emit_chars(&mut self, b: StrTendril)448     fn emit_chars(&mut self, b: StrTendril) {
449         self.process_token(CharacterTokens(b));
450     }
451 
452     // Emits the current Processing Instruction
emit_pi(&mut self)453     fn emit_pi(&mut self) {
454         let token = PIToken(Pi {
455             target: replace(&mut self.current_pi_target, StrTendril::new()),
456             data: replace(&mut self.current_pi_data, StrTendril::new()),
457         });
458         self.process_token(token);
459     }
460 
consume_char_ref(&mut self, addnl_allowed: Option<char>)461     fn consume_char_ref(&mut self, addnl_allowed: Option<char>) {
462         // NB: The char ref tokenizer assumes we have an additional allowed
463         // character iff we're tokenizing in an attribute value.
464         self.char_ref_tokenizer = Some(Box::new(CharRefTokenizer::new(addnl_allowed)));
465     }
466 
emit_eof(&mut self)467     fn emit_eof(&mut self) {
468         self.process_token(EOFToken);
469     }
470 
emit_error(&mut self, error: Cow<'static, str>)471     fn emit_error(&mut self, error: Cow<'static, str>) {
472         self.process_token(ParseError(error));
473     }
474 
emit_current_comment(&mut self)475     fn emit_current_comment(&mut self) {
476         let comment = replace(&mut self.current_comment, StrTendril::new());
477         self.process_token(CommentToken(comment));
478     }
479 
emit_current_doctype(&mut self)480     fn emit_current_doctype(&mut self) {
481         let doctype = replace(&mut self.current_doctype, Doctype::new());
482         self.process_token(DoctypeToken(doctype));
483     }
484 
doctype_id<'a>(&'a mut self, kind: DoctypeKind) -> &'a mut Option<StrTendril>485     fn doctype_id<'a>(&'a mut self, kind: DoctypeKind) -> &'a mut Option<StrTendril> {
486         match kind {
487             Public => &mut self.current_doctype.public_id,
488             System => &mut self.current_doctype.system_id,
489         }
490     }
491 
clear_doctype_id(&mut self, kind: DoctypeKind)492     fn clear_doctype_id(&mut self, kind: DoctypeKind) {
493         let id = self.doctype_id(kind);
494         match *id {
495             Some(ref mut s) => s.clear(),
496             None => *id = Some(StrTendril::new()),
497         }
498     }
499 
peek(&mut self, input: &mut BufferQueue) -> Option<char>500     fn peek(&mut self, input: &mut BufferQueue) -> Option<char> {
501         if self.reconsume {
502             Some(self.current_char)
503         } else {
504             input.peek()
505         }
506     }
507 
discard_char(&mut self, input: &mut BufferQueue)508     fn discard_char(&mut self, input: &mut BufferQueue) {
509         let c = self.get_char(input);
510         assert!(c.is_some());
511     }
512 
unconsume(&mut self, input: &mut BufferQueue, buf: StrTendril)513     fn unconsume(&mut self, input: &mut BufferQueue, buf: StrTendril) {
514         input.push_front(buf);
515     }
516 }
517 
518 // Shorthand for common state machine behaviors.
519 macro_rules! shorthand (
520     ( $me:ident : emit $c:expr                     ) => ( $me.emit_char($c);                                   );
521     ( $me:ident : create_tag $kind:ident $c:expr   ) => ( $me.create_tag($kind, $c);                           );
522     ( $me:ident : push_tag $c:expr                 ) => ( $me.current_tag_name.push_char($c);                  );
523     ( $me:ident : discard_tag $input:expr          ) => ( $me.discard_tag($input);                             );
524     ( $me:ident : discard_char                     ) => ( $me.discard_char();                                  );
525     ( $me:ident : push_temp $c:expr                ) => ( $me.temp_buf.push_char($c);                          );
526     ( $me:ident : emit_temp                        ) => ( $me.emit_temp_buf();                                 );
527     ( $me:ident : clear_temp                       ) => ( $me.clear_temp_buf();                                );
528     ( $me:ident : create_attr $c:expr              ) => ( $me.create_attribute($c);                            );
529     ( $me:ident : push_name $c:expr                ) => ( $me.current_attr_name.push_char($c);                 );
530     ( $me:ident : push_value $c:expr               ) => ( $me.current_attr_value.push_char($c);                );
531     ( $me:ident : append_value $c:expr             ) => ( $me.current_attr_value.push_tendril($c);             );
532     ( $me:ident : push_comment $c:expr             ) => ( $me.current_comment.push_char($c);                   );
533     ( $me:ident : append_comment $c:expr           ) => ( $me.current_comment.push_slice($c);                  );
534     ( $me:ident : emit_comment                     ) => ( $me.emit_current_comment();                          );
535     ( $me:ident : clear_comment                    ) => ( $me.current_comment.clear();                         );
536     ( $me:ident : create_doctype                   ) => ( $me.current_doctype = Doctype::new();                );
537     ( $me:ident : push_doctype_name $c:expr        ) => ( option_push(&mut $me.current_doctype.name, $c);      );
538     ( $me:ident : push_doctype_id $k:ident $c:expr ) => ( option_push($me.doctype_id($k), $c);                 );
539     ( $me:ident : clear_doctype_id $k:ident        ) => ( $me.clear_doctype_id($k);                            );
540     ( $me:ident : emit_doctype                     ) => ( $me.emit_current_doctype();                          );
541     ( $me:ident : error                            ) => ( $me.bad_char_error();                                );
542     ( $me:ident : error_eof                        ) => ( $me.bad_eof_error();                                 );
543     ( $me:ident : create_pi $c:expr                ) => ( $me.create_pi($c);                                   );
544     ( $me:ident : push_pi_target $c:expr           ) => ( $me.current_pi_target.push_char($c);                 );
545     ( $me:ident : push_pi_data $c:expr             ) => ( $me.current_pi_data.push_char($c);                   );
546     ( $me:ident : set_empty_tag                    ) => ( $me.set_empty_tag();                                 );
547 );
548 
549 // Tracing of tokenizer actions.  This adds significant bloat and compile time,
550 // so it's behind a cfg flag.
551 #[cfg(trace_tokenizer)]
552 macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ({
553     debug!("  {:s}", stringify!($($cmds)*));
554     shorthand!($me:expr : $($cmds)*);
555 }));
556 
557 #[cfg(not(trace_tokenizer))]
558 macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ( shorthand!($me: $($cmds)*) ) );
559 
560 // A little DSL for sequencing shorthand actions.
561 macro_rules! go (
562     // A pattern like $($cmd:tt)* ; $($rest:tt)* causes parse ambiguity.
563     // We have to tell the parser how much lookahead we need.
564 
565     ( $me:ident : $a:tt                   ; $($rest:tt)* ) => ({ sh_trace!($me: $a);          go!($me: $($rest)*); });
566     ( $me:ident : $a:tt $b:tt             ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b);       go!($me: $($rest)*); });
567     ( $me:ident : $a:tt $b:tt $c:tt       ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c);    go!($me: $($rest)*); });
568     ( $me:ident : $a:tt $b:tt $c:tt $d:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c $d); go!($me: $($rest)*); });
569 
570     // These can only come at the end.
571 
572     ( $me:ident : to $s:ident                    ) => ({ $me.state = states::$s; return true;           });
573     ( $me:ident : to $s:ident $k1:expr           ) => ({ $me.state = states::$s($k1); return true;      });
574     ( $me:ident : to $s:ident $k1:ident $k2:expr ) => ({ $me.state = states::$s($k1($k2)); return true; });
575 
576     ( $me:ident : reconsume $s:ident                    ) => ({ $me.reconsume = true; go!($me: to $s);         });
577     ( $me:ident : reconsume $s:ident $k1:expr           ) => ({ $me.reconsume = true; go!($me: to $s $k1);     });
578     ( $me:ident : reconsume $s:ident $k1:ident $k2:expr ) => ({ $me.reconsume = true; go!($me: to $s $k1 $k2); });
579 
580     ( $me:ident : consume_char_ref             ) => ({ $me.consume_char_ref(None); return true;         });
581     ( $me:ident : consume_char_ref $addnl:expr ) => ({ $me.consume_char_ref(Some($addnl)); return true; });
582 
583     // We have a default next state after emitting a tag, but the sink can override.
584     ( $me:ident : emit_tag $s:ident ) => ({
585         $me.state = states::$s;
586         $me.emit_current_tag();
587         return true;
588     });
589 
590     // We have a special when dealing with empty and short tags in Xml
591     ( $me:ident : emit_short_tag $s:ident ) => ({
592         $me.state = states::$s;
593         $me.emit_short_tag();
594         return true;
595     });
596 
597     ( $me:ident : emit_empty_tag $s:ident ) => ({
598         $me.state = states::$s;
599         $me.emit_empty_tag();
600         return true;
601     });
602 
603     ( $me:ident : emit_start_tag $s:ident ) => ({
604         $me.state = states::$s;
605         $me.emit_start_tag();
606         return true;
607     });
608 
609     ( $me:ident : emit_pi $s:ident ) => ({
610         $me.state = states::$s;
611         $me.emit_pi();
612         return true;
613     });
614 
615     ( $me:ident : eof ) => ({ $me.emit_eof(); return false; });
616 
617     // If nothing else matched, it's a single command
618     ( $me:ident : $($cmd:tt)+ ) => ( sh_trace!($me: $($cmd)+); );
619 
620     // or nothing.
621     ( $me:ident : ) => (());
622 );
623 
624 // This is a macro because it can cause early return
625 // from the function where it is used.
626 macro_rules! get_char ( ($me:expr, $input:expr) => (
627     unwrap_or_return!($me.get_char($input), false)
628 ));
629 
630 macro_rules! pop_except_from ( ($me:expr, $input:expr, $set:expr) => (
631     unwrap_or_return!($me.pop_except_from($input, $set), false)
632 ));
633 
634 macro_rules! eat ( ($me:expr, $input:expr, $pat:expr) => (
635     unwrap_or_return!($me.eat($input, $pat), false)
636 ));
637 
638 impl<Sink: TokenSink> XmlTokenizer<Sink> {
639     // Run the state machine for a while.
640     // Return true if we should be immediately re-invoked
641     // (this just simplifies control flow vs. break / continue).
step(&mut self, input: &mut BufferQueue) -> bool642     fn step(&mut self, input: &mut BufferQueue) -> bool {
643         if self.char_ref_tokenizer.is_some() {
644             return self.step_char_ref_tokenizer(input);
645         }
646 
647         debug!("processing in state {:?}", self.state);
648         match self.state {
649             XmlState::Quiescent => {
650                 self.state = XmlState::Data;
651                 return false;
652             },
653             //§ data-state
654             XmlState::Data => loop {
655                 match pop_except_from!(self, input, small_char_set!('\r' '&' '<')) {
656                     FromSet('&') => go!(self: consume_char_ref),
657                     FromSet('<') => go!(self: to TagState),
658                     FromSet(c) => go!(self: emit c),
659                     NotFromSet(b) => self.emit_chars(b),
660                 }
661             },
662             //§ tag-state
663             XmlState::TagState => loop {
664                 match get_char!(self, input) {
665                     '!' => go!(self: to MarkupDecl),
666                     '/' => go!(self: to EndTagState),
667                     '?' => go!(self: to Pi),
668                     '\t' | '\n' | ' ' | ':' | '<' | '>' => {
669                         go!(self: error; emit '<'; reconsume Data)
670                     },
671                     cl => go!(self: create_tag StartTag cl; to TagName),
672                 }
673             },
674             //§ end-tag-state
675             XmlState::EndTagState => loop {
676                 match get_char!(self, input) {
677                     '>' => go!(self:  emit_short_tag Data),
678                     '\t' | '\n' | ' ' | '<' | ':' => {
679                         go!(self: error; emit '<'; emit '/'; reconsume Data)
680                     },
681                     cl => go!(self: create_tag EndTag cl; to EndTagName),
682                 }
683             },
684             //§ end-tag-name-state
685             XmlState::EndTagName => loop {
686                 match get_char!(self, input) {
687                     '\t' | '\n' | ' ' => go!(self: to EndTagNameAfter),
688                     '/' => go!(self: error; to EndTagNameAfter),
689                     '>' => go!(self: emit_tag Data),
690                     cl => go!(self: push_tag cl),
691                 }
692             },
693             //§ end-tag-name-after-state
694             XmlState::EndTagNameAfter => loop {
695                 match get_char!(self, input) {
696                     '>' => go!(self: emit_tag Data),
697                     '\t' | '\n' | ' ' => (),
698                     _ => self.emit_error(Borrowed("Unexpected element in tag name")),
699                 }
700             },
701             //§ pi-state
702             XmlState::Pi => loop {
703                 match get_char!(self, input) {
704                     '\t' | '\n' | ' ' => go!(self: error; reconsume BogusComment),
705                     cl => go!(self: create_pi cl; to PiTarget),
706                 }
707             },
708             //§ pi-target-state
709             XmlState::PiTarget => loop {
710                 match get_char!(self, input) {
711                     '\t' | '\n' | ' ' => go!(self: to PiTargetAfter),
712                     '?' => go!(self: to PiAfter),
713                     cl => go!(self: push_pi_target cl),
714                 }
715             },
716             //§ pi-target-after-state
717             XmlState::PiTargetAfter => loop {
718                 match get_char!(self, input) {
719                     '\t' | '\n' | ' ' => (),
720                     _ => go!(self: reconsume PiData),
721                 }
722             },
723             //§ pi-data-state
724             XmlState::PiData => loop {
725                 match get_char!(self, input) {
726                     '?' => go!(self: to PiAfter),
727                     cl => go!(self: push_pi_data cl),
728                 }
729             },
730             //§ pi-after-state
731             XmlState::PiAfter => loop {
732                 match get_char!(self, input) {
733                     '>' => go!(self: emit_pi Data),
734                     '?' => go!(self: to PiAfter),
735                     cl => go!(self: push_pi_data cl),
736                 }
737             },
738             //§ markup-declaration-state
739             XmlState::MarkupDecl => loop {
740                 if eat!(self, input, "--") {
741                     go!(self: clear_comment; to CommentStart);
742                 } else if eat!(self, input, "[CDATA[") {
743                     go!(self: to Cdata);
744                 } else if eat!(self, input, "DOCTYPE") {
745                     go!(self: to Doctype);
746                 } else {
747                     // FIXME: 'error' gives wrong message
748                     go!(self: error; to BogusComment);
749                 }
750             },
751             //§ comment-start-state
752             XmlState::CommentStart => loop {
753                 match get_char!(self, input) {
754                     '-' => go!(self: to CommentStartDash),
755                     '>' => go!(self: error; emit_comment; to Data),
756                     _ => go!(self: reconsume Comment),
757                 }
758             },
759             //§ comment-start-dash-state
760             XmlState::CommentStartDash => loop {
761                 match get_char!(self, input) {
762                     '-' => go!(self: to CommentEnd),
763                     '>' => go!(self: error; emit_comment; to Data),
764                     _ => go!(self: push_comment '-'; reconsume Comment),
765                 }
766             },
767             //§ comment-state
768             XmlState::Comment => loop {
769                 match get_char!(self, input) {
770                     '<' => go!(self: push_comment '<'; to CommentLessThan),
771                     '-' => go!(self: to CommentEndDash),
772                     c => go!(self: push_comment c),
773                 }
774             },
775             //§ comment-less-than-sign-state
776             XmlState::CommentLessThan => loop {
777                 match get_char!(self, input) {
778                     '!' => go!(self: push_comment '!';to CommentLessThanBang),
779                     '<' => go!(self: push_comment '<'),
780                     _ => go!(self: reconsume Comment),
781                 }
782             },
783             //§ comment-less-than-sign-bang-state
784             XmlState::CommentLessThanBang => loop {
785                 match get_char!(self, input) {
786                     '-' => go!(self: to CommentLessThanBangDash),
787                     _ => go!(self: reconsume Comment),
788                 }
789             },
790             //§ comment-less-than-sign-bang-dash-state
791             XmlState::CommentLessThanBangDash => loop {
792                 match get_char!(self, input) {
793                     '-' => go!(self: to CommentLessThanBangDashDash),
794                     _ => go!(self: reconsume CommentEndDash),
795                 }
796             },
797             //§ comment-less-than-sign-bang-dash-dash-state
798             XmlState::CommentLessThanBangDashDash => loop {
799                 match get_char!(self, input) {
800                     '>' => go!(self: reconsume CommentEnd),
801                     _ => go!(self: error; reconsume CommentEnd),
802                 }
803             },
804             //§ comment-end-dash-state
805             XmlState::CommentEndDash => loop {
806                 match get_char!(self, input) {
807                     '-' => go!(self: to CommentEnd),
808                     _ => go!(self: push_comment '-'; reconsume Comment),
809                 }
810             },
811             //§ comment-end-state
812             XmlState::CommentEnd => loop {
813                 match get_char!(self, input) {
814                     '>' => go!(self: emit_comment; to Data),
815                     '!' => go!(self: to CommentEndBang),
816                     '-' => go!(self: push_comment '-'),
817                     _ => go!(self: append_comment "--"; reconsume Comment),
818                 }
819             },
820             //§ comment-end-bang-state
821             XmlState::CommentEndBang => loop {
822                 match get_char!(self, input) {
823                     '-' => go!(self: append_comment "--!"; to CommentEndDash),
824                     '>' => go!(self: error; emit_comment; to Data),
825                     _ => go!(self: append_comment "--!"; reconsume Comment),
826                 }
827             },
828             //§ bogus-comment-state
829             XmlState::BogusComment => loop {
830                 match get_char!(self, input) {
831                     '>' => go!(self: emit_comment; to Data),
832                     c => go!(self: push_comment c),
833                 }
834             },
835             //§ cdata-state
836             XmlState::Cdata => loop {
837                 match get_char!(self, input) {
838                     ']' => go!(self: to CdataBracket),
839                     cl => go!(self: emit cl),
840                 }
841             },
842             //§ cdata-bracket-state
843             XmlState::CdataBracket => loop {
844                 match get_char!(self, input) {
845                     ']' => go!(self: to CdataEnd),
846                     cl => go!(self: emit ']'; emit cl; to Cdata),
847                 }
848             },
849             //§ cdata-end-state
850             XmlState::CdataEnd => loop {
851                 match get_char!(self, input) {
852                     '>' => go!(self: to Data),
853                     ']' => go!(self: emit ']'),
854                     cl => go!(self: emit ']'; emit ']'; emit cl; to Cdata),
855                 }
856             },
857             //§ tag-name-state
858             XmlState::TagName => loop {
859                 match get_char!(self, input) {
860                     '\t' | '\n' | ' ' => go!(self: to TagAttrNameBefore),
861                     '>' => go!(self: emit_tag Data),
862                     '/' => go!(self: set_empty_tag; to TagEmpty),
863                     cl => go!(self: push_tag cl),
864                 }
865             },
866             //§ empty-tag-state
867             XmlState::TagEmpty => loop {
868                 match get_char!(self, input) {
869                     '>' => go!(self: emit_empty_tag Data),
870                     _ => go!(self: reconsume TagAttrValueBefore),
871                 }
872             },
873             //§ tag-attribute-name-before-state
874             XmlState::TagAttrNameBefore => loop {
875                 match get_char!(self, input) {
876                     '\t' | '\n' | ' ' => (),
877                     '>' => go!(self: emit_tag Data),
878                     '/' => go!(self: set_empty_tag; to TagEmpty),
879                     ':' => go!(self: error),
880                     cl => go!(self: create_attr cl; to TagAttrName),
881                 }
882             },
883             //§ tag-attribute-name-state
884             XmlState::TagAttrName => loop {
885                 match get_char!(self, input) {
886                     '=' => go!(self: to TagAttrValueBefore),
887                     '>' => go!(self: emit_tag Data),
888                     '\t' | '\n' | ' ' => go!(self: to TagAttrNameAfter),
889                     '/' => go!(self: set_empty_tag; to TagEmpty),
890                     cl => go!(self: push_name cl),
891                 }
892             },
893             //§ tag-attribute-name-after-state
894             XmlState::TagAttrNameAfter => loop {
895                 match get_char!(self, input) {
896                     '\t' | '\n' | ' ' => (),
897                     '=' => go!(self: to TagAttrValueBefore),
898                     '>' => go!(self: emit_tag Data),
899                     '/' => go!(self: set_empty_tag; to TagEmpty),
900                     cl => go!(self: create_attr cl; to TagAttrName),
901                 }
902             },
903             //§ tag-attribute-value-before-state
904             XmlState::TagAttrValueBefore => loop {
905                 match get_char!(self, input) {
906                     '\t' | '\n' | ' ' => (),
907                     '"' => go!(self: to TagAttrValue DoubleQuoted),
908                     '\'' => go!(self: to TagAttrValue SingleQuoted),
909                     '&' => go!(self: reconsume TagAttrValue(Unquoted)),
910                     '>' => go!(self: emit_tag Data),
911                     cl => go!(self: push_value cl; to TagAttrValue(Unquoted)),
912                 }
913             },
914             //§ tag-attribute-value-double-quoted-state
915             XmlState::TagAttrValue(DoubleQuoted) => loop {
916                 match pop_except_from!(self, input, small_char_set!('\n' '"' '&')) {
917                     FromSet('"') => go!(self: to TagAttrNameBefore),
918                     FromSet('&') => go!(self: consume_char_ref '"' ),
919                     FromSet(c) => go!(self: push_value c),
920                     NotFromSet(ref b) => go!(self: append_value b),
921                 }
922             },
923             //§ tag-attribute-value-single-quoted-state
924             XmlState::TagAttrValue(SingleQuoted) => loop {
925                 match pop_except_from!(self, input, small_char_set!('\n' '\'' '&')) {
926                     FromSet('\'') => go!(self: to TagAttrNameBefore),
927                     FromSet('&') => go!(self: consume_char_ref '\''),
928                     FromSet(c) => go!(self: push_value c),
929                     NotFromSet(ref b) => go!(self: append_value b),
930                 }
931             },
932             //§ tag-attribute-value-double-quoted-state
933             XmlState::TagAttrValue(Unquoted) => loop {
934                 match pop_except_from!(self, input, small_char_set!('\n' '\t' ' ' '&' '>')) {
935                     FromSet('\t') | FromSet('\n') | FromSet(' ') => go!(self: to TagAttrNameBefore),
936                     FromSet('&') => go!(self: consume_char_ref),
937                     FromSet('>') => go!(self: emit_tag Data),
938                     FromSet(c) => go!(self: push_value c),
939                     NotFromSet(ref b) => go!(self: append_value b),
940                 }
941             },
942 
943             //§ doctype-state
944             XmlState::Doctype => loop {
945                 match get_char!(self, input) {
946                     '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeName),
947                     _ => go!(self: error; reconsume BeforeDoctypeName),
948                 }
949             },
950             //§ before-doctype-name-state
951             XmlState::BeforeDoctypeName => loop {
952                 match get_char!(self, input) {
953                     '\t' | '\n' | '\x0C' | ' ' => (),
954                     '>' => go!(self: error; emit_doctype; to Data),
955                     c => go!(self: create_doctype; push_doctype_name (c.to_ascii_lowercase());
956                                   to DoctypeName),
957                 }
958             },
959             //§ doctype-name-state
960             XmlState::DoctypeName => loop {
961                 match get_char!(self, input) {
962                     '\t' | '\n' | '\x0C' | ' ' => go!(self: to AfterDoctypeName),
963                     '>' => go!(self: emit_doctype; to Data),
964                     c => go!(self: push_doctype_name (c.to_ascii_lowercase());
965                                   to DoctypeName),
966                 }
967             },
968             //§ after-doctype-name-state
969             XmlState::AfterDoctypeName => loop {
970                 if eat!(self, input, "public") {
971                     go!(self: to AfterDoctypeKeyword Public);
972                 } else if eat!(self, input, "system") {
973                     go!(self: to AfterDoctypeKeyword System);
974                 } else {
975                     match get_char!(self, input) {
976                         '\t' | '\n' | '\x0C' | ' ' => (),
977                         '>' => go!(self: emit_doctype; to Data),
978                         _ => go!(self: error; to BogusDoctype),
979                     }
980                 }
981             },
982             //§ after-doctype-public-keyword-state
983             XmlState::AfterDoctypeKeyword(Public) => loop {
984                 match get_char!(self, input) {
985                     '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier Public),
986                     '"' => {
987                         go!(self: error; clear_doctype_id Public; to DoctypeIdentifierDoubleQuoted Public)
988                     },
989                     '\'' => {
990                         go!(self: error; clear_doctype_id Public; to DoctypeIdentifierSingleQuoted Public)
991                     },
992                     '>' => go!(self: error; emit_doctype; to Data),
993                     _ => go!(self: error; to BogusDoctype),
994                 }
995             },
996             //§ after-doctype-system-keyword-state
997             XmlState::AfterDoctypeKeyword(System) => loop {
998                 match get_char!(self, input) {
999                     '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier System),
1000                     '"' => {
1001                         go!(self: error; clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System)
1002                     },
1003                     '\'' => {
1004                         go!(self: error; clear_doctype_id System; to DoctypeIdentifierSingleQuoted System)
1005                     },
1006                     '>' => go!(self: error; emit_doctype; to Data),
1007                     _ => go!(self: error; to BogusDoctype),
1008                 }
1009             },
1010             //§ before_doctype_public_identifier_state before_doctype_system_identifier_state
1011             XmlState::BeforeDoctypeIdentifier(kind) => loop {
1012                 match get_char!(self, input) {
1013                     '\t' | '\n' | '\x0C' | ' ' => (),
1014                     '"' => go!(self: clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind),
1015                     '\'' => go!(self: clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind),
1016                     '>' => go!(self: error; emit_doctype; to Data),
1017                     _ => go!(self: error; to BogusDoctype),
1018                 }
1019             },
1020             //§ doctype_public_identifier_double_quoted_state doctype_system_identifier_double_quoted_state
1021             XmlState::DoctypeIdentifierDoubleQuoted(kind) => loop {
1022                 match get_char!(self, input) {
1023                     '"' => go!(self: to AfterDoctypeIdentifier kind),
1024                     '>' => go!(self: error; emit_doctype; to Data),
1025                     c => go!(self: push_doctype_id kind c),
1026                 }
1027             },
1028             //§ doctype_public_identifier_single_quoted_state doctype_system_identifier_single_quoted_state
1029             XmlState::DoctypeIdentifierSingleQuoted(kind) => loop {
1030                 match get_char!(self, input) {
1031                     '\'' => go!(self: to AfterDoctypeIdentifier kind),
1032                     '>' => go!(self: error; emit_doctype; to Data),
1033                     c => go!(self: push_doctype_id kind c),
1034                 }
1035             },
1036             //§ doctype_public_identifier_single_quoted_state
1037             XmlState::AfterDoctypeIdentifier(Public) => loop {
1038                 match get_char!(self, input) {
1039                     '\t' | '\n' | '\x0C' | ' ' => {
1040                         go!(self: to BetweenDoctypePublicAndSystemIdentifiers)
1041                     },
1042                     '\'' => {
1043                         go!(self: error; clear_doctype_id System; to DoctypeIdentifierSingleQuoted(System))
1044                     },
1045                     '"' => {
1046                         go!(self: error; clear_doctype_id System; to DoctypeIdentifierDoubleQuoted(System))
1047                     },
1048                     '>' => go!(self: emit_doctype; to Data),
1049                     _ => go!(self: error; to BogusDoctype),
1050                 }
1051             },
1052             //§ doctype_system_identifier_single_quoted_state
1053             XmlState::AfterDoctypeIdentifier(System) => loop {
1054                 match get_char!(self, input) {
1055                     '\t' | '\n' | '\x0C' | ' ' => (),
1056                     '>' => go!(self: emit_doctype; to Data),
1057                     _ => go!(self: error; to BogusDoctype),
1058                 }
1059             },
1060             //§ between_doctype_public_and_system_identifier_state
1061             XmlState::BetweenDoctypePublicAndSystemIdentifiers => loop {
1062                 match get_char!(self, input) {
1063                     '\t' | '\n' | '\x0C' | ' ' => (),
1064                     '>' => go!(self: emit_doctype; to Data),
1065                     '\'' => go!(self: to DoctypeIdentifierSingleQuoted System),
1066                     '"' => go!(self: to DoctypeIdentifierDoubleQuoted System),
1067                     _ => go!(self: error; to BogusDoctype),
1068                 }
1069             },
1070             //§ bogus_doctype_state
1071             XmlState::BogusDoctype => loop {
1072                 match get_char!(self, input) {
1073                     '>' => go!(self: emit_doctype; to Data),
1074                     _ => (),
1075                 }
1076             },
1077         }
1078     }
1079 
1080     /// Indicate that we have reached the end of the input.
1081     pub fn end(&mut self) {
1082         // Handle EOF in the char ref sub-tokenizer, if there is one.
1083         // Do this first because it might un-consume stuff.
1084         let mut input = BufferQueue::new();
1085         match self.char_ref_tokenizer.take() {
1086             None => (),
1087             Some(mut tok) => {
1088                 tok.end_of_file(self, &mut input);
1089                 self.process_char_ref(tok.get_result());
1090             },
1091         }
1092 
1093         // Process all remaining buffered input.
1094         // If we're waiting for lookahead, we're not gonna get it.
1095         self.at_eof = true;
1096         self.run(&mut input);
1097 
1098         while self.eof_step() {
1099             // loop
1100         }
1101 
1102         self.sink.end();
1103 
1104         if self.opts.profile {
1105             self.dump_profile();
1106         }
1107     }
1108 
1109     #[cfg(for_c)]
1110     fn dump_profile(&self) {
1111         unreachable!();
1112     }
1113 
1114     #[cfg(not(for_c))]
1115     fn dump_profile(&self) {
1116         let mut results: Vec<(states::XmlState, u64)> =
1117             self.state_profile.iter().map(|(s, t)| (*s, *t)).collect();
1118         results.sort_by(|&(_, x), &(_, y)| y.cmp(&x));
1119 
1120         let total: u64 = results
1121             .iter()
1122             .map(|&(_, t)| t)
1123             .fold(0, ::std::ops::Add::add);
1124         debug!("\nTokenizer profile, in nanoseconds");
1125         debug!("\n{:12}         total in token sink", self.time_in_sink);
1126         debug!("\n{:12}         total in tokenizer", total);
1127 
1128         for (k, v) in results.into_iter() {
1129             let pct = 100.0 * (v as f64) / (total as f64);
1130             debug!("{:12}  {:4.1}%  {:?}", v, pct, k);
1131         }
1132     }
1133 
1134     fn eof_step(&mut self) -> bool {
1135         debug!("processing EOF in state {:?}", self.state);
1136         match self.state {
1137             XmlState::Data | XmlState::Quiescent => go!(self: eof),
1138             XmlState::CommentStart | XmlState::CommentLessThan | XmlState::CommentLessThanBang => {
1139                 go!(self: reconsume Comment)
1140             },
1141             XmlState::CommentLessThanBangDash => go!(self: reconsume CommentEndDash),
1142             XmlState::CommentLessThanBangDashDash => go!(self: reconsume CommentEnd),
1143             XmlState::CommentStartDash |
1144             XmlState::Comment |
1145             XmlState::CommentEndDash |
1146             XmlState::CommentEnd |
1147             XmlState::CommentEndBang => go!(self: error_eof; emit_comment; eof),
1148             XmlState::TagState => go!(self: error_eof; emit '<'; to Data),
1149             XmlState::EndTagState => go!(self: error_eof; emit '<'; emit '/'; to Data),
1150             XmlState::TagEmpty => go!(self: error_eof; to TagAttrNameBefore),
1151             XmlState::Cdata | XmlState::CdataBracket | XmlState::CdataEnd => {
1152                 go!(self: error_eof; to Data)
1153             },
1154             XmlState::Pi => go!(self: error_eof; to BogusComment),
1155             XmlState::PiTargetAfter | XmlState::PiAfter => go!(self: reconsume PiData),
1156             XmlState::MarkupDecl => go!(self: error_eof; to BogusComment),
1157             XmlState::TagName |
1158             XmlState::TagAttrNameBefore |
1159             XmlState::EndTagName |
1160             XmlState::TagAttrNameAfter |
1161             XmlState::EndTagNameAfter |
1162             XmlState::TagAttrValueBefore |
1163             XmlState::TagAttrValue(_) => go!(self: error_eof; emit_tag Data),
1164             XmlState::PiData | XmlState::PiTarget => go!(self: error_eof; emit_pi Data),
1165             XmlState::TagAttrName => go!(self: error_eof; emit_start_tag Data),
1166             XmlState::BeforeDoctypeName |
1167             XmlState::Doctype |
1168             XmlState::DoctypeName |
1169             XmlState::AfterDoctypeName |
1170             XmlState::AfterDoctypeKeyword(_) |
1171             XmlState::BeforeDoctypeIdentifier(_) |
1172             XmlState::AfterDoctypeIdentifier(_) |
1173             XmlState::DoctypeIdentifierSingleQuoted(_) |
1174             XmlState::DoctypeIdentifierDoubleQuoted(_) |
1175             XmlState::BetweenDoctypePublicAndSystemIdentifiers => {
1176                 go!(self: error_eof; emit_doctype; to Data)
1177             },
1178             XmlState::BogusDoctype => go!(self: emit_doctype; to Data),
1179             XmlState::BogusComment => go!(self: emit_comment; to Data),
1180         }
1181     }
1182 
1183     fn process_char_ref(&mut self, char_ref: CharRef) {
1184         let CharRef {
1185             mut chars,
1186             mut num_chars,
1187         } = char_ref;
1188 
1189         if num_chars == 0 {
1190             chars[0] = '&';
1191             num_chars = 1;
1192         }
1193 
1194         for i in 0..num_chars {
1195             let c = chars[i as usize];
1196             match self.state {
1197                 states::Data | states::Cdata => go!(self: emit c),
1198 
1199                 states::TagAttrValue(_) => go!(self: push_value c),
1200 
1201                 _ => panic!(
1202                     "state {:?} should not be reachable in process_char_ref",
1203                     self.state
1204                 ),
1205             }
1206         }
1207     }
1208 
1209     fn step_char_ref_tokenizer(&mut self, input: &mut BufferQueue) -> bool {
1210         let mut tok = self.char_ref_tokenizer.take().unwrap();
1211         let outcome = tok.step(self, input);
1212 
1213         let progress = match outcome {
1214             char_ref::Done => {
1215                 self.process_char_ref(tok.get_result());
1216                 return true;
1217             },
1218 
1219             char_ref::Stuck => false,
1220             char_ref::Progress => true,
1221         };
1222 
1223         self.char_ref_tokenizer = Some(tok);
1224         progress
1225     }
1226 
1227     fn finish_attribute(&mut self) {
1228         if self.current_attr_name.len() == 0 {
1229             return;
1230         }
1231 
1232         // Check for a duplicate attribute.
1233         // FIXME: the spec says we should error as soon as the name is finished.
1234         // FIXME: linear time search, do we care?
1235         let dup = {
1236             let name = &self.current_attr_name[..];
1237             self.current_tag_attrs
1238                 .iter()
1239                 .any(|a| &*a.name.local == name)
1240         };
1241 
1242         if dup {
1243             self.emit_error(Borrowed("Duplicate attribute"));
1244             self.current_attr_name.clear();
1245             self.current_attr_value.clear();
1246         } else {
1247             let qname = process_qname(replace(&mut self.current_attr_name, StrTendril::new()));
1248             let attr = Attribute {
1249                 name: qname.clone(),
1250                 value: replace(&mut self.current_attr_value, StrTendril::new()),
1251             };
1252 
1253             if qname.local == local_name!("xmlns") ||
1254                 qname.prefix == Some(namespace_prefix!("xmlns"))
1255             {
1256                 self.current_tag_attrs.insert(0, attr);
1257             } else {
1258                 self.current_tag_attrs.push(attr);
1259             }
1260         }
1261     }
1262 
1263     fn create_attribute(&mut self, c: char) {
1264         self.finish_attribute();
1265 
1266         self.current_attr_name.push_char(c);
1267     }
1268 }
1269 
1270 #[cfg(test)]
1271 mod test {
1272 
1273     use super::process_qname;
1274     use crate::tendril::SliceExt;
1275     use crate::{LocalName, Prefix};
1276 
1277     #[test]
1278     fn simple_namespace() {
1279         let qname = process_qname("prefix:local".to_tendril());
1280         assert_eq!(qname.prefix, Some(Prefix::from("prefix")));
1281         assert_eq!(qname.local, LocalName::from("local"));
1282 
1283         let qname = process_qname("a:b".to_tendril());
1284         assert_eq!(qname.prefix, Some(Prefix::from("a")));
1285         assert_eq!(qname.local, LocalName::from("b"));
1286     }
1287 
1288     #[test]
1289     fn wrong_namespaces() {
1290         let qname = process_qname(":local".to_tendril());
1291         assert_eq!(qname.prefix, None);
1292         assert_eq!(qname.local, LocalName::from(":local"));
1293 
1294         let qname = process_qname("::local".to_tendril());
1295         assert_eq!(qname.prefix, None);
1296         assert_eq!(qname.local, LocalName::from("::local"));
1297 
1298         let qname = process_qname("a::local".to_tendril());
1299         assert_eq!(qname.prefix, None);
1300         assert_eq!(qname.local, LocalName::from("a::local"));
1301 
1302         let qname = process_qname("fake::".to_tendril());
1303         assert_eq!(qname.prefix, None);
1304         assert_eq!(qname.local, LocalName::from("fake::"));
1305 
1306         let qname = process_qname(":::".to_tendril());
1307         assert_eq!(qname.prefix, None);
1308         assert_eq!(qname.local, LocalName::from(":::"));
1309 
1310         let qname = process_qname(":a:b:".to_tendril());
1311         assert_eq!(qname.prefix, None);
1312         assert_eq!(qname.local, LocalName::from(":a:b:"));
1313     }
1314 }
1315