1 // Copyright 2014-2017 The html5ever Project Developers. See the
2 // COPYRIGHT file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9 
10 mod char_ref;
11 mod interface;
12 mod qname;
13 pub mod states;
14 
15 pub use self::interface::{CharacterTokens, EOFToken, NullCharacterToken};
16 pub use self::interface::{CommentToken, DoctypeToken, PIToken, TagToken};
17 pub use self::interface::{Doctype, Pi};
18 pub use self::interface::{EmptyTag, EndTag, ShortTag, StartTag};
19 pub use self::interface::{ParseError, Tag, TagKind, Token, TokenSink};
20 pub use crate::{LocalName, Namespace, Prefix};
21 
22 use crate::tendril::StrTendril;
23 use crate::{buffer_queue, Attribute, QualName, SmallCharSet};
24 use log::debug;
25 use mac::{format_if, unwrap_or_return};
26 use markup5ever::{local_name, namespace_prefix, namespace_url, ns, small_char_set};
27 use std::borrow::Cow::{self, Borrowed};
28 use std::collections::BTreeMap;
29 use std::mem::replace;
30 
31 use self::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult};
32 use self::char_ref::{CharRef, CharRefTokenizer};
33 use self::qname::QualNameTokenizer;
34 use self::states::XmlState;
35 use self::states::{DoctypeKind, Public, System};
36 use self::states::{DoubleQuoted, SingleQuoted, Unquoted};
37 
38 /// Copy of Tokenizer options, with an impl for `Default`.
39 #[derive(Copy, Clone)]
40 pub struct XmlTokenizerOpts {
41     /// Report all parse errors described in the spec, at some
42     /// performance penalty?  Default: false
43     pub exact_errors: bool,
44 
45     /// Discard a `U+FEFF BYTE ORDER MARK` if we see one at the beginning
46     /// of the stream?  Default: true
47     pub discard_bom: bool,
48 
49     /// Keep a record of how long we spent in each state?  Printed
50     /// when `end()` is called.  Default: false
51     pub profile: bool,
52 
53     /// Initial state override.  Only the test runner should use
54     /// a non-`None` value!
55     pub initial_state: Option<states::XmlState>,
56 }
57 
process_qname(tag_name: StrTendril) -> QualName58 fn process_qname(tag_name: StrTendril) -> QualName {
59     // If tag name can't possibly contain full namespace, skip qualified name
60     // parsing altogether. For a tag to have namespace it must look like:
61     //     a:b
62     // Since StrTendril are UTF-8, we know that minimal size in bytes must be
63     // three bytes minimum.
64     let split = if (&*tag_name).as_bytes().len() < 3 {
65         None
66     } else {
67         QualNameTokenizer::new((&*tag_name).as_bytes()).run()
68     };
69 
70     match split {
71         None => QualName::new(None, ns!(), LocalName::from(&*tag_name)),
72         Some(col) => {
73             let len = (&*tag_name).as_bytes().len() as u32;
74             let prefix = tag_name.subtendril(0, col);
75             let local = tag_name.subtendril(col + 1, len - col - 1);
76             let ns = ns!(); // Actual namespace URL set in XmlTreeBuilder::bind_qname
77             QualName::new(Some(Prefix::from(&*prefix)), ns, LocalName::from(&*local))
78         },
79     }
80 }
81 
option_push(opt_str: &mut Option<StrTendril>, c: char)82 fn option_push(opt_str: &mut Option<StrTendril>, c: char) {
83     match *opt_str {
84         Some(ref mut s) => s.push_char(c),
85         None => *opt_str = Some(StrTendril::from_char(c)),
86     }
87 }
88 
89 impl Default for XmlTokenizerOpts {
default() -> XmlTokenizerOpts90     fn default() -> XmlTokenizerOpts {
91         XmlTokenizerOpts {
92             exact_errors: false,
93             discard_bom: true,
94             profile: false,
95             initial_state: None,
96         }
97     }
98 }
99 /// The Xml tokenizer.
100 pub struct XmlTokenizer<Sink> {
101     /// Options controlling the behavior of the tokenizer.
102     opts: XmlTokenizerOpts,
103 
104     /// Destination for tokens we emit.
105     pub sink: Sink,
106 
107     /// The abstract machine state as described in the spec.
108     state: states::XmlState,
109 
110     /// Are we at the end of the file, once buffers have been processed
111     /// completely? This affects whether we will wait for lookahead or not.
112     at_eof: bool,
113 
114     /// Tokenizer for character references, if we're tokenizing
115     /// one at the moment.
116     char_ref_tokenizer: Option<Box<CharRefTokenizer>>,
117 
118     /// Current input character.  Just consumed, may reconsume.
119     current_char: char,
120 
121     /// Should we reconsume the current input character?
122     reconsume: bool,
123 
124     /// Did we just consume \r, translating it to \n?  In that case we need
125     /// to ignore the next character if it's \n.
126     ignore_lf: bool,
127 
128     /// Discard a U+FEFF BYTE ORDER MARK if we see one?  Only done at the
129     /// beginning of the stream.
130     discard_bom: bool,
131 
132     /// Temporary buffer
133     temp_buf: StrTendril,
134 
135     /// Current tag kind.
136     current_tag_kind: TagKind,
137 
138     /// Current tag name.
139     current_tag_name: StrTendril,
140 
141     /// Current tag attributes.
142     current_tag_attrs: Vec<Attribute>,
143 
144     /// Current attribute name.
145     current_attr_name: StrTendril,
146 
147     /// Current attribute value.
148     current_attr_value: StrTendril,
149 
150     current_doctype: Doctype,
151 
152     /// Current comment.
153     current_comment: StrTendril,
154 
155     /// Current processing instruction target.
156     current_pi_target: StrTendril,
157 
158     /// Current processing instruction value.
159     current_pi_data: StrTendril,
160 
161     /// Record of how many ns we spent in each state, if profiling is enabled.
162     state_profile: BTreeMap<states::XmlState, u64>,
163 
164     /// Record of how many ns we spent in the token sink.
165     time_in_sink: u64,
166 }
167 
168 impl<Sink: TokenSink> XmlTokenizer<Sink> {
169     /// Create a new tokenizer which feeds tokens to a particular `TokenSink`.
new(sink: Sink, opts: XmlTokenizerOpts) -> XmlTokenizer<Sink>170     pub fn new(sink: Sink, opts: XmlTokenizerOpts) -> XmlTokenizer<Sink> {
171         if opts.profile && cfg!(for_c) {
172             panic!("Can't profile tokenizer when built as a C library");
173         }
174 
175         let state = *opts.initial_state.as_ref().unwrap_or(&states::Data);
176         let discard_bom = opts.discard_bom;
177         XmlTokenizer {
178             opts,
179             sink,
180             state,
181             char_ref_tokenizer: None,
182             at_eof: false,
183             current_char: '\0',
184             reconsume: false,
185             ignore_lf: false,
186             temp_buf: StrTendril::new(),
187             discard_bom,
188             current_tag_kind: StartTag,
189             current_tag_name: StrTendril::new(),
190             current_tag_attrs: vec![],
191             current_attr_name: StrTendril::new(),
192             current_attr_value: StrTendril::new(),
193             current_comment: StrTendril::new(),
194             current_pi_data: StrTendril::new(),
195             current_pi_target: StrTendril::new(),
196             current_doctype: Doctype::new(),
197             state_profile: BTreeMap::new(),
198             time_in_sink: 0,
199         }
200     }
201 
202     /// Feed an input string into the tokenizer.
feed(&mut self, input: &mut BufferQueue)203     pub fn feed(&mut self, input: &mut BufferQueue) {
204         if input.is_empty() {
205             return;
206         }
207 
208         if self.discard_bom {
209             if let Some(c) = input.peek() {
210                 if c == '\u{feff}' {
211                     input.next();
212                 }
213             } else {
214                 return;
215             }
216         };
217 
218         self.run(input);
219     }
220 
process_token(&mut self, token: Token)221     fn process_token(&mut self, token: Token) {
222         if self.opts.profile {
223             let (_, dt) = time!(self.sink.process_token(token));
224             self.time_in_sink += dt;
225         } else {
226             self.sink.process_token(token);
227         }
228     }
229 
230     // Get the next input character, which might be the character
231     // 'c' that we already consumed from the buffers.
get_preprocessed_char(&mut self, mut c: char, input: &mut BufferQueue) -> Option<char>232     fn get_preprocessed_char(&mut self, mut c: char, input: &mut BufferQueue) -> Option<char> {
233         if self.ignore_lf {
234             self.ignore_lf = false;
235             if c == '\n' {
236                 c = unwrap_or_return!(input.next(), None);
237             }
238         }
239 
240         if c == '\r' {
241             self.ignore_lf = true;
242             c = '\n';
243         }
244 
245         // Normalize \x00 into \uFFFD
246         if c == '\x00' {
247             c = '\u{FFFD}'
248         }
249 
250         // Exclude forbidden Unicode characters
251         if self.opts.exact_errors &&
252             match c as u32 {
253                 0x01..=0x08 | 0x0B | 0x0E..=0x1F | 0x7F..=0x9F | 0xFDD0..=0xFDEF => true,
254                 n if (n & 0xFFFE) == 0xFFFE => true,
255                 _ => false,
256             }
257         {
258             let msg = format!("Bad character {}", c);
259             self.emit_error(Cow::Owned(msg));
260         }
261 
262         debug!("got character {}", c);
263         self.current_char = c;
264         Some(c)
265     }
266 
bad_eof_error(&mut self)267     fn bad_eof_error(&mut self) {
268         let msg = format_if!(
269             self.opts.exact_errors,
270             "Unexpected EOF",
271             "Saw EOF in state {:?}",
272             self.state
273         );
274         self.emit_error(msg);
275     }
276 
pop_except_from(&mut self, input: &mut BufferQueue, set: SmallCharSet) -> Option<SetResult>277     fn pop_except_from(&mut self, input: &mut BufferQueue, set: SmallCharSet) -> Option<SetResult> {
278         // Bail to the slow path for various corner cases.
279         // This means that `FromSet` can contain characters not in the set!
280         // It shouldn't matter because the fallback `FromSet` case should
281         // always do the same thing as the `NotFromSet` case.
282         if self.opts.exact_errors || self.reconsume || self.ignore_lf {
283             return self.get_char(input).map(FromSet);
284         }
285 
286         let d = input.pop_except_from(set);
287         debug!("got characters {:?}", d);
288         match d {
289             Some(FromSet(c)) => self.get_preprocessed_char(c, input).map(FromSet),
290 
291             // NB: We don't set self.current_char for a run of characters not
292             // in the set.  It shouldn't matter for the codepaths that use
293             // this.
294             _ => d,
295         }
296     }
297 
298     // Check if the next characters are an ASCII case-insensitive match.  See
299     // BufferQueue::eat.
300     //
301     // NB: this doesn't do input stream preprocessing or set the current input
302     // character.
eat(&mut self, input: &mut BufferQueue, pat: &str) -> Option<bool>303     fn eat(&mut self, input: &mut BufferQueue, pat: &str) -> Option<bool> {
304         input.push_front(replace(&mut self.temp_buf, StrTendril::new()));
305         match input.eat(pat, u8::eq_ignore_ascii_case) {
306             None if self.at_eof => Some(false),
307             None => {
308                 while let Some(c) = input.next() {
309                     self.temp_buf.push_char(c);
310                 }
311                 None
312             },
313             Some(matched) => Some(matched),
314         }
315     }
316 
317     /// Run the state machine for as long as we can.
run(&mut self, input: &mut BufferQueue)318     pub fn run(&mut self, input: &mut BufferQueue) {
319         if self.opts.profile {
320             loop {
321                 let state = self.state;
322                 let old_sink = self.time_in_sink;
323                 let (run, mut dt) = time!(self.step(input));
324                 dt -= self.time_in_sink - old_sink;
325                 let new = match self.state_profile.get_mut(&state) {
326                     Some(x) => {
327                         *x += dt;
328                         false
329                     },
330                     None => true,
331                 };
332                 if new {
333                     // do this here because of borrow shenanigans
334                     self.state_profile.insert(state, dt);
335                 }
336                 if !run {
337                     break;
338                 }
339             }
340         } else {
341             while self.step(input) {}
342         }
343     }
344 
345     //§ tokenization
346     // Get the next input character, if one is available.
get_char(&mut self, input: &mut BufferQueue) -> Option<char>347     fn get_char(&mut self, input: &mut BufferQueue) -> Option<char> {
348         if self.reconsume {
349             self.reconsume = false;
350             Some(self.current_char)
351         } else {
352             input
353                 .next()
354                 .and_then(|c| self.get_preprocessed_char(c, input))
355         }
356     }
357 
bad_char_error(&mut self)358     fn bad_char_error(&mut self) {
359         let msg = format_if!(
360             self.opts.exact_errors,
361             "Bad character",
362             "Saw {} in state {:?}",
363             self.current_char,
364             self.state
365         );
366         self.emit_error(msg);
367     }
368 
discard_tag(&mut self)369     fn discard_tag(&mut self) {
370         self.current_tag_name = StrTendril::new();
371         self.current_tag_attrs = Vec::new();
372     }
373 
create_tag(&mut self, kind: TagKind, c: char)374     fn create_tag(&mut self, kind: TagKind, c: char) {
375         self.discard_tag();
376         self.current_tag_name.push_char(c);
377         self.current_tag_kind = kind;
378     }
379 
380     // This method creates a PI token and
381     // sets its target to given char
create_pi(&mut self, c: char)382     fn create_pi(&mut self, c: char) {
383         self.current_pi_target = StrTendril::new();
384         self.current_pi_data = StrTendril::new();
385         self.current_pi_target.push_char(c);
386     }
387 
emit_char(&mut self, c: char)388     fn emit_char(&mut self, c: char) {
389         self.process_token(CharacterTokens(StrTendril::from_char(match c {
390             '\0' => '\u{FFFD}',
391             c => c,
392         })));
393     }
394 
emit_short_tag(&mut self)395     fn emit_short_tag(&mut self) {
396         self.current_tag_kind = ShortTag;
397         self.current_tag_name = StrTendril::new();
398         self.emit_current_tag();
399     }
400 
emit_empty_tag(&mut self)401     fn emit_empty_tag(&mut self) {
402         self.current_tag_kind = EmptyTag;
403         self.emit_current_tag();
404     }
405 
set_empty_tag(&mut self)406     fn set_empty_tag(&mut self) {
407         self.current_tag_kind = EmptyTag;
408     }
409 
emit_start_tag(&mut self)410     fn emit_start_tag(&mut self) {
411         self.current_tag_kind = StartTag;
412         self.emit_current_tag();
413     }
414 
emit_current_tag(&mut self)415     fn emit_current_tag(&mut self) {
416         self.finish_attribute();
417 
418         let qname = process_qname(replace(&mut self.current_tag_name, StrTendril::new()));
419 
420         match self.current_tag_kind {
421             StartTag | EmptyTag => {},
422             EndTag => {
423                 if !self.current_tag_attrs.is_empty() {
424                     self.emit_error(Borrowed("Attributes on an end tag"));
425                 }
426             },
427             ShortTag => {
428                 if !self.current_tag_attrs.is_empty() {
429                     self.emit_error(Borrowed("Attributes on a short tag"));
430                 }
431             },
432         }
433 
434         let token = TagToken(Tag {
435             kind: self.current_tag_kind,
436             name: qname,
437             attrs: replace(&mut self.current_tag_attrs, vec![]),
438         });
439         self.process_token(token);
440 
441         match self.sink.query_state_change() {
442             None => (),
443             Some(s) => self.state = s,
444         }
445     }
446 
447     // The string must not contain '\0'!
emit_chars(&mut self, b: StrTendril)448     fn emit_chars(&mut self, b: StrTendril) {
449         self.process_token(CharacterTokens(b));
450     }
451 
452     // Emits the current Processing Instruction
emit_pi(&mut self)453     fn emit_pi(&mut self) {
454         let token = PIToken(Pi {
455             target: replace(&mut self.current_pi_target, StrTendril::new()),
456             data: replace(&mut self.current_pi_data, StrTendril::new()),
457         });
458         self.process_token(token);
459     }
460 
consume_char_ref(&mut self, addnl_allowed: Option<char>)461     fn consume_char_ref(&mut self, addnl_allowed: Option<char>) {
462         // NB: The char ref tokenizer assumes we have an additional allowed
463         // character iff we're tokenizing in an attribute value.
464         self.char_ref_tokenizer = Some(Box::new(CharRefTokenizer::new(addnl_allowed)));
465     }
466 
emit_eof(&mut self)467     fn emit_eof(&mut self) {
468         self.process_token(EOFToken);
469     }
470 
emit_error(&mut self, error: Cow<'static, str>)471     fn emit_error(&mut self, error: Cow<'static, str>) {
472         self.process_token(ParseError(error));
473     }
474 
emit_current_comment(&mut self)475     fn emit_current_comment(&mut self) {
476         let comment = replace(&mut self.current_comment, StrTendril::new());
477         self.process_token(CommentToken(comment));
478     }
479 
emit_current_doctype(&mut self)480     fn emit_current_doctype(&mut self) {
481         let doctype = replace(&mut self.current_doctype, Doctype::new());
482         self.process_token(DoctypeToken(doctype));
483     }
484 
doctype_id(&mut self, kind: DoctypeKind) -> &mut Option<StrTendril>485     fn doctype_id(&mut self, kind: DoctypeKind) -> &mut Option<StrTendril> {
486         match kind {
487             Public => &mut self.current_doctype.public_id,
488             System => &mut self.current_doctype.system_id,
489         }
490     }
491 
clear_doctype_id(&mut self, kind: DoctypeKind)492     fn clear_doctype_id(&mut self, kind: DoctypeKind) {
493         let id = self.doctype_id(kind);
494         match *id {
495             Some(ref mut s) => s.clear(),
496             None => *id = Some(StrTendril::new()),
497         }
498     }
499 
peek(&mut self, input: &mut BufferQueue) -> Option<char>500     fn peek(&mut self, input: &mut BufferQueue) -> Option<char> {
501         if self.reconsume {
502             Some(self.current_char)
503         } else {
504             input.peek()
505         }
506     }
507 
discard_char(&mut self, input: &mut BufferQueue)508     fn discard_char(&mut self, input: &mut BufferQueue) {
509         let c = self.get_char(input);
510         assert!(c.is_some());
511     }
512 
unconsume(&mut self, input: &mut BufferQueue, buf: StrTendril)513     fn unconsume(&mut self, input: &mut BufferQueue, buf: StrTendril) {
514         input.push_front(buf);
515     }
516 }
517 
518 // Shorthand for common state machine behaviors.
519 macro_rules! shorthand (
520     ( $me:ident : emit $c:expr                     ) => ( $me.emit_char($c);                                   );
521     ( $me:ident : create_tag $kind:ident $c:expr   ) => ( $me.create_tag($kind, $c);                           );
522     ( $me:ident : push_tag $c:expr                 ) => ( $me.current_tag_name.push_char($c);                  );
523     ( $me:ident : discard_tag $input:expr          ) => ( $me.discard_tag($input);                             );
524     ( $me:ident : discard_char                     ) => ( $me.discard_char();                                  );
525     ( $me:ident : push_temp $c:expr                ) => ( $me.temp_buf.push_char($c);                          );
526     ( $me:ident : emit_temp                        ) => ( $me.emit_temp_buf();                                 );
527     ( $me:ident : clear_temp                       ) => ( $me.clear_temp_buf();                                );
528     ( $me:ident : create_attr $c:expr              ) => ( $me.create_attribute($c);                            );
529     ( $me:ident : push_name $c:expr                ) => ( $me.current_attr_name.push_char($c);                 );
530     ( $me:ident : push_value $c:expr               ) => ( $me.current_attr_value.push_char($c);                );
531     ( $me:ident : append_value $c:expr             ) => ( $me.current_attr_value.push_tendril($c);             );
532     ( $me:ident : push_comment $c:expr             ) => ( $me.current_comment.push_char($c);                   );
533     ( $me:ident : append_comment $c:expr           ) => ( $me.current_comment.push_slice($c);                  );
534     ( $me:ident : emit_comment                     ) => ( $me.emit_current_comment();                          );
535     ( $me:ident : clear_comment                    ) => ( $me.current_comment.clear();                         );
536     ( $me:ident : create_doctype                   ) => ( $me.current_doctype = Doctype::new();                );
537     ( $me:ident : push_doctype_name $c:expr        ) => ( option_push(&mut $me.current_doctype.name, $c);      );
538     ( $me:ident : push_doctype_id $k:ident $c:expr ) => ( option_push($me.doctype_id($k), $c);                 );
539     ( $me:ident : clear_doctype_id $k:ident        ) => ( $me.clear_doctype_id($k);                            );
540     ( $me:ident : emit_doctype                     ) => ( $me.emit_current_doctype();                          );
541     ( $me:ident : error                            ) => ( $me.bad_char_error();                                );
542     ( $me:ident : error_eof                        ) => ( $me.bad_eof_error();                                 );
543     ( $me:ident : create_pi $c:expr                ) => ( $me.create_pi($c);                                   );
544     ( $me:ident : push_pi_target $c:expr           ) => ( $me.current_pi_target.push_char($c);                 );
545     ( $me:ident : push_pi_data $c:expr             ) => ( $me.current_pi_data.push_char($c);                   );
546     ( $me:ident : set_empty_tag                    ) => ( $me.set_empty_tag();                                 );
547 );
548 
549 // Tracing of tokenizer actions.  This adds significant bloat and compile time,
550 // so it's behind a cfg flag.
551 #[cfg(trace_tokenizer)]
552 macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ({
553     debug!("  {:s}", stringify!($($cmds)*));
554     shorthand!($me:expr : $($cmds)*);
555 }));
556 
557 #[cfg(not(trace_tokenizer))]
558 macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ( shorthand!($me: $($cmds)*) ) );
559 
560 // A little DSL for sequencing shorthand actions.
561 macro_rules! go (
562     // A pattern like $($cmd:tt)* ; $($rest:tt)* causes parse ambiguity.
563     // We have to tell the parser how much lookahead we need.
564 
565     ( $me:ident : $a:tt                   ; $($rest:tt)* ) => ({ sh_trace!($me: $a);          go!($me: $($rest)*); });
566     ( $me:ident : $a:tt $b:tt             ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b);       go!($me: $($rest)*); });
567     ( $me:ident : $a:tt $b:tt $c:tt       ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c);    go!($me: $($rest)*); });
568     ( $me:ident : $a:tt $b:tt $c:tt $d:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c $d); go!($me: $($rest)*); });
569 
570     // These can only come at the end.
571 
572     ( $me:ident : to $s:ident                    ) => ({ $me.state = states::$s; return true;           });
573     ( $me:ident : to $s:ident $k1:expr           ) => ({ $me.state = states::$s($k1); return true;      });
574     ( $me:ident : to $s:ident $k1:ident $k2:expr ) => ({ $me.state = states::$s($k1($k2)); return true; });
575 
576     ( $me:ident : reconsume $s:ident                    ) => ({ $me.reconsume = true; go!($me: to $s);         });
577     ( $me:ident : reconsume $s:ident $k1:expr           ) => ({ $me.reconsume = true; go!($me: to $s $k1);     });
578     ( $me:ident : reconsume $s:ident $k1:ident $k2:expr ) => ({ $me.reconsume = true; go!($me: to $s $k1 $k2); });
579 
580     ( $me:ident : consume_char_ref             ) => ({ $me.consume_char_ref(None); return true;         });
581     ( $me:ident : consume_char_ref $addnl:expr ) => ({ $me.consume_char_ref(Some($addnl)); return true; });
582 
583     // We have a default next state after emitting a tag, but the sink can override.
584     ( $me:ident : emit_tag $s:ident ) => ({
585         $me.state = states::$s;
586         $me.emit_current_tag();
587         return true;
588     });
589 
590     // We have a special when dealing with empty and short tags in Xml
591     ( $me:ident : emit_short_tag $s:ident ) => ({
592         $me.state = states::$s;
593         $me.emit_short_tag();
594         return true;
595     });
596 
597     ( $me:ident : emit_empty_tag $s:ident ) => ({
598         $me.state = states::$s;
599         $me.emit_empty_tag();
600         return true;
601     });
602 
603     ( $me:ident : emit_start_tag $s:ident ) => ({
604         $me.state = states::$s;
605         $me.emit_start_tag();
606         return true;
607     });
608 
609     ( $me:ident : emit_pi $s:ident ) => ({
610         $me.state = states::$s;
611         $me.emit_pi();
612         return true;
613     });
614 
615     ( $me:ident : eof ) => ({ $me.emit_eof(); return false; });
616 
617     // If nothing else matched, it's a single command
618     ( $me:ident : $($cmd:tt)+ ) => ( sh_trace!($me: $($cmd)+); );
619 
620     // or nothing.
621     ( $me:ident : ) => (());
622 );
623 
624 // This is a macro because it can cause early return
625 // from the function where it is used.
626 macro_rules! get_char ( ($me:expr, $input:expr) => (
627     unwrap_or_return!($me.get_char($input), false)
628 ));
629 
630 macro_rules! pop_except_from ( ($me:expr, $input:expr, $set:expr) => (
631     unwrap_or_return!($me.pop_except_from($input, $set), false)
632 ));
633 
634 macro_rules! eat ( ($me:expr, $input:expr, $pat:expr) => (
635     unwrap_or_return!($me.eat($input, $pat), false)
636 ));
637 
638 impl<Sink: TokenSink> XmlTokenizer<Sink> {
639     // Run the state machine for a while.
640     // Return true if we should be immediately re-invoked
641     // (this just simplifies control flow vs. break / continue).
642     #[allow(clippy::never_loop)]
step(&mut self, input: &mut BufferQueue) -> bool643     fn step(&mut self, input: &mut BufferQueue) -> bool {
644         if self.char_ref_tokenizer.is_some() {
645             return self.step_char_ref_tokenizer(input);
646         }
647 
648         debug!("processing in state {:?}", self.state);
649         match self.state {
650             XmlState::Quiescent => {
651                 self.state = XmlState::Data;
652                 false
653             },
654             //§ data-state
655             XmlState::Data => loop {
656                 match pop_except_from!(self, input, small_char_set!('\r' '&' '<')) {
657                     FromSet('&') => go!(self: consume_char_ref),
658                     FromSet('<') => go!(self: to TagState),
659                     FromSet(c) => go!(self: emit c),
660                     NotFromSet(b) => self.emit_chars(b),
661                 }
662             },
663             //§ tag-state
664             XmlState::TagState => loop {
665                 match get_char!(self, input) {
666                     '!' => go!(self: to MarkupDecl),
667                     '/' => go!(self: to EndTagState),
668                     '?' => go!(self: to Pi),
669                     '\t' | '\n' | ' ' | ':' | '<' | '>' => {
670                         go!(self: error; emit '<'; reconsume Data)
671                     },
672                     cl => go!(self: create_tag StartTag cl; to TagName),
673                 }
674             },
675             //§ end-tag-state
676             XmlState::EndTagState => loop {
677                 match get_char!(self, input) {
678                     '>' => go!(self:  emit_short_tag Data),
679                     '\t' | '\n' | ' ' | '<' | ':' => {
680                         go!(self: error; emit '<'; emit '/'; reconsume Data)
681                     },
682                     cl => go!(self: create_tag EndTag cl; to EndTagName),
683                 }
684             },
685             //§ end-tag-name-state
686             XmlState::EndTagName => loop {
687                 match get_char!(self, input) {
688                     '\t' | '\n' | ' ' => go!(self: to EndTagNameAfter),
689                     '/' => go!(self: error; to EndTagNameAfter),
690                     '>' => go!(self: emit_tag Data),
691                     cl => go!(self: push_tag cl),
692                 }
693             },
694             //§ end-tag-name-after-state
695             XmlState::EndTagNameAfter => loop {
696                 match get_char!(self, input) {
697                     '>' => go!(self: emit_tag Data),
698                     '\t' | '\n' | ' ' => (),
699                     _ => self.emit_error(Borrowed("Unexpected element in tag name")),
700                 }
701             },
702             //§ pi-state
703             XmlState::Pi => loop {
704                 match get_char!(self, input) {
705                     '\t' | '\n' | ' ' => go!(self: error; reconsume BogusComment),
706                     cl => go!(self: create_pi cl; to PiTarget),
707                 }
708             },
709             //§ pi-target-state
710             XmlState::PiTarget => loop {
711                 match get_char!(self, input) {
712                     '\t' | '\n' | ' ' => go!(self: to PiTargetAfter),
713                     '?' => go!(self: to PiAfter),
714                     cl => go!(self: push_pi_target cl),
715                 }
716             },
717             //§ pi-target-after-state
718             XmlState::PiTargetAfter => loop {
719                 match get_char!(self, input) {
720                     '\t' | '\n' | ' ' => (),
721                     _ => go!(self: reconsume PiData),
722                 }
723             },
724             //§ pi-data-state
725             XmlState::PiData => loop {
726                 match get_char!(self, input) {
727                     '?' => go!(self: to PiAfter),
728                     cl => go!(self: push_pi_data cl),
729                 }
730             },
731             //§ pi-after-state
732             XmlState::PiAfter => loop {
733                 match get_char!(self, input) {
734                     '>' => go!(self: emit_pi Data),
735                     '?' => go!(self: to PiAfter),
736                     cl => go!(self: push_pi_data cl),
737                 }
738             },
739             //§ markup-declaration-state
740             XmlState::MarkupDecl => loop {
741                 if eat!(self, input, "--") {
742                     go!(self: clear_comment; to CommentStart);
743                 } else if eat!(self, input, "[CDATA[") {
744                     go!(self: to Cdata);
745                 } else if eat!(self, input, "DOCTYPE") {
746                     go!(self: to Doctype);
747                 } else {
748                     // FIXME: 'error' gives wrong message
749                     go!(self: error; to BogusComment);
750                 }
751             },
752             //§ comment-start-state
753             XmlState::CommentStart => loop {
754                 match get_char!(self, input) {
755                     '-' => go!(self: to CommentStartDash),
756                     '>' => go!(self: error; emit_comment; to Data),
757                     _ => go!(self: reconsume Comment),
758                 }
759             },
760             //§ comment-start-dash-state
761             XmlState::CommentStartDash => loop {
762                 match get_char!(self, input) {
763                     '-' => go!(self: to CommentEnd),
764                     '>' => go!(self: error; emit_comment; to Data),
765                     _ => go!(self: push_comment '-'; reconsume Comment),
766                 }
767             },
768             //§ comment-state
769             XmlState::Comment => loop {
770                 match get_char!(self, input) {
771                     '<' => go!(self: push_comment '<'; to CommentLessThan),
772                     '-' => go!(self: to CommentEndDash),
773                     c => go!(self: push_comment c),
774                 }
775             },
776             //§ comment-less-than-sign-state
777             XmlState::CommentLessThan => loop {
778                 match get_char!(self, input) {
779                     '!' => go!(self: push_comment '!';to CommentLessThanBang),
780                     '<' => go!(self: push_comment '<'),
781                     _ => go!(self: reconsume Comment),
782                 }
783             },
784             //§ comment-less-than-sign-bang-state
785             XmlState::CommentLessThanBang => loop {
786                 match get_char!(self, input) {
787                     '-' => go!(self: to CommentLessThanBangDash),
788                     _ => go!(self: reconsume Comment),
789                 }
790             },
791             //§ comment-less-than-sign-bang-dash-state
792             XmlState::CommentLessThanBangDash => loop {
793                 match get_char!(self, input) {
794                     '-' => go!(self: to CommentLessThanBangDashDash),
795                     _ => go!(self: reconsume CommentEndDash),
796                 }
797             },
798             //§ comment-less-than-sign-bang-dash-dash-state
799             XmlState::CommentLessThanBangDashDash => loop {
800                 match get_char!(self, input) {
801                     '>' => go!(self: reconsume CommentEnd),
802                     _ => go!(self: error; reconsume CommentEnd),
803                 }
804             },
805             //§ comment-end-dash-state
806             XmlState::CommentEndDash => loop {
807                 match get_char!(self, input) {
808                     '-' => go!(self: to CommentEnd),
809                     _ => go!(self: push_comment '-'; reconsume Comment),
810                 }
811             },
812             //§ comment-end-state
813             XmlState::CommentEnd => loop {
814                 match get_char!(self, input) {
815                     '>' => go!(self: emit_comment; to Data),
816                     '!' => go!(self: to CommentEndBang),
817                     '-' => go!(self: push_comment '-'),
818                     _ => go!(self: append_comment "--"; reconsume Comment),
819                 }
820             },
821             //§ comment-end-bang-state
822             XmlState::CommentEndBang => loop {
823                 match get_char!(self, input) {
824                     '-' => go!(self: append_comment "--!"; to CommentEndDash),
825                     '>' => go!(self: error; emit_comment; to Data),
826                     _ => go!(self: append_comment "--!"; reconsume Comment),
827                 }
828             },
829             //§ bogus-comment-state
830             XmlState::BogusComment => loop {
831                 match get_char!(self, input) {
832                     '>' => go!(self: emit_comment; to Data),
833                     c => go!(self: push_comment c),
834                 }
835             },
836             //§ cdata-state
837             XmlState::Cdata => loop {
838                 match get_char!(self, input) {
839                     ']' => go!(self: to CdataBracket),
840                     cl => go!(self: emit cl),
841                 }
842             },
843             //§ cdata-bracket-state
844             XmlState::CdataBracket => loop {
845                 match get_char!(self, input) {
846                     ']' => go!(self: to CdataEnd),
847                     cl => go!(self: emit ']'; emit cl; to Cdata),
848                 }
849             },
850             //§ cdata-end-state
851             XmlState::CdataEnd => loop {
852                 match get_char!(self, input) {
853                     '>' => go!(self: to Data),
854                     ']' => go!(self: emit ']'),
855                     cl => go!(self: emit ']'; emit ']'; emit cl; to Cdata),
856                 }
857             },
858             //§ tag-name-state
859             XmlState::TagName => loop {
860                 match get_char!(self, input) {
861                     '\t' | '\n' | ' ' => go!(self: to TagAttrNameBefore),
862                     '>' => go!(self: emit_tag Data),
863                     '/' => go!(self: set_empty_tag; to TagEmpty),
864                     cl => go!(self: push_tag cl),
865                 }
866             },
867             //§ empty-tag-state
868             XmlState::TagEmpty => loop {
869                 match get_char!(self, input) {
870                     '>' => go!(self: emit_empty_tag Data),
871                     _ => go!(self: reconsume TagAttrValueBefore),
872                 }
873             },
874             //§ tag-attribute-name-before-state
875             XmlState::TagAttrNameBefore => loop {
876                 match get_char!(self, input) {
877                     '\t' | '\n' | ' ' => (),
878                     '>' => go!(self: emit_tag Data),
879                     '/' => go!(self: set_empty_tag; to TagEmpty),
880                     ':' => go!(self: error),
881                     cl => go!(self: create_attr cl; to TagAttrName),
882                 }
883             },
884             //§ tag-attribute-name-state
885             XmlState::TagAttrName => loop {
886                 match get_char!(self, input) {
887                     '=' => go!(self: to TagAttrValueBefore),
888                     '>' => go!(self: emit_tag Data),
889                     '\t' | '\n' | ' ' => go!(self: to TagAttrNameAfter),
890                     '/' => go!(self: set_empty_tag; to TagEmpty),
891                     cl => go!(self: push_name cl),
892                 }
893             },
894             //§ tag-attribute-name-after-state
895             XmlState::TagAttrNameAfter => loop {
896                 match get_char!(self, input) {
897                     '\t' | '\n' | ' ' => (),
898                     '=' => go!(self: to TagAttrValueBefore),
899                     '>' => go!(self: emit_tag Data),
900                     '/' => go!(self: set_empty_tag; to TagEmpty),
901                     cl => go!(self: create_attr cl; to TagAttrName),
902                 }
903             },
904             //§ tag-attribute-value-before-state
905             XmlState::TagAttrValueBefore => loop {
906                 match get_char!(self, input) {
907                     '\t' | '\n' | ' ' => (),
908                     '"' => go!(self: to TagAttrValue DoubleQuoted),
909                     '\'' => go!(self: to TagAttrValue SingleQuoted),
910                     '&' => go!(self: reconsume TagAttrValue(Unquoted)),
911                     '>' => go!(self: emit_tag Data),
912                     cl => go!(self: push_value cl; to TagAttrValue(Unquoted)),
913                 }
914             },
915             //§ tag-attribute-value-double-quoted-state
916             XmlState::TagAttrValue(DoubleQuoted) => loop {
917                 match pop_except_from!(self, input, small_char_set!('\n' '"' '&')) {
918                     FromSet('"') => go!(self: to TagAttrNameBefore),
919                     FromSet('&') => go!(self: consume_char_ref '"' ),
920                     FromSet(c) => go!(self: push_value c),
921                     NotFromSet(ref b) => go!(self: append_value b),
922                 }
923             },
924             //§ tag-attribute-value-single-quoted-state
925             XmlState::TagAttrValue(SingleQuoted) => loop {
926                 match pop_except_from!(self, input, small_char_set!('\n' '\'' '&')) {
927                     FromSet('\'') => go!(self: to TagAttrNameBefore),
928                     FromSet('&') => go!(self: consume_char_ref '\''),
929                     FromSet(c) => go!(self: push_value c),
930                     NotFromSet(ref b) => go!(self: append_value b),
931                 }
932             },
933             //§ tag-attribute-value-double-quoted-state
934             XmlState::TagAttrValue(Unquoted) => loop {
935                 match pop_except_from!(self, input, small_char_set!('\n' '\t' ' ' '&' '>')) {
936                     FromSet('\t') | FromSet('\n') | FromSet(' ') => go!(self: to TagAttrNameBefore),
937                     FromSet('&') => go!(self: consume_char_ref),
938                     FromSet('>') => go!(self: emit_tag Data),
939                     FromSet(c) => go!(self: push_value c),
940                     NotFromSet(ref b) => go!(self: append_value b),
941                 }
942             },
943 
944             //§ doctype-state
945             XmlState::Doctype => loop {
946                 match get_char!(self, input) {
947                     '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeName),
948                     _ => go!(self: error; reconsume BeforeDoctypeName),
949                 }
950             },
951             //§ before-doctype-name-state
952             XmlState::BeforeDoctypeName => loop {
953                 match get_char!(self, input) {
954                     '\t' | '\n' | '\x0C' | ' ' => (),
955                     '>' => go!(self: error; emit_doctype; to Data),
956                     c => go!(self: create_doctype; push_doctype_name (c.to_ascii_lowercase());
957                                   to DoctypeName),
958                 }
959             },
960             //§ doctype-name-state
961             XmlState::DoctypeName => loop {
962                 match get_char!(self, input) {
963                     '\t' | '\n' | '\x0C' | ' ' => go!(self: to AfterDoctypeName),
964                     '>' => go!(self: emit_doctype; to Data),
965                     c => go!(self: push_doctype_name (c.to_ascii_lowercase());
966                                   to DoctypeName),
967                 }
968             },
969             //§ after-doctype-name-state
970             XmlState::AfterDoctypeName => loop {
971                 if eat!(self, input, "public") {
972                     go!(self: to AfterDoctypeKeyword Public);
973                 } else if eat!(self, input, "system") {
974                     go!(self: to AfterDoctypeKeyword System);
975                 } else {
976                     match get_char!(self, input) {
977                         '\t' | '\n' | '\x0C' | ' ' => (),
978                         '>' => go!(self: emit_doctype; to Data),
979                         _ => go!(self: error; to BogusDoctype),
980                     }
981                 }
982             },
983             //§ after-doctype-public-keyword-state
984             XmlState::AfterDoctypeKeyword(Public) => loop {
985                 match get_char!(self, input) {
986                     '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier Public),
987                     '"' => {
988                         go!(self: error; clear_doctype_id Public; to DoctypeIdentifierDoubleQuoted Public)
989                     },
990                     '\'' => {
991                         go!(self: error; clear_doctype_id Public; to DoctypeIdentifierSingleQuoted Public)
992                     },
993                     '>' => go!(self: error; emit_doctype; to Data),
994                     _ => go!(self: error; to BogusDoctype),
995                 }
996             },
997             //§ after-doctype-system-keyword-state
998             XmlState::AfterDoctypeKeyword(System) => loop {
999                 match get_char!(self, input) {
1000                     '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier System),
1001                     '"' => {
1002                         go!(self: error; clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System)
1003                     },
1004                     '\'' => {
1005                         go!(self: error; clear_doctype_id System; to DoctypeIdentifierSingleQuoted System)
1006                     },
1007                     '>' => go!(self: error; emit_doctype; to Data),
1008                     _ => go!(self: error; to BogusDoctype),
1009                 }
1010             },
1011             //§ before_doctype_public_identifier_state before_doctype_system_identifier_state
1012             XmlState::BeforeDoctypeIdentifier(kind) => loop {
1013                 match get_char!(self, input) {
1014                     '\t' | '\n' | '\x0C' | ' ' => (),
1015                     '"' => go!(self: clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind),
1016                     '\'' => go!(self: clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind),
1017                     '>' => go!(self: error; emit_doctype; to Data),
1018                     _ => go!(self: error; to BogusDoctype),
1019                 }
1020             },
1021             //§ doctype_public_identifier_double_quoted_state doctype_system_identifier_double_quoted_state
1022             XmlState::DoctypeIdentifierDoubleQuoted(kind) => loop {
1023                 match get_char!(self, input) {
1024                     '"' => go!(self: to AfterDoctypeIdentifier kind),
1025                     '>' => go!(self: error; emit_doctype; to Data),
1026                     c => go!(self: push_doctype_id kind c),
1027                 }
1028             },
1029             //§ doctype_public_identifier_single_quoted_state doctype_system_identifier_single_quoted_state
1030             XmlState::DoctypeIdentifierSingleQuoted(kind) => loop {
1031                 match get_char!(self, input) {
1032                     '\'' => go!(self: to AfterDoctypeIdentifier kind),
1033                     '>' => go!(self: error; emit_doctype; to Data),
1034                     c => go!(self: push_doctype_id kind c),
1035                 }
1036             },
1037             //§ doctype_public_identifier_single_quoted_state
1038             XmlState::AfterDoctypeIdentifier(Public) => loop {
1039                 match get_char!(self, input) {
1040                     '\t' | '\n' | '\x0C' | ' ' => {
1041                         go!(self: to BetweenDoctypePublicAndSystemIdentifiers)
1042                     },
1043                     '\'' => {
1044                         go!(self: error; clear_doctype_id System; to DoctypeIdentifierSingleQuoted(System))
1045                     },
1046                     '"' => {
1047                         go!(self: error; clear_doctype_id System; to DoctypeIdentifierDoubleQuoted(System))
1048                     },
1049                     '>' => go!(self: emit_doctype; to Data),
1050                     _ => go!(self: error; to BogusDoctype),
1051                 }
1052             },
1053             //§ doctype_system_identifier_single_quoted_state
1054             XmlState::AfterDoctypeIdentifier(System) => loop {
1055                 match get_char!(self, input) {
1056                     '\t' | '\n' | '\x0C' | ' ' => (),
1057                     '>' => go!(self: emit_doctype; to Data),
1058                     _ => go!(self: error; to BogusDoctype),
1059                 }
1060             },
1061             //§ between_doctype_public_and_system_identifier_state
1062             XmlState::BetweenDoctypePublicAndSystemIdentifiers => loop {
1063                 match get_char!(self, input) {
1064                     '\t' | '\n' | '\x0C' | ' ' => (),
1065                     '>' => go!(self: emit_doctype; to Data),
1066                     '\'' => go!(self: to DoctypeIdentifierSingleQuoted System),
1067                     '"' => go!(self: to DoctypeIdentifierDoubleQuoted System),
1068                     _ => go!(self: error; to BogusDoctype),
1069                 }
1070             },
1071             //§ bogus_doctype_state
1072             XmlState::BogusDoctype => loop {
1073                 match get_char!(self, input) {
1074                     '>' => go!(self: emit_doctype; to Data),
1075                     _ => (),
1076                 }
1077             },
1078         }
1079     }
1080 
1081     /// Indicate that we have reached the end of the input.
1082     pub fn end(&mut self) {
1083         // Handle EOF in the char ref sub-tokenizer, if there is one.
1084         // Do this first because it might un-consume stuff.
1085         let mut input = BufferQueue::new();
1086         match self.char_ref_tokenizer.take() {
1087             None => (),
1088             Some(mut tok) => {
1089                 tok.end_of_file(self, &mut input);
1090                 self.process_char_ref(tok.get_result());
1091             },
1092         }
1093 
1094         // Process all remaining buffered input.
1095         // If we're waiting for lookahead, we're not gonna get it.
1096         self.at_eof = true;
1097         self.run(&mut input);
1098 
1099         while self.eof_step() {
1100             // loop
1101         }
1102 
1103         self.sink.end();
1104 
1105         if self.opts.profile {
1106             self.dump_profile();
1107         }
1108     }
1109 
1110     #[cfg(for_c)]
1111     fn dump_profile(&self) {
1112         unreachable!();
1113     }
1114 
1115     #[cfg(not(for_c))]
1116     fn dump_profile(&self) {
1117         let mut results: Vec<(states::XmlState, u64)> =
1118             self.state_profile.iter().map(|(s, t)| (*s, *t)).collect();
1119         results.sort_by(|&(_, x), &(_, y)| y.cmp(&x));
1120 
1121         let total: u64 = results
1122             .iter()
1123             .map(|&(_, t)| t)
1124             .fold(0, ::std::ops::Add::add);
1125         debug!("\nTokenizer profile, in nanoseconds");
1126         debug!("\n{:12}         total in token sink", self.time_in_sink);
1127         debug!("\n{:12}         total in tokenizer", total);
1128 
1129         for (k, v) in results.into_iter() {
1130             let pct = 100.0 * (v as f64) / (total as f64);
1131             debug!("{:12}  {:4.1}%  {:?}", v, pct, k);
1132         }
1133     }
1134 
1135     fn eof_step(&mut self) -> bool {
1136         debug!("processing EOF in state {:?}", self.state);
1137         match self.state {
1138             XmlState::Data | XmlState::Quiescent => go!(self: eof),
1139             XmlState::CommentStart | XmlState::CommentLessThan | XmlState::CommentLessThanBang => {
1140                 go!(self: reconsume Comment)
1141             },
1142             XmlState::CommentLessThanBangDash => go!(self: reconsume CommentEndDash),
1143             XmlState::CommentLessThanBangDashDash => go!(self: reconsume CommentEnd),
1144             XmlState::CommentStartDash |
1145             XmlState::Comment |
1146             XmlState::CommentEndDash |
1147             XmlState::CommentEnd |
1148             XmlState::CommentEndBang => go!(self: error_eof; emit_comment; eof),
1149             XmlState::TagState => go!(self: error_eof; emit '<'; to Data),
1150             XmlState::EndTagState => go!(self: error_eof; emit '<'; emit '/'; to Data),
1151             XmlState::TagEmpty => go!(self: error_eof; to TagAttrNameBefore),
1152             XmlState::Cdata | XmlState::CdataBracket | XmlState::CdataEnd => {
1153                 go!(self: error_eof; to Data)
1154             },
1155             XmlState::Pi => go!(self: error_eof; to BogusComment),
1156             XmlState::PiTargetAfter | XmlState::PiAfter => go!(self: reconsume PiData),
1157             XmlState::MarkupDecl => go!(self: error_eof; to BogusComment),
1158             XmlState::TagName |
1159             XmlState::TagAttrNameBefore |
1160             XmlState::EndTagName |
1161             XmlState::TagAttrNameAfter |
1162             XmlState::EndTagNameAfter |
1163             XmlState::TagAttrValueBefore |
1164             XmlState::TagAttrValue(_) => go!(self: error_eof; emit_tag Data),
1165             XmlState::PiData | XmlState::PiTarget => go!(self: error_eof; emit_pi Data),
1166             XmlState::TagAttrName => go!(self: error_eof; emit_start_tag Data),
1167             XmlState::BeforeDoctypeName |
1168             XmlState::Doctype |
1169             XmlState::DoctypeName |
1170             XmlState::AfterDoctypeName |
1171             XmlState::AfterDoctypeKeyword(_) |
1172             XmlState::BeforeDoctypeIdentifier(_) |
1173             XmlState::AfterDoctypeIdentifier(_) |
1174             XmlState::DoctypeIdentifierSingleQuoted(_) |
1175             XmlState::DoctypeIdentifierDoubleQuoted(_) |
1176             XmlState::BetweenDoctypePublicAndSystemIdentifiers => {
1177                 go!(self: error_eof; emit_doctype; to Data)
1178             },
1179             XmlState::BogusDoctype => go!(self: emit_doctype; to Data),
1180             XmlState::BogusComment => go!(self: emit_comment; to Data),
1181         }
1182     }
1183 
1184     fn process_char_ref(&mut self, char_ref: CharRef) {
1185         let CharRef {
1186             mut chars,
1187             mut num_chars,
1188         } = char_ref;
1189 
1190         if num_chars == 0 {
1191             chars[0] = '&';
1192             num_chars = 1;
1193         }
1194 
1195         for i in 0..num_chars {
1196             let c = chars[i as usize];
1197             match self.state {
1198                 states::Data | states::Cdata => go!(self: emit c),
1199 
1200                 states::TagAttrValue(_) => go!(self: push_value c),
1201 
1202                 _ => panic!(
1203                     "state {:?} should not be reachable in process_char_ref",
1204                     self.state
1205                 ),
1206             }
1207         }
1208     }
1209 
1210     fn step_char_ref_tokenizer(&mut self, input: &mut BufferQueue) -> bool {
1211         let mut tok = self.char_ref_tokenizer.take().unwrap();
1212         let outcome = tok.step(self, input);
1213 
1214         let progress = match outcome {
1215             char_ref::Done => {
1216                 self.process_char_ref(tok.get_result());
1217                 return true;
1218             },
1219 
1220             char_ref::Stuck => false,
1221             char_ref::Progress => true,
1222         };
1223 
1224         self.char_ref_tokenizer = Some(tok);
1225         progress
1226     }
1227 
1228     fn finish_attribute(&mut self) {
1229         if self.current_attr_name.is_empty() {
1230             return;
1231         }
1232 
1233         // Check for a duplicate attribute.
1234         // FIXME: the spec says we should error as soon as the name is finished.
1235         // FIXME: linear time search, do we care?
1236         let dup = {
1237             let name = &self.current_attr_name[..];
1238             self.current_tag_attrs
1239                 .iter()
1240                 .any(|a| &*a.name.local == name)
1241         };
1242 
1243         if dup {
1244             self.emit_error(Borrowed("Duplicate attribute"));
1245             self.current_attr_name.clear();
1246             self.current_attr_value.clear();
1247         } else {
1248             let qname = process_qname(replace(&mut self.current_attr_name, StrTendril::new()));
1249             let attr = Attribute {
1250                 name: qname.clone(),
1251                 value: replace(&mut self.current_attr_value, StrTendril::new()),
1252             };
1253 
1254             if qname.local == local_name!("xmlns") ||
1255                 qname.prefix == Some(namespace_prefix!("xmlns"))
1256             {
1257                 self.current_tag_attrs.insert(0, attr);
1258             } else {
1259                 self.current_tag_attrs.push(attr);
1260             }
1261         }
1262     }
1263 
1264     fn create_attribute(&mut self, c: char) {
1265         self.finish_attribute();
1266 
1267         self.current_attr_name.push_char(c);
1268     }
1269 }
1270 
1271 #[cfg(test)]
1272 mod test {
1273 
1274     use super::process_qname;
1275     use crate::tendril::SliceExt;
1276     use crate::{LocalName, Prefix};
1277 
1278     #[test]
1279     fn simple_namespace() {
1280         let qname = process_qname("prefix:local".to_tendril());
1281         assert_eq!(qname.prefix, Some(Prefix::from("prefix")));
1282         assert_eq!(qname.local, LocalName::from("local"));
1283 
1284         let qname = process_qname("a:b".to_tendril());
1285         assert_eq!(qname.prefix, Some(Prefix::from("a")));
1286         assert_eq!(qname.local, LocalName::from("b"));
1287     }
1288 
1289     #[test]
1290     fn wrong_namespaces() {
1291         let qname = process_qname(":local".to_tendril());
1292         assert_eq!(qname.prefix, None);
1293         assert_eq!(qname.local, LocalName::from(":local"));
1294 
1295         let qname = process_qname("::local".to_tendril());
1296         assert_eq!(qname.prefix, None);
1297         assert_eq!(qname.local, LocalName::from("::local"));
1298 
1299         let qname = process_qname("a::local".to_tendril());
1300         assert_eq!(qname.prefix, None);
1301         assert_eq!(qname.local, LocalName::from("a::local"));
1302 
1303         let qname = process_qname("fake::".to_tendril());
1304         assert_eq!(qname.prefix, None);
1305         assert_eq!(qname.local, LocalName::from("fake::"));
1306 
1307         let qname = process_qname(":::".to_tendril());
1308         assert_eq!(qname.prefix, None);
1309         assert_eq!(qname.local, LocalName::from(":::"));
1310 
1311         let qname = process_qname(":a:b:".to_tendril());
1312         assert_eq!(qname.prefix, None);
1313         assert_eq!(qname.local, LocalName::from(":a:b:"));
1314     }
1315 }
1316