1 // Copyright 2014-2017 The html5ever Project Developers. See the
2 // COPYRIGHT file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9
10 mod char_ref;
11 mod interface;
12 mod qname;
13 pub mod states;
14
15 pub use self::interface::{CharacterTokens, EOFToken, NullCharacterToken};
16 pub use self::interface::{CommentToken, DoctypeToken, PIToken, TagToken};
17 pub use self::interface::{Doctype, Pi};
18 pub use self::interface::{EmptyTag, EndTag, ShortTag, StartTag};
19 pub use self::interface::{ParseError, Tag, TagKind, Token, TokenSink};
20 pub use crate::{LocalName, Namespace, Prefix};
21
22 use crate::tendril::StrTendril;
23 use crate::{buffer_queue, Attribute, QualName, SmallCharSet};
24 use log::debug;
25 use mac::{format_if, unwrap_or_return};
26 use markup5ever::{local_name, namespace_prefix, namespace_url, ns, small_char_set};
27 use std::borrow::Cow::{self, Borrowed};
28 use std::collections::BTreeMap;
29 use std::mem::replace;
30
31 use self::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult};
32 use self::char_ref::{CharRef, CharRefTokenizer};
33 use self::qname::QualNameTokenizer;
34 use self::states::XmlState;
35 use self::states::{DoctypeKind, Public, System};
36 use self::states::{DoubleQuoted, SingleQuoted, Unquoted};
37
38 /// Copy of Tokenizer options, with an impl for `Default`.
39 #[derive(Copy, Clone)]
40 pub struct XmlTokenizerOpts {
41 /// Report all parse errors described in the spec, at some
42 /// performance penalty? Default: false
43 pub exact_errors: bool,
44
45 /// Discard a `U+FEFF BYTE ORDER MARK` if we see one at the beginning
46 /// of the stream? Default: true
47 pub discard_bom: bool,
48
49 /// Keep a record of how long we spent in each state? Printed
50 /// when `end()` is called. Default: false
51 pub profile: bool,
52
53 /// Initial state override. Only the test runner should use
54 /// a non-`None` value!
55 pub initial_state: Option<states::XmlState>,
56 }
57
process_qname(tag_name: StrTendril) -> QualName58 fn process_qname(tag_name: StrTendril) -> QualName {
59 // If tag name can't possibly contain full namespace, skip qualified name
60 // parsing altogether. For a tag to have namespace it must look like:
61 // a:b
62 // Since StrTendril are UTF-8, we know that minimal size in bytes must be
63 // three bytes minimum.
64 let split = if (&*tag_name).as_bytes().len() < 3 {
65 None
66 } else {
67 QualNameTokenizer::new((&*tag_name).as_bytes()).run()
68 };
69
70 match split {
71 None => QualName::new(None, ns!(), LocalName::from(&*tag_name)),
72 Some(col) => {
73 let len = (&*tag_name).as_bytes().len() as u32;
74 let prefix = tag_name.subtendril(0, col);
75 let local = tag_name.subtendril(col + 1, len - col - 1);
76 let ns = ns!(); // Actual namespace URL set in XmlTreeBuilder::bind_qname
77 QualName::new(Some(Prefix::from(&*prefix)), ns, LocalName::from(&*local))
78 },
79 }
80 }
81
option_push(opt_str: &mut Option<StrTendril>, c: char)82 fn option_push(opt_str: &mut Option<StrTendril>, c: char) {
83 match *opt_str {
84 Some(ref mut s) => s.push_char(c),
85 None => *opt_str = Some(StrTendril::from_char(c)),
86 }
87 }
88
89 impl Default for XmlTokenizerOpts {
default() -> XmlTokenizerOpts90 fn default() -> XmlTokenizerOpts {
91 XmlTokenizerOpts {
92 exact_errors: false,
93 discard_bom: true,
94 profile: false,
95 initial_state: None,
96 }
97 }
98 }
99 /// The Xml tokenizer.
100 pub struct XmlTokenizer<Sink> {
101 /// Options controlling the behavior of the tokenizer.
102 opts: XmlTokenizerOpts,
103
104 /// Destination for tokens we emit.
105 pub sink: Sink,
106
107 /// The abstract machine state as described in the spec.
108 state: states::XmlState,
109
110 /// Are we at the end of the file, once buffers have been processed
111 /// completely? This affects whether we will wait for lookahead or not.
112 at_eof: bool,
113
114 /// Tokenizer for character references, if we're tokenizing
115 /// one at the moment.
116 char_ref_tokenizer: Option<Box<CharRefTokenizer>>,
117
118 /// Current input character. Just consumed, may reconsume.
119 current_char: char,
120
121 /// Should we reconsume the current input character?
122 reconsume: bool,
123
124 /// Did we just consume \r, translating it to \n? In that case we need
125 /// to ignore the next character if it's \n.
126 ignore_lf: bool,
127
128 /// Discard a U+FEFF BYTE ORDER MARK if we see one? Only done at the
129 /// beginning of the stream.
130 discard_bom: bool,
131
132 /// Temporary buffer
133 temp_buf: StrTendril,
134
135 /// Current tag kind.
136 current_tag_kind: TagKind,
137
138 /// Current tag name.
139 current_tag_name: StrTendril,
140
141 /// Current tag attributes.
142 current_tag_attrs: Vec<Attribute>,
143
144 /// Current attribute name.
145 current_attr_name: StrTendril,
146
147 /// Current attribute value.
148 current_attr_value: StrTendril,
149
150 current_doctype: Doctype,
151
152 /// Current comment.
153 current_comment: StrTendril,
154
155 /// Current processing instruction target.
156 current_pi_target: StrTendril,
157
158 /// Current processing instruction value.
159 current_pi_data: StrTendril,
160
161 /// Record of how many ns we spent in each state, if profiling is enabled.
162 state_profile: BTreeMap<states::XmlState, u64>,
163
164 /// Record of how many ns we spent in the token sink.
165 time_in_sink: u64,
166 }
167
168 impl<Sink: TokenSink> XmlTokenizer<Sink> {
169 /// Create a new tokenizer which feeds tokens to a particular `TokenSink`.
new(sink: Sink, opts: XmlTokenizerOpts) -> XmlTokenizer<Sink>170 pub fn new(sink: Sink, opts: XmlTokenizerOpts) -> XmlTokenizer<Sink> {
171 if opts.profile && cfg!(for_c) {
172 panic!("Can't profile tokenizer when built as a C library");
173 }
174
175 let state = *opts.initial_state.as_ref().unwrap_or(&states::Data);
176 let discard_bom = opts.discard_bom;
177 XmlTokenizer {
178 opts,
179 sink,
180 state,
181 char_ref_tokenizer: None,
182 at_eof: false,
183 current_char: '\0',
184 reconsume: false,
185 ignore_lf: false,
186 temp_buf: StrTendril::new(),
187 discard_bom,
188 current_tag_kind: StartTag,
189 current_tag_name: StrTendril::new(),
190 current_tag_attrs: vec![],
191 current_attr_name: StrTendril::new(),
192 current_attr_value: StrTendril::new(),
193 current_comment: StrTendril::new(),
194 current_pi_data: StrTendril::new(),
195 current_pi_target: StrTendril::new(),
196 current_doctype: Doctype::new(),
197 state_profile: BTreeMap::new(),
198 time_in_sink: 0,
199 }
200 }
201
202 /// Feed an input string into the tokenizer.
feed(&mut self, input: &mut BufferQueue)203 pub fn feed(&mut self, input: &mut BufferQueue) {
204 if input.is_empty() {
205 return;
206 }
207
208 if self.discard_bom {
209 if let Some(c) = input.peek() {
210 if c == '\u{feff}' {
211 input.next();
212 }
213 } else {
214 return;
215 }
216 };
217
218 self.run(input);
219 }
220
process_token(&mut self, token: Token)221 fn process_token(&mut self, token: Token) {
222 if self.opts.profile {
223 let (_, dt) = time!(self.sink.process_token(token));
224 self.time_in_sink += dt;
225 } else {
226 self.sink.process_token(token);
227 }
228 }
229
230 // Get the next input character, which might be the character
231 // 'c' that we already consumed from the buffers.
get_preprocessed_char(&mut self, mut c: char, input: &mut BufferQueue) -> Option<char>232 fn get_preprocessed_char(&mut self, mut c: char, input: &mut BufferQueue) -> Option<char> {
233 if self.ignore_lf {
234 self.ignore_lf = false;
235 if c == '\n' {
236 c = unwrap_or_return!(input.next(), None);
237 }
238 }
239
240 if c == '\r' {
241 self.ignore_lf = true;
242 c = '\n';
243 }
244
245 // Normalize \x00 into \uFFFD
246 if c == '\x00' {
247 c = '\u{FFFD}'
248 }
249
250 // Exclude forbidden Unicode characters
251 if self.opts.exact_errors &&
252 match c as u32 {
253 0x01..=0x08 | 0x0B | 0x0E..=0x1F | 0x7F..=0x9F | 0xFDD0..=0xFDEF => true,
254 n if (n & 0xFFFE) == 0xFFFE => true,
255 _ => false,
256 }
257 {
258 let msg = format!("Bad character {}", c);
259 self.emit_error(Cow::Owned(msg));
260 }
261
262 debug!("got character {}", c);
263 self.current_char = c;
264 Some(c)
265 }
266
bad_eof_error(&mut self)267 fn bad_eof_error(&mut self) {
268 let msg = format_if!(
269 self.opts.exact_errors,
270 "Unexpected EOF",
271 "Saw EOF in state {:?}",
272 self.state
273 );
274 self.emit_error(msg);
275 }
276
pop_except_from(&mut self, input: &mut BufferQueue, set: SmallCharSet) -> Option<SetResult>277 fn pop_except_from(&mut self, input: &mut BufferQueue, set: SmallCharSet) -> Option<SetResult> {
278 // Bail to the slow path for various corner cases.
279 // This means that `FromSet` can contain characters not in the set!
280 // It shouldn't matter because the fallback `FromSet` case should
281 // always do the same thing as the `NotFromSet` case.
282 if self.opts.exact_errors || self.reconsume || self.ignore_lf {
283 return self.get_char(input).map(FromSet);
284 }
285
286 let d = input.pop_except_from(set);
287 debug!("got characters {:?}", d);
288 match d {
289 Some(FromSet(c)) => self.get_preprocessed_char(c, input).map(FromSet),
290
291 // NB: We don't set self.current_char for a run of characters not
292 // in the set. It shouldn't matter for the codepaths that use
293 // this.
294 _ => d,
295 }
296 }
297
298 // Check if the next characters are an ASCII case-insensitive match. See
299 // BufferQueue::eat.
300 //
301 // NB: this doesn't do input stream preprocessing or set the current input
302 // character.
eat(&mut self, input: &mut BufferQueue, pat: &str) -> Option<bool>303 fn eat(&mut self, input: &mut BufferQueue, pat: &str) -> Option<bool> {
304 input.push_front(replace(&mut self.temp_buf, StrTendril::new()));
305 match input.eat(pat, u8::eq_ignore_ascii_case) {
306 None if self.at_eof => Some(false),
307 None => {
308 while let Some(c) = input.next() {
309 self.temp_buf.push_char(c);
310 }
311 None
312 },
313 Some(matched) => Some(matched),
314 }
315 }
316
317 /// Run the state machine for as long as we can.
run(&mut self, input: &mut BufferQueue)318 pub fn run(&mut self, input: &mut BufferQueue) {
319 if self.opts.profile {
320 loop {
321 let state = self.state;
322 let old_sink = self.time_in_sink;
323 let (run, mut dt) = time!(self.step(input));
324 dt -= self.time_in_sink - old_sink;
325 let new = match self.state_profile.get_mut(&state) {
326 Some(x) => {
327 *x += dt;
328 false
329 },
330 None => true,
331 };
332 if new {
333 // do this here because of borrow shenanigans
334 self.state_profile.insert(state, dt);
335 }
336 if !run {
337 break;
338 }
339 }
340 } else {
341 while self.step(input) {}
342 }
343 }
344
345 //§ tokenization
346 // Get the next input character, if one is available.
get_char(&mut self, input: &mut BufferQueue) -> Option<char>347 fn get_char(&mut self, input: &mut BufferQueue) -> Option<char> {
348 if self.reconsume {
349 self.reconsume = false;
350 Some(self.current_char)
351 } else {
352 input
353 .next()
354 .and_then(|c| self.get_preprocessed_char(c, input))
355 }
356 }
357
bad_char_error(&mut self)358 fn bad_char_error(&mut self) {
359 let msg = format_if!(
360 self.opts.exact_errors,
361 "Bad character",
362 "Saw {} in state {:?}",
363 self.current_char,
364 self.state
365 );
366 self.emit_error(msg);
367 }
368
discard_tag(&mut self)369 fn discard_tag(&mut self) {
370 self.current_tag_name = StrTendril::new();
371 self.current_tag_attrs = Vec::new();
372 }
373
create_tag(&mut self, kind: TagKind, c: char)374 fn create_tag(&mut self, kind: TagKind, c: char) {
375 self.discard_tag();
376 self.current_tag_name.push_char(c);
377 self.current_tag_kind = kind;
378 }
379
380 // This method creates a PI token and
381 // sets its target to given char
create_pi(&mut self, c: char)382 fn create_pi(&mut self, c: char) {
383 self.current_pi_target = StrTendril::new();
384 self.current_pi_data = StrTendril::new();
385 self.current_pi_target.push_char(c);
386 }
387
emit_char(&mut self, c: char)388 fn emit_char(&mut self, c: char) {
389 self.process_token(CharacterTokens(StrTendril::from_char(match c {
390 '\0' => '\u{FFFD}',
391 c => c,
392 })));
393 }
394
emit_short_tag(&mut self)395 fn emit_short_tag(&mut self) {
396 self.current_tag_kind = ShortTag;
397 self.current_tag_name = StrTendril::new();
398 self.emit_current_tag();
399 }
400
emit_empty_tag(&mut self)401 fn emit_empty_tag(&mut self) {
402 self.current_tag_kind = EmptyTag;
403 self.emit_current_tag();
404 }
405
set_empty_tag(&mut self)406 fn set_empty_tag(&mut self) {
407 self.current_tag_kind = EmptyTag;
408 }
409
emit_start_tag(&mut self)410 fn emit_start_tag(&mut self) {
411 self.current_tag_kind = StartTag;
412 self.emit_current_tag();
413 }
414
emit_current_tag(&mut self)415 fn emit_current_tag(&mut self) {
416 self.finish_attribute();
417
418 let qname = process_qname(replace(&mut self.current_tag_name, StrTendril::new()));
419
420 match self.current_tag_kind {
421 StartTag | EmptyTag => {},
422 EndTag => {
423 if !self.current_tag_attrs.is_empty() {
424 self.emit_error(Borrowed("Attributes on an end tag"));
425 }
426 },
427 ShortTag => {
428 if !self.current_tag_attrs.is_empty() {
429 self.emit_error(Borrowed("Attributes on a short tag"));
430 }
431 },
432 }
433
434 let token = TagToken(Tag {
435 kind: self.current_tag_kind,
436 name: qname,
437 attrs: replace(&mut self.current_tag_attrs, vec![]),
438 });
439 self.process_token(token);
440
441 match self.sink.query_state_change() {
442 None => (),
443 Some(s) => self.state = s,
444 }
445 }
446
447 // The string must not contain '\0'!
emit_chars(&mut self, b: StrTendril)448 fn emit_chars(&mut self, b: StrTendril) {
449 self.process_token(CharacterTokens(b));
450 }
451
452 // Emits the current Processing Instruction
emit_pi(&mut self)453 fn emit_pi(&mut self) {
454 let token = PIToken(Pi {
455 target: replace(&mut self.current_pi_target, StrTendril::new()),
456 data: replace(&mut self.current_pi_data, StrTendril::new()),
457 });
458 self.process_token(token);
459 }
460
consume_char_ref(&mut self, addnl_allowed: Option<char>)461 fn consume_char_ref(&mut self, addnl_allowed: Option<char>) {
462 // NB: The char ref tokenizer assumes we have an additional allowed
463 // character iff we're tokenizing in an attribute value.
464 self.char_ref_tokenizer = Some(Box::new(CharRefTokenizer::new(addnl_allowed)));
465 }
466
emit_eof(&mut self)467 fn emit_eof(&mut self) {
468 self.process_token(EOFToken);
469 }
470
emit_error(&mut self, error: Cow<'static, str>)471 fn emit_error(&mut self, error: Cow<'static, str>) {
472 self.process_token(ParseError(error));
473 }
474
emit_current_comment(&mut self)475 fn emit_current_comment(&mut self) {
476 let comment = replace(&mut self.current_comment, StrTendril::new());
477 self.process_token(CommentToken(comment));
478 }
479
emit_current_doctype(&mut self)480 fn emit_current_doctype(&mut self) {
481 let doctype = replace(&mut self.current_doctype, Doctype::new());
482 self.process_token(DoctypeToken(doctype));
483 }
484
doctype_id(&mut self, kind: DoctypeKind) -> &mut Option<StrTendril>485 fn doctype_id(&mut self, kind: DoctypeKind) -> &mut Option<StrTendril> {
486 match kind {
487 Public => &mut self.current_doctype.public_id,
488 System => &mut self.current_doctype.system_id,
489 }
490 }
491
clear_doctype_id(&mut self, kind: DoctypeKind)492 fn clear_doctype_id(&mut self, kind: DoctypeKind) {
493 let id = self.doctype_id(kind);
494 match *id {
495 Some(ref mut s) => s.clear(),
496 None => *id = Some(StrTendril::new()),
497 }
498 }
499
peek(&mut self, input: &mut BufferQueue) -> Option<char>500 fn peek(&mut self, input: &mut BufferQueue) -> Option<char> {
501 if self.reconsume {
502 Some(self.current_char)
503 } else {
504 input.peek()
505 }
506 }
507
discard_char(&mut self, input: &mut BufferQueue)508 fn discard_char(&mut self, input: &mut BufferQueue) {
509 let c = self.get_char(input);
510 assert!(c.is_some());
511 }
512
unconsume(&mut self, input: &mut BufferQueue, buf: StrTendril)513 fn unconsume(&mut self, input: &mut BufferQueue, buf: StrTendril) {
514 input.push_front(buf);
515 }
516 }
517
518 // Shorthand for common state machine behaviors.
519 macro_rules! shorthand (
520 ( $me:ident : emit $c:expr ) => ( $me.emit_char($c); );
521 ( $me:ident : create_tag $kind:ident $c:expr ) => ( $me.create_tag($kind, $c); );
522 ( $me:ident : push_tag $c:expr ) => ( $me.current_tag_name.push_char($c); );
523 ( $me:ident : discard_tag $input:expr ) => ( $me.discard_tag($input); );
524 ( $me:ident : discard_char ) => ( $me.discard_char(); );
525 ( $me:ident : push_temp $c:expr ) => ( $me.temp_buf.push_char($c); );
526 ( $me:ident : emit_temp ) => ( $me.emit_temp_buf(); );
527 ( $me:ident : clear_temp ) => ( $me.clear_temp_buf(); );
528 ( $me:ident : create_attr $c:expr ) => ( $me.create_attribute($c); );
529 ( $me:ident : push_name $c:expr ) => ( $me.current_attr_name.push_char($c); );
530 ( $me:ident : push_value $c:expr ) => ( $me.current_attr_value.push_char($c); );
531 ( $me:ident : append_value $c:expr ) => ( $me.current_attr_value.push_tendril($c); );
532 ( $me:ident : push_comment $c:expr ) => ( $me.current_comment.push_char($c); );
533 ( $me:ident : append_comment $c:expr ) => ( $me.current_comment.push_slice($c); );
534 ( $me:ident : emit_comment ) => ( $me.emit_current_comment(); );
535 ( $me:ident : clear_comment ) => ( $me.current_comment.clear(); );
536 ( $me:ident : create_doctype ) => ( $me.current_doctype = Doctype::new(); );
537 ( $me:ident : push_doctype_name $c:expr ) => ( option_push(&mut $me.current_doctype.name, $c); );
538 ( $me:ident : push_doctype_id $k:ident $c:expr ) => ( option_push($me.doctype_id($k), $c); );
539 ( $me:ident : clear_doctype_id $k:ident ) => ( $me.clear_doctype_id($k); );
540 ( $me:ident : emit_doctype ) => ( $me.emit_current_doctype(); );
541 ( $me:ident : error ) => ( $me.bad_char_error(); );
542 ( $me:ident : error_eof ) => ( $me.bad_eof_error(); );
543 ( $me:ident : create_pi $c:expr ) => ( $me.create_pi($c); );
544 ( $me:ident : push_pi_target $c:expr ) => ( $me.current_pi_target.push_char($c); );
545 ( $me:ident : push_pi_data $c:expr ) => ( $me.current_pi_data.push_char($c); );
546 ( $me:ident : set_empty_tag ) => ( $me.set_empty_tag(); );
547 );
548
549 // Tracing of tokenizer actions. This adds significant bloat and compile time,
550 // so it's behind a cfg flag.
551 #[cfg(trace_tokenizer)]
552 macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ({
553 debug!(" {:s}", stringify!($($cmds)*));
554 shorthand!($me:expr : $($cmds)*);
555 }));
556
557 #[cfg(not(trace_tokenizer))]
558 macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ( shorthand!($me: $($cmds)*) ) );
559
560 // A little DSL for sequencing shorthand actions.
561 macro_rules! go (
562 // A pattern like $($cmd:tt)* ; $($rest:tt)* causes parse ambiguity.
563 // We have to tell the parser how much lookahead we need.
564
565 ( $me:ident : $a:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a); go!($me: $($rest)*); });
566 ( $me:ident : $a:tt $b:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b); go!($me: $($rest)*); });
567 ( $me:ident : $a:tt $b:tt $c:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c); go!($me: $($rest)*); });
568 ( $me:ident : $a:tt $b:tt $c:tt $d:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c $d); go!($me: $($rest)*); });
569
570 // These can only come at the end.
571
572 ( $me:ident : to $s:ident ) => ({ $me.state = states::$s; return true; });
573 ( $me:ident : to $s:ident $k1:expr ) => ({ $me.state = states::$s($k1); return true; });
574 ( $me:ident : to $s:ident $k1:ident $k2:expr ) => ({ $me.state = states::$s($k1($k2)); return true; });
575
576 ( $me:ident : reconsume $s:ident ) => ({ $me.reconsume = true; go!($me: to $s); });
577 ( $me:ident : reconsume $s:ident $k1:expr ) => ({ $me.reconsume = true; go!($me: to $s $k1); });
578 ( $me:ident : reconsume $s:ident $k1:ident $k2:expr ) => ({ $me.reconsume = true; go!($me: to $s $k1 $k2); });
579
580 ( $me:ident : consume_char_ref ) => ({ $me.consume_char_ref(None); return true; });
581 ( $me:ident : consume_char_ref $addnl:expr ) => ({ $me.consume_char_ref(Some($addnl)); return true; });
582
583 // We have a default next state after emitting a tag, but the sink can override.
584 ( $me:ident : emit_tag $s:ident ) => ({
585 $me.state = states::$s;
586 $me.emit_current_tag();
587 return true;
588 });
589
590 // We have a special when dealing with empty and short tags in Xml
591 ( $me:ident : emit_short_tag $s:ident ) => ({
592 $me.state = states::$s;
593 $me.emit_short_tag();
594 return true;
595 });
596
597 ( $me:ident : emit_empty_tag $s:ident ) => ({
598 $me.state = states::$s;
599 $me.emit_empty_tag();
600 return true;
601 });
602
603 ( $me:ident : emit_start_tag $s:ident ) => ({
604 $me.state = states::$s;
605 $me.emit_start_tag();
606 return true;
607 });
608
609 ( $me:ident : emit_pi $s:ident ) => ({
610 $me.state = states::$s;
611 $me.emit_pi();
612 return true;
613 });
614
615 ( $me:ident : eof ) => ({ $me.emit_eof(); return false; });
616
617 // If nothing else matched, it's a single command
618 ( $me:ident : $($cmd:tt)+ ) => ( sh_trace!($me: $($cmd)+); );
619
620 // or nothing.
621 ( $me:ident : ) => (());
622 );
623
624 // This is a macro because it can cause early return
625 // from the function where it is used.
626 macro_rules! get_char ( ($me:expr, $input:expr) => (
627 unwrap_or_return!($me.get_char($input), false)
628 ));
629
630 macro_rules! pop_except_from ( ($me:expr, $input:expr, $set:expr) => (
631 unwrap_or_return!($me.pop_except_from($input, $set), false)
632 ));
633
634 macro_rules! eat ( ($me:expr, $input:expr, $pat:expr) => (
635 unwrap_or_return!($me.eat($input, $pat), false)
636 ));
637
638 impl<Sink: TokenSink> XmlTokenizer<Sink> {
639 // Run the state machine for a while.
640 // Return true if we should be immediately re-invoked
641 // (this just simplifies control flow vs. break / continue).
642 #[allow(clippy::never_loop)]
step(&mut self, input: &mut BufferQueue) -> bool643 fn step(&mut self, input: &mut BufferQueue) -> bool {
644 if self.char_ref_tokenizer.is_some() {
645 return self.step_char_ref_tokenizer(input);
646 }
647
648 debug!("processing in state {:?}", self.state);
649 match self.state {
650 XmlState::Quiescent => {
651 self.state = XmlState::Data;
652 false
653 },
654 //§ data-state
655 XmlState::Data => loop {
656 match pop_except_from!(self, input, small_char_set!('\r' '&' '<')) {
657 FromSet('&') => go!(self: consume_char_ref),
658 FromSet('<') => go!(self: to TagState),
659 FromSet(c) => go!(self: emit c),
660 NotFromSet(b) => self.emit_chars(b),
661 }
662 },
663 //§ tag-state
664 XmlState::TagState => loop {
665 match get_char!(self, input) {
666 '!' => go!(self: to MarkupDecl),
667 '/' => go!(self: to EndTagState),
668 '?' => go!(self: to Pi),
669 '\t' | '\n' | ' ' | ':' | '<' | '>' => {
670 go!(self: error; emit '<'; reconsume Data)
671 },
672 cl => go!(self: create_tag StartTag cl; to TagName),
673 }
674 },
675 //§ end-tag-state
676 XmlState::EndTagState => loop {
677 match get_char!(self, input) {
678 '>' => go!(self: emit_short_tag Data),
679 '\t' | '\n' | ' ' | '<' | ':' => {
680 go!(self: error; emit '<'; emit '/'; reconsume Data)
681 },
682 cl => go!(self: create_tag EndTag cl; to EndTagName),
683 }
684 },
685 //§ end-tag-name-state
686 XmlState::EndTagName => loop {
687 match get_char!(self, input) {
688 '\t' | '\n' | ' ' => go!(self: to EndTagNameAfter),
689 '/' => go!(self: error; to EndTagNameAfter),
690 '>' => go!(self: emit_tag Data),
691 cl => go!(self: push_tag cl),
692 }
693 },
694 //§ end-tag-name-after-state
695 XmlState::EndTagNameAfter => loop {
696 match get_char!(self, input) {
697 '>' => go!(self: emit_tag Data),
698 '\t' | '\n' | ' ' => (),
699 _ => self.emit_error(Borrowed("Unexpected element in tag name")),
700 }
701 },
702 //§ pi-state
703 XmlState::Pi => loop {
704 match get_char!(self, input) {
705 '\t' | '\n' | ' ' => go!(self: error; reconsume BogusComment),
706 cl => go!(self: create_pi cl; to PiTarget),
707 }
708 },
709 //§ pi-target-state
710 XmlState::PiTarget => loop {
711 match get_char!(self, input) {
712 '\t' | '\n' | ' ' => go!(self: to PiTargetAfter),
713 '?' => go!(self: to PiAfter),
714 cl => go!(self: push_pi_target cl),
715 }
716 },
717 //§ pi-target-after-state
718 XmlState::PiTargetAfter => loop {
719 match get_char!(self, input) {
720 '\t' | '\n' | ' ' => (),
721 _ => go!(self: reconsume PiData),
722 }
723 },
724 //§ pi-data-state
725 XmlState::PiData => loop {
726 match get_char!(self, input) {
727 '?' => go!(self: to PiAfter),
728 cl => go!(self: push_pi_data cl),
729 }
730 },
731 //§ pi-after-state
732 XmlState::PiAfter => loop {
733 match get_char!(self, input) {
734 '>' => go!(self: emit_pi Data),
735 '?' => go!(self: to PiAfter),
736 cl => go!(self: push_pi_data cl),
737 }
738 },
739 //§ markup-declaration-state
740 XmlState::MarkupDecl => loop {
741 if eat!(self, input, "--") {
742 go!(self: clear_comment; to CommentStart);
743 } else if eat!(self, input, "[CDATA[") {
744 go!(self: to Cdata);
745 } else if eat!(self, input, "DOCTYPE") {
746 go!(self: to Doctype);
747 } else {
748 // FIXME: 'error' gives wrong message
749 go!(self: error; to BogusComment);
750 }
751 },
752 //§ comment-start-state
753 XmlState::CommentStart => loop {
754 match get_char!(self, input) {
755 '-' => go!(self: to CommentStartDash),
756 '>' => go!(self: error; emit_comment; to Data),
757 _ => go!(self: reconsume Comment),
758 }
759 },
760 //§ comment-start-dash-state
761 XmlState::CommentStartDash => loop {
762 match get_char!(self, input) {
763 '-' => go!(self: to CommentEnd),
764 '>' => go!(self: error; emit_comment; to Data),
765 _ => go!(self: push_comment '-'; reconsume Comment),
766 }
767 },
768 //§ comment-state
769 XmlState::Comment => loop {
770 match get_char!(self, input) {
771 '<' => go!(self: push_comment '<'; to CommentLessThan),
772 '-' => go!(self: to CommentEndDash),
773 c => go!(self: push_comment c),
774 }
775 },
776 //§ comment-less-than-sign-state
777 XmlState::CommentLessThan => loop {
778 match get_char!(self, input) {
779 '!' => go!(self: push_comment '!';to CommentLessThanBang),
780 '<' => go!(self: push_comment '<'),
781 _ => go!(self: reconsume Comment),
782 }
783 },
784 //§ comment-less-than-sign-bang-state
785 XmlState::CommentLessThanBang => loop {
786 match get_char!(self, input) {
787 '-' => go!(self: to CommentLessThanBangDash),
788 _ => go!(self: reconsume Comment),
789 }
790 },
791 //§ comment-less-than-sign-bang-dash-state
792 XmlState::CommentLessThanBangDash => loop {
793 match get_char!(self, input) {
794 '-' => go!(self: to CommentLessThanBangDashDash),
795 _ => go!(self: reconsume CommentEndDash),
796 }
797 },
798 //§ comment-less-than-sign-bang-dash-dash-state
799 XmlState::CommentLessThanBangDashDash => loop {
800 match get_char!(self, input) {
801 '>' => go!(self: reconsume CommentEnd),
802 _ => go!(self: error; reconsume CommentEnd),
803 }
804 },
805 //§ comment-end-dash-state
806 XmlState::CommentEndDash => loop {
807 match get_char!(self, input) {
808 '-' => go!(self: to CommentEnd),
809 _ => go!(self: push_comment '-'; reconsume Comment),
810 }
811 },
812 //§ comment-end-state
813 XmlState::CommentEnd => loop {
814 match get_char!(self, input) {
815 '>' => go!(self: emit_comment; to Data),
816 '!' => go!(self: to CommentEndBang),
817 '-' => go!(self: push_comment '-'),
818 _ => go!(self: append_comment "--"; reconsume Comment),
819 }
820 },
821 //§ comment-end-bang-state
822 XmlState::CommentEndBang => loop {
823 match get_char!(self, input) {
824 '-' => go!(self: append_comment "--!"; to CommentEndDash),
825 '>' => go!(self: error; emit_comment; to Data),
826 _ => go!(self: append_comment "--!"; reconsume Comment),
827 }
828 },
829 //§ bogus-comment-state
830 XmlState::BogusComment => loop {
831 match get_char!(self, input) {
832 '>' => go!(self: emit_comment; to Data),
833 c => go!(self: push_comment c),
834 }
835 },
836 //§ cdata-state
837 XmlState::Cdata => loop {
838 match get_char!(self, input) {
839 ']' => go!(self: to CdataBracket),
840 cl => go!(self: emit cl),
841 }
842 },
843 //§ cdata-bracket-state
844 XmlState::CdataBracket => loop {
845 match get_char!(self, input) {
846 ']' => go!(self: to CdataEnd),
847 cl => go!(self: emit ']'; emit cl; to Cdata),
848 }
849 },
850 //§ cdata-end-state
851 XmlState::CdataEnd => loop {
852 match get_char!(self, input) {
853 '>' => go!(self: to Data),
854 ']' => go!(self: emit ']'),
855 cl => go!(self: emit ']'; emit ']'; emit cl; to Cdata),
856 }
857 },
858 //§ tag-name-state
859 XmlState::TagName => loop {
860 match get_char!(self, input) {
861 '\t' | '\n' | ' ' => go!(self: to TagAttrNameBefore),
862 '>' => go!(self: emit_tag Data),
863 '/' => go!(self: set_empty_tag; to TagEmpty),
864 cl => go!(self: push_tag cl),
865 }
866 },
867 //§ empty-tag-state
868 XmlState::TagEmpty => loop {
869 match get_char!(self, input) {
870 '>' => go!(self: emit_empty_tag Data),
871 _ => go!(self: reconsume TagAttrValueBefore),
872 }
873 },
874 //§ tag-attribute-name-before-state
875 XmlState::TagAttrNameBefore => loop {
876 match get_char!(self, input) {
877 '\t' | '\n' | ' ' => (),
878 '>' => go!(self: emit_tag Data),
879 '/' => go!(self: set_empty_tag; to TagEmpty),
880 ':' => go!(self: error),
881 cl => go!(self: create_attr cl; to TagAttrName),
882 }
883 },
884 //§ tag-attribute-name-state
885 XmlState::TagAttrName => loop {
886 match get_char!(self, input) {
887 '=' => go!(self: to TagAttrValueBefore),
888 '>' => go!(self: emit_tag Data),
889 '\t' | '\n' | ' ' => go!(self: to TagAttrNameAfter),
890 '/' => go!(self: set_empty_tag; to TagEmpty),
891 cl => go!(self: push_name cl),
892 }
893 },
894 //§ tag-attribute-name-after-state
895 XmlState::TagAttrNameAfter => loop {
896 match get_char!(self, input) {
897 '\t' | '\n' | ' ' => (),
898 '=' => go!(self: to TagAttrValueBefore),
899 '>' => go!(self: emit_tag Data),
900 '/' => go!(self: set_empty_tag; to TagEmpty),
901 cl => go!(self: create_attr cl; to TagAttrName),
902 }
903 },
904 //§ tag-attribute-value-before-state
905 XmlState::TagAttrValueBefore => loop {
906 match get_char!(self, input) {
907 '\t' | '\n' | ' ' => (),
908 '"' => go!(self: to TagAttrValue DoubleQuoted),
909 '\'' => go!(self: to TagAttrValue SingleQuoted),
910 '&' => go!(self: reconsume TagAttrValue(Unquoted)),
911 '>' => go!(self: emit_tag Data),
912 cl => go!(self: push_value cl; to TagAttrValue(Unquoted)),
913 }
914 },
915 //§ tag-attribute-value-double-quoted-state
916 XmlState::TagAttrValue(DoubleQuoted) => loop {
917 match pop_except_from!(self, input, small_char_set!('\n' '"' '&')) {
918 FromSet('"') => go!(self: to TagAttrNameBefore),
919 FromSet('&') => go!(self: consume_char_ref '"' ),
920 FromSet(c) => go!(self: push_value c),
921 NotFromSet(ref b) => go!(self: append_value b),
922 }
923 },
924 //§ tag-attribute-value-single-quoted-state
925 XmlState::TagAttrValue(SingleQuoted) => loop {
926 match pop_except_from!(self, input, small_char_set!('\n' '\'' '&')) {
927 FromSet('\'') => go!(self: to TagAttrNameBefore),
928 FromSet('&') => go!(self: consume_char_ref '\''),
929 FromSet(c) => go!(self: push_value c),
930 NotFromSet(ref b) => go!(self: append_value b),
931 }
932 },
933 //§ tag-attribute-value-double-quoted-state
934 XmlState::TagAttrValue(Unquoted) => loop {
935 match pop_except_from!(self, input, small_char_set!('\n' '\t' ' ' '&' '>')) {
936 FromSet('\t') | FromSet('\n') | FromSet(' ') => go!(self: to TagAttrNameBefore),
937 FromSet('&') => go!(self: consume_char_ref),
938 FromSet('>') => go!(self: emit_tag Data),
939 FromSet(c) => go!(self: push_value c),
940 NotFromSet(ref b) => go!(self: append_value b),
941 }
942 },
943
944 //§ doctype-state
945 XmlState::Doctype => loop {
946 match get_char!(self, input) {
947 '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeName),
948 _ => go!(self: error; reconsume BeforeDoctypeName),
949 }
950 },
951 //§ before-doctype-name-state
952 XmlState::BeforeDoctypeName => loop {
953 match get_char!(self, input) {
954 '\t' | '\n' | '\x0C' | ' ' => (),
955 '>' => go!(self: error; emit_doctype; to Data),
956 c => go!(self: create_doctype; push_doctype_name (c.to_ascii_lowercase());
957 to DoctypeName),
958 }
959 },
960 //§ doctype-name-state
961 XmlState::DoctypeName => loop {
962 match get_char!(self, input) {
963 '\t' | '\n' | '\x0C' | ' ' => go!(self: to AfterDoctypeName),
964 '>' => go!(self: emit_doctype; to Data),
965 c => go!(self: push_doctype_name (c.to_ascii_lowercase());
966 to DoctypeName),
967 }
968 },
969 //§ after-doctype-name-state
970 XmlState::AfterDoctypeName => loop {
971 if eat!(self, input, "public") {
972 go!(self: to AfterDoctypeKeyword Public);
973 } else if eat!(self, input, "system") {
974 go!(self: to AfterDoctypeKeyword System);
975 } else {
976 match get_char!(self, input) {
977 '\t' | '\n' | '\x0C' | ' ' => (),
978 '>' => go!(self: emit_doctype; to Data),
979 _ => go!(self: error; to BogusDoctype),
980 }
981 }
982 },
983 //§ after-doctype-public-keyword-state
984 XmlState::AfterDoctypeKeyword(Public) => loop {
985 match get_char!(self, input) {
986 '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier Public),
987 '"' => {
988 go!(self: error; clear_doctype_id Public; to DoctypeIdentifierDoubleQuoted Public)
989 },
990 '\'' => {
991 go!(self: error; clear_doctype_id Public; to DoctypeIdentifierSingleQuoted Public)
992 },
993 '>' => go!(self: error; emit_doctype; to Data),
994 _ => go!(self: error; to BogusDoctype),
995 }
996 },
997 //§ after-doctype-system-keyword-state
998 XmlState::AfterDoctypeKeyword(System) => loop {
999 match get_char!(self, input) {
1000 '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier System),
1001 '"' => {
1002 go!(self: error; clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System)
1003 },
1004 '\'' => {
1005 go!(self: error; clear_doctype_id System; to DoctypeIdentifierSingleQuoted System)
1006 },
1007 '>' => go!(self: error; emit_doctype; to Data),
1008 _ => go!(self: error; to BogusDoctype),
1009 }
1010 },
1011 //§ before_doctype_public_identifier_state before_doctype_system_identifier_state
1012 XmlState::BeforeDoctypeIdentifier(kind) => loop {
1013 match get_char!(self, input) {
1014 '\t' | '\n' | '\x0C' | ' ' => (),
1015 '"' => go!(self: clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind),
1016 '\'' => go!(self: clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind),
1017 '>' => go!(self: error; emit_doctype; to Data),
1018 _ => go!(self: error; to BogusDoctype),
1019 }
1020 },
1021 //§ doctype_public_identifier_double_quoted_state doctype_system_identifier_double_quoted_state
1022 XmlState::DoctypeIdentifierDoubleQuoted(kind) => loop {
1023 match get_char!(self, input) {
1024 '"' => go!(self: to AfterDoctypeIdentifier kind),
1025 '>' => go!(self: error; emit_doctype; to Data),
1026 c => go!(self: push_doctype_id kind c),
1027 }
1028 },
1029 //§ doctype_public_identifier_single_quoted_state doctype_system_identifier_single_quoted_state
1030 XmlState::DoctypeIdentifierSingleQuoted(kind) => loop {
1031 match get_char!(self, input) {
1032 '\'' => go!(self: to AfterDoctypeIdentifier kind),
1033 '>' => go!(self: error; emit_doctype; to Data),
1034 c => go!(self: push_doctype_id kind c),
1035 }
1036 },
1037 //§ doctype_public_identifier_single_quoted_state
1038 XmlState::AfterDoctypeIdentifier(Public) => loop {
1039 match get_char!(self, input) {
1040 '\t' | '\n' | '\x0C' | ' ' => {
1041 go!(self: to BetweenDoctypePublicAndSystemIdentifiers)
1042 },
1043 '\'' => {
1044 go!(self: error; clear_doctype_id System; to DoctypeIdentifierSingleQuoted(System))
1045 },
1046 '"' => {
1047 go!(self: error; clear_doctype_id System; to DoctypeIdentifierDoubleQuoted(System))
1048 },
1049 '>' => go!(self: emit_doctype; to Data),
1050 _ => go!(self: error; to BogusDoctype),
1051 }
1052 },
1053 //§ doctype_system_identifier_single_quoted_state
1054 XmlState::AfterDoctypeIdentifier(System) => loop {
1055 match get_char!(self, input) {
1056 '\t' | '\n' | '\x0C' | ' ' => (),
1057 '>' => go!(self: emit_doctype; to Data),
1058 _ => go!(self: error; to BogusDoctype),
1059 }
1060 },
1061 //§ between_doctype_public_and_system_identifier_state
1062 XmlState::BetweenDoctypePublicAndSystemIdentifiers => loop {
1063 match get_char!(self, input) {
1064 '\t' | '\n' | '\x0C' | ' ' => (),
1065 '>' => go!(self: emit_doctype; to Data),
1066 '\'' => go!(self: to DoctypeIdentifierSingleQuoted System),
1067 '"' => go!(self: to DoctypeIdentifierDoubleQuoted System),
1068 _ => go!(self: error; to BogusDoctype),
1069 }
1070 },
1071 //§ bogus_doctype_state
1072 XmlState::BogusDoctype => loop {
1073 match get_char!(self, input) {
1074 '>' => go!(self: emit_doctype; to Data),
1075 _ => (),
1076 }
1077 },
1078 }
1079 }
1080
1081 /// Indicate that we have reached the end of the input.
1082 pub fn end(&mut self) {
1083 // Handle EOF in the char ref sub-tokenizer, if there is one.
1084 // Do this first because it might un-consume stuff.
1085 let mut input = BufferQueue::new();
1086 match self.char_ref_tokenizer.take() {
1087 None => (),
1088 Some(mut tok) => {
1089 tok.end_of_file(self, &mut input);
1090 self.process_char_ref(tok.get_result());
1091 },
1092 }
1093
1094 // Process all remaining buffered input.
1095 // If we're waiting for lookahead, we're not gonna get it.
1096 self.at_eof = true;
1097 self.run(&mut input);
1098
1099 while self.eof_step() {
1100 // loop
1101 }
1102
1103 self.sink.end();
1104
1105 if self.opts.profile {
1106 self.dump_profile();
1107 }
1108 }
1109
1110 #[cfg(for_c)]
1111 fn dump_profile(&self) {
1112 unreachable!();
1113 }
1114
1115 #[cfg(not(for_c))]
1116 fn dump_profile(&self) {
1117 let mut results: Vec<(states::XmlState, u64)> =
1118 self.state_profile.iter().map(|(s, t)| (*s, *t)).collect();
1119 results.sort_by(|&(_, x), &(_, y)| y.cmp(&x));
1120
1121 let total: u64 = results
1122 .iter()
1123 .map(|&(_, t)| t)
1124 .fold(0, ::std::ops::Add::add);
1125 debug!("\nTokenizer profile, in nanoseconds");
1126 debug!("\n{:12} total in token sink", self.time_in_sink);
1127 debug!("\n{:12} total in tokenizer", total);
1128
1129 for (k, v) in results.into_iter() {
1130 let pct = 100.0 * (v as f64) / (total as f64);
1131 debug!("{:12} {:4.1}% {:?}", v, pct, k);
1132 }
1133 }
1134
1135 fn eof_step(&mut self) -> bool {
1136 debug!("processing EOF in state {:?}", self.state);
1137 match self.state {
1138 XmlState::Data | XmlState::Quiescent => go!(self: eof),
1139 XmlState::CommentStart | XmlState::CommentLessThan | XmlState::CommentLessThanBang => {
1140 go!(self: reconsume Comment)
1141 },
1142 XmlState::CommentLessThanBangDash => go!(self: reconsume CommentEndDash),
1143 XmlState::CommentLessThanBangDashDash => go!(self: reconsume CommentEnd),
1144 XmlState::CommentStartDash |
1145 XmlState::Comment |
1146 XmlState::CommentEndDash |
1147 XmlState::CommentEnd |
1148 XmlState::CommentEndBang => go!(self: error_eof; emit_comment; eof),
1149 XmlState::TagState => go!(self: error_eof; emit '<'; to Data),
1150 XmlState::EndTagState => go!(self: error_eof; emit '<'; emit '/'; to Data),
1151 XmlState::TagEmpty => go!(self: error_eof; to TagAttrNameBefore),
1152 XmlState::Cdata | XmlState::CdataBracket | XmlState::CdataEnd => {
1153 go!(self: error_eof; to Data)
1154 },
1155 XmlState::Pi => go!(self: error_eof; to BogusComment),
1156 XmlState::PiTargetAfter | XmlState::PiAfter => go!(self: reconsume PiData),
1157 XmlState::MarkupDecl => go!(self: error_eof; to BogusComment),
1158 XmlState::TagName |
1159 XmlState::TagAttrNameBefore |
1160 XmlState::EndTagName |
1161 XmlState::TagAttrNameAfter |
1162 XmlState::EndTagNameAfter |
1163 XmlState::TagAttrValueBefore |
1164 XmlState::TagAttrValue(_) => go!(self: error_eof; emit_tag Data),
1165 XmlState::PiData | XmlState::PiTarget => go!(self: error_eof; emit_pi Data),
1166 XmlState::TagAttrName => go!(self: error_eof; emit_start_tag Data),
1167 XmlState::BeforeDoctypeName |
1168 XmlState::Doctype |
1169 XmlState::DoctypeName |
1170 XmlState::AfterDoctypeName |
1171 XmlState::AfterDoctypeKeyword(_) |
1172 XmlState::BeforeDoctypeIdentifier(_) |
1173 XmlState::AfterDoctypeIdentifier(_) |
1174 XmlState::DoctypeIdentifierSingleQuoted(_) |
1175 XmlState::DoctypeIdentifierDoubleQuoted(_) |
1176 XmlState::BetweenDoctypePublicAndSystemIdentifiers => {
1177 go!(self: error_eof; emit_doctype; to Data)
1178 },
1179 XmlState::BogusDoctype => go!(self: emit_doctype; to Data),
1180 XmlState::BogusComment => go!(self: emit_comment; to Data),
1181 }
1182 }
1183
1184 fn process_char_ref(&mut self, char_ref: CharRef) {
1185 let CharRef {
1186 mut chars,
1187 mut num_chars,
1188 } = char_ref;
1189
1190 if num_chars == 0 {
1191 chars[0] = '&';
1192 num_chars = 1;
1193 }
1194
1195 for i in 0..num_chars {
1196 let c = chars[i as usize];
1197 match self.state {
1198 states::Data | states::Cdata => go!(self: emit c),
1199
1200 states::TagAttrValue(_) => go!(self: push_value c),
1201
1202 _ => panic!(
1203 "state {:?} should not be reachable in process_char_ref",
1204 self.state
1205 ),
1206 }
1207 }
1208 }
1209
1210 fn step_char_ref_tokenizer(&mut self, input: &mut BufferQueue) -> bool {
1211 let mut tok = self.char_ref_tokenizer.take().unwrap();
1212 let outcome = tok.step(self, input);
1213
1214 let progress = match outcome {
1215 char_ref::Done => {
1216 self.process_char_ref(tok.get_result());
1217 return true;
1218 },
1219
1220 char_ref::Stuck => false,
1221 char_ref::Progress => true,
1222 };
1223
1224 self.char_ref_tokenizer = Some(tok);
1225 progress
1226 }
1227
1228 fn finish_attribute(&mut self) {
1229 if self.current_attr_name.is_empty() {
1230 return;
1231 }
1232
1233 // Check for a duplicate attribute.
1234 // FIXME: the spec says we should error as soon as the name is finished.
1235 // FIXME: linear time search, do we care?
1236 let dup = {
1237 let name = &self.current_attr_name[..];
1238 self.current_tag_attrs
1239 .iter()
1240 .any(|a| &*a.name.local == name)
1241 };
1242
1243 if dup {
1244 self.emit_error(Borrowed("Duplicate attribute"));
1245 self.current_attr_name.clear();
1246 self.current_attr_value.clear();
1247 } else {
1248 let qname = process_qname(replace(&mut self.current_attr_name, StrTendril::new()));
1249 let attr = Attribute {
1250 name: qname.clone(),
1251 value: replace(&mut self.current_attr_value, StrTendril::new()),
1252 };
1253
1254 if qname.local == local_name!("xmlns") ||
1255 qname.prefix == Some(namespace_prefix!("xmlns"))
1256 {
1257 self.current_tag_attrs.insert(0, attr);
1258 } else {
1259 self.current_tag_attrs.push(attr);
1260 }
1261 }
1262 }
1263
1264 fn create_attribute(&mut self, c: char) {
1265 self.finish_attribute();
1266
1267 self.current_attr_name.push_char(c);
1268 }
1269 }
1270
1271 #[cfg(test)]
1272 mod test {
1273
1274 use super::process_qname;
1275 use crate::tendril::SliceExt;
1276 use crate::{LocalName, Prefix};
1277
1278 #[test]
1279 fn simple_namespace() {
1280 let qname = process_qname("prefix:local".to_tendril());
1281 assert_eq!(qname.prefix, Some(Prefix::from("prefix")));
1282 assert_eq!(qname.local, LocalName::from("local"));
1283
1284 let qname = process_qname("a:b".to_tendril());
1285 assert_eq!(qname.prefix, Some(Prefix::from("a")));
1286 assert_eq!(qname.local, LocalName::from("b"));
1287 }
1288
1289 #[test]
1290 fn wrong_namespaces() {
1291 let qname = process_qname(":local".to_tendril());
1292 assert_eq!(qname.prefix, None);
1293 assert_eq!(qname.local, LocalName::from(":local"));
1294
1295 let qname = process_qname("::local".to_tendril());
1296 assert_eq!(qname.prefix, None);
1297 assert_eq!(qname.local, LocalName::from("::local"));
1298
1299 let qname = process_qname("a::local".to_tendril());
1300 assert_eq!(qname.prefix, None);
1301 assert_eq!(qname.local, LocalName::from("a::local"));
1302
1303 let qname = process_qname("fake::".to_tendril());
1304 assert_eq!(qname.prefix, None);
1305 assert_eq!(qname.local, LocalName::from("fake::"));
1306
1307 let qname = process_qname(":::".to_tendril());
1308 assert_eq!(qname.prefix, None);
1309 assert_eq!(qname.local, LocalName::from(":::"));
1310
1311 let qname = process_qname(":a:b:".to_tendril());
1312 assert_eq!(qname.prefix, None);
1313 assert_eq!(qname.local, LocalName::from(":a:b:"));
1314 }
1315 }
1316