1 // Copyright 2014-2017 The html5ever Project Developers. See the
2 // COPYRIGHT file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9
10 mod char_ref;
11 mod interface;
12 mod qname;
13 pub mod states;
14
15 pub use self::interface::{CharacterTokens, EOFToken, NullCharacterToken};
16 pub use self::interface::{CommentToken, DoctypeToken, PIToken, TagToken};
17 pub use self::interface::{Doctype, Pi};
18 pub use self::interface::{EmptyTag, EndTag, ShortTag, StartTag};
19 pub use self::interface::{ParseError, Tag, TagKind, Token, TokenSink};
20 pub use crate::{LocalName, Namespace, Prefix};
21
22 use log::debug;
23 use mac::{format_if, unwrap_or_return};
24 use markup5ever::{local_name, namespace_prefix, namespace_url, ns, small_char_set};
25 use std::borrow::Cow::{self, Borrowed};
26 use std::collections::BTreeMap;
27 use std::mem::replace;
28 use crate::tendril::StrTendril;
29 use crate::{buffer_queue, Attribute, QualName, SmallCharSet};
30
31 use self::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult};
32 use self::char_ref::{CharRef, CharRefTokenizer};
33 use self::qname::QualNameTokenizer;
34 use self::states::XmlState;
35 use self::states::{DoctypeKind, Public, System};
36 use self::states::{DoubleQuoted, SingleQuoted, Unquoted};
37
38 /// Copy of Tokenizer options, with an impl for `Default`.
39 #[derive(Copy, Clone)]
40 pub struct XmlTokenizerOpts {
41 /// Report all parse errors described in the spec, at some
42 /// performance penalty? Default: false
43 pub exact_errors: bool,
44
45 /// Discard a `U+FEFF BYTE ORDER MARK` if we see one at the beginning
46 /// of the stream? Default: true
47 pub discard_bom: bool,
48
49 /// Keep a record of how long we spent in each state? Printed
50 /// when `end()` is called. Default: false
51 pub profile: bool,
52
53 /// Initial state override. Only the test runner should use
54 /// a non-`None` value!
55 pub initial_state: Option<states::XmlState>,
56 }
57
process_qname(tag_name: StrTendril) -> QualName58 fn process_qname(tag_name: StrTendril) -> QualName {
59 // If tag name can't possibly contain full namespace, skip qualified name
60 // parsing altogether. For a tag to have namespace it must look like:
61 // a:b
62 // Since StrTendril are UTF-8, we know that minimal size in bytes must be
63 // three bytes minimum.
64 let split = if (&*tag_name).as_bytes().len() < 3 {
65 None
66 } else {
67 QualNameTokenizer::new((&*tag_name).as_bytes()).run()
68 };
69
70 match split {
71 None => QualName::new(None, ns!(), LocalName::from(&*tag_name)),
72 Some(col) => {
73 let len = (&*tag_name).as_bytes().len() as u32;
74 let prefix = tag_name.subtendril(0, col);
75 let local = tag_name.subtendril(col + 1, len - col - 1);
76 let ns = ns!(); // Actual namespace URL set in XmlTreeBuilder::bind_qname
77 QualName::new(Some(Prefix::from(&*prefix)), ns, LocalName::from(&*local))
78 },
79 }
80 }
81
option_push(opt_str: &mut Option<StrTendril>, c: char)82 fn option_push(opt_str: &mut Option<StrTendril>, c: char) {
83 match *opt_str {
84 Some(ref mut s) => s.push_char(c),
85 None => *opt_str = Some(StrTendril::from_char(c)),
86 }
87 }
88
89 impl Default for XmlTokenizerOpts {
default() -> XmlTokenizerOpts90 fn default() -> XmlTokenizerOpts {
91 XmlTokenizerOpts {
92 exact_errors: false,
93 discard_bom: true,
94 profile: false,
95 initial_state: None,
96 }
97 }
98 }
99 /// The Xml tokenizer.
100 pub struct XmlTokenizer<Sink> {
101 /// Options controlling the behavior of the tokenizer.
102 opts: XmlTokenizerOpts,
103
104 /// Destination for tokens we emit.
105 pub sink: Sink,
106
107 /// The abstract machine state as described in the spec.
108 state: states::XmlState,
109
110 /// Are we at the end of the file, once buffers have been processed
111 /// completely? This affects whether we will wait for lookahead or not.
112 at_eof: bool,
113
114 /// Tokenizer for character references, if we're tokenizing
115 /// one at the moment.
116 char_ref_tokenizer: Option<Box<CharRefTokenizer>>,
117
118 /// Current input character. Just consumed, may reconsume.
119 current_char: char,
120
121 /// Should we reconsume the current input character?
122 reconsume: bool,
123
124 /// Did we just consume \r, translating it to \n? In that case we need
125 /// to ignore the next character if it's \n.
126 ignore_lf: bool,
127
128 /// Discard a U+FEFF BYTE ORDER MARK if we see one? Only done at the
129 /// beginning of the stream.
130 discard_bom: bool,
131
132 /// Temporary buffer
133 temp_buf: StrTendril,
134
135 /// Current tag kind.
136 current_tag_kind: TagKind,
137
138 /// Current tag name.
139 current_tag_name: StrTendril,
140
141 /// Current tag attributes.
142 current_tag_attrs: Vec<Attribute>,
143
144 /// Current attribute name.
145 current_attr_name: StrTendril,
146
147 /// Current attribute value.
148 current_attr_value: StrTendril,
149
150 current_doctype: Doctype,
151
152 /// Current comment.
153 current_comment: StrTendril,
154
155 /// Current processing instruction target.
156 current_pi_target: StrTendril,
157
158 /// Current processing instruction value.
159 current_pi_data: StrTendril,
160
161 /// Record of how many ns we spent in each state, if profiling is enabled.
162 state_profile: BTreeMap<states::XmlState, u64>,
163
164 /// Record of how many ns we spent in the token sink.
165 time_in_sink: u64,
166 }
167
168 impl<Sink: TokenSink> XmlTokenizer<Sink> {
169 /// Create a new tokenizer which feeds tokens to a particular `TokenSink`.
new(sink: Sink, opts: XmlTokenizerOpts) -> XmlTokenizer<Sink>170 pub fn new(sink: Sink, opts: XmlTokenizerOpts) -> XmlTokenizer<Sink> {
171 if opts.profile && cfg!(for_c) {
172 panic!("Can't profile tokenizer when built as a C library");
173 }
174
175 let state = *opts.initial_state.as_ref().unwrap_or(&states::Data);
176 let discard_bom = opts.discard_bom;
177 XmlTokenizer {
178 opts: opts,
179 sink: sink,
180 state: state,
181 char_ref_tokenizer: None,
182 at_eof: false,
183 current_char: '\0',
184 reconsume: false,
185 ignore_lf: false,
186 temp_buf: StrTendril::new(),
187 discard_bom: discard_bom,
188 current_tag_kind: StartTag,
189 current_tag_name: StrTendril::new(),
190 current_tag_attrs: vec![],
191 current_attr_name: StrTendril::new(),
192 current_attr_value: StrTendril::new(),
193 current_comment: StrTendril::new(),
194 current_pi_data: StrTendril::new(),
195 current_pi_target: StrTendril::new(),
196 current_doctype: Doctype::new(),
197 state_profile: BTreeMap::new(),
198 time_in_sink: 0,
199 }
200 }
201
202 /// Feed an input string into the tokenizer.
feed(&mut self, input: &mut BufferQueue)203 pub fn feed(&mut self, input: &mut BufferQueue) {
204 if input.is_empty() {
205 return;
206 }
207
208 if self.discard_bom {
209 if let Some(c) = input.peek() {
210 if c == '\u{feff}' {
211 input.next();
212 }
213 } else {
214 return;
215 }
216 };
217
218 self.run(input);
219 }
220
process_token(&mut self, token: Token)221 fn process_token(&mut self, token: Token) {
222 if self.opts.profile {
223 let (_, dt) = time!(self.sink.process_token(token));
224 self.time_in_sink += dt;
225 } else {
226 self.sink.process_token(token);
227 }
228 }
229
230 // Get the next input character, which might be the character
231 // 'c' that we already consumed from the buffers.
get_preprocessed_char(&mut self, mut c: char, input: &mut BufferQueue) -> Option<char>232 fn get_preprocessed_char(&mut self, mut c: char, input: &mut BufferQueue) -> Option<char> {
233 if self.ignore_lf {
234 self.ignore_lf = false;
235 if c == '\n' {
236 c = unwrap_or_return!(input.next(), None);
237 }
238 }
239
240 if c == '\r' {
241 self.ignore_lf = true;
242 c = '\n';
243 }
244
245 // Normalize \x00 into \uFFFD
246 if c == '\x00' {
247 c = '\u{FFFD}'
248 }
249
250 // Exclude forbidden Unicode characters
251 if self.opts.exact_errors &&
252 match c as u32 {
253 0x01..=0x08 | 0x0B | 0x0E..=0x1F | 0x7F..=0x9F | 0xFDD0..=0xFDEF => true,
254 n if (n & 0xFFFE) == 0xFFFE => true,
255 _ => false,
256 }
257 {
258 let msg = format!("Bad character {}", c);
259 self.emit_error(Cow::Owned(msg));
260 }
261
262 debug!("got character {}", c);
263 self.current_char = c;
264 Some(c)
265 }
266
bad_eof_error(&mut self)267 fn bad_eof_error(&mut self) {
268 let msg = format_if!(
269 self.opts.exact_errors,
270 "Unexpected EOF",
271 "Saw EOF in state {:?}",
272 self.state
273 );
274 self.emit_error(msg);
275 }
276
pop_except_from(&mut self, input: &mut BufferQueue, set: SmallCharSet) -> Option<SetResult>277 fn pop_except_from(&mut self, input: &mut BufferQueue, set: SmallCharSet) -> Option<SetResult> {
278 // Bail to the slow path for various corner cases.
279 // This means that `FromSet` can contain characters not in the set!
280 // It shouldn't matter because the fallback `FromSet` case should
281 // always do the same thing as the `NotFromSet` case.
282 if self.opts.exact_errors || self.reconsume || self.ignore_lf {
283 return self.get_char(input).map(|x| FromSet(x));
284 }
285
286 let d = input.pop_except_from(set);
287 debug!("got characters {:?}", d);
288 match d {
289 Some(FromSet(c)) => self.get_preprocessed_char(c, input).map(|x| FromSet(x)),
290
291 // NB: We don't set self.current_char for a run of characters not
292 // in the set. It shouldn't matter for the codepaths that use
293 // this.
294 _ => d,
295 }
296 }
297
298 // Check if the next characters are an ASCII case-insensitive match. See
299 // BufferQueue::eat.
300 //
301 // NB: this doesn't do input stream preprocessing or set the current input
302 // character.
eat(&mut self, input: &mut BufferQueue, pat: &str) -> Option<bool>303 fn eat(&mut self, input: &mut BufferQueue, pat: &str) -> Option<bool> {
304 input.push_front(replace(&mut self.temp_buf, StrTendril::new()));
305 match input.eat(pat, u8::eq_ignore_ascii_case) {
306 None if self.at_eof => Some(false),
307 None => {
308 while let Some(c) = input.next() {
309 self.temp_buf.push_char(c);
310 }
311 None
312 },
313 Some(matched) => Some(matched),
314 }
315 }
316
317 /// Run the state machine for as long as we can.
run(&mut self, input: &mut BufferQueue)318 pub fn run(&mut self, input: &mut BufferQueue) {
319 if self.opts.profile {
320 loop {
321 let state = self.state;
322 let old_sink = self.time_in_sink;
323 let (run, mut dt) = time!(self.step(input));
324 dt -= self.time_in_sink - old_sink;
325 let new = match self.state_profile.get_mut(&state) {
326 Some(x) => {
327 *x += dt;
328 false
329 },
330 None => true,
331 };
332 if new {
333 // do this here because of borrow shenanigans
334 self.state_profile.insert(state, dt);
335 }
336 if !run {
337 break;
338 }
339 }
340 } else {
341 while self.step(input) {}
342 }
343 }
344
345 //§ tokenization
346 // Get the next input character, if one is available.
get_char(&mut self, input: &mut BufferQueue) -> Option<char>347 fn get_char(&mut self, input: &mut BufferQueue) -> Option<char> {
348 if self.reconsume {
349 self.reconsume = false;
350 Some(self.current_char)
351 } else {
352 input
353 .next()
354 .and_then(|c| self.get_preprocessed_char(c, input))
355 }
356 }
357
bad_char_error(&mut self)358 fn bad_char_error(&mut self) {
359 let msg = format_if!(
360 self.opts.exact_errors,
361 "Bad character",
362 "Saw {} in state {:?}",
363 self.current_char,
364 self.state
365 );
366 self.emit_error(msg);
367 }
368
discard_tag(&mut self)369 fn discard_tag(&mut self) {
370 self.current_tag_name = StrTendril::new();
371 self.current_tag_attrs = Vec::new();
372 }
373
create_tag(&mut self, kind: TagKind, c: char)374 fn create_tag(&mut self, kind: TagKind, c: char) {
375 self.discard_tag();
376 self.current_tag_name.push_char(c);
377 self.current_tag_kind = kind;
378 }
379
380 // This method creates a PI token and
381 // sets its target to given char
create_pi(&mut self, c: char)382 fn create_pi(&mut self, c: char) {
383 self.current_pi_target = StrTendril::new();
384 self.current_pi_data = StrTendril::new();
385 self.current_pi_target.push_char(c);
386 }
387
emit_char(&mut self, c: char)388 fn emit_char(&mut self, c: char) {
389 self.process_token(CharacterTokens(StrTendril::from_char(match c {
390 '\0' => '\u{FFFD}',
391 c => c,
392 })));
393 }
394
emit_short_tag(&mut self)395 fn emit_short_tag(&mut self) {
396 self.current_tag_kind = ShortTag;
397 self.current_tag_name = StrTendril::new();
398 self.emit_current_tag();
399 }
400
emit_empty_tag(&mut self)401 fn emit_empty_tag(&mut self) {
402 self.current_tag_kind = EmptyTag;
403 self.emit_current_tag();
404 }
405
set_empty_tag(&mut self)406 fn set_empty_tag(&mut self) {
407 self.current_tag_kind = EmptyTag;
408 }
409
emit_start_tag(&mut self)410 fn emit_start_tag(&mut self) {
411 self.current_tag_kind = StartTag;
412 self.emit_current_tag();
413 }
414
emit_current_tag(&mut self)415 fn emit_current_tag(&mut self) {
416 self.finish_attribute();
417
418 let qname = process_qname(replace(&mut self.current_tag_name, StrTendril::new()));
419
420 match self.current_tag_kind {
421 StartTag | EmptyTag => {},
422 EndTag => {
423 if !self.current_tag_attrs.is_empty() {
424 self.emit_error(Borrowed("Attributes on an end tag"));
425 }
426 },
427 ShortTag => {
428 if !self.current_tag_attrs.is_empty() {
429 self.emit_error(Borrowed("Attributes on a short tag"));
430 }
431 },
432 }
433
434 let token = TagToken(Tag {
435 kind: self.current_tag_kind,
436 name: qname,
437 attrs: replace(&mut self.current_tag_attrs, vec![]),
438 });
439 self.process_token(token);
440
441 match self.sink.query_state_change() {
442 None => (),
443 Some(s) => self.state = s,
444 }
445 }
446
447 // The string must not contain '\0'!
emit_chars(&mut self, b: StrTendril)448 fn emit_chars(&mut self, b: StrTendril) {
449 self.process_token(CharacterTokens(b));
450 }
451
452 // Emits the current Processing Instruction
emit_pi(&mut self)453 fn emit_pi(&mut self) {
454 let token = PIToken(Pi {
455 target: replace(&mut self.current_pi_target, StrTendril::new()),
456 data: replace(&mut self.current_pi_data, StrTendril::new()),
457 });
458 self.process_token(token);
459 }
460
consume_char_ref(&mut self, addnl_allowed: Option<char>)461 fn consume_char_ref(&mut self, addnl_allowed: Option<char>) {
462 // NB: The char ref tokenizer assumes we have an additional allowed
463 // character iff we're tokenizing in an attribute value.
464 self.char_ref_tokenizer = Some(Box::new(CharRefTokenizer::new(addnl_allowed)));
465 }
466
emit_eof(&mut self)467 fn emit_eof(&mut self) {
468 self.process_token(EOFToken);
469 }
470
emit_error(&mut self, error: Cow<'static, str>)471 fn emit_error(&mut self, error: Cow<'static, str>) {
472 self.process_token(ParseError(error));
473 }
474
emit_current_comment(&mut self)475 fn emit_current_comment(&mut self) {
476 let comment = replace(&mut self.current_comment, StrTendril::new());
477 self.process_token(CommentToken(comment));
478 }
479
emit_current_doctype(&mut self)480 fn emit_current_doctype(&mut self) {
481 let doctype = replace(&mut self.current_doctype, Doctype::new());
482 self.process_token(DoctypeToken(doctype));
483 }
484
doctype_id<'a>(&'a mut self, kind: DoctypeKind) -> &'a mut Option<StrTendril>485 fn doctype_id<'a>(&'a mut self, kind: DoctypeKind) -> &'a mut Option<StrTendril> {
486 match kind {
487 Public => &mut self.current_doctype.public_id,
488 System => &mut self.current_doctype.system_id,
489 }
490 }
491
clear_doctype_id(&mut self, kind: DoctypeKind)492 fn clear_doctype_id(&mut self, kind: DoctypeKind) {
493 let id = self.doctype_id(kind);
494 match *id {
495 Some(ref mut s) => s.clear(),
496 None => *id = Some(StrTendril::new()),
497 }
498 }
499
peek(&mut self, input: &mut BufferQueue) -> Option<char>500 fn peek(&mut self, input: &mut BufferQueue) -> Option<char> {
501 if self.reconsume {
502 Some(self.current_char)
503 } else {
504 input.peek()
505 }
506 }
507
discard_char(&mut self, input: &mut BufferQueue)508 fn discard_char(&mut self, input: &mut BufferQueue) {
509 let c = self.get_char(input);
510 assert!(c.is_some());
511 }
512
unconsume(&mut self, input: &mut BufferQueue, buf: StrTendril)513 fn unconsume(&mut self, input: &mut BufferQueue, buf: StrTendril) {
514 input.push_front(buf);
515 }
516 }
517
518 // Shorthand for common state machine behaviors.
519 macro_rules! shorthand (
520 ( $me:ident : emit $c:expr ) => ( $me.emit_char($c); );
521 ( $me:ident : create_tag $kind:ident $c:expr ) => ( $me.create_tag($kind, $c); );
522 ( $me:ident : push_tag $c:expr ) => ( $me.current_tag_name.push_char($c); );
523 ( $me:ident : discard_tag $input:expr ) => ( $me.discard_tag($input); );
524 ( $me:ident : discard_char ) => ( $me.discard_char(); );
525 ( $me:ident : push_temp $c:expr ) => ( $me.temp_buf.push_char($c); );
526 ( $me:ident : emit_temp ) => ( $me.emit_temp_buf(); );
527 ( $me:ident : clear_temp ) => ( $me.clear_temp_buf(); );
528 ( $me:ident : create_attr $c:expr ) => ( $me.create_attribute($c); );
529 ( $me:ident : push_name $c:expr ) => ( $me.current_attr_name.push_char($c); );
530 ( $me:ident : push_value $c:expr ) => ( $me.current_attr_value.push_char($c); );
531 ( $me:ident : append_value $c:expr ) => ( $me.current_attr_value.push_tendril($c); );
532 ( $me:ident : push_comment $c:expr ) => ( $me.current_comment.push_char($c); );
533 ( $me:ident : append_comment $c:expr ) => ( $me.current_comment.push_slice($c); );
534 ( $me:ident : emit_comment ) => ( $me.emit_current_comment(); );
535 ( $me:ident : clear_comment ) => ( $me.current_comment.clear(); );
536 ( $me:ident : create_doctype ) => ( $me.current_doctype = Doctype::new(); );
537 ( $me:ident : push_doctype_name $c:expr ) => ( option_push(&mut $me.current_doctype.name, $c); );
538 ( $me:ident : push_doctype_id $k:ident $c:expr ) => ( option_push($me.doctype_id($k), $c); );
539 ( $me:ident : clear_doctype_id $k:ident ) => ( $me.clear_doctype_id($k); );
540 ( $me:ident : emit_doctype ) => ( $me.emit_current_doctype(); );
541 ( $me:ident : error ) => ( $me.bad_char_error(); );
542 ( $me:ident : error_eof ) => ( $me.bad_eof_error(); );
543 ( $me:ident : create_pi $c:expr ) => ( $me.create_pi($c); );
544 ( $me:ident : push_pi_target $c:expr ) => ( $me.current_pi_target.push_char($c); );
545 ( $me:ident : push_pi_data $c:expr ) => ( $me.current_pi_data.push_char($c); );
546 ( $me:ident : set_empty_tag ) => ( $me.set_empty_tag(); );
547 );
548
549 // Tracing of tokenizer actions. This adds significant bloat and compile time,
550 // so it's behind a cfg flag.
551 #[cfg(trace_tokenizer)]
552 macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ({
553 debug!(" {:s}", stringify!($($cmds)*));
554 shorthand!($me:expr : $($cmds)*);
555 }));
556
557 #[cfg(not(trace_tokenizer))]
558 macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ( shorthand!($me: $($cmds)*) ) );
559
560 // A little DSL for sequencing shorthand actions.
561 macro_rules! go (
562 // A pattern like $($cmd:tt)* ; $($rest:tt)* causes parse ambiguity.
563 // We have to tell the parser how much lookahead we need.
564
565 ( $me:ident : $a:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a); go!($me: $($rest)*); });
566 ( $me:ident : $a:tt $b:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b); go!($me: $($rest)*); });
567 ( $me:ident : $a:tt $b:tt $c:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c); go!($me: $($rest)*); });
568 ( $me:ident : $a:tt $b:tt $c:tt $d:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c $d); go!($me: $($rest)*); });
569
570 // These can only come at the end.
571
572 ( $me:ident : to $s:ident ) => ({ $me.state = states::$s; return true; });
573 ( $me:ident : to $s:ident $k1:expr ) => ({ $me.state = states::$s($k1); return true; });
574 ( $me:ident : to $s:ident $k1:ident $k2:expr ) => ({ $me.state = states::$s($k1($k2)); return true; });
575
576 ( $me:ident : reconsume $s:ident ) => ({ $me.reconsume = true; go!($me: to $s); });
577 ( $me:ident : reconsume $s:ident $k1:expr ) => ({ $me.reconsume = true; go!($me: to $s $k1); });
578 ( $me:ident : reconsume $s:ident $k1:ident $k2:expr ) => ({ $me.reconsume = true; go!($me: to $s $k1 $k2); });
579
580 ( $me:ident : consume_char_ref ) => ({ $me.consume_char_ref(None); return true; });
581 ( $me:ident : consume_char_ref $addnl:expr ) => ({ $me.consume_char_ref(Some($addnl)); return true; });
582
583 // We have a default next state after emitting a tag, but the sink can override.
584 ( $me:ident : emit_tag $s:ident ) => ({
585 $me.state = states::$s;
586 $me.emit_current_tag();
587 return true;
588 });
589
590 // We have a special when dealing with empty and short tags in Xml
591 ( $me:ident : emit_short_tag $s:ident ) => ({
592 $me.state = states::$s;
593 $me.emit_short_tag();
594 return true;
595 });
596
597 ( $me:ident : emit_empty_tag $s:ident ) => ({
598 $me.state = states::$s;
599 $me.emit_empty_tag();
600 return true;
601 });
602
603 ( $me:ident : emit_start_tag $s:ident ) => ({
604 $me.state = states::$s;
605 $me.emit_start_tag();
606 return true;
607 });
608
609 ( $me:ident : emit_pi $s:ident ) => ({
610 $me.state = states::$s;
611 $me.emit_pi();
612 return true;
613 });
614
615 ( $me:ident : eof ) => ({ $me.emit_eof(); return false; });
616
617 // If nothing else matched, it's a single command
618 ( $me:ident : $($cmd:tt)+ ) => ( sh_trace!($me: $($cmd)+); );
619
620 // or nothing.
621 ( $me:ident : ) => (());
622 );
623
624 // This is a macro because it can cause early return
625 // from the function where it is used.
626 macro_rules! get_char ( ($me:expr, $input:expr) => (
627 unwrap_or_return!($me.get_char($input), false)
628 ));
629
630 macro_rules! pop_except_from ( ($me:expr, $input:expr, $set:expr) => (
631 unwrap_or_return!($me.pop_except_from($input, $set), false)
632 ));
633
634 macro_rules! eat ( ($me:expr, $input:expr, $pat:expr) => (
635 unwrap_or_return!($me.eat($input, $pat), false)
636 ));
637
638 impl<Sink: TokenSink> XmlTokenizer<Sink> {
639 // Run the state machine for a while.
640 // Return true if we should be immediately re-invoked
641 // (this just simplifies control flow vs. break / continue).
step(&mut self, input: &mut BufferQueue) -> bool642 fn step(&mut self, input: &mut BufferQueue) -> bool {
643 if self.char_ref_tokenizer.is_some() {
644 return self.step_char_ref_tokenizer(input);
645 }
646
647 debug!("processing in state {:?}", self.state);
648 match self.state {
649 XmlState::Quiescent => {
650 self.state = XmlState::Data;
651 return false;
652 },
653 //§ data-state
654 XmlState::Data => loop {
655 match pop_except_from!(self, input, small_char_set!('\r' '&' '<')) {
656 FromSet('&') => go!(self: consume_char_ref),
657 FromSet('<') => go!(self: to TagState),
658 FromSet(c) => go!(self: emit c),
659 NotFromSet(b) => self.emit_chars(b),
660 }
661 },
662 //§ tag-state
663 XmlState::TagState => loop {
664 match get_char!(self, input) {
665 '!' => go!(self: to MarkupDecl),
666 '/' => go!(self: to EndTagState),
667 '?' => go!(self: to Pi),
668 '\t' | '\n' | ' ' | ':' | '<' | '>' => {
669 go!(self: error; emit '<'; reconsume Data)
670 },
671 cl => go!(self: create_tag StartTag cl; to TagName),
672 }
673 },
674 //§ end-tag-state
675 XmlState::EndTagState => loop {
676 match get_char!(self, input) {
677 '>' => go!(self: emit_short_tag Data),
678 '\t' | '\n' | ' ' | '<' | ':' => {
679 go!(self: error; emit '<'; emit '/'; reconsume Data)
680 },
681 cl => go!(self: create_tag EndTag cl; to EndTagName),
682 }
683 },
684 //§ end-tag-name-state
685 XmlState::EndTagName => loop {
686 match get_char!(self, input) {
687 '\t' | '\n' | ' ' => go!(self: to EndTagNameAfter),
688 '/' => go!(self: error; to EndTagNameAfter),
689 '>' => go!(self: emit_tag Data),
690 cl => go!(self: push_tag cl),
691 }
692 },
693 //§ end-tag-name-after-state
694 XmlState::EndTagNameAfter => loop {
695 match get_char!(self, input) {
696 '>' => go!(self: emit_tag Data),
697 '\t' | '\n' | ' ' => (),
698 _ => self.emit_error(Borrowed("Unexpected element in tag name")),
699 }
700 },
701 //§ pi-state
702 XmlState::Pi => loop {
703 match get_char!(self, input) {
704 '\t' | '\n' | ' ' => go!(self: error; reconsume BogusComment),
705 cl => go!(self: create_pi cl; to PiTarget),
706 }
707 },
708 //§ pi-target-state
709 XmlState::PiTarget => loop {
710 match get_char!(self, input) {
711 '\t' | '\n' | ' ' => go!(self: to PiTargetAfter),
712 '?' => go!(self: to PiAfter),
713 cl => go!(self: push_pi_target cl),
714 }
715 },
716 //§ pi-target-after-state
717 XmlState::PiTargetAfter => loop {
718 match get_char!(self, input) {
719 '\t' | '\n' | ' ' => (),
720 _ => go!(self: reconsume PiData),
721 }
722 },
723 //§ pi-data-state
724 XmlState::PiData => loop {
725 match get_char!(self, input) {
726 '?' => go!(self: to PiAfter),
727 cl => go!(self: push_pi_data cl),
728 }
729 },
730 //§ pi-after-state
731 XmlState::PiAfter => loop {
732 match get_char!(self, input) {
733 '>' => go!(self: emit_pi Data),
734 '?' => go!(self: to PiAfter),
735 cl => go!(self: push_pi_data cl),
736 }
737 },
738 //§ markup-declaration-state
739 XmlState::MarkupDecl => loop {
740 if eat!(self, input, "--") {
741 go!(self: clear_comment; to CommentStart);
742 } else if eat!(self, input, "[CDATA[") {
743 go!(self: to Cdata);
744 } else if eat!(self, input, "DOCTYPE") {
745 go!(self: to Doctype);
746 } else {
747 // FIXME: 'error' gives wrong message
748 go!(self: error; to BogusComment);
749 }
750 },
751 //§ comment-start-state
752 XmlState::CommentStart => loop {
753 match get_char!(self, input) {
754 '-' => go!(self: to CommentStartDash),
755 '>' => go!(self: error; emit_comment; to Data),
756 _ => go!(self: reconsume Comment),
757 }
758 },
759 //§ comment-start-dash-state
760 XmlState::CommentStartDash => loop {
761 match get_char!(self, input) {
762 '-' => go!(self: to CommentEnd),
763 '>' => go!(self: error; emit_comment; to Data),
764 _ => go!(self: push_comment '-'; reconsume Comment),
765 }
766 },
767 //§ comment-state
768 XmlState::Comment => loop {
769 match get_char!(self, input) {
770 '<' => go!(self: push_comment '<'; to CommentLessThan),
771 '-' => go!(self: to CommentEndDash),
772 c => go!(self: push_comment c),
773 }
774 },
775 //§ comment-less-than-sign-state
776 XmlState::CommentLessThan => loop {
777 match get_char!(self, input) {
778 '!' => go!(self: push_comment '!';to CommentLessThanBang),
779 '<' => go!(self: push_comment '<'),
780 _ => go!(self: reconsume Comment),
781 }
782 },
783 //§ comment-less-than-sign-bang-state
784 XmlState::CommentLessThanBang => loop {
785 match get_char!(self, input) {
786 '-' => go!(self: to CommentLessThanBangDash),
787 _ => go!(self: reconsume Comment),
788 }
789 },
790 //§ comment-less-than-sign-bang-dash-state
791 XmlState::CommentLessThanBangDash => loop {
792 match get_char!(self, input) {
793 '-' => go!(self: to CommentLessThanBangDashDash),
794 _ => go!(self: reconsume CommentEndDash),
795 }
796 },
797 //§ comment-less-than-sign-bang-dash-dash-state
798 XmlState::CommentLessThanBangDashDash => loop {
799 match get_char!(self, input) {
800 '>' => go!(self: reconsume CommentEnd),
801 _ => go!(self: error; reconsume CommentEnd),
802 }
803 },
804 //§ comment-end-dash-state
805 XmlState::CommentEndDash => loop {
806 match get_char!(self, input) {
807 '-' => go!(self: to CommentEnd),
808 _ => go!(self: push_comment '-'; reconsume Comment),
809 }
810 },
811 //§ comment-end-state
812 XmlState::CommentEnd => loop {
813 match get_char!(self, input) {
814 '>' => go!(self: emit_comment; to Data),
815 '!' => go!(self: to CommentEndBang),
816 '-' => go!(self: push_comment '-'),
817 _ => go!(self: append_comment "--"; reconsume Comment),
818 }
819 },
820 //§ comment-end-bang-state
821 XmlState::CommentEndBang => loop {
822 match get_char!(self, input) {
823 '-' => go!(self: append_comment "--!"; to CommentEndDash),
824 '>' => go!(self: error; emit_comment; to Data),
825 _ => go!(self: append_comment "--!"; reconsume Comment),
826 }
827 },
828 //§ bogus-comment-state
829 XmlState::BogusComment => loop {
830 match get_char!(self, input) {
831 '>' => go!(self: emit_comment; to Data),
832 c => go!(self: push_comment c),
833 }
834 },
835 //§ cdata-state
836 XmlState::Cdata => loop {
837 match get_char!(self, input) {
838 ']' => go!(self: to CdataBracket),
839 cl => go!(self: emit cl),
840 }
841 },
842 //§ cdata-bracket-state
843 XmlState::CdataBracket => loop {
844 match get_char!(self, input) {
845 ']' => go!(self: to CdataEnd),
846 cl => go!(self: emit ']'; emit cl; to Cdata),
847 }
848 },
849 //§ cdata-end-state
850 XmlState::CdataEnd => loop {
851 match get_char!(self, input) {
852 '>' => go!(self: to Data),
853 ']' => go!(self: emit ']'),
854 cl => go!(self: emit ']'; emit ']'; emit cl; to Cdata),
855 }
856 },
857 //§ tag-name-state
858 XmlState::TagName => loop {
859 match get_char!(self, input) {
860 '\t' | '\n' | ' ' => go!(self: to TagAttrNameBefore),
861 '>' => go!(self: emit_tag Data),
862 '/' => go!(self: set_empty_tag; to TagEmpty),
863 cl => go!(self: push_tag cl),
864 }
865 },
866 //§ empty-tag-state
867 XmlState::TagEmpty => loop {
868 match get_char!(self, input) {
869 '>' => go!(self: emit_empty_tag Data),
870 _ => go!(self: reconsume TagAttrValueBefore),
871 }
872 },
873 //§ tag-attribute-name-before-state
874 XmlState::TagAttrNameBefore => loop {
875 match get_char!(self, input) {
876 '\t' | '\n' | ' ' => (),
877 '>' => go!(self: emit_tag Data),
878 '/' => go!(self: set_empty_tag; to TagEmpty),
879 ':' => go!(self: error),
880 cl => go!(self: create_attr cl; to TagAttrName),
881 }
882 },
883 //§ tag-attribute-name-state
884 XmlState::TagAttrName => loop {
885 match get_char!(self, input) {
886 '=' => go!(self: to TagAttrValueBefore),
887 '>' => go!(self: emit_tag Data),
888 '\t' | '\n' | ' ' => go!(self: to TagAttrNameAfter),
889 '/' => go!(self: set_empty_tag; to TagEmpty),
890 cl => go!(self: push_name cl),
891 }
892 },
893 //§ tag-attribute-name-after-state
894 XmlState::TagAttrNameAfter => loop {
895 match get_char!(self, input) {
896 '\t' | '\n' | ' ' => (),
897 '=' => go!(self: to TagAttrValueBefore),
898 '>' => go!(self: emit_tag Data),
899 '/' => go!(self: set_empty_tag; to TagEmpty),
900 cl => go!(self: create_attr cl; to TagAttrName),
901 }
902 },
903 //§ tag-attribute-value-before-state
904 XmlState::TagAttrValueBefore => loop {
905 match get_char!(self, input) {
906 '\t' | '\n' | ' ' => (),
907 '"' => go!(self: to TagAttrValue DoubleQuoted),
908 '\'' => go!(self: to TagAttrValue SingleQuoted),
909 '&' => go!(self: reconsume TagAttrValue(Unquoted)),
910 '>' => go!(self: emit_tag Data),
911 cl => go!(self: push_value cl; to TagAttrValue(Unquoted)),
912 }
913 },
914 //§ tag-attribute-value-double-quoted-state
915 XmlState::TagAttrValue(DoubleQuoted) => loop {
916 match pop_except_from!(self, input, small_char_set!('\n' '"' '&')) {
917 FromSet('"') => go!(self: to TagAttrNameBefore),
918 FromSet('&') => go!(self: consume_char_ref '"' ),
919 FromSet(c) => go!(self: push_value c),
920 NotFromSet(ref b) => go!(self: append_value b),
921 }
922 },
923 //§ tag-attribute-value-single-quoted-state
924 XmlState::TagAttrValue(SingleQuoted) => loop {
925 match pop_except_from!(self, input, small_char_set!('\n' '\'' '&')) {
926 FromSet('\'') => go!(self: to TagAttrNameBefore),
927 FromSet('&') => go!(self: consume_char_ref '\''),
928 FromSet(c) => go!(self: push_value c),
929 NotFromSet(ref b) => go!(self: append_value b),
930 }
931 },
932 //§ tag-attribute-value-double-quoted-state
933 XmlState::TagAttrValue(Unquoted) => loop {
934 match pop_except_from!(self, input, small_char_set!('\n' '\t' ' ' '&' '>')) {
935 FromSet('\t') | FromSet('\n') | FromSet(' ') => go!(self: to TagAttrNameBefore),
936 FromSet('&') => go!(self: consume_char_ref),
937 FromSet('>') => go!(self: emit_tag Data),
938 FromSet(c) => go!(self: push_value c),
939 NotFromSet(ref b) => go!(self: append_value b),
940 }
941 },
942
943 //§ doctype-state
944 XmlState::Doctype => loop {
945 match get_char!(self, input) {
946 '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeName),
947 _ => go!(self: error; reconsume BeforeDoctypeName),
948 }
949 },
950 //§ before-doctype-name-state
951 XmlState::BeforeDoctypeName => loop {
952 match get_char!(self, input) {
953 '\t' | '\n' | '\x0C' | ' ' => (),
954 '>' => go!(self: error; emit_doctype; to Data),
955 c => go!(self: create_doctype; push_doctype_name (c.to_ascii_lowercase());
956 to DoctypeName),
957 }
958 },
959 //§ doctype-name-state
960 XmlState::DoctypeName => loop {
961 match get_char!(self, input) {
962 '\t' | '\n' | '\x0C' | ' ' => go!(self: to AfterDoctypeName),
963 '>' => go!(self: emit_doctype; to Data),
964 c => go!(self: push_doctype_name (c.to_ascii_lowercase());
965 to DoctypeName),
966 }
967 },
968 //§ after-doctype-name-state
969 XmlState::AfterDoctypeName => loop {
970 if eat!(self, input, "public") {
971 go!(self: to AfterDoctypeKeyword Public);
972 } else if eat!(self, input, "system") {
973 go!(self: to AfterDoctypeKeyword System);
974 } else {
975 match get_char!(self, input) {
976 '\t' | '\n' | '\x0C' | ' ' => (),
977 '>' => go!(self: emit_doctype; to Data),
978 _ => go!(self: error; to BogusDoctype),
979 }
980 }
981 },
982 //§ after-doctype-public-keyword-state
983 XmlState::AfterDoctypeKeyword(Public) => loop {
984 match get_char!(self, input) {
985 '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier Public),
986 '"' => {
987 go!(self: error; clear_doctype_id Public; to DoctypeIdentifierDoubleQuoted Public)
988 },
989 '\'' => {
990 go!(self: error; clear_doctype_id Public; to DoctypeIdentifierSingleQuoted Public)
991 },
992 '>' => go!(self: error; emit_doctype; to Data),
993 _ => go!(self: error; to BogusDoctype),
994 }
995 },
996 //§ after-doctype-system-keyword-state
997 XmlState::AfterDoctypeKeyword(System) => loop {
998 match get_char!(self, input) {
999 '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier System),
1000 '"' => {
1001 go!(self: error; clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System)
1002 },
1003 '\'' => {
1004 go!(self: error; clear_doctype_id System; to DoctypeIdentifierSingleQuoted System)
1005 },
1006 '>' => go!(self: error; emit_doctype; to Data),
1007 _ => go!(self: error; to BogusDoctype),
1008 }
1009 },
1010 //§ before_doctype_public_identifier_state before_doctype_system_identifier_state
1011 XmlState::BeforeDoctypeIdentifier(kind) => loop {
1012 match get_char!(self, input) {
1013 '\t' | '\n' | '\x0C' | ' ' => (),
1014 '"' => go!(self: clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind),
1015 '\'' => go!(self: clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind),
1016 '>' => go!(self: error; emit_doctype; to Data),
1017 _ => go!(self: error; to BogusDoctype),
1018 }
1019 },
1020 //§ doctype_public_identifier_double_quoted_state doctype_system_identifier_double_quoted_state
1021 XmlState::DoctypeIdentifierDoubleQuoted(kind) => loop {
1022 match get_char!(self, input) {
1023 '"' => go!(self: to AfterDoctypeIdentifier kind),
1024 '>' => go!(self: error; emit_doctype; to Data),
1025 c => go!(self: push_doctype_id kind c),
1026 }
1027 },
1028 //§ doctype_public_identifier_single_quoted_state doctype_system_identifier_single_quoted_state
1029 XmlState::DoctypeIdentifierSingleQuoted(kind) => loop {
1030 match get_char!(self, input) {
1031 '\'' => go!(self: to AfterDoctypeIdentifier kind),
1032 '>' => go!(self: error; emit_doctype; to Data),
1033 c => go!(self: push_doctype_id kind c),
1034 }
1035 },
1036 //§ doctype_public_identifier_single_quoted_state
1037 XmlState::AfterDoctypeIdentifier(Public) => loop {
1038 match get_char!(self, input) {
1039 '\t' | '\n' | '\x0C' | ' ' => {
1040 go!(self: to BetweenDoctypePublicAndSystemIdentifiers)
1041 },
1042 '\'' => {
1043 go!(self: error; clear_doctype_id System; to DoctypeIdentifierSingleQuoted(System))
1044 },
1045 '"' => {
1046 go!(self: error; clear_doctype_id System; to DoctypeIdentifierDoubleQuoted(System))
1047 },
1048 '>' => go!(self: emit_doctype; to Data),
1049 _ => go!(self: error; to BogusDoctype),
1050 }
1051 },
1052 //§ doctype_system_identifier_single_quoted_state
1053 XmlState::AfterDoctypeIdentifier(System) => loop {
1054 match get_char!(self, input) {
1055 '\t' | '\n' | '\x0C' | ' ' => (),
1056 '>' => go!(self: emit_doctype; to Data),
1057 _ => go!(self: error; to BogusDoctype),
1058 }
1059 },
1060 //§ between_doctype_public_and_system_identifier_state
1061 XmlState::BetweenDoctypePublicAndSystemIdentifiers => loop {
1062 match get_char!(self, input) {
1063 '\t' | '\n' | '\x0C' | ' ' => (),
1064 '>' => go!(self: emit_doctype; to Data),
1065 '\'' => go!(self: to DoctypeIdentifierSingleQuoted System),
1066 '"' => go!(self: to DoctypeIdentifierDoubleQuoted System),
1067 _ => go!(self: error; to BogusDoctype),
1068 }
1069 },
1070 //§ bogus_doctype_state
1071 XmlState::BogusDoctype => loop {
1072 match get_char!(self, input) {
1073 '>' => go!(self: emit_doctype; to Data),
1074 _ => (),
1075 }
1076 },
1077 }
1078 }
1079
1080 /// Indicate that we have reached the end of the input.
1081 pub fn end(&mut self) {
1082 // Handle EOF in the char ref sub-tokenizer, if there is one.
1083 // Do this first because it might un-consume stuff.
1084 let mut input = BufferQueue::new();
1085 match self.char_ref_tokenizer.take() {
1086 None => (),
1087 Some(mut tok) => {
1088 tok.end_of_file(self, &mut input);
1089 self.process_char_ref(tok.get_result());
1090 },
1091 }
1092
1093 // Process all remaining buffered input.
1094 // If we're waiting for lookahead, we're not gonna get it.
1095 self.at_eof = true;
1096 self.run(&mut input);
1097
1098 while self.eof_step() {
1099 // loop
1100 }
1101
1102 self.sink.end();
1103
1104 if self.opts.profile {
1105 self.dump_profile();
1106 }
1107 }
1108
1109 #[cfg(for_c)]
1110 fn dump_profile(&self) {
1111 unreachable!();
1112 }
1113
1114 #[cfg(not(for_c))]
1115 fn dump_profile(&self) {
1116 let mut results: Vec<(states::XmlState, u64)> =
1117 self.state_profile.iter().map(|(s, t)| (*s, *t)).collect();
1118 results.sort_by(|&(_, x), &(_, y)| y.cmp(&x));
1119
1120 let total: u64 = results
1121 .iter()
1122 .map(|&(_, t)| t)
1123 .fold(0, ::std::ops::Add::add);
1124 debug!("\nTokenizer profile, in nanoseconds");
1125 debug!("\n{:12} total in token sink", self.time_in_sink);
1126 debug!("\n{:12} total in tokenizer", total);
1127
1128 for (k, v) in results.into_iter() {
1129 let pct = 100.0 * (v as f64) / (total as f64);
1130 debug!("{:12} {:4.1}% {:?}", v, pct, k);
1131 }
1132 }
1133
1134 fn eof_step(&mut self) -> bool {
1135 debug!("processing EOF in state {:?}", self.state);
1136 match self.state {
1137 XmlState::Data | XmlState::Quiescent => go!(self: eof),
1138 XmlState::CommentStart | XmlState::CommentLessThan | XmlState::CommentLessThanBang => {
1139 go!(self: reconsume Comment)
1140 },
1141 XmlState::CommentLessThanBangDash => go!(self: reconsume CommentEndDash),
1142 XmlState::CommentLessThanBangDashDash => go!(self: reconsume CommentEnd),
1143 XmlState::CommentStartDash |
1144 XmlState::Comment |
1145 XmlState::CommentEndDash |
1146 XmlState::CommentEnd |
1147 XmlState::CommentEndBang => go!(self: error_eof; emit_comment; eof),
1148 XmlState::TagState => go!(self: error_eof; emit '<'; to Data),
1149 XmlState::EndTagState => go!(self: error_eof; emit '<'; emit '/'; to Data),
1150 XmlState::TagEmpty => go!(self: error_eof; to TagAttrNameBefore),
1151 XmlState::Cdata | XmlState::CdataBracket | XmlState::CdataEnd => {
1152 go!(self: error_eof; to Data)
1153 },
1154 XmlState::Pi => go!(self: error_eof; to BogusComment),
1155 XmlState::PiTargetAfter | XmlState::PiAfter => go!(self: reconsume PiData),
1156 XmlState::MarkupDecl => go!(self: error_eof; to BogusComment),
1157 XmlState::TagName |
1158 XmlState::TagAttrNameBefore |
1159 XmlState::EndTagName |
1160 XmlState::TagAttrNameAfter |
1161 XmlState::EndTagNameAfter |
1162 XmlState::TagAttrValueBefore |
1163 XmlState::TagAttrValue(_) => go!(self: error_eof; emit_tag Data),
1164 XmlState::PiData | XmlState::PiTarget => go!(self: error_eof; emit_pi Data),
1165 XmlState::TagAttrName => go!(self: error_eof; emit_start_tag Data),
1166 XmlState::BeforeDoctypeName |
1167 XmlState::Doctype |
1168 XmlState::DoctypeName |
1169 XmlState::AfterDoctypeName |
1170 XmlState::AfterDoctypeKeyword(_) |
1171 XmlState::BeforeDoctypeIdentifier(_) |
1172 XmlState::AfterDoctypeIdentifier(_) |
1173 XmlState::DoctypeIdentifierSingleQuoted(_) |
1174 XmlState::DoctypeIdentifierDoubleQuoted(_) |
1175 XmlState::BetweenDoctypePublicAndSystemIdentifiers => {
1176 go!(self: error_eof; emit_doctype; to Data)
1177 },
1178 XmlState::BogusDoctype => go!(self: emit_doctype; to Data),
1179 XmlState::BogusComment => go!(self: emit_comment; to Data),
1180 }
1181 }
1182
1183 fn process_char_ref(&mut self, char_ref: CharRef) {
1184 let CharRef {
1185 mut chars,
1186 mut num_chars,
1187 } = char_ref;
1188
1189 if num_chars == 0 {
1190 chars[0] = '&';
1191 num_chars = 1;
1192 }
1193
1194 for i in 0..num_chars {
1195 let c = chars[i as usize];
1196 match self.state {
1197 states::Data | states::Cdata => go!(self: emit c),
1198
1199 states::TagAttrValue(_) => go!(self: push_value c),
1200
1201 _ => panic!(
1202 "state {:?} should not be reachable in process_char_ref",
1203 self.state
1204 ),
1205 }
1206 }
1207 }
1208
1209 fn step_char_ref_tokenizer(&mut self, input: &mut BufferQueue) -> bool {
1210 let mut tok = self.char_ref_tokenizer.take().unwrap();
1211 let outcome = tok.step(self, input);
1212
1213 let progress = match outcome {
1214 char_ref::Done => {
1215 self.process_char_ref(tok.get_result());
1216 return true;
1217 },
1218
1219 char_ref::Stuck => false,
1220 char_ref::Progress => true,
1221 };
1222
1223 self.char_ref_tokenizer = Some(tok);
1224 progress
1225 }
1226
1227 fn finish_attribute(&mut self) {
1228 if self.current_attr_name.len() == 0 {
1229 return;
1230 }
1231
1232 // Check for a duplicate attribute.
1233 // FIXME: the spec says we should error as soon as the name is finished.
1234 // FIXME: linear time search, do we care?
1235 let dup = {
1236 let name = &self.current_attr_name[..];
1237 self.current_tag_attrs
1238 .iter()
1239 .any(|a| &*a.name.local == name)
1240 };
1241
1242 if dup {
1243 self.emit_error(Borrowed("Duplicate attribute"));
1244 self.current_attr_name.clear();
1245 self.current_attr_value.clear();
1246 } else {
1247 let qname = process_qname(replace(&mut self.current_attr_name, StrTendril::new()));
1248 let attr = Attribute {
1249 name: qname.clone(),
1250 value: replace(&mut self.current_attr_value, StrTendril::new()),
1251 };
1252
1253 if qname.local == local_name!("xmlns") ||
1254 qname.prefix == Some(namespace_prefix!("xmlns"))
1255 {
1256 self.current_tag_attrs.insert(0, attr);
1257 } else {
1258 self.current_tag_attrs.push(attr);
1259 }
1260 }
1261 }
1262
1263 fn create_attribute(&mut self, c: char) {
1264 self.finish_attribute();
1265
1266 self.current_attr_name.push_char(c);
1267 }
1268 }
1269
1270 #[cfg(test)]
1271 mod test {
1272
1273 use super::process_qname;
1274 use crate::tendril::SliceExt;
1275 use crate::{LocalName, Prefix};
1276
1277 #[test]
1278 fn simple_namespace() {
1279 let qname = process_qname("prefix:local".to_tendril());
1280 assert_eq!(qname.prefix, Some(Prefix::from("prefix")));
1281 assert_eq!(qname.local, LocalName::from("local"));
1282
1283 let qname = process_qname("a:b".to_tendril());
1284 assert_eq!(qname.prefix, Some(Prefix::from("a")));
1285 assert_eq!(qname.local, LocalName::from("b"));
1286 }
1287
1288 #[test]
1289 fn wrong_namespaces() {
1290 let qname = process_qname(":local".to_tendril());
1291 assert_eq!(qname.prefix, None);
1292 assert_eq!(qname.local, LocalName::from(":local"));
1293
1294 let qname = process_qname("::local".to_tendril());
1295 assert_eq!(qname.prefix, None);
1296 assert_eq!(qname.local, LocalName::from("::local"));
1297
1298 let qname = process_qname("a::local".to_tendril());
1299 assert_eq!(qname.prefix, None);
1300 assert_eq!(qname.local, LocalName::from("a::local"));
1301
1302 let qname = process_qname("fake::".to_tendril());
1303 assert_eq!(qname.prefix, None);
1304 assert_eq!(qname.local, LocalName::from("fake::"));
1305
1306 let qname = process_qname(":::".to_tendril());
1307 assert_eq!(qname.prefix, None);
1308 assert_eq!(qname.local, LocalName::from(":::"));
1309
1310 let qname = process_qname(":a:b:".to_tendril());
1311 assert_eq!(qname.prefix, None);
1312 assert_eq!(qname.local, LocalName::from(":a:b:"));
1313 }
1314 }
1315