1 // Copyright 2014 The html5ever Project Developers. See the
2 // COPYRIGHT file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9
10 //! The HTML5 tokenizer.
11
12 pub use self::interface::{Doctype, Attribute, TagKind, StartTag, EndTag, Tag};
13 pub use self::interface::{Token, DoctypeToken, TagToken, CommentToken};
14 pub use self::interface::{CharacterTokens, NullCharacterToken, EOFToken, ParseError};
15 pub use self::interface::{TokenSink, TokenSinkResult};
16
17 use self::states::{Rcdata, Rawtext, ScriptData, ScriptDataEscaped};
18 use self::states::{Escaped, DoubleEscaped};
19 use self::states::{Unquoted, SingleQuoted, DoubleQuoted};
20 use self::states::{DoctypeIdKind, Public, System};
21
22 use self::char_ref::{CharRef, CharRefTokenizer};
23
24 use self::buffer_queue::{BufferQueue, SetResult, FromSet, NotFromSet};
25
26 use util::str::lower_ascii_letter;
27 use util::smallcharset::SmallCharSet;
28
29 use std::ascii::AsciiExt;
30 use std::mem::replace;
31 use std::default::Default;
32 use std::borrow::Cow::{self, Borrowed};
33 use std::collections::BTreeMap;
34
35 use {LocalName, QualName};
36 use tendril::StrTendril;
37
38 pub mod buffer_queue;
39 pub mod states;
40 mod interface;
41 mod char_ref;
42
43 pub enum ProcessResult<Handle> {
44 Continue,
45 Suspend,
46 Script(Handle)
47 }
48
49 #[must_use]
50 pub enum TokenizerResult<Handle> {
51 Done,
52 Script(Handle)
53 }
54
option_push(opt_str: &mut Option<StrTendril>, c: char)55 fn option_push(opt_str: &mut Option<StrTendril>, c: char) {
56 match *opt_str {
57 Some(ref mut s) => s.push_char(c),
58 None => *opt_str = Some(StrTendril::from_char(c)),
59 }
60 }
61
62 /// Tokenizer options, with an impl for `Default`.
63 #[derive(Clone)]
64 pub struct TokenizerOpts {
65 /// Report all parse errors described in the spec, at some
66 /// performance penalty? Default: false
67 pub exact_errors: bool,
68
69 /// Discard a `U+FEFF BYTE ORDER MARK` if we see one at the beginning
70 /// of the stream? Default: true
71 pub discard_bom: bool,
72
73 /// Keep a record of how long we spent in each state? Printed
74 /// when `end()` is called. Default: false
75 pub profile: bool,
76
77 /// Initial state override. Only the test runner should use
78 /// a non-`None` value!
79 pub initial_state: Option<states::State>,
80
81 /// Last start tag. Only the test runner should use a
82 /// non-`None` value!
83 ///
84 /// FIXME: Can't use Tendril because we want TokenizerOpts
85 /// to be Send.
86 pub last_start_tag_name: Option<String>,
87 }
88
89 impl Default for TokenizerOpts {
default() -> TokenizerOpts90 fn default() -> TokenizerOpts {
91 TokenizerOpts {
92 exact_errors: false,
93 discard_bom: true,
94 profile: false,
95 initial_state: None,
96 last_start_tag_name: None,
97 }
98 }
99 }
100
101 /// The HTML tokenizer.
102 pub struct Tokenizer<Sink> {
103 /// Options controlling the behavior of the tokenizer.
104 opts: TokenizerOpts,
105
106 /// Destination for tokens we emit.
107 sink: Sink,
108
109 /// The abstract machine state as described in the spec.
110 state: states::State,
111
112 /// Are we at the end of the file, once buffers have been processed
113 /// completely? This affects whether we will wait for lookahead or not.
114 at_eof: bool,
115
116 /// Tokenizer for character references, if we're tokenizing
117 /// one at the moment.
118 char_ref_tokenizer: Option<Box<CharRefTokenizer>>,
119
120 /// Current input character. Just consumed, may reconsume.
121 current_char: char,
122
123 /// Should we reconsume the current input character?
124 reconsume: bool,
125
126 /// Did we just consume \r, translating it to \n? In that case we need
127 /// to ignore the next character if it's \n.
128 ignore_lf: bool,
129
130 /// Discard a U+FEFF BYTE ORDER MARK if we see one? Only done at the
131 /// beginning of the stream.
132 discard_bom: bool,
133
134 /// Current tag kind.
135 current_tag_kind: TagKind,
136
137 /// Current tag name.
138 current_tag_name: StrTendril,
139
140 /// Current tag is self-closing?
141 current_tag_self_closing: bool,
142
143 /// Current tag attributes.
144 current_tag_attrs: Vec<Attribute>,
145
146 /// Current attribute name.
147 current_attr_name: StrTendril,
148
149 /// Current attribute value.
150 current_attr_value: StrTendril,
151
152 /// Current comment.
153 current_comment: StrTendril,
154
155 /// Current doctype token.
156 current_doctype: Doctype,
157
158 /// Last start tag name, for use in checking "appropriate end tag".
159 last_start_tag_name: Option<LocalName>,
160
161 /// The "temporary buffer" mentioned in the spec.
162 temp_buf: StrTendril,
163
164 /// Record of how many ns we spent in each state, if profiling is enabled.
165 state_profile: BTreeMap<states::State, u64>,
166
167 /// Record of how many ns we spent in the token sink.
168 time_in_sink: u64,
169
170 /// Track current line
171 current_line: u64,
172 }
173
174 impl<Sink: TokenSink> Tokenizer<Sink> {
175 /// Create a new tokenizer which feeds tokens to a particular `TokenSink`.
new(sink: Sink, mut opts: TokenizerOpts) -> Tokenizer<Sink>176 pub fn new(sink: Sink, mut opts: TokenizerOpts) -> Tokenizer<Sink> {
177 let start_tag_name = opts.last_start_tag_name.take()
178 .map(|s| LocalName::from(&*s));
179 let state = opts.initial_state.unwrap_or(states::Data);
180 let discard_bom = opts.discard_bom;
181 Tokenizer {
182 opts: opts,
183 sink: sink,
184 state: state,
185 char_ref_tokenizer: None,
186 at_eof: false,
187 current_char: '\0',
188 reconsume: false,
189 ignore_lf: false,
190 discard_bom: discard_bom,
191 current_tag_kind: StartTag,
192 current_tag_name: StrTendril::new(),
193 current_tag_self_closing: false,
194 current_tag_attrs: vec!(),
195 current_attr_name: StrTendril::new(),
196 current_attr_value: StrTendril::new(),
197 current_comment: StrTendril::new(),
198 current_doctype: Doctype::new(),
199 last_start_tag_name: start_tag_name,
200 temp_buf: StrTendril::new(),
201 state_profile: BTreeMap::new(),
202 time_in_sink: 0,
203 current_line: 1,
204 }
205 }
206
unwrap(self) -> Sink207 pub fn unwrap(self) -> Sink {
208 self.sink
209 }
210
sink<'a>(&'a self) -> &'a Sink211 pub fn sink<'a>(&'a self) -> &'a Sink {
212 &self.sink
213 }
214
sink_mut<'a>(&'a mut self) -> &'a mut Sink215 pub fn sink_mut<'a>(&'a mut self) -> &'a mut Sink {
216 &mut self.sink
217 }
218
219 /// Feed an input string into the tokenizer.
feed(&mut self, input: &mut BufferQueue) -> TokenizerResult<Sink::Handle>220 pub fn feed(&mut self, input: &mut BufferQueue) -> TokenizerResult<Sink::Handle> {
221 if input.is_empty() {
222 return TokenizerResult::Done;
223 }
224
225 if self.discard_bom {
226 if let Some(c) = input.peek() {
227 if c == '\u{feff}' {
228 input.next();
229 }
230 } else {
231 return TokenizerResult::Done;
232 }
233 };
234
235 self.run(input)
236 }
237
set_plaintext_state(&mut self)238 pub fn set_plaintext_state(&mut self) {
239 self.state = states::Plaintext;
240 }
241
process_token(&mut self, token: Token) -> TokenSinkResult<Sink::Handle>242 fn process_token(&mut self, token: Token) -> TokenSinkResult<Sink::Handle> {
243 if self.opts.profile {
244 let (ret, dt) = time!(self.sink.process_token(token, self.current_line));
245 self.time_in_sink += dt;
246 ret
247 } else {
248 self.sink.process_token(token, self.current_line)
249 }
250 }
251
process_token_and_continue(&mut self, token: Token)252 fn process_token_and_continue(&mut self, token: Token) {
253 assert!(matches!(self.process_token(token), TokenSinkResult::Continue));
254 }
255
256 //§ preprocessing-the-input-stream
257 // Get the next input character, which might be the character
258 // 'c' that we already consumed from the buffers.
get_preprocessed_char( &mut self, mut c: char, input: &mut BufferQueue) -> Option<char>259 fn get_preprocessed_char(
260 &mut self,
261 mut c: char,
262 input: &mut BufferQueue)
263 -> Option<char> {
264 if self.ignore_lf {
265 self.ignore_lf = false;
266 if c == '\n' {
267 c = unwrap_or_return!(input.next(), None);
268 }
269 }
270
271 if c == '\r' {
272 self.ignore_lf = true;
273 c = '\n';
274 }
275
276 if c == '\n' {
277 self.current_line += 1;
278 }
279
280 if self.opts.exact_errors && match c as u32 {
281 0x01...0x08 | 0x0B | 0x0E...0x1F | 0x7F...0x9F | 0xFDD0...0xFDEF => true,
282 n if (n & 0xFFFE) == 0xFFFE => true,
283 _ => false,
284 } {
285 let msg = format!("Bad character {}", c);
286 self.emit_error(Cow::Owned(msg));
287 }
288
289 debug!("got character {}", c);
290 self.current_char = c;
291 Some(c)
292 }
293
294 //§ tokenization
295 // Get the next input character, if one is available.
get_char(&mut self, input: &mut BufferQueue) -> Option<char>296 fn get_char(&mut self, input: &mut BufferQueue) -> Option<char> {
297 if self.reconsume {
298 self.reconsume = false;
299 Some(self.current_char)
300 } else {
301 input.next().and_then(|c| self.get_preprocessed_char(c, input))
302 }
303 }
304
pop_except_from(&mut self, input: &mut BufferQueue, set: SmallCharSet) -> Option<SetResult>305 fn pop_except_from(&mut self, input: &mut BufferQueue, set: SmallCharSet) -> Option<SetResult> {
306 // Bail to the slow path for various corner cases.
307 // This means that `FromSet` can contain characters not in the set!
308 // It shouldn't matter because the fallback `FromSet` case should
309 // always do the same thing as the `NotFromSet` case.
310 if self.opts.exact_errors || self.reconsume || self.ignore_lf {
311 return self.get_char(input).map(|x| FromSet(x));
312 }
313
314 let d = input.pop_except_from(set);
315 debug!("got characters {:?}", d);
316 match d {
317 Some(FromSet(c)) => self.get_preprocessed_char(c, input).map(|x| FromSet(x)),
318
319 // NB: We don't set self.current_char for a run of characters not
320 // in the set. It shouldn't matter for the codepaths that use
321 // this.
322 _ => d
323 }
324 }
325
326 // Check if the next characters are an ASCII case-insensitive match. See
327 // BufferQueue::eat.
328 //
329 // NB: this doesn't do input stream preprocessing or set the current input
330 // character.
eat( &mut self, input: &mut BufferQueue, pat: &str, eq: fn(&u8, &u8) -> bool) -> Option<bool>331 fn eat(
332 &mut self,
333 input: &mut BufferQueue,
334 pat: &str,
335 eq: fn(&u8, &u8) -> bool)
336 -> Option<bool> {
337 input.push_front(replace(&mut self.temp_buf, StrTendril::new()));
338 match input.eat(pat, eq) {
339 None if self.at_eof => Some(false),
340 None => {
341 while let Some(c) = input.next() {
342 self.temp_buf.push_char(c);
343 }
344 None
345 },
346 Some(matched) => Some(matched),
347 }
348 }
349
350 /// Run the state machine for as long as we can.
run(&mut self, input: &mut BufferQueue) -> TokenizerResult<Sink::Handle>351 fn run(&mut self, input: &mut BufferQueue) -> TokenizerResult<Sink::Handle> {
352 if self.opts.profile {
353 loop {
354 let state = self.state;
355 let old_sink = self.time_in_sink;
356 let (run, mut dt) = time!(self.step(input));
357 dt -= (self.time_in_sink - old_sink);
358 let new = match self.state_profile.get_mut(&state) {
359 Some(x) => {
360 *x += dt;
361 false
362 }
363 None => true,
364 };
365 if new {
366 // do this here because of borrow shenanigans
367 self.state_profile.insert(state, dt);
368 }
369 match run {
370 ProcessResult::Continue => (),
371 ProcessResult::Suspend => break,
372 ProcessResult::Script(node) => return TokenizerResult::Script(node),
373 }
374 }
375 } else {
376 loop {
377 match self.step(input) {
378 ProcessResult::Continue => (),
379 ProcessResult::Suspend => break,
380 ProcessResult::Script(node) => return TokenizerResult::Script(node),
381 }
382 }
383 }
384 TokenizerResult::Done
385 }
386
bad_char_error(&mut self)387 fn bad_char_error(&mut self) {
388 let msg = format_if!(
389 self.opts.exact_errors,
390 "Bad character",
391 "Saw {} in state {:?}", self.current_char, self.state);
392 self.emit_error(msg);
393 }
394
bad_eof_error(&mut self)395 fn bad_eof_error(&mut self) {
396 let msg = format_if!(
397 self.opts.exact_errors,
398 "Unexpected EOF",
399 "Saw EOF in state {:?}", self.state);
400 self.emit_error(msg);
401 }
402
emit_char(&mut self, c: char)403 fn emit_char(&mut self, c: char) {
404 self.process_token_and_continue(match c {
405 '\0' => NullCharacterToken,
406 _ => CharacterTokens(StrTendril::from_char(c)),
407 });
408 }
409
410 // The string must not contain '\0'!
emit_chars(&mut self, b: StrTendril)411 fn emit_chars(&mut self, b: StrTendril) {
412 self.process_token_and_continue(CharacterTokens(b));
413 }
414
emit_current_tag(&mut self) -> ProcessResult<Sink::Handle>415 fn emit_current_tag(&mut self) -> ProcessResult<Sink::Handle> {
416 self.finish_attribute();
417
418 let name = LocalName::from(&*self.current_tag_name);
419 self.current_tag_name.clear();
420
421 match self.current_tag_kind {
422 StartTag => {
423 self.last_start_tag_name = Some(name.clone());
424 }
425 EndTag => {
426 if !self.current_tag_attrs.is_empty() {
427 self.emit_error(Borrowed("Attributes on an end tag"));
428 }
429 if self.current_tag_self_closing {
430 self.emit_error(Borrowed("Self-closing end tag"));
431 }
432 }
433 }
434
435 let token = TagToken(Tag { kind: self.current_tag_kind,
436 name: name,
437 self_closing: self.current_tag_self_closing,
438 attrs: replace(&mut self.current_tag_attrs, vec!()),
439 });
440
441 match self.process_token(token) {
442 TokenSinkResult::Continue => ProcessResult::Continue,
443 TokenSinkResult::Plaintext => {
444 self.state = states::Plaintext;
445 ProcessResult::Continue
446 },
447 TokenSinkResult::Script(node) => {
448 self.state = states::Data;
449 ProcessResult::Script(node)
450 },
451 TokenSinkResult::RawData(kind) => {
452 self.state = states::RawData(kind);
453 ProcessResult::Continue
454 }
455 }
456 }
457
emit_temp_buf(&mut self)458 fn emit_temp_buf(&mut self) {
459 // FIXME: Make sure that clearing on emit is spec-compatible.
460 let buf = replace(&mut self.temp_buf, StrTendril::new());
461 self.emit_chars(buf);
462 }
463
clear_temp_buf(&mut self)464 fn clear_temp_buf(&mut self) {
465 // Do this without a new allocation.
466 self.temp_buf.clear();
467 }
468
emit_current_comment(&mut self)469 fn emit_current_comment(&mut self) {
470 let comment = replace(&mut self.current_comment, StrTendril::new());
471 self.process_token_and_continue(CommentToken(comment));
472 }
473
discard_tag(&mut self)474 fn discard_tag(&mut self) {
475 self.current_tag_name.clear();
476 self.current_tag_self_closing = false;
477 self.current_tag_attrs = vec!();
478 }
479
create_tag(&mut self, kind: TagKind, c: char)480 fn create_tag(&mut self, kind: TagKind, c: char) {
481 self.discard_tag();
482 self.current_tag_name.push_char(c);
483 self.current_tag_kind = kind;
484 }
485
have_appropriate_end_tag(&self) -> bool486 fn have_appropriate_end_tag(&self) -> bool {
487 match self.last_start_tag_name.as_ref() {
488 Some(last) =>
489 (self.current_tag_kind == EndTag)
490 && (*self.current_tag_name == **last),
491 None => false,
492 }
493 }
494
create_attribute(&mut self, c: char)495 fn create_attribute(&mut self, c: char) {
496 self.finish_attribute();
497
498 self.current_attr_name.push_char(c);
499 }
500
finish_attribute(&mut self)501 fn finish_attribute(&mut self) {
502 if self.current_attr_name.len() == 0 {
503 return;
504 }
505
506 // Check for a duplicate attribute.
507 // FIXME: the spec says we should error as soon as the name is finished.
508 // FIXME: linear time search, do we care?
509 let dup = {
510 let name = &*self.current_attr_name;
511 self.current_tag_attrs.iter().any(|a| &*a.name.local == name)
512 };
513
514 if dup {
515 self.emit_error(Borrowed("Duplicate attribute"));
516 self.current_attr_name.clear();
517 self.current_attr_value.clear();
518 } else {
519 let name = LocalName::from(&*self.current_attr_name);
520 self.current_attr_name.clear();
521 self.current_tag_attrs.push(Attribute {
522 // The tree builder will adjust the namespace if necessary.
523 // This only happens in foreign elements.
524 name: QualName::new(ns!(), name),
525 value: replace(&mut self.current_attr_value, StrTendril::new()),
526 });
527 }
528 }
529
emit_current_doctype(&mut self)530 fn emit_current_doctype(&mut self) {
531 let doctype = replace(&mut self.current_doctype, Doctype::new());
532 self.process_token_and_continue(DoctypeToken(doctype));
533 }
534
doctype_id<'a>(&'a mut self, kind: DoctypeIdKind) -> &'a mut Option<StrTendril>535 fn doctype_id<'a>(&'a mut self, kind: DoctypeIdKind) -> &'a mut Option<StrTendril> {
536 match kind {
537 Public => &mut self.current_doctype.public_id,
538 System => &mut self.current_doctype.system_id,
539 }
540 }
541
clear_doctype_id(&mut self, kind: DoctypeIdKind)542 fn clear_doctype_id(&mut self, kind: DoctypeIdKind) {
543 let id = self.doctype_id(kind);
544 match *id {
545 Some(ref mut s) => s.clear(),
546 None => *id = Some(StrTendril::new()),
547 }
548 }
549
consume_char_ref(&mut self, addnl_allowed: Option<char>)550 fn consume_char_ref(&mut self, addnl_allowed: Option<char>) {
551 // NB: The char ref tokenizer assumes we have an additional allowed
552 // character iff we're tokenizing in an attribute value.
553 self.char_ref_tokenizer = Some(Box::new(CharRefTokenizer::new(addnl_allowed)));
554 }
555
emit_eof(&mut self)556 fn emit_eof(&mut self) {
557 self.process_token_and_continue(EOFToken);
558 }
559
peek(&mut self, input: &BufferQueue) -> Option<char>560 fn peek(&mut self, input: &BufferQueue) -> Option<char> {
561 if self.reconsume {
562 Some(self.current_char)
563 } else {
564 input.peek()
565 }
566 }
567
discard_char(&mut self, input: &mut BufferQueue)568 fn discard_char(&mut self, input: &mut BufferQueue) {
569 let c = self.get_char(input);
570 assert!(c.is_some());
571 }
572
emit_error(&mut self, error: Cow<'static, str>)573 fn emit_error(&mut self, error: Cow<'static, str>) {
574 self.process_token_and_continue(ParseError(error));
575 }
576 }
577 //§ END
578
579 // Shorthand for common state machine behaviors.
580 macro_rules! shorthand (
581 ( $me:ident : emit $c:expr ) => ( $me.emit_char($c); );
582 ( $me:ident : create_tag $kind:ident $c:expr ) => ( $me.create_tag($kind, $c); );
583 ( $me:ident : push_tag $c:expr ) => ( $me.current_tag_name.push_char($c); );
584 ( $me:ident : discard_tag ) => ( $me.discard_tag(); );
585 ( $me:ident : discard_char $input:expr ) => ( $me.discard_char($input); );
586 ( $me:ident : push_temp $c:expr ) => ( $me.temp_buf.push_char($c); );
587 ( $me:ident : emit_temp ) => ( $me.emit_temp_buf(); );
588 ( $me:ident : clear_temp ) => ( $me.clear_temp_buf(); );
589 ( $me:ident : create_attr $c:expr ) => ( $me.create_attribute($c); );
590 ( $me:ident : push_name $c:expr ) => ( $me.current_attr_name.push_char($c); );
591 ( $me:ident : push_value $c:expr ) => ( $me.current_attr_value.push_char($c); );
592 ( $me:ident : append_value $c:expr ) => ( $me.current_attr_value.push_tendril($c); );
593 ( $me:ident : push_comment $c:expr ) => ( $me.current_comment.push_char($c); );
594 ( $me:ident : append_comment $c:expr ) => ( $me.current_comment.push_slice($c); );
595 ( $me:ident : emit_comment ) => ( $me.emit_current_comment(); );
596 ( $me:ident : clear_comment ) => ( $me.current_comment.clear(); );
597 ( $me:ident : create_doctype ) => ( $me.current_doctype = Doctype::new(); );
598 ( $me:ident : push_doctype_name $c:expr ) => ( option_push(&mut $me.current_doctype.name, $c); );
599 ( $me:ident : push_doctype_id $k:ident $c:expr ) => ( option_push($me.doctype_id($k), $c); );
600 ( $me:ident : clear_doctype_id $k:ident ) => ( $me.clear_doctype_id($k); );
601 ( $me:ident : force_quirks ) => ( $me.current_doctype.force_quirks = true; );
602 ( $me:ident : emit_doctype ) => ( $me.emit_current_doctype(); );
603 ( $me:ident : error ) => ( $me.bad_char_error(); );
604 ( $me:ident : error_eof ) => ( $me.bad_eof_error(); );
605 );
606
607 // Tracing of tokenizer actions. This adds significant bloat and compile time,
608 // so it's behind a cfg flag.
609 #[cfg(trace_tokenizer)]
610 macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ({
611 debug!(" {:s}", stringify!($($cmds)*));
612 shorthand!($me:expr : $($cmds)*);
613 }));
614
615 #[cfg(not(trace_tokenizer))]
616 macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ( shorthand!($me: $($cmds)*) ) );
617
618 // A little DSL for sequencing shorthand actions.
619 macro_rules! go (
620 // A pattern like $($cmd:tt)* ; $($rest:tt)* causes parse ambiguity.
621 // We have to tell the parser how much lookahead we need.
622
623 ( $me:ident : $a:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a); go!($me: $($rest)*); });
624 ( $me:ident : $a:tt $b:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b); go!($me: $($rest)*); });
625 ( $me:ident : $a:tt $b:tt $c:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c); go!($me: $($rest)*); });
626 ( $me:ident : $a:tt $b:tt $c:tt $d:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c $d); go!($me: $($rest)*); });
627
628 // These can only come at the end.
629
630 ( $me:ident : to $s:ident ) => ({ $me.state = states::$s; return ProcessResult::Continue; });
631 ( $me:ident : to $s:ident $k1:expr ) => ({ $me.state = states::$s($k1); return ProcessResult::Continue; });
632 ( $me:ident : to $s:ident $k1:ident $k2:expr ) => ({ $me.state = states::$s($k1($k2)); return ProcessResult::Continue; });
633
634 ( $me:ident : reconsume $s:ident ) => ({ $me.reconsume = true; go!($me: to $s); });
635 ( $me:ident : reconsume $s:ident $k1:expr ) => ({ $me.reconsume = true; go!($me: to $s $k1); });
636 ( $me:ident : reconsume $s:ident $k1:ident $k2:expr ) => ({ $me.reconsume = true; go!($me: to $s $k1 $k2); });
637
638 ( $me:ident : consume_char_ref ) => ({ $me.consume_char_ref(None); return ProcessResult::Continue; });
639 ( $me:ident : consume_char_ref $addnl:expr ) => ({ $me.consume_char_ref(Some($addnl)); return ProcessResult::Continue; });
640
641 // We have a default next state after emitting a tag, but the sink can override.
642 ( $me:ident : emit_tag $s:ident ) => ({
643 $me.state = states::$s;
644 return $me.emit_current_tag();
645 });
646
647 ( $me:ident : eof ) => ({ $me.emit_eof(); return ProcessResult::Suspend; });
648
649 // If nothing else matched, it's a single command
650 ( $me:ident : $($cmd:tt)+ ) => ( sh_trace!($me: $($cmd)+); );
651
652 // or nothing.
653 ( $me:ident : ) => (());
654 );
655
656 macro_rules! go_match ( ( $me:ident : $x:expr, $($pats:pat),+ => $($cmds:tt)* ) => (
657 match $x {
658 $($pats)|+ => go!($me: $($cmds)*),
659 _ => (),
660 }
661 ));
662
663 // This is a macro because it can cause early return
664 // from the function where it is used.
665 macro_rules! get_char ( ($me:expr, $input:expr) => (
666 unwrap_or_return!($me.get_char($input), ProcessResult::Suspend)
667 ));
668
669 macro_rules! peek ( ($me:expr, $input:expr) => (
670 unwrap_or_return!($me.peek($input), ProcessResult::Suspend)
671 ));
672
673 macro_rules! pop_except_from ( ($me:expr, $input:expr, $set:expr) => (
674 unwrap_or_return!($me.pop_except_from($input, $set), ProcessResult::Suspend)
675 ));
676
677 macro_rules! eat ( ($me:expr, $input:expr, $pat:expr) => (
678 unwrap_or_return!($me.eat($input, $pat, u8::eq_ignore_ascii_case), ProcessResult::Suspend)
679 ));
680
681 macro_rules! eat_exact ( ($me:expr, $input:expr, $pat:expr) => (
682 unwrap_or_return!($me.eat($input, $pat, u8::eq), ProcessResult::Suspend)
683 ));
684
685 impl<Sink: TokenSink> Tokenizer<Sink> {
686 // Run the state machine for a while.
687 // Return true if we should be immediately re-invoked
688 // (this just simplifies control flow vs. break / continue).
step(&mut self, input: &mut BufferQueue) -> ProcessResult<Sink::Handle>689 fn step(&mut self, input: &mut BufferQueue) -> ProcessResult<Sink::Handle> {
690 if self.char_ref_tokenizer.is_some() {
691 return self.step_char_ref_tokenizer(input);
692 }
693
694 debug!("processing in state {:?}", self.state);
695 match self.state {
696 //§ data-state
697 states::Data => loop {
698 match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n')) {
699 FromSet('\0') => go!(self: error; emit '\0'),
700 FromSet('&') => go!(self: consume_char_ref),
701 FromSet('<') => go!(self: to TagOpen),
702 FromSet(c) => go!(self: emit c),
703 NotFromSet(b) => self.emit_chars(b),
704 }
705 },
706
707 //§ rcdata-state
708 states::RawData(Rcdata) => loop {
709 match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n')) {
710 FromSet('\0') => go!(self: error; emit '\u{fffd}'),
711 FromSet('&') => go!(self: consume_char_ref),
712 FromSet('<') => go!(self: to RawLessThanSign Rcdata),
713 FromSet(c) => go!(self: emit c),
714 NotFromSet(b) => self.emit_chars(b),
715 }
716 },
717
718 //§ rawtext-state
719 states::RawData(Rawtext) => loop {
720 match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) {
721 FromSet('\0') => go!(self: error; emit '\u{fffd}'),
722 FromSet('<') => go!(self: to RawLessThanSign Rawtext),
723 FromSet(c) => go!(self: emit c),
724 NotFromSet(b) => self.emit_chars(b),
725 }
726 },
727
728 //§ script-data-state
729 states::RawData(ScriptData) => loop {
730 match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) {
731 FromSet('\0') => go!(self: error; emit '\u{fffd}'),
732 FromSet('<') => go!(self: to RawLessThanSign ScriptData),
733 FromSet(c) => go!(self: emit c),
734 NotFromSet(b) => self.emit_chars(b),
735 }
736 },
737
738 //§ script-data-escaped-state
739 states::RawData(ScriptDataEscaped(Escaped)) => loop {
740 match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) {
741 FromSet('\0') => go!(self: error; emit '\u{fffd}'),
742 FromSet('-') => go!(self: emit '-'; to ScriptDataEscapedDash Escaped),
743 FromSet('<') => go!(self: to RawLessThanSign ScriptDataEscaped Escaped),
744 FromSet(c) => go!(self: emit c),
745 NotFromSet(b) => self.emit_chars(b),
746 }
747 },
748
749 //§ script-data-double-escaped-state
750 states::RawData(ScriptDataEscaped(DoubleEscaped)) => loop {
751 match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) {
752 FromSet('\0') => go!(self: error; emit '\u{fffd}'),
753 FromSet('-') => go!(self: emit '-'; to ScriptDataEscapedDash DoubleEscaped),
754 FromSet('<') => go!(self: emit '<'; to RawLessThanSign ScriptDataEscaped DoubleEscaped),
755 FromSet(c) => go!(self: emit c),
756 NotFromSet(b) => self.emit_chars(b),
757 }
758 },
759
760 //§ plaintext-state
761 states::Plaintext => loop {
762 match pop_except_from!(self, input, small_char_set!('\r' '\0' '\n')) {
763 FromSet('\0') => go!(self: error; emit '\u{fffd}'),
764 FromSet(c) => go!(self: emit c),
765 NotFromSet(b) => self.emit_chars(b),
766 }
767 },
768
769 //§ tag-open-state
770 states::TagOpen => loop { match get_char!(self, input) {
771 '!' => go!(self: clear_temp; to MarkupDeclarationOpen),
772 '/' => go!(self: to EndTagOpen),
773 '?' => go!(self: error; clear_comment; push_comment '?'; to BogusComment),
774 c => match lower_ascii_letter(c) {
775 Some(cl) => go!(self: create_tag StartTag cl; to TagName),
776 None => go!(self: error; emit '<'; reconsume Data),
777 }
778 }},
779
780 //§ end-tag-open-state
781 states::EndTagOpen => loop { match get_char!(self, input) {
782 '>' => go!(self: error; to Data),
783 '\0' => go!(self: error; clear_comment; push_comment '\u{fffd}'; to BogusComment),
784 c => match lower_ascii_letter(c) {
785 Some(cl) => go!(self: create_tag EndTag cl; to TagName),
786 None => go!(self: error; clear_comment; push_comment c; to BogusComment),
787 }
788 }},
789
790 //§ tag-name-state
791 states::TagName => loop { match get_char!(self, input) {
792 '\t' | '\n' | '\x0C' | ' '
793 => go!(self: to BeforeAttributeName),
794 '/' => go!(self: to SelfClosingStartTag),
795 '>' => go!(self: emit_tag Data),
796 '\0' => go!(self: error; push_tag '\u{fffd}'),
797 c => go!(self: push_tag (c.to_ascii_lowercase())),
798 }},
799
800 //§ script-data-escaped-less-than-sign-state
801 states::RawLessThanSign(ScriptDataEscaped(Escaped)) => loop { match get_char!(self, input) {
802 '/' => go!(self: clear_temp; to RawEndTagOpen ScriptDataEscaped Escaped),
803 c => match lower_ascii_letter(c) {
804 Some(cl) => go!(self: clear_temp; push_temp cl; emit '<'; emit c;
805 to ScriptDataEscapeStart DoubleEscaped),
806 None => go!(self: emit '<'; reconsume RawData ScriptDataEscaped Escaped),
807 }
808 }},
809
810 //§ script-data-double-escaped-less-than-sign-state
811 states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => loop { match get_char!(self, input) {
812 '/' => go!(self: clear_temp; emit '/'; to ScriptDataDoubleEscapeEnd),
813 _ => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped),
814 }},
815
816 //§ rcdata-less-than-sign-state rawtext-less-than-sign-state script-data-less-than-sign-state
817 // otherwise
818 states::RawLessThanSign(kind) => loop { match get_char!(self, input) {
819 '/' => go!(self: clear_temp; to RawEndTagOpen kind),
820 '!' if kind == ScriptData => go!(self: emit '<'; emit '!'; to ScriptDataEscapeStart Escaped),
821 _ => go!(self: emit '<'; reconsume RawData kind),
822 }},
823
824 //§ rcdata-end-tag-open-state rawtext-end-tag-open-state script-data-end-tag-open-state script-data-escaped-end-tag-open-state
825 states::RawEndTagOpen(kind) => loop {
826 let c = get_char!(self, input);
827 match lower_ascii_letter(c) {
828 Some(cl) => go!(self: create_tag EndTag cl; push_temp c; to RawEndTagName kind),
829 None => go!(self: emit '<'; emit '/'; reconsume RawData kind),
830 }
831 },
832
833 //§ rcdata-end-tag-name-state rawtext-end-tag-name-state script-data-end-tag-name-state script-data-escaped-end-tag-name-state
834 states::RawEndTagName(kind) => loop {
835 let c = get_char!(self, input);
836 if self.have_appropriate_end_tag() {
837 match c {
838 '\t' | '\n' | '\x0C' | ' '
839 => go!(self: to BeforeAttributeName),
840 '/' => go!(self: to SelfClosingStartTag),
841 '>' => go!(self: emit_tag Data),
842 _ => (),
843 }
844 }
845
846 match lower_ascii_letter(c) {
847 Some(cl) => go!(self: push_tag cl; push_temp c),
848 None => go!(self: discard_tag; emit '<'; emit '/'; emit_temp; reconsume RawData kind),
849 }
850 },
851
852 //§ script-data-double-escape-start-state
853 states::ScriptDataEscapeStart(DoubleEscaped) => loop {
854 let c = get_char!(self, input);
855 match c {
856 '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => {
857 let esc = if &*self.temp_buf == "script" { DoubleEscaped } else { Escaped };
858 go!(self: emit c; to RawData ScriptDataEscaped esc);
859 }
860 _ => match lower_ascii_letter(c) {
861 Some(cl) => go!(self: push_temp cl; emit c),
862 None => go!(self: reconsume RawData ScriptDataEscaped Escaped),
863 }
864 }
865 },
866
867 //§ script-data-escape-start-state
868 states::ScriptDataEscapeStart(Escaped) => loop { match get_char!(self, input) {
869 '-' => go!(self: emit '-'; to ScriptDataEscapeStartDash),
870 _ => go!(self: reconsume RawData ScriptData),
871 }},
872
873 //§ script-data-escape-start-dash-state
874 states::ScriptDataEscapeStartDash => loop { match get_char!(self, input) {
875 '-' => go!(self: emit '-'; to ScriptDataEscapedDashDash Escaped),
876 _ => go!(self: reconsume RawData ScriptData),
877 }},
878
879 //§ script-data-escaped-dash-state script-data-double-escaped-dash-state
880 states::ScriptDataEscapedDash(kind) => loop { match get_char!(self, input) {
881 '-' => go!(self: emit '-'; to ScriptDataEscapedDashDash kind),
882 '<' => {
883 if kind == DoubleEscaped { go!(self: emit '<'); }
884 go!(self: to RawLessThanSign ScriptDataEscaped kind);
885 }
886 '\0' => go!(self: error; emit '\u{fffd}'; to RawData ScriptDataEscaped kind),
887 c => go!(self: emit c; to RawData ScriptDataEscaped kind),
888 }},
889
890 //§ script-data-escaped-dash-dash-state script-data-double-escaped-dash-dash-state
891 states::ScriptDataEscapedDashDash(kind) => loop { match get_char!(self, input) {
892 '-' => go!(self: emit '-'),
893 '<' => {
894 if kind == DoubleEscaped { go!(self: emit '<'); }
895 go!(self: to RawLessThanSign ScriptDataEscaped kind);
896 }
897 '>' => go!(self: emit '>'; to RawData ScriptData),
898 '\0' => go!(self: error; emit '\u{fffd}'; to RawData ScriptDataEscaped kind),
899 c => go!(self: emit c; to RawData ScriptDataEscaped kind),
900 }},
901
902 //§ script-data-double-escape-end-state
903 states::ScriptDataDoubleEscapeEnd => loop {
904 let c = get_char!(self, input);
905 match c {
906 '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => {
907 let esc = if &*self.temp_buf == "script" { Escaped } else { DoubleEscaped };
908 go!(self: emit c; to RawData ScriptDataEscaped esc);
909 }
910 _ => match lower_ascii_letter(c) {
911 Some(cl) => go!(self: push_temp cl; emit c),
912 None => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped),
913 }
914 }
915 },
916
917 //§ before-attribute-name-state
918 states::BeforeAttributeName => loop { match get_char!(self, input) {
919 '\t' | '\n' | '\x0C' | ' ' => (),
920 '/' => go!(self: to SelfClosingStartTag),
921 '>' => go!(self: emit_tag Data),
922 '\0' => go!(self: error; create_attr '\u{fffd}'; to AttributeName),
923 c => match lower_ascii_letter(c) {
924 Some(cl) => go!(self: create_attr cl; to AttributeName),
925 None => {
926 go_match!(self: c,
927 '"' , '\'' , '<' , '=' => error);
928 go!(self: create_attr c; to AttributeName);
929 }
930 }
931 }},
932
933 //§ attribute-name-state
934 states::AttributeName => loop { match get_char!(self, input) {
935 '\t' | '\n' | '\x0C' | ' '
936 => go!(self: to AfterAttributeName),
937 '/' => go!(self: to SelfClosingStartTag),
938 '=' => go!(self: to BeforeAttributeValue),
939 '>' => go!(self: emit_tag Data),
940 '\0' => go!(self: error; push_name '\u{fffd}'),
941 c => match lower_ascii_letter(c) {
942 Some(cl) => go!(self: push_name cl),
943 None => {
944 go_match!(self: c,
945 '"' , '\'' , '<' => error);
946 go!(self: push_name c);
947 }
948 }
949 }},
950
951 //§ after-attribute-name-state
952 states::AfterAttributeName => loop { match get_char!(self, input) {
953 '\t' | '\n' | '\x0C' | ' ' => (),
954 '/' => go!(self: to SelfClosingStartTag),
955 '=' => go!(self: to BeforeAttributeValue),
956 '>' => go!(self: emit_tag Data),
957 '\0' => go!(self: error; create_attr '\u{fffd}'; to AttributeName),
958 c => match lower_ascii_letter(c) {
959 Some(cl) => go!(self: create_attr cl; to AttributeName),
960 None => {
961 go_match!(self: c,
962 '"' , '\'' , '<' => error);
963 go!(self: create_attr c; to AttributeName);
964 }
965 }
966 }},
967
968 //§ before-attribute-value-state
969 // Use peek so we can handle the first attr character along with the rest,
970 // hopefully in the same zero-copy buffer.
971 states::BeforeAttributeValue => loop { match peek!(self, input) {
972 '\t' | '\n' | '\r' | '\x0C' | ' ' => go!(self: discard_char input),
973 '"' => go!(self: discard_char input; to AttributeValue DoubleQuoted),
974 '\'' => go!(self: discard_char input; to AttributeValue SingleQuoted),
975 '\0' => go!(self: discard_char input; error; push_value '\u{fffd}'; to AttributeValue Unquoted),
976 '>' => go!(self: discard_char input; error; emit_tag Data),
977 _ => go!(self: to AttributeValue Unquoted),
978 }},
979
980 //§ attribute-value-(double-quoted)-state
981 states::AttributeValue(DoubleQuoted) => loop {
982 match pop_except_from!(self, input, small_char_set!('\r' '"' '&' '\0' '\n')) {
983 FromSet('"') => go!(self: to AfterAttributeValueQuoted),
984 FromSet('&') => go!(self: consume_char_ref '"'),
985 FromSet('\0') => go!(self: error; push_value '\u{fffd}'),
986 FromSet(c) => go!(self: push_value c),
987 NotFromSet(ref b) => go!(self: append_value b),
988 }
989 },
990
991 //§ attribute-value-(single-quoted)-state
992 states::AttributeValue(SingleQuoted) => loop {
993 match pop_except_from!(self, input, small_char_set!('\r' '\'' '&' '\0' '\n')) {
994 FromSet('\'') => go!(self: to AfterAttributeValueQuoted),
995 FromSet('&') => go!(self: consume_char_ref '\''),
996 FromSet('\0') => go!(self: error; push_value '\u{fffd}'),
997 FromSet(c) => go!(self: push_value c),
998 NotFromSet(ref b) => go!(self: append_value b),
999 }
1000 },
1001
1002 //§ attribute-value-(unquoted)-state
1003 states::AttributeValue(Unquoted) => loop {
1004 match pop_except_from!(self, input, small_char_set!('\r' '\t' '\n' '\x0C' ' ' '&' '>' '\0')) {
1005 FromSet('\t') | FromSet('\n') | FromSet('\x0C') | FromSet(' ')
1006 => go!(self: to BeforeAttributeName),
1007 FromSet('&') => go!(self: consume_char_ref '>'),
1008 FromSet('>') => go!(self: emit_tag Data),
1009 FromSet('\0') => go!(self: error; push_value '\u{fffd}'),
1010 FromSet(c) => {
1011 go_match!(self: c,
1012 '"' , '\'' , '<' , '=' , '`' => error);
1013 go!(self: push_value c);
1014 }
1015 NotFromSet(ref b) => go!(self: append_value b),
1016 }
1017 },
1018
1019 //§ after-attribute-value-(quoted)-state
1020 states::AfterAttributeValueQuoted => loop { match get_char!(self, input) {
1021 '\t' | '\n' | '\x0C' | ' '
1022 => go!(self: to BeforeAttributeName),
1023 '/' => go!(self: to SelfClosingStartTag),
1024 '>' => go!(self: emit_tag Data),
1025 _ => go!(self: error; reconsume BeforeAttributeName),
1026 }},
1027
1028 //§ self-closing-start-tag-state
1029 states::SelfClosingStartTag => loop { match get_char!(self, input) {
1030 '>' => {
1031 self.current_tag_self_closing = true;
1032 go!(self: emit_tag Data);
1033 }
1034 _ => go!(self: error; reconsume BeforeAttributeName),
1035 }},
1036
1037 //§ comment-start-state
1038 states::CommentStart => loop { match get_char!(self, input) {
1039 '-' => go!(self: to CommentStartDash),
1040 '\0' => go!(self: error; push_comment '\u{fffd}'; to Comment),
1041 '>' => go!(self: error; emit_comment; to Data),
1042 c => go!(self: push_comment c; to Comment),
1043 }},
1044
1045 //§ comment-start-dash-state
1046 states::CommentStartDash => loop { match get_char!(self, input) {
1047 '-' => go!(self: to CommentEnd),
1048 '\0' => go!(self: error; append_comment "-\u{fffd}"; to Comment),
1049 '>' => go!(self: error; emit_comment; to Data),
1050 c => go!(self: push_comment '-'; push_comment c; to Comment),
1051 }},
1052
1053 //§ comment-state
1054 states::Comment => loop { match get_char!(self, input) {
1055 '-' => go!(self: to CommentEndDash),
1056 '\0' => go!(self: error; push_comment '\u{fffd}'),
1057 c => go!(self: push_comment c),
1058 }},
1059
1060 //§ comment-end-dash-state
1061 states::CommentEndDash => loop { match get_char!(self, input) {
1062 '-' => go!(self: to CommentEnd),
1063 '\0' => go!(self: error; append_comment "-\u{fffd}"; to Comment),
1064 c => go!(self: push_comment '-'; push_comment c; to Comment),
1065 }},
1066
1067 //§ comment-end-state
1068 states::CommentEnd => loop { match get_char!(self, input) {
1069 '>' => go!(self: emit_comment; to Data),
1070 '\0' => go!(self: error; append_comment "--\u{fffd}"; to Comment),
1071 '!' => go!(self: error; to CommentEndBang),
1072 '-' => go!(self: error; push_comment '-'),
1073 c => go!(self: error; append_comment "--"; push_comment c; to Comment),
1074 }},
1075
1076 //§ comment-end-bang-state
1077 states::CommentEndBang => loop { match get_char!(self, input) {
1078 '-' => go!(self: append_comment "--!"; to CommentEndDash),
1079 '>' => go!(self: emit_comment; to Data),
1080 '\0' => go!(self: error; append_comment "--!\u{fffd}"; to Comment),
1081 c => go!(self: append_comment "--!"; push_comment c; to Comment),
1082 }},
1083
1084 //§ doctype-state
1085 states::Doctype => loop { match get_char!(self, input) {
1086 '\t' | '\n' | '\x0C' | ' '
1087 => go!(self: to BeforeDoctypeName),
1088 _ => go!(self: error; reconsume BeforeDoctypeName),
1089 }},
1090
1091 //§ before-doctype-name-state
1092 states::BeforeDoctypeName => loop { match get_char!(self, input) {
1093 '\t' | '\n' | '\x0C' | ' ' => (),
1094 '\0' => go!(self: error; create_doctype; push_doctype_name '\u{fffd}'; to DoctypeName),
1095 '>' => go!(self: error; create_doctype; force_quirks; emit_doctype; to Data),
1096 c => go!(self: create_doctype; push_doctype_name (c.to_ascii_lowercase());
1097 to DoctypeName),
1098 }},
1099
1100 //§ doctype-name-state
1101 states::DoctypeName => loop { match get_char!(self, input) {
1102 '\t' | '\n' | '\x0C' | ' '
1103 => go!(self: clear_temp; to AfterDoctypeName),
1104 '>' => go!(self: emit_doctype; to Data),
1105 '\0' => go!(self: error; push_doctype_name '\u{fffd}'),
1106 c => go!(self: push_doctype_name (c.to_ascii_lowercase())),
1107 }},
1108
1109 //§ after-doctype-name-state
1110 states::AfterDoctypeName => loop {
1111 if eat!(self, input, "public") {
1112 go!(self: to AfterDoctypeKeyword Public);
1113 } else if eat!(self, input, "system") {
1114 go!(self: to AfterDoctypeKeyword System);
1115 } else {
1116 match get_char!(self, input) {
1117 '\t' | '\n' | '\x0C' | ' ' => (),
1118 '>' => go!(self: emit_doctype; to Data),
1119 _ => go!(self: error; force_quirks; to BogusDoctype),
1120 }
1121 }
1122 },
1123
1124 //§ after-doctype-public-keyword-state after-doctype-system-keyword-state
1125 states::AfterDoctypeKeyword(kind) => loop { match get_char!(self, input) {
1126 '\t' | '\n' | '\x0C' | ' '
1127 => go!(self: to BeforeDoctypeIdentifier kind),
1128 '"' => go!(self: error; clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind),
1129 '\'' => go!(self: error; clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind),
1130 '>' => go!(self: error; force_quirks; emit_doctype; to Data),
1131 _ => go!(self: error; force_quirks; to BogusDoctype),
1132 }},
1133
1134 //§ before-doctype-public-identifier-state before-doctype-system-identifier-state
1135 states::BeforeDoctypeIdentifier(kind) => loop { match get_char!(self, input) {
1136 '\t' | '\n' | '\x0C' | ' ' => (),
1137 '"' => go!(self: clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind),
1138 '\'' => go!(self: clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind),
1139 '>' => go!(self: error; force_quirks; emit_doctype; to Data),
1140 _ => go!(self: error; force_quirks; to BogusDoctype),
1141 }},
1142
1143 //§ doctype-public-identifier-(double-quoted)-state doctype-system-identifier-(double-quoted)-state
1144 states::DoctypeIdentifierDoubleQuoted(kind) => loop { match get_char!(self, input) {
1145 '"' => go!(self: to AfterDoctypeIdentifier kind),
1146 '\0' => go!(self: error; push_doctype_id kind '\u{fffd}'),
1147 '>' => go!(self: error; force_quirks; emit_doctype; to Data),
1148 c => go!(self: push_doctype_id kind c),
1149 }},
1150
1151 //§ doctype-public-identifier-(single-quoted)-state doctype-system-identifier-(single-quoted)-state
1152 states::DoctypeIdentifierSingleQuoted(kind) => loop { match get_char!(self, input) {
1153 '\'' => go!(self: to AfterDoctypeIdentifier kind),
1154 '\0' => go!(self: error; push_doctype_id kind '\u{fffd}'),
1155 '>' => go!(self: error; force_quirks; emit_doctype; to Data),
1156 c => go!(self: push_doctype_id kind c),
1157 }},
1158
1159 //§ after-doctype-public-identifier-state
1160 states::AfterDoctypeIdentifier(Public) => loop { match get_char!(self, input) {
1161 '\t' | '\n' | '\x0C' | ' '
1162 => go!(self: to BetweenDoctypePublicAndSystemIdentifiers),
1163 '>' => go!(self: emit_doctype; to Data),
1164 '"' => go!(self: error; clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System),
1165 '\'' => go!(self: error; clear_doctype_id System; to DoctypeIdentifierSingleQuoted System),
1166 _ => go!(self: error; force_quirks; to BogusDoctype),
1167 }},
1168
1169 //§ after-doctype-system-identifier-state
1170 states::AfterDoctypeIdentifier(System) => loop { match get_char!(self, input) {
1171 '\t' | '\n' | '\x0C' | ' ' => (),
1172 '>' => go!(self: emit_doctype; to Data),
1173 _ => go!(self: error; to BogusDoctype),
1174 }},
1175
1176 //§ between-doctype-public-and-system-identifiers-state
1177 states::BetweenDoctypePublicAndSystemIdentifiers => loop { match get_char!(self, input) {
1178 '\t' | '\n' | '\x0C' | ' ' => (),
1179 '>' => go!(self: emit_doctype; to Data),
1180 '"' => go!(self: clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System),
1181 '\'' => go!(self: clear_doctype_id System; to DoctypeIdentifierSingleQuoted System),
1182 _ => go!(self: error; force_quirks; to BogusDoctype),
1183 }},
1184
1185 //§ bogus-doctype-state
1186 states::BogusDoctype => loop { match get_char!(self, input) {
1187 '>' => go!(self: emit_doctype; to Data),
1188 _ => (),
1189 }},
1190
1191 //§ bogus-comment-state
1192 states::BogusComment => loop { match get_char!(self, input) {
1193 '>' => go!(self: emit_comment; to Data),
1194 '\0' => go!(self: push_comment '\u{fffd}'),
1195 c => go!(self: push_comment c),
1196 }},
1197
1198 //§ markup-declaration-open-state
1199 states::MarkupDeclarationOpen => loop {
1200 if eat_exact!(self, input, "--") {
1201 go!(self: clear_comment; to CommentStart);
1202 } else if eat!(self, input, "doctype") {
1203 go!(self: to Doctype);
1204 } else {
1205 if self.sink.adjusted_current_node_present_but_not_in_html_namespace() {
1206 if eat_exact!(self, input, "[CDATA[") {
1207 go!(self: clear_temp; to CdataSection);
1208 }
1209 }
1210 go!(self: error; to BogusComment);
1211 }
1212 },
1213
1214 //§ cdata-section-state
1215 states::CdataSection => loop { match get_char!(self, input) {
1216 ']' => go!(self: to CdataSectionBracket),
1217 '\0' => go!(self: emit_temp; emit '\0'),
1218 c => go!(self: push_temp c),
1219 }},
1220
1221 //§ cdata-section-bracket
1222 states::CdataSectionBracket => match get_char!(self, input) {
1223 ']' => go!(self: to CdataSectionEnd),
1224 _ => go!(self: push_temp ']'; reconsume CdataSection),
1225 },
1226
1227 //§ cdata-section-end
1228 states::CdataSectionEnd => loop { match get_char!(self, input) {
1229 ']' => go!(self: push_temp ']'),
1230 '>' => go!(self: emit_temp; to Data),
1231 _ => go!(self: push_temp ']'; push_temp ']'; reconsume CdataSection),
1232 }},
1233
1234 //§ END
1235 }
1236 }
1237
1238 fn step_char_ref_tokenizer(&mut self, input: &mut BufferQueue) -> ProcessResult<Sink::Handle> {
1239 // FIXME HACK: Take and replace the tokenizer so we don't
1240 // double-mut-borrow self. This is why it's boxed.
1241 let mut tok = self.char_ref_tokenizer.take().unwrap();
1242 let outcome = tok.step(self, input);
1243
1244 let progress = match outcome {
1245 char_ref::Done => {
1246 self.process_char_ref(tok.get_result());
1247 return ProcessResult::Continue;
1248 }
1249
1250 char_ref::Stuck => ProcessResult::Suspend,
1251 char_ref::Progress => ProcessResult::Continue,
1252 };
1253
1254 self.char_ref_tokenizer = Some(tok);
1255 progress
1256 }
1257
1258 fn process_char_ref(&mut self, char_ref: CharRef) {
1259 let CharRef { mut chars, mut num_chars } = char_ref;
1260
1261 if num_chars == 0 {
1262 chars[0] = '&';
1263 num_chars = 1;
1264 }
1265
1266 for i in 0 .. num_chars {
1267 let c = chars[i as usize];
1268 match self.state {
1269 states::Data | states::RawData(states::Rcdata)
1270 => go!(self: emit c),
1271
1272 states::AttributeValue(_)
1273 => go!(self: push_value c),
1274
1275 _ => panic!("state {:?} should not be reachable in process_char_ref", self.state),
1276 }
1277 }
1278 }
1279
1280 /// Indicate that we have reached the end of the input.
1281 pub fn end(&mut self) {
1282 // Handle EOF in the char ref sub-tokenizer, if there is one.
1283 // Do this first because it might un-consume stuff.
1284 let mut input = BufferQueue::new();
1285 match self.char_ref_tokenizer.take() {
1286 None => (),
1287 Some(mut tok) => {
1288 tok.end_of_file(self, &mut input);
1289 self.process_char_ref(tok.get_result());
1290 }
1291 }
1292
1293 // Process all remaining buffered input.
1294 // If we're waiting for lookahead, we're not gonna get it.
1295 self.at_eof = true;
1296 assert!(matches!(self.run(&mut input), TokenizerResult::Done));
1297 assert!(input.is_empty());
1298
1299 loop {
1300 match self.eof_step() {
1301 ProcessResult::Continue => (),
1302 ProcessResult::Suspend => break,
1303 ProcessResult::Script(_) => unreachable!(),
1304 }
1305 }
1306
1307 self.sink.end();
1308
1309 if self.opts.profile {
1310 self.dump_profile();
1311 }
1312 }
1313
1314 fn dump_profile(&self) {
1315 let mut results: Vec<(states::State, u64)>
1316 = self.state_profile.iter().map(|(s, t)| (*s, *t)).collect();
1317 results.sort_by(|&(_, x), &(_, y)| y.cmp(&x));
1318
1319 let total: u64 = results.iter().map(|&(_, t)| t).fold(0, ::std::ops::Add::add);
1320 println!("\nTokenizer profile, in nanoseconds");
1321 println!("\n{:12} total in token sink", self.time_in_sink);
1322 println!("\n{:12} total in tokenizer", total);
1323
1324 for (k, v) in results.into_iter() {
1325 let pct = 100.0 * (v as f64) / (total as f64);
1326 println!("{:12} {:4.1}% {:?}", v, pct, k);
1327 }
1328 }
1329
1330 fn eof_step(&mut self) -> ProcessResult<Sink::Handle> {
1331 debug!("processing EOF in state {:?}", self.state);
1332 match self.state {
1333 states::Data | states::RawData(Rcdata) | states::RawData(Rawtext)
1334 | states::RawData(ScriptData) | states::Plaintext
1335 => go!(self: eof),
1336
1337 states::TagName | states::RawData(ScriptDataEscaped(_))
1338 | states::BeforeAttributeName | states::AttributeName
1339 | states::AfterAttributeName | states::BeforeAttributeValue
1340 | states::AttributeValue(_) | states::AfterAttributeValueQuoted
1341 | states::SelfClosingStartTag | states::ScriptDataEscapedDash(_)
1342 | states::ScriptDataEscapedDashDash(_)
1343 => go!(self: error_eof; to Data),
1344
1345 states::TagOpen
1346 => go!(self: error_eof; emit '<'; to Data),
1347
1348 states::EndTagOpen
1349 => go!(self: error_eof; emit '<'; emit '/'; to Data),
1350
1351 states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped))
1352 => go!(self: to RawData ScriptDataEscaped DoubleEscaped),
1353
1354 states::RawLessThanSign(kind)
1355 => go!(self: emit '<'; to RawData kind),
1356
1357 states::RawEndTagOpen(kind)
1358 => go!(self: emit '<'; emit '/'; to RawData kind),
1359
1360 states::RawEndTagName(kind)
1361 => go!(self: emit '<'; emit '/'; emit_temp; to RawData kind),
1362
1363 states::ScriptDataEscapeStart(kind)
1364 => go!(self: to RawData ScriptDataEscaped kind),
1365
1366 states::ScriptDataEscapeStartDash
1367 => go!(self: to RawData ScriptData),
1368
1369 states::ScriptDataDoubleEscapeEnd
1370 => go!(self: to RawData ScriptDataEscaped DoubleEscaped),
1371
1372 states::CommentStart | states::CommentStartDash
1373 | states::Comment | states::CommentEndDash
1374 | states::CommentEnd | states::CommentEndBang
1375 => go!(self: error_eof; emit_comment; to Data),
1376
1377 states::Doctype | states::BeforeDoctypeName
1378 => go!(self: error_eof; create_doctype; force_quirks; emit_doctype; to Data),
1379
1380 states::DoctypeName | states::AfterDoctypeName | states::AfterDoctypeKeyword(_)
1381 | states::BeforeDoctypeIdentifier(_) | states::DoctypeIdentifierDoubleQuoted(_)
1382 | states::DoctypeIdentifierSingleQuoted(_) | states::AfterDoctypeIdentifier(_)
1383 | states::BetweenDoctypePublicAndSystemIdentifiers
1384 => go!(self: error_eof; force_quirks; emit_doctype; to Data),
1385
1386 states::BogusDoctype
1387 => go!(self: emit_doctype; to Data),
1388
1389 states::BogusComment
1390 => go!(self: emit_comment; to Data),
1391
1392 states::MarkupDeclarationOpen
1393 => go!(self: error; to BogusComment),
1394
1395 states::CdataSection
1396 => go!(self: emit_temp; error_eof; to Data),
1397
1398 states::CdataSectionBracket
1399 => go!(self: push_temp ']'; to CdataSection),
1400
1401 states::CdataSectionEnd
1402 => go!(self: push_temp ']'; push_temp ']'; to CdataSection),
1403 }
1404 }
1405 }
1406
1407 #[cfg(test)]
1408 #[allow(non_snake_case)]
1409 mod test {
1410 use super::option_push; // private items
1411 use tendril::{StrTendril, SliceExt};
1412
1413 use super::{TokenSink, Tokenizer, TokenizerOpts, TokenSinkResult};
1414
1415 use super::interface::{Token, TagToken};
1416 use super::interface::{CharacterTokens, NullCharacterToken, EOFToken, ParseError};
1417 use super::interface::{TagKind, StartTag, EndTag, Tag};
1418
1419 use super::buffer_queue::{BufferQueue};
1420 use std::mem::replace;
1421
1422 use {LocalName};
1423
1424 // LinesMatch implements the TokenSink trait. It is used for testing to see
1425 // if current_line is being updated when process_token is called. The lines
1426 // vector is a collection of the line numbers that each token is on.
1427 struct LinesMatch {
1428 tokens: Vec<Token>,
1429 current_str: StrTendril,
1430 lines: Vec<(Token, u64)>,
1431 }
1432
1433 impl LinesMatch {
1434 fn new() -> LinesMatch {
1435 LinesMatch {
1436 tokens: vec!(),
1437 current_str: StrTendril::new(),
1438 lines: vec!(),
1439 }
1440 }
1441
1442 fn push(&mut self, token: Token, line_number: u64) {
1443 self.finish_str();
1444 self.lines.push((token, line_number));
1445 }
1446
1447 fn finish_str(&mut self) {
1448 if self.current_str.len() > 0 {
1449 let s = replace(&mut self.current_str, StrTendril::new());
1450 self.tokens.push(CharacterTokens(s));
1451 }
1452 }
1453
1454 }
1455
1456 impl TokenSink for LinesMatch {
1457
1458 type Handle = ();
1459
1460 fn process_token(&mut self, token: Token, line_number: u64) -> TokenSinkResult<Self::Handle> {
1461
1462 match token {
1463 CharacterTokens(b) => {
1464 self.current_str.push_slice(&b);
1465 }
1466
1467 NullCharacterToken => {
1468 self.current_str.push_char('\0');
1469 }
1470
1471 ParseError(_) => {
1472 panic!("unexpected parse error");
1473 }
1474
1475 TagToken(mut t) => {
1476 // The spec seems to indicate that one can emit
1477 // erroneous end tags with attrs, but the test
1478 // cases don't contain them.
1479 match t.kind {
1480 EndTag => {
1481 t.self_closing = false;
1482 t.attrs = vec!();
1483 }
1484 _ => t.attrs.sort_by(|a1, a2| a1.name.cmp(&a2.name)),
1485 }
1486 self.push(TagToken(t), line_number);
1487 }
1488
1489 EOFToken => (),
1490
1491 _ => self.push(token, line_number),
1492 }
1493 TokenSinkResult::Continue
1494 }
1495 }
1496
1497 // Take in tokens, process them, and return vector with line
1498 // numbers that each token is on
1499 fn tokenize(input: Vec<StrTendril>, opts: TokenizerOpts) -> Vec<(Token, u64)> {
1500 let sink = LinesMatch::new();
1501 let mut tok = Tokenizer::new(sink, opts);
1502 let mut buffer = BufferQueue::new();
1503 for chunk in input.into_iter() {
1504 buffer.push_back(chunk);
1505 let _ = tok.feed(&mut buffer);
1506 }
1507 tok.end();
1508 tok.sink.lines
1509 }
1510
1511 // Create a tag token
1512 fn create_tag(token: StrTendril, tagkind: TagKind) -> Token {
1513 let name = LocalName::from(&*token);
1514 let token = TagToken(Tag { kind: tagkind,
1515 name: name,
1516 self_closing: false,
1517 attrs: vec!(),
1518 });
1519 token
1520 }
1521
1522 #[test]
1523 fn push_to_None_gives_singleton() {
1524 let mut s: Option<StrTendril> = None;
1525 option_push(&mut s, 'x');
1526 assert_eq!(s, Some("x".to_tendril()));
1527 }
1528
1529 #[test]
1530 fn push_to_empty_appends() {
1531 let mut s: Option<StrTendril> = Some(StrTendril::new());
1532 option_push(&mut s, 'x');
1533 assert_eq!(s, Some("x".to_tendril()));
1534 }
1535
1536 #[test]
1537 fn push_to_nonempty_appends() {
1538 let mut s: Option<StrTendril> = Some(StrTendril::from_slice("y"));
1539 option_push(&mut s, 'x');
1540 assert_eq!(s, Some("yx".to_tendril()));
1541 }
1542
1543 #[test]
1544 fn check_lines() {
1545 let opts = TokenizerOpts {
1546 exact_errors: false,
1547 discard_bom: true,
1548 profile: false,
1549 initial_state: None,
1550 last_start_tag_name: None,
1551 };
1552 let vector = vec![StrTendril::from("<a>\n"), StrTendril::from("<b>\n"),
1553 StrTendril::from("</b>\n"), StrTendril::from("</a>\n")];
1554 let expected = vec![(create_tag(StrTendril::from("a"), StartTag), 1),
1555 (create_tag(StrTendril::from("b"), StartTag), 2),
1556 (create_tag(StrTendril::from("b"), EndTag), 3),
1557 (create_tag(StrTendril::from("a"), EndTag), 4)];
1558 let results = tokenize(vector, opts);
1559 assert_eq!(results, expected);
1560 }
1561
1562 #[test]
1563 fn check_lines_with_new_line() {
1564 let opts = TokenizerOpts {
1565 exact_errors: false,
1566 discard_bom: true,
1567 profile: false,
1568 initial_state: None,
1569 last_start_tag_name: None,
1570 };
1571 let vector = vec![StrTendril::from("<a>\r\n"), StrTendril::from("<b>\r\n"),
1572 StrTendril::from("</b>\r\n"), StrTendril::from("</a>\r\n")];
1573 let expected = vec![(create_tag(StrTendril::from("a"), StartTag), 1),
1574 (create_tag(StrTendril::from("b"), StartTag), 2),
1575 (create_tag(StrTendril::from("b"), EndTag), 3),
1576 (create_tag(StrTendril::from("a"), EndTag), 4)];
1577 let results = tokenize(vector, opts);
1578 assert_eq!(results, expected);
1579 }
1580 }
1581