1 // Copyright 2014-2017 The html5ever Project Developers. See the
2 // COPYRIGHT file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9
10 //! The HTML5 tokenizer.
11
12 pub use self::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
13 pub use self::interface::{CommentToken, DoctypeToken, TagToken, Token};
14 pub use self::interface::{Doctype, EndTag, StartTag, Tag, TagKind};
15 pub use self::interface::{TokenSink, TokenSinkResult};
16
17 use self::states::{DoctypeIdKind, Public, System};
18 use self::states::{DoubleEscaped, Escaped};
19 use self::states::{DoubleQuoted, SingleQuoted, Unquoted};
20 use self::states::{Rawtext, Rcdata, ScriptData, ScriptDataEscaped};
21
22 use self::char_ref::{CharRef, CharRefTokenizer};
23
24 use util::str::lower_ascii_letter;
25
26 use std::borrow::Cow::{self, Borrowed};
27 use std::collections::BTreeMap;
28 use std::default::Default;
29 use std::mem::replace;
30
31 pub use buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult};
32 use tendril::StrTendril;
33 use {Attribute, LocalName, QualName, SmallCharSet};
34
35 mod char_ref;
36 mod interface;
37 pub mod states;
38
39 pub enum ProcessResult<Handle> {
40 Continue,
41 Suspend,
42 Script(Handle),
43 }
44
45 #[must_use]
46 pub enum TokenizerResult<Handle> {
47 Done,
48 Script(Handle),
49 }
50
option_push(opt_str: &mut Option<StrTendril>, c: char)51 fn option_push(opt_str: &mut Option<StrTendril>, c: char) {
52 match *opt_str {
53 Some(ref mut s) => s.push_char(c),
54 None => *opt_str = Some(StrTendril::from_char(c)),
55 }
56 }
57
58 /// Tokenizer options, with an impl for `Default`.
59 #[derive(Clone)]
60 pub struct TokenizerOpts {
61 /// Report all parse errors described in the spec, at some
62 /// performance penalty? Default: false
63 pub exact_errors: bool,
64
65 /// Discard a `U+FEFF BYTE ORDER MARK` if we see one at the beginning
66 /// of the stream? Default: true
67 pub discard_bom: bool,
68
69 /// Keep a record of how long we spent in each state? Printed
70 /// when `end()` is called. Default: false
71 pub profile: bool,
72
73 /// Initial state override. Only the test runner should use
74 /// a non-`None` value!
75 pub initial_state: Option<states::State>,
76
77 /// Last start tag. Only the test runner should use a
78 /// non-`None` value!
79 ///
80 /// FIXME: Can't use Tendril because we want TokenizerOpts
81 /// to be Send.
82 pub last_start_tag_name: Option<String>,
83 }
84
85 impl Default for TokenizerOpts {
default() -> TokenizerOpts86 fn default() -> TokenizerOpts {
87 TokenizerOpts {
88 exact_errors: false,
89 discard_bom: true,
90 profile: false,
91 initial_state: None,
92 last_start_tag_name: None,
93 }
94 }
95 }
96
97 /// The HTML tokenizer.
98 pub struct Tokenizer<Sink> {
99 /// Options controlling the behavior of the tokenizer.
100 opts: TokenizerOpts,
101
102 /// Destination for tokens we emit.
103 pub sink: Sink,
104
105 /// The abstract machine state as described in the spec.
106 state: states::State,
107
108 /// Are we at the end of the file, once buffers have been processed
109 /// completely? This affects whether we will wait for lookahead or not.
110 at_eof: bool,
111
112 /// Tokenizer for character references, if we're tokenizing
113 /// one at the moment.
114 char_ref_tokenizer: Option<Box<CharRefTokenizer>>,
115
116 /// Current input character. Just consumed, may reconsume.
117 current_char: char,
118
119 /// Should we reconsume the current input character?
120 reconsume: bool,
121
122 /// Did we just consume \r, translating it to \n? In that case we need
123 /// to ignore the next character if it's \n.
124 ignore_lf: bool,
125
126 /// Discard a U+FEFF BYTE ORDER MARK if we see one? Only done at the
127 /// beginning of the stream.
128 discard_bom: bool,
129
130 /// Current tag kind.
131 current_tag_kind: TagKind,
132
133 /// Current tag name.
134 current_tag_name: StrTendril,
135
136 /// Current tag is self-closing?
137 current_tag_self_closing: bool,
138
139 /// Current tag attributes.
140 current_tag_attrs: Vec<Attribute>,
141
142 /// Current attribute name.
143 current_attr_name: StrTendril,
144
145 /// Current attribute value.
146 current_attr_value: StrTendril,
147
148 /// Current comment.
149 current_comment: StrTendril,
150
151 /// Current doctype token.
152 current_doctype: Doctype,
153
154 /// Last start tag name, for use in checking "appropriate end tag".
155 last_start_tag_name: Option<LocalName>,
156
157 /// The "temporary buffer" mentioned in the spec.
158 temp_buf: StrTendril,
159
160 /// Record of how many ns we spent in each state, if profiling is enabled.
161 state_profile: BTreeMap<states::State, u64>,
162
163 /// Record of how many ns we spent in the token sink.
164 time_in_sink: u64,
165
166 /// Track current line
167 current_line: u64,
168 }
169
170 impl<Sink: TokenSink> Tokenizer<Sink> {
171 /// Create a new tokenizer which feeds tokens to a particular `TokenSink`.
new(sink: Sink, mut opts: TokenizerOpts) -> Tokenizer<Sink>172 pub fn new(sink: Sink, mut opts: TokenizerOpts) -> Tokenizer<Sink> {
173 let start_tag_name = opts
174 .last_start_tag_name
175 .take()
176 .map(|s| LocalName::from(&*s));
177 let state = opts.initial_state.unwrap_or(states::Data);
178 let discard_bom = opts.discard_bom;
179 Tokenizer {
180 opts: opts,
181 sink: sink,
182 state: state,
183 char_ref_tokenizer: None,
184 at_eof: false,
185 current_char: '\0',
186 reconsume: false,
187 ignore_lf: false,
188 discard_bom: discard_bom,
189 current_tag_kind: StartTag,
190 current_tag_name: StrTendril::new(),
191 current_tag_self_closing: false,
192 current_tag_attrs: vec![],
193 current_attr_name: StrTendril::new(),
194 current_attr_value: StrTendril::new(),
195 current_comment: StrTendril::new(),
196 current_doctype: Doctype::new(),
197 last_start_tag_name: start_tag_name,
198 temp_buf: StrTendril::new(),
199 state_profile: BTreeMap::new(),
200 time_in_sink: 0,
201 current_line: 1,
202 }
203 }
204
205 /// Feed an input string into the tokenizer.
feed(&mut self, input: &mut BufferQueue) -> TokenizerResult<Sink::Handle>206 pub fn feed(&mut self, input: &mut BufferQueue) -> TokenizerResult<Sink::Handle> {
207 if input.is_empty() {
208 return TokenizerResult::Done;
209 }
210
211 if self.discard_bom {
212 if let Some(c) = input.peek() {
213 if c == '\u{feff}' {
214 input.next();
215 }
216 } else {
217 return TokenizerResult::Done;
218 }
219 };
220
221 self.run(input)
222 }
223
set_plaintext_state(&mut self)224 pub fn set_plaintext_state(&mut self) {
225 self.state = states::Plaintext;
226 }
227
process_token(&mut self, token: Token) -> TokenSinkResult<Sink::Handle>228 fn process_token(&mut self, token: Token) -> TokenSinkResult<Sink::Handle> {
229 if self.opts.profile {
230 let (ret, dt) = time!(self.sink.process_token(token, self.current_line));
231 self.time_in_sink += dt;
232 ret
233 } else {
234 self.sink.process_token(token, self.current_line)
235 }
236 }
237
process_token_and_continue(&mut self, token: Token)238 fn process_token_and_continue(&mut self, token: Token) {
239 assert!(matches!(
240 self.process_token(token),
241 TokenSinkResult::Continue
242 ));
243 }
244
245 //§ preprocessing-the-input-stream
246 // Get the next input character, which might be the character
247 // 'c' that we already consumed from the buffers.
get_preprocessed_char(&mut self, mut c: char, input: &mut BufferQueue) -> Option<char>248 fn get_preprocessed_char(&mut self, mut c: char, input: &mut BufferQueue) -> Option<char> {
249 if self.ignore_lf {
250 self.ignore_lf = false;
251 if c == '\n' {
252 c = unwrap_or_return!(input.next(), None);
253 }
254 }
255
256 if c == '\r' {
257 self.ignore_lf = true;
258 c = '\n';
259 }
260
261 if c == '\n' {
262 self.current_line += 1;
263 }
264
265 if self.opts.exact_errors &&
266 match c as u32 {
267 0x01...0x08 | 0x0B | 0x0E...0x1F | 0x7F...0x9F | 0xFDD0...0xFDEF => true,
268 n if (n & 0xFFFE) == 0xFFFE => true,
269 _ => false,
270 }
271 {
272 let msg = format!("Bad character {}", c);
273 self.emit_error(Cow::Owned(msg));
274 }
275
276 debug!("got character {}", c);
277 self.current_char = c;
278 Some(c)
279 }
280
281 //§ tokenization
282 // Get the next input character, if one is available.
get_char(&mut self, input: &mut BufferQueue) -> Option<char>283 fn get_char(&mut self, input: &mut BufferQueue) -> Option<char> {
284 if self.reconsume {
285 self.reconsume = false;
286 Some(self.current_char)
287 } else {
288 input
289 .next()
290 .and_then(|c| self.get_preprocessed_char(c, input))
291 }
292 }
293
pop_except_from(&mut self, input: &mut BufferQueue, set: SmallCharSet) -> Option<SetResult>294 fn pop_except_from(&mut self, input: &mut BufferQueue, set: SmallCharSet) -> Option<SetResult> {
295 // Bail to the slow path for various corner cases.
296 // This means that `FromSet` can contain characters not in the set!
297 // It shouldn't matter because the fallback `FromSet` case should
298 // always do the same thing as the `NotFromSet` case.
299 if self.opts.exact_errors || self.reconsume || self.ignore_lf {
300 return self.get_char(input).map(|x| FromSet(x));
301 }
302
303 let d = input.pop_except_from(set);
304 debug!("got characters {:?}", d);
305 match d {
306 Some(FromSet(c)) => self.get_preprocessed_char(c, input).map(|x| FromSet(x)),
307
308 // NB: We don't set self.current_char for a run of characters not
309 // in the set. It shouldn't matter for the codepaths that use
310 // this.
311 _ => d,
312 }
313 }
314
315 // Check if the next characters are an ASCII case-insensitive match. See
316 // BufferQueue::eat.
317 //
318 // NB: this doesn't do input stream preprocessing or set the current input
319 // character.
eat( &mut self, input: &mut BufferQueue, pat: &str, eq: fn(&u8, &u8) -> bool, ) -> Option<bool>320 fn eat(
321 &mut self,
322 input: &mut BufferQueue,
323 pat: &str,
324 eq: fn(&u8, &u8) -> bool,
325 ) -> Option<bool> {
326 input.push_front(replace(&mut self.temp_buf, StrTendril::new()));
327 match input.eat(pat, eq) {
328 None if self.at_eof => Some(false),
329 None => {
330 while let Some(c) = input.next() {
331 self.temp_buf.push_char(c);
332 }
333 None
334 },
335 Some(matched) => Some(matched),
336 }
337 }
338
339 /// Run the state machine for as long as we can.
run(&mut self, input: &mut BufferQueue) -> TokenizerResult<Sink::Handle>340 fn run(&mut self, input: &mut BufferQueue) -> TokenizerResult<Sink::Handle> {
341 if self.opts.profile {
342 loop {
343 let state = self.state;
344 let old_sink = self.time_in_sink;
345 let (run, mut dt) = time!(self.step(input));
346 dt -= (self.time_in_sink - old_sink);
347 let new = match self.state_profile.get_mut(&state) {
348 Some(x) => {
349 *x += dt;
350 false
351 },
352 None => true,
353 };
354 if new {
355 // do this here because of borrow shenanigans
356 self.state_profile.insert(state, dt);
357 }
358 match run {
359 ProcessResult::Continue => (),
360 ProcessResult::Suspend => break,
361 ProcessResult::Script(node) => return TokenizerResult::Script(node),
362 }
363 }
364 } else {
365 loop {
366 match self.step(input) {
367 ProcessResult::Continue => (),
368 ProcessResult::Suspend => break,
369 ProcessResult::Script(node) => return TokenizerResult::Script(node),
370 }
371 }
372 }
373 TokenizerResult::Done
374 }
375
bad_char_error(&mut self)376 fn bad_char_error(&mut self) {
377 let msg = format_if!(
378 self.opts.exact_errors,
379 "Bad character",
380 "Saw {} in state {:?}",
381 self.current_char,
382 self.state
383 );
384 self.emit_error(msg);
385 }
386
bad_eof_error(&mut self)387 fn bad_eof_error(&mut self) {
388 let msg = format_if!(
389 self.opts.exact_errors,
390 "Unexpected EOF",
391 "Saw EOF in state {:?}",
392 self.state
393 );
394 self.emit_error(msg);
395 }
396
emit_char(&mut self, c: char)397 fn emit_char(&mut self, c: char) {
398 self.process_token_and_continue(match c {
399 '\0' => NullCharacterToken,
400 _ => CharacterTokens(StrTendril::from_char(c)),
401 });
402 }
403
404 // The string must not contain '\0'!
emit_chars(&mut self, b: StrTendril)405 fn emit_chars(&mut self, b: StrTendril) {
406 self.process_token_and_continue(CharacterTokens(b));
407 }
408
emit_current_tag(&mut self) -> ProcessResult<Sink::Handle>409 fn emit_current_tag(&mut self) -> ProcessResult<Sink::Handle> {
410 self.finish_attribute();
411
412 let name = LocalName::from(&*self.current_tag_name);
413 self.current_tag_name.clear();
414
415 match self.current_tag_kind {
416 StartTag => {
417 self.last_start_tag_name = Some(name.clone());
418 },
419 EndTag => {
420 if !self.current_tag_attrs.is_empty() {
421 self.emit_error(Borrowed("Attributes on an end tag"));
422 }
423 if self.current_tag_self_closing {
424 self.emit_error(Borrowed("Self-closing end tag"));
425 }
426 },
427 }
428
429 let token = TagToken(Tag {
430 kind: self.current_tag_kind,
431 name: name,
432 self_closing: self.current_tag_self_closing,
433 attrs: replace(&mut self.current_tag_attrs, vec![]),
434 });
435
436 match self.process_token(token) {
437 TokenSinkResult::Continue => ProcessResult::Continue,
438 TokenSinkResult::Plaintext => {
439 self.state = states::Plaintext;
440 ProcessResult::Continue
441 },
442 TokenSinkResult::Script(node) => {
443 self.state = states::Data;
444 ProcessResult::Script(node)
445 },
446 TokenSinkResult::RawData(kind) => {
447 self.state = states::RawData(kind);
448 ProcessResult::Continue
449 },
450 }
451 }
452
emit_temp_buf(&mut self)453 fn emit_temp_buf(&mut self) {
454 // FIXME: Make sure that clearing on emit is spec-compatible.
455 let buf = replace(&mut self.temp_buf, StrTendril::new());
456 self.emit_chars(buf);
457 }
458
clear_temp_buf(&mut self)459 fn clear_temp_buf(&mut self) {
460 // Do this without a new allocation.
461 self.temp_buf.clear();
462 }
463
emit_current_comment(&mut self)464 fn emit_current_comment(&mut self) {
465 let comment = replace(&mut self.current_comment, StrTendril::new());
466 self.process_token_and_continue(CommentToken(comment));
467 }
468
discard_tag(&mut self)469 fn discard_tag(&mut self) {
470 self.current_tag_name.clear();
471 self.current_tag_self_closing = false;
472 self.current_tag_attrs = vec![];
473 }
474
create_tag(&mut self, kind: TagKind, c: char)475 fn create_tag(&mut self, kind: TagKind, c: char) {
476 self.discard_tag();
477 self.current_tag_name.push_char(c);
478 self.current_tag_kind = kind;
479 }
480
have_appropriate_end_tag(&self) -> bool481 fn have_appropriate_end_tag(&self) -> bool {
482 match self.last_start_tag_name.as_ref() {
483 Some(last) => (self.current_tag_kind == EndTag) && (*self.current_tag_name == **last),
484 None => false,
485 }
486 }
487
create_attribute(&mut self, c: char)488 fn create_attribute(&mut self, c: char) {
489 self.finish_attribute();
490
491 self.current_attr_name.push_char(c);
492 }
493
finish_attribute(&mut self)494 fn finish_attribute(&mut self) {
495 if self.current_attr_name.len() == 0 {
496 return;
497 }
498
499 // Check for a duplicate attribute.
500 // FIXME: the spec says we should error as soon as the name is finished.
501 // FIXME: linear time search, do we care?
502 let dup = {
503 let name = &*self.current_attr_name;
504 self.current_tag_attrs
505 .iter()
506 .any(|a| &*a.name.local == name)
507 };
508
509 if dup {
510 self.emit_error(Borrowed("Duplicate attribute"));
511 self.current_attr_name.clear();
512 self.current_attr_value.clear();
513 } else {
514 let name = LocalName::from(&*self.current_attr_name);
515 self.current_attr_name.clear();
516 self.current_tag_attrs.push(Attribute {
517 // The tree builder will adjust the namespace if necessary.
518 // This only happens in foreign elements.
519 name: QualName::new(None, ns!(), name),
520 value: replace(&mut self.current_attr_value, StrTendril::new()),
521 });
522 }
523 }
524
emit_current_doctype(&mut self)525 fn emit_current_doctype(&mut self) {
526 let doctype = replace(&mut self.current_doctype, Doctype::new());
527 self.process_token_and_continue(DoctypeToken(doctype));
528 }
529
doctype_id<'a>(&'a mut self, kind: DoctypeIdKind) -> &'a mut Option<StrTendril>530 fn doctype_id<'a>(&'a mut self, kind: DoctypeIdKind) -> &'a mut Option<StrTendril> {
531 match kind {
532 Public => &mut self.current_doctype.public_id,
533 System => &mut self.current_doctype.system_id,
534 }
535 }
536
clear_doctype_id(&mut self, kind: DoctypeIdKind)537 fn clear_doctype_id(&mut self, kind: DoctypeIdKind) {
538 let id = self.doctype_id(kind);
539 match *id {
540 Some(ref mut s) => s.clear(),
541 None => *id = Some(StrTendril::new()),
542 }
543 }
544
consume_char_ref(&mut self, addnl_allowed: Option<char>)545 fn consume_char_ref(&mut self, addnl_allowed: Option<char>) {
546 // NB: The char ref tokenizer assumes we have an additional allowed
547 // character iff we're tokenizing in an attribute value.
548 self.char_ref_tokenizer = Some(Box::new(CharRefTokenizer::new(addnl_allowed)));
549 }
550
emit_eof(&mut self)551 fn emit_eof(&mut self) {
552 self.process_token_and_continue(EOFToken);
553 }
554
peek(&mut self, input: &BufferQueue) -> Option<char>555 fn peek(&mut self, input: &BufferQueue) -> Option<char> {
556 if self.reconsume {
557 Some(self.current_char)
558 } else {
559 input.peek()
560 }
561 }
562
discard_char(&mut self, input: &mut BufferQueue)563 fn discard_char(&mut self, input: &mut BufferQueue) {
564 self.get_char(input);
565 }
566
emit_error(&mut self, error: Cow<'static, str>)567 fn emit_error(&mut self, error: Cow<'static, str>) {
568 self.process_token_and_continue(ParseError(error));
569 }
570 }
571 //§ END
572
573 // Shorthand for common state machine behaviors.
574 macro_rules! shorthand (
575 ( $me:ident : emit $c:expr ) => ( $me.emit_char($c); );
576 ( $me:ident : create_tag $kind:ident $c:expr ) => ( $me.create_tag($kind, $c); );
577 ( $me:ident : push_tag $c:expr ) => ( $me.current_tag_name.push_char($c); );
578 ( $me:ident : discard_tag ) => ( $me.discard_tag(); );
579 ( $me:ident : discard_char $input:expr ) => ( $me.discard_char($input); );
580 ( $me:ident : push_temp $c:expr ) => ( $me.temp_buf.push_char($c); );
581 ( $me:ident : emit_temp ) => ( $me.emit_temp_buf(); );
582 ( $me:ident : clear_temp ) => ( $me.clear_temp_buf(); );
583 ( $me:ident : create_attr $c:expr ) => ( $me.create_attribute($c); );
584 ( $me:ident : push_name $c:expr ) => ( $me.current_attr_name.push_char($c); );
585 ( $me:ident : push_value $c:expr ) => ( $me.current_attr_value.push_char($c); );
586 ( $me:ident : append_value $c:expr ) => ( $me.current_attr_value.push_tendril($c); );
587 ( $me:ident : push_comment $c:expr ) => ( $me.current_comment.push_char($c); );
588 ( $me:ident : append_comment $c:expr ) => ( $me.current_comment.push_slice($c); );
589 ( $me:ident : emit_comment ) => ( $me.emit_current_comment(); );
590 ( $me:ident : clear_comment ) => ( $me.current_comment.clear(); );
591 ( $me:ident : create_doctype ) => ( $me.current_doctype = Doctype::new(); );
592 ( $me:ident : push_doctype_name $c:expr ) => ( option_push(&mut $me.current_doctype.name, $c); );
593 ( $me:ident : push_doctype_id $k:ident $c:expr ) => ( option_push($me.doctype_id($k), $c); );
594 ( $me:ident : clear_doctype_id $k:ident ) => ( $me.clear_doctype_id($k); );
595 ( $me:ident : force_quirks ) => ( $me.current_doctype.force_quirks = true; );
596 ( $me:ident : emit_doctype ) => ( $me.emit_current_doctype(); );
597 ( $me:ident : error ) => ( $me.bad_char_error(); );
598 ( $me:ident : error_eof ) => ( $me.bad_eof_error(); );
599 );
600
601 // Tracing of tokenizer actions. This adds significant bloat and compile time,
602 // so it's behind a cfg flag.
603 #[cfg(trace_tokenizer)]
604 macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ({
605 debug!(" {:s}", stringify!($($cmds)*));
606 shorthand!($me:expr : $($cmds)*);
607 }));
608
609 #[cfg(not(trace_tokenizer))]
610 macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ( shorthand!($me: $($cmds)*) ) );
611
612 // A little DSL for sequencing shorthand actions.
613 macro_rules! go (
614 // A pattern like $($cmd:tt)* ; $($rest:tt)* causes parse ambiguity.
615 // We have to tell the parser how much lookahead we need.
616
617 ( $me:ident : $a:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a); go!($me: $($rest)*); });
618 ( $me:ident : $a:tt $b:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b); go!($me: $($rest)*); });
619 ( $me:ident : $a:tt $b:tt $c:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c); go!($me: $($rest)*); });
620 ( $me:ident : $a:tt $b:tt $c:tt $d:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c $d); go!($me: $($rest)*); });
621
622 // These can only come at the end.
623
624 ( $me:ident : to $s:ident ) => ({ $me.state = states::$s; return ProcessResult::Continue; });
625 ( $me:ident : to $s:ident $k1:expr ) => ({ $me.state = states::$s($k1); return ProcessResult::Continue; });
626 ( $me:ident : to $s:ident $k1:ident $k2:expr ) => ({ $me.state = states::$s($k1($k2)); return ProcessResult::Continue; });
627
628 ( $me:ident : reconsume $s:ident ) => ({ $me.reconsume = true; go!($me: to $s); });
629 ( $me:ident : reconsume $s:ident $k1:expr ) => ({ $me.reconsume = true; go!($me: to $s $k1); });
630 ( $me:ident : reconsume $s:ident $k1:ident $k2:expr ) => ({ $me.reconsume = true; go!($me: to $s $k1 $k2); });
631
632 ( $me:ident : consume_char_ref ) => ({ $me.consume_char_ref(None); return ProcessResult::Continue; });
633 ( $me:ident : consume_char_ref $addnl:expr ) => ({ $me.consume_char_ref(Some($addnl)); return ProcessResult::Continue; });
634
635 // We have a default next state after emitting a tag, but the sink can override.
636 ( $me:ident : emit_tag $s:ident ) => ({
637 $me.state = states::$s;
638 return $me.emit_current_tag();
639 });
640
641 ( $me:ident : eof ) => ({ $me.emit_eof(); return ProcessResult::Suspend; });
642
643 // If nothing else matched, it's a single command
644 ( $me:ident : $($cmd:tt)+ ) => ( sh_trace!($me: $($cmd)+); );
645
646 // or nothing.
647 ( $me:ident : ) => (());
648 );
649
650 macro_rules! go_match ( ( $me:ident : $x:expr, $($pats:pat),+ => $($cmds:tt)* ) => (
651 match $x {
652 $($pats)|+ => go!($me: $($cmds)*),
653 _ => (),
654 }
655 ));
656
657 // This is a macro because it can cause early return
658 // from the function where it is used.
659 macro_rules! get_char ( ($me:expr, $input:expr) => (
660 unwrap_or_return!($me.get_char($input), ProcessResult::Suspend)
661 ));
662
663 macro_rules! peek ( ($me:expr, $input:expr) => (
664 unwrap_or_return!($me.peek($input), ProcessResult::Suspend)
665 ));
666
667 macro_rules! pop_except_from ( ($me:expr, $input:expr, $set:expr) => (
668 unwrap_or_return!($me.pop_except_from($input, $set), ProcessResult::Suspend)
669 ));
670
671 macro_rules! eat ( ($me:expr, $input:expr, $pat:expr) => (
672 unwrap_or_return!($me.eat($input, $pat, u8::eq_ignore_ascii_case), ProcessResult::Suspend)
673 ));
674
675 macro_rules! eat_exact ( ($me:expr, $input:expr, $pat:expr) => (
676 unwrap_or_return!($me.eat($input, $pat, u8::eq), ProcessResult::Suspend)
677 ));
678
679 impl<Sink: TokenSink> Tokenizer<Sink> {
680 // Run the state machine for a while.
681 // Return true if we should be immediately re-invoked
682 // (this just simplifies control flow vs. break / continue).
step(&mut self, input: &mut BufferQueue) -> ProcessResult<Sink::Handle>683 fn step(&mut self, input: &mut BufferQueue) -> ProcessResult<Sink::Handle> {
684 if self.char_ref_tokenizer.is_some() {
685 return self.step_char_ref_tokenizer(input);
686 }
687
688 debug!("processing in state {:?}", self.state);
689 match self.state {
690 //§ data-state
691 states::Data => loop {
692 match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n')) {
693 FromSet('\0') => go!(self: error; emit '\0'),
694 FromSet('&') => go!(self: consume_char_ref),
695 FromSet('<') => go!(self: to TagOpen),
696 FromSet(c) => go!(self: emit c),
697 NotFromSet(b) => self.emit_chars(b),
698 }
699 },
700
701 //§ rcdata-state
702 states::RawData(Rcdata) => loop {
703 match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n')) {
704 FromSet('\0') => go!(self: error; emit '\u{fffd}'),
705 FromSet('&') => go!(self: consume_char_ref),
706 FromSet('<') => go!(self: to RawLessThanSign Rcdata),
707 FromSet(c) => go!(self: emit c),
708 NotFromSet(b) => self.emit_chars(b),
709 }
710 },
711
712 //§ rawtext-state
713 states::RawData(Rawtext) => loop {
714 match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) {
715 FromSet('\0') => go!(self: error; emit '\u{fffd}'),
716 FromSet('<') => go!(self: to RawLessThanSign Rawtext),
717 FromSet(c) => go!(self: emit c),
718 NotFromSet(b) => self.emit_chars(b),
719 }
720 },
721
722 //§ script-data-state
723 states::RawData(ScriptData) => loop {
724 match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) {
725 FromSet('\0') => go!(self: error; emit '\u{fffd}'),
726 FromSet('<') => go!(self: to RawLessThanSign ScriptData),
727 FromSet(c) => go!(self: emit c),
728 NotFromSet(b) => self.emit_chars(b),
729 }
730 },
731
732 //§ script-data-escaped-state
733 states::RawData(ScriptDataEscaped(Escaped)) => loop {
734 match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) {
735 FromSet('\0') => go!(self: error; emit '\u{fffd}'),
736 FromSet('-') => go!(self: emit '-'; to ScriptDataEscapedDash Escaped),
737 FromSet('<') => go!(self: to RawLessThanSign ScriptDataEscaped Escaped),
738 FromSet(c) => go!(self: emit c),
739 NotFromSet(b) => self.emit_chars(b),
740 }
741 },
742
743 //§ script-data-double-escaped-state
744 states::RawData(ScriptDataEscaped(DoubleEscaped)) => loop {
745 match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) {
746 FromSet('\0') => go!(self: error; emit '\u{fffd}'),
747 FromSet('-') => go!(self: emit '-'; to ScriptDataEscapedDash DoubleEscaped),
748 FromSet('<') => {
749 go!(self: emit '<'; to RawLessThanSign ScriptDataEscaped DoubleEscaped)
750 },
751 FromSet(c) => go!(self: emit c),
752 NotFromSet(b) => self.emit_chars(b),
753 }
754 },
755
756 //§ plaintext-state
757 states::Plaintext => loop {
758 match pop_except_from!(self, input, small_char_set!('\r' '\0' '\n')) {
759 FromSet('\0') => go!(self: error; emit '\u{fffd}'),
760 FromSet(c) => go!(self: emit c),
761 NotFromSet(b) => self.emit_chars(b),
762 }
763 },
764
765 //§ tag-open-state
766 states::TagOpen => loop {
767 match get_char!(self, input) {
768 '!' => go!(self: clear_temp; to MarkupDeclarationOpen),
769 '/' => go!(self: to EndTagOpen),
770 '?' => go!(self: error; clear_comment; push_comment '?'; to BogusComment),
771 c => match lower_ascii_letter(c) {
772 Some(cl) => go!(self: create_tag StartTag cl; to TagName),
773 None => go!(self: error; emit '<'; reconsume Data),
774 },
775 }
776 },
777
778 //§ end-tag-open-state
779 states::EndTagOpen => loop {
780 match get_char!(self, input) {
781 '>' => go!(self: error; to Data),
782 '\0' => {
783 go!(self: error; clear_comment; push_comment '\u{fffd}'; to BogusComment)
784 },
785 c => match lower_ascii_letter(c) {
786 Some(cl) => go!(self: create_tag EndTag cl; to TagName),
787 None => go!(self: error; clear_comment; push_comment c; to BogusComment),
788 },
789 }
790 },
791
792 //§ tag-name-state
793 states::TagName => loop {
794 match get_char!(self, input) {
795 '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName),
796 '/' => go!(self: to SelfClosingStartTag),
797 '>' => go!(self: emit_tag Data),
798 '\0' => go!(self: error; push_tag '\u{fffd}'),
799 c => go!(self: push_tag (c.to_ascii_lowercase())),
800 }
801 },
802
803 //§ script-data-escaped-less-than-sign-state
804 states::RawLessThanSign(ScriptDataEscaped(Escaped)) => loop {
805 match get_char!(self, input) {
806 '/' => go!(self: clear_temp; to RawEndTagOpen ScriptDataEscaped Escaped),
807 c => match lower_ascii_letter(c) {
808 Some(cl) => go!(self: clear_temp; push_temp cl; emit '<'; emit c;
809 to ScriptDataEscapeStart DoubleEscaped),
810 None => go!(self: emit '<'; reconsume RawData ScriptDataEscaped Escaped),
811 },
812 }
813 },
814
815 //§ script-data-double-escaped-less-than-sign-state
816 states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => loop {
817 match get_char!(self, input) {
818 '/' => go!(self: clear_temp; emit '/'; to ScriptDataDoubleEscapeEnd),
819 _ => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped),
820 }
821 },
822
823 //§ rcdata-less-than-sign-state rawtext-less-than-sign-state script-data-less-than-sign-state
824 // otherwise
825 states::RawLessThanSign(kind) => loop {
826 match get_char!(self, input) {
827 '/' => go!(self: clear_temp; to RawEndTagOpen kind),
828 '!' if kind == ScriptData => {
829 go!(self: emit '<'; emit '!'; to ScriptDataEscapeStart Escaped)
830 },
831 _ => go!(self: emit '<'; reconsume RawData kind),
832 }
833 },
834
835 //§ rcdata-end-tag-open-state rawtext-end-tag-open-state script-data-end-tag-open-state script-data-escaped-end-tag-open-state
836 states::RawEndTagOpen(kind) => loop {
837 let c = get_char!(self, input);
838 match lower_ascii_letter(c) {
839 Some(cl) => go!(self: create_tag EndTag cl; push_temp c; to RawEndTagName kind),
840 None => go!(self: emit '<'; emit '/'; reconsume RawData kind),
841 }
842 },
843
844 //§ rcdata-end-tag-name-state rawtext-end-tag-name-state script-data-end-tag-name-state script-data-escaped-end-tag-name-state
845 states::RawEndTagName(kind) => loop {
846 let c = get_char!(self, input);
847 if self.have_appropriate_end_tag() {
848 match c {
849 '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName),
850 '/' => go!(self: to SelfClosingStartTag),
851 '>' => go!(self: emit_tag Data),
852 _ => (),
853 }
854 }
855
856 match lower_ascii_letter(c) {
857 Some(cl) => go!(self: push_tag cl; push_temp c),
858 None => {
859 go!(self: discard_tag; emit '<'; emit '/'; emit_temp; reconsume RawData kind)
860 },
861 }
862 },
863
864 //§ script-data-double-escape-start-state
865 states::ScriptDataEscapeStart(DoubleEscaped) => loop {
866 let c = get_char!(self, input);
867 match c {
868 '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => {
869 let esc = if &*self.temp_buf == "script" {
870 DoubleEscaped
871 } else {
872 Escaped
873 };
874 go!(self: emit c; to RawData ScriptDataEscaped esc);
875 },
876 _ => match lower_ascii_letter(c) {
877 Some(cl) => go!(self: push_temp cl; emit c),
878 None => go!(self: reconsume RawData ScriptDataEscaped Escaped),
879 },
880 }
881 },
882
883 //§ script-data-escape-start-state
884 states::ScriptDataEscapeStart(Escaped) => loop {
885 match get_char!(self, input) {
886 '-' => go!(self: emit '-'; to ScriptDataEscapeStartDash),
887 _ => go!(self: reconsume RawData ScriptData),
888 }
889 },
890
891 //§ script-data-escape-start-dash-state
892 states::ScriptDataEscapeStartDash => loop {
893 match get_char!(self, input) {
894 '-' => go!(self: emit '-'; to ScriptDataEscapedDashDash Escaped),
895 _ => go!(self: reconsume RawData ScriptData),
896 }
897 },
898
899 //§ script-data-escaped-dash-state script-data-double-escaped-dash-state
900 states::ScriptDataEscapedDash(kind) => loop {
901 match get_char!(self, input) {
902 '-' => go!(self: emit '-'; to ScriptDataEscapedDashDash kind),
903 '<' => {
904 if kind == DoubleEscaped {
905 go!(self: emit '<');
906 }
907 go!(self: to RawLessThanSign ScriptDataEscaped kind);
908 },
909 '\0' => go!(self: error; emit '\u{fffd}'; to RawData ScriptDataEscaped kind),
910 c => go!(self: emit c; to RawData ScriptDataEscaped kind),
911 }
912 },
913
914 //§ script-data-escaped-dash-dash-state script-data-double-escaped-dash-dash-state
915 states::ScriptDataEscapedDashDash(kind) => loop {
916 match get_char!(self, input) {
917 '-' => go!(self: emit '-'),
918 '<' => {
919 if kind == DoubleEscaped {
920 go!(self: emit '<');
921 }
922 go!(self: to RawLessThanSign ScriptDataEscaped kind);
923 },
924 '>' => go!(self: emit '>'; to RawData ScriptData),
925 '\0' => go!(self: error; emit '\u{fffd}'; to RawData ScriptDataEscaped kind),
926 c => go!(self: emit c; to RawData ScriptDataEscaped kind),
927 }
928 },
929
930 //§ script-data-double-escape-end-state
931 states::ScriptDataDoubleEscapeEnd => loop {
932 let c = get_char!(self, input);
933 match c {
934 '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => {
935 let esc = if &*self.temp_buf == "script" {
936 Escaped
937 } else {
938 DoubleEscaped
939 };
940 go!(self: emit c; to RawData ScriptDataEscaped esc);
941 },
942 _ => match lower_ascii_letter(c) {
943 Some(cl) => go!(self: push_temp cl; emit c),
944 None => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped),
945 },
946 }
947 },
948
949 //§ before-attribute-name-state
950 states::BeforeAttributeName => loop {
951 match get_char!(self, input) {
952 '\t' | '\n' | '\x0C' | ' ' => (),
953 '/' => go!(self: to SelfClosingStartTag),
954 '>' => go!(self: emit_tag Data),
955 '\0' => go!(self: error; create_attr '\u{fffd}'; to AttributeName),
956 c => match lower_ascii_letter(c) {
957 Some(cl) => go!(self: create_attr cl; to AttributeName),
958 None => {
959 go_match!(self: c,
960 '"' , '\'' , '<' , '=' => error);
961 go!(self: create_attr c; to AttributeName);
962 },
963 },
964 }
965 },
966
967 //§ attribute-name-state
968 states::AttributeName => loop {
969 match get_char!(self, input) {
970 '\t' | '\n' | '\x0C' | ' ' => go!(self: to AfterAttributeName),
971 '/' => go!(self: to SelfClosingStartTag),
972 '=' => go!(self: to BeforeAttributeValue),
973 '>' => go!(self: emit_tag Data),
974 '\0' => go!(self: error; push_name '\u{fffd}'),
975 c => match lower_ascii_letter(c) {
976 Some(cl) => go!(self: push_name cl),
977 None => {
978 go_match!(self: c,
979 '"' , '\'' , '<' => error);
980 go!(self: push_name c);
981 },
982 },
983 }
984 },
985
986 //§ after-attribute-name-state
987 states::AfterAttributeName => loop {
988 match get_char!(self, input) {
989 '\t' | '\n' | '\x0C' | ' ' => (),
990 '/' => go!(self: to SelfClosingStartTag),
991 '=' => go!(self: to BeforeAttributeValue),
992 '>' => go!(self: emit_tag Data),
993 '\0' => go!(self: error; create_attr '\u{fffd}'; to AttributeName),
994 c => match lower_ascii_letter(c) {
995 Some(cl) => go!(self: create_attr cl; to AttributeName),
996 None => {
997 go_match!(self: c,
998 '"' , '\'' , '<' => error);
999 go!(self: create_attr c; to AttributeName);
1000 },
1001 },
1002 }
1003 },
1004
1005 //§ before-attribute-value-state
1006 // Use peek so we can handle the first attr character along with the rest,
1007 // hopefully in the same zero-copy buffer.
1008 states::BeforeAttributeValue => loop {
1009 match peek!(self, input) {
1010 '\t' | '\n' | '\r' | '\x0C' | ' ' => go!(self: discard_char input),
1011 '"' => go!(self: discard_char input; to AttributeValue DoubleQuoted),
1012 '\'' => go!(self: discard_char input; to AttributeValue SingleQuoted),
1013 '\0' => {
1014 go!(self: discard_char input; error; push_value '\u{fffd}'; to AttributeValue Unquoted)
1015 },
1016 '>' => go!(self: discard_char input; error; emit_tag Data),
1017 _ => go!(self: to AttributeValue Unquoted),
1018 }
1019 },
1020
1021 //§ attribute-value-(double-quoted)-state
1022 states::AttributeValue(DoubleQuoted) => loop {
1023 match pop_except_from!(self, input, small_char_set!('\r' '"' '&' '\0' '\n')) {
1024 FromSet('"') => go!(self: to AfterAttributeValueQuoted),
1025 FromSet('&') => go!(self: consume_char_ref '"'),
1026 FromSet('\0') => go!(self: error; push_value '\u{fffd}'),
1027 FromSet(c) => go!(self: push_value c),
1028 NotFromSet(ref b) => go!(self: append_value b),
1029 }
1030 },
1031
1032 //§ attribute-value-(single-quoted)-state
1033 states::AttributeValue(SingleQuoted) => loop {
1034 match pop_except_from!(self, input, small_char_set!('\r' '\'' '&' '\0' '\n')) {
1035 FromSet('\'') => go!(self: to AfterAttributeValueQuoted),
1036 FromSet('&') => go!(self: consume_char_ref '\''),
1037 FromSet('\0') => go!(self: error; push_value '\u{fffd}'),
1038 FromSet(c) => go!(self: push_value c),
1039 NotFromSet(ref b) => go!(self: append_value b),
1040 }
1041 },
1042
1043 //§ attribute-value-(unquoted)-state
1044 states::AttributeValue(Unquoted) => loop {
1045 match pop_except_from!(
1046 self,
1047 input,
1048 small_char_set!('\r' '\t' '\n' '\x0C' ' ' '&' '>' '\0')
1049 ) {
1050 FromSet('\t') | FromSet('\n') | FromSet('\x0C') | FromSet(' ') => {
1051 go!(self: to BeforeAttributeName)
1052 },
1053 FromSet('&') => go!(self: consume_char_ref '>'),
1054 FromSet('>') => go!(self: emit_tag Data),
1055 FromSet('\0') => go!(self: error; push_value '\u{fffd}'),
1056 FromSet(c) => {
1057 go_match!(self: c,
1058 '"' , '\'' , '<' , '=' , '`' => error);
1059 go!(self: push_value c);
1060 },
1061 NotFromSet(ref b) => go!(self: append_value b),
1062 }
1063 },
1064
1065 //§ after-attribute-value-(quoted)-state
1066 states::AfterAttributeValueQuoted => loop {
1067 match get_char!(self, input) {
1068 '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName),
1069 '/' => go!(self: to SelfClosingStartTag),
1070 '>' => go!(self: emit_tag Data),
1071 _ => go!(self: error; reconsume BeforeAttributeName),
1072 }
1073 },
1074
1075 //§ self-closing-start-tag-state
1076 states::SelfClosingStartTag => loop {
1077 match get_char!(self, input) {
1078 '>' => {
1079 self.current_tag_self_closing = true;
1080 go!(self: emit_tag Data);
1081 },
1082 _ => go!(self: error; reconsume BeforeAttributeName),
1083 }
1084 },
1085
1086 //§ comment-start-state
1087 states::CommentStart => loop {
1088 match get_char!(self, input) {
1089 '-' => go!(self: to CommentStartDash),
1090 '\0' => go!(self: error; push_comment '\u{fffd}'; to Comment),
1091 '>' => go!(self: error; emit_comment; to Data),
1092 c => go!(self: push_comment c; to Comment),
1093 }
1094 },
1095
1096 //§ comment-start-dash-state
1097 states::CommentStartDash => loop {
1098 match get_char!(self, input) {
1099 '-' => go!(self: to CommentEnd),
1100 '\0' => go!(self: error; append_comment "-\u{fffd}"; to Comment),
1101 '>' => go!(self: error; emit_comment; to Data),
1102 c => go!(self: push_comment '-'; push_comment c; to Comment),
1103 }
1104 },
1105
1106 //§ comment-state
1107 states::Comment => loop {
1108 match get_char!(self, input) {
1109 '-' => go!(self: to CommentEndDash),
1110 '\0' => go!(self: error; push_comment '\u{fffd}'),
1111 c => go!(self: push_comment c),
1112 }
1113 },
1114
1115 //§ comment-end-dash-state
1116 states::CommentEndDash => loop {
1117 match get_char!(self, input) {
1118 '-' => go!(self: to CommentEnd),
1119 '\0' => go!(self: error; append_comment "-\u{fffd}"; to Comment),
1120 c => go!(self: push_comment '-'; push_comment c; to Comment),
1121 }
1122 },
1123
1124 //§ comment-end-state
1125 states::CommentEnd => loop {
1126 match get_char!(self, input) {
1127 '>' => go!(self: emit_comment; to Data),
1128 '\0' => go!(self: error; append_comment "--\u{fffd}"; to Comment),
1129 '!' => go!(self: error; to CommentEndBang),
1130 '-' => go!(self: error; push_comment '-'),
1131 c => go!(self: error; append_comment "--"; push_comment c; to Comment),
1132 }
1133 },
1134
1135 //§ comment-end-bang-state
1136 states::CommentEndBang => loop {
1137 match get_char!(self, input) {
1138 '-' => go!(self: append_comment "--!"; to CommentEndDash),
1139 '>' => go!(self: emit_comment; to Data),
1140 '\0' => go!(self: error; append_comment "--!\u{fffd}"; to Comment),
1141 c => go!(self: append_comment "--!"; push_comment c; to Comment),
1142 }
1143 },
1144
1145 //§ doctype-state
1146 states::Doctype => loop {
1147 match get_char!(self, input) {
1148 '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeName),
1149 _ => go!(self: error; reconsume BeforeDoctypeName),
1150 }
1151 },
1152
1153 //§ before-doctype-name-state
1154 states::BeforeDoctypeName => loop {
1155 match get_char!(self, input) {
1156 '\t' | '\n' | '\x0C' | ' ' => (),
1157 '\0' => {
1158 go!(self: error; create_doctype; push_doctype_name '\u{fffd}'; to DoctypeName)
1159 },
1160 '>' => go!(self: error; create_doctype; force_quirks; emit_doctype; to Data),
1161 c => go!(self: create_doctype; push_doctype_name (c.to_ascii_lowercase());
1162 to DoctypeName),
1163 }
1164 },
1165
1166 //§ doctype-name-state
1167 states::DoctypeName => loop {
1168 match get_char!(self, input) {
1169 '\t' | '\n' | '\x0C' | ' ' => go!(self: clear_temp; to AfterDoctypeName),
1170 '>' => go!(self: emit_doctype; to Data),
1171 '\0' => go!(self: error; push_doctype_name '\u{fffd}'),
1172 c => go!(self: push_doctype_name (c.to_ascii_lowercase())),
1173 }
1174 },
1175
1176 //§ after-doctype-name-state
1177 states::AfterDoctypeName => loop {
1178 if eat!(self, input, "public") {
1179 go!(self: to AfterDoctypeKeyword Public);
1180 } else if eat!(self, input, "system") {
1181 go!(self: to AfterDoctypeKeyword System);
1182 } else {
1183 match get_char!(self, input) {
1184 '\t' | '\n' | '\x0C' | ' ' => (),
1185 '>' => go!(self: emit_doctype; to Data),
1186 _ => go!(self: error; force_quirks; to BogusDoctype),
1187 }
1188 }
1189 },
1190
1191 //§ after-doctype-public-keyword-state after-doctype-system-keyword-state
1192 states::AfterDoctypeKeyword(kind) => loop {
1193 match get_char!(self, input) {
1194 '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier kind),
1195 '"' => {
1196 go!(self: error; clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind)
1197 },
1198 '\'' => {
1199 go!(self: error; clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind)
1200 },
1201 '>' => go!(self: error; force_quirks; emit_doctype; to Data),
1202 _ => go!(self: error; force_quirks; to BogusDoctype),
1203 }
1204 },
1205
1206 //§ before-doctype-public-identifier-state before-doctype-system-identifier-state
1207 states::BeforeDoctypeIdentifier(kind) => loop {
1208 match get_char!(self, input) {
1209 '\t' | '\n' | '\x0C' | ' ' => (),
1210 '"' => go!(self: clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind),
1211 '\'' => go!(self: clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind),
1212 '>' => go!(self: error; force_quirks; emit_doctype; to Data),
1213 _ => go!(self: error; force_quirks; to BogusDoctype),
1214 }
1215 },
1216
1217 //§ doctype-public-identifier-(double-quoted)-state doctype-system-identifier-(double-quoted)-state
1218 states::DoctypeIdentifierDoubleQuoted(kind) => loop {
1219 match get_char!(self, input) {
1220 '"' => go!(self: to AfterDoctypeIdentifier kind),
1221 '\0' => go!(self: error; push_doctype_id kind '\u{fffd}'),
1222 '>' => go!(self: error; force_quirks; emit_doctype; to Data),
1223 c => go!(self: push_doctype_id kind c),
1224 }
1225 },
1226
1227 //§ doctype-public-identifier-(single-quoted)-state doctype-system-identifier-(single-quoted)-state
1228 states::DoctypeIdentifierSingleQuoted(kind) => loop {
1229 match get_char!(self, input) {
1230 '\'' => go!(self: to AfterDoctypeIdentifier kind),
1231 '\0' => go!(self: error; push_doctype_id kind '\u{fffd}'),
1232 '>' => go!(self: error; force_quirks; emit_doctype; to Data),
1233 c => go!(self: push_doctype_id kind c),
1234 }
1235 },
1236
1237 //§ after-doctype-public-identifier-state
1238 states::AfterDoctypeIdentifier(Public) => loop {
1239 match get_char!(self, input) {
1240 '\t' | '\n' | '\x0C' | ' ' => {
1241 go!(self: to BetweenDoctypePublicAndSystemIdentifiers)
1242 },
1243 '>' => go!(self: emit_doctype; to Data),
1244 '"' => {
1245 go!(self: error; clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System)
1246 },
1247 '\'' => {
1248 go!(self: error; clear_doctype_id System; to DoctypeIdentifierSingleQuoted System)
1249 },
1250 _ => go!(self: error; force_quirks; to BogusDoctype),
1251 }
1252 },
1253
1254 //§ after-doctype-system-identifier-state
1255 states::AfterDoctypeIdentifier(System) => loop {
1256 match get_char!(self, input) {
1257 '\t' | '\n' | '\x0C' | ' ' => (),
1258 '>' => go!(self: emit_doctype; to Data),
1259 _ => go!(self: error; to BogusDoctype),
1260 }
1261 },
1262
1263 //§ between-doctype-public-and-system-identifiers-state
1264 states::BetweenDoctypePublicAndSystemIdentifiers => loop {
1265 match get_char!(self, input) {
1266 '\t' | '\n' | '\x0C' | ' ' => (),
1267 '>' => go!(self: emit_doctype; to Data),
1268 '"' => {
1269 go!(self: clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System)
1270 },
1271 '\'' => {
1272 go!(self: clear_doctype_id System; to DoctypeIdentifierSingleQuoted System)
1273 },
1274 _ => go!(self: error; force_quirks; to BogusDoctype),
1275 }
1276 },
1277
1278 //§ bogus-doctype-state
1279 states::BogusDoctype => loop {
1280 match get_char!(self, input) {
1281 '>' => go!(self: emit_doctype; to Data),
1282 _ => (),
1283 }
1284 },
1285
1286 //§ bogus-comment-state
1287 states::BogusComment => loop {
1288 match get_char!(self, input) {
1289 '>' => go!(self: emit_comment; to Data),
1290 '\0' => go!(self: push_comment '\u{fffd}'),
1291 c => go!(self: push_comment c),
1292 }
1293 },
1294
1295 //§ markup-declaration-open-state
1296 states::MarkupDeclarationOpen => loop {
1297 if eat_exact!(self, input, "--") {
1298 go!(self: clear_comment; to CommentStart);
1299 } else if eat!(self, input, "doctype") {
1300 go!(self: to Doctype);
1301 } else {
1302 if self
1303 .sink
1304 .adjusted_current_node_present_but_not_in_html_namespace()
1305 {
1306 if eat_exact!(self, input, "[CDATA[") {
1307 go!(self: clear_temp; to CdataSection);
1308 }
1309 }
1310 go!(self: error; to BogusComment);
1311 }
1312 },
1313
1314 //§ cdata-section-state
1315 states::CdataSection => loop {
1316 match get_char!(self, input) {
1317 ']' => go!(self: to CdataSectionBracket),
1318 '\0' => go!(self: emit_temp; emit '\0'),
1319 c => go!(self: push_temp c),
1320 }
1321 },
1322
1323 //§ cdata-section-bracket
1324 states::CdataSectionBracket => match get_char!(self, input) {
1325 ']' => go!(self: to CdataSectionEnd),
1326 _ => go!(self: push_temp ']'; reconsume CdataSection),
1327 },
1328
1329 //§ cdata-section-end
1330 states::CdataSectionEnd => loop {
1331 match get_char!(self, input) {
1332 ']' => go!(self: push_temp ']'),
1333 '>' => go!(self: emit_temp; to Data),
1334 _ => go!(self: push_temp ']'; push_temp ']'; reconsume CdataSection),
1335 }
1336 },
1337 //§ END
1338 }
1339 }
1340
1341 fn step_char_ref_tokenizer(&mut self, input: &mut BufferQueue) -> ProcessResult<Sink::Handle> {
1342 // FIXME HACK: Take and replace the tokenizer so we don't
1343 // double-mut-borrow self. This is why it's boxed.
1344 let mut tok = self.char_ref_tokenizer.take().unwrap();
1345 let outcome = tok.step(self, input);
1346
1347 let progress = match outcome {
1348 char_ref::Done => {
1349 self.process_char_ref(tok.get_result());
1350 return ProcessResult::Continue;
1351 },
1352
1353 char_ref::Stuck => ProcessResult::Suspend,
1354 char_ref::Progress => ProcessResult::Continue,
1355 };
1356
1357 self.char_ref_tokenizer = Some(tok);
1358 progress
1359 }
1360
1361 fn process_char_ref(&mut self, char_ref: CharRef) {
1362 let CharRef {
1363 mut chars,
1364 mut num_chars,
1365 } = char_ref;
1366
1367 if num_chars == 0 {
1368 chars[0] = '&';
1369 num_chars = 1;
1370 }
1371
1372 for i in 0..num_chars {
1373 let c = chars[i as usize];
1374 match self.state {
1375 states::Data | states::RawData(states::Rcdata) => go!(self: emit c),
1376
1377 states::AttributeValue(_) => go!(self: push_value c),
1378
1379 _ => panic!(
1380 "state {:?} should not be reachable in process_char_ref",
1381 self.state
1382 ),
1383 }
1384 }
1385 }
1386
1387 /// Indicate that we have reached the end of the input.
1388 pub fn end(&mut self) {
1389 // Handle EOF in the char ref sub-tokenizer, if there is one.
1390 // Do this first because it might un-consume stuff.
1391 let mut input = BufferQueue::new();
1392 match self.char_ref_tokenizer.take() {
1393 None => (),
1394 Some(mut tok) => {
1395 tok.end_of_file(self, &mut input);
1396 self.process_char_ref(tok.get_result());
1397 },
1398 }
1399
1400 // Process all remaining buffered input.
1401 // If we're waiting for lookahead, we're not gonna get it.
1402 self.at_eof = true;
1403 assert!(matches!(self.run(&mut input), TokenizerResult::Done));
1404 assert!(input.is_empty());
1405
1406 loop {
1407 match self.eof_step() {
1408 ProcessResult::Continue => (),
1409 ProcessResult::Suspend => break,
1410 ProcessResult::Script(_) => unreachable!(),
1411 }
1412 }
1413
1414 self.sink.end();
1415
1416 if self.opts.profile {
1417 self.dump_profile();
1418 }
1419 }
1420
1421 fn dump_profile(&self) {
1422 let mut results: Vec<(states::State, u64)> =
1423 self.state_profile.iter().map(|(s, t)| (*s, *t)).collect();
1424 results.sort_by(|&(_, x), &(_, y)| y.cmp(&x));
1425
1426 let total: u64 = results
1427 .iter()
1428 .map(|&(_, t)| t)
1429 .fold(0, ::std::ops::Add::add);
1430 println!("\nTokenizer profile, in nanoseconds");
1431 println!("\n{:12} total in token sink", self.time_in_sink);
1432 println!("\n{:12} total in tokenizer", total);
1433
1434 for (k, v) in results.into_iter() {
1435 let pct = 100.0 * (v as f64) / (total as f64);
1436 println!("{:12} {:4.1}% {:?}", v, pct, k);
1437 }
1438 }
1439
1440 fn eof_step(&mut self) -> ProcessResult<Sink::Handle> {
1441 debug!("processing EOF in state {:?}", self.state);
1442 match self.state {
1443 states::Data |
1444 states::RawData(Rcdata) |
1445 states::RawData(Rawtext) |
1446 states::RawData(ScriptData) |
1447 states::Plaintext => go!(self: eof),
1448
1449 states::TagName |
1450 states::RawData(ScriptDataEscaped(_)) |
1451 states::BeforeAttributeName |
1452 states::AttributeName |
1453 states::AfterAttributeName |
1454 states::BeforeAttributeValue |
1455 states::AttributeValue(_) |
1456 states::AfterAttributeValueQuoted |
1457 states::SelfClosingStartTag |
1458 states::ScriptDataEscapedDash(_) |
1459 states::ScriptDataEscapedDashDash(_) => go!(self: error_eof; to Data),
1460
1461 states::TagOpen => go!(self: error_eof; emit '<'; to Data),
1462
1463 states::EndTagOpen => go!(self: error_eof; emit '<'; emit '/'; to Data),
1464
1465 states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => {
1466 go!(self: to RawData ScriptDataEscaped DoubleEscaped)
1467 },
1468
1469 states::RawLessThanSign(kind) => go!(self: emit '<'; to RawData kind),
1470
1471 states::RawEndTagOpen(kind) => go!(self: emit '<'; emit '/'; to RawData kind),
1472
1473 states::RawEndTagName(kind) => {
1474 go!(self: emit '<'; emit '/'; emit_temp; to RawData kind)
1475 },
1476
1477 states::ScriptDataEscapeStart(kind) => go!(self: to RawData ScriptDataEscaped kind),
1478
1479 states::ScriptDataEscapeStartDash => go!(self: to RawData ScriptData),
1480
1481 states::ScriptDataDoubleEscapeEnd => {
1482 go!(self: to RawData ScriptDataEscaped DoubleEscaped)
1483 },
1484
1485 states::CommentStart |
1486 states::CommentStartDash |
1487 states::Comment |
1488 states::CommentEndDash |
1489 states::CommentEnd |
1490 states::CommentEndBang => go!(self: error_eof; emit_comment; to Data),
1491
1492 states::Doctype | states::BeforeDoctypeName => {
1493 go!(self: error_eof; create_doctype; force_quirks; emit_doctype; to Data)
1494 },
1495
1496 states::DoctypeName |
1497 states::AfterDoctypeName |
1498 states::AfterDoctypeKeyword(_) |
1499 states::BeforeDoctypeIdentifier(_) |
1500 states::DoctypeIdentifierDoubleQuoted(_) |
1501 states::DoctypeIdentifierSingleQuoted(_) |
1502 states::AfterDoctypeIdentifier(_) |
1503 states::BetweenDoctypePublicAndSystemIdentifiers => {
1504 go!(self: error_eof; force_quirks; emit_doctype; to Data)
1505 },
1506
1507 states::BogusDoctype => go!(self: emit_doctype; to Data),
1508
1509 states::BogusComment => go!(self: emit_comment; to Data),
1510
1511 states::MarkupDeclarationOpen => go!(self: error; to BogusComment),
1512
1513 states::CdataSection => go!(self: emit_temp; error_eof; to Data),
1514
1515 states::CdataSectionBracket => go!(self: push_temp ']'; to CdataSection),
1516
1517 states::CdataSectionEnd => go!(self: push_temp ']'; push_temp ']'; to CdataSection),
1518 }
1519 }
1520 }
1521
1522 #[cfg(test)]
1523 #[allow(non_snake_case)]
1524 mod test {
1525 use super::option_push; // private items
1526 use tendril::{SliceExt, StrTendril};
1527
1528 use super::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
1529
1530 use super::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
1531 use super::interface::{EndTag, StartTag, Tag, TagKind};
1532 use super::interface::{TagToken, Token};
1533
1534 use markup5ever::buffer_queue::BufferQueue;
1535 use std::mem::replace;
1536
1537 use LocalName;
1538
1539 // LinesMatch implements the TokenSink trait. It is used for testing to see
1540 // if current_line is being updated when process_token is called. The lines
1541 // vector is a collection of the line numbers that each token is on.
1542 struct LinesMatch {
1543 tokens: Vec<Token>,
1544 current_str: StrTendril,
1545 lines: Vec<(Token, u64)>,
1546 }
1547
1548 impl LinesMatch {
1549 fn new() -> LinesMatch {
1550 LinesMatch {
1551 tokens: vec![],
1552 current_str: StrTendril::new(),
1553 lines: vec![],
1554 }
1555 }
1556
1557 fn push(&mut self, token: Token, line_number: u64) {
1558 self.finish_str();
1559 self.lines.push((token, line_number));
1560 }
1561
1562 fn finish_str(&mut self) {
1563 if self.current_str.len() > 0 {
1564 let s = replace(&mut self.current_str, StrTendril::new());
1565 self.tokens.push(CharacterTokens(s));
1566 }
1567 }
1568 }
1569
1570 impl TokenSink for LinesMatch {
1571 type Handle = ();
1572
1573 fn process_token(
1574 &mut self,
1575 token: Token,
1576 line_number: u64,
1577 ) -> TokenSinkResult<Self::Handle> {
1578 match token {
1579 CharacterTokens(b) => {
1580 self.current_str.push_slice(&b);
1581 },
1582
1583 NullCharacterToken => {
1584 self.current_str.push_char('\0');
1585 },
1586
1587 ParseError(_) => {
1588 panic!("unexpected parse error");
1589 },
1590
1591 TagToken(mut t) => {
1592 // The spec seems to indicate that one can emit
1593 // erroneous end tags with attrs, but the test
1594 // cases don't contain them.
1595 match t.kind {
1596 EndTag => {
1597 t.self_closing = false;
1598 t.attrs = vec![];
1599 },
1600 _ => t.attrs.sort_by(|a1, a2| a1.name.cmp(&a2.name)),
1601 }
1602 self.push(TagToken(t), line_number);
1603 },
1604
1605 EOFToken => (),
1606
1607 _ => self.push(token, line_number),
1608 }
1609 TokenSinkResult::Continue
1610 }
1611 }
1612
1613 // Take in tokens, process them, and return vector with line
1614 // numbers that each token is on
1615 fn tokenize(input: Vec<StrTendril>, opts: TokenizerOpts) -> Vec<(Token, u64)> {
1616 let sink = LinesMatch::new();
1617 let mut tok = Tokenizer::new(sink, opts);
1618 let mut buffer = BufferQueue::new();
1619 for chunk in input.into_iter() {
1620 buffer.push_back(chunk);
1621 let _ = tok.feed(&mut buffer);
1622 }
1623 tok.end();
1624 tok.sink.lines
1625 }
1626
1627 // Create a tag token
1628 fn create_tag(token: StrTendril, tagkind: TagKind) -> Token {
1629 let name = LocalName::from(&*token);
1630 let token = TagToken(Tag {
1631 kind: tagkind,
1632 name: name,
1633 self_closing: false,
1634 attrs: vec![],
1635 });
1636 token
1637 }
1638
1639 #[test]
1640 fn push_to_None_gives_singleton() {
1641 let mut s: Option<StrTendril> = None;
1642 option_push(&mut s, 'x');
1643 assert_eq!(s, Some("x".to_tendril()));
1644 }
1645
1646 #[test]
1647 fn push_to_empty_appends() {
1648 let mut s: Option<StrTendril> = Some(StrTendril::new());
1649 option_push(&mut s, 'x');
1650 assert_eq!(s, Some("x".to_tendril()));
1651 }
1652
1653 #[test]
1654 fn push_to_nonempty_appends() {
1655 let mut s: Option<StrTendril> = Some(StrTendril::from_slice("y"));
1656 option_push(&mut s, 'x');
1657 assert_eq!(s, Some("yx".to_tendril()));
1658 }
1659
1660 #[test]
1661 fn check_lines() {
1662 let opts = TokenizerOpts {
1663 exact_errors: false,
1664 discard_bom: true,
1665 profile: false,
1666 initial_state: None,
1667 last_start_tag_name: None,
1668 };
1669 let vector = vec![
1670 StrTendril::from("<a>\n"),
1671 StrTendril::from("<b>\n"),
1672 StrTendril::from("</b>\n"),
1673 StrTendril::from("</a>\n"),
1674 ];
1675 let expected = vec![
1676 (create_tag(StrTendril::from("a"), StartTag), 1),
1677 (create_tag(StrTendril::from("b"), StartTag), 2),
1678 (create_tag(StrTendril::from("b"), EndTag), 3),
1679 (create_tag(StrTendril::from("a"), EndTag), 4),
1680 ];
1681 let results = tokenize(vector, opts);
1682 assert_eq!(results, expected);
1683 }
1684
1685 #[test]
1686 fn check_lines_with_new_line() {
1687 let opts = TokenizerOpts {
1688 exact_errors: false,
1689 discard_bom: true,
1690 profile: false,
1691 initial_state: None,
1692 last_start_tag_name: None,
1693 };
1694 let vector = vec![
1695 StrTendril::from("<a>\r\n"),
1696 StrTendril::from("<b>\r\n"),
1697 StrTendril::from("</b>\r\n"),
1698 StrTendril::from("</a>\r\n"),
1699 ];
1700 let expected = vec![
1701 (create_tag(StrTendril::from("a"), StartTag), 1),
1702 (create_tag(StrTendril::from("b"), StartTag), 2),
1703 (create_tag(StrTendril::from("b"), EndTag), 3),
1704 (create_tag(StrTendril::from("a"), EndTag), 4),
1705 ];
1706 let results = tokenize(vector, opts);
1707 assert_eq!(results, expected);
1708 }
1709 }
1710