1 // Copyright 2014-2017 The html5ever Project Developers. See the
2 // COPYRIGHT file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9
10 //! The HTML5 tokenizer.
11
12 pub use self::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
13 pub use self::interface::{CommentToken, DoctypeToken, TagToken, Token};
14 pub use self::interface::{Doctype, EndTag, StartTag, Tag, TagKind};
15 pub use self::interface::{TokenSink, TokenSinkResult};
16
17 use self::states::{DoctypeIdKind, Public, System};
18 use self::states::{DoubleEscaped, Escaped};
19 use self::states::{DoubleQuoted, SingleQuoted, Unquoted};
20 use self::states::{Rawtext, Rcdata, ScriptData, ScriptDataEscaped};
21
22 use self::char_ref::{CharRef, CharRefTokenizer};
23
24 use crate::util::str::lower_ascii_letter;
25
26 use log::debug;
27 use mac::{format_if, matches, _tt_as_expr_hack};
28 use markup5ever::{namespace_url, ns, small_char_set};
29 use std::borrow::Cow::{self, Borrowed};
30 use std::collections::BTreeMap;
31 use std::default::Default;
32 use std::mem::replace;
33
34 pub use crate::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult};
35 use crate::tendril::StrTendril;
36 use crate::{Attribute, LocalName, QualName, SmallCharSet};
37
38 mod char_ref;
39 mod interface;
40 pub mod states;
41
42 pub enum ProcessResult<Handle> {
43 Continue,
44 Suspend,
45 Script(Handle),
46 }
47
48 #[must_use]
49 pub enum TokenizerResult<Handle> {
50 Done,
51 Script(Handle),
52 }
53
option_push(opt_str: &mut Option<StrTendril>, c: char)54 fn option_push(opt_str: &mut Option<StrTendril>, c: char) {
55 match *opt_str {
56 Some(ref mut s) => s.push_char(c),
57 None => *opt_str = Some(StrTendril::from_char(c)),
58 }
59 }
60
61 /// Tokenizer options, with an impl for `Default`.
62 #[derive(Clone)]
63 pub struct TokenizerOpts {
64 /// Report all parse errors described in the spec, at some
65 /// performance penalty? Default: false
66 pub exact_errors: bool,
67
68 /// Discard a `U+FEFF BYTE ORDER MARK` if we see one at the beginning
69 /// of the stream? Default: true
70 pub discard_bom: bool,
71
72 /// Keep a record of how long we spent in each state? Printed
73 /// when `end()` is called. Default: false
74 pub profile: bool,
75
76 /// Initial state override. Only the test runner should use
77 /// a non-`None` value!
78 pub initial_state: Option<states::State>,
79
80 /// Last start tag. Only the test runner should use a
81 /// non-`None` value!
82 ///
83 /// FIXME: Can't use Tendril because we want TokenizerOpts
84 /// to be Send.
85 pub last_start_tag_name: Option<String>,
86 }
87
88 impl Default for TokenizerOpts {
default() -> TokenizerOpts89 fn default() -> TokenizerOpts {
90 TokenizerOpts {
91 exact_errors: false,
92 discard_bom: true,
93 profile: false,
94 initial_state: None,
95 last_start_tag_name: None,
96 }
97 }
98 }
99
100 /// The HTML tokenizer.
101 pub struct Tokenizer<Sink> {
102 /// Options controlling the behavior of the tokenizer.
103 opts: TokenizerOpts,
104
105 /// Destination for tokens we emit.
106 pub sink: Sink,
107
108 /// The abstract machine state as described in the spec.
109 state: states::State,
110
111 /// Are we at the end of the file, once buffers have been processed
112 /// completely? This affects whether we will wait for lookahead or not.
113 at_eof: bool,
114
115 /// Tokenizer for character references, if we're tokenizing
116 /// one at the moment.
117 char_ref_tokenizer: Option<Box<CharRefTokenizer>>,
118
119 /// Current input character. Just consumed, may reconsume.
120 current_char: char,
121
122 /// Should we reconsume the current input character?
123 reconsume: bool,
124
125 /// Did we just consume \r, translating it to \n? In that case we need
126 /// to ignore the next character if it's \n.
127 ignore_lf: bool,
128
129 /// Discard a U+FEFF BYTE ORDER MARK if we see one? Only done at the
130 /// beginning of the stream.
131 discard_bom: bool,
132
133 /// Current tag kind.
134 current_tag_kind: TagKind,
135
136 /// Current tag name.
137 current_tag_name: StrTendril,
138
139 /// Current tag is self-closing?
140 current_tag_self_closing: bool,
141
142 /// Current tag attributes.
143 current_tag_attrs: Vec<Attribute>,
144
145 /// Current attribute name.
146 current_attr_name: StrTendril,
147
148 /// Current attribute value.
149 current_attr_value: StrTendril,
150
151 /// Current comment.
152 current_comment: StrTendril,
153
154 /// Current doctype token.
155 current_doctype: Doctype,
156
157 /// Last start tag name, for use in checking "appropriate end tag".
158 last_start_tag_name: Option<LocalName>,
159
160 /// The "temporary buffer" mentioned in the spec.
161 temp_buf: StrTendril,
162
163 /// Record of how many ns we spent in each state, if profiling is enabled.
164 state_profile: BTreeMap<states::State, u64>,
165
166 /// Record of how many ns we spent in the token sink.
167 time_in_sink: u64,
168
169 /// Track current line
170 current_line: u64,
171 }
172
173 impl<Sink: TokenSink> Tokenizer<Sink> {
174 /// Create a new tokenizer which feeds tokens to a particular `TokenSink`.
new(sink: Sink, mut opts: TokenizerOpts) -> Tokenizer<Sink>175 pub fn new(sink: Sink, mut opts: TokenizerOpts) -> Tokenizer<Sink> {
176 let start_tag_name = opts
177 .last_start_tag_name
178 .take()
179 .map(|s| LocalName::from(&*s));
180 let state = opts.initial_state.unwrap_or(states::Data);
181 let discard_bom = opts.discard_bom;
182 Tokenizer {
183 opts: opts,
184 sink: sink,
185 state: state,
186 char_ref_tokenizer: None,
187 at_eof: false,
188 current_char: '\0',
189 reconsume: false,
190 ignore_lf: false,
191 discard_bom: discard_bom,
192 current_tag_kind: StartTag,
193 current_tag_name: StrTendril::new(),
194 current_tag_self_closing: false,
195 current_tag_attrs: vec![],
196 current_attr_name: StrTendril::new(),
197 current_attr_value: StrTendril::new(),
198 current_comment: StrTendril::new(),
199 current_doctype: Doctype::new(),
200 last_start_tag_name: start_tag_name,
201 temp_buf: StrTendril::new(),
202 state_profile: BTreeMap::new(),
203 time_in_sink: 0,
204 current_line: 1,
205 }
206 }
207
208 /// Feed an input string into the tokenizer.
feed(&mut self, input: &mut BufferQueue) -> TokenizerResult<Sink::Handle>209 pub fn feed(&mut self, input: &mut BufferQueue) -> TokenizerResult<Sink::Handle> {
210 if input.is_empty() {
211 return TokenizerResult::Done;
212 }
213
214 if self.discard_bom {
215 if let Some(c) = input.peek() {
216 if c == '\u{feff}' {
217 input.next();
218 }
219 } else {
220 return TokenizerResult::Done;
221 }
222 };
223
224 self.run(input)
225 }
226
set_plaintext_state(&mut self)227 pub fn set_plaintext_state(&mut self) {
228 self.state = states::Plaintext;
229 }
230
process_token(&mut self, token: Token) -> TokenSinkResult<Sink::Handle>231 fn process_token(&mut self, token: Token) -> TokenSinkResult<Sink::Handle> {
232 if self.opts.profile {
233 let (ret, dt) = time!(self.sink.process_token(token, self.current_line));
234 self.time_in_sink += dt;
235 ret
236 } else {
237 self.sink.process_token(token, self.current_line)
238 }
239 }
240
process_token_and_continue(&mut self, token: Token)241 fn process_token_and_continue(&mut self, token: Token) {
242 assert!(matches!(
243 self.process_token(token),
244 TokenSinkResult::Continue
245 ));
246 }
247
248 //§ preprocessing-the-input-stream
249 // Get the next input character, which might be the character
250 // 'c' that we already consumed from the buffers.
get_preprocessed_char(&mut self, mut c: char, input: &mut BufferQueue) -> Option<char>251 fn get_preprocessed_char(&mut self, mut c: char, input: &mut BufferQueue) -> Option<char> {
252 if self.ignore_lf {
253 self.ignore_lf = false;
254 if c == '\n' {
255 c = unwrap_or_return!(input.next(), None);
256 }
257 }
258
259 if c == '\r' {
260 self.ignore_lf = true;
261 c = '\n';
262 }
263
264 if c == '\n' {
265 self.current_line += 1;
266 }
267
268 if self.opts.exact_errors &&
269 match c as u32 {
270 0x01..=0x08 | 0x0B | 0x0E..=0x1F | 0x7F..=0x9F | 0xFDD0..=0xFDEF => true,
271 n if (n & 0xFFFE) == 0xFFFE => true,
272 _ => false,
273 }
274 {
275 let msg = format!("Bad character {}", c);
276 self.emit_error(Cow::Owned(msg));
277 }
278
279 debug!("got character {}", c);
280 self.current_char = c;
281 Some(c)
282 }
283
284 //§ tokenization
285 // Get the next input character, if one is available.
get_char(&mut self, input: &mut BufferQueue) -> Option<char>286 fn get_char(&mut self, input: &mut BufferQueue) -> Option<char> {
287 if self.reconsume {
288 self.reconsume = false;
289 Some(self.current_char)
290 } else {
291 input
292 .next()
293 .and_then(|c| self.get_preprocessed_char(c, input))
294 }
295 }
296
pop_except_from(&mut self, input: &mut BufferQueue, set: SmallCharSet) -> Option<SetResult>297 fn pop_except_from(&mut self, input: &mut BufferQueue, set: SmallCharSet) -> Option<SetResult> {
298 // Bail to the slow path for various corner cases.
299 // This means that `FromSet` can contain characters not in the set!
300 // It shouldn't matter because the fallback `FromSet` case should
301 // always do the same thing as the `NotFromSet` case.
302 if self.opts.exact_errors || self.reconsume || self.ignore_lf {
303 return self.get_char(input).map(|x| FromSet(x));
304 }
305
306 let d = input.pop_except_from(set);
307 debug!("got characters {:?}", d);
308 match d {
309 Some(FromSet(c)) => self.get_preprocessed_char(c, input).map(|x| FromSet(x)),
310
311 // NB: We don't set self.current_char for a run of characters not
312 // in the set. It shouldn't matter for the codepaths that use
313 // this.
314 _ => d,
315 }
316 }
317
318 // Check if the next characters are an ASCII case-insensitive match. See
319 // BufferQueue::eat.
320 //
321 // NB: this doesn't do input stream preprocessing or set the current input
322 // character.
eat( &mut self, input: &mut BufferQueue, pat: &str, eq: fn(&u8, &u8) -> bool, ) -> Option<bool>323 fn eat(
324 &mut self,
325 input: &mut BufferQueue,
326 pat: &str,
327 eq: fn(&u8, &u8) -> bool,
328 ) -> Option<bool> {
329 input.push_front(replace(&mut self.temp_buf, StrTendril::new()));
330 match input.eat(pat, eq) {
331 None if self.at_eof => Some(false),
332 None => {
333 while let Some(c) = input.next() {
334 self.temp_buf.push_char(c);
335 }
336 None
337 },
338 Some(matched) => Some(matched),
339 }
340 }
341
342 /// Run the state machine for as long as we can.
run(&mut self, input: &mut BufferQueue) -> TokenizerResult<Sink::Handle>343 fn run(&mut self, input: &mut BufferQueue) -> TokenizerResult<Sink::Handle> {
344 if self.opts.profile {
345 loop {
346 let state = self.state;
347 let old_sink = self.time_in_sink;
348 let (run, mut dt) = time!(self.step(input));
349 dt -= (self.time_in_sink - old_sink);
350 let new = match self.state_profile.get_mut(&state) {
351 Some(x) => {
352 *x += dt;
353 false
354 },
355 None => true,
356 };
357 if new {
358 // do this here because of borrow shenanigans
359 self.state_profile.insert(state, dt);
360 }
361 match run {
362 ProcessResult::Continue => (),
363 ProcessResult::Suspend => break,
364 ProcessResult::Script(node) => return TokenizerResult::Script(node),
365 }
366 }
367 } else {
368 loop {
369 match self.step(input) {
370 ProcessResult::Continue => (),
371 ProcessResult::Suspend => break,
372 ProcessResult::Script(node) => return TokenizerResult::Script(node),
373 }
374 }
375 }
376 TokenizerResult::Done
377 }
378
bad_char_error(&mut self)379 fn bad_char_error(&mut self) {
380 let msg = format_if!(
381 self.opts.exact_errors,
382 "Bad character",
383 "Saw {} in state {:?}",
384 self.current_char,
385 self.state
386 );
387 self.emit_error(msg);
388 }
389
bad_eof_error(&mut self)390 fn bad_eof_error(&mut self) {
391 let msg = format_if!(
392 self.opts.exact_errors,
393 "Unexpected EOF",
394 "Saw EOF in state {:?}",
395 self.state
396 );
397 self.emit_error(msg);
398 }
399
emit_char(&mut self, c: char)400 fn emit_char(&mut self, c: char) {
401 self.process_token_and_continue(match c {
402 '\0' => NullCharacterToken,
403 _ => CharacterTokens(StrTendril::from_char(c)),
404 });
405 }
406
407 // The string must not contain '\0'!
emit_chars(&mut self, b: StrTendril)408 fn emit_chars(&mut self, b: StrTendril) {
409 self.process_token_and_continue(CharacterTokens(b));
410 }
411
emit_current_tag(&mut self) -> ProcessResult<Sink::Handle>412 fn emit_current_tag(&mut self) -> ProcessResult<Sink::Handle> {
413 self.finish_attribute();
414
415 let name = LocalName::from(&*self.current_tag_name);
416 self.current_tag_name.clear();
417
418 match self.current_tag_kind {
419 StartTag => {
420 self.last_start_tag_name = Some(name.clone());
421 },
422 EndTag => {
423 if !self.current_tag_attrs.is_empty() {
424 self.emit_error(Borrowed("Attributes on an end tag"));
425 }
426 if self.current_tag_self_closing {
427 self.emit_error(Borrowed("Self-closing end tag"));
428 }
429 },
430 }
431
432 let token = TagToken(Tag {
433 kind: self.current_tag_kind,
434 name: name,
435 self_closing: self.current_tag_self_closing,
436 attrs: replace(&mut self.current_tag_attrs, vec![]),
437 });
438
439 match self.process_token(token) {
440 TokenSinkResult::Continue => ProcessResult::Continue,
441 TokenSinkResult::Plaintext => {
442 self.state = states::Plaintext;
443 ProcessResult::Continue
444 },
445 TokenSinkResult::Script(node) => {
446 self.state = states::Data;
447 ProcessResult::Script(node)
448 },
449 TokenSinkResult::RawData(kind) => {
450 self.state = states::RawData(kind);
451 ProcessResult::Continue
452 },
453 }
454 }
455
emit_temp_buf(&mut self)456 fn emit_temp_buf(&mut self) {
457 // FIXME: Make sure that clearing on emit is spec-compatible.
458 let buf = replace(&mut self.temp_buf, StrTendril::new());
459 self.emit_chars(buf);
460 }
461
clear_temp_buf(&mut self)462 fn clear_temp_buf(&mut self) {
463 // Do this without a new allocation.
464 self.temp_buf.clear();
465 }
466
emit_current_comment(&mut self)467 fn emit_current_comment(&mut self) {
468 let comment = replace(&mut self.current_comment, StrTendril::new());
469 self.process_token_and_continue(CommentToken(comment));
470 }
471
discard_tag(&mut self)472 fn discard_tag(&mut self) {
473 self.current_tag_name.clear();
474 self.current_tag_self_closing = false;
475 self.current_tag_attrs = vec![];
476 }
477
create_tag(&mut self, kind: TagKind, c: char)478 fn create_tag(&mut self, kind: TagKind, c: char) {
479 self.discard_tag();
480 self.current_tag_name.push_char(c);
481 self.current_tag_kind = kind;
482 }
483
have_appropriate_end_tag(&self) -> bool484 fn have_appropriate_end_tag(&self) -> bool {
485 match self.last_start_tag_name.as_ref() {
486 Some(last) => (self.current_tag_kind == EndTag) && (*self.current_tag_name == **last),
487 None => false,
488 }
489 }
490
create_attribute(&mut self, c: char)491 fn create_attribute(&mut self, c: char) {
492 self.finish_attribute();
493
494 self.current_attr_name.push_char(c);
495 }
496
finish_attribute(&mut self)497 fn finish_attribute(&mut self) {
498 if self.current_attr_name.len() == 0 {
499 return;
500 }
501
502 // Check for a duplicate attribute.
503 // FIXME: the spec says we should error as soon as the name is finished.
504 // FIXME: linear time search, do we care?
505 let dup = {
506 let name = &*self.current_attr_name;
507 self.current_tag_attrs
508 .iter()
509 .any(|a| &*a.name.local == name)
510 };
511
512 if dup {
513 self.emit_error(Borrowed("Duplicate attribute"));
514 self.current_attr_name.clear();
515 self.current_attr_value.clear();
516 } else {
517 let name = LocalName::from(&*self.current_attr_name);
518 self.current_attr_name.clear();
519 self.current_tag_attrs.push(Attribute {
520 // The tree builder will adjust the namespace if necessary.
521 // This only happens in foreign elements.
522 name: QualName::new(None, ns!(), name),
523 value: replace(&mut self.current_attr_value, StrTendril::new()),
524 });
525 }
526 }
527
emit_current_doctype(&mut self)528 fn emit_current_doctype(&mut self) {
529 let doctype = replace(&mut self.current_doctype, Doctype::new());
530 self.process_token_and_continue(DoctypeToken(doctype));
531 }
532
doctype_id<'a>(&'a mut self, kind: DoctypeIdKind) -> &'a mut Option<StrTendril>533 fn doctype_id<'a>(&'a mut self, kind: DoctypeIdKind) -> &'a mut Option<StrTendril> {
534 match kind {
535 Public => &mut self.current_doctype.public_id,
536 System => &mut self.current_doctype.system_id,
537 }
538 }
539
clear_doctype_id(&mut self, kind: DoctypeIdKind)540 fn clear_doctype_id(&mut self, kind: DoctypeIdKind) {
541 let id = self.doctype_id(kind);
542 match *id {
543 Some(ref mut s) => s.clear(),
544 None => *id = Some(StrTendril::new()),
545 }
546 }
547
consume_char_ref(&mut self, addnl_allowed: Option<char>)548 fn consume_char_ref(&mut self, addnl_allowed: Option<char>) {
549 // NB: The char ref tokenizer assumes we have an additional allowed
550 // character iff we're tokenizing in an attribute value.
551 self.char_ref_tokenizer = Some(Box::new(CharRefTokenizer::new(addnl_allowed)));
552 }
553
emit_eof(&mut self)554 fn emit_eof(&mut self) {
555 self.process_token_and_continue(EOFToken);
556 }
557
peek(&mut self, input: &BufferQueue) -> Option<char>558 fn peek(&mut self, input: &BufferQueue) -> Option<char> {
559 if self.reconsume {
560 Some(self.current_char)
561 } else {
562 input.peek()
563 }
564 }
565
discard_char(&mut self, input: &mut BufferQueue)566 fn discard_char(&mut self, input: &mut BufferQueue) {
567 self.get_char(input);
568 }
569
emit_error(&mut self, error: Cow<'static, str>)570 fn emit_error(&mut self, error: Cow<'static, str>) {
571 self.process_token_and_continue(ParseError(error));
572 }
573 }
574 //§ END
575
576 // Shorthand for common state machine behaviors.
577 macro_rules! shorthand (
578 ( $me:ident : emit $c:expr ) => ( $me.emit_char($c); );
579 ( $me:ident : create_tag $kind:ident $c:expr ) => ( $me.create_tag($kind, $c); );
580 ( $me:ident : push_tag $c:expr ) => ( $me.current_tag_name.push_char($c); );
581 ( $me:ident : discard_tag ) => ( $me.discard_tag(); );
582 ( $me:ident : discard_char $input:expr ) => ( $me.discard_char($input); );
583 ( $me:ident : push_temp $c:expr ) => ( $me.temp_buf.push_char($c); );
584 ( $me:ident : emit_temp ) => ( $me.emit_temp_buf(); );
585 ( $me:ident : clear_temp ) => ( $me.clear_temp_buf(); );
586 ( $me:ident : create_attr $c:expr ) => ( $me.create_attribute($c); );
587 ( $me:ident : push_name $c:expr ) => ( $me.current_attr_name.push_char($c); );
588 ( $me:ident : push_value $c:expr ) => ( $me.current_attr_value.push_char($c); );
589 ( $me:ident : append_value $c:expr ) => ( $me.current_attr_value.push_tendril($c); );
590 ( $me:ident : push_comment $c:expr ) => ( $me.current_comment.push_char($c); );
591 ( $me:ident : append_comment $c:expr ) => ( $me.current_comment.push_slice($c); );
592 ( $me:ident : emit_comment ) => ( $me.emit_current_comment(); );
593 ( $me:ident : clear_comment ) => ( $me.current_comment.clear(); );
594 ( $me:ident : create_doctype ) => ( $me.current_doctype = Doctype::new(); );
595 ( $me:ident : push_doctype_name $c:expr ) => ( option_push(&mut $me.current_doctype.name, $c); );
596 ( $me:ident : push_doctype_id $k:ident $c:expr ) => ( option_push($me.doctype_id($k), $c); );
597 ( $me:ident : clear_doctype_id $k:ident ) => ( $me.clear_doctype_id($k); );
598 ( $me:ident : force_quirks ) => ( $me.current_doctype.force_quirks = true; );
599 ( $me:ident : emit_doctype ) => ( $me.emit_current_doctype(); );
600 ( $me:ident : error ) => ( $me.bad_char_error(); );
601 ( $me:ident : error_eof ) => ( $me.bad_eof_error(); );
602 );
603
604 // Tracing of tokenizer actions. This adds significant bloat and compile time,
605 // so it's behind a cfg flag.
606 #[cfg(trace_tokenizer)]
607 macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ({
608 debug!(" {:s}", stringify!($($cmds)*));
609 shorthand!($me:expr : $($cmds)*);
610 }));
611
612 #[cfg(not(trace_tokenizer))]
613 macro_rules! sh_trace ( ( $me:ident : $($cmds:tt)* ) => ( shorthand!($me: $($cmds)*) ) );
614
615 // A little DSL for sequencing shorthand actions.
616 macro_rules! go (
617 // A pattern like $($cmd:tt)* ; $($rest:tt)* causes parse ambiguity.
618 // We have to tell the parser how much lookahead we need.
619
620 ( $me:ident : $a:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a); go!($me: $($rest)*); });
621 ( $me:ident : $a:tt $b:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b); go!($me: $($rest)*); });
622 ( $me:ident : $a:tt $b:tt $c:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c); go!($me: $($rest)*); });
623 ( $me:ident : $a:tt $b:tt $c:tt $d:tt ; $($rest:tt)* ) => ({ sh_trace!($me: $a $b $c $d); go!($me: $($rest)*); });
624
625 // These can only come at the end.
626
627 ( $me:ident : to $s:ident ) => ({ $me.state = states::$s; return ProcessResult::Continue; });
628 ( $me:ident : to $s:ident $k1:expr ) => ({ $me.state = states::$s($k1); return ProcessResult::Continue; });
629 ( $me:ident : to $s:ident $k1:ident $k2:expr ) => ({ $me.state = states::$s($k1($k2)); return ProcessResult::Continue; });
630
631 ( $me:ident : reconsume $s:ident ) => ({ $me.reconsume = true; go!($me: to $s); });
632 ( $me:ident : reconsume $s:ident $k1:expr ) => ({ $me.reconsume = true; go!($me: to $s $k1); });
633 ( $me:ident : reconsume $s:ident $k1:ident $k2:expr ) => ({ $me.reconsume = true; go!($me: to $s $k1 $k2); });
634
635 ( $me:ident : consume_char_ref ) => ({ $me.consume_char_ref(None); return ProcessResult::Continue; });
636 ( $me:ident : consume_char_ref $addnl:expr ) => ({ $me.consume_char_ref(Some($addnl)); return ProcessResult::Continue; });
637
638 // We have a default next state after emitting a tag, but the sink can override.
639 ( $me:ident : emit_tag $s:ident ) => ({
640 $me.state = states::$s;
641 return $me.emit_current_tag();
642 });
643
644 ( $me:ident : eof ) => ({ $me.emit_eof(); return ProcessResult::Suspend; });
645
646 // If nothing else matched, it's a single command
647 ( $me:ident : $($cmd:tt)+ ) => ( sh_trace!($me: $($cmd)+); );
648
649 // or nothing.
650 ( $me:ident : ) => (());
651 );
652
653 macro_rules! go_match ( ( $me:ident : $x:expr, $($pats:pat),+ => $($cmds:tt)* ) => (
654 match $x {
655 $($pats)|+ => go!($me: $($cmds)*),
656 _ => (),
657 }
658 ));
659
660 // This is a macro because it can cause early return
661 // from the function where it is used.
662 macro_rules! get_char ( ($me:expr, $input:expr) => (
663 unwrap_or_return!($me.get_char($input), ProcessResult::Suspend)
664 ));
665
666 macro_rules! peek ( ($me:expr, $input:expr) => (
667 unwrap_or_return!($me.peek($input), ProcessResult::Suspend)
668 ));
669
670 macro_rules! pop_except_from ( ($me:expr, $input:expr, $set:expr) => (
671 unwrap_or_return!($me.pop_except_from($input, $set), ProcessResult::Suspend)
672 ));
673
674 macro_rules! eat ( ($me:expr, $input:expr, $pat:expr) => (
675 unwrap_or_return!($me.eat($input, $pat, u8::eq_ignore_ascii_case), ProcessResult::Suspend)
676 ));
677
678 macro_rules! eat_exact ( ($me:expr, $input:expr, $pat:expr) => (
679 unwrap_or_return!($me.eat($input, $pat, u8::eq), ProcessResult::Suspend)
680 ));
681
682 impl<Sink: TokenSink> Tokenizer<Sink> {
683 // Run the state machine for a while.
684 // Return true if we should be immediately re-invoked
685 // (this just simplifies control flow vs. break / continue).
step(&mut self, input: &mut BufferQueue) -> ProcessResult<Sink::Handle>686 fn step(&mut self, input: &mut BufferQueue) -> ProcessResult<Sink::Handle> {
687 if self.char_ref_tokenizer.is_some() {
688 return self.step_char_ref_tokenizer(input);
689 }
690
691 debug!("processing in state {:?}", self.state);
692 match self.state {
693 //§ data-state
694 states::Data => loop {
695 match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n')) {
696 FromSet('\0') => go!(self: error; emit '\0'),
697 FromSet('&') => go!(self: consume_char_ref),
698 FromSet('<') => go!(self: to TagOpen),
699 FromSet(c) => go!(self: emit c),
700 NotFromSet(b) => self.emit_chars(b),
701 }
702 },
703
704 //§ rcdata-state
705 states::RawData(Rcdata) => loop {
706 match pop_except_from!(self, input, small_char_set!('\r' '\0' '&' '<' '\n')) {
707 FromSet('\0') => go!(self: error; emit '\u{fffd}'),
708 FromSet('&') => go!(self: consume_char_ref),
709 FromSet('<') => go!(self: to RawLessThanSign Rcdata),
710 FromSet(c) => go!(self: emit c),
711 NotFromSet(b) => self.emit_chars(b),
712 }
713 },
714
715 //§ rawtext-state
716 states::RawData(Rawtext) => loop {
717 match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) {
718 FromSet('\0') => go!(self: error; emit '\u{fffd}'),
719 FromSet('<') => go!(self: to RawLessThanSign Rawtext),
720 FromSet(c) => go!(self: emit c),
721 NotFromSet(b) => self.emit_chars(b),
722 }
723 },
724
725 //§ script-data-state
726 states::RawData(ScriptData) => loop {
727 match pop_except_from!(self, input, small_char_set!('\r' '\0' '<' '\n')) {
728 FromSet('\0') => go!(self: error; emit '\u{fffd}'),
729 FromSet('<') => go!(self: to RawLessThanSign ScriptData),
730 FromSet(c) => go!(self: emit c),
731 NotFromSet(b) => self.emit_chars(b),
732 }
733 },
734
735 //§ script-data-escaped-state
736 states::RawData(ScriptDataEscaped(Escaped)) => loop {
737 match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) {
738 FromSet('\0') => go!(self: error; emit '\u{fffd}'),
739 FromSet('-') => go!(self: emit '-'; to ScriptDataEscapedDash Escaped),
740 FromSet('<') => go!(self: to RawLessThanSign ScriptDataEscaped Escaped),
741 FromSet(c) => go!(self: emit c),
742 NotFromSet(b) => self.emit_chars(b),
743 }
744 },
745
746 //§ script-data-double-escaped-state
747 states::RawData(ScriptDataEscaped(DoubleEscaped)) => loop {
748 match pop_except_from!(self, input, small_char_set!('\r' '\0' '-' '<' '\n')) {
749 FromSet('\0') => go!(self: error; emit '\u{fffd}'),
750 FromSet('-') => go!(self: emit '-'; to ScriptDataEscapedDash DoubleEscaped),
751 FromSet('<') => {
752 go!(self: emit '<'; to RawLessThanSign ScriptDataEscaped DoubleEscaped)
753 },
754 FromSet(c) => go!(self: emit c),
755 NotFromSet(b) => self.emit_chars(b),
756 }
757 },
758
759 //§ plaintext-state
760 states::Plaintext => loop {
761 match pop_except_from!(self, input, small_char_set!('\r' '\0' '\n')) {
762 FromSet('\0') => go!(self: error; emit '\u{fffd}'),
763 FromSet(c) => go!(self: emit c),
764 NotFromSet(b) => self.emit_chars(b),
765 }
766 },
767
768 //§ tag-open-state
769 states::TagOpen => loop {
770 match get_char!(self, input) {
771 '!' => go!(self: clear_temp; to MarkupDeclarationOpen),
772 '/' => go!(self: to EndTagOpen),
773 '?' => go!(self: error; clear_comment; push_comment '?'; to BogusComment),
774 c => match lower_ascii_letter(c) {
775 Some(cl) => go!(self: create_tag StartTag cl; to TagName),
776 None => go!(self: error; emit '<'; reconsume Data),
777 },
778 }
779 },
780
781 //§ end-tag-open-state
782 states::EndTagOpen => loop {
783 match get_char!(self, input) {
784 '>' => go!(self: error; to Data),
785 '\0' => {
786 go!(self: error; clear_comment; push_comment '\u{fffd}'; to BogusComment)
787 },
788 c => match lower_ascii_letter(c) {
789 Some(cl) => go!(self: create_tag EndTag cl; to TagName),
790 None => go!(self: error; clear_comment; push_comment c; to BogusComment),
791 },
792 }
793 },
794
795 //§ tag-name-state
796 states::TagName => loop {
797 match get_char!(self, input) {
798 '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName),
799 '/' => go!(self: to SelfClosingStartTag),
800 '>' => go!(self: emit_tag Data),
801 '\0' => go!(self: error; push_tag '\u{fffd}'),
802 c => go!(self: push_tag (c.to_ascii_lowercase())),
803 }
804 },
805
806 //§ script-data-escaped-less-than-sign-state
807 states::RawLessThanSign(ScriptDataEscaped(Escaped)) => loop {
808 match get_char!(self, input) {
809 '/' => go!(self: clear_temp; to RawEndTagOpen ScriptDataEscaped Escaped),
810 c => match lower_ascii_letter(c) {
811 Some(cl) => go!(self: clear_temp; push_temp cl; emit '<'; emit c;
812 to ScriptDataEscapeStart DoubleEscaped),
813 None => go!(self: emit '<'; reconsume RawData ScriptDataEscaped Escaped),
814 },
815 }
816 },
817
818 //§ script-data-double-escaped-less-than-sign-state
819 states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => loop {
820 match get_char!(self, input) {
821 '/' => go!(self: clear_temp; emit '/'; to ScriptDataDoubleEscapeEnd),
822 _ => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped),
823 }
824 },
825
826 //§ rcdata-less-than-sign-state rawtext-less-than-sign-state script-data-less-than-sign-state
827 // otherwise
828 states::RawLessThanSign(kind) => loop {
829 match get_char!(self, input) {
830 '/' => go!(self: clear_temp; to RawEndTagOpen kind),
831 '!' if kind == ScriptData => {
832 go!(self: emit '<'; emit '!'; to ScriptDataEscapeStart Escaped)
833 },
834 _ => go!(self: emit '<'; reconsume RawData kind),
835 }
836 },
837
838 //§ rcdata-end-tag-open-state rawtext-end-tag-open-state script-data-end-tag-open-state script-data-escaped-end-tag-open-state
839 states::RawEndTagOpen(kind) => loop {
840 let c = get_char!(self, input);
841 match lower_ascii_letter(c) {
842 Some(cl) => go!(self: create_tag EndTag cl; push_temp c; to RawEndTagName kind),
843 None => go!(self: emit '<'; emit '/'; reconsume RawData kind),
844 }
845 },
846
847 //§ rcdata-end-tag-name-state rawtext-end-tag-name-state script-data-end-tag-name-state script-data-escaped-end-tag-name-state
848 states::RawEndTagName(kind) => loop {
849 let c = get_char!(self, input);
850 if self.have_appropriate_end_tag() {
851 match c {
852 '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName),
853 '/' => go!(self: to SelfClosingStartTag),
854 '>' => go!(self: emit_tag Data),
855 _ => (),
856 }
857 }
858
859 match lower_ascii_letter(c) {
860 Some(cl) => go!(self: push_tag cl; push_temp c),
861 None => {
862 go!(self: discard_tag; emit '<'; emit '/'; emit_temp; reconsume RawData kind)
863 },
864 }
865 },
866
867 //§ script-data-double-escape-start-state
868 states::ScriptDataEscapeStart(DoubleEscaped) => loop {
869 let c = get_char!(self, input);
870 match c {
871 '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => {
872 let esc = if &*self.temp_buf == "script" {
873 DoubleEscaped
874 } else {
875 Escaped
876 };
877 go!(self: emit c; to RawData ScriptDataEscaped esc);
878 },
879 _ => match lower_ascii_letter(c) {
880 Some(cl) => go!(self: push_temp cl; emit c),
881 None => go!(self: reconsume RawData ScriptDataEscaped Escaped),
882 },
883 }
884 },
885
886 //§ script-data-escape-start-state
887 states::ScriptDataEscapeStart(Escaped) => loop {
888 match get_char!(self, input) {
889 '-' => go!(self: emit '-'; to ScriptDataEscapeStartDash),
890 _ => go!(self: reconsume RawData ScriptData),
891 }
892 },
893
894 //§ script-data-escape-start-dash-state
895 states::ScriptDataEscapeStartDash => loop {
896 match get_char!(self, input) {
897 '-' => go!(self: emit '-'; to ScriptDataEscapedDashDash Escaped),
898 _ => go!(self: reconsume RawData ScriptData),
899 }
900 },
901
902 //§ script-data-escaped-dash-state script-data-double-escaped-dash-state
903 states::ScriptDataEscapedDash(kind) => loop {
904 match get_char!(self, input) {
905 '-' => go!(self: emit '-'; to ScriptDataEscapedDashDash kind),
906 '<' => {
907 if kind == DoubleEscaped {
908 go!(self: emit '<');
909 }
910 go!(self: to RawLessThanSign ScriptDataEscaped kind);
911 },
912 '\0' => go!(self: error; emit '\u{fffd}'; to RawData ScriptDataEscaped kind),
913 c => go!(self: emit c; to RawData ScriptDataEscaped kind),
914 }
915 },
916
917 //§ script-data-escaped-dash-dash-state script-data-double-escaped-dash-dash-state
918 states::ScriptDataEscapedDashDash(kind) => loop {
919 match get_char!(self, input) {
920 '-' => go!(self: emit '-'),
921 '<' => {
922 if kind == DoubleEscaped {
923 go!(self: emit '<');
924 }
925 go!(self: to RawLessThanSign ScriptDataEscaped kind);
926 },
927 '>' => go!(self: emit '>'; to RawData ScriptData),
928 '\0' => go!(self: error; emit '\u{fffd}'; to RawData ScriptDataEscaped kind),
929 c => go!(self: emit c; to RawData ScriptDataEscaped kind),
930 }
931 },
932
933 //§ script-data-double-escape-end-state
934 states::ScriptDataDoubleEscapeEnd => loop {
935 let c = get_char!(self, input);
936 match c {
937 '\t' | '\n' | '\x0C' | ' ' | '/' | '>' => {
938 let esc = if &*self.temp_buf == "script" {
939 Escaped
940 } else {
941 DoubleEscaped
942 };
943 go!(self: emit c; to RawData ScriptDataEscaped esc);
944 },
945 _ => match lower_ascii_letter(c) {
946 Some(cl) => go!(self: push_temp cl; emit c),
947 None => go!(self: reconsume RawData ScriptDataEscaped DoubleEscaped),
948 },
949 }
950 },
951
952 //§ before-attribute-name-state
953 states::BeforeAttributeName => loop {
954 match get_char!(self, input) {
955 '\t' | '\n' | '\x0C' | ' ' => (),
956 '/' => go!(self: to SelfClosingStartTag),
957 '>' => go!(self: emit_tag Data),
958 '\0' => go!(self: error; create_attr '\u{fffd}'; to AttributeName),
959 c => match lower_ascii_letter(c) {
960 Some(cl) => go!(self: create_attr cl; to AttributeName),
961 None => {
962 go_match!(self: c,
963 '"' , '\'' , '<' , '=' => error);
964 go!(self: create_attr c; to AttributeName);
965 },
966 },
967 }
968 },
969
970 //§ attribute-name-state
971 states::AttributeName => loop {
972 match get_char!(self, input) {
973 '\t' | '\n' | '\x0C' | ' ' => go!(self: to AfterAttributeName),
974 '/' => go!(self: to SelfClosingStartTag),
975 '=' => go!(self: to BeforeAttributeValue),
976 '>' => go!(self: emit_tag Data),
977 '\0' => go!(self: error; push_name '\u{fffd}'),
978 c => match lower_ascii_letter(c) {
979 Some(cl) => go!(self: push_name cl),
980 None => {
981 go_match!(self: c,
982 '"' , '\'' , '<' => error);
983 go!(self: push_name c);
984 },
985 },
986 }
987 },
988
989 //§ after-attribute-name-state
990 states::AfterAttributeName => loop {
991 match get_char!(self, input) {
992 '\t' | '\n' | '\x0C' | ' ' => (),
993 '/' => go!(self: to SelfClosingStartTag),
994 '=' => go!(self: to BeforeAttributeValue),
995 '>' => go!(self: emit_tag Data),
996 '\0' => go!(self: error; create_attr '\u{fffd}'; to AttributeName),
997 c => match lower_ascii_letter(c) {
998 Some(cl) => go!(self: create_attr cl; to AttributeName),
999 None => {
1000 go_match!(self: c,
1001 '"' , '\'' , '<' => error);
1002 go!(self: create_attr c; to AttributeName);
1003 },
1004 },
1005 }
1006 },
1007
1008 //§ before-attribute-value-state
1009 // Use peek so we can handle the first attr character along with the rest,
1010 // hopefully in the same zero-copy buffer.
1011 states::BeforeAttributeValue => loop {
1012 match peek!(self, input) {
1013 '\t' | '\n' | '\r' | '\x0C' | ' ' => go!(self: discard_char input),
1014 '"' => go!(self: discard_char input; to AttributeValue DoubleQuoted),
1015 '\'' => go!(self: discard_char input; to AttributeValue SingleQuoted),
1016 '\0' => {
1017 go!(self: discard_char input; error; push_value '\u{fffd}'; to AttributeValue Unquoted)
1018 },
1019 '>' => go!(self: discard_char input; error; emit_tag Data),
1020 _ => go!(self: to AttributeValue Unquoted),
1021 }
1022 },
1023
1024 //§ attribute-value-(double-quoted)-state
1025 states::AttributeValue(DoubleQuoted) => loop {
1026 match pop_except_from!(self, input, small_char_set!('\r' '"' '&' '\0' '\n')) {
1027 FromSet('"') => go!(self: to AfterAttributeValueQuoted),
1028 FromSet('&') => go!(self: consume_char_ref '"'),
1029 FromSet('\0') => go!(self: error; push_value '\u{fffd}'),
1030 FromSet(c) => go!(self: push_value c),
1031 NotFromSet(ref b) => go!(self: append_value b),
1032 }
1033 },
1034
1035 //§ attribute-value-(single-quoted)-state
1036 states::AttributeValue(SingleQuoted) => loop {
1037 match pop_except_from!(self, input, small_char_set!('\r' '\'' '&' '\0' '\n')) {
1038 FromSet('\'') => go!(self: to AfterAttributeValueQuoted),
1039 FromSet('&') => go!(self: consume_char_ref '\''),
1040 FromSet('\0') => go!(self: error; push_value '\u{fffd}'),
1041 FromSet(c) => go!(self: push_value c),
1042 NotFromSet(ref b) => go!(self: append_value b),
1043 }
1044 },
1045
1046 //§ attribute-value-(unquoted)-state
1047 states::AttributeValue(Unquoted) => loop {
1048 match pop_except_from!(
1049 self,
1050 input,
1051 small_char_set!('\r' '\t' '\n' '\x0C' ' ' '&' '>' '\0')
1052 ) {
1053 FromSet('\t') | FromSet('\n') | FromSet('\x0C') | FromSet(' ') => {
1054 go!(self: to BeforeAttributeName)
1055 },
1056 FromSet('&') => go!(self: consume_char_ref '>'),
1057 FromSet('>') => go!(self: emit_tag Data),
1058 FromSet('\0') => go!(self: error; push_value '\u{fffd}'),
1059 FromSet(c) => {
1060 go_match!(self: c,
1061 '"' , '\'' , '<' , '=' , '`' => error);
1062 go!(self: push_value c);
1063 },
1064 NotFromSet(ref b) => go!(self: append_value b),
1065 }
1066 },
1067
1068 //§ after-attribute-value-(quoted)-state
1069 states::AfterAttributeValueQuoted => loop {
1070 match get_char!(self, input) {
1071 '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName),
1072 '/' => go!(self: to SelfClosingStartTag),
1073 '>' => go!(self: emit_tag Data),
1074 _ => go!(self: error; reconsume BeforeAttributeName),
1075 }
1076 },
1077
1078 //§ self-closing-start-tag-state
1079 states::SelfClosingStartTag => loop {
1080 match get_char!(self, input) {
1081 '>' => {
1082 self.current_tag_self_closing = true;
1083 go!(self: emit_tag Data);
1084 },
1085 _ => go!(self: error; reconsume BeforeAttributeName),
1086 }
1087 },
1088
1089 //§ comment-start-state
1090 states::CommentStart => loop {
1091 match get_char!(self, input) {
1092 '-' => go!(self: to CommentStartDash),
1093 '\0' => go!(self: error; push_comment '\u{fffd}'; to Comment),
1094 '>' => go!(self: error; emit_comment; to Data),
1095 c => go!(self: push_comment c; to Comment),
1096 }
1097 },
1098
1099 //§ comment-start-dash-state
1100 states::CommentStartDash => loop {
1101 match get_char!(self, input) {
1102 '-' => go!(self: to CommentEnd),
1103 '\0' => go!(self: error; append_comment "-\u{fffd}"; to Comment),
1104 '>' => go!(self: error; emit_comment; to Data),
1105 c => go!(self: push_comment '-'; push_comment c; to Comment),
1106 }
1107 },
1108
1109 //§ comment-state
1110 states::Comment => loop {
1111 match get_char!(self, input) {
1112 '-' => go!(self: to CommentEndDash),
1113 '\0' => go!(self: error; push_comment '\u{fffd}'),
1114 c => go!(self: push_comment c),
1115 }
1116 },
1117
1118 //§ comment-end-dash-state
1119 states::CommentEndDash => loop {
1120 match get_char!(self, input) {
1121 '-' => go!(self: to CommentEnd),
1122 '\0' => go!(self: error; append_comment "-\u{fffd}"; to Comment),
1123 c => go!(self: push_comment '-'; push_comment c; to Comment),
1124 }
1125 },
1126
1127 //§ comment-end-state
1128 states::CommentEnd => loop {
1129 match get_char!(self, input) {
1130 '>' => go!(self: emit_comment; to Data),
1131 '\0' => go!(self: error; append_comment "--\u{fffd}"; to Comment),
1132 '!' => go!(self: error; to CommentEndBang),
1133 '-' => go!(self: error; push_comment '-'),
1134 c => go!(self: error; append_comment "--"; push_comment c; to Comment),
1135 }
1136 },
1137
1138 //§ comment-end-bang-state
1139 states::CommentEndBang => loop {
1140 match get_char!(self, input) {
1141 '-' => go!(self: append_comment "--!"; to CommentEndDash),
1142 '>' => go!(self: emit_comment; to Data),
1143 '\0' => go!(self: error; append_comment "--!\u{fffd}"; to Comment),
1144 c => go!(self: append_comment "--!"; push_comment c; to Comment),
1145 }
1146 },
1147
1148 //§ doctype-state
1149 states::Doctype => loop {
1150 match get_char!(self, input) {
1151 '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeName),
1152 _ => go!(self: error; reconsume BeforeDoctypeName),
1153 }
1154 },
1155
1156 //§ before-doctype-name-state
1157 states::BeforeDoctypeName => loop {
1158 match get_char!(self, input) {
1159 '\t' | '\n' | '\x0C' | ' ' => (),
1160 '\0' => {
1161 go!(self: error; create_doctype; push_doctype_name '\u{fffd}'; to DoctypeName)
1162 },
1163 '>' => go!(self: error; create_doctype; force_quirks; emit_doctype; to Data),
1164 c => go!(self: create_doctype; push_doctype_name (c.to_ascii_lowercase());
1165 to DoctypeName),
1166 }
1167 },
1168
1169 //§ doctype-name-state
1170 states::DoctypeName => loop {
1171 match get_char!(self, input) {
1172 '\t' | '\n' | '\x0C' | ' ' => go!(self: clear_temp; to AfterDoctypeName),
1173 '>' => go!(self: emit_doctype; to Data),
1174 '\0' => go!(self: error; push_doctype_name '\u{fffd}'),
1175 c => go!(self: push_doctype_name (c.to_ascii_lowercase())),
1176 }
1177 },
1178
1179 //§ after-doctype-name-state
1180 states::AfterDoctypeName => loop {
1181 if eat!(self, input, "public") {
1182 go!(self: to AfterDoctypeKeyword Public);
1183 } else if eat!(self, input, "system") {
1184 go!(self: to AfterDoctypeKeyword System);
1185 } else {
1186 match get_char!(self, input) {
1187 '\t' | '\n' | '\x0C' | ' ' => (),
1188 '>' => go!(self: emit_doctype; to Data),
1189 _ => go!(self: error; force_quirks; to BogusDoctype),
1190 }
1191 }
1192 },
1193
1194 //§ after-doctype-public-keyword-state after-doctype-system-keyword-state
1195 states::AfterDoctypeKeyword(kind) => loop {
1196 match get_char!(self, input) {
1197 '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeIdentifier kind),
1198 '"' => {
1199 go!(self: error; clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind)
1200 },
1201 '\'' => {
1202 go!(self: error; clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind)
1203 },
1204 '>' => go!(self: error; force_quirks; emit_doctype; to Data),
1205 _ => go!(self: error; force_quirks; to BogusDoctype),
1206 }
1207 },
1208
1209 //§ before-doctype-public-identifier-state before-doctype-system-identifier-state
1210 states::BeforeDoctypeIdentifier(kind) => loop {
1211 match get_char!(self, input) {
1212 '\t' | '\n' | '\x0C' | ' ' => (),
1213 '"' => go!(self: clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind),
1214 '\'' => go!(self: clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind),
1215 '>' => go!(self: error; force_quirks; emit_doctype; to Data),
1216 _ => go!(self: error; force_quirks; to BogusDoctype),
1217 }
1218 },
1219
1220 //§ doctype-public-identifier-(double-quoted)-state doctype-system-identifier-(double-quoted)-state
1221 states::DoctypeIdentifierDoubleQuoted(kind) => loop {
1222 match get_char!(self, input) {
1223 '"' => go!(self: to AfterDoctypeIdentifier kind),
1224 '\0' => go!(self: error; push_doctype_id kind '\u{fffd}'),
1225 '>' => go!(self: error; force_quirks; emit_doctype; to Data),
1226 c => go!(self: push_doctype_id kind c),
1227 }
1228 },
1229
1230 //§ doctype-public-identifier-(single-quoted)-state doctype-system-identifier-(single-quoted)-state
1231 states::DoctypeIdentifierSingleQuoted(kind) => loop {
1232 match get_char!(self, input) {
1233 '\'' => go!(self: to AfterDoctypeIdentifier kind),
1234 '\0' => go!(self: error; push_doctype_id kind '\u{fffd}'),
1235 '>' => go!(self: error; force_quirks; emit_doctype; to Data),
1236 c => go!(self: push_doctype_id kind c),
1237 }
1238 },
1239
1240 //§ after-doctype-public-identifier-state
1241 states::AfterDoctypeIdentifier(Public) => loop {
1242 match get_char!(self, input) {
1243 '\t' | '\n' | '\x0C' | ' ' => {
1244 go!(self: to BetweenDoctypePublicAndSystemIdentifiers)
1245 },
1246 '>' => go!(self: emit_doctype; to Data),
1247 '"' => {
1248 go!(self: error; clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System)
1249 },
1250 '\'' => {
1251 go!(self: error; clear_doctype_id System; to DoctypeIdentifierSingleQuoted System)
1252 },
1253 _ => go!(self: error; force_quirks; to BogusDoctype),
1254 }
1255 },
1256
1257 //§ after-doctype-system-identifier-state
1258 states::AfterDoctypeIdentifier(System) => loop {
1259 match get_char!(self, input) {
1260 '\t' | '\n' | '\x0C' | ' ' => (),
1261 '>' => go!(self: emit_doctype; to Data),
1262 _ => go!(self: error; to BogusDoctype),
1263 }
1264 },
1265
1266 //§ between-doctype-public-and-system-identifiers-state
1267 states::BetweenDoctypePublicAndSystemIdentifiers => loop {
1268 match get_char!(self, input) {
1269 '\t' | '\n' | '\x0C' | ' ' => (),
1270 '>' => go!(self: emit_doctype; to Data),
1271 '"' => {
1272 go!(self: clear_doctype_id System; to DoctypeIdentifierDoubleQuoted System)
1273 },
1274 '\'' => {
1275 go!(self: clear_doctype_id System; to DoctypeIdentifierSingleQuoted System)
1276 },
1277 _ => go!(self: error; force_quirks; to BogusDoctype),
1278 }
1279 },
1280
1281 //§ bogus-doctype-state
1282 states::BogusDoctype => loop {
1283 match get_char!(self, input) {
1284 '>' => go!(self: emit_doctype; to Data),
1285 _ => (),
1286 }
1287 },
1288
1289 //§ bogus-comment-state
1290 states::BogusComment => loop {
1291 match get_char!(self, input) {
1292 '>' => go!(self: emit_comment; to Data),
1293 '\0' => go!(self: push_comment '\u{fffd}'),
1294 c => go!(self: push_comment c),
1295 }
1296 },
1297
1298 //§ markup-declaration-open-state
1299 states::MarkupDeclarationOpen => loop {
1300 if eat_exact!(self, input, "--") {
1301 go!(self: clear_comment; to CommentStart);
1302 } else if eat!(self, input, "doctype") {
1303 go!(self: to Doctype);
1304 } else {
1305 if self
1306 .sink
1307 .adjusted_current_node_present_but_not_in_html_namespace()
1308 {
1309 if eat_exact!(self, input, "[CDATA[") {
1310 go!(self: clear_temp; to CdataSection);
1311 }
1312 }
1313 go!(self: error; to BogusComment);
1314 }
1315 },
1316
1317 //§ cdata-section-state
1318 states::CdataSection => loop {
1319 match get_char!(self, input) {
1320 ']' => go!(self: to CdataSectionBracket),
1321 '\0' => go!(self: emit_temp; emit '\0'),
1322 c => go!(self: push_temp c),
1323 }
1324 },
1325
1326 //§ cdata-section-bracket
1327 states::CdataSectionBracket => match get_char!(self, input) {
1328 ']' => go!(self: to CdataSectionEnd),
1329 _ => go!(self: push_temp ']'; reconsume CdataSection),
1330 },
1331
1332 //§ cdata-section-end
1333 states::CdataSectionEnd => loop {
1334 match get_char!(self, input) {
1335 ']' => go!(self: push_temp ']'),
1336 '>' => go!(self: emit_temp; to Data),
1337 _ => go!(self: push_temp ']'; push_temp ']'; reconsume CdataSection),
1338 }
1339 },
1340 //§ END
1341 }
1342 }
1343
1344 fn step_char_ref_tokenizer(&mut self, input: &mut BufferQueue) -> ProcessResult<Sink::Handle> {
1345 // FIXME HACK: Take and replace the tokenizer so we don't
1346 // double-mut-borrow self. This is why it's boxed.
1347 let mut tok = self.char_ref_tokenizer.take().unwrap();
1348 let outcome = tok.step(self, input);
1349
1350 let progress = match outcome {
1351 char_ref::Done => {
1352 self.process_char_ref(tok.get_result());
1353 return ProcessResult::Continue;
1354 },
1355
1356 char_ref::Stuck => ProcessResult::Suspend,
1357 char_ref::Progress => ProcessResult::Continue,
1358 };
1359
1360 self.char_ref_tokenizer = Some(tok);
1361 progress
1362 }
1363
1364 fn process_char_ref(&mut self, char_ref: CharRef) {
1365 let CharRef {
1366 mut chars,
1367 mut num_chars,
1368 } = char_ref;
1369
1370 if num_chars == 0 {
1371 chars[0] = '&';
1372 num_chars = 1;
1373 }
1374
1375 for i in 0..num_chars {
1376 let c = chars[i as usize];
1377 match self.state {
1378 states::Data | states::RawData(states::Rcdata) => go!(self: emit c),
1379
1380 states::AttributeValue(_) => go!(self: push_value c),
1381
1382 _ => panic!(
1383 "state {:?} should not be reachable in process_char_ref",
1384 self.state
1385 ),
1386 }
1387 }
1388 }
1389
1390 /// Indicate that we have reached the end of the input.
1391 pub fn end(&mut self) {
1392 // Handle EOF in the char ref sub-tokenizer, if there is one.
1393 // Do this first because it might un-consume stuff.
1394 let mut input = BufferQueue::new();
1395 match self.char_ref_tokenizer.take() {
1396 None => (),
1397 Some(mut tok) => {
1398 tok.end_of_file(self, &mut input);
1399 self.process_char_ref(tok.get_result());
1400 },
1401 }
1402
1403 // Process all remaining buffered input.
1404 // If we're waiting for lookahead, we're not gonna get it.
1405 self.at_eof = true;
1406 assert!(matches!(self.run(&mut input), TokenizerResult::Done));
1407 assert!(input.is_empty());
1408
1409 loop {
1410 match self.eof_step() {
1411 ProcessResult::Continue => (),
1412 ProcessResult::Suspend => break,
1413 ProcessResult::Script(_) => unreachable!(),
1414 }
1415 }
1416
1417 self.sink.end();
1418
1419 if self.opts.profile {
1420 self.dump_profile();
1421 }
1422 }
1423
1424 fn dump_profile(&self) {
1425 let mut results: Vec<(states::State, u64)> =
1426 self.state_profile.iter().map(|(s, t)| (*s, *t)).collect();
1427 results.sort_by(|&(_, x), &(_, y)| y.cmp(&x));
1428
1429 let total: u64 = results
1430 .iter()
1431 .map(|&(_, t)| t)
1432 .fold(0, ::std::ops::Add::add);
1433 println!("\nTokenizer profile, in nanoseconds");
1434 println!("\n{:12} total in token sink", self.time_in_sink);
1435 println!("\n{:12} total in tokenizer", total);
1436
1437 for (k, v) in results.into_iter() {
1438 let pct = 100.0 * (v as f64) / (total as f64);
1439 println!("{:12} {:4.1}% {:?}", v, pct, k);
1440 }
1441 }
1442
1443 fn eof_step(&mut self) -> ProcessResult<Sink::Handle> {
1444 debug!("processing EOF in state {:?}", self.state);
1445 match self.state {
1446 states::Data |
1447 states::RawData(Rcdata) |
1448 states::RawData(Rawtext) |
1449 states::RawData(ScriptData) |
1450 states::Plaintext => go!(self: eof),
1451
1452 states::TagName |
1453 states::RawData(ScriptDataEscaped(_)) |
1454 states::BeforeAttributeName |
1455 states::AttributeName |
1456 states::AfterAttributeName |
1457 states::BeforeAttributeValue |
1458 states::AttributeValue(_) |
1459 states::AfterAttributeValueQuoted |
1460 states::SelfClosingStartTag |
1461 states::ScriptDataEscapedDash(_) |
1462 states::ScriptDataEscapedDashDash(_) => go!(self: error_eof; to Data),
1463
1464 states::TagOpen => go!(self: error_eof; emit '<'; to Data),
1465
1466 states::EndTagOpen => go!(self: error_eof; emit '<'; emit '/'; to Data),
1467
1468 states::RawLessThanSign(ScriptDataEscaped(DoubleEscaped)) => {
1469 go!(self: to RawData ScriptDataEscaped DoubleEscaped)
1470 },
1471
1472 states::RawLessThanSign(kind) => go!(self: emit '<'; to RawData kind),
1473
1474 states::RawEndTagOpen(kind) => go!(self: emit '<'; emit '/'; to RawData kind),
1475
1476 states::RawEndTagName(kind) => {
1477 go!(self: emit '<'; emit '/'; emit_temp; to RawData kind)
1478 },
1479
1480 states::ScriptDataEscapeStart(kind) => go!(self: to RawData ScriptDataEscaped kind),
1481
1482 states::ScriptDataEscapeStartDash => go!(self: to RawData ScriptData),
1483
1484 states::ScriptDataDoubleEscapeEnd => {
1485 go!(self: to RawData ScriptDataEscaped DoubleEscaped)
1486 },
1487
1488 states::CommentStart |
1489 states::CommentStartDash |
1490 states::Comment |
1491 states::CommentEndDash |
1492 states::CommentEnd |
1493 states::CommentEndBang => go!(self: error_eof; emit_comment; to Data),
1494
1495 states::Doctype | states::BeforeDoctypeName => {
1496 go!(self: error_eof; create_doctype; force_quirks; emit_doctype; to Data)
1497 },
1498
1499 states::DoctypeName |
1500 states::AfterDoctypeName |
1501 states::AfterDoctypeKeyword(_) |
1502 states::BeforeDoctypeIdentifier(_) |
1503 states::DoctypeIdentifierDoubleQuoted(_) |
1504 states::DoctypeIdentifierSingleQuoted(_) |
1505 states::AfterDoctypeIdentifier(_) |
1506 states::BetweenDoctypePublicAndSystemIdentifiers => {
1507 go!(self: error_eof; force_quirks; emit_doctype; to Data)
1508 },
1509
1510 states::BogusDoctype => go!(self: emit_doctype; to Data),
1511
1512 states::BogusComment => go!(self: emit_comment; to Data),
1513
1514 states::MarkupDeclarationOpen => go!(self: error; to BogusComment),
1515
1516 states::CdataSection => go!(self: emit_temp; error_eof; to Data),
1517
1518 states::CdataSectionBracket => go!(self: push_temp ']'; to CdataSection),
1519
1520 states::CdataSectionEnd => go!(self: push_temp ']'; push_temp ']'; to CdataSection),
1521 }
1522 }
1523 }
1524
1525 #[cfg(test)]
1526 #[allow(non_snake_case)]
1527 mod test {
1528 use super::option_push; // private items
1529 use crate::tendril::{SliceExt, StrTendril};
1530
1531 use super::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
1532
1533 use super::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
1534 use super::interface::{EndTag, StartTag, Tag, TagKind};
1535 use super::interface::{TagToken, Token};
1536
1537 use markup5ever::buffer_queue::BufferQueue;
1538 use std::mem::replace;
1539
1540 use crate::LocalName;
1541
1542 // LinesMatch implements the TokenSink trait. It is used for testing to see
1543 // if current_line is being updated when process_token is called. The lines
1544 // vector is a collection of the line numbers that each token is on.
1545 struct LinesMatch {
1546 tokens: Vec<Token>,
1547 current_str: StrTendril,
1548 lines: Vec<(Token, u64)>,
1549 }
1550
1551 impl LinesMatch {
1552 fn new() -> LinesMatch {
1553 LinesMatch {
1554 tokens: vec![],
1555 current_str: StrTendril::new(),
1556 lines: vec![],
1557 }
1558 }
1559
1560 fn push(&mut self, token: Token, line_number: u64) {
1561 self.finish_str();
1562 self.lines.push((token, line_number));
1563 }
1564
1565 fn finish_str(&mut self) {
1566 if self.current_str.len() > 0 {
1567 let s = replace(&mut self.current_str, StrTendril::new());
1568 self.tokens.push(CharacterTokens(s));
1569 }
1570 }
1571 }
1572
1573 impl TokenSink for LinesMatch {
1574 type Handle = ();
1575
1576 fn process_token(
1577 &mut self,
1578 token: Token,
1579 line_number: u64,
1580 ) -> TokenSinkResult<Self::Handle> {
1581 match token {
1582 CharacterTokens(b) => {
1583 self.current_str.push_slice(&b);
1584 },
1585
1586 NullCharacterToken => {
1587 self.current_str.push_char('\0');
1588 },
1589
1590 ParseError(_) => {
1591 panic!("unexpected parse error");
1592 },
1593
1594 TagToken(mut t) => {
1595 // The spec seems to indicate that one can emit
1596 // erroneous end tags with attrs, but the test
1597 // cases don't contain them.
1598 match t.kind {
1599 EndTag => {
1600 t.self_closing = false;
1601 t.attrs = vec![];
1602 },
1603 _ => t.attrs.sort_by(|a1, a2| a1.name.cmp(&a2.name)),
1604 }
1605 self.push(TagToken(t), line_number);
1606 },
1607
1608 EOFToken => (),
1609
1610 _ => self.push(token, line_number),
1611 }
1612 TokenSinkResult::Continue
1613 }
1614 }
1615
1616 // Take in tokens, process them, and return vector with line
1617 // numbers that each token is on
1618 fn tokenize(input: Vec<StrTendril>, opts: TokenizerOpts) -> Vec<(Token, u64)> {
1619 let sink = LinesMatch::new();
1620 let mut tok = Tokenizer::new(sink, opts);
1621 let mut buffer = BufferQueue::new();
1622 for chunk in input.into_iter() {
1623 buffer.push_back(chunk);
1624 let _ = tok.feed(&mut buffer);
1625 }
1626 tok.end();
1627 tok.sink.lines
1628 }
1629
1630 // Create a tag token
1631 fn create_tag(token: StrTendril, tagkind: TagKind) -> Token {
1632 let name = LocalName::from(&*token);
1633 let token = TagToken(Tag {
1634 kind: tagkind,
1635 name: name,
1636 self_closing: false,
1637 attrs: vec![],
1638 });
1639 token
1640 }
1641
1642 #[test]
1643 fn push_to_None_gives_singleton() {
1644 let mut s: Option<StrTendril> = None;
1645 option_push(&mut s, 'x');
1646 assert_eq!(s, Some("x".to_tendril()));
1647 }
1648
1649 #[test]
1650 fn push_to_empty_appends() {
1651 let mut s: Option<StrTendril> = Some(StrTendril::new());
1652 option_push(&mut s, 'x');
1653 assert_eq!(s, Some("x".to_tendril()));
1654 }
1655
1656 #[test]
1657 fn push_to_nonempty_appends() {
1658 let mut s: Option<StrTendril> = Some(StrTendril::from_slice("y"));
1659 option_push(&mut s, 'x');
1660 assert_eq!(s, Some("yx".to_tendril()));
1661 }
1662
1663 #[test]
1664 fn check_lines() {
1665 let opts = TokenizerOpts {
1666 exact_errors: false,
1667 discard_bom: true,
1668 profile: false,
1669 initial_state: None,
1670 last_start_tag_name: None,
1671 };
1672 let vector = vec![
1673 StrTendril::from("<a>\n"),
1674 StrTendril::from("<b>\n"),
1675 StrTendril::from("</b>\n"),
1676 StrTendril::from("</a>\n"),
1677 ];
1678 let expected = vec![
1679 (create_tag(StrTendril::from("a"), StartTag), 1),
1680 (create_tag(StrTendril::from("b"), StartTag), 2),
1681 (create_tag(StrTendril::from("b"), EndTag), 3),
1682 (create_tag(StrTendril::from("a"), EndTag), 4),
1683 ];
1684 let results = tokenize(vector, opts);
1685 assert_eq!(results, expected);
1686 }
1687
1688 #[test]
1689 fn check_lines_with_new_line() {
1690 let opts = TokenizerOpts {
1691 exact_errors: false,
1692 discard_bom: true,
1693 profile: false,
1694 initial_state: None,
1695 last_start_tag_name: None,
1696 };
1697 let vector = vec![
1698 StrTendril::from("<a>\r\n"),
1699 StrTendril::from("<b>\r\n"),
1700 StrTendril::from("</b>\r\n"),
1701 StrTendril::from("</a>\r\n"),
1702 ];
1703 let expected = vec![
1704 (create_tag(StrTendril::from("a"), StartTag), 1),
1705 (create_tag(StrTendril::from("b"), StartTag), 2),
1706 (create_tag(StrTendril::from("b"), EndTag), 3),
1707 (create_tag(StrTendril::from("a"), EndTag), 4),
1708 ];
1709 let results = tokenize(vector, opts);
1710 assert_eq!(results, expected);
1711 }
1712 }
1713