1 //! Contains simple lexer for XML documents. 2 //! 3 //! This module is for internal use. Use `xml::pull` module to do parsing. 4 5 use std::fmt; 6 use std::collections::VecDeque; 7 use std::io::Read; 8 use std::result; 9 use std::borrow::Cow; 10 11 use common::{Position, TextPosition, is_whitespace_char, is_name_char}; 12 use reader::Error; 13 use util; 14 15 /// `Token` represents a single lexeme of an XML document. These lexemes 16 /// are used to perform actual parsing. 17 #[derive(Copy, Clone, PartialEq, Eq, Debug)] 18 pub enum Token { 19 /// `<?` 20 ProcessingInstructionStart, 21 /// `?>` 22 ProcessingInstructionEnd, 23 /// `<!DOCTYPE 24 DoctypeStart, 25 /// `<` 26 OpeningTagStart, 27 /// `</` 28 ClosingTagStart, 29 /// `>` 30 TagEnd, 31 /// `/>` 32 EmptyTagEnd, 33 /// `<!--` 34 CommentStart, 35 /// `-->` 36 CommentEnd, 37 /// A chunk of characters, used for errors recovery. 38 Chunk(&'static str), 39 /// Any non-special character except whitespace. 40 Character(char), 41 /// Whitespace character. 42 Whitespace(char), 43 /// `=` 44 EqualsSign, 45 /// `'` 46 SingleQuote, 47 /// `"` 48 DoubleQuote, 49 /// `<![CDATA[` 50 CDataStart, 51 /// `]]>` 52 CDataEnd, 53 /// `&` 54 ReferenceStart, 55 /// `;` 56 ReferenceEnd, 57 } 58 59 impl fmt::Display for Token { fmt(&self, f: &mut fmt::Formatter) -> fmt::Result60 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 61 match *self { 62 Token::Chunk(s) => write!(f, "{}", s), 63 Token::Character(c) | Token::Whitespace(c) => write!(f, "{}", c), 64 other => write!(f, "{}", match other { 65 Token::OpeningTagStart => "<", 66 Token::ProcessingInstructionStart => "<?", 67 Token::DoctypeStart => "<!DOCTYPE", 68 Token::ClosingTagStart => "</", 69 Token::CommentStart => "<!--", 70 Token::CDataStart => "<![CDATA[", 71 Token::TagEnd => ">", 72 Token::EmptyTagEnd => "/>", 73 Token::ProcessingInstructionEnd => "?>", 74 Token::CommentEnd => "-->", 75 Token::CDataEnd => "]]>", 76 Token::ReferenceStart => "&", 77 Token::ReferenceEnd => ";", 78 Token::EqualsSign => "=", 79 Token::SingleQuote => "'", 80 Token::DoubleQuote => "\"", 81 _ => unreachable!() 82 }) 83 } 84 } 85 } 86 87 impl Token { as_static_str(&self) -> Option<&'static str>88 pub fn as_static_str(&self) -> Option<&'static str> { 89 match *self { 90 Token::OpeningTagStart => Some("<"), 91 Token::ProcessingInstructionStart => Some("<?"), 92 Token::DoctypeStart => Some("<!DOCTYPE"), 93 Token::ClosingTagStart => Some("</"), 94 Token::CommentStart => Some("<!--"), 95 Token::CDataStart => Some("<![CDATA["), 96 Token::TagEnd => Some(">"), 97 Token::EmptyTagEnd => Some("/>"), 98 Token::ProcessingInstructionEnd => Some("?>"), 99 Token::CommentEnd => Some("-->"), 100 Token::CDataEnd => Some("]]>"), 101 Token::ReferenceStart => Some("&"), 102 Token::ReferenceEnd => Some(";"), 103 Token::EqualsSign => Some("="), 104 Token::SingleQuote => Some("'"), 105 Token::DoubleQuote => Some("\""), 106 Token::Chunk(s) => Some(s), 107 _ => None 108 } 109 } 110 111 // using String.push_str(token.to_string()) is simply way too slow push_to_string(&self, target: &mut String)112 pub fn push_to_string(&self, target: &mut String) { 113 match self.as_static_str() { 114 Some(s) => { target.push_str(s); } 115 None => { 116 match *self { 117 Token::Character(c) | Token::Whitespace(c) => target.push(c), 118 _ => unreachable!() 119 } 120 } 121 } 122 } 123 124 /// Returns `true` if this token contains data that can be interpreted 125 /// as a part of the text. Surprisingly, this also means '>' and '=' and '"' and "'" and '-->'. 126 #[inline] contains_char_data(&self) -> bool127 pub fn contains_char_data(&self) -> bool { 128 match *self { 129 Token::Whitespace(_) | Token::Chunk(_) | Token::Character(_) | Token::CommentEnd | 130 Token::TagEnd | Token::EqualsSign | Token::DoubleQuote | Token::SingleQuote => true, 131 _ => false 132 } 133 } 134 135 /// Returns `true` if this token corresponds to a white space character. 136 #[inline] is_whitespace(&self) -> bool137 pub fn is_whitespace(&self) -> bool { 138 match *self { 139 Token::Whitespace(_) => true, 140 _ => false 141 } 142 } 143 } 144 145 enum State { 146 /// Triggered on '<' 147 TagStarted, 148 /// Triggered on '<!' 149 CommentOrCDataOrDoctypeStarted, 150 /// Triggered on '<!-' 151 CommentStarted, 152 /// Triggered on '<!D' up to '<!DOCTYPE' 153 DoctypeStarted(DoctypeStartedSubstate), 154 /// Triggered after DoctypeStarted to handle sub elements 155 DoctypeFinishing(u8), 156 /// Triggered on '<![' up to '<![CDATA' 157 CDataStarted(CDataStartedSubstate), 158 /// Triggered on '?' 159 ProcessingInstructionClosing, 160 /// Triggered on '/' 161 EmptyTagClosing, 162 /// Triggered on '-' up to '--' 163 CommentClosing(ClosingSubstate), 164 /// Triggered on ']' up to ']]' 165 CDataClosing(ClosingSubstate), 166 /// Default state 167 Normal 168 } 169 170 #[derive(Copy, Clone)] 171 enum ClosingSubstate { 172 First, Second 173 } 174 175 #[derive(Copy, Clone)] 176 enum DoctypeStartedSubstate { 177 D, DO, DOC, DOCT, DOCTY, DOCTYP 178 } 179 180 #[derive(Copy, Clone)] 181 enum CDataStartedSubstate { 182 E, C, CD, CDA, CDAT, CDATA 183 } 184 185 /// `Result` represents lexing result. It is either a token or an error message. 186 pub type Result = result::Result<Option<Token>, Error>; 187 188 /// Helps to set up a dispatch table for lexing large unambigous tokens like 189 /// `<![CDATA[` or `<!DOCTYPE `. 190 macro_rules! dispatch_on_enum_state( 191 ($_self:ident, $s:expr, $c:expr, $is:expr, 192 $($st:ident; $stc:expr ; $next_st:ident ; $chunk:expr),+; 193 $end_st:ident ; $end_c:expr ; $end_chunk:expr ; $e:expr) => ( 194 match $s { 195 $( 196 $st => match $c { 197 $stc => $_self.move_to($is($next_st)), 198 _ => $_self.handle_error($chunk, $c) 199 }, 200 )+ 201 $end_st => match $c { 202 $end_c => $e, 203 _ => $_self.handle_error($end_chunk, $c) 204 } 205 } 206 ) 207 ); 208 209 /// `Lexer` is a lexer for XML documents, which implements pull API. 210 /// 211 /// Main method is `next_token` which accepts an `std::io::Read` instance and 212 /// tries to read the next lexeme from it. 213 /// 214 /// When `skip_errors` flag is set, invalid lexemes will be returned as `Chunk`s. 215 /// When it is not set, errors will be reported as `Err` objects with a string message. 216 /// By default this flag is not set. Use `enable_errors` and `disable_errors` methods 217 /// to toggle the behavior. 218 pub struct Lexer { 219 pos: TextPosition, 220 head_pos: TextPosition, 221 char_queue: VecDeque<char>, 222 st: State, 223 skip_errors: bool, 224 inside_comment: bool, 225 inside_token: bool, 226 eof_handled: bool 227 } 228 229 impl Position for Lexer { 230 #[inline] 231 /// Returns the position of the last token produced by the lexer position(&self) -> TextPosition232 fn position(&self) -> TextPosition { self.pos } 233 } 234 235 impl Lexer { 236 /// Returns a new lexer with default state. new() -> Lexer237 pub fn new() -> Lexer { 238 Lexer { 239 pos: TextPosition::new(), 240 head_pos: TextPosition::new(), 241 char_queue: VecDeque::with_capacity(4), // TODO: check size 242 st: State::Normal, 243 skip_errors: false, 244 inside_comment: false, 245 inside_token: false, 246 eof_handled: false 247 } 248 } 249 250 /// Enables error handling so `next_token` will return `Some(Err(..))` 251 /// upon invalid lexeme. 252 #[inline] enable_errors(&mut self)253 pub fn enable_errors(&mut self) { self.skip_errors = false; } 254 255 /// Disables error handling so `next_token` will return `Some(Chunk(..))` 256 /// upon invalid lexeme with this lexeme content. 257 #[inline] disable_errors(&mut self)258 pub fn disable_errors(&mut self) { self.skip_errors = true; } 259 260 /// Enables special handling of some lexemes which should be done when we're parsing comment 261 /// internals. 262 #[inline] inside_comment(&mut self)263 pub fn inside_comment(&mut self) { self.inside_comment = true; } 264 265 /// Disables the effect of `inside_comment()` method. 266 #[inline] outside_comment(&mut self)267 pub fn outside_comment(&mut self) { self.inside_comment = false; } 268 269 /// Reset the eof handled flag of the lexer. 270 #[inline] reset_eof_handled(&mut self)271 pub fn reset_eof_handled(&mut self) { self.eof_handled = false; } 272 273 /// Tries to read the next token from the buffer. 274 /// 275 /// It is possible to pass different instaces of `BufReader` each time 276 /// this method is called, but the resulting behavior is undefined in this case. 277 /// 278 /// Return value: 279 /// * `Err(reason) where reason: reader::Error` - when an error occurs; 280 /// * `Ok(None)` - upon end of stream is reached; 281 /// * `Ok(Some(token)) where token: Token` - in case a complete-token has been read from the stream. next_token<B: Read>(&mut self, b: &mut B) -> Result282 pub fn next_token<B: Read>(&mut self, b: &mut B) -> Result { 283 // Already reached end of buffer 284 if self.eof_handled { 285 return Ok(None); 286 } 287 288 if !self.inside_token { 289 self.pos = self.head_pos; 290 self.inside_token = true; 291 } 292 293 // Check if we have saved a char or two for ourselves 294 while let Some(c) = self.char_queue.pop_front() { 295 match try!(self.read_next_token(c)) { 296 Some(t) => { 297 self.inside_token = false; 298 return Ok(Some(t)); 299 } 300 None => {} // continue 301 } 302 } 303 304 loop { 305 // TODO: this should handle multiple encodings 306 let c = match try!(util::next_char_from(b)) { 307 Some(c) => c, // got next char 308 None => break, // nothing to read left 309 }; 310 311 match try!(self.read_next_token(c)) { 312 Some(t) => { 313 self.inside_token = false; 314 return Ok(Some(t)); 315 } 316 None => { 317 // continue 318 } 319 } 320 } 321 322 // Handle end of stream 323 self.eof_handled = true; 324 self.pos = self.head_pos; 325 match self.st { 326 State::TagStarted | State::CommentOrCDataOrDoctypeStarted | 327 State::CommentStarted | State::CDataStarted(_)| State::DoctypeStarted(_) | 328 State::CommentClosing(ClosingSubstate::Second) | 329 State::DoctypeFinishing(_) => 330 Err(self.error("Unexpected end of stream")), 331 State::ProcessingInstructionClosing => 332 Ok(Some(Token::Character('?'))), 333 State::EmptyTagClosing => 334 Ok(Some(Token::Character('/'))), 335 State::CommentClosing(ClosingSubstate::First) => 336 Ok(Some(Token::Character('-'))), 337 State::CDataClosing(ClosingSubstate::First) => 338 Ok(Some(Token::Character(']'))), 339 State::CDataClosing(ClosingSubstate::Second) => 340 Ok(Some(Token::Chunk("]]"))), 341 State::Normal => 342 Ok(None) 343 } 344 } 345 346 #[inline] error<M: Into<Cow<'static, str>>>(&self, msg: M) -> Error347 fn error<M: Into<Cow<'static, str>>>(&self, msg: M) -> Error { 348 (self, msg).into() 349 } 350 351 #[inline] read_next_token(&mut self, c: char) -> Result352 fn read_next_token(&mut self, c: char) -> Result { 353 let res = self.dispatch_char(c); 354 if self.char_queue.is_empty() { 355 if c == '\n' { 356 self.head_pos.new_line(); 357 } else { 358 self.head_pos.advance(1); 359 } 360 } 361 res 362 } 363 dispatch_char(&mut self, c: char) -> Result364 fn dispatch_char(&mut self, c: char) -> Result { 365 match self.st { 366 State::Normal => self.normal(c), 367 State::TagStarted => self.tag_opened(c), 368 State::CommentOrCDataOrDoctypeStarted => self.comment_or_cdata_or_doctype_started(c), 369 State::CommentStarted => self.comment_started(c), 370 State::CDataStarted(s) => self.cdata_started(c, s), 371 State::DoctypeStarted(s) => self.doctype_started(c, s), 372 State::DoctypeFinishing(d) => self.doctype_finishing(c, d), 373 State::ProcessingInstructionClosing => self.processing_instruction_closing(c), 374 State::EmptyTagClosing => self.empty_element_closing(c), 375 State::CommentClosing(s) => self.comment_closing(c, s), 376 State::CDataClosing(s) => self.cdata_closing(c, s) 377 } 378 } 379 380 #[inline] move_to(&mut self, st: State) -> Result381 fn move_to(&mut self, st: State) -> Result { 382 self.st = st; 383 Ok(None) 384 } 385 386 #[inline] move_to_with(&mut self, st: State, token: Token) -> Result387 fn move_to_with(&mut self, st: State, token: Token) -> Result { 388 self.st = st; 389 Ok(Some(token)) 390 } 391 392 #[inline] move_to_with_unread(&mut self, st: State, cs: &[char], token: Token) -> Result393 fn move_to_with_unread(&mut self, st: State, cs: &[char], token: Token) -> Result { 394 self.char_queue.extend(cs.iter().cloned()); 395 self.move_to_with(st, token) 396 } 397 handle_error(&mut self, chunk: &'static str, c: char) -> Result398 fn handle_error(&mut self, chunk: &'static str, c: char) -> Result { 399 self.char_queue.push_back(c); 400 if self.skip_errors || (self.inside_comment && chunk != "--") { // FIXME: looks hacky 401 self.move_to_with(State::Normal, Token::Chunk(chunk)) 402 } else { 403 Err(self.error(format!("Unexpected token '{}' before '{}'", chunk, c))) 404 } 405 } 406 407 /// Encountered a char normal(&mut self, c: char) -> Result408 fn normal(&mut self, c: char) -> Result { 409 match c { 410 '<' => self.move_to(State::TagStarted), 411 '>' => Ok(Some(Token::TagEnd)), 412 '/' => self.move_to(State::EmptyTagClosing), 413 '=' => Ok(Some(Token::EqualsSign)), 414 '"' => Ok(Some(Token::DoubleQuote)), 415 '\'' => Ok(Some(Token::SingleQuote)), 416 '?' => self.move_to(State::ProcessingInstructionClosing), 417 '-' => self.move_to(State::CommentClosing(ClosingSubstate::First)), 418 ']' => self.move_to(State::CDataClosing(ClosingSubstate::First)), 419 '&' => Ok(Some(Token::ReferenceStart)), 420 ';' => Ok(Some(Token::ReferenceEnd)), 421 _ if is_whitespace_char(c) => Ok(Some(Token::Whitespace(c))), 422 _ => Ok(Some(Token::Character(c))) 423 } 424 } 425 426 /// Encountered '<' tag_opened(&mut self, c: char) -> Result427 fn tag_opened(&mut self, c: char) -> Result { 428 match c { 429 '?' => self.move_to_with(State::Normal, Token::ProcessingInstructionStart), 430 '/' => self.move_to_with(State::Normal, Token::ClosingTagStart), 431 '!' => self.move_to(State::CommentOrCDataOrDoctypeStarted), 432 _ if is_whitespace_char(c) => self.move_to_with_unread(State::Normal, &[c], Token::OpeningTagStart), 433 _ if is_name_char(c) => self.move_to_with_unread(State::Normal, &[c], Token::OpeningTagStart), 434 _ => self.handle_error("<", c) 435 } 436 } 437 438 /// Encountered '<!' comment_or_cdata_or_doctype_started(&mut self, c: char) -> Result439 fn comment_or_cdata_or_doctype_started(&mut self, c: char) -> Result { 440 match c { 441 '-' => self.move_to(State::CommentStarted), 442 '[' => self.move_to(State::CDataStarted(CDataStartedSubstate::E)), 443 'D' => self.move_to(State::DoctypeStarted(DoctypeStartedSubstate::D)), 444 _ => self.handle_error("<!", c) 445 } 446 } 447 448 /// Encountered '<!-' comment_started(&mut self, c: char) -> Result449 fn comment_started(&mut self, c: char) -> Result { 450 match c { 451 '-' => self.move_to_with(State::Normal, Token::CommentStart), 452 _ => self.handle_error("<!-", c) 453 } 454 } 455 456 /// Encountered '<![' cdata_started(&mut self, c: char, s: CDataStartedSubstate) -> Result457 fn cdata_started(&mut self, c: char, s: CDataStartedSubstate) -> Result { 458 use self::CDataStartedSubstate::{E, C, CD, CDA, CDAT, CDATA}; 459 dispatch_on_enum_state!(self, s, c, State::CDataStarted, 460 E ; 'C' ; C ; "<![", 461 C ; 'D' ; CD ; "<![C", 462 CD ; 'A' ; CDA ; "<![CD", 463 CDA ; 'T' ; CDAT ; "<![CDA", 464 CDAT ; 'A' ; CDATA ; "<![CDAT"; 465 CDATA ; '[' ; "<![CDATA" ; self.move_to_with(State::Normal, Token::CDataStart) 466 ) 467 } 468 469 /// Encountered '<!D' doctype_started(&mut self, c: char, s: DoctypeStartedSubstate) -> Result470 fn doctype_started(&mut self, c: char, s: DoctypeStartedSubstate) -> Result { 471 use self::DoctypeStartedSubstate::{D, DO, DOC, DOCT, DOCTY, DOCTYP}; 472 dispatch_on_enum_state!(self, s, c, State::DoctypeStarted, 473 D ; 'O' ; DO ; "<!D", 474 DO ; 'C' ; DOC ; "<!DO", 475 DOC ; 'T' ; DOCT ; "<!DOC", 476 DOCT ; 'Y' ; DOCTY ; "<!DOCT", 477 DOCTY ; 'P' ; DOCTYP ; "<!DOCTY"; 478 DOCTYP ; 'E' ; "<!DOCTYP" ; self.move_to_with(State::DoctypeFinishing(1), Token::DoctypeStart) 479 ) 480 } 481 482 /// State used while awaiting the closing bracket for the <!DOCTYPE tag doctype_finishing(&mut self, c: char, d: u8) -> Result483 fn doctype_finishing(&mut self, c: char, d: u8) -> Result { 484 match c { 485 '<' => self.move_to(State::DoctypeFinishing(d + 1)), 486 '>' if d == 1 => self.move_to_with(State::Normal, Token::TagEnd), 487 '>' => self.move_to(State::DoctypeFinishing(d - 1)), 488 _ => Ok(None), 489 } 490 } 491 492 /// Encountered '?' processing_instruction_closing(&mut self, c: char) -> Result493 fn processing_instruction_closing(&mut self, c: char) -> Result { 494 match c { 495 '>' => self.move_to_with(State::Normal, Token::ProcessingInstructionEnd), 496 _ => self.move_to_with_unread(State::Normal, &[c], Token::Character('?')), 497 } 498 } 499 500 /// Encountered '/' empty_element_closing(&mut self, c: char) -> Result501 fn empty_element_closing(&mut self, c: char) -> Result { 502 match c { 503 '>' => self.move_to_with(State::Normal, Token::EmptyTagEnd), 504 _ => self.move_to_with_unread(State::Normal, &[c], Token::Character('/')), 505 } 506 } 507 508 /// Encountered '-' comment_closing(&mut self, c: char, s: ClosingSubstate) -> Result509 fn comment_closing(&mut self, c: char, s: ClosingSubstate) -> Result { 510 match s { 511 ClosingSubstate::First => match c { 512 '-' => self.move_to(State::CommentClosing(ClosingSubstate::Second)), 513 _ => self.move_to_with_unread(State::Normal, &[c], Token::Character('-')) 514 }, 515 ClosingSubstate::Second => match c { 516 '>' => self.move_to_with(State::Normal, Token::CommentEnd), 517 // double dash not followed by a greater-than is a hard error inside comment 518 _ if self.inside_comment => self.handle_error("--", c), 519 // nothing else except comment closing starts with a double dash, and comment 520 // closing can never be after another dash, and also we're outside of a comment, 521 // therefore it is safe to push only the last read character to the list of unread 522 // characters and pass the double dash directly to the output 523 _ => self.move_to_with_unread(State::Normal, &[c], Token::Chunk("--")) 524 } 525 } 526 } 527 528 /// Encountered ']' cdata_closing(&mut self, c: char, s: ClosingSubstate) -> Result529 fn cdata_closing(&mut self, c: char, s: ClosingSubstate) -> Result { 530 match s { 531 ClosingSubstate::First => match c { 532 ']' => self.move_to(State::CDataClosing(ClosingSubstate::Second)), 533 _ => self.move_to_with_unread(State::Normal, &[c], Token::Character(']')) 534 }, 535 ClosingSubstate::Second => match c { 536 '>' => self.move_to_with(State::Normal, Token::CDataEnd), 537 _ => self.move_to_with_unread(State::Normal, &[']', c], Token::Character(']')) 538 } 539 } 540 } 541 } 542 543 #[cfg(test)] 544 mod tests { 545 use common::{Position}; 546 use std::io::{BufReader, Cursor}; 547 548 use super::{Lexer, Token}; 549 550 macro_rules! assert_oks( 551 (for $lex:ident and $buf:ident ; $($e:expr)+) => ({ 552 $( 553 assert_eq!(Ok(Some($e)), $lex.next_token(&mut $buf)); 554 )+ 555 }) 556 ); 557 558 macro_rules! assert_err( 559 (for $lex:ident and $buf:ident expect row $r:expr ; $c:expr, $s:expr) => ({ 560 let err = $lex.next_token(&mut $buf); 561 assert!(err.is_err()); 562 let err = err.unwrap_err(); 563 assert_eq!($r as u64, err.position().row); 564 assert_eq!($c as u64, err.position().column); 565 assert_eq!($s, err.msg()); 566 }) 567 ); 568 569 macro_rules! assert_none( 570 (for $lex:ident and $buf:ident) => ( 571 assert_eq!(Ok(None), $lex.next_token(&mut $buf)); 572 ) 573 ); 574 make_lex_and_buf(s: &str) -> (Lexer, BufReader<Cursor<Vec<u8>>>)575 fn make_lex_and_buf(s: &str) -> (Lexer, BufReader<Cursor<Vec<u8>>>) { 576 (Lexer::new(), BufReader::new(Cursor::new(s.to_owned().into_bytes()))) 577 } 578 579 #[test] simple_lexer_test()580 fn simple_lexer_test() { 581 let (mut lex, mut buf) = make_lex_and_buf( 582 r#"<a p='q'> x<b z="y">d </b></a><p/> <?nm ?> <!-- a c --> "# 583 ); 584 585 assert_oks!(for lex and buf ; 586 Token::OpeningTagStart 587 Token::Character('a') 588 Token::Whitespace(' ') 589 Token::Character('p') 590 Token::EqualsSign 591 Token::SingleQuote 592 Token::Character('q') 593 Token::SingleQuote 594 Token::TagEnd 595 Token::Whitespace(' ') 596 Token::Character('x') 597 Token::OpeningTagStart 598 Token::Character('b') 599 Token::Whitespace(' ') 600 Token::Character('z') 601 Token::EqualsSign 602 Token::DoubleQuote 603 Token::Character('y') 604 Token::DoubleQuote 605 Token::TagEnd 606 Token::Character('d') 607 Token::Whitespace('\t') 608 Token::ClosingTagStart 609 Token::Character('b') 610 Token::TagEnd 611 Token::ClosingTagStart 612 Token::Character('a') 613 Token::TagEnd 614 Token::OpeningTagStart 615 Token::Character('p') 616 Token::EmptyTagEnd 617 Token::Whitespace(' ') 618 Token::ProcessingInstructionStart 619 Token::Character('n') 620 Token::Character('m') 621 Token::Whitespace(' ') 622 Token::ProcessingInstructionEnd 623 Token::Whitespace(' ') 624 Token::CommentStart 625 Token::Whitespace(' ') 626 Token::Character('a') 627 Token::Whitespace(' ') 628 Token::Character('c') 629 Token::Whitespace(' ') 630 Token::CommentEnd 631 Token::Whitespace(' ') 632 Token::ReferenceStart 633 Token::Character('n') 634 Token::Character('b') 635 Token::Character('s') 636 Token::Character('p') 637 Token::ReferenceEnd 638 ); 639 assert_none!(for lex and buf); 640 } 641 642 #[test] special_chars_test()643 fn special_chars_test() { 644 let (mut lex, mut buf) = make_lex_and_buf( 645 r#"?x!+ // -| ]z]]"# 646 ); 647 648 assert_oks!(for lex and buf ; 649 Token::Character('?') 650 Token::Character('x') 651 Token::Character('!') 652 Token::Character('+') 653 Token::Whitespace(' ') 654 Token::Character('/') 655 Token::Character('/') 656 Token::Whitespace(' ') 657 Token::Character('-') 658 Token::Character('|') 659 Token::Whitespace(' ') 660 Token::Character(']') 661 Token::Character('z') 662 Token::Chunk("]]") 663 ); 664 assert_none!(for lex and buf); 665 } 666 667 #[test] cdata_test()668 fn cdata_test() { 669 let (mut lex, mut buf) = make_lex_and_buf( 670 r#"<a><![CDATA[x y ?]]> </a>"# 671 ); 672 673 assert_oks!(for lex and buf ; 674 Token::OpeningTagStart 675 Token::Character('a') 676 Token::TagEnd 677 Token::CDataStart 678 Token::Character('x') 679 Token::Whitespace(' ') 680 Token::Character('y') 681 Token::Whitespace(' ') 682 Token::Character('?') 683 Token::CDataEnd 684 Token::Whitespace(' ') 685 Token::ClosingTagStart 686 Token::Character('a') 687 Token::TagEnd 688 ); 689 assert_none!(for lex and buf); 690 } 691 692 #[test] doctype_test()693 fn doctype_test() { 694 let (mut lex, mut buf) = make_lex_and_buf( 695 r#"<a><!DOCTYPE ab xx z> "# 696 ); 697 assert_oks!(for lex and buf ; 698 Token::OpeningTagStart 699 Token::Character('a') 700 Token::TagEnd 701 Token::DoctypeStart 702 Token::TagEnd 703 Token::Whitespace(' ') 704 ); 705 assert_none!(for lex and buf) 706 } 707 708 #[test] doctype_with_internal_subset_test()709 fn doctype_with_internal_subset_test() { 710 let (mut lex, mut buf) = make_lex_and_buf( 711 r#"<a><!DOCTYPE ab[<!ELEMENT ba> ]> "# 712 ); 713 assert_oks!(for lex and buf ; 714 Token::OpeningTagStart 715 Token::Character('a') 716 Token::TagEnd 717 Token::DoctypeStart 718 Token::TagEnd 719 Token::Whitespace(' ') 720 ); 721 assert_none!(for lex and buf) 722 } 723 724 #[test] end_of_stream_handling_ok()725 fn end_of_stream_handling_ok() { 726 macro_rules! eof_check( 727 ($data:expr ; $token:expr) => ({ 728 let (mut lex, mut buf) = make_lex_and_buf($data); 729 assert_oks!(for lex and buf ; $token); 730 assert_none!(for lex and buf); 731 }) 732 ); 733 eof_check!("?" ; Token::Character('?')); 734 eof_check!("/" ; Token::Character('/')); 735 eof_check!("-" ; Token::Character('-')); 736 eof_check!("]" ; Token::Character(']')); 737 eof_check!("]]" ; Token::Chunk("]]")); 738 } 739 740 #[test] end_of_stream_handling_error()741 fn end_of_stream_handling_error() { 742 macro_rules! eof_check( 743 ($data:expr; $r:expr, $c:expr) => ({ 744 let (mut lex, mut buf) = make_lex_and_buf($data); 745 assert_err!(for lex and buf expect row $r ; $c, "Unexpected end of stream"); 746 assert_none!(for lex and buf); 747 }) 748 ); 749 eof_check!("<" ; 0, 1); 750 eof_check!("<!" ; 0, 2); 751 eof_check!("<!-" ; 0, 3); 752 eof_check!("<![" ; 0, 3); 753 eof_check!("<![C" ; 0, 4); 754 eof_check!("<![CD" ; 0, 5); 755 eof_check!("<![CDA" ; 0, 6); 756 eof_check!("<![CDAT" ; 0, 7); 757 eof_check!("<![CDATA" ; 0, 8); 758 eof_check!("--" ; 0, 2); 759 } 760 761 #[test] error_in_comment_or_cdata_prefix()762 fn error_in_comment_or_cdata_prefix() { 763 let (mut lex, mut buf) = make_lex_and_buf("<!x"); 764 assert_err!(for lex and buf expect row 0 ; 0, 765 "Unexpected token '<!' before 'x'" 766 ); 767 768 let (mut lex, mut buf) = make_lex_and_buf("<!x"); 769 lex.disable_errors(); 770 assert_oks!(for lex and buf ; 771 Token::Chunk("<!") 772 Token::Character('x') 773 ); 774 assert_none!(for lex and buf); 775 } 776 777 #[test] error_in_comment_started()778 fn error_in_comment_started() { 779 let (mut lex, mut buf) = make_lex_and_buf("<!-\t"); 780 assert_err!(for lex and buf expect row 0 ; 0, 781 "Unexpected token '<!-' before '\t'" 782 ); 783 784 let (mut lex, mut buf) = make_lex_and_buf("<!-\t"); 785 lex.disable_errors(); 786 assert_oks!(for lex and buf ; 787 Token::Chunk("<!-") 788 Token::Whitespace('\t') 789 ); 790 assert_none!(for lex and buf); 791 } 792 793 #[test] error_in_comment_two_dashes_not_at_end()794 fn error_in_comment_two_dashes_not_at_end() { 795 let (mut lex, mut buf) = make_lex_and_buf("--x"); 796 lex.inside_comment(); 797 assert_err!(for lex and buf expect row 0; 0, 798 "Unexpected token '--' before 'x'" 799 ); 800 801 let (mut lex, mut buf) = make_lex_and_buf("--x"); 802 assert_oks!(for lex and buf ; 803 Token::Chunk("--") 804 Token::Character('x') 805 ); 806 } 807 808 macro_rules! check_case( 809 ($chunk:expr, $app:expr; $data:expr; $r:expr, $c:expr, $s:expr) => ({ 810 let (mut lex, mut buf) = make_lex_and_buf($data); 811 assert_err!(for lex and buf expect row $r ; $c, $s); 812 813 let (mut lex, mut buf) = make_lex_and_buf($data); 814 lex.disable_errors(); 815 assert_oks!(for lex and buf ; 816 Token::Chunk($chunk) 817 Token::Character($app) 818 ); 819 assert_none!(for lex and buf); 820 }) 821 ); 822 823 #[test] error_in_cdata_started()824 fn error_in_cdata_started() { 825 check_case!("<![", '['; "<![[" ; 0, 0, "Unexpected token '<![' before '['"); 826 check_case!("<![C", '['; "<![C[" ; 0, 0, "Unexpected token '<![C' before '['"); 827 check_case!("<![CD", '['; "<![CD[" ; 0, 0, "Unexpected token '<![CD' before '['"); 828 check_case!("<![CDA", '['; "<![CDA[" ; 0, 0, "Unexpected token '<![CDA' before '['"); 829 check_case!("<![CDAT", '['; "<![CDAT[" ; 0, 0, "Unexpected token '<![CDAT' before '['"); 830 check_case!("<![CDATA", '|'; "<![CDATA|" ; 0, 0, "Unexpected token '<![CDATA' before '|'"); 831 } 832 833 #[test] error_in_doctype_started()834 fn error_in_doctype_started() { 835 check_case!("<!D", 'a'; "<!Da" ; 0, 0, "Unexpected token '<!D' before 'a'"); 836 check_case!("<!DO", 'b'; "<!DOb" ; 0, 0, "Unexpected token '<!DO' before 'b'"); 837 check_case!("<!DOC", 'c'; "<!DOCc" ; 0, 0, "Unexpected token '<!DOC' before 'c'"); 838 check_case!("<!DOCT", 'd'; "<!DOCTd" ; 0, 0, "Unexpected token '<!DOCT' before 'd'"); 839 check_case!("<!DOCTY", 'e'; "<!DOCTYe" ; 0, 0, "Unexpected token '<!DOCTY' before 'e'"); 840 check_case!("<!DOCTYP", 'f'; "<!DOCTYPf" ; 0, 0, "Unexpected token '<!DOCTYP' before 'f'"); 841 } 842 843 844 845 #[test] issue_98_cdata_ending_with_right_bracket()846 fn issue_98_cdata_ending_with_right_bracket() { 847 let (mut lex, mut buf) = make_lex_and_buf( 848 r#"<![CDATA[Foo [Bar]]]>"# 849 ); 850 851 assert_oks!(for lex and buf ; 852 Token::CDataStart 853 Token::Character('F') 854 Token::Character('o') 855 Token::Character('o') 856 Token::Whitespace(' ') 857 Token::Character('[') 858 Token::Character('B') 859 Token::Character('a') 860 Token::Character('r') 861 Token::Character(']') 862 Token::CDataEnd 863 ); 864 assert_none!(for lex and buf); 865 } 866 } 867