1 //! Contains simple lexer for XML documents. 2 //! 3 //! This module is for internal use. Use `xml::pull` module to do parsing. 4 5 use std::fmt; 6 use std::collections::VecDeque; 7 use std::io::Read; 8 use std::result; 9 use std::borrow::Cow; 10 11 use common::{Position, TextPosition, is_whitespace_char, is_name_char}; 12 use reader::Error; 13 use util; 14 15 /// `Token` represents a single lexeme of an XML document. These lexemes 16 /// are used to perform actual parsing. 17 #[derive(Copy, Clone, PartialEq, Eq, Debug)] 18 pub enum Token { 19 /// `<?` 20 ProcessingInstructionStart, 21 /// `?>` 22 ProcessingInstructionEnd, 23 /// `<!DOCTYPE 24 DoctypeStart, 25 /// `<` 26 OpeningTagStart, 27 /// `</` 28 ClosingTagStart, 29 /// `>` 30 TagEnd, 31 /// `/>` 32 EmptyTagEnd, 33 /// `<!--` 34 CommentStart, 35 /// `-->` 36 CommentEnd, 37 /// A chunk of characters, used for errors recovery. 38 Chunk(&'static str), 39 /// Any non-special character except whitespace. 40 Character(char), 41 /// Whitespace character. 42 Whitespace(char), 43 /// `=` 44 EqualsSign, 45 /// `'` 46 SingleQuote, 47 /// `"` 48 DoubleQuote, 49 /// `<![CDATA[` 50 CDataStart, 51 /// `]]>` 52 CDataEnd, 53 /// `&` 54 ReferenceStart, 55 /// `;` 56 ReferenceEnd, 57 } 58 59 impl fmt::Display for Token { fmt(&self, f: &mut fmt::Formatter) -> fmt::Result60 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 61 match *self { 62 Token::Chunk(s) => write!(f, "{}", s), 63 Token::Character(c) | Token::Whitespace(c) => write!(f, "{}", c), 64 other => write!(f, "{}", match other { 65 Token::OpeningTagStart => "<", 66 Token::ProcessingInstructionStart => "<?", 67 Token::DoctypeStart => "<!DOCTYPE", 68 Token::ClosingTagStart => "</", 69 Token::CommentStart => "<!--", 70 Token::CDataStart => "<![CDATA[", 71 Token::TagEnd => ">", 72 Token::EmptyTagEnd => "/>", 73 Token::ProcessingInstructionEnd => "?>", 74 Token::CommentEnd => "-->", 75 Token::CDataEnd => "]]>", 76 Token::ReferenceStart => "&", 77 Token::ReferenceEnd => ";", 78 Token::EqualsSign => "=", 79 Token::SingleQuote => "'", 80 Token::DoubleQuote => "\"", 81 _ => unreachable!() 82 }) 83 } 84 } 85 } 86 87 impl Token { as_static_str(&self) -> Option<&'static str>88 pub fn as_static_str(&self) -> Option<&'static str> { 89 match *self { 90 Token::OpeningTagStart => Some("<"), 91 Token::ProcessingInstructionStart => Some("<?"), 92 Token::DoctypeStart => Some("<!DOCTYPE"), 93 Token::ClosingTagStart => Some("</"), 94 Token::CommentStart => Some("<!--"), 95 Token::CDataStart => Some("<![CDATA["), 96 Token::TagEnd => Some(">"), 97 Token::EmptyTagEnd => Some("/>"), 98 Token::ProcessingInstructionEnd => Some("?>"), 99 Token::CommentEnd => Some("-->"), 100 Token::CDataEnd => Some("]]>"), 101 Token::ReferenceStart => Some("&"), 102 Token::ReferenceEnd => Some(";"), 103 Token::EqualsSign => Some("="), 104 Token::SingleQuote => Some("'"), 105 Token::DoubleQuote => Some("\""), 106 Token::Chunk(s) => Some(s), 107 _ => None 108 } 109 } 110 111 // using String.push_str(token.to_string()) is simply way too slow push_to_string(&self, target: &mut String)112 pub fn push_to_string(&self, target: &mut String) { 113 match self.as_static_str() { 114 Some(s) => { target.push_str(s); } 115 None => { 116 match *self { 117 Token::Character(c) | Token::Whitespace(c) => target.push(c), 118 _ => unreachable!() 119 } 120 } 121 } 122 } 123 124 /// Returns `true` if this token contains data that can be interpreted 125 /// as a part of the text. Surprisingly, this also means '>' and '=' and '"' and "'" and '-->'. 126 #[inline] contains_char_data(&self) -> bool127 pub fn contains_char_data(&self) -> bool { 128 match *self { 129 Token::Whitespace(_) | Token::Chunk(_) | Token::Character(_) | Token::CommentEnd | 130 Token::TagEnd | Token::EqualsSign | Token::DoubleQuote | Token::SingleQuote => true, 131 _ => false 132 } 133 } 134 135 /// Returns `true` if this token corresponds to a white space character. 136 #[inline] is_whitespace(&self) -> bool137 pub fn is_whitespace(&self) -> bool { 138 match *self { 139 Token::Whitespace(_) => true, 140 _ => false 141 } 142 } 143 } 144 145 enum State { 146 /// Triggered on '<' 147 TagStarted, 148 /// Triggered on '<!' 149 CommentOrCDataOrDoctypeStarted, 150 /// Triggered on '<!-' 151 CommentStarted, 152 /// Triggered on '<!D' up to '<!DOCTYPE' 153 DoctypeStarted(DoctypeStartedSubstate), 154 /// Triggered on '<![' up to '<![CDATA' 155 CDataStarted(CDataStartedSubstate), 156 /// Triggered on '?' 157 ProcessingInstructionClosing, 158 /// Triggered on '/' 159 EmptyTagClosing, 160 /// Triggered on '-' up to '--' 161 CommentClosing(ClosingSubstate), 162 /// Triggered on ']' up to ']]' 163 CDataClosing(ClosingSubstate), 164 /// Default state 165 Normal 166 } 167 168 #[derive(Copy, Clone)] 169 enum ClosingSubstate { 170 First, Second 171 } 172 173 #[derive(Copy, Clone)] 174 enum DoctypeStartedSubstate { 175 D, DO, DOC, DOCT, DOCTY, DOCTYP 176 } 177 178 #[derive(Copy, Clone)] 179 enum CDataStartedSubstate { 180 E, C, CD, CDA, CDAT, CDATA 181 } 182 183 /// `Result` represents lexing result. It is either a token or an error message. 184 pub type Result = result::Result<Option<Token>, Error>; 185 186 /// Helps to set up a dispatch table for lexing large unambigous tokens like 187 /// `<![CDATA[` or `<!DOCTYPE `. 188 macro_rules! dispatch_on_enum_state( 189 ($_self:ident, $s:expr, $c:expr, $is:expr, 190 $($st:ident; $stc:expr ; $next_st:ident ; $chunk:expr),+; 191 $end_st:ident ; $end_c:expr ; $end_chunk:expr ; $e:expr) => ( 192 match $s { 193 $( 194 $st => match $c { 195 $stc => $_self.move_to($is($next_st)), 196 _ => $_self.handle_error($chunk, $c) 197 }, 198 )+ 199 $end_st => match $c { 200 $end_c => $e, 201 _ => $_self.handle_error($end_chunk, $c) 202 } 203 } 204 ) 205 ); 206 207 /// `Lexer` is a lexer for XML documents, which implements pull API. 208 /// 209 /// Main method is `next_token` which accepts an `std::io::Read` instance and 210 /// tries to read the next lexeme from it. 211 /// 212 /// When `skip_errors` flag is set, invalid lexemes will be returned as `Chunk`s. 213 /// When it is not set, errors will be reported as `Err` objects with a string message. 214 /// By default this flag is not set. Use `enable_errors` and `disable_errors` methods 215 /// to toggle the behavior. 216 pub struct Lexer { 217 pos: TextPosition, 218 head_pos: TextPosition, 219 char_queue: VecDeque<char>, 220 st: State, 221 skip_errors: bool, 222 inside_comment: bool, 223 inside_token: bool, 224 eof_handled: bool 225 } 226 227 impl Position for Lexer { 228 #[inline] 229 /// Returns the position of the last token produced by the lexer position(&self) -> TextPosition230 fn position(&self) -> TextPosition { self.pos } 231 } 232 233 impl Lexer { 234 /// Returns a new lexer with default state. new() -> Lexer235 pub fn new() -> Lexer { 236 Lexer { 237 pos: TextPosition::new(), 238 head_pos: TextPosition::new(), 239 char_queue: VecDeque::with_capacity(4), // TODO: check size 240 st: State::Normal, 241 skip_errors: false, 242 inside_comment: false, 243 inside_token: false, 244 eof_handled: false 245 } 246 } 247 248 /// Enables error handling so `next_token` will return `Some(Err(..))` 249 /// upon invalid lexeme. 250 #[inline] enable_errors(&mut self)251 pub fn enable_errors(&mut self) { self.skip_errors = false; } 252 253 /// Disables error handling so `next_token` will return `Some(Chunk(..))` 254 /// upon invalid lexeme with this lexeme content. 255 #[inline] disable_errors(&mut self)256 pub fn disable_errors(&mut self) { self.skip_errors = true; } 257 258 /// Enables special handling of some lexemes which should be done when we're parsing comment 259 /// internals. 260 #[inline] inside_comment(&mut self)261 pub fn inside_comment(&mut self) { self.inside_comment = true; } 262 263 /// Disables the effect of `inside_comment()` method. 264 #[inline] outside_comment(&mut self)265 pub fn outside_comment(&mut self) { self.inside_comment = false; } 266 267 /// Reset the eof handled flag of the lexer. 268 #[inline] reset_eof_handled(&mut self)269 pub fn reset_eof_handled(&mut self) { self.eof_handled = false; } 270 271 /// Tries to read the next token from the buffer. 272 /// 273 /// It is possible to pass different instaces of `BufReader` each time 274 /// this method is called, but the resulting behavior is undefined in this case. 275 /// 276 /// Return value: 277 /// * `Err(reason) where reason: reader::Error` - when an error occurs; 278 /// * `Ok(None)` - upon end of stream is reached; 279 /// * `Ok(Some(token)) where token: Token` - in case a complete-token has been read from the stream. next_token<B: Read>(&mut self, b: &mut B) -> Result280 pub fn next_token<B: Read>(&mut self, b: &mut B) -> Result { 281 // Already reached end of buffer 282 if self.eof_handled { 283 return Ok(None); 284 } 285 286 if !self.inside_token { 287 self.pos = self.head_pos; 288 self.inside_token = true; 289 } 290 291 // Check if we have saved a char or two for ourselves 292 while let Some(c) = self.char_queue.pop_front() { 293 match try!(self.read_next_token(c)) { 294 Some(t) => { 295 self.inside_token = false; 296 return Ok(Some(t)); 297 } 298 None => {} // continue 299 } 300 } 301 302 loop { 303 // TODO: this should handle multiple encodings 304 let c = match try!(util::next_char_from(b)) { 305 Some(c) => c, // got next char 306 None => break, // nothing to read left 307 }; 308 309 match try!(self.read_next_token(c)) { 310 Some(t) => { 311 self.inside_token = false; 312 return Ok(Some(t)); 313 } 314 None => { 315 // continue 316 } 317 } 318 } 319 320 // Handle end of stream 321 self.eof_handled = true; 322 self.pos = self.head_pos; 323 match self.st { 324 State::TagStarted | State::CommentOrCDataOrDoctypeStarted | 325 State::CommentStarted | State::CDataStarted(_)| State::DoctypeStarted(_) | 326 State::CommentClosing(ClosingSubstate::Second) => 327 Err(self.error("Unexpected end of stream")), 328 State::ProcessingInstructionClosing => 329 Ok(Some(Token::Character('?'))), 330 State::EmptyTagClosing => 331 Ok(Some(Token::Character('/'))), 332 State::CommentClosing(ClosingSubstate::First) => 333 Ok(Some(Token::Character('-'))), 334 State::CDataClosing(ClosingSubstate::First) => 335 Ok(Some(Token::Character(']'))), 336 State::CDataClosing(ClosingSubstate::Second) => 337 Ok(Some(Token::Chunk("]]"))), 338 State::Normal => 339 Ok(None) 340 } 341 } 342 343 #[inline] error<M: Into<Cow<'static, str>>>(&self, msg: M) -> Error344 fn error<M: Into<Cow<'static, str>>>(&self, msg: M) -> Error { 345 (self, msg).into() 346 } 347 348 #[inline] read_next_token(&mut self, c: char) -> Result349 fn read_next_token(&mut self, c: char) -> Result { 350 let res = self.dispatch_char(c); 351 if self.char_queue.is_empty() { 352 if c == '\n' { 353 self.head_pos.new_line(); 354 } else { 355 self.head_pos.advance(1); 356 } 357 } 358 res 359 } 360 dispatch_char(&mut self, c: char) -> Result361 fn dispatch_char(&mut self, c: char) -> Result { 362 match self.st { 363 State::Normal => self.normal(c), 364 State::TagStarted => self.tag_opened(c), 365 State::CommentOrCDataOrDoctypeStarted => self.comment_or_cdata_or_doctype_started(c), 366 State::CommentStarted => self.comment_started(c), 367 State::CDataStarted(s) => self.cdata_started(c, s), 368 State::DoctypeStarted(s) => self.doctype_started(c, s), 369 State::ProcessingInstructionClosing => self.processing_instruction_closing(c), 370 State::EmptyTagClosing => self.empty_element_closing(c), 371 State::CommentClosing(s) => self.comment_closing(c, s), 372 State::CDataClosing(s) => self.cdata_closing(c, s) 373 } 374 } 375 376 #[inline] move_to(&mut self, st: State) -> Result377 fn move_to(&mut self, st: State) -> Result { 378 self.st = st; 379 Ok(None) 380 } 381 382 #[inline] move_to_with(&mut self, st: State, token: Token) -> Result383 fn move_to_with(&mut self, st: State, token: Token) -> Result { 384 self.st = st; 385 Ok(Some(token)) 386 } 387 388 #[inline] move_to_with_unread(&mut self, st: State, cs: &[char], token: Token) -> Result389 fn move_to_with_unread(&mut self, st: State, cs: &[char], token: Token) -> Result { 390 self.char_queue.extend(cs.iter().cloned()); 391 self.move_to_with(st, token) 392 } 393 handle_error(&mut self, chunk: &'static str, c: char) -> Result394 fn handle_error(&mut self, chunk: &'static str, c: char) -> Result { 395 self.char_queue.push_back(c); 396 if self.skip_errors || (self.inside_comment && chunk != "--") { // FIXME: looks hacky 397 self.move_to_with(State::Normal, Token::Chunk(chunk)) 398 } else { 399 Err(self.error(format!("Unexpected token '{}' before '{}'", chunk, c))) 400 } 401 } 402 403 /// Encountered a char normal(&mut self, c: char) -> Result404 fn normal(&mut self, c: char) -> Result { 405 match c { 406 '<' => self.move_to(State::TagStarted), 407 '>' => Ok(Some(Token::TagEnd)), 408 '/' => self.move_to(State::EmptyTagClosing), 409 '=' => Ok(Some(Token::EqualsSign)), 410 '"' => Ok(Some(Token::DoubleQuote)), 411 '\'' => Ok(Some(Token::SingleQuote)), 412 '?' => self.move_to(State::ProcessingInstructionClosing), 413 '-' => self.move_to(State::CommentClosing(ClosingSubstate::First)), 414 ']' => self.move_to(State::CDataClosing(ClosingSubstate::First)), 415 '&' => Ok(Some(Token::ReferenceStart)), 416 ';' => Ok(Some(Token::ReferenceEnd)), 417 _ if is_whitespace_char(c) => Ok(Some(Token::Whitespace(c))), 418 _ => Ok(Some(Token::Character(c))) 419 } 420 } 421 422 /// Encountered '<' tag_opened(&mut self, c: char) -> Result423 fn tag_opened(&mut self, c: char) -> Result { 424 match c { 425 '?' => self.move_to_with(State::Normal, Token::ProcessingInstructionStart), 426 '/' => self.move_to_with(State::Normal, Token::ClosingTagStart), 427 '!' => self.move_to(State::CommentOrCDataOrDoctypeStarted), 428 _ if is_whitespace_char(c) => self.move_to_with_unread(State::Normal, &[c], Token::OpeningTagStart), 429 _ if is_name_char(c) => self.move_to_with_unread(State::Normal, &[c], Token::OpeningTagStart), 430 _ => self.handle_error("<", c) 431 } 432 } 433 434 /// Encountered '<!' comment_or_cdata_or_doctype_started(&mut self, c: char) -> Result435 fn comment_or_cdata_or_doctype_started(&mut self, c: char) -> Result { 436 match c { 437 '-' => self.move_to(State::CommentStarted), 438 '[' => self.move_to(State::CDataStarted(CDataStartedSubstate::E)), 439 'D' => self.move_to(State::DoctypeStarted(DoctypeStartedSubstate::D)), 440 _ => self.handle_error("<!", c) 441 } 442 } 443 444 /// Encountered '<!-' comment_started(&mut self, c: char) -> Result445 fn comment_started(&mut self, c: char) -> Result { 446 match c { 447 '-' => self.move_to_with(State::Normal, Token::CommentStart), 448 _ => self.handle_error("<!-", c) 449 } 450 } 451 452 /// Encountered '<![' cdata_started(&mut self, c: char, s: CDataStartedSubstate) -> Result453 fn cdata_started(&mut self, c: char, s: CDataStartedSubstate) -> Result { 454 use self::CDataStartedSubstate::{E, C, CD, CDA, CDAT, CDATA}; 455 dispatch_on_enum_state!(self, s, c, State::CDataStarted, 456 E ; 'C' ; C ; "<![", 457 C ; 'D' ; CD ; "<![C", 458 CD ; 'A' ; CDA ; "<![CD", 459 CDA ; 'T' ; CDAT ; "<![CDA", 460 CDAT ; 'A' ; CDATA ; "<![CDAT"; 461 CDATA ; '[' ; "<![CDATA" ; self.move_to_with(State::Normal, Token::CDataStart) 462 ) 463 } 464 465 /// Encountered '<!D' doctype_started(&mut self, c: char, s: DoctypeStartedSubstate) -> Result466 fn doctype_started(&mut self, c: char, s: DoctypeStartedSubstate) -> Result { 467 use self::DoctypeStartedSubstate::{D, DO, DOC, DOCT, DOCTY, DOCTYP}; 468 dispatch_on_enum_state!(self, s, c, State::DoctypeStarted, 469 D ; 'O' ; DO ; "<!D", 470 DO ; 'C' ; DOC ; "<!DO", 471 DOC ; 'T' ; DOCT ; "<!DOC", 472 DOCT ; 'Y' ; DOCTY ; "<!DOCT", 473 DOCTY ; 'P' ; DOCTYP ; "<!DOCTY"; 474 DOCTYP ; 'E' ; "<!DOCTYP" ; self.move_to_with(State::Normal, Token::DoctypeStart) 475 ) 476 } 477 478 /// Encountered '?' processing_instruction_closing(&mut self, c: char) -> Result479 fn processing_instruction_closing(&mut self, c: char) -> Result { 480 match c { 481 '>' => self.move_to_with(State::Normal, Token::ProcessingInstructionEnd), 482 _ => self.move_to_with_unread(State::Normal, &[c], Token::Character('?')), 483 } 484 } 485 486 /// Encountered '/' empty_element_closing(&mut self, c: char) -> Result487 fn empty_element_closing(&mut self, c: char) -> Result { 488 match c { 489 '>' => self.move_to_with(State::Normal, Token::EmptyTagEnd), 490 _ => self.move_to_with_unread(State::Normal, &[c], Token::Character('/')), 491 } 492 } 493 494 /// Encountered '-' comment_closing(&mut self, c: char, s: ClosingSubstate) -> Result495 fn comment_closing(&mut self, c: char, s: ClosingSubstate) -> Result { 496 match s { 497 ClosingSubstate::First => match c { 498 '-' => self.move_to(State::CommentClosing(ClosingSubstate::Second)), 499 _ => self.move_to_with_unread(State::Normal, &[c], Token::Character('-')) 500 }, 501 ClosingSubstate::Second => match c { 502 '>' => self.move_to_with(State::Normal, Token::CommentEnd), 503 // double dash not followed by a greater-than is a hard error inside comment 504 _ if self.inside_comment => self.handle_error("--", c), 505 // nothing else except comment closing starts with a double dash, and comment 506 // closing can never be after another dash, and also we're outside of a comment, 507 // therefore it is safe to push only the last read character to the list of unread 508 // characters and pass the double dash directly to the output 509 _ => self.move_to_with_unread(State::Normal, &[c], Token::Chunk("--")) 510 } 511 } 512 } 513 514 /// Encountered ']' cdata_closing(&mut self, c: char, s: ClosingSubstate) -> Result515 fn cdata_closing(&mut self, c: char, s: ClosingSubstate) -> Result { 516 match s { 517 ClosingSubstate::First => match c { 518 ']' => self.move_to(State::CDataClosing(ClosingSubstate::Second)), 519 _ => self.move_to_with_unread(State::Normal, &[c], Token::Character(']')) 520 }, 521 ClosingSubstate::Second => match c { 522 '>' => self.move_to_with(State::Normal, Token::CDataEnd), 523 _ => self.move_to_with_unread(State::Normal, &[']', c], Token::Character(']')) 524 } 525 } 526 } 527 } 528 529 #[cfg(test)] 530 mod tests { 531 use common::{Position}; 532 use std::io::{BufReader, Cursor}; 533 534 use super::{Lexer, Token}; 535 536 macro_rules! assert_oks( 537 (for $lex:ident and $buf:ident ; $($e:expr)+) => ({ 538 $( 539 assert_eq!(Ok(Some($e)), $lex.next_token(&mut $buf)); 540 )+ 541 }) 542 ); 543 544 macro_rules! assert_err( 545 (for $lex:ident and $buf:ident expect row $r:expr ; $c:expr, $s:expr) => ({ 546 let err = $lex.next_token(&mut $buf); 547 assert!(err.is_err()); 548 let err = err.unwrap_err(); 549 assert_eq!($r as u64, err.position().row); 550 assert_eq!($c as u64, err.position().column); 551 assert_eq!($s, err.msg()); 552 }) 553 ); 554 555 macro_rules! assert_none( 556 (for $lex:ident and $buf:ident) => ( 557 assert_eq!(Ok(None), $lex.next_token(&mut $buf)); 558 ) 559 ); 560 make_lex_and_buf(s: &str) -> (Lexer, BufReader<Cursor<Vec<u8>>>)561 fn make_lex_and_buf(s: &str) -> (Lexer, BufReader<Cursor<Vec<u8>>>) { 562 (Lexer::new(), BufReader::new(Cursor::new(s.to_owned().into_bytes()))) 563 } 564 565 #[test] simple_lexer_test()566 fn simple_lexer_test() { 567 let (mut lex, mut buf) = make_lex_and_buf( 568 r#"<a p='q'> x<b z="y">d </b></a><p/> <?nm ?> <!-- a c --> "# 569 ); 570 571 assert_oks!(for lex and buf ; 572 Token::OpeningTagStart 573 Token::Character('a') 574 Token::Whitespace(' ') 575 Token::Character('p') 576 Token::EqualsSign 577 Token::SingleQuote 578 Token::Character('q') 579 Token::SingleQuote 580 Token::TagEnd 581 Token::Whitespace(' ') 582 Token::Character('x') 583 Token::OpeningTagStart 584 Token::Character('b') 585 Token::Whitespace(' ') 586 Token::Character('z') 587 Token::EqualsSign 588 Token::DoubleQuote 589 Token::Character('y') 590 Token::DoubleQuote 591 Token::TagEnd 592 Token::Character('d') 593 Token::Whitespace('\t') 594 Token::ClosingTagStart 595 Token::Character('b') 596 Token::TagEnd 597 Token::ClosingTagStart 598 Token::Character('a') 599 Token::TagEnd 600 Token::OpeningTagStart 601 Token::Character('p') 602 Token::EmptyTagEnd 603 Token::Whitespace(' ') 604 Token::ProcessingInstructionStart 605 Token::Character('n') 606 Token::Character('m') 607 Token::Whitespace(' ') 608 Token::ProcessingInstructionEnd 609 Token::Whitespace(' ') 610 Token::CommentStart 611 Token::Whitespace(' ') 612 Token::Character('a') 613 Token::Whitespace(' ') 614 Token::Character('c') 615 Token::Whitespace(' ') 616 Token::CommentEnd 617 Token::Whitespace(' ') 618 Token::ReferenceStart 619 Token::Character('n') 620 Token::Character('b') 621 Token::Character('s') 622 Token::Character('p') 623 Token::ReferenceEnd 624 ); 625 assert_none!(for lex and buf); 626 } 627 628 #[test] special_chars_test()629 fn special_chars_test() { 630 let (mut lex, mut buf) = make_lex_and_buf( 631 r#"?x!+ // -| ]z]]"# 632 ); 633 634 assert_oks!(for lex and buf ; 635 Token::Character('?') 636 Token::Character('x') 637 Token::Character('!') 638 Token::Character('+') 639 Token::Whitespace(' ') 640 Token::Character('/') 641 Token::Character('/') 642 Token::Whitespace(' ') 643 Token::Character('-') 644 Token::Character('|') 645 Token::Whitespace(' ') 646 Token::Character(']') 647 Token::Character('z') 648 Token::Chunk("]]") 649 ); 650 assert_none!(for lex and buf); 651 } 652 653 #[test] cdata_test()654 fn cdata_test() { 655 let (mut lex, mut buf) = make_lex_and_buf( 656 r#"<a><![CDATA[x y ?]]> </a>"# 657 ); 658 659 assert_oks!(for lex and buf ; 660 Token::OpeningTagStart 661 Token::Character('a') 662 Token::TagEnd 663 Token::CDataStart 664 Token::Character('x') 665 Token::Whitespace(' ') 666 Token::Character('y') 667 Token::Whitespace(' ') 668 Token::Character('?') 669 Token::CDataEnd 670 Token::Whitespace(' ') 671 Token::ClosingTagStart 672 Token::Character('a') 673 Token::TagEnd 674 ); 675 assert_none!(for lex and buf); 676 } 677 678 #[test] doctype_test()679 fn doctype_test() { 680 let (mut lex, mut buf) = make_lex_and_buf( 681 r#"<a><!DOCTYPE ab xx z> "# 682 ); 683 assert_oks!(for lex and buf ; 684 Token::OpeningTagStart 685 Token::Character('a') 686 Token::TagEnd 687 Token::DoctypeStart 688 Token::Whitespace(' ') 689 Token::Character('a') 690 Token::Character('b') 691 Token::Whitespace(' ') 692 Token::Character('x') 693 Token::Character('x') 694 Token::Whitespace(' ') 695 Token::Character('z') 696 Token::TagEnd 697 Token::Whitespace(' ') 698 ); 699 assert_none!(for lex and buf) 700 } 701 702 #[test] end_of_stream_handling_ok()703 fn end_of_stream_handling_ok() { 704 macro_rules! eof_check( 705 ($data:expr ; $token:expr) => ({ 706 let (mut lex, mut buf) = make_lex_and_buf($data); 707 assert_oks!(for lex and buf ; $token); 708 assert_none!(for lex and buf); 709 }) 710 ); 711 eof_check!("?" ; Token::Character('?')); 712 eof_check!("/" ; Token::Character('/')); 713 eof_check!("-" ; Token::Character('-')); 714 eof_check!("]" ; Token::Character(']')); 715 eof_check!("]]" ; Token::Chunk("]]")); 716 } 717 718 #[test] end_of_stream_handling_error()719 fn end_of_stream_handling_error() { 720 macro_rules! eof_check( 721 ($data:expr; $r:expr, $c:expr) => ({ 722 let (mut lex, mut buf) = make_lex_and_buf($data); 723 assert_err!(for lex and buf expect row $r ; $c, "Unexpected end of stream"); 724 assert_none!(for lex and buf); 725 }) 726 ); 727 eof_check!("<" ; 0, 1); 728 eof_check!("<!" ; 0, 2); 729 eof_check!("<!-" ; 0, 3); 730 eof_check!("<![" ; 0, 3); 731 eof_check!("<![C" ; 0, 4); 732 eof_check!("<![CD" ; 0, 5); 733 eof_check!("<![CDA" ; 0, 6); 734 eof_check!("<![CDAT" ; 0, 7); 735 eof_check!("<![CDATA" ; 0, 8); 736 eof_check!("--" ; 0, 2); 737 } 738 739 #[test] error_in_comment_or_cdata_prefix()740 fn error_in_comment_or_cdata_prefix() { 741 let (mut lex, mut buf) = make_lex_and_buf("<!x"); 742 assert_err!(for lex and buf expect row 0 ; 0, 743 "Unexpected token '<!' before 'x'" 744 ); 745 746 let (mut lex, mut buf) = make_lex_and_buf("<!x"); 747 lex.disable_errors(); 748 assert_oks!(for lex and buf ; 749 Token::Chunk("<!") 750 Token::Character('x') 751 ); 752 assert_none!(for lex and buf); 753 } 754 755 #[test] error_in_comment_started()756 fn error_in_comment_started() { 757 let (mut lex, mut buf) = make_lex_and_buf("<!-\t"); 758 assert_err!(for lex and buf expect row 0 ; 0, 759 "Unexpected token '<!-' before '\t'" 760 ); 761 762 let (mut lex, mut buf) = make_lex_and_buf("<!-\t"); 763 lex.disable_errors(); 764 assert_oks!(for lex and buf ; 765 Token::Chunk("<!-") 766 Token::Whitespace('\t') 767 ); 768 assert_none!(for lex and buf); 769 } 770 771 #[test] error_in_comment_two_dashes_not_at_end()772 fn error_in_comment_two_dashes_not_at_end() { 773 let (mut lex, mut buf) = make_lex_and_buf("--x"); 774 lex.inside_comment(); 775 assert_err!(for lex and buf expect row 0; 0, 776 "Unexpected token '--' before 'x'" 777 ); 778 779 let (mut lex, mut buf) = make_lex_and_buf("--x"); 780 assert_oks!(for lex and buf ; 781 Token::Chunk("--") 782 Token::Character('x') 783 ); 784 } 785 786 macro_rules! check_case( 787 ($chunk:expr, $app:expr; $data:expr; $r:expr, $c:expr, $s:expr) => ({ 788 let (mut lex, mut buf) = make_lex_and_buf($data); 789 assert_err!(for lex and buf expect row $r ; $c, $s); 790 791 let (mut lex, mut buf) = make_lex_and_buf($data); 792 lex.disable_errors(); 793 assert_oks!(for lex and buf ; 794 Token::Chunk($chunk) 795 Token::Character($app) 796 ); 797 assert_none!(for lex and buf); 798 }) 799 ); 800 801 #[test] error_in_cdata_started()802 fn error_in_cdata_started() { 803 check_case!("<![", '['; "<![[" ; 0, 0, "Unexpected token '<![' before '['"); 804 check_case!("<![C", '['; "<![C[" ; 0, 0, "Unexpected token '<![C' before '['"); 805 check_case!("<![CD", '['; "<![CD[" ; 0, 0, "Unexpected token '<![CD' before '['"); 806 check_case!("<![CDA", '['; "<![CDA[" ; 0, 0, "Unexpected token '<![CDA' before '['"); 807 check_case!("<![CDAT", '['; "<![CDAT[" ; 0, 0, "Unexpected token '<![CDAT' before '['"); 808 check_case!("<![CDATA", '|'; "<![CDATA|" ; 0, 0, "Unexpected token '<![CDATA' before '|'"); 809 } 810 811 #[test] error_in_doctype_started()812 fn error_in_doctype_started() { 813 check_case!("<!D", 'a'; "<!Da" ; 0, 0, "Unexpected token '<!D' before 'a'"); 814 check_case!("<!DO", 'b'; "<!DOb" ; 0, 0, "Unexpected token '<!DO' before 'b'"); 815 check_case!("<!DOC", 'c'; "<!DOCc" ; 0, 0, "Unexpected token '<!DOC' before 'c'"); 816 check_case!("<!DOCT", 'd'; "<!DOCTd" ; 0, 0, "Unexpected token '<!DOCT' before 'd'"); 817 check_case!("<!DOCTY", 'e'; "<!DOCTYe" ; 0, 0, "Unexpected token '<!DOCTY' before 'e'"); 818 check_case!("<!DOCTYP", 'f'; "<!DOCTYPf" ; 0, 0, "Unexpected token '<!DOCTYP' before 'f'"); 819 } 820 821 822 823 #[test] issue_98_cdata_ending_with_right_bracket()824 fn issue_98_cdata_ending_with_right_bracket() { 825 let (mut lex, mut buf) = make_lex_and_buf( 826 r#"<![CDATA[Foo [Bar]]]>"# 827 ); 828 829 assert_oks!(for lex and buf ; 830 Token::CDataStart 831 Token::Character('F') 832 Token::Character('o') 833 Token::Character('o') 834 Token::Whitespace(' ') 835 Token::Character('[') 836 Token::Character('B') 837 Token::Character('a') 838 Token::Character('r') 839 Token::Character(']') 840 Token::CDataEnd 841 ); 842 assert_none!(for lex and buf); 843 } 844 } 845