1 //! Contains simple lexer for XML documents. 2 //! 3 //! This module is for internal use. Use `xml::pull` module to do parsing. 4 5 use std::fmt; 6 use std::collections::VecDeque; 7 use std::io::Read; 8 use std::result; 9 use std::borrow::Cow; 10 11 use common::{Position, TextPosition, is_whitespace_char, is_name_char}; 12 use reader::Error; 13 use util; 14 15 /// `Token` represents a single lexeme of an XML document. These lexemes 16 /// are used to perform actual parsing. 17 #[derive(Copy, Clone, PartialEq, Eq, Debug)] 18 pub enum Token { 19 /// `<?` 20 ProcessingInstructionStart, 21 /// `?>` 22 ProcessingInstructionEnd, 23 /// `<!DOCTYPE 24 DoctypeStart, 25 /// `<` 26 OpeningTagStart, 27 /// `</` 28 ClosingTagStart, 29 /// `>` 30 TagEnd, 31 /// `/>` 32 EmptyTagEnd, 33 /// `<!--` 34 CommentStart, 35 /// `-->` 36 CommentEnd, 37 /// A chunk of characters, used for errors recovery. 38 Chunk(&'static str), 39 /// Any non-special character except whitespace. 40 Character(char), 41 /// Whitespace character. 42 Whitespace(char), 43 /// `=` 44 EqualsSign, 45 /// `'` 46 SingleQuote, 47 /// `"` 48 DoubleQuote, 49 /// `<![CDATA[` 50 CDataStart, 51 /// `]]>` 52 CDataEnd, 53 /// `&` 54 ReferenceStart, 55 /// `;` 56 ReferenceEnd, 57 } 58 59 impl fmt::Display for Token { fmt(&self, f: &mut fmt::Formatter) -> fmt::Result60 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 61 match *self { 62 Token::Chunk(s) => write!(f, "{}", s), 63 Token::Character(c) | Token::Whitespace(c) => write!(f, "{}", c), 64 other => write!(f, "{}", match other { 65 Token::OpeningTagStart => "<", 66 Token::ProcessingInstructionStart => "<?", 67 Token::DoctypeStart => "<!DOCTYPE", 68 Token::ClosingTagStart => "</", 69 Token::CommentStart => "<!--", 70 Token::CDataStart => "<![CDATA[", 71 Token::TagEnd => ">", 72 Token::EmptyTagEnd => "/>", 73 Token::ProcessingInstructionEnd => "?>", 74 Token::CommentEnd => "-->", 75 Token::CDataEnd => "]]>", 76 Token::ReferenceStart => "&", 77 Token::ReferenceEnd => ";", 78 Token::EqualsSign => "=", 79 Token::SingleQuote => "'", 80 Token::DoubleQuote => "\"", 81 _ => unreachable!() 82 }) 83 } 84 } 85 } 86 87 impl Token { as_static_str(&self) -> Option<&'static str>88 pub fn as_static_str(&self) -> Option<&'static str> { 89 match *self { 90 Token::OpeningTagStart => Some("<"), 91 Token::ProcessingInstructionStart => Some("<?"), 92 Token::DoctypeStart => Some("<!DOCTYPE"), 93 Token::ClosingTagStart => Some("</"), 94 Token::CommentStart => Some("<!--"), 95 Token::CDataStart => Some("<![CDATA["), 96 Token::TagEnd => Some(">"), 97 Token::EmptyTagEnd => Some("/>"), 98 Token::ProcessingInstructionEnd => Some("?>"), 99 Token::CommentEnd => Some("-->"), 100 Token::CDataEnd => Some("]]>"), 101 Token::ReferenceStart => Some("&"), 102 Token::ReferenceEnd => Some(";"), 103 Token::EqualsSign => Some("="), 104 Token::SingleQuote => Some("'"), 105 Token::DoubleQuote => Some("\""), 106 Token::Chunk(s) => Some(s), 107 _ => None 108 } 109 } 110 111 // using String.push_str(token.to_string()) is simply way too slow push_to_string(&self, target: &mut String)112 pub fn push_to_string(&self, target: &mut String) { 113 match self.as_static_str() { 114 Some(s) => { target.push_str(s); } 115 None => { 116 match *self { 117 Token::Character(c) | Token::Whitespace(c) => target.push(c), 118 _ => unreachable!() 119 } 120 } 121 } 122 } 123 124 /// Returns `true` if this token contains data that can be interpreted 125 /// as a part of the text. Surprisingly, this also means '>' and '=' and '"' and "'" and '-->'. 126 #[inline] contains_char_data(&self) -> bool127 pub fn contains_char_data(&self) -> bool { 128 match *self { 129 Token::Whitespace(_) | Token::Chunk(_) | Token::Character(_) | Token::CommentEnd | 130 Token::TagEnd | Token::EqualsSign | Token::DoubleQuote | Token::SingleQuote | Token::CDataEnd | 131 Token::ProcessingInstructionEnd | Token::EmptyTagEnd => true, 132 _ => false 133 } 134 } 135 136 /// Returns `true` if this token corresponds to a white space character. 137 #[inline] is_whitespace(&self) -> bool138 pub fn is_whitespace(&self) -> bool { 139 match *self { 140 Token::Whitespace(_) => true, 141 _ => false 142 } 143 } 144 } 145 146 enum State { 147 /// Triggered on '<' 148 TagStarted, 149 /// Triggered on '<!' 150 CommentOrCDataOrDoctypeStarted, 151 /// Triggered on '<!-' 152 CommentStarted, 153 /// Triggered on '<!D' up to '<!DOCTYPE' 154 DoctypeStarted(DoctypeStartedSubstate), 155 /// Triggered after DoctypeStarted to handle sub elements 156 DoctypeFinishing(u8), 157 /// Triggered on '<![' up to '<![CDATA' 158 CDataStarted(CDataStartedSubstate), 159 /// Triggered on '?' 160 ProcessingInstructionClosing, 161 /// Triggered on '/' 162 EmptyTagClosing, 163 /// Triggered on '-' up to '--' 164 CommentClosing(ClosingSubstate), 165 /// Triggered on ']' up to ']]' 166 CDataClosing(ClosingSubstate), 167 /// Default state 168 Normal 169 } 170 171 #[derive(Copy, Clone)] 172 enum ClosingSubstate { 173 First, Second 174 } 175 176 #[derive(Copy, Clone)] 177 enum DoctypeStartedSubstate { 178 D, DO, DOC, DOCT, DOCTY, DOCTYP 179 } 180 181 #[derive(Copy, Clone)] 182 enum CDataStartedSubstate { 183 E, C, CD, CDA, CDAT, CDATA 184 } 185 186 /// `Result` represents lexing result. It is either a token or an error message. 187 pub type Result = result::Result<Option<Token>, Error>; 188 189 /// Helps to set up a dispatch table for lexing large unambigous tokens like 190 /// `<![CDATA[` or `<!DOCTYPE `. 191 macro_rules! dispatch_on_enum_state( 192 ($_self:ident, $s:expr, $c:expr, $is:expr, 193 $($st:ident; $stc:expr ; $next_st:ident ; $chunk:expr),+; 194 $end_st:ident ; $end_c:expr ; $end_chunk:expr ; $e:expr) => ( 195 match $s { 196 $( 197 $st => match $c { 198 $stc => $_self.move_to($is($next_st)), 199 _ => $_self.handle_error($chunk, $c) 200 }, 201 )+ 202 $end_st => match $c { 203 $end_c => $e, 204 _ => $_self.handle_error($end_chunk, $c) 205 } 206 } 207 ) 208 ); 209 210 /// `Lexer` is a lexer for XML documents, which implements pull API. 211 /// 212 /// Main method is `next_token` which accepts an `std::io::Read` instance and 213 /// tries to read the next lexeme from it. 214 /// 215 /// When `skip_errors` flag is set, invalid lexemes will be returned as `Chunk`s. 216 /// When it is not set, errors will be reported as `Err` objects with a string message. 217 /// By default this flag is not set. Use `enable_errors` and `disable_errors` methods 218 /// to toggle the behavior. 219 pub struct Lexer { 220 pos: TextPosition, 221 head_pos: TextPosition, 222 char_queue: VecDeque<char>, 223 st: State, 224 skip_errors: bool, 225 inside_comment: bool, 226 inside_token: bool, 227 eof_handled: bool 228 } 229 230 impl Position for Lexer { 231 #[inline] 232 /// Returns the position of the last token produced by the lexer position(&self) -> TextPosition233 fn position(&self) -> TextPosition { self.pos } 234 } 235 236 impl Lexer { 237 /// Returns a new lexer with default state. new() -> Lexer238 pub fn new() -> Lexer { 239 Lexer { 240 pos: TextPosition::new(), 241 head_pos: TextPosition::new(), 242 char_queue: VecDeque::with_capacity(4), // TODO: check size 243 st: State::Normal, 244 skip_errors: false, 245 inside_comment: false, 246 inside_token: false, 247 eof_handled: false 248 } 249 } 250 251 /// Enables error handling so `next_token` will return `Some(Err(..))` 252 /// upon invalid lexeme. 253 #[inline] enable_errors(&mut self)254 pub fn enable_errors(&mut self) { self.skip_errors = false; } 255 256 /// Disables error handling so `next_token` will return `Some(Chunk(..))` 257 /// upon invalid lexeme with this lexeme content. 258 #[inline] disable_errors(&mut self)259 pub fn disable_errors(&mut self) { self.skip_errors = true; } 260 261 /// Enables special handling of some lexemes which should be done when we're parsing comment 262 /// internals. 263 #[inline] inside_comment(&mut self)264 pub fn inside_comment(&mut self) { self.inside_comment = true; } 265 266 /// Disables the effect of `inside_comment()` method. 267 #[inline] outside_comment(&mut self)268 pub fn outside_comment(&mut self) { self.inside_comment = false; } 269 270 /// Reset the eof handled flag of the lexer. 271 #[inline] reset_eof_handled(&mut self)272 pub fn reset_eof_handled(&mut self) { self.eof_handled = false; } 273 274 /// Tries to read the next token from the buffer. 275 /// 276 /// It is possible to pass different instaces of `BufReader` each time 277 /// this method is called, but the resulting behavior is undefined in this case. 278 /// 279 /// Return value: 280 /// * `Err(reason) where reason: reader::Error` - when an error occurs; 281 /// * `Ok(None)` - upon end of stream is reached; 282 /// * `Ok(Some(token)) where token: Token` - in case a complete-token has been read from the stream. next_token<B: Read>(&mut self, b: &mut B) -> Result283 pub fn next_token<B: Read>(&mut self, b: &mut B) -> Result { 284 // Already reached end of buffer 285 if self.eof_handled { 286 return Ok(None); 287 } 288 289 if !self.inside_token { 290 self.pos = self.head_pos; 291 self.inside_token = true; 292 } 293 294 // Check if we have saved a char or two for ourselves 295 while let Some(c) = self.char_queue.pop_front() { 296 match try!(self.read_next_token(c)) { 297 Some(t) => { 298 self.inside_token = false; 299 return Ok(Some(t)); 300 } 301 None => {} // continue 302 } 303 } 304 305 loop { 306 // TODO: this should handle multiple encodings 307 let c = match try!(util::next_char_from(b)) { 308 Some(c) => c, // got next char 309 None => break, // nothing to read left 310 }; 311 312 match try!(self.read_next_token(c)) { 313 Some(t) => { 314 self.inside_token = false; 315 return Ok(Some(t)); 316 } 317 None => { 318 // continue 319 } 320 } 321 } 322 323 // Handle end of stream 324 self.eof_handled = true; 325 self.pos = self.head_pos; 326 match self.st { 327 State::TagStarted | State::CommentOrCDataOrDoctypeStarted | 328 State::CommentStarted | State::CDataStarted(_)| State::DoctypeStarted(_) | 329 State::CommentClosing(ClosingSubstate::Second) | 330 State::DoctypeFinishing(_) => 331 Err(self.error("Unexpected end of stream")), 332 State::ProcessingInstructionClosing => 333 Ok(Some(Token::Character('?'))), 334 State::EmptyTagClosing => 335 Ok(Some(Token::Character('/'))), 336 State::CommentClosing(ClosingSubstate::First) => 337 Ok(Some(Token::Character('-'))), 338 State::CDataClosing(ClosingSubstate::First) => 339 Ok(Some(Token::Character(']'))), 340 State::CDataClosing(ClosingSubstate::Second) => 341 Ok(Some(Token::Chunk("]]"))), 342 State::Normal => 343 Ok(None) 344 } 345 } 346 347 #[inline] error<M: Into<Cow<'static, str>>>(&self, msg: M) -> Error348 fn error<M: Into<Cow<'static, str>>>(&self, msg: M) -> Error { 349 (self, msg).into() 350 } 351 352 #[inline] read_next_token(&mut self, c: char) -> Result353 fn read_next_token(&mut self, c: char) -> Result { 354 let res = self.dispatch_char(c); 355 if self.char_queue.is_empty() { 356 if c == '\n' { 357 self.head_pos.new_line(); 358 } else { 359 self.head_pos.advance(1); 360 } 361 } 362 res 363 } 364 dispatch_char(&mut self, c: char) -> Result365 fn dispatch_char(&mut self, c: char) -> Result { 366 match self.st { 367 State::Normal => self.normal(c), 368 State::TagStarted => self.tag_opened(c), 369 State::CommentOrCDataOrDoctypeStarted => self.comment_or_cdata_or_doctype_started(c), 370 State::CommentStarted => self.comment_started(c), 371 State::CDataStarted(s) => self.cdata_started(c, s), 372 State::DoctypeStarted(s) => self.doctype_started(c, s), 373 State::DoctypeFinishing(d) => self.doctype_finishing(c, d), 374 State::ProcessingInstructionClosing => self.processing_instruction_closing(c), 375 State::EmptyTagClosing => self.empty_element_closing(c), 376 State::CommentClosing(s) => self.comment_closing(c, s), 377 State::CDataClosing(s) => self.cdata_closing(c, s) 378 } 379 } 380 381 #[inline] move_to(&mut self, st: State) -> Result382 fn move_to(&mut self, st: State) -> Result { 383 self.st = st; 384 Ok(None) 385 } 386 387 #[inline] move_to_with(&mut self, st: State, token: Token) -> Result388 fn move_to_with(&mut self, st: State, token: Token) -> Result { 389 self.st = st; 390 Ok(Some(token)) 391 } 392 393 #[inline] move_to_with_unread(&mut self, st: State, cs: &[char], token: Token) -> Result394 fn move_to_with_unread(&mut self, st: State, cs: &[char], token: Token) -> Result { 395 self.char_queue.extend(cs.iter().cloned()); 396 self.move_to_with(st, token) 397 } 398 handle_error(&mut self, chunk: &'static str, c: char) -> Result399 fn handle_error(&mut self, chunk: &'static str, c: char) -> Result { 400 self.char_queue.push_back(c); 401 if self.skip_errors || (self.inside_comment && chunk != "--") { // FIXME: looks hacky 402 self.move_to_with(State::Normal, Token::Chunk(chunk)) 403 } else { 404 Err(self.error(format!("Unexpected token '{}' before '{}'", chunk, c))) 405 } 406 } 407 408 /// Encountered a char normal(&mut self, c: char) -> Result409 fn normal(&mut self, c: char) -> Result { 410 match c { 411 '<' => self.move_to(State::TagStarted), 412 '>' => Ok(Some(Token::TagEnd)), 413 '/' => self.move_to(State::EmptyTagClosing), 414 '=' => Ok(Some(Token::EqualsSign)), 415 '"' => Ok(Some(Token::DoubleQuote)), 416 '\'' => Ok(Some(Token::SingleQuote)), 417 '?' => self.move_to(State::ProcessingInstructionClosing), 418 '-' => self.move_to(State::CommentClosing(ClosingSubstate::First)), 419 ']' => self.move_to(State::CDataClosing(ClosingSubstate::First)), 420 '&' => Ok(Some(Token::ReferenceStart)), 421 ';' => Ok(Some(Token::ReferenceEnd)), 422 _ if is_whitespace_char(c) => Ok(Some(Token::Whitespace(c))), 423 _ => Ok(Some(Token::Character(c))) 424 } 425 } 426 427 /// Encountered '<' tag_opened(&mut self, c: char) -> Result428 fn tag_opened(&mut self, c: char) -> Result { 429 match c { 430 '?' => self.move_to_with(State::Normal, Token::ProcessingInstructionStart), 431 '/' => self.move_to_with(State::Normal, Token::ClosingTagStart), 432 '!' => self.move_to(State::CommentOrCDataOrDoctypeStarted), 433 _ if is_whitespace_char(c) => self.move_to_with_unread(State::Normal, &[c], Token::OpeningTagStart), 434 _ if is_name_char(c) => self.move_to_with_unread(State::Normal, &[c], Token::OpeningTagStart), 435 _ => self.handle_error("<", c) 436 } 437 } 438 439 /// Encountered '<!' comment_or_cdata_or_doctype_started(&mut self, c: char) -> Result440 fn comment_or_cdata_or_doctype_started(&mut self, c: char) -> Result { 441 match c { 442 '-' => self.move_to(State::CommentStarted), 443 '[' => self.move_to(State::CDataStarted(CDataStartedSubstate::E)), 444 'D' => self.move_to(State::DoctypeStarted(DoctypeStartedSubstate::D)), 445 _ => self.handle_error("<!", c) 446 } 447 } 448 449 /// Encountered '<!-' comment_started(&mut self, c: char) -> Result450 fn comment_started(&mut self, c: char) -> Result { 451 match c { 452 '-' => self.move_to_with(State::Normal, Token::CommentStart), 453 _ => self.handle_error("<!-", c) 454 } 455 } 456 457 /// Encountered '<![' cdata_started(&mut self, c: char, s: CDataStartedSubstate) -> Result458 fn cdata_started(&mut self, c: char, s: CDataStartedSubstate) -> Result { 459 use self::CDataStartedSubstate::{E, C, CD, CDA, CDAT, CDATA}; 460 dispatch_on_enum_state!(self, s, c, State::CDataStarted, 461 E ; 'C' ; C ; "<![", 462 C ; 'D' ; CD ; "<![C", 463 CD ; 'A' ; CDA ; "<![CD", 464 CDA ; 'T' ; CDAT ; "<![CDA", 465 CDAT ; 'A' ; CDATA ; "<![CDAT"; 466 CDATA ; '[' ; "<![CDATA" ; self.move_to_with(State::Normal, Token::CDataStart) 467 ) 468 } 469 470 /// Encountered '<!D' doctype_started(&mut self, c: char, s: DoctypeStartedSubstate) -> Result471 fn doctype_started(&mut self, c: char, s: DoctypeStartedSubstate) -> Result { 472 use self::DoctypeStartedSubstate::{D, DO, DOC, DOCT, DOCTY, DOCTYP}; 473 dispatch_on_enum_state!(self, s, c, State::DoctypeStarted, 474 D ; 'O' ; DO ; "<!D", 475 DO ; 'C' ; DOC ; "<!DO", 476 DOC ; 'T' ; DOCT ; "<!DOC", 477 DOCT ; 'Y' ; DOCTY ; "<!DOCT", 478 DOCTY ; 'P' ; DOCTYP ; "<!DOCTY"; 479 DOCTYP ; 'E' ; "<!DOCTYP" ; self.move_to_with(State::DoctypeFinishing(1), Token::DoctypeStart) 480 ) 481 } 482 483 /// State used while awaiting the closing bracket for the <!DOCTYPE tag doctype_finishing(&mut self, c: char, d: u8) -> Result484 fn doctype_finishing(&mut self, c: char, d: u8) -> Result { 485 match c { 486 '<' => self.move_to(State::DoctypeFinishing(d + 1)), 487 '>' if d == 1 => self.move_to_with(State::Normal, Token::TagEnd), 488 '>' => self.move_to(State::DoctypeFinishing(d - 1)), 489 _ => Ok(None), 490 } 491 } 492 493 /// Encountered '?' processing_instruction_closing(&mut self, c: char) -> Result494 fn processing_instruction_closing(&mut self, c: char) -> Result { 495 match c { 496 '>' => self.move_to_with(State::Normal, Token::ProcessingInstructionEnd), 497 _ => self.move_to_with_unread(State::Normal, &[c], Token::Character('?')), 498 } 499 } 500 501 /// Encountered '/' empty_element_closing(&mut self, c: char) -> Result502 fn empty_element_closing(&mut self, c: char) -> Result { 503 match c { 504 '>' => self.move_to_with(State::Normal, Token::EmptyTagEnd), 505 _ => self.move_to_with_unread(State::Normal, &[c], Token::Character('/')), 506 } 507 } 508 509 /// Encountered '-' comment_closing(&mut self, c: char, s: ClosingSubstate) -> Result510 fn comment_closing(&mut self, c: char, s: ClosingSubstate) -> Result { 511 match s { 512 ClosingSubstate::First => match c { 513 '-' => self.move_to(State::CommentClosing(ClosingSubstate::Second)), 514 _ => self.move_to_with_unread(State::Normal, &[c], Token::Character('-')) 515 }, 516 ClosingSubstate::Second => match c { 517 '>' => self.move_to_with(State::Normal, Token::CommentEnd), 518 // double dash not followed by a greater-than is a hard error inside comment 519 _ if self.inside_comment => self.handle_error("--", c), 520 // nothing else except comment closing starts with a double dash, and comment 521 // closing can never be after another dash, and also we're outside of a comment, 522 // therefore it is safe to push only the last read character to the list of unread 523 // characters and pass the double dash directly to the output 524 _ => self.move_to_with_unread(State::Normal, &[c], Token::Chunk("--")) 525 } 526 } 527 } 528 529 /// Encountered ']' cdata_closing(&mut self, c: char, s: ClosingSubstate) -> Result530 fn cdata_closing(&mut self, c: char, s: ClosingSubstate) -> Result { 531 match s { 532 ClosingSubstate::First => match c { 533 ']' => self.move_to(State::CDataClosing(ClosingSubstate::Second)), 534 _ => self.move_to_with_unread(State::Normal, &[c], Token::Character(']')) 535 }, 536 ClosingSubstate::Second => match c { 537 '>' => self.move_to_with(State::Normal, Token::CDataEnd), 538 _ => self.move_to_with_unread(State::Normal, &[']', c], Token::Character(']')) 539 } 540 } 541 } 542 } 543 544 #[cfg(test)] 545 mod tests { 546 use common::{Position}; 547 use std::io::{BufReader, Cursor}; 548 549 use super::{Lexer, Token}; 550 551 macro_rules! assert_oks( 552 (for $lex:ident and $buf:ident ; $($e:expr)+) => ({ 553 $( 554 assert_eq!(Ok(Some($e)), $lex.next_token(&mut $buf)); 555 )+ 556 }) 557 ); 558 559 macro_rules! assert_err( 560 (for $lex:ident and $buf:ident expect row $r:expr ; $c:expr, $s:expr) => ({ 561 let err = $lex.next_token(&mut $buf); 562 assert!(err.is_err()); 563 let err = err.unwrap_err(); 564 assert_eq!($r as u64, err.position().row); 565 assert_eq!($c as u64, err.position().column); 566 assert_eq!($s, err.msg()); 567 }) 568 ); 569 570 macro_rules! assert_none( 571 (for $lex:ident and $buf:ident) => ( 572 assert_eq!(Ok(None), $lex.next_token(&mut $buf)); 573 ) 574 ); 575 make_lex_and_buf(s: &str) -> (Lexer, BufReader<Cursor<Vec<u8>>>)576 fn make_lex_and_buf(s: &str) -> (Lexer, BufReader<Cursor<Vec<u8>>>) { 577 (Lexer::new(), BufReader::new(Cursor::new(s.to_owned().into_bytes()))) 578 } 579 580 #[test] simple_lexer_test()581 fn simple_lexer_test() { 582 let (mut lex, mut buf) = make_lex_and_buf( 583 r#"<a p='q'> x<b z="y">d </b></a><p/> <?nm ?> <!-- a c --> "# 584 ); 585 586 assert_oks!(for lex and buf ; 587 Token::OpeningTagStart 588 Token::Character('a') 589 Token::Whitespace(' ') 590 Token::Character('p') 591 Token::EqualsSign 592 Token::SingleQuote 593 Token::Character('q') 594 Token::SingleQuote 595 Token::TagEnd 596 Token::Whitespace(' ') 597 Token::Character('x') 598 Token::OpeningTagStart 599 Token::Character('b') 600 Token::Whitespace(' ') 601 Token::Character('z') 602 Token::EqualsSign 603 Token::DoubleQuote 604 Token::Character('y') 605 Token::DoubleQuote 606 Token::TagEnd 607 Token::Character('d') 608 Token::Whitespace('\t') 609 Token::ClosingTagStart 610 Token::Character('b') 611 Token::TagEnd 612 Token::ClosingTagStart 613 Token::Character('a') 614 Token::TagEnd 615 Token::OpeningTagStart 616 Token::Character('p') 617 Token::EmptyTagEnd 618 Token::Whitespace(' ') 619 Token::ProcessingInstructionStart 620 Token::Character('n') 621 Token::Character('m') 622 Token::Whitespace(' ') 623 Token::ProcessingInstructionEnd 624 Token::Whitespace(' ') 625 Token::CommentStart 626 Token::Whitespace(' ') 627 Token::Character('a') 628 Token::Whitespace(' ') 629 Token::Character('c') 630 Token::Whitespace(' ') 631 Token::CommentEnd 632 Token::Whitespace(' ') 633 Token::ReferenceStart 634 Token::Character('n') 635 Token::Character('b') 636 Token::Character('s') 637 Token::Character('p') 638 Token::ReferenceEnd 639 ); 640 assert_none!(for lex and buf); 641 } 642 643 #[test] special_chars_test()644 fn special_chars_test() { 645 let (mut lex, mut buf) = make_lex_and_buf( 646 r#"?x!+ // -| ]z]]"# 647 ); 648 649 assert_oks!(for lex and buf ; 650 Token::Character('?') 651 Token::Character('x') 652 Token::Character('!') 653 Token::Character('+') 654 Token::Whitespace(' ') 655 Token::Character('/') 656 Token::Character('/') 657 Token::Whitespace(' ') 658 Token::Character('-') 659 Token::Character('|') 660 Token::Whitespace(' ') 661 Token::Character(']') 662 Token::Character('z') 663 Token::Chunk("]]") 664 ); 665 assert_none!(for lex and buf); 666 } 667 668 #[test] cdata_test()669 fn cdata_test() { 670 let (mut lex, mut buf) = make_lex_and_buf( 671 r#"<a><![CDATA[x y ?]]> </a>"# 672 ); 673 674 assert_oks!(for lex and buf ; 675 Token::OpeningTagStart 676 Token::Character('a') 677 Token::TagEnd 678 Token::CDataStart 679 Token::Character('x') 680 Token::Whitespace(' ') 681 Token::Character('y') 682 Token::Whitespace(' ') 683 Token::Character('?') 684 Token::CDataEnd 685 Token::Whitespace(' ') 686 Token::ClosingTagStart 687 Token::Character('a') 688 Token::TagEnd 689 ); 690 assert_none!(for lex and buf); 691 } 692 693 #[test] doctype_test()694 fn doctype_test() { 695 let (mut lex, mut buf) = make_lex_and_buf( 696 r#"<a><!DOCTYPE ab xx z> "# 697 ); 698 assert_oks!(for lex and buf ; 699 Token::OpeningTagStart 700 Token::Character('a') 701 Token::TagEnd 702 Token::DoctypeStart 703 Token::TagEnd 704 Token::Whitespace(' ') 705 ); 706 assert_none!(for lex and buf) 707 } 708 709 #[test] doctype_with_internal_subset_test()710 fn doctype_with_internal_subset_test() { 711 let (mut lex, mut buf) = make_lex_and_buf( 712 r#"<a><!DOCTYPE ab[<!ELEMENT ba> ]> "# 713 ); 714 assert_oks!(for lex and buf ; 715 Token::OpeningTagStart 716 Token::Character('a') 717 Token::TagEnd 718 Token::DoctypeStart 719 Token::TagEnd 720 Token::Whitespace(' ') 721 ); 722 assert_none!(for lex and buf) 723 } 724 725 #[test] end_of_stream_handling_ok()726 fn end_of_stream_handling_ok() { 727 macro_rules! eof_check( 728 ($data:expr ; $token:expr) => ({ 729 let (mut lex, mut buf) = make_lex_and_buf($data); 730 assert_oks!(for lex and buf ; $token); 731 assert_none!(for lex and buf); 732 }) 733 ); 734 eof_check!("?" ; Token::Character('?')); 735 eof_check!("/" ; Token::Character('/')); 736 eof_check!("-" ; Token::Character('-')); 737 eof_check!("]" ; Token::Character(']')); 738 eof_check!("]]" ; Token::Chunk("]]")); 739 } 740 741 #[test] end_of_stream_handling_error()742 fn end_of_stream_handling_error() { 743 macro_rules! eof_check( 744 ($data:expr; $r:expr, $c:expr) => ({ 745 let (mut lex, mut buf) = make_lex_and_buf($data); 746 assert_err!(for lex and buf expect row $r ; $c, "Unexpected end of stream"); 747 assert_none!(for lex and buf); 748 }) 749 ); 750 eof_check!("<" ; 0, 1); 751 eof_check!("<!" ; 0, 2); 752 eof_check!("<!-" ; 0, 3); 753 eof_check!("<![" ; 0, 3); 754 eof_check!("<![C" ; 0, 4); 755 eof_check!("<![CD" ; 0, 5); 756 eof_check!("<![CDA" ; 0, 6); 757 eof_check!("<![CDAT" ; 0, 7); 758 eof_check!("<![CDATA" ; 0, 8); 759 eof_check!("--" ; 0, 2); 760 } 761 762 #[test] error_in_comment_or_cdata_prefix()763 fn error_in_comment_or_cdata_prefix() { 764 let (mut lex, mut buf) = make_lex_and_buf("<!x"); 765 assert_err!(for lex and buf expect row 0 ; 0, 766 "Unexpected token '<!' before 'x'" 767 ); 768 769 let (mut lex, mut buf) = make_lex_and_buf("<!x"); 770 lex.disable_errors(); 771 assert_oks!(for lex and buf ; 772 Token::Chunk("<!") 773 Token::Character('x') 774 ); 775 assert_none!(for lex and buf); 776 } 777 778 #[test] error_in_comment_started()779 fn error_in_comment_started() { 780 let (mut lex, mut buf) = make_lex_and_buf("<!-\t"); 781 assert_err!(for lex and buf expect row 0 ; 0, 782 "Unexpected token '<!-' before '\t'" 783 ); 784 785 let (mut lex, mut buf) = make_lex_and_buf("<!-\t"); 786 lex.disable_errors(); 787 assert_oks!(for lex and buf ; 788 Token::Chunk("<!-") 789 Token::Whitespace('\t') 790 ); 791 assert_none!(for lex and buf); 792 } 793 794 #[test] error_in_comment_two_dashes_not_at_end()795 fn error_in_comment_two_dashes_not_at_end() { 796 let (mut lex, mut buf) = make_lex_and_buf("--x"); 797 lex.inside_comment(); 798 assert_err!(for lex and buf expect row 0; 0, 799 "Unexpected token '--' before 'x'" 800 ); 801 802 let (mut lex, mut buf) = make_lex_and_buf("--x"); 803 assert_oks!(for lex and buf ; 804 Token::Chunk("--") 805 Token::Character('x') 806 ); 807 } 808 809 macro_rules! check_case( 810 ($chunk:expr, $app:expr; $data:expr; $r:expr, $c:expr, $s:expr) => ({ 811 let (mut lex, mut buf) = make_lex_and_buf($data); 812 assert_err!(for lex and buf expect row $r ; $c, $s); 813 814 let (mut lex, mut buf) = make_lex_and_buf($data); 815 lex.disable_errors(); 816 assert_oks!(for lex and buf ; 817 Token::Chunk($chunk) 818 Token::Character($app) 819 ); 820 assert_none!(for lex and buf); 821 }) 822 ); 823 824 #[test] error_in_cdata_started()825 fn error_in_cdata_started() { 826 check_case!("<![", '['; "<![[" ; 0, 0, "Unexpected token '<![' before '['"); 827 check_case!("<![C", '['; "<![C[" ; 0, 0, "Unexpected token '<![C' before '['"); 828 check_case!("<![CD", '['; "<![CD[" ; 0, 0, "Unexpected token '<![CD' before '['"); 829 check_case!("<![CDA", '['; "<![CDA[" ; 0, 0, "Unexpected token '<![CDA' before '['"); 830 check_case!("<![CDAT", '['; "<![CDAT[" ; 0, 0, "Unexpected token '<![CDAT' before '['"); 831 check_case!("<![CDATA", '|'; "<![CDATA|" ; 0, 0, "Unexpected token '<![CDATA' before '|'"); 832 } 833 834 #[test] error_in_doctype_started()835 fn error_in_doctype_started() { 836 check_case!("<!D", 'a'; "<!Da" ; 0, 0, "Unexpected token '<!D' before 'a'"); 837 check_case!("<!DO", 'b'; "<!DOb" ; 0, 0, "Unexpected token '<!DO' before 'b'"); 838 check_case!("<!DOC", 'c'; "<!DOCc" ; 0, 0, "Unexpected token '<!DOC' before 'c'"); 839 check_case!("<!DOCT", 'd'; "<!DOCTd" ; 0, 0, "Unexpected token '<!DOCT' before 'd'"); 840 check_case!("<!DOCTY", 'e'; "<!DOCTYe" ; 0, 0, "Unexpected token '<!DOCTY' before 'e'"); 841 check_case!("<!DOCTYP", 'f'; "<!DOCTYPf" ; 0, 0, "Unexpected token '<!DOCTYP' before 'f'"); 842 } 843 844 845 846 #[test] issue_98_cdata_ending_with_right_bracket()847 fn issue_98_cdata_ending_with_right_bracket() { 848 let (mut lex, mut buf) = make_lex_and_buf( 849 r#"<![CDATA[Foo [Bar]]]>"# 850 ); 851 852 assert_oks!(for lex and buf ; 853 Token::CDataStart 854 Token::Character('F') 855 Token::Character('o') 856 Token::Character('o') 857 Token::Whitespace(' ') 858 Token::Character('[') 859 Token::Character('B') 860 Token::Character('a') 861 Token::Character('r') 862 Token::Character(']') 863 Token::CDataEnd 864 ); 865 assert_none!(for lex and buf); 866 } 867 } 868