//! Contains simple lexer for XML documents. //! //! This module is for internal use. Use `xml::pull` module to do parsing. use std::fmt; use std::collections::VecDeque; use std::io::Read; use std::result; use std::borrow::Cow; use common::{Position, TextPosition, is_whitespace_char, is_name_char}; use reader::Error; use util; /// `Token` represents a single lexeme of an XML document. These lexemes /// are used to perform actual parsing. #[derive(Copy, Clone, PartialEq, Eq, Debug)] pub enum Token { /// `` ProcessingInstructionEnd, /// `` TagEnd, /// `/>` EmptyTagEnd, /// `` CommentEnd, /// A chunk of characters, used for errors recovery. Chunk(&'static str), /// Any non-special character except whitespace. Character(char), /// Whitespace character. Whitespace(char), /// `=` EqualsSign, /// `'` SingleQuote, /// `"` DoubleQuote, /// `` CDataEnd, /// `&` ReferenceStart, /// `;` ReferenceEnd, } impl fmt::Display for Token { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match *self { Token::Chunk(s) => write!(f, "{}", s), Token::Character(c) | Token::Whitespace(c) => write!(f, "{}", c), other => write!(f, "{}", match other { Token::OpeningTagStart => "<", Token::ProcessingInstructionStart => " " " "", Token::CDataEnd => "]]>", Token::ReferenceStart => "&", Token::ReferenceEnd => ";", Token::EqualsSign => "=", Token::SingleQuote => "'", Token::DoubleQuote => "\"", _ => unreachable!() }) } } } impl Token { pub fn as_static_str(&self) -> Option<&'static str> { match *self { Token::OpeningTagStart => Some("<"), Token::ProcessingInstructionStart => Some(" Some(" Some(" Some(""), Token::CDataEnd => Some("]]>"), Token::ReferenceStart => Some("&"), Token::ReferenceEnd => Some(";"), Token::EqualsSign => Some("="), Token::SingleQuote => Some("'"), Token::DoubleQuote => Some("\""), Token::Chunk(s) => Some(s), _ => None } } // using String.push_str(token.to_string()) is simply way too slow pub fn push_to_string(&self, target: &mut String) { match self.as_static_str() { Some(s) => { target.push_str(s); } None => { match *self { Token::Character(c) | Token::Whitespace(c) => target.push(c), _ => unreachable!() } } } } /// Returns `true` if this token contains data that can be interpreted /// as a part of the text. Surprisingly, this also means '>' and '=' and '"' and "'" and '-->'. #[inline] pub fn contains_char_data(&self) -> bool { match *self { Token::Whitespace(_) | Token::Chunk(_) | Token::Character(_) | Token::CommentEnd | Token::TagEnd | Token::EqualsSign | Token::DoubleQuote | Token::SingleQuote | Token::CDataEnd | Token::ProcessingInstructionEnd | Token::EmptyTagEnd => true, _ => false } } /// Returns `true` if this token corresponds to a white space character. #[inline] pub fn is_whitespace(&self) -> bool { match *self { Token::Whitespace(_) => true, _ => false } } } enum State { /// Triggered on '<' TagStarted, /// Triggered on ', Error>; /// Helps to set up a dispatch table for lexing large unambigous tokens like /// ` ( match $s { $( $st => match $c { $stc => $_self.move_to($is($next_st)), _ => $_self.handle_error($chunk, $c) }, )+ $end_st => match $c { $end_c => $e, _ => $_self.handle_error($end_chunk, $c) } } ) ); /// `Lexer` is a lexer for XML documents, which implements pull API. /// /// Main method is `next_token` which accepts an `std::io::Read` instance and /// tries to read the next lexeme from it. /// /// When `skip_errors` flag is set, invalid lexemes will be returned as `Chunk`s. /// When it is not set, errors will be reported as `Err` objects with a string message. /// By default this flag is not set. Use `enable_errors` and `disable_errors` methods /// to toggle the behavior. pub struct Lexer { pos: TextPosition, head_pos: TextPosition, char_queue: VecDeque, st: State, skip_errors: bool, inside_comment: bool, inside_token: bool, eof_handled: bool } impl Position for Lexer { #[inline] /// Returns the position of the last token produced by the lexer fn position(&self) -> TextPosition { self.pos } } impl Lexer { /// Returns a new lexer with default state. pub fn new() -> Lexer { Lexer { pos: TextPosition::new(), head_pos: TextPosition::new(), char_queue: VecDeque::with_capacity(4), // TODO: check size st: State::Normal, skip_errors: false, inside_comment: false, inside_token: false, eof_handled: false } } /// Enables error handling so `next_token` will return `Some(Err(..))` /// upon invalid lexeme. #[inline] pub fn enable_errors(&mut self) { self.skip_errors = false; } /// Disables error handling so `next_token` will return `Some(Chunk(..))` /// upon invalid lexeme with this lexeme content. #[inline] pub fn disable_errors(&mut self) { self.skip_errors = true; } /// Enables special handling of some lexemes which should be done when we're parsing comment /// internals. #[inline] pub fn inside_comment(&mut self) { self.inside_comment = true; } /// Disables the effect of `inside_comment()` method. #[inline] pub fn outside_comment(&mut self) { self.inside_comment = false; } /// Reset the eof handled flag of the lexer. #[inline] pub fn reset_eof_handled(&mut self) { self.eof_handled = false; } /// Tries to read the next token from the buffer. /// /// It is possible to pass different instaces of `BufReader` each time /// this method is called, but the resulting behavior is undefined in this case. /// /// Return value: /// * `Err(reason) where reason: reader::Error` - when an error occurs; /// * `Ok(None)` - upon end of stream is reached; /// * `Ok(Some(token)) where token: Token` - in case a complete-token has been read from the stream. pub fn next_token(&mut self, b: &mut B) -> Result { // Already reached end of buffer if self.eof_handled { return Ok(None); } if !self.inside_token { self.pos = self.head_pos; self.inside_token = true; } // Check if we have saved a char or two for ourselves while let Some(c) = self.char_queue.pop_front() { match try!(self.read_next_token(c)) { Some(t) => { self.inside_token = false; return Ok(Some(t)); } None => {} // continue } } loop { // TODO: this should handle multiple encodings let c = match try!(util::next_char_from(b)) { Some(c) => c, // got next char None => break, // nothing to read left }; match try!(self.read_next_token(c)) { Some(t) => { self.inside_token = false; return Ok(Some(t)); } None => { // continue } } } // Handle end of stream self.eof_handled = true; self.pos = self.head_pos; match self.st { State::TagStarted | State::CommentOrCDataOrDoctypeStarted | State::CommentStarted | State::CDataStarted(_)| State::DoctypeStarted(_) | State::CommentClosing(ClosingSubstate::Second) | State::DoctypeFinishing(_) => Err(self.error("Unexpected end of stream")), State::ProcessingInstructionClosing => Ok(Some(Token::Character('?'))), State::EmptyTagClosing => Ok(Some(Token::Character('/'))), State::CommentClosing(ClosingSubstate::First) => Ok(Some(Token::Character('-'))), State::CDataClosing(ClosingSubstate::First) => Ok(Some(Token::Character(']'))), State::CDataClosing(ClosingSubstate::Second) => Ok(Some(Token::Chunk("]]"))), State::Normal => Ok(None) } } #[inline] fn error>>(&self, msg: M) -> Error { (self, msg).into() } #[inline] fn read_next_token(&mut self, c: char) -> Result { let res = self.dispatch_char(c); if self.char_queue.is_empty() { if c == '\n' { self.head_pos.new_line(); } else { self.head_pos.advance(1); } } res } fn dispatch_char(&mut self, c: char) -> Result { match self.st { State::Normal => self.normal(c), State::TagStarted => self.tag_opened(c), State::CommentOrCDataOrDoctypeStarted => self.comment_or_cdata_or_doctype_started(c), State::CommentStarted => self.comment_started(c), State::CDataStarted(s) => self.cdata_started(c, s), State::DoctypeStarted(s) => self.doctype_started(c, s), State::DoctypeFinishing(d) => self.doctype_finishing(c, d), State::ProcessingInstructionClosing => self.processing_instruction_closing(c), State::EmptyTagClosing => self.empty_element_closing(c), State::CommentClosing(s) => self.comment_closing(c, s), State::CDataClosing(s) => self.cdata_closing(c, s) } } #[inline] fn move_to(&mut self, st: State) -> Result { self.st = st; Ok(None) } #[inline] fn move_to_with(&mut self, st: State, token: Token) -> Result { self.st = st; Ok(Some(token)) } #[inline] fn move_to_with_unread(&mut self, st: State, cs: &[char], token: Token) -> Result { self.char_queue.extend(cs.iter().cloned()); self.move_to_with(st, token) } fn handle_error(&mut self, chunk: &'static str, c: char) -> Result { self.char_queue.push_back(c); if self.skip_errors || (self.inside_comment && chunk != "--") { // FIXME: looks hacky self.move_to_with(State::Normal, Token::Chunk(chunk)) } else { Err(self.error(format!("Unexpected token '{}' before '{}'", chunk, c))) } } /// Encountered a char fn normal(&mut self, c: char) -> Result { match c { '<' => self.move_to(State::TagStarted), '>' => Ok(Some(Token::TagEnd)), '/' => self.move_to(State::EmptyTagClosing), '=' => Ok(Some(Token::EqualsSign)), '"' => Ok(Some(Token::DoubleQuote)), '\'' => Ok(Some(Token::SingleQuote)), '?' => self.move_to(State::ProcessingInstructionClosing), '-' => self.move_to(State::CommentClosing(ClosingSubstate::First)), ']' => self.move_to(State::CDataClosing(ClosingSubstate::First)), '&' => Ok(Some(Token::ReferenceStart)), ';' => Ok(Some(Token::ReferenceEnd)), _ if is_whitespace_char(c) => Ok(Some(Token::Whitespace(c))), _ => Ok(Some(Token::Character(c))) } } /// Encountered '<' fn tag_opened(&mut self, c: char) -> Result { match c { '?' => self.move_to_with(State::Normal, Token::ProcessingInstructionStart), '/' => self.move_to_with(State::Normal, Token::ClosingTagStart), '!' => self.move_to(State::CommentOrCDataOrDoctypeStarted), _ if is_whitespace_char(c) => self.move_to_with_unread(State::Normal, &[c], Token::OpeningTagStart), _ if is_name_char(c) => self.move_to_with_unread(State::Normal, &[c], Token::OpeningTagStart), _ => self.handle_error("<", c) } } /// Encountered ' Result { match c { '-' => self.move_to(State::CommentStarted), '[' => self.move_to(State::CDataStarted(CDataStartedSubstate::E)), 'D' => self.move_to(State::DoctypeStarted(DoctypeStartedSubstate::D)), _ => self.handle_error(" Result { match c { '-' => self.move_to_with(State::Normal, Token::CommentStart), _ => self.handle_error(" Result { use self::CDataStartedSubstate::{E, C, CD, CDA, CDAT, CDATA}; dispatch_on_enum_state!(self, s, c, State::CDataStarted, E ; 'C' ; C ; " Result { use self::DoctypeStartedSubstate::{D, DO, DOC, DOCT, DOCTY, DOCTYP}; dispatch_on_enum_state!(self, s, c, State::DoctypeStarted, D ; 'O' ; DO ; " Result { match c { '<' => self.move_to(State::DoctypeFinishing(d + 1)), '>' if d == 1 => self.move_to_with(State::Normal, Token::TagEnd), '>' => self.move_to(State::DoctypeFinishing(d - 1)), _ => Ok(None), } } /// Encountered '?' fn processing_instruction_closing(&mut self, c: char) -> Result { match c { '>' => self.move_to_with(State::Normal, Token::ProcessingInstructionEnd), _ => self.move_to_with_unread(State::Normal, &[c], Token::Character('?')), } } /// Encountered '/' fn empty_element_closing(&mut self, c: char) -> Result { match c { '>' => self.move_to_with(State::Normal, Token::EmptyTagEnd), _ => self.move_to_with_unread(State::Normal, &[c], Token::Character('/')), } } /// Encountered '-' fn comment_closing(&mut self, c: char, s: ClosingSubstate) -> Result { match s { ClosingSubstate::First => match c { '-' => self.move_to(State::CommentClosing(ClosingSubstate::Second)), _ => self.move_to_with_unread(State::Normal, &[c], Token::Character('-')) }, ClosingSubstate::Second => match c { '>' => self.move_to_with(State::Normal, Token::CommentEnd), // double dash not followed by a greater-than is a hard error inside comment _ if self.inside_comment => self.handle_error("--", c), // nothing else except comment closing starts with a double dash, and comment // closing can never be after another dash, and also we're outside of a comment, // therefore it is safe to push only the last read character to the list of unread // characters and pass the double dash directly to the output _ => self.move_to_with_unread(State::Normal, &[c], Token::Chunk("--")) } } } /// Encountered ']' fn cdata_closing(&mut self, c: char, s: ClosingSubstate) -> Result { match s { ClosingSubstate::First => match c { ']' => self.move_to(State::CDataClosing(ClosingSubstate::Second)), _ => self.move_to_with_unread(State::Normal, &[c], Token::Character(']')) }, ClosingSubstate::Second => match c { '>' => self.move_to_with(State::Normal, Token::CDataEnd), _ => self.move_to_with_unread(State::Normal, &[']', c], Token::Character(']')) } } } } #[cfg(test)] mod tests { use common::{Position}; use std::io::{BufReader, Cursor}; use super::{Lexer, Token}; macro_rules! assert_oks( (for $lex:ident and $buf:ident ; $($e:expr)+) => ({ $( assert_eq!(Ok(Some($e)), $lex.next_token(&mut $buf)); )+ }) ); macro_rules! assert_err( (for $lex:ident and $buf:ident expect row $r:expr ; $c:expr, $s:expr) => ({ let err = $lex.next_token(&mut $buf); assert!(err.is_err()); let err = err.unwrap_err(); assert_eq!($r as u64, err.position().row); assert_eq!($c as u64, err.position().column); assert_eq!($s, err.msg()); }) ); macro_rules! assert_none( (for $lex:ident and $buf:ident) => ( assert_eq!(Ok(None), $lex.next_token(&mut $buf)); ) ); fn make_lex_and_buf(s: &str) -> (Lexer, BufReader>>) { (Lexer::new(), BufReader::new(Cursor::new(s.to_owned().into_bytes()))) } #[test] fn simple_lexer_test() { let (mut lex, mut buf) = make_lex_and_buf( r#" xd

"# ); assert_oks!(for lex and buf ; Token::OpeningTagStart Token::Character('a') Token::Whitespace(' ') Token::Character('p') Token::EqualsSign Token::SingleQuote Token::Character('q') Token::SingleQuote Token::TagEnd Token::Whitespace(' ') Token::Character('x') Token::OpeningTagStart Token::Character('b') Token::Whitespace(' ') Token::Character('z') Token::EqualsSign Token::DoubleQuote Token::Character('y') Token::DoubleQuote Token::TagEnd Token::Character('d') Token::Whitespace('\t') Token::ClosingTagStart Token::Character('b') Token::TagEnd Token::ClosingTagStart Token::Character('a') Token::TagEnd Token::OpeningTagStart Token::Character('p') Token::EmptyTagEnd Token::Whitespace(' ') Token::ProcessingInstructionStart Token::Character('n') Token::Character('m') Token::Whitespace(' ') Token::ProcessingInstructionEnd Token::Whitespace(' ') Token::CommentStart Token::Whitespace(' ') Token::Character('a') Token::Whitespace(' ') Token::Character('c') Token::Whitespace(' ') Token::CommentEnd Token::Whitespace(' ') Token::ReferenceStart Token::Character('n') Token::Character('b') Token::Character('s') Token::Character('p') Token::ReferenceEnd ); assert_none!(for lex and buf); } #[test] fn special_chars_test() { let (mut lex, mut buf) = make_lex_and_buf( r#"?x!+ // -| ]z]]"# ); assert_oks!(for lex and buf ; Token::Character('?') Token::Character('x') Token::Character('!') Token::Character('+') Token::Whitespace(' ') Token::Character('/') Token::Character('/') Token::Whitespace(' ') Token::Character('-') Token::Character('|') Token::Whitespace(' ') Token::Character(']') Token::Character('z') Token::Chunk("]]") ); assert_none!(for lex and buf); } #[test] fn cdata_test() { let (mut lex, mut buf) = make_lex_and_buf( r#" "# ); assert_oks!(for lex and buf ; Token::OpeningTagStart Token::Character('a') Token::TagEnd Token::CDataStart Token::Character('x') Token::Whitespace(' ') Token::Character('y') Token::Whitespace(' ') Token::Character('?') Token::CDataEnd Token::Whitespace(' ') Token::ClosingTagStart Token::Character('a') Token::TagEnd ); assert_none!(for lex and buf); } #[test] fn doctype_test() { let (mut lex, mut buf) = make_lex_and_buf( r#" "# ); assert_oks!(for lex and buf ; Token::OpeningTagStart Token::Character('a') Token::TagEnd Token::DoctypeStart Token::TagEnd Token::Whitespace(' ') ); assert_none!(for lex and buf) } #[test] fn doctype_with_internal_subset_test() { let (mut lex, mut buf) = make_lex_and_buf( r#" ]> "# ); assert_oks!(for lex and buf ; Token::OpeningTagStart Token::Character('a') Token::TagEnd Token::DoctypeStart Token::TagEnd Token::Whitespace(' ') ); assert_none!(for lex and buf) } #[test] fn end_of_stream_handling_ok() { macro_rules! eof_check( ($data:expr ; $token:expr) => ({ let (mut lex, mut buf) = make_lex_and_buf($data); assert_oks!(for lex and buf ; $token); assert_none!(for lex and buf); }) ); eof_check!("?" ; Token::Character('?')); eof_check!("/" ; Token::Character('/')); eof_check!("-" ; Token::Character('-')); eof_check!("]" ; Token::Character(']')); eof_check!("]]" ; Token::Chunk("]]")); } #[test] fn end_of_stream_handling_error() { macro_rules! eof_check( ($data:expr; $r:expr, $c:expr) => ({ let (mut lex, mut buf) = make_lex_and_buf($data); assert_err!(for lex and buf expect row $r ; $c, "Unexpected end of stream"); assert_none!(for lex and buf); }) ); eof_check!("<" ; 0, 1); eof_check!(" ({ let (mut lex, mut buf) = make_lex_and_buf($data); assert_err!(for lex and buf expect row $r ; $c, $s); let (mut lex, mut buf) = make_lex_and_buf($data); lex.disable_errors(); assert_oks!(for lex and buf ; Token::Chunk($chunk) Token::Character($app) ); assert_none!(for lex and buf); }) ); #[test] fn error_in_cdata_started() { check_case!(""# ); assert_oks!(for lex and buf ; Token::CDataStart Token::Character('F') Token::Character('o') Token::Character('o') Token::Whitespace(' ') Token::Character('[') Token::Character('B') Token::Character('a') Token::Character('r') Token::Character(']') Token::CDataEnd ); assert_none!(for lex and buf); } }