1 //! Contains simple lexer for XML documents.
2 //!
3 //! This module is for internal use. Use `xml::pull` module to do parsing.
4 
5 use std::fmt;
6 use std::collections::VecDeque;
7 use std::io::Read;
8 use std::result;
9 use std::borrow::Cow;
10 
11 use common::{Position, TextPosition, is_whitespace_char, is_name_char};
12 use reader::Error;
13 use util;
14 
15 /// `Token` represents a single lexeme of an XML document. These lexemes
16 /// are used to perform actual parsing.
17 #[derive(Copy, Clone, PartialEq, Eq, Debug)]
18 pub enum Token {
19     /// `<?`
20     ProcessingInstructionStart,
21     /// `?>`
22     ProcessingInstructionEnd,
23     /// `<!DOCTYPE
24     DoctypeStart,
25     /// `<`
26     OpeningTagStart,
27     /// `</`
28     ClosingTagStart,
29     /// `>`
30     TagEnd,
31     /// `/>`
32     EmptyTagEnd,
33     /// `<!--`
34     CommentStart,
35     /// `-->`
36     CommentEnd,
37     /// A chunk of characters, used for errors recovery.
38     Chunk(&'static str),
39     /// Any non-special character except whitespace.
40     Character(char),
41     /// Whitespace character.
42     Whitespace(char),
43     /// `=`
44     EqualsSign,
45     /// `'`
46     SingleQuote,
47     /// `"`
48     DoubleQuote,
49     /// `<![CDATA[`
50     CDataStart,
51     /// `]]>`
52     CDataEnd,
53     /// `&`
54     ReferenceStart,
55     /// `;`
56     ReferenceEnd,
57 }
58 
59 impl fmt::Display for Token {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result60     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
61         match *self {
62             Token::Chunk(s)                            => write!(f, "{}", s),
63             Token::Character(c) | Token::Whitespace(c) => write!(f, "{}", c),
64             other => write!(f, "{}", match other {
65                 Token::OpeningTagStart            => "<",
66                 Token::ProcessingInstructionStart => "<?",
67                 Token::DoctypeStart               => "<!DOCTYPE",
68                 Token::ClosingTagStart            => "</",
69                 Token::CommentStart               => "<!--",
70                 Token::CDataStart                 => "<![CDATA[",
71                 Token::TagEnd                     => ">",
72                 Token::EmptyTagEnd                => "/>",
73                 Token::ProcessingInstructionEnd   => "?>",
74                 Token::CommentEnd                 => "-->",
75                 Token::CDataEnd                   => "]]>",
76                 Token::ReferenceStart             => "&",
77                 Token::ReferenceEnd               => ";",
78                 Token::EqualsSign                 => "=",
79                 Token::SingleQuote                => "'",
80                 Token::DoubleQuote                => "\"",
81                 _                          => unreachable!()
82             })
83         }
84     }
85 }
86 
87 impl Token {
as_static_str(&self) -> Option<&'static str>88     pub fn as_static_str(&self) -> Option<&'static str> {
89         match *self {
90             Token::OpeningTagStart            => Some("<"),
91             Token::ProcessingInstructionStart => Some("<?"),
92             Token::DoctypeStart               => Some("<!DOCTYPE"),
93             Token::ClosingTagStart            => Some("</"),
94             Token::CommentStart               => Some("<!--"),
95             Token::CDataStart                 => Some("<![CDATA["),
96             Token::TagEnd                     => Some(">"),
97             Token::EmptyTagEnd                => Some("/>"),
98             Token::ProcessingInstructionEnd   => Some("?>"),
99             Token::CommentEnd                 => Some("-->"),
100             Token::CDataEnd                   => Some("]]>"),
101             Token::ReferenceStart             => Some("&"),
102             Token::ReferenceEnd               => Some(";"),
103             Token::EqualsSign                 => Some("="),
104             Token::SingleQuote                => Some("'"),
105             Token::DoubleQuote                => Some("\""),
106             Token::Chunk(s)                   => Some(s),
107             _                                 => None
108         }
109     }
110 
111     // using String.push_str(token.to_string()) is simply way too slow
push_to_string(&self, target: &mut String)112     pub fn push_to_string(&self, target: &mut String) {
113         match self.as_static_str() {
114             Some(s) => { target.push_str(s); }
115             None => {
116                 match *self {
117                     Token::Character(c) | Token::Whitespace(c) => target.push(c),
118                     _ => unreachable!()
119                 }
120             }
121         }
122     }
123 
124     /// Returns `true` if this token contains data that can be interpreted
125     /// as a part of the text. Surprisingly, this also means '>' and '=' and '"' and "'" and '-->'.
126     #[inline]
contains_char_data(&self) -> bool127     pub fn contains_char_data(&self) -> bool {
128         match *self {
129             Token::Whitespace(_) | Token::Chunk(_) | Token::Character(_) | Token::CommentEnd |
130             Token::TagEnd | Token::EqualsSign | Token::DoubleQuote | Token::SingleQuote => true,
131             _ => false
132         }
133     }
134 
135     /// Returns `true` if this token corresponds to a white space character.
136     #[inline]
is_whitespace(&self) -> bool137     pub fn is_whitespace(&self) -> bool {
138         match *self {
139             Token::Whitespace(_) => true,
140             _ => false
141         }
142     }
143 }
144 
145 enum State {
146     /// Triggered on '<'
147     TagStarted,
148     /// Triggered on '<!'
149     CommentOrCDataOrDoctypeStarted,
150     /// Triggered on '<!-'
151     CommentStarted,
152     /// Triggered on '<!D' up to '<!DOCTYPE'
153     DoctypeStarted(DoctypeStartedSubstate),
154     /// Triggered after DoctypeStarted to handle sub elements
155     DoctypeFinishing(u8),
156     /// Triggered on '<![' up to '<![CDATA'
157     CDataStarted(CDataStartedSubstate),
158     /// Triggered on '?'
159     ProcessingInstructionClosing,
160     /// Triggered on '/'
161     EmptyTagClosing,
162     /// Triggered on '-' up to '--'
163     CommentClosing(ClosingSubstate),
164     /// Triggered on ']' up to ']]'
165     CDataClosing(ClosingSubstate),
166     /// Default state
167     Normal
168 }
169 
170 #[derive(Copy, Clone)]
171 enum ClosingSubstate {
172     First, Second
173 }
174 
175 #[derive(Copy, Clone)]
176 enum DoctypeStartedSubstate {
177     D, DO, DOC, DOCT, DOCTY, DOCTYP
178 }
179 
180 #[derive(Copy, Clone)]
181 enum CDataStartedSubstate {
182     E, C, CD, CDA, CDAT, CDATA
183 }
184 
185 /// `Result` represents lexing result. It is either a token or an error message.
186 pub type Result = result::Result<Option<Token>, Error>;
187 
188 /// Helps to set up a dispatch table for lexing large unambigous tokens like
189 /// `<![CDATA[` or `<!DOCTYPE `.
190 macro_rules! dispatch_on_enum_state(
191     ($_self:ident, $s:expr, $c:expr, $is:expr,
192      $($st:ident; $stc:expr ; $next_st:ident ; $chunk:expr),+;
193      $end_st:ident ; $end_c:expr ; $end_chunk:expr ; $e:expr) => (
194         match $s {
195             $(
196             $st => match $c {
197                 $stc => $_self.move_to($is($next_st)),
198                 _  => $_self.handle_error($chunk, $c)
199             },
200             )+
201             $end_st => match $c {
202                 $end_c => $e,
203                 _      => $_self.handle_error($end_chunk, $c)
204             }
205         }
206     )
207 );
208 
209 /// `Lexer` is a lexer for XML documents, which implements pull API.
210 ///
211 /// Main method is `next_token` which accepts an `std::io::Read` instance and
212 /// tries to read the next lexeme from it.
213 ///
214 /// When `skip_errors` flag is set, invalid lexemes will be returned as `Chunk`s.
215 /// When it is not set, errors will be reported as `Err` objects with a string message.
216 /// By default this flag is not set. Use `enable_errors` and `disable_errors` methods
217 /// to toggle the behavior.
218 pub struct Lexer {
219     pos: TextPosition,
220     head_pos: TextPosition,
221     char_queue: VecDeque<char>,
222     st: State,
223     skip_errors: bool,
224     inside_comment: bool,
225     inside_token: bool,
226     eof_handled: bool
227 }
228 
229 impl Position for Lexer {
230     #[inline]
231     /// Returns the position of the last token produced by the lexer
position(&self) -> TextPosition232     fn position(&self) -> TextPosition { self.pos }
233 }
234 
235 impl Lexer {
236     /// Returns a new lexer with default state.
new() -> Lexer237     pub fn new() -> Lexer {
238         Lexer {
239             pos: TextPosition::new(),
240             head_pos: TextPosition::new(),
241             char_queue: VecDeque::with_capacity(4),  // TODO: check size
242             st: State::Normal,
243             skip_errors: false,
244             inside_comment: false,
245             inside_token: false,
246             eof_handled: false
247         }
248     }
249 
250     /// Enables error handling so `next_token` will return `Some(Err(..))`
251     /// upon invalid lexeme.
252     #[inline]
enable_errors(&mut self)253     pub fn enable_errors(&mut self) { self.skip_errors = false; }
254 
255     /// Disables error handling so `next_token` will return `Some(Chunk(..))`
256     /// upon invalid lexeme with this lexeme content.
257     #[inline]
disable_errors(&mut self)258     pub fn disable_errors(&mut self) { self.skip_errors = true; }
259 
260     /// Enables special handling of some lexemes which should be done when we're parsing comment
261     /// internals.
262     #[inline]
inside_comment(&mut self)263     pub fn inside_comment(&mut self) { self.inside_comment = true; }
264 
265     /// Disables the effect of `inside_comment()` method.
266     #[inline]
outside_comment(&mut self)267     pub fn outside_comment(&mut self) { self.inside_comment = false; }
268 
269     /// Reset the eof handled flag of the lexer.
270     #[inline]
reset_eof_handled(&mut self)271     pub fn reset_eof_handled(&mut self) { self.eof_handled = false; }
272 
273     /// Tries to read the next token from the buffer.
274     ///
275     /// It is possible to pass different instaces of `BufReader` each time
276     /// this method is called, but the resulting behavior is undefined in this case.
277     ///
278     /// Return value:
279     /// * `Err(reason) where reason: reader::Error` - when an error occurs;
280     /// * `Ok(None)` - upon end of stream is reached;
281     /// * `Ok(Some(token)) where token: Token` - in case a complete-token has been read from the stream.
next_token<B: Read>(&mut self, b: &mut B) -> Result282     pub fn next_token<B: Read>(&mut self, b: &mut B) -> Result {
283         // Already reached end of buffer
284         if self.eof_handled {
285             return Ok(None);
286         }
287 
288         if !self.inside_token {
289             self.pos = self.head_pos;
290             self.inside_token = true;
291         }
292 
293         // Check if we have saved a char or two for ourselves
294         while let Some(c) = self.char_queue.pop_front() {
295             match try!(self.read_next_token(c)) {
296                 Some(t) => {
297                     self.inside_token = false;
298                     return Ok(Some(t));
299                 }
300                 None => {}  // continue
301             }
302         }
303 
304         loop {
305             // TODO: this should handle multiple encodings
306             let c = match try!(util::next_char_from(b)) {
307                 Some(c) => c,   // got next char
308                 None => break,  // nothing to read left
309             };
310 
311             match try!(self.read_next_token(c)) {
312                 Some(t) => {
313                     self.inside_token = false;
314                     return Ok(Some(t));
315                 }
316                 None => {
317                     // continue
318                 }
319             }
320         }
321 
322         // Handle end of stream
323         self.eof_handled = true;
324         self.pos = self.head_pos;
325         match self.st {
326             State::TagStarted | State::CommentOrCDataOrDoctypeStarted |
327             State::CommentStarted | State::CDataStarted(_)| State::DoctypeStarted(_) |
328             State::CommentClosing(ClosingSubstate::Second) |
329             State::DoctypeFinishing(_) =>
330                 Err(self.error("Unexpected end of stream")),
331             State::ProcessingInstructionClosing =>
332                 Ok(Some(Token::Character('?'))),
333             State::EmptyTagClosing =>
334                 Ok(Some(Token::Character('/'))),
335             State::CommentClosing(ClosingSubstate::First) =>
336                 Ok(Some(Token::Character('-'))),
337             State::CDataClosing(ClosingSubstate::First) =>
338                 Ok(Some(Token::Character(']'))),
339             State::CDataClosing(ClosingSubstate::Second) =>
340                 Ok(Some(Token::Chunk("]]"))),
341             State::Normal =>
342                 Ok(None)
343         }
344     }
345 
346     #[inline]
error<M: Into<Cow<'static, str>>>(&self, msg: M) -> Error347     fn error<M: Into<Cow<'static, str>>>(&self, msg: M) -> Error {
348         (self, msg).into()
349     }
350 
351     #[inline]
read_next_token(&mut self, c: char) -> Result352     fn read_next_token(&mut self, c: char) -> Result {
353         let res = self.dispatch_char(c);
354         if self.char_queue.is_empty() {
355             if c == '\n' {
356                 self.head_pos.new_line();
357             } else {
358                 self.head_pos.advance(1);
359             }
360         }
361         res
362     }
363 
dispatch_char(&mut self, c: char) -> Result364     fn dispatch_char(&mut self, c: char) -> Result {
365         match self.st {
366             State::Normal                         => self.normal(c),
367             State::TagStarted                     => self.tag_opened(c),
368             State::CommentOrCDataOrDoctypeStarted => self.comment_or_cdata_or_doctype_started(c),
369             State::CommentStarted                 => self.comment_started(c),
370             State::CDataStarted(s)                => self.cdata_started(c, s),
371             State::DoctypeStarted(s)              => self.doctype_started(c, s),
372             State::DoctypeFinishing(d)            => self.doctype_finishing(c, d),
373             State::ProcessingInstructionClosing   => self.processing_instruction_closing(c),
374             State::EmptyTagClosing                => self.empty_element_closing(c),
375             State::CommentClosing(s)              => self.comment_closing(c, s),
376             State::CDataClosing(s)                => self.cdata_closing(c, s)
377         }
378     }
379 
380     #[inline]
move_to(&mut self, st: State) -> Result381     fn move_to(&mut self, st: State) -> Result {
382         self.st = st;
383         Ok(None)
384     }
385 
386     #[inline]
move_to_with(&mut self, st: State, token: Token) -> Result387     fn move_to_with(&mut self, st: State, token: Token) -> Result {
388         self.st = st;
389         Ok(Some(token))
390     }
391 
392     #[inline]
move_to_with_unread(&mut self, st: State, cs: &[char], token: Token) -> Result393     fn move_to_with_unread(&mut self, st: State, cs: &[char], token: Token) -> Result {
394         self.char_queue.extend(cs.iter().cloned());
395         self.move_to_with(st, token)
396     }
397 
handle_error(&mut self, chunk: &'static str, c: char) -> Result398     fn handle_error(&mut self, chunk: &'static str, c: char) -> Result {
399         self.char_queue.push_back(c);
400         if self.skip_errors || (self.inside_comment && chunk != "--") {  // FIXME: looks hacky
401             self.move_to_with(State::Normal, Token::Chunk(chunk))
402         } else {
403             Err(self.error(format!("Unexpected token '{}' before '{}'", chunk, c)))
404         }
405     }
406 
407     /// Encountered a char
normal(&mut self, c: char) -> Result408     fn normal(&mut self, c: char) -> Result {
409         match c {
410             '<'                        => self.move_to(State::TagStarted),
411             '>'                        => Ok(Some(Token::TagEnd)),
412             '/'                        => self.move_to(State::EmptyTagClosing),
413             '='                        => Ok(Some(Token::EqualsSign)),
414             '"'                        => Ok(Some(Token::DoubleQuote)),
415             '\''                       => Ok(Some(Token::SingleQuote)),
416             '?'                        => self.move_to(State::ProcessingInstructionClosing),
417             '-'                        => self.move_to(State::CommentClosing(ClosingSubstate::First)),
418             ']'                        => self.move_to(State::CDataClosing(ClosingSubstate::First)),
419             '&'                        => Ok(Some(Token::ReferenceStart)),
420             ';'                        => Ok(Some(Token::ReferenceEnd)),
421             _ if is_whitespace_char(c) => Ok(Some(Token::Whitespace(c))),
422             _                          => Ok(Some(Token::Character(c)))
423         }
424     }
425 
426     /// Encountered '<'
tag_opened(&mut self, c: char) -> Result427     fn tag_opened(&mut self, c: char) -> Result {
428         match c {
429             '?'                        => self.move_to_with(State::Normal, Token::ProcessingInstructionStart),
430             '/'                        => self.move_to_with(State::Normal, Token::ClosingTagStart),
431             '!'                        => self.move_to(State::CommentOrCDataOrDoctypeStarted),
432             _ if is_whitespace_char(c) => self.move_to_with_unread(State::Normal, &[c], Token::OpeningTagStart),
433             _ if is_name_char(c)       => self.move_to_with_unread(State::Normal, &[c], Token::OpeningTagStart),
434             _                          => self.handle_error("<", c)
435         }
436     }
437 
438     /// Encountered '<!'
comment_or_cdata_or_doctype_started(&mut self, c: char) -> Result439     fn comment_or_cdata_or_doctype_started(&mut self, c: char) -> Result {
440         match c {
441             '-' => self.move_to(State::CommentStarted),
442             '[' => self.move_to(State::CDataStarted(CDataStartedSubstate::E)),
443             'D' => self.move_to(State::DoctypeStarted(DoctypeStartedSubstate::D)),
444             _   => self.handle_error("<!", c)
445         }
446     }
447 
448     /// Encountered '<!-'
comment_started(&mut self, c: char) -> Result449     fn comment_started(&mut self, c: char) -> Result {
450         match c {
451             '-' => self.move_to_with(State::Normal, Token::CommentStart),
452             _   => self.handle_error("<!-", c)
453         }
454     }
455 
456     /// Encountered '<!['
cdata_started(&mut self, c: char, s: CDataStartedSubstate) -> Result457     fn cdata_started(&mut self, c: char, s: CDataStartedSubstate) -> Result {
458         use self::CDataStartedSubstate::{E, C, CD, CDA, CDAT, CDATA};
459         dispatch_on_enum_state!(self, s, c, State::CDataStarted,
460             E     ; 'C' ; C     ; "<![",
461             C     ; 'D' ; CD    ; "<![C",
462             CD    ; 'A' ; CDA   ; "<![CD",
463             CDA   ; 'T' ; CDAT  ; "<![CDA",
464             CDAT  ; 'A' ; CDATA ; "<![CDAT";
465             CDATA ; '[' ; "<![CDATA" ; self.move_to_with(State::Normal, Token::CDataStart)
466         )
467     }
468 
469     /// Encountered '<!D'
doctype_started(&mut self, c: char, s: DoctypeStartedSubstate) -> Result470     fn doctype_started(&mut self, c: char, s: DoctypeStartedSubstate) -> Result {
471         use self::DoctypeStartedSubstate::{D, DO, DOC, DOCT, DOCTY, DOCTYP};
472         dispatch_on_enum_state!(self, s, c, State::DoctypeStarted,
473             D      ; 'O' ; DO     ; "<!D",
474             DO     ; 'C' ; DOC    ; "<!DO",
475             DOC    ; 'T' ; DOCT   ; "<!DOC",
476             DOCT   ; 'Y' ; DOCTY  ; "<!DOCT",
477             DOCTY  ; 'P' ; DOCTYP ; "<!DOCTY";
478             DOCTYP ; 'E' ; "<!DOCTYP" ; self.move_to_with(State::DoctypeFinishing(1), Token::DoctypeStart)
479         )
480     }
481 
482     /// State used while awaiting the closing bracket for the <!DOCTYPE tag
doctype_finishing(&mut self, c: char, d: u8) -> Result483     fn doctype_finishing(&mut self, c: char, d: u8) -> Result {
484         match c {
485             '<' => self.move_to(State::DoctypeFinishing(d + 1)),
486             '>' if d == 1 => self.move_to_with(State::Normal, Token::TagEnd),
487             '>' => self.move_to(State::DoctypeFinishing(d - 1)),
488             _ => Ok(None),
489         }
490     }
491 
492     /// Encountered '?'
processing_instruction_closing(&mut self, c: char) -> Result493     fn processing_instruction_closing(&mut self, c: char) -> Result {
494         match c {
495             '>' => self.move_to_with(State::Normal, Token::ProcessingInstructionEnd),
496             _   => self.move_to_with_unread(State::Normal, &[c], Token::Character('?')),
497         }
498     }
499 
500     /// Encountered '/'
empty_element_closing(&mut self, c: char) -> Result501     fn empty_element_closing(&mut self, c: char) -> Result {
502         match c {
503             '>' => self.move_to_with(State::Normal, Token::EmptyTagEnd),
504             _   => self.move_to_with_unread(State::Normal, &[c], Token::Character('/')),
505         }
506     }
507 
508     /// Encountered '-'
comment_closing(&mut self, c: char, s: ClosingSubstate) -> Result509     fn comment_closing(&mut self, c: char, s: ClosingSubstate) -> Result {
510         match s {
511             ClosingSubstate::First => match c {
512                 '-' => self.move_to(State::CommentClosing(ClosingSubstate::Second)),
513                 _   => self.move_to_with_unread(State::Normal, &[c], Token::Character('-'))
514             },
515             ClosingSubstate::Second => match c {
516                 '>'                      => self.move_to_with(State::Normal, Token::CommentEnd),
517                 // double dash not followed by a greater-than is a hard error inside comment
518                 _ if self.inside_comment => self.handle_error("--", c),
519                 // nothing else except comment closing starts with a double dash, and comment
520                 // closing can never be after another dash, and also we're outside of a comment,
521                 // therefore it is safe to push only the last read character to the list of unread
522                 // characters and pass the double dash directly to the output
523                 _                        => self.move_to_with_unread(State::Normal, &[c], Token::Chunk("--"))
524             }
525         }
526     }
527 
528     /// Encountered ']'
cdata_closing(&mut self, c: char, s: ClosingSubstate) -> Result529     fn cdata_closing(&mut self, c: char, s: ClosingSubstate) -> Result {
530         match s {
531             ClosingSubstate::First => match c {
532                 ']' => self.move_to(State::CDataClosing(ClosingSubstate::Second)),
533                 _   => self.move_to_with_unread(State::Normal, &[c], Token::Character(']'))
534             },
535             ClosingSubstate::Second => match c {
536                 '>' => self.move_to_with(State::Normal, Token::CDataEnd),
537                 _   => self.move_to_with_unread(State::Normal, &[']', c], Token::Character(']'))
538             }
539         }
540     }
541 }
542 
543 #[cfg(test)]
544 mod tests {
545     use common::{Position};
546     use std::io::{BufReader, Cursor};
547 
548     use super::{Lexer, Token};
549 
550     macro_rules! assert_oks(
551         (for $lex:ident and $buf:ident ; $($e:expr)+) => ({
552             $(
553                 assert_eq!(Ok(Some($e)), $lex.next_token(&mut $buf));
554              )+
555         })
556     );
557 
558     macro_rules! assert_err(
559         (for $lex:ident and $buf:ident expect row $r:expr ; $c:expr, $s:expr) => ({
560             let err = $lex.next_token(&mut $buf);
561             assert!(err.is_err());
562             let err = err.unwrap_err();
563             assert_eq!($r as u64, err.position().row);
564             assert_eq!($c as u64, err.position().column);
565             assert_eq!($s, err.msg());
566         })
567     );
568 
569     macro_rules! assert_none(
570         (for $lex:ident and $buf:ident) => (
571             assert_eq!(Ok(None), $lex.next_token(&mut $buf));
572         )
573     );
574 
make_lex_and_buf(s: &str) -> (Lexer, BufReader<Cursor<Vec<u8>>>)575     fn make_lex_and_buf(s: &str) -> (Lexer, BufReader<Cursor<Vec<u8>>>) {
576         (Lexer::new(), BufReader::new(Cursor::new(s.to_owned().into_bytes())))
577     }
578 
579     #[test]
simple_lexer_test()580     fn simple_lexer_test() {
581         let (mut lex, mut buf) = make_lex_and_buf(
582             r#"<a p='q'> x<b z="y">d	</b></a><p/> <?nm ?> <!-- a c --> &nbsp;"#
583         );
584 
585         assert_oks!(for lex and buf ;
586             Token::OpeningTagStart
587             Token::Character('a')
588             Token::Whitespace(' ')
589             Token::Character('p')
590             Token::EqualsSign
591             Token::SingleQuote
592             Token::Character('q')
593             Token::SingleQuote
594             Token::TagEnd
595             Token::Whitespace(' ')
596             Token::Character('x')
597             Token::OpeningTagStart
598             Token::Character('b')
599             Token::Whitespace(' ')
600             Token::Character('z')
601             Token::EqualsSign
602             Token::DoubleQuote
603             Token::Character('y')
604             Token::DoubleQuote
605             Token::TagEnd
606             Token::Character('d')
607             Token::Whitespace('\t')
608             Token::ClosingTagStart
609             Token::Character('b')
610             Token::TagEnd
611             Token::ClosingTagStart
612             Token::Character('a')
613             Token::TagEnd
614             Token::OpeningTagStart
615             Token::Character('p')
616             Token::EmptyTagEnd
617             Token::Whitespace(' ')
618             Token::ProcessingInstructionStart
619             Token::Character('n')
620             Token::Character('m')
621             Token::Whitespace(' ')
622             Token::ProcessingInstructionEnd
623             Token::Whitespace(' ')
624             Token::CommentStart
625             Token::Whitespace(' ')
626             Token::Character('a')
627             Token::Whitespace(' ')
628             Token::Character('c')
629             Token::Whitespace(' ')
630             Token::CommentEnd
631             Token::Whitespace(' ')
632             Token::ReferenceStart
633             Token::Character('n')
634             Token::Character('b')
635             Token::Character('s')
636             Token::Character('p')
637             Token::ReferenceEnd
638         );
639         assert_none!(for lex and buf);
640     }
641 
642     #[test]
special_chars_test()643     fn special_chars_test() {
644         let (mut lex, mut buf) = make_lex_and_buf(
645             r#"?x!+ // -| ]z]]"#
646         );
647 
648         assert_oks!(for lex and buf ;
649             Token::Character('?')
650             Token::Character('x')
651             Token::Character('!')
652             Token::Character('+')
653             Token::Whitespace(' ')
654             Token::Character('/')
655             Token::Character('/')
656             Token::Whitespace(' ')
657             Token::Character('-')
658             Token::Character('|')
659             Token::Whitespace(' ')
660             Token::Character(']')
661             Token::Character('z')
662             Token::Chunk("]]")
663         );
664         assert_none!(for lex and buf);
665     }
666 
667     #[test]
cdata_test()668     fn cdata_test() {
669         let (mut lex, mut buf) = make_lex_and_buf(
670             r#"<a><![CDATA[x y ?]]> </a>"#
671         );
672 
673         assert_oks!(for lex and buf ;
674             Token::OpeningTagStart
675             Token::Character('a')
676             Token::TagEnd
677             Token::CDataStart
678             Token::Character('x')
679             Token::Whitespace(' ')
680             Token::Character('y')
681             Token::Whitespace(' ')
682             Token::Character('?')
683             Token::CDataEnd
684             Token::Whitespace(' ')
685             Token::ClosingTagStart
686             Token::Character('a')
687             Token::TagEnd
688         );
689         assert_none!(for lex and buf);
690     }
691 
692     #[test]
doctype_test()693     fn doctype_test() {
694         let (mut lex, mut buf) = make_lex_and_buf(
695             r#"<a><!DOCTYPE ab xx z> "#
696         );
697         assert_oks!(for lex and buf ;
698             Token::OpeningTagStart
699             Token::Character('a')
700             Token::TagEnd
701             Token::DoctypeStart
702             Token::TagEnd
703             Token::Whitespace(' ')
704         );
705         assert_none!(for lex and buf)
706     }
707 
708     #[test]
doctype_with_internal_subset_test()709     fn doctype_with_internal_subset_test() {
710         let (mut lex, mut buf) = make_lex_and_buf(
711             r#"<a><!DOCTYPE ab[<!ELEMENT ba> ]> "#
712         );
713         assert_oks!(for lex and buf ;
714             Token::OpeningTagStart
715             Token::Character('a')
716             Token::TagEnd
717             Token::DoctypeStart
718             Token::TagEnd
719             Token::Whitespace(' ')
720         );
721         assert_none!(for lex and buf)
722     }
723 
724     #[test]
end_of_stream_handling_ok()725     fn end_of_stream_handling_ok() {
726         macro_rules! eof_check(
727             ($data:expr ; $token:expr) => ({
728                 let (mut lex, mut buf) = make_lex_and_buf($data);
729                 assert_oks!(for lex and buf ; $token);
730                 assert_none!(for lex and buf);
731             })
732         );
733         eof_check!("?"  ; Token::Character('?'));
734         eof_check!("/"  ; Token::Character('/'));
735         eof_check!("-"  ; Token::Character('-'));
736         eof_check!("]"  ; Token::Character(']'));
737         eof_check!("]]" ; Token::Chunk("]]"));
738     }
739 
740     #[test]
end_of_stream_handling_error()741     fn end_of_stream_handling_error() {
742         macro_rules! eof_check(
743             ($data:expr; $r:expr, $c:expr) => ({
744                 let (mut lex, mut buf) = make_lex_and_buf($data);
745                 assert_err!(for lex and buf expect row $r ; $c, "Unexpected end of stream");
746                 assert_none!(for lex and buf);
747             })
748         );
749         eof_check!("<"        ; 0, 1);
750         eof_check!("<!"       ; 0, 2);
751         eof_check!("<!-"      ; 0, 3);
752         eof_check!("<!["      ; 0, 3);
753         eof_check!("<![C"     ; 0, 4);
754         eof_check!("<![CD"    ; 0, 5);
755         eof_check!("<![CDA"   ; 0, 6);
756         eof_check!("<![CDAT"  ; 0, 7);
757         eof_check!("<![CDATA" ; 0, 8);
758         eof_check!("--"       ; 0, 2);
759     }
760 
761     #[test]
error_in_comment_or_cdata_prefix()762     fn error_in_comment_or_cdata_prefix() {
763         let (mut lex, mut buf) = make_lex_and_buf("<!x");
764         assert_err!(for lex and buf expect row 0 ; 0,
765             "Unexpected token '<!' before 'x'"
766         );
767 
768         let (mut lex, mut buf) = make_lex_and_buf("<!x");
769         lex.disable_errors();
770         assert_oks!(for lex and buf ;
771             Token::Chunk("<!")
772             Token::Character('x')
773         );
774         assert_none!(for lex and buf);
775     }
776 
777     #[test]
error_in_comment_started()778     fn error_in_comment_started() {
779         let (mut lex, mut buf) = make_lex_and_buf("<!-\t");
780         assert_err!(for lex and buf expect row 0 ; 0,
781             "Unexpected token '<!-' before '\t'"
782         );
783 
784         let (mut lex, mut buf) = make_lex_and_buf("<!-\t");
785         lex.disable_errors();
786         assert_oks!(for lex and buf ;
787             Token::Chunk("<!-")
788             Token::Whitespace('\t')
789         );
790         assert_none!(for lex and buf);
791     }
792 
793     #[test]
error_in_comment_two_dashes_not_at_end()794     fn error_in_comment_two_dashes_not_at_end() {
795         let (mut lex, mut buf) = make_lex_and_buf("--x");
796         lex.inside_comment();
797         assert_err!(for lex and buf expect row 0; 0,
798             "Unexpected token '--' before 'x'"
799         );
800 
801         let (mut lex, mut buf) = make_lex_and_buf("--x");
802         assert_oks!(for lex and buf ;
803             Token::Chunk("--")
804             Token::Character('x')
805         );
806     }
807 
808     macro_rules! check_case(
809         ($chunk:expr, $app:expr; $data:expr; $r:expr, $c:expr, $s:expr) => ({
810             let (mut lex, mut buf) = make_lex_and_buf($data);
811             assert_err!(for lex and buf expect row $r ; $c, $s);
812 
813             let (mut lex, mut buf) = make_lex_and_buf($data);
814             lex.disable_errors();
815             assert_oks!(for lex and buf ;
816                 Token::Chunk($chunk)
817                 Token::Character($app)
818             );
819             assert_none!(for lex and buf);
820         })
821     );
822 
823     #[test]
error_in_cdata_started()824     fn error_in_cdata_started() {
825         check_case!("<![",      '['; "<![["      ; 0, 0, "Unexpected token '<![' before '['");
826         check_case!("<![C",     '['; "<![C["     ; 0, 0, "Unexpected token '<![C' before '['");
827         check_case!("<![CD",    '['; "<![CD["    ; 0, 0, "Unexpected token '<![CD' before '['");
828         check_case!("<![CDA",   '['; "<![CDA["   ; 0, 0, "Unexpected token '<![CDA' before '['");
829         check_case!("<![CDAT",  '['; "<![CDAT["  ; 0, 0, "Unexpected token '<![CDAT' before '['");
830         check_case!("<![CDATA", '|'; "<![CDATA|" ; 0, 0, "Unexpected token '<![CDATA' before '|'");
831     }
832 
833     #[test]
error_in_doctype_started()834     fn error_in_doctype_started() {
835         check_case!("<!D",      'a'; "<!Da"      ; 0, 0, "Unexpected token '<!D' before 'a'");
836         check_case!("<!DO",     'b'; "<!DOb"     ; 0, 0, "Unexpected token '<!DO' before 'b'");
837         check_case!("<!DOC",    'c'; "<!DOCc"    ; 0, 0, "Unexpected token '<!DOC' before 'c'");
838         check_case!("<!DOCT",   'd'; "<!DOCTd"   ; 0, 0, "Unexpected token '<!DOCT' before 'd'");
839         check_case!("<!DOCTY",  'e'; "<!DOCTYe"  ; 0, 0, "Unexpected token '<!DOCTY' before 'e'");
840         check_case!("<!DOCTYP", 'f'; "<!DOCTYPf" ; 0, 0, "Unexpected token '<!DOCTYP' before 'f'");
841     }
842 
843 
844 
845     #[test]
issue_98_cdata_ending_with_right_bracket()846     fn issue_98_cdata_ending_with_right_bracket() {
847         let (mut lex, mut buf) = make_lex_and_buf(
848             r#"<![CDATA[Foo [Bar]]]>"#
849         );
850 
851         assert_oks!(for lex and buf ;
852             Token::CDataStart
853             Token::Character('F')
854             Token::Character('o')
855             Token::Character('o')
856             Token::Whitespace(' ')
857             Token::Character('[')
858             Token::Character('B')
859             Token::Character('a')
860             Token::Character('r')
861             Token::Character(']')
862             Token::CDataEnd
863         );
864         assert_none!(for lex and buf);
865     }
866 }
867