1 //! Contains simple lexer for XML documents.
2 //!
3 //! This module is for internal use. Use `xml::pull` module to do parsing.
4 
5 use std::fmt;
6 use std::collections::VecDeque;
7 use std::io::Read;
8 use std::result;
9 use std::borrow::Cow;
10 
11 use common::{Position, TextPosition, is_whitespace_char, is_name_char};
12 use reader::Error;
13 use util;
14 
15 /// `Token` represents a single lexeme of an XML document. These lexemes
16 /// are used to perform actual parsing.
17 #[derive(Copy, Clone, PartialEq, Eq, Debug)]
18 pub enum Token {
19     /// `<?`
20     ProcessingInstructionStart,
21     /// `?>`
22     ProcessingInstructionEnd,
23     /// `<!DOCTYPE
24     DoctypeStart,
25     /// `<`
26     OpeningTagStart,
27     /// `</`
28     ClosingTagStart,
29     /// `>`
30     TagEnd,
31     /// `/>`
32     EmptyTagEnd,
33     /// `<!--`
34     CommentStart,
35     /// `-->`
36     CommentEnd,
37     /// A chunk of characters, used for errors recovery.
38     Chunk(&'static str),
39     /// Any non-special character except whitespace.
40     Character(char),
41     /// Whitespace character.
42     Whitespace(char),
43     /// `=`
44     EqualsSign,
45     /// `'`
46     SingleQuote,
47     /// `"`
48     DoubleQuote,
49     /// `<![CDATA[`
50     CDataStart,
51     /// `]]>`
52     CDataEnd,
53     /// `&`
54     ReferenceStart,
55     /// `;`
56     ReferenceEnd,
57 }
58 
59 impl fmt::Display for Token {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result60     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
61         match *self {
62             Token::Chunk(s)                            => write!(f, "{}", s),
63             Token::Character(c) | Token::Whitespace(c) => write!(f, "{}", c),
64             other => write!(f, "{}", match other {
65                 Token::OpeningTagStart            => "<",
66                 Token::ProcessingInstructionStart => "<?",
67                 Token::DoctypeStart               => "<!DOCTYPE",
68                 Token::ClosingTagStart            => "</",
69                 Token::CommentStart               => "<!--",
70                 Token::CDataStart                 => "<![CDATA[",
71                 Token::TagEnd                     => ">",
72                 Token::EmptyTagEnd                => "/>",
73                 Token::ProcessingInstructionEnd   => "?>",
74                 Token::CommentEnd                 => "-->",
75                 Token::CDataEnd                   => "]]>",
76                 Token::ReferenceStart             => "&",
77                 Token::ReferenceEnd               => ";",
78                 Token::EqualsSign                 => "=",
79                 Token::SingleQuote                => "'",
80                 Token::DoubleQuote                => "\"",
81                 _                          => unreachable!()
82             })
83         }
84     }
85 }
86 
87 impl Token {
as_static_str(&self) -> Option<&'static str>88     pub fn as_static_str(&self) -> Option<&'static str> {
89         match *self {
90             Token::OpeningTagStart            => Some("<"),
91             Token::ProcessingInstructionStart => Some("<?"),
92             Token::DoctypeStart               => Some("<!DOCTYPE"),
93             Token::ClosingTagStart            => Some("</"),
94             Token::CommentStart               => Some("<!--"),
95             Token::CDataStart                 => Some("<![CDATA["),
96             Token::TagEnd                     => Some(">"),
97             Token::EmptyTagEnd                => Some("/>"),
98             Token::ProcessingInstructionEnd   => Some("?>"),
99             Token::CommentEnd                 => Some("-->"),
100             Token::CDataEnd                   => Some("]]>"),
101             Token::ReferenceStart             => Some("&"),
102             Token::ReferenceEnd               => Some(";"),
103             Token::EqualsSign                 => Some("="),
104             Token::SingleQuote                => Some("'"),
105             Token::DoubleQuote                => Some("\""),
106             Token::Chunk(s)                   => Some(s),
107             _                                 => None
108         }
109     }
110 
111     // using String.push_str(token.to_string()) is simply way too slow
push_to_string(&self, target: &mut String)112     pub fn push_to_string(&self, target: &mut String) {
113         match self.as_static_str() {
114             Some(s) => { target.push_str(s); }
115             None => {
116                 match *self {
117                     Token::Character(c) | Token::Whitespace(c) => target.push(c),
118                     _ => unreachable!()
119                 }
120             }
121         }
122     }
123 
124     /// Returns `true` if this token contains data that can be interpreted
125     /// as a part of the text. Surprisingly, this also means '>' and '=' and '"' and "'" and '-->'.
126     #[inline]
contains_char_data(&self) -> bool127     pub fn contains_char_data(&self) -> bool {
128         match *self {
129             Token::Whitespace(_) | Token::Chunk(_) | Token::Character(_) | Token::CommentEnd |
130             Token::TagEnd | Token::EqualsSign | Token::DoubleQuote | Token::SingleQuote | Token::CDataEnd |
131             Token::ProcessingInstructionEnd | Token::EmptyTagEnd => true,
132             _ => false
133         }
134     }
135 
136     /// Returns `true` if this token corresponds to a white space character.
137     #[inline]
is_whitespace(&self) -> bool138     pub fn is_whitespace(&self) -> bool {
139         match *self {
140             Token::Whitespace(_) => true,
141             _ => false
142         }
143     }
144 }
145 
146 enum State {
147     /// Triggered on '<'
148     TagStarted,
149     /// Triggered on '<!'
150     CommentOrCDataOrDoctypeStarted,
151     /// Triggered on '<!-'
152     CommentStarted,
153     /// Triggered on '<!D' up to '<!DOCTYPE'
154     DoctypeStarted(DoctypeStartedSubstate),
155     /// Triggered after DoctypeStarted to handle sub elements
156     DoctypeFinishing(u8),
157     /// Triggered on '<![' up to '<![CDATA'
158     CDataStarted(CDataStartedSubstate),
159     /// Triggered on '?'
160     ProcessingInstructionClosing,
161     /// Triggered on '/'
162     EmptyTagClosing,
163     /// Triggered on '-' up to '--'
164     CommentClosing(ClosingSubstate),
165     /// Triggered on ']' up to ']]'
166     CDataClosing(ClosingSubstate),
167     /// Default state
168     Normal
169 }
170 
171 #[derive(Copy, Clone)]
172 enum ClosingSubstate {
173     First, Second
174 }
175 
176 #[derive(Copy, Clone)]
177 enum DoctypeStartedSubstate {
178     D, DO, DOC, DOCT, DOCTY, DOCTYP
179 }
180 
181 #[derive(Copy, Clone)]
182 enum CDataStartedSubstate {
183     E, C, CD, CDA, CDAT, CDATA
184 }
185 
186 /// `Result` represents lexing result. It is either a token or an error message.
187 pub type Result = result::Result<Option<Token>, Error>;
188 
189 /// Helps to set up a dispatch table for lexing large unambigous tokens like
190 /// `<![CDATA[` or `<!DOCTYPE `.
191 macro_rules! dispatch_on_enum_state(
192     ($_self:ident, $s:expr, $c:expr, $is:expr,
193      $($st:ident; $stc:expr ; $next_st:ident ; $chunk:expr),+;
194      $end_st:ident ; $end_c:expr ; $end_chunk:expr ; $e:expr) => (
195         match $s {
196             $(
197             $st => match $c {
198                 $stc => $_self.move_to($is($next_st)),
199                 _  => $_self.handle_error($chunk, $c)
200             },
201             )+
202             $end_st => match $c {
203                 $end_c => $e,
204                 _      => $_self.handle_error($end_chunk, $c)
205             }
206         }
207     )
208 );
209 
210 /// `Lexer` is a lexer for XML documents, which implements pull API.
211 ///
212 /// Main method is `next_token` which accepts an `std::io::Read` instance and
213 /// tries to read the next lexeme from it.
214 ///
215 /// When `skip_errors` flag is set, invalid lexemes will be returned as `Chunk`s.
216 /// When it is not set, errors will be reported as `Err` objects with a string message.
217 /// By default this flag is not set. Use `enable_errors` and `disable_errors` methods
218 /// to toggle the behavior.
219 pub struct Lexer {
220     pos: TextPosition,
221     head_pos: TextPosition,
222     char_queue: VecDeque<char>,
223     st: State,
224     skip_errors: bool,
225     inside_comment: bool,
226     inside_token: bool,
227     eof_handled: bool
228 }
229 
230 impl Position for Lexer {
231     #[inline]
232     /// Returns the position of the last token produced by the lexer
position(&self) -> TextPosition233     fn position(&self) -> TextPosition { self.pos }
234 }
235 
236 impl Lexer {
237     /// Returns a new lexer with default state.
new() -> Lexer238     pub fn new() -> Lexer {
239         Lexer {
240             pos: TextPosition::new(),
241             head_pos: TextPosition::new(),
242             char_queue: VecDeque::with_capacity(4),  // TODO: check size
243             st: State::Normal,
244             skip_errors: false,
245             inside_comment: false,
246             inside_token: false,
247             eof_handled: false
248         }
249     }
250 
251     /// Enables error handling so `next_token` will return `Some(Err(..))`
252     /// upon invalid lexeme.
253     #[inline]
enable_errors(&mut self)254     pub fn enable_errors(&mut self) { self.skip_errors = false; }
255 
256     /// Disables error handling so `next_token` will return `Some(Chunk(..))`
257     /// upon invalid lexeme with this lexeme content.
258     #[inline]
disable_errors(&mut self)259     pub fn disable_errors(&mut self) { self.skip_errors = true; }
260 
261     /// Enables special handling of some lexemes which should be done when we're parsing comment
262     /// internals.
263     #[inline]
inside_comment(&mut self)264     pub fn inside_comment(&mut self) { self.inside_comment = true; }
265 
266     /// Disables the effect of `inside_comment()` method.
267     #[inline]
outside_comment(&mut self)268     pub fn outside_comment(&mut self) { self.inside_comment = false; }
269 
270     /// Reset the eof handled flag of the lexer.
271     #[inline]
reset_eof_handled(&mut self)272     pub fn reset_eof_handled(&mut self) { self.eof_handled = false; }
273 
274     /// Tries to read the next token from the buffer.
275     ///
276     /// It is possible to pass different instaces of `BufReader` each time
277     /// this method is called, but the resulting behavior is undefined in this case.
278     ///
279     /// Return value:
280     /// * `Err(reason) where reason: reader::Error` - when an error occurs;
281     /// * `Ok(None)` - upon end of stream is reached;
282     /// * `Ok(Some(token)) where token: Token` - in case a complete-token has been read from the stream.
next_token<B: Read>(&mut self, b: &mut B) -> Result283     pub fn next_token<B: Read>(&mut self, b: &mut B) -> Result {
284         // Already reached end of buffer
285         if self.eof_handled {
286             return Ok(None);
287         }
288 
289         if !self.inside_token {
290             self.pos = self.head_pos;
291             self.inside_token = true;
292         }
293 
294         // Check if we have saved a char or two for ourselves
295         while let Some(c) = self.char_queue.pop_front() {
296             match try!(self.read_next_token(c)) {
297                 Some(t) => {
298                     self.inside_token = false;
299                     return Ok(Some(t));
300                 }
301                 None => {}  // continue
302             }
303         }
304 
305         loop {
306             // TODO: this should handle multiple encodings
307             let c = match try!(util::next_char_from(b)) {
308                 Some(c) => c,   // got next char
309                 None => break,  // nothing to read left
310             };
311 
312             match try!(self.read_next_token(c)) {
313                 Some(t) => {
314                     self.inside_token = false;
315                     return Ok(Some(t));
316                 }
317                 None => {
318                     // continue
319                 }
320             }
321         }
322 
323         // Handle end of stream
324         self.eof_handled = true;
325         self.pos = self.head_pos;
326         match self.st {
327             State::TagStarted | State::CommentOrCDataOrDoctypeStarted |
328             State::CommentStarted | State::CDataStarted(_)| State::DoctypeStarted(_) |
329             State::CommentClosing(ClosingSubstate::Second) |
330             State::DoctypeFinishing(_) =>
331                 Err(self.error("Unexpected end of stream")),
332             State::ProcessingInstructionClosing =>
333                 Ok(Some(Token::Character('?'))),
334             State::EmptyTagClosing =>
335                 Ok(Some(Token::Character('/'))),
336             State::CommentClosing(ClosingSubstate::First) =>
337                 Ok(Some(Token::Character('-'))),
338             State::CDataClosing(ClosingSubstate::First) =>
339                 Ok(Some(Token::Character(']'))),
340             State::CDataClosing(ClosingSubstate::Second) =>
341                 Ok(Some(Token::Chunk("]]"))),
342             State::Normal =>
343                 Ok(None)
344         }
345     }
346 
347     #[inline]
error<M: Into<Cow<'static, str>>>(&self, msg: M) -> Error348     fn error<M: Into<Cow<'static, str>>>(&self, msg: M) -> Error {
349         (self, msg).into()
350     }
351 
352     #[inline]
read_next_token(&mut self, c: char) -> Result353     fn read_next_token(&mut self, c: char) -> Result {
354         let res = self.dispatch_char(c);
355         if self.char_queue.is_empty() {
356             if c == '\n' {
357                 self.head_pos.new_line();
358             } else {
359                 self.head_pos.advance(1);
360             }
361         }
362         res
363     }
364 
dispatch_char(&mut self, c: char) -> Result365     fn dispatch_char(&mut self, c: char) -> Result {
366         match self.st {
367             State::Normal                         => self.normal(c),
368             State::TagStarted                     => self.tag_opened(c),
369             State::CommentOrCDataOrDoctypeStarted => self.comment_or_cdata_or_doctype_started(c),
370             State::CommentStarted                 => self.comment_started(c),
371             State::CDataStarted(s)                => self.cdata_started(c, s),
372             State::DoctypeStarted(s)              => self.doctype_started(c, s),
373             State::DoctypeFinishing(d)            => self.doctype_finishing(c, d),
374             State::ProcessingInstructionClosing   => self.processing_instruction_closing(c),
375             State::EmptyTagClosing                => self.empty_element_closing(c),
376             State::CommentClosing(s)              => self.comment_closing(c, s),
377             State::CDataClosing(s)                => self.cdata_closing(c, s)
378         }
379     }
380 
381     #[inline]
move_to(&mut self, st: State) -> Result382     fn move_to(&mut self, st: State) -> Result {
383         self.st = st;
384         Ok(None)
385     }
386 
387     #[inline]
move_to_with(&mut self, st: State, token: Token) -> Result388     fn move_to_with(&mut self, st: State, token: Token) -> Result {
389         self.st = st;
390         Ok(Some(token))
391     }
392 
393     #[inline]
move_to_with_unread(&mut self, st: State, cs: &[char], token: Token) -> Result394     fn move_to_with_unread(&mut self, st: State, cs: &[char], token: Token) -> Result {
395         self.char_queue.extend(cs.iter().cloned());
396         self.move_to_with(st, token)
397     }
398 
handle_error(&mut self, chunk: &'static str, c: char) -> Result399     fn handle_error(&mut self, chunk: &'static str, c: char) -> Result {
400         self.char_queue.push_back(c);
401         if self.skip_errors || (self.inside_comment && chunk != "--") {  // FIXME: looks hacky
402             self.move_to_with(State::Normal, Token::Chunk(chunk))
403         } else {
404             Err(self.error(format!("Unexpected token '{}' before '{}'", chunk, c)))
405         }
406     }
407 
408     /// Encountered a char
normal(&mut self, c: char) -> Result409     fn normal(&mut self, c: char) -> Result {
410         match c {
411             '<'                        => self.move_to(State::TagStarted),
412             '>'                        => Ok(Some(Token::TagEnd)),
413             '/'                        => self.move_to(State::EmptyTagClosing),
414             '='                        => Ok(Some(Token::EqualsSign)),
415             '"'                        => Ok(Some(Token::DoubleQuote)),
416             '\''                       => Ok(Some(Token::SingleQuote)),
417             '?'                        => self.move_to(State::ProcessingInstructionClosing),
418             '-'                        => self.move_to(State::CommentClosing(ClosingSubstate::First)),
419             ']'                        => self.move_to(State::CDataClosing(ClosingSubstate::First)),
420             '&'                        => Ok(Some(Token::ReferenceStart)),
421             ';'                        => Ok(Some(Token::ReferenceEnd)),
422             _ if is_whitespace_char(c) => Ok(Some(Token::Whitespace(c))),
423             _                          => Ok(Some(Token::Character(c)))
424         }
425     }
426 
427     /// Encountered '<'
tag_opened(&mut self, c: char) -> Result428     fn tag_opened(&mut self, c: char) -> Result {
429         match c {
430             '?'                        => self.move_to_with(State::Normal, Token::ProcessingInstructionStart),
431             '/'                        => self.move_to_with(State::Normal, Token::ClosingTagStart),
432             '!'                        => self.move_to(State::CommentOrCDataOrDoctypeStarted),
433             _ if is_whitespace_char(c) => self.move_to_with_unread(State::Normal, &[c], Token::OpeningTagStart),
434             _ if is_name_char(c)       => self.move_to_with_unread(State::Normal, &[c], Token::OpeningTagStart),
435             _                          => self.handle_error("<", c)
436         }
437     }
438 
439     /// Encountered '<!'
comment_or_cdata_or_doctype_started(&mut self, c: char) -> Result440     fn comment_or_cdata_or_doctype_started(&mut self, c: char) -> Result {
441         match c {
442             '-' => self.move_to(State::CommentStarted),
443             '[' => self.move_to(State::CDataStarted(CDataStartedSubstate::E)),
444             'D' => self.move_to(State::DoctypeStarted(DoctypeStartedSubstate::D)),
445             _   => self.handle_error("<!", c)
446         }
447     }
448 
449     /// Encountered '<!-'
comment_started(&mut self, c: char) -> Result450     fn comment_started(&mut self, c: char) -> Result {
451         match c {
452             '-' => self.move_to_with(State::Normal, Token::CommentStart),
453             _   => self.handle_error("<!-", c)
454         }
455     }
456 
457     /// Encountered '<!['
cdata_started(&mut self, c: char, s: CDataStartedSubstate) -> Result458     fn cdata_started(&mut self, c: char, s: CDataStartedSubstate) -> Result {
459         use self::CDataStartedSubstate::{E, C, CD, CDA, CDAT, CDATA};
460         dispatch_on_enum_state!(self, s, c, State::CDataStarted,
461             E     ; 'C' ; C     ; "<![",
462             C     ; 'D' ; CD    ; "<![C",
463             CD    ; 'A' ; CDA   ; "<![CD",
464             CDA   ; 'T' ; CDAT  ; "<![CDA",
465             CDAT  ; 'A' ; CDATA ; "<![CDAT";
466             CDATA ; '[' ; "<![CDATA" ; self.move_to_with(State::Normal, Token::CDataStart)
467         )
468     }
469 
470     /// Encountered '<!D'
doctype_started(&mut self, c: char, s: DoctypeStartedSubstate) -> Result471     fn doctype_started(&mut self, c: char, s: DoctypeStartedSubstate) -> Result {
472         use self::DoctypeStartedSubstate::{D, DO, DOC, DOCT, DOCTY, DOCTYP};
473         dispatch_on_enum_state!(self, s, c, State::DoctypeStarted,
474             D      ; 'O' ; DO     ; "<!D",
475             DO     ; 'C' ; DOC    ; "<!DO",
476             DOC    ; 'T' ; DOCT   ; "<!DOC",
477             DOCT   ; 'Y' ; DOCTY  ; "<!DOCT",
478             DOCTY  ; 'P' ; DOCTYP ; "<!DOCTY";
479             DOCTYP ; 'E' ; "<!DOCTYP" ; self.move_to_with(State::DoctypeFinishing(1), Token::DoctypeStart)
480         )
481     }
482 
483     /// State used while awaiting the closing bracket for the <!DOCTYPE tag
doctype_finishing(&mut self, c: char, d: u8) -> Result484     fn doctype_finishing(&mut self, c: char, d: u8) -> Result {
485         match c {
486             '<' => self.move_to(State::DoctypeFinishing(d + 1)),
487             '>' if d == 1 => self.move_to_with(State::Normal, Token::TagEnd),
488             '>' => self.move_to(State::DoctypeFinishing(d - 1)),
489             _ => Ok(None),
490         }
491     }
492 
493     /// Encountered '?'
processing_instruction_closing(&mut self, c: char) -> Result494     fn processing_instruction_closing(&mut self, c: char) -> Result {
495         match c {
496             '>' => self.move_to_with(State::Normal, Token::ProcessingInstructionEnd),
497             _   => self.move_to_with_unread(State::Normal, &[c], Token::Character('?')),
498         }
499     }
500 
501     /// Encountered '/'
empty_element_closing(&mut self, c: char) -> Result502     fn empty_element_closing(&mut self, c: char) -> Result {
503         match c {
504             '>' => self.move_to_with(State::Normal, Token::EmptyTagEnd),
505             _   => self.move_to_with_unread(State::Normal, &[c], Token::Character('/')),
506         }
507     }
508 
509     /// Encountered '-'
comment_closing(&mut self, c: char, s: ClosingSubstate) -> Result510     fn comment_closing(&mut self, c: char, s: ClosingSubstate) -> Result {
511         match s {
512             ClosingSubstate::First => match c {
513                 '-' => self.move_to(State::CommentClosing(ClosingSubstate::Second)),
514                 _   => self.move_to_with_unread(State::Normal, &[c], Token::Character('-'))
515             },
516             ClosingSubstate::Second => match c {
517                 '>'                      => self.move_to_with(State::Normal, Token::CommentEnd),
518                 // double dash not followed by a greater-than is a hard error inside comment
519                 _ if self.inside_comment => self.handle_error("--", c),
520                 // nothing else except comment closing starts with a double dash, and comment
521                 // closing can never be after another dash, and also we're outside of a comment,
522                 // therefore it is safe to push only the last read character to the list of unread
523                 // characters and pass the double dash directly to the output
524                 _                        => self.move_to_with_unread(State::Normal, &[c], Token::Chunk("--"))
525             }
526         }
527     }
528 
529     /// Encountered ']'
cdata_closing(&mut self, c: char, s: ClosingSubstate) -> Result530     fn cdata_closing(&mut self, c: char, s: ClosingSubstate) -> Result {
531         match s {
532             ClosingSubstate::First => match c {
533                 ']' => self.move_to(State::CDataClosing(ClosingSubstate::Second)),
534                 _   => self.move_to_with_unread(State::Normal, &[c], Token::Character(']'))
535             },
536             ClosingSubstate::Second => match c {
537                 '>' => self.move_to_with(State::Normal, Token::CDataEnd),
538                 _   => self.move_to_with_unread(State::Normal, &[']', c], Token::Character(']'))
539             }
540         }
541     }
542 }
543 
544 #[cfg(test)]
545 mod tests {
546     use common::{Position};
547     use std::io::{BufReader, Cursor};
548 
549     use super::{Lexer, Token};
550 
551     macro_rules! assert_oks(
552         (for $lex:ident and $buf:ident ; $($e:expr)+) => ({
553             $(
554                 assert_eq!(Ok(Some($e)), $lex.next_token(&mut $buf));
555              )+
556         })
557     );
558 
559     macro_rules! assert_err(
560         (for $lex:ident and $buf:ident expect row $r:expr ; $c:expr, $s:expr) => ({
561             let err = $lex.next_token(&mut $buf);
562             assert!(err.is_err());
563             let err = err.unwrap_err();
564             assert_eq!($r as u64, err.position().row);
565             assert_eq!($c as u64, err.position().column);
566             assert_eq!($s, err.msg());
567         })
568     );
569 
570     macro_rules! assert_none(
571         (for $lex:ident and $buf:ident) => (
572             assert_eq!(Ok(None), $lex.next_token(&mut $buf));
573         )
574     );
575 
make_lex_and_buf(s: &str) -> (Lexer, BufReader<Cursor<Vec<u8>>>)576     fn make_lex_and_buf(s: &str) -> (Lexer, BufReader<Cursor<Vec<u8>>>) {
577         (Lexer::new(), BufReader::new(Cursor::new(s.to_owned().into_bytes())))
578     }
579 
580     #[test]
simple_lexer_test()581     fn simple_lexer_test() {
582         let (mut lex, mut buf) = make_lex_and_buf(
583             r#"<a p='q'> x<b z="y">d	</b></a><p/> <?nm ?> <!-- a c --> &nbsp;"#
584         );
585 
586         assert_oks!(for lex and buf ;
587             Token::OpeningTagStart
588             Token::Character('a')
589             Token::Whitespace(' ')
590             Token::Character('p')
591             Token::EqualsSign
592             Token::SingleQuote
593             Token::Character('q')
594             Token::SingleQuote
595             Token::TagEnd
596             Token::Whitespace(' ')
597             Token::Character('x')
598             Token::OpeningTagStart
599             Token::Character('b')
600             Token::Whitespace(' ')
601             Token::Character('z')
602             Token::EqualsSign
603             Token::DoubleQuote
604             Token::Character('y')
605             Token::DoubleQuote
606             Token::TagEnd
607             Token::Character('d')
608             Token::Whitespace('\t')
609             Token::ClosingTagStart
610             Token::Character('b')
611             Token::TagEnd
612             Token::ClosingTagStart
613             Token::Character('a')
614             Token::TagEnd
615             Token::OpeningTagStart
616             Token::Character('p')
617             Token::EmptyTagEnd
618             Token::Whitespace(' ')
619             Token::ProcessingInstructionStart
620             Token::Character('n')
621             Token::Character('m')
622             Token::Whitespace(' ')
623             Token::ProcessingInstructionEnd
624             Token::Whitespace(' ')
625             Token::CommentStart
626             Token::Whitespace(' ')
627             Token::Character('a')
628             Token::Whitespace(' ')
629             Token::Character('c')
630             Token::Whitespace(' ')
631             Token::CommentEnd
632             Token::Whitespace(' ')
633             Token::ReferenceStart
634             Token::Character('n')
635             Token::Character('b')
636             Token::Character('s')
637             Token::Character('p')
638             Token::ReferenceEnd
639         );
640         assert_none!(for lex and buf);
641     }
642 
643     #[test]
special_chars_test()644     fn special_chars_test() {
645         let (mut lex, mut buf) = make_lex_and_buf(
646             r#"?x!+ // -| ]z]]"#
647         );
648 
649         assert_oks!(for lex and buf ;
650             Token::Character('?')
651             Token::Character('x')
652             Token::Character('!')
653             Token::Character('+')
654             Token::Whitespace(' ')
655             Token::Character('/')
656             Token::Character('/')
657             Token::Whitespace(' ')
658             Token::Character('-')
659             Token::Character('|')
660             Token::Whitespace(' ')
661             Token::Character(']')
662             Token::Character('z')
663             Token::Chunk("]]")
664         );
665         assert_none!(for lex and buf);
666     }
667 
668     #[test]
cdata_test()669     fn cdata_test() {
670         let (mut lex, mut buf) = make_lex_and_buf(
671             r#"<a><![CDATA[x y ?]]> </a>"#
672         );
673 
674         assert_oks!(for lex and buf ;
675             Token::OpeningTagStart
676             Token::Character('a')
677             Token::TagEnd
678             Token::CDataStart
679             Token::Character('x')
680             Token::Whitespace(' ')
681             Token::Character('y')
682             Token::Whitespace(' ')
683             Token::Character('?')
684             Token::CDataEnd
685             Token::Whitespace(' ')
686             Token::ClosingTagStart
687             Token::Character('a')
688             Token::TagEnd
689         );
690         assert_none!(for lex and buf);
691     }
692 
693     #[test]
doctype_test()694     fn doctype_test() {
695         let (mut lex, mut buf) = make_lex_and_buf(
696             r#"<a><!DOCTYPE ab xx z> "#
697         );
698         assert_oks!(for lex and buf ;
699             Token::OpeningTagStart
700             Token::Character('a')
701             Token::TagEnd
702             Token::DoctypeStart
703             Token::TagEnd
704             Token::Whitespace(' ')
705         );
706         assert_none!(for lex and buf)
707     }
708 
709     #[test]
doctype_with_internal_subset_test()710     fn doctype_with_internal_subset_test() {
711         let (mut lex, mut buf) = make_lex_and_buf(
712             r#"<a><!DOCTYPE ab[<!ELEMENT ba> ]> "#
713         );
714         assert_oks!(for lex and buf ;
715             Token::OpeningTagStart
716             Token::Character('a')
717             Token::TagEnd
718             Token::DoctypeStart
719             Token::TagEnd
720             Token::Whitespace(' ')
721         );
722         assert_none!(for lex and buf)
723     }
724 
725     #[test]
end_of_stream_handling_ok()726     fn end_of_stream_handling_ok() {
727         macro_rules! eof_check(
728             ($data:expr ; $token:expr) => ({
729                 let (mut lex, mut buf) = make_lex_and_buf($data);
730                 assert_oks!(for lex and buf ; $token);
731                 assert_none!(for lex and buf);
732             })
733         );
734         eof_check!("?"  ; Token::Character('?'));
735         eof_check!("/"  ; Token::Character('/'));
736         eof_check!("-"  ; Token::Character('-'));
737         eof_check!("]"  ; Token::Character(']'));
738         eof_check!("]]" ; Token::Chunk("]]"));
739     }
740 
741     #[test]
end_of_stream_handling_error()742     fn end_of_stream_handling_error() {
743         macro_rules! eof_check(
744             ($data:expr; $r:expr, $c:expr) => ({
745                 let (mut lex, mut buf) = make_lex_and_buf($data);
746                 assert_err!(for lex and buf expect row $r ; $c, "Unexpected end of stream");
747                 assert_none!(for lex and buf);
748             })
749         );
750         eof_check!("<"        ; 0, 1);
751         eof_check!("<!"       ; 0, 2);
752         eof_check!("<!-"      ; 0, 3);
753         eof_check!("<!["      ; 0, 3);
754         eof_check!("<![C"     ; 0, 4);
755         eof_check!("<![CD"    ; 0, 5);
756         eof_check!("<![CDA"   ; 0, 6);
757         eof_check!("<![CDAT"  ; 0, 7);
758         eof_check!("<![CDATA" ; 0, 8);
759         eof_check!("--"       ; 0, 2);
760     }
761 
762     #[test]
error_in_comment_or_cdata_prefix()763     fn error_in_comment_or_cdata_prefix() {
764         let (mut lex, mut buf) = make_lex_and_buf("<!x");
765         assert_err!(for lex and buf expect row 0 ; 0,
766             "Unexpected token '<!' before 'x'"
767         );
768 
769         let (mut lex, mut buf) = make_lex_and_buf("<!x");
770         lex.disable_errors();
771         assert_oks!(for lex and buf ;
772             Token::Chunk("<!")
773             Token::Character('x')
774         );
775         assert_none!(for lex and buf);
776     }
777 
778     #[test]
error_in_comment_started()779     fn error_in_comment_started() {
780         let (mut lex, mut buf) = make_lex_and_buf("<!-\t");
781         assert_err!(for lex and buf expect row 0 ; 0,
782             "Unexpected token '<!-' before '\t'"
783         );
784 
785         let (mut lex, mut buf) = make_lex_and_buf("<!-\t");
786         lex.disable_errors();
787         assert_oks!(for lex and buf ;
788             Token::Chunk("<!-")
789             Token::Whitespace('\t')
790         );
791         assert_none!(for lex and buf);
792     }
793 
794     #[test]
error_in_comment_two_dashes_not_at_end()795     fn error_in_comment_two_dashes_not_at_end() {
796         let (mut lex, mut buf) = make_lex_and_buf("--x");
797         lex.inside_comment();
798         assert_err!(for lex and buf expect row 0; 0,
799             "Unexpected token '--' before 'x'"
800         );
801 
802         let (mut lex, mut buf) = make_lex_and_buf("--x");
803         assert_oks!(for lex and buf ;
804             Token::Chunk("--")
805             Token::Character('x')
806         );
807     }
808 
809     macro_rules! check_case(
810         ($chunk:expr, $app:expr; $data:expr; $r:expr, $c:expr, $s:expr) => ({
811             let (mut lex, mut buf) = make_lex_and_buf($data);
812             assert_err!(for lex and buf expect row $r ; $c, $s);
813 
814             let (mut lex, mut buf) = make_lex_and_buf($data);
815             lex.disable_errors();
816             assert_oks!(for lex and buf ;
817                 Token::Chunk($chunk)
818                 Token::Character($app)
819             );
820             assert_none!(for lex and buf);
821         })
822     );
823 
824     #[test]
error_in_cdata_started()825     fn error_in_cdata_started() {
826         check_case!("<![",      '['; "<![["      ; 0, 0, "Unexpected token '<![' before '['");
827         check_case!("<![C",     '['; "<![C["     ; 0, 0, "Unexpected token '<![C' before '['");
828         check_case!("<![CD",    '['; "<![CD["    ; 0, 0, "Unexpected token '<![CD' before '['");
829         check_case!("<![CDA",   '['; "<![CDA["   ; 0, 0, "Unexpected token '<![CDA' before '['");
830         check_case!("<![CDAT",  '['; "<![CDAT["  ; 0, 0, "Unexpected token '<![CDAT' before '['");
831         check_case!("<![CDATA", '|'; "<![CDATA|" ; 0, 0, "Unexpected token '<![CDATA' before '|'");
832     }
833 
834     #[test]
error_in_doctype_started()835     fn error_in_doctype_started() {
836         check_case!("<!D",      'a'; "<!Da"      ; 0, 0, "Unexpected token '<!D' before 'a'");
837         check_case!("<!DO",     'b'; "<!DOb"     ; 0, 0, "Unexpected token '<!DO' before 'b'");
838         check_case!("<!DOC",    'c'; "<!DOCc"    ; 0, 0, "Unexpected token '<!DOC' before 'c'");
839         check_case!("<!DOCT",   'd'; "<!DOCTd"   ; 0, 0, "Unexpected token '<!DOCT' before 'd'");
840         check_case!("<!DOCTY",  'e'; "<!DOCTYe"  ; 0, 0, "Unexpected token '<!DOCTY' before 'e'");
841         check_case!("<!DOCTYP", 'f'; "<!DOCTYPf" ; 0, 0, "Unexpected token '<!DOCTYP' before 'f'");
842     }
843 
844 
845 
846     #[test]
issue_98_cdata_ending_with_right_bracket()847     fn issue_98_cdata_ending_with_right_bracket() {
848         let (mut lex, mut buf) = make_lex_and_buf(
849             r#"<![CDATA[Foo [Bar]]]>"#
850         );
851 
852         assert_oks!(for lex and buf ;
853             Token::CDataStart
854             Token::Character('F')
855             Token::Character('o')
856             Token::Character('o')
857             Token::Whitespace(' ')
858             Token::Character('[')
859             Token::Character('B')
860             Token::Character('a')
861             Token::Character('r')
862             Token::Character(']')
863             Token::CDataEnd
864         );
865         assert_none!(for lex and buf);
866     }
867 }
868