1 //! Contains simple lexer for XML documents.
2 //!
3 //! This module is for internal use. Use `xml::pull` module to do parsing.
4 
5 use std::fmt;
6 use std::collections::VecDeque;
7 use std::io::Read;
8 use std::result;
9 use std::borrow::Cow;
10 
11 use common::{Position, TextPosition, is_whitespace_char, is_name_char};
12 use reader::Error;
13 use util;
14 
15 /// `Token` represents a single lexeme of an XML document. These lexemes
16 /// are used to perform actual parsing.
17 #[derive(Copy, Clone, PartialEq, Eq, Debug)]
18 pub enum Token {
19     /// `<?`
20     ProcessingInstructionStart,
21     /// `?>`
22     ProcessingInstructionEnd,
23     /// `<!DOCTYPE
24     DoctypeStart,
25     /// `<`
26     OpeningTagStart,
27     /// `</`
28     ClosingTagStart,
29     /// `>`
30     TagEnd,
31     /// `/>`
32     EmptyTagEnd,
33     /// `<!--`
34     CommentStart,
35     /// `-->`
36     CommentEnd,
37     /// A chunk of characters, used for errors recovery.
38     Chunk(&'static str),
39     /// Any non-special character except whitespace.
40     Character(char),
41     /// Whitespace character.
42     Whitespace(char),
43     /// `=`
44     EqualsSign,
45     /// `'`
46     SingleQuote,
47     /// `"`
48     DoubleQuote,
49     /// `<![CDATA[`
50     CDataStart,
51     /// `]]>`
52     CDataEnd,
53     /// `&`
54     ReferenceStart,
55     /// `;`
56     ReferenceEnd,
57 }
58 
59 impl fmt::Display for Token {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result60     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
61         match *self {
62             Token::Chunk(s)                            => write!(f, "{}", s),
63             Token::Character(c) | Token::Whitespace(c) => write!(f, "{}", c),
64             other => write!(f, "{}", match other {
65                 Token::OpeningTagStart            => "<",
66                 Token::ProcessingInstructionStart => "<?",
67                 Token::DoctypeStart               => "<!DOCTYPE",
68                 Token::ClosingTagStart            => "</",
69                 Token::CommentStart               => "<!--",
70                 Token::CDataStart                 => "<![CDATA[",
71                 Token::TagEnd                     => ">",
72                 Token::EmptyTagEnd                => "/>",
73                 Token::ProcessingInstructionEnd   => "?>",
74                 Token::CommentEnd                 => "-->",
75                 Token::CDataEnd                   => "]]>",
76                 Token::ReferenceStart             => "&",
77                 Token::ReferenceEnd               => ";",
78                 Token::EqualsSign                 => "=",
79                 Token::SingleQuote                => "'",
80                 Token::DoubleQuote                => "\"",
81                 _                          => unreachable!()
82             })
83         }
84     }
85 }
86 
87 impl Token {
as_static_str(&self) -> Option<&'static str>88     pub fn as_static_str(&self) -> Option<&'static str> {
89         match *self {
90             Token::OpeningTagStart            => Some("<"),
91             Token::ProcessingInstructionStart => Some("<?"),
92             Token::DoctypeStart               => Some("<!DOCTYPE"),
93             Token::ClosingTagStart            => Some("</"),
94             Token::CommentStart               => Some("<!--"),
95             Token::CDataStart                 => Some("<![CDATA["),
96             Token::TagEnd                     => Some(">"),
97             Token::EmptyTagEnd                => Some("/>"),
98             Token::ProcessingInstructionEnd   => Some("?>"),
99             Token::CommentEnd                 => Some("-->"),
100             Token::CDataEnd                   => Some("]]>"),
101             Token::ReferenceStart             => Some("&"),
102             Token::ReferenceEnd               => Some(";"),
103             Token::EqualsSign                 => Some("="),
104             Token::SingleQuote                => Some("'"),
105             Token::DoubleQuote                => Some("\""),
106             Token::Chunk(s)                   => Some(s),
107             _                                 => None
108         }
109     }
110 
111     // using String.push_str(token.to_string()) is simply way too slow
push_to_string(&self, target: &mut String)112     pub fn push_to_string(&self, target: &mut String) {
113         match self.as_static_str() {
114             Some(s) => { target.push_str(s); }
115             None => {
116                 match *self {
117                     Token::Character(c) | Token::Whitespace(c) => target.push(c),
118                     _ => unreachable!()
119                 }
120             }
121         }
122     }
123 
124     /// Returns `true` if this token contains data that can be interpreted
125     /// as a part of the text. Surprisingly, this also means '>' and '=' and '"' and "'" and '-->'.
126     #[inline]
contains_char_data(&self) -> bool127     pub fn contains_char_data(&self) -> bool {
128         match *self {
129             Token::Whitespace(_) | Token::Chunk(_) | Token::Character(_) | Token::CommentEnd |
130             Token::TagEnd | Token::EqualsSign | Token::DoubleQuote | Token::SingleQuote => true,
131             _ => false
132         }
133     }
134 
135     /// Returns `true` if this token corresponds to a white space character.
136     #[inline]
is_whitespace(&self) -> bool137     pub fn is_whitespace(&self) -> bool {
138         match *self {
139             Token::Whitespace(_) => true,
140             _ => false
141         }
142     }
143 }
144 
145 enum State {
146     /// Triggered on '<'
147     TagStarted,
148     /// Triggered on '<!'
149     CommentOrCDataOrDoctypeStarted,
150     /// Triggered on '<!-'
151     CommentStarted,
152     /// Triggered on '<!D' up to '<!DOCTYPE'
153     DoctypeStarted(DoctypeStartedSubstate),
154     /// Triggered on '<![' up to '<![CDATA'
155     CDataStarted(CDataStartedSubstate),
156     /// Triggered on '?'
157     ProcessingInstructionClosing,
158     /// Triggered on '/'
159     EmptyTagClosing,
160     /// Triggered on '-' up to '--'
161     CommentClosing(ClosingSubstate),
162     /// Triggered on ']' up to ']]'
163     CDataClosing(ClosingSubstate),
164     /// Default state
165     Normal
166 }
167 
168 #[derive(Copy, Clone)]
169 enum ClosingSubstate {
170     First, Second
171 }
172 
173 #[derive(Copy, Clone)]
174 enum DoctypeStartedSubstate {
175     D, DO, DOC, DOCT, DOCTY, DOCTYP
176 }
177 
178 #[derive(Copy, Clone)]
179 enum CDataStartedSubstate {
180     E, C, CD, CDA, CDAT, CDATA
181 }
182 
183 /// `Result` represents lexing result. It is either a token or an error message.
184 pub type Result = result::Result<Option<Token>, Error>;
185 
186 /// Helps to set up a dispatch table for lexing large unambigous tokens like
187 /// `<![CDATA[` or `<!DOCTYPE `.
188 macro_rules! dispatch_on_enum_state(
189     ($_self:ident, $s:expr, $c:expr, $is:expr,
190      $($st:ident; $stc:expr ; $next_st:ident ; $chunk:expr),+;
191      $end_st:ident ; $end_c:expr ; $end_chunk:expr ; $e:expr) => (
192         match $s {
193             $(
194             $st => match $c {
195                 $stc => $_self.move_to($is($next_st)),
196                 _  => $_self.handle_error($chunk, $c)
197             },
198             )+
199             $end_st => match $c {
200                 $end_c => $e,
201                 _      => $_self.handle_error($end_chunk, $c)
202             }
203         }
204     )
205 );
206 
207 /// `Lexer` is a lexer for XML documents, which implements pull API.
208 ///
209 /// Main method is `next_token` which accepts an `std::io::Read` instance and
210 /// tries to read the next lexeme from it.
211 ///
212 /// When `skip_errors` flag is set, invalid lexemes will be returned as `Chunk`s.
213 /// When it is not set, errors will be reported as `Err` objects with a string message.
214 /// By default this flag is not set. Use `enable_errors` and `disable_errors` methods
215 /// to toggle the behavior.
216 pub struct Lexer {
217     pos: TextPosition,
218     head_pos: TextPosition,
219     char_queue: VecDeque<char>,
220     st: State,
221     skip_errors: bool,
222     inside_comment: bool,
223     inside_token: bool,
224     eof_handled: bool
225 }
226 
227 impl Position for Lexer {
228     #[inline]
229     /// Returns the position of the last token produced by the lexer
position(&self) -> TextPosition230     fn position(&self) -> TextPosition { self.pos }
231 }
232 
233 impl Lexer {
234     /// Returns a new lexer with default state.
new() -> Lexer235     pub fn new() -> Lexer {
236         Lexer {
237             pos: TextPosition::new(),
238             head_pos: TextPosition::new(),
239             char_queue: VecDeque::with_capacity(4),  // TODO: check size
240             st: State::Normal,
241             skip_errors: false,
242             inside_comment: false,
243             inside_token: false,
244             eof_handled: false
245         }
246     }
247 
248     /// Enables error handling so `next_token` will return `Some(Err(..))`
249     /// upon invalid lexeme.
250     #[inline]
enable_errors(&mut self)251     pub fn enable_errors(&mut self) { self.skip_errors = false; }
252 
253     /// Disables error handling so `next_token` will return `Some(Chunk(..))`
254     /// upon invalid lexeme with this lexeme content.
255     #[inline]
disable_errors(&mut self)256     pub fn disable_errors(&mut self) { self.skip_errors = true; }
257 
258     /// Enables special handling of some lexemes which should be done when we're parsing comment
259     /// internals.
260     #[inline]
inside_comment(&mut self)261     pub fn inside_comment(&mut self) { self.inside_comment = true; }
262 
263     /// Disables the effect of `inside_comment()` method.
264     #[inline]
outside_comment(&mut self)265     pub fn outside_comment(&mut self) { self.inside_comment = false; }
266 
267     /// Reset the eof handled flag of the lexer.
268     #[inline]
reset_eof_handled(&mut self)269     pub fn reset_eof_handled(&mut self) { self.eof_handled = false; }
270 
271     /// Tries to read the next token from the buffer.
272     ///
273     /// It is possible to pass different instaces of `BufReader` each time
274     /// this method is called, but the resulting behavior is undefined in this case.
275     ///
276     /// Return value:
277     /// * `Err(reason) where reason: reader::Error` - when an error occurs;
278     /// * `Ok(None)` - upon end of stream is reached;
279     /// * `Ok(Some(token)) where token: Token` - in case a complete-token has been read from the stream.
next_token<B: Read>(&mut self, b: &mut B) -> Result280     pub fn next_token<B: Read>(&mut self, b: &mut B) -> Result {
281         // Already reached end of buffer
282         if self.eof_handled {
283             return Ok(None);
284         }
285 
286         if !self.inside_token {
287             self.pos = self.head_pos;
288             self.inside_token = true;
289         }
290 
291         // Check if we have saved a char or two for ourselves
292         while let Some(c) = self.char_queue.pop_front() {
293             match try!(self.read_next_token(c)) {
294                 Some(t) => {
295                     self.inside_token = false;
296                     return Ok(Some(t));
297                 }
298                 None => {}  // continue
299             }
300         }
301 
302         loop {
303             // TODO: this should handle multiple encodings
304             let c = match try!(util::next_char_from(b)) {
305                 Some(c) => c,   // got next char
306                 None => break,  // nothing to read left
307             };
308 
309             match try!(self.read_next_token(c)) {
310                 Some(t) => {
311                     self.inside_token = false;
312                     return Ok(Some(t));
313                 }
314                 None => {
315                     // continue
316                 }
317             }
318         }
319 
320         // Handle end of stream
321         self.eof_handled = true;
322         self.pos = self.head_pos;
323         match self.st {
324             State::TagStarted | State::CommentOrCDataOrDoctypeStarted |
325             State::CommentStarted | State::CDataStarted(_)| State::DoctypeStarted(_) |
326             State::CommentClosing(ClosingSubstate::Second)  =>
327                 Err(self.error("Unexpected end of stream")),
328             State::ProcessingInstructionClosing =>
329                 Ok(Some(Token::Character('?'))),
330             State::EmptyTagClosing =>
331                 Ok(Some(Token::Character('/'))),
332             State::CommentClosing(ClosingSubstate::First) =>
333                 Ok(Some(Token::Character('-'))),
334             State::CDataClosing(ClosingSubstate::First) =>
335                 Ok(Some(Token::Character(']'))),
336             State::CDataClosing(ClosingSubstate::Second) =>
337                 Ok(Some(Token::Chunk("]]"))),
338             State::Normal =>
339                 Ok(None)
340         }
341     }
342 
343     #[inline]
error<M: Into<Cow<'static, str>>>(&self, msg: M) -> Error344     fn error<M: Into<Cow<'static, str>>>(&self, msg: M) -> Error {
345         (self, msg).into()
346     }
347 
348     #[inline]
read_next_token(&mut self, c: char) -> Result349     fn read_next_token(&mut self, c: char) -> Result {
350         let res = self.dispatch_char(c);
351         if self.char_queue.is_empty() {
352             if c == '\n' {
353                 self.head_pos.new_line();
354             } else {
355                 self.head_pos.advance(1);
356             }
357         }
358         res
359     }
360 
dispatch_char(&mut self, c: char) -> Result361     fn dispatch_char(&mut self, c: char) -> Result {
362         match self.st {
363             State::Normal                         => self.normal(c),
364             State::TagStarted                     => self.tag_opened(c),
365             State::CommentOrCDataOrDoctypeStarted => self.comment_or_cdata_or_doctype_started(c),
366             State::CommentStarted                 => self.comment_started(c),
367             State::CDataStarted(s)                => self.cdata_started(c, s),
368             State::DoctypeStarted(s)              => self.doctype_started(c, s),
369             State::ProcessingInstructionClosing   => self.processing_instruction_closing(c),
370             State::EmptyTagClosing                => self.empty_element_closing(c),
371             State::CommentClosing(s)              => self.comment_closing(c, s),
372             State::CDataClosing(s)                => self.cdata_closing(c, s)
373         }
374     }
375 
376     #[inline]
move_to(&mut self, st: State) -> Result377     fn move_to(&mut self, st: State) -> Result {
378         self.st = st;
379         Ok(None)
380     }
381 
382     #[inline]
move_to_with(&mut self, st: State, token: Token) -> Result383     fn move_to_with(&mut self, st: State, token: Token) -> Result {
384         self.st = st;
385         Ok(Some(token))
386     }
387 
388     #[inline]
move_to_with_unread(&mut self, st: State, cs: &[char], token: Token) -> Result389     fn move_to_with_unread(&mut self, st: State, cs: &[char], token: Token) -> Result {
390         self.char_queue.extend(cs.iter().cloned());
391         self.move_to_with(st, token)
392     }
393 
handle_error(&mut self, chunk: &'static str, c: char) -> Result394     fn handle_error(&mut self, chunk: &'static str, c: char) -> Result {
395         self.char_queue.push_back(c);
396         if self.skip_errors || (self.inside_comment && chunk != "--") {  // FIXME: looks hacky
397             self.move_to_with(State::Normal, Token::Chunk(chunk))
398         } else {
399             Err(self.error(format!("Unexpected token '{}' before '{}'", chunk, c)))
400         }
401     }
402 
403     /// Encountered a char
normal(&mut self, c: char) -> Result404     fn normal(&mut self, c: char) -> Result {
405         match c {
406             '<'                        => self.move_to(State::TagStarted),
407             '>'                        => Ok(Some(Token::TagEnd)),
408             '/'                        => self.move_to(State::EmptyTagClosing),
409             '='                        => Ok(Some(Token::EqualsSign)),
410             '"'                        => Ok(Some(Token::DoubleQuote)),
411             '\''                       => Ok(Some(Token::SingleQuote)),
412             '?'                        => self.move_to(State::ProcessingInstructionClosing),
413             '-'                        => self.move_to(State::CommentClosing(ClosingSubstate::First)),
414             ']'                        => self.move_to(State::CDataClosing(ClosingSubstate::First)),
415             '&'                        => Ok(Some(Token::ReferenceStart)),
416             ';'                        => Ok(Some(Token::ReferenceEnd)),
417             _ if is_whitespace_char(c) => Ok(Some(Token::Whitespace(c))),
418             _                          => Ok(Some(Token::Character(c)))
419         }
420     }
421 
422     /// Encountered '<'
tag_opened(&mut self, c: char) -> Result423     fn tag_opened(&mut self, c: char) -> Result {
424         match c {
425             '?'                        => self.move_to_with(State::Normal, Token::ProcessingInstructionStart),
426             '/'                        => self.move_to_with(State::Normal, Token::ClosingTagStart),
427             '!'                        => self.move_to(State::CommentOrCDataOrDoctypeStarted),
428             _ if is_whitespace_char(c) => self.move_to_with_unread(State::Normal, &[c], Token::OpeningTagStart),
429             _ if is_name_char(c)       => self.move_to_with_unread(State::Normal, &[c], Token::OpeningTagStart),
430             _                          => self.handle_error("<", c)
431         }
432     }
433 
434     /// Encountered '<!'
comment_or_cdata_or_doctype_started(&mut self, c: char) -> Result435     fn comment_or_cdata_or_doctype_started(&mut self, c: char) -> Result {
436         match c {
437             '-' => self.move_to(State::CommentStarted),
438             '[' => self.move_to(State::CDataStarted(CDataStartedSubstate::E)),
439             'D' => self.move_to(State::DoctypeStarted(DoctypeStartedSubstate::D)),
440             _   => self.handle_error("<!", c)
441         }
442     }
443 
444     /// Encountered '<!-'
comment_started(&mut self, c: char) -> Result445     fn comment_started(&mut self, c: char) -> Result {
446         match c {
447             '-' => self.move_to_with(State::Normal, Token::CommentStart),
448             _   => self.handle_error("<!-", c)
449         }
450     }
451 
452     /// Encountered '<!['
cdata_started(&mut self, c: char, s: CDataStartedSubstate) -> Result453     fn cdata_started(&mut self, c: char, s: CDataStartedSubstate) -> Result {
454         use self::CDataStartedSubstate::{E, C, CD, CDA, CDAT, CDATA};
455         dispatch_on_enum_state!(self, s, c, State::CDataStarted,
456             E     ; 'C' ; C     ; "<![",
457             C     ; 'D' ; CD    ; "<![C",
458             CD    ; 'A' ; CDA   ; "<![CD",
459             CDA   ; 'T' ; CDAT  ; "<![CDA",
460             CDAT  ; 'A' ; CDATA ; "<![CDAT";
461             CDATA ; '[' ; "<![CDATA" ; self.move_to_with(State::Normal, Token::CDataStart)
462         )
463     }
464 
465     /// Encountered '<!D'
doctype_started(&mut self, c: char, s: DoctypeStartedSubstate) -> Result466     fn doctype_started(&mut self, c: char, s: DoctypeStartedSubstate) -> Result {
467         use self::DoctypeStartedSubstate::{D, DO, DOC, DOCT, DOCTY, DOCTYP};
468         dispatch_on_enum_state!(self, s, c, State::DoctypeStarted,
469             D      ; 'O' ; DO     ; "<!D",
470             DO     ; 'C' ; DOC    ; "<!DO",
471             DOC    ; 'T' ; DOCT   ; "<!DOC",
472             DOCT   ; 'Y' ; DOCTY  ; "<!DOCT",
473             DOCTY  ; 'P' ; DOCTYP ; "<!DOCTY";
474             DOCTYP ; 'E' ; "<!DOCTYP" ; self.move_to_with(State::Normal, Token::DoctypeStart)
475         )
476     }
477 
478     /// Encountered '?'
processing_instruction_closing(&mut self, c: char) -> Result479     fn processing_instruction_closing(&mut self, c: char) -> Result {
480         match c {
481             '>' => self.move_to_with(State::Normal, Token::ProcessingInstructionEnd),
482             _   => self.move_to_with_unread(State::Normal, &[c], Token::Character('?')),
483         }
484     }
485 
486     /// Encountered '/'
empty_element_closing(&mut self, c: char) -> Result487     fn empty_element_closing(&mut self, c: char) -> Result {
488         match c {
489             '>' => self.move_to_with(State::Normal, Token::EmptyTagEnd),
490             _   => self.move_to_with_unread(State::Normal, &[c], Token::Character('/')),
491         }
492     }
493 
494     /// Encountered '-'
comment_closing(&mut self, c: char, s: ClosingSubstate) -> Result495     fn comment_closing(&mut self, c: char, s: ClosingSubstate) -> Result {
496         match s {
497             ClosingSubstate::First => match c {
498                 '-' => self.move_to(State::CommentClosing(ClosingSubstate::Second)),
499                 _   => self.move_to_with_unread(State::Normal, &[c], Token::Character('-'))
500             },
501             ClosingSubstate::Second => match c {
502                 '>'                      => self.move_to_with(State::Normal, Token::CommentEnd),
503                 // double dash not followed by a greater-than is a hard error inside comment
504                 _ if self.inside_comment => self.handle_error("--", c),
505                 // nothing else except comment closing starts with a double dash, and comment
506                 // closing can never be after another dash, and also we're outside of a comment,
507                 // therefore it is safe to push only the last read character to the list of unread
508                 // characters and pass the double dash directly to the output
509                 _                        => self.move_to_with_unread(State::Normal, &[c], Token::Chunk("--"))
510             }
511         }
512     }
513 
514     /// Encountered ']'
cdata_closing(&mut self, c: char, s: ClosingSubstate) -> Result515     fn cdata_closing(&mut self, c: char, s: ClosingSubstate) -> Result {
516         match s {
517             ClosingSubstate::First => match c {
518                 ']' => self.move_to(State::CDataClosing(ClosingSubstate::Second)),
519                 _   => self.move_to_with_unread(State::Normal, &[c], Token::Character(']'))
520             },
521             ClosingSubstate::Second => match c {
522                 '>' => self.move_to_with(State::Normal, Token::CDataEnd),
523                 _   => self.move_to_with_unread(State::Normal, &[']', c], Token::Character(']'))
524             }
525         }
526     }
527 }
528 
529 #[cfg(test)]
530 mod tests {
531     use common::{Position};
532     use std::io::{BufReader, Cursor};
533 
534     use super::{Lexer, Token};
535 
536     macro_rules! assert_oks(
537         (for $lex:ident and $buf:ident ; $($e:expr)+) => ({
538             $(
539                 assert_eq!(Ok(Some($e)), $lex.next_token(&mut $buf));
540              )+
541         })
542     );
543 
544     macro_rules! assert_err(
545         (for $lex:ident and $buf:ident expect row $r:expr ; $c:expr, $s:expr) => ({
546             let err = $lex.next_token(&mut $buf);
547             assert!(err.is_err());
548             let err = err.unwrap_err();
549             assert_eq!($r as u64, err.position().row);
550             assert_eq!($c as u64, err.position().column);
551             assert_eq!($s, err.msg());
552         })
553     );
554 
555     macro_rules! assert_none(
556         (for $lex:ident and $buf:ident) => (
557             assert_eq!(Ok(None), $lex.next_token(&mut $buf));
558         )
559     );
560 
make_lex_and_buf(s: &str) -> (Lexer, BufReader<Cursor<Vec<u8>>>)561     fn make_lex_and_buf(s: &str) -> (Lexer, BufReader<Cursor<Vec<u8>>>) {
562         (Lexer::new(), BufReader::new(Cursor::new(s.to_owned().into_bytes())))
563     }
564 
565     #[test]
simple_lexer_test()566     fn simple_lexer_test() {
567         let (mut lex, mut buf) = make_lex_and_buf(
568             r#"<a p='q'> x<b z="y">d	</b></a><p/> <?nm ?> <!-- a c --> &nbsp;"#
569         );
570 
571         assert_oks!(for lex and buf ;
572             Token::OpeningTagStart
573             Token::Character('a')
574             Token::Whitespace(' ')
575             Token::Character('p')
576             Token::EqualsSign
577             Token::SingleQuote
578             Token::Character('q')
579             Token::SingleQuote
580             Token::TagEnd
581             Token::Whitespace(' ')
582             Token::Character('x')
583             Token::OpeningTagStart
584             Token::Character('b')
585             Token::Whitespace(' ')
586             Token::Character('z')
587             Token::EqualsSign
588             Token::DoubleQuote
589             Token::Character('y')
590             Token::DoubleQuote
591             Token::TagEnd
592             Token::Character('d')
593             Token::Whitespace('\t')
594             Token::ClosingTagStart
595             Token::Character('b')
596             Token::TagEnd
597             Token::ClosingTagStart
598             Token::Character('a')
599             Token::TagEnd
600             Token::OpeningTagStart
601             Token::Character('p')
602             Token::EmptyTagEnd
603             Token::Whitespace(' ')
604             Token::ProcessingInstructionStart
605             Token::Character('n')
606             Token::Character('m')
607             Token::Whitespace(' ')
608             Token::ProcessingInstructionEnd
609             Token::Whitespace(' ')
610             Token::CommentStart
611             Token::Whitespace(' ')
612             Token::Character('a')
613             Token::Whitespace(' ')
614             Token::Character('c')
615             Token::Whitespace(' ')
616             Token::CommentEnd
617             Token::Whitespace(' ')
618             Token::ReferenceStart
619             Token::Character('n')
620             Token::Character('b')
621             Token::Character('s')
622             Token::Character('p')
623             Token::ReferenceEnd
624         );
625         assert_none!(for lex and buf);
626     }
627 
628     #[test]
special_chars_test()629     fn special_chars_test() {
630         let (mut lex, mut buf) = make_lex_and_buf(
631             r#"?x!+ // -| ]z]]"#
632         );
633 
634         assert_oks!(for lex and buf ;
635             Token::Character('?')
636             Token::Character('x')
637             Token::Character('!')
638             Token::Character('+')
639             Token::Whitespace(' ')
640             Token::Character('/')
641             Token::Character('/')
642             Token::Whitespace(' ')
643             Token::Character('-')
644             Token::Character('|')
645             Token::Whitespace(' ')
646             Token::Character(']')
647             Token::Character('z')
648             Token::Chunk("]]")
649         );
650         assert_none!(for lex and buf);
651     }
652 
653     #[test]
cdata_test()654     fn cdata_test() {
655         let (mut lex, mut buf) = make_lex_and_buf(
656             r#"<a><![CDATA[x y ?]]> </a>"#
657         );
658 
659         assert_oks!(for lex and buf ;
660             Token::OpeningTagStart
661             Token::Character('a')
662             Token::TagEnd
663             Token::CDataStart
664             Token::Character('x')
665             Token::Whitespace(' ')
666             Token::Character('y')
667             Token::Whitespace(' ')
668             Token::Character('?')
669             Token::CDataEnd
670             Token::Whitespace(' ')
671             Token::ClosingTagStart
672             Token::Character('a')
673             Token::TagEnd
674         );
675         assert_none!(for lex and buf);
676     }
677 
678     #[test]
doctype_test()679     fn doctype_test() {
680         let (mut lex, mut buf) = make_lex_and_buf(
681             r#"<a><!DOCTYPE ab xx z> "#
682         );
683         assert_oks!(for lex and buf ;
684             Token::OpeningTagStart
685             Token::Character('a')
686             Token::TagEnd
687             Token::DoctypeStart
688             Token::Whitespace(' ')
689             Token::Character('a')
690             Token::Character('b')
691             Token::Whitespace(' ')
692             Token::Character('x')
693             Token::Character('x')
694             Token::Whitespace(' ')
695             Token::Character('z')
696             Token::TagEnd
697             Token::Whitespace(' ')
698         );
699         assert_none!(for lex and buf)
700     }
701 
702     #[test]
end_of_stream_handling_ok()703     fn end_of_stream_handling_ok() {
704         macro_rules! eof_check(
705             ($data:expr ; $token:expr) => ({
706                 let (mut lex, mut buf) = make_lex_and_buf($data);
707                 assert_oks!(for lex and buf ; $token);
708                 assert_none!(for lex and buf);
709             })
710         );
711         eof_check!("?"  ; Token::Character('?'));
712         eof_check!("/"  ; Token::Character('/'));
713         eof_check!("-"  ; Token::Character('-'));
714         eof_check!("]"  ; Token::Character(']'));
715         eof_check!("]]" ; Token::Chunk("]]"));
716     }
717 
718     #[test]
end_of_stream_handling_error()719     fn end_of_stream_handling_error() {
720         macro_rules! eof_check(
721             ($data:expr; $r:expr, $c:expr) => ({
722                 let (mut lex, mut buf) = make_lex_and_buf($data);
723                 assert_err!(for lex and buf expect row $r ; $c, "Unexpected end of stream");
724                 assert_none!(for lex and buf);
725             })
726         );
727         eof_check!("<"        ; 0, 1);
728         eof_check!("<!"       ; 0, 2);
729         eof_check!("<!-"      ; 0, 3);
730         eof_check!("<!["      ; 0, 3);
731         eof_check!("<![C"     ; 0, 4);
732         eof_check!("<![CD"    ; 0, 5);
733         eof_check!("<![CDA"   ; 0, 6);
734         eof_check!("<![CDAT"  ; 0, 7);
735         eof_check!("<![CDATA" ; 0, 8);
736         eof_check!("--"       ; 0, 2);
737     }
738 
739     #[test]
error_in_comment_or_cdata_prefix()740     fn error_in_comment_or_cdata_prefix() {
741         let (mut lex, mut buf) = make_lex_and_buf("<!x");
742         assert_err!(for lex and buf expect row 0 ; 0,
743             "Unexpected token '<!' before 'x'"
744         );
745 
746         let (mut lex, mut buf) = make_lex_and_buf("<!x");
747         lex.disable_errors();
748         assert_oks!(for lex and buf ;
749             Token::Chunk("<!")
750             Token::Character('x')
751         );
752         assert_none!(for lex and buf);
753     }
754 
755     #[test]
error_in_comment_started()756     fn error_in_comment_started() {
757         let (mut lex, mut buf) = make_lex_and_buf("<!-\t");
758         assert_err!(for lex and buf expect row 0 ; 0,
759             "Unexpected token '<!-' before '\t'"
760         );
761 
762         let (mut lex, mut buf) = make_lex_and_buf("<!-\t");
763         lex.disable_errors();
764         assert_oks!(for lex and buf ;
765             Token::Chunk("<!-")
766             Token::Whitespace('\t')
767         );
768         assert_none!(for lex and buf);
769     }
770 
771     #[test]
error_in_comment_two_dashes_not_at_end()772     fn error_in_comment_two_dashes_not_at_end() {
773         let (mut lex, mut buf) = make_lex_and_buf("--x");
774         lex.inside_comment();
775         assert_err!(for lex and buf expect row 0; 0,
776             "Unexpected token '--' before 'x'"
777         );
778 
779         let (mut lex, mut buf) = make_lex_and_buf("--x");
780         assert_oks!(for lex and buf ;
781             Token::Chunk("--")
782             Token::Character('x')
783         );
784     }
785 
786     macro_rules! check_case(
787         ($chunk:expr, $app:expr; $data:expr; $r:expr, $c:expr, $s:expr) => ({
788             let (mut lex, mut buf) = make_lex_and_buf($data);
789             assert_err!(for lex and buf expect row $r ; $c, $s);
790 
791             let (mut lex, mut buf) = make_lex_and_buf($data);
792             lex.disable_errors();
793             assert_oks!(for lex and buf ;
794                 Token::Chunk($chunk)
795                 Token::Character($app)
796             );
797             assert_none!(for lex and buf);
798         })
799     );
800 
801     #[test]
error_in_cdata_started()802     fn error_in_cdata_started() {
803         check_case!("<![",      '['; "<![["      ; 0, 0, "Unexpected token '<![' before '['");
804         check_case!("<![C",     '['; "<![C["     ; 0, 0, "Unexpected token '<![C' before '['");
805         check_case!("<![CD",    '['; "<![CD["    ; 0, 0, "Unexpected token '<![CD' before '['");
806         check_case!("<![CDA",   '['; "<![CDA["   ; 0, 0, "Unexpected token '<![CDA' before '['");
807         check_case!("<![CDAT",  '['; "<![CDAT["  ; 0, 0, "Unexpected token '<![CDAT' before '['");
808         check_case!("<![CDATA", '|'; "<![CDATA|" ; 0, 0, "Unexpected token '<![CDATA' before '|'");
809     }
810 
811     #[test]
error_in_doctype_started()812     fn error_in_doctype_started() {
813         check_case!("<!D",      'a'; "<!Da"      ; 0, 0, "Unexpected token '<!D' before 'a'");
814         check_case!("<!DO",     'b'; "<!DOb"     ; 0, 0, "Unexpected token '<!DO' before 'b'");
815         check_case!("<!DOC",    'c'; "<!DOCc"    ; 0, 0, "Unexpected token '<!DOC' before 'c'");
816         check_case!("<!DOCT",   'd'; "<!DOCTd"   ; 0, 0, "Unexpected token '<!DOCT' before 'd'");
817         check_case!("<!DOCTY",  'e'; "<!DOCTYe"  ; 0, 0, "Unexpected token '<!DOCTY' before 'e'");
818         check_case!("<!DOCTYP", 'f'; "<!DOCTYPf" ; 0, 0, "Unexpected token '<!DOCTYP' before 'f'");
819     }
820 
821 
822 
823     #[test]
issue_98_cdata_ending_with_right_bracket()824     fn issue_98_cdata_ending_with_right_bracket() {
825         let (mut lex, mut buf) = make_lex_and_buf(
826             r#"<![CDATA[Foo [Bar]]]>"#
827         );
828 
829         assert_oks!(for lex and buf ;
830             Token::CDataStart
831             Token::Character('F')
832             Token::Character('o')
833             Token::Character('o')
834             Token::Whitespace(' ')
835             Token::Character('[')
836             Token::Character('B')
837             Token::Character('a')
838             Token::Character('r')
839             Token::Character(']')
840             Token::CDataEnd
841         );
842         assert_none!(for lex and buf);
843     }
844 }
845