1 //! Contains an implementation of pull-based XML parser.
2 
3 use std::mem;
4 use std::borrow::Cow;
5 use std::io::prelude::*;
6 
7 use common::{
8     self,
9     XmlVersion, Position, TextPosition,
10     is_name_start_char, is_name_char,
11 };
12 use name::OwnedName;
13 use attribute::OwnedAttribute;
14 use namespace::NamespaceStack;
15 
16 use reader::events::XmlEvent;
17 use reader::config::ParserConfig;
18 use reader::lexer::{Lexer, Token};
19 
20 macro_rules! gen_takes(
21     ($($field:ident -> $method:ident, $t:ty, $def:expr);+) => (
22         $(
23         impl MarkupData {
24             #[inline]
25             fn $method(&mut self) -> $t {
26                 mem::replace(&mut self.$field, $def)
27             }
28         }
29         )+
30     )
31 );
32 
33 gen_takes!(
34     name         -> take_name, String, String::new();
35     ref_data     -> take_ref_data, String, String::new();
36 
37     version      -> take_version, Option<common::XmlVersion>, None;
38     encoding     -> take_encoding, Option<String>, None;
39     standalone   -> take_standalone, Option<bool>, None;
40 
41     element_name -> take_element_name, Option<OwnedName>, None;
42 
43     attr_name    -> take_attr_name, Option<OwnedName>, None;
44     attributes   -> take_attributes, Vec<OwnedAttribute>, vec!()
45 );
46 
47 macro_rules! self_error(
48     ($this:ident; $msg:expr) => ($this.error($msg));
49     ($this:ident; $fmt:expr, $($arg:expr),+) => ($this.error(format!($fmt, $($arg),+)))
50 );
51 
52 mod outside_tag;
53 mod inside_processing_instruction;
54 mod inside_declaration;
55 mod inside_doctype;
56 mod inside_opening_tag;
57 mod inside_closing_tag_name;
58 mod inside_comment;
59 mod inside_cdata;
60 mod inside_reference;
61 
62 static DEFAULT_VERSION: XmlVersion      = XmlVersion::Version10;
63 static DEFAULT_ENCODING: &'static str   = "UTF-8";
64 static DEFAULT_STANDALONE: Option<bool> = None;
65 
66 type ElementStack = Vec<OwnedName>;
67 pub type Result = super::Result<XmlEvent>;
68 
69 /// Pull-based XML parser.
70 pub struct PullParser {
71     config: ParserConfig,
72     lexer: Lexer,
73     st: State,
74     buf: String,
75     nst: NamespaceStack,
76 
77     data: MarkupData,
78     final_result: Option<Result>,
79     next_event: Option<Result>,
80     est: ElementStack,
81     pos: Vec<TextPosition>,
82 
83     encountered_element: bool,
84     parsed_declaration: bool,
85     inside_whitespace: bool,
86     read_prefix_separator: bool,
87     pop_namespace: bool
88 }
89 
90 impl PullParser {
91     /// Returns a new parser using the given config.
new(config: ParserConfig) -> PullParser92     pub fn new(config: ParserConfig) -> PullParser {
93         PullParser {
94             config: config,
95             lexer: Lexer::new(),
96             st: State::OutsideTag,
97             buf: String::new(),
98             nst: NamespaceStack::default(),
99 
100             data: MarkupData {
101                 name: String::new(),
102                 version: None,
103                 encoding: None,
104                 standalone: None,
105                 ref_data: String::new(),
106                 element_name: None,
107                 quote: None,
108                 attr_name: None,
109                 attributes: Vec::new()
110             },
111             final_result: None,
112             next_event: None,
113             est: Vec::new(),
114             pos: vec![TextPosition::new()],
115 
116             encountered_element: false,
117             parsed_declaration: false,
118             inside_whitespace: true,
119             read_prefix_separator: false,
120             pop_namespace: false
121         }
122     }
123 
124     /// Checks if this parser ignores the end of stream errors.
is_ignoring_end_of_stream(&self) -> bool125     pub fn is_ignoring_end_of_stream(&self) -> bool { self.config.ignore_end_of_stream }
126 }
127 
128 impl Position for PullParser {
129     /// Returns the position of the last event produced by the parser
130     #[inline]
position(&self) -> TextPosition131     fn position(&self) -> TextPosition {
132         self.pos[0]
133     }
134 }
135 
136 #[derive(Clone, PartialEq)]
137 pub enum State {
138     OutsideTag,
139     InsideOpeningTag(OpeningTagSubstate),
140     InsideClosingTag(ClosingTagSubstate),
141     InsideProcessingInstruction(ProcessingInstructionSubstate),
142     InsideComment,
143     InsideCData,
144     InsideDeclaration(DeclarationSubstate),
145     InsideDoctype,
146     InsideReference(Box<State>)
147 }
148 
149 #[derive(Clone, PartialEq)]
150 pub enum OpeningTagSubstate {
151     InsideName,
152 
153     InsideTag,
154 
155     InsideAttributeName,
156     AfterAttributeName,
157 
158     InsideAttributeValue,
159 }
160 
161 #[derive(Clone, PartialEq)]
162 pub enum ClosingTagSubstate {
163     CTInsideName,
164     CTAfterName
165 }
166 
167 #[derive(Clone, PartialEq)]
168 pub enum ProcessingInstructionSubstate {
169     PIInsideName,
170     PIInsideData
171 }
172 
173 #[derive(Clone, PartialEq)]
174 pub enum DeclarationSubstate {
175     BeforeVersion,
176     InsideVersion,
177     AfterVersion,
178 
179     InsideVersionValue,
180     AfterVersionValue,
181 
182     InsideEncoding,
183     AfterEncoding,
184 
185     InsideEncodingValue,
186 
187     BeforeStandaloneDecl,
188     InsideStandaloneDecl,
189     AfterStandaloneDecl,
190 
191     InsideStandaloneDeclValue,
192     AfterStandaloneDeclValue
193 }
194 
195 #[derive(PartialEq)]
196 enum QualifiedNameTarget {
197     AttributeNameTarget,
198     OpeningTagNameTarget,
199     ClosingTagNameTarget
200 }
201 
202 #[derive(Copy, Clone, PartialEq, Eq)]
203 enum QuoteToken {
204     SingleQuoteToken,
205     DoubleQuoteToken
206 }
207 
208 impl QuoteToken {
from_token(t: &Token) -> QuoteToken209     fn from_token(t: &Token) -> QuoteToken {
210         match *t {
211             Token::SingleQuote => QuoteToken::SingleQuoteToken,
212             Token::DoubleQuote => QuoteToken::DoubleQuoteToken,
213             _ => panic!("Unexpected token: {}", t)
214         }
215     }
216 
as_token(self) -> Token217     fn as_token(self) -> Token {
218         match self {
219             QuoteToken::SingleQuoteToken => Token::SingleQuote,
220             QuoteToken::DoubleQuoteToken => Token::DoubleQuote
221         }
222     }
223 }
224 
225 struct MarkupData {
226     name: String,     // used for processing instruction name
227     ref_data: String,  // used for reference content
228 
229     version: Option<common::XmlVersion>,  // used for XML declaration version
230     encoding: Option<String>,  // used for XML declaration encoding
231     standalone: Option<bool>,  // used for XML declaration standalone parameter
232 
233     element_name: Option<OwnedName>,  // used for element name
234 
235     quote: Option<QuoteToken>,  // used to hold opening quote for attribute value
236     attr_name: Option<OwnedName>,  // used to hold attribute name
237     attributes: Vec<OwnedAttribute>   // used to hold all accumulated attributes
238 }
239 
240 impl PullParser {
241     /// Returns next event read from the given buffer.
242     ///
243     /// This method should be always called with the same buffer. If you call it
244     /// providing different buffers each time, the result will be undefined.
next<R: Read>(&mut self, r: &mut R) -> Result245     pub fn next<R: Read>(&mut self, r: &mut R) -> Result {
246         if let Some(ref ev) = self.final_result {
247             return ev.clone();
248         }
249 
250         if let Some(ev) = self.next_event.take() {
251             return ev;
252         }
253 
254         if self.pop_namespace {
255             self.pop_namespace = false;
256             self.nst.pop();
257         }
258 
259         loop {
260             // While lexer gives us Ok(maybe_token) -- we loop.
261             // Upon having a complete XML-event -- we return from the whole function.
262             match self.lexer.next_token(r) {
263                 Ok(maybe_token) =>
264                     match maybe_token {
265                         None => break,
266                         Some(token) =>
267                             match self.dispatch_token(token) {
268                                 None => {} // continue
269                                 Some(Ok(XmlEvent::EndDocument)) =>
270                                     return {
271                                         self.next_pos();
272                                         self.set_final_result(Ok(XmlEvent::EndDocument))
273                                     },
274                                 Some(Ok(xml_event)) =>
275                                     return {
276                                         self.next_pos();
277                                         Ok(xml_event)
278                                     },
279                                 Some(Err(xml_error)) =>
280                                     return {
281                                         self.next_pos();
282                                         self.set_final_result(Err(xml_error))
283                                     },
284                             }
285                     },
286                 Err(lexer_error) =>
287                     return self.set_final_result(Err(lexer_error)),
288             }
289         }
290 
291         // Handle end of stream
292         // Forward pos to the lexer head
293         self.next_pos();
294         let ev = if self.depth() == 0 {
295             if self.encountered_element && self.st == State::OutsideTag {  // all is ok
296                 Ok(XmlEvent::EndDocument)
297             } else if !self.encountered_element {
298                 self_error!(self; "Unexpected end of stream: no root element found")
299             } else {  // self.st != State::OutsideTag
300                 self_error!(self; "Unexpected end of stream")  // TODO: add expected hint?
301             }
302         } else {
303             if self.config.ignore_end_of_stream {
304                 self.final_result = None;
305                 self.lexer.reset_eof_handled();
306                 return self_error!(self; "Unexpected end of stream: still inside the root element");
307             } else {
308                 self_error!(self; "Unexpected end of stream: still inside the root element")
309             }
310         };
311         self.set_final_result(ev)
312     }
313 
314     // This function is to be called when a terminal event is reached.
315     // The function sets up the `self.final_result` into `Some(result)` and return `result`.
set_final_result(&mut self, result: Result) -> Result316     fn set_final_result(&mut self, result: Result) -> Result {
317         self.final_result = Some(result.clone());
318         result
319     }
320 
321     #[inline]
error<M: Into<Cow<'static, str>>>(&self, msg: M) -> Result322     fn error<M: Into<Cow<'static, str>>>(&self, msg: M) -> Result {
323         Err((&self.lexer, msg).into())
324     }
325 
326     #[inline]
next_pos(&mut self)327     fn next_pos(&mut self) {
328         if self.pos.len() > 1 {
329             self.pos.remove(0);
330         } else {
331             self.pos[0] = self.lexer.position();
332         }
333     }
334 
335     #[inline]
push_pos(&mut self)336     fn push_pos(&mut self) {
337         self.pos.push(self.lexer.position());
338     }
339 
dispatch_token(&mut self, t: Token) -> Option<Result>340     fn dispatch_token(&mut self, t: Token) -> Option<Result> {
341         match self.st.clone() {
342             State::OutsideTag                     => self.outside_tag(t),
343             State::InsideProcessingInstruction(s) => self.inside_processing_instruction(t, s),
344             State::InsideDeclaration(s)           => self.inside_declaration(t, s),
345             State::InsideDoctype                  => self.inside_doctype(t),
346             State::InsideOpeningTag(s)            => self.inside_opening_tag(t, s),
347             State::InsideClosingTag(s)            => self.inside_closing_tag_name(t, s),
348             State::InsideComment                  => self.inside_comment(t),
349             State::InsideCData                    => self.inside_cdata(t),
350             State::InsideReference(s)             => self.inside_reference(t, *s)
351         }
352     }
353 
354     #[inline]
depth(&self) -> usize355     fn depth(&self) -> usize {
356         self.est.len()
357     }
358 
359     #[inline]
buf_has_data(&self) -> bool360     fn buf_has_data(&self) -> bool {
361         self.buf.len() > 0
362     }
363 
364     #[inline]
take_buf(&mut self) -> String365     fn take_buf(&mut self) -> String {
366         mem::replace(&mut self.buf, String::new())
367     }
368 
369     #[inline]
append_char_continue(&mut self, c: char) -> Option<Result>370     fn append_char_continue(&mut self, c: char) -> Option<Result> {
371         self.buf.push(c);
372         None
373     }
374 
375     #[inline]
into_state(&mut self, st: State, ev: Option<Result>) -> Option<Result>376     fn into_state(&mut self, st: State, ev: Option<Result>) -> Option<Result> {
377         self.st = st;
378         ev
379     }
380 
381     #[inline]
into_state_continue(&mut self, st: State) -> Option<Result>382     fn into_state_continue(&mut self, st: State) -> Option<Result> {
383         self.into_state(st, None)
384     }
385 
386     #[inline]
into_state_emit(&mut self, st: State, ev: Result) -> Option<Result>387     fn into_state_emit(&mut self, st: State, ev: Result) -> Option<Result> {
388         self.into_state(st, Some(ev))
389     }
390 
391     /// Dispatches tokens in order to process qualified name. If qualified name cannot be parsed,
392     /// an error is returned.
393     ///
394     /// # Parameters
395     /// * `t`       --- next token;
396     /// * `on_name` --- a callback which is executed when whitespace is encountered.
read_qualified_name<F>(&mut self, t: Token, target: QualifiedNameTarget, on_name: F) -> Option<Result> where F: Fn(&mut PullParser, Token, OwnedName) -> Option<Result>397     fn read_qualified_name<F>(&mut self, t: Token, target: QualifiedNameTarget, on_name: F) -> Option<Result>
398       where F: Fn(&mut PullParser, Token, OwnedName) -> Option<Result> {
399         // We can get here for the first time only when self.data.name contains zero or one character,
400         // but first character cannot be a colon anyway
401         if self.buf.len() <= 1 {
402             self.read_prefix_separator = false;
403         }
404 
405         let invoke_callback = |this: &mut PullParser, t| {
406             let name = this.take_buf();
407             match name.parse() {
408                 Ok(name) => on_name(this, t, name),
409                 Err(_) => Some(self_error!(this; "Qualified name is invalid: {}", name))
410             }
411         };
412 
413         match t {
414             // There can be only one colon, and not as the first character
415             Token::Character(':') if self.buf_has_data() && !self.read_prefix_separator => {
416                 self.buf.push(':');
417                 self.read_prefix_separator = true;
418                 None
419             }
420 
421             Token::Character(c) if c != ':' && (!self.buf_has_data() && is_name_start_char(c) ||
422                                           self.buf_has_data() && is_name_char(c)) =>
423                 self.append_char_continue(c),
424 
425             Token::EqualsSign if target == QualifiedNameTarget::AttributeNameTarget => invoke_callback(self, t),
426 
427             Token::EmptyTagEnd if target == QualifiedNameTarget::OpeningTagNameTarget => invoke_callback(self, t),
428 
429             Token::TagEnd if target == QualifiedNameTarget::OpeningTagNameTarget ||
430                       target == QualifiedNameTarget::ClosingTagNameTarget => invoke_callback(self, t),
431 
432             Token::Whitespace(_) => invoke_callback(self, t),
433 
434             _ => Some(self_error!(self; "Unexpected token inside qualified name: {}", t))
435         }
436     }
437 
438     /// Dispatches tokens in order to process attribute value.
439     ///
440     /// # Parameters
441     /// * `t`        --- next token;
442     /// * `on_value` --- a callback which is called when terminating quote is encountered.
read_attribute_value<F>(&mut self, t: Token, on_value: F) -> Option<Result> where F: Fn(&mut PullParser, String) -> Option<Result>443     fn read_attribute_value<F>(&mut self, t: Token, on_value: F) -> Option<Result>
444       where F: Fn(&mut PullParser, String) -> Option<Result> {
445         match t {
446             Token::Whitespace(_) if self.data.quote.is_none() => None,  // skip leading whitespace
447 
448             Token::DoubleQuote | Token::SingleQuote => match self.data.quote {
449                 None => {  // Entered attribute value
450                     self.data.quote = Some(QuoteToken::from_token(&t));
451                     None
452                 }
453                 Some(q) if q.as_token() == t => {
454                     self.data.quote = None;
455                     let value = self.take_buf();
456                     on_value(self, value)
457                 }
458                 _ => {
459                     t.push_to_string(&mut self.buf);
460                     None
461                 }
462             },
463 
464             Token::ReferenceStart => {
465                 let st = Box::new(self.st.clone());
466                 self.into_state_continue(State::InsideReference(st))
467             }
468 
469             Token::OpeningTagStart =>
470                 Some(self_error!(self; "Unexpected token inside attribute value: <")),
471 
472             // Every character except " and ' and < is okay
473             _  => {
474                 t.push_to_string(&mut self.buf);
475                 None
476             }
477         }
478     }
479 
emit_start_element(&mut self, emit_end_element: bool) -> Option<Result>480     fn emit_start_element(&mut self, emit_end_element: bool) -> Option<Result> {
481         let mut name = self.data.take_element_name().unwrap();
482         let mut attributes = self.data.take_attributes();
483 
484         // check whether the name prefix is bound and fix its namespace
485         match self.nst.get(name.borrow().prefix_repr()) {
486             Some("") => name.namespace = None,  // default namespace
487             Some(ns) => name.namespace = Some(ns.into()),
488             None => return Some(self_error!(self; "Element {} prefix is unbound", name))
489         }
490 
491         // check and fix accumulated attributes prefixes
492         for attr in attributes.iter_mut() {
493             if let Some(ref pfx) = attr.name.prefix {
494                 let new_ns = match self.nst.get(pfx) {
495                     Some("") => None,  // default namespace
496                     Some(ns) => Some(ns.into()),
497                     None => return Some(self_error!(self; "Attribute {} prefix is unbound", attr.name))
498                 };
499                 attr.name.namespace = new_ns;
500             }
501         }
502 
503         if emit_end_element {
504             self.pop_namespace = true;
505             self.next_event = Some(Ok(XmlEvent::EndElement {
506                 name: name.clone()
507             }));
508         } else {
509             self.est.push(name.clone());
510         }
511         let namespace = self.nst.squash();
512         self.into_state_emit(State::OutsideTag, Ok(XmlEvent::StartElement {
513             name: name,
514             attributes: attributes,
515             namespace: namespace
516         }))
517     }
518 
emit_end_element(&mut self) -> Option<Result>519     fn emit_end_element(&mut self) -> Option<Result> {
520         let mut name = self.data.take_element_name().unwrap();
521 
522         // check whether the name prefix is bound and fix its namespace
523         match self.nst.get(name.borrow().prefix_repr()) {
524             Some("") => name.namespace = None,  // default namespace
525             Some(ns) => name.namespace = Some(ns.into()),
526             None => return Some(self_error!(self; "Element {} prefix is unbound", name))
527         }
528 
529         let op_name = self.est.pop().unwrap();
530 
531         if name == op_name {
532             self.pop_namespace = true;
533             self.into_state_emit(State::OutsideTag, Ok(XmlEvent::EndElement { name: name }))
534         } else {
535             Some(self_error!(self; "Unexpected closing tag: {}, expected {}", name, op_name))
536         }
537     }
538 
539 }
540 
541 #[cfg(test)]
542 mod tests {
543     use std::io::BufReader;
544 
545     use common::{Position, TextPosition};
546     use name::OwnedName;
547     use attribute::OwnedAttribute;
548     use reader::parser::PullParser;
549     use reader::ParserConfig;
550     use reader::events::XmlEvent;
551 
new_parser() -> PullParser552     fn new_parser() -> PullParser {
553         PullParser::new(ParserConfig::new())
554     }
555 
556     macro_rules! expect_event(
557         ($r:expr, $p:expr, $t:pat) => (
558             match $p.next(&mut $r) {
559                 $t => {}
560                 e => panic!("Unexpected event: {:?}", e)
561             }
562         );
563         ($r:expr, $p:expr, $t:pat => $c:expr ) => (
564             match $p.next(&mut $r) {
565                 $t if $c => {}
566                 e => panic!("Unexpected event: {:?}", e)
567             }
568         )
569     );
570 
571     macro_rules! test_data(
572         ($d:expr) => ({
573             static DATA: &'static str = $d;
574             let r = BufReader::new(DATA.as_bytes());
575             let p = new_parser();
576             (r, p)
577         })
578     );
579 
580     #[test]
issue_3_semicolon_in_attribute_value()581     fn issue_3_semicolon_in_attribute_value() {
582         let (mut r, mut p) = test_data!(r#"
583             <a attr="zzz;zzz" />
584         "#);
585 
586         expect_event!(r, p, Ok(XmlEvent::StartDocument { .. }));
587         expect_event!(r, p, Ok(XmlEvent::StartElement { ref name, ref attributes, ref namespace }) =>
588             *name == OwnedName::local("a") &&
589              attributes.len() == 1 &&
590              attributes[0] == OwnedAttribute::new(OwnedName::local("attr"), "zzz;zzz") &&
591              namespace.is_essentially_empty()
592         );
593         expect_event!(r, p, Ok(XmlEvent::EndElement { ref name }) => *name == OwnedName::local("a"));
594         expect_event!(r, p, Ok(XmlEvent::EndDocument));
595     }
596 
597     #[test]
issue_140_entity_reference_inside_tag()598     fn issue_140_entity_reference_inside_tag() {
599         let (mut r, mut p) = test_data!(r#"
600             <bla>&#9835;</bla>
601         "#);
602 
603         expect_event!(r, p, Ok(XmlEvent::StartDocument { .. }));
604         expect_event!(r, p, Ok(XmlEvent::StartElement { ref name, .. }) => *name == OwnedName::local("bla"));
605         expect_event!(r, p, Ok(XmlEvent::Characters(ref s)) => s == "\u{266b}");
606         expect_event!(r, p, Ok(XmlEvent::EndElement { ref name, .. }) => *name == OwnedName::local("bla"));
607         expect_event!(r, p, Ok(XmlEvent::EndDocument));
608     }
609 
610     #[test]
opening_tag_in_attribute_value()611     fn opening_tag_in_attribute_value() {
612         let (mut r, mut p) = test_data!(r#"
613             <a attr="zzz<zzz" />
614         "#);
615 
616         expect_event!(r, p, Ok(XmlEvent::StartDocument { .. }));
617         expect_event!(r, p, Err(ref e) =>
618             e.msg() == "Unexpected token inside attribute value: <" &&
619             e.position() == TextPosition { row: 1, column: 24 }
620         );
621     }
622 }
623