1 //! Contains an implementation of pull-based XML parser. 2 3 use std::mem; 4 use std::borrow::Cow; 5 use std::io::prelude::*; 6 7 use common::{ 8 self, 9 XmlVersion, Position, TextPosition, 10 is_name_start_char, is_name_char, 11 }; 12 use name::OwnedName; 13 use attribute::OwnedAttribute; 14 use namespace::NamespaceStack; 15 16 use reader::events::XmlEvent; 17 use reader::config::ParserConfig; 18 use reader::lexer::{Lexer, Token}; 19 20 macro_rules! gen_takes( 21 ($($field:ident -> $method:ident, $t:ty, $def:expr);+) => ( 22 $( 23 impl MarkupData { 24 #[inline] 25 fn $method(&mut self) -> $t { 26 mem::replace(&mut self.$field, $def) 27 } 28 } 29 )+ 30 ) 31 ); 32 33 gen_takes!( 34 name -> take_name, String, String::new(); 35 ref_data -> take_ref_data, String, String::new(); 36 37 version -> take_version, Option<common::XmlVersion>, None; 38 encoding -> take_encoding, Option<String>, None; 39 standalone -> take_standalone, Option<bool>, None; 40 41 element_name -> take_element_name, Option<OwnedName>, None; 42 43 attr_name -> take_attr_name, Option<OwnedName>, None; 44 attributes -> take_attributes, Vec<OwnedAttribute>, vec!() 45 ); 46 47 macro_rules! self_error( 48 ($this:ident; $msg:expr) => ($this.error($msg)); 49 ($this:ident; $fmt:expr, $($arg:expr),+) => ($this.error(format!($fmt, $($arg),+))) 50 ); 51 52 mod outside_tag; 53 mod inside_processing_instruction; 54 mod inside_declaration; 55 mod inside_doctype; 56 mod inside_opening_tag; 57 mod inside_closing_tag_name; 58 mod inside_comment; 59 mod inside_cdata; 60 mod inside_reference; 61 62 static DEFAULT_VERSION: XmlVersion = XmlVersion::Version10; 63 static DEFAULT_ENCODING: &'static str = "UTF-8"; 64 static DEFAULT_STANDALONE: Option<bool> = None; 65 66 type ElementStack = Vec<OwnedName>; 67 pub type Result = super::Result<XmlEvent>; 68 69 /// Pull-based XML parser. 70 pub struct PullParser { 71 config: ParserConfig, 72 lexer: Lexer, 73 st: State, 74 buf: String, 75 nst: NamespaceStack, 76 77 data: MarkupData, 78 final_result: Option<Result>, 79 next_event: Option<Result>, 80 est: ElementStack, 81 pos: Vec<TextPosition>, 82 83 encountered_element: bool, 84 parsed_declaration: bool, 85 inside_whitespace: bool, 86 read_prefix_separator: bool, 87 pop_namespace: bool 88 } 89 90 impl PullParser { 91 /// Returns a new parser using the given config. new(config: ParserConfig) -> PullParser92 pub fn new(config: ParserConfig) -> PullParser { 93 PullParser { 94 config: config, 95 lexer: Lexer::new(), 96 st: State::OutsideTag, 97 buf: String::new(), 98 nst: NamespaceStack::default(), 99 100 data: MarkupData { 101 name: String::new(), 102 version: None, 103 encoding: None, 104 standalone: None, 105 ref_data: String::new(), 106 element_name: None, 107 quote: None, 108 attr_name: None, 109 attributes: Vec::new() 110 }, 111 final_result: None, 112 next_event: None, 113 est: Vec::new(), 114 pos: vec![TextPosition::new()], 115 116 encountered_element: false, 117 parsed_declaration: false, 118 inside_whitespace: true, 119 read_prefix_separator: false, 120 pop_namespace: false 121 } 122 } 123 124 /// Checks if this parser ignores the end of stream errors. is_ignoring_end_of_stream(&self) -> bool125 pub fn is_ignoring_end_of_stream(&self) -> bool { self.config.ignore_end_of_stream } 126 } 127 128 impl Position for PullParser { 129 /// Returns the position of the last event produced by the parser 130 #[inline] position(&self) -> TextPosition131 fn position(&self) -> TextPosition { 132 self.pos[0] 133 } 134 } 135 136 #[derive(Clone, PartialEq)] 137 pub enum State { 138 OutsideTag, 139 InsideOpeningTag(OpeningTagSubstate), 140 InsideClosingTag(ClosingTagSubstate), 141 InsideProcessingInstruction(ProcessingInstructionSubstate), 142 InsideComment, 143 InsideCData, 144 InsideDeclaration(DeclarationSubstate), 145 InsideDoctype, 146 InsideReference(Box<State>) 147 } 148 149 #[derive(Clone, PartialEq)] 150 pub enum OpeningTagSubstate { 151 InsideName, 152 153 InsideTag, 154 155 InsideAttributeName, 156 AfterAttributeName, 157 158 InsideAttributeValue, 159 } 160 161 #[derive(Clone, PartialEq)] 162 pub enum ClosingTagSubstate { 163 CTInsideName, 164 CTAfterName 165 } 166 167 #[derive(Clone, PartialEq)] 168 pub enum ProcessingInstructionSubstate { 169 PIInsideName, 170 PIInsideData 171 } 172 173 #[derive(Clone, PartialEq)] 174 pub enum DeclarationSubstate { 175 BeforeVersion, 176 InsideVersion, 177 AfterVersion, 178 179 InsideVersionValue, 180 AfterVersionValue, 181 182 InsideEncoding, 183 AfterEncoding, 184 185 InsideEncodingValue, 186 187 BeforeStandaloneDecl, 188 InsideStandaloneDecl, 189 AfterStandaloneDecl, 190 191 InsideStandaloneDeclValue, 192 AfterStandaloneDeclValue 193 } 194 195 #[derive(PartialEq)] 196 enum QualifiedNameTarget { 197 AttributeNameTarget, 198 OpeningTagNameTarget, 199 ClosingTagNameTarget 200 } 201 202 #[derive(Copy, Clone, PartialEq, Eq)] 203 enum QuoteToken { 204 SingleQuoteToken, 205 DoubleQuoteToken 206 } 207 208 impl QuoteToken { from_token(t: &Token) -> QuoteToken209 fn from_token(t: &Token) -> QuoteToken { 210 match *t { 211 Token::SingleQuote => QuoteToken::SingleQuoteToken, 212 Token::DoubleQuote => QuoteToken::DoubleQuoteToken, 213 _ => panic!("Unexpected token: {}", t) 214 } 215 } 216 as_token(self) -> Token217 fn as_token(self) -> Token { 218 match self { 219 QuoteToken::SingleQuoteToken => Token::SingleQuote, 220 QuoteToken::DoubleQuoteToken => Token::DoubleQuote 221 } 222 } 223 } 224 225 struct MarkupData { 226 name: String, // used for processing instruction name 227 ref_data: String, // used for reference content 228 229 version: Option<common::XmlVersion>, // used for XML declaration version 230 encoding: Option<String>, // used for XML declaration encoding 231 standalone: Option<bool>, // used for XML declaration standalone parameter 232 233 element_name: Option<OwnedName>, // used for element name 234 235 quote: Option<QuoteToken>, // used to hold opening quote for attribute value 236 attr_name: Option<OwnedName>, // used to hold attribute name 237 attributes: Vec<OwnedAttribute> // used to hold all accumulated attributes 238 } 239 240 impl PullParser { 241 /// Returns next event read from the given buffer. 242 /// 243 /// This method should be always called with the same buffer. If you call it 244 /// providing different buffers each time, the result will be undefined. next<R: Read>(&mut self, r: &mut R) -> Result245 pub fn next<R: Read>(&mut self, r: &mut R) -> Result { 246 if let Some(ref ev) = self.final_result { 247 return ev.clone(); 248 } 249 250 if let Some(ev) = self.next_event.take() { 251 return ev; 252 } 253 254 if self.pop_namespace { 255 self.pop_namespace = false; 256 self.nst.pop(); 257 } 258 259 loop { 260 // While lexer gives us Ok(maybe_token) -- we loop. 261 // Upon having a complete XML-event -- we return from the whole function. 262 match self.lexer.next_token(r) { 263 Ok(maybe_token) => 264 match maybe_token { 265 None => break, 266 Some(token) => 267 match self.dispatch_token(token) { 268 None => {} // continue 269 Some(Ok(XmlEvent::EndDocument)) => 270 return { 271 self.next_pos(); 272 self.set_final_result(Ok(XmlEvent::EndDocument)) 273 }, 274 Some(Ok(xml_event)) => 275 return { 276 self.next_pos(); 277 Ok(xml_event) 278 }, 279 Some(Err(xml_error)) => 280 return { 281 self.next_pos(); 282 self.set_final_result(Err(xml_error)) 283 }, 284 } 285 }, 286 Err(lexer_error) => 287 return self.set_final_result(Err(lexer_error)), 288 } 289 } 290 291 // Handle end of stream 292 // Forward pos to the lexer head 293 self.next_pos(); 294 let ev = if self.depth() == 0 { 295 if self.encountered_element && self.st == State::OutsideTag { // all is ok 296 Ok(XmlEvent::EndDocument) 297 } else if !self.encountered_element { 298 self_error!(self; "Unexpected end of stream: no root element found") 299 } else { // self.st != State::OutsideTag 300 self_error!(self; "Unexpected end of stream") // TODO: add expected hint? 301 } 302 } else { 303 if self.config.ignore_end_of_stream { 304 self.final_result = None; 305 self.lexer.reset_eof_handled(); 306 return self_error!(self; "Unexpected end of stream: still inside the root element"); 307 } else { 308 self_error!(self; "Unexpected end of stream: still inside the root element") 309 } 310 }; 311 self.set_final_result(ev) 312 } 313 314 // This function is to be called when a terminal event is reached. 315 // The function sets up the `self.final_result` into `Some(result)` and return `result`. set_final_result(&mut self, result: Result) -> Result316 fn set_final_result(&mut self, result: Result) -> Result { 317 self.final_result = Some(result.clone()); 318 result 319 } 320 321 #[inline] error<M: Into<Cow<'static, str>>>(&self, msg: M) -> Result322 fn error<M: Into<Cow<'static, str>>>(&self, msg: M) -> Result { 323 Err((&self.lexer, msg).into()) 324 } 325 326 #[inline] next_pos(&mut self)327 fn next_pos(&mut self) { 328 if self.pos.len() > 1 { 329 self.pos.remove(0); 330 } else { 331 self.pos[0] = self.lexer.position(); 332 } 333 } 334 335 #[inline] push_pos(&mut self)336 fn push_pos(&mut self) { 337 self.pos.push(self.lexer.position()); 338 } 339 dispatch_token(&mut self, t: Token) -> Option<Result>340 fn dispatch_token(&mut self, t: Token) -> Option<Result> { 341 match self.st.clone() { 342 State::OutsideTag => self.outside_tag(t), 343 State::InsideProcessingInstruction(s) => self.inside_processing_instruction(t, s), 344 State::InsideDeclaration(s) => self.inside_declaration(t, s), 345 State::InsideDoctype => self.inside_doctype(t), 346 State::InsideOpeningTag(s) => self.inside_opening_tag(t, s), 347 State::InsideClosingTag(s) => self.inside_closing_tag_name(t, s), 348 State::InsideComment => self.inside_comment(t), 349 State::InsideCData => self.inside_cdata(t), 350 State::InsideReference(s) => self.inside_reference(t, *s) 351 } 352 } 353 354 #[inline] depth(&self) -> usize355 fn depth(&self) -> usize { 356 self.est.len() 357 } 358 359 #[inline] buf_has_data(&self) -> bool360 fn buf_has_data(&self) -> bool { 361 self.buf.len() > 0 362 } 363 364 #[inline] take_buf(&mut self) -> String365 fn take_buf(&mut self) -> String { 366 mem::replace(&mut self.buf, String::new()) 367 } 368 369 #[inline] append_char_continue(&mut self, c: char) -> Option<Result>370 fn append_char_continue(&mut self, c: char) -> Option<Result> { 371 self.buf.push(c); 372 None 373 } 374 375 #[inline] into_state(&mut self, st: State, ev: Option<Result>) -> Option<Result>376 fn into_state(&mut self, st: State, ev: Option<Result>) -> Option<Result> { 377 self.st = st; 378 ev 379 } 380 381 #[inline] into_state_continue(&mut self, st: State) -> Option<Result>382 fn into_state_continue(&mut self, st: State) -> Option<Result> { 383 self.into_state(st, None) 384 } 385 386 #[inline] into_state_emit(&mut self, st: State, ev: Result) -> Option<Result>387 fn into_state_emit(&mut self, st: State, ev: Result) -> Option<Result> { 388 self.into_state(st, Some(ev)) 389 } 390 391 /// Dispatches tokens in order to process qualified name. If qualified name cannot be parsed, 392 /// an error is returned. 393 /// 394 /// # Parameters 395 /// * `t` --- next token; 396 /// * `on_name` --- a callback which is executed when whitespace is encountered. read_qualified_name<F>(&mut self, t: Token, target: QualifiedNameTarget, on_name: F) -> Option<Result> where F: Fn(&mut PullParser, Token, OwnedName) -> Option<Result>397 fn read_qualified_name<F>(&mut self, t: Token, target: QualifiedNameTarget, on_name: F) -> Option<Result> 398 where F: Fn(&mut PullParser, Token, OwnedName) -> Option<Result> { 399 // We can get here for the first time only when self.data.name contains zero or one character, 400 // but first character cannot be a colon anyway 401 if self.buf.len() <= 1 { 402 self.read_prefix_separator = false; 403 } 404 405 let invoke_callback = |this: &mut PullParser, t| { 406 let name = this.take_buf(); 407 match name.parse() { 408 Ok(name) => on_name(this, t, name), 409 Err(_) => Some(self_error!(this; "Qualified name is invalid: {}", name)) 410 } 411 }; 412 413 match t { 414 // There can be only one colon, and not as the first character 415 Token::Character(':') if self.buf_has_data() && !self.read_prefix_separator => { 416 self.buf.push(':'); 417 self.read_prefix_separator = true; 418 None 419 } 420 421 Token::Character(c) if c != ':' && (!self.buf_has_data() && is_name_start_char(c) || 422 self.buf_has_data() && is_name_char(c)) => 423 self.append_char_continue(c), 424 425 Token::EqualsSign if target == QualifiedNameTarget::AttributeNameTarget => invoke_callback(self, t), 426 427 Token::EmptyTagEnd if target == QualifiedNameTarget::OpeningTagNameTarget => invoke_callback(self, t), 428 429 Token::TagEnd if target == QualifiedNameTarget::OpeningTagNameTarget || 430 target == QualifiedNameTarget::ClosingTagNameTarget => invoke_callback(self, t), 431 432 Token::Whitespace(_) => invoke_callback(self, t), 433 434 _ => Some(self_error!(self; "Unexpected token inside qualified name: {}", t)) 435 } 436 } 437 438 /// Dispatches tokens in order to process attribute value. 439 /// 440 /// # Parameters 441 /// * `t` --- next token; 442 /// * `on_value` --- a callback which is called when terminating quote is encountered. read_attribute_value<F>(&mut self, t: Token, on_value: F) -> Option<Result> where F: Fn(&mut PullParser, String) -> Option<Result>443 fn read_attribute_value<F>(&mut self, t: Token, on_value: F) -> Option<Result> 444 where F: Fn(&mut PullParser, String) -> Option<Result> { 445 match t { 446 Token::Whitespace(_) if self.data.quote.is_none() => None, // skip leading whitespace 447 448 Token::DoubleQuote | Token::SingleQuote => match self.data.quote { 449 None => { // Entered attribute value 450 self.data.quote = Some(QuoteToken::from_token(&t)); 451 None 452 } 453 Some(q) if q.as_token() == t => { 454 self.data.quote = None; 455 let value = self.take_buf(); 456 on_value(self, value) 457 } 458 _ => { 459 t.push_to_string(&mut self.buf); 460 None 461 } 462 }, 463 464 Token::ReferenceStart => { 465 let st = Box::new(self.st.clone()); 466 self.into_state_continue(State::InsideReference(st)) 467 } 468 469 Token::OpeningTagStart => 470 Some(self_error!(self; "Unexpected token inside attribute value: <")), 471 472 // Every character except " and ' and < is okay 473 _ => { 474 t.push_to_string(&mut self.buf); 475 None 476 } 477 } 478 } 479 emit_start_element(&mut self, emit_end_element: bool) -> Option<Result>480 fn emit_start_element(&mut self, emit_end_element: bool) -> Option<Result> { 481 let mut name = self.data.take_element_name().unwrap(); 482 let mut attributes = self.data.take_attributes(); 483 484 // check whether the name prefix is bound and fix its namespace 485 match self.nst.get(name.borrow().prefix_repr()) { 486 Some("") => name.namespace = None, // default namespace 487 Some(ns) => name.namespace = Some(ns.into()), 488 None => return Some(self_error!(self; "Element {} prefix is unbound", name)) 489 } 490 491 // check and fix accumulated attributes prefixes 492 for attr in attributes.iter_mut() { 493 if let Some(ref pfx) = attr.name.prefix { 494 let new_ns = match self.nst.get(pfx) { 495 Some("") => None, // default namespace 496 Some(ns) => Some(ns.into()), 497 None => return Some(self_error!(self; "Attribute {} prefix is unbound", attr.name)) 498 }; 499 attr.name.namespace = new_ns; 500 } 501 } 502 503 if emit_end_element { 504 self.pop_namespace = true; 505 self.next_event = Some(Ok(XmlEvent::EndElement { 506 name: name.clone() 507 })); 508 } else { 509 self.est.push(name.clone()); 510 } 511 let namespace = self.nst.squash(); 512 self.into_state_emit(State::OutsideTag, Ok(XmlEvent::StartElement { 513 name: name, 514 attributes: attributes, 515 namespace: namespace 516 })) 517 } 518 emit_end_element(&mut self) -> Option<Result>519 fn emit_end_element(&mut self) -> Option<Result> { 520 let mut name = self.data.take_element_name().unwrap(); 521 522 // check whether the name prefix is bound and fix its namespace 523 match self.nst.get(name.borrow().prefix_repr()) { 524 Some("") => name.namespace = None, // default namespace 525 Some(ns) => name.namespace = Some(ns.into()), 526 None => return Some(self_error!(self; "Element {} prefix is unbound", name)) 527 } 528 529 let op_name = self.est.pop().unwrap(); 530 531 if name == op_name { 532 self.pop_namespace = true; 533 self.into_state_emit(State::OutsideTag, Ok(XmlEvent::EndElement { name: name })) 534 } else { 535 Some(self_error!(self; "Unexpected closing tag: {}, expected {}", name, op_name)) 536 } 537 } 538 539 } 540 541 #[cfg(test)] 542 mod tests { 543 use std::io::BufReader; 544 545 use common::{Position, TextPosition}; 546 use name::OwnedName; 547 use attribute::OwnedAttribute; 548 use reader::parser::PullParser; 549 use reader::ParserConfig; 550 use reader::events::XmlEvent; 551 new_parser() -> PullParser552 fn new_parser() -> PullParser { 553 PullParser::new(ParserConfig::new()) 554 } 555 556 macro_rules! expect_event( 557 ($r:expr, $p:expr, $t:pat) => ( 558 match $p.next(&mut $r) { 559 $t => {} 560 e => panic!("Unexpected event: {:?}", e) 561 } 562 ); 563 ($r:expr, $p:expr, $t:pat => $c:expr ) => ( 564 match $p.next(&mut $r) { 565 $t if $c => {} 566 e => panic!("Unexpected event: {:?}", e) 567 } 568 ) 569 ); 570 571 macro_rules! test_data( 572 ($d:expr) => ({ 573 static DATA: &'static str = $d; 574 let r = BufReader::new(DATA.as_bytes()); 575 let p = new_parser(); 576 (r, p) 577 }) 578 ); 579 580 #[test] issue_3_semicolon_in_attribute_value()581 fn issue_3_semicolon_in_attribute_value() { 582 let (mut r, mut p) = test_data!(r#" 583 <a attr="zzz;zzz" /> 584 "#); 585 586 expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); 587 expect_event!(r, p, Ok(XmlEvent::StartElement { ref name, ref attributes, ref namespace }) => 588 *name == OwnedName::local("a") && 589 attributes.len() == 1 && 590 attributes[0] == OwnedAttribute::new(OwnedName::local("attr"), "zzz;zzz") && 591 namespace.is_essentially_empty() 592 ); 593 expect_event!(r, p, Ok(XmlEvent::EndElement { ref name }) => *name == OwnedName::local("a")); 594 expect_event!(r, p, Ok(XmlEvent::EndDocument)); 595 } 596 597 #[test] issue_140_entity_reference_inside_tag()598 fn issue_140_entity_reference_inside_tag() { 599 let (mut r, mut p) = test_data!(r#" 600 <bla>♫</bla> 601 "#); 602 603 expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); 604 expect_event!(r, p, Ok(XmlEvent::StartElement { ref name, .. }) => *name == OwnedName::local("bla")); 605 expect_event!(r, p, Ok(XmlEvent::Characters(ref s)) => s == "\u{266b}"); 606 expect_event!(r, p, Ok(XmlEvent::EndElement { ref name, .. }) => *name == OwnedName::local("bla")); 607 expect_event!(r, p, Ok(XmlEvent::EndDocument)); 608 } 609 610 #[test] opening_tag_in_attribute_value()611 fn opening_tag_in_attribute_value() { 612 let (mut r, mut p) = test_data!(r#" 613 <a attr="zzz<zzz" /> 614 "#); 615 616 expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); 617 expect_event!(r, p, Err(ref e) => 618 e.msg() == "Unexpected token inside attribute value: <" && 619 e.position() == TextPosition { row: 1, column: 24 } 620 ); 621 } 622 } 623