1# $Id$ 2 3package XML::SAX::PurePerl; 4 5use strict; 6use vars qw/$VERSION/; 7 8$VERSION = '1.02'; 9 10use XML::SAX::PurePerl::Productions qw($NameChar $SingleChar); 11use XML::SAX::PurePerl::Reader; 12use XML::SAX::PurePerl::EncodingDetect (); 13use XML::SAX::Exception; 14use XML::SAX::PurePerl::DocType (); 15use XML::SAX::PurePerl::DTDDecls (); 16use XML::SAX::PurePerl::XMLDecl (); 17use XML::SAX::DocumentLocator (); 18use XML::SAX::Base (); 19use XML::SAX qw(Namespaces); 20use XML::NamespaceSupport (); 21use IO::File; 22 23if ($] < 5.006) { 24 require XML::SAX::PurePerl::NoUnicodeExt; 25} 26else { 27 require XML::SAX::PurePerl::UnicodeExt; 28} 29 30use vars qw(@ISA); 31@ISA = ('XML::SAX::Base'); 32 33my %int_ents = ( 34 amp => '&', 35 lt => '<', 36 gt => '>', 37 quot => '"', 38 apos => "'", 39 ); 40 41my $xmlns_ns = "http://www.w3.org/2000/xmlns/"; 42my $xml_ns = "http://www.w3.org/XML/1998/namespace"; 43 44use Carp; 45sub _parse_characterstream { 46 my $self = shift; 47 my ($fh) = @_; 48 confess("CharacterStream is not yet correctly implemented"); 49 my $reader = XML::SAX::PurePerl::Reader::Stream->new($fh); 50 return $self->_parse($reader); 51} 52 53sub _parse_bytestream { 54 my $self = shift; 55 my ($fh) = @_; 56 my $reader = XML::SAX::PurePerl::Reader::Stream->new($fh); 57 return $self->_parse($reader); 58} 59 60sub _parse_string { 61 my $self = shift; 62 my ($str) = @_; 63 my $reader = XML::SAX::PurePerl::Reader::String->new($str); 64 return $self->_parse($reader); 65} 66 67sub _parse_systemid { 68 my $self = shift; 69 my ($uri) = @_; 70 my $reader = XML::SAX::PurePerl::Reader::URI->new($uri); 71 return $self->_parse($reader); 72} 73 74sub _parse { 75 my ($self, $reader) = @_; 76 77 $reader->public_id($self->{ParseOptions}{Source}{PublicId}); 78 $reader->system_id($self->{ParseOptions}{Source}{SystemId}); 79 80 $self->{NSHelper} = XML::NamespaceSupport->new({xmlns => 1}); 81 82 $self->set_document_locator( 83 XML::SAX::DocumentLocator->new( 84 sub { $reader->public_id }, 85 sub { $reader->system_id }, 86 sub { $reader->line }, 87 sub { $reader->column }, 88 sub { $reader->get_encoding }, 89 sub { $reader->get_xml_version }, 90 ), 91 ); 92 93 $self->start_document({}); 94 95 if (defined $self->{ParseOptions}{Source}{Encoding}) { 96 $reader->set_encoding($self->{ParseOptions}{Source}{Encoding}); 97 } 98 else { 99 $self->encoding_detect($reader); 100 } 101 102 # parse a document 103 $self->document($reader); 104 105 return $self->end_document({}); 106} 107 108sub parser_error { 109 my $self = shift; 110 my ($error, $reader) = @_; 111 112# warn("parser error: $error from ", $reader->line, " : ", $reader->column, "\n"); 113 my $exception = XML::SAX::Exception::Parse->new( 114 Message => $error, 115 ColumnNumber => $reader->column, 116 LineNumber => $reader->line, 117 PublicId => $reader->public_id, 118 SystemId => $reader->system_id, 119 ); 120 121 $self->fatal_error($exception); 122 $exception->throw; 123} 124 125sub document { 126 my ($self, $reader) = @_; 127 128 # document ::= prolog element Misc* 129 130 $self->prolog($reader); 131 $self->element($reader) || 132 $self->parser_error("Document requires an element", $reader); 133 134 while(length($reader->data)) { 135 $self->Misc($reader) || 136 $self->parser_error("Only Comments, PIs and whitespace allowed at end of document", $reader); 137 } 138} 139 140sub prolog { 141 my ($self, $reader) = @_; 142 143 $self->XMLDecl($reader); 144 145 # consume all misc bits 146 1 while($self->Misc($reader)); 147 148 if ($self->doctypedecl($reader)) { 149 while (length($reader->data)) { 150 $self->Misc($reader) || last; 151 } 152 } 153} 154 155sub element { 156 my ($self, $reader) = @_; 157 158 return 0 unless $reader->match('<'); 159 160 my $name = $self->Name($reader) || $self->parser_error("Invalid element name", $reader); 161 162 my %attribs; 163 164 while( my ($k, $v) = $self->Attribute($reader) ) { 165 $attribs{$k} = $v; 166 } 167 168 my $have_namespaces = $self->get_feature(Namespaces); 169 170 # Namespace processing 171 $self->{NSHelper}->push_context; 172 my @new_ns; 173# my %attrs = @attribs; 174# while (my ($k,$v) = each %attrs) { 175 if ($have_namespaces) { 176 while ( my ($k, $v) = each %attribs ) { 177 if ($k =~ m/^xmlns(:(.*))?$/) { 178 my $prefix = $2 || ''; 179 $self->{NSHelper}->declare_prefix($prefix, $v); 180 my $ns = 181 { 182 Prefix => $prefix, 183 NamespaceURI => $v, 184 }; 185 push @new_ns, $ns; 186 $self->SUPER::start_prefix_mapping($ns); 187 } 188 } 189 } 190 191 # Create element object and fire event 192 my %attrib_hash; 193 while (my ($name, $value) = each %attribs ) { 194 # TODO normalise value here 195 my ($ns, $prefix, $lname); 196 if ($have_namespaces) { 197 ($ns, $prefix, $lname) = $self->{NSHelper}->process_attribute_name($name); 198 } 199 $ns ||= ''; $prefix ||= ''; $lname ||= ''; 200 $attrib_hash{"{$ns}$lname"} = { 201 Name => $name, 202 LocalName => $lname, 203 Prefix => $prefix, 204 NamespaceURI => $ns, 205 Value => $value, 206 }; 207 } 208 209 %attribs = (); # lose the memory since we recurse deep 210 211 my ($ns, $prefix, $lname); 212 if ($self->get_feature(Namespaces)) { 213 ($ns, $prefix, $lname) = $self->{NSHelper}->process_element_name($name); 214 } 215 else { 216 $lname = $name; 217 } 218 $ns ||= ''; $prefix ||= ''; $lname ||= ''; 219 220 # Process remainder of start_element 221 $self->skip_whitespace($reader); 222 my $have_content; 223 my $data = $reader->data(2); 224 if ($data =~ /^\/>/) { 225 $reader->move_along(2); 226 } 227 else { 228 $data =~ /^>/ or $self->parser_error("No close element tag", $reader); 229 $reader->move_along(1); 230 $have_content++; 231 } 232 233 my $el = 234 { 235 Name => $name, 236 LocalName => $lname, 237 Prefix => $prefix, 238 NamespaceURI => $ns, 239 Attributes => \%attrib_hash, 240 }; 241 $self->start_element($el); 242 243 # warn("($name\n"); 244 245 if ($have_content) { 246 $self->content($reader); 247 248 my $data = $reader->data(2); 249 $data =~ /^<\// or $self->parser_error("No close tag marker", $reader); 250 $reader->move_along(2); 251 my $end_name = $self->Name($reader); 252 $end_name eq $name || $self->parser_error("End tag mismatch ($end_name != $name)", $reader); 253 $self->skip_whitespace($reader); 254 $reader->match('>') or $self->parser_error("No close '>' on end tag", $reader); 255 } 256 257 my %end_el = %$el; 258 delete $end_el{Attributes}; 259 $self->end_element(\%end_el); 260 261 for my $ns (@new_ns) { 262 $self->end_prefix_mapping($ns); 263 } 264 $self->{NSHelper}->pop_context; 265 266 return 1; 267} 268 269sub content { 270 my ($self, $reader) = @_; 271 272 while (1) { 273 $self->CharData($reader); 274 275 my $data = $reader->data(2); 276 277 if ($data =~ /^<\//) { 278 return 1; 279 } 280 elsif ($data =~ /^&/) { 281 $self->Reference($reader) or $self->parser_error("bare & not allowed in content", $reader); 282 next; 283 } 284 elsif ($data =~ /^<!/) { 285 ($self->CDSect($reader) 286 or 287 $self->Comment($reader)) 288 and next; 289 } 290 elsif ($data =~ /^<\?/) { 291 $self->PI($reader) and next; 292 } 293 elsif ($data =~ /^</) { 294 $self->element($reader) and next; 295 } 296 last; 297 } 298 299 return 1; 300} 301 302sub CDSect { 303 my ($self, $reader) = @_; 304 305 my $data = $reader->data(9); 306 return 0 unless $data =~ /^<!\[CDATA\[/; 307 $reader->move_along(9); 308 309 $self->start_cdata({}); 310 311 $data = $reader->data; 312 while (1) { 313 $self->parser_error("EOF looking for CDATA section end", $reader) 314 unless length($data); 315 316 if ($data =~ /^(.*?)\]\]>/s) { 317 my $chars = $1; 318 $reader->move_along(length($chars) + 3); 319 $self->characters({Data => $chars}); 320 last; 321 } 322 else { 323 $self->characters({Data => $data}); 324 $reader->move_along(length($data)); 325 $data = $reader->data; 326 } 327 } 328 $self->end_cdata({}); 329 return 1; 330} 331 332sub CharData { 333 my ($self, $reader) = @_; 334 335 my $data = $reader->data; 336 337 while (1) { 338 return unless length($data); 339 340 if ($data =~ /^([^<&]*)[<&]/s) { 341 my $chars = $1; 342 $self->parser_error("String ']]>' not allowed in character data", $reader) 343 if $chars =~ /\]\]>/; 344 $reader->move_along(length($chars)); 345 $self->characters({Data => $chars}) if length($chars); 346 last; 347 } 348 else { 349 $self->characters({Data => $data}); 350 $reader->move_along(length($data)); 351 $data = $reader->data; 352 } 353 } 354} 355 356sub Misc { 357 my ($self, $reader) = @_; 358 if ($self->Comment($reader)) { 359 return 1; 360 } 361 elsif ($self->PI($reader)) { 362 return 1; 363 } 364 elsif ($self->skip_whitespace($reader)) { 365 return 1; 366 } 367 368 return 0; 369} 370 371sub Reference { 372 my ($self, $reader) = @_; 373 374 return 0 unless $reader->match('&'); 375 376 my $data = $reader->data; 377 378 # Fetch more data if we have an incomplete numeric reference 379 if ($data =~ /^(#\d*|#x[0-9a-fA-F]*)$/) { 380 $data = $reader->data(length($data) + 6); 381 } 382 383 if ($data =~ /^#x([0-9a-fA-F]+);/) { 384 my $ref = $1; 385 $reader->move_along(length($ref) + 3); 386 my $char = chr_ref(hex($ref)); 387 $self->parser_error("Character reference &#$ref; refers to an illegal XML character ($char)", $reader) 388 unless $char =~ /$SingleChar/o; 389 $self->characters({ Data => $char }); 390 return 1; 391 } 392 elsif ($data =~ /^#([0-9]+);/) { 393 my $ref = $1; 394 $reader->move_along(length($ref) + 2); 395 my $char = chr_ref($ref); 396 $self->parser_error("Character reference &#$ref; refers to an illegal XML character ($char)", $reader) 397 unless $char =~ /$SingleChar/o; 398 $self->characters({ Data => $char }); 399 return 1; 400 } 401 else { 402 # EntityRef 403 my $name = $self->Name($reader) 404 || $self->parser_error("Invalid name in entity", $reader); 405 $reader->match(';') or $self->parser_error("No semi-colon found after entity name", $reader); 406 407 # warn("got entity: \&$name;\n"); 408 409 # expand it 410 if ($self->_is_entity($name)) { 411 412 if ($self->_is_external($name)) { 413 my $value = $self->_get_entity($name); 414 my $ent_reader = XML::SAX::PurePerl::Reader::URI->new($value); 415 $self->encoding_detect($ent_reader); 416 $self->extParsedEnt($ent_reader); 417 } 418 else { 419 my $value = $self->_stringify_entity($name); 420 my $ent_reader = XML::SAX::PurePerl::Reader::String->new($value); 421 $self->content($ent_reader); 422 } 423 return 1; 424 } 425 elsif ($name =~ /^(?:amp|gt|lt|quot|apos)$/) { 426 $self->characters({ Data => $int_ents{$name} }); 427 return 1; 428 } 429 else { 430 $self->parser_error("Undeclared entity", $reader); 431 } 432 } 433} 434 435sub AttReference { 436 my ($self, $name, $reader) = @_; 437 if ($name =~ /^#x([0-9a-fA-F]+)$/) { 438 my $chr = chr_ref(hex($1)); 439 $chr =~ /$SingleChar/o or $self->parser_error("Character reference '&$name;' refers to an illegal XML character", $reader); 440 return $chr; 441 } 442 elsif ($name =~ /^#([0-9]+)$/) { 443 my $chr = chr_ref($1); 444 $chr =~ /$SingleChar/o or $self->parser_error("Character reference '&$name;' refers to an illegal XML character", $reader); 445 return $chr; 446 } 447 else { 448 if ($self->_is_entity($name)) { 449 if ($self->_is_external($name)) { 450 $self->parser_error("No external entity references allowed in attribute values", $reader); 451 } 452 else { 453 my $value = $self->_stringify_entity($name); 454 return $value; 455 } 456 } 457 elsif ($name =~ /^(?:amp|lt|gt|quot|apos)$/) { 458 return $int_ents{$name}; 459 } 460 else { 461 $self->parser_error("Undeclared entity '$name'", $reader); 462 } 463 } 464} 465 466sub extParsedEnt { 467 my ($self, $reader) = @_; 468 469 $self->TextDecl($reader); 470 $self->content($reader); 471} 472 473sub _is_external { 474 my ($self, $name) = @_; 475# TODO: Fix this to use $reader to store the entities perhaps. 476 if ($self->{ParseOptions}{external_entities}{$name}) { 477 return 1; 478 } 479 return ; 480} 481 482sub _is_entity { 483 my ($self, $name) = @_; 484# TODO: ditto above 485 if (exists $self->{ParseOptions}{entities}{$name}) { 486 return 1; 487 } 488 return 0; 489} 490 491sub _stringify_entity { 492 my ($self, $name) = @_; 493# TODO: ditto above 494 if (exists $self->{ParseOptions}{expanded_entity}{$name}) { 495 return $self->{ParseOptions}{expanded_entity}{$name}; 496 } 497 # expand 498 my $reader = XML::SAX::PurePerl::Reader::URI->new($self->{ParseOptions}{entities}{$name}); 499 my $ent = ''; 500 while(1) { 501 my $data = $reader->data; 502 $ent .= $data; 503 $reader->move_along(length($data)) or last; 504 } 505 return $self->{ParseOptions}{expanded_entity}{$name} = $ent; 506} 507 508sub _get_entity { 509 my ($self, $name) = @_; 510# TODO: ditto above 511 return $self->{ParseOptions}{entities}{$name}; 512} 513 514sub skip_whitespace { 515 my ($self, $reader) = @_; 516 517 my $data = $reader->data; 518 519 my $found = 0; 520 while ($data =~ s/^([\x20\x0A\x0D\x09]*)//) { 521 last unless length($1); 522 $found++; 523 $reader->move_along(length($1)); 524 $data = $reader->data; 525 } 526 527 return $found; 528} 529 530sub Attribute { 531 my ($self, $reader) = @_; 532 533 $self->skip_whitespace($reader) || return; 534 535 my $data = $reader->data(2); 536 return if $data =~ /^\/?>/; 537 538 if (my $name = $self->Name($reader)) { 539 $self->skip_whitespace($reader); 540 $reader->match('=') or $self->parser_error("No '=' in Attribute", $reader); 541 $self->skip_whitespace($reader); 542 my $value = $self->AttValue($reader); 543 544 if (!$self->cdata_attrib($name)) { 545 $value =~ s/^\x20*//; # discard leading spaces 546 $value =~ s/\x20*$//; # discard trailing spaces 547 $value =~ s/ {1,}/ /g; # all >1 space to single space 548 } 549 550 return $name, $value; 551 } 552 553 return; 554} 555 556sub cdata_attrib { 557 # TODO implement this! 558 return 1; 559} 560 561sub AttValue { 562 my ($self, $reader) = @_; 563 564 my $quote = $self->quote($reader); 565 566 my $value = ''; 567 568 while (1) { 569 my $data = $reader->data; 570 $self->parser_error("EOF found while looking for the end of attribute value", $reader) 571 unless length($data); 572 if ($data =~ /^([^$quote]*)$quote/) { 573 $reader->move_along(length($1) + 1); 574 $value .= $1; 575 last; 576 } 577 else { 578 $value .= $data; 579 $reader->move_along(length($data)); 580 } 581 } 582 583 if ($value =~ /</) { 584 $self->parser_error("< character not allowed in attribute values", $reader); 585 } 586 587 $value =~ s/[\x09\x0A\x0D]/\x20/g; 588 $value =~ s/&(#(x[0-9a-fA-F]+)|#([0-9]+)|\w+);/$self->AttReference($1, $reader)/geo; 589 590 return $value; 591} 592 593sub Comment { 594 my ($self, $reader) = @_; 595 596 my $data = $reader->data(4); 597 if ($data =~ /^<!--/) { 598 $reader->move_along(4); 599 my $comment_str = ''; 600 while (1) { 601 my $data = $reader->data; 602 $self->parser_error("End of data seen while looking for close comment marker", $reader) 603 unless length($data); 604 if ($data =~ /^(.*?)-->/s) { 605 $comment_str .= $1; 606 $self->parser_error("Invalid comment (dash)", $reader) if $comment_str =~ /-$/; 607 $reader->move_along(length($1) + 3); 608 last; 609 } 610 else { 611 $comment_str .= $data; 612 $reader->move_along(length($data)); 613 } 614 } 615 616 $self->comment({ Data => $comment_str }); 617 618 return 1; 619 } 620 return 0; 621} 622 623sub PI { 624 my ($self, $reader) = @_; 625 626 my $data = $reader->data(2); 627 628 if ($data =~ /^<\?/) { 629 $reader->move_along(2); 630 my ($target); 631 $target = $self->Name($reader) || 632 $self->parser_error("PI has no target", $reader); 633 634 my $pi_data = ''; 635 if ($self->skip_whitespace($reader)) { 636 while (1) { 637 my $data = $reader->data; 638 $self->parser_error("End of data seen while looking for close PI marker", $reader) 639 unless length($data); 640 if ($data =~ /^(.*?)\?>/s) { 641 $pi_data .= $1; 642 $reader->move_along(length($1) + 2); 643 last; 644 } 645 else { 646 $pi_data .= $data; 647 $reader->move_along(length($data)); 648 } 649 } 650 } 651 else { 652 my $data = $reader->data(2); 653 $data =~ /^\?>/ or $self->parser_error("PI closing sequence not found", $reader); 654 $reader->move_along(2); 655 } 656 657 $self->processing_instruction({ Target => $target, Data => $pi_data }); 658 659 return 1; 660 } 661 return 0; 662} 663 664sub Name { 665 my ($self, $reader) = @_; 666 667 my $name = ''; 668 while(1) { 669 my $data = $reader->data; 670 return unless length($data); 671 $data =~ /^([^\s>\/&\?;=<\)\(\[\],\%\#\!\*\|]*)/ or return; 672 $name .= $1; 673 my $len = length($1); 674 $reader->move_along($len); 675 last if ($len != length($data)); 676 } 677 678 return unless length($name); 679 680 $name =~ /$NameChar/o or $self->parser_error("Name <$name> does not match NameChar production", $reader); 681 682 return $name; 683} 684 685sub quote { 686 my ($self, $reader) = @_; 687 688 my $data = $reader->data; 689 690 $data =~ /^(['"])/ or $self->parser_error("Invalid quote token", $reader); 691 $reader->move_along(1); 692 return $1; 693} 694 6951; 696__END__ 697 698=head1 NAME 699 700XML::SAX::PurePerl - Pure Perl XML Parser with SAX2 interface 701 702=head1 SYNOPSIS 703 704 use XML::Handler::Foo; 705 use XML::SAX::PurePerl; 706 my $handler = XML::Handler::Foo->new(); 707 my $parser = XML::SAX::PurePerl->new(Handler => $handler); 708 $parser->parse_uri("myfile.xml"); 709 710=head1 DESCRIPTION 711 712This module implements an XML parser in pure perl. It is written around the 713upcoming perl 5.8's unicode support and support for multiple document 714encodings (using the PerlIO layer), however it has been ported to work with 715ASCII/UTF8 documents under lower perl versions. 716 717The SAX2 API is described in detail at http://sourceforge.net/projects/perl-xml/, in 718the CVS archive, under libxml-perl/docs. Hopefully those documents will be in a 719better location soon. 720 721Please refer to the SAX2 documentation for how to use this module - it is merely a 722front end to SAX2, and implements nothing that is not in that spec (or at least tries 723not to - please email me if you find errors in this implementation). 724 725=head1 BUGS 726 727XML::SAX::PurePerl is B<slow>. Very slow. I suggest you use something else 728in fact. However it is great as a fallback parser for XML::SAX, where the 729user might not be able to install an XS based parser or C library. 730 731Currently lots, probably. At the moment the weakest area is parsing DOCTYPE declarations, 732though the code is in place to start doing this. Also parsing parameter entity 733references is causing me much confusion, since it's not exactly what I would call 734trivial, or well documented in the XML grammar. XML documents with internal subsets 735are likely to fail. 736 737I am however trying to work towards full conformance using the Oasis test suite. 738 739=head1 AUTHOR 740 741Matt Sergeant, matt@sergeant.org. Copyright 2001. 742 743Please report all bugs to the Perl-XML mailing list at perl-xml@listserv.activestate.com. 744 745=head1 LICENSE 746 747This is free software. You may use it or redistribute it under the same terms as 748Perl 5.7.2 itself. 749 750=cut 751 752