1# $Id$
2
3package XML::SAX::PurePerl;
4
5use strict;
6use vars qw/$VERSION/;
7
8$VERSION = '1.02';
9
10use XML::SAX::PurePerl::Productions qw($NameChar $SingleChar);
11use XML::SAX::PurePerl::Reader;
12use XML::SAX::PurePerl::EncodingDetect ();
13use XML::SAX::Exception;
14use XML::SAX::PurePerl::DocType ();
15use XML::SAX::PurePerl::DTDDecls ();
16use XML::SAX::PurePerl::XMLDecl ();
17use XML::SAX::DocumentLocator ();
18use XML::SAX::Base ();
19use XML::SAX qw(Namespaces);
20use XML::NamespaceSupport ();
21use IO::File;
22
23if ($] < 5.006) {
24    require XML::SAX::PurePerl::NoUnicodeExt;
25}
26else {
27    require XML::SAX::PurePerl::UnicodeExt;
28}
29
30use vars qw(@ISA);
31@ISA = ('XML::SAX::Base');
32
33my %int_ents = (
34        amp => '&',
35        lt => '<',
36        gt => '>',
37        quot => '"',
38        apos => "'",
39        );
40
41my $xmlns_ns = "http://www.w3.org/2000/xmlns/";
42my $xml_ns = "http://www.w3.org/XML/1998/namespace";
43
44use Carp;
45sub _parse_characterstream {
46    my $self = shift;
47    my ($fh) = @_;
48    confess("CharacterStream is not yet correctly implemented");
49    my $reader = XML::SAX::PurePerl::Reader::Stream->new($fh);
50    return $self->_parse($reader);
51}
52
53sub _parse_bytestream {
54    my $self = shift;
55    my ($fh) = @_;
56    my $reader = XML::SAX::PurePerl::Reader::Stream->new($fh);
57    return $self->_parse($reader);
58}
59
60sub _parse_string {
61    my $self = shift;
62    my ($str) = @_;
63    my $reader = XML::SAX::PurePerl::Reader::String->new($str);
64    return $self->_parse($reader);
65}
66
67sub _parse_systemid {
68    my $self = shift;
69    my ($uri) = @_;
70    my $reader = XML::SAX::PurePerl::Reader::URI->new($uri);
71    return $self->_parse($reader);
72}
73
74sub _parse {
75    my ($self, $reader) = @_;
76
77    $reader->public_id($self->{ParseOptions}{Source}{PublicId});
78    $reader->system_id($self->{ParseOptions}{Source}{SystemId});
79
80    $self->{NSHelper} = XML::NamespaceSupport->new({xmlns => 1});
81
82    $self->set_document_locator(
83        XML::SAX::DocumentLocator->new(
84            sub { $reader->public_id },
85            sub { $reader->system_id },
86            sub { $reader->line },
87            sub { $reader->column },
88            sub { $reader->get_encoding },
89            sub { $reader->get_xml_version },
90        ),
91    );
92
93    $self->start_document({});
94
95    if (defined $self->{ParseOptions}{Source}{Encoding}) {
96        $reader->set_encoding($self->{ParseOptions}{Source}{Encoding});
97    }
98    else {
99        $self->encoding_detect($reader);
100    }
101
102    # parse a document
103    $self->document($reader);
104
105    return $self->end_document({});
106}
107
108sub parser_error {
109    my $self = shift;
110    my ($error, $reader) = @_;
111
112# warn("parser error: $error from ", $reader->line, " : ", $reader->column, "\n");
113    my $exception = XML::SAX::Exception::Parse->new(
114                Message => $error,
115                ColumnNumber => $reader->column,
116                LineNumber => $reader->line,
117                PublicId => $reader->public_id,
118                SystemId => $reader->system_id,
119            );
120
121    $self->fatal_error($exception);
122    $exception->throw;
123}
124
125sub document {
126    my ($self, $reader) = @_;
127
128    # document ::= prolog element Misc*
129
130    $self->prolog($reader);
131    $self->element($reader) ||
132        $self->parser_error("Document requires an element", $reader);
133
134    while(length($reader->data)) {
135        $self->Misc($reader) ||
136                $self->parser_error("Only Comments, PIs and whitespace allowed at end of document", $reader);
137    }
138}
139
140sub prolog {
141    my ($self, $reader) = @_;
142
143    $self->XMLDecl($reader);
144
145    # consume all misc bits
146    1 while($self->Misc($reader));
147
148    if ($self->doctypedecl($reader)) {
149        while (length($reader->data)) {
150            $self->Misc($reader) || last;
151        }
152    }
153}
154
155sub element {
156    my ($self, $reader) = @_;
157
158    return 0 unless $reader->match('<');
159
160    my $name = $self->Name($reader) || $self->parser_error("Invalid element name", $reader);
161
162    my %attribs;
163
164    while( my ($k, $v) = $self->Attribute($reader) ) {
165        $attribs{$k} = $v;
166    }
167
168    my $have_namespaces = $self->get_feature(Namespaces);
169
170    # Namespace processing
171    $self->{NSHelper}->push_context;
172    my @new_ns;
173#        my %attrs = @attribs;
174#        while (my ($k,$v) = each %attrs) {
175    if ($have_namespaces) {
176        while ( my ($k, $v) = each %attribs ) {
177            if ($k =~ m/^xmlns(:(.*))?$/) {
178                my $prefix = $2 || '';
179                $self->{NSHelper}->declare_prefix($prefix, $v);
180                my $ns =
181                    {
182                        Prefix       => $prefix,
183                        NamespaceURI => $v,
184                    };
185                push @new_ns, $ns;
186                $self->SUPER::start_prefix_mapping($ns);
187            }
188        }
189    }
190
191    # Create element object and fire event
192    my %attrib_hash;
193    while (my ($name, $value) = each %attribs ) {
194        # TODO normalise value here
195        my ($ns, $prefix, $lname);
196        if ($have_namespaces) {
197            ($ns, $prefix, $lname) = $self->{NSHelper}->process_attribute_name($name);
198        }
199        $ns ||= ''; $prefix ||= ''; $lname ||= '';
200        $attrib_hash{"{$ns}$lname"} = {
201            Name => $name,
202            LocalName => $lname,
203            Prefix => $prefix,
204            NamespaceURI => $ns,
205            Value => $value,
206        };
207    }
208
209    %attribs = (); # lose the memory since we recurse deep
210
211    my ($ns, $prefix, $lname);
212    if ($self->get_feature(Namespaces)) {
213        ($ns, $prefix, $lname) = $self->{NSHelper}->process_element_name($name);
214    }
215    else {
216        $lname = $name;
217    }
218    $ns ||= ''; $prefix ||= ''; $lname ||= '';
219
220    # Process remainder of start_element
221    $self->skip_whitespace($reader);
222    my $have_content;
223    my $data = $reader->data(2);
224    if ($data =~ /^\/>/) {
225        $reader->move_along(2);
226    }
227    else {
228        $data =~ /^>/ or $self->parser_error("No close element tag", $reader);
229        $reader->move_along(1);
230        $have_content++;
231    }
232
233    my $el =
234    {
235        Name => $name,
236        LocalName => $lname,
237        Prefix => $prefix,
238        NamespaceURI => $ns,
239        Attributes => \%attrib_hash,
240    };
241    $self->start_element($el);
242
243    # warn("($name\n");
244
245    if ($have_content) {
246        $self->content($reader);
247
248        my $data = $reader->data(2);
249        $data =~ /^<\// or $self->parser_error("No close tag marker", $reader);
250        $reader->move_along(2);
251        my $end_name = $self->Name($reader);
252        $end_name eq $name || $self->parser_error("End tag mismatch ($end_name != $name)", $reader);
253        $self->skip_whitespace($reader);
254        $reader->match('>') or $self->parser_error("No close '>' on end tag", $reader);
255    }
256
257    my %end_el = %$el;
258    delete $end_el{Attributes};
259    $self->end_element(\%end_el);
260
261    for my $ns (@new_ns) {
262        $self->end_prefix_mapping($ns);
263    }
264    $self->{NSHelper}->pop_context;
265
266    return 1;
267}
268
269sub content {
270    my ($self, $reader) = @_;
271
272    while (1) {
273        $self->CharData($reader);
274
275        my $data = $reader->data(2);
276
277        if ($data =~ /^<\//) {
278            return 1;
279        }
280        elsif ($data =~ /^&/) {
281            $self->Reference($reader) or $self->parser_error("bare & not allowed in content", $reader);
282            next;
283        }
284        elsif ($data =~ /^<!/) {
285            ($self->CDSect($reader)
286             or
287             $self->Comment($reader))
288             and next;
289        }
290        elsif ($data =~ /^<\?/) {
291            $self->PI($reader) and next;
292        }
293        elsif ($data =~ /^</) {
294            $self->element($reader) and next;
295        }
296        last;
297    }
298
299    return 1;
300}
301
302sub CDSect {
303    my ($self, $reader) = @_;
304
305    my $data = $reader->data(9);
306    return 0 unless $data =~ /^<!\[CDATA\[/;
307    $reader->move_along(9);
308
309    $self->start_cdata({});
310
311    $data = $reader->data;
312    while (1) {
313        $self->parser_error("EOF looking for CDATA section end", $reader)
314            unless length($data);
315
316        if ($data =~ /^(.*?)\]\]>/s) {
317            my $chars = $1;
318            $reader->move_along(length($chars) + 3);
319            $self->characters({Data => $chars});
320            last;
321        }
322        else {
323            $self->characters({Data => $data});
324            $reader->move_along(length($data));
325            $data = $reader->data;
326        }
327    }
328    $self->end_cdata({});
329    return 1;
330}
331
332sub CharData {
333    my ($self, $reader) = @_;
334
335    my $data = $reader->data;
336
337    while (1) {
338        return unless length($data);
339
340        if ($data =~ /^([^<&]*)[<&]/s) {
341            my $chars = $1;
342            $self->parser_error("String ']]>' not allowed in character data", $reader)
343                if $chars =~ /\]\]>/;
344            $reader->move_along(length($chars));
345            $self->characters({Data => $chars}) if length($chars);
346            last;
347        }
348        else {
349            $self->characters({Data => $data});
350            $reader->move_along(length($data));
351            $data = $reader->data;
352        }
353    }
354}
355
356sub Misc {
357    my ($self, $reader) = @_;
358    if ($self->Comment($reader)) {
359        return 1;
360    }
361    elsif ($self->PI($reader)) {
362        return 1;
363    }
364    elsif ($self->skip_whitespace($reader)) {
365        return 1;
366    }
367
368    return 0;
369}
370
371sub Reference {
372    my ($self, $reader) = @_;
373
374    return 0 unless $reader->match('&');
375
376    my $data = $reader->data;
377
378    # Fetch more data if we have an incomplete numeric reference
379    if ($data =~ /^(#\d*|#x[0-9a-fA-F]*)$/) {
380        $data = $reader->data(length($data) + 6);
381    }
382
383    if ($data =~ /^#x([0-9a-fA-F]+);/) {
384        my $ref = $1;
385        $reader->move_along(length($ref) + 3);
386        my $char = chr_ref(hex($ref));
387        $self->parser_error("Character reference &#$ref; refers to an illegal XML character ($char)", $reader)
388            unless $char =~ /$SingleChar/o;
389        $self->characters({ Data => $char });
390        return 1;
391    }
392    elsif ($data =~ /^#([0-9]+);/) {
393        my $ref = $1;
394        $reader->move_along(length($ref) + 2);
395        my $char = chr_ref($ref);
396        $self->parser_error("Character reference &#$ref; refers to an illegal XML character ($char)", $reader)
397            unless $char =~ /$SingleChar/o;
398        $self->characters({ Data => $char });
399        return 1;
400    }
401    else {
402        # EntityRef
403        my $name = $self->Name($reader)
404            || $self->parser_error("Invalid name in entity", $reader);
405        $reader->match(';') or $self->parser_error("No semi-colon found after entity name", $reader);
406
407        # warn("got entity: \&$name;\n");
408
409        # expand it
410        if ($self->_is_entity($name)) {
411
412            if ($self->_is_external($name)) {
413                my $value = $self->_get_entity($name);
414                my $ent_reader = XML::SAX::PurePerl::Reader::URI->new($value);
415                $self->encoding_detect($ent_reader);
416                $self->extParsedEnt($ent_reader);
417            }
418            else {
419                my $value = $self->_stringify_entity($name);
420                my $ent_reader = XML::SAX::PurePerl::Reader::String->new($value);
421                $self->content($ent_reader);
422            }
423            return 1;
424        }
425        elsif ($name =~ /^(?:amp|gt|lt|quot|apos)$/) {
426            $self->characters({ Data => $int_ents{$name} });
427            return 1;
428        }
429        else {
430            $self->parser_error("Undeclared entity", $reader);
431        }
432    }
433}
434
435sub AttReference {
436    my ($self, $name, $reader) = @_;
437    if ($name =~ /^#x([0-9a-fA-F]+)$/) {
438        my $chr = chr_ref(hex($1));
439        $chr =~ /$SingleChar/o or $self->parser_error("Character reference '&$name;' refers to an illegal XML character", $reader);
440        return $chr;
441    }
442    elsif ($name =~ /^#([0-9]+)$/) {
443        my $chr = chr_ref($1);
444        $chr =~ /$SingleChar/o or $self->parser_error("Character reference '&$name;' refers to an illegal XML character", $reader);
445        return $chr;
446    }
447    else {
448        if ($self->_is_entity($name)) {
449            if ($self->_is_external($name)) {
450                $self->parser_error("No external entity references allowed in attribute values", $reader);
451            }
452            else {
453                my $value = $self->_stringify_entity($name);
454                return $value;
455            }
456        }
457        elsif ($name =~ /^(?:amp|lt|gt|quot|apos)$/) {
458            return $int_ents{$name};
459        }
460        else {
461            $self->parser_error("Undeclared entity '$name'", $reader);
462        }
463    }
464}
465
466sub extParsedEnt {
467    my ($self, $reader) = @_;
468
469    $self->TextDecl($reader);
470    $self->content($reader);
471}
472
473sub _is_external {
474    my ($self, $name) = @_;
475# TODO: Fix this to use $reader to store the entities perhaps.
476    if ($self->{ParseOptions}{external_entities}{$name}) {
477        return 1;
478    }
479    return ;
480}
481
482sub _is_entity {
483    my ($self, $name) = @_;
484# TODO: ditto above
485    if (exists $self->{ParseOptions}{entities}{$name}) {
486        return 1;
487    }
488    return 0;
489}
490
491sub _stringify_entity {
492    my ($self, $name) = @_;
493# TODO: ditto above
494    if (exists $self->{ParseOptions}{expanded_entity}{$name}) {
495        return $self->{ParseOptions}{expanded_entity}{$name};
496    }
497    # expand
498    my $reader = XML::SAX::PurePerl::Reader::URI->new($self->{ParseOptions}{entities}{$name});
499    my $ent = '';
500    while(1) {
501        my $data = $reader->data;
502        $ent .= $data;
503        $reader->move_along(length($data)) or last;
504    }
505    return $self->{ParseOptions}{expanded_entity}{$name} = $ent;
506}
507
508sub _get_entity {
509    my ($self, $name) = @_;
510# TODO: ditto above
511    return $self->{ParseOptions}{entities}{$name};
512}
513
514sub skip_whitespace {
515    my ($self, $reader) = @_;
516
517    my $data = $reader->data;
518
519    my $found = 0;
520    while ($data =~ s/^([\x20\x0A\x0D\x09]*)//) {
521        last unless length($1);
522        $found++;
523        $reader->move_along(length($1));
524        $data = $reader->data;
525    }
526
527    return $found;
528}
529
530sub Attribute {
531    my ($self, $reader) = @_;
532
533    $self->skip_whitespace($reader) || return;
534
535    my $data = $reader->data(2);
536    return if $data =~ /^\/?>/;
537
538    if (my $name = $self->Name($reader)) {
539        $self->skip_whitespace($reader);
540        $reader->match('=') or $self->parser_error("No '=' in Attribute", $reader);
541        $self->skip_whitespace($reader);
542        my $value = $self->AttValue($reader);
543
544        if (!$self->cdata_attrib($name)) {
545            $value =~ s/^\x20*//; # discard leading spaces
546            $value =~ s/\x20*$//; # discard trailing spaces
547            $value =~ s/ {1,}/ /g; # all >1 space to single space
548        }
549
550        return $name, $value;
551    }
552
553    return;
554}
555
556sub cdata_attrib {
557    # TODO implement this!
558    return 1;
559}
560
561sub AttValue {
562    my ($self, $reader) = @_;
563
564    my $quote = $self->quote($reader);
565
566    my $value = '';
567
568    while (1) {
569        my $data = $reader->data;
570        $self->parser_error("EOF found while looking for the end of attribute value", $reader)
571            unless length($data);
572        if ($data =~ /^([^$quote]*)$quote/) {
573            $reader->move_along(length($1) + 1);
574            $value .= $1;
575            last;
576        }
577        else {
578            $value .= $data;
579            $reader->move_along(length($data));
580        }
581    }
582
583    if ($value =~ /</) {
584        $self->parser_error("< character not allowed in attribute values", $reader);
585    }
586
587    $value =~ s/[\x09\x0A\x0D]/\x20/g;
588    $value =~ s/&(#(x[0-9a-fA-F]+)|#([0-9]+)|\w+);/$self->AttReference($1, $reader)/geo;
589
590    return $value;
591}
592
593sub Comment {
594    my ($self, $reader) = @_;
595
596    my $data = $reader->data(4);
597    if ($data =~ /^<!--/) {
598        $reader->move_along(4);
599        my $comment_str = '';
600        while (1) {
601            my $data = $reader->data;
602            $self->parser_error("End of data seen while looking for close comment marker", $reader)
603                unless length($data);
604            if ($data =~ /^(.*?)-->/s) {
605                $comment_str .= $1;
606                $self->parser_error("Invalid comment (dash)", $reader) if $comment_str =~ /-$/;
607                $reader->move_along(length($1) + 3);
608                last;
609            }
610            else {
611                $comment_str .= $data;
612                $reader->move_along(length($data));
613            }
614        }
615
616        $self->comment({ Data => $comment_str });
617
618        return 1;
619    }
620    return 0;
621}
622
623sub PI {
624    my ($self, $reader) = @_;
625
626    my $data = $reader->data(2);
627
628    if ($data =~ /^<\?/) {
629        $reader->move_along(2);
630        my ($target);
631        $target = $self->Name($reader) ||
632            $self->parser_error("PI has no target", $reader);
633
634        my $pi_data = '';
635        if ($self->skip_whitespace($reader)) {
636            while (1) {
637                my $data = $reader->data;
638                $self->parser_error("End of data seen while looking for close PI marker", $reader)
639                    unless length($data);
640                if ($data =~ /^(.*?)\?>/s) {
641                    $pi_data .= $1;
642                    $reader->move_along(length($1) + 2);
643                    last;
644                }
645                else {
646                    $pi_data .= $data;
647                    $reader->move_along(length($data));
648                }
649            }
650        }
651        else {
652            my $data = $reader->data(2);
653            $data =~ /^\?>/ or $self->parser_error("PI closing sequence not found", $reader);
654            $reader->move_along(2);
655        }
656
657        $self->processing_instruction({ Target => $target, Data => $pi_data });
658
659        return 1;
660    }
661    return 0;
662}
663
664sub Name {
665    my ($self, $reader) = @_;
666
667    my $name = '';
668    while(1) {
669        my $data = $reader->data;
670        return unless length($data);
671        $data =~ /^([^\s>\/&\?;=<\)\(\[\],\%\#\!\*\|]*)/ or return;
672        $name .= $1;
673        my $len = length($1);
674        $reader->move_along($len);
675        last if ($len != length($data));
676    }
677
678    return unless length($name);
679
680    $name =~ /$NameChar/o or $self->parser_error("Name <$name> does not match NameChar production", $reader);
681
682    return $name;
683}
684
685sub quote {
686    my ($self, $reader) = @_;
687
688    my $data = $reader->data;
689
690    $data =~ /^(['"])/ or $self->parser_error("Invalid quote token", $reader);
691    $reader->move_along(1);
692    return $1;
693}
694
6951;
696__END__
697
698=head1 NAME
699
700XML::SAX::PurePerl - Pure Perl XML Parser with SAX2 interface
701
702=head1 SYNOPSIS
703
704  use XML::Handler::Foo;
705  use XML::SAX::PurePerl;
706  my $handler = XML::Handler::Foo->new();
707  my $parser = XML::SAX::PurePerl->new(Handler => $handler);
708  $parser->parse_uri("myfile.xml");
709
710=head1 DESCRIPTION
711
712This module implements an XML parser in pure perl. It is written around the
713upcoming perl 5.8's unicode support and support for multiple document
714encodings (using the PerlIO layer), however it has been ported to work with
715ASCII/UTF8 documents under lower perl versions.
716
717The SAX2 API is described in detail at http://sourceforge.net/projects/perl-xml/, in
718the CVS archive, under libxml-perl/docs. Hopefully those documents will be in a
719better location soon.
720
721Please refer to the SAX2 documentation for how to use this module - it is merely a
722front end to SAX2, and implements nothing that is not in that spec (or at least tries
723not to - please email me if you find errors in this implementation).
724
725=head1 BUGS
726
727XML::SAX::PurePerl is B<slow>. Very slow. I suggest you use something else
728in fact. However it is great as a fallback parser for XML::SAX, where the
729user might not be able to install an XS based parser or C library.
730
731Currently lots, probably. At the moment the weakest area is parsing DOCTYPE declarations,
732though the code is in place to start doing this. Also parsing parameter entity
733references is causing me much confusion, since it's not exactly what I would call
734trivial, or well documented in the XML grammar. XML documents with internal subsets
735are likely to fail.
736
737I am however trying to work towards full conformance using the Oasis test suite.
738
739=head1 AUTHOR
740
741Matt Sergeant, matt@sergeant.org. Copyright 2001.
742
743Please report all bugs to the Perl-XML mailing list at perl-xml@listserv.activestate.com.
744
745=head1 LICENSE
746
747This is free software. You may use it or redistribute it under the same terms as
748Perl 5.7.2 itself.
749
750=cut
751
752