1package XML::SAX::Machine;
2{
3  $XML::SAX::Machine::VERSION = '0.46';
4}
5# ABSTRACT: Manage a collection of SAX processors
6
7
8
9use strict;
10
11use constant has_named_regexp_character_classes => $] > 5.006000;
12
13use Carp;
14use UNIVERSAL;
15use XML::SAX::EventMethodMaker qw( :all );
16use XML::SAX::Machines;
17
18## Tell the config stuff what options we'll be requesting, so we
19## don't get typoes in this code.  Very annoying, but I mispelt it
20## so often, that adding one statement like this seemed like a low
21## pain solution, since testing options like this can be long and
22## bothersome.
23XML::SAX::Machines->expected_processor_class_options(qw(
24    ConstructWithHashedOptions
25));
26
27
28
29sub new {
30    my $proto = shift;
31    my $class = ref $proto || $proto;
32
33    my @options_if_any = @_ && ref $_[-1] eq "HASH" ? %{pop()} : ();
34    my $self = bless { @options_if_any }, $class;
35
36    $self->{Parts} = [];
37    $self->{PartsByName} = {};  ## Mapping of names to parts
38
39    $self->_compile_specs( @_ );
40
41    ## Set this last in case any specs have handler "Exhaust"
42    $self->set_handler( $self->{Handler} ) if $self->{Handler};
43
44    return $self;
45}
46
47
48sub _find_part_rec {
49    my $self = shift;
50    my ( $id ) = @_;
51
52    if ( ref $id ) {
53        return exists $self->{PartsByProcessor}->{$id}
54            && $self->{PartsByProcessor}->{$id};
55    }
56
57    if ( $id =~ /^[+-]?\d+(?!\n)$/ ) {
58        return undef
59            if     $id >     $#{$self->{Parts}}
60                || $id < - ( $#{$self->{Parts}} + 1 );
61        return $self->{Parts}->[$id];
62    }
63
64    return $self->{PartsByName}->{$id}
65        if exists $self->{PartsByName}->{$id};
66
67    return undef;
68}
69
70
71sub find_part {
72    my $self = shift;
73    my ( $spec ) = @_;
74
75    return $self->{Handler} if $spec eq "Exhaust";
76
77    my $part_rec;
78
79    if ( 0 <= index $spec, "/" ) {
80        ## Take the sloooow road...
81        require File::Spec::Unix;
82        croak "find_part() path not absolute: '$spec'"
83            unless File::Spec::Unix->file_name_is_absolute( $spec );
84
85        ## Cannonical-ize it, do /foo/../ => / conversion
86        $spec = File::Spec::Unix->canonpath( $spec );
87        1 while $spec =~ s{/[^/]+/\.\.(/|(?!\n\Z))}{$1};
88
89        my @names = File::Spec::Unix->splitdir( $spec );
90        pop @names while @names && ! length $names[-1];
91        shift @names while @names && ! length $names[0];
92
93        croak "invalid find_part() specification: '$spec'"
94            unless File::Spec::Unix->file_name_is_absolute( $spec );
95
96        my @audit_trail;
97        my $proc = $self;
98        for ( @names ) {
99            push @audit_trail, $_;
100            $part_rec = $proc->_find_part_rec( $_ );
101            unless ( $part_rec ) {
102                croak "find_path() could not find '",
103                    join( "/", "", @audit_trail ),
104                    "' in ", ref $self;
105            }
106            $proc = $part_rec->{Processor};
107        }
108    }
109    else {
110        $part_rec = $self->_find_part_rec( $spec );
111    }
112
113    croak "find_path() could not find '$spec' in ", ref $self
114        unless $part_rec;
115
116    my $proc = $part_rec->{Processor};
117
118    ## Be paranoid here, just in case we have a bug somewhere.  I prefer
119    ## getting reasonable bug reports...
120    confess "find_path() found an undefined Processor reference as part '$_[0]' in ",
121        ref $self
122        unless defined $proc;
123
124    confess "find_path() found '$proc' instead of a Processor reference as part '$_[0]' in ",
125        ref $self
126        unless ref $proc;
127
128    confess "find_path() found a ",
129        ref $proc,
130        " reference instead of a Processor reference in part '$_[0]' in ",
131        ref $self
132        unless index( "SCALAR|ARRAY|HASH|Regexp|REF|CODE", ref $proc ) <= 0;
133
134    return $proc;
135}
136
137
138use vars qw( $AUTOLOAD );
139
140sub DESTROY {} ## Prevent AUTOLOADing of this.
141
142my $alpha_first_char = has_named_regexp_character_classes
143	    ? "^[[:alpha:]]"
144	    : "^[a-zA-Z]";
145
146sub AUTOLOAD {
147    my $self = shift;
148
149    $AUTOLOAD =~ s/.*://;
150
151    my $fc = substr $AUTOLOAD, 0, 1;
152    ## TODO: Find out how Perl determines "alphaness" and use that.
153    croak ref $self, " does not provide method $AUTOLOAD"
154        unless $fc eq uc $fc && $AUTOLOAD =~ /$alpha_first_char/o;
155
156    my $found = $self->find_part( $AUTOLOAD );
157    return $found;
158}
159
160
161sub parts {
162    my $self = shift;
163    croak "Can't set parts for a '", ref( $self ), "'" if @_;
164    confess "undef Parts" unless defined $self->{Parts};
165    return map $_->{Processor}, @{$self->{Parts}};
166}
167
168
169## TODO: Detect deep recursion in _all_part_recs().  In fact, detect deep
170## recursion when building the machine.
171
172sub _all_part_recs {
173    my $self = shift;
174    croak "Can't pass parms to ", ref( $self ), "->_all_part_recs" if @_;
175    confess "undef Parts" unless defined $self->{Parts};
176    my $proc;
177    return map {
178        $proc = $_->{Processor};
179        UNIVERSAL::can( $proc, "all_parts" )
180            ? ( $_, $proc->_all_part_recs )
181            : $_;
182    } @{$self->{Parts}};
183}
184
185
186sub all_parts {
187    my $self = shift;
188    croak "Can't pass parms to ", ref( $self ), "->_all_parts" if @_;
189    confess "undef Parts" unless defined $self->{Parts};
190    return map $_->{Processor}, $self->_all_part_recs;
191}
192
193
194#=item add_parts
195#
196#    $m->add_parts( { Foo => $foo, Bar => $bar } );
197#
198#On linear machines:
199#
200#    $m->add_parts( @parts );
201#
202#Adds one or more parts to the machine.  Does not connect them, you need
203#to do that manually (we need to add a $m->connect_parts() style API).
204#
205#=cut
206#
207#sub add_parts {
208#    my $self = shift;
209#confess "TODO";
210#}
211
212#=item remove_parts
213#
214#    $m->remove_parts( qw( Foo Bar ) );
215#
216#Slower, but possible:
217#
218#    $m->remove_parts( $m->Foo, $m->Bar );
219#
220#On linear machines:
221#
222#    $m->remove_parts( 1, 3 );
223#
224#Removes one or more parts from the machine.  Does not connect them
225#except on linear machines.  Attempts to disconnect any parts that
226#point to them, and that they point to.  This attempt will fail for any
227#part that does not provide a handler() or handlers() method.
228#
229#This is breadth-first recursive, like C<$m->find_part( $id )> is.  This
230#will remove *all* parts with the given names from a complex
231#machine (this does not apply to index numbers).
232#
233#Returns a list of the removed parts.
234#
235#If a name is not found, it is ignored.
236#
237#=cut
238#
239#sub remove_parts {
240#    my $self = shift;
241#
242#    my %names;
243#    my @found;
244#
245#    for my $doomed ( @_ ) {
246#        unless ( ref $doomed ) {
247#            $names{$doomed} = undef;
248#            if ( my $f = delete $self->{Parts}->{$doomed} ) {
249#                push @found, $f;
250#            }
251#            else {
252#                for my $c ( $self->parts ) {
253#                    if ( $c->can( "remove_parts" )
254#                       && ( my @f = $c->remove_parts( $doomed ) )
255#                    ) {
256#                        push @found, @f;
257#                    }
258#                }
259#            }
260#        }
261#        else {
262#            ## It's a reference.  Do this the slow, painful way.
263#            for my $name ( keys %{$self->{Parts}} ) {
264#                if ( $doomed == $self->{Parts}->{$name} ) {
265#                    $names{$name} = undef;
266#                    push @found, delete $self->{Parts}->{$name};
267#                }
268#            }
269#
270#            for my $c ( $self->parts ) {
271#                if ( $c->can( "remove_parts" )
272#                   && ( my @f = $c->remove_parts( $doomed ) )
273#                ) {
274#                    push @found, @f;
275#                }
276#            }
277#        }
278#    }
279#
280#    for my $c ( sort keys %{$self->{Connections}} ) {
281#        if ( exists $names{$self->{Connections}->{$c}} ) {
282###TODO: Unhook the processors if possible
283#            delete $self->{Connections}->{$c};
284#        }
285#        if ( exists $names{$c} ) {
286###TODO: Unhook the processors if possible
287#            delete $self->{Connections}->{$c};
288#        }
289#    }
290#
291#    return @found;
292#}
293
294
295sub set_handler {
296    my $self = shift;
297    my ( $handler, $type ) = reverse @_;
298
299    $type ||= "Handler";
300
301    for my $part_rec ( @{$self->{Parts}} ) {
302        my $proc = $part_rec->{Processor};
303        my $hs = $part_rec->{Handlers};
304
305        if ( grep ref $_ ? $_ == $self->{$type} : $_ eq "Exhaust", @$hs ) {
306            if ( @$hs == 1 && $proc->can( "set_handler" ) ) {
307                $proc->set_handler(
308                    $type ne "Handler" ? $type : (),
309                    $handler
310                );
311                next;
312            }
313
314            unless ( $proc->can( "set_handlers" ) ) {
315                croak ref $proc,
316                    @$hs == 1
317                        ? " has no set_handler or set_handlers method"
318                        : " has no set_handlers method"
319            }
320
321            $proc->set_handlers(
322                map {
323                    my $h;
324                    my $t;
325                    if ( ref $_ ) {
326                        $h = $_;
327                        $t = "Handler";
328                    }
329                    elsif ( $_ eq "Exhaust" ) {
330                        $h = $handler;
331                        $t = $type;
332                    } else {
333                        ( $h, $t ) = reverse split /=>/, $_;
334                        $h = $self->find_part( $h );
335                        $t = $type;
336                        croak "Can't locate part $_ to be a handler for ",
337                            $part_rec->string_description
338                            unless $h;
339                    }
340                    { $type => $h }
341                } @$hs
342            );
343        }
344    }
345
346    $self->{$type} = $handler;
347}
348
349
350my $warned_about_missing_sax_tracer;
351sub trace_parts {
352    my $self = shift;
353
354    unless ( eval "require Devel::TraceSAX; 1" ) {
355        warn $@ unless $warned_about_missing_sax_tracer++;
356        return;
357    }
358
359
360    for ( @_ ? map $self->_find_part_rec( $_ ), @_ : @{$self->{Parts}} ) {
361        Devel::TraceSAX::trace_SAX(
362            $_->{Processor},
363            $_->string_description
364        );
365    }
366
367    ## some parts are created lazily, let's trace those, too
368    $self->{TraceAdHocParts} ||= 1 unless @_;
369}
370
371
372
373sub trace_all_parts {
374    my $self = shift;
375
376    croak "Can't pass parms to trace_all_parts" if @_;
377
378    unless ( eval "require Devel::TraceSAX; 1" ) {
379        warn $@ unless $warned_about_missing_sax_tracer++;
380        return;
381    }
382
383    for ( @{$self->{Parts}} ) {
384        Devel::TraceSAX::trace_SAX(
385            $_->{Processor},
386            $_->string_description
387        );
388        $_->{Processor}->trace_all_parts
389            if $_->{Processor}->can( "trace_all_parts" );
390    }
391
392    ## some parts are created lazily, let's trace those, too
393    $self->{TraceAdHocParts} = 1;
394}
395
396
397
398sub untracify_parts {
399    my $self = shift;
400    for ( @_ ? map $self->find_part( $_ ), @_ : $self->parts ) {
401        XML::SAX::TraceViaISA::remove_tracing_subclass( $_ );
402    }
403}
404
405
406
407compile_methods __PACKAGE__, <<'EOCODE', sax_event_names "ParseMethods" ;
408    sub <METHOD> {
409        my $self = shift;
410        my $h = $self->find_part( "Intake" );
411        croak "SAX machine 'Intake' undefined"
412            unless $h;
413
414        if ( $h->can( "<METHOD>" ) ) {
415            my ( $ok, @result ) = eval {
416                ( 1, wantarray
417                    ? $h-><METHOD>( @_ )
418                    : scalar $h-><METHOD>( @_ )
419                );
420            };
421
422            ## Not sure how/where causes me to need this next line, but
423            ## in perl5.6.1 it seems necessary.
424            return wantarray ? @result : $result[0] if $ok;
425            die $@ unless $@ =~ /No .*routine defined/;
426            undef $@;
427
428            if ( $h->isa( "XML::SAX::Base" ) ) {
429                ## Due to a bug in old versions of X::S::B, we need to reset
430                ## this so that it will pass events on.
431                ## TODO: when newer X::S::B's are common, jack up the
432                ## version in Makefile.PL's PREREQ_PM :).
433                delete $h->{ParseOptions};
434            }
435        }
436
437        require XML::SAX::ParserFactory;
438        $self->{Parser} = XML::SAX::ParserFactory->parser(
439            Handler => $h
440        );
441
442        Devel::TraceSAX::trace_SAX(
443            $self->{Parser},
444            "Ad hoc parser (" . ref( $self->{Parser} ) . ")"
445        ) if $self->{TraceAdHocParts};
446
447        return $self->{Parser}-><METHOD>(@_);
448    }
449EOCODE
450
451
452compile_methods __PACKAGE__, <<'EOCODE', sax_event_names ;
453    sub <EVENT> {
454        my $self = shift;
455        my $h = $self->find_part( "Intake" );
456        croak "SAX machine 'Intake' undefined"
457            unless $h;
458
459        return $h-><EVENT>( @_ ) if $h->can( "<EVENT>" );
460    }
461EOCODE
462
463
464
465my %basic_types = (
466    ARRAY  => undef,
467    CODE   => undef,
468    GLOB   => undef,
469    HASH   => undef,
470    REF    => undef,  ## Never seen this one, but it's listed in perlfunc
471    Regexp => undef,
472    SCALAR => undef,
473);
474
475
476sub _resolve_spec {
477    my $self = shift;
478    my ( $spec ) = @_;
479
480    croak "undef passed instead of a filter to ", ref( $self ), "->new()"
481        unless defined $spec;
482
483    croak "Empty filter name ('') passed to ", ref( $self ), "->new()"
484        unless length $spec;
485
486    my $type = ref $spec;
487
488    if (
489           $type eq "SCALAR"
490## TODO:         || $type eq "ARRAY"  <== need XML::SAX::Writer to supt this.
491        || $type eq "GLOB"
492        || UNIVERSAL::isa( $spec, "IO::Handle" )
493        || ( ! $type && $spec =~ /^\s*([>|]|\+>)/ )
494    ) {
495## Cheat until XML::SAX::Writer cat grok it
496if ( ! $type ) {
497    use Symbol;
498    my $fh = gensym;
499    open $fh, $spec or croak "$! opening '$spec'" ;
500    $spec = $fh;
501}
502        require XML::SAX::Writer;
503        $spec = XML::SAX::Writer->new( Output => $spec );
504    }
505    elsif ( !$type ) {
506        if ( $spec !~ /^\s*<|\|\s*(?!\n)$/ ) {
507            ## Doesn't look like the caller wants to slurp a file
508            ## Let's require it now to catch errors early, then
509            ## new() it later after all requires are done.
510            ## delaying the new()s might help us from doing things
511            ## like blowing away output files and then finding
512            ## errors, for instance.
513            croak $@ unless $spec->can( "new" ) || eval "require $spec";
514        }
515    }
516    else {
517        croak "'$type' not supported in a SAX machine specification\n"
518            if exists $basic_types{$type};
519    }
520
521    return $spec;
522}
523
524my $is_name_like = has_named_regexp_character_classes
525	    ? '^[[:alpha:]]\w*(?!\n)$'
526	    :    '^[a-zA-Z]\w*(?!\n)$';
527
528sub _valid_name($) {
529    my ( $prospect ) = @_;
530    return 0 unless defined $prospect && length $prospect;
531    my $fc = substr $prospect, 0, 1;
532    ## I wonder how close to valid Perl method names this is?
533    ( $fc eq uc $fc && $prospect =~ /$is_name_like/o ) ? 1 : 0;
534}
535
536
537sub _push_spec {
538    my $self = shift;
539    my ( $name, $spec, @handlers ) =
540        ref $_[0]
541            ? ( undef, @_ )       ## Implictly unnamed: [ $obj, ... ]
542            : @_;                 ## Named or explicitly unnamed: [ $name, ...]
543
544    my $part = XML::SAX::Machine::Part->new(
545        Name      => $name,
546        Handlers  => \@handlers,
547    );
548
549#    if ( grep $_ eq "Exhaust", @handlers ) {
550#        $self->{OverusedNames}->{Exhaust} ||= undef
551#            if exists $self->{PartsByName}->{Exhaust};
552#
553#        $self->{PartsByName}->{Exhaust} = $self->{Parts}->[-1];
554#
555#        @handlers = grep $_ ne "Exhaust", @handlers;
556#    }
557
558    ## NOTE: This may
559    ## still return a non-reference, which is the type of processor
560    ## wanted here.  We construct those lazily below; see the docs
561    ## about order of construction.
562    my $proc = $self->_resolve_spec( $spec );
563    $part->{Processor} = $proc;
564    croak "SAX machine BUG: couldn't resolve spec '$spec'"
565        unless defined $proc;
566
567    push @{$self->{Parts}}, $part;
568    $part->{Number} = $#{$self->{Parts}};
569
570    if ( defined $name ) {
571        $self->{OverusedNames}->{$name} ||= undef
572            if exists $self->{PartsByName}->{$name};
573
574        $self->{IllegalNames}->{$name} ||= undef
575            unless _valid_name $name && $name ne "Exhaust";
576
577        $self->{PartsByName}->{$name} = $self->{Parts}->[-1];
578    }
579
580    ## This HASH is used to detect cycles even if the user uses
581    ## preconstructed references instead of named parts.
582    $self->{PartsByProcessor}->{$proc} = $part
583        if ref $proc;
584}
585
586
587sub _names_err_msgs {
588    my ( $s, @names ) = @_ ;
589    @names = map ref $_ eq "HASH" ? keys %$_ : $_, @names;
590    return () unless @names;
591
592    @names = keys %{ { map { ( $_ => undef ) } @names } };
593
594    if ( @names == 1 ) {
595        $s =~ s/%[A-Z]+//g;
596    }
597    else {
598        $s =~ s/%([A-Z]+)/\L$1/g;
599    }
600
601    return $s . join ", ", map "'$_'", sort @names ;
602}
603
604
605sub _build_part {
606    my $self = shift;
607    my ( $part ) = @_;
608
609    my $part_num = $part->{Number};
610
611    return if $self->{BuiltParts}->[$part_num];
612
613    confess "SAX machine BUG: cycle found too late"
614        if $self->{SeenParts}->[$part_num];
615    ++$self->{SeenParts}->[$part_num];
616
617    ## We retun a list of all cycles that have been discovered but
618    ## not yet completed.  We don't return cycles that have been
619    ## completely discovered; those are placed in DetectedCycles.
620    my @open_cycles;
621
622    eval {
623        ## This eval is to make sure we decrement SeenParts so that
624        ## we don't encounter spurious cycle found too late exceptions.
625
626        ## Build any handlers, detect cycles
627        my @handler_procs;
628
629## I decided not to autolink one handler to the next in order to keep
630## from causing hard to diagnose errors when unintended machines are
631## passed in.  The special purpose machines, like Pipeline, have
632## that logic built in.
633##        ## Link any part with no handlers to the next part.
634##        push @{$part->{Handlers}}, $part->{Number} + 1
635##            if ! @{$part->{Handlers}} && $part->{Number} < $#{$self->{Parts}};
636
637        for my $handler_spec ( @{$part->{Handlers}} ) {
638
639            my $handler;
640
641            if ( ref $handler_spec ) {
642                ## The caller specified a handler with a real reference, so
643                ## we don't need to build it, but we do need to do
644                ## cycle detection. _build_part won't build it in this case
645                ## but it will link it and do cycle detection.
646                $handler = $self->{PartsByProcessor}->{$handler_spec}
647                    if exists $self->{PartsByProcessor}->{$handler_spec};
648
649                if ( ! defined $handler ) {
650                    ## It's a processor not in this machine.  Hope the
651                    ## caller knows what it's doing.
652                    push @handler_procs, $handler_spec;
653                    next;
654                }
655            }
656            else {
657                $handler = $self->_find_part_rec( $handler_spec );
658                ## all handler specs were checked earlier, so "survive" this
659                ## failure and let the queued error message tell the user
660                ## about it.
661                next unless defined $handler;
662            }
663
664            if ( $self->{SeenParts}->[$handler->{Number}] ) {
665                ## Oop, a cycle, and we don't want to recurse or we'll
666                ## recurse forever.
667                push @open_cycles, $part eq $handler
668                    ? [ $handler ]
669                    : [ $part, $handler ];
670                next;
671            }
672
673            my @nested_cycles = $self->_build_part( $handler );
674
675            my $handler_proc = $handler->{Processor};
676
677            confess "SAX machine BUG: found a part with no processor: ",
678                $handler->string_description
679                unless defined $handler_proc;
680
681            confess "SAX machine BUG: found a unbuilt '",
682                $handler->{Processor},
683                "' processor: ",
684                $handler->string_description
685                unless ref $handler_proc;
686
687            push @handler_procs, $handler_proc;
688
689            for my $nested_cycle ( @nested_cycles ) {
690                if ( $nested_cycle->[-1] == $part ) {
691                    ## the returned cycle "ended" with our part, so
692                    ## we have a complete description of the cycle, log it
693                    ## and move on.
694                    push @{$self->{DetectedCycles}}, $nested_cycle;
695                }
696                else {
697                    ## This part is part of this cycle but not it's "beginning"
698                    push @open_cycles, [ $part, $nested_cycle ];
699                }
700            }
701        }
702
703        ## Create this processor if need be, otherwise just set the handlers.
704        my $proc = $part->{Processor};
705        confess "SAX machine BUG: undefined processor for ",
706            $part->string_description
707            unless defined $proc;
708
709        unless ( ref $proc ) {
710            ## TODO: Figure a way to specify the type of handler, probably
711            ## using a DTDHandler=>Name syntax, not sure.  Perhaps
712            ## using a hash would be best.
713
714            if ( $proc =~ /^\s*<|\|\s*(?!\n)$/ ) {
715                ## Looks like the caller wants to slurp a file
716                ## We open it ourselves to get all of Perl's magical
717                ## "open" goodness.  TODO: also check for a URL scheme
718                ## and handle that :).
719
720                ## TODO: Move this in to a/the parse method so it can
721                ## be repeated.
722                require Symbol;
723                my $fh = Symbol::gensym;
724                open $fh, $proc or croak "$! opening '$proc'";
725                require XML::SAX::ParserFactory;
726                require IO::Handle;
727                $proc = XML::SAX::ParserFactory->parser(
728                    Source => {
729                        ByteStream => $fh,
730                    },
731                    map {
732                        ( Handler => $_ ),
733                    } @handler_procs
734                );
735
736            }
737            elsif (
738                XML::SAX::Machines->processor_class_option(
739                    $proc,
740                    "ConstructWithHashedOptions"
741                )
742            ) {
743                ## This is designed to build options in a format compatible
744                ## with SAXT style constructors when multiple handlers are
745                ## defined.
746                $proc = $proc->new(
747                    map {
748                        { Handler => $_ }, ## Hashes
749                    } @handler_procs       ## 0 or more of 'em
750                );
751            }
752            else {
753                ## More common Foo->new( Handler => $h );
754                croak "$proc->new doesn't allow multiple handlers.\nSet ConstructWithOptionsHashes => 1 in XML::SAX::Machines::ConfigDefaults if need be"
755                    if @handler_procs > 1;
756                $proc = $proc->new(
757                    map {
758                        ( Handler => $_ ),  ## A plain list
759                    } @handler_procs        ## with 0 or 1 elts
760                );
761            }
762            $self->{PartsByProcessor}->{$proc} = $part;
763        }
764        elsif ( @handler_procs ) {
765            if ( $proc->can( "set_handlers" ) ) {
766                $proc->set_handlers( @handler_procs );
767            }
768            elsif ( $proc->can( "set_handler" ) ) {
769                if ( @handler_procs == 1 ) {
770                    $proc->set_handler( @handler_procs );
771                }
772                else {
773                    die "SAX machine part ", $part->string_description,
774                    " can only take one handler at a time\n";
775                }
776            }
777            else {
778                die "SAX machine part ", $part->string_description,
779                " does not provide a set_handler() or set_handlers() method\n"
780            }
781        }
782
783        $part->{Processor} = $proc;
784    };
785
786    --$self->{SeenParts}->[$part->{Number}];
787    $self->{BuiltParts}->[$part_num] = 1;
788
789
790    if ( $@ ) {
791        chomp $@;
792        $@ .= "\n        ...while building " . $part->string_description . "\n";
793        die $@;
794    }
795
796    return @open_cycles;
797}
798
799
800sub _compile_specs {
801    my $self = shift;
802
803    my @errors;
804
805    ## Init the permanent structures
806    $self->{Parts}            = [];
807    $self->{PartsByName}      = {};
808    $self->{PartsByProcessor} = {};
809
810    ## And some temporary structures.
811    $self->{IllegalNames}  = {};
812    $self->{OverusedNames} = {};
813
814    ## Scan the specs and figure out the connectivity, names and load
815    ## any requirements, etc.
816    for my $spec ( @_ ) {
817        eval {
818            $self->_push_spec(
819                ref $spec eq "ARRAY"
820                    ? @$spec
821                    : ( undef, $spec )
822            );
823        };
824        ## This could be ugly if $@ contains a stack trace, but it'll have
825        ## to do.
826        if ( $@ ) {
827            chomp $@;
828            push @errors, $@;
829        }
830    }
831
832    push @errors, (
833        _names_err_msgs(
834            "illegal SAX machine part name%S ",
835            $self->{IllegalNames}
836        ),
837        _names_err_msgs(
838            "undefined SAX machine part%S specified as handler%S ",
839            grep defined && ! $self->_find_part_rec( $_ ),
840                grep ! ref && $_ ne "Exhaust",
841                    map @{$_->{Handlers}},
842                        @{$self->{Parts}}
843        ),
844        _names_err_msgs(
845            "multiple SAX machine parts named ",
846            $self->{OverusedNames}
847        )
848    );
849
850    ## Free some memory and make object dumps smaller
851    delete $self->{IllegalNames};
852    delete $self->{OverusedNames};
853
854    ## If we made it this far, all classes have been loaded and all
855    ## non-processor refs have been converted in to processors.
856    ## Now
857    ## we need to build and that were specified by type name and do
858    ## them in reverse order so we can pass the
859    ## Handler option(s) in.
860    ## If multiple handlers are defined, then
861    ## we assume that the constructor takes a SAXT like parameter list.
862    ## TODO: figure out how to allow DocumentHandler, etc.  Perhaps allow
863    ## HASH refs in ARRAY syntax decls.
864
865    ## Some temporaries
866    $self->{BuiltParts}     = [];
867    $self->{SeenParts}      = [];
868    $self->{DetectedCycles} = [];
869
870    ## _build_part is recursive and builds any downstream handlers
871    ## needed to build a part.
872    for ( @{$self->{Parts}} ) {
873        eval {
874            push @{$self->{DetectedCycles}}, $self->_build_part( $_ );
875        };
876        if ( $@ ) {
877            chomp $@;
878            push @errors, $@;
879        }
880    }
881
882#    $self->{PartsByName}->{Intake}  ||= $self->{Parts}->[0];
883#    $self->{PartsByName}->{Exhaust} ||= $self->{Parts}->[-1];
884
885    if ( @{$self->{DetectedCycles}} ) {
886        ## Remove duplicate (cycles are found once for each processor in
887        ## the cycle.
888        my %unique_cycles;
889
890        for my $cycle ( @{$self->{DetectedCycles}} ) {
891            my $start = 0;
892            for ( 1..$#$cycle ) {
893                $start = $_
894                    if $cycle->[$_]->{Number} < $cycle->[$start]->{Number};
895            }
896            my $key = join(
897                ",",
898                map $_->{Number},
899                    @{$cycle}[$start..($#$cycle),0..($start-1)]
900            );
901            $unique_cycles{$key} ||= $cycle;
902        }
903
904        push @errors, map {
905            "Cycle detected in SAX machine: " .
906                join(
907                    "->",
908                    map $_->string_description, $_->[-1], @$_
909                );
910        } map $unique_cycles{$_}, sort keys %unique_cycles;
911    }
912
913    delete $self->{SeenParts};
914    delete $self->{BuiltParts};
915    delete $self->{DetectedCycles};
916
917    croak join "\n", @errors if @errors;
918}
919
920
921sub _SAX2_attrs {
922    my %a = @_;
923
924    return {
925        map {
926            defined $a{$_}
927                ? ( $_ => {
928                    LocalName => $_,
929                    Name      => $_,
930                    Value     => $a{$_},
931                } )
932                : () ;
933        } keys %a
934    };
935}
936
937
938my %ids;
939sub _idify($) {
940    $ids{$_[0]} = keys %ids unless exists $ids{$_[0]};
941    return $ids{$_[0]};
942}
943
944
945sub pointer_elt {
946    my $self = shift;
947    my ( $elt_type, $h_spec, $options ) = @_;
948
949    my $part_rec;
950
951    $h_spec = $self->{Handler}
952        if $h_spec eq "Exhaust" && defined $self->{Handler};
953
954    ## Look locally first in case the name is not
955    ## unique among parts in RootMachine.
956    $part_rec = $self->_find_part_rec( $h_spec )
957        if ! $part_rec;
958
959    ## Don't look for indexes in RootMachine
960    $part_rec = $options->{RootMachine}->_find_part_rec(
961        $h_spec
962    ) if ! $part_rec
963        && defined $options->{RootMachine}
964        && $h_spec != /^-?\d+$/ ;
965
966    my %attrs;
967
968    if ( $part_rec ) {
969        %attrs = (
970            name           => $part_rec->{Name} || $h_spec,
971            "handler-id"   => _idify $part_rec->{Processor},
972        );
973    }
974    else {
975        if ( ref $h_spec ) {
976            %attrs = (
977                type         => ref $h_spec,
978                "handler-id" => _idify $h_spec,
979            );
980        }
981        else {
982            %attrs = (
983                name => $h_spec,
984            );
985        }
986    }
987
988    return {
989        Name       => $elt_type,
990        LocalName  => $elt_type,
991        Attributes => _SAX2_attrs( %attrs ),
992    };
993}
994
995
996sub generate_part_descriptions {
997    my $self = shift;
998    my ( $options ) = @_;
999
1000    my $h = $options->{Handler};
1001    croak "No Handler passed" unless $h;
1002
1003    for my $part_rec ( @{$self->{Parts}} ) {
1004        my $proc = $part_rec->{Processor};
1005
1006        if ( $proc->can( "generate_description" ) ) {
1007            $proc->generate_description( {
1008                %$options,
1009                Name        => $part_rec->{Name},
1010                Description => $part_rec->string_description,
1011            } );
1012        }
1013        else {
1014            my $part_elt = {
1015                LocalName  => "part",
1016                Name       => "part",
1017                Attributes => _SAX2_attrs(
1018                    id          => _idify $proc,
1019                    type        => ref $part_rec,
1020                    name        => $part_rec->{Name},
1021                    description => $part_rec->string_description,
1022                ),
1023            };
1024            $h->start_element( $part_elt );
1025            for my $h_spec ( @{$part_rec->{Handlers}} ) {
1026                my $handler_elt = $self->pointer_elt( "handler", $h_spec );
1027
1028                $h->start_element( $handler_elt );
1029                $h->end_element(   $handler_elt );
1030            }
1031            $h->end_element( $part_elt );
1032        }
1033    }
1034}
1035
1036
1037sub generate_description {
1038    my $self = shift;
1039
1040    my $options =
1041        @_ == 1
1042            ? ref $_[0] eq "HASH"
1043                ? { %{$_[0]} }
1044                : {
1045                    Handler =>
1046                        ref $_[0]
1047                            ? $_[0]
1048                            : $self->_resolve_spec( $_[0] )
1049                }
1050            : { @_ };
1051
1052    my $h = $options->{Handler};
1053    croak "No Handler passed" unless $h;
1054
1055    unless ( $options->{Depth} ) {
1056        %ids = ();
1057        $options->{RootMachine} = $self;
1058
1059        $h->start_document({});
1060    }
1061
1062    ++$options->{Depth};
1063    my $root_elt = {
1064        LocalName => "sax-machine",
1065        Name      => "sax-machine",
1066        Attributes => _SAX2_attrs(
1067            id          => _idify $self,
1068            type        => ref $self,
1069            name        => $options->{Name},
1070            description => $options->{Description},
1071        ),
1072    };
1073
1074    $h->start_element( $root_elt );
1075
1076    ## Listing the handler first so it doesn't look like a part's
1077    ## handler (which it kinda does if it's hanging out *after* a <part .../>
1078    ## tag :).  Also makes following the links by hand a tad easier.
1079    if ( defined $self->{Handler} ) {
1080        my $handler_elt = $self->pointer_elt( "handler", $self->{Handler} );
1081        $handler_elt->{Attributes}->{name} = {
1082            Name      => "name",
1083            LocalName => "name",
1084            Value     => "Exhaust"
1085        } unless exists $handler_elt->{Attributes}->{Name};
1086
1087        $h->start_element( $handler_elt );
1088        $h->end_element(   $handler_elt );
1089    }
1090
1091    for ( sort keys %{$self->{PartsByName}} ) {
1092        if ( $self->{PartsByName}->{$_}->{Name} ne $_ ) {
1093        warn $self->{PartsByName}->{$_}->{Name}, " : ", $_;
1094            my $handler_elt = $self->pointer_elt( "alias", $_ );
1095            %{$handler_elt->{Attributes}} = (
1096                %{$handler_elt->{Attributes}},
1097                %{_SAX2_attrs( alias => $_ )},
1098            );
1099            $h->start_element( $handler_elt );
1100            $h->end_element(   $handler_elt );
1101        }
1102    }
1103
1104    $self->generate_part_descriptions( $options );
1105    $h->end_element( $root_elt );
1106
1107    --$options->{Depth};
1108    $h->end_document({}) unless $options->{Depth};
1109}
1110
1111
1112##
1113## This is a private class, only this class should use it directly.
1114##
1115package XML::SAX::Machine::Part;
1116{
1117  $XML::SAX::Machine::Part::VERSION = '0.46';
1118}
1119
1120use fields (
1121    'Name',       ## The caller-given name of the part
1122    'Number',     ## Where it sits in the parts list.
1123    'Processor',  ## The actual SAX processor
1124    'Handlers',   ## The handlers the caller specified
1125);
1126
1127
1128sub new {
1129    my $proto = shift;
1130    my $class = ref $proto || $proto;
1131
1132    my $self = bless {}, $class;
1133
1134    my %options = @_ ;
1135    $self->{$_} = $options{$_} for keys %options;
1136
1137    return $self;
1138}
1139
1140
1141sub string_description {
1142    my $self = shift;
1143
1144    return join(
1145        "",
1146        $self->{Name}
1147            ? $self->{Name}
1148            : ( "#", $self->{Number} ),
1149        " (",
1150        $self->{Processor}
1151            ? ( ref $self->{Processor} || $self->{Processor} )
1152            : "<undefined processor>",
1153        ")"
1154    );
1155}
1156
11571;
1158
1159__END__
1160
1161=pod
1162
1163=head1 NAME
1164
1165XML::SAX::Machine - Manage a collection of SAX processors
1166
1167=head1 VERSION
1168
1169version 0.46
1170
1171=head1 SYNOPSIS
1172
1173    ## Note: See XML::SAX::Pipeline and XML::SAX::Machines first,
1174    ## this is the gory, detailed interface.
1175
1176    use My::SAX::Machines qw( Machine );
1177    use My::SAX::Filter2;
1178    use My::SAX::Filter3;
1179
1180    my $filter3 = My::SAX::Filter3->new;
1181
1182    ## A simple pipeline.  My::SAX::Filter1 will be autoloaded.
1183    my $m = Machine(
1184        #
1185        # Name   => Class/object            => handler(s)
1186        #
1187        [ Intake => "My::SAX::Filter1"      => "B"        ],
1188        [ B      => My::SAX::Filter2->new() => "C"        ],
1189        [ C      => $filter3                => "D"        ],
1190        [ D      => \*STDOUT                              ],
1191    );
1192
1193    ## A parser will be created unless My::SAX::Filter1 can parse_file
1194    $m->parse_file( "foo.revml" );
1195
1196    my $m = Machine(
1197        [ Intake   => "My::SAX::Filter1"  => qw( Tee     ) ],
1198        [ Tee      => "XML::Filter::SAXT" => qw( Foo Bar ) ],
1199        [ Foo      => "My::SAX::Filter2"  => qw( Out1    ) ],
1200        [ Out1     => \$log                                ],
1201        [ Bar      => "My::SAX::Filter3"  => qw( Exhaust ) ],
1202    );
1203
1204=head1 DESCRIPTION
1205
1206B<WARNING>: This API is alpha!!!  It I<will> be changing.
1207
1208A generic SAX machine (an instance of XML::SAX::Machine) is a container
1209of SAX processors (referred to as "parts") connected in arbitrary ways.
1210
1211Each parameter to C<Machine()> (or C<XML::SAX::Machine->new()>)
1212represents one top level part of the machine.  Each part has a name, a
1213processor, and one or more handlers (usually specified by name, as shown
1214in the SYNOPSIS).
1215
1216Since SAX machines may be passed in as single top level parts, you can
1217also create nested, complex machines ($filter3 in the SYNOPSIS could be
1218a Pipeline, for example).
1219
1220A SAX machines can act as a normal SAX processors by connecting them to
1221other SAX processors:
1222
1223    my $w = My::Writer->new();
1224    my $m = Machine( ...., { Handler => $w } );
1225    my $g = My::Parser->new( Handler => $w );
1226
1227=head2 Part Names
1228
1229Although it's not required, each part in a machine can be named.  This
1230is useful for retrieving and manipulating the parts (see L</part>, for
1231instance), and for debugging, since debugging output (see
1232L</trace_parts> and L</trace_all_parts>) includes the names.
1233
1234Part names must be valid Perl subroutine names, beginning with an
1235uppercase character.  This is to allow convenience part accessors
1236methods like
1237
1238    $c = $m->NameOfAFilter;
1239
1240to work without ever colliding with the name of a method (all method
1241names are completely lower case).  Only filters named like this can be
1242accessed using the magical accessor functions.
1243
1244=head2 Reserved Names: Intake and Exhaust
1245
1246The names c<Intake> and C<Exhaust> are reserved.  C<Intake> refers to
1247the first part in the processing chain.  This is not necessarily the
1248first part in the constructor list, just the first part to receive
1249external events.
1250
1251C<Exhaust> refers to the output of the machine; no part may be named
1252C<Exhaust>, and any parts with a handler named C<Exhaust> will deliver
1253their output to the machine's handler.  Normally, only one part should
1254deliver it's output to the Exhaust port.
1255
1256Calling $m->set_handler() alters the Exhaust port, assuming any
1257processors pointing to the C<Exhaust> provide a C<set_handler()> method
1258like L<XML::SAX::Base>'s.
1259
1260C<Intake> and C<Exhaust> are usually assigned automatically by
1261single-purpose machines like L<XML::SAX::Pipeline> and
1262L<XML::SAX::Manifold>.
1263
1264=head2 SAX Processor Support
1265
1266The XML::SAX::Machine class is very agnostic about what SAX processors
1267it supports; about the only constraint is that it must be a blessed
1268reference (of any type) that does not happen to be a Perl IO::Handle
1269(which are assumed to be input or output filehandles).
1270
1271The major constraint placed on SAX processors is that they must provide
1272either a C<set_handler> or C<set_handlers> method (depending on how many
1273handlers a processor can feed) to allow the SAX::Machine to disconnect
1274and reconnect them.  Luckily, this is true of almost any processor
1275derived from XML::SAX::Base.  Unfortunately, many SAX older (SAX1)
1276processors do not meet this requirement; they assume that SAX processors
1277will only ever be connected together using their constructors.
1278
1279=head2 Connections
1280
1281SAX machines allow you to connect the parts however you like; each part
1282is given a name and a list of named handlers to feed.  The number of
1283handlers a part is allowed depends on the part; most filters only allow
1284once downstream handler, but filters like L<XML::Filter::SAXT> and
1285L<XML::Filter::Distributor> are meant to feed multiple handlers.
1286
1287Parts may not be connected in loops ("cycles" in graph theory terms).
1288The machines specified by:
1289
1290    [ A => "Foo" => "A" ],  ## Illegal!
1291
1292and
1293
1294    [ A => "Foo" => "B" ],  ## Illegal!
1295    [ B => "Foo" => "A" ],
1296
1297.  Configuring a machine this way would cause events to flow in an
1298infinite loop, and/or cause the first processor in the cycle to start
1299receiving events from the end of the cycle before the input document was
1300complete.  Besides, it's not a very useful topology :).
1301
1302SAX machines detect loops at construction time.
1303
1304=head1 NAME
1305
1306    XML::SAX::Machine - Manage a collection of SAX processors
1307
1308=head1 API
1309
1310=head2 Public Methods
1311
1312These methods are meant to be used by users of SAX machines.
1313
1314=over
1315
1316=item new()
1317
1318    my $m = $self->new( @machine_spec, \%options );
1319
1320Creates $self using %options, and compiles the machine spec.  This is
1321the longhand form of C<Machines( ... )>.
1322
1323=item find_part
1324
1325Gets a part contained by this machine by name, number or object reference:
1326
1327    $c = $m->find_part( $name );
1328    $c = $m->find_part( $number );
1329    $c = $m->find_part( $obj );    ## useful only to see if $obj is in $m
1330
1331If a machine contains other machines, parts of the contained machines
1332may be accessed by name using unix directory syntax:
1333
1334    $c = $m->find_part( "/Intake/Foo/Bar" );
1335
1336(all paths must be absolute).
1337
1338Parts may also be accessed by number using array indexing:
1339
1340    $c = $m->find_part(0);  ## Returns first part or undef if none
1341    $c = $m->find_part(-1); ## Returns last part or undef if none
1342    $c = $m->find_part( "Foo/0/1/-1" );
1343
1344There is no way to guarantee that a part's position number means
1345anything, since parts can be reconnected after their position numbers
1346are assigned, so using a part name is recommended.
1347
1348Throws an exception if the part is not found, so doing things like
1349
1350   $m->find_part( "Foo" )->bar()
1351
1352garner informative messages when "Foo" is not found.  If you want to
1353test a result code, do something like
1354
1355    my $p = eval { $m->find_part };
1356    unless ( $p ) {
1357        ...handle lookup failure...
1358    }
1359
1360=item parts
1361
1362    for ( $m->parts ) { ... }
1363
1364Gets an arbitrarily ordered list of top level parts in this machine.
1365This is all of the parts directly contained by this machine and none of
1366the parts that may be inside them.  So if a machine contains an
1367L<XML::SAX::Pipeline> as one of it's parts, the pipeline will be
1368returned but not the parts inside the pipeline.
1369
1370=item all_parts
1371
1372    for ( $m->all_parts ) { ... }
1373
1374Gets all parts in this machine, not just top level ones. This includes
1375any machines contained by this machine and their parts.
1376
1377=item set_handler
1378
1379    $m->set_handler( $handler );
1380    $m->set_handler( DTDHandler => $handler );
1381
1382Sets the machine's handler and sets the handlers for all parts that
1383have C<Exhaust> specified as their handlers.  Requires that any such
1384parts provide a C<set_handler> or (if the part has multiple handlers)
1385a C<set_handlers> method.
1386
1387NOTE: handler types other than "Handler" are only supported if they are
1388supported by whatever parts point at the C<Exhaust>.  If the handler type is
1389C<Handler>, then the appropriate method is called as:
1390
1391    $part->set_handler( $handler );
1392    $part->set_handlers( $handler0, $handler1, ... );
1393
1394If the type is some other handler type, these are called as:
1395
1396    $part->set_handler( $type => $handler );
1397    $part->set_handlers( { $type0 => $handler0 }, ... );
1398
1399=item trace_parts
1400
1401    $m->trace_parts;          ## trace all top-level parts
1402    $m->trace_parts( @ids );  ## trace the indicated parts
1403
1404Uses Devel::TraceSAX to enable tracing of all events received by the parts of
1405this machine.  Does not enable tracing of parts contained in machines in this
1406machine; for that, see trace_all_parts.
1407
1408=item trace_all_parts
1409
1410    $m->trace_all_parts;      ## trace all parts
1411
1412Uses Devel::TraceSAX to trace all events received by the parts of this
1413machine.
1414
1415=item untracify_parts
1416
1417    $m->untracify_parts( @ids );
1418
1419Converts the indicated parts to SAX processors with tracing enabled.
1420This may not work with processors that use AUTOLOAD.
1421
1422=back
1423
1424=head1 Events and parse routines
1425
1426XML::SAX::Machine provides all SAX1 and SAX2 events and delgates them to the
1427processor indicated by $m->find_part( "Intake" ).  This adds some overhead, so
1428if you are concerned about overhead, you might want to direct SAX events
1429directly to the Intake instead of to the machine.
1430
1431It also provides parse...() routines so it can whip up a parser if need
1432be.  This means: parse(), parse_uri(), parse_string(), and parse_file()
1433(see XML::SAX::EventMethodMaker for details).  There is no way to pass
1434methods directly to the parser unless you know that the Intake is a
1435parser and call it directly.  This is not so important for parsing,
1436because the overhead it takes to delegate is minor compared to the
1437effort needed to parse an XML document.
1438
1439=head2 Internal and Helper Methods
1440
1441These methods are meant to be used/overridden by subclasses.
1442
1443=over
1444
1445=item _compile_specs
1446
1447    my @comp = $self->_compile_specs( @_ );
1448
1449Runs through a list of module names, output specifiers, etc., and builds
1450the machine.
1451
1452    $scalar     --> "$scalar"->new
1453    $ARRAY_ref  --> pipeline @$ARRAY_ref
1454    $SCALAR_ref --> XML::SAX::Writer->new( Output => $SCALAR_ref )
1455    $GLOB_ref   --> XML::SAX::Writer->new( Output => $GLOB_ref )
1456
1457=item generate_description
1458
1459    $m->generate_description( $h );
1460    $m->generate_description( Handler => $h );
1461    $m->generate_description( Pipeline ... );
1462
1463Generates a series of SAX events to the handler of your choice.
1464
1465See L<XML::Handler::Machine2GraphViz> on CPAN for a way of visualizing
1466machine innards.
1467
1468=back
1469
1470=head1 TODO
1471
1472=over
1473
1474=item *
1475
1476Separate initialization from construction time; there should be somthing
1477like a $m->connect( ....machine_spec... ) that new() calls to allow you
1478to delay parts speficication and reconfigure existing machines.
1479
1480=item *
1481
1482Allow an XML doc to be passed in as a machine spec.
1483
1484=back
1485
1486=head1 LIMITATIONS
1487
1488=over
1489
1490=back
1491
1492=head1 AUTHOR
1493
1494    Barrie Slaymaker <barries@slaysys.com>
1495
1496=head1 LICENSE
1497
1498Artistic or GPL, any version.
1499
1500=head1 AUTHORS
1501
1502=over 4
1503
1504=item *
1505
1506Barry Slaymaker
1507
1508=item *
1509
1510Chris Prather <chris@prather.org>
1511
1512=back
1513
1514=head1 COPYRIGHT AND LICENSE
1515
1516This software is copyright (c) 2013 by Barry Slaymaker.
1517
1518This is free software; you can redistribute it and/or modify it under
1519the same terms as the Perl 5 programming language system itself.
1520
1521=cut
1522