1package XML::SAX::Machine; 2{ 3 $XML::SAX::Machine::VERSION = '0.46'; 4} 5# ABSTRACT: Manage a collection of SAX processors 6 7 8 9use strict; 10 11use constant has_named_regexp_character_classes => $] > 5.006000; 12 13use Carp; 14use UNIVERSAL; 15use XML::SAX::EventMethodMaker qw( :all ); 16use XML::SAX::Machines; 17 18## Tell the config stuff what options we'll be requesting, so we 19## don't get typoes in this code. Very annoying, but I mispelt it 20## so often, that adding one statement like this seemed like a low 21## pain solution, since testing options like this can be long and 22## bothersome. 23XML::SAX::Machines->expected_processor_class_options(qw( 24 ConstructWithHashedOptions 25)); 26 27 28 29sub new { 30 my $proto = shift; 31 my $class = ref $proto || $proto; 32 33 my @options_if_any = @_ && ref $_[-1] eq "HASH" ? %{pop()} : (); 34 my $self = bless { @options_if_any }, $class; 35 36 $self->{Parts} = []; 37 $self->{PartsByName} = {}; ## Mapping of names to parts 38 39 $self->_compile_specs( @_ ); 40 41 ## Set this last in case any specs have handler "Exhaust" 42 $self->set_handler( $self->{Handler} ) if $self->{Handler}; 43 44 return $self; 45} 46 47 48sub _find_part_rec { 49 my $self = shift; 50 my ( $id ) = @_; 51 52 if ( ref $id ) { 53 return exists $self->{PartsByProcessor}->{$id} 54 && $self->{PartsByProcessor}->{$id}; 55 } 56 57 if ( $id =~ /^[+-]?\d+(?!\n)$/ ) { 58 return undef 59 if $id > $#{$self->{Parts}} 60 || $id < - ( $#{$self->{Parts}} + 1 ); 61 return $self->{Parts}->[$id]; 62 } 63 64 return $self->{PartsByName}->{$id} 65 if exists $self->{PartsByName}->{$id}; 66 67 return undef; 68} 69 70 71sub find_part { 72 my $self = shift; 73 my ( $spec ) = @_; 74 75 return $self->{Handler} if $spec eq "Exhaust"; 76 77 my $part_rec; 78 79 if ( 0 <= index $spec, "/" ) { 80 ## Take the sloooow road... 81 require File::Spec::Unix; 82 croak "find_part() path not absolute: '$spec'" 83 unless File::Spec::Unix->file_name_is_absolute( $spec ); 84 85 ## Cannonical-ize it, do /foo/../ => / conversion 86 $spec = File::Spec::Unix->canonpath( $spec ); 87 1 while $spec =~ s{/[^/]+/\.\.(/|(?!\n\Z))}{$1}; 88 89 my @names = File::Spec::Unix->splitdir( $spec ); 90 pop @names while @names && ! length $names[-1]; 91 shift @names while @names && ! length $names[0]; 92 93 croak "invalid find_part() specification: '$spec'" 94 unless File::Spec::Unix->file_name_is_absolute( $spec ); 95 96 my @audit_trail; 97 my $proc = $self; 98 for ( @names ) { 99 push @audit_trail, $_; 100 $part_rec = $proc->_find_part_rec( $_ ); 101 unless ( $part_rec ) { 102 croak "find_path() could not find '", 103 join( "/", "", @audit_trail ), 104 "' in ", ref $self; 105 } 106 $proc = $part_rec->{Processor}; 107 } 108 } 109 else { 110 $part_rec = $self->_find_part_rec( $spec ); 111 } 112 113 croak "find_path() could not find '$spec' in ", ref $self 114 unless $part_rec; 115 116 my $proc = $part_rec->{Processor}; 117 118 ## Be paranoid here, just in case we have a bug somewhere. I prefer 119 ## getting reasonable bug reports... 120 confess "find_path() found an undefined Processor reference as part '$_[0]' in ", 121 ref $self 122 unless defined $proc; 123 124 confess "find_path() found '$proc' instead of a Processor reference as part '$_[0]' in ", 125 ref $self 126 unless ref $proc; 127 128 confess "find_path() found a ", 129 ref $proc, 130 " reference instead of a Processor reference in part '$_[0]' in ", 131 ref $self 132 unless index( "SCALAR|ARRAY|HASH|Regexp|REF|CODE", ref $proc ) <= 0; 133 134 return $proc; 135} 136 137 138use vars qw( $AUTOLOAD ); 139 140sub DESTROY {} ## Prevent AUTOLOADing of this. 141 142my $alpha_first_char = has_named_regexp_character_classes 143 ? "^[[:alpha:]]" 144 : "^[a-zA-Z]"; 145 146sub AUTOLOAD { 147 my $self = shift; 148 149 $AUTOLOAD =~ s/.*://; 150 151 my $fc = substr $AUTOLOAD, 0, 1; 152 ## TODO: Find out how Perl determines "alphaness" and use that. 153 croak ref $self, " does not provide method $AUTOLOAD" 154 unless $fc eq uc $fc && $AUTOLOAD =~ /$alpha_first_char/o; 155 156 my $found = $self->find_part( $AUTOLOAD ); 157 return $found; 158} 159 160 161sub parts { 162 my $self = shift; 163 croak "Can't set parts for a '", ref( $self ), "'" if @_; 164 confess "undef Parts" unless defined $self->{Parts}; 165 return map $_->{Processor}, @{$self->{Parts}}; 166} 167 168 169## TODO: Detect deep recursion in _all_part_recs(). In fact, detect deep 170## recursion when building the machine. 171 172sub _all_part_recs { 173 my $self = shift; 174 croak "Can't pass parms to ", ref( $self ), "->_all_part_recs" if @_; 175 confess "undef Parts" unless defined $self->{Parts}; 176 my $proc; 177 return map { 178 $proc = $_->{Processor}; 179 UNIVERSAL::can( $proc, "all_parts" ) 180 ? ( $_, $proc->_all_part_recs ) 181 : $_; 182 } @{$self->{Parts}}; 183} 184 185 186sub all_parts { 187 my $self = shift; 188 croak "Can't pass parms to ", ref( $self ), "->_all_parts" if @_; 189 confess "undef Parts" unless defined $self->{Parts}; 190 return map $_->{Processor}, $self->_all_part_recs; 191} 192 193 194#=item add_parts 195# 196# $m->add_parts( { Foo => $foo, Bar => $bar } ); 197# 198#On linear machines: 199# 200# $m->add_parts( @parts ); 201# 202#Adds one or more parts to the machine. Does not connect them, you need 203#to do that manually (we need to add a $m->connect_parts() style API). 204# 205#=cut 206# 207#sub add_parts { 208# my $self = shift; 209#confess "TODO"; 210#} 211 212#=item remove_parts 213# 214# $m->remove_parts( qw( Foo Bar ) ); 215# 216#Slower, but possible: 217# 218# $m->remove_parts( $m->Foo, $m->Bar ); 219# 220#On linear machines: 221# 222# $m->remove_parts( 1, 3 ); 223# 224#Removes one or more parts from the machine. Does not connect them 225#except on linear machines. Attempts to disconnect any parts that 226#point to them, and that they point to. This attempt will fail for any 227#part that does not provide a handler() or handlers() method. 228# 229#This is breadth-first recursive, like C<$m->find_part( $id )> is. This 230#will remove *all* parts with the given names from a complex 231#machine (this does not apply to index numbers). 232# 233#Returns a list of the removed parts. 234# 235#If a name is not found, it is ignored. 236# 237#=cut 238# 239#sub remove_parts { 240# my $self = shift; 241# 242# my %names; 243# my @found; 244# 245# for my $doomed ( @_ ) { 246# unless ( ref $doomed ) { 247# $names{$doomed} = undef; 248# if ( my $f = delete $self->{Parts}->{$doomed} ) { 249# push @found, $f; 250# } 251# else { 252# for my $c ( $self->parts ) { 253# if ( $c->can( "remove_parts" ) 254# && ( my @f = $c->remove_parts( $doomed ) ) 255# ) { 256# push @found, @f; 257# } 258# } 259# } 260# } 261# else { 262# ## It's a reference. Do this the slow, painful way. 263# for my $name ( keys %{$self->{Parts}} ) { 264# if ( $doomed == $self->{Parts}->{$name} ) { 265# $names{$name} = undef; 266# push @found, delete $self->{Parts}->{$name}; 267# } 268# } 269# 270# for my $c ( $self->parts ) { 271# if ( $c->can( "remove_parts" ) 272# && ( my @f = $c->remove_parts( $doomed ) ) 273# ) { 274# push @found, @f; 275# } 276# } 277# } 278# } 279# 280# for my $c ( sort keys %{$self->{Connections}} ) { 281# if ( exists $names{$self->{Connections}->{$c}} ) { 282###TODO: Unhook the processors if possible 283# delete $self->{Connections}->{$c}; 284# } 285# if ( exists $names{$c} ) { 286###TODO: Unhook the processors if possible 287# delete $self->{Connections}->{$c}; 288# } 289# } 290# 291# return @found; 292#} 293 294 295sub set_handler { 296 my $self = shift; 297 my ( $handler, $type ) = reverse @_; 298 299 $type ||= "Handler"; 300 301 for my $part_rec ( @{$self->{Parts}} ) { 302 my $proc = $part_rec->{Processor}; 303 my $hs = $part_rec->{Handlers}; 304 305 if ( grep ref $_ ? $_ == $self->{$type} : $_ eq "Exhaust", @$hs ) { 306 if ( @$hs == 1 && $proc->can( "set_handler" ) ) { 307 $proc->set_handler( 308 $type ne "Handler" ? $type : (), 309 $handler 310 ); 311 next; 312 } 313 314 unless ( $proc->can( "set_handlers" ) ) { 315 croak ref $proc, 316 @$hs == 1 317 ? " has no set_handler or set_handlers method" 318 : " has no set_handlers method" 319 } 320 321 $proc->set_handlers( 322 map { 323 my $h; 324 my $t; 325 if ( ref $_ ) { 326 $h = $_; 327 $t = "Handler"; 328 } 329 elsif ( $_ eq "Exhaust" ) { 330 $h = $handler; 331 $t = $type; 332 } else { 333 ( $h, $t ) = reverse split /=>/, $_; 334 $h = $self->find_part( $h ); 335 $t = $type; 336 croak "Can't locate part $_ to be a handler for ", 337 $part_rec->string_description 338 unless $h; 339 } 340 { $type => $h } 341 } @$hs 342 ); 343 } 344 } 345 346 $self->{$type} = $handler; 347} 348 349 350my $warned_about_missing_sax_tracer; 351sub trace_parts { 352 my $self = shift; 353 354 unless ( eval "require Devel::TraceSAX; 1" ) { 355 warn $@ unless $warned_about_missing_sax_tracer++; 356 return; 357 } 358 359 360 for ( @_ ? map $self->_find_part_rec( $_ ), @_ : @{$self->{Parts}} ) { 361 Devel::TraceSAX::trace_SAX( 362 $_->{Processor}, 363 $_->string_description 364 ); 365 } 366 367 ## some parts are created lazily, let's trace those, too 368 $self->{TraceAdHocParts} ||= 1 unless @_; 369} 370 371 372 373sub trace_all_parts { 374 my $self = shift; 375 376 croak "Can't pass parms to trace_all_parts" if @_; 377 378 unless ( eval "require Devel::TraceSAX; 1" ) { 379 warn $@ unless $warned_about_missing_sax_tracer++; 380 return; 381 } 382 383 for ( @{$self->{Parts}} ) { 384 Devel::TraceSAX::trace_SAX( 385 $_->{Processor}, 386 $_->string_description 387 ); 388 $_->{Processor}->trace_all_parts 389 if $_->{Processor}->can( "trace_all_parts" ); 390 } 391 392 ## some parts are created lazily, let's trace those, too 393 $self->{TraceAdHocParts} = 1; 394} 395 396 397 398sub untracify_parts { 399 my $self = shift; 400 for ( @_ ? map $self->find_part( $_ ), @_ : $self->parts ) { 401 XML::SAX::TraceViaISA::remove_tracing_subclass( $_ ); 402 } 403} 404 405 406 407compile_methods __PACKAGE__, <<'EOCODE', sax_event_names "ParseMethods" ; 408 sub <METHOD> { 409 my $self = shift; 410 my $h = $self->find_part( "Intake" ); 411 croak "SAX machine 'Intake' undefined" 412 unless $h; 413 414 if ( $h->can( "<METHOD>" ) ) { 415 my ( $ok, @result ) = eval { 416 ( 1, wantarray 417 ? $h-><METHOD>( @_ ) 418 : scalar $h-><METHOD>( @_ ) 419 ); 420 }; 421 422 ## Not sure how/where causes me to need this next line, but 423 ## in perl5.6.1 it seems necessary. 424 return wantarray ? @result : $result[0] if $ok; 425 die $@ unless $@ =~ /No .*routine defined/; 426 undef $@; 427 428 if ( $h->isa( "XML::SAX::Base" ) ) { 429 ## Due to a bug in old versions of X::S::B, we need to reset 430 ## this so that it will pass events on. 431 ## TODO: when newer X::S::B's are common, jack up the 432 ## version in Makefile.PL's PREREQ_PM :). 433 delete $h->{ParseOptions}; 434 } 435 } 436 437 require XML::SAX::ParserFactory; 438 $self->{Parser} = XML::SAX::ParserFactory->parser( 439 Handler => $h 440 ); 441 442 Devel::TraceSAX::trace_SAX( 443 $self->{Parser}, 444 "Ad hoc parser (" . ref( $self->{Parser} ) . ")" 445 ) if $self->{TraceAdHocParts}; 446 447 return $self->{Parser}-><METHOD>(@_); 448 } 449EOCODE 450 451 452compile_methods __PACKAGE__, <<'EOCODE', sax_event_names ; 453 sub <EVENT> { 454 my $self = shift; 455 my $h = $self->find_part( "Intake" ); 456 croak "SAX machine 'Intake' undefined" 457 unless $h; 458 459 return $h-><EVENT>( @_ ) if $h->can( "<EVENT>" ); 460 } 461EOCODE 462 463 464 465my %basic_types = ( 466 ARRAY => undef, 467 CODE => undef, 468 GLOB => undef, 469 HASH => undef, 470 REF => undef, ## Never seen this one, but it's listed in perlfunc 471 Regexp => undef, 472 SCALAR => undef, 473); 474 475 476sub _resolve_spec { 477 my $self = shift; 478 my ( $spec ) = @_; 479 480 croak "undef passed instead of a filter to ", ref( $self ), "->new()" 481 unless defined $spec; 482 483 croak "Empty filter name ('') passed to ", ref( $self ), "->new()" 484 unless length $spec; 485 486 my $type = ref $spec; 487 488 if ( 489 $type eq "SCALAR" 490## TODO: || $type eq "ARRAY" <== need XML::SAX::Writer to supt this. 491 || $type eq "GLOB" 492 || UNIVERSAL::isa( $spec, "IO::Handle" ) 493 || ( ! $type && $spec =~ /^\s*([>|]|\+>)/ ) 494 ) { 495## Cheat until XML::SAX::Writer cat grok it 496if ( ! $type ) { 497 use Symbol; 498 my $fh = gensym; 499 open $fh, $spec or croak "$! opening '$spec'" ; 500 $spec = $fh; 501} 502 require XML::SAX::Writer; 503 $spec = XML::SAX::Writer->new( Output => $spec ); 504 } 505 elsif ( !$type ) { 506 if ( $spec !~ /^\s*<|\|\s*(?!\n)$/ ) { 507 ## Doesn't look like the caller wants to slurp a file 508 ## Let's require it now to catch errors early, then 509 ## new() it later after all requires are done. 510 ## delaying the new()s might help us from doing things 511 ## like blowing away output files and then finding 512 ## errors, for instance. 513 croak $@ unless $spec->can( "new" ) || eval "require $spec"; 514 } 515 } 516 else { 517 croak "'$type' not supported in a SAX machine specification\n" 518 if exists $basic_types{$type}; 519 } 520 521 return $spec; 522} 523 524my $is_name_like = has_named_regexp_character_classes 525 ? '^[[:alpha:]]\w*(?!\n)$' 526 : '^[a-zA-Z]\w*(?!\n)$'; 527 528sub _valid_name($) { 529 my ( $prospect ) = @_; 530 return 0 unless defined $prospect && length $prospect; 531 my $fc = substr $prospect, 0, 1; 532 ## I wonder how close to valid Perl method names this is? 533 ( $fc eq uc $fc && $prospect =~ /$is_name_like/o ) ? 1 : 0; 534} 535 536 537sub _push_spec { 538 my $self = shift; 539 my ( $name, $spec, @handlers ) = 540 ref $_[0] 541 ? ( undef, @_ ) ## Implictly unnamed: [ $obj, ... ] 542 : @_; ## Named or explicitly unnamed: [ $name, ...] 543 544 my $part = XML::SAX::Machine::Part->new( 545 Name => $name, 546 Handlers => \@handlers, 547 ); 548 549# if ( grep $_ eq "Exhaust", @handlers ) { 550# $self->{OverusedNames}->{Exhaust} ||= undef 551# if exists $self->{PartsByName}->{Exhaust}; 552# 553# $self->{PartsByName}->{Exhaust} = $self->{Parts}->[-1]; 554# 555# @handlers = grep $_ ne "Exhaust", @handlers; 556# } 557 558 ## NOTE: This may 559 ## still return a non-reference, which is the type of processor 560 ## wanted here. We construct those lazily below; see the docs 561 ## about order of construction. 562 my $proc = $self->_resolve_spec( $spec ); 563 $part->{Processor} = $proc; 564 croak "SAX machine BUG: couldn't resolve spec '$spec'" 565 unless defined $proc; 566 567 push @{$self->{Parts}}, $part; 568 $part->{Number} = $#{$self->{Parts}}; 569 570 if ( defined $name ) { 571 $self->{OverusedNames}->{$name} ||= undef 572 if exists $self->{PartsByName}->{$name}; 573 574 $self->{IllegalNames}->{$name} ||= undef 575 unless _valid_name $name && $name ne "Exhaust"; 576 577 $self->{PartsByName}->{$name} = $self->{Parts}->[-1]; 578 } 579 580 ## This HASH is used to detect cycles even if the user uses 581 ## preconstructed references instead of named parts. 582 $self->{PartsByProcessor}->{$proc} = $part 583 if ref $proc; 584} 585 586 587sub _names_err_msgs { 588 my ( $s, @names ) = @_ ; 589 @names = map ref $_ eq "HASH" ? keys %$_ : $_, @names; 590 return () unless @names; 591 592 @names = keys %{ { map { ( $_ => undef ) } @names } }; 593 594 if ( @names == 1 ) { 595 $s =~ s/%[A-Z]+//g; 596 } 597 else { 598 $s =~ s/%([A-Z]+)/\L$1/g; 599 } 600 601 return $s . join ", ", map "'$_'", sort @names ; 602} 603 604 605sub _build_part { 606 my $self = shift; 607 my ( $part ) = @_; 608 609 my $part_num = $part->{Number}; 610 611 return if $self->{BuiltParts}->[$part_num]; 612 613 confess "SAX machine BUG: cycle found too late" 614 if $self->{SeenParts}->[$part_num]; 615 ++$self->{SeenParts}->[$part_num]; 616 617 ## We retun a list of all cycles that have been discovered but 618 ## not yet completed. We don't return cycles that have been 619 ## completely discovered; those are placed in DetectedCycles. 620 my @open_cycles; 621 622 eval { 623 ## This eval is to make sure we decrement SeenParts so that 624 ## we don't encounter spurious cycle found too late exceptions. 625 626 ## Build any handlers, detect cycles 627 my @handler_procs; 628 629## I decided not to autolink one handler to the next in order to keep 630## from causing hard to diagnose errors when unintended machines are 631## passed in. The special purpose machines, like Pipeline, have 632## that logic built in. 633## ## Link any part with no handlers to the next part. 634## push @{$part->{Handlers}}, $part->{Number} + 1 635## if ! @{$part->{Handlers}} && $part->{Number} < $#{$self->{Parts}}; 636 637 for my $handler_spec ( @{$part->{Handlers}} ) { 638 639 my $handler; 640 641 if ( ref $handler_spec ) { 642 ## The caller specified a handler with a real reference, so 643 ## we don't need to build it, but we do need to do 644 ## cycle detection. _build_part won't build it in this case 645 ## but it will link it and do cycle detection. 646 $handler = $self->{PartsByProcessor}->{$handler_spec} 647 if exists $self->{PartsByProcessor}->{$handler_spec}; 648 649 if ( ! defined $handler ) { 650 ## It's a processor not in this machine. Hope the 651 ## caller knows what it's doing. 652 push @handler_procs, $handler_spec; 653 next; 654 } 655 } 656 else { 657 $handler = $self->_find_part_rec( $handler_spec ); 658 ## all handler specs were checked earlier, so "survive" this 659 ## failure and let the queued error message tell the user 660 ## about it. 661 next unless defined $handler; 662 } 663 664 if ( $self->{SeenParts}->[$handler->{Number}] ) { 665 ## Oop, a cycle, and we don't want to recurse or we'll 666 ## recurse forever. 667 push @open_cycles, $part eq $handler 668 ? [ $handler ] 669 : [ $part, $handler ]; 670 next; 671 } 672 673 my @nested_cycles = $self->_build_part( $handler ); 674 675 my $handler_proc = $handler->{Processor}; 676 677 confess "SAX machine BUG: found a part with no processor: ", 678 $handler->string_description 679 unless defined $handler_proc; 680 681 confess "SAX machine BUG: found a unbuilt '", 682 $handler->{Processor}, 683 "' processor: ", 684 $handler->string_description 685 unless ref $handler_proc; 686 687 push @handler_procs, $handler_proc; 688 689 for my $nested_cycle ( @nested_cycles ) { 690 if ( $nested_cycle->[-1] == $part ) { 691 ## the returned cycle "ended" with our part, so 692 ## we have a complete description of the cycle, log it 693 ## and move on. 694 push @{$self->{DetectedCycles}}, $nested_cycle; 695 } 696 else { 697 ## This part is part of this cycle but not it's "beginning" 698 push @open_cycles, [ $part, $nested_cycle ]; 699 } 700 } 701 } 702 703 ## Create this processor if need be, otherwise just set the handlers. 704 my $proc = $part->{Processor}; 705 confess "SAX machine BUG: undefined processor for ", 706 $part->string_description 707 unless defined $proc; 708 709 unless ( ref $proc ) { 710 ## TODO: Figure a way to specify the type of handler, probably 711 ## using a DTDHandler=>Name syntax, not sure. Perhaps 712 ## using a hash would be best. 713 714 if ( $proc =~ /^\s*<|\|\s*(?!\n)$/ ) { 715 ## Looks like the caller wants to slurp a file 716 ## We open it ourselves to get all of Perl's magical 717 ## "open" goodness. TODO: also check for a URL scheme 718 ## and handle that :). 719 720 ## TODO: Move this in to a/the parse method so it can 721 ## be repeated. 722 require Symbol; 723 my $fh = Symbol::gensym; 724 open $fh, $proc or croak "$! opening '$proc'"; 725 require XML::SAX::ParserFactory; 726 require IO::Handle; 727 $proc = XML::SAX::ParserFactory->parser( 728 Source => { 729 ByteStream => $fh, 730 }, 731 map { 732 ( Handler => $_ ), 733 } @handler_procs 734 ); 735 736 } 737 elsif ( 738 XML::SAX::Machines->processor_class_option( 739 $proc, 740 "ConstructWithHashedOptions" 741 ) 742 ) { 743 ## This is designed to build options in a format compatible 744 ## with SAXT style constructors when multiple handlers are 745 ## defined. 746 $proc = $proc->new( 747 map { 748 { Handler => $_ }, ## Hashes 749 } @handler_procs ## 0 or more of 'em 750 ); 751 } 752 else { 753 ## More common Foo->new( Handler => $h ); 754 croak "$proc->new doesn't allow multiple handlers.\nSet ConstructWithOptionsHashes => 1 in XML::SAX::Machines::ConfigDefaults if need be" 755 if @handler_procs > 1; 756 $proc = $proc->new( 757 map { 758 ( Handler => $_ ), ## A plain list 759 } @handler_procs ## with 0 or 1 elts 760 ); 761 } 762 $self->{PartsByProcessor}->{$proc} = $part; 763 } 764 elsif ( @handler_procs ) { 765 if ( $proc->can( "set_handlers" ) ) { 766 $proc->set_handlers( @handler_procs ); 767 } 768 elsif ( $proc->can( "set_handler" ) ) { 769 if ( @handler_procs == 1 ) { 770 $proc->set_handler( @handler_procs ); 771 } 772 else { 773 die "SAX machine part ", $part->string_description, 774 " can only take one handler at a time\n"; 775 } 776 } 777 else { 778 die "SAX machine part ", $part->string_description, 779 " does not provide a set_handler() or set_handlers() method\n" 780 } 781 } 782 783 $part->{Processor} = $proc; 784 }; 785 786 --$self->{SeenParts}->[$part->{Number}]; 787 $self->{BuiltParts}->[$part_num] = 1; 788 789 790 if ( $@ ) { 791 chomp $@; 792 $@ .= "\n ...while building " . $part->string_description . "\n"; 793 die $@; 794 } 795 796 return @open_cycles; 797} 798 799 800sub _compile_specs { 801 my $self = shift; 802 803 my @errors; 804 805 ## Init the permanent structures 806 $self->{Parts} = []; 807 $self->{PartsByName} = {}; 808 $self->{PartsByProcessor} = {}; 809 810 ## And some temporary structures. 811 $self->{IllegalNames} = {}; 812 $self->{OverusedNames} = {}; 813 814 ## Scan the specs and figure out the connectivity, names and load 815 ## any requirements, etc. 816 for my $spec ( @_ ) { 817 eval { 818 $self->_push_spec( 819 ref $spec eq "ARRAY" 820 ? @$spec 821 : ( undef, $spec ) 822 ); 823 }; 824 ## This could be ugly if $@ contains a stack trace, but it'll have 825 ## to do. 826 if ( $@ ) { 827 chomp $@; 828 push @errors, $@; 829 } 830 } 831 832 push @errors, ( 833 _names_err_msgs( 834 "illegal SAX machine part name%S ", 835 $self->{IllegalNames} 836 ), 837 _names_err_msgs( 838 "undefined SAX machine part%S specified as handler%S ", 839 grep defined && ! $self->_find_part_rec( $_ ), 840 grep ! ref && $_ ne "Exhaust", 841 map @{$_->{Handlers}}, 842 @{$self->{Parts}} 843 ), 844 _names_err_msgs( 845 "multiple SAX machine parts named ", 846 $self->{OverusedNames} 847 ) 848 ); 849 850 ## Free some memory and make object dumps smaller 851 delete $self->{IllegalNames}; 852 delete $self->{OverusedNames}; 853 854 ## If we made it this far, all classes have been loaded and all 855 ## non-processor refs have been converted in to processors. 856 ## Now 857 ## we need to build and that were specified by type name and do 858 ## them in reverse order so we can pass the 859 ## Handler option(s) in. 860 ## If multiple handlers are defined, then 861 ## we assume that the constructor takes a SAXT like parameter list. 862 ## TODO: figure out how to allow DocumentHandler, etc. Perhaps allow 863 ## HASH refs in ARRAY syntax decls. 864 865 ## Some temporaries 866 $self->{BuiltParts} = []; 867 $self->{SeenParts} = []; 868 $self->{DetectedCycles} = []; 869 870 ## _build_part is recursive and builds any downstream handlers 871 ## needed to build a part. 872 for ( @{$self->{Parts}} ) { 873 eval { 874 push @{$self->{DetectedCycles}}, $self->_build_part( $_ ); 875 }; 876 if ( $@ ) { 877 chomp $@; 878 push @errors, $@; 879 } 880 } 881 882# $self->{PartsByName}->{Intake} ||= $self->{Parts}->[0]; 883# $self->{PartsByName}->{Exhaust} ||= $self->{Parts}->[-1]; 884 885 if ( @{$self->{DetectedCycles}} ) { 886 ## Remove duplicate (cycles are found once for each processor in 887 ## the cycle. 888 my %unique_cycles; 889 890 for my $cycle ( @{$self->{DetectedCycles}} ) { 891 my $start = 0; 892 for ( 1..$#$cycle ) { 893 $start = $_ 894 if $cycle->[$_]->{Number} < $cycle->[$start]->{Number}; 895 } 896 my $key = join( 897 ",", 898 map $_->{Number}, 899 @{$cycle}[$start..($#$cycle),0..($start-1)] 900 ); 901 $unique_cycles{$key} ||= $cycle; 902 } 903 904 push @errors, map { 905 "Cycle detected in SAX machine: " . 906 join( 907 "->", 908 map $_->string_description, $_->[-1], @$_ 909 ); 910 } map $unique_cycles{$_}, sort keys %unique_cycles; 911 } 912 913 delete $self->{SeenParts}; 914 delete $self->{BuiltParts}; 915 delete $self->{DetectedCycles}; 916 917 croak join "\n", @errors if @errors; 918} 919 920 921sub _SAX2_attrs { 922 my %a = @_; 923 924 return { 925 map { 926 defined $a{$_} 927 ? ( $_ => { 928 LocalName => $_, 929 Name => $_, 930 Value => $a{$_}, 931 } ) 932 : () ; 933 } keys %a 934 }; 935} 936 937 938my %ids; 939sub _idify($) { 940 $ids{$_[0]} = keys %ids unless exists $ids{$_[0]}; 941 return $ids{$_[0]}; 942} 943 944 945sub pointer_elt { 946 my $self = shift; 947 my ( $elt_type, $h_spec, $options ) = @_; 948 949 my $part_rec; 950 951 $h_spec = $self->{Handler} 952 if $h_spec eq "Exhaust" && defined $self->{Handler}; 953 954 ## Look locally first in case the name is not 955 ## unique among parts in RootMachine. 956 $part_rec = $self->_find_part_rec( $h_spec ) 957 if ! $part_rec; 958 959 ## Don't look for indexes in RootMachine 960 $part_rec = $options->{RootMachine}->_find_part_rec( 961 $h_spec 962 ) if ! $part_rec 963 && defined $options->{RootMachine} 964 && $h_spec != /^-?\d+$/ ; 965 966 my %attrs; 967 968 if ( $part_rec ) { 969 %attrs = ( 970 name => $part_rec->{Name} || $h_spec, 971 "handler-id" => _idify $part_rec->{Processor}, 972 ); 973 } 974 else { 975 if ( ref $h_spec ) { 976 %attrs = ( 977 type => ref $h_spec, 978 "handler-id" => _idify $h_spec, 979 ); 980 } 981 else { 982 %attrs = ( 983 name => $h_spec, 984 ); 985 } 986 } 987 988 return { 989 Name => $elt_type, 990 LocalName => $elt_type, 991 Attributes => _SAX2_attrs( %attrs ), 992 }; 993} 994 995 996sub generate_part_descriptions { 997 my $self = shift; 998 my ( $options ) = @_; 999 1000 my $h = $options->{Handler}; 1001 croak "No Handler passed" unless $h; 1002 1003 for my $part_rec ( @{$self->{Parts}} ) { 1004 my $proc = $part_rec->{Processor}; 1005 1006 if ( $proc->can( "generate_description" ) ) { 1007 $proc->generate_description( { 1008 %$options, 1009 Name => $part_rec->{Name}, 1010 Description => $part_rec->string_description, 1011 } ); 1012 } 1013 else { 1014 my $part_elt = { 1015 LocalName => "part", 1016 Name => "part", 1017 Attributes => _SAX2_attrs( 1018 id => _idify $proc, 1019 type => ref $part_rec, 1020 name => $part_rec->{Name}, 1021 description => $part_rec->string_description, 1022 ), 1023 }; 1024 $h->start_element( $part_elt ); 1025 for my $h_spec ( @{$part_rec->{Handlers}} ) { 1026 my $handler_elt = $self->pointer_elt( "handler", $h_spec ); 1027 1028 $h->start_element( $handler_elt ); 1029 $h->end_element( $handler_elt ); 1030 } 1031 $h->end_element( $part_elt ); 1032 } 1033 } 1034} 1035 1036 1037sub generate_description { 1038 my $self = shift; 1039 1040 my $options = 1041 @_ == 1 1042 ? ref $_[0] eq "HASH" 1043 ? { %{$_[0]} } 1044 : { 1045 Handler => 1046 ref $_[0] 1047 ? $_[0] 1048 : $self->_resolve_spec( $_[0] ) 1049 } 1050 : { @_ }; 1051 1052 my $h = $options->{Handler}; 1053 croak "No Handler passed" unless $h; 1054 1055 unless ( $options->{Depth} ) { 1056 %ids = (); 1057 $options->{RootMachine} = $self; 1058 1059 $h->start_document({}); 1060 } 1061 1062 ++$options->{Depth}; 1063 my $root_elt = { 1064 LocalName => "sax-machine", 1065 Name => "sax-machine", 1066 Attributes => _SAX2_attrs( 1067 id => _idify $self, 1068 type => ref $self, 1069 name => $options->{Name}, 1070 description => $options->{Description}, 1071 ), 1072 }; 1073 1074 $h->start_element( $root_elt ); 1075 1076 ## Listing the handler first so it doesn't look like a part's 1077 ## handler (which it kinda does if it's hanging out *after* a <part .../> 1078 ## tag :). Also makes following the links by hand a tad easier. 1079 if ( defined $self->{Handler} ) { 1080 my $handler_elt = $self->pointer_elt( "handler", $self->{Handler} ); 1081 $handler_elt->{Attributes}->{name} = { 1082 Name => "name", 1083 LocalName => "name", 1084 Value => "Exhaust" 1085 } unless exists $handler_elt->{Attributes}->{Name}; 1086 1087 $h->start_element( $handler_elt ); 1088 $h->end_element( $handler_elt ); 1089 } 1090 1091 for ( sort keys %{$self->{PartsByName}} ) { 1092 if ( $self->{PartsByName}->{$_}->{Name} ne $_ ) { 1093 warn $self->{PartsByName}->{$_}->{Name}, " : ", $_; 1094 my $handler_elt = $self->pointer_elt( "alias", $_ ); 1095 %{$handler_elt->{Attributes}} = ( 1096 %{$handler_elt->{Attributes}}, 1097 %{_SAX2_attrs( alias => $_ )}, 1098 ); 1099 $h->start_element( $handler_elt ); 1100 $h->end_element( $handler_elt ); 1101 } 1102 } 1103 1104 $self->generate_part_descriptions( $options ); 1105 $h->end_element( $root_elt ); 1106 1107 --$options->{Depth}; 1108 $h->end_document({}) unless $options->{Depth}; 1109} 1110 1111 1112## 1113## This is a private class, only this class should use it directly. 1114## 1115package XML::SAX::Machine::Part; 1116{ 1117 $XML::SAX::Machine::Part::VERSION = '0.46'; 1118} 1119 1120use fields ( 1121 'Name', ## The caller-given name of the part 1122 'Number', ## Where it sits in the parts list. 1123 'Processor', ## The actual SAX processor 1124 'Handlers', ## The handlers the caller specified 1125); 1126 1127 1128sub new { 1129 my $proto = shift; 1130 my $class = ref $proto || $proto; 1131 1132 my $self = bless {}, $class; 1133 1134 my %options = @_ ; 1135 $self->{$_} = $options{$_} for keys %options; 1136 1137 return $self; 1138} 1139 1140 1141sub string_description { 1142 my $self = shift; 1143 1144 return join( 1145 "", 1146 $self->{Name} 1147 ? $self->{Name} 1148 : ( "#", $self->{Number} ), 1149 " (", 1150 $self->{Processor} 1151 ? ( ref $self->{Processor} || $self->{Processor} ) 1152 : "<undefined processor>", 1153 ")" 1154 ); 1155} 1156 11571; 1158 1159__END__ 1160 1161=pod 1162 1163=head1 NAME 1164 1165XML::SAX::Machine - Manage a collection of SAX processors 1166 1167=head1 VERSION 1168 1169version 0.46 1170 1171=head1 SYNOPSIS 1172 1173 ## Note: See XML::SAX::Pipeline and XML::SAX::Machines first, 1174 ## this is the gory, detailed interface. 1175 1176 use My::SAX::Machines qw( Machine ); 1177 use My::SAX::Filter2; 1178 use My::SAX::Filter3; 1179 1180 my $filter3 = My::SAX::Filter3->new; 1181 1182 ## A simple pipeline. My::SAX::Filter1 will be autoloaded. 1183 my $m = Machine( 1184 # 1185 # Name => Class/object => handler(s) 1186 # 1187 [ Intake => "My::SAX::Filter1" => "B" ], 1188 [ B => My::SAX::Filter2->new() => "C" ], 1189 [ C => $filter3 => "D" ], 1190 [ D => \*STDOUT ], 1191 ); 1192 1193 ## A parser will be created unless My::SAX::Filter1 can parse_file 1194 $m->parse_file( "foo.revml" ); 1195 1196 my $m = Machine( 1197 [ Intake => "My::SAX::Filter1" => qw( Tee ) ], 1198 [ Tee => "XML::Filter::SAXT" => qw( Foo Bar ) ], 1199 [ Foo => "My::SAX::Filter2" => qw( Out1 ) ], 1200 [ Out1 => \$log ], 1201 [ Bar => "My::SAX::Filter3" => qw( Exhaust ) ], 1202 ); 1203 1204=head1 DESCRIPTION 1205 1206B<WARNING>: This API is alpha!!! It I<will> be changing. 1207 1208A generic SAX machine (an instance of XML::SAX::Machine) is a container 1209of SAX processors (referred to as "parts") connected in arbitrary ways. 1210 1211Each parameter to C<Machine()> (or C<XML::SAX::Machine->new()>) 1212represents one top level part of the machine. Each part has a name, a 1213processor, and one or more handlers (usually specified by name, as shown 1214in the SYNOPSIS). 1215 1216Since SAX machines may be passed in as single top level parts, you can 1217also create nested, complex machines ($filter3 in the SYNOPSIS could be 1218a Pipeline, for example). 1219 1220A SAX machines can act as a normal SAX processors by connecting them to 1221other SAX processors: 1222 1223 my $w = My::Writer->new(); 1224 my $m = Machine( ...., { Handler => $w } ); 1225 my $g = My::Parser->new( Handler => $w ); 1226 1227=head2 Part Names 1228 1229Although it's not required, each part in a machine can be named. This 1230is useful for retrieving and manipulating the parts (see L</part>, for 1231instance), and for debugging, since debugging output (see 1232L</trace_parts> and L</trace_all_parts>) includes the names. 1233 1234Part names must be valid Perl subroutine names, beginning with an 1235uppercase character. This is to allow convenience part accessors 1236methods like 1237 1238 $c = $m->NameOfAFilter; 1239 1240to work without ever colliding with the name of a method (all method 1241names are completely lower case). Only filters named like this can be 1242accessed using the magical accessor functions. 1243 1244=head2 Reserved Names: Intake and Exhaust 1245 1246The names c<Intake> and C<Exhaust> are reserved. C<Intake> refers to 1247the first part in the processing chain. This is not necessarily the 1248first part in the constructor list, just the first part to receive 1249external events. 1250 1251C<Exhaust> refers to the output of the machine; no part may be named 1252C<Exhaust>, and any parts with a handler named C<Exhaust> will deliver 1253their output to the machine's handler. Normally, only one part should 1254deliver it's output to the Exhaust port. 1255 1256Calling $m->set_handler() alters the Exhaust port, assuming any 1257processors pointing to the C<Exhaust> provide a C<set_handler()> method 1258like L<XML::SAX::Base>'s. 1259 1260C<Intake> and C<Exhaust> are usually assigned automatically by 1261single-purpose machines like L<XML::SAX::Pipeline> and 1262L<XML::SAX::Manifold>. 1263 1264=head2 SAX Processor Support 1265 1266The XML::SAX::Machine class is very agnostic about what SAX processors 1267it supports; about the only constraint is that it must be a blessed 1268reference (of any type) that does not happen to be a Perl IO::Handle 1269(which are assumed to be input or output filehandles). 1270 1271The major constraint placed on SAX processors is that they must provide 1272either a C<set_handler> or C<set_handlers> method (depending on how many 1273handlers a processor can feed) to allow the SAX::Machine to disconnect 1274and reconnect them. Luckily, this is true of almost any processor 1275derived from XML::SAX::Base. Unfortunately, many SAX older (SAX1) 1276processors do not meet this requirement; they assume that SAX processors 1277will only ever be connected together using their constructors. 1278 1279=head2 Connections 1280 1281SAX machines allow you to connect the parts however you like; each part 1282is given a name and a list of named handlers to feed. The number of 1283handlers a part is allowed depends on the part; most filters only allow 1284once downstream handler, but filters like L<XML::Filter::SAXT> and 1285L<XML::Filter::Distributor> are meant to feed multiple handlers. 1286 1287Parts may not be connected in loops ("cycles" in graph theory terms). 1288The machines specified by: 1289 1290 [ A => "Foo" => "A" ], ## Illegal! 1291 1292and 1293 1294 [ A => "Foo" => "B" ], ## Illegal! 1295 [ B => "Foo" => "A" ], 1296 1297. Configuring a machine this way would cause events to flow in an 1298infinite loop, and/or cause the first processor in the cycle to start 1299receiving events from the end of the cycle before the input document was 1300complete. Besides, it's not a very useful topology :). 1301 1302SAX machines detect loops at construction time. 1303 1304=head1 NAME 1305 1306 XML::SAX::Machine - Manage a collection of SAX processors 1307 1308=head1 API 1309 1310=head2 Public Methods 1311 1312These methods are meant to be used by users of SAX machines. 1313 1314=over 1315 1316=item new() 1317 1318 my $m = $self->new( @machine_spec, \%options ); 1319 1320Creates $self using %options, and compiles the machine spec. This is 1321the longhand form of C<Machines( ... )>. 1322 1323=item find_part 1324 1325Gets a part contained by this machine by name, number or object reference: 1326 1327 $c = $m->find_part( $name ); 1328 $c = $m->find_part( $number ); 1329 $c = $m->find_part( $obj ); ## useful only to see if $obj is in $m 1330 1331If a machine contains other machines, parts of the contained machines 1332may be accessed by name using unix directory syntax: 1333 1334 $c = $m->find_part( "/Intake/Foo/Bar" ); 1335 1336(all paths must be absolute). 1337 1338Parts may also be accessed by number using array indexing: 1339 1340 $c = $m->find_part(0); ## Returns first part or undef if none 1341 $c = $m->find_part(-1); ## Returns last part or undef if none 1342 $c = $m->find_part( "Foo/0/1/-1" ); 1343 1344There is no way to guarantee that a part's position number means 1345anything, since parts can be reconnected after their position numbers 1346are assigned, so using a part name is recommended. 1347 1348Throws an exception if the part is not found, so doing things like 1349 1350 $m->find_part( "Foo" )->bar() 1351 1352garner informative messages when "Foo" is not found. If you want to 1353test a result code, do something like 1354 1355 my $p = eval { $m->find_part }; 1356 unless ( $p ) { 1357 ...handle lookup failure... 1358 } 1359 1360=item parts 1361 1362 for ( $m->parts ) { ... } 1363 1364Gets an arbitrarily ordered list of top level parts in this machine. 1365This is all of the parts directly contained by this machine and none of 1366the parts that may be inside them. So if a machine contains an 1367L<XML::SAX::Pipeline> as one of it's parts, the pipeline will be 1368returned but not the parts inside the pipeline. 1369 1370=item all_parts 1371 1372 for ( $m->all_parts ) { ... } 1373 1374Gets all parts in this machine, not just top level ones. This includes 1375any machines contained by this machine and their parts. 1376 1377=item set_handler 1378 1379 $m->set_handler( $handler ); 1380 $m->set_handler( DTDHandler => $handler ); 1381 1382Sets the machine's handler and sets the handlers for all parts that 1383have C<Exhaust> specified as their handlers. Requires that any such 1384parts provide a C<set_handler> or (if the part has multiple handlers) 1385a C<set_handlers> method. 1386 1387NOTE: handler types other than "Handler" are only supported if they are 1388supported by whatever parts point at the C<Exhaust>. If the handler type is 1389C<Handler>, then the appropriate method is called as: 1390 1391 $part->set_handler( $handler ); 1392 $part->set_handlers( $handler0, $handler1, ... ); 1393 1394If the type is some other handler type, these are called as: 1395 1396 $part->set_handler( $type => $handler ); 1397 $part->set_handlers( { $type0 => $handler0 }, ... ); 1398 1399=item trace_parts 1400 1401 $m->trace_parts; ## trace all top-level parts 1402 $m->trace_parts( @ids ); ## trace the indicated parts 1403 1404Uses Devel::TraceSAX to enable tracing of all events received by the parts of 1405this machine. Does not enable tracing of parts contained in machines in this 1406machine; for that, see trace_all_parts. 1407 1408=item trace_all_parts 1409 1410 $m->trace_all_parts; ## trace all parts 1411 1412Uses Devel::TraceSAX to trace all events received by the parts of this 1413machine. 1414 1415=item untracify_parts 1416 1417 $m->untracify_parts( @ids ); 1418 1419Converts the indicated parts to SAX processors with tracing enabled. 1420This may not work with processors that use AUTOLOAD. 1421 1422=back 1423 1424=head1 Events and parse routines 1425 1426XML::SAX::Machine provides all SAX1 and SAX2 events and delgates them to the 1427processor indicated by $m->find_part( "Intake" ). This adds some overhead, so 1428if you are concerned about overhead, you might want to direct SAX events 1429directly to the Intake instead of to the machine. 1430 1431It also provides parse...() routines so it can whip up a parser if need 1432be. This means: parse(), parse_uri(), parse_string(), and parse_file() 1433(see XML::SAX::EventMethodMaker for details). There is no way to pass 1434methods directly to the parser unless you know that the Intake is a 1435parser and call it directly. This is not so important for parsing, 1436because the overhead it takes to delegate is minor compared to the 1437effort needed to parse an XML document. 1438 1439=head2 Internal and Helper Methods 1440 1441These methods are meant to be used/overridden by subclasses. 1442 1443=over 1444 1445=item _compile_specs 1446 1447 my @comp = $self->_compile_specs( @_ ); 1448 1449Runs through a list of module names, output specifiers, etc., and builds 1450the machine. 1451 1452 $scalar --> "$scalar"->new 1453 $ARRAY_ref --> pipeline @$ARRAY_ref 1454 $SCALAR_ref --> XML::SAX::Writer->new( Output => $SCALAR_ref ) 1455 $GLOB_ref --> XML::SAX::Writer->new( Output => $GLOB_ref ) 1456 1457=item generate_description 1458 1459 $m->generate_description( $h ); 1460 $m->generate_description( Handler => $h ); 1461 $m->generate_description( Pipeline ... ); 1462 1463Generates a series of SAX events to the handler of your choice. 1464 1465See L<XML::Handler::Machine2GraphViz> on CPAN for a way of visualizing 1466machine innards. 1467 1468=back 1469 1470=head1 TODO 1471 1472=over 1473 1474=item * 1475 1476Separate initialization from construction time; there should be somthing 1477like a $m->connect( ....machine_spec... ) that new() calls to allow you 1478to delay parts speficication and reconfigure existing machines. 1479 1480=item * 1481 1482Allow an XML doc to be passed in as a machine spec. 1483 1484=back 1485 1486=head1 LIMITATIONS 1487 1488=over 1489 1490=back 1491 1492=head1 AUTHOR 1493 1494 Barrie Slaymaker <barries@slaysys.com> 1495 1496=head1 LICENSE 1497 1498Artistic or GPL, any version. 1499 1500=head1 AUTHORS 1501 1502=over 4 1503 1504=item * 1505 1506Barry Slaymaker 1507 1508=item * 1509 1510Chris Prather <chris@prather.org> 1511 1512=back 1513 1514=head1 COPYRIGHT AND LICENSE 1515 1516This software is copyright (c) 2013 by Barry Slaymaker. 1517 1518This is free software; you can redistribute it and/or modify it under 1519the same terms as the Perl 5 programming language system itself. 1520 1521=cut 1522