1use strict; 2use warnings; # > perl 5.5 3 4# This is created in the caller's space 5# I realize (now!) that it's not clean, but it's been there for 10+ years... 6BEGIN 7{ sub ::PCDATA { '#PCDATA' } ## no critic (Subroutines::ProhibitNestedSubs); 8 sub ::CDATA { '#CDATA' } ## no critic (Subroutines::ProhibitNestedSubs); 9} 10 11use UNIVERSAL(); 12 13## if a sub returns a scalar, it better not bloody disappear in list context 14## no critic (Subroutines::ProhibitExplicitReturnUndef); 15 16my $perl_version; 17my $parser_version; 18 19###################################################################### 20package XML::Twig; 21###################################################################### 22 23require 5.004; 24 25use utf8; # > perl 5.5 26 27use vars qw($VERSION @ISA %valid_option); 28 29use Carp; 30use File::Spec; 31use File::Basename; 32 33use Config; # to get perl's path name in case we need to know if perlio is available 34 35*isa= *UNIVERSAL::isa; 36 37# flag, set to true if the weaken sub is available 38use vars qw( $weakrefs); 39 40# flag set to true if the version of expat seems to be 1.95.2, which has annoying bugs 41# wrt doctype handling. This is global for performance reasons. 42my $expat_1_95_2=0; 43 44# a slight non-xml mod: # is allowed as a first character 45my $REG_TAG_FIRST_LETTER; 46#$REG_TAG_FIRST_LETTER= q{(?:[^\W\d]|[:#_])}; # < perl 5.6 - does not work for leading non-ascii letters 47$REG_TAG_FIRST_LETTER= q{(?:[[:alpha:]:#_])}; # >= perl 5.6 48 49my $REG_TAG_LETTER= q{(?:[\w_.-]*)}; 50 51# a simple name (no colon) 52my $REG_NAME_TOKEN= qq{(?:$REG_TAG_FIRST_LETTER$REG_TAG_LETTER*)}; 53 54# a tag name, possibly including namespace 55my $REG_NAME= qq{(?:(?:$REG_NAME_TOKEN:)?$REG_NAME_TOKEN)}; 56 57# tag name (leading # allowed) 58# first line is for perl 5.005, second line for modern perl, that accept character classes 59my $REG_TAG_NAME=$REG_NAME; 60 61# name or wildcard (* or '') (leading # allowed) 62my $REG_NAME_W = qq{(?:$REG_NAME|[*])}; 63 64# class and ids are deliberately permissive 65my $REG_NTOKEN_FIRST_LETTER; 66#$REG_NTOKEN_FIRST_LETTER= q{(?:[^\W\d]|[:_])}; # < perl 5.6 - does not work for leading non-ascii letters 67$REG_NTOKEN_FIRST_LETTER= q{(?:[[:alpha:]:_])}; # >= perl 5.6 68 69my $REG_NTOKEN_LETTER= q{(?:[\w_:.-]*)}; 70 71my $REG_NTOKEN= qq{(?:$REG_NTOKEN_FIRST_LETTER$REG_NTOKEN_LETTER*)}; 72my $REG_CLASS = $REG_NTOKEN; 73my $REG_ID = $REG_NTOKEN; 74 75# allow <tag> #<tag> (private elt) * <tag>.<class> *.<class> <tag>#<id> *#<id> 76my $REG_TAG_PART= qq{(?:$REG_NAME_W(?:[.]$REG_CLASS|[#]$REG_ID)?|[.]$REG_CLASS)}; 77 78my $REG_REGEXP = q{(?:/(?:[^\\/]|\\.)*/[eimsox]*)}; # regexp 79my $REG_MATCH = q{[!=]~}; # match (or not) 80my $REG_STRING = q{(?:"(?:[^\\"]|\\.)*"|'(?:[^\\']|\\.)*')}; # string (simple or double quoted) 81my $REG_NUMBER = q{(?:\d+(?:\.\d*)?|\.\d+)}; # number 82my $REG_VALUE = qq{(?:$REG_STRING|$REG_NUMBER)}; # value 83my $REG_OP = q{==|!=|>|<|>=|<=|eq|ne|lt|gt|le|ge|=}; # op 84my $REG_FUNCTION = q{(?:string|text)\(\s*\)}; 85my $REG_STRING_ARG = qq{(?:string|text)\\(\\s*$REG_NAME_W\\s*\\)}; 86my $REG_COMP = q{(?:>=|<=|!=|<|>|=)}; 87 88my $REG_TAG_IN_PREDICATE= $REG_NAME_W . q{(?=\s*(?i:and\b|or\b|\]|$))}; 89 90# keys in the context stack, chosen not to interfere with att names, even private (#-prefixed) ones 91my $ST_TAG = '##tag'; 92my $ST_ELT = '##elt'; 93my $ST_NS = '##ns' ; 94 95# used in the handler trigger code 96my $REG_NAKED_PREDICATE= qq{((?:"[^"]*"|'[^']*'|$REG_STRING_ARG|$REG_FUNCTION|\@$REG_NAME_W|$REG_MATCH\\s*$REG_REGEXP|[\\s\\d><=!()+.-]|(?i:and)|(?i:or)|$REG_TAG_IN_PREDICATE)*)}; 97my $REG_PREDICATE= qq{\\[$REG_NAKED_PREDICATE\\]}; 98 99# not all axis, only supported ones (in get_xpath) 100my @supported_axis= ( 'ancestor', 'ancestor-or-self', 'child', 'descendant', 'descendant-or-self', 101 'following', 'following-sibling', 'parent', 'preceding', 'preceding-sibling', 'self' 102 ); 103my $REG_AXIS = "(?:" . join( '|', @supported_axis) .")"; 104 105# only used in the "xpath"engine (for get_xpath/findnodes) for now 106my $REG_PREDICATE_ALT = qr{\[(?:(?:string\(\s*\)|\@$REG_TAG_NAME)\s*$REG_MATCH\s*$REG_REGEXP\s*|[^\]]*)\]}; 107 108# used to convert XPath tests on strings to the perl equivalent 109my %PERL_ALPHA_TEST= ( '=' => ' eq ', '!=' => ' ne ', '>' => ' gt ', '>=' => ' ge ', '<' => ' lt ', '<=' => ' le '); 110 111my( $FB_HTMLCREF, $FB_XMLCREF); 112 113my $NO_WARNINGS= $perl_version >= 5.006 ? 'no warnings' : 'local $^W=0'; 114 115# default namespaces, both ways 116my %DEFAULT_NS= ( xml => "http://www.w3.org/XML/1998/namespace", 117 xmlns => "http://www.w3.org/2000/xmlns/", 118 ); 119my %DEFAULT_URI2NS= map { $DEFAULT_NS{$_} => $_ } keys %DEFAULT_NS; 120 121# constants 122my( $PCDATA, $CDATA, $PI, $COMMENT, $ENT, $ELT, $NOTATION, $TEXT, $ASIS, $EMPTY, $BUFSIZE); 123 124# used when an HTML doc only has a PUBLIC declaration, to generate the SYSTEM one 125# this should really be done by HTML::TreeBuilder, but as of HTML::TreeBuilder 4.2 it isn't 126# the various declarations are taken from http://en.wikipedia.org/wiki/Document_Type_Declaration 127my %HTML_DECL= ( "-//W3C//DTD HTML 4.0 Transitional//EN" => "http://www.w3.org/TR/REC-html40/loose.dtd", 128 "-//W3C//DTD HTML 4.01//EN" => "http://www.w3.org/TR/html4/strict.dtd", 129 "-//W3C//DTD HTML 4.01 Transitional//EN" => "http://www.w3.org/TR/html4/loose.dtd", 130 "-//W3C//DTD HTML 4.01 Frameset//EN" => "http://www.w3.org/TR/html4/frameset.dtd", 131 "-//W3C//DTD XHTML 1.0 Strict//EN" => "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd", 132 "-//W3C//DTD XHTML 1.0 Transitional//EN" => "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd", 133 "-//W3C//DTD XHTML 1.0 Frameset//EN" => "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd", 134 "-//W3C//DTD XHTML 1.1//EN" => "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd", 135 "-//W3C//DTD XHTML Basic 1.0//EN" => "http://www.w3.org/TR/xhtml-basic/xhtml-basic10.dtd", 136 "-//W3C//DTD XHTML Basic 1.1//EN" => "http://www.w3.org/TR/xhtml-basic/xhtml-basic11.dtd", 137 "-//WAPFORUM//DTD XHTML Mobile 1.0//EN" => "http://www.wapforum.org/DTD/xhtml-mobile10.dtd", 138 "-//WAPFORUM//DTD XHTML Mobile 1.1//EN" => "http://www.openmobilealliance.org/tech/DTD/xhtml-mobile11.dtd", 139 "-//WAPFORUM//DTD XHTML Mobile 1.2//EN" => "http://www.openmobilealliance.org/tech/DTD/xhtml-mobile12.dtd", 140 "-//W3C//DTD XHTML+RDFa 1.0//EN" => "http://www.w3.org/MarkUp/DTD/xhtml-rdfa-1.dtd", 141 ); 142 143my $DEFAULT_HTML_TYPE= "-//W3C//DTD HTML 4.0 Transitional//EN"; 144 145my $SEP= qr/\s*(?:$|\|)/; 146 147BEGIN 148{ 149$VERSION = '3.52'; 150 151use XML::Parser; 152my $needVersion = '2.23'; 153($parser_version= $XML::Parser::VERSION)=~ s{_\d+}{}; # remove _<n> from version so numeric tests do not warn 154croak "need at least XML::Parser version $needVersion" unless $parser_version >= $needVersion; 155 156($perl_version= $])=~ s{_\d+}{}; 157 158if( $perl_version >= 5.008) 159 { eval "use Encode qw( :all)"; ## no critic ProhibitStringyEval 160 $FB_XMLCREF = 0x0400; # Encode::FB_XMLCREF; 161 $FB_HTMLCREF = 0x0200; # Encode::FB_HTMLCREF; 162 } 163 164# test whether we can use weak references 165# set local empty signal handler to trap error messages 166{ local $SIG{__DIE__}; 167 if( eval( 'require Scalar::Util') && defined( \&Scalar::Util::weaken)) 168 { import Scalar::Util( 'weaken'); $weakrefs= 1; } 169 elsif( eval( 'require WeakRef')) 170 { import WeakRef; $weakrefs= 1; } 171 else 172 { $weakrefs= 0; } 173} 174 175import XML::Twig::Elt; 176import XML::Twig::Entity; 177import XML::Twig::Entity_list; 178 179# used to store the gi's 180# should be set for each twig really, at least when there are several 181# the init ensures that special gi's are always the same 182 183# constants: element types 184$PCDATA = '#PCDATA'; 185$CDATA = '#CDATA'; 186$PI = '#PI'; 187$COMMENT = '#COMMENT'; 188$ENT = '#ENT'; 189$NOTATION = '#NOTATION'; 190 191# element classes 192$ELT = '#ELT'; 193$TEXT = '#TEXT'; 194 195# element properties 196$ASIS = '#ASIS'; 197$EMPTY = '#EMPTY'; 198 199# used in parseurl to set the buffer size to the same size as in XML::Parser::Expat 200$BUFSIZE = 32768; 201 202 203# gi => index 204%XML::Twig::gi2index=( '', 0, $PCDATA => 1, $CDATA => 2, $PI => 3, $COMMENT => 4, $ENT => 5); 205# list of gi's 206@XML::Twig::index2gi=( '', $PCDATA, $CDATA, $PI, $COMMENT, $ENT); 207 208# gi's under this value are special 209$XML::Twig::SPECIAL_GI= @XML::Twig::index2gi; 210 211%XML::Twig::base_ent= ( '>' => '>', '<' => '<', '&' => '&', "'" => ''', '"' => '"',); 212foreach my $c ( "\n", "\r", "\t") { $XML::Twig::base_ent{$c}= sprintf( "&#x%02x;", ord( $c)); } 213 214# now set some aliases 215*find_nodes = *get_xpath; # same as XML::XPath 216*findnodes = *get_xpath; # same as XML::LibXML 217*getElementsByTagName = *descendants; 218*descendants_or_self = *descendants; # valid in XML::Twig, not in XML::Twig::Elt 219*find_by_tag_name = *descendants; 220*getElementById = *elt_id; 221*getEltById = *elt_id; 222*toString = *sprint; 223*create_accessors = *att_accessors; 224 225} 226 227@ISA = qw(XML::Parser); 228 229# fake gi's used in twig_handlers and start_tag_handlers 230my $ALL = '_all_'; # the associated function is always called 231my $DEFAULT= '_default_'; # the function is called if no other handler has been 232 233# some defaults 234my $COMMENTS_DEFAULT= 'keep'; 235my $PI_DEFAULT = 'keep'; 236 237 238# handlers used in regular mode 239my %twig_handlers=( Start => \&_twig_start, 240 End => \&_twig_end, 241 Char => \&_twig_char, 242 Entity => \&_twig_entity, 243 Notation => \&_twig_notation, 244 XMLDecl => \&_twig_xmldecl, 245 Doctype => \&_twig_doctype, 246 Element => \&_twig_element, 247 Attlist => \&_twig_attlist, 248 CdataStart => \&_twig_cdatastart, 249 CdataEnd => \&_twig_cdataend, 250 Proc => \&_twig_pi, 251 Comment => \&_twig_comment, 252 Default => \&_twig_default, 253 ExternEnt => \&_twig_extern_ent, 254 ); 255 256# handlers used when twig_roots is used and we are outside of the roots 257my %twig_handlers_roots= 258 ( Start => \&_twig_start_check_roots, 259 End => \&_twig_end_check_roots, 260 Doctype => \&_twig_doctype, 261 Char => undef, Entity => undef, XMLDecl => \&_twig_xmldecl, 262 Element => undef, Attlist => undef, CdataStart => undef, 263 CdataEnd => undef, Proc => undef, Comment => undef, 264 Proc => \&_twig_pi_check_roots, 265 Default => sub {}, # hack needed for XML::Parser 2.27 266 ExternEnt => \&_twig_extern_ent, 267 ); 268 269# handlers used when twig_roots and print_outside_roots are used and we are 270# outside of the roots 271my %twig_handlers_roots_print_2_30= 272 ( Start => \&_twig_start_check_roots, 273 End => \&_twig_end_check_roots, 274 Char => \&_twig_print, 275 Entity => \&_twig_print_entity, 276 ExternEnt => \&_twig_print_entity, 277 DoctypeFin => \&_twig_doctype_fin_print, 278 XMLDecl => sub { _twig_xmldecl( @_); _twig_print( @_); }, 279 Doctype => \&_twig_print_doctype, # because recognized_string is broken here 280 # Element => \&_twig_print, Attlist => \&_twig_print, 281 CdataStart => \&_twig_print, CdataEnd => \&_twig_print, 282 Proc => \&_twig_pi_check_roots, Comment => \&_twig_print, 283 Default => \&_twig_print_check_doctype, 284 ExternEnt => \&_twig_extern_ent, 285 ); 286 287# handlers used when twig_roots, print_outside_roots and keep_encoding are used 288# and we are outside of the roots 289my %twig_handlers_roots_print_original_2_30= 290 ( Start => \&_twig_start_check_roots, 291 End => \&_twig_end_check_roots, 292 Char => \&_twig_print_original, 293 # I have no idea why I should not be using this handler! 294 Entity => \&_twig_print_entity, 295 ExternEnt => \&_twig_print_entity, 296 DoctypeFin => \&_twig_doctype_fin_print, 297 XMLDecl => sub { _twig_xmldecl( @_); _twig_print_original( @_) }, 298 Doctype => \&_twig_print_original_doctype, # because original_string is broken here 299 Element => \&_twig_print_original, Attlist => \&_twig_print_original, 300 CdataStart => \&_twig_print_original, CdataEnd => \&_twig_print_original, 301 Proc => \&_twig_pi_check_roots, Comment => \&_twig_print_original, 302 Default => \&_twig_print_original_check_doctype, 303 ); 304 305# handlers used when twig_roots and print_outside_roots are used and we are 306# outside of the roots 307my %twig_handlers_roots_print_2_27= 308 ( Start => \&_twig_start_check_roots, 309 End => \&_twig_end_check_roots, 310 Char => \&_twig_print, 311 # if the Entity handler is set then it prints the entity declaration 312 # before the entire internal subset (including the declaration!) is output 313 Entity => sub {}, 314 XMLDecl => \&_twig_print, Doctype => \&_twig_print, 315 CdataStart => \&_twig_print, CdataEnd => \&_twig_print, 316 Proc => \&_twig_pi_check_roots, Comment => \&_twig_print, 317 Default => \&_twig_print, 318 ExternEnt => \&_twig_extern_ent, 319 ); 320 321# handlers used when twig_roots, print_outside_roots and keep_encoding are used 322# and we are outside of the roots 323my %twig_handlers_roots_print_original_2_27= 324 ( Start => \&_twig_start_check_roots, 325 End => \&_twig_end_check_roots, 326 Char => \&_twig_print_original, 327 # for some reason original_string is wrong here 328 # this can be a problem if the doctype includes non ascii characters 329 XMLDecl => \&_twig_print, Doctype => \&_twig_print, 330 # if the Entity handler is set then it prints the entity declaration 331 # before the entire internal subset (including the declaration!) is output 332 Entity => sub {}, 333 #Element => undef, Attlist => undef, 334 CdataStart => \&_twig_print_original, CdataEnd => \&_twig_print_original, 335 Proc => \&_twig_pi_check_roots, Comment => \&_twig_print_original, 336 Default => \&_twig_print, # _twig_print_original does not work 337 ExternEnt => \&_twig_extern_ent, 338 ); 339 340 341my %twig_handlers_roots_print= $parser_version > 2.27 342 ? %twig_handlers_roots_print_2_30 343 : %twig_handlers_roots_print_2_27; 344my %twig_handlers_roots_print_original= $parser_version > 2.27 345 ? %twig_handlers_roots_print_original_2_30 346 : %twig_handlers_roots_print_original_2_27; 347 348 349# handlers used when the finish_print method has been called 350my %twig_handlers_finish_print= 351 ( Start => \&_twig_print, 352 End => \&_twig_print, Char => \&_twig_print, 353 Entity => \&_twig_print, XMLDecl => \&_twig_print, 354 Doctype => \&_twig_print, Element => \&_twig_print, 355 Attlist => \&_twig_print, CdataStart => \&_twig_print, 356 CdataEnd => \&_twig_print, Proc => \&_twig_print, 357 Comment => \&_twig_print, Default => \&_twig_print, 358 ExternEnt => \&_twig_extern_ent, 359 ); 360 361# handlers used when the finish_print method has been called and the keep_encoding 362# option is used 363my %twig_handlers_finish_print_original= 364 ( Start => \&_twig_print_original, End => \&_twig_print_end_original, 365 Char => \&_twig_print_original, Entity => \&_twig_print_original, 366 XMLDecl => \&_twig_print_original, Doctype => \&_twig_print_original, 367 Element => \&_twig_print_original, Attlist => \&_twig_print_original, 368 CdataStart => \&_twig_print_original, CdataEnd => \&_twig_print_original, 369 Proc => \&_twig_print_original, Comment => \&_twig_print_original, 370 Default => \&_twig_print_original, 371 ); 372 373# handlers used within ignored elements 374my %twig_handlers_ignore= 375 ( Start => \&_twig_ignore_start, 376 End => \&_twig_ignore_end, 377 Char => undef, Entity => undef, XMLDecl => undef, 378 Doctype => undef, Element => undef, Attlist => undef, 379 CdataStart => undef, CdataEnd => undef, Proc => undef, 380 Comment => undef, Default => undef, 381 ExternEnt => undef, 382 ); 383 384 385# those handlers are only used if the entities are NOT to be expanded 386my %twig_noexpand_handlers= ( ExternEnt => undef, Default => \&_twig_default ); 387 388my @saved_default_handler; 389 390my $ID= 'id'; # default value, set by the Id argument 391my $css_sel=0; # set through the css_sel option to allow .class selectors in triggers 392 393# all allowed options 394%valid_option= 395 ( # XML::Twig options 396 TwigHandlers => 1, Id => 1, 397 TwigRoots => 1, TwigPrintOutsideRoots => 1, 398 StartTagHandlers => 1, EndTagHandlers => 1, 399 ForceEndTagHandlersUsage => 1, 400 DoNotChainHandlers => 1, 401 IgnoreElts => 1, 402 Index => 1, 403 AttAccessors => 1, 404 EltAccessors => 1, 405 FieldAccessors => 1, 406 CharHandler => 1, 407 TopDownHandlers => 1, 408 KeepEncoding => 1, DoNotEscapeAmpInAtts => 1, 409 ParseStartTag => 1, KeepAttsOrder => 1, 410 LoadDTD => 1, DTDHandler => 1, DTDBase => 1, NoXxe => 1, 411 DoNotOutputDTD => 1, NoProlog => 1, 412 ExpandExternalEnts => 1, 413 DiscardSpaces => 1, KeepSpaces => 1, DiscardAllSpaces => 1, 414 DiscardSpacesIn => 1, KeepSpacesIn => 1, 415 PrettyPrint => 1, EmptyTags => 1, 416 EscapeGt => 1, 417 Quote => 1, 418 Comments => 1, Pi => 1, 419 OutputFilter => 1, InputFilter => 1, 420 OutputTextFilter => 1, 421 OutputEncoding => 1, 422 RemoveCdata => 1, 423 EltClass => 1, 424 MapXmlns => 1, KeepOriginalPrefix => 1, 425 SkipMissingEnts => 1, 426 # XML::Parser options 427 ErrorContext => 1, ProtocolEncoding => 1, 428 Namespaces => 1, NoExpand => 1, 429 Stream_Delimiter => 1, ParseParamEnt => 1, 430 NoLWP => 1, Non_Expat_Options => 1, 431 Xmlns => 1, CssSel => 1, 432 UseTidy => 1, TidyOptions => 1, 433 OutputHtmlDoctype => 1, 434 ); 435 436my $active_twig; # last active twig,for XML::Twig::s 437 438# predefined input and output filters 439use vars qw( %filter); 440%filter= ( html => \&html_encode, 441 safe => \&safe_encode, 442 safe_hex => \&safe_encode_hex, 443 ); 444 445 446# trigger types (used to sort them) 447my ($LEVEL_TRIGGER, $REGEXP_TRIGGER, $XPATH_TRIGGER)=(1..3); 448 449sub new 450 { my ($class, %args) = @_; 451 my $handlers; 452 453 # change all nice_perlish_names into nicePerlishNames 454 %args= _normalize_args( %args); 455 456 # check options 457 unless( $args{MoreOptions}) 458 { foreach my $arg (keys %args) 459 { carp "invalid option $arg" unless $valid_option{$arg}; } 460 } 461 462 # a twig is really an XML::Parser 463 # my $self= XML::Parser->new(%args); 464 my $self; 465 $self= XML::Parser->new(%args); 466 467 bless $self, $class; 468 469 $self->{_twig_context_stack}= []; 470 471 # allow tag.class selectors in handler triggers 472 $css_sel= $args{CssSel} || 0; 473 474 475 if( exists $args{TwigHandlers}) 476 { $handlers= $args{TwigHandlers}; 477 $self->setTwigHandlers( $handlers); 478 delete $args{TwigHandlers}; 479 } 480 481 # take care of twig-specific arguments 482 if( exists $args{StartTagHandlers}) 483 { $self->setStartTagHandlers( $args{StartTagHandlers}); 484 delete $args{StartTagHandlers}; 485 } 486 487 if( exists $args{DoNotChainHandlers}) 488 { $self->{twig_do_not_chain_handlers}= $args{DoNotChainHandlers}; } 489 490 if( exists $args{IgnoreElts}) 491 { # change array to hash so you can write ignore_elts => [ qw(foo bar baz)] 492 if( isa( $args{IgnoreElts}, 'ARRAY')) { $args{IgnoreElts}= { map { $_ => 1 } @{$args{IgnoreElts}} }; } 493 $self->setIgnoreEltsHandlers( $args{IgnoreElts}); 494 delete $args{IgnoreElts}; 495 } 496 497 if( exists $args{Index}) 498 { my $index= $args{Index}; 499 # we really want a hash name => path, we turn an array into a hash if necessary 500 if( ref( $index) eq 'ARRAY') 501 { my %index= map { $_ => $_ } @$index; 502 $index= \%index; 503 } 504 while( my( $name, $exp)= each %$index) 505 { $self->setTwigHandler( $exp, sub { push @{$_[0]->{_twig_index}->{$name}}, $_; 1; }); } 506 } 507 508 $self->{twig_elt_class}= $args{EltClass} || 'XML::Twig::Elt'; 509 if( defined( $args{EltClass}) && $args{EltClass} ne 'XML::Twig::Elt') { $self->{twig_alt_elt_class}=1; } 510 if( exists( $args{EltClass})) { delete $args{EltClass}; } 511 512 if( exists( $args{MapXmlns})) 513 { $self->{twig_map_xmlns}= $args{MapXmlns}; 514 $self->{Namespaces}=1; 515 delete $args{MapXmlns}; 516 } 517 518 if( exists( $args{KeepOriginalPrefix})) 519 { $self->{twig_keep_original_prefix}= $args{KeepOriginalPrefix}; 520 delete $args{KeepOriginalPrefix}; 521 } 522 523 $self->{twig_dtd_handler}= $args{DTDHandler}; 524 delete $args{DTDHandler}; 525 526 if( $args{ExpandExternalEnts}) 527 { $self->set_expand_external_entities( 1); 528 $self->{twig_expand_external_ents}= $args{ExpandExternalEnts}; 529 $self->{twig_read_external_dtd}= 1; # implied by ExpandExternalEnts 530 if( $args{ExpandExternalEnts} == -1) 531 { $self->{twig_extern_ent_nofail}= 1; 532 $self->setHandlers( ExternEnt => \&_twig_extern_ent_nofail); 533 } 534 delete $args{LoadDTD}; 535 delete $args{ExpandExternalEnts}; 536 } 537 else 538 { $self->set_expand_external_entities( 0); } 539 540 if( !$args{NoLWP} && ! _use( 'URI') && ! _use( 'URI::File') && ! _use( 'LWP')) 541 { $self->{twig_ext_ent_handler}= \&XML::Parser::initial_ext_ent_handler } 542 elsif( $args{NoXxe}) 543 { $self->{twig_ext_ent_handler}= 544 sub { my($xp, $base, $path) = @_; $xp->{ErrorMessage}.= "cannot use entities in document when the no_xxe option is on"; return undef; }; 545 } 546 else 547 { $self->{twig_ext_ent_handler}= \&XML::Parser::file_ext_ent_handler } 548 549 if( $args{DoNotEscapeAmpInAtts}) 550 { $self->set_do_not_escape_amp_in_atts( 1); 551 $self->{twig_do_not_escape_amp_in_atts}=1; 552 } 553 else 554 { $self->set_do_not_escape_amp_in_atts( 0); 555 $self->{twig_do_not_escape_amp_in_atts}=0; 556 } 557 558 # deal with TwigRoots argument, a hash of elements for which 559 # subtrees will be built (and associated handlers) 560 561 if( $args{TwigRoots}) 562 { $self->setTwigRoots( $args{TwigRoots}); 563 delete $args{TwigRoots}; 564 } 565 566 if( $args{EndTagHandlers}) 567 { unless ($self->{twig_roots} || $args{ForceEndTagHandlersUsage}) 568 { croak "you should not use EndTagHandlers without TwigRoots\n", 569 "if you want to use it anyway, normally because you have ", 570 "a start_tag_handlers that calls 'ignore' and you want to ", 571 "call an ent_tag_handlers at the end of the element, then ", 572 "pass 'force_end_tag_handlers_usage => 1' as an argument ", 573 "to new"; 574 } 575 576 $self->setEndTagHandlers( $args{EndTagHandlers}); 577 delete $args{EndTagHandlers}; 578 } 579 580 if( $args{TwigPrintOutsideRoots}) 581 { croak "cannot use twig_print_outside_roots without twig_roots" 582 unless( $self->{twig_roots}); 583 # if the arg is a filehandle then store it 584 if( _is_fh( $args{TwigPrintOutsideRoots}) ) 585 { $self->{twig_output_fh}= $args{TwigPrintOutsideRoots}; } 586 $self->{twig_default_print}= $args{TwigPrintOutsideRoots}; 587 } 588 589 # space policy 590 if( $args{KeepSpaces}) 591 { croak "cannot use both keep_spaces and discard_spaces" if( $args{DiscardSpaces}); 592 croak "cannot use both keep_spaces and discard_all_spaces" if( $args{DiscardAllSpaces}); 593 croak "cannot use both keep_spaces and keep_spaces_in" if( $args{KeepSpacesIn}); 594 $self->{twig_keep_spaces}=1; 595 delete $args{KeepSpaces}; 596 } 597 if( $args{DiscardSpaces}) 598 { 599 croak "cannot use both discard_spaces and keep_spaces_in" if( $args{KeepSpacesIn}); 600 croak "cannot use both discard_spaces and discard_all_spaces" if( $args{DiscardAllSpaces}); 601 croak "cannot use both discard_spaces and discard_spaces_in" if( $args{DiscardSpacesIn}); 602 $self->{twig_discard_spaces}=1; 603 delete $args{DiscardSpaces}; 604 } 605 if( $args{KeepSpacesIn}) 606 { croak "cannot use both keep_spaces_in and discard_spaces_in" if( $args{DiscardSpacesIn}); 607 croak "cannot use both keep_spaces_in and discard_all_spaces" if( $args{DiscardAllSpaces}); 608 $self->{twig_discard_spaces}=1; 609 $self->{twig_keep_spaces_in}={}; 610 my @tags= @{$args{KeepSpacesIn}}; 611 foreach my $tag (@tags) { $self->{twig_keep_spaces_in}->{$tag}=1; } 612 delete $args{KeepSpacesIn}; 613 } 614 615 if( $args{DiscardAllSpaces}) 616 { 617 croak "cannot use both discard_all_spaces and discard_spaces_in" if( $args{DiscardSpacesIn}); 618 $self->{twig_discard_all_spaces}=1; 619 delete $args{DiscardAllSpaces}; 620 } 621 622 if( $args{DiscardSpacesIn}) 623 { $self->{twig_keep_spaces}=1; 624 $self->{twig_discard_spaces_in}={}; 625 my @tags= @{$args{DiscardSpacesIn}}; 626 foreach my $tag (@tags) { $self->{twig_discard_spaces_in}->{$tag}=1; } 627 delete $args{DiscardSpacesIn}; 628 } 629 # discard spaces by default 630 $self->{twig_discard_spaces}= 1 unless( $self->{twig_keep_spaces}); 631 632 $args{Comments}||= $COMMENTS_DEFAULT; 633 if( $args{Comments} eq 'drop') { $self->{twig_keep_comments}= 0; } 634 elsif( $args{Comments} eq 'keep') { $self->{twig_keep_comments}= 1; } 635 elsif( $args{Comments} eq 'process') { $self->{twig_process_comments}= 1; } 636 else { croak "wrong value for comments argument: '$args{Comments}' (should be 'drop', 'keep' or 'process')"; } 637 delete $args{Comments}; 638 639 $args{Pi}||= $PI_DEFAULT; 640 if( $args{Pi} eq 'drop') { $self->{twig_keep_pi}= 0; } 641 elsif( $args{Pi} eq 'keep') { $self->{twig_keep_pi}= 1; } 642 elsif( $args{Pi} eq 'process') { $self->{twig_process_pi}= 1; } 643 else { croak "wrong value for pi argument: '$args{Pi}' (should be 'drop', 'keep' or 'process')"; } 644 delete $args{Pi}; 645 646 if( $args{KeepEncoding}) 647 { 648 # set it in XML::Twig::Elt so print functions know what to do 649 $self->set_keep_encoding( 1); 650 $self->{parse_start_tag}= $args{ParseStartTag} || \&_parse_start_tag; 651 delete $args{ParseStartTag} if defined( $args{ParseStartTag}) ; 652 delete $args{KeepEncoding}; 653 } 654 else 655 { $self->set_keep_encoding( 0); 656 if( $args{ParseStartTag}) 657 { $self->{parse_start_tag}= $args{ParseStartTag}; } 658 else 659 { delete $self->{parse_start_tag}; } 660 delete $args{ParseStartTag}; 661 } 662 663 if( $args{OutputFilter}) 664 { $self->set_output_filter( $args{OutputFilter}); 665 delete $args{OutputFilter}; 666 } 667 else 668 { $self->set_output_filter( 0); } 669 670 if( $args{RemoveCdata}) 671 { $self->set_remove_cdata( $args{RemoveCdata}); 672 delete $args{RemoveCdata}; 673 } 674 else 675 { $self->set_remove_cdata( 0); } 676 677 if( $args{OutputTextFilter}) 678 { $self->set_output_text_filter( $args{OutputTextFilter}); 679 delete $args{OutputTextFilter}; 680 } 681 else 682 { $self->set_output_text_filter( 0); } 683 684 if( $args{KeepAttsOrder}) 685 { $self->{keep_atts_order}= $args{KeepAttsOrder}; 686 if( _use( 'Tie::IxHash')) 687 { $self->set_keep_atts_order( $self->{keep_atts_order}); } 688 else 689 { croak "Tie::IxHash not available, option keep_atts_order not allowed"; } 690 } 691 else 692 { $self->set_keep_atts_order( 0); } 693 694 695 if( $args{PrettyPrint}) { $self->set_pretty_print( $args{PrettyPrint}); } 696 if( $args{EscapeGt}) { $self->escape_gt( $args{EscapeGt}); } 697 if( $args{EmptyTags}) { $self->set_empty_tag_style( $args{EmptyTags}) } 698 699 if( exists $args{Id}) { $ID= $args{Id}; delete $args{ID}; } 700 if( $args{NoProlog}) { $self->{no_prolog}= 1; delete $args{NoProlog}; } 701 if( $args{DoNotOutputDTD}) { $self->{no_dtd_output}= 1; delete $args{DoNotOutputDTD}; } 702 if( $args{LoadDTD}) { $self->{twig_read_external_dtd}= 1; delete $args{LoadDTD}; } 703 if( $args{CharHandler}) { $self->setCharHandler( $args{CharHandler}); delete $args{CharHandler}; } 704 705 if( $args{InputFilter}) { $self->set_input_filter( $args{InputFilter}); delete $args{InputFilter}; } 706 if( $args{NoExpand}) { $self->setHandlers( %twig_noexpand_handlers); $self->{twig_no_expand}=1; } 707 if( my $output_encoding= $args{OutputEncoding}) { $self->set_output_encoding( $output_encoding); delete $args{OutputFilter}; } 708 709 if( my $tdh= $args{TopDownHandlers}) { $self->{twig_tdh}=1; delete $args{TopDownHandlers}; } 710 711 if( my $acc_a= $args{AttAccessors}) { $self->att_accessors( @$acc_a); } 712 if( my $acc_e= $args{EltAccessors}) { $self->elt_accessors( isa( $acc_e, 'ARRAY') ? @$acc_e : $acc_e); } 713 if( my $acc_f= $args{FieldAccessors}) { $self->field_accessors( isa( $acc_f, 'ARRAY') ? @$acc_f : $acc_f); } 714 715 if( $args{UseTidy}) { $self->{use_tidy}= 1; } 716 $self->{tidy_options}= $args{TidyOptions} || {}; 717 718 if( $args{OutputHtmlDoctype}) { $self->{html_doctype}= 1; } 719 720 $self->set_quote( $args{Quote} || 'double'); 721 722 # set handlers 723 if( $self->{twig_roots}) 724 { if( $self->{twig_default_print}) 725 { if( $self->{twig_keep_encoding}) 726 { $self->setHandlers( %twig_handlers_roots_print_original); } 727 else 728 { $self->setHandlers( %twig_handlers_roots_print); } 729 } 730 else 731 { $self->setHandlers( %twig_handlers_roots); } 732 } 733 else 734 { $self->setHandlers( %twig_handlers); } 735 736 # XML::Parser::Expat does not like these handler to be set. So in order to 737 # use the various sets of handlers on XML::Parser or XML::Parser::Expat 738 # objects when needed, these ones have to be set only once, here, at 739 # XML::Parser level 740 $self->setHandlers( Init => \&_twig_init, Final => \&_twig_final); 741 742 $self->{twig_entity_list}= XML::Twig::Entity_list->new; 743 $self->{twig_notation_list}= XML::Twig::Notation_list->new; 744 745 $self->{twig_id}= $ID; 746 $self->{twig_stored_spaces}=''; 747 748 $self->{twig_autoflush}= 1; # auto flush by default 749 750 $self->{twig}= $self; 751 if( $weakrefs) { weaken( $self->{twig}); } 752 753 return $self; 754 } 755 756sub parse 757 { 758 my $t= shift; 759 # if called as a class method, calls nparse, which creates the twig then parses it 760 if( !ref( $t) || !isa( $t, 'XML::Twig')) { return $t->nparse( @_); } 761 762 # requires 5.006 at least (or the ${^UNICODE} causes a problem) # > perl 5.5 763 # trap underlying bug in IO::Handle (see RT #17500) # > perl 5.5 764 # croak if perl 5.8+, -CD (or PERL_UNICODE set to D) and parsing a pipe # > perl 5.5 765 if( $perl_version>=5.008 && ${^UNICODE} && (${^UNICODE} & 24) && isa( $_[0], 'GLOB') && -p $_[0] ) # > perl 5.5 766 { croak "cannot parse the output of a pipe when perl is set to use the UTF8 perlIO layer\n" # > perl 5.5 767 . "set the environment variable PERL_UNICODE or use the -C option (see perldoc perlrun)\n" # > perl 5.5 768 . "not to include 'D'"; # > perl 5.5 769 } # > perl 5.5 770 $t= eval { $t->SUPER::parse( @_); }; 771 772 if( !$t 773 && $@=~m{(syntax error at line 1, column 0, byte 0|not well-formed \(invalid token\) at line 1, column 1, byte 1)} 774 && -f $_[0] 775 && ( ! ref( $_[0]) || ref( $_[0])) ne 'GLOB' # -f works on a filehandle, so this make sure $_[0] is a real file 776 ) 777 { croak "you seem to have used the parse method on a filename ($_[0]), you probably want parsefile instead"; } 778 return _checked_parse_result( $t, $@); 779 } 780 781sub parsefile 782 { my $t= shift; 783 if( -f $_[0] && ! -s $_[0]) { return _checked_parse_result( undef, "empty file '$_[0]'"); } 784 $t= eval { $t->SUPER::parsefile( @_); }; 785 return _checked_parse_result( $t, $@); 786 } 787 788sub _checked_parse_result 789 { my( $t, $returned)= @_; 790 if( !$t) 791 { if( isa( $returned, 'XML::Twig') && $returned->{twig_finish_now}) 792 { $t= $returned; 793 delete $t->{twig_finish_now}; 794 return $t->_twig_final; 795 } 796 else 797 { _croak( $returned, 0); } 798 } 799 800 $active_twig= $t; 801 return $t; 802 } 803 804sub active_twig { return $active_twig; } 805 806sub finish_now 807 { my $t= shift; 808 $t->{twig_finish_now}=1; 809 # XML::Parser 2.43 changed xpcroak in a way that caused test failures for XML::Twig 810 # the change was reverted in 2.44, but this is here to ensure that tests pass with 2.43 811 if( $XML::Parser::VERSION == 2.43) 812 { no warnings; 813 $t->parser->{twig_error}= $t; 814 *XML::Parser::Expat::xpcroak= sub { die $_[0]->{twig_error}; }; 815 die $t; 816 } 817 else 818 { die $t; } 819 } 820 821 822sub parsefile_inplace { shift->_parse_inplace( parsefile => @_); } 823sub parsefile_html_inplace { shift->_parse_inplace( parsefile_html => @_); } 824 825sub _parse_inplace 826 { my( $t, $method, $file, $suffix)= @_; 827 _use( 'File::Temp') || croak "need File::Temp to use inplace methods\n"; 828 _use( 'File::Basename'); 829 830 831 my $tmpdir= dirname( $file); 832 my( $tmpfh, $tmpfile)= File::Temp::tempfile( DIR => $tmpdir); 833 my $original_fh= select $tmpfh; 834 835 # we can only use binmode :utf8 if perl was compiled with useperlio 836 # might be a problem if keep_encoding used but the file is already in utf8 837 if( $perl_version > 5.006 && !$t->{twig_keep_encoding} && _use_perlio()) { binmode( $tmpfh, ":utf8" ); } 838 839 $t->$method( $file); 840 841 select $original_fh; 842 close $tmpfh; 843 my $mode= (stat( $file))[2] & oct(7777); 844 chmod $mode, $tmpfile or croak "cannot change temp file mode to $mode: $!"; 845 846 if( $suffix) 847 { my $backup; 848 if( $suffix=~ m{\*}) { ($backup = $suffix) =~ s/\*/$file/g; } 849 else { $backup= $file . $suffix; } 850 851 rename( $file, $backup) or croak "cannot backup initial file ($file) to $backup: $!"; 852 } 853 rename( $tmpfile, $file) or croak "cannot rename temp file ($tmpfile) to initial file ($file): $!"; 854 855 return $t; 856 } 857 858 859sub parseurl 860 { my $t= shift; 861 $t->_parseurl( 0, @_); 862 } 863 864sub safe_parseurl 865 { my $t= shift; 866 $t->_parseurl( 1, @_); 867 } 868 869sub safe_parsefile_html 870 { my $t= shift; 871 eval { $t->parsefile_html( @_); }; 872 return $@ ? $t->_reset_twig_after_error : $t; 873 } 874 875sub safe_parseurl_html 876 { my $t= shift; 877 _use( 'LWP::Simple') or croak "missing LWP::Simple"; 878 eval { $t->parse_html( LWP::Simple::get( shift()), @_); } ; 879 return $@ ? $t->_reset_twig_after_error : $t; 880 } 881 882sub parseurl_html 883 { my $t= shift; 884 _use( 'LWP::Simple') or croak "missing LWP::Simple"; 885 $t->parse_html( LWP::Simple::get( shift()), @_); 886 } 887 888 889# uses eval to catch the parser's death 890sub safe_parse_html 891 { my $t= shift; 892 eval { $t->parse_html( @_); } ; 893 return $@ ? $t->_reset_twig_after_error : $t; 894 } 895 896sub parsefile_html 897 { my $t= shift; 898 my $file= shift; 899 my $indent= $t->{ErrorContext} ? 1 : 0; 900 $t->set_empty_tag_style( 'html'); 901 my $html2xml= $t->{use_tidy} ? \&_tidy_html : \&_html2xml; 902 my $options= $t->{use_tidy} ? $t->{tidy_options} || {} : { indent => $indent, html_doctype => $t->{html_doctype} }; 903 $t->parse( $html2xml->( _slurp( $file), $options), @_); 904 return $t; 905 } 906 907sub parse_html 908 { my $t= shift; 909 my $options= ref $_[0] && ref $_[0] eq 'HASH' ? shift() : {}; 910 my $use_tidy= exists $options->{use_tidy} ? $options->{use_tidy} : $t->{use_tidy}; 911 my $content= shift; 912 my $indent= $t->{ErrorContext} ? 1 : 0; 913 $t->set_empty_tag_style( 'html'); 914 my $html2xml= $use_tidy ? \&_tidy_html : \&_html2xml; 915 my $conv_options= $use_tidy ? $t->{tidy_options} || {} : { indent => $indent, html_doctype => $t->{html_doctype} }; 916 $t->parse( $html2xml->( isa( $content, 'GLOB') ? _slurp_fh( $content) : $content, $conv_options), @_); 917 return $t; 918 } 919 920sub xparse 921 { my $t= shift; 922 my $to_parse= $_[0]; 923 if( isa( $to_parse, 'GLOB')) { $t->parse( @_); } 924 elsif( $to_parse=~ m{^\s*<}) { $to_parse=~ m{<html}i ? $t->_parse_as_xml_or_html( @_) 925 : $t->parse( @_); 926 } 927 elsif( $to_parse=~ m{^\w+://.*\.html?$}) { _use( 'LWP::Simple') or croak "missing LWP::Simple"; 928 $t->_parse_as_xml_or_html( LWP::Simple::get( shift()), @_); 929 } 930 elsif( $to_parse=~ m{^\w+://}) { _use( 'LWP::Simple') or croak "missing LWP::Simple"; 931 my $doc= LWP::Simple::get( shift); 932 if( ! defined $doc) { $doc=''; } 933 my $xml_parse_ok= $t->safe_parse( $doc, @_); 934 if( $xml_parse_ok) 935 { return $xml_parse_ok; } 936 else 937 { my $diag= $@; 938 if( $doc=~ m{<html}i) 939 { $t->parse_html( $doc, @_); } 940 else 941 { croak $diag; } 942 } 943 } 944 elsif( $to_parse=~ m{\.html?$}) { my $content= _slurp( shift); 945 $t->_parse_as_xml_or_html( $content, @_); 946 } 947 else { $t->parsefile( @_); } 948 } 949 950sub _parse_as_xml_or_html 951 { my $t= shift; 952 if( _is_well_formed_xml( $_[0])) 953 { $t->parse( @_) } 954 else 955 { my $html2xml= $t->{use_tidy} ? \&_tidy_html : \&_html2xml; 956 my $options= $t->{use_tidy} ? $t->{tidy_options} || {} : { indent => 0, html_doctype => $t->{html_doctype} }; 957 my $html= $html2xml->( $_[0], $options, @_); 958 if( _is_well_formed_xml( $html)) 959 { $t->parse( $html); } 960 else 961 { croak $@; } # can't really test this because HTML::Parser or HTML::Tidy may change how they deal with bas HTML between versions 962 } 963 } 964 965{ my $parser; 966 sub _is_well_formed_xml 967 { $parser ||= XML::Parser->new; 968 eval { $parser->parse( $_[0]); }; 969 return $@ ? 0 : 1; 970 } 971} 972 973sub nparse 974 { my $class= shift; 975 my $to_parse= pop; 976 $class->new( @_)->xparse( $to_parse); 977 } 978 979sub nparse_pp { shift()->nparse( pretty_print => 'indented', @_); } 980sub nparse_e { shift()->nparse( error_context => 1, @_); } 981sub nparse_ppe { shift()->nparse( pretty_print => 'indented', error_context => 1, @_); } 982 983 984sub _html2xml 985 { my( $html, $options)= @_; 986 _use( 'HTML::TreeBuilder', '3.13') or croak "cannot parse HTML: missing HTML::TreeBuilder v >= 3.13\n"; 987 my $tree= HTML::TreeBuilder->new; 988 $tree->ignore_ignorable_whitespace( 0); 989 $tree->ignore_unknown( 0); 990 $tree->no_space_compacting( 1); 991 $tree->store_comments( 1); 992 $tree->store_pis(1); 993 $tree->parse( $html); 994 $tree->eof; 995 996 my $xml=''; 997 if( $options->{html_doctype} && exists $tree->{_decl} ) 998 { my $decl= $tree->{_decl}->as_XML; 999 1000 # first try to fix declarations that are missing the SYSTEM part 1001 $decl =~ s{^\s*<!DOCTYPE \s+ ((?i)html) \s+ PUBLIC \s+ "([^"]*)" \s* >} 1002 { my $system= $HTML_DECL{$2} || $HTML_DECL{$DEFAULT_HTML_TYPE}; 1003 qq{<!DOCTYPE $1 PUBLIC "$2" "$system">} 1004 1005 }xe; 1006 1007 # then check that the declaration looks OK (so it parses), if not remove it, 1008 # better to parse without the declaration than to die stupidly 1009 if( $decl =~ m{<!DOCTYPE \s+ (?i:HTML) (\s+ PUBLIC \s+ "[^"]*" \s+ (SYSTEM \s+)? "[^"]*")? \s*>}x # PUBLIC then SYSTEM 1010 || $decl =~ m{<!DOCTYPE \s+ (?i:HTML) \s+ SYSTEM \s+ "[^"]*" \s*>}x # just SYSTEM 1011 ) 1012 { $xml= $decl; } 1013 } 1014 1015 $xml.= _as_XML( $tree); 1016 1017 1018 _fix_xml( $tree, \$xml); 1019 1020 if( $options->{indent}) { _indent_xhtml( \$xml); } 1021 $tree->delete; 1022 $xml=~ s{\s+$}{}s; # trim end 1023 return $xml; 1024 } 1025 1026sub _tidy_html 1027 { my( $html, $options)= @_; 1028 _use( 'HTML::Tidy') or croak "cannot cleanup HTML using HTML::Tidy (required by the use_tidy option): $@\n"; ; 1029 my $TIDY_DEFAULTS= { output_xhtml => 1, # duh! 1030 tidy_mark => 0, # do not add the "generated by tidy" comment 1031 numeric_entities => 1, 1032 char_encoding => 'utf8', 1033 bare => 1, 1034 clean => 1, 1035 doctype => 'transitional', 1036 fix_backslash => 1, 1037 merge_divs => 0, 1038 merge_spans => 0, 1039 sort_attributes => 'alpha', 1040 indent => 0, 1041 wrap => 0, 1042 break_before_br => 0, 1043 }; 1044 $options ||= {}; 1045 my $tidy_options= { %$TIDY_DEFAULTS, %$options}; 1046 my $tidy = HTML::Tidy->new( $tidy_options); 1047 $tidy->ignore( type => 1, type => 2 ); # 1 is TIDY_WARNING, 2 is TIDY_ERROR, not clean 1048 my $xml= $tidy->clean( $html ); 1049 return $xml; 1050 } 1051 1052 1053{ my %xml_parser_encoding; 1054 sub _fix_xml 1055 { my( $tree, $xml)= @_; # $xml is a ref to the xml string 1056 1057 my $max_tries=5; 1058 my $add_decl; 1059 1060 while( ! _check_xml( $xml) && $max_tries--) 1061 { 1062 # a couple of fixes for weird HTML::TreeBuilder errors 1063 if( $@=~ m{^\s*xml (or text )?declaration not at start of (external )?entity}i) 1064 { $$xml=~ s{<\?xml.*?\?>}{}g; 1065 #warn " fixed xml declaration in the wrong place\n"; 1066 } 1067 elsif( $@=~ m{undefined entity}) 1068 { $$xml=~ s{&(amp;)?Amp;}{&}g if $HTML::TreeBuilder::VERSION < 4.00; 1069 if( _use( 'HTML::Entities::Numbered')) { $$xml=name2hex_xml( $$xml); } 1070 $$xml=~ s{&(\w+);}{ my $ent= $1; if( $ent !~ m{^(amp|lt|gt|apos|quote)$}) { "&$ent;" } }eg; 1071 } 1072 elsif( $@=~ m{&Amp; used in html}) 1073 # if $Amp; is used instead of & then HTML::TreeBuilder's as_xml is tripped (old version) 1074 { $$xml=~ s{&(amp;)?Amp;}{&}g if $HTML::TreeBuilder::VERSION < 4.00; 1075 } 1076 elsif( $@=~ m{^\s*not well-formed \(invalid token\)}) 1077 { if( $HTML::TreeBuilder::VERSION < 4.00) 1078 { $$xml=~ s{&(amp;)?Amp;}{&}g; 1079 $$xml=~ s{(<[^>]* )(\d+=)"}{$1a$2"}g; # <table 1> comes out as <table 1="1">, "fix the attribute 1080 } 1081 my $q= '<img "=""" '; # extracted so vim doesn't get confused 1082 if( _use( 'HTML::Entities::Numbered')) { $$xml=name2hex_xml( $$xml); } 1083 if( $$xml=~ m{$q}) 1084 { $$xml=~ s{$q}{<img }g; # happens with <img src="foo.png"" ... 1085 } 1086 else 1087 { my $encoding= _encoding_from_meta( $tree); 1088 unless( keys %xml_parser_encoding) { %xml_parser_encoding= _xml_parser_encodings(); } 1089 1090 if( ! $add_decl) 1091 { if( $xml_parser_encoding{$encoding}) 1092 { $add_decl=1; } 1093 elsif( $encoding eq 'euc-jp' && $xml_parser_encoding{'x-euc-jp-jisx0221'}) 1094 { $encoding="x-euc-jp-jisx0221"; $add_decl=1;} 1095 elsif( $encoding eq 'shift-jis' && $xml_parser_encoding{'x-sjis-jisx0221'}) 1096 { $encoding="x-sjis-jisx0221"; $add_decl=1;} 1097 1098 if( $add_decl) 1099 { $$xml=~ s{^(<\?xml.*?\?>)?}{<?xml version="1.0" encoding="$encoding"?>}s; 1100 #warn " added decl (encoding $encoding)\n"; 1101 } 1102 else 1103 { $$xml=~ s{^(<\?xml.*?\?>)?}{}s; 1104 #warn " converting to utf8 from $encoding\n"; 1105 $$xml= _to_utf8( $encoding, $$xml); 1106 } 1107 } 1108 else 1109 { $$xml=~ s{^(<\?xml.*?\?>)?}{}s; 1110 #warn " converting to utf8 from $encoding\n"; 1111 $$xml= _to_utf8( $encoding, $$xml); 1112 } 1113 } 1114 } 1115 } 1116 1117 # some versions of HTML::TreeBuilder escape CDATA sections 1118 $$xml=~ s{(<!\[CDATA\[.*?\]\]>)}{_unescape_cdata( $1)}eg; 1119 1120 } 1121 1122 sub _xml_parser_encodings 1123 { my @encodings=( 'iso-8859-1'); # this one is included by default, there is no map for it in @INC 1124 foreach my $inc (@INC) 1125 { push @encodings, map { basename( $_, '.enc') } glob( File::Spec->catdir( $inc => XML => Parser => Encodings => '*.enc')); } 1126 return map { $_ => 1 } @encodings; 1127 } 1128} 1129 1130 1131sub _unescape_cdata 1132 { my( $cdata)= @_; 1133 $cdata=~s{<}{<}g; 1134 $cdata=~s{>}{>}g; 1135 $cdata=~s{&}{&}g; 1136 return $cdata; 1137 } 1138 1139sub _as_XML { 1140 1141 # fork of HTML::Element::as_XML, which is a little too buggy and inconsistent between versions for my liking 1142 my ($elt) = @_; 1143 my $xml= ''; 1144 my $empty_element_map = $elt->_empty_element_map; 1145 1146 my ( $tag, $node, $start ); # per-iteration scratch 1147 $elt->traverse( 1148 sub { 1149 ( $node, $start ) = @_; 1150 if ( ref $node ) 1151 { # it's an element 1152 $tag = $node->{'_tag'}; 1153 if ($start) 1154 { # on the way in 1155 foreach my $att ( grep { ! m{^(_|/$)} } keys %$node ) 1156 { # fix attribute names instead of dying 1157 my $new_att= $att; 1158 if( $att=~ m{^\d}) { $new_att= "a$att"; } 1159 $new_att=~ s{[^\w\d:_-]}{}g; 1160 $new_att ||= 'a'; 1161 if( $new_att ne $att) { $node->{$new_att}= delete $node->{$att}; } 1162 } 1163 1164 if ( $empty_element_map->{$tag} && (!@{ $node->{'_content'} || []}) ) 1165 { $xml.= $node->starttag_XML( undef, 1 ); } 1166 else 1167 { $xml.= $node->starttag_XML(undef); } 1168 } 1169 else 1170 { # on the way out 1171 unless ( $empty_element_map->{$tag} and !@{ $node->{'_content'} || [] } ) 1172 { $xml.= $node->endtag_XML(); 1173 } # otherwise it will have been an <... /> tag. 1174 } 1175 } 1176 elsif( $node=~ /<!\[CDATA\[/) # the content includes CDATA 1177 { foreach my $chunk (split /(<!\[CDATA\[.*?\]\]>)/s, $node) # chunks are CDATA sections or normal text 1178 { $xml.= $chunk =~ m{<!\[CDATA\[} ? $chunk : _xml_escape( $chunk); } 1179 } 1180 else # it's just text 1181 { $xml .= _xml_escape($node); } 1182 1; # keep traversing 1183 } 1184 ); 1185 return $xml; 1186} 1187 1188sub _xml_escape 1189 { my( $html)= @_; 1190 $html =~ s{&(?! # An ampersand that isn't followed by... 1191 ( \#[0-9]+; | # A hash mark, digits and semicolon, or 1192 \#x[0-9a-fA-F]+; | # A hash mark, "x", hex digits and semicolon, or 1193 [\w]+; # A valid unicode entity name and semicolon 1194 ) 1195 ) 1196 } 1197 {&}gx if 0; # Needs to be escaped to amp 1198 1199 $html=~ s{&}{&}g; 1200 1201 # in old versions of HTML::TreeBuilder & can come out as &Amp; 1202 if( $HTML::TreeBuilder::VERSION && $HTML::TreeBuilder::VERSION <= 3.23) { $html=~ s{&Amp;}{&}g; } 1203 1204 # simple character escapes 1205 $html =~ s/</</g; 1206 $html =~ s/>/>/g; 1207 $html =~ s/"/"/g; 1208 $html =~ s/'/'/g; 1209 1210 return $html; 1211 } 1212 1213 1214 1215 1216sub _check_xml 1217 { my( $xml)= @_; # $xml is a ref to the xml string 1218 my $ok= eval { XML::Parser->new->parse( $$xml); }; 1219 #if( $ok) { warn " parse OK\n"; } 1220 return $ok; 1221 } 1222 1223sub _encoding_from_meta 1224 { my( $tree)= @_; 1225 my $enc="iso-8859-1"; 1226 my @meta= $tree->find( 'meta'); 1227 foreach my $meta (@meta) 1228 { if( $meta->{'http-equiv'} && ($meta->{'http-equiv'} =~ m{^\s*content-type\s*}i) 1229 && $meta->{content} && ($meta->{content} =~ m{^\s*text/html\s*;\s*charset\s*=\s*(\S*)\s*}i) 1230 ) 1231 { $enc= lc $1; 1232 #warn " encoding from meta tag is '$enc'\n"; 1233 last; 1234 } 1235 } 1236 return $enc; 1237 } 1238 1239{ sub _to_utf8 1240 { my( $encoding, $string)= @_; 1241 local $SIG{__DIE__}; 1242 if( _use( 'Encode')) 1243 { Encode::from_to( $string, $encoding => 'utf8', 0x0400); } # 0x0400 is Encode::FB_XMLCREF 1244 elsif( _use( 'Text::Iconv')) 1245 { my $converter = eval { Text::Iconv->new( $encoding => "utf8") }; 1246 if( $converter) { $string= $converter->convert( $string); } 1247 } 1248 elsif( _use( 'Unicode::Map8') && _use( 'Unicode::String')) 1249 { my $map= Unicode::Map8->new( $encoding); 1250 $string= $map->tou( $string)->utf8; 1251 } 1252 $string=~ s{[\x00-\x08\x0B\x0C\x0E-\x1F]}{}g; # get rid of control chars, portable in 5.6 1253 return $string; 1254 } 1255} 1256 1257 1258sub _indent_xhtml 1259 { my( $xhtml)= @_; # $xhtml is a ref 1260 my %block_tag= map { $_ => 1 } qw( html 1261 head 1262 meta title link script base 1263 body 1264 h1 h2 h3 h4 h5 h6 1265 p br address blockquote pre 1266 ol ul li dd dl dt 1267 table tr td th tbody tfoot thead col colgroup caption 1268 div frame frameset hr 1269 ); 1270 1271 my $level=0; 1272 $$xhtml=~ s{( (?:<!(?:--.*?-->|[CDATA[.*?]]>)) # ignore comments and CDATA sections 1273 | <(\w+)((?:\s+\w+\s*=\s*(?:"[^"]*"|'[^']*'))*\s*/>) # empty tag 1274 | <(\w+) # start tag 1275 |</(\w+) # end tag 1276 ) 1277 } 1278 { if( $2 && $block_tag{$2}) { my $indent= " " x $level; 1279 "\n$indent<$2$3"; 1280 } 1281 elsif( $4 && $block_tag{$4}) { my $indent= " " x $level; 1282 $level++ unless( $4=~ m{/>}); 1283 my $nl= $4 eq 'html' ? '' : "\n"; 1284 "$nl$indent<$4"; 1285 } 1286 elsif( $5 && $block_tag{$5}) { $level--; "</$5"; } 1287 else { $1; } 1288 }xesg; 1289 } 1290 1291 1292sub add_stylesheet 1293 { my( $t, $type, $href)= @_; 1294 my %text_type= map { $_ => 1 } qw( xsl css); 1295 my $ss= $t->{twig_elt_class}->new( $PI); 1296 if( $text_type{$type}) 1297 { $ss->_set_pi( 'xml-stylesheet', qq{type="text/$type" href="$href"}); } 1298 else 1299 { croak "unsupported style sheet type '$type'"; } 1300 1301 $t->_add_cpi_outside_of_root( leading_cpi => $ss); 1302 return $t; 1303 } 1304 1305{ my %used; # module => 1 if require ok, 0 otherwise 1306 my %disallowed; # for testing, refuses to _use modules in this hash 1307 1308 sub _disallow_use ## no critic (Subroutines::ProhibitNestedSubs); 1309 { my( @modules)= @_; 1310 $disallowed{$_}= 1 foreach (@modules); 1311 } 1312 1313 sub _allow_use ## no critic (Subroutines::ProhibitNestedSubs); 1314 { my( @modules)= @_; 1315 $disallowed{$_}= 0 foreach (@modules); 1316 } 1317 1318 sub _use ## no critic (Subroutines::ProhibitNestedSubs); 1319 { my( $module, $version)= @_; 1320 $version ||= 0; 1321 if( $disallowed{$module}) { return 0; } 1322 if( $used{$module}) { return 1; } 1323 if( eval "require $module") { import $module; $used{$module}= 1; # no critic ProhibitStringyEval 1324 if( $version) 1325 { 1326 ## no critic (TestingAndDebugging::ProhibitNoStrict); 1327 no strict 'refs'; 1328 if( ${"${module}::VERSION"} >= $version ) { return 1; } 1329 else { return 0; } 1330 } 1331 else 1332 { return 1; } 1333 } 1334 else { $used{$module}= 0; return 0; } 1335 } 1336} 1337 1338# used to solve the [n] predicates while avoiding getting the entire list 1339# needs a prototype to accept passing bare blocks 1340sub _first_n(&$@) ## no critic (Subroutines::ProhibitSubroutinePrototypes); 1341 { my $coderef= shift; 1342 my $n= shift; 1343 my $i=0; 1344 if( $n > 0) 1345 { foreach (@_) { if( &$coderef) { $i++; return $_ if( $i == $n); } } } 1346 elsif( $n < 0) 1347 { foreach (reverse @_) { if( &$coderef) { $i--; return $_ if( $i == $n); } } } 1348 else 1349 { croak "illegal position number 0"; } 1350 return undef; 1351 } 1352 1353sub _slurp_uri 1354 { my( $uri, $base)= @_; 1355 if( $uri=~ m{^\w+://}) { _use( 'LWP::Simple'); return LWP::Simple::get( $uri); } 1356 else { return _slurp( _based_filename( $uri, $base)); } 1357 } 1358 1359sub _based_filename 1360 { my( $filename, $base)= @_; 1361 # cf. XML/Parser.pm's file_ext_ent_handler 1362 if (defined($base) and not ($filename =~ m{^(?:[\\/]|\w+:)})) 1363 { my $newpath = $base; 1364 $newpath =~ s{[^\\/:]*$}{$filename}; 1365 $filename = $newpath; 1366 } 1367 return $filename; 1368 } 1369 1370sub _slurp 1371 { my( $filename)= @_; 1372 my $to_slurp; 1373 open( $to_slurp, "<$filename") or croak "cannot open '$filename': $!"; 1374 local $/= undef; 1375 my $content= <$to_slurp>; 1376 close $to_slurp; 1377 return $content; 1378 } 1379 1380sub _slurp_fh 1381 { my( $fh)= @_; 1382 local $/= undef; 1383 my $content= <$fh>; 1384 return $content; 1385 } 1386 1387# I should really add extra options to allow better configuration of the 1388# LWP::UserAgent object 1389# this method forks (except on VMS!) 1390# - the child gets the data and copies it to the pipe, 1391# - the parent reads the stream and sends it to XML::Parser 1392# the data is cut it chunks the size of the XML::Parser::Expat buffer 1393# the method returns the twig and the status 1394sub _parseurl 1395 { my( $t, $safe, $url, $agent)= @_; 1396 _use( 'LWP') || croak "LWP not available, needed to use parseurl methods"; 1397 if( $^O ne 'VMS') 1398 { pipe( README, WRITEME) or croak "cannot create connected pipes: $!"; 1399 if( my $pid= fork) 1400 { # parent code: parse the incoming file 1401 close WRITEME; # no need to write 1402 my $result= $safe ? $t->safe_parse( \*README) : $t->parse( \*README); 1403 close README; 1404 return $@ ? 0 : $t; 1405 } 1406 else 1407 { # child 1408 close README; # no need to read 1409 local $|=1; 1410 $agent ||= LWP::UserAgent->new; 1411 my $request = HTTP::Request->new( GET => $url); 1412 # _pass_url_content is called with chunks of data the same size as 1413 # the XML::Parser buffer 1414 my $response = $agent->request( $request, 1415 sub { _pass_url_content( \*WRITEME, @_); }, $BUFSIZE); 1416 $response->is_success or croak "$url ", $response->message; 1417 close WRITEME; 1418 CORE::exit(); # CORE is there for mod_perl (which redefines exit) 1419 } 1420 } 1421 else 1422 { # VMS branch (hard to test!) 1423 local $|=1; 1424 $agent ||= LWP::UserAgent->new; 1425 my $request = HTTP::Request->new( GET => $url); 1426 my $response = $agent->request( $request); 1427 $response->is_success or croak "$url ", $response->message; 1428 my $result= $safe ? $t->safe_parse($response->content) : $t->parse($response->content); 1429 return $@ ? 0 : $t; 1430 } 1431 1432 } 1433 1434# get the (hopefully!) XML data from the URL and 1435sub _pass_url_content 1436 { my( $fh, $data, $response, $protocol)= @_; 1437 print {$fh} $data; 1438 } 1439 1440sub add_options 1441 { my %args= map { $_, 1 } @_; 1442 %args= _normalize_args( %args); 1443 foreach (keys %args) { $valid_option{$_}++; } 1444 } 1445 1446sub _pretty_print_styles { return XML::Twig::Elt::_pretty_print_styles(); } 1447 1448sub _twig_store_internal_dtd 1449 { 1450 # warn " in _twig_store_internal_dtd...\n"; # DEBUG handler 1451 my( $p, $string)= @_; 1452 my $t= $p->{twig}; 1453 if( $t->{twig_keep_encoding}) { $string= $p->original_string(); } 1454 $t->{twig_doctype}->{internal} .= $string; 1455 return; 1456 } 1457 1458sub _twig_stop_storing_internal_dtd 1459 { # warn " in _twig_stop_storing_internal_dtd...\n"; # DEBUG handler 1460 my $p= shift; 1461 if( @saved_default_handler && defined $saved_default_handler[1]) 1462 { $p->setHandlers( @saved_default_handler); } 1463 else 1464 { 1465 $p->setHandlers( Default => undef); 1466 } 1467 $p->{twig}->{twig_doctype}->{internal}=~ s{^\s*\[}{}; 1468 $p->{twig}->{twig_doctype}->{internal}=~ s{\]\s*$}{}; 1469 return; 1470 } 1471 1472sub _twig_doctype_fin_print 1473 { # warn " in _twig_doctype_fin_print...\n"; # DEBUG handler 1474 my( $p)= shift; 1475 if( $p->{twig}->{twig_doctype}->{has_internal} && !$expat_1_95_2) { print ' ]>'; } 1476 return; 1477 } 1478 1479 1480sub _normalize_args 1481 { my %normalized_args; 1482 while( my $key= shift ) 1483 { $key= join '', map { ucfirst } split /_/, $key; 1484 #$key= "Twig".$key unless( substr( $key, 0, 4) eq 'Twig'); 1485 $normalized_args{$key}= shift ; 1486 } 1487 return %normalized_args; 1488 } 1489 1490sub _is_fh { return unless $_[0]; return $_[0] if( isa( $_[0], 'GLOB') || isa( $_[0], 'IO::Scalar')); } 1491 1492sub _set_handler 1493 { my( $handlers, $whole_path, $handler)= @_; 1494 1495 my $H_SPECIAL = qr{($ALL|$DEFAULT|$COMMENT|$TEXT)}; 1496 my $H_PI = qr{(\?|$PI)\s*(([^\s]*)\s*)}; 1497 my $H_LEVEL = qr{level \s* \( \s* ([0-9]+) \s* \)}x; 1498 my $H_REGEXP = qr{\(\?([\^xism]*)(-[\^xism]*)?:(.*)\)}x; 1499 my $H_XPATH = qr{(/?/?$REG_TAG_PART? \s* ($REG_PREDICATE\s*)?)+}x; 1500 1501 my $prev_handler; 1502 1503 my $cpath= $whole_path; 1504 #warn "\$cpath: '$cpath\n"; 1505 while( $cpath && $cpath=~ s{^\s*($H_SPECIAL|$H_PI|$H_LEVEL|$H_REGEXP|$H_XPATH)\s*($|\|)}{}) 1506 { my $path= $1; 1507 #warn "\$cpath: '$cpath' - $path: '$path'\n"; 1508 $prev_handler ||= $handlers->{handlers}->{string}->{$path} || undef; # $prev_handler gets the first found handler 1509 1510 _set_special_handler ( $handlers, $path, $handler, $prev_handler) 1511 || _set_pi_handler ( $handlers, $path, $handler, $prev_handler) 1512 || _set_level_handler ( $handlers, $path, $handler, $prev_handler) 1513 || _set_regexp_handler ( $handlers, $path, $handler, $prev_handler) 1514 || _set_xpath_handler ( $handlers, $path, $handler, $prev_handler) 1515 || croak "unrecognized expression in handler: '$whole_path'"; 1516 1517 # this both takes care of the simple (gi) handlers and store 1518 # the handler code reference for other handlers 1519 $handlers->{handlers}->{string}->{$path}= $handler; 1520 } 1521 1522 if( $cpath) { croak "unrecognized expression in handler: '$whole_path'"; } 1523 1524 return $prev_handler; 1525 } 1526 1527 1528sub _set_special_handler 1529 { my( $handlers, $path, $handler, $prev_handler)= @_; 1530 if( $path =~ m{^\s*($ALL|$DEFAULT|$COMMENT|$TEXT)\s*$}io ) 1531 { $handlers->{handlers}->{$1}= $handler; 1532 return 1; 1533 } 1534 else 1535 { return 0; } 1536 } 1537 1538sub _set_xpath_handler 1539 { my( $handlers, $path, $handler, $prev_handler)= @_; 1540 if( my $handler_data= _parse_xpath_handler( $path, $handler)) 1541 { _add_handler( $handlers, $handler_data, $path, $prev_handler); 1542 return 1; 1543 } 1544 else 1545 { return 0; } 1546 } 1547 1548sub _add_handler 1549 { my( $handlers, $handler_data, $path, $prev_handler)= @_; 1550 1551 my $tag= $handler_data->{tag}; 1552 my @handlers= $handlers->{xpath_handler}->{$tag} ? @{$handlers->{xpath_handler}->{$tag}} : (); 1553 1554 if( $prev_handler) { @handlers= grep { $_->{path} ne $path } @handlers; } 1555 1556 push @handlers, $handler_data if( $handler_data->{handler}); 1557 1558 if( @handlers > 1) 1559 { @handlers= sort { (($b->{score}->{type} || 0) <=> ($a->{score}->{type} || 0)) 1560 || (($b->{score}->{anchored} || 0) <=> ($a->{score}->{anchored} || 0)) 1561 || (($b->{score}->{steps} || 0) <=> ($a->{score}->{steps} || 0)) 1562 || (($b->{score}->{predicates} || 0) <=> ($a->{score}->{predicates} || 0)) 1563 || (($b->{score}->{tests} || 0) <=> ($a->{score}->{tests} || 0)) 1564 || ($a->{path} cmp $b->{path}) 1565 } @handlers; 1566 } 1567 1568 $handlers->{xpath_handler}->{$tag}= \@handlers; 1569 } 1570 1571sub _set_pi_handler 1572 { my( $handlers, $path, $handler, $prev_handler)= @_; 1573 # PI conditions ( '?target' => \&handler or '?' => \&handler 1574 # or '#PItarget' => \&handler or '#PI' => \&handler) 1575 if( $path=~ /^\s*(?:\?|$PI)\s*(?:([^\s]*)\s*)$/) 1576 { my $target= $1 || ''; 1577 # update the path_handlers count, knowing that 1578 # either the previous or the new handler can be undef 1579 $handlers->{pi_handlers}->{$1}= $handler; 1580 return 1; 1581 } 1582 else 1583 { return 0; 1584 } 1585 } 1586 1587sub _set_level_handler 1588 { my( $handlers, $path, $handler, $prev_handler)= @_; 1589 if( $path =~ m{^ \s* level \s* \( \s* ([0-9]+) \s* \) \s* $}ox ) 1590 { my $level= $1; 1591 my $sub= sub { my( $stack)= @_; return( ($stack->[-1]->{$ST_TAG} !~ m{^#}) && (scalar @$stack == $level + 1) ) }; 1592 my $handler_data= { tag=> '*', score => { type => $LEVEL_TRIGGER}, trigger => $sub, 1593 path => $path, handler => $handler, test_on_text => 0 1594 }; 1595 _add_handler( $handlers, $handler_data, $path, $prev_handler); 1596 return 1; 1597 } 1598 else 1599 { return 0; } 1600 } 1601 1602sub _set_regexp_handler 1603 { my( $handlers, $path, $handler, $prev_handler)= @_; 1604 # if the expression was a regexp it is now a string (it was stringified when it became a hash key) 1605 if( $path=~ m{^\(\?([\^xism]*)(?:-[\^xism]*)?:(.*)\)$}) 1606 { my $regexp= qr/(?$1:$2)/; # convert it back into a regexp 1607 my $sub= sub { my( $stack)= @_; return( $stack->[-1]->{$ST_TAG} =~ $regexp ) }; 1608 my $handler_data= { tag=> '*', score => { type => $REGEXP_TRIGGER} , trigger => $sub, 1609 path => $path, handler => $handler, test_on_text => 0 1610 }; 1611 _add_handler( $handlers, $handler_data, $path, $prev_handler); 1612 return 1; 1613 } 1614 else 1615 { return 0; } 1616 } 1617 1618my $DEBUG_HANDLER= 0; # 0 or 1 (output the handler checking code) or 2 (super verbose) 1619my $handler_string; # store the handler itself 1620sub _set_debug_handler { $DEBUG_HANDLER= shift; } 1621sub _warn_debug_handler { if( $DEBUG_HANDLER < 3) { warn @_; } else { $handler_string .= join( '', @_); } } 1622sub _return_debug_handler { my $string= $handler_string; $handler_string=''; return $string; } 1623 1624sub _parse_xpath_handler 1625 { my( $xpath, $handler)= @_; 1626 my $xpath_original= $xpath; 1627 1628 1629 if( $DEBUG_HANDLER >=1) { _warn_debug_handler( "\n\nparsing path '$xpath'\n"); } 1630 1631 my $path_to_check= $xpath; 1632 $path_to_check=~ s{/?/?$REG_TAG_PART?\s*(?:$REG_PREDICATE\s*)?}{}g; 1633 if( $DEBUG_HANDLER && $path_to_check=~ /\S/) { _warn_debug_handler( "left: $path_to_check\n"); } 1634 return if( $path_to_check=~ /\S/); 1635 1636 (my $xpath_to_display= $xpath)=~ s{(["{}'\[\]\@\$])}{\\$1}g; 1637 1638 my @xpath_steps; 1639 my $last_token_is_sep; 1640 1641 while( $xpath=~ s{^\s* 1642 ( (//?) # separator 1643 | (?:$REG_TAG_PART\s*(?:$REG_PREDICATE\s*)?) # tag name and optional predicate 1644 | (?:$REG_PREDICATE) # just a predicate 1645 ) 1646 } 1647 {}x 1648 ) 1649 { # check that we have alternating separators and steps 1650 if( $2) # found a separator 1651 { if( $last_token_is_sep) { return 0; } # 2 separators in a row 1652 $last_token_is_sep= 1; 1653 } 1654 else 1655 { if( defined( $last_token_is_sep) && !$last_token_is_sep) { return 0; } # 2 steps in a row 1656 $last_token_is_sep= 0; 1657 } 1658 1659 push @xpath_steps, $1; 1660 } 1661 if( $last_token_is_sep) { return 0; } # expression cannot end with a separator 1662 1663 my $i=-1; 1664 1665 my $perlfunc= _join_n( $NO_WARNINGS . ';', 1666 q|my( $stack)= @_; |, 1667 q|my @current_elts= (scalar @$stack); |, 1668 q|my @new_current_elts; |, 1669 q|my $elt; |, 1670 ($DEBUG_HANDLER >= 1) && (qq#warn q{checking path '$xpath_to_display'\n};#), 1671 ); 1672 1673 1674 my $last_tag=''; 1675 my $anchored= $xpath_original=~ m{^\s*/(?!/)} ? 1 : 0; 1676 my $score={ type => $XPATH_TRIGGER, anchored => $anchored }; 1677 my $flag= { test_on_text => 0 }; 1678 my $sep='/'; # '/' or '//' 1679 while( my $xpath_step= pop @xpath_steps) 1680 { my( $tag, $predicate)= $xpath_step =~ m{^($REG_TAG_PART)?(?:\[(.*)\])?\s*$}; 1681 $score->{steps}++; 1682 $tag||='*'; 1683 1684 my $warn_empty_stack= $DEBUG_HANDLER >= 2 ? qq{warn "return with empty stack\\n";} : ''; 1685 1686 if( $predicate) 1687 { if( $DEBUG_HANDLER >= 2) { _warn_debug_handler( "predicate is: '$predicate'\n"); } 1688 # changes $predicate (from an XPath expression to a Perl one) 1689 if( $predicate=~ m{^\s*$REG_NUMBER\s*$}) { croak "position selector [$predicate] not supported on twig_handlers"; } 1690 _parse_predicate_in_handler( $predicate, $flag, $score); 1691 if( $DEBUG_HANDLER >= 2) { _warn_debug_handler( "predicate becomes: '$predicate'\n"); } 1692 } 1693 1694 my $tag_cond= _tag_cond( $tag); 1695 my $cond= join( " && ", grep { $_ } $tag_cond, $predicate) || 1; 1696 1697 if( $css_sel && $tag=~ m{\.}) { $tag=~s{\.[^.]*$}{}; $tag ||='*'; } 1698 $tag=~ s{(.)#.+$}{$1}; 1699 1700 $last_tag ||= $tag; 1701 1702 if( $sep eq '/') 1703 { 1704 $perlfunc .= sprintf( _join_n( q#foreach my $current_elt (@current_elts) #, 1705 q# { next if( !$current_elt); #, 1706 q# $current_elt--; #, 1707 q# $elt= $stack->[$current_elt]; #, 1708 q# if( %s) { push @new_current_elts, $current_elt;} #, 1709 q# } #, 1710 ), 1711 $cond 1712 ); 1713 } 1714 elsif( $sep eq '//') 1715 { 1716 $perlfunc .= sprintf( _join_n( q#foreach my $current_elt (@current_elts) #, 1717 q# { next if( !$current_elt); #, 1718 q# $current_elt--; #, 1719 q# my $candidate= $current_elt; #, 1720 q# while( $candidate >=0) #, 1721 q# { $elt= $stack->[$candidate]; #, 1722 q# if( %s) { push @new_current_elts, $candidate;} #, 1723 q# $candidate--; #, 1724 q# } #, 1725 q# } #, 1726 ), 1727 $cond 1728 ); 1729 } 1730 my $warn= $DEBUG_HANDLER >= 2 ? _join_n( qq#warn qq%fail at cond '$cond'%;#) : ''; 1731 $perlfunc .= sprintf( _join_n( q#unless( @new_current_elts) { %s return 0; } #, 1732 q#@current_elts= @new_current_elts; #, 1733 q#@new_current_elts=(); #, 1734 ), 1735 $warn 1736 ); 1737 1738 $sep= pop @xpath_steps; 1739 } 1740 1741 if( $anchored) # there should be a better way, but this works 1742 { 1743 my $warn= $DEBUG_HANDLER >= 2 ? _join_n( qq#warn qq{fail, stack not empty};#) : ''; 1744 $perlfunc .= sprintf( _join_n( q#if( ! grep { $_ == 0 } @current_elts) { %s return 0;}#), $warn); 1745 } 1746 1747 $perlfunc.= qq{warn "handler for '$xpath_to_display' triggered\\n";\n} if( $DEBUG_HANDLER >=2); 1748 $perlfunc.= qq{return q{$xpath_original};\n}; 1749 _warn_debug_handler( "\nperlfunc:\n$perlfunc\n") if( $DEBUG_HANDLER>=1); 1750 my $s= eval "sub { $perlfunc }"; 1751 if( $@) 1752 { croak "wrong handler condition '$xpath' ($@);" } 1753 1754 _warn_debug_handler( "last tag: '$last_tag', test_on_text: '$flag->{test_on_text}'\n") if( $DEBUG_HANDLER >=1); 1755 _warn_debug_handler( "score: ", join( ' ', map { "$_: $score->{$_}" } sort keys %$score), "\n") if( $DEBUG_HANDLER >=1); 1756 return { tag=> $last_tag, score => $score, trigger => $s, path => $xpath_original, handler => $handler, test_on_text => $flag->{test_on_text} }; 1757 } 1758 1759sub _join_n { return join( "\n", @_, ''); } 1760 1761# the "tag" part can be <tag>, <tag>.<class> or <tag>#<id> (where tag can be *, or start with # for hidden tags) 1762sub _tag_cond 1763 { my( $full_tag)= @_; 1764 1765 my( $tag, $class, $id); 1766 if( $full_tag=~ m{^(.+)#(.+)$}) 1767 { ($tag, $id)= ($1, $2); } # <tag>#<id> 1768 else 1769 { ( $tag, $class)= $css_sel ? $full_tag=~ m{^(.*?)(?:\.([^.]*))?$} : ($full_tag, undef); } 1770 1771 my $tag_cond = $tag && $tag ne '*' ? qq#(\$elt->{'$ST_TAG'} eq "$tag")# : ''; 1772 my $id_cond = defined $id ? qq#(\$elt->{id} eq "$id")# : ''; 1773 my $class_cond = defined $class ? qq#(\$elt->{class}=~ m{(^| )$class( |\$)})# : ''; 1774 1775 my $full_cond= join( ' && ', grep { $_ } ( $tag_cond, $class_cond, $id_cond)); 1776 1777 return $full_cond; 1778 } 1779 1780# input: the predicate ($_[0]) which will be changed in place 1781# flags, a hashref with various flags (like test_on_text) 1782# the score 1783sub _parse_predicate_in_handler 1784 { my( $flag, $score)= @_[1..2]; 1785 $_[0]=~ s{( ($REG_STRING) # strings 1786 |\@($REG_TAG_NAME)(\s* $REG_MATCH \s* $REG_REGEXP) # @att and regexp 1787 |\@($REG_TAG_NAME)(?=\s*(?:[><=!])) # @att followed by a comparison operator 1788 |\@($REG_TAG_NAME) # @att (not followed by a comparison operator) 1789 |=~|!~ # matching operators 1790 |([><]=?|=|!=)(?=\s*[\d+-]) # test before a number 1791 |([><]=?|=|!=) # test, other cases 1792 |($REG_FUNCTION) # no arg functions 1793 # this bit is a mess, but it is the only solution with this half-baked parser 1794 |(string\(\s*$REG_NAME\s*\)\s*$REG_MATCH\s*$REG_REGEXP) # string( child)=~ /regexp/ 1795 |(string\(\s*$REG_NAME\s*\)\s*$REG_COMP\s*$REG_STRING) # string( child) = "value" (or other test) 1796 |(string\(\s*$REG_NAME\s*\)\s*$REG_COMP\s*$REG_NUMBER) # string( child) = nb (or other test) 1797 |(and|or) 1798 # |($REG_NAME(?=\s*(and|or|$))) # nested tag name (needs to be after all other unquoted strings) 1799 |($REG_TAG_IN_PREDICATE) # nested tag name (needs to be after all other unquoted strings) 1800 1801 )} 1802 { my( $token, $str, $att_re_name, $att_re_regexp, $att, $bare_att, $num_test, $alpha_test, $func, $str_regexp, $str_test_alpha, $str_test_num, $and_or, $tag) 1803 = ( $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14); 1804 1805 $score->{predicates}++; 1806 1807 # store tests on text (they are not always allowed) 1808 if( $func || $str_regexp || $str_test_num || $str_test_alpha ) { $flag->{test_on_text}= 1; } 1809 1810 if( defined $str) { $token } 1811 elsif( $tag) { qq{(\$elt->{'$ST_ELT'} && \$elt->{'$ST_ELT'}->has_child( '$tag'))} } 1812 elsif( $att) { $att=~ m{^#} ? qq{ (\$elt->{'$ST_ELT'} && \$elt->{'$ST_ELT'}->{att}->{'$att'})} 1813 : qq{\$elt->{'$att'}} 1814 } 1815 elsif( $att_re_name) { $att_re_name=~ m{^#} ? qq{ (\$elt->{'$ST_ELT'} && \$elt->{'$ST_ELT'}->{att}->{'$att_re_name'}$att_re_regexp)} 1816 : qq{\$elt->{'$att_re_name'}$att_re_regexp} 1817 } 1818 # for some reason Devel::Cover flags the following lines as not tested. They are though. 1819 elsif( $bare_att) { $bare_att=~ m{^#} ? qq{(\$elt->{'$ST_ELT'} && defined(\$elt->{'$ST_ELT'}->{att}->{'$bare_att'}))} 1820 : qq{defined( \$elt->{'$bare_att'})} 1821 } 1822 elsif( $num_test && ($num_test eq '=') ) { "==" } # others tests are unchanged 1823 elsif( $alpha_test) { $PERL_ALPHA_TEST{$alpha_test} } 1824 elsif( $func && $func=~ m{^string}) 1825 { "\$elt->{'$ST_ELT'}->text"; } 1826 elsif( $str_regexp && $str_regexp =~ m{string\(\s*($REG_TAG_NAME)\s*\)\s*($REG_MATCH)\s*($REG_REGEXP)}) 1827 { "defined( _first_n { \$_->text $2 $3 } 1, \$elt->{'$ST_ELT'}->_children( '$1'))"; } 1828 elsif( $str_test_alpha && $str_test_alpha =~ m{string\(\s*($REG_TAG_NAME)\s*\)\s*($REG_COMP)\s*($REG_STRING)}) 1829 { my( $tag, $op, $str)= ($1, $2, $3); 1830 $str=~ s{(?<=.)'(?=.)}{\\'}g; # escape a quote within the string 1831 $str=~ s{^"}{'}; 1832 $str=~ s{"$}{'}; 1833 "defined( _first_n { \$_->text $PERL_ALPHA_TEST{$op} $str } 1, \$elt->{'$ST_ELT'}->children( '$tag'))"; } 1834 elsif( $str_test_num && $str_test_num =~ m{string\(\s*($REG_TAG_NAME)\s*\)\s*($REG_COMP)\s*($REG_NUMBER)}) 1835 { my $test= ($2 eq '=') ? '==' : $2; 1836 "defined( _first_n { \$_->text $test $3 } 1, \$elt->{'$ST_ELT'}->children( '$1'))"; 1837 } 1838 elsif( $and_or) { $score->{tests}++; $and_or eq 'and' ? '&&' : '||' ; } 1839 else { $token; } 1840 }gexs; 1841 } 1842 1843 1844sub setCharHandler 1845 { my( $t, $handler)= @_; 1846 $t->{twig_char_handler}= $handler; 1847 } 1848 1849 1850sub _reset_handlers 1851 { my $handlers= shift; 1852 delete $handlers->{handlers}; 1853 delete $handlers->{path_handlers}; 1854 delete $handlers->{subpath_handlers}; 1855 $handlers->{attcond_handlers_exp}=[] if( $handlers->{attcond_handlers}); 1856 delete $handlers->{attcond_handlers}; 1857 } 1858 1859sub _set_handlers 1860 { my $handlers= shift || return; 1861 my $set_handlers= {}; 1862 foreach my $path (keys %{$handlers}) 1863 { _set_handler( $set_handlers, $path, $handlers->{$path}); } 1864 1865 return $set_handlers; 1866 } 1867 1868 1869sub setTwigHandler 1870 { my( $t, $path, $handler)= @_; 1871 $t->{twig_handlers} ||={}; 1872 return _set_handler( $t->{twig_handlers}, $path, $handler); 1873 } 1874 1875sub setTwigHandlers 1876 { my( $t, $handlers)= @_; 1877 my $previous_handlers= $t->{twig_handlers} || undef; 1878 _reset_handlers( $t->{twig_handlers}); 1879 $t->{twig_handlers}= _set_handlers( $handlers); 1880 return $previous_handlers; 1881 } 1882 1883sub setStartTagHandler 1884 { my( $t, $path, $handler)= @_; 1885 $t->{twig_starttag_handlers}||={}; 1886 return _set_handler( $t->{twig_starttag_handlers}, $path, $handler); 1887 } 1888 1889sub setStartTagHandlers 1890 { my( $t, $handlers)= @_; 1891 my $previous_handlers= $t->{twig_starttag_handlers} || undef; 1892 _reset_handlers( $t->{twig_starttag_handlers}); 1893 $t->{twig_starttag_handlers}= _set_handlers( $handlers); 1894 return $previous_handlers; 1895 } 1896 1897sub setIgnoreEltsHandler 1898 { my( $t, $path, $action)= @_; 1899 $t->{twig_ignore_elts_handlers}||={}; 1900 return _set_handler( $t->{twig_ignore_elts_handlers}, $path, $action ); 1901 } 1902 1903sub setIgnoreEltsHandlers 1904 { my( $t, $handlers)= @_; 1905 my $previous_handlers= $t->{twig_ignore_elts_handlers}; 1906 _reset_handlers( $t->{twig_ignore_elts_handlers}); 1907 $t->{twig_ignore_elts_handlers}= _set_handlers( $handlers); 1908 return $previous_handlers; 1909 } 1910 1911sub setEndTagHandler 1912 { my( $t, $path, $handler)= @_; 1913 $t->{twig_endtag_handlers}||={}; 1914 return _set_handler( $t->{twig_endtag_handlers}, $path,$handler); 1915 } 1916 1917sub setEndTagHandlers 1918 { my( $t, $handlers)= @_; 1919 my $previous_handlers= $t->{twig_endtag_handlers}; 1920 _reset_handlers( $t->{twig_endtag_handlers}); 1921 $t->{twig_endtag_handlers}= _set_handlers( $handlers); 1922 return $previous_handlers; 1923 } 1924 1925# a little more complex: set the twig_handlers only if a code ref is given 1926sub setTwigRoots 1927 { my( $t, $handlers)= @_; 1928 my $previous_roots= $t->{twig_roots}; 1929 _reset_handlers($t->{twig_roots}); 1930 $t->{twig_roots}= _set_handlers( $handlers); 1931 1932 _check_illegal_twig_roots_handlers( $t->{twig_roots}); 1933 1934 foreach my $path (keys %{$handlers}) 1935 { $t->{twig_handlers}||= {}; 1936 _set_handler( $t->{twig_handlers}, $path, $handlers->{$path}) 1937 if( ref($handlers->{$path}) && isa( $handlers->{$path}, 'CODE')); 1938 } 1939 return $previous_roots; 1940 } 1941 1942sub _check_illegal_twig_roots_handlers 1943 { my( $handlers)= @_; 1944 foreach my $tag_handlers (values %{$handlers->{xpath_handler}}) 1945 { foreach my $handler_data (@$tag_handlers) 1946 { if( my $type= $handler_data->{test_on_text}) 1947 { croak "string() condition not supported on twig_roots option"; } 1948 } 1949 } 1950 return; 1951 } 1952 1953 1954# just store the reference to the expat object in the twig 1955sub _twig_init 1956 { # warn " in _twig_init...\n"; # DEBUG handler 1957 1958 my $p= shift; 1959 my $t=$p->{twig}; 1960 1961 if( $t->{twig_parsing} ) { croak "cannot reuse a twig that is already parsing"; } 1962 $t->{twig_parsing}=1; 1963 1964 $t->{twig_parser}= $p; 1965 if( $weakrefs) { weaken( $t->{twig_parser}); } 1966 1967 # in case they had been created by a previous parse 1968 delete $t->{twig_dtd}; 1969 delete $t->{twig_doctype}; 1970 delete $t->{twig_xmldecl}; 1971 delete $t->{twig_root}; 1972 1973 # if needed set the output filehandle 1974 $t->_set_fh_to_twig_output_fh(); 1975 return; 1976 } 1977 1978# uses eval to catch the parser's death 1979sub safe_parse 1980 { my $t= shift; 1981 eval { $t->parse( @_); } ; 1982 return $@ ? $t->_reset_twig_after_error : $t; 1983 } 1984 1985sub safe_parsefile 1986 { my $t= shift; 1987 eval { $t->parsefile( @_); } ; 1988 return $@ ? $t->_reset_twig_after_error : $t; 1989 } 1990 1991# restore a twig in a proper state so it can be reused for a new parse 1992sub _reset_twig 1993 { my $t= shift; 1994 $t->{twig_parsing}= 0; 1995 delete $t->{twig_current}; 1996 delete $t->{extra_data}; 1997 delete $t->{twig_dtd}; 1998 delete $t->{twig_in_pcdata}; 1999 delete $t->{twig_in_cdata}; 2000 delete $t->{twig_stored_space}; 2001 delete $t->{twig_entity_list}; 2002 $t->root->delete if( $t->root); 2003 delete $t->{twig_root}; 2004 return $t; 2005 } 2006 2007sub _reset_twig_after_error 2008 { my $t= shift; 2009 $t->_reset_twig; 2010 return undef; 2011 } 2012 2013 2014sub _add_or_discard_stored_spaces 2015 { my $t= shift; 2016 2017 $t->{twig_right_after_root}=0; #XX 2018 2019 my $current= $t->{twig_current} or return; # ugly hack, with ignore on, twig_current can disappear 2020 return unless length $t->{twig_stored_spaces}; 2021 my $current_gi= $XML::Twig::index2gi[$current->{'gi'}]; 2022 2023 if( ! $t->{twig_discard_all_spaces}) 2024 { if( ! defined( $t->{twig_space_policy}->{$current_gi})) 2025 { $t->{twig_space_policy}->{$current_gi}= _space_policy( $t, $current_gi); } 2026 if( $t->{twig_space_policy}->{$current_gi} || ($t->{twig_stored_spaces}!~ m{\n}) || $t->{twig_preserve_space}) 2027 { _insert_pcdata( $t, $t->{twig_stored_spaces} ); } 2028 } 2029 2030 $t->{twig_stored_spaces}=''; 2031 2032 return; 2033 } 2034 2035# the default twig handlers, which build the tree 2036sub _twig_start 2037 { # warn " in _twig_start...\n"; # DEBUG handler 2038 2039 #foreach my $s (@_) { next if ref $s; warn "$s: ", is_utf8( $s) ? "has flag" : "FLAG NOT SET"; } # YYY 2040 2041 my ($p, $gi, @att)= @_; 2042 my $t=$p->{twig}; 2043 2044 # empty the stored pcdata (space stored in case they are really part of 2045 # a pcdata element) or stored it if the space policy dictates so 2046 # create a pcdata element with the spaces if need be 2047 _add_or_discard_stored_spaces( $t); 2048 my $parent= $t->{twig_current}; 2049 2050 # if we were parsing PCDATA then we exit the pcdata 2051 if( $t->{twig_in_pcdata}) 2052 { $t->{twig_in_pcdata}= 0; 2053 delete $parent->{'twig_current'}; 2054 $parent= $parent->{parent}; 2055 } 2056 2057 # if we choose to keep the encoding then we need to parse the tag 2058 if( my $func = $t->{parse_start_tag}) 2059 { ($gi, @att)= &$func($p->original_string); } 2060 elsif( $t->{twig_entities_in_attribute}) 2061 { 2062 ($gi,@att)= _parse_start_tag( $p->recognized_string); 2063 $t->{twig_entities_in_attribute}=0; 2064 } 2065 2066 # if we are using an external DTD, we need to fill the default attributes 2067 if( $t->{twig_read_external_dtd}) { _fill_default_atts( $t, $gi, \@att); } 2068 2069 # filter the input data if need be 2070 if( my $filter= $t->{twig_input_filter}) 2071 { $gi= $filter->( $gi); 2072 foreach my $att (@att) { $att= $filter->($att); } 2073 } 2074 2075 my $ns_decl; 2076 if( $t->{twig_map_xmlns}) 2077 { $ns_decl= _replace_ns( $t, \$gi, \@att); } 2078 2079 my $elt= $t->{twig_elt_class}->new( $gi); 2080 $elt->set_atts( @att); 2081 2082 # now we can store the tag and atts 2083 my $context= { $ST_TAG => $gi, $ST_ELT => $elt, @att}; 2084 $context->{$ST_NS}= $ns_decl if $ns_decl; 2085 if( $weakrefs) { weaken( $context->{$ST_ELT}); } 2086 push @{$t->{_twig_context_stack}}, $context; 2087 2088 delete $parent->{'twig_current'} if( $parent); 2089 $t->{twig_current}= $elt; 2090 $elt->{'twig_current'}=1; 2091 2092 if( $parent) 2093 { my $prev_sibling= $parent->{last_child}; 2094 if( $prev_sibling) 2095 { $prev_sibling->{next_sibling}= $elt; 2096 $elt->{prev_sibling}=$prev_sibling; if( $XML::Twig::weakrefs) { weaken( $elt->{prev_sibling});} ; 2097 } 2098 2099 $elt->{parent}=$parent; if( $XML::Twig::weakrefs) { weaken( $elt->{parent});} ; 2100 unless( $parent->{first_child}) { $parent->{first_child}= $elt; } 2101 delete $parent->{empty}; $parent->{last_child}=$elt; if( $XML::Twig::weakrefs) { weaken( $parent->{last_child});} ; 2102 } 2103 else 2104 { # processing root 2105 $t->set_root( $elt); 2106 # call dtd handler if need be 2107 $t->{twig_dtd_handler}->($t, $t->{twig_dtd}) 2108 if( defined $t->{twig_dtd_handler}); 2109 2110 # set this so we can catch external entities 2111 # (the handler was modified during DTD processing) 2112 if( $t->{twig_default_print}) 2113 { $p->setHandlers( Default => \&_twig_print); } 2114 elsif( $t->{twig_roots}) 2115 { $p->setHandlers( Default => sub { return }); } 2116 else 2117 { $p->setHandlers( Default => \&_twig_default); } 2118 } 2119 2120 $elt->{empty}= $p->recognized_string=~ m{/\s*>$}s ? 1 : 0; 2121 2122 $elt->{extra_data}= $t->{extra_data} if( $t->{extra_data}); 2123 $t->{extra_data}=''; 2124 2125 # if the element is ID-ed then store that info 2126 my $id= $elt->{'att'}->{$ID}; 2127 if( defined $id) 2128 { $t->{twig_id_list}->{$id}= $elt; 2129 if( $weakrefs) { weaken( $t->{twig_id_list}->{$id}); } 2130 } 2131 2132 # call user handler if need be 2133 if( $t->{twig_starttag_handlers}) 2134 { # call all appropriate handlers 2135 my @handlers= _handler( $t, $t->{twig_starttag_handlers}, $gi); 2136 2137 local $_= $elt; 2138 2139 foreach my $handler ( @handlers) 2140 { $handler->($t, $elt) || last; } 2141 # call _all_ handler if needed 2142 if( my $all= $t->{twig_starttag_handlers}->{handlers}->{$ALL}) 2143 { $all->($t, $elt); } 2144 } 2145 2146 # check if the tag is in the list of tags to be ignored 2147 if( $t->{twig_ignore_elts_handlers}) 2148 { my @handlers= _handler( $t, $t->{twig_ignore_elts_handlers}, $gi); 2149 # only the first handler counts, it contains the action (discard/print/string) 2150 if( @handlers) { my $action= shift @handlers; $t->ignore( $elt, $action); } 2151 } 2152 2153 if( $elt->{'att'}->{'xml:space'} && ( $elt->{'att'}->{'xml:space'} eq 'preserve')) { $t->{twig_preserve_space}++; } 2154 2155 2156 return; 2157 } 2158 2159sub _replace_ns 2160 { my( $t, $gi, $atts)= @_; 2161 my $decls; 2162 foreach my $new_prefix ( $t->parser->new_ns_prefixes) 2163 { my $uri= $t->parser->expand_ns_prefix( $new_prefix); 2164 # replace the prefix if it is mapped 2165 $decls->{$new_prefix}= $uri; 2166 if( !$t->{twig_keep_original_prefix} && (my $mapped_prefix= $t->{twig_map_xmlns}->{$uri})) 2167 { $new_prefix= $mapped_prefix; } 2168 # now put the namespace declaration back in the element 2169 if( $new_prefix eq '#default') 2170 { push @$atts, "xmlns" => $uri; } 2171 else 2172 { push @$atts, "xmlns:$new_prefix" => $uri; } 2173 } 2174 2175 if( $t->{twig_keep_original_prefix}) 2176 { # things become more complex: we need to find the original prefix 2177 # and store both prefixes 2178 my $ns_info= $t->_ns_info( $$gi); 2179 my $map_att; 2180 if( $ns_info->{mapped_prefix}) 2181 { $$gi= "$ns_info->{mapped_prefix}:$$gi"; 2182 $map_att->{$ns_info->{mapped_prefix}}= $ns_info->{prefix}; 2183 } 2184 my $att_name=1; 2185 foreach( @$atts) 2186 { if( $att_name) 2187 { 2188 my $ns_info= $t->_ns_info( $_); 2189 if( $ns_info->{mapped_prefix}) 2190 { $_= "$ns_info->{mapped_prefix}:$_"; 2191 $map_att->{$ns_info->{mapped_prefix}}= $ns_info->{prefix}; 2192 } 2193 $att_name=0; 2194 } 2195 else 2196 { $att_name=1; } 2197 } 2198 push @$atts, '#original_gi', $map_att if( $map_att); 2199 } 2200 else 2201 { $$gi= $t->_replace_prefix( $$gi); 2202 my $att_name=1; 2203 foreach( @$atts) 2204 { if( $att_name) { $_= $t->_replace_prefix( $_); $att_name=0; } 2205 else { $att_name=1; } 2206 } 2207 } 2208 return $decls; 2209 } 2210 2211 2212# extract prefix, local_name, uri, mapped_prefix from a name 2213# will only work if called from a start or end tag handler 2214sub _ns_info 2215 { my( $t, $name)= @_; 2216 my $ns_info={}; 2217 my $p= $t->parser; 2218 $ns_info->{uri}= $p->namespace( $name); 2219 return $ns_info unless( $ns_info->{uri}); 2220 2221 $ns_info->{prefix}= _a_proper_ns_prefix( $p, $ns_info->{uri}); 2222 $ns_info->{mapped_prefix}= $t->{twig_map_xmlns}->{$ns_info->{uri}} || $ns_info->{prefix}; 2223 2224 return $ns_info; 2225 } 2226 2227sub _a_proper_ns_prefix 2228 { my( $p, $uri)= @_; 2229 foreach my $prefix ($p->current_ns_prefixes) 2230 { if( $p->expand_ns_prefix( $prefix) eq $uri) 2231 { return $prefix; } 2232 } 2233 return; 2234 } 2235 2236# returns the uri bound to a prefix in the original document 2237# only works in a handler 2238# can be used to deal with xsi:type attributes 2239sub original_uri 2240 { my( $t, $prefix)= @_; 2241 my $ST_NS = '##ns' ; 2242 foreach my $ns (map { $_->{$ST_NS} if $_->{$ST_NS} } reverse @{$t->{_twig_context_stack}}) 2243 { return $ns->{$prefix} || next; } 2244 return; 2245 } 2246 2247 2248sub _fill_default_atts 2249 { my( $t, $gi, $atts)= @_; 2250 my $dtd= $t->{twig_dtd}; 2251 my $attlist= $dtd->{att}->{$gi}; 2252 my %value= @$atts; 2253 foreach my $att (keys %$attlist) 2254 { if( !exists( $value{$att}) 2255 && exists( $attlist->{$att}->{default}) 2256 && ( $attlist->{$att}->{default} ne '#IMPLIED') 2257 ) 2258 { # the quotes are included in the default, so we need to remove them 2259 my $default_value= substr( $attlist->{$att}->{default}, 1, -1); 2260 push @$atts, $att, $default_value; 2261 } 2262 } 2263 return; 2264 } 2265 2266 2267# the default function to parse a start tag (in keep_encoding mode) 2268# can be overridden with the parse_start_tag method 2269# only works for 1-byte character sets 2270sub _parse_start_tag 2271 { my $string= shift; 2272 my( $gi, @atts); 2273 2274 # get the gi (between < and the first space, / or > character) 2275 #if( $string=~ s{^<\s*([^\s>/]*)[\s>/]*}{}s) 2276 if( $string=~ s{^<\s*($REG_TAG_NAME)\s*[\s>/]}{}s) 2277 { $gi= $1; } 2278 else 2279 { croak "error parsing tag '$string'"; } 2280 while( $string=~ s{^([^\s=]*)\s*=\s*(["'])(.*?)\2\s*}{}s) 2281 { push @atts, $1, $3; } 2282 return $gi, @atts; 2283 } 2284 2285sub set_root 2286 { my( $t, $elt)= @_; 2287 $t->{twig_root}= $elt; 2288 if( $elt) 2289 { $elt->{twig}= $t; 2290 if( $weakrefs) { weaken( $elt->{twig}); } 2291 } 2292 return $t; 2293 } 2294 2295sub _twig_end 2296 { # warn " in _twig_end...\n"; # DEBUG handler 2297 my ($p, $gi) = @_; 2298 2299 my $t=$p->{twig}; 2300 2301 if( $t->{twig_in_pcdata} && (my $text_handler= $t->{TwigHandlers}->{$TEXT}) ) 2302 { local $_= $t->{twig_current}; $text_handler->( $t, $_) if $_; 2303 } 2304 2305 if( $t->{twig_map_xmlns}) { $gi= $t->_replace_prefix( $gi); } 2306 2307 _add_or_discard_stored_spaces( $t); 2308 2309 # the new twig_current is the parent 2310 my $elt= $t->{twig_current}; 2311 delete $elt->{'twig_current'}; 2312 2313 # if we were parsing PCDATA then we exit the pcdata too 2314 if( $t->{twig_in_pcdata}) 2315 { 2316 $t->{twig_in_pcdata}= 0; 2317 $elt= $elt->{parent} if($elt->{parent}); 2318 delete $elt->{'twig_current'}; 2319 } 2320 2321 # parent is the new current element 2322 my $parent= $elt->{parent}; 2323 $t->{twig_current}= $parent; 2324 2325 if( $parent) 2326 { $parent->{'twig_current'}=1; 2327 # twig_to_be_normalized 2328 if( $parent->{twig_to_be_normalized}) { $parent->normalize; $parent->{twig_to_be_normalized}=0; } 2329 } 2330 2331 if( $t->{extra_data}) 2332 { $elt->_set_extra_data_before_end_tag( $t->{extra_data}); 2333 $t->{extra_data}=''; 2334 } 2335 2336 if( $t->{twig_handlers}) 2337 { # look for handlers 2338 my @handlers= _handler( $t, $t->{twig_handlers}, $gi); 2339 2340 if( $t->{twig_tdh}) 2341 { if( @handlers) { push @{$t->{twig_handlers_to_trigger}}, [ $elt, \@handlers ]; } 2342 if( my $all= $t->{twig_handlers}->{handlers}->{$ALL}) 2343 { push @{$t->{twig_handlers_to_trigger}}, [ $elt, [$all] ]; } 2344 } 2345 else 2346 { 2347 local $_= $elt; # so we can use $_ in the handlers 2348 2349 foreach my $handler ( @handlers) 2350 { $handler->($t, $elt) || last; } 2351 # call _all_ handler if needed 2352 my $all= $t->{twig_handlers}->{handlers}->{$ALL}; 2353 if( $all) 2354 { $all->($t, $elt); } 2355 if( @handlers || $all) { $t->{twig_right_after_root}=0; } 2356 } 2357 } 2358 2359 # if twig_roots is set for the element then set appropriate handler 2360 if( $t->{twig_root_depth} and ($p->depth == $t->{twig_root_depth}) ) 2361 { if( $t->{twig_default_print}) 2362 { # select the proper fh (and store the currently selected one) 2363 $t->_set_fh_to_twig_output_fh(); 2364 if( !$p->depth==1) { $t->{twig_right_after_root}=1; } #XX 2365 if( $t->{twig_keep_encoding}) 2366 { $p->setHandlers( %twig_handlers_roots_print_original); } 2367 else 2368 { $p->setHandlers( %twig_handlers_roots_print); } 2369 } 2370 else 2371 { $p->setHandlers( %twig_handlers_roots); } 2372 } 2373 2374 if( $elt->{'att'}->{'xml:space'} && ( $elt->{'att'}->{'xml:space'} eq 'preserve')) { $t->{twig_preserve_space}--; } 2375 2376 pop @{$t->{_twig_context_stack}}; 2377 return; 2378 } 2379 2380sub _trigger_tdh 2381 { my( $t)= @_; 2382 2383 if( @{$t->{twig_handlers_to_trigger}}) 2384 { my @handlers_to_trigger_now= sort { $a->[0]->cmp( $b->[0]) } @{$t->{twig_handlers_to_trigger}}; 2385 foreach my $elt_handlers (@handlers_to_trigger_now) 2386 { my( $handled_elt, $handlers_to_trigger)= @$elt_handlers; 2387 foreach my $handler ( @$handlers_to_trigger) 2388 { local $_= $handled_elt; $handler->($t, $handled_elt) || last; } 2389 } 2390 } 2391 return; 2392 } 2393 2394# return the list of handler that can be activated for an element 2395# (either of CODE ref's or 1's for twig_roots) 2396 2397sub _handler 2398 { my( $t, $handlers, $gi)= @_; 2399 2400 my @found_handlers=(); 2401 my $found_handler; 2402 2403 foreach my $handler ( map { @$_ } grep { $_ } $handlers->{xpath_handler}->{$gi}, $handlers->{xpath_handler}->{'*'}) 2404 { my $trigger= $handler->{trigger}; 2405 if( my $found_path= $trigger->( $t->{_twig_context_stack})) 2406 { my $found_handler= $handler->{handler}; 2407 push @found_handlers, $found_handler; 2408 } 2409 } 2410 2411 # if no handler found call default handler if defined 2412 if( !@found_handlers && defined $handlers->{handlers}->{$DEFAULT}) 2413 { push @found_handlers, $handlers->{handlers}->{$DEFAULT}; } 2414 2415 if( @found_handlers and $t->{twig_do_not_chain_handlers}) 2416 { @found_handlers= ($found_handlers[0]); } 2417 2418 return @found_handlers; # empty if no handler found 2419 2420 } 2421 2422 2423sub _replace_prefix 2424 { my( $t, $name)= @_; 2425 my $p= $t->parser; 2426 my $uri= $p->namespace( $name); 2427 # try to get the namespace from default if none is found (for attributes) 2428 # this should probably be an option 2429 if( !$uri and( $name!~/^xml/)) { $uri= $p->expand_ns_prefix( '#default'); } 2430 if( $uri) 2431 { if (my $mapped_prefix= $t->{twig_map_xmlns}->{$uri} || $DEFAULT_URI2NS{$uri}) 2432 { return "$mapped_prefix:$name"; } 2433 else 2434 { my $prefix= _a_proper_ns_prefix( $p, $uri); 2435 if( $prefix eq '#default') { $prefix=''; } 2436 return $prefix ? "$prefix:$name" : $name; 2437 } 2438 } 2439 else 2440 { return $name; } 2441 } 2442 2443 2444sub _twig_char 2445 { # warn " in _twig_char...\n"; # DEBUG handler 2446 2447 my ($p, $string)= @_; 2448 my $t=$p->{twig}; 2449 2450 if( $t->{twig_keep_encoding}) 2451 { if( !$t->{twig_in_cdata}) 2452 { $string= $p->original_string(); } 2453 else 2454 { 2455 use bytes; # > perl 5.5 2456 if( length( $string) < 1024) 2457 { $string= $p->original_string(); } 2458 else 2459 { #warn "dodgy case"; 2460 # TODO original_string does not hold the entire string, but $string is wrong 2461 # I believe due to a bug in XML::Parser 2462 # for now, we use the original string, even if it means that it's been converted to utf8 2463 } 2464 } 2465 } 2466 2467 if( $t->{twig_input_filter}) { $string= $t->{twig_input_filter}->( $string); } 2468 if( $t->{twig_char_handler}) { $string= $t->{twig_char_handler}->( $string); } 2469 2470 my $elt= $t->{twig_current}; 2471 2472 if( $t->{twig_in_cdata}) 2473 { # text is the continuation of a previously created cdata 2474 $elt->{cdata}.= $t->{twig_stored_spaces} . $string; 2475 } 2476 elsif( $t->{twig_in_pcdata}) 2477 { # text is the continuation of a previously created pcdata 2478 if( $t->{extra_data}) 2479 { $elt->_push_extra_data_in_pcdata( $t->{extra_data}, length( $elt->{pcdata})); 2480 $t->{extra_data}=''; 2481 } 2482 $elt->{pcdata}.= $string; 2483 } 2484 else 2485 { 2486 # text is just space, which might be discarded later 2487 if( $string=~/\A\s*\Z/s) 2488 { 2489 if( $t->{extra_data}) 2490 { # we got extra data (comment, pi), lets add the spaces to it 2491 $t->{extra_data} .= $string; 2492 } 2493 else 2494 { # no extra data, just store the spaces 2495 $t->{twig_stored_spaces}.= $string; 2496 } 2497 } 2498 else 2499 { my $new_elt= _insert_pcdata( $t, $t->{twig_stored_spaces}.$string); 2500 delete $elt->{'twig_current'}; 2501 $new_elt->{'twig_current'}=1; 2502 $t->{twig_current}= $new_elt; 2503 $t->{twig_in_pcdata}=1; 2504 if( $t->{extra_data}) 2505 { $new_elt->_push_extra_data_in_pcdata( $t->{extra_data}, 0); 2506 $t->{extra_data}=''; 2507 } 2508 } 2509 } 2510 return; 2511 } 2512 2513sub _twig_cdatastart 2514 { # warn " in _twig_cdatastart...\n"; # DEBUG handler 2515 2516 my $p= shift; 2517 my $t=$p->{twig}; 2518 2519 $t->{twig_in_cdata}=1; 2520 my $cdata= $t->{twig_elt_class}->new( $CDATA); 2521 my $twig_current= $t->{twig_current}; 2522 2523 if( $t->{twig_in_pcdata}) 2524 { # create the node as a sibling of the PCDATA 2525 $cdata->{prev_sibling}=$twig_current; if( $XML::Twig::weakrefs) { weaken( $cdata->{prev_sibling});} ; 2526 $twig_current->{next_sibling}= $cdata; 2527 my $parent= $twig_current->{parent}; 2528 $cdata->{parent}=$parent; if( $XML::Twig::weakrefs) { weaken( $cdata->{parent});} ; 2529 delete $parent->{empty}; $parent->{last_child}=$cdata; if( $XML::Twig::weakrefs) { weaken( $parent->{last_child});} ; 2530 $t->{twig_in_pcdata}=0; 2531 } 2532 else 2533 { # we have to create a PCDATA element if we need to store spaces 2534 if( $t->_space_policy($XML::Twig::index2gi[$twig_current->{'gi'}]) && $t->{twig_stored_spaces}) 2535 { _insert_pcdata( $t, $t->{twig_stored_spaces}); } 2536 $t->{twig_stored_spaces}=''; 2537 2538 # create the node as a child of the current element 2539 $cdata->{parent}=$twig_current; if( $XML::Twig::weakrefs) { weaken( $cdata->{parent});} ; 2540 if( my $prev_sibling= $twig_current->{last_child}) 2541 { $cdata->{prev_sibling}=$prev_sibling; if( $XML::Twig::weakrefs) { weaken( $cdata->{prev_sibling});} ; 2542 $prev_sibling->{next_sibling}= $cdata; 2543 } 2544 else 2545 { $twig_current->{first_child}= $cdata; } 2546 delete $twig_current->{empty}; $twig_current->{last_child}=$cdata; if( $XML::Twig::weakrefs) { weaken( $twig_current->{last_child});} ; 2547 2548 } 2549 2550 delete $twig_current->{'twig_current'}; 2551 $t->{twig_current}= $cdata; 2552 $cdata->{'twig_current'}=1; 2553 if( $t->{extra_data}) { $cdata->set_extra_data( $t->{extra_data}); $t->{extra_data}='' }; 2554 return; 2555 } 2556 2557sub _twig_cdataend 2558 { # warn " in _twig_cdataend...\n"; # DEBUG handler 2559 2560 my $p= shift; 2561 my $t=$p->{twig}; 2562 2563 $t->{twig_in_cdata}=0; 2564 2565 my $elt= $t->{twig_current}; 2566 delete $elt->{'twig_current'}; 2567 my $cdata= $elt->{cdata}; 2568 $elt->{cdata}= $cdata; 2569 2570 push @{$t->{_twig_context_stack}}, { $ST_TAG => $CDATA }; 2571 2572 if( $t->{twig_handlers}) 2573 { # look for handlers 2574 my @handlers= _handler( $t, $t->{twig_handlers}, $CDATA); 2575 local $_= $elt; # so we can use $_ in the handlers 2576 foreach my $handler ( @handlers) { $handler->($t, $elt) || last; } 2577 } 2578 2579 pop @{$t->{_twig_context_stack}}; 2580 2581 $elt= $elt->{parent}; 2582 $t->{twig_current}= $elt; 2583 $elt->{'twig_current'}=1; 2584 2585 $t->{twig_long_cdata}=0; 2586 return; 2587 } 2588 2589sub _pi_elt_handlers 2590 { my( $t, $pi)= @_; 2591 my $pi_handlers= $t->{twig_handlers}->{pi_handlers} || return; 2592 foreach my $handler ( $pi_handlers->{$pi->{target}}, $pi_handlers->{''}) 2593 { if( $handler) { local $_= $pi; $handler->( $t, $pi) || last; } } 2594 } 2595 2596sub _pi_text_handler 2597 { my( $t, $target, $data)= @_; 2598 if( my $handler= $t->{twig_handlers}->{pi_handlers}->{$target}) 2599 { return $handler->( $t, $target, $data); } 2600 if( my $handler= $t->{twig_handlers}->{pi_handlers}->{''}) 2601 { return $handler->( $t, $target, $data); } 2602 return defined( $data) && $data ne '' ? "<?$target $data?>" : "<?$target?>" ; 2603 } 2604 2605sub _comment_elt_handler 2606 { my( $t, $comment)= @_; 2607 if( my $handler= $t->{twig_handlers}->{handlers}->{$COMMENT}) 2608 { local $_= $comment; $handler->($t, $comment); } 2609 } 2610 2611sub _comment_text_handler 2612 { my( $t, $comment)= @_; 2613 if( my $handler= $t->{twig_handlers}->{handlers}->{$COMMENT}) 2614 { $comment= $handler->($t, $comment); 2615 if( !defined $comment || $comment eq '') { return ''; } 2616 } 2617 return "<!--$comment-->"; 2618 } 2619 2620 2621 2622sub _twig_comment 2623 { # warn " in _twig_comment...\n"; # DEBUG handler 2624 2625 my( $p, $comment_text)= @_; 2626 my $t=$p->{twig}; 2627 2628 if( $t->{twig_keep_encoding}) { $comment_text= substr( $p->original_string(), 4, -3); } 2629 2630 $t->_twig_pi_comment( $p, $COMMENT, $t->{twig_keep_comments}, $t->{twig_process_comments}, 2631 '_set_comment', '_comment_elt_handler', '_comment_text_handler', $comment_text 2632 ); 2633 return; 2634 } 2635 2636sub _twig_pi 2637 { # warn " in _twig_pi...\n"; # DEBUG handler 2638 2639 my( $p, $target, $data)= @_; 2640 my $t=$p->{twig}; 2641 2642 if( $t->{twig_keep_encoding}) 2643 { my $pi_text= substr( $p->original_string(), 2, -2); 2644 ($target, $data)= split( /\s+/, $pi_text, 2); 2645 } 2646 2647 $t->_twig_pi_comment( $p, $PI, $t->{twig_keep_pi}, $t->{twig_process_pi}, 2648 '_set_pi', '_pi_elt_handlers', '_pi_text_handler', $target, $data 2649 ); 2650 return; 2651 } 2652 2653sub _twig_pi_comment 2654 { my( $t, $p, $type, $keep, $process, $set, $elt_handler, $text_handler, @parser_args)= @_; 2655 2656 if( $t->{twig_input_filter}) 2657 { foreach my $arg (@parser_args) { $arg= $t->{twig_input_filter}->( $arg); } } 2658 2659 # if pi/comments are to be kept then we piggyback them to the current element 2660 if( $keep) 2661 { # first add spaces 2662 if( $t->{twig_stored_spaces}) 2663 { $t->{extra_data}.= $t->{twig_stored_spaces}; 2664 $t->{twig_stored_spaces}= ''; 2665 } 2666 2667 my $extra_data= $t->$text_handler( @parser_args); 2668 $t->{extra_data}.= $extra_data; 2669 2670 } 2671 elsif( $process) 2672 { 2673 my $twig_current= $t->{twig_current}; # defined unless we are outside of the root 2674 2675 my $elt= $t->{twig_elt_class}->new( $type); 2676 $elt->$set( @parser_args); 2677 if( $t->{extra_data}) 2678 { $elt->set_extra_data( $t->{extra_data}); 2679 $t->{extra_data}=''; 2680 } 2681 2682 unless( $t->root) 2683 { $t->_add_cpi_outside_of_root( leading_cpi => $elt); 2684 } 2685 elsif( $t->{twig_in_pcdata}) 2686 { # create the node as a sibling of the PCDATA 2687 $elt->paste_after( $twig_current); 2688 $t->{twig_in_pcdata}=0; 2689 } 2690 elsif( $twig_current) 2691 { # we have to create a PCDATA element if we need to store spaces 2692 if( $t->_space_policy($XML::Twig::index2gi[$twig_current->{'gi'}]) && $t->{twig_stored_spaces}) 2693 { _insert_pcdata( $t, $t->{twig_stored_spaces}); } 2694 $t->{twig_stored_spaces}=''; 2695 # create the node as a child of the current element 2696 $elt->paste_last_child( $twig_current); 2697 } 2698 else 2699 { $t->_add_cpi_outside_of_root( trailing_cpi => $elt); } 2700 2701 if( $twig_current) 2702 { delete $twig_current->{'twig_current'}; 2703 my $parent= $elt->{parent}; 2704 $t->{twig_current}= $parent; 2705 $parent->{'twig_current'}=1; 2706 } 2707 2708 $t->$elt_handler( $elt); 2709 } 2710 2711 } 2712 2713 2714# add a comment or pi before the first element 2715sub _add_cpi_outside_of_root 2716 { my($t, $type, $elt)= @_; # $type is 'leading_cpi' or 'trailing_cpi' 2717 $t->{$type} ||= $t->{twig_elt_class}->new( '#CPI'); 2718 # create the node as a child of the current element 2719 $elt->paste_last_child( $t->{$type}); 2720 return $t; 2721 } 2722 2723sub _twig_final 2724 { # warn " in _twig_final...\n"; # DEBUG handler 2725 2726 my $p= shift; 2727 my $t= $p->isa( 'XML::Twig') ? $p : $p->{twig}; 2728 2729 # store trailing data 2730 if( $t->{extra_data}) { $t->{trailing_cpi_text} = $t->{extra_data}; $t->{extra_data}=''; } 2731 $t->{trailing_spaces}= $t->{twig_stored_spaces} || ''; 2732 my $s= $t->{twig_stored_spaces}; $s=~s{\n}{\\n}g; 2733 if( $t->{twig_stored_spaces}) { my $s= $t->{twig_stored_spaces}; } 2734 2735 # restore the selected filehandle if needed 2736 $t->_set_fh_to_selected_fh(); 2737 2738 $t->_trigger_tdh if( $t->{twig_tdh}); 2739 2740 select $t->{twig_original_selected_fh} if($t->{twig_original_selected_fh}); # probably dodgy 2741 2742 if( exists $t->{twig_autoflush_data}) 2743 { my @args; 2744 push @args, $t->{twig_autoflush_data}->{fh} if( $t->{twig_autoflush_data}->{fh}); 2745 push @args, @{$t->{twig_autoflush_data}->{args}} if( $t->{twig_autoflush_data}->{args}); 2746 $t->flush( @args); 2747 delete $t->{twig_autoflush_data}; 2748 $t->root->delete if $t->root; 2749 } 2750 2751 # tries to clean-up (probably not very well at the moment) 2752 #undef $p->{twig}; 2753 undef $t->{twig_parser}; 2754 delete $t->{twig_parsing}; 2755 @{$t}{ qw( twig_parser twig_parsing _twig_context_stack twig_current) }=(); 2756 2757 return $t; 2758 } 2759 2760sub _insert_pcdata 2761 { my( $t, $string)= @_; 2762 # create a new PCDATA element 2763 my $parent= $t->{twig_current}; # always defined 2764 my $elt; 2765 if( exists $t->{twig_alt_elt_class}) 2766 { $elt= $t->{twig_elt_class}->new( $PCDATA); 2767 $elt->{pcdata}= $string; 2768 } 2769 else 2770 { $elt= bless( { gi => $XML::Twig::gi2index{$PCDATA}, pcdata => $string }, 'XML::Twig::Elt'); } 2771 2772 my $prev_sibling= $parent->{last_child}; 2773 if( $prev_sibling) 2774 { $prev_sibling->{next_sibling}= $elt; 2775 $elt->{prev_sibling}=$prev_sibling; if( $XML::Twig::weakrefs) { weaken( $elt->{prev_sibling});} ; 2776 } 2777 else 2778 { $parent->{first_child}= $elt; } 2779 2780 $elt->{parent}=$parent; if( $XML::Twig::weakrefs) { weaken( $elt->{parent});} ; 2781 delete $parent->{empty}; $parent->{last_child}=$elt; if( $XML::Twig::weakrefs) { weaken( $parent->{last_child});} ; 2782 $t->{twig_stored_spaces}=''; 2783 return $elt; 2784 } 2785 2786sub _space_policy 2787 { my( $t, $gi)= @_; 2788 my $policy; 2789 $policy=0 if( $t->{twig_discard_spaces}); 2790 $policy=1 if( $t->{twig_keep_spaces}); 2791 $policy=1 if( $t->{twig_keep_spaces_in} 2792 && $t->{twig_keep_spaces_in}->{$gi}); 2793 $policy=0 if( $t->{twig_discard_spaces_in} 2794 && $t->{twig_discard_spaces_in}->{$gi}); 2795 return $policy; 2796 } 2797 2798 2799sub _twig_entity 2800 { # warn " in _twig_entity...\n"; # DEBUG handler 2801 my( $p, $name, $val, $sysid, $pubid, $ndata, $param)= @_; 2802 my $t=$p->{twig}; 2803 2804 #{ no warnings; my $base= $p->base; warn "_twig_entity called: expand: '$t->{twig_expand_external_ents}', base: '$base', name: '$name', val: '$val', sysid: '$sysid', pubid: '$pubid', ndata: '$ndata', param: '$param'\n";} 2805 2806 my $missing_entity=0; 2807 2808 if( $sysid) 2809 { if($ndata) 2810 { if( ! -f _based_filename( $sysid, $p->base)) { $missing_entity= 1; } 2811 } 2812 else 2813 { if( $t->{twig_expand_external_ents}) 2814 { $val= eval { _slurp_uri( $sysid, $p->base) }; 2815 if( ! defined $val) 2816 { if( $t->{twig_extern_ent_nofail}) 2817 { $missing_entity= 1; } 2818 else 2819 { _croak( "cannot load SYSTEM entity '$name' from '$sysid': $@", 3); } 2820 } 2821 } 2822 } 2823 } 2824 2825 my $ent=XML::Twig::Entity->new( $name, $val, $sysid, $pubid, $ndata, $param); 2826 if( $missing_entity) { $t->{twig_missing_system_entities}->{$name}= $ent; } 2827 2828 my $entity_list= $t->entity_list; 2829 if( $entity_list) { $entity_list->add( $ent); } 2830 2831 if( $parser_version > 2.27) 2832 { # this is really ugly, but with some versions of XML::Parser the value 2833 # of the entity is not properly returned by the default handler 2834 my $ent_decl= $ent->text; 2835 if( $t->{twig_keep_encoding}) 2836 { if( defined $ent->{val} && ($ent_decl !~ /["']/)) 2837 { my $val= $ent->{val}; 2838 $ent_decl .= $val =~ /"/ ? qq{'$val' } : qq{"$val" }; 2839 } 2840 # for my solaris box (perl 5.6.1, XML::Parser 2.31, expat?) 2841 $t->{twig_doctype}->{internal}=~ s{<!ENTITY\s+$name\s+$}{substr( $ent_decl, 0, -1)}e; 2842 } 2843 $t->{twig_doctype}->{internal} .= $ent_decl 2844 unless( $t->{twig_doctype}->{internal}=~ m{<!ENTITY\s+$name\s+}); 2845 } 2846 2847 return; 2848 } 2849 2850sub _twig_notation 2851 { my( $p, $name, $base, $sysid, $pubid ) = @_; 2852 my $t = $p->{twig}; 2853 2854 my $notation = XML::Twig::Notation->new( $name, $base, $sysid, $pubid ); 2855 my $notation_list = $t->notation_list(); 2856 if( $notation_list ) { $notation_list->add( $notation ); } 2857 2858 # internal should get the recognized_string, but XML::Parser does not provide it 2859 # so we need to re-create it ( $notation->text) and stick it there. 2860 $t->{twig_doctype}->{internal} .= $notation->text; 2861 2862 return; 2863 } 2864 2865 2866sub _twig_extern_ent 2867 { # warn " in _twig_extern_ent...I (", $_[0]->original_string, ")\n"; # DEBUG handler 2868 my( $p, $base, $sysid, $pubid)= @_; 2869 my $t= $p->{twig}; 2870 if( $t->{twig_no_expand}) 2871 { my $ent_name= $t->{twig_keep_encoding} ? $p->original_string : $p->recognized_string; 2872 _twig_insert_ent( $t, $ent_name); 2873 return ''; 2874 } 2875 my $ent_content= eval { $t->{twig_ext_ent_handler}->( $p, $base, $sysid) }; 2876 if( ! defined $ent_content) 2877 { 2878 my $ent_name = $p->recognized_string; 2879 my $file = _based_filename( $sysid, $base); 2880 my $error_message= "cannot expand $ent_name - cannot load '$file'"; 2881 if( $t->{twig_extern_ent_nofail}) { return "<!-- $error_message -->"; } 2882 else { _croak( $error_message); } 2883 } 2884 return $ent_content; 2885 } 2886 2887# I use this so I can change the $Carp::CarpLevel (which determines how many call frames to skip when reporting an error) 2888sub _croak 2889 { my( $message, $level)= @_; 2890 $Carp::CarpLevel= $level || 0; 2891 croak $message; 2892 } 2893 2894sub _twig_xmldecl 2895 { # warn " in _twig_xmldecl...\n"; # DEBUG handler 2896 2897 my $p= shift; 2898 my $t=$p->{twig}; 2899 $t->{twig_xmldecl}||={}; # could have been set by set_output_encoding 2900 $t->{twig_xmldecl}->{version}= shift; 2901 $t->{twig_xmldecl}->{encoding}= shift; 2902 $t->{twig_xmldecl}->{standalone}= shift; 2903 return; 2904 } 2905 2906sub _twig_doctype 2907 { # warn " in _twig_doctype...\n"; # DEBUG handler 2908 my( $p, $name, $sysid, $pub, $internal)= @_; 2909 my $t=$p->{twig}; 2910 $t->{twig_doctype}||= {}; # create 2911 $t->{twig_doctype}->{name}= $name; # always there 2912 $t->{twig_doctype}->{sysid}= $sysid; # 2913 $t->{twig_doctype}->{pub}= $pub; # 2914 2915 # now let's try to cope with XML::Parser 2.28 and above 2916 if( $parser_version > 2.27) 2917 { @saved_default_handler= $p->setHandlers( Default => \&_twig_store_internal_dtd, 2918 Entity => \&_twig_entity, 2919 ); 2920 $p->setHandlers( DoctypeFin => \&_twig_stop_storing_internal_dtd); 2921 $t->{twig_doctype}->{internal}=''; 2922 } 2923 else 2924 # for XML::Parser before 2.28 2925 { $internal||=''; 2926 $internal=~ s{^\s*\[}{}; 2927 $internal=~ s{]\s*$}{}; 2928 $t->{twig_doctype}->{internal}=$internal; 2929 } 2930 2931 # now check if we want to get the DTD info 2932 if( $t->{twig_read_external_dtd} && $sysid) 2933 { # let's build a fake document with an internal DTD 2934 if( $t->{DTDBase}) 2935 { _use( 'File::Spec'); 2936 $sysid=File::Spec->catfile($t->{DTDBase}, $sysid); 2937 } 2938 my $dtd= _slurp_uri( $sysid); 2939 # if the DTD includes an XML declaration, it needs to be moved before the DOCTYPE bit 2940 if( $dtd=~ s{^(\s*<\?xml(\s+\w+\s*=\s*("[^"]*"|'[^']*'))*\s*\?>)}{}) 2941 { $dtd= "$1<!DOCTYPE $name [$dtd]><$name/>"; } 2942 else 2943 { $dtd= "<!DOCTYPE $name [$dtd]><$name/>"; } 2944 2945 $t->save_global_state(); # save the globals (they will be reset by the following new) 2946 my $t_dtd= XML::Twig->new( load_DTD => 1, ParseParamEnt => 1, error_context => $t->{ErrorContext} || 0); # create a temp twig 2947 $t_dtd->parse( $dtd); # parse it 2948 $t->{twig_dtd}= $t_dtd->{twig_dtd}; # grab the dtd info 2949 #$t->{twig_dtd_is_external}=1; 2950 $t->entity_list->_add_list( $t_dtd->entity_list) if( $t_dtd->entity_list); # grab the entity info 2951 $t->notation_list->_add_list( $t_dtd->notation_list) if( $t_dtd->notation_list); # grab the notation info 2952 $t->restore_global_state(); 2953 } 2954 return; 2955 } 2956 2957sub _twig_element 2958 { # warn " in _twig_element...\n"; # DEBUG handler 2959 2960 my( $p, $name, $model)= @_; 2961 my $t=$p->{twig}; 2962 $t->{twig_dtd}||= {}; # may create the dtd 2963 $t->{twig_dtd}->{model}||= {}; # may create the model hash 2964 $t->{twig_dtd}->{elt_list}||= []; # ordered list of elements 2965 push @{$t->{twig_dtd}->{elt_list}}, $name; # store the elt 2966 $t->{twig_dtd}->{model}->{$name}= $model; # store the model 2967 if( ($parser_version > 2.27) && ($t->{twig_doctype}->{internal}=~ m{(^|>)\s*$}) ) 2968 { my $text= $XML::Twig::Elt::keep_encoding ? $p->original_string : $p->recognized_string; 2969 unless( $text) 2970 { # this version of XML::Parser does not return the text in the *_string method 2971 # we need to rebuild it 2972 $text= "<!ELEMENT $name $model>"; 2973 } 2974 $t->{twig_doctype}->{internal} .= $text; 2975 } 2976 return; 2977 } 2978 2979sub _twig_attlist 2980 { # warn " in _twig_attlist...\n"; # DEBUG handler 2981 2982 my( $p, $gi, $att, $type, $default, $fixed)= @_; 2983 #warn "in attlist: gi: '$gi', att: '$att', type: '$type', default: '$default', fixed: '$fixed'\n"; 2984 my $t=$p->{twig}; 2985 $t->{twig_dtd}||= {}; # create dtd if need be 2986 $t->{twig_dtd}->{$gi}||= {}; # create elt if need be 2987 #$t->{twig_dtd}->{$gi}->{att}||= {}; # create att if need be 2988 if( ($parser_version > 2.27) && ($t->{twig_doctype}->{internal}=~ m{(^|>)\s*$}) ) 2989 { my $text= $XML::Twig::Elt::keep_encoding ? $p->original_string : $p->recognized_string; 2990 unless( $text) 2991 { # this version of XML::Parser does not return the text in the *_string method 2992 # we need to rebuild it 2993 my $att_decl="$att $type"; 2994 $att_decl .= " #FIXED" if( $fixed); 2995 $att_decl .= " $default" if( defined $default); 2996 # 2 cases: there is already an attlist on that element or not 2997 if( $t->{twig_dtd}->{att}->{$gi}) 2998 { # there is already an attlist, add to it 2999 $t->{twig_doctype}->{internal}=~ s{(<!ATTLIST\s*$gi )(.*?)\n?>} 3000 { "$1$2\n" . ' ' x length( $1) . "$att_decl\n>"}es; 3001 } 3002 else 3003 { # create the attlist 3004 $t->{twig_doctype}->{internal}.= "<!ATTLIST $gi $att_decl>" 3005 } 3006 } 3007 } 3008 $t->{twig_dtd}->{att}->{$gi}->{$att}= {} ; 3009 $t->{twig_dtd}->{att}->{$gi}->{$att}->{type}= $type; 3010 $t->{twig_dtd}->{att}->{$gi}->{$att}->{default}= $default if( defined $default); 3011 $t->{twig_dtd}->{att}->{$gi}->{$att}->{fixed}= $fixed; 3012 return; 3013 } 3014 3015sub _twig_default 3016 { # warn " in _twig_default...\n"; # DEBUG handler 3017 3018 my( $p, $string)= @_; 3019 3020 my $t= $p->{twig}; 3021 3022 # we need to process the data in 2 cases: entity, or spaces after the closing tag 3023 3024 # after the closing tag (no twig_current and root has been created) 3025 if( ! $t->{twig_current} && $t->{twig_root} && $string=~ m{^\s+$}m) { $t->{twig_stored_spaces} .= $string; } 3026 3027 # process only if we have an entity 3028 if( $string=~ m{^&([^;]*);$}) 3029 { # the entity has to be pure pcdata, or we have a problem 3030 if( ($p->original_string=~ m{^<}) && ($p->original_string=~ m{>$}) ) 3031 { # string is a tag, entity is in an attribute 3032 $t->{twig_entities_in_attribute}=1 if( $t->{twig_do_not_escape_amp_in_atts}); 3033 } 3034 else 3035 { my $ent; 3036 if( $t->{twig_keep_encoding}) 3037 { _twig_char( $p, $string); 3038 $ent= substr( $string, 1, -1); 3039 } 3040 else 3041 { $ent= _twig_insert_ent( $t, $string); 3042 } 3043 3044 return $ent; 3045 } 3046 } 3047 } 3048 3049sub _twig_insert_ent 3050 { 3051 my( $t, $string)=@_; 3052 3053 my $twig_current= $t->{twig_current}; 3054 3055 my $ent= $t->{twig_elt_class}->new( $ENT); 3056 $ent->{ent}= $string; 3057 3058 _add_or_discard_stored_spaces( $t); 3059 3060 if( $t->{twig_in_pcdata}) 3061 { # create the node as a sibling of the #PCDATA 3062 3063 $ent->{prev_sibling}=$twig_current; if( $XML::Twig::weakrefs) { weaken( $ent->{prev_sibling});} ; 3064 $twig_current->{next_sibling}= $ent; 3065 my $parent= $twig_current->{parent}; 3066 $ent->{parent}=$parent; if( $XML::Twig::weakrefs) { weaken( $ent->{parent});} ; 3067 delete $parent->{empty}; $parent->{last_child}=$ent; if( $XML::Twig::weakrefs) { weaken( $parent->{last_child});} ; 3068 # the twig_current is now the parent 3069 delete $twig_current->{'twig_current'}; 3070 $t->{twig_current}= $parent; 3071 # we left pcdata 3072 $t->{twig_in_pcdata}=0; 3073 } 3074 else 3075 { # create the node as a child of the current element 3076 $ent->{parent}=$twig_current; if( $XML::Twig::weakrefs) { weaken( $ent->{parent});} ; 3077 if( my $prev_sibling= $twig_current->{last_child}) 3078 { $ent->{prev_sibling}=$prev_sibling; if( $XML::Twig::weakrefs) { weaken( $ent->{prev_sibling});} ; 3079 $prev_sibling->{next_sibling}= $ent; 3080 } 3081 else 3082 { if( $twig_current) { $twig_current->{first_child}= $ent; } } 3083 if( $twig_current) { delete $twig_current->{empty}; $twig_current->{last_child}=$ent; if( $XML::Twig::weakrefs) { weaken( $twig_current->{last_child});} ; } 3084 } 3085 3086 # meant to trigger entity handler, does not seem to be activated at this time 3087 #if( my $handler= $t->{twig_handlers}->{gi}->{$ENT}) 3088 # { local $_= $ent; $handler->( $t, $ent); } 3089 3090 return $ent; 3091 } 3092 3093sub parser 3094 { return $_[0]->{twig_parser}; } 3095 3096# returns the declaration text (or a default one) 3097sub xmldecl 3098 { my $t= shift; 3099 return '' unless( $t->{twig_xmldecl} || $t->{output_encoding}); 3100 my $decl_string; 3101 my $decl= $t->{twig_xmldecl}; 3102 if( $decl) 3103 { my $version= $decl->{version}; 3104 $decl_string= q{<?xml}; 3105 $decl_string .= qq{ version="$version"}; 3106 3107 # encoding can either have been set (in $decl->{output_encoding}) 3108 # or come from the document (in $decl->{encoding}) 3109 if( $t->{output_encoding}) 3110 { my $encoding= $t->{output_encoding}; 3111 $decl_string .= qq{ encoding="$encoding"}; 3112 } 3113 elsif( $decl->{encoding}) 3114 { my $encoding= $decl->{encoding}; 3115 $decl_string .= qq{ encoding="$encoding"}; 3116 } 3117 3118 if( defined( $decl->{standalone})) 3119 { $decl_string .= q{ standalone="}; 3120 $decl_string .= $decl->{standalone} ? "yes" : "no"; 3121 $decl_string .= q{"}; 3122 } 3123 3124 $decl_string .= "?>\n"; 3125 } 3126 else 3127 { my $encoding= $t->{output_encoding}; 3128 $decl_string= qq{<?xml version="1.0" encoding="$encoding"?>}; 3129 } 3130 3131 my $output_filter= XML::Twig::Elt::output_filter(); 3132 return $output_filter ? $output_filter->( $decl_string) : $decl_string; 3133 } 3134 3135sub set_doctype 3136 { my( $t, $name, $system, $public, $internal)= @_; 3137 $t->{twig_doctype}= {} unless defined $t->{twig_doctype}; 3138 my $doctype= $t->{twig_doctype}; 3139 $doctype->{name} = $name if( defined $name); 3140 $doctype->{sysid} = $system if( defined $system); 3141 $doctype->{pub} = $public if( defined $public); 3142 $doctype->{internal} = $internal if( defined $internal); 3143 } 3144 3145sub doctype_name 3146 { my $t= shift; 3147 my $doctype= $t->{twig_doctype} or return ''; 3148 return $doctype->{name} || ''; 3149 } 3150 3151sub system_id 3152 { my $t= shift; 3153 my $doctype= $t->{twig_doctype} or return ''; 3154 return $doctype->{sysid} || ''; 3155 } 3156 3157sub public_id 3158 { my $t= shift; 3159 my $doctype= $t->{twig_doctype} or return ''; 3160 return $doctype->{pub} || ''; 3161 } 3162 3163sub internal_subset 3164 { my $t= shift; 3165 my $doctype= $t->{twig_doctype} or return ''; 3166 return $doctype->{internal} || ''; 3167 } 3168 3169# return the dtd object 3170sub dtd 3171 { my $t= shift; 3172 return $t->{twig_dtd}; 3173 } 3174 3175# return an element model, or the list of element models 3176sub model 3177 { my $t= shift; 3178 my $elt= shift; 3179 return $t->dtd->{model}->{$elt} if( $elt); 3180 return (sort keys %{$t->dtd->{model}}); 3181 } 3182 3183 3184# return the entity_list object 3185sub entity_list 3186 { my $t= shift; 3187 return $t->{twig_entity_list}; 3188 } 3189 3190# return the list of entity names 3191sub entity_names 3192 { my $t= shift; 3193 return $t->entity_list->entity_names; 3194 } 3195 3196# return the entity object 3197sub entity 3198 { my $t= shift; 3199 my $entity_name= shift; 3200 return $t->entity_list->ent( $entity_name); 3201 } 3202 3203# return the notation_list object 3204sub notation_list 3205 { my $t= shift; 3206 return $t->{twig_notation_list}; 3207 } 3208 3209# return the list of notation names 3210sub notation_names 3211 { my $t= shift; 3212 return $t->notation_list->notation_names; 3213 } 3214 3215# return the notation object 3216sub notation 3217 { my $t= shift; 3218 my $notation_name= shift; 3219 return $t->notation_list->notation( $notation_name); 3220 } 3221 3222 3223 3224 3225sub print_prolog 3226 { my $t= shift; 3227 my $fh= isa( $_[0], 'GLOB') || isa( $_[0], 'IO::Scalar') ? shift : $t->{twig_output_fh} || select() || \*STDOUT; 3228 ## no critic (TestingAndDebugging::ProhibitNoStrict); 3229 no strict 'refs'; 3230 print {$fh} $t->prolog( @_); 3231 } 3232 3233sub prolog 3234 { my $t= shift; 3235 if( $t->{no_prolog}){ return ''; } 3236 3237 return $t->{no_prolog} ? '' 3238 : defined $t->{no_dtd_output} ? $t->xmldecl 3239 : $t->xmldecl . $t->doctype( @_); 3240 } 3241 3242sub doctype 3243 { my $t= shift; 3244 my %args= _normalize_args( @_); 3245 my $update_dtd = $args{UpdateDTD} || ''; 3246 my $doctype_text=''; 3247 3248 my $doctype= $t->{twig_doctype}; 3249 3250 if( $doctype) 3251 { $doctype_text .= qq{<!DOCTYPE $doctype->{name}} if( $doctype->{name}); 3252 $doctype_text .= qq{ PUBLIC "$doctype->{pub}"} if( $doctype->{pub}); 3253 $doctype_text .= qq{ SYSTEM} if( $doctype->{sysid} && !$doctype->{pub}); 3254 $doctype_text .= qq{ "$doctype->{sysid}"} if( $doctype->{sysid}); 3255 } 3256 3257 if( $update_dtd) 3258 { if( $doctype) 3259 { my $internal=$doctype->{internal}; 3260 # awful hack, but at least it works a little better that what was there before 3261 if( $internal) 3262 { # remove entity and notation declarations (they will be re-generated from the updated entity list) 3263 $internal=~ s{<! \s* ENTITY \s+ $REG_TAG_NAME \s+ ( ("[^"]*"|'[^']*') \s* | SYSTEM [^>]*) >\s*}{}xg; 3264 $internal=~ s{<! \s* NOTATION .*? >\s*}{}sxg; 3265 $internal=~ s{^\n}{}; 3266 } 3267 $internal .= $t->entity_list->text ||'' if( $t->entity_list); 3268 $internal .= $t->notation_list->text ||'' if( $t->notation_list); 3269 if( $internal) { $doctype_text .= "[\n$internal]>\n"; } 3270 } 3271 elsif( !$t->{'twig_dtd'} && ( keys %{$t->entity_list} || keys %{$t->notation_list} ) ) 3272 { $doctype_text .= "<!DOCTYPE " . $t->root->gi . " [\n" . $t->entity_list->text . $t->notation_list->text . "\n]>";} 3273 else 3274 { $doctype_text= $t->{twig_dtd}; 3275 $doctype_text .= $t->dtd_text; 3276 } 3277 } 3278 elsif( $doctype) 3279 { if( my $internal= $doctype->{internal}) 3280 { # add opening and closing brackets if not already there 3281 # plus some spaces and newlines for a nice formating 3282 # I test it here because I can't remember which version of 3283 # XML::Parser need it or not, nor guess which one will in the 3284 # future, so this about the best I can do 3285 $internal=~ s{^\s*(\[\s*)?}{ [\n}; 3286 $internal=~ s{\s*(\]\s*(>\s*)?)?\s*$}{\n]>\n}; 3287 3288 # XML::Parser does not include the NOTATION declarations in the DTD 3289 # at least in the current version. So put them back 3290 #if( $t->notation_list && $internal !~ m{<!\s*NOTATION}) 3291 # { $internal=~ s{(\n]>\n)$}{ "\n" . $t->notation_list->text . $1}es; } 3292 3293 $doctype_text .= $internal; 3294 } 3295 } 3296 3297 if( $doctype_text) 3298 { 3299 # terrible hack, as I can't figure out in which case the darn prolog 3300 # should get an extra > (depends on XML::Parser and expat versions) 3301 $doctype_text=~ s/(>\s*)*$/>\n/; # if($doctype_text); 3302 3303 my $output_filter= XML::Twig::Elt::output_filter(); 3304 return $output_filter ? $output_filter->( $doctype_text) : $doctype_text; 3305 } 3306 else 3307 { return $doctype_text; } 3308 } 3309 3310sub _leading_cpi 3311 { my $t= shift; 3312 my $leading_cpi= $t->{leading_cpi} || return ''; 3313 return $leading_cpi->sprint( 1); 3314 } 3315 3316sub _trailing_cpi 3317 { my $t= shift; 3318 my $trailing_cpi= $t->{trailing_cpi} || return ''; 3319 return $trailing_cpi->sprint( 1); 3320 } 3321 3322sub _trailing_cpi_text 3323 { my $t= shift; 3324 return $t->{trailing_cpi_text} || ''; 3325 } 3326 3327sub print_to_file 3328 { my( $t, $filename)= (shift, shift); 3329 my $out_fh; 3330# open( $out_fh, ">$filename") or _croak( "cannot create file $filename: $!"); # < perl 5.8 3331 my $mode= $t->{twig_keep_encoding} && ! _use_perlio() ? '>' : '>:utf8'; # >= perl 5.8 3332 open( $out_fh, $mode, $filename) or _croak( "cannot create file $filename: $!"); # >= perl 5.8 3333 $t->print( $out_fh, @_); 3334 close $out_fh; 3335 return $t; 3336 } 3337 3338# probably only works on *nix (at least the chmod bit) 3339# first print to a temporary file, then rename that file to the desired file name, then change permissions 3340# to the original file permissions (or to the current umask) 3341sub safe_print_to_file 3342 { my( $t, $filename)= (shift, shift); 3343 my $perm= -f $filename ? (stat $filename)[2] & 07777 : ~umask() ; 3344 XML::Twig::_use( 'File::Temp') || croak "need File::Temp to use safe_print_to_file\n"; 3345 my $tmpdir= dirname( $filename); 3346 my( $fh, $tmpfilename) = File::Temp::tempfile( DIR => $tmpdir); 3347 $t->print_to_file( $tmpfilename, @_); 3348 rename( $tmpfilename, $filename) or unlink $tmpfilename && _croak( "cannot move temporary file to $filename: $!"); 3349 chmod $perm, $filename; 3350 return $t; 3351 } 3352 3353 3354sub print 3355 { my $t= shift; 3356 my $fh= isa( $_[0], 'GLOB') || isa( $_[0], 'IO::Scalar') ? shift : undef; 3357 my %args= _normalize_args( @_); 3358 3359 my $old_select = defined $fh ? select $fh : undef; 3360 my $old_pretty = defined ($args{PrettyPrint}) ? $t->set_pretty_print( $args{PrettyPrint}) : undef; 3361 my $old_empty_tag = defined ($args{EmptyTags}) ? $t->set_empty_tag_style( $args{EmptyTags}) : undef; 3362 3363 #if( !$t->{encoding} || lc( $t->{encoding}) eq 'utf-8') { my $out= $fh || \*STDOUT; binmode $out, ':utf8'; } 3364 3365 if( $perl_version > 5.006 && ! $t->{twig_keep_encoding} && _use_perlio() ) { binmode( $fh || \*STDOUT, ":utf8" ); } 3366 3367 print $t->prolog( %args) . $t->_leading_cpi( %args); 3368 $t->{twig_root}->print; 3369 print $t->_trailing_cpi # trailing comments and pi's (elements, in 'process' mode) 3370 . $t->_trailing_cpi_text # trailing comments and pi's (in 'keep' mode) 3371 . ( ($t->{twig_keep_spaces}||'') && ($t->{trailing_spaces} || '')) 3372 ; 3373 3374 3375 $t->set_pretty_print( $old_pretty) if( defined $old_pretty); 3376 $t->set_empty_tag_style( $old_empty_tag) if( defined $old_empty_tag); 3377 if( $fh) { select $old_select; } 3378 3379 return $t; 3380 } 3381 3382 3383sub flush 3384 { my $t= shift; 3385 3386 $t->_trigger_tdh if $t->{twig_tdh}; 3387 3388 return if( $t->{twig_completely_flushed}); 3389 3390 my $fh= isa( $_[0], 'GLOB') || isa( $_[0], 'IO::Scalar') ? shift : undef; 3391 my $old_select= defined $fh ? select $fh : undef; 3392 my $up_to= ref $_[0] ? shift : undef; 3393 my %args= _normalize_args( @_); 3394 3395 my $old_pretty; 3396 if( defined $args{PrettyPrint}) 3397 { $old_pretty= $t->set_pretty_print( $args{PrettyPrint}); 3398 delete $args{PrettyPrint}; 3399 } 3400 3401 my $old_empty_tag_style; 3402 if( $args{EmptyTags}) 3403 { $old_empty_tag_style= $t->set_empty_tag_style( $args{EmptyTags}); 3404 delete $args{EmptyTags}; 3405 } 3406 3407 3408 # the "real" last element processed, as _twig_end has closed it 3409 my $last_elt; 3410 my $flush_trailing_data=0; 3411 if( $up_to) 3412 { $last_elt= $up_to; } 3413 elsif( $t->{twig_current}) 3414 { $last_elt= $t->{twig_current}->{last_child}; } 3415 else 3416 { $last_elt= $t->{twig_root}; 3417 $flush_trailing_data=1; 3418 $t->{twig_completely_flushed}=1; 3419 } 3420 3421 # flush the DTD unless it has ready flushed (ie root has been flushed) 3422 my $elt= $t->{twig_root}; 3423 unless( $elt->{'flushed'}) 3424 { # store flush info so we can auto-flush later 3425 if( $t->{twig_autoflush}) 3426 { $t->{twig_autoflush_data}={}; 3427 $t->{twig_autoflush_data}->{fh} = $fh if( $fh); 3428 $t->{twig_autoflush_data}->{args} = \@_ if( @_); 3429 } 3430 $t->print_prolog( %args); 3431 print $t->_leading_cpi; 3432 } 3433 3434 while( $elt) 3435 { my $next_elt; 3436 if( $last_elt && $last_elt->in( $elt)) 3437 { 3438 unless( $elt->{'flushed'}) 3439 { # just output the front tag 3440 print $elt->start_tag(); 3441 $elt->{'flushed'}=1; 3442 } 3443 $next_elt= $elt->{first_child}; 3444 } 3445 else 3446 { # an element before the last one or the last one, 3447 $next_elt= $elt->{next_sibling}; 3448 $elt->_flush(); 3449 $elt->delete; 3450 last if( $last_elt && ($elt == $last_elt)); 3451 } 3452 $elt= $next_elt; 3453 } 3454 3455 if( $flush_trailing_data) 3456 { print $t->_trailing_cpi # trailing comments and pi's (elements, in 'process' mode) 3457 , $t->_trailing_cpi_text # trailing comments and pi's (in 'keep' mode) 3458 } 3459 3460 select $old_select if( defined $old_select); 3461 $t->set_pretty_print( $old_pretty) if( defined $old_pretty); 3462 $t->set_empty_tag_style( $old_empty_tag_style) if( defined $old_empty_tag_style); 3463 3464 if( my $ids= $t->{twig_id_list}) 3465 { while( my ($id, $elt)= each %$ids) 3466 { if( ! defined $elt) 3467 { delete $t->{twig_id_list}->{$id} } 3468 } 3469 } 3470 3471 return $t; 3472 } 3473 3474 3475# flushes up to an element 3476# this method just reorders the arguments and calls flush 3477sub flush_up_to 3478 { my $t= shift; 3479 my $up_to= shift; 3480 if( isa( $_[0], 'GLOB') || isa( $_[0], 'IO::Scalar')) 3481 { my $fh= shift; 3482 $t->flush( $fh, $up_to, @_); 3483 } 3484 else 3485 { $t->flush( $up_to, @_); } 3486 3487 return $t; 3488 } 3489 3490 3491# same as print except the entire document text is returned as a string 3492sub sprint 3493 { my $t= shift; 3494 my %args= _normalize_args( @_); 3495 3496 my $old_pretty; 3497 if( defined $args{PrettyPrint}) 3498 { $old_pretty= $t->set_pretty_print( $args{PrettyPrint}); 3499 delete $args{PrettyPrint}; 3500 } 3501 3502 my $old_empty_tag_style; 3503 if( defined $args{EmptyTags}) 3504 { $old_empty_tag_style= $t->set_empty_tag_style( $args{EmptyTags}); 3505 delete $args{EmptyTags}; 3506 } 3507 3508 my $string= $t->prolog( %args) # xml declaration and doctype 3509 . $t->_leading_cpi( %args) # leading comments and pi's in 'process' mode 3510 . ( ($t->{twig_root} && $t->{twig_root}->sprint) || '') 3511 . $t->_trailing_cpi # trailing comments and pi's (elements, in 'process' mode) 3512 . $t->_trailing_cpi_text # trailing comments and pi's (in 'keep' mode) 3513 ; 3514 if( $t->{twig_keep_spaces} && $t->{trailing_spaces}) { $string .= $t->{trailing_spaces}; } 3515 3516 $t->set_pretty_print( $old_pretty) if( defined $old_pretty); 3517 $t->set_empty_tag_style( $old_empty_tag_style) if( defined $old_empty_tag_style); 3518 3519 return $string; 3520 } 3521 3522 3523# this method discards useless elements in a tree 3524# it does the same thing as a flush except it does not print it 3525# the second argument is an element, the last purged element 3526# (this argument is usually set through the purge_up_to method) 3527sub purge 3528 { my $t= shift; 3529 my $up_to= shift; 3530 3531 $t->_trigger_tdh if $t->{twig_tdh}; 3532 3533 # the "real" last element processed, as _twig_end has closed it 3534 my $last_elt; 3535 if( $up_to) 3536 { $last_elt= $up_to; } 3537 elsif( $t->{twig_current}) 3538 { $last_elt= $t->{twig_current}->{last_child}; } 3539 else 3540 { $last_elt= $t->{twig_root}; } 3541 3542 my $elt= $t->{twig_root}; 3543 3544 while( $elt) 3545 { my $next_elt; 3546 if( $last_elt && $last_elt->in( $elt)) 3547 { $elt->{'flushed'}=1; 3548 $next_elt= $elt->{first_child}; 3549 } 3550 else 3551 { # an element before the last one or the last one, 3552 $next_elt= $elt->{next_sibling}; 3553 $elt->delete; 3554 last if( $last_elt && ($elt == $last_elt) ); 3555 } 3556 $elt= $next_elt; 3557 } 3558 3559 if( my $ids= $t->{twig_id_list}) 3560 { while( my ($id, $elt)= each %$ids) { if( ! defined $elt) { delete $t->{twig_id_list}->{$id} } } } 3561 3562 return $t; 3563 } 3564 3565# flushes up to an element. This method just calls purge 3566sub purge_up_to 3567 { my $t= shift; 3568 return $t->purge( @_); 3569 } 3570 3571sub root 3572 { return $_[0]->{twig_root}; } 3573 3574sub normalize 3575 { return $_[0]->root->normalize; } 3576 3577 3578# create accessor methods on attribute names 3579{ my %accessor; # memorize accessor names so re-creating them won't trigger an error 3580sub att_accessors 3581 { 3582 my $twig_or_class= shift; 3583 my $elt_class= ref $twig_or_class ? $twig_or_class->{twig_elt_class} 3584 : 'XML::Twig::Elt' 3585 ; 3586 ## no critic (TestingAndDebugging::ProhibitNoStrict); 3587 no strict 'refs'; 3588 foreach my $att (@_) 3589 { _croak( "attempt to redefine existing method $att using att_accessors") 3590 if( $elt_class->can( $att) && !$accessor{$att}); 3591 3592 if( !$accessor{$att}) 3593 { *{"$elt_class\::$att"}= 3594 sub 3595 :lvalue # > perl 5.5 3596 { my $elt= shift; 3597 if( @_) { $elt->{att}->{$att}= $_[0]; } 3598 $elt->{att}->{$att}; 3599 }; 3600 $accessor{$att}=1; 3601 } 3602 } 3603 return $twig_or_class; 3604 } 3605} 3606 3607{ my %accessor; # memorize accessor names so re-creating them won't trigger an error 3608sub elt_accessors 3609 { 3610 my $twig_or_class= shift; 3611 my $elt_class= ref $twig_or_class ? $twig_or_class->{twig_elt_class} 3612 : 'XML::Twig::Elt' 3613 ; 3614 3615 # if arg is a hash ref, it's exp => name, otherwise it's a list of tags 3616 my %exp_to_alias= ref( $_[0]) && isa( $_[0], 'HASH') ? %{$_[0]} 3617 : map { $_ => $_ } @_; 3618 ## no critic (TestingAndDebugging::ProhibitNoStrict); 3619 no strict 'refs'; 3620 while( my( $alias, $exp)= each %exp_to_alias ) 3621 { if( $elt_class->can( $alias) && !$accessor{$alias}) 3622 { _croak( "attempt to redefine existing method $alias using elt_accessors"); } 3623 3624 if( !$accessor{$alias}) 3625 { *{"$elt_class\::$alias"}= 3626 sub 3627 { my $elt= shift; 3628 return wantarray ? $elt->children( $exp) : $elt->first_child( $exp); 3629 }; 3630 $accessor{$alias}=1; 3631 } 3632 } 3633 return $twig_or_class; 3634 } 3635} 3636 3637{ my %accessor; # memorize accessor names so re-creating them won't trigger an error 3638sub field_accessors 3639 { 3640 my $twig_or_class= shift; 3641 my $elt_class= ref $twig_or_class ? $twig_or_class->{twig_elt_class} 3642 : 'XML::Twig::Elt' 3643 ; 3644 my %exp_to_alias= ref( $_[0]) && isa( $_[0], 'HASH') ? %{$_[0]} 3645 : map { $_ => $_ } @_; 3646 3647 ## no critic (TestingAndDebugging::ProhibitNoStrict); 3648 no strict 'refs'; 3649 while( my( $alias, $exp)= each %exp_to_alias ) 3650 { if( $elt_class->can( $alias) && !$accessor{$alias}) 3651 { _croak( "attempt to redefine existing method $exp using field_accessors"); } 3652 if( !$accessor{$alias}) 3653 { *{"$elt_class\::$alias"}= 3654 sub 3655 { my $elt= shift; 3656 $elt->field( $exp) 3657 }; 3658 $accessor{$alias}=1; 3659 } 3660 } 3661 return $twig_or_class; 3662 } 3663} 3664 3665sub first_elt 3666 { my( $t, $cond)= @_; 3667 my $root= $t->root || return undef; 3668 return $root if( $root->passes( $cond)); 3669 return $root->next_elt( $cond); 3670 } 3671 3672sub last_elt 3673 { my( $t, $cond)= @_; 3674 my $root= $t->root || return undef; 3675 return $root->last_descendant( $cond); 3676 } 3677 3678sub next_n_elt 3679 { my( $t, $offset, $cond)= @_; 3680 $offset -- if( $t->root->matches( $cond) ); 3681 return $t->root->next_n_elt( $offset, $cond); 3682 } 3683 3684sub get_xpath 3685 { my $twig= shift; 3686 if( isa( $_[0], 'ARRAY')) 3687 { my $elt_array= shift; 3688 return _unique_elts( map { $_->get_xpath( @_) } @$elt_array); 3689 } 3690 else 3691 { return $twig->root->get_xpath( @_); } 3692 } 3693 3694# get a list of elts and return a sorted list of unique elts 3695sub _unique_elts 3696 { my @sorted= sort { $a ->cmp( $b) } @_; 3697 my @unique; 3698 while( my $current= shift @sorted) 3699 { push @unique, $current unless( @unique && ($unique[-1] == $current)); } 3700 return @unique; 3701 } 3702 3703sub findvalue 3704 { my $twig= shift; 3705 if( isa( $_[0], 'ARRAY')) 3706 { my $elt_array= shift; 3707 return join( '', map { $_->findvalue( @_) } @$elt_array); 3708 } 3709 else 3710 { return $twig->root->findvalue( @_); } 3711 } 3712 3713sub findvalues 3714 { my $twig= shift; 3715 if( isa( $_[0], 'ARRAY')) 3716 { my $elt_array= shift; 3717 return map { $_->findvalues( @_) } @$elt_array; 3718 } 3719 else 3720 { return $twig->root->findvalues( @_); } 3721 } 3722 3723sub set_id_seed 3724 { my $t= shift; 3725 XML::Twig::Elt->set_id_seed( @_); 3726 return $t; 3727 } 3728 3729# return an array ref to an index, or undef 3730sub index 3731 { my( $twig, $name, $index)= @_; 3732 return defined( $index) ? $twig->{_twig_index}->{$name}->[$index] : $twig->{_twig_index}->{$name}; 3733 } 3734 3735# return a list with just the root 3736# if a condition is given then return an empty list unless the root matches 3737sub children 3738 { my( $t, $cond)= @_; 3739 my $root= $t->root; 3740 unless( $cond && !($root->passes( $cond)) ) 3741 { return ($root); } 3742 else 3743 { return (); } 3744 } 3745 3746sub _children { return ($_[0]->root); } 3747 3748# weird, but here for completude 3749# used to solve (non-sensical) /doc[1] XPath queries 3750sub child 3751 { my $t= shift; 3752 my $nb= shift; 3753 return ($t->children( @_))[$nb]; 3754 } 3755 3756sub descendants 3757 { my( $t, $cond)= @_; 3758 my $root= $t->root; 3759 if( $root->passes( $cond) ) 3760 { return ($root, $root->descendants( $cond)); } 3761 else 3762 { return ( $root->descendants( $cond)); } 3763 } 3764 3765sub simplify { my $t= shift; $t->root->simplify( @_); } 3766sub subs_text { my $t= shift; $t->root->subs_text( @_); } 3767sub trim { my $t= shift; $t->root->trim( @_); } 3768 3769 3770sub set_keep_encoding 3771 { my( $t, $keep)= @_; 3772 $t->{twig_keep_encoding}= $keep; 3773 $t->{NoExpand}= $keep; 3774 return XML::Twig::Elt::set_keep_encoding( $keep); 3775 } 3776 3777sub set_expand_external_entities 3778 { return XML::Twig::Elt::set_expand_external_entities( @_); } 3779 3780sub escape_gt 3781 { my $t= shift; $t->{twig_escape_gt}= 1; return XML::Twig::Elt::escape_gt( @_); } 3782 3783sub do_not_escape_gt 3784 { my $t= shift; $t->{twig_escape_gt}= 0; return XML::Twig::Elt::do_not_escape_gt( @_); } 3785 3786sub elt_id 3787 { return $_[0]->{twig_id_list}->{$_[1]}; } 3788 3789# change it in ALL twigs at the moment 3790sub change_gi 3791 { my( $twig, $old_gi, $new_gi)= @_; 3792 my $index; 3793 return unless($index= $XML::Twig::gi2index{$old_gi}); 3794 $XML::Twig::index2gi[$index]= $new_gi; 3795 delete $XML::Twig::gi2index{$old_gi}; 3796 $XML::Twig::gi2index{$new_gi}= $index; 3797 return $twig; 3798 } 3799 3800 3801# builds the DTD from the stored (possibly updated) data 3802sub dtd_text 3803 { my $t= shift; 3804 my $dtd= $t->{twig_dtd}; 3805 my $doctype= $t->{twig_doctype} or return ''; 3806 my $string= "<!DOCTYPE ".$doctype->{name}; 3807 3808 $string .= " [\n"; 3809 3810 foreach my $gi (@{$dtd->{elt_list}}) 3811 { $string.= "<!ELEMENT $gi ".$dtd->{model}->{$gi}.">\n" ; 3812 if( $dtd->{att}->{$gi}) 3813 { my $attlist= $dtd->{att}->{$gi}; 3814 $string.= "<!ATTLIST $gi\n"; 3815 foreach my $att ( sort keys %{$attlist}) 3816 { 3817 if( $attlist->{$att}->{fixed}) 3818 { $string.= " $att $attlist->{$att}->{type} #FIXED $attlist->{$att}->{default}"; } 3819 else 3820 { $string.= " $att $attlist->{$att}->{type} $attlist->{$att}->{default}"; } 3821 $string.= "\n"; 3822 } 3823 $string.= ">\n"; 3824 } 3825 } 3826 $string.= $t->entity_list->text if( $t->entity_list); 3827 $string.= "\n]>\n"; 3828 return $string; 3829 } 3830 3831# prints the DTD from the stored (possibly updated) data 3832sub dtd_print 3833 { my $t= shift; 3834 my $fh= isa( $_[0], 'GLOB') || isa( $_[0], 'IO::Scalar') ? shift : undef; 3835 if( $fh) { print $fh $t->dtd_text; } 3836 else { print $t->dtd_text; } 3837 return $t; 3838 } 3839 3840# build the subs that call directly expat 3841BEGIN 3842 { my @expat_methods= qw( depth in_element within_element context 3843 current_line current_column current_byte 3844 recognized_string original_string 3845 xpcroak xpcarp 3846 base current_element element_index 3847 xml_escape 3848 position_in_context); 3849 foreach my $method (@expat_methods) 3850 { 3851 ## no critic (TestingAndDebugging::ProhibitNoStrict); 3852 no strict 'refs'; 3853 *{$method}= sub { my $t= shift; 3854 _croak( "calling $method after parsing is finished") unless( $t->{twig_parsing}); 3855 return $t->{twig_parser}->$method(@_); 3856 }; 3857 } 3858 } 3859 3860sub path 3861 { my( $t, $gi)= @_; 3862 if( $t->{twig_map_xmlns}) 3863 { return "/" . join( "/", map { $t->_replace_prefix( $_)} ($t->{twig_parser}->context, $gi)); } 3864 else 3865 { return "/" . join( "/", ($t->{twig_parser}->context, $gi)); } 3866 } 3867 3868sub finish 3869 { my $t= shift; 3870 return $t->{twig_parser}->finish; 3871 } 3872 3873# just finish the parse by printing the rest of the document 3874sub finish_print 3875 { my( $t, $fh)= @_; 3876 my $old_fh; 3877 unless( defined $fh) 3878 { $t->_set_fh_to_twig_output_fh(); } 3879 elsif( defined $fh) 3880 { $old_fh= select $fh; 3881 $t->{twig_original_selected_fh}= $old_fh if( $old_fh); 3882 } 3883 3884 my $p=$t->{twig_parser}; 3885 if( $t->{twig_keep_encoding}) 3886 { $p->setHandlers( %twig_handlers_finish_print); } 3887 else 3888 { $p->setHandlers( %twig_handlers_finish_print_original); } 3889 return $t; 3890 } 3891 3892sub set_remove_cdata { return XML::Twig::Elt::set_remove_cdata( @_); } 3893 3894sub output_filter { return XML::Twig::Elt::output_filter( @_); } 3895sub set_output_filter { return XML::Twig::Elt::set_output_filter( @_); } 3896 3897sub output_text_filter { return XML::Twig::Elt::output_text_filter( @_); } 3898sub set_output_text_filter { return XML::Twig::Elt::set_output_text_filter( @_); } 3899 3900sub set_input_filter 3901 { my( $t, $input_filter)= @_; 3902 my $old_filter= $t->{twig_input_filter}; 3903 if( !$input_filter || isa( $input_filter, 'CODE') ) 3904 { $t->{twig_input_filter}= $input_filter; } 3905 elsif( $input_filter eq 'latin1') 3906 { $t->{twig_input_filter}= latin1(); } 3907 elsif( $filter{$input_filter}) 3908 { $t->{twig_input_filter}= $filter{$input_filter}; } 3909 else 3910 { _croak( "invalid input filter: $input_filter"); } 3911 3912 return $old_filter; 3913 } 3914 3915sub set_empty_tag_style 3916 { return XML::Twig::Elt::set_empty_tag_style( @_); } 3917 3918sub set_pretty_print 3919 { return XML::Twig::Elt::set_pretty_print( @_); } 3920 3921sub set_quote 3922 { return XML::Twig::Elt::set_quote( @_); } 3923 3924sub set_indent 3925 { return XML::Twig::Elt::set_indent( @_); } 3926 3927sub set_keep_atts_order 3928 { shift; return XML::Twig::Elt::set_keep_atts_order( @_); } 3929 3930sub keep_atts_order 3931 { return XML::Twig::Elt::keep_atts_order( @_); } 3932 3933sub set_do_not_escape_amp_in_atts 3934 { return XML::Twig::Elt::set_do_not_escape_amp_in_atts( @_); } 3935 3936# save and restore package globals (the ones in XML::Twig::Elt) 3937# should probably return the XML::Twig object itself, but instead 3938# returns the state (as a hashref) for backward compatibility 3939sub save_global_state 3940 { my $t= shift; 3941 return $t->{twig_saved_state}= XML::Twig::Elt::global_state(); 3942 } 3943 3944sub restore_global_state 3945 { my $t= shift; 3946 XML::Twig::Elt::set_global_state( $t->{twig_saved_state}); 3947 } 3948 3949sub global_state 3950 { return XML::Twig::Elt::global_state(); } 3951 3952sub set_global_state 3953 { return XML::Twig::Elt::set_global_state( $_[1]); } 3954 3955sub dispose 3956 { my $t= shift; 3957 $t->DESTROY; 3958 return; 3959 } 3960 3961sub DESTROY 3962 { my $t= shift; 3963 if( $t->{twig_root} && isa( $t->{twig_root}, 'XML::Twig::Elt')) 3964 { $t->{twig_root}->delete } 3965 3966 # added to break circular references 3967 undef $t->{twig}; 3968 undef $t->{twig_root}->{twig} if( $t->{twig_root}); 3969 undef $t->{twig_parser}; 3970 3971 undef %$t;# prevents memory leaks (especially when using mod_perl) 3972 undef $t; 3973 } 3974 3975# return true if perl was compiled using perlio 3976# if perl is not available return true, these days perlio should be used 3977sub _use_perlio 3978 { my $perl= _this_perl(); 3979 return $perl ? grep /useperlio=define/, `$perl -V` : 1; 3980 } 3981 3982# returns the parth to the perl executable (if available) 3983sub _this_perl 3984 { # straight from perlvar 3985 my $secure_perl_path= $Config{perlpath}; 3986 if ($^O ne 'VMS') 3987 { $secure_perl_path .= $Config{_exe} unless $secure_perl_path =~ m/$Config{_exe}$/i; } 3988 if( ! -f $secure_perl_path) { $secure_perl_path= ''; } # when perl is not available (PDK) 3989 return $secure_perl_path; 3990 } 3991 3992# 3993# non standard handlers 3994# 3995 3996# kludge: expat 1.95.2 calls both Default AND Doctype handlers 3997# so if the default handler finds '<!DOCTYPE' then it must 3998# unset itself (_twig_print_doctype will reset it) 3999sub _twig_print_check_doctype 4000 { # warn " in _twig_print_check_doctype...\n"; # DEBUG handler 4001 4002 my $p= shift; 4003 my $string= $p->recognized_string(); 4004 if( $string eq '<!DOCTYPE') 4005 { 4006 $p->setHandlers( Default => undef); 4007 $p->setHandlers( Entity => undef); 4008 $expat_1_95_2=1; 4009 } 4010 else 4011 { print $string; } 4012 4013 return; 4014 } 4015 4016 4017sub _twig_print 4018 { # warn " in _twig_print...\n"; # DEBUG handler 4019 my $p= shift; 4020 if( $expat_1_95_2 && ($p->recognized_string eq '[') && !$p->{twig}->{expat_1_95_2_seen_bracket}) 4021 { # otherwise the opening square bracket of the doctype gets printed twice 4022 $p->{twig}->{expat_1_95_2_seen_bracket}=1; 4023 } 4024 else 4025 { if( $p->{twig}->{twig_right_after_root}) 4026 { my $s= $p->recognized_string(); print $s if $s=~ m{\S}; } 4027 else 4028 { print $p->recognized_string(); } 4029 } 4030 return; 4031 } 4032# recognized_string does not seem to work for entities, go figure! 4033# so this handler is used to print them anyway 4034sub _twig_print_entity 4035 { # warn " in _twig_print_entity...\n"; # DEBUG handler 4036 my $p= shift; 4037 XML::Twig::Entity->new( @_)->print; 4038 } 4039 4040# kludge: expat 1.95.2 calls both Default AND Doctype handlers 4041# so if the default handler finds '<!DOCTYPE' then it must 4042# unset itself (_twig_print_doctype will reset it) 4043sub _twig_print_original_check_doctype 4044 { # warn " in _twig_print_original_check_doctype...\n"; # DEBUG handler 4045 4046 my $p= shift; 4047 my $string= $p->original_string(); 4048 if( $string eq '<!DOCTYPE') 4049 { $p->setHandlers( Default => undef); 4050 $p->setHandlers( Entity => undef); 4051 $expat_1_95_2=1; 4052 } 4053 else 4054 { print $string; } 4055 4056 return; 4057 } 4058 4059sub _twig_print_original 4060 { # warn " in _twig_print_original...\n"; # DEBUG handler 4061 my $p= shift; 4062 print $p->original_string(); 4063 return; 4064 } 4065 4066 4067sub _twig_print_original_doctype 4068 { # warn " in _twig_print_original_doctype...\n"; # DEBUG handler 4069 4070 my( $p, $name, $sysid, $pubid, $internal)= @_; 4071 if( $name) 4072 { # with recent versions of XML::Parser original_string does not work, 4073 # hence we need to rebuild the doctype declaration 4074 my $doctype=''; 4075 $doctype .= qq{<!DOCTYPE $name} if( $name); 4076 $doctype .= qq{ PUBLIC "$pubid"} if( $pubid); 4077 $doctype .= qq{ SYSTEM} if( $sysid && !$pubid); 4078 $doctype .= qq{ "$sysid"} if( $sysid); 4079 $doctype .= ' [' if( $internal && !$expat_1_95_2) ; 4080 $doctype .= qq{>} unless( $internal || $expat_1_95_2); 4081 $p->{twig}->{twig_doctype}->{has_internal}=$internal; 4082 print $doctype; 4083 } 4084 $p->setHandlers( Default => \&_twig_print_original); 4085 return; 4086 } 4087 4088sub _twig_print_doctype 4089 { # warn " in _twig_print_doctype...\n"; # DEBUG handler 4090 my( $p, $name, $sysid, $pubid, $internal)= @_; 4091 if( $name) 4092 { # with recent versions of XML::Parser original_string does not work, 4093 # hence we need to rebuild the doctype declaration 4094 my $doctype=''; 4095 $doctype .= qq{<!DOCTYPE $name} if( $name); 4096 $doctype .= qq{ PUBLIC "$pubid"} if( $pubid); 4097 $doctype .= qq{ SYSTEM} if( $sysid && !$pubid); 4098 $doctype .= qq{ "$sysid"} if( $sysid); 4099 $doctype .= ' [' if( $internal) ; 4100 $doctype .= qq{>} unless( $internal || $expat_1_95_2); 4101 $p->{twig}->{twig_doctype}->{has_internal}=$internal; 4102 print $doctype; 4103 } 4104 $p->setHandlers( Default => \&_twig_print); 4105 return; 4106 } 4107 4108 4109sub _twig_print_original_default 4110 { # warn " in _twig_print_original_default...\n"; # DEBUG handler 4111 my $p= shift; 4112 print $p->original_string(); 4113 return; 4114 } 4115 4116# account for the case where the element is empty 4117sub _twig_print_end_original 4118 { # warn " in _twig_print_end_original...\n"; # DEBUG handler 4119 my $p= shift; 4120 print $p->original_string(); 4121 return; 4122 } 4123 4124sub _twig_start_check_roots 4125 { # warn " in _twig_start_check_roots...\n"; # DEBUG handler 4126 my $p= shift; 4127 my $gi= shift; 4128 4129 my $t= $p->{twig}; 4130 4131 my $fh= $t->{twig_output_fh} || select() || \*STDOUT; 4132 4133 my $ns_decl; 4134 unless( $p->depth == 0) 4135 { if( $t->{twig_map_xmlns}) { $ns_decl= _replace_ns( $t, \$gi, \@_); } 4136 } 4137 4138 my $context= { $ST_TAG => $gi, @_}; 4139 $context->{$ST_NS}= $ns_decl if $ns_decl; 4140 push @{$t->{_twig_context_stack}}, $context; 4141 my %att= @_; 4142 4143 if( _handler( $t, $t->{twig_roots}, $gi)) 4144 { $p->setHandlers( %twig_handlers); # restore regular handlers 4145 $t->{twig_root_depth}= $p->depth; 4146 pop @{$t->{_twig_context_stack}}; # will be pushed back in _twig_start 4147 _twig_start( $p, $gi, @_); 4148 return; 4149 } 4150 4151 # $tag will always be true if it needs to be printed (the tag string is never empty) 4152 my $tag= $t->{twig_default_print} ? $t->{twig_keep_encoding} ? $p->original_string 4153 : $p->recognized_string 4154 : ''; 4155 4156 if( $p->depth == 0) 4157 { 4158 ## no critic (TestingAndDebugging::ProhibitNoStrict); 4159 no strict 'refs'; 4160 print {$fh} $tag if( $tag); 4161 pop @{$t->{_twig_context_stack}}; # will be pushed back in _twig_start 4162 _twig_start( $p, $gi, @_); 4163 $t->root->{'flushed'}=1; # or the root start tag gets output the first time we flush 4164 } 4165 elsif( $t->{twig_starttag_handlers}) 4166 { # look for start tag handlers 4167 4168 my @handlers= _handler( $t, $t->{twig_starttag_handlers}, $gi); 4169 my $last_handler_res; 4170 foreach my $handler ( @handlers) 4171 { $last_handler_res= $handler->($t, $gi, %att); 4172 last unless $last_handler_res; 4173 } 4174 ## no critic (TestingAndDebugging::ProhibitNoStrict); 4175 no strict 'refs'; 4176 print {$fh} $tag if( $tag && (!@handlers || $last_handler_res)); 4177 } 4178 else 4179 { 4180 ## no critic (TestingAndDebugging::ProhibitNoStrict); 4181 no strict 'refs'; 4182 print {$fh} $tag if( $tag); 4183 } 4184 return; 4185 } 4186 4187sub _twig_end_check_roots 4188 { # warn " in _twig_end_check_roots...\n"; # DEBUG handler 4189 4190 my( $p, $gi, %att)= @_; 4191 my $t= $p->{twig}; 4192 # $tag can be empty (<elt/>), hence the undef and the tests for defined 4193 my $tag= $t->{twig_default_print} ? $t->{twig_keep_encoding} ? $p->original_string 4194 : $p->recognized_string 4195 : undef; 4196 my $fh= $t->{twig_output_fh} || select() || \*STDOUT; 4197 4198 if( $t->{twig_endtag_handlers}) 4199 { # look for end tag handlers 4200 my @handlers= _handler( $t, $t->{twig_endtag_handlers}, $gi); 4201 my $last_handler_res=1; 4202 foreach my $handler ( @handlers) 4203 { $last_handler_res= $handler->($t, $gi) || last; } 4204 #if( ! $last_handler_res) 4205 # { pop @{$t->{_twig_context_stack}}; warn "tested"; 4206 # return; 4207 # } 4208 } 4209 { 4210 ## no critic (TestingAndDebugging::ProhibitNoStrict); 4211 no strict 'refs'; 4212 print {$fh} $tag if( defined $tag); 4213 } 4214 if( $p->depth == 0) 4215 { 4216 _twig_end( $p, $gi); 4217 $t->root->{end_tag_flushed}=1; 4218 } 4219 4220 pop @{$t->{_twig_context_stack}}; 4221 return; 4222 } 4223 4224sub _twig_pi_check_roots 4225 { # warn " in _twig_pi_check_roots...\n"; # DEBUG handler 4226 my( $p, $target, $data)= @_; 4227 my $t= $p->{twig}; 4228 my $pi= $t->{twig_default_print} ? $t->{twig_keep_encoding} ? $p->original_string 4229 : $p->recognized_string 4230 : undef; 4231 my $fh= $t->{twig_output_fh} || select() || \*STDOUT; 4232 4233 if( my $handler= $t->{twig_handlers}->{pi_handlers}->{$target} 4234 || $t->{twig_handlers}->{pi_handlers}->{''} 4235 ) 4236 { # if handler is called on pi, then it needs to be processed as a regular node 4237 my @flags= qw( twig_process_pi twig_keep_pi); 4238 my @save= @{$t}{@flags}; # save pi related flags 4239 @{$t}{@flags}= (1, 0); # override them, pi needs to be processed 4240 _twig_pi( @_); # call handler on the pi 4241 @{$t}{@flags}= @save;; # restore flag 4242 } 4243 else 4244 { 4245 ## no critic (TestingAndDebugging::ProhibitNoStrict); 4246 no strict 'refs'; 4247 print {$fh} $pi if( defined( $pi)); 4248 } 4249 return; 4250 } 4251 4252 4253sub _output_ignored 4254 { my( $t, $p)= @_; 4255 my $action= $t->{twig_ignore_action}; 4256 4257 my $get_string= $t->{twig_keep_encoding} ? 'original_string' : 'recognized_string'; 4258 4259 if( $action eq 'print' ) { print $p->$get_string; } 4260 else 4261 { my $string_ref; 4262 if( $action eq 'string') 4263 { $string_ref= \$t->{twig_buffered_string}; } 4264 elsif( ref( $action) && ref( $action) eq 'SCALAR') 4265 { $string_ref= $action; } 4266 else 4267 { _croak( "wrong ignore action: $action"); } 4268 4269 $$string_ref .= $p->$get_string; 4270 } 4271 } 4272 4273 4274 4275sub _twig_ignore_start 4276 { # warn " in _twig_ignore_start...\n"; # DEBUG handler 4277 4278 my( $p, $gi)= @_; 4279 my $t= $p->{twig}; 4280 $t->{twig_ignore_level}++; 4281 my $action= $t->{twig_ignore_action}; 4282 4283 $t->_output_ignored( $p) unless $action eq 'discard'; 4284 return; 4285 } 4286 4287sub _twig_ignore_end 4288 { # warn " in _twig_ignore_end...\n"; # DEBUG handler 4289 4290 my( $p, $gi)= @_; 4291 my $t= $p->{twig}; 4292 4293 my $action= $t->{twig_ignore_action}; 4294 $t->_output_ignored( $p) unless $action eq 'discard'; 4295 4296 $t->{twig_ignore_level}--; 4297 4298 if( ! $t->{twig_ignore_level}) 4299 { 4300 $t->{twig_current} = $t->{twig_ignore_elt}; 4301 $t->{twig_current}->{'twig_current'}=1; 4302 4303 $t->{twig_ignore_elt}->cut; # there could possibly be a memory leak here (delete would avoid it, 4304 # but could also delete elements that should not be deleted) 4305 4306 # restore the saved stack to the current level 4307 splice( @{$t->{_twig_context_stack}}, $p->depth+ 1 ); 4308 #warn "stack: ", _dump_stack( $t->{_twig_context_stack}), "\n"; 4309 4310 $p->setHandlers( @{$t->{twig_saved_handlers}}); 4311 # test for handlers 4312 if( $t->{twig_endtag_handlers}) 4313 { # look for end tag handlers 4314 my @handlers= _handler( $t, $t->{twig_endtag_handlers}, $gi); 4315 my $last_handler_res=1; 4316 foreach my $handler ( @handlers) 4317 { $last_handler_res= $handler->($t, $gi) || last; } 4318 } 4319 pop @{$t->{_twig_context_stack}}; 4320 }; 4321 return; 4322 } 4323 4324#sub _dump_stack { my( $stack)= @_; return join( ":", map { $_->{$ST_TAG} } @$stack); } 4325 4326sub ignore 4327 { my( $t, $elt, $action)= @_; 4328 my $current= $t->{twig_current}; 4329 4330 if( ! ($elt && ref( $elt) && isa( $elt, 'XML::Twig::Elt'))) { $elt= $current; } 4331 4332 #warn "ignore: current = ", $current->tag, ", elt = ", $elt->tag, ")\n"; 4333 4334 # we need the ($elt == $current->{last_child}) test because the current element is set to the 4335 # parent _before_ handlers are called (and I can't figure out how to fix this) 4336 unless( ($elt == $current) || ($current->{last_child} && ($elt == $current->{last_child})) || $current->in( $elt)) 4337 { _croak( "element to be ignored must be ancestor of current element"); } 4338 4339 $t->{twig_ignore_level}= $current == $elt ? 1 : $t->_level_in_stack( $current) - $t->_level_in_stack($elt) + 1; 4340 #warn "twig_ignore_level: $t->{twig_ignore_level} (current: ", $current->tag, ", elt: ", $elt->tag, ")\n"; 4341 $t->{twig_ignore_elt} = $elt; # save it, so we can delete it later 4342 4343 $action ||= 'discard'; 4344 if( !($action eq 'print' || $action eq 'string' || ( ref( $action) && ref( $action) eq 'SCALAR'))) 4345 { $action= 'discard'; } 4346 4347 $t->{twig_ignore_action}= $action; 4348 4349 my $p= $t->{twig_parser}; 4350 my @saved_handlers= $p->setHandlers( %twig_handlers_ignore); # set handlers 4351 4352 my $get_string= $t->{twig_keep_encoding} ? 'original_string' : 'recognized_string'; 4353 4354 my $default_handler; 4355 4356 if( $action ne 'discard') 4357 { if( $action eq 'print') 4358 { $p->setHandlers( Default => sub { print $_[0]->$get_string; }); } 4359 else 4360 { my $string_ref; 4361 if( $action eq 'string') 4362 { if( ! exists $t->{twig_buffered_string}) { $t->{twig_buffered_string}=''; } 4363 $string_ref= \$t->{twig_buffered_string}; 4364 } 4365 elsif( ref( $action) && ref( $action) eq 'SCALAR') 4366 { $string_ref= $action; } 4367 4368 $p->setHandlers( Default => sub { $$string_ref .= $_[0]->$get_string; }); 4369 } 4370 $t->_output_ignored( $p, $action); 4371 } 4372 4373 4374 $t->{twig_saved_handlers}= \@saved_handlers; # save current handlers 4375 } 4376 4377sub _level_in_stack 4378 { my( $t, $elt)= @_; 4379 my $level=1; 4380 foreach my $elt_in_stack ( @{$t->{_twig_context_stack}} ) 4381 { if( $elt_in_stack->{$ST_ELT} && ($elt == $elt_in_stack->{$ST_ELT})) { return $level } 4382 $level++; 4383 } 4384 } 4385 4386 4387 4388# select $t->{twig_output_fh} and store the current selected fh 4389sub _set_fh_to_twig_output_fh 4390 { my $t= shift; 4391 my $output_fh= $t->{twig_output_fh}; 4392 if( $output_fh && !$t->{twig_output_fh_selected}) 4393 { # there is an output fh 4394 $t->{twig_selected_fh}= select(); # store the currently selected fh 4395 $t->{twig_output_fh_selected}=1; 4396 select $output_fh; # select the output fh for the twig 4397 } 4398 } 4399 4400# select the fh that was stored in $t->{twig_selected_fh} 4401# (before $t->{twig_output_fh} was selected) 4402sub _set_fh_to_selected_fh 4403 { my $t= shift; 4404 return unless( $t->{twig_output_fh}); 4405 my $selected_fh= $t->{twig_selected_fh}; 4406 $t->{twig_output_fh_selected}=0; 4407 select $selected_fh; 4408 return; 4409 } 4410 4411 4412sub encoding 4413 { return $_[0]->{twig_xmldecl}->{encoding} if( $_[0]->{twig_xmldecl}); } 4414 4415sub set_encoding 4416 { my( $t, $encoding)= @_; 4417 $t->{twig_xmldecl} ||={}; 4418 $t->set_xml_version( "1.0") unless( $t->xml_version); 4419 $t->{twig_xmldecl}->{encoding}= $encoding; 4420 return $t; 4421 } 4422 4423sub output_encoding 4424 { return $_[0]->{output_encoding}; } 4425 4426sub set_output_encoding 4427 { my( $t, $encoding)= @_; 4428 my $output_filter= $t->output_filter || ''; 4429 4430 if( ($encoding && $encoding !~ m{^utf-?8$}i) || $t->{twig_keep_encoding} || $output_filter) 4431 { $t->set_output_filter( _encoding_filter( $encoding || '')); } 4432 4433 $t->{output_encoding}= $encoding; 4434 return $t; 4435 } 4436 4437sub xml_version 4438 { return $_[0]->{twig_xmldecl}->{version} if( $_[0]->{twig_xmldecl}); } 4439 4440sub set_xml_version 4441 { my( $t, $version)= @_; 4442 $t->{twig_xmldecl} ||={}; 4443 $t->{twig_xmldecl}->{version}= $version; 4444 return $t; 4445 } 4446 4447sub standalone 4448 { return $_[0]->{twig_xmldecl}->{standalone} if( $_[0]->{twig_xmldecl}); } 4449 4450sub set_standalone 4451 { my( $t, $standalone)= @_; 4452 $t->{twig_xmldecl} ||={}; 4453 $t->set_xml_version( "1.0") unless( $t->xml_version); 4454 $t->{twig_xmldecl}->{standalone}= $standalone; 4455 return $t; 4456 } 4457 4458 4459# SAX methods 4460 4461sub toSAX1 4462 { _croak( "cannot use toSAX1 while parsing (use flush_toSAX1)") if (defined $_[0]->{twig_parser}); 4463 shift(@_)->_toSAX(@_, \&XML::Twig::Elt::_start_tag_data_SAX1, 4464 \&XML::Twig::Elt::_end_tag_data_SAX1 4465 ); 4466 } 4467 4468sub toSAX2 4469 { _croak( "cannot use toSAX2 while parsing (use flush_toSAX2)") if (defined $_[0]->{twig_parser}); 4470 shift(@_)->_toSAX(@_, \&XML::Twig::Elt::_start_tag_data_SAX2, 4471 \&XML::Twig::Elt::_end_tag_data_SAX2 4472 ); 4473 } 4474 4475 4476sub _toSAX 4477 { my( $t, $handler, $start_tag_data, $end_tag_data) = @_; 4478 4479 if( my $start_document = $handler->can( 'start_document')) 4480 { $start_document->( $handler); } 4481 4482 $t->_prolog_toSAX( $handler); 4483 4484 if( $t->root) { $t->root->_toSAX( $handler, $start_tag_data, $end_tag_data) ; } 4485 if( my $end_document = $handler->can( 'end_document')) 4486 { $end_document->( $handler); } 4487 } 4488 4489 4490sub flush_toSAX1 4491 { shift(@_)->_flush_toSAX(@_, \&XML::Twig::Elt::_start_tag_data_SAX1, 4492 \&XML::Twig::Elt::_end_tag_data_SAX1 4493 ); 4494 } 4495 4496sub flush_toSAX2 4497 { shift(@_)->_flush_toSAX(@_, \&XML::Twig::Elt::_start_tag_data_SAX2, 4498 \&XML::Twig::Elt::_end_tag_data_SAX2 4499 ); 4500 } 4501 4502sub _flush_toSAX 4503 { my( $t, $handler, $start_tag_data, $end_tag_data)= @_; 4504 4505 # the "real" last element processed, as _twig_end has closed it 4506 my $last_elt; 4507 if( $t->{twig_current}) 4508 { $last_elt= $t->{twig_current}->{last_child}; } 4509 else 4510 { $last_elt= $t->{twig_root}; } 4511 4512 my $elt= $t->{twig_root}; 4513 unless( $elt->{'flushed'}) 4514 { # init unless already done (ie root has been flushed) 4515 if( my $start_document = $handler->can( 'start_document')) 4516 { $start_document->( $handler); } 4517 # flush the DTD 4518 $t->_prolog_toSAX( $handler) 4519 } 4520 4521 while( $elt) 4522 { my $next_elt; 4523 if( $last_elt && $last_elt->in( $elt)) 4524 { 4525 unless( $elt->{'flushed'}) 4526 { # just output the front tag 4527 if( my $start_element = $handler->can( 'start_element')) 4528 { if( my $tag_data= $start_tag_data->( $elt)) 4529 { $start_element->( $handler, $tag_data); } 4530 } 4531 $elt->{'flushed'}=1; 4532 } 4533 $next_elt= $elt->{first_child}; 4534 } 4535 else 4536 { # an element before the last one or the last one, 4537 $next_elt= $elt->{next_sibling}; 4538 $elt->_toSAX( $handler, $start_tag_data, $end_tag_data); 4539 $elt->delete; 4540 last if( $last_elt && ($elt == $last_elt)); 4541 } 4542 $elt= $next_elt; 4543 } 4544 if( !$t->{twig_parsing}) 4545 { if( my $end_document = $handler->can( 'end_document')) 4546 { $end_document->( $handler); } 4547 } 4548 } 4549 4550 4551sub _prolog_toSAX 4552 { my( $t, $handler)= @_; 4553 $t->_xmldecl_toSAX( $handler); 4554 $t->_DTD_toSAX( $handler); 4555 } 4556 4557sub _xmldecl_toSAX 4558 { my( $t, $handler)= @_; 4559 my $decl= $t->{twig_xmldecl}; 4560 my $data= { Version => $decl->{version}, 4561 Encoding => $decl->{encoding}, 4562 Standalone => $decl->{standalone}, 4563 }; 4564 if( my $xml_decl= $handler->can( 'xml_decl')) 4565 { $xml_decl->( $handler, $data); } 4566 } 4567 4568sub _DTD_toSAX 4569 { my( $t, $handler)= @_; 4570 my $doctype= $t->{twig_doctype}; 4571 return unless( $doctype); 4572 my $data= { Name => $doctype->{name}, 4573 PublicId => $doctype->{pub}, 4574 SystemId => $doctype->{sysid}, 4575 }; 4576 4577 if( my $start_dtd= $handler->can( 'start_dtd')) 4578 { $start_dtd->( $handler, $data); } 4579 4580 # I should call code to export the internal subset here 4581 4582 if( my $end_dtd= $handler->can( 'end_dtd')) 4583 { $end_dtd->( $handler); } 4584 } 4585 4586# input/output filters 4587 4588sub latin1 4589 { local $SIG{__DIE__}; 4590 if( _use( 'Encode')) 4591 { return encode_convert( 'ISO-8859-15'); } 4592 elsif( _use( 'Text::Iconv')) 4593 { return iconv_convert( 'ISO-8859-15'); } 4594 elsif( _use( 'Unicode::Map8') && _use( 'Unicode::String')) 4595 { return unicode_convert( 'ISO-8859-15'); } 4596 else 4597 { return \®exp2latin1; } 4598 } 4599 4600sub _encoding_filter 4601 { 4602 { local $SIG{__DIE__}; 4603 my $encoding= $_[1] || $_[0]; 4604 if( _use( 'Encode')) 4605 { my $sub= encode_convert( $encoding); 4606 return $sub; 4607 } 4608 elsif( _use( 'Text::Iconv')) 4609 { return iconv_convert( $encoding); } 4610 elsif( _use( 'Unicode::Map8') && _use( 'Unicode::String')) 4611 { return unicode_convert( $encoding); } 4612 } 4613 _croak( "Encode, Text::Iconv or Unicode::Map8 and Unicode::String need to be installed in order to use encoding options"); 4614 } 4615 4616# shamelessly lifted from XML::TyePYX (works only with XML::Parse 2.27) 4617sub regexp2latin1 4618 { my $text=shift; 4619 $text=~s{([\xc0-\xc3])(.)}{ my $hi = ord($1); 4620 my $lo = ord($2); 4621 chr((($hi & 0x03) <<6) | ($lo & 0x3F)) 4622 }ge; 4623 return $text; 4624 } 4625 4626 4627sub html_encode 4628 { _use( 'HTML::Entities') or croak "cannot use html_encode: missing HTML::Entities"; 4629 return HTML::Entities::encode_entities($_[0] ); 4630 } 4631 4632sub safe_encode 4633 { my $str= shift; 4634 if( $perl_version < 5.008) 4635 { # the no utf8 makes the regexp work in 5.6 4636 no utf8; # = perl 5.6 4637 $str =~ s{([\xC0-\xDF].|[\xE0-\xEF]..|[\xF0-\xFF]...)} 4638 {_XmlUtf8Decode($1)}egs; 4639 } 4640 else 4641 { $str= encode( ascii => $str, $FB_HTMLCREF); } 4642 return $str; 4643 } 4644 4645sub safe_encode_hex 4646 { my $str= shift; 4647 if( $perl_version < 5.008) 4648 { # the no utf8 makes the regexp work in 5.6 4649 no utf8; # = perl 5.6 4650 $str =~ s{([\xC0-\xDF].|[\xE0-\xEF]..|[\xF0-\xFF]...)} 4651 {_XmlUtf8Decode($1, 1)}egs; 4652 } 4653 else 4654 { $str= encode( ascii => $str, $FB_XMLCREF); } 4655 return $str; 4656 } 4657 4658# this one shamelessly lifted from XML::DOM 4659# does NOT work on 5.8.0 4660sub _XmlUtf8Decode 4661 { my ($str, $hex) = @_; 4662 my $len = length ($str); 4663 my $n; 4664 4665 if ($len == 2) 4666 { my @n = unpack "C2", $str; 4667 $n = (($n[0] & 0x3f) << 6) + ($n[1] & 0x3f); 4668 } 4669 elsif ($len == 3) 4670 { my @n = unpack "C3", $str; 4671 $n = (($n[0] & 0x1f) << 12) + (($n[1] & 0x3f) << 6) + ($n[2] & 0x3f); 4672 } 4673 elsif ($len == 4) 4674 { my @n = unpack "C4", $str; 4675 $n = (($n[0] & 0x0f) << 18) + (($n[1] & 0x3f) << 12) 4676 + (($n[2] & 0x3f) << 6) + ($n[3] & 0x3f); 4677 } 4678 elsif ($len == 1) # just to be complete... 4679 { $n = ord ($str); } 4680 else 4681 { croak "bad value [$str] for _XmlUtf8Decode"; } 4682 4683 my $char= $hex ? sprintf ("&#x%x;", $n) : "&#$n;"; 4684 return $char; 4685 } 4686 4687 4688sub unicode_convert 4689 { my $enc= $_[1] ? $_[1] : $_[0]; # so the method can be called on the twig or directly 4690 _use( 'Unicode::Map8') or croak "Unicode::Map8 not available, needed for encoding filter: $!"; 4691 _use( 'Unicode::String') or croak "Unicode::String not available, needed for encoding filter: $!"; 4692 import Unicode::String qw(utf8); 4693 my $sub= eval qq{ { $NO_WARNINGS; 4694 my \$cnv; 4695 BEGIN { \$cnv= Unicode::Map8->new(\$enc) 4696 or croak "Can't create converter to \$enc"; 4697 } 4698 sub { return \$cnv->to8 (utf8(\$_[0])->ucs2); } 4699 } 4700 }; 4701 unless( $sub) { croak $@; } 4702 return $sub; 4703 } 4704 4705sub iconv_convert 4706 { my $enc= $_[1] ? $_[1] : $_[0]; # so the method can be called on the twig or directly 4707 _use( 'Text::Iconv') or croak "Text::Iconv not available, needed for encoding filter: $!"; 4708 my $sub= eval qq{ { $NO_WARNINGS; 4709 my \$cnv; 4710 BEGIN { \$cnv = Text::Iconv->new( 'utf8', \$enc) 4711 or croak "Can't create iconv converter to \$enc"; 4712 } 4713 sub { return \$cnv->convert( \$_[0]); } 4714 } 4715 }; 4716 unless( $sub) 4717 { if( $@=~ m{^Unsupported conversion: Invalid argument}) 4718 { croak "Unsupported encoding: $enc"; } 4719 else 4720 { croak $@; } 4721 } 4722 4723 return $sub; 4724 } 4725 4726sub encode_convert 4727 { my $enc= $_[1] ? $_[1] : $_[0]; # so the method can be called on the twig or directly 4728 my $sub= eval qq{sub { $NO_WARNINGS; return encode( "$enc", \$_[0]); } }; 4729 croak "can't create Encode-based filter: $@" unless( $sub); 4730 return $sub; 4731 } 4732 4733 4734# XML::XPath compatibility 4735sub getRootNode { return $_[0]; } 4736sub getParentNode { return undef; } 4737sub getChildNodes { my @children= ($_[0]->root); return wantarray ? @children : \@children; } 4738 4739sub _weakrefs { return $weakrefs; } 4740sub _set_weakrefs { $weakrefs=shift() || 0; XML::Twig::Elt::set_destroy()if ! $weakrefs; } # for testing purposes 4741 4742sub _dump 4743 { my $t= shift; 4744 my $dump=''; 4745 4746 $dump="document\n"; # should dump twig level data here 4747 if( $t->root) { $dump .= $t->root->_dump( @_); } 4748 4749 return $dump; 4750 4751 } 4752 4753 47541; 4755 4756###################################################################### 4757package XML::Twig::Entity_list; 4758###################################################################### 4759 4760*isa= *UNIVERSAL::isa; 4761 4762sub new 4763 { my $class = shift; 4764 my $self={ entities => {}, updated => 0}; 4765 4766 bless $self, $class; 4767 return $self; 4768 4769 } 4770 4771sub add_new_ent 4772 { my $ent_list= shift; 4773 my $ent= XML::Twig::Entity->new( @_); 4774 $ent_list->add( $ent); 4775 return $ent_list; 4776 } 4777 4778sub _add_list 4779 { my( $ent_list, $to_add)= @_; 4780 my $ents_to_add= $to_add->{entities}; 4781 return $ent_list unless( $ents_to_add && %$ents_to_add); 4782 @{$ent_list->{entities}}{keys %$ents_to_add}= values %$ents_to_add; 4783 $ent_list->{updated}=1; 4784 return $ent_list; 4785 } 4786 4787sub add 4788 { my( $ent_list, $ent)= @_; 4789 $ent_list->{entities}->{$ent->{name}}= $ent; 4790 $ent_list->{updated}=1; 4791 return $ent_list; 4792 } 4793 4794sub ent 4795 { my( $ent_list, $ent_name)= @_; 4796 return $ent_list->{entities}->{$ent_name}; 4797 } 4798 4799# can be called with an entity or with an entity name 4800sub delete 4801 { my $ent_list= shift; 4802 if( isa( ref $_[0], 'XML::Twig::Entity')) 4803 { # the second arg is an entity 4804 my $ent= shift; 4805 delete $ent_list->{entities}->{$ent->{name}}; 4806 } 4807 else 4808 { # the second arg was not entity, must be a string then 4809 my $name= shift; 4810 delete $ent_list->{entities}->{$name}; 4811 } 4812 $ent_list->{updated}=1; 4813 return $ent_list; 4814 } 4815 4816sub print 4817 { my ($ent_list, $fh)= @_; 4818 my $old_select= defined $fh ? select $fh : undef; 4819 4820 foreach my $ent_name ( sort keys %{$ent_list->{entities}}) 4821 { my $ent= $ent_list->{entities}->{$ent_name}; 4822 # we have to test what the entity is or un-defined entities can creep in 4823 if( isa( $ent, 'XML::Twig::Entity')) { $ent->print(); } 4824 } 4825 select $old_select if( defined $old_select); 4826 return $ent_list; 4827 } 4828 4829sub text 4830 { my ($ent_list)= @_; 4831 return join "\n", map { $ent_list->{entities}->{$_}->text} sort keys %{$ent_list->{entities}}; 4832 } 4833 4834# return the list of entity names 4835sub entity_names 4836 { my $ent_list= shift; 4837 return (sort keys %{$ent_list->{entities}}) ; 4838 } 4839 4840 4841sub list 4842 { my ($ent_list)= @_; 4843 return map { $ent_list->{entities}->{$_} } sort keys %{$ent_list->{entities}}; 4844 } 4845 48461; 4847 4848###################################################################### 4849package XML::Twig::Entity; 4850###################################################################### 4851 4852#*isa= *UNIVERSAL::isa; 4853 4854sub new 4855 { my( $class, $name, $val, $sysid, $pubid, $ndata, $param)= @_; 4856 $class= ref( $class) || $class; 4857 4858 my $self={}; 4859 4860 $self->{name} = $name; 4861 $self->{val} = $val if( defined $val ); 4862 $self->{sysid} = $sysid if( defined $sysid); 4863 $self->{pubid} = $pubid if( defined $pubid); 4864 $self->{ndata} = $ndata if( defined $ndata); 4865 $self->{param} = $param if( defined $param); 4866 4867 bless $self, $class; 4868 return $self; 4869 } 4870 4871 4872sub name { return $_[0]->{name}; } 4873sub val { return $_[0]->{val}; } 4874sub sysid { return defined( $_[0]->{sysid}) ? $_[0]->{sysid} : ''; } 4875sub pubid { return defined( $_[0]->{pubid}) ? $_[0]->{pubid} : ''; } 4876sub ndata { return defined( $_[0]->{ndata}) ? $_[0]->{ndata} : ''; } 4877sub param { return defined( $_[0]->{param}) ? $_[0]->{param} : ''; } 4878 4879 4880sub print 4881 { my ($ent, $fh)= @_; 4882 my $text= $ent->text; 4883 if( $fh) { print $fh $text . "\n"; } 4884 else { print $text . "\n"; } 4885 } 4886 4887sub sprint 4888 { my ($ent)= @_; 4889 return $ent->text; 4890 } 4891 4892sub text 4893 { my ($ent)= @_; 4894 #warn "text called: '", $ent->_dump, "'\n"; 4895 return '' if( !$ent->{name}); 4896 my @tokens; 4897 push @tokens, '<!ENTITY'; 4898 4899 push @tokens, '%' if( $ent->{param}); 4900 push @tokens, $ent->{name}; 4901 4902 if( defined $ent->{val} && !defined( $ent->{sysid}) && !defined($ent->{pubid}) ) 4903 { push @tokens, _quoted_val( $ent->{val}); 4904 } 4905 elsif( defined $ent->{sysid}) 4906 { push @tokens, 'PUBLIC', _quoted_val( $ent->{pubid}) if( $ent->{pubid}); 4907 push @tokens, 'SYSTEM' unless( $ent->{pubid}); 4908 push @tokens, _quoted_val( $ent->{sysid}); 4909 push @tokens, 'NDATA', $ent->{ndata} if( $ent->{ndata}); 4910 } 4911 return join( ' ', @tokens) . '>'; 4912 } 4913 4914sub _quoted_val 4915 { my $q= $_[0]=~ m{"} ? q{'} : q{"}; 4916 return qq{$q$_[0]$q}; 4917 } 4918 4919sub _dump 4920 { my( $ent)= @_; return join( " - ", map { "$_ => '$ent->{$_}'" } grep { defined $ent->{$_} } sort keys %$ent); } 4921 49221; 4923 4924###################################################################### 4925package XML::Twig::Notation_list; 4926###################################################################### 4927 4928*isa= *UNIVERSAL::isa; 4929 4930sub new 4931 { my $class = shift; 4932 my $self={ notations => {}, updated => 0}; 4933 4934 bless $self, $class; 4935 return $self; 4936 4937 } 4938 4939sub add_new_notation 4940 { my $notation_list= shift; 4941 my $notation= XML::Twig::Notation->new( @_); 4942 $notation_list->add( $notation); 4943 return $notation_list; 4944 } 4945 4946sub _add_list 4947 { my( $notation_list, $to_add)= @_; 4948 my $notations_to_add= $to_add->{notations}; 4949 return $notation_list unless( $notations_to_add && %$notations_to_add); 4950 @{$notation_list->{notations}}{keys %$notations_to_add}= values %$notations_to_add; 4951 $notation_list->{updated}=1; 4952 return $notation_list; 4953 } 4954 4955sub add 4956 { my( $notation_list, $notation)= @_; 4957 $notation_list->{notations}->{$notation->{name}}= $notation; 4958 $notation_list->{updated}=1; 4959 return $notation_list; 4960 } 4961 4962sub notation 4963 { my( $notation_list, $notation_name)= @_; 4964 return $notation_list->{notations}->{$notation_name}; 4965 } 4966 4967# can be called with an notation or with an notation name 4968sub delete 4969 { my $notation_list= shift; 4970 if( isa( ref $_[0], 'XML::Twig::Notation')) 4971 { # the second arg is an notation 4972 my $notation= shift; 4973 delete $notation_list->{notations}->{$notation->{name}}; 4974 } 4975 else 4976 { # the second arg was not notation, must be a string then 4977 my $name= shift; 4978 delete $notation_list->{notations}->{$name}; 4979 } 4980 $notation_list->{updated}=1; 4981 return $notation_list; 4982 } 4983 4984sub print 4985 { my ($notation_list, $fh)= @_; 4986 my $old_select= defined $fh ? select $fh : undef; 4987 4988 foreach my $notation_name ( sort keys %{$notation_list->{notations}}) 4989 { my $notation= $notation_list->{notations}->{$notation_name}; 4990 # we have to test what the notation is or un-defined notations can creep in 4991 if( isa( $notation, 'XML::Twig::Notation')) { $notation->print(); } 4992 } 4993 select $old_select if( defined $old_select); 4994 return $notation_list; 4995 } 4996 4997sub text 4998 { my ($notation_list)= @_; 4999 return join "\n", map { $notation_list->{notations}->{$_}->text} sort keys %{$notation_list->{notations}}; 5000 } 5001 5002# return the list of notation names 5003sub notation_names 5004 { my $notation_list= shift; 5005 return (sort keys %{$notation_list->{notations}}) ; 5006 } 5007 5008 5009sub list 5010 { my ($notation_list)= @_; 5011 return map { $notation_list->{notations}->{$_} } sort keys %{$notation_list->{notations}}; 5012 } 5013 50141; 5015 5016###################################################################### 5017package XML::Twig::Notation; 5018###################################################################### 5019 5020#*isa= *UNIVERSAL::isa; 5021 5022BEGIN 5023 { *sprint= *text; 5024 } 5025 5026sub new 5027 { my( $class, $name, $base, $sysid, $pubid)= @_; 5028 $class= ref( $class) || $class; 5029 5030 my $self={}; 5031 5032 $self->{name} = $name; 5033 $self->{base} = $base if( defined $base ); 5034 $self->{sysid} = $sysid if( defined $sysid); 5035 $self->{pubid} = $pubid if( defined $pubid); 5036 5037 bless $self, $class; 5038 return $self; 5039 } 5040 5041 5042sub name { return $_[0]->{name}; } 5043sub base { return $_[0]->{base}; } 5044sub sysid { return $_[0]->{sysid}; } 5045sub pubid { return $_[0]->{pubid}; } 5046 5047 5048sub print 5049 { my ($notation, $fh)= @_; 5050 my $text= $notation->text; 5051 if( $fh) { print $fh $text . "\n"; } 5052 else { print $text . "\n"; } 5053 } 5054 5055sub text 5056 { my ($notation)= @_; 5057 return '' if( !$notation->{name}); 5058 my @tokens; 5059 push @tokens, '<!NOTATION'; 5060 push @tokens, $notation->{name}; 5061 push @tokens, ( 'PUBLIC', _quoted_val( $notation->{pubid} ) ) if $notation->{pubid}; 5062 push @tokens, ( 'SYSTEM') if ! $notation->{pubid} && $notation->{sysid}; 5063 push @tokens, (_quoted_val( $notation->{sysid}) ) if $notation->{sysid}; 5064 5065 return join( ' ', @tokens) . '>'; 5066 } 5067 5068sub _quoted_val 5069 { my $q= $_[0]=~ m{"} ? q{'} : q{"}; 5070 return qq{$q$_[0]$q}; 5071 } 5072 5073sub _dump 5074 { my( $notation)= @_; return join( " - ", map { "$_ => '$notation->{$_}'" } grep { defined $notation->{$_} } sort keys %$notation); } 5075 50761; 5077 5078###################################################################### 5079package XML::Twig::Elt; 5080###################################################################### 5081 5082use Carp; 5083*isa= *UNIVERSAL::isa; 5084 5085my $CDATA_START = "<![CDATA["; 5086my $CDATA_END = "]]>"; 5087my $PI_START = "<?"; 5088my $PI_END = "?>"; 5089my $COMMENT_START = "<!--"; 5090my $COMMENT_END = "-->"; 5091 5092my $XMLNS_URI = 'http://www.w3.org/2000/xmlns/'; 5093 5094 5095BEGIN 5096 { # set some aliases for methods 5097 *tag = *gi; 5098 *name = *gi; 5099 *set_tag = *set_gi; 5100 *set_name = *set_gi; 5101 *find_nodes = *get_xpath; # as in XML::DOM 5102 *findnodes = *get_xpath; # as in XML::LibXML 5103 *field = *first_child_text; 5104 *trimmed_field = *first_child_trimmed_text; 5105 *is_field = *contains_only_text; 5106 *is = *passes; 5107 *matches = *passes; 5108 *has_child = *first_child; 5109 *has_children = *first_child; 5110 *all_children_pass = *all_children_are; 5111 *all_children_match= *all_children_are; 5112 *getElementsByTagName= *descendants; 5113 *find_by_tag_name= *descendants_or_self; 5114 *unwrap = *erase; 5115 *inner_xml = *xml_string; 5116 *outer_xml = *sprint; 5117 *add_class = *add_to_class; 5118 5119 *first_child_is = *first_child_matches; 5120 *last_child_is = *last_child_matches; 5121 *next_sibling_is = *next_sibling_matches; 5122 *prev_sibling_is = *prev_sibling_matches; 5123 *next_elt_is = *next_elt_matches; 5124 *prev_elt_is = *prev_elt_matches; 5125 *parent_is = *parent_matches; 5126 *child_is = *child_matches; 5127 *inherited_att = *inherit_att; 5128 5129 *sort_children_by_value= *sort_children_on_value; 5130 5131 *has_atts= *att_nb; 5132 5133 # imports from XML::Twig 5134 *_is_fh= *XML::Twig::_is_fh; 5135 5136 # XML::XPath compatibility 5137 *string_value = *text; 5138 *toString = *sprint; 5139 *getName = *gi; 5140 *getRootNode = *twig; 5141 *getNextSibling = *_next_sibling; 5142 *getPreviousSibling = *_prev_sibling; 5143 *isElementNode = *is_elt; 5144 *isTextNode = *is_text; 5145 *isPI = *is_pi; 5146 *isPINode = *is_pi; 5147 *isProcessingInstructionNode= *is_pi; 5148 *isComment = *is_comment; 5149 *isCommentNode = *is_comment; 5150 *getTarget = *target; 5151 *getFirstChild = *_first_child; 5152 *getLastChild = *_last_child; 5153 5154 # try using weak references 5155 # test whether we can use weak references 5156 { local $SIG{__DIE__}; 5157 if( eval 'require Scalar::Util' && defined( &Scalar::Util::weaken) ) 5158 { import Scalar::Util qw(weaken); } 5159 elsif( eval 'require WeakRef') 5160 { import WeakRef; } 5161 } 5162} 5163 5164 5165# can be called as XML::Twig::Elt->new( [[$gi, $atts, [@content]]) 5166# - gi is an optional gi given to the element 5167# - $atts is a hashref to attributes for the element 5168# - @content is an optional list of text and elements that will 5169# be inserted under the element 5170sub new 5171 { my $class= shift; 5172 $class= ref $class || $class; 5173 my $elt = {}; 5174 bless ($elt, $class); 5175 5176 return $elt unless @_; 5177 5178 if( @_ == 1 && $_[0]=~ m{^\s*<}) { return $class->parse( @_); } 5179 5180 # if a gi is passed then use it 5181 my $gi= shift; 5182 $elt->{gi}=$XML::Twig::gi2index{$gi} or $elt->set_gi( $gi); 5183 5184 5185 my $atts= ref $_[0] eq 'HASH' ? shift : undef; 5186 5187 if( $atts && defined $atts->{$CDATA}) 5188 { delete $atts->{$CDATA}; 5189 5190 my $cdata= $class->new( $CDATA => @_); 5191 return $class->new( $gi, $atts, $cdata); 5192 } 5193 5194 if( $gi eq $PCDATA) 5195 { if( grep { ref $_ } @_) { croak "element $PCDATA can only be created from text"; } 5196 $elt->{pcdata}= join '', @_; 5197 } 5198 elsif( $gi eq $ENT) 5199 { $elt->{ent}= shift; } 5200 elsif( $gi eq $CDATA) 5201 { if( grep { ref $_ } @_) { croak "element $CDATA can only be created from text"; } 5202 $elt->{cdata}= join '', @_; 5203 } 5204 elsif( $gi eq $COMMENT) 5205 { if( grep { ref $_ } @_) { croak "element $COMMENT can only be created from text"; } 5206 $elt->{comment}= join '', @_; 5207 } 5208 elsif( $gi eq $PI) 5209 { if( grep { ref $_ } @_) { croak "element $PI can only be created from text"; } 5210 $elt->_set_pi( shift, join '', @_); 5211 } 5212 else 5213 { # the rest of the arguments are the content of the element 5214 if( @_) 5215 { $elt->set_content( @_); } 5216 else 5217 { $elt->{empty}= 1; } 5218 } 5219 5220 if( $atts) 5221 { # the attribute hash can be used to pass the asis status 5222 if( defined $atts->{$ASIS}) { $elt->set_asis( $atts->{$ASIS} ); delete $atts->{$ASIS}; } 5223 if( defined $atts->{$EMPTY}) { $elt->{empty}= $atts->{$EMPTY}; delete $atts->{$EMPTY}; } 5224 if( keys %$atts) { $elt->set_atts( $atts); } 5225 $elt->_set_id( $atts->{$ID}) if( $atts->{$ID}); 5226 } 5227 5228 return $elt; 5229 } 5230 5231# optimized version of $elt->new( PCDATA, $text); 5232sub _new_pcdata 5233 { my $class= $_[0]; 5234 $class= ref $class || $class; 5235 my $elt = {}; 5236 bless $elt, $class; 5237 $elt->{gi}=$XML::Twig::gi2index{$PCDATA} or $elt->set_gi( $PCDATA); 5238 $elt->{pcdata}= $_[1]; 5239 return $elt; 5240 } 5241 5242# this function creates an XM:::Twig::Elt from a string 5243# it is quite clumsy at the moment, as it just creates a 5244# new twig then returns its root 5245# there might also be memory leaks there 5246# additional arguments are passed to new XML::Twig 5247sub parse 5248 { my $class= shift; 5249 if( ref( $class)) { $class= ref( $class); } 5250 my $string= shift; 5251 my %args= @_; 5252 my $t= XML::Twig->new(%args); 5253 $t->parse( $string); 5254 my $elt= $t->root; 5255 # clean-up the node 5256 delete $elt->{twig}; # get rid of the twig data 5257 delete $elt->{twig_current}; # better get rid of this too 5258 if( $t->{twig_id_list}) { $elt->{twig_id_list}= $t->{twig_id_list}; } 5259 $elt->cut; 5260 undef $t->{twig_root}; 5261 return $elt; 5262 } 5263 5264sub set_inner_xml 5265 { my( $elt, $xml, @args)= @_; 5266 my $new_elt= $elt->parse( "<dummy>$xml</dummy>", @args); 5267 $elt->cut_children; 5268 $new_elt->paste_first_child( $elt); 5269 $new_elt->erase; 5270 return $elt; 5271 } 5272 5273sub set_outer_xml 5274 { my( $elt, $xml, @args)= @_; 5275 my $new_elt= $elt->parse( "<dummy>$xml</dummy>", @args); 5276 $elt->cut_children; 5277 $new_elt->replace( $elt); 5278 $new_elt->erase; 5279 return $new_elt; 5280 } 5281 5282 5283sub set_inner_html 5284 { my( $elt, $html)= @_; 5285 my $t= XML::Twig->new->parse_html( "<html>$html</html>"); 5286 my $new_elt= $t->root; 5287 if( $elt->tag eq 'head') 5288 { $new_elt->first_child( 'head')->unwrap; 5289 $new_elt->first_child( 'body')->cut; 5290 } 5291 elsif( $elt->tag ne 'html') 5292 { $new_elt->first_child( 'head')->cut; 5293 $new_elt->first_child( 'body')->unwrap; 5294 } 5295 $new_elt->cut; 5296 $elt->cut_children; 5297 $new_elt->paste_first_child( $elt); 5298 $new_elt->erase; 5299 return $elt; 5300 } 5301 5302sub set_gi 5303 { my ($elt, $gi)= @_; 5304 unless( defined $XML::Twig::gi2index{$gi}) 5305 { # new gi, create entries in %gi2index and @index2gi 5306 push @XML::Twig::index2gi, $gi; 5307 $XML::Twig::gi2index{$gi}= $#XML::Twig::index2gi; 5308 } 5309 $elt->{gi}= $XML::Twig::gi2index{$gi}; 5310 return $elt; 5311 } 5312 5313sub gi { return $XML::Twig::index2gi[$_[0]->{gi}]; } 5314 5315sub local_name 5316 { my $elt= shift; 5317 return _local_name( $XML::Twig::index2gi[$elt->{'gi'}]); 5318 } 5319 5320sub ns_prefix 5321 { my $elt= shift; 5322 return _ns_prefix( $XML::Twig::index2gi[$elt->{'gi'}]); 5323 } 5324 5325# namespace prefix for any qname (can be used for elements or attributes) 5326sub _ns_prefix 5327 { my $qname= shift; 5328 if( $qname=~ m{^([^:]*):}) 5329 { return $1; } 5330 else 5331 { return( ''); } # should it be '' ? 5332 } 5333 5334# local name for any qname (can be used for elements or attributes) 5335sub _local_name 5336 { my $qname= shift; 5337 (my $local= $qname)=~ s{^[^:]*:}{}; 5338 return $local; 5339 } 5340 5341#sub get_namespace 5342sub namespace ## no critic (Subroutines::ProhibitNestedSubs); 5343 { my $elt= shift; 5344 my $prefix= defined $_[0] ? shift() : $elt->ns_prefix; 5345 my $ns_att= $prefix ? "xmlns:$prefix" : "xmlns"; 5346 my $expanded= $DEFAULT_NS{$prefix} || $elt->_inherit_att_through_cut( $ns_att) || ''; 5347 return $expanded; 5348 } 5349 5350sub declare_missing_ns ## no critic (Subroutines::ProhibitNestedSubs); 5351 { my $root= shift; 5352 my %missing_prefix; 5353 my $map= $root->_current_ns_prefix_map; 5354 5355 foreach my $prefix (keys %$map) 5356 { my $prefix_att= $prefix eq '#default' ? 'xmlns' : "xmlns:$prefix"; 5357 if( ! $root->{'att'}->{$prefix_att}) 5358 { $root->set_att( $prefix_att => $map->{$prefix}); } 5359 } 5360 return $root; 5361 } 5362 5363sub _current_ns_prefix_map 5364 { my( $elt)= shift; 5365 my $map; 5366 while( $elt) 5367 { foreach my $att ($elt->att_names) 5368 { my $prefix= $att eq 'xmlns' ? '#default' 5369 : $att=~ m{^xmlns:(.*)$} ? $1 5370 : next 5371 ; 5372 if( ! exists $map->{$prefix}) { $map->{$prefix}= $elt->{'att'}->{$att}; } 5373 } 5374 $elt= $elt->{parent} || ($elt->{former} && $elt->{former}->{parent}); 5375 } 5376 return $map; 5377 } 5378 5379sub set_ns_decl 5380 { my( $elt, $uri, $prefix)= @_; 5381 my $ns_att= $prefix ? "xmlns:$prefix" : 'xmlns'; 5382 $elt->set_att( $ns_att => $uri); 5383 return $elt; 5384 } 5385 5386sub set_ns_as_default 5387 { my( $root, $uri)= @_; 5388 my @ns_decl_to_remove; 5389 foreach my $elt ($root->descendants_or_self) 5390 { if( $elt->_ns_prefix && $elt->namespace eq $uri) 5391 { $elt->set_tag( $elt->local_name); } 5392 # store any namespace declaration for that uri 5393 foreach my $ns_decl (grep { $_=~ m{xmlns(:|$)} && $elt->{'att'}->{$_} eq $uri } $elt->att_names) 5394 { push @ns_decl_to_remove, [$elt, $ns_decl]; } 5395 } 5396 $root->set_ns_decl( $uri); 5397 # now remove the ns declarations (if done earlier then descendants of an element with the ns declaration 5398 # are not considered being in the namespace 5399 foreach my $ns_decl_to_remove ( @ns_decl_to_remove) 5400 { my( $elt, $ns_decl)= @$ns_decl_to_remove; 5401 $elt->del_att( $ns_decl); 5402 } 5403 5404 return $root; 5405 } 5406 5407 5408 5409# return #ELT for an element and #PCDATA... for others 5410sub get_type 5411 { my $gi_nb= $_[0]->{gi}; # the number, not the string 5412 return $ELT if( $gi_nb >= $XML::Twig::SPECIAL_GI); 5413 return $_[0]->gi; 5414 } 5415 5416# return the gi if it's a "real" element, 0 otherwise 5417sub is_elt 5418 { if( $_[0]->{gi} >= $XML::Twig::SPECIAL_GI) 5419 { return $_[0]->gi; } 5420 else 5421 { return 0; } 5422 } 5423 5424 5425sub is_pcdata 5426 { my $elt= shift; 5427 return (exists $elt->{'pcdata'}); 5428 } 5429 5430sub is_cdata 5431 { my $elt= shift; 5432 return (exists $elt->{'cdata'}); 5433 } 5434 5435sub is_pi 5436 { my $elt= shift; 5437 return (exists $elt->{'target'}); 5438 } 5439 5440sub is_comment 5441 { my $elt= shift; 5442 return (exists $elt->{'comment'}); 5443 } 5444 5445sub is_ent 5446 { my $elt= shift; 5447 return (exists $elt->{ent} || $elt->{ent_name}); 5448 } 5449 5450 5451sub is_text 5452 { my $elt= shift; 5453 return (exists( $elt->{'pcdata'}) || (exists $elt->{'cdata'})); 5454 } 5455 5456sub is_empty 5457 { return $_[0]->{empty} || 0; } 5458 5459sub set_empty 5460 { $_[0]->{empty}= defined( $_[1]) ? $_[1] : 1; return $_[0]; } 5461 5462sub set_not_empty 5463 { delete $_[0]->{empty} if( $_[0]->{'empty'}); return $_[0]; } 5464 5465 5466sub set_asis 5467 { my $elt=shift; 5468 5469 foreach my $descendant ($elt, $elt->_descendants ) 5470 { $descendant->{asis}= 1; 5471 if( (exists $descendant->{'cdata'})) 5472 { $descendant->{gi}=$XML::Twig::gi2index{$PCDATA} or $descendant->set_gi( $PCDATA); 5473 $descendant->{pcdata}= $descendant->{cdata}; 5474 } 5475 5476 } 5477 return $elt; 5478 } 5479 5480sub set_not_asis 5481 { my $elt=shift; 5482 foreach my $descendant ($elt, $elt->descendants) 5483 { delete $descendant->{asis} if $descendant->{asis};} 5484 return $elt; 5485 } 5486 5487sub is_asis 5488 { return $_[0]->{asis}; } 5489 5490sub closed 5491 { my $elt= shift; 5492 my $t= $elt->twig || return; 5493 my $curr_elt= $t->{twig_current}; 5494 return 1 unless( $curr_elt); 5495 return $curr_elt->in( $elt); 5496 } 5497 5498sub set_pcdata 5499 { my( $elt, $pcdata)= @_; 5500 5501 if( $elt->{extra_data_in_pcdata}) 5502 { _try_moving_extra_data( $elt, $pcdata); 5503 } 5504 $elt->{pcdata}= $pcdata; 5505 return $elt; 5506 } 5507 5508sub _extra_data_in_pcdata { return $_[0]->{extra_data_in_pcdata}; } 5509sub _set_extra_data_in_pcdata { $_[0]->{extra_data_in_pcdata}= $_[1]; return $_[0]; } 5510sub _del_extra_data_in_pcdata { delete $_[0]->{extra_data_in_pcdata}; return $_[0]; } 5511sub _unshift_extra_data_in_pcdata 5512 { my $e= shift; 5513 $e->{extra_data_in_pcdata}||=[]; 5514 unshift @{$e->{extra_data_in_pcdata}}, { text => shift(), offset => shift() }; 5515 } 5516sub _push_extra_data_in_pcdata 5517 { my $e= shift; 5518 $e->{extra_data_in_pcdata}||=[]; 5519 push @{$e->{extra_data_in_pcdata}}, { text => shift(), offset => shift() }; 5520 } 5521 5522sub _extra_data_before_end_tag { return $_[0]->{extra_data_before_end_tag} || ''; } 5523sub _set_extra_data_before_end_tag { $_[0]->{extra_data_before_end_tag}= $_[1]; return $_[0]} 5524sub _del_extra_data_before_end_tag { delete $_[0]->{extra_data_before_end_tag}; return $_[0]} 5525sub _prefix_extra_data_before_end_tag 5526 { my( $elt, $data)= @_; 5527 if($elt->{extra_data_before_end_tag}) 5528 { $elt->{extra_data_before_end_tag}= $data . $elt->{extra_data_before_end_tag}; } 5529 else 5530 { $elt->{extra_data_before_end_tag}= $data; } 5531 return $elt; 5532 } 5533 5534# internal, in cases where we know there is no extra_data (inlined anyway!) 5535sub _set_pcdata { $_[0]->{pcdata}= $_[1]; } 5536 5537# try to figure out if we can keep the extra_data around 5538sub _try_moving_extra_data 5539 { my( $elt, $modified)=@_; 5540 my $initial= $elt->{pcdata}; 5541 my $cpis= $elt->{extra_data_in_pcdata}; 5542 5543 if( (my $offset= index( $modified, $initial)) != -1) 5544 { # text has been added 5545 foreach (@$cpis) { $_->{offset}+= $offset; } 5546 } 5547 elsif( ($offset= index( $initial, $modified)) != -1) 5548 { # text has been cut 5549 my $len= length( $modified); 5550 foreach my $cpi (@$cpis) { $cpi->{offset} -= $offset; } 5551 $elt->_set_extra_data_in_pcdata( [ grep { $_->{offset} >= 0 && $_->{offset} < $len } @$cpis ]); 5552 } 5553 else 5554 { _match_extra_data_words( $elt, $initial, $modified) 5555 || _match_extra_data_chars( $elt, $initial, $modified) 5556 || $elt->_del_extra_data_in_pcdata; 5557 } 5558 } 5559 5560sub _match_extra_data_words 5561 { my( $elt, $initial, $modified)= @_; 5562 my @initial= split /\b/, $initial; 5563 my @modified= split /\b/, $modified; 5564 5565 return _match_extra_data( $elt, length( $initial), \@initial, \@modified); 5566 } 5567 5568sub _match_extra_data_chars 5569 { my( $elt, $initial, $modified)= @_; 5570 my @initial= split //, $initial; 5571 my @modified= split //, $modified; 5572 5573 return _match_extra_data( $elt, length( $initial), \@initial, \@modified); 5574 } 5575 5576sub _match_extra_data 5577 { my( $elt, $length, $initial, $modified)= @_; 5578 5579 my $cpis= $elt->{extra_data_in_pcdata}; 5580 5581 if( @$initial <= @$modified) 5582 { 5583 my( $ok, $positions, $offsets)= _pos_offset( $initial, $modified); 5584 if( $ok) 5585 { my $offset=0; 5586 my $pos= shift @$positions; 5587 foreach my $cpi (@$cpis) 5588 { while( $cpi->{offset} >= $pos) 5589 { $offset= shift @$offsets; 5590 $pos= shift @$positions || $length +1; 5591 } 5592 $cpi->{offset} += $offset; 5593 } 5594 return 1; 5595 } 5596 } 5597 else 5598 { my( $ok, $positions, $offsets)= _pos_offset( $modified, $initial); 5599 if( $ok) 5600 { #print STDERR "pos: ", join( ':', @$positions), "\n", 5601 # "offset: ", join( ':', @$offsets), "\n"; 5602 my $offset=0; 5603 my $pos= shift @$positions; 5604 my $prev_pos= 0; 5605 5606 foreach my $cpi (@$cpis) 5607 { while( $cpi->{offset} >= $pos) 5608 { $offset= shift @$offsets; 5609 $prev_pos= $pos; 5610 $pos= shift @$positions || $length +1; 5611 } 5612 $cpi->{offset} -= $offset; 5613 if( $cpi->{offset} < $prev_pos) { delete $cpi->{text}; } 5614 } 5615 $elt->_set_extra_data_in_pcdata( [ grep { exists $_->{text} } @$cpis ]); 5616 return 1; 5617 } 5618 } 5619 return 0; 5620 } 5621 5622 5623sub _pos_offset 5624 { my( $short, $long)= @_; 5625 my( @pos, @offset); 5626 my( $s_length, $l_length)=(0,0); 5627 while (@$short) 5628 { my $s_word= shift @$short; 5629 my $l_word= shift @$long; 5630 if( $s_word ne $l_word) 5631 { while( @$long && $s_word ne $l_word) 5632 { $l_length += length( $l_word); 5633 $l_word= shift @$long; 5634 } 5635 if( !@$long && $s_word ne $l_word) { return 0; } 5636 push @pos, $s_length; 5637 push @offset, $l_length - $s_length; 5638 } 5639 my $length= length( $s_word); 5640 $s_length += $length; 5641 $l_length += $length; 5642 } 5643 return( 1, \@pos, \@offset); 5644 } 5645 5646sub append_pcdata 5647 { $_[0]->{'pcdata'}.= $_[1]; 5648 return $_[0]; 5649 } 5650 5651sub pcdata { return $_[0]->{pcdata}; } 5652 5653 5654sub append_extra_data 5655 { $_[0]->{extra_data}.= $_[1]; 5656 return $_[0]; 5657 } 5658 5659sub set_extra_data 5660 { $_[0]->{extra_data}= $_[1]; 5661 return $_[0]; 5662 } 5663sub extra_data { return $_[0]->{extra_data} || ''; } 5664 5665sub set_target 5666 { my( $elt, $target)= @_; 5667 $elt->{target}= $target; 5668 return $elt; 5669 } 5670sub target { return $_[0]->{target}; } 5671 5672sub set_data 5673 { $_[0]->{'data'}= $_[1]; 5674 return $_[0]; 5675 } 5676sub data { return $_[0]->{data}; } 5677 5678sub set_pi 5679 { my $elt= shift; 5680 unless( $elt->{gi} == $XML::Twig::gi2index{$PI}) 5681 { $elt->cut_children; 5682 $elt->{gi}=$XML::Twig::gi2index{$PI} or $elt->set_gi( $PI); 5683 } 5684 return $elt->_set_pi( @_); 5685 } 5686 5687sub _set_pi 5688 { $_[0]->set_target( $_[1]); 5689 $_[0]->{data}= $_[2]; 5690 return $_[0]; 5691 } 5692 5693sub pi_string { my $string= $PI_START . $_[0]->{target}; 5694 my $data= $_[0]->{data}; 5695 if( defined( $data) && $data ne '') { $string .= " $data"; } 5696 $string .= $PI_END ; 5697 return $string; 5698 } 5699 5700sub set_comment 5701 { my $elt= shift; 5702 unless( $elt->{gi} == $XML::Twig::gi2index{$COMMENT}) 5703 { $elt->cut_children; 5704 $elt->{gi}=$XML::Twig::gi2index{$COMMENT} or $elt->set_gi( $COMMENT); 5705 } 5706 $elt->{comment}= $_[0]; 5707 return $elt; 5708 } 5709 5710sub _set_comment { $_[0]->{comment}= $_[1]; return $_[0]; } 5711sub comment { return $_[0]->{comment}; } 5712sub comment_string { return $COMMENT_START . _comment_escaped_string( $_[0]->{comment}) . $COMMENT_END; } 5713# comments cannot start or end with 5714sub _comment_escaped_string 5715 { my( $c)= @_; 5716 $c=~ s{^-}{ -}; 5717 $c=~ s{-$}{- }; 5718 $c=~ s{--}{- -}g; 5719 return $c; 5720 } 5721 5722sub set_ent { $_[0]->{ent}= $_[1]; return $_[0]; } 5723sub ent { return $_[0]->{ent}; } 5724sub ent_name { return substr( $_[0]->{ent}, 1, -1);} 5725 5726sub set_cdata 5727 { my $elt= shift; 5728 unless( $elt->{gi} == $XML::Twig::gi2index{$CDATA}) 5729 { $elt->cut_children; 5730 $elt->insert_new_elt( first_child => $CDATA, @_); 5731 return $elt; 5732 } 5733 $elt->{cdata}= $_[0]; 5734 return $_[0]; 5735 } 5736 5737sub _set_cdata 5738 { $_[0]->{cdata}= $_[1]; 5739 return $_[0]; 5740 } 5741 5742sub append_cdata 5743 { $_[0]->{cdata}.= $_[1]; 5744 return $_[0]; 5745 } 5746sub cdata { return $_[0]->{cdata}; } 5747 5748 5749sub contains_only_text 5750 { my $elt= shift; 5751 return 0 unless $elt->is_elt; 5752 foreach my $child ($elt->_children) 5753 { return 0 if $child->is_elt; } 5754 return $elt; 5755 } 5756 5757sub contains_only 5758 { my( $elt, $exp)= @_; 5759 my @children= do { my $elt= $elt; my @children=(); my $child= $elt->{first_child}; while( $child) { push @children, $child; $child= $child->{next_sibling}; } @children; }; 5760 foreach my $child (@children) 5761 { return 0 unless $child->is( $exp); } 5762 return @children || 1; 5763 } 5764 5765sub contains_a_single 5766 { my( $elt, $exp)= @_; 5767 my $child= $elt->{first_child} or return 0; 5768 return 0 unless $child->passes( $exp); 5769 return 0 if( $child->{next_sibling}); 5770 return $child; 5771 } 5772 5773 5774sub root 5775 { my $elt= shift; 5776 while( $elt->{parent}) { $elt= $elt->{parent}; } 5777 return $elt; 5778 } 5779 5780sub _root_through_cut 5781 { my $elt= shift; 5782 while( $elt->{parent} || ($elt->{former} && $elt->{former}->{parent})) { $elt= $elt->{parent} || ($elt->{former} && $elt->{former}->{parent}); } 5783 return $elt; 5784 } 5785 5786sub twig 5787 { my $elt= shift; 5788 my $root= $elt->root; 5789 return $root->{twig}; 5790 } 5791 5792sub _twig_through_cut 5793 { my $elt= shift; 5794 my $root= $elt->_root_through_cut; 5795 return $root->{twig}; 5796 } 5797 5798 5799# used for navigation 5800# returns undef or the element, depending on whether $elt passes $cond 5801# $cond can be 5802# - empty: the element passes the condition 5803# - ELT ('#ELT'): the element passes the condition if it is a "real" element 5804# - TEXT ('#TEXT'): the element passes if it is a CDATA or PCDATA element 5805# - a string with an XPath condition (only a subset of XPath is actually 5806# supported). 5807# - a regexp: the element passes if its gi matches the regexp 5808# - a code ref: the element passes if the code, applied on the element, 5809# returns true 5810 5811my %cond_cache; # expression => coderef 5812 5813sub reset_cond_cache { %cond_cache=(); } 5814 5815{ 5816 sub _install_cond 5817 { my $cond= shift; 5818 my $test; 5819 my $init=''; 5820 5821 my $original_cond= $cond; 5822 5823 my $not= ($cond=~ s{^\s*!}{}) ? '!' : ''; 5824 5825 if( ref $cond eq 'CODE') { return $cond; } 5826 5827 if( ref $cond eq 'Regexp') 5828 { $test = qq{(\$_[0]->gi=~ /$cond/)}; } 5829 else 5830 { my @tests; 5831 while( $cond) 5832 { 5833 # the condition is a string 5834 if( $cond=~ s{$ELT$SEP}{}) 5835 { push @tests, qq{\$_[0]->is_elt}; } 5836 elsif( $cond=~ s{$TEXT$SEP}{}) 5837 { push @tests, qq{\$_[0]->is_text}; } 5838 elsif( $cond=~ s{^\s*($REG_TAG_PART)$SEP}{}) 5839 { push @tests, _gi_test( $1); } 5840 elsif( $cond=~ s{^\s*($REG_REGEXP)$SEP}{}) 5841 { # /regexp/ 5842 push @tests, qq{ \$_[0]->gi=~ $1 }; 5843 } 5844 elsif( $cond=~ s{^\s*($REG_TAG_PART)?\s* # $1 5845 \[\s*(-?)\s*(\d+)\s*\] # [$2] 5846 $SEP}{}xo 5847 ) 5848 { my( $gi, $neg, $index)= ($1, $2, $3); 5849 my $siblings= $neg ? q{$_[0]->_next_siblings} : q{$_[0]->_prev_siblings}; 5850 if( $gi && ($gi ne '*')) 5851 #{ $test= qq{((\$_[0]->gi eq "$gi") && (scalar( grep { \$_->gi eq "$gi" } $siblings) + 1 == $index))}; } 5852 { push @tests, _and( _gi_test( $gi), qq{ (scalar( grep { \$_->gi eq "$gi" } $siblings) + 1 == $index)}); } 5853 else 5854 { push @tests, qq{(scalar( $siblings) + 1 == $index)}; } 5855 } 5856 elsif( $cond=~ s{^\s*($REG_TAG_PART?)\s*($REG_PREDICATE)$SEP}{}) 5857 { my( $gi, $predicate)= ( $1, $2); 5858 push @tests, _and( _gi_test( $gi), _parse_predicate_in_step( $predicate)); 5859 } 5860 elsif( $cond=~ s{^\s*($REG_NAKED_PREDICATE)$SEP}{}) 5861 { push @tests, _parse_predicate_in_step( $1); } 5862 else 5863 { croak "wrong navigation condition '$original_cond' ($@)"; } 5864 } 5865 $test= @tests > 1 ? '(' . join( '||', map { "($_)" } @tests) . ')' : $tests[0]; 5866 } 5867 5868 #warn "init: '$init' - test: '$test'\n"; 5869 5870 my $sub= qq{sub { $NO_WARNINGS; $init; return $not($test) ? \$_[0] : undef; } }; 5871 my $s= eval $sub; 5872 #warn "cond: $cond\n$sub\n"; 5873 if( $@) 5874 { croak "wrong navigation condition '$original_cond' ($@);" } 5875 return $s; 5876 } 5877 5878 sub _gi_test 5879 { my( $full_gi)= @_; 5880 5881 # optimize if the gi exists, including the case where the gi includes a dot 5882 my $index= $XML::Twig::gi2index{$full_gi}; 5883 if( $index) { return qq{\$_[0]->{gi} == $index}; } 5884 5885 my( $gi, $class, $id)= $full_gi=~ m{^(.*?)(?:[.]([^.]*)|[#](.*))?$}; 5886 5887 my $gi_test=''; 5888 if( $gi && $gi ne '*' ) 5889 { # 2 options, depending on whether the gi exists in gi2index 5890 # start optimization 5891 my $index= $XML::Twig::gi2index{$gi}; 5892 if( $index) 5893 { # the gi exists, use its index as a faster shortcut 5894 $gi_test = qq{\$_[0]->{gi} == $index}; 5895 } 5896 else 5897 # end optimization 5898 { # it does not exist (but might be created later), compare the strings 5899 $gi_test = qq{ \$_[0]->gi eq "$gi"}; 5900 } 5901 } 5902 else 5903 { $gi_test= 1; } 5904 5905 my $class_test=''; 5906 #warn "class: '$class'"; 5907 if( $class) 5908 { $class_test = qq{ defined( \$_[0]->{att}->{class}) && \$_[0]->{att}->{class}=~ m{\\b$class\\b} }; } 5909 5910 my $id_test=''; 5911 #warn "id: '$id'"; 5912 if( $id) 5913 { $id_test = qq{ defined( \$_[0]->{att}->{$ID}) && \$_[0]->{att}->{$ID} eq '$id' }; } 5914 5915 5916 #warn "gi_test: '$gi_test' - class_test: '$class_test' returning ", _and( $gi_test, $class_test); 5917 return _and( $gi_test, $class_test, $id_test); 5918 } 5919 5920 5921 # input: the original predicate 5922 sub _parse_predicate_in_step 5923 { my $cond= shift; 5924 my %PERL_ALPHA_TEST= ( '=' => ' eq ', '!=' => ' ne ', '>' => ' gt ', '>=' => ' ge ', '<' => ' lt ', '<=' => ' le '); 5925 5926 $cond=~ s{^\s*\[\s*}{}; 5927 $cond=~ s{\s*\]\s*$}{}; 5928 $cond=~ s{( ($REG_STRING|$REG_REGEXP) # strings or regexps 5929 |\@($REG_TAG_NAME)(?=\s*(?:[><=!]|!~|=~)) # @att (followed by a comparison operator) 5930 |\@($REG_TAG_NAME) # @att (not followed by a comparison operator) 5931 |=~|!~ # matching operators 5932 |([><]=?|=|!=)(?=\s*[\d+-]) # test before a number 5933 |([><]=?|=|!=) # test, other cases 5934 |($REG_FUNCTION) # no arg functions 5935 # this bit is a mess, but it is the only solution with this half-baked parser 5936 |((?:string|text)\(\s*$REG_TAG_NAME\s*\)\s*$REG_MATCH\s*$REG_REGEXP) # string( child) =~ /regexp/ 5937 |((?:string|text)\(\s*$REG_TAG_NAME\s*\)\s*!?=\s*$REG_VALUE) # string( child) = "value" (or !=) 5938 |((?:string|text)\(\s*$REG_TAG_NAME\s*\)\s*[<>]=?\s*$REG_VALUE) # string( child) > "value" 5939 |(and|or) 5940 )} 5941 { my( $token, $string, $att, $bare_att, $num_test, $alpha_test, $func, $string_regexp, $string_eq, $string_test, $and_or) 5942 = ( $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11); 5943 5944 if( defined $string) { $token } 5945 elsif( $att) { "( \$_[0]->{att} && exists( \$_[0]->{att}->{'$att'}) && \$_[0]->{att}->{'$att'})"; } 5946 elsif( $bare_att) { "(\$_[0]->{att} && defined( \$_[0]->{att}->{'$bare_att'}))"; } 5947 elsif( $num_test && ($num_test eq '=') ) { "==" } # others tests are unchanged 5948 elsif( $alpha_test) { $PERL_ALPHA_TEST{$alpha_test} } 5949 elsif( $func && $func=~ m{^(?:string|text)}) 5950 { "\$_[0]->text"; } 5951 elsif( $string_regexp && $string_regexp =~ m{(?:string|text)\(\s*($REG_TAG_NAME)\s*\)\s*($REG_MATCH)\s*($REG_REGEXP)}) 5952 { "(XML::Twig::_first_n { (\$_->gi eq '$1') && (\$_->text $2 $3) } 1, \$_[0]->_children)"; } 5953 elsif( $string_eq && $string_eq =~ m{(?:string|text)\(\s*($REG_TAG_NAME)\s*\)\s*(!?=)\s*($REG_VALUE)}) 5954 {"(XML::Twig::_first_n { (\$_->gi eq '$1') && (\$_->text $PERL_ALPHA_TEST{$2} $3) } 1, \$_[0]->_children)"; } 5955 elsif( $string_test && $string_test =~ m{(?:string|text)\(\s*($REG_TAG_NAME)\s*\)\s*([<>]=?)\s*($REG_VALUE)}) 5956 { "(XML::Twig::_first_n { (\$_->gi eq '$1') && (\$_->text $2 $3) } 1, \$_[0]->_children)"; } 5957 elsif( $and_or) { $and_or eq 'and' ? '&&' : '||' ; } 5958 else { $token; } 5959 }gexs; 5960 return "($cond)"; 5961 } 5962 5963 5964 sub _op 5965 { my $op= shift; 5966 if( $op eq '=') { $op= 'eq'; } 5967 elsif( $op eq '!=') { $op= 'ne'; } 5968 return $op; 5969 } 5970 5971 sub passes 5972 { my( $elt, $cond)= @_; 5973 return $elt unless $cond; 5974 my $sub= ($cond_cache{$cond} ||= _install_cond( $cond)); 5975 return $sub->( $elt); 5976 } 5977} 5978 5979sub set_parent 5980 { $_[0]->{parent}= $_[1]; 5981 if( $XML::Twig::weakrefs) { weaken( $_[0]->{parent}); } 5982 } 5983 5984sub parent 5985 { my $elt= shift; 5986 my $cond= shift || return $elt->{parent}; 5987 do { $elt= $elt->{parent} || return; } until ( $elt->passes( $cond)); 5988 return $elt; 5989 } 5990 5991sub set_first_child 5992 { $_[0]->{'first_child'}= $_[1]; 5993 } 5994 5995sub first_child 5996 { my $elt= shift; 5997 my $cond= shift || return $elt->{first_child}; 5998 my $child= $elt->{first_child}; 5999 my $test_cond= ($cond_cache{$cond} ||= _install_cond( $cond)); 6000 while( $child && !$test_cond->( $child)) 6001 { $child= $child->{next_sibling}; } 6002 return $child; 6003 } 6004 6005sub _first_child { return $_[0]->{first_child}; } 6006sub _last_child { return $_[0]->{last_child}; } 6007sub _next_sibling { return $_[0]->{next_sibling}; } 6008sub _prev_sibling { return $_[0]->{prev_sibling}; } 6009sub _parent { return $_[0]->{parent}; } 6010sub _next_siblings { my $elt= shift; my @siblings; while( $elt= $elt->{next_sibling}) { push @siblings, $elt; } return @siblings; } 6011sub _prev_siblings { my $elt= shift; my @siblings; while( $elt= $elt->{prev_sibling}) { push @siblings, $elt; } return @siblings; } 6012 6013# sets a field 6014# arguments $record, $cond, @content 6015sub set_field 6016 { my $record = shift; 6017 my $cond = shift; 6018 my $child= $record->first_child( $cond); 6019 if( $child) 6020 { $child->set_content( @_); } 6021 else 6022 { if( $cond=~ m{^\s*($REG_TAG_NAME)}) 6023 { my $gi= $1; 6024 $child= $record->insert_new_elt( last_child => $gi, @_); 6025 } 6026 else 6027 { croak "can't create a field name from $cond"; } 6028 } 6029 return $child; 6030 } 6031 6032sub set_last_child 6033 { $_[0]->{'last_child'}= $_[1]; 6034 delete $_->[0]->{empty}; 6035 if( $XML::Twig::weakrefs) { weaken( $_[0]->{'last_child'}); } 6036 } 6037 6038sub last_child 6039 { my $elt= shift; 6040 my $cond= shift || return $elt->{last_child}; 6041 my $test_cond= ($cond_cache{$cond} ||= _install_cond( $cond)); 6042 my $child= $elt->{last_child}; 6043 while( $child && !$test_cond->( $child) ) 6044 { $child= $child->{prev_sibling}; } 6045 return $child 6046 } 6047 6048 6049sub set_prev_sibling 6050 { $_[0]->{'prev_sibling'}= $_[1]; 6051 if( $XML::Twig::weakrefs) { weaken( $_[0]->{'prev_sibling'}); } 6052 } 6053 6054sub prev_sibling 6055 { my $elt= shift; 6056 my $cond= shift || return $elt->{prev_sibling}; 6057 my $test_cond= ($cond_cache{$cond} ||= _install_cond( $cond)); 6058 my $sibling= $elt->{prev_sibling}; 6059 while( $sibling && !$test_cond->( $sibling) ) 6060 { $sibling= $sibling->{prev_sibling}; } 6061 return $sibling; 6062 } 6063 6064sub set_next_sibling { $_[0]->{'next_sibling'}= $_[1]; } 6065 6066sub next_sibling 6067 { my $elt= shift; 6068 my $cond= shift || return $elt->{next_sibling}; 6069 my $test_cond= ($cond_cache{$cond} ||= _install_cond( $cond)); 6070 my $sibling= $elt->{next_sibling}; 6071 while( $sibling && !$test_cond->( $sibling) ) 6072 { $sibling= $sibling->{next_sibling}; } 6073 return $sibling; 6074 } 6075 6076# methods dealing with the class attribute, convenient if you work with xhtml 6077sub class { $_[0]->{att}->{class}; } 6078# lvalue version of class. separate from class to avoid problem like RT# 6079sub lclass 6080 :lvalue # > perl 5.5 6081 { $_[0]->{att}->{class}; } 6082 6083sub set_class { my( $elt, $class)= @_; $elt->set_att( class => $class); } 6084 6085# adds a class to an element 6086sub add_to_class 6087 { my( $elt, $new_class)= @_; 6088 return $elt unless $new_class; 6089 my $class= $elt->class; 6090 my %class= $class ? map { $_ => 1 } split /\s+/, $class : (); 6091 $class{$new_class}= 1; 6092 $elt->set_class( join( ' ', sort keys %class)); 6093 } 6094 6095sub remove_class 6096 { my( $elt, $class_to_remove)= @_; 6097 return $elt unless $class_to_remove; 6098 my $class= $elt->class; 6099 my %class= $class ? map { $_ => 1 } split /\s+/, $class : (); 6100 delete $class{$class_to_remove}; 6101 $elt->set_class( join( ' ', sort keys %class)); 6102 } 6103 6104sub att_to_class { my( $elt, $att)= @_; $elt->set_class( $elt->{'att'}->{$att}); } 6105sub add_att_to_class { my( $elt, $att)= @_; $elt->add_to_class( $elt->{'att'}->{$att}); } 6106sub move_att_to_class { my( $elt, $att)= @_; $elt->add_to_class( $elt->{'att'}->{$att}); 6107 $elt->del_att( $att); 6108 } 6109sub tag_to_class { my( $elt)= @_; $elt->set_class( $elt->tag); } 6110sub add_tag_to_class { my( $elt)= @_; $elt->add_to_class( $elt->tag); } 6111sub set_tag_class { my( $elt, $new_tag)= @_; $elt->add_tag_to_class; $elt->set_tag( $new_tag); } 6112 6113sub tag_to_span 6114 { my( $elt)= @_; 6115 $elt->set_class( $elt->tag) unless( $elt->tag eq 'span' && $elt->class); # set class to span unless it would mean replacing it with span 6116 $elt->set_tag( 'span'); 6117 } 6118 6119sub tag_to_div 6120 { my( $elt)= @_; 6121 $elt->set_class( $elt->tag) unless( $elt->tag eq 'div' && $elt->class); # set class to div unless it would mean replacing it with div 6122 $elt->set_tag( 'div'); 6123 } 6124 6125sub in_class 6126 { my( $elt, $class)= @_; 6127 my $elt_class= $elt->class; 6128 return unless( defined $elt_class); 6129 return $elt->class=~ m{(?:^|\s)\Q$class\E(?:\s|$)} ? $elt : 0; 6130 } 6131 6132 6133# get or set all attributes 6134# argument can be a hash or a hashref 6135sub set_atts 6136 { my $elt= shift; 6137 my %atts; 6138 tie %atts, 'Tie::IxHash' if( keep_atts_order()); 6139 %atts= @_ == 1 ? %{$_[0]} : @_; 6140 $elt->{att}= \%atts; 6141 if( exists $atts{$ID}) { $elt->_set_id( $atts{$ID}); } 6142 return $elt; 6143 } 6144 6145sub atts { return $_[0]->{att}; } 6146sub att_names { return (sort keys %{$_[0]->{att}}); } 6147sub del_atts { $_[0]->{att}={}; return $_[0]; } 6148 6149# get or set a single attribute (set works for several atts) 6150sub set_att 6151 { my $elt= shift; 6152 6153 if( $_[0] && ref( $_[0]) && !$_[1]) 6154 { croak "improper call to set_att, usage is \$elt->set_att( att1 => 'val1', att2 => 'val2',...)"; } 6155 6156 unless( $elt->{att}) 6157 { $elt->{att}={}; 6158 tie %{$elt->{att}}, 'Tie::IxHash' if( keep_atts_order()); 6159 } 6160 6161 while(@_) 6162 { my( $att, $val)= (shift, shift); 6163 $elt->{att}->{$att}= $val; 6164 if( $att eq $ID) { $elt->_set_id( $val); } 6165 } 6166 return $elt; 6167 } 6168 6169sub att { $_[0]->{att}->{$_[1]}; } 6170# lvalue version of att. separate from class to avoid problem like RT# 6171sub latt 6172 :lvalue # > perl 5.5 6173 { $_[0]->{att}->{$_[1]}; } 6174 6175sub del_att 6176 { my $elt= shift; 6177 while( @_) { delete $elt->{'att'}->{shift()}; } 6178 return $elt; 6179 } 6180 6181sub att_exists { return exists $_[0]->{att}->{$_[1]}; } 6182 6183# delete an attribute from all descendants of an element 6184sub strip_att 6185 { my( $elt, $att)= @_; 6186 $_->del_att( $att) foreach ($elt->descendants_or_self( qq{*[\@$att]})); 6187 return $elt; 6188 } 6189 6190sub change_att_name 6191 { my( $elt, $old_name, $new_name)= @_; 6192 my $value= $elt->{'att'}->{$old_name}; 6193 return $elt unless( defined $value); 6194 $elt->del_att( $old_name) 6195 ->set_att( $new_name => $value); 6196 return $elt; 6197 } 6198 6199sub lc_attnames 6200 { my $elt= shift; 6201 foreach my $att ($elt->att_names) 6202 { if( $att ne lc $att) { $elt->change_att_name( $att, lc $att); } } 6203 return $elt; 6204 } 6205 6206sub set_twig_current { $_[0]->{twig_current}=1; } 6207sub del_twig_current { delete $_[0]->{twig_current}; } 6208 6209 6210# get or set the id attribute 6211sub set_id 6212 { my( $elt, $id)= @_; 6213 $elt->del_id() if( exists $elt->{att}->{$ID}); 6214 $elt->set_att($ID, $id); 6215 $elt->_set_id( $id); 6216 return $elt; 6217 } 6218 6219# only set id, does not update the attribute value 6220sub _set_id 6221 { my( $elt, $id)= @_; 6222 my $t= $elt->twig || $elt; 6223 $t->{twig_id_list}->{$id}= $elt; 6224 if( $XML::Twig::weakrefs) { weaken( $t->{twig_id_list}->{$id}); } 6225 return $elt; 6226 } 6227 6228sub id { return $_[0]->{att}->{$ID}; } 6229 6230# methods used to add ids to elements that don't have one 6231BEGIN 6232{ my $id_nb = "0001"; 6233 my $id_seed = "twig_id_"; 6234 6235 sub set_id_seed ## no critic (Subroutines::ProhibitNestedSubs); 6236 { $id_seed= $_[1]; $id_nb=1; } 6237 6238 sub add_id ## no critic (Subroutines::ProhibitNestedSubs); 6239 { my $elt= shift; 6240 if( defined $elt->{'att'}->{$ID}) 6241 { return $elt->{'att'}->{$ID}; } 6242 else 6243 { my $id= $_[0] && ref( $_[0]) && isa( $_[0], 'CODE') ? $_[0]->( $elt) : $id_seed . $id_nb++; 6244 $elt->set_id( $id); 6245 return $id; 6246 } 6247 } 6248} 6249 6250 6251 6252# delete the id attribute and remove the element from the id list 6253sub del_id 6254 { my $elt= shift; 6255 if( ! exists $elt->{att}->{$ID}) { return $elt }; 6256 my $id= $elt->{att}->{$ID}; 6257 6258 delete $elt->{att}->{$ID}; 6259 6260 my $t= shift || $elt->twig; 6261 unless( $t) { return $elt; } 6262 if( exists $t->{twig_id_list}->{$id}) { delete $t->{twig_id_list}->{$id}; } 6263 6264 return $elt; 6265 } 6266 6267# return the list of children 6268sub children 6269 { my $elt= shift; 6270 my @children; 6271 my $child= $elt->first_child( @_); 6272 while( $child) 6273 { push @children, $child; 6274 $child= $child->next_sibling( @_); 6275 } 6276 return @children; 6277 } 6278 6279sub _children 6280 { my $elt= shift; 6281 my @children=(); 6282 my $child= $elt->{first_child}; 6283 while( $child) 6284 { push @children, $child; 6285 $child= $child->{next_sibling}; 6286 } 6287 return @children; 6288 } 6289 6290sub children_copy 6291 { my $elt= shift; 6292 my @children; 6293 my $child= $elt->first_child( @_); 6294 while( $child) 6295 { push @children, $child->copy; 6296 $child= $child->next_sibling( @_); 6297 } 6298 return @children; 6299 } 6300 6301 6302sub children_count 6303 { my $elt= shift; 6304 my $cond= shift; 6305 my $count=0; 6306 my $child= $elt->{first_child}; 6307 while( $child) 6308 { $count++ if( $child->passes( $cond)); 6309 $child= $child->{next_sibling}; 6310 } 6311 return $count; 6312 } 6313 6314sub children_text 6315 { my $elt= shift; 6316 return wantarray() ? map { $_->text} $elt->children( @_) 6317 : join( '', map { $_->text} $elt->children( @_) ) 6318 ; 6319 } 6320 6321sub children_trimmed_text 6322 { my $elt= shift; 6323 return wantarray() ? map { $_->trimmed_text} $elt->children( @_) 6324 : join( '', map { $_->trimmed_text} $elt->children( @_) ) 6325 ; 6326 } 6327 6328sub all_children_are 6329 { my( $parent, $cond)= @_; 6330 foreach my $child ($parent->_children) 6331 { return 0 unless( $child->passes( $cond)); } 6332 return $parent; 6333 } 6334 6335 6336sub ancestors 6337 { my( $elt, $cond)= @_; 6338 my @ancestors; 6339 while( $elt->{parent}) 6340 { $elt= $elt->{parent}; 6341 push @ancestors, $elt if( $elt->passes( $cond)); 6342 } 6343 return @ancestors; 6344 } 6345 6346sub ancestors_or_self 6347 { my( $elt, $cond)= @_; 6348 my @ancestors; 6349 while( $elt) 6350 { push @ancestors, $elt if( $elt->passes( $cond)); 6351 $elt= $elt->{parent}; 6352 } 6353 return @ancestors; 6354 } 6355 6356 6357sub _ancestors 6358 { my( $elt, $include_self)= @_; 6359 my @ancestors= $include_self ? ($elt) : (); 6360 while( $elt= $elt->{parent}) { push @ancestors, $elt; } 6361 return @ancestors; 6362 } 6363 6364 6365sub inherit_att 6366 { my $elt= shift; 6367 my $att= shift; 6368 my %tags= map { ($_, 1) } @_; 6369 6370 do 6371 { if( (defined $elt->{'att'}->{$att}) 6372 && ( !%tags || $tags{$XML::Twig::index2gi[$elt->{'gi'}]}) 6373 ) 6374 { return $elt->{'att'}->{$att}; } 6375 } while( $elt= $elt->{parent}); 6376 return undef; 6377 } 6378 6379sub _inherit_att_through_cut 6380 { my $elt= shift; 6381 my $att= shift; 6382 my %tags= map { ($_, 1) } @_; 6383 6384 do 6385 { if( (defined $elt->{'att'}->{$att}) 6386 && ( !%tags || $tags{$XML::Twig::index2gi[$elt->{'gi'}]}) 6387 ) 6388 { return $elt->{'att'}->{$att}; } 6389 } while( $elt= $elt->{parent} || ($elt->{former} && $elt->{former}->{parent})); 6390 return undef; 6391 } 6392 6393 6394sub current_ns_prefixes 6395 { my $elt= shift; 6396 my %prefix; 6397 $prefix{''}=1 if( $elt->namespace( '')); 6398 while( $elt) 6399 { my @ns= grep { !m{^xml} } map { m{^([^:]+):} } ($XML::Twig::index2gi[$elt->{'gi'}], $elt->att_names); 6400 $prefix{$_}=1 foreach (@ns); 6401 $elt= $elt->{parent}; 6402 } 6403 6404 return (sort keys %prefix); 6405 } 6406 6407# kinda counter-intuitive actually: 6408# the next element is found by looking for the next open tag after from the 6409# current one, which is the first child, if it exists, or the next sibling 6410# or the first next sibling of an ancestor 6411# optional arguments are: 6412# - $subtree_root: a reference to an element, when the next element is not 6413# within $subtree_root anymore then next_elt returns undef 6414# - $cond: a condition, next_elt returns the next element matching the condition 6415 6416sub next_elt 6417 { my $elt= shift; 6418 my $subtree_root= 0; 6419 $subtree_root= shift if( ref( $_[0]) && isa( $_[0], 'XML::Twig::Elt')); 6420 my $cond= shift; 6421 my $next_elt; 6422 6423 my $ind; # optimization 6424 my $test_cond; 6425 if( $cond) # optimization 6426 { unless( defined( $ind= $XML::Twig::gi2index{$cond}) ) # optimization 6427 { $test_cond= ($cond_cache{$cond} ||= _install_cond( $cond)); } # optimization 6428 } # optimization 6429 6430 do 6431 { if( $next_elt= $elt->{first_child}) 6432 { # simplest case: the elt has a child 6433 } 6434 elsif( $next_elt= $elt->{next_sibling}) 6435 { # no child but a next sibling (just check we stay within the subtree) 6436 6437 # case where elt is subtree_root, is empty and has a sibling 6438 return undef if( $subtree_root && ($elt == $subtree_root)); 6439 6440 } 6441 else 6442 { # case where the element has no child and no next sibling: 6443 # get the first next sibling of an ancestor, checking subtree_root 6444 6445 # case where elt is subtree_root, is empty and has no sibling 6446 return undef if( $subtree_root && ($elt == $subtree_root)); 6447 6448 $next_elt= $elt->{parent} || return undef; 6449 6450 until( $next_elt->{next_sibling}) 6451 { return undef if( $subtree_root && ($subtree_root == $next_elt)); 6452 $next_elt= $next_elt->{parent} || return undef; 6453 } 6454 return undef if( $subtree_root && ($subtree_root == $next_elt)); 6455 $next_elt= $next_elt->{next_sibling}; 6456 } 6457 $elt= $next_elt; # just in case we need to loop 6458 } until( ! defined $elt 6459 || ! defined $cond 6460 || (defined $ind && ($elt->{gi} eq $ind)) # optimization 6461 || (defined $test_cond && ($test_cond->( $elt))) 6462 ); 6463 6464 return $elt; 6465 } 6466 6467# return the next_elt within the element 6468# just call next_elt with the element as first and second argument 6469sub first_descendant { return $_[0]->next_elt( @_); } 6470 6471# get the last descendant, # then return the element found or call prev_elt with the condition 6472sub last_descendant 6473 { my( $elt, $cond)= @_; 6474 my $last_descendant= $elt->_last_descendant; 6475 if( !$cond || $last_descendant->matches( $cond)) 6476 { return $last_descendant; } 6477 else 6478 { return $last_descendant->prev_elt( $elt, $cond); } 6479 } 6480 6481# no argument allowed here, just go down the last_child recursively 6482sub _last_descendant 6483 { my $elt= shift; 6484 while( my $child= $elt->{last_child}) { $elt= $child; } 6485 return $elt; 6486 } 6487 6488# counter-intuitive too: 6489# the previous element is found by looking 6490# for the first open tag backwards from the current one 6491# it's the last descendant of the previous sibling 6492# if it exists, otherwise it's simply the parent 6493sub prev_elt 6494 { my $elt= shift; 6495 my $subtree_root= 0; 6496 if( defined $_[0] and (ref( $_[0]) && isa( $_[0], 'XML::Twig::Elt'))) 6497 { $subtree_root= shift ; 6498 return undef if( $elt == $subtree_root); 6499 } 6500 my $cond= shift; 6501 # get prev elt 6502 my $prev_elt; 6503 do 6504 { return undef if( $elt == $subtree_root); 6505 if( $prev_elt= $elt->{prev_sibling}) 6506 { while( $prev_elt->{last_child}) 6507 { $prev_elt= $prev_elt->{last_child}; } 6508 } 6509 else 6510 { $prev_elt= $elt->{parent} || return undef; } 6511 $elt= $prev_elt; # in case we need to loop 6512 } until( $elt->passes( $cond)); 6513 6514 return $elt; 6515 } 6516 6517sub _following_elt 6518 { my( $elt)= @_; 6519 while( $elt && !$elt->{next_sibling}) 6520 { $elt= $elt->{parent}; } 6521 return $elt ? $elt->{next_sibling} : undef; 6522 } 6523 6524sub following_elt 6525 { my( $elt, $cond)= @_; 6526 $elt= $elt->_following_elt || return undef; 6527 return $elt if( !$cond || $elt->matches( $cond)); 6528 return $elt->next_elt( $cond); 6529 } 6530 6531sub following_elts 6532 { my( $elt, $cond)= @_; 6533 if( !$cond) { undef $cond; } 6534 my $following= $elt->following_elt( $cond); 6535 if( $following) 6536 { my @followings= $following; 6537 while( $following= $following->next_elt( $cond)) 6538 { push @followings, $following; } 6539 return( @followings); 6540 } 6541 else 6542 { return (); } 6543 } 6544 6545sub _preceding_elt 6546 { my( $elt)= @_; 6547 while( $elt && !$elt->{prev_sibling}) 6548 { $elt= $elt->{parent}; } 6549 return $elt ? $elt->{prev_sibling}->_last_descendant : undef; 6550 } 6551 6552sub preceding_elt 6553 { my( $elt, $cond)= @_; 6554 $elt= $elt->_preceding_elt || return undef; 6555 return $elt if( !$cond || $elt->matches( $cond)); 6556 return $elt->prev_elt( $cond); 6557 } 6558 6559sub preceding_elts 6560 { my( $elt, $cond)= @_; 6561 if( !$cond) { undef $cond; } 6562 my $preceding= $elt->preceding_elt( $cond); 6563 if( $preceding) 6564 { my @precedings= $preceding; 6565 while( $preceding= $preceding->prev_elt( $cond)) 6566 { push @precedings, $preceding; } 6567 return( @precedings); 6568 } 6569 else 6570 { return (); } 6571 } 6572 6573# used in get_xpath 6574sub _self 6575 { my( $elt, $cond)= @_; 6576 return $cond ? $elt->matches( $cond) : $elt; 6577 } 6578 6579sub next_n_elt 6580 { my $elt= shift; 6581 my $offset= shift || return undef; 6582 foreach (1..$offset) 6583 { $elt= $elt->next_elt( @_) || return undef; } 6584 return $elt; 6585 } 6586 6587# checks whether $elt is included in $ancestor, returns 1 in that case 6588sub in 6589 { my ($elt, $ancestor)= @_; 6590 if( ref( $ancestor) && isa( $ancestor, 'XML::Twig::Elt')) 6591 { # element 6592 while( $elt= $elt->{parent}) { return $elt if( $elt == $ancestor); } 6593 } 6594 else 6595 { # condition 6596 while( $elt= $elt->{parent}) { return $elt if( $elt->matches( $ancestor)); } 6597 } 6598 return 0; 6599 } 6600 6601sub first_child_text 6602 { my $elt= shift; 6603 my $dest=$elt->first_child(@_) or return ''; 6604 return $dest->text; 6605 } 6606 6607sub fields 6608 { my $elt= shift; 6609 return map { $elt->field( $_) } @_; 6610 } 6611 6612sub first_child_trimmed_text 6613 { my $elt= shift; 6614 my $dest=$elt->first_child(@_) or return ''; 6615 return $dest->trimmed_text; 6616 } 6617 6618sub first_child_matches 6619 { my $elt= shift; 6620 my $dest= $elt->{first_child} or return undef; 6621 return $dest->passes( @_); 6622 } 6623 6624sub last_child_text 6625 { my $elt= shift; 6626 my $dest=$elt->last_child(@_) or return ''; 6627 return $dest->text; 6628 } 6629 6630sub last_child_trimmed_text 6631 { my $elt= shift; 6632 my $dest=$elt->last_child(@_) or return ''; 6633 return $dest->trimmed_text; 6634 } 6635 6636sub last_child_matches 6637 { my $elt= shift; 6638 my $dest= $elt->{last_child} or return undef; 6639 return $dest->passes( @_); 6640 } 6641 6642sub child_text 6643 { my $elt= shift; 6644 my $dest=$elt->child(@_) or return ''; 6645 return $dest->text; 6646 } 6647 6648sub child_trimmed_text 6649 { my $elt= shift; 6650 my $dest=$elt->child(@_) or return ''; 6651 return $dest->trimmed_text; 6652 } 6653 6654sub child_matches 6655 { my $elt= shift; 6656 my $nb= shift; 6657 my $dest= $elt->child( $nb) or return undef; 6658 return $dest->passes( @_); 6659 } 6660 6661sub prev_sibling_text 6662 { my $elt= shift; 6663 my $dest= $elt->_prev_sibling(@_) or return ''; 6664 return $dest->text; 6665 } 6666 6667sub prev_sibling_trimmed_text 6668 { my $elt= shift; 6669 my $dest= $elt->_prev_sibling(@_) or return ''; 6670 return $dest->trimmed_text; 6671 } 6672 6673sub prev_sibling_matches 6674 { my $elt= shift; 6675 my $dest= $elt->{prev_sibling} or return undef; 6676 return $dest->passes( @_); 6677 } 6678 6679sub next_sibling_text 6680 { my $elt= shift; 6681 my $dest= $elt->next_sibling(@_) or return ''; 6682 return $dest->text; 6683 } 6684 6685sub next_sibling_trimmed_text 6686 { my $elt= shift; 6687 my $dest= $elt->next_sibling(@_) or return ''; 6688 return $dest->trimmed_text; 6689 } 6690 6691sub next_sibling_matches 6692 { my $elt= shift; 6693 my $dest= $elt->{next_sibling} or return undef; 6694 return $dest->passes( @_); 6695 } 6696 6697sub prev_elt_text 6698 { my $elt= shift; 6699 my $dest= $elt->prev_elt(@_) or return ''; 6700 return $dest->text; 6701 } 6702 6703sub prev_elt_trimmed_text 6704 { my $elt= shift; 6705 my $dest= $elt->prev_elt(@_) or return ''; 6706 return $dest->trimmed_text; 6707 } 6708 6709sub prev_elt_matches 6710 { my $elt= shift; 6711 my $dest= $elt->prev_elt or return undef; 6712 return $dest->passes( @_); 6713 } 6714 6715sub next_elt_text 6716 { my $elt= shift; 6717 my $dest= $elt->next_elt(@_) or return ''; 6718 return $dest->text; 6719 } 6720 6721sub next_elt_trimmed_text 6722 { my $elt= shift; 6723 my $dest= $elt->next_elt(@_) or return ''; 6724 return $dest->trimmed_text; 6725 } 6726 6727sub next_elt_matches 6728 { my $elt= shift; 6729 my $dest= $elt->next_elt or return undef; 6730 return $dest->passes( @_); 6731 } 6732 6733sub parent_text 6734 { my $elt= shift; 6735 my $dest= $elt->parent(@_) or return ''; 6736 return $dest->text; 6737 } 6738 6739sub parent_trimmed_text 6740 { my $elt= shift; 6741 my $dest= $elt->parent(@_) or return ''; 6742 return $dest->trimmed_text; 6743 } 6744 6745sub parent_matches 6746 { my $elt= shift; 6747 my $dest= $elt->{parent} or return undef; 6748 return $dest->passes( @_); 6749 } 6750 6751sub is_first_child 6752 { my $elt= shift; 6753 my $parent= $elt->{parent} or return 0; 6754 my $first_child= $parent->first_child( @_) or return 0; 6755 return ($first_child == $elt) ? $elt : 0; 6756 } 6757 6758sub is_last_child 6759 { my $elt= shift; 6760 my $parent= $elt->{parent} or return 0; 6761 my $last_child= $parent->last_child( @_) or return 0; 6762 return ($last_child == $elt) ? $elt : 0; 6763 } 6764 6765# returns the depth level of the element 6766# if 2 parameter are used then counts the 2cd element name in the 6767# ancestors list 6768sub level 6769 { my( $elt, $cond)= @_; 6770 my $level=0; 6771 my $name=shift || ''; 6772 while( $elt= $elt->{parent}) { $level++ if( !$cond || $elt->matches( $cond)); } 6773 return $level; 6774 } 6775 6776# checks whether $elt has an ancestor that satisfies $cond, returns the ancestor 6777sub in_context 6778 { my ($elt, $cond, $level)= @_; 6779 $level= -1 unless( $level) ; # $level-- will never hit 0 6780 6781 while( $level) 6782 { $elt= $elt->{parent} or return 0; 6783 if( $elt->matches( $cond)) { return $elt; } 6784 $level--; 6785 } 6786 return 0; 6787 } 6788 6789sub _descendants 6790 { my( $subtree_root, $include_self)= @_; 6791 my @descendants= $include_self ? ($subtree_root) : (); 6792 6793 my $elt= $subtree_root; 6794 my $next_elt; 6795 6796 MAIN: while( 1) 6797 { if( $next_elt= $elt->{first_child}) 6798 { # simplest case: the elt has a child 6799 } 6800 elsif( $next_elt= $elt->{next_sibling}) 6801 { # no child but a next sibling (just check we stay within the subtree) 6802 6803 # case where elt is subtree_root, is empty and has a sibling 6804 last MAIN if( $elt == $subtree_root); 6805 } 6806 else 6807 { # case where the element has no child and no next sibling: 6808 # get the first next sibling of an ancestor, checking subtree_root 6809 6810 # case where elt is subtree_root, is empty and has no sibling 6811 last MAIN if( $elt == $subtree_root); 6812 6813 # backtrack until we find a parent with a next sibling 6814 $next_elt= $elt->{parent} || last; 6815 until( $next_elt->{next_sibling}) 6816 { last MAIN if( $subtree_root == $next_elt); 6817 $next_elt= $next_elt->{parent} || last MAIN; 6818 } 6819 last MAIN if( $subtree_root == $next_elt); 6820 $next_elt= $next_elt->{next_sibling}; 6821 } 6822 $elt= $next_elt || last MAIN; 6823 push @descendants, $elt; 6824 } 6825 return @descendants; 6826 } 6827 6828 6829sub descendants 6830 { my( $subtree_root, $cond)= @_; 6831 my @descendants=(); 6832 my $elt= $subtree_root; 6833 6834 # this branch is pure optimization for speed: if $cond is a gi replace it 6835 # by the index of the gi and loop here 6836 # start optimization 6837 my $ind; 6838 if( !$cond || ( defined ( $ind= $XML::Twig::gi2index{$cond})) ) 6839 { 6840 my $next_elt; 6841 6842 while( 1) 6843 { if( $next_elt= $elt->{first_child}) 6844 { # simplest case: the elt has a child 6845 } 6846 elsif( $next_elt= $elt->{next_sibling}) 6847 { # no child but a next sibling (just check we stay within the subtree) 6848 6849 # case where elt is subtree_root, is empty and has a sibling 6850 last if( $subtree_root && ($elt == $subtree_root)); 6851 } 6852 else 6853 { # case where the element has no child and no next sibling: 6854 # get the first next sibling of an ancestor, checking subtree_root 6855 6856 # case where elt is subtree_root, is empty and has no sibling 6857 last if( $subtree_root && ($elt == $subtree_root)); 6858 6859 # backtrack until we find a parent with a next sibling 6860 $next_elt= $elt->{parent} || last undef; 6861 until( $next_elt->{next_sibling}) 6862 { last if( $subtree_root && ($subtree_root == $next_elt)); 6863 $next_elt= $next_elt->{parent} || last; 6864 } 6865 last if( $subtree_root && ($subtree_root == $next_elt)); 6866 $next_elt= $next_elt->{next_sibling}; 6867 } 6868 $elt= $next_elt || last; 6869 push @descendants, $elt if( !$cond || ($elt->{gi} eq $ind)); 6870 } 6871 } 6872 else 6873 # end optimization 6874 { # branch for a complex condition: use the regular (slow but simple) way 6875 while( $elt= $elt->next_elt( $subtree_root, $cond)) 6876 { push @descendants, $elt; } 6877 } 6878 return @descendants; 6879 } 6880 6881 6882sub descendants_or_self 6883 { my( $elt, $cond)= @_; 6884 my @descendants= $elt->passes( $cond) ? ($elt) : (); 6885 push @descendants, $elt->descendants( $cond); 6886 return @descendants; 6887 } 6888 6889sub sibling 6890 { my $elt= shift; 6891 my $nb= shift; 6892 if( $nb > 0) 6893 { foreach( 1..$nb) 6894 { $elt= $elt->next_sibling( @_) or return undef; } 6895 } 6896 elsif( $nb < 0) 6897 { foreach( 1..(-$nb)) 6898 { $elt= $elt->prev_sibling( @_) or return undef; } 6899 } 6900 else # $nb == 0 6901 { return $elt->passes( $_[0]); } 6902 return $elt; 6903 } 6904 6905sub sibling_text 6906 { my $elt= sibling( @_); 6907 return $elt ? $elt->text : undef; 6908 } 6909 6910 6911sub child 6912 { my $elt= shift; 6913 my $nb= shift; 6914 if( $nb >= 0) 6915 { $elt= $elt->first_child( @_) or return undef; 6916 foreach( 1..$nb) 6917 { $elt= $elt->next_sibling( @_) or return undef; } 6918 } 6919 else 6920 { $elt= $elt->last_child( @_) or return undef; 6921 foreach( 2..(-$nb)) 6922 { $elt= $elt->prev_sibling( @_) or return undef; } 6923 } 6924 return $elt; 6925 } 6926 6927sub prev_siblings 6928 { my $elt= shift; 6929 my @siblings=(); 6930 while( $elt= $elt->prev_sibling( @_)) 6931 { unshift @siblings, $elt; } 6932 return @siblings; 6933 } 6934 6935sub siblings 6936 { my $elt= shift; 6937 return grep { $_ ne $elt } $elt->{parent}->children( @_); 6938 } 6939 6940sub pos 6941 { my $elt= shift; 6942 return 0 if ($_[0] && !$elt->matches( @_)); 6943 my $pos=1; 6944 $pos++ while( $elt= $elt->prev_sibling( @_)); 6945 return $pos; 6946 } 6947 6948 6949sub next_siblings 6950 { my $elt= shift; 6951 my @siblings=(); 6952 while( $elt= $elt->next_sibling( @_)) 6953 { push @siblings, $elt; } 6954 return @siblings; 6955 } 6956 6957 6958# used by get_xpath: parses the xpath expression and generates a sub that performs the 6959# search 6960{ my %axis2method; 6961 BEGIN { %axis2method= ( child => 'children', 6962 descendant => 'descendants', 6963 'descendant-or-self' => 'descendants_or_self', 6964 parent => 'parent_is', 6965 ancestor => 'ancestors', 6966 'ancestor-or-self' => 'ancestors_or_self', 6967 'following-sibling' => 'next_siblings', 6968 'preceding-sibling' => 'prev_siblings', 6969 following => 'following_elts', 6970 preceding => 'preceding_elts', 6971 self => '_self', 6972 ); 6973 } 6974 6975 sub _install_xpath 6976 { my( $xpath_exp, $type)= @_; 6977 my $original_exp= $xpath_exp; 6978 my $sub= 'my $elt= shift; my @results;'; 6979 6980 # grab the root if expression starts with a / 6981 if( $xpath_exp=~ s{^/}{}) 6982 { $sub .= '@results= ($elt->twig) || croak "cannot use an XPath query starting with a / on a node not attached to a whole twig";'; } 6983 elsif( $xpath_exp=~ s{^\./}{}) 6984 { $sub .= '@results= ($elt);'; } 6985 else 6986 { $sub .= '@results= ($elt);'; } 6987 6988 6989 #warn "xpath_exp= '$xpath_exp'\n"; 6990 6991 while( $xpath_exp && 6992 $xpath_exp=~s{^\s*(/?) 6993 # the xxx=~/regexp/ is a pain as it includes / 6994 (\s*(?:(?:($REG_AXIS)::)?(\*|$REG_TAG_PART|\.\.|\.)\s*)?($REG_PREDICATE_ALT*) 6995 ) 6996 (/|$)}{}xo) 6997 6998 { my( $wildcard, $sub_exp, $axis, $gi, $predicates)= ($1, $2, $3, $4, $5); 6999 if( $axis && ! $gi) 7000 { _croak_and_doublecheck_xpath( $original_exp, "error in xpath expression $original_exp"); } 7001 7002 # grab a parent 7003 if( $sub_exp eq '..') 7004 { _croak_and_doublecheck_xpath( $original_exp, "error in xpath expression $original_exp") if( $wildcard); 7005 $sub .= '@results= map { $_->{parent}} @results;'; 7006 } 7007 # test the element itself 7008 elsif( $sub_exp=~ m{^\.(.*)$}s) 7009 { $sub .= "\@results= grep { \$_->matches( q{$1}) } \@results;" } 7010 # grab children 7011 else 7012 { 7013 if( !$axis) 7014 { $axis= $wildcard ? 'descendant' : 'child'; } 7015 if( !$gi or $gi eq '*') { $gi=''; } 7016 my $function; 7017 7018 # "special" predicates, that return just one element 7019 if( $predicates && ($predicates =~ m{^\s*\[\s*((-\s*)?\d+)\s*\]\s*$})) 7020 { # [<nb>] 7021 my $offset= $1; 7022 $offset-- if( $offset > 0); 7023 $function= $axis eq 'descendant' ? "next_n_elt( $offset, '$gi')" 7024 : $axis eq 'child' ? "child( $offset, '$gi')" 7025 : _croak_and_doublecheck_xpath( $original_exp, "error [$1] not supported along axis '$axis'") 7026 ; 7027 $sub .= "\@results= grep { \$_ } map { \$_->$function } \@results;" 7028 } 7029 elsif( $predicates && ($predicates =~ m{^\s*\[\s*last\s*\(\s*\)\s*\]\s*$}) ) 7030 { # last() 7031 _croak_and_doublecheck_xpath( $original_exp, "error in xpath expression $original_exp, usage of // and last() not supported") if( $wildcard); 7032 $sub .= "\@results= map { \$_->last_child( '$gi') } \@results;"; 7033 } 7034 else 7035 { # follow the axis 7036 #warn "axis: '$axis' - method: '$axis2method{$axis}' - gi: '$gi'\n"; 7037 7038 my $follow_axis= " \$_->$axis2method{$axis}( '$gi')"; 7039 my $step= $follow_axis; 7040 7041 # now filter using the predicate 7042 while( $predicates=~ s{^\s*($REG_PREDICATE_ALT)\s*}{}o) 7043 { my $pred= $1; 7044 $pred=~ s{^\s*\[\s*}{}; 7045 $pred=~ s{\s*\]\s*$}{}; 7046 my $test=""; 7047 my $pos; 7048 if( $pred=~ m{^(-?\s*\d+)$}) 7049 { my $pos= $1; 7050 if( $step=~ m{^\s*grep(.*) (\$_->\w+\(\s*'[^']*'\s*\))}) 7051 { $step= "XML::Twig::_first_n $1 $pos, $2"; } 7052 else 7053 { if( $pos > 0) { $pos--; } 7054 $step= "($step)[$pos]"; 7055 } 7056 #warn "number predicate '$pos' - generated step '$step'\n"; 7057 } 7058 else 7059 { my $syntax_error=0; 7060 do 7061 { if( $pred =~ s{^string\(\s*\)\s*=\s*($REG_STRING)\s*}{}o) # string()="string" pred 7062 { $test .= "\$_->text eq $1"; } 7063 elsif( $pred =~ s{^string\(\s*\)\s*!=\s*($REG_STRING)\s*}{}o) # string()!="string" pred 7064 { $test .= "\$_->text ne $1"; } 7065 if( $pred =~ s{^string\(\s*\)\s*=\s*($REG_NUMBER)\s*}{}o) # string()=<number> pred 7066 { $test .= "\$_->text eq $1"; } 7067 elsif( $pred =~ s{^string\(\s*\)\s*!=\s*($REG_NUMBER)\s*}{}o) # string()!=<number> pred 7068 { $test .= "\$_->text ne $1"; } 7069 elsif( $pred =~ s{^string\(\s*\)\s*(>|<|>=|<=)\s*($REG_NUMBER)\s*}{}o) # string()!=<number> pred 7070 { $test .= "\$_->text $1 $2"; } 7071 7072 elsif( $pred =~ s{^string\(\s*\)\s*($REG_MATCH)\s*($REG_REGEXP)\s*}{}o) # string()=~/regex/ pred 7073 { my( $match, $regexp)= ($1, $2); 7074 $test .= "\$_->text $match $regexp"; 7075 } 7076 elsif( $pred =~ s{^string\(\s*\)\s*}{}o) # string() pred 7077 { $test .= "\$_->text"; } 7078 elsif( $pred=~ s{^@($REG_TAG_NAME)\s*($REG_OP)\s*($REG_STRING|$REG_NUMBER)}{}o) # @att="val" pred 7079 { my( $att, $oper, $val)= ($1, _op( $2), $3); 7080 $test .= qq{((defined \$_->{'att'}->{"$att"}) && (\$_->{'att'}->{"$att"} $oper $val))}; 7081 } 7082 elsif( $pred =~ s{^@($REG_TAG_NAME)\s*($REG_MATCH)\s*($REG_REGEXP)\s*}{}o) # @att=~/regex/ pred XXX 7083 { my( $att, $match, $regexp)= ($1, $2, $3); 7084 $test .= qq{((defined \$_->{'att'}->{"$att"}) && (\$_->{'att'}->{"$att"} $match $regexp))};; 7085 } 7086 elsif( $pred=~ s{^@($REG_TAG_NAME)\s*}{}o) # @att pred 7087 { $test .= qq{(defined \$_->{'att'}->{"$1"})}; } 7088 elsif( $pred=~ s{^\s*(?:not|!)\s*@($REG_TAG_NAME)\s*}{}o) # not @att pred 7089 { $test .= qq{((\$_->is_elt) && (not defined \$_->{'att'}->{"$1"}))}; } 7090 elsif( $pred=~ s{^\s*([()])}{}) # ( or ) (just add to the test) 7091 { $test .= qq{$1}; } 7092 elsif( $pred=~ s{^\s*(and|or)\s*}{}) 7093 { $test .= lc " $1 "; } 7094 else 7095 { $syntax_error=1; } 7096 7097 } while( !$syntax_error && $pred); 7098 _croak_and_doublecheck_xpath( $original_exp, "error in xpath expression $original_exp at $pred") if( $pred); 7099 $step= " grep { $test } $step "; 7100 } 7101 } 7102 #warn "step: '$step'"; 7103 $sub .= "\@results= grep defined, map { $step } \@results;"; 7104 } 7105 } 7106 } 7107 7108 if( $xpath_exp) 7109 { _croak_and_doublecheck_xpath( $original_exp, "error in xpath expression $original_exp around $xpath_exp"); } 7110 7111 $sub .= q{return XML::Twig::_unique_elts( @results); }; 7112 #warn "generated: '$sub'\n"; 7113 my $s= eval "sub { $NO_WARNINGS; $sub }"; 7114 if( $@) 7115 { _croak_and_doublecheck_xpath( $original_exp, "error in xpath expression $original_exp ($@);") } 7116 return( $s); 7117 } 7118} 7119 7120sub _croak_and_doublecheck_xpath 7121 { my $xpath_expression= shift; 7122 my $mess= join( "\n", @_); 7123 if( $XML::Twig::XPath::VERSION || 0) 7124 { my $check_twig= XML::Twig::XPath->new; 7125 if( eval { $check_twig->{twig_xp}->_parse( $xpath_expression) }) 7126 { $mess .= "\nthe expression is a valid XPath statement, and you are using XML::Twig::XPath, but" 7127 . "\nyou are using either 'find_nodes' or 'get_xpath' where the method you likely wanted" 7128 . "\nto use is 'findnodes', which is the only one that uses the full XPath engine\n"; 7129 } 7130 } 7131 croak $mess; 7132 } 7133 7134 7135 7136{ # extremely elaborate caching mechanism 7137 my %xpath; # xpath_expression => subroutine_code; 7138 sub get_xpath 7139 { my( $elt, $xpath_exp, $offset)= @_; 7140 my $sub= ($xpath{$xpath_exp} ||= _install_xpath( $xpath_exp)); 7141 return $sub->( $elt) unless( defined $offset); 7142 my @res= $sub->( $elt); 7143 return $res[$offset]; 7144 } 7145} 7146 7147 7148sub findvalues 7149 { my $elt= shift; 7150 return map { $_->text } $elt->get_xpath( @_); 7151 } 7152 7153sub findvalue 7154 { my $elt= shift; 7155 return join '', map { $_->text } $elt->get_xpath( @_); 7156 } 7157 7158 7159# XML::XPath compatibility 7160sub getElementById { return $_[0]->twig->elt_id( $_[1]); } 7161sub getChildNodes { my @children= do { my $elt= $_[0]; my @children=(); my $child= $elt->{first_child}; while( $child) { push @children, $child; $child= $child->{next_sibling}; } @children; }; return wantarray ? @children : \@children; } 7162 7163sub _flushed { return $_[0]->{flushed}; } 7164sub _set_flushed { $_[0]->{flushed}=1; } 7165sub _del_flushed { delete $_[0]->{flushed}; } 7166 7167sub cut 7168 { my $elt= shift; 7169 my( $parent, $prev_sibling, $next_sibling); 7170 $parent= $elt->{parent}; 7171 if( ! $parent && $elt->is_elt) 7172 { # are we cutting the root? 7173 my $t= $elt->{twig}; 7174 if( $t && ! $t->{twig_parsing}) 7175 { delete $t->{twig_root}; 7176 delete $elt->{twig}; 7177 return $elt; 7178 } # cutt`ing the root 7179 else 7180 { return; } # cutting an orphan, returning $elt would break backward compatibility 7181 } 7182 7183 # save the old links, that'll make it easier for some loops 7184 foreach my $link ( qw(parent prev_sibling next_sibling) ) 7185 { $elt->{former}->{$link}= $elt->{$link}; 7186 if( $XML::Twig::weakrefs) { weaken( $elt->{former}->{$link}); } 7187 } 7188 7189 # if we cut the current element then its parent becomes the current elt 7190 if( $elt->{twig_current}) 7191 { my $twig_current= $elt->{parent}; 7192 $elt->twig->{twig_current}= $twig_current; 7193 $twig_current->{'twig_current'}=1; 7194 delete $elt->{'twig_current'}; 7195 } 7196 7197 if( $parent->{first_child} && $parent->{first_child} == $elt) 7198 { $parent->{first_child}= $elt->{next_sibling}; 7199 # cutting can make the parent empty 7200 if( ! $parent->{first_child}) { $parent->{empty}= 1; } 7201 } 7202 7203 if( $parent->{last_child} && $parent->{last_child} == $elt) 7204 { delete $parent->{empty}; $parent->{last_child}=$elt->{prev_sibling}; if( $XML::Twig::weakrefs) { weaken( $parent->{last_child});} ; 7205 } 7206 7207 if( $prev_sibling= $elt->{prev_sibling}) 7208 { $prev_sibling->{next_sibling}= $elt->{next_sibling}; } 7209 if( $next_sibling= $elt->{next_sibling}) 7210 { $next_sibling->{prev_sibling}=$elt->{prev_sibling}; if( $XML::Twig::weakrefs) { weaken( $next_sibling->{prev_sibling});} ; } 7211 7212 7213 $elt->{parent}=undef; if( $XML::Twig::weakrefs) { weaken( $elt->{parent});} ; 7214 $elt->{prev_sibling}=undef; if( $XML::Twig::weakrefs) { weaken( $elt->{prev_sibling});} ; 7215 $elt->{next_sibling}= undef; 7216 7217 # merge 2 (now) consecutive text nodes if they are of the same type 7218 # (type can be PCDATA or CDATA) 7219 if( $prev_sibling && $next_sibling && $prev_sibling->is_text && ( $XML::Twig::index2gi[$prev_sibling->{'gi'}] eq $XML::Twig::index2gi[$next_sibling->{'gi'}])) 7220 { $prev_sibling->merge_text( $next_sibling); } 7221 7222 return $elt; 7223 } 7224 7225 7226sub former_next_sibling { return $_[0]->{former}->{next_sibling}; } 7227sub former_prev_sibling { return $_[0]->{former}->{prev_sibling}; } 7228sub former_parent { return $_[0]->{former}->{parent}; } 7229 7230sub cut_children 7231 { my( $elt, $exp)= @_; 7232 my @children= $elt->children( $exp); 7233 foreach (@children) { $_->cut; } 7234 if( ! $elt->has_children) { $elt->{empty}= 1; } 7235 return @children; 7236 } 7237 7238sub cut_descendants 7239 { my( $elt, $exp)= @_; 7240 my @descendants= $elt->descendants( $exp); 7241 foreach ($elt->descendants( $exp)) { $_->cut; } 7242 if( ! $elt->has_children) { $elt->{empty}= 1; } 7243 return @descendants; 7244 } 7245 7246 7247sub erase 7248 { my $elt= shift; 7249 #you cannot erase the current element 7250 if( $elt->{twig_current}) 7251 { croak "trying to erase an element before it has been completely parsed"; } 7252 if( my $parent= $elt->{parent}) 7253 { # normal case 7254 $elt->_move_extra_data_after_erase; 7255 my @children= do { my $elt= $elt; my @children=(); my $child= $elt->{first_child}; while( $child) { push @children, $child; $child= $child->{next_sibling}; } @children; }; 7256 if( @children) 7257 { 7258 # elt has children, move them up 7259 7260 # the first child may need to be merged with a previous text 7261 my $first_child= shift @children; 7262 $first_child->move( before => $elt); 7263 my $prev= $first_child->{prev_sibling}; 7264 if( $prev && $prev->is_text && ($XML::Twig::index2gi[$first_child->{'gi'}] eq $XML::Twig::index2gi[$prev->{'gi'}]) ) 7265 { $prev->merge_text( $first_child); } 7266 7267 # move the rest of the children 7268 foreach my $child (@children) 7269 { $child->move( before => $elt); } 7270 7271 # now the elt had no child, delete it 7272 $elt->delete; 7273 7274 # now see if we need to merge the last child with the next element 7275 my $last_child= $children[-1] || $first_child; # if no last child, then it's also the first child 7276 my $next= $last_child->{next_sibling}; 7277 if( $next && $next->is_text && ($XML::Twig::index2gi[$last_child->{'gi'}] eq $XML::Twig::index2gi[$next->{'gi'}]) ) 7278 { $last_child->merge_text( $next); } 7279 7280 # if parsing and have now a PCDATA text, mark so we can normalize later on if need be 7281 if( $parent->{twig_current} && $last_child->is_text) { $parent->{twig_to_be_normalized}=1; } 7282 } 7283 else 7284 { # no children, just cut the elt 7285 $elt->delete; 7286 } 7287 } 7288 else 7289 { # trying to erase the root (of a twig or of a cut/new element) 7290 my @children= do { my $elt= $elt; my @children=(); my $child= $elt->{first_child}; while( $child) { push @children, $child; $child= $child->{next_sibling}; } @children; }; 7291 unless( @children == 1) 7292 { croak "can only erase an element with no parent if it has a single child"; } 7293 $elt->_move_extra_data_after_erase; 7294 my $child= shift @children; 7295 $child->{parent}=undef; if( $XML::Twig::weakrefs) { weaken( $child->{parent});} ; 7296 my $twig= $elt->twig; 7297 $twig->set_root( $child); 7298 } 7299 7300 return $elt; 7301 7302 } 7303 7304sub _move_extra_data_after_erase 7305 { my( $elt)= @_; 7306 # extra_data 7307 if( my $extra_data= $elt->{extra_data}) 7308 { my $target= $elt->{first_child} || $elt->{next_sibling}; 7309 if( $target) 7310 { 7311 if( $target->is( $ELT)) 7312 { $target->set_extra_data( $extra_data . ($target->extra_data || '')); } 7313 elsif( $target->is( $TEXT)) 7314 { $target->_unshift_extra_data_in_pcdata( $extra_data, 0); } # TO CHECK 7315 } 7316 else 7317 { my $parent= $elt->{parent}; # always exists or the erase cannot be performed 7318 $parent->_prefix_extra_data_before_end_tag( $extra_data); 7319 } 7320 } 7321 7322 # extra_data_before_end_tag 7323 if( my $extra_data= $elt->{extra_data_before_end_tag}) 7324 { if( my $target= $elt->{next_sibling}) 7325 { if( $target->is( $ELT)) 7326 { $target->set_extra_data( $extra_data . ($target->extra_data || '')); } 7327 elsif( $target->is( $TEXT)) 7328 { 7329 $target->_unshift_extra_data_in_pcdata( $extra_data, 0); 7330 } 7331 } 7332 elsif( my $parent= $elt->{parent}) 7333 { $parent->_prefix_extra_data_before_end_tag( $extra_data); } 7334 } 7335 7336 return $elt; 7337 7338 } 7339BEGIN 7340 { my %method= ( before => \&paste_before, 7341 after => \&paste_after, 7342 first_child => \&paste_first_child, 7343 last_child => \&paste_last_child, 7344 within => \&paste_within, 7345 ); 7346 7347 # paste elt somewhere around ref 7348 # pos can be first_child (default), last_child, before, after or within 7349 sub paste ## no critic (Subroutines::ProhibitNestedSubs); 7350 { my $elt= shift; 7351 if( $elt->{parent}) 7352 { croak "cannot paste an element that belongs to a tree"; } 7353 my $pos; 7354 my $ref; 7355 if( ref $_[0]) 7356 { $pos= 'first_child'; 7357 croak "wrong argument order in paste, should be $_[1] first" if($_[1]); 7358 } 7359 else 7360 { $pos= shift; } 7361 7362 if( my $method= $method{$pos}) 7363 { 7364 unless( ref( $_[0]) && isa( $_[0], 'XML::Twig::Elt')) 7365 { if( ! defined( $_[0])) 7366 { croak "missing target in paste"; } 7367 elsif( ! ref( $_[0])) 7368 { croak "wrong target type in paste (not a reference), should be XML::Twig::Elt or a subclass"; } 7369 else 7370 { my $ref= ref $_[0]; 7371 croak "wrong target type in paste: '$ref', should be XML::Twig::Elt or a subclass"; 7372 } 7373 } 7374 $ref= $_[0]; 7375 # check here so error message lists the caller file/line 7376 if( !$ref->{parent} && ($pos=~ m{^(before|after)$}) && !(exists $elt->{'target'}) && !(exists $elt->{'comment'})) 7377 { croak "cannot paste $1 root"; } 7378 $elt->$method( @_); 7379 } 7380 else 7381 { croak "tried to paste in wrong position '$pos', allowed positions " . 7382 " are 'first_child', 'last_child', 'before', 'after' and " . 7383 "'within'"; 7384 } 7385 if( (my $ids= $elt->{twig_id_list}) && (my $t= $ref->twig) ) 7386 { $t->{twig_id_list}||={}; 7387 foreach my $id (keys %$ids) 7388 { $t->{twig_id_list}->{$id}= $ids->{$id}; 7389 if( $XML::Twig::weakrefs) { weaken( $t->{twig_id_list}->{$id}); } 7390 } 7391 } 7392 return $elt; 7393 } 7394 7395 7396 sub paste_before 7397 { my( $elt, $ref)= @_; 7398 my( $parent, $prev_sibling, $next_sibling ); 7399 7400 # trying to paste before an orphan (root or detached wlt) 7401 unless( $ref->{parent}) 7402 { if( my $t= $ref->twig) 7403 { if( (exists $elt->{'comment'}) || (exists $elt->{'target'})) # we can still do this 7404 { $t->_add_cpi_outside_of_root( leading_cpi => $elt); return; } 7405 else 7406 { croak "cannot paste before root"; } 7407 } 7408 else 7409 { croak "cannot paste before an orphan element"; } 7410 } 7411 $parent= $ref->{parent}; 7412 $prev_sibling= $ref->{prev_sibling}; 7413 $next_sibling= $ref; 7414 7415 $elt->{parent}=$parent; if( $XML::Twig::weakrefs) { weaken( $elt->{parent});} ; 7416 if( $parent->{first_child} == $ref) { $parent->{first_child}= $elt; } 7417 7418 if( $prev_sibling) { $prev_sibling->{next_sibling}= $elt; } 7419 $elt->{prev_sibling}=$prev_sibling; if( $XML::Twig::weakrefs) { weaken( $elt->{prev_sibling});} ; 7420 7421 $next_sibling->{prev_sibling}=$elt; if( $XML::Twig::weakrefs) { weaken( $next_sibling->{prev_sibling});} ; 7422 $elt->{next_sibling}= $ref; 7423 return $elt; 7424 } 7425 7426 sub paste_after 7427 { my( $elt, $ref)= @_; 7428 my( $parent, $prev_sibling, $next_sibling ); 7429 7430 # trying to paste after an orphan (root or detached wlt) 7431 unless( $ref->{parent}) 7432 { if( my $t= $ref->twig) 7433 { if( (exists $elt->{'comment'}) || (exists $elt->{'target'})) # we can still do this 7434 { $t->_add_cpi_outside_of_root( trailing_cpi => $elt); return; } 7435 else 7436 { croak "cannot paste after root"; } 7437 } 7438 else 7439 { croak "cannot paste after an orphan element"; } 7440 } 7441 $parent= $ref->{parent}; 7442 $prev_sibling= $ref; 7443 $next_sibling= $ref->{next_sibling}; 7444 7445 $elt->{parent}=$parent; if( $XML::Twig::weakrefs) { weaken( $elt->{parent});} ; 7446 if( $parent->{last_child}== $ref) { delete $parent->{empty}; $parent->{last_child}=$elt; if( $XML::Twig::weakrefs) { weaken( $parent->{last_child});} ; } 7447 7448 $prev_sibling->{next_sibling}= $elt; 7449 $elt->{prev_sibling}=$prev_sibling; if( $XML::Twig::weakrefs) { weaken( $elt->{prev_sibling});} ; 7450 7451 if( $next_sibling) { $next_sibling->{prev_sibling}=$elt; if( $XML::Twig::weakrefs) { weaken( $next_sibling->{prev_sibling});} ; } 7452 $elt->{next_sibling}= $next_sibling; 7453 return $elt; 7454 7455 } 7456 7457 sub paste_first_child 7458 { my( $elt, $ref)= @_; 7459 my( $parent, $prev_sibling, $next_sibling ); 7460 $parent= $ref; 7461 $next_sibling= $ref->{first_child}; 7462 7463 $elt->{parent}=$parent; if( $XML::Twig::weakrefs) { weaken( $elt->{parent});} ; 7464 $parent->{first_child}= $elt; 7465 unless( $parent->{last_child}) { delete $parent->{empty}; $parent->{last_child}=$elt; if( $XML::Twig::weakrefs) { weaken( $parent->{last_child});} ; } 7466 7467 $elt->{prev_sibling}=undef; if( $XML::Twig::weakrefs) { weaken( $elt->{prev_sibling});} ; 7468 7469 if( $next_sibling) { $next_sibling->{prev_sibling}=$elt; if( $XML::Twig::weakrefs) { weaken( $next_sibling->{prev_sibling});} ; } 7470 $elt->{next_sibling}= $next_sibling; 7471 return $elt; 7472 } 7473 7474 sub paste_last_child 7475 { my( $elt, $ref)= @_; 7476 my( $parent, $prev_sibling, $next_sibling ); 7477 $parent= $ref; 7478 $prev_sibling= $ref->{last_child}; 7479 7480 $elt->{parent}=$parent; if( $XML::Twig::weakrefs) { weaken( $elt->{parent});} ; 7481 delete $parent->{empty}; $parent->{last_child}=$elt; if( $XML::Twig::weakrefs) { weaken( $parent->{last_child});} ; 7482 unless( $parent->{first_child}) { $parent->{first_child}= $elt; } 7483 7484 $elt->{prev_sibling}=$prev_sibling; if( $XML::Twig::weakrefs) { weaken( $elt->{prev_sibling});} ; 7485 if( $prev_sibling) { $prev_sibling->{next_sibling}= $elt; } 7486 7487 $elt->{next_sibling}= undef; 7488 return $elt; 7489 } 7490 7491 sub paste_within 7492 { my( $elt, $ref, $offset)= @_; 7493 my $text= $ref->is_text ? $ref : $ref->next_elt( $TEXT, $ref); 7494 my $new= $text->split_at( $offset); 7495 $elt->paste_before( $new); 7496 return $elt; 7497 } 7498 } 7499 7500# load an element into a structure similar to XML::Simple's 7501sub simplify 7502 { my $elt= shift; 7503 7504 # normalize option names 7505 my %options= @_; 7506 %options= map { my ($key, $val)= ($_, $options{$_}); 7507 $key=~ s{(\w)([A-Z])}{$1_\L$2}g; 7508 $key => $val 7509 } keys %options; 7510 7511 # check options 7512 my @allowed_options= qw( keyattr forcearray noattr content_key 7513 var var_regexp variables var_attr 7514 group_tags forcecontent 7515 normalise_space normalize_space 7516 ); 7517 my %allowed_options= map { $_ => 1 } @allowed_options; 7518 foreach my $option (keys %options) 7519 { carp "invalid option $option\n" unless( $allowed_options{$option}); } 7520 7521 $options{normalise_space} ||= $options{normalize_space} || 0; 7522 7523 $options{content_key} ||= 'content'; 7524 if( $options{content_key}=~ m{^-}) 7525 { # need to remove the - and to activate extra folding 7526 $options{content_key}=~ s{^-}{}; 7527 $options{extra_folding}= 1; 7528 } 7529 else 7530 { $options{extra_folding}= 0; } 7531 7532 $options{forcearray} ||=0; 7533 if( isa( $options{forcearray}, 'ARRAY')) 7534 { my %forcearray_tags= map { $_ => 1 } @{$options{forcearray}}; 7535 $options{forcearray_tags}= \%forcearray_tags; 7536 $options{forcearray}= 0; 7537 } 7538 7539 $options{keyattr} ||= ['name', 'key', 'id']; 7540 if( ref $options{keyattr} eq 'ARRAY') 7541 { foreach my $keyattr (@{$options{keyattr}}) 7542 { my( $prefix, $att)= ($keyattr=~ m{^([+-])?(.*)}); 7543 $prefix ||= ''; 7544 $options{key_for_all}->{$att}= 1; 7545 $options{remove_key_for_all}->{$att}=1 unless( $prefix eq '+'); 7546 $options{prefix_key_for_all}->{$att}=1 if( $prefix eq '-'); 7547 } 7548 } 7549 elsif( ref $options{keyattr} eq 'HASH') 7550 { while( my( $elt, $keyattr)= each %{$options{keyattr}}) 7551 { my( $prefix, $att)= ($keyattr=~ m{^([+-])?(.*)}); 7552 $prefix ||=''; 7553 $options{key_for_elt}->{$elt}= $att; 7554 $options{remove_key_for_elt}->{"$elt#$att"}=1 unless( $prefix); 7555 $options{prefix_key_for_elt}->{"$elt#$att"}=1 if( $prefix eq '-'); 7556 } 7557 } 7558 7559 7560 $options{var}||= $options{var_attr}; # for compat with XML::Simple 7561 if( $options{var}) { $options{var_values}= {}; } 7562 else { $options{var}=''; } 7563 7564 if( $options{variables}) 7565 { $options{var}||= 1; 7566 $options{var_values}= $options{variables}; 7567 } 7568 7569 if( $options{var_regexp} and !$options{var}) 7570 { warn "var option not used, var_regexp option ignored\n"; } 7571 $options{var_regexp} ||= '\$\{?(\w+)\}?'; 7572 7573 $elt->_simplify( \%options); 7574 7575 } 7576 7577sub _simplify 7578 { my( $elt, $options)= @_; 7579 7580 my $data; 7581 7582 my $gi= $XML::Twig::index2gi[$elt->{'gi'}]; 7583 my @children= do { my $elt= $elt; my @children=(); my $child= $elt->{first_child}; while( $child) { push @children, $child; $child= $child->{next_sibling}; } @children; }; 7584 my %atts= $options->{noattr} || !$elt->{att} ? () : %{$elt->{att}}; 7585 my $nb_atts= keys %atts; 7586 my $nb_children= $elt->children_count + $nb_atts; 7587 7588 my %nb_children; 7589 foreach (@children) { $nb_children{$_->tag}++; } 7590 foreach (keys %atts) { $nb_children{$_}++; } 7591 7592 my $arrays; # tag => array where elements are stored 7593 7594 7595 # store children 7596 foreach my $child (@children) 7597 { if( $child->is_text) 7598 { # generate with a content key 7599 my $text= $elt->_text_with_vars( $options); 7600 if( $options->{normalise_space} >= 2) { $text= _normalize_space( $text); } 7601 if( $options->{force_content} 7602 || $nb_atts 7603 || (scalar @children > 1) 7604 ) 7605 { $data->{$options->{content_key}}= $text; } 7606 else 7607 { $data= $text; } 7608 } 7609 else 7610 { # element with sub-elements 7611 my $child_gi= $XML::Twig::index2gi[$child->{'gi'}]; 7612 7613 my $child_data= $child->_simplify( $options); 7614 7615 # first see if we need to simplify further the child data 7616 # simplify because of grouped tags 7617 if( my $grouped_tag= $options->{group_tags}->{$child_gi}) 7618 { # check that the child data is a hash with a single field 7619 unless( (ref( $child_data) eq 'HASH') 7620 && (keys %$child_data == 1) 7621 && defined ( my $grouped_child_data= $child_data->{$grouped_tag}) 7622 ) 7623 { croak "error in grouped tag $child_gi"; } 7624 else 7625 { $child_data= $grouped_child_data; } 7626 } 7627 # simplify because of extra folding 7628 if( $options->{extra_folding}) 7629 { if( (ref( $child_data) eq 'HASH') 7630 && (keys %$child_data == 1) 7631 && defined( my $content= $child_data->{$options->{content_key}}) 7632 ) 7633 { $child_data= $content; } 7634 } 7635 7636 if( my $keyatt= $child->_key_attr( $options)) 7637 { # simplify element with key 7638 my $key= $child->{'att'}->{$keyatt}; 7639 if( $options->{normalise_space} >= 1) { $key= _normalize_space( $key); } 7640 $data->{$child_gi}->{$key}= $child_data; 7641 } 7642 elsif( $options->{forcearray} 7643 || $options->{forcearray_tags}->{$child_gi} 7644 || ( $nb_children{$child_gi} > 1) 7645 ) 7646 { # simplify element to store in an array 7647 if( defined $child_data && $child_data ne "" ) 7648 { $data->{$child_gi} ||= []; 7649 push @{$data->{$child_gi}}, $child_data; 7650 } 7651 else 7652 { $data->{$child_gi}= [{}]; } 7653 } 7654 else 7655 { # simplify element to store as a hash field 7656 $data->{$child_gi}=$child_data; 7657 $data->{$child_gi}= defined $child_data && $child_data ne "" ? $child_data : {}; 7658 } 7659 } 7660 } 7661 7662 # store atts 7663 # TODO: deal with att that already have an element by that name 7664 foreach my $att (keys %atts) 7665 { # do not store if the att is a key that needs to be removed 7666 if( $options->{remove_key_for_all}->{$att} 7667 || $options->{remove_key_for_elt}->{"$gi#$att"} 7668 ) 7669 { next; } 7670 7671 my $att_text= $options->{var} ? _replace_vars_in_text( $atts{$att}, $options) : $atts{$att} ; 7672 if( $options->{normalise_space} >= 2) { $att_text= _normalize_space( $att_text); } 7673 7674 if( $options->{prefix_key_for_all}->{$att} 7675 || $options->{prefix_key_for_elt}->{"$gi#$att"} 7676 ) 7677 { # prefix the att 7678 $data->{"-$att"}= $att_text; 7679 } 7680 else 7681 { # normal case 7682 $data->{$att}= $att_text; 7683 } 7684 } 7685 7686 return $data; 7687 } 7688 7689sub _key_attr 7690 { my( $elt, $options)=@_; 7691 return if( $options->{noattr}); 7692 if( $options->{key_for_all}) 7693 { foreach my $att ($elt->att_names) 7694 { if( $options->{key_for_all}->{$att}) 7695 { return $att; } 7696 } 7697 } 7698 elsif( $options->{key_for_elt}) 7699 { if( my $key_for_elt= $options->{key_for_elt}->{$XML::Twig::index2gi[$elt->{'gi'}]} ) 7700 { return $key_for_elt if( defined( $elt->{'att'}->{$key_for_elt})); } 7701 } 7702 return; 7703 } 7704 7705sub _text_with_vars 7706 { my( $elt, $options)= @_; 7707 my $text; 7708 if( $options->{var}) 7709 { $text= _replace_vars_in_text( $elt->text, $options); 7710 $elt->_store_var( $options); 7711 } 7712 else 7713 { $text= $elt->text; } 7714 return $text; 7715 } 7716 7717 7718sub _normalize_space 7719 { my $text= shift; 7720 $text=~ s{\s+}{ }sg; 7721 $text=~ s{^\s}{}; 7722 $text=~ s{\s$}{}; 7723 return $text; 7724 } 7725 7726 7727sub att_nb 7728 { return 0 unless( my $atts= $_[0]->{att}); 7729 return scalar keys %$atts; 7730 } 7731 7732sub has_no_atts 7733 { return 1 unless( my $atts= $_[0]->{att}); 7734 return scalar keys %$atts ? 0 : 1; 7735 } 7736 7737sub _replace_vars_in_text 7738 { my( $text, $options)= @_; 7739 7740 $text=~ s{($options->{var_regexp})} 7741 { if( defined( my $value= $options->{var_values}->{$2})) 7742 { $value } 7743 else 7744 { warn "unknown variable $2\n"; 7745 $1 7746 } 7747 }gex; 7748 return $text; 7749 } 7750 7751sub _store_var 7752 { my( $elt, $options)= @_; 7753 if( defined (my $var_name= $elt->{'att'}->{$options->{var}})) 7754 { $options->{var_values}->{$var_name}= $elt->text; 7755 } 7756 } 7757 7758 7759# split a text element at a given offset 7760sub split_at 7761 { my( $elt, $offset)= @_; 7762 my $text_elt= $elt->is_text ? $elt : $elt->first_child( $TEXT) || return ''; 7763 my $string= $text_elt->text; 7764 my $left_string= substr( $string, 0, $offset); 7765 my $right_string= substr( $string, $offset); 7766 $text_elt->{pcdata}= (delete $text_elt->{empty} || 1) && $left_string; 7767 my $new_elt= $elt->new( $XML::Twig::index2gi[$elt->{'gi'}], $right_string); 7768 $new_elt->paste( after => $elt); 7769 return $new_elt; 7770 } 7771 7772 7773# split an element or its text descendants into several, in place 7774# all elements (new and untouched) are returned 7775sub split 7776 { my $elt= shift; 7777 my @text_chunks; 7778 my @result; 7779 if( $elt->is_text) { @text_chunks= ($elt); } 7780 else { @text_chunks= $elt->descendants( $TEXT); } 7781 foreach my $text_chunk (@text_chunks) 7782 { push @result, $text_chunk->_split( 1, @_); } 7783 return @result; 7784 } 7785 7786# split an element or its text descendants into several, in place 7787# created elements (those which match the regexp) are returned 7788sub mark 7789 { my $elt= shift; 7790 my @text_chunks; 7791 my @result; 7792 if( $elt->is_text) { @text_chunks= ($elt); } 7793 else { @text_chunks= $elt->descendants( $TEXT); } 7794 foreach my $text_chunk (@text_chunks) 7795 { push @result, $text_chunk->_split( 0, @_); } 7796 return @result; 7797 } 7798 7799# split a single text element 7800# return_all defines what is returned: if it is true 7801# only returns the elements created by matches in the split regexp 7802# otherwise all elements (new and untouched) are returned 7803 7804 7805{ 7806 7807 sub _split 7808 { my $elt= shift; 7809 my $return_all= shift; 7810 my $regexp= shift; 7811 my @tags; 7812 7813 while( @_) 7814 { my $tag= shift(); 7815 if( ref $_[0]) 7816 { push @tags, { tag => $tag, atts => shift }; } 7817 else 7818 { push @tags, { tag => $tag }; } 7819 } 7820 7821 unless( @tags) { @tags= { tag => $elt->{parent}->gi }; } 7822 7823 my @result; # the returned list of elements 7824 my $text= $elt->text; 7825 my $gi= $XML::Twig::index2gi[$elt->{'gi'}]; 7826 7827 # 2 uses: if split matches then the first substring reuses $elt 7828 # once a split has occurred then the last match needs to be put in 7829 # a new element 7830 my $previous_match= 0; 7831 7832 while( my( $pre_match, @matches)= $text=~ /^(.*?)$regexp(.*)$/gcs) 7833 { $text= pop @matches; 7834 if( $previous_match) 7835 { # match, not the first one, create a new text ($gi) element 7836 _utf8_ify( $pre_match) if( $] < 5.010); 7837 $elt= $elt->insert_new_elt( after => $gi, $pre_match); 7838 push @result, $elt if( $return_all); 7839 } 7840 else 7841 { # first match in $elt, re-use $elt for the first sub-string 7842 _utf8_ify( $pre_match) if( $] < 5.010); 7843 $elt->set_text( $pre_match); 7844 $previous_match++; # store the fact that there was a match 7845 push @result, $elt if( $return_all); 7846 } 7847 7848 # now deal with matches captured in the regexp 7849 if( @matches) 7850 { # match, with capture 7851 my $i=0; 7852 foreach my $match (@matches) 7853 { # create new element, text is the match 7854 _utf8_ify( $match) if( $] < 5.010); 7855 my $tag = _repl_match( $tags[$i]->{tag}, @matches) || '#PCDATA'; 7856 my $atts = \%{$tags[$i]->{atts}} || {}; 7857 my %atts= map { _repl_match( $_, @matches) => _repl_match( $atts->{$_}, @matches) } keys %$atts; 7858 $elt= $elt->insert_new_elt( after => $tag, \%atts, $match); 7859 push @result, $elt; 7860 $i= ($i + 1) % @tags; 7861 } 7862 } 7863 else 7864 { # match, no captures 7865 my $tag = $tags[0]->{tag}; 7866 my $atts = \%{$tags[0]->{atts}} || {}; 7867 $elt= $elt->insert_new_elt( after => $tag, $atts); 7868 push @result, $elt; 7869 } 7870 } 7871 if( $previous_match && $text) 7872 { # there was at least 1 match, and there is text left after the match 7873 $elt= $elt->insert_new_elt( after => $gi, $text); 7874 } 7875 7876 push @result, $elt if( $return_all); 7877 7878 return @result; # return all elements 7879 } 7880 7881sub _repl_match 7882 { my( $val, @matches)= @_; 7883 $val=~ s{\$(\d+)}{$matches[$1-1]}g; 7884 return $val; 7885 } 7886 7887 # evil hack needed as sometimes 7888 my $encode_is_loaded=0; # so we only load Encode once 7889 sub _utf8_ify 7890 { 7891 if( $perl_version >= 5.008 and $perl_version < 5.010 and !_keep_encoding()) 7892 { unless( $encode_is_loaded) { require Encode; import Encode; $encode_is_loaded++; } 7893 Encode::_utf8_on( $_[0]); # the flag should be set but is not 7894 } 7895 } 7896 7897 7898} 7899 7900{ my %replace_sub; # cache for complex expressions (expression => sub) 7901 7902 sub subs_text 7903 { my( $elt, $regexp, $replace)= @_; 7904 7905 my $replacement_string; 7906 my $is_string= _is_string( $replace); 7907 7908 my @parents; 7909 7910 foreach my $text_elt ($elt->descendants_or_self( $TEXT)) 7911 { 7912 if( $is_string) 7913 { my $text= $text_elt->text; 7914 $text=~ s{$regexp}{ _replace_var( $replace, $1, $2, $3, $4, $5, $6, $7, $8, $9)}egx; 7915 $text_elt->set_text( $text); 7916 } 7917 else 7918 { 7919 no utf8; # = perl 5.6 7920 my $replace_sub= ( $replace_sub{$replace} ||= _install_replace_sub( $replace)); 7921 my $text= $text_elt->text; 7922 my $pos=0; # used to skip text that was previously matched 7923 my $found_hit; 7924 while( my( $pre_match_string, $match_string, @var)= ($text=~ m{(.*?)($regexp)}sg)) 7925 { $found_hit=1; 7926 my $match_start = length( $pre_match_string); 7927 my $match = $match_start ? $text_elt->split_at( $match_start + $pos) : $text_elt; 7928 my $match_length = length( $match_string); 7929 my $post_match = $match->split_at( $match_length); 7930 $replace_sub->( $match, @var); 7931 7932 # go to next 7933 $text_elt= $post_match; 7934 $text= $post_match->text; 7935 7936 if( $found_hit) { push @parents, $text_elt->{parent} unless $parents[-1] && $parents[-1]== $text_elt->{parent}; } 7937 7938 } 7939 } 7940 } 7941 7942 foreach my $parent (@parents) { $parent->normalize; } 7943 7944 return $elt; 7945 } 7946 7947 7948 sub _is_string 7949 { return ($_[0]=~ m{&e[ln]t}) ? 0: 1 } 7950 7951 sub _replace_var 7952 { my( $string, @var)= @_; 7953 unshift @var, undef; 7954 $string=~ s{\$(\d)}{$var[$1]}g; 7955 return $string; 7956 } 7957 7958 sub _install_replace_sub 7959 { my $replace_exp= shift; 7960 my @item= split m{(&e[ln]t\s*\([^)]*\))}, $replace_exp; 7961 my $sub= q{ my( $match, @var)= @_; my $new; my $last_inserted=$match;}; 7962 my( $gi, $exp); 7963 foreach my $item (@item) 7964 { next if ! length $item; 7965 if( $item=~ m{^&elt\s*\(([^)]*)\)}) 7966 { $exp= $1; } 7967 elsif( $item=~ m{^&ent\s*\(\s*([^\s)]*)\s*\)}) 7968 { $exp= " '#ENT' => $1"; } 7969 else 7970 { $exp= qq{ '#PCDATA' => "$item"}; } 7971 $exp=~ s{\$(\d)}{my $i= $1-1; "\$var[$i]"}eg; # replace references to matches 7972 $sub.= qq{ \$new= \$match->new( $exp); }; 7973 $sub .= q{ $new->paste( after => $last_inserted); $last_inserted=$new;}; 7974 } 7975 $sub .= q{ $match->delete; }; 7976 #$sub=~ s/;/;\n/g; warn "subs: $sub"; 7977 my $coderef= eval "sub { $NO_WARNINGS; $sub }"; 7978 if( $@) { croak( "invalid replacement expression $replace_exp: ",$@); } 7979 return $coderef; 7980 } 7981 7982 } 7983 7984 7985sub merge_text 7986 { my( $e1, $e2)= @_; 7987 croak "invalid merge: can only merge 2 elements" 7988 unless( isa( $e2, 'XML::Twig::Elt')); 7989 croak "invalid merge: can only merge 2 text elements" 7990 unless( $e1->is_text && $e2->is_text && ($e1->gi eq $e2->gi)); 7991 7992 my $t1_length= length( $e1->text); 7993 7994 $e1->set_text( $e1->text . $e2->text); 7995 7996 if( my $extra_data_in_pcdata= $e2->_extra_data_in_pcdata) 7997 { foreach my $data (@$extra_data_in_pcdata) { $e1->_push_extra_data_in_pcdata( $data->{text}, $data->{offset} + $t1_length); } } 7998 7999 $e2->delete; 8000 8001 return $e1; 8002 } 8003 8004sub merge 8005 { my( $e1, $e2)= @_; 8006 my @e2_children= $e2->_children; 8007 if( $e1->_last_child && $e1->_last_child->is_pcdata 8008 && @e2_children && $e2_children[0]->is_pcdata 8009 ) 8010 { my $t1_length= length( $e1->_last_child->{pcdata}); 8011 my $child1= $e1->_last_child; 8012 my $child2= shift @e2_children; 8013 $child1->{pcdata} .= $child2->{pcdata}; 8014 8015 my $extra_data= $e1->_extra_data_before_end_tag . $e2->extra_data; 8016 8017 if( $extra_data) 8018 { $e1->_del_extra_data_before_end_tag; 8019 $child1->_push_extra_data_in_pcdata( $extra_data, $t1_length); 8020 } 8021 8022 if( my $extra_data_in_pcdata= $child2->_extra_data_in_pcdata) 8023 { foreach my $data (@$extra_data_in_pcdata) { $child1->_push_extra_data_in_pcdata( $data->{text}, $data->{offset} + $t1_length); } } 8024 8025 if( my $extra_data_before_end_tag= $e2->_extra_data_before_end_tag) 8026 { $e1->_set_extra_data_before_end_tag( $extra_data_before_end_tag); } 8027 } 8028 8029 foreach my $e (@e2_children) { $e->move( last_child => $e1); } 8030 8031 $e2->delete; 8032 return $e1; 8033 } 8034 8035 8036# recursively copy an element and returns the copy (can be huge and long) 8037sub copy 8038 { my $elt= shift; 8039 my $copy= $elt->new( $XML::Twig::index2gi[$elt->{'gi'}]); 8040 8041 if( $elt->extra_data) { $copy->set_extra_data( $elt->extra_data); } 8042 if( $elt->{extra_data_before_end_tag}) { $copy->_set_extra_data_before_end_tag( $elt->{extra_data_before_end_tag}); } 8043 8044 if( $elt->is_asis) { $copy->set_asis; } 8045 8046 if( (exists $elt->{'pcdata'})) 8047 { $copy->{pcdata}= (delete $copy->{empty} || 1) && $elt->{pcdata}; 8048 if( $elt->{extra_data_in_pcdata}) { $copy->_set_extra_data_in_pcdata( $elt->{extra_data_in_pcdata}); } 8049 } 8050 elsif( (exists $elt->{'cdata'})) 8051 { $copy->{cdata}= $elt->{cdata}; 8052 if( $elt->{extra_data_in_pcdata}) { $copy->_set_extra_data_in_pcdata( $elt->{extra_data_in_pcdata}); } 8053 } 8054 elsif( (exists $elt->{'target'})) 8055 { $copy->_set_pi( $elt->{target}, $elt->{data}); } 8056 elsif( (exists $elt->{'comment'})) 8057 { $copy->{comment}= $elt->{comment}; } 8058 elsif( (exists $elt->{'ent'})) 8059 { $copy->{ent}= $elt->{ent}; } 8060 else 8061 { my @children= do { my $elt= $elt; my @children=(); my $child= $elt->{first_child}; while( $child) { push @children, $child; $child= $child->{next_sibling}; } @children; }; 8062 if( my $atts= $elt->{att}) 8063 { my %atts; 8064 tie %atts, 'Tie::IxHash' if (keep_atts_order()); 8065 %atts= %{$atts}; # we want to do a real copy of the attributes 8066 $copy->set_atts( \%atts); 8067 } 8068 foreach my $child (@children) 8069 { my $child_copy= $child->copy; 8070 $child_copy->paste( 'last_child', $copy); 8071 } 8072 } 8073 # save links to the original location, which can be convenient and is used for namespace resolution 8074 foreach my $link ( qw(parent prev_sibling next_sibling) ) 8075 { $copy->{former}->{$link}= $elt->{$link}; 8076 if( $XML::Twig::weakrefs) { weaken( $copy->{former}->{$link}); } 8077 } 8078 8079 $copy->{empty}= $elt->{'empty'}; 8080 8081 return $copy; 8082 } 8083 8084 8085sub delete 8086 { my $elt= shift; 8087 $elt->cut; 8088 $elt->DESTROY unless $XML::Twig::weakrefs; 8089 return undef; 8090 } 8091 8092sub __destroy 8093 { my $elt= shift; 8094 return if( $XML::Twig::weakrefs); 8095 my $t= shift || $elt->twig; # optional argument, passed in recursive calls 8096 8097 foreach( @{[$elt->_children]}) { $_->DESTROY( $t); } 8098 8099 # the id reference needs to be destroyed 8100 # lots of tests to avoid warnings during the cleanup phase 8101 $elt->del_id( $t) if( $ID && $t && defined( $elt->{att}) && exists( $elt->{att}->{$ID})); 8102 if( $elt->{former}) { foreach (keys %{$elt->{former}}) { delete $elt->{former}->{$_}; } delete $elt->{former}; } 8103 foreach (qw( keys %$elt)) { delete $elt->{$_}; } 8104 undef $elt; 8105 } 8106 8107BEGIN 8108{ sub set_destroy { if( $XML::Twig::weakrefs) { undef *DESTROY } else { *DESTROY= *__destroy; } } 8109 set_destroy(); 8110} 8111 8112# ignores the element 8113sub ignore 8114 { my $elt= shift; 8115 my $t= $elt->twig; 8116 $t->ignore( $elt, @_); 8117 } 8118 8119BEGIN { 8120 my $pretty = 0; 8121 my $quote = '"'; 8122 my $INDENT = ' '; 8123 my $empty_tag_style = 0; 8124 my $remove_cdata = 0; 8125 my $keep_encoding = 0; 8126 my $expand_external_entities = 0; 8127 my $keep_atts_order = 0; 8128 my $do_not_escape_amp_in_atts = 0; 8129 my $WRAP = '80'; 8130 my $REPLACED_ENTS = qq{&<}; 8131 8132 my ($NSGMLS, $NICE, $INDENTED, $INDENTEDCT, $INDENTEDC, $WRAPPED, $RECORD1, $RECORD2, $INDENTEDA)= (1..9); 8133 my %KEEP_TEXT_TAG_ON_ONE_LINE= map { $_ => 1 } ( $INDENTED, $INDENTEDCT, $INDENTEDC, $INDENTEDA, $WRAPPED); 8134 my %WRAPPED = map { $_ => 1 } ( $WRAPPED, $INDENTEDA, $INDENTEDC); 8135 8136 my %pretty_print_style= 8137 ( none => 0, # no added \n 8138 nsgmls => $NSGMLS, # nsgmls-style, \n in tags 8139 # below this line styles are UNSAFE (the generated XML can be well-formed but invalid) 8140 nice => $NICE, # \n after open/close tags except when the 8141 # element starts with text 8142 indented => $INDENTED, # nice plus idented 8143 indented_close_tag => $INDENTEDCT, # nice plus idented 8144 indented_c => $INDENTEDC, # slightly more compact than indented (closing 8145 # tags are on the same line) 8146 wrapped => $WRAPPED, # text is wrapped at column 8147 record_c => $RECORD1, # for record-like data (compact) 8148 record => $RECORD2, # for record-like data (not so compact) 8149 indented_a => $INDENTEDA, # nice, indented, and with attributes on separate 8150 # lines as the nsgmls style, as well as wrapped 8151 # lines - to make the xml friendly to line-oriented tools 8152 cvs => $INDENTEDA, # alias for indented_a 8153 ); 8154 8155 my ($HTML, $EXPAND)= (1..2); 8156 my %empty_tag_style= 8157 ( normal => 0, # <tag/> 8158 html => $HTML, # <tag /> 8159 xhtml => $HTML, # <tag /> 8160 expand => $EXPAND, # <tag></tag> 8161 ); 8162 8163 my %quote_style= 8164 ( double => '"', 8165 single => "'", 8166 # smart => "smart", 8167 ); 8168 8169 my $xml_space_preserve; # set when an element includes xml:space="preserve" 8170 8171 my $output_filter; # filters the entire output (including < and >) 8172 my $output_text_filter; # filters only the text part (tag names, attributes, pcdata) 8173 8174 my $replaced_ents= $REPLACED_ENTS; 8175 8176 8177 # returns those pesky "global" variables so you can switch between twigs 8178 sub global_state ## no critic (Subroutines::ProhibitNestedSubs); 8179 { return 8180 { pretty => $pretty, 8181 quote => $quote, 8182 indent => $INDENT, 8183 empty_tag_style => $empty_tag_style, 8184 remove_cdata => $remove_cdata, 8185 keep_encoding => $keep_encoding, 8186 expand_external_entities => $expand_external_entities, 8187 output_filter => $output_filter, 8188 output_text_filter => $output_text_filter, 8189 keep_atts_order => $keep_atts_order, 8190 do_not_escape_amp_in_atts => $do_not_escape_amp_in_atts, 8191 wrap => $WRAP, 8192 replaced_ents => $replaced_ents, 8193 }; 8194 } 8195 8196 # restores the global variables 8197 sub set_global_state 8198 { my $state= shift; 8199 $pretty = $state->{pretty}; 8200 $quote = $state->{quote}; 8201 $INDENT = $state->{indent}; 8202 $empty_tag_style = $state->{empty_tag_style}; 8203 $remove_cdata = $state->{remove_cdata}; 8204 $keep_encoding = $state->{keep_encoding}; 8205 $expand_external_entities = $state->{expand_external_entities}; 8206 $output_filter = $state->{output_filter}; 8207 $output_text_filter = $state->{output_text_filter}; 8208 $keep_atts_order = $state->{keep_atts_order}; 8209 $do_not_escape_amp_in_atts = $state->{do_not_escape_amp_in_atts}; 8210 $WRAP = $state->{wrap}; 8211 $replaced_ents = $state->{replaced_ents}, 8212 } 8213 8214 # sets global state to defaults 8215 sub init_global_state 8216 { set_global_state( 8217 { pretty => 0, 8218 quote => '"', 8219 indent => $INDENT, 8220 empty_tag_style => 0, 8221 remove_cdata => 0, 8222 keep_encoding => 0, 8223 expand_external_entities => 0, 8224 output_filter => undef, 8225 output_text_filter => undef, 8226 keep_atts_order => undef, 8227 do_not_escape_amp_in_atts => 0, 8228 wrap => $WRAP, 8229 replaced_ents => $REPLACED_ENTS, 8230 }); 8231 } 8232 8233 8234 # set the pretty_print style (in $pretty) and returns the old one 8235 # can be called from outside the package with 2 arguments (elt, style) 8236 # or from inside with only one argument (style) 8237 # the style can be either a string (one of the keys of %pretty_print_style 8238 # or a number (presumably an old value saved) 8239 sub set_pretty_print 8240 { my $style= lc( defined $_[1] ? $_[1] : $_[0]); # so we cover both cases 8241 my $old_pretty= $pretty; 8242 if( $style=~ /^\d+$/) 8243 { croak "invalid pretty print style $style" unless( $style < keys %pretty_print_style); 8244 $pretty= $style; 8245 } 8246 else 8247 { croak "invalid pretty print style '$style'" unless( exists $pretty_print_style{$style}); 8248 $pretty= $pretty_print_style{$style}; 8249 } 8250 if( $WRAPPED{$pretty} ) 8251 { XML::Twig::_use( 'Text::Wrap') or croak( "Text::Wrap not available, cannot use style $style"); } 8252 return $old_pretty; 8253 } 8254 8255 sub _pretty_print { return $pretty; } 8256 8257 # set the empty tag style (in $empty_tag_style) and returns the old one 8258 # can be called from outside the package with 2 arguments (elt, style) 8259 # or from inside with only one argument (style) 8260 # the style can be either a string (one of the keys of %empty_tag_style 8261 # or a number (presumably an old value saved) 8262 sub set_empty_tag_style 8263 { my $style= lc( defined $_[1] ? $_[1] : $_[0]); # so we cover both cases 8264 my $old_style= $empty_tag_style; 8265 if( $style=~ /^\d+$/) 8266 { croak "invalid empty tag style $style" 8267 unless( $style < keys %empty_tag_style); 8268 $empty_tag_style= $style; 8269 } 8270 else 8271 { croak "invalid empty tag style '$style'" 8272 unless( exists $empty_tag_style{$style}); 8273 $empty_tag_style= $empty_tag_style{$style}; 8274 } 8275 return $old_style; 8276 } 8277 8278 sub _pretty_print_styles 8279 { return (sort { $pretty_print_style{$a} <=> $pretty_print_style{$b} || $a cmp $b } keys %pretty_print_style); } 8280 8281 sub set_quote 8282 { my $style= $_[1] || $_[0]; 8283 my $old_quote= $quote; 8284 croak "invalid quote '$style'" unless( exists $quote_style{$style}); 8285 $quote= $quote_style{$style}; 8286 return $old_quote; 8287 } 8288 8289 sub set_remove_cdata 8290 { my $new_value= defined $_[1] ? $_[1] : $_[0]; 8291 my $old_value= $remove_cdata; 8292 $remove_cdata= $new_value; 8293 return $old_value; 8294 } 8295 8296 8297 sub set_indent 8298 { my $new_value= defined $_[1] ? $_[1] : $_[0]; 8299 my $old_value= $INDENT; 8300 $INDENT= $new_value; 8301 return $old_value; 8302 } 8303 8304 sub set_wrap 8305 { my $new_value= defined $_[1] ? $_[1] : $_[0]; 8306 my $old_value= $WRAP; 8307 $WRAP= $new_value; 8308 return $old_value; 8309 } 8310 8311 8312 sub set_keep_encoding 8313 { my $new_value= defined $_[1] ? $_[1] : $_[0]; 8314 my $old_value= $keep_encoding; 8315 $keep_encoding= $new_value; 8316 return $old_value; 8317 } 8318 8319 sub set_replaced_ents 8320 { my $new_value= defined $_[1] ? $_[1] : $_[0]; 8321 my $old_value= $replaced_ents; 8322 $replaced_ents= $new_value; 8323 return $old_value; 8324 } 8325 8326 sub do_not_escape_gt 8327 { my $old_value= $replaced_ents; 8328 $replaced_ents= q{&<}; # & needs to be first 8329 return $old_value; 8330 } 8331 8332 sub escape_gt 8333 { my $old_value= $replaced_ents; 8334 $replaced_ents= qq{&<>}; # & needs to be first 8335 return $old_value; 8336 } 8337 8338 sub _keep_encoding { return $keep_encoding; } # so I can use elsewhere in the module 8339 8340 sub set_do_not_escape_amp_in_atts 8341 { my $new_value= defined $_[1] ? $_[1] : $_[0]; 8342 my $old_value= $do_not_escape_amp_in_atts; 8343 $do_not_escape_amp_in_atts= $new_value; 8344 return $old_value; 8345 } 8346 8347 sub output_filter { return $output_filter; } 8348 sub output_text_filter { return $output_text_filter; } 8349 8350 sub set_output_filter 8351 { my $new_value= defined $_[1] ? $_[1] : $_[0]; # can be called in object/non-object mode 8352 # if called in object mode with no argument, the filter is undefined 8353 if( isa( $new_value, 'XML::Twig::Elt') || isa( $new_value, 'XML::Twig')) { undef $new_value; } 8354 my $old_value= $output_filter; 8355 if( !$new_value || isa( $new_value, 'CODE') ) 8356 { $output_filter= $new_value; } 8357 elsif( $new_value eq 'latin1') 8358 { $output_filter= XML::Twig::latin1(); 8359 } 8360 elsif( $XML::Twig::filter{$new_value}) 8361 { $output_filter= $XML::Twig::filter{$new_value}; } 8362 else 8363 { croak "invalid output filter '$new_value'"; } 8364 8365 return $old_value; 8366 } 8367 8368 sub set_output_text_filter 8369 { my $new_value= defined $_[1] ? $_[1] : $_[0]; # can be called in object/non-object mode 8370 # if called in object mode with no argument, the filter is undefined 8371 if( isa( $new_value, 'XML::Twig::Elt') || isa( $new_value, 'XML::Twig')) { undef $new_value; } 8372 my $old_value= $output_text_filter; 8373 if( !$new_value || isa( $new_value, 'CODE') ) 8374 { $output_text_filter= $new_value; } 8375 elsif( $new_value eq 'latin1') 8376 { $output_text_filter= XML::Twig::latin1(); 8377 } 8378 elsif( $XML::Twig::filter{$new_value}) 8379 { $output_text_filter= $XML::Twig::filter{$new_value}; } 8380 else 8381 { croak "invalid output text filter '$new_value'"; } 8382 8383 return $old_value; 8384 } 8385 8386 sub set_expand_external_entities 8387 { my $new_value= defined $_[1] ? $_[1] : $_[0]; 8388 my $old_value= $expand_external_entities; 8389 $expand_external_entities= $new_value; 8390 return $old_value; 8391 } 8392 8393 sub set_keep_atts_order 8394 { my $new_value= defined $_[1] ? $_[1] : $_[0]; 8395 my $old_value= $keep_atts_order; 8396 $keep_atts_order= $new_value; 8397 return $old_value; 8398 8399 } 8400 8401 sub keep_atts_order { return $keep_atts_order; } # so I can use elsewhere in the module 8402 8403 my %html_empty_elt; 8404 BEGIN { %html_empty_elt= map { $_ => 1} qw( base meta link hr br param img area input col); } 8405 8406 sub start_tag 8407 { my( $elt, $option)= @_; 8408 8409 8410 return if( $elt->{gi} < $XML::Twig::SPECIAL_GI); 8411 8412 my $extra_data= $elt->{extra_data} || ''; 8413 8414 my $gi= $XML::Twig::index2gi[$elt->{'gi'}]; 8415 my $att= $elt->{att}; # should be $elt->{att}, optimized into a pure hash look-up 8416 8417 my $ns_map= $att ? $att->{'#original_gi'} : ''; 8418 if( $ns_map) { $gi= _restore_original_prefix( $ns_map, $gi); } 8419 $gi=~ s{^#default:}{}; # remove default prefix 8420 8421 if( $output_text_filter) { $gi= $output_text_filter->( $gi); } 8422 8423 # get the attribute and their values 8424 my $att_sep = $pretty==$NSGMLS ? "\n" 8425 : $pretty==$INDENTEDA ? "\n" . $INDENT x ($elt->level+1) . ' ' 8426 : ' ' 8427 ; 8428 8429 my $replace_in_att_value= $replaced_ents . "$quote\t\r\n"; 8430 if( $option->{escape_gt} && $replaced_ents !~ m{>}) { $replace_in_att_value.= '>'; } 8431 8432 my $tag; 8433 my @att_names= grep { !( $_=~ m{^#(?!default:)} ) } $keep_atts_order ? keys %{$att} : sort keys %{$att}; 8434 if( @att_names) 8435 { my $atts= join $att_sep, map { my $output_att_name= $ns_map ? _restore_original_prefix( $ns_map, $_) : $_; 8436 if( $output_text_filter) 8437 { $output_att_name= $output_text_filter->( $output_att_name); } 8438 $output_att_name . '=' . $quote . _att_xml_string( $att->{$_}, $replace_in_att_value) . $quote 8439 8440 } 8441 @att_names 8442 ; 8443 if( $pretty==$INDENTEDA && @att_names == 1) { $att_sep= ' '; } 8444 $tag= "<$gi$att_sep$atts"; 8445 } 8446 else 8447 { $tag= "<$gi"; } 8448 8449 $tag .= "\n" if($pretty==$NSGMLS); 8450 8451 8452 # force empty if suitable HTML tag, otherwise use the value from the input tree 8453 if( ($empty_tag_style eq $HTML) && !$elt->{first_child} && !$elt->{extra_data_before_end_tag} && $html_empty_elt{$gi}) 8454 { $elt->{empty}= 1; } 8455 my $empty= defined $elt->{empty} ? $elt->{empty} 8456 : $elt->{first_child} ? 0 8457 : 1; 8458 8459 $tag .= (!$elt->{empty} || $elt->{extra_data_before_end_tag}) ? '>' # element has content 8460 : (($empty_tag_style eq $HTML) && $html_empty_elt{$gi}) ? ' />' # html empty element 8461 # cvs-friendly format 8462 : ( $pretty == $INDENTEDA && @att_names > 1) ? "\n" . $INDENT x $elt->level . "/>" 8463 : ( $pretty == $INDENTEDA && @att_names == 1) ? " />" 8464 : $empty_tag_style ? "></" . $XML::Twig::index2gi[$elt->{'gi'}] . ">" # $empty_tag_style is $HTML or $EXPAND 8465 : '/>' 8466 ; 8467 8468 if( ( (substr( $XML::Twig::index2gi[$elt->{'gi'}], 0, 1) eq '#') && (substr( $XML::Twig::index2gi[$elt->{'gi'}], 0, 9) ne '#default:') )) { $tag= ''; } 8469 8470#warn "TRACE: ", $tag,": ", Encode::is_utf8( $tag) ? "has flag" : "FLAG NOT SET"; 8471 8472 unless( $pretty) { return defined( $extra_data) ? $extra_data . $tag : $tag; } 8473 8474 my $prefix=''; 8475 my $return=''; # '' or \n is to be printed before the tag 8476 my $indent=0; # number of indents before the tag 8477 8478 if( $pretty==$RECORD1) 8479 { my $level= $elt->level; 8480 $return= "\n" if( $level < 2); 8481 $indent= 1 if( $level == 1); 8482 } 8483 8484 elsif( $pretty==$RECORD2) 8485 { $return= "\n"; 8486 $indent= $elt->level; 8487 } 8488 8489 elsif( $pretty==$NICE) 8490 { my $parent= $elt->{parent}; 8491 unless( !$parent || $parent->{contains_text}) 8492 { $return= "\n"; } 8493 $elt->{contains_text}= 1 if( ($parent && $parent->{contains_text}) 8494 || $elt->contains_text); 8495 } 8496 8497 elsif( $KEEP_TEXT_TAG_ON_ONE_LINE{$pretty}) 8498 { my $parent= $elt->{parent}; 8499 unless( !$parent || $parent->{contains_text}) 8500 { $return= "\n"; 8501 $indent= $elt->level; 8502 } 8503 $elt->{contains_text}= 1 if( ($parent && $parent->{contains_text}) 8504 || $elt->contains_text); 8505 } 8506 8507 if( $return || $indent) 8508 { # check for elements in which spaces should be kept 8509 my $t= $elt->twig; 8510 return $extra_data . $tag if( $xml_space_preserve); 8511 if( $t && $t->{twig_keep_spaces_in}) 8512 { foreach my $ancestor ($elt->ancestors) 8513 { return $extra_data . $tag if( $t->{twig_keep_spaces_in}->{$XML::Twig::index2gi[$ancestor->{'gi'}]}) } 8514 } 8515 8516 $prefix= $INDENT x $indent; 8517 if( $extra_data) 8518 { $extra_data=~ s{\s+$}{}; 8519 $extra_data=~ s{^\s+}{}; 8520 $extra_data= $prefix . $extra_data . $return; 8521 } 8522 } 8523 8524 8525 return $return . $extra_data . $prefix . $tag; 8526 } 8527 8528 sub end_tag 8529 { my $elt= shift; 8530 return '' if( ($elt->{gi}<$XML::Twig::SPECIAL_GI) 8531 || ($elt->{'empty'} && !$elt->{extra_data_before_end_tag}) 8532 ); 8533 my $tag= "<"; 8534 my $gi= $XML::Twig::index2gi[$elt->{'gi'}]; 8535 8536 if( my $map= $elt->{'att'}->{'#original_gi'}) { $gi= _restore_original_prefix( $map, $gi); } 8537 $gi=~ s{^#default:}{}; # remove default prefix 8538 8539 if( $output_text_filter) { $gi= $output_text_filter->( $XML::Twig::index2gi[$elt->{'gi'}]); } 8540 $tag .= "/$gi>"; 8541 8542 $tag = ($elt->{extra_data_before_end_tag} || '') . $tag; 8543 8544 if( ( (substr( $XML::Twig::index2gi[$elt->{'gi'}], 0, 1) eq '#') && (substr( $XML::Twig::index2gi[$elt->{'gi'}], 0, 9) ne '#default:') )) { $tag= ''; } 8545 8546 return $tag unless $pretty; 8547 8548 my $prefix=''; 8549 my $return=0; # 1 if a \n is to be printed before the tag 8550 my $indent=0; # number of indents before the tag 8551 8552 if( $pretty==$RECORD1) 8553 { $return= 1 if( $elt->level == 0); 8554 } 8555 8556 elsif( $pretty==$RECORD2) 8557 { unless( $elt->contains_text) 8558 { $return= 1 ; 8559 $indent= $elt->level; 8560 } 8561 } 8562 8563 elsif( $pretty==$NICE) 8564 { my $parent= $elt->{parent}; 8565 if( ( ($parent && !$parent->{contains_text}) || !$parent ) 8566 && ( !$elt->{contains_text} 8567 && ($elt->{has_flushed_child} || $elt->{first_child}) 8568 ) 8569 ) 8570 { $return= 1; } 8571 } 8572 8573 elsif( $KEEP_TEXT_TAG_ON_ONE_LINE{$pretty}) 8574 { my $parent= $elt->{parent}; 8575 if( ( ($parent && !$parent->{contains_text}) || !$parent ) 8576 && ( !$elt->{contains_text} 8577 && ($elt->{has_flushed_child} || $elt->{first_child}) 8578 ) 8579 ) 8580 { $return= 1; 8581 $indent= $elt->level; 8582 } 8583 } 8584 8585 if( $return || $indent) 8586 { # check for elements in which spaces should be kept 8587 my $t= $elt->twig; 8588 return $tag if( $xml_space_preserve); 8589 if( $t && $t->{twig_keep_spaces_in}) 8590 { foreach my $ancestor ($elt, $elt->ancestors) 8591 { return $tag if( $t->{twig_keep_spaces_in}->{$XML::Twig::index2gi[$ancestor->{'gi'}]}) } 8592 } 8593 8594 if( $return) { $prefix= ($pretty== $INDENTEDCT) ? "\n$INDENT" : "\n"; } 8595 $prefix.= $INDENT x $indent; 8596 } 8597 8598 # add a \n at the end of the document (after the root element) 8599 $tag .= "\n" unless( $elt->{parent}); 8600 8601 return $prefix . $tag; 8602 } 8603 8604 sub _restore_original_prefix 8605 { my( $map, $name)= @_; 8606 my $prefix= _ns_prefix( $name); 8607 if( my $original_prefix= $map->{$prefix}) 8608 { if( $original_prefix eq '#default') 8609 { $name=~ s{^$prefix:}{}; } 8610 else 8611 { $name=~ s{^$prefix(?=:)}{$original_prefix}; } 8612 } 8613 return $name; 8614 } 8615 8616 # buffer used to hold the text to print/sprint, to avoid passing it back and forth between methods 8617 my @sprint; 8618 8619 # $elt is an element to print 8620 # $fh is an optional filehandle to print to 8621 # $pretty is an optional value, if true a \n is printed after the < of the 8622 # opening tag 8623 sub print 8624 { my $elt= shift; 8625 8626 my $fh= isa( $_[0], 'GLOB') || isa( $_[0], 'IO::Scalar') ? shift : undef; 8627 my $old_select= defined $fh ? select $fh : undef; 8628 print $elt->sprint( @_); 8629 select $old_select if( defined $old_select); 8630 } 8631 8632 8633# those next 2 methods need to be refactored, they are copies of the same methods in XML::Twig 8634sub print_to_file 8635 { my( $elt, $filename)= (shift, shift); 8636 my $out_fh; 8637# open( $out_fh, ">$filename") or _croak( "cannot create file $filename: $!"); # < perl 5.8 8638 my $mode= $keep_encoding ? '>' : '>:utf8'; # >= perl 5.8 8639 open( $out_fh, $mode, $filename) or _croak( "cannot create file $filename: $!"); # >= perl 5.8 8640 $elt->print( $out_fh, @_); 8641 close $out_fh; 8642 return $elt; 8643 } 8644 8645# probably only works on *nix (at least the chmod bit) 8646# first print to a temporary file, then rename that file to the desired file name, then change permissions 8647# to the original file permissions (or to the current umask) 8648sub safe_print_to_file 8649 { my( $elt, $filename)= (shift, shift); 8650 my $perm= -f $filename ? (stat $filename)[2] & 07777 : ~umask() ; 8651 XML::Twig::_use( 'File::Temp') || croak "need File::Temp to use safe_print_to_file\n"; 8652 XML::Twig::_use( 'File::Basename') || croak "need File::Basename to use safe_print_to_file\n"; 8653 my $tmpdir= File::Basename::dirname( $filename); 8654 my( $fh, $tmpfilename) = File::Temp::tempfile( DIR => $tmpdir); 8655 $elt->print_to_file( $tmpfilename, @_); 8656 rename( $tmpfilename, $filename) or unlink $tmpfilename && _croak( "cannot move temporary file to $filename: $!"); 8657 chmod $perm, $filename; 8658 return $elt; 8659 } 8660 8661 8662 # same as print but does not output the start tag if the element 8663 # is marked as flushed 8664 sub flush 8665 { my $elt= shift; 8666 my $up_to= $_[0] && isa( $_[0], 'XML::Twig::Elt') ? shift : $elt; 8667 $elt->twig->flush_up_to( $up_to, @_); 8668 } 8669 sub purge 8670 { my $elt= shift; 8671 my $up_to= $_[0] && isa( $_[0], 'XML::Twig::Elt') ? shift : $elt; 8672 $elt->twig->purge_up_to( $up_to, @_); 8673 } 8674 8675 sub _flush 8676 { my $elt= shift; 8677 8678 my $pretty; 8679 my $fh= isa( $_[0], 'GLOB') || isa( $_[0], 'IO::Scalar') ? shift : undef; 8680 my $old_select= defined $fh ? select $fh : undef; 8681 my $old_pretty= defined ($pretty= shift) ? set_pretty_print( $pretty) : undef; 8682 8683 $xml_space_preserve= 1 if( ($elt->inherit_att( 'xml:space') || '') eq 'preserve'); 8684 8685 $elt->__flush(); 8686 8687 $xml_space_preserve= 0; 8688 8689 select $old_select if( defined $old_select); 8690 set_pretty_print( $old_pretty) if( defined $old_pretty); 8691 } 8692 8693 sub __flush 8694 { my $elt= shift; 8695 8696 if( $elt->{gi} >= $XML::Twig::SPECIAL_GI) 8697 { my $preserve= ($elt->{'att'}->{'xml:space'} || '') eq 'preserve'; 8698 $xml_space_preserve++ if $preserve; 8699 unless( $elt->{'flushed'}) 8700 { print $elt->start_tag(); 8701 } 8702 8703 # flush the children 8704 my @children= do { my $elt= $elt; my @children=(); my $child= $elt->{first_child}; while( $child) { push @children, $child; $child= $child->{next_sibling}; } @children; }; 8705 foreach my $child (@children) 8706 { $child->_flush( $pretty); 8707 $child->{'flushed'}=1; 8708 } 8709 if( ! $elt->{end_tag_flushed}) 8710 { print $elt->end_tag; 8711 $elt->{end_tag_flushed}=1; 8712 $elt->{'flushed'}=1; 8713 } 8714 $xml_space_preserve-- if $preserve; 8715 # used for pretty printing 8716 if( my $parent= $elt->{parent}) { $parent->{has_flushed_child}= 1; } 8717 } 8718 else # text or special element 8719 { my $text; 8720 if( (exists $elt->{'pcdata'})) { $text= $elt->pcdata_xml_string; 8721 if( my $parent= $elt->{parent}) 8722 { $parent->{contains_text}= 1; } 8723 } 8724 elsif( (exists $elt->{'cdata'})) { $text= $elt->cdata_string; 8725 if( my $parent= $elt->{parent}) 8726 { $parent->{contains_text}= 1; } 8727 } 8728 elsif( (exists $elt->{'target'})) { $text= $elt->pi_string; } 8729 elsif( (exists $elt->{'comment'})) { $text= $elt->comment_string; } 8730 elsif( (exists $elt->{'ent'})) { $text= $elt->ent_string; } 8731 8732 print $output_filter ? $output_filter->( $text) : $text; 8733 } 8734 } 8735 8736 8737 sub xml_text 8738 { my( $elt, @options)= @_; 8739 8740 if( @options && grep { lc( $_) eq 'no_recurse' } @options) { return $elt->xml_text_only; } 8741 8742 my $string=''; 8743 8744 if( ($elt->{gi} >= $XML::Twig::SPECIAL_GI) ) 8745 { # sprint the children 8746 my $child= $elt->{first_child} || ''; 8747 while( $child) 8748 { $string.= $child->xml_text; 8749 } continue { $child= $child->{next_sibling}; } 8750 } 8751 elsif( (exists $elt->{'pcdata'})) { $string .= $output_filter ? $output_filter->($elt->pcdata_xml_string) 8752 : $elt->pcdata_xml_string; 8753 } 8754 elsif( (exists $elt->{'cdata'})) { $string .= $output_filter ? $output_filter->($elt->cdata_string) 8755 : $elt->cdata_string; 8756 } 8757 elsif( (exists $elt->{'ent'})) { $string .= $elt->ent_string; } 8758 8759 return $string; 8760 } 8761 8762 sub xml_text_only 8763 { return join '', map { $_->xml_text if( $_->is_text || (exists $_->{'ent'})) } $_[0]->_children; } 8764 8765 # same as print but except... it does not print but rather returns the string 8766 # if the second parameter is set then only the content is returned, not the 8767 # start and end tags of the element (but the tags of the included elements are 8768 # returned) 8769 8770 sub sprint 8771 { my $elt= shift; 8772 my( $old_pretty, $old_empty_tag_style); 8773 8774 if( $_[0]) 8775 { if( isa( $_[0], 'HASH')) 8776 { # "proper way, using a hashref for options 8777 my %args= XML::Twig::_normalize_args( %{shift()}); 8778 if( defined $args{PrettyPrint}) { $old_pretty = set_pretty_print( $args{PrettyPrint}); } 8779 if( defined $args{EmptyTags}) { $old_empty_tag_style = set_empty_tag_style( $args{EmptyTags}); } 8780 } 8781 else 8782 { # "old" way, just using the option name 8783 my @other_opt; 8784 foreach my $opt (@_) 8785 { if( exists $pretty_print_style{$opt}) { $old_pretty = set_pretty_print( $opt); } 8786 elsif( exists $empty_tag_style{$opt}) { $old_empty_tag_style = set_empty_tag_style( $opt); } 8787 else { push @other_opt, $opt; } 8788 } 8789 @_= @other_opt; 8790 } 8791 } 8792 8793 $xml_space_preserve= 1 if( ($elt->inherit_att( 'xml:space') || '') eq 'preserve'); 8794 8795 @sprint=(); 8796 $elt->_sprint( @_); 8797 my $sprint= join( '', @sprint); 8798 if( $output_filter) { $sprint= $output_filter->( $sprint); } 8799 8800 if( ( ($pretty== $WRAPPED) || ($pretty==$INDENTEDC)) && !$xml_space_preserve) 8801 { $sprint= _wrap_text( $sprint); } 8802 $xml_space_preserve= 0; 8803 8804 8805 if( defined $old_pretty) { set_pretty_print( $old_pretty); } 8806 if( defined $old_empty_tag_style) { set_empty_tag_style( $old_empty_tag_style); } 8807 8808 return $sprint; 8809 } 8810 8811 sub _wrap_text 8812 { my( $string)= @_; 8813 my $wrapped; 8814 foreach my $line (split /\n/, $string) 8815 { my( $initial_indent)= $line=~ m{^(\s*)}; 8816 my $wrapped_line= Text::Wrap::wrap( '', $initial_indent . $INDENT, $line) . "\n"; 8817 8818 # fix glitch with Text::wrap when the first line is long and does not include spaces 8819 # the first line ends up being too short by 2 chars, but we'll have to live with it! 8820 $wrapped_line=~ s{^ +\n }{}s; # this prefix needs to be removed 8821 8822 $wrapped .= $wrapped_line; 8823 } 8824 8825 return $wrapped; 8826 } 8827 8828 8829 sub _sprint 8830 { my $elt= shift; 8831 my $no_tag= shift || 0; 8832 # in case there's some comments or PI's piggybacking 8833 8834 if( $elt->{gi} >= $XML::Twig::SPECIAL_GI) 8835 { 8836 my $preserve= ($elt->{'att'}->{'xml:space'} || '') eq 'preserve'; 8837 $xml_space_preserve++ if $preserve; 8838 8839 push @sprint, $elt->start_tag unless( $no_tag); 8840 8841 # sprint the children 8842 my $child= $elt->{first_child}; 8843 while( $child) 8844 { $child->_sprint; 8845 $child= $child->{next_sibling}; 8846 } 8847 push @sprint, $elt->end_tag unless( $no_tag); 8848 $xml_space_preserve-- if $preserve; 8849 } 8850 else 8851 { push @sprint, $elt->{extra_data} if( $elt->{extra_data}) ; 8852 if( (exists $elt->{'pcdata'})) { push @sprint, $elt->pcdata_xml_string; } 8853 elsif( (exists $elt->{'cdata'})) { push @sprint, $elt->cdata_string; } 8854 elsif( (exists $elt->{'target'})) { if( ($pretty >= $INDENTED) && !$elt->{parent}->{contains_text}) { push @sprint, "\n" . $INDENT x $elt->level; } 8855 push @sprint, $elt->pi_string; 8856 } 8857 elsif( (exists $elt->{'comment'})) { if( ($pretty >= $INDENTED) && !$elt->{parent}->{contains_text}) { push @sprint, "\n" . $INDENT x $elt->level; } 8858 push @sprint, $elt->comment_string; 8859 } 8860 elsif( (exists $elt->{'ent'})) { push @sprint, $elt->ent_string; } 8861 } 8862 8863 return; 8864 } 8865 8866 # just a shortcut to $elt->sprint( 1) 8867 sub xml_string 8868 { my $elt= shift; 8869 isa( $_[0], 'HASH') ? $elt->sprint( shift(), 1) : $elt->sprint( 1); 8870 } 8871 8872 sub pcdata_xml_string 8873 { my $elt= shift; 8874 if( defined( my $string= $elt->{pcdata}) ) 8875 { 8876 if( ! $elt->{extra_data_in_pcdata}) 8877 { 8878 $string=~ s/([$replaced_ents])/$XML::Twig::base_ent{$1}/g unless( !$replaced_ents || $keep_encoding || $elt->{asis}); 8879 $string=~ s{\Q]]>}{]]>}g; 8880 } 8881 else 8882 { _gen_mark( $string); # used by _(un)?protect_extra_data 8883 foreach my $data (reverse @{$elt->{extra_data_in_pcdata}}) 8884 { my $substr= substr( $string, $data->{offset}); 8885 if( $keep_encoding || $elt->{asis}) 8886 { substr( $string, $data->{offset}, 0, $data->{text}); } 8887 else 8888 { substr( $string, $data->{offset}, 0, _protect_extra_data( $data->{text})); } 8889 } 8890 unless( $keep_encoding || $elt->{asis}) 8891 { 8892 $string=~ s{([$replaced_ents])}{$XML::Twig::base_ent{$1}}g ; 8893 $string=~ s{\Q]]>}{]]>}g; 8894 _unprotect_extra_data( $string); 8895 } 8896 } 8897 return $output_text_filter ? $output_text_filter->( $string) : $string; 8898 } 8899 else 8900 { return ''; } 8901 } 8902 8903 { my $mark; 8904 my( %char2ent, %ent2char); 8905 BEGIN 8906 { %char2ent= ( '<' => 'lt', '&' => 'amp', '>' => 'gt'); 8907 %ent2char= map { $char2ent{$_} => $_ } keys %char2ent; 8908 } 8909 8910 # generate a unique mark (a string) not found in the string, 8911 # used to mark < and & in the extra data 8912 sub _gen_mark 8913 { $mark="AAAA"; 8914 $mark++ while( index( $_[0], $mark) > -1); 8915 return $mark; 8916 } 8917 8918 sub _protect_extra_data 8919 { my( $extra_data)= @_; 8920 $extra_data=~ s{([<&>])}{:$mark:$char2ent{$1}:}g; 8921 return $extra_data; 8922 } 8923 8924 sub _unprotect_extra_data 8925 { $_[0]=~ s{:$mark:(\w+):}{$ent2char{$1}}g; } 8926 8927 } 8928 8929 sub cdata_string 8930 { my $cdata= $_[0]->{cdata}; 8931 unless( defined $cdata) { return ''; } 8932 if( $remove_cdata) 8933 { $cdata=~ s/([$replaced_ents])/$XML::Twig::base_ent{$1}/g; } 8934 else 8935 { $cdata= $CDATA_START . $cdata . $CDATA_END; } 8936 return $cdata; 8937 } 8938 8939 sub att_xml_string 8940 { my $elt= shift; 8941 my $att= shift; 8942 8943 my $replace= $replaced_ents . "$quote\n\r\t"; 8944 if($_[0] && $_[0]->{escape_gt} && ($replace!~ m{>}) ) { $replace .='>'; } 8945 8946 if( defined (my $string= $elt->{att}->{$att})) 8947 { return _att_xml_string( $string, $replace); } 8948 else 8949 { return ''; } 8950 } 8951 8952 # escaped xml string for an attribute value 8953 sub _att_xml_string 8954 { my( $string, $escape)= @_; 8955 if( !defined( $string)) { return ''; } 8956 if( $keep_encoding) 8957 { $string=~ s{$quote}{$XML::Twig::base_ent{$quote}}g; 8958 } 8959 else 8960 { 8961 if( $do_not_escape_amp_in_atts) 8962 { $escape=~ s{^.}{}; # seems like the most backward compatible way to remove & from the list 8963 $string=~ s{([$escape])}{$XML::Twig::base_ent{$1}}g; 8964 $string=~ s{&(?!(\w+|#\d+|[xX][0-9a-fA-F]+);)}{&}g; # dodgy: escape & that do not start an entity 8965 } 8966 else 8967 { $string=~ s{([$escape])}{$XML::Twig::base_ent{$1}}g; 8968 $string=~ s{\Q]]>}{]]>}g; 8969 } 8970 } 8971 8972 return $output_text_filter ? $output_text_filter->( $string) : $string; 8973 } 8974 8975 sub ent_string 8976 { my $ent= shift; 8977 my $ent_text= $ent->{ent}; 8978 my( $t, $el, $ent_string); 8979 if( $expand_external_entities 8980 && ($t= $ent->twig) 8981 && ($el= $t->entity_list) 8982 && ($ent_string= $el->{entities}->{$ent->ent_name}->{val}) 8983 ) 8984 { return $ent_string; } 8985 else 8986 { return $ent_text; } 8987 } 8988 8989 # returns just the text, no tags, for an element 8990 sub text 8991 { my( $elt, @options)= @_; 8992 8993 if( @options && grep { lc( $_) eq 'no_recurse' } @options) { return $elt->text_only; } 8994 my $sep = (@options && grep { lc( $_) eq 'sep' } @options) ? ' ' : ''; 8995 8996 my $string; 8997 8998 if( (exists $elt->{'pcdata'})) { return $elt->{pcdata} . $sep; } 8999 elsif( (exists $elt->{'cdata'})) { return $elt->{cdata} . $sep; } 9000 elsif( (exists $elt->{'target'})) { return $elt->pi_string . $sep; } 9001 elsif( (exists $elt->{'comment'})) { return $elt->{comment} . $sep; } 9002 elsif( (exists $elt->{'ent'})) { return $elt->{ent} . $sep ; } 9003 9004 9005 my $child= $elt->{first_child} ||''; 9006 while( $child) 9007 { 9008 my $child_text= $child->text( @options); 9009 $string.= defined( $child_text) ? $sep . $child_text : ''; 9010 } continue { $child= $child->{next_sibling}; } 9011 9012 unless( defined $string) { $string=''; } 9013 9014 return $output_text_filter ? $output_text_filter->( $string) : $string; 9015 } 9016 9017 sub text_only 9018 { return join '', map { $_->text if( $_->is_text || (exists $_->{'ent'})) } $_[0]->_children; } 9019 9020 sub trimmed_text 9021 { my $elt= shift; 9022 my $text= $elt->text( @_); 9023 $text=~ s{\s+}{ }sg; 9024 $text=~ s{^\s*}{}; 9025 $text=~ s{\s*$}{}; 9026 return $text; 9027 } 9028 9029 sub trim 9030 { my( $elt)= @_; 9031 my $pcdata= $elt->first_descendant( $TEXT); 9032 (my $pcdata_text= $pcdata->text)=~ s{^\s+}{}s; 9033 $pcdata->set_text( $pcdata_text); 9034 $pcdata= $elt->last_descendant( $TEXT); 9035 ($pcdata_text= $pcdata->text)=~ s{\s+$}{}; 9036 $pcdata->set_text( $pcdata_text); 9037 foreach my $pcdata ($elt->descendants( $TEXT)) 9038 { ($pcdata_text= $pcdata->text)=~ s{\s+}{ }g; 9039 $pcdata->set_text( $pcdata_text); 9040 } 9041 return $elt; 9042 } 9043 9044 9045 # remove cdata sections (turns them into regular pcdata) in an element 9046 sub remove_cdata 9047 { my $elt= shift; 9048 foreach my $cdata ($elt->descendants_or_self( $CDATA)) 9049 { if( $keep_encoding) 9050 { my $data= $cdata->{cdata}; 9051 $data=~ s{([&<"'])}{$XML::Twig::base_ent{$1}}g; 9052 $cdata->{pcdata}= (delete $cdata->{empty} || 1) && $data; 9053 } 9054 else 9055 { $cdata->{pcdata}= (delete $cdata->{empty} || 1) && $cdata->{cdata}; } 9056 $cdata->{gi}=$XML::Twig::gi2index{$PCDATA} or $cdata->set_gi( $PCDATA); 9057 undef $cdata->{cdata}; 9058 } 9059 } 9060 9061sub _is_private { return _is_private_name( $_[0]->gi); } 9062sub _is_private_name { return $_[0]=~ m{^#(?!default:)}; } 9063 9064 9065} # end of block containing package globals ($pretty_print, $quotes, keep_encoding...) 9066 9067# merges consecutive #PCDATAs in am element 9068sub normalize 9069 { my( $elt)= @_; 9070 my @descendants= $elt->descendants( $PCDATA); 9071 while( my $desc= shift @descendants) 9072 { if( ! length $desc->{pcdata}) { $desc->delete; next; } 9073 while( @descendants && $desc->{next_sibling} && $desc->{next_sibling}== $descendants[0]) 9074 { my $to_merge= shift @descendants; 9075 $desc->merge_text( $to_merge); 9076 } 9077 } 9078 return $elt; 9079 } 9080 9081# SAX export methods 9082sub toSAX1 9083 { _toSAX(@_, \&_start_tag_data_SAX1, \&_end_tag_data_SAX1); } 9084 9085sub toSAX2 9086 { _toSAX(@_, \&_start_tag_data_SAX2, \&_end_tag_data_SAX2); } 9087 9088sub _toSAX 9089 { my( $elt, $handler, $start_tag_data, $end_tag_data)= @_; 9090 if( $elt->{gi} >= $XML::Twig::SPECIAL_GI) 9091 { my $data= $start_tag_data->( $elt); 9092 _start_prefix_mapping( $elt, $handler, $data); 9093 if( $data && (my $start_element = $handler->can( 'start_element'))) 9094 { unless( $elt->{'flushed'}) { $start_element->( $handler, $data); } } 9095 9096 foreach my $child ($elt->_children) 9097 { $child->_toSAX( $handler, $start_tag_data, $end_tag_data); } 9098 9099 if( (my $data= $end_tag_data->( $elt)) && (my $end_element = $handler->can( 'end_element')) ) 9100 { $end_element->( $handler, $data); } 9101 _end_prefix_mapping( $elt, $handler); 9102 } 9103 else # text or special element 9104 { if( (exists $elt->{'pcdata'}) && (my $characters= $handler->can( 'characters'))) 9105 { $characters->( $handler, { Data => $elt->{pcdata} }); } 9106 elsif( (exists $elt->{'cdata'})) 9107 { if( my $start_cdata= $handler->can( 'start_cdata')) 9108 { $start_cdata->( $handler); } 9109 if( my $characters= $handler->can( 'characters')) 9110 { $characters->( $handler, {Data => $elt->{cdata} }); } 9111 if( my $end_cdata= $handler->can( 'end_cdata')) 9112 { $end_cdata->( $handler); } 9113 } 9114 elsif( ((exists $elt->{'target'})) && (my $pi= $handler->can( 'processing_instruction'))) 9115 { $pi->( $handler, { Target =>$elt->{target}, Data => $elt->{data} }); } 9116 elsif( ((exists $elt->{'comment'})) && (my $comment= $handler->can( 'comment'))) 9117 { $comment->( $handler, { Data => $elt->{comment} }); } 9118 elsif( ((exists $elt->{'ent'}))) 9119 { 9120 if( my $se= $handler->can( 'skipped_entity')) 9121 { $se->( $handler, { Name => $elt->ent_name }); } 9122 elsif( my $characters= $handler->can( 'characters')) 9123 { if( defined $elt->ent_string) 9124 { $characters->( $handler, {Data => $elt->ent_string}); } 9125 else 9126 { $characters->( $handler, {Data => $elt->ent_name}); } 9127 } 9128 } 9129 9130 } 9131 } 9132 9133sub _start_tag_data_SAX1 9134 { my( $elt)= @_; 9135 my $name= $XML::Twig::index2gi[$elt->{'gi'}]; 9136 return if( ( (substr( $XML::Twig::index2gi[$elt->{'gi'}], 0, 1) eq '#') && (substr( $XML::Twig::index2gi[$elt->{'gi'}], 0, 9) ne '#default:') )); 9137 my $attributes={}; 9138 my $atts= $elt->{att}; 9139 while( my( $att, $value)= each %$atts) 9140 { $attributes->{$att}= $value unless( ( $att=~ m{^#(?!default:)} )); } 9141 my $data= { Name => $name, Attributes => $attributes}; 9142 return $data; 9143 } 9144 9145sub _end_tag_data_SAX1 9146 { my( $elt)= @_; 9147 return if( ( (substr( $XML::Twig::index2gi[$elt->{'gi'}], 0, 1) eq '#') && (substr( $XML::Twig::index2gi[$elt->{'gi'}], 0, 9) ne '#default:') )); 9148 return { Name => $XML::Twig::index2gi[$elt->{'gi'}] }; 9149 } 9150 9151sub _start_tag_data_SAX2 9152 { my( $elt)= @_; 9153 my $data={}; 9154 9155 my $name= $XML::Twig::index2gi[$elt->{'gi'}]; 9156 return if( ( (substr( $XML::Twig::index2gi[$elt->{'gi'}], 0, 1) eq '#') && (substr( $XML::Twig::index2gi[$elt->{'gi'}], 0, 9) ne '#default:') )); 9157 $data->{Name} = $name; 9158 $data->{Prefix} = $elt->ns_prefix; 9159 $data->{LocalName} = $elt->local_name; 9160 $data->{NamespaceURI} = $elt->namespace; 9161 9162 # save a copy of the data so we can re-use it for the end tag 9163 my %sax2_data= %$data; 9164 $elt->{twig_elt_SAX2_data}= \%sax2_data; 9165 9166 # add the attributes 9167 $data->{Attributes}= $elt->_atts_to_SAX2; 9168 9169 return $data; 9170 } 9171 9172sub _atts_to_SAX2 9173 { my $elt= shift; 9174 my $SAX2_atts= {}; 9175 foreach my $att (keys %{$elt->{att}}) 9176 { 9177 next if( ( $att=~ m{^#(?!default:)} )); 9178 my $SAX2_att={}; 9179 $SAX2_att->{Name} = $att; 9180 $SAX2_att->{Prefix} = _ns_prefix( $att); 9181 $SAX2_att->{LocalName} = _local_name( $att); 9182 $SAX2_att->{NamespaceURI} = $elt->namespace( $SAX2_att->{Prefix}); 9183 $SAX2_att->{Value} = $elt->{'att'}->{$att}; 9184 my $SAX2_att_name= "{$SAX2_att->{NamespaceURI}}$SAX2_att->{LocalName}"; 9185 9186 $SAX2_atts->{$SAX2_att_name}= $SAX2_att; 9187 } 9188 return $SAX2_atts; 9189 } 9190 9191sub _start_prefix_mapping 9192 { my( $elt, $handler, $data)= @_; 9193 if( my $start_prefix_mapping= $handler->can( 'start_prefix_mapping') 9194 and my @new_prefix_mappings= grep { /^\{[^}]*\}xmlns/ || /^\{$XMLNS_URI\}/ } keys %{$data->{Attributes}} 9195 ) 9196 { foreach my $prefix (@new_prefix_mappings) 9197 { my $prefix_string= $data->{Attributes}->{$prefix}->{LocalName}; 9198 if( $prefix_string eq 'xmlns') { $prefix_string=''; } 9199 my $prefix_data= 9200 { Prefix => $prefix_string, 9201 NamespaceURI => $data->{Attributes}->{$prefix}->{Value} 9202 }; 9203 $start_prefix_mapping->( $handler, $prefix_data); 9204 $elt->{twig_end_prefix_mapping} ||= []; 9205 push @{$elt->{twig_end_prefix_mapping}}, $prefix_string; 9206 } 9207 } 9208 } 9209 9210sub _end_prefix_mapping 9211 { my( $elt, $handler)= @_; 9212 if( my $end_prefix_mapping= $handler->can( 'end_prefix_mapping')) 9213 { foreach my $prefix (@{$elt->{twig_end_prefix_mapping}}) 9214 { $end_prefix_mapping->( $handler, { Prefix => $prefix} ); } 9215 } 9216 } 9217 9218sub _end_tag_data_SAX2 9219 { my( $elt)= @_; 9220 return if( ( (substr( $XML::Twig::index2gi[$elt->{'gi'}], 0, 1) eq '#') && (substr( $XML::Twig::index2gi[$elt->{'gi'}], 0, 9) ne '#default:') )); 9221 return $elt->{twig_elt_SAX2_data}; 9222 } 9223 9224sub contains_text 9225 { my $elt= shift; 9226 my $child= $elt->{first_child}; 9227 while ($child) 9228 { return 1 if( $child->is_text || (exists $child->{'ent'})); 9229 $child= $child->{next_sibling}; 9230 } 9231 return 0; 9232 } 9233 9234# creates a single pcdata element containing the text as child of the element 9235# options: 9236# - force_pcdata: when set to a true value forces the text to be in a #PCDATA 9237# even if the original element was a #CDATA 9238sub set_text 9239 { my( $elt, $string, %option)= @_; 9240 9241 if( $XML::Twig::index2gi[$elt->{'gi'}] eq $PCDATA) 9242 { return $elt->{pcdata}= (delete $elt->{empty} || 1) && $string; } 9243 elsif( $XML::Twig::index2gi[$elt->{'gi'}] eq $CDATA) 9244 { if( $option{force_pcdata}) 9245 { $elt->{gi}=$XML::Twig::gi2index{$PCDATA} or $elt->set_gi( $PCDATA); 9246 $elt->{cdata}= ''; 9247 return $elt->{pcdata}= (delete $elt->{empty} || 1) && $string; 9248 } 9249 else 9250 { $elt->{cdata}= $string; 9251 return $string; 9252 } 9253 } 9254 elsif( $elt->contains_a_single( $PCDATA) ) 9255 { # optimized so we have a slight chance of not losing embedded comments and pi's 9256 $elt->{first_child}->set_pcdata( $string); 9257 return $elt; 9258 } 9259 9260 foreach my $child (@{[$elt->_children]}) 9261 { $child->delete; } 9262 9263 my $pcdata= $elt->_new_pcdata( $string); 9264 $pcdata->paste( $elt); 9265 9266 delete $elt->{empty}; 9267 9268 return $elt; 9269 } 9270 9271# set the content of an element from a list of strings and elements 9272sub set_content 9273 { my $elt= shift; 9274 9275 return $elt unless defined $_[0]; 9276 9277 # attributes can be given as a hash (passed by ref) 9278 if( ref $_[0] eq 'HASH') 9279 { my $atts= shift; 9280 $elt->del_atts; # usually useless but better safe than sorry 9281 $elt->set_atts( $atts); 9282 return $elt unless defined $_[0]; 9283 } 9284 9285 # check next argument for #EMPTY 9286 if( !(ref $_[0]) && ($_[0] eq $EMPTY) ) 9287 { $elt->{empty}= 1; return $elt; } 9288 9289 # case where we really want to do a set_text, the element is '#PCDATA' 9290 # or contains a single PCDATA and we only want to add text in it 9291 if( ($XML::Twig::index2gi[$elt->{'gi'}] eq $PCDATA || $elt->contains_a_single( $PCDATA)) 9292 && (@_ == 1) && !( ref $_[0])) 9293 { $elt->set_text( $_[0]); 9294 return $elt; 9295 } 9296 elsif( ($XML::Twig::index2gi[$elt->{'gi'}] eq $CDATA) && (@_ == 1) && !( ref $_[0])) 9297 { $elt->{cdata}= $_[0]; 9298 return $elt; 9299 } 9300 9301 # delete the children 9302 foreach my $child (@{[$elt->_children]}) 9303 { $child->delete; } 9304 9305 if( @_) { delete $elt->{empty}; } 9306 9307 foreach my $child (@_) 9308 { if( ref( $child) && isa( $child, 'XML::Twig::Elt')) 9309 { # argument is an element 9310 $child->paste( 'last_child', $elt); 9311 } 9312 else 9313 { # argument is a string 9314 if( (my $pcdata= $elt->{last_child}) && $elt->{last_child}->is_pcdata) 9315 { # previous child is also pcdata: just concatenate 9316 $pcdata->{pcdata}= (delete $pcdata->{empty} || 1) && $pcdata->{pcdata} . $child 9317 } 9318 else 9319 { # previous child is not a string: create a new pcdata element 9320 $pcdata= $elt->_new_pcdata( $child); 9321 $pcdata->paste( 'last_child', $elt); 9322 } 9323 } 9324 } 9325 9326 9327 return $elt; 9328 } 9329 9330# inserts an element (whose gi is given) as child of the element 9331# all children of the element are now children of the new element 9332# returns the new element 9333sub insert 9334 { my ($elt, @args)= @_; 9335 # first cut the children 9336 my @children= do { my $elt= $elt; my @children=(); my $child= $elt->{first_child}; while( $child) { push @children, $child; $child= $child->{next_sibling}; } @children; }; 9337 foreach my $child (@children) 9338 { $child->cut; } 9339 # insert elements 9340 while( my $gi= shift @args) 9341 { my $new_elt= $elt->new( $gi); 9342 # add attributes if needed 9343 if( defined( $args[0]) && ( isa( $args[0], 'HASH')) ) 9344 { $new_elt->set_atts( shift @args); } 9345 # paste the element 9346 $new_elt->paste( $elt); 9347 delete $elt->{empty}; 9348 $elt= $new_elt; 9349 } 9350 # paste back the children 9351 foreach my $child (@children) 9352 { $child->paste( 'last_child', $elt); } 9353 return $elt; 9354 } 9355 9356# insert a new element 9357# $elt->insert_new_element( $opt_position, $gi, $opt_atts_hash, @opt_content); 9358# the element is created with the same syntax as new 9359# position is the same as in paste, first_child by default 9360sub insert_new_elt 9361 { my $elt= shift; 9362 my $position= $_[0]; 9363 if( ($position eq 'before') || ($position eq 'after') 9364 || ($position eq 'first_child') || ($position eq 'last_child')) 9365 { shift; } 9366 else 9367 { $position= 'first_child'; } 9368 9369 my $new_elt= $elt->new( @_); 9370 $new_elt->paste( $position, $elt); 9371 9372 #if( defined $new_elt->{'att'}->{$ID}) { $new_elt->set_id( $new_elt->{'att'}->{$ID}); } 9373 9374 return $new_elt; 9375 } 9376 9377# wraps an element in elements which gi's are given as arguments 9378# $elt->wrap_in( 'td', 'tr', 'table') wraps the element as a single 9379# cell in a table for example 9380# returns the new element 9381sub wrap_in 9382 { my $elt= shift; 9383 while( my $gi = shift @_) 9384 { my $new_elt = $elt->new( $gi); 9385 if( $elt->{twig_current}) 9386 { my $t= $elt->twig; 9387 $t->{twig_current}= $new_elt; 9388 delete $elt->{'twig_current'}; 9389 $new_elt->{'twig_current'}=1; 9390 } 9391 9392 if( my $parent= $elt->{parent}) 9393 { $new_elt->{parent}=$parent; if( $XML::Twig::weakrefs) { weaken( $new_elt->{parent});} ; 9394 if( $parent->{first_child} == $elt) { $parent->{first_child}= $new_elt; } 9395 if( $parent->{last_child} == $elt) { delete $parent->{empty}; $parent->{last_child}=$new_elt; if( $XML::Twig::weakrefs) { weaken( $parent->{last_child});} ; } 9396 } 9397 else 9398 { # wrapping the root 9399 my $twig= $elt->twig; 9400 if( $twig && $twig->root && ($twig->root eq $elt) ) 9401 { $twig->set_root( $new_elt); 9402 } 9403 } 9404 9405 if( my $prev_sibling= $elt->{prev_sibling}) 9406 { $new_elt->{prev_sibling}=$prev_sibling; if( $XML::Twig::weakrefs) { weaken( $new_elt->{prev_sibling});} ; 9407 $prev_sibling->{next_sibling}= $new_elt; 9408 } 9409 9410 if( my $next_sibling= $elt->{next_sibling}) 9411 { $new_elt->{next_sibling}= $next_sibling; 9412 $next_sibling->{prev_sibling}=$new_elt; if( $XML::Twig::weakrefs) { weaken( $next_sibling->{prev_sibling});} ; 9413 } 9414 $new_elt->{first_child}= $elt; 9415 delete $new_elt->{empty}; $new_elt->{last_child}=$elt; if( $XML::Twig::weakrefs) { weaken( $new_elt->{last_child});} ; 9416 9417 $elt->{parent}=$new_elt; if( $XML::Twig::weakrefs) { weaken( $elt->{parent});} ; 9418 $elt->{prev_sibling}=undef; if( $XML::Twig::weakrefs) { weaken( $elt->{prev_sibling});} ; 9419 $elt->{next_sibling}= undef; 9420 9421 # add the attributes if the next argument is a hash ref 9422 if( defined( $_[0]) && (isa( $_[0], 'HASH')) ) 9423 { $new_elt->set_atts( shift @_); } 9424 9425 $elt= $new_elt; 9426 } 9427 9428 return $elt; 9429 } 9430 9431sub replace 9432 { my( $elt, $ref)= @_; 9433 9434 if( $elt->{parent}) { $elt->cut; } 9435 9436 if( my $parent= $ref->{parent}) 9437 { $elt->{parent}=$parent; if( $XML::Twig::weakrefs) { weaken( $elt->{parent});} ; 9438 if( $parent->{first_child} == $ref) { $parent->{first_child}= $elt; } 9439 if( $parent->{last_child} == $ref) { delete $parent->{empty}; $parent->{last_child}=$elt; if( $XML::Twig::weakrefs) { weaken( $parent->{last_child});} ; } 9440 } 9441 elsif( $ref->twig && $ref == $ref->twig->root) 9442 { $ref->twig->set_root( $elt); } 9443 9444 if( my $prev_sibling= $ref->{prev_sibling}) 9445 { $elt->{prev_sibling}=$prev_sibling; if( $XML::Twig::weakrefs) { weaken( $elt->{prev_sibling});} ; 9446 $prev_sibling->{next_sibling}= $elt; 9447 } 9448 if( my $next_sibling= $ref->{next_sibling}) 9449 { $elt->{next_sibling}= $next_sibling; 9450 $next_sibling->{prev_sibling}=$elt; if( $XML::Twig::weakrefs) { weaken( $next_sibling->{prev_sibling});} ; 9451 } 9452 9453 $ref->{parent}=undef; if( $XML::Twig::weakrefs) { weaken( $ref->{parent});} ; 9454 $ref->{prev_sibling}=undef; if( $XML::Twig::weakrefs) { weaken( $ref->{prev_sibling});} ; 9455 $ref->{next_sibling}= undef; 9456 return $ref; 9457 } 9458 9459sub replace_with 9460 { my $ref= shift; 9461 my $elt= shift; 9462 $elt->replace( $ref); 9463 foreach my $new_elt (reverse @_) 9464 { $new_elt->paste( after => $elt); } 9465 return $elt; 9466 } 9467 9468 9469# move an element, same syntax as paste, except the element is first cut 9470sub move 9471 { my $elt= shift; 9472 $elt->cut; 9473 $elt->paste( @_); 9474 return $elt; 9475 } 9476 9477 9478# adds a prefix to an element, creating a pcdata child if needed 9479sub prefix 9480 { my ($elt, $prefix, $option)= @_; 9481 my $asis= ($option && ($option eq 'asis')) ? 1 : 0; 9482 if( (exists $elt->{'pcdata'}) 9483 && (($asis && $elt->{asis}) || (!$asis && ! $elt->{asis})) 9484 ) 9485 { $elt->{pcdata}= (delete $elt->{empty} || 1) && $prefix . $elt->{pcdata}; } 9486 elsif( $elt->{first_child} && $elt->{first_child}->is_pcdata 9487 && ( ($asis && $elt->{first_child}->{asis}) 9488 || (!$asis && ! $elt->{first_child}->{asis})) 9489 ) 9490 { 9491 $elt->{first_child}->set_pcdata( $prefix . $elt->{first_child}->pcdata); 9492 } 9493 else 9494 { my $new_elt= $elt->_new_pcdata( $prefix); 9495 my $pos= (exists $elt->{'pcdata'}) ? 'before' : 'first_child'; 9496 $new_elt->paste( $pos => $elt); 9497 if( $asis) { $new_elt->set_asis; } 9498 } 9499 return $elt; 9500 } 9501 9502# adds a suffix to an element, creating a pcdata child if needed 9503sub suffix 9504 { my ($elt, $suffix, $option)= @_; 9505 my $asis= ($option && ($option eq 'asis')) ? 1 : 0; 9506 if( (exists $elt->{'pcdata'}) 9507 && (($asis && $elt->{asis}) || (!$asis && ! $elt->{asis})) 9508 ) 9509 { $elt->{pcdata}= (delete $elt->{empty} || 1) && $elt->{pcdata} . $suffix; } 9510 elsif( $elt->{last_child} && $elt->{last_child}->is_pcdata 9511 && ( ($asis && $elt->{last_child}->{asis}) 9512 || (!$asis && ! $elt->{last_child}->{asis})) 9513 ) 9514 { $elt->{last_child}->set_pcdata( $elt->{last_child}->pcdata . $suffix); } 9515 else 9516 { my $new_elt= $elt->_new_pcdata( $suffix); 9517 my $pos= (exists $elt->{'pcdata'}) ? 'after' : 'last_child'; 9518 $new_elt->paste( $pos => $elt); 9519 if( $asis) { $new_elt->set_asis; } 9520 } 9521 return $elt; 9522 } 9523 9524# create a path to an element ('/root/.../gi) 9525sub path 9526 { my $elt= shift; 9527 my @context= ( $elt, $elt->ancestors); 9528 return "/" . join( "/", reverse map {$_->gi} @context); 9529 } 9530 9531sub xpath 9532 { my $elt= shift; 9533 my $xpath; 9534 foreach my $ancestor (reverse $elt->ancestors_or_self) 9535 { my $gi= $XML::Twig::index2gi[$ancestor->{'gi'}]; 9536 $xpath.= "/$gi"; 9537 my $index= $ancestor->prev_siblings( $gi) + 1; 9538 unless( ($index == 1) && !$ancestor->next_sibling( $gi)) 9539 { $xpath.= "[$index]"; } 9540 } 9541 return $xpath; 9542 } 9543 9544# methods used mainly by wrap_children 9545 9546# return a string with the 9547# for an element <foo><elt att="val">...</elt><elt2/><elt>...</elt></foo> 9548# returns '<elt att="val"><elt2><elt>' 9549sub _stringify_struct 9550 { my( $elt, %opt)= @_; 9551 my $string=''; 9552 my $pretty_print= set_pretty_print( 'none'); 9553 foreach my $child ($elt->_children) 9554 { $child->add_id; $string .= $child->start_tag( { escape_gt => 1 }) ||''; } 9555 set_pretty_print( $pretty_print); 9556 return $string; 9557 } 9558 9559# wrap a series of elements in a new one 9560sub _wrap_range 9561 { my $elt= shift; 9562 my $gi= shift; 9563 my $atts= isa( $_[0], 'HASH') ? shift : undef; 9564 my $range= shift; # the string with the tags to wrap 9565 9566 my $t= $elt->twig; 9567 9568 # get the tags to wrap 9569 my @to_wrap; 9570 while( $range=~ m{<\w+\s+[^>]*id=("[^"]*"|'[^']*')[^>]*>}g) 9571 { push @to_wrap, $t->elt_id( substr( $1, 1, -1)); } 9572 9573 return '' unless @to_wrap; 9574 9575 my $to_wrap= shift @to_wrap; 9576 my %atts= %$atts; 9577 my $new_elt= $to_wrap->wrap_in( $gi, \%atts); 9578 $_->move( last_child => $new_elt) foreach (@to_wrap); 9579 9580 return ''; 9581 } 9582 9583# wrap children matching a regexp in a new element 9584sub wrap_children 9585 { my( $elt, $regexp, $gi, $atts)= @_; 9586 9587 $atts ||={}; 9588 9589 my $elt_as_string= $elt->_stringify_struct; # stringify the elt structure 9590 $regexp=~ s{(<[^>]*>)}{_match_expr( $1)}eg; # in the regexp, replace gi's by the proper regexp 9591 $elt_as_string=~ s{($regexp)}{$elt->_wrap_range( $gi, $atts, $1)}eg; # then do the actual replace 9592 9593 return $elt; 9594 } 9595 9596sub _match_expr 9597 { my $tag= shift; 9598 my( $gi, %atts)= XML::Twig::_parse_start_tag( $tag); 9599 return _match_tag( $gi, %atts); 9600 } 9601 9602 9603sub _match_tag 9604 { my( $elt, %atts)= @_; 9605 my $string= "<$elt\\b"; 9606 foreach my $key (sort keys %atts) 9607 { my $val= qq{\Q$atts{$key}\E}; 9608 $string.= qq{[^>]*$key=(?:"$val"|'$val')}; 9609 } 9610 $string.= qq{[^>]*>}; 9611 return "(?:$string)"; 9612 } 9613 9614sub field_to_att 9615 { my( $elt, $cond, $att)= @_; 9616 $att ||= $cond; 9617 my $child= $elt->first_child( $cond) or return undef; 9618 $elt->set_att( $att => $child->text); 9619 $child->cut; 9620 return $elt; 9621 } 9622 9623sub att_to_field 9624 { my( $elt, $att, $tag)= @_; 9625 $tag ||= $att; 9626 my $child= $elt->insert_new_elt( first_child => $tag, $elt->{'att'}->{$att}); 9627 $elt->del_att( $att); 9628 return $elt; 9629 } 9630 9631# sort children methods 9632 9633sub sort_children_on_field 9634 { my $elt = shift; 9635 my $field = shift; 9636 my $get_key= sub { return $_[0]->field( $field) }; 9637 return $elt->sort_children( $get_key, @_); 9638 } 9639 9640sub sort_children_on_att 9641 { my $elt = shift; 9642 my $att = shift; 9643 my $get_key= sub { return $_[0]->{'att'}->{$att} }; 9644 return $elt->sort_children( $get_key, @_); 9645 } 9646 9647sub sort_children_on_value 9648 { my $elt = shift; 9649 #my $get_key= eval qq{ sub { $NO_WARNINGS; return \$_[0]->text } }; 9650 my $get_key= \&text; 9651 return $elt->sort_children( $get_key, @_); 9652 } 9653 9654sub sort_children 9655 { my( $elt, $get_key, %opt)=@_; 9656 $opt{order} ||= 'normal'; 9657 $opt{type} ||= 'alpha'; 9658 my( $par_a, $par_b)= ($opt{order} eq 'reverse') ? qw( b a) : qw ( a b) ; 9659 my $op= ($opt{type} eq 'numeric') ? '<=>' : 'cmp' ; 9660 my @children= $elt->cut_children; 9661 if( $opt{type} eq 'numeric') 9662 { @children= map { $_->[1] } 9663 sort { $a->[0] <=> $b->[0] } 9664 map { [ $get_key->( $_), $_] } @children; 9665 } 9666 elsif( $opt{type} eq 'alpha') 9667 { @children= map { $_->[1] } 9668 sort { $a->[0] cmp $b->[0] } 9669 map { [ $get_key->( $_), $_] } @children; 9670 } 9671 else 9672 { croak "wrong sort type '$opt{type}', should be either 'alpha' or 'numeric'"; } 9673 9674 @children= reverse @children if( $opt{order} eq 'reverse'); 9675 $elt->set_content( @children); 9676 } 9677 9678 9679# comparison methods 9680 9681sub before 9682 { my( $a, $b)=@_; 9683 if( $a->cmp( $b) == -1) { return 1; } else { return 0; } 9684 } 9685 9686sub after 9687 { my( $a, $b)=@_; 9688 if( $a->cmp( $b) == 1) { return 1; } else { return 0; } 9689 } 9690 9691sub lt 9692 { my( $a, $b)=@_; 9693 return 1 if( $a->cmp( $b) == -1); 9694 return 0; 9695 } 9696 9697sub le 9698 { my( $a, $b)=@_; 9699 return 1 unless( $a->cmp( $b) == 1); 9700 return 0; 9701 } 9702 9703sub gt 9704 { my( $a, $b)=@_; 9705 return 1 if( $a->cmp( $b) == 1); 9706 return 0; 9707 } 9708 9709sub ge 9710 { my( $a, $b)=@_; 9711 return 1 unless( $a->cmp( $b) == -1); 9712 return 0; 9713 } 9714 9715 9716sub cmp 9717 { my( $a, $b)=@_; 9718 9719 # easy cases 9720 return 0 if( $a == $b); 9721 return 1 if( $a->in($b)); # a in b => a starts after b 9722 return -1 if( $b->in($a)); # b in a => a starts before b 9723 9724 # ancestors does not include the element itself 9725 my @a_pile= ($a, $a->ancestors); 9726 my @b_pile= ($b, $b->ancestors); 9727 9728 # the 2 elements are not in the same twig 9729 return undef unless( $a_pile[-1] == $b_pile[-1]); 9730 9731 # find the first non common ancestors (they are siblings) 9732 my $a_anc= pop @a_pile; 9733 my $b_anc= pop @b_pile; 9734 9735 while( $a_anc == $b_anc) 9736 { $a_anc= pop @a_pile; 9737 $b_anc= pop @b_pile; 9738 } 9739 9740 # from there move left and right and figure out the order 9741 my( $a_prev, $a_next, $b_prev, $b_next)= ($a_anc, $a_anc, $b_anc, $b_anc); 9742 while() 9743 { $a_prev= $a_prev->{prev_sibling} || return( -1); 9744 return 1 if( $a_prev == $b_next); 9745 $a_next= $a_next->{next_sibling} || return( 1); 9746 return -1 if( $a_next == $b_prev); 9747 $b_prev= $b_prev->{prev_sibling} || return( 1); 9748 return -1 if( $b_prev == $a_next); 9749 $b_next= $b_next->{next_sibling} || return( -1); 9750 return 1 if( $b_next == $a_prev); 9751 } 9752 } 9753 9754sub _dump 9755 { my( $elt, $option)= @_; 9756 9757 my $atts = defined $option->{atts} ? $option->{atts} : 1; 9758 my $extra = defined $option->{extra} ? $option->{extra} : 0; 9759 my $short_text = defined $option->{short_text} ? $option->{short_text} : 40; 9760 9761 my $sp= '| '; 9762 my $indent= $sp x $elt->level; 9763 my $indent_sp= ' ' x $elt->level; 9764 9765 my $dump=''; 9766 if( $elt->is_elt) 9767 { 9768 $dump .= $indent . '|-' . $XML::Twig::index2gi[$elt->{'gi'}]; 9769 9770 if( $atts && (my @atts= $elt->att_names) ) 9771 { $dump .= ' ' . join( ' ', map { qq{$_="} . $elt->{'att'}->{$_} . qq{"} } @atts); } 9772 9773 $dump .= "\n"; 9774 if( $extra) { $dump .= $elt->_dump_extra_data( $indent, $indent_sp, $short_text); } 9775 $dump .= join( "", map { $_->_dump( $option) } do { my $elt= $elt; my @children=(); my $child= $elt->{first_child}; while( $child) { push @children, $child; $child= $child->{next_sibling}; } @children; }); 9776 } 9777 else 9778 { 9779 if( (exists $elt->{'pcdata'})) 9780 { $dump .= "$indent|-PCDATA: '" . _short_text( $elt->{pcdata}, $short_text) . "'\n" } 9781 elsif( (exists $elt->{'ent'})) 9782 { $dump .= "$indent|-ENTITY: '" . _short_text( $elt->{ent}, $short_text) . "'\n" } 9783 elsif( (exists $elt->{'cdata'})) 9784 { $dump .= "$indent|-CDATA: '" . _short_text( $elt->{cdata}, $short_text) . "'\n" } 9785 elsif( (exists $elt->{'comment'})) 9786 { $dump .= "$indent|-COMMENT: '" . _short_text( $elt->comment_string, $short_text) . "'\n" } 9787 elsif( (exists $elt->{'target'})) 9788 { $dump .= "$indent|-PI: '" . $elt->{target} . "' - '" . _short_text( $elt->{data}, $short_text) . "'\n" } 9789 if( $extra) { $dump .= $elt->_dump_extra_data( $indent, $indent_sp, $short_text); } 9790 } 9791 return $dump; 9792 } 9793 9794sub _dump_extra_data 9795 { my( $elt, $indent, $indent_sp, $short_text)= @_; 9796 my $dump=''; 9797 if( $elt->extra_data) 9798 { my $extra_data = $indent . "|-- (cpi before) '" . _short_text( $elt->extra_data, $short_text) . "'"; 9799 $extra_data=~ s{\n}{$indent_sp}g; 9800 $dump .= $extra_data . "\n"; 9801 } 9802 if( $elt->{extra_data_in_pcdata}) 9803 { foreach my $data ( @{$elt->{extra_data_in_pcdata}}) 9804 { my $extra_data = $indent . "|-- (cpi offset $data->{offset}) '" . _short_text( $data->{text}, $short_text) . "'"; 9805 $extra_data=~ s{\n}{$indent_sp}g; 9806 $dump .= $extra_data . "\n"; 9807 } 9808 } 9809 if( $elt->{extra_data_before_end_tag}) 9810 { my $extra_data = $indent . "|-- (cpi end) '" . _short_text( $elt->{extra_data_before_end_tag}, $short_text) . "'"; 9811 $extra_data=~ s{\n}{$indent_sp}g; 9812 $dump .= $extra_data . "\n"; 9813 } 9814 return $dump; 9815 } 9816 9817 9818sub _short_text 9819 { my( $string, $length)= @_; 9820 if( !$length || (length( $string) < $length) ) { return $string; } 9821 my $l1= (length( $string) -5) /2; 9822 my $l2= length( $string) - ($l1 + 5); 9823 return substr( $string, 0, $l1) . ' ... ' . substr( $string, -$l2); 9824 } 9825 9826 9827sub _and { return _join_defined( ' && ', @_); } 9828sub _join_defined { return join( shift(), grep { $_ } @_); } 9829 98301; 9831__END__ 9832 9833=head1 NAME 9834 9835XML::Twig - A perl module for processing huge XML documents in tree mode. 9836 9837=head1 SYNOPSIS 9838 9839Note that this documentation is intended as a reference to the module. 9840 9841Complete docs, including a tutorial, examples, an easier to use HTML version, 9842a quick reference card and a FAQ are available at L<http://www.xmltwig.org/xmltwig> 9843 9844Small documents (loaded in memory as a tree): 9845 9846 my $twig=XML::Twig->new(); # create the twig 9847 $twig->parsefile( 'doc.xml'); # build it 9848 my_process( $twig); # use twig methods to process it 9849 $twig->print; # output the twig 9850 9851Huge documents (processed in combined stream/tree mode): 9852 9853 # at most one div will be loaded in memory 9854 my $twig=XML::Twig->new( 9855 twig_handlers => 9856 { title => sub { $_->set_tag( 'h2') }, # change title tags to h2 9857 # $_ is the current element 9858 para => sub { $_->set_tag( 'p') }, # change para to p 9859 hidden => sub { $_->delete; }, # remove hidden elements 9860 list => \&my_list_process, # process list elements 9861 div => sub { $_[0]->flush; }, # output and free memory 9862 }, 9863 pretty_print => 'indented', # output will be nicely formatted 9864 empty_tags => 'html', # outputs <empty_tag /> 9865 ); 9866 $twig->parsefile( 'my_big.xml'); 9867 9868 sub my_list_process 9869 { my( $twig, $list)= @_; 9870 # ... 9871 } 9872 9873See L<XML::Twig 101|/XML::Twig 101> for other ways to use the module, as a 9874filter for example. 9875 9876=encoding utf8 9877 9878=head1 DESCRIPTION 9879 9880This module provides a way to process XML documents. It is build on top 9881of C<XML::Parser>. 9882 9883The module offers a tree interface to the document, while allowing you 9884to output the parts of it that have been completely processed. 9885 9886It allows minimal resource (CPU and memory) usage by building the tree 9887only for the parts of the documents that need actual processing, through the 9888use of the C<L<twig_roots> > and 9889C<L<twig_print_outside_roots> > options. The 9890C<L<finish> > and C<L<finish_print> > methods also help 9891to increase performances. 9892 9893XML::Twig tries to make simple things easy so it tries its best to takes care 9894of a lot of the (usually) annoying (but sometimes necessary) features that 9895come with XML and XML::Parser. 9896 9897=head1 TOOLS 9898 9899XML::Twig comes with a few command-line utilities: 9900 9901=head2 xml_pp - xml pretty-printer 9902 9903XML pretty printer using XML::Twig 9904 9905=head2 xml_grep - grep XML files looking for specific elements 9906 9907C<xml_grep> does a grep on XML files. Instead of using regular expressions 9908it uses XPath expressions (in fact the subset of XPath supported by 9909XML::Twig). 9910 9911=head2 xml_split - cut a big XML file into smaller chunks 9912 9913C<xml_split> takes a (presumably big) XML file and split it in several smaller 9914files, based on various criteria (level in the tree, size or an XPath 9915expression) 9916 9917=head2 xml_merge - merge back XML files split with xml_split 9918 9919C<xml_merge> takes several xml files that have been split using C<xml_split> 9920and recreates a single file. 9921 9922=head2 xml_spellcheck - spellcheck XML files 9923 9924C<xml_spellcheck> lets you spell check the content of an XML file. It extracts 9925the text (the content of elements and optionally of attributes), call a spell 9926checker on it and then recreates the XML document. 9927 9928 9929=head1 XML::Twig 101 9930 9931XML::Twig can be used either on "small" XML documents (that fit in memory) 9932or on huge ones, by processing parts of the document and outputting or 9933discarding them once they are processed. 9934 9935 9936=head2 Loading an XML document and processing it 9937 9938 my $t= XML::Twig->new(); 9939 $t->parse( '<d><title>title</title><para>p 1</para><para>p 2</para></d>'); 9940 my $root= $t->root; 9941 $root->set_tag( 'html'); # change doc to html 9942 $title= $root->first_child( 'title'); # get the title 9943 $title->set_tag( 'h1'); # turn it into h1 9944 my @para= $root->children( 'para'); # get the para children 9945 foreach my $para (@para) 9946 { $para->set_tag( 'p'); } # turn them into p 9947 $t->print; # output the document 9948 9949Other useful methods include: 9950 9951L<att>: C<< $elt->{'att'}->{'foo'} >> return the C<foo> attribute for an 9952element, 9953 9954L<set_att> : C<< $elt->set_att( foo => "bar") >> sets the C<foo> 9955attribute to the C<bar> value, 9956 9957L<next_sibling>: C<< $elt->{next_sibling} >> return the next sibling 9958in the document (in the example C<< $title->{next_sibling} >> is the first 9959C<para>, you can also (and actually should) use 9960C<< $elt->next_sibling( 'para') >> to get it 9961 9962The document can also be transformed through the use of the L<cut>, 9963L<copy>, L<paste> and L<move> methods: 9964C<< $title->cut; $title->paste( after => $p); >> for example 9965 9966And much, much more, see L<XML::Twig::Elt|/XML::Twig::Elt>. 9967 9968=head2 Processing an XML document chunk by chunk 9969 9970One of the strengths of XML::Twig is that it let you work with files that do 9971not fit in memory (BTW storing an XML document in memory as a tree is quite 9972memory-expensive, the expansion factor being often around 10). 9973 9974To do this you can define handlers, that will be called once a specific 9975element has been completely parsed. In these handlers you can access the 9976element and process it as you see fit, using the navigation and the 9977cut-n-paste methods, plus lots of convenient ones like C<L<prefix> >. 9978Once the element is completely processed you can then C<L<flush> > it, 9979which will output it and free the memory. You can also C<L<purge> > it 9980if you don't need to output it (if you are just extracting some data from 9981the document for example). The handler will be called again once the next 9982relevant element has been parsed. 9983 9984 my $t= XML::Twig->new( twig_handlers => 9985 { section => \§ion, 9986 para => sub { $_->set_tag( 'p'); } 9987 }, 9988 ); 9989 $t->parsefile( 'doc.xml'); 9990 9991 # the handler is called once a section is completely parsed, ie when 9992 # the end tag for section is found, it receives the twig itself and 9993 # the element (including all its sub-elements) as arguments 9994 sub section 9995 { my( $t, $section)= @_; # arguments for all twig_handlers 9996 $section->set_tag( 'div'); # change the tag name 9997 # let's use the attribute nb as a prefix to the title 9998 my $title= $section->first_child( 'title'); # find the title 9999 my $nb= $title->{'att'}->{'nb'}; # get the attribute 10000 $title->prefix( "$nb - "); # easy isn't it? 10001 $section->flush; # outputs the section and frees memory 10002 } 10003 10004 10005There is of course more to it: you can trigger handlers on more elaborate 10006conditions than just the name of the element, C<section/title> for example. 10007 10008 my $t= XML::Twig->new( twig_handlers => 10009 { 'section/title' => sub { $_->print } } 10010 ) 10011 ->parsefile( 'doc.xml'); 10012 10013Here C<< sub { $_->print } >> simply prints the current element (C<$_> is aliased 10014to the element in the handler). 10015 10016You can also trigger a handler on a test on an attribute: 10017 10018 my $t= XML::Twig->new( twig_handlers => 10019 { 'section[@level="1"]' => sub { $_->print } } 10020 ); 10021 ->parsefile( 'doc.xml'); 10022 10023You can also use C<L<start_tag_handlers> > to process an 10024element as soon as the start tag is found. Besides C<L<prefix> > you 10025can also use C<L<suffix> >, 10026 10027=head2 Processing just parts of an XML document 10028 10029The twig_roots mode builds only the required sub-trees from the document 10030Anything outside of the twig roots will just be ignored: 10031 10032 my $t= XML::Twig->new( 10033 # the twig will include just the root and selected titles 10034 twig_roots => { 'section/title' => \&print_n_purge, 10035 'annex/title' => \&print_n_purge 10036 } 10037 ); 10038 $t->parsefile( 'doc.xml'); 10039 10040 sub print_n_purge 10041 { my( $t, $elt)= @_; 10042 print $elt->text; # print the text (including sub-element texts) 10043 $t->purge; # frees the memory 10044 } 10045 10046You can use that mode when you want to process parts of a documents but are 10047not interested in the rest and you don't want to pay the price, either in 10048time or memory, to build the tree for the it. 10049 10050 10051=head2 Building an XML filter 10052 10053You can combine the C<twig_roots> and the C<twig_print_outside_roots> options to 10054build filters, which let you modify selected elements and will output the rest 10055of the document as is. 10056 10057This would convert prices in $ to prices in Euro in a document: 10058 10059 my $t= XML::Twig->new( 10060 twig_roots => { 'price' => \&convert, }, # process prices 10061 twig_print_outside_roots => 1, # print the rest 10062 ); 10063 $t->parsefile( 'doc.xml'); 10064 10065 sub convert 10066 { my( $t, $price)= @_; 10067 my $currency= $price->{'att'}->{'currency'}; # get the currency 10068 if( $currency eq 'USD') 10069 { $usd_price= $price->text; # get the price 10070 # %rate is just a conversion table 10071 my $euro_price= $usd_price * $rate{usd2euro}; 10072 $price->set_text( $euro_price); # set the new price 10073 $price->set_att( currency => 'EUR'); # don't forget this! 10074 } 10075 $price->print; # output the price 10076 } 10077 10078=head2 XML::Twig and various versions of Perl, XML::Parser and expat: 10079 10080XML::Twig is a lot more sensitive to variations in versions of perl, 10081XML::Parser and expat than to the OS, so this should cover some 10082reasonable configurations. 10083 10084The "recommended configuration" is perl 5.8.3+ (for good Unicode 10085support), XML::Parser 2.31+ and expat 1.95.5+ 10086 10087See L<http://testers.cpan.org/search?request=dist&dist=XML-Twig> for the 10088CPAN testers reports on XML::Twig, which list all tested configurations. 10089 10090An Atom feed of the CPAN Testers results is available at 10091L<http://xmltwig.org/rss/twig_testers.rss> 10092 10093Finally: 10094 10095=over 4 10096 10097=item XML::Twig does B<NOT> work with expat 1.95.4 10098 10099=item XML::Twig only works with XML::Parser 2.27 in perl 5.6.* 10100 10101Note that I can't compile XML::Parser 2.27 anymore, so I can't guarantee 10102that it still works 10103 10104=item XML::Parser 2.28 does not really work 10105 10106=back 10107 10108When in doubt, upgrade expat, XML::Parser and Scalar::Util 10109 10110Finally, for some optional features, XML::Twig depends on some additional 10111modules. The complete list, which depends somewhat on the version of Perl 10112that you are running, is given by running C<t/zz_dump_config.t> 10113 10114=head1 Simplifying XML processing 10115 10116=over 4 10117 10118=item Whitespaces 10119 10120Whitespaces that look non-significant are discarded, this behaviour can be 10121controlled using the C<L<keep_spaces> >, 10122C<L<keep_spaces_in> > and 10123C<L<discard_spaces_in> > options. 10124 10125=item Encoding 10126 10127You can specify that you want the output in the same encoding as the input 10128(provided you have valid XML, which means you have to specify the encoding 10129either in the document or when you create the Twig object) using the 10130C<L<keep_encoding> > option 10131 10132You can also use C<L<output_encoding>> to convert the internal UTF-8 format 10133to the required encoding. 10134 10135=item Comments and Processing Instructions (PI) 10136 10137Comments and PI's can be hidden from the processing, but still appear in the 10138output (they are carried by the "real" element closer to them) 10139 10140=item Pretty Printing 10141 10142XML::Twig can output the document pretty printed so it is easier to read for 10143us humans. 10144 10145=item Surviving an untimely death 10146 10147XML parsers are supposed to react violently when fed improper XML. 10148XML::Parser just dies. 10149 10150XML::Twig provides the C<L<safe_parse> > and the 10151C<L<safe_parsefile> > methods which wrap the parse in an eval 10152and return either the parsed twig or 0 in case of failure. 10153 10154=item Private attributes 10155 10156Attributes with a name starting with # (illegal in XML) will not be 10157output, so you can safely use them to store temporary values during 10158processing. Note that you can store anything in a private attribute, 10159not just text, it's just a regular Perl variable, so a reference to 10160an object or a huge data structure is perfectly fine. 10161 10162=back 10163 10164=head1 CLASSES 10165 10166XML::Twig uses a very limited number of classes. The ones you are most likely to use 10167are C<L<XML::Twig>> of course, which represents a complete XML document, including the 10168document itself (the root of the document itself is C<L<root>>), its handlers, its 10169input or output filters... The other main class is C<L<XML::Twig::Elt>>, which models 10170an XML element. Element here has a very wide definition: it can be a regular element, or 10171but also text, with an element C<L<tag>> of C<#PCDATA> (or C<#CDATA>), an entity (tag is 10172C<#ENT>), a Processing Instruction (C<#PI>), a comment (C<#COMMENT>). 10173 10174Those are the 2 commonly used classes. 10175 10176You might want to look the C<L<elt_class>> option if you want to subclass C<XML::Twig::Elt>. 10177 10178Attributes are just attached to their parent element, they are not objects per se. (Please 10179use the provided methods C<L<att>> and C<L<set_att>> to access them, if you access them 10180as a hash, then your code becomes implementation dependent and might break in the future). 10181 10182Other classes that are seldom used are C<L<XML::Twig::Entity_list>> and C<L<XML::Twig::Entity>>. 10183 10184If you use C<L<XML::Twig::XPath>> instead of C<XML::Twig>, elements are then created as 10185C<L<XML::Twig::XPath::Elt>> 10186 10187 10188=head1 METHODS 10189 10190=head2 XML::Twig 10191 10192A twig is a subclass of XML::Parser, so all XML::Parser methods can be 10193called on a twig object, including parse and parsefile. 10194C<setHandlers> on the other hand cannot be used, see C<L<BUGS> > 10195 10196 10197=over 4 10198 10199=item new 10200 10201This is a class method, the constructor for XML::Twig. Options are passed 10202as keyword value pairs. Recognized options are the same as XML::Parser, 10203plus some (in fact a lot!) XML::Twig specifics. 10204 10205New Options: 10206 10207=over 4 10208 10209=item twig_handlers 10210 10211This argument consists of a hash C<{ expression => \&handler}> where 10212expression is a an I<XPath-like expression> (+ some others). 10213 10214XPath expressions are limited to using the child and descendant axis 10215(indeed you can't specify an axis), and predicates cannot be nested. 10216You can use the C<string>, or C<< string(<tag>) >> function (except 10217in C<twig_roots> triggers). 10218 10219Additionally you can use regexps (/ delimited) to match attribute 10220and string values. 10221 10222Examples: 10223 10224 foo 10225 foo/bar 10226 foo//bar 10227 /foo/bar 10228 /foo//bar 10229 /foo/bar[@att1 = "val1" and @att2 = "val2"]/baz[@a >= 1] 10230 foo[string()=~ /^duh!+/] 10231 /foo[string(bar)=~ /\d+/]/baz[@att != 3] 10232 10233#CDATA can be used to call a handler for a CDATA section. 10234#COMMENT can be used to call a handler for comments 10235 10236Some additional (non-XPath) expressions are also provided for convenience: 10237 10238=over 4 10239 10240=item processing instructions 10241 10242C<'?'> or C<'#PI'> triggers the handler for any processing instruction, 10243and C<< '?<target>' >> or C<< '#PI <target>' >> triggers a handler for processing 10244instruction with the given target( ex: C<'#PI xml-stylesheet'>). 10245 10246=item level(<level>) 10247 10248Triggers the handler on any element at that level in the tree (root is level 1) 10249 10250=item _all_ 10251 10252Triggers the handler for B<all> elements in the tree 10253 10254=item _default_ 10255 10256Triggers the handler for each element that does NOT have any other handler. 10257 10258=back 10259 10260Expressions are evaluated against the input document. 10261Which means that even if you have changed the tag of an element (changing the 10262tag of a parent element from a handler for example) the change will not impact 10263the expression evaluation. There is an exception to this: "private" attributes 10264(which name start with a '#', and can only be created during the parsing, as 10265they are not valid XML) are checked against the current twig. 10266 10267Handlers are triggered in fixed order, sorted by their type (xpath expressions 10268first, then regexps, then level), then by whether they specify a full path 10269(starting at the root element) or 10270not, then by number of steps in the expression, then number of 10271predicates, then number of tests in predicates. Handlers where the last 10272step does not specify a step (C<foo/bar/*>) are triggered after other XPath 10273handlers. Finally C<_all_> handlers are triggered last. 10274 10275B<Important>: once a handler has been triggered if it returns 0 then no other 10276handler is called, except a C<_all_> handler which will be called anyway. 10277 10278If a handler returns a true value and other handlers apply, then the next 10279applicable handler will be called. Repeat, rinse, lather..; The exception 10280to that rule is when the C<L<do_not_chain_handlers>> 10281option is set, in which case only the first handler will be called. 10282 10283Note that it might be a good idea to explicitly return a short true value 10284(like 1) from handlers: this ensures that other applicable handlers are 10285called even if the last statement for the handler happens to evaluate to 10286false. This might also speedup the code by avoiding the result of the last 10287statement of the code to be copied and passed to the code managing handlers. 10288It can really pay to have 1 instead of a long string returned. 10289 10290When the closing tag for an element is parsed the corresponding handler is 10291called, with 2 arguments: the twig and the C<L<Element> >. The twig includes 10292the document tree that has been built so far, the element is the complete 10293sub-tree for the element. B<The fact that the handler is called only when the 10294closing tag for the element is found means that handlers for inner elements 10295are called before handlers for outer elements>. 10296 10297C<$_> is also set to the element, so it is easy to write inline handlers like 10298 10299 para => sub { $_->set_tag( 'p'); } 10300 10301Text is stored in elements whose tag name is #PCDATA (due to mixed content, 10302text and sub-element in an element there is no way to store the text as just 10303an attribute of the enclosing element, this is similar to the DOM model). 10304 10305B<Warning>: if you have used purge or flush on the twig the element might not 10306be complete, some of its children might have been entirely flushed or purged, 10307and the start tag might even have been printed (by C<flush>) already, so changing 10308its tag might not give the expected result. 10309 10310 10311=item twig_roots 10312 10313This argument let's you build the tree only for those elements you are 10314interested in. 10315 10316 Example: my $t= XML::Twig->new( twig_roots => { title => 1, subtitle => 1}); 10317 $t->parsefile( file); 10318 my $t= XML::Twig->new( twig_roots => { 'section/title' => 1}); 10319 $t->parsefile( file); 10320 10321 10322return a twig containing a document including only C<title> and C<subtitle> 10323elements, as children of the root element. 10324 10325You can use I<generic_attribute_condition>, I<attribute_condition>, 10326I<full_path>, I<partial_path>, I<tag>, I<tag_regexp>, I<_default_> and 10327I<_all_> to trigger the building of the twig. 10328I<string_condition> and I<regexp_condition> cannot be used as the content 10329of the element, and the string, have not yet been parsed when the condition 10330is checked. 10331 10332B<WARNING>: path are checked for the document. Even if the C<twig_roots> option 10333is used they will be checked against the full document tree, not the virtual 10334tree created by XML::Twig 10335 10336 10337B<WARNING>: twig_roots elements should NOT be nested, that would hopelessly 10338confuse XML::Twig ;--( 10339 10340Note: you can set handlers (twig_handlers) using twig_roots 10341 Example: my $t= XML::Twig->new( twig_roots => 10342 { title => sub { $_[1]->print;}, 10343 subtitle => \&process_subtitle 10344 } 10345 ); 10346 $t->parsefile( file); 10347 10348 10349=item twig_print_outside_roots 10350 10351To be used in conjunction with the C<twig_roots> argument. When set to a true 10352value this will print the document outside of the C<twig_roots> elements. 10353 10354 Example: my $t= XML::Twig->new( twig_roots => { title => \&number_title }, 10355 twig_print_outside_roots => 1, 10356 ); 10357 $t->parsefile( file); 10358 { my $nb; 10359 sub number_title 10360 { my( $twig, $title); 10361 $nb++; 10362 $title->prefix( "$nb "); 10363 $title->print; 10364 } 10365 } 10366 10367 10368This example prints the document outside of the title element, calls 10369C<number_title> for each C<title> element, prints it, and then resumes printing 10370the document. The twig is built only for the C<title> elements. 10371 10372If the value is a reference to a file handle then the document outside the 10373C<twig_roots> elements will be output to this file handle: 10374 10375 open( my $out, '>', 'out_file.xml') or die "cannot open out file.xml out_file:$!"; 10376 my $t= XML::Twig->new( twig_roots => { title => \&number_title }, 10377 # default output to $out 10378 twig_print_outside_roots => $out, 10379 ); 10380 10381 { my $nb; 10382 sub number_title 10383 { my( $twig, $title); 10384 $nb++; 10385 $title->prefix( "$nb "); 10386 $title->print( $out); # you have to print to \*OUT here 10387 } 10388 } 10389 10390 10391=item start_tag_handlers 10392 10393A hash C<{ expression => \&handler}>. Sets element handlers that are called when 10394the element is open (at the end of the XML::Parser C<Start> handler). The handlers 10395are called with 2 params: the twig and the element. The element is empty at 10396that point, its attributes are created though. 10397 10398You can use I<generic_attribute_condition>, I<attribute_condition>, 10399I<full_path>, I<partial_path>, I<tag>, I<tag_regexp>, I<_default_> and I<_all_> 10400to trigger the handler. 10401 10402I<string_condition> and I<regexp_condition> cannot be used as the content of 10403the element, and the string, have not yet been parsed when the condition is 10404checked. 10405 10406The main uses for those handlers are to change the tag name (you might have to 10407do it as soon as you find the open tag if you plan to C<flush> the twig at some 10408point in the element, and to create temporary attributes that will be used 10409when processing sub-element with C<twig_hanlders>. 10410 10411B<Note>: C<start_tag> handlers can be called outside of C<twig_roots> if this 10412argument is used. Since the element object is not built, in this case handlers 10413are called with the following arguments: C<$t> (the twig), C<$tag> (the tag of 10414the element) and C<%att> (a hash of the attributes of the element). 10415 10416If the C<twig_print_outside_roots> argument is also used, if the last handler 10417called returns a C<true> value, then the start tag will be output as it 10418appeared in the original document, if the handler returns a C<false> value 10419then the start tag will B<not> be printed (so you can print a modified string 10420yourself for example). 10421 10422Note that you can use the L<ignore> method in C<start_tag_handlers> 10423(and only there). 10424 10425=item end_tag_handlers 10426 10427A hash C<{ expression => \&handler}>. Sets element handlers that are called when 10428the element is closed (at the end of the XML::Parser C<End> handler). The handlers 10429are called with 2 params: the twig and the tag of the element. 10430 10431I<twig_handlers> are called when an element is completely parsed, so why have 10432this redundant option? There is only one use for C<end_tag_handlers>: when using 10433the C<twig_roots> option, to trigger a handler for an element B<outside> the roots. 10434It is for example very useful to number titles in a document using nested 10435sections: 10436 10437 my @no= (0); 10438 my $no; 10439 my $t= XML::Twig->new( 10440 start_tag_handlers => 10441 { section => sub { $no[$#no]++; $no= join '.', @no; push @no, 0; } }, 10442 twig_roots => 10443 { title => sub { $_->prefix( $no); $_->print; } }, 10444 end_tag_handlers => { section => sub { pop @no; } }, 10445 twig_print_outside_roots => 1 10446 ); 10447 $t->parsefile( $file); 10448 10449Using the C<end_tag_handlers> argument without C<twig_roots> will result in an 10450error. 10451 10452=item do_not_chain_handlers 10453 10454If this option is set to a true value, then only one handler will be called for 10455each element, even if several satisfy the condition 10456 10457Note that the C<_all_> handler will still be called regardless 10458 10459=item ignore_elts 10460 10461This option lets you ignore elements when building the twig. This is useful 10462in cases where you cannot use C<twig_roots> to ignore elements, for example if 10463the element to ignore is a sibling of elements you are interested in. 10464 10465Example: 10466 10467 my $twig= XML::Twig->new( ignore_elts => { elt => 'discard' }); 10468 $twig->parsefile( 'doc.xml'); 10469 10470This will build the complete twig for the document, except that all C<elt> 10471elements (and their children) will be left out. 10472 10473The keys in the hash are triggers, limited to the same subset as 10474C<L<start_tag_handlers>>. The values can be C<discard>, to discard 10475the element, C<print>, to output the element as-is, C<string> to 10476store the text of the ignored element(s), including markup, in a field of 10477the twig: C<< $t->{twig_buffered_string} >> or a reference to a scalar, in 10478which case the text of the ignored element(s), including markup, will be 10479stored in the scalar. Any other value will be treated as C<discard>. 10480 10481 10482=item char_handler 10483 10484A reference to a subroutine that will be called every time C<PCDATA> is found. 10485 10486The subroutine receives the string as argument, and returns the modified string: 10487 10488 # WE WANT ALL STRINGS IN UPPER CASE 10489 sub my_char_handler 10490 { my( $text)= @_; 10491 $text= uc( $text); 10492 return $text; 10493 } 10494 10495=item elt_class 10496 10497The name of a class used to store elements. this class should inherit from 10498C<XML::Twig::Elt> (and by default it is C<XML::Twig::Elt>). This option is used 10499to subclass the element class and extend it with new methods. 10500 10501This option is needed because during the parsing of the XML, elements are created 10502by C<XML::Twig>, without any control from the user code. 10503 10504=item keep_atts_order 10505 10506Setting this option to a true value causes the attribute hash to be tied to 10507a C<Tie::IxHash> object. 10508This means that C<Tie::IxHash> needs to be installed for this option to be 10509available. It also means that the hash keeps its order, so you will get 10510the attributes in order. This allows outputting the attributes in the same 10511order as they were in the original document. 10512 10513=item keep_encoding 10514 10515This is a (slightly?) evil option: if the XML document is not UTF-8 encoded and 10516you want to keep it that way, then setting keep_encoding will use theC<Expat> 10517original_string method for character, thus keeping the original encoding, as 10518well as the original entities in the strings. 10519 10520See the C<t/test6.t> test file to see what results you can expect from the 10521various encoding options. 10522 10523B<WARNING>: if the original encoding is multi-byte then attribute parsing will 10524be EXTREMELY unsafe under any Perl before 5.6, as it uses regular expressions 10525which do not deal properly with multi-byte characters. You can specify an 10526alternate function to parse the start tags with the C<parse_start_tag> option 10527(see below) 10528 10529B<WARNING>: this option is NOT used when parsing with XML::Parser non-blocking 10530parser (C<parse_start>, C<parse_more>, C<parse_done> methods) which you probably 10531should not use with XML::Twig anyway as they are totally untested! 10532 10533=item output_encoding 10534 10535This option generates an output_filter using C<Encode>, C<Text::Iconv> or 10536C<Unicode::Map8> and C<Unicode::Strings>, and sets the encoding in the XML 10537declaration. This is the easiest way to deal with encodings, if you need 10538more sophisticated features, look at C<output_filter> below 10539 10540 10541=item output_filter 10542 10543This option is used to convert the character encoding of the output document. 10544It is passed either a string corresponding to a predefined filter or 10545a subroutine reference. The filter will be called every time a document or 10546element is processed by the "print" functions (C<print>, C<sprint>, C<flush>). 10547 10548Pre-defined filters: 10549 10550=over 4 10551 10552=item latin1 10553 10554uses either C<Encode>, C<Text::Iconv> or C<Unicode::Map8> and C<Unicode::String> 10555or a regexp (which works only with XML::Parser 2.27), in this order, to convert 10556all characters to ISO-8859-15 (usually latin1 is synonym to ISO-8859-1, but 10557in practice it seems that ISO-8859-15, which includes the euro sign, is more 10558useful and probably what most people want). 10559 10560=item html 10561 10562does the same conversion as C<latin1>, plus encodes entities using 10563C<HTML::Entities> (oddly enough you will need to have HTML::Entities installed 10564for it to be available). This should only be used if the tags and attribute 10565names themselves are in US-ASCII, or they will be converted and the output will 10566not be valid XML any more 10567 10568=item safe 10569 10570converts the output to ASCII (US) only plus I<character entities> (C<&#nnn;>) 10571this should be used only if the tags and attribute names themselves are in 10572US-ASCII, or they will be converted and the output will not be valid XML any 10573more 10574 10575=item safe_hex 10576 10577same as C<safe> except that the character entities are in hex (C<&#xnnn;>) 10578 10579=item encode_convert ($encoding) 10580 10581Return a subref that can be used to convert utf8 strings to C<$encoding>). 10582Uses C<Encode>. 10583 10584 my $conv = XML::Twig::encode_convert( 'latin1'); 10585 my $t = XML::Twig->new(output_filter => $conv); 10586 10587=item iconv_convert ($encoding) 10588 10589this function is used to create a filter subroutine that will be used to 10590convert the characters to the target encoding using C<Text::Iconv> (which needs 10591to be installed, look at the documentation for the module and for the 10592C<iconv> library to find out which encodings are available on your system, 10593C<iconv -l> should give you a list of available encodings) 10594 10595 my $conv = XML::Twig::iconv_convert( 'latin1'); 10596 my $t = XML::Twig->new(output_filter => $conv); 10597 10598=item unicode_convert ($encoding) 10599 10600this function is used to create a filter subroutine that will be used to 10601convert the characters to the target encoding using C<Unicode::Strings> 10602and C<Unicode::Map8> (which need to be installed, look at the documentation 10603for the modules to find out which encodings are available on your system) 10604 10605 my $conv = XML::Twig::unicode_convert( 'latin1'); 10606 my $t = XML::Twig->new(output_filter => $conv); 10607 10608=back 10609 10610The C<text> and C<att> methods do not use the filter, so their 10611result are always in unicode. 10612 10613Those predeclared filters are based on subroutines that can be used 10614by themselves (as C<XML::Twig::foo>). 10615 10616=over 4 10617 10618=item html_encode ($string) 10619 10620Use C<HTML::Entities> to encode a utf8 string 10621 10622=item safe_encode ($string) 10623 10624Use either a regexp (perl < 5.8) or C<Encode> to encode non-ascii characters 10625in the string in C<< &#<nnnn>; >> format 10626 10627=item safe_encode_hex ($string) 10628 10629Use either a regexp (perl < 5.8) or C<Encode> to encode non-ascii characters 10630in the string in C<< &#x<nnnn>; >> format 10631 10632=item regexp2latin1 ($string) 10633 10634Use a regexp to encode a utf8 string into latin 1 (ISO-8859-1). Does not 10635work with Perl 5.8.0! 10636 10637=back 10638 10639=item output_text_filter 10640 10641same as output_filter, except it doesn't apply to the brackets and quotes 10642around attribute values. This is useful for all filters that could change 10643the tagging, basically anything that does not just change the encoding of 10644the output. C<html>, C<safe> and C<safe_hex> are better used with this option. 10645 10646=item input_filter 10647 10648This option is similar to C<output_filter> except the filter is applied to 10649the characters before they are stored in the twig, at parsing time. 10650 10651=item remove_cdata 10652 10653Setting this option to a true value will force the twig to output CDATA 10654sections as regular (escaped) PCDATA 10655 10656=item parse_start_tag 10657 10658If you use the C<keep_encoding> option then this option can be used to replace 10659the default parsing function. You should provide a coderef (a reference to a 10660subroutine) as the argument, this subroutine takes the original tag (given 10661by XML::Parser::Expat C<original_string()> method) and returns a tag and the 10662attributes in a hash (or in a list attribute_name/attribute value). 10663 10664=item no_xxe 10665 10666prevents external entities to be parsed. 10667 10668This is a security feature, in case the input XML cannot be trusted. With this 10669option set to a true value defining external entities in the document will cause 10670the parse to fail. 10671 10672This prevents an entity like C<< <!ENTITY xxe PUBLIC "bar" "/etc/passwd"> >> to 10673make the password fiel available in the document. 10674 10675 10676=item expand_external_ents 10677 10678When this option is used external entities (that are defined) are expanded 10679when the document is output using "print" functions such as C<L<print> >, 10680C<L<sprint> >, C<L<flush> > and C<L<xml_string> >. 10681Note that in the twig the entity will be stored as an element with a 10682tag 'C<#ENT>', the entity will not be expanded there, so you might want to 10683process the entities before outputting it. 10684 10685If an external entity is not available, then the parse will fail. 10686 10687A special case is when the value of this option is -1. In that case a missing 10688entity will not cause the parser to die, but its C<name>, C<sysid> and C<pubid> 10689will be stored in the twig as C<< $twig->{twig_missing_system_entities} >> 10690(a reference to an array of hashes { name => <name>, sysid => <sysid>, 10691pubid => <pubid> }). Yes, this is a bit of a hack, but it's useful in some 10692cases. 10693 10694=item load_DTD 10695 10696If this argument is set to a true value, C<parse> or C<parsefile> on the twig 10697will load the DTD information. This information can then be accessed through 10698the twig, in a C<DTD_handler> for example. This will load even an external DTD. 10699 10700Default and fixed values for attributes will also be filled, based on the DTD. 10701 10702Note that to do this the module will generate a temporary file in the current 10703directory. If this is a problem let me know and I will add an option to 10704specify an alternate directory. 10705 10706See L<DTD Handling> for more information 10707 10708=item DTD_base <path_to_DTD_directory> 10709 10710If the DTD is in a different directory, looks for it there, useful to make up 10711somewhat for the lack of catalog suport in C<expat>. You still need a SYSTEM 10712declaration 10713 10714=item DTD_handler 10715 10716Set a handler that will be called once the doctype (and the DTD) have been 10717loaded, with 2 arguments, the twig and the DTD. 10718 10719=item no_prolog 10720 10721Does not output a prolog (XML declaration and DTD) 10722 10723=item id 10724 10725This optional argument gives the name of an attribute that can be used as 10726an ID in the document. Elements whose ID is known can be accessed through 10727the elt_id method. id defaults to 'id'. 10728See C<L<BUGS> > 10729 10730=item discard_spaces 10731 10732If this optional argument is set to a true value then spaces are discarded 10733when they look non-significant: strings containing only spaces and at least 10734one line feed are discarded. This argument is set to true by default. 10735 10736The exact algorithm to drop spaces is: strings including only spaces (perl \s) 10737and at least one \n right before an open or close tag are dropped. 10738 10739=item discard_all_spaces 10740 10741If this argument is set to a true value, spaces are discarded more 10742aggressively than with C<discard_spaces>: strings not including a \n are also 10743dropped. This option is appropriate for data-oriented XML. 10744 10745 10746=item keep_spaces 10747 10748If this optional argument is set to a true value then all spaces in the 10749document are kept, and stored as C<PCDATA>. 10750 10751B<Warning>: adding this option can result in changes in the twig generated: 10752space that was previously discarded might end up in a new text element. see 10753the difference by calling the following code with 0 and 1 as arguments: 10754 10755 perl -MXML::Twig -e'print XML::Twig->new( keep_spaces => shift)->parse( "<d> \n<e/></d>")->_dump' 10756 10757 10758C<keep_spaces> and C<discard_spaces> cannot be both set. 10759 10760=item discard_spaces_in 10761 10762This argument sets C<keep_spaces> to true but will cause the twig builder to 10763discard spaces in the elements listed. 10764 10765The syntax for using this argument is: 10766 10767 XML::Twig->new( discard_spaces_in => [ 'elt1', 'elt2']); 10768 10769=item keep_spaces_in 10770 10771This argument sets C<discard_spaces> to true but will cause the twig builder to 10772keep spaces in the elements listed. 10773 10774The syntax for using this argument is: 10775 10776 XML::Twig->new( keep_spaces_in => [ 'elt1', 'elt2']); 10777 10778B<Warning>: adding this option can result in changes in the twig generated: 10779space that was previously discarded might end up in a new text element. 10780 10781=item pretty_print 10782 10783Set the pretty print method, amongst 'C<none>' (default), 'C<nsgmls>', 10784'C<nice>', 'C<indented>', 'C<indented_c>', 'C<indented_a>', 10785'C<indented_close_tag>', 'C<cvs>', 'C<wrapped>', 'C<record>' and 'C<record_c>' 10786 10787pretty_print formats: 10788 10789=over 4 10790 10791=item none 10792 10793The document is output as one ling string, with no line breaks except those 10794found within text elements 10795 10796=item nsgmls 10797 10798Line breaks are inserted in safe places: that is within tags, between a tag 10799and an attribute, between attributes and before the > at the end of a tag. 10800 10801This is quite ugly but better than C<none>, and it is very safe, the document 10802will still be valid (conforming to its DTD). 10803 10804This is how the SGML parser C<sgmls> splits documents, hence the name. 10805 10806=item nice 10807 10808This option inserts line breaks before any tag that does not contain text (so 10809element with textual content are not broken as the \n is the significant). 10810 10811B<WARNING>: this option leaves the document well-formed but might make it 10812invalid (not conformant to its DTD). If you have elements declared as 10813 10814 <!ELEMENT foo (#PCDATA|bar)> 10815 10816then a C<foo> element including a C<bar> one will be printed as 10817 10818 <foo> 10819 <bar>bar is just pcdata</bar> 10820 </foo> 10821 10822This is invalid, as the parser will take the line break after the C<foo> tag 10823as a sign that the element contains PCDATA, it will then die when it finds the 10824C<bar> tag. This may or may not be important for you, but be aware of it! 10825 10826=item indented 10827 10828Same as C<nice> (and with the same warning) but indents elements according to 10829their level 10830 10831=item indented_c 10832 10833Same as C<indented> but a little more compact: the closing tags are on the 10834same line as the preceding text 10835 10836=item indented_close_tag 10837 10838Same as C<indented> except that the closing tag is also indented, to line up 10839with the tags within the element 10840 10841=item idented_a 10842 10843This formats XML files in a line-oriented version control friendly way. 10844The format is described in L<http://tinyurl.com/2kwscq> (that's an Oracle 10845document with an insanely long URL). 10846 10847Note that to be totaly conformant to the "spec", the order of attributes 10848should not be changed, so if they are not already in alphabetical order 10849you will need to use the C<L<keep_atts_order>> option. 10850 10851=item cvs 10852 10853Same as C<L<idented_a>>. 10854 10855=item wrapped 10856 10857Same as C<indented_c> but lines are wrapped using L<Text::Wrap::wrap>. The 10858default length for lines is the default for C<$Text::Wrap::columns>, and can 10859be changed by changing that variable. 10860 10861=item record 10862 10863This is a record-oriented pretty print, that display data in records, one field 10864per line (which looks a LOT like C<indented>) 10865 10866=item record_c 10867 10868Stands for record compact, one record per line 10869 10870=back 10871 10872 10873=item empty_tags 10874 10875Set the empty tag display style ('C<normal>', 'C<html>' or 'C<expand>'). 10876 10877C<normal> outputs an empty tag 'C<< <tag/> >>', C<html> adds a space 10878'C<< <tag /> >>' for elements that can be empty in XHTML and C<expand> outputs 10879'C<< <tag></tag> >>' 10880 10881=item quote 10882 10883Set the quote character for attributes ('C<single>' or 'C<double>'). 10884 10885=item escape_gt 10886 10887By default XML::Twig does not escape the character > in its output, as it is not 10888mandated by the XML spec. With this option on, > will be replaced by C<>> 10889 10890=item comments 10891 10892Set the way comments are processed: 'C<drop>' (default), 'C<keep>' or 10893'C<process>' 10894 10895Comments processing options: 10896 10897=over 4 10898 10899=item drop 10900 10901drops the comments, they are not read, nor printed to the output 10902 10903=item keep 10904 10905comments are loaded and will appear on the output, they are not 10906accessible within the twig and will not interfere with processing 10907though 10908 10909B<Note>: comments in the middle of a text element such as 10910 10911 <p>text <!-- comment --> more text --></p> 10912 10913are kept at their original position in the text. Using ˝"print" 10914methods like C<print> or C<sprint> will return the comments in the 10915text. Using C<text> or C<field> on the other hand will not. 10916 10917Any use of C<set_pcdata> on the C<#PCDATA> element (directly or 10918through other methods like C<set_content>) will delete the comment(s). 10919 10920=item process 10921 10922comments are loaded in the twig and will be treated as regular elements 10923(their C<tag> is C<#COMMENT>) this can interfere with processing if you 10924expect C<< $elt->{first_child} >> to be an element but find a comment there. 10925Validation will not protect you from this as comments can happen anywhere. 10926You can use C<< $elt->first_child( 'tag') >> (which is a good habit anyway) 10927to get where you want. 10928 10929Consider using C<process> if you are outputting SAX events from XML::Twig. 10930 10931=back 10932 10933=item pi 10934 10935Set the way processing instructions are processed: 'C<drop>', 'C<keep>' 10936(default) or 'C<process>' 10937 10938Note that you can also set PI handlers in the C<twig_handlers> option: 10939 10940 '?' => \&handler 10941 '?target' => \&handler 2 10942 10943The handlers will be called with 2 parameters, the twig and the PI element if 10944C<pi> is set to C<process>, and with 3, the twig, the target and the data if 10945C<pi> is set to C<keep>. Of course they will not be called if C<pi> is set to 10946C<drop>. 10947 10948If C<pi> is set to C<keep> the handler should return a string that will be used 10949as-is as the PI text (it should look like "C< <?target data?> >" or '' if you 10950want to remove the PI), 10951 10952Only one handler will be called, C<?target> or C<?> if no specific handler for 10953that target is available. 10954 10955=item map_xmlns 10956 10957This option is passed a hashref that maps uri's to prefixes. The prefixes in 10958the document will be replaced by the ones in the map. The mapped prefixes can 10959(actually have to) be used to trigger handlers, navigate or query the document. 10960 10961Here is an example: 10962 10963 my $t= XML::Twig->new( map_xmlns => {'http://www.w3.org/2000/svg' => "svg"}, 10964 twig_handlers => 10965 { 'svg:circle' => sub { $_->set_att( r => 20) } }, 10966 pretty_print => 'indented', 10967 ) 10968 ->parse( '<doc xmlns:gr="http://www.w3.org/2000/svg"> 10969 <gr:circle cx="10" cy="90" r="10"/> 10970 </doc>' 10971 ) 10972 ->print; 10973 10974This will output: 10975 10976 <doc xmlns:svg="http://www.w3.org/2000/svg"> 10977 <svg:circle cx="10" cy="90" r="20"/> 10978 </doc> 10979 10980=item keep_original_prefix 10981 10982When used with C<L<map_xmlns>> this option will make C<XML::Twig> use the original 10983namespace prefixes when outputting a document. The mapped prefix will still be used 10984for triggering handlers and in navigation and query methods. 10985 10986 my $t= XML::Twig->new( map_xmlns => {'http://www.w3.org/2000/svg' => "svg"}, 10987 twig_handlers => 10988 { 'svg:circle' => sub { $_->set_att( r => 20) } }, 10989 keep_original_prefix => 1, 10990 pretty_print => 'indented', 10991 ) 10992 ->parse( '<doc xmlns:gr="http://www.w3.org/2000/svg"> 10993 <gr:circle cx="10" cy="90" r="10"/> 10994 </doc>' 10995 ) 10996 ->print; 10997 10998This will output: 10999 11000 <doc xmlns:gr="http://www.w3.org/2000/svg"> 11001 <gr:circle cx="10" cy="90" r="20"/> 11002 </doc> 11003 11004=item original_uri ($prefix) 11005 11006called within a handler, this will return the uri bound to the namespace prefix 11007in the original document. 11008 11009=item index ($arrayref or $hashref) 11010 11011This option creates lists of specific elements during the parsing of the XML. 11012It takes a reference to either a list of triggering expressions or to a hash 11013name => expression, and for each one generates the list of elements that 11014match the expression. The list can be accessed through the C<L<index>> method. 11015 11016example: 11017 11018 # using an array ref 11019 my $t= XML::Twig->new( index => [ 'div', 'table' ]) 11020 ->parsefile( "foo.xml"); 11021 my $divs= $t->index( 'div'); 11022 my $first_div= $divs->[0]; 11023 my $last_table= $t->index( table => -1); 11024 11025 # using a hashref to name the indexes 11026 my $t= XML::Twig->new( index => { email => 'a[@href=~/^ \s*mailto:/]'}) 11027 ->parsefile( "foo.xml"); 11028 my $last_emails= $t->index( email => -1); 11029 11030Note that the index is not maintained after the parsing. If elements are 11031deleted, renamed or otherwise hurt during processing, the index is NOT updated. 11032(changing the id element OTOH will update the index) 11033 11034=item att_accessors <list of attribute names> 11035 11036creates methods that give direct access to attribute: 11037 11038 my $t= XML::Twig->new( att_accessors => [ 'href', 'src']) 11039 ->parsefile( $file); 11040 my $first_href= $t->first_elt( 'img')->src; # same as ->att( 'src') 11041 $t->first_elt( 'img')->src( 'new_logo.png') # changes the attribute value 11042 11043=item elt_accessors 11044 11045creates methods that give direct access to the first child element (in scalar context) 11046or the list of elements (in list context): 11047 11048the list of accessors to create can be given 1 2 different ways: in an array, 11049or in a hash alias => expression 11050 my $t= XML::Twig->new( elt_accessors => [ 'head']) 11051 ->parsefile( $file); 11052 my $title_text= $t->root->head->field( 'title'); 11053 # same as $title_text= $t->root->first_child( 'head')->field( 'title'); 11054 11055 my $t= XML::Twig->new( elt_accessors => { warnings => 'p[@class="warning"]', d2 => 'div[2]'}, ) 11056 ->parsefile( $file); 11057 my $body= $t->first_elt( 'body'); 11058 my @warnings= $body->warnings; # same as $body->children( 'p[@class="warning"]'); 11059 my $s2= $body->d2; # same as $body->first_child( 'div[2]') 11060 11061=item field_accessors 11062 11063creates methods that give direct access to the first child element text: 11064 11065 my $t= XML::Twig->new( field_accessors => [ 'h1']) 11066 ->parsefile( $file); 11067 my $div_title_text= $t->first_elt( 'div')->title; 11068 # same as $title_text= $t->first_elt( 'div')->field( 'title'); 11069 11070=item use_tidy 11071 11072set this option to use HTML::Tidy instead of HTML::TreeBuilder to convert 11073HTML to XML. HTML, especially real (real "crap") HTML found in the wild, 11074so depending on the data, one module or the other does a better job at 11075the conversion. Also, HTML::Tidy can be a bit difficult to install, so 11076XML::Twig offers both option. TIMTOWTDI 11077 11078=item output_html_doctype 11079 11080when using HTML::TreeBuilder to convert HTML, this option causes the DOCTYPE 11081declaration to be output, which may be important for some legacy browsers. 11082Without that option the DOCTYPE definition is NOT output. Also if the definition 11083is completely wrong (ie not easily parsable), it is not output either. 11084 11085=back 11086 11087B<Note>: I _HATE_ the Java-like name of arguments used by most XML modules. 11088So in pure TIMTOWTDI fashion all arguments can be written either as 11089C<UglyJavaLikeName> or as C<readable_perl_name>: C<twig_print_outside_roots> 11090or C<TwigPrintOutsideRoots> (or even C<twigPrintOutsideRoots> {shudder}). 11091XML::Twig normalizes them before processing them. 11092 11093=item parse ( $source) 11094 11095The C<$source> parameter should either be a string containing the whole XML 11096document, or it should be an open C<IO::Handle> (aka a filehandle). 11097 11098A die call is thrown if a parse error occurs. Otherwise it will return 11099the twig built by the parse. Use C<safe_parse> if you want the parsing 11100to return even when an error occurs. 11101 11102If this method is called as a class method 11103(C<< XML::Twig->parse( $some_xml_or_html) >>) then an XML::Twig object is 11104created, using the parameters except the last one (eg 11105C<< XML::Twig->parse( pretty_print => 'indented', $some_xml_or_html) >>) 11106and C<L<xparse>> is called on it. 11107 11108Note that when parsing a filehandle, the handle should NOT be open with an 11109encoding (ie open with C<open( my $in, '<', $filename)>. The file will be 11110parsed by C<expat>, so specifying the encoding actually causes problems 11111for the parser (as in: it can crash it, see 11112https://rt.cpan.org/Ticket/Display.html?id=78877). For parsing a file it 11113is actually recommended to use C<parsefile> on the file name, instead of 11114<parse> on the open file. 11115 11116=item parsestring 11117 11118This is just an alias for C<parse> for backwards compatibility. 11119 11120=item parsefile (FILE [, OPT => OPT_VALUE [...]]) 11121 11122Open C<FILE> for reading, then call C<parse> with the open handle. The file 11123is closed no matter how C<parse> returns. 11124 11125A C<die> call is thrown if a parse error occurs. Otherwise it will return 11126the twig built by the parse. Use C<safe_parsefile> if you want the parsing 11127to return even when an error occurs. 11128 11129=item parsefile_inplace ( $file, $optional_extension) 11130 11131Parse and update a file "in place". It does this by creating a temp file, 11132selecting it as the default for print() statements (and methods), then parsing 11133the input file. If the parsing is successful, then the temp file is 11134moved to replace the input file. 11135 11136If an extension is given then the original file is backed-up (the rules for 11137the extension are the same as the rule for the -i option in perl). 11138 11139=item parsefile_html_inplace ( $file, $optional_extension) 11140 11141Same as parsefile_inplace, except that it parses HTML instead of XML 11142 11143=item parseurl ($url $optional_user_agent) 11144 11145Gets the data from C<$url> and parse it. The data is piped to the parser in 11146chunks the size of the XML::Parser::Expat buffer, so memory consumption and 11147hopefully speed are optimal. 11148 11149For most (read "small") XML it is probably as efficient (and easier to debug) 11150to just C<get> the XML file and then parse it as a string. 11151 11152 use XML::Twig; 11153 use LWP::Simple; 11154 my $twig= XML::Twig->new(); 11155 $twig->parse( LWP::Simple::get( $URL )); 11156 11157or 11158 11159 use XML::Twig; 11160 my $twig= XML::Twig->nparse( $URL); 11161 11162 11163If the C<$optional_user_agent> argument is used then it is used, otherwise a 11164new one is created. 11165 11166=item safe_parse ( SOURCE [, OPT => OPT_VALUE [...]]) 11167 11168This method is similar to C<parse> except that it wraps the parsing in an 11169C<eval> block. It returns the twig on success and 0 on failure (the twig object 11170also contains the parsed twig). C<$@> contains the error message on failure. 11171 11172Note that the parsing still stops as soon as an error is detected, there is 11173no way to keep going after an error. 11174 11175=item safe_parsefile (FILE [, OPT => OPT_VALUE [...]]) 11176 11177This method is similar to C<parsefile> except that it wraps the parsing in an 11178C<eval> block. It returns the twig on success and 0 on failure (the twig object 11179also contains the parsed twig) . C<$@> contains the error message on failure 11180 11181Note that the parsing still stops as soon as an error is detected, there is 11182no way to keep going after an error. 11183 11184=item safe_parseurl ($url $optional_user_agent) 11185 11186Same as C<parseurl> except that it wraps the parsing in an C<eval> block. It 11187returns the twig on success and 0 on failure (the twig object also contains 11188the parsed twig) . C<$@> contains the error message on failure 11189 11190=item parse_html ($string_or_fh) 11191 11192parse an HTML string or file handle (by converting it to XML using 11193HTML::TreeBuilder, which needs to be available). 11194 11195This works nicely, but some information gets lost in the process: 11196newlines are removed, and (at least on the version I use), comments 11197get an extra CDATA section inside ( <!-- foo --> becomes 11198<!-- <![CDATA[ foo ]]> --> 11199 11200=item parsefile_html ($file) 11201 11202parse an HTML file (by converting it to XML using HTML::TreeBuilder, which 11203needs to be available, or HTML::Tidy if the C<use_tidy> option was used). 11204The file is loaded completely in memory and converted to XML before being parsed. 11205 11206this method is to be used with caution though, as it doesn't know about the 11207file encoding, it is usually better to use C<L<parse_html>>, which gives you 11208a chance to open the file with the proper encoding layer. 11209 11210=item parseurl_html ($url $optional_user_agent) 11211 11212parse an URL as html the same way C<L<parse_html>> does 11213 11214=item safe_parseurl_html ($url $optional_user_agent) 11215 11216Same as C<L<parseurl_html>>> except that it wraps the parsing in an C<eval> 11217block. It returns the twig on success and 0 on failure (the twig object also 11218contains the parsed twig) . C<$@> contains the error message on failure 11219 11220=item safe_parsefile_html ($file $optional_user_agent) 11221 11222Same as C<L<parsefile_html>>> except that it wraps the parsing in an C<eval> 11223block. It returns the twig on success and 0 on failure (the twig object also 11224contains the parsed twig) . C<$@> contains the error message on failure 11225 11226=item safe_parse_html ($string_or_fh) 11227 11228Same as C<L<parse_html>> except that it wraps the parsing in an C<eval> block. 11229It returns the twig on success and 0 on failure (the twig object also contains 11230the parsed twig) . C<$@> contains the error message on failure 11231 11232=item xparse ($thing_to_parse) 11233 11234parse the C<$thing_to_parse>, whether it is a filehandle, a string, an HTML 11235file, an HTML URL, an URL or a file. 11236 11237Note that this is mostly a convenience method for one-off scripts. For example 11238files that end in '.htm' or '.html' are parsed first as XML, and if this fails 11239as HTML. This is certainly not the most efficient way to do this in general. 11240 11241=item nparse ($optional_twig_options, $thing_to_parse) 11242 11243create a twig with the C<$optional_options>, and parse the C<$thing_to_parse>, 11244whether it is a filehandle, a string, an HTML file, an HTML URL, an URL or a 11245file. 11246 11247Examples: 11248 11249 XML::Twig->nparse( "file.xml"); 11250 XML::Twig->nparse( error_context => 1, "file://file.xml"); 11251 11252=item nparse_pp ($optional_twig_options, $thing_to_parse) 11253 11254same as C<L<nparse>> but also sets the C<pretty_print> option to C<indented>. 11255 11256=item nparse_e ($optional_twig_options, $thing_to_parse) 11257 11258same as C<L<nparse>> but also sets the C<error_context> option to 1. 11259 11260=item nparse_ppe ($optional_twig_options, $thing_to_parse) 11261 11262same as C<L<nparse>> but also sets the C<pretty_print> option to C<indented> 11263and the C<error_context> option to 1. 11264 11265=item parser 11266 11267This method returns the C<expat> object (actually the XML::Parser::Expat object) 11268used during parsing. It is useful for example to call XML::Parser::Expat methods 11269on it. To get the line of a tag for example use C<< $t->parser->current_line >>. 11270 11271=item setTwigHandlers ($handlers) 11272 11273Set the twig_handlers. C<$handlers> is a reference to a hash similar to the 11274one in the C<twig_handlers> option of new. All previous handlers are unset. 11275The method returns the reference to the previous handlers. 11276 11277=item setTwigHandler ($exp $handler) 11278 11279Set a single twig_handler for elements matching C<$exp>. C<$handler> is a 11280reference to a subroutine. If the handler was previously set then the reference 11281to the previous handler is returned. 11282 11283=item setStartTagHandlers ($handlers) 11284 11285Set the start_tag handlers. C<$handlers> is a reference to a hash similar to the 11286one in the C<start_tag_handlers> option of new. All previous handlers are unset. 11287The method returns the reference to the previous handlers. 11288 11289=item setStartTagHandler ($exp $handler) 11290 11291Set a single start_tag handlers for elements matching C<$exp>. C<$handler> is a 11292reference to a subroutine. If the handler was previously set then the reference 11293to the previous handler is returned. 11294 11295=item setEndTagHandlers ($handlers) 11296 11297Set the end_tag handlers. C<$handlers> is a reference to a hash similar to the 11298one in the C<end_tag_handlers> option of new. All previous handlers are unset. 11299The method returns the reference to the previous handlers. 11300 11301=item setEndTagHandler ($exp $handler) 11302 11303Set a single end_tag handlers for elements matching C<$exp>. C<$handler> is a 11304reference to a subroutine. If the handler was previously set then the 11305reference to the previous handler is returned. 11306 11307=item setTwigRoots ($handlers) 11308 11309Same as using the C<L<twig_roots>> option when creating the twig 11310 11311=item setCharHandler ($exp $handler) 11312 11313Set a C<char_handler> 11314 11315=item setIgnoreEltsHandler ($exp) 11316 11317Set a C<ignore_elt> handler (elements that match C<$exp> will be ignored 11318 11319=item setIgnoreEltsHandlers ($exp) 11320 11321Set all C<ignore_elt> handlers (previous handlers are replaced) 11322 11323=item dtd 11324 11325Return the dtd (an L<XML::Twig::DTD> object) of a twig 11326 11327=item xmldecl 11328 11329Return the XML declaration for the document, or a default one if it doesn't 11330have one 11331 11332=item doctype 11333 11334Return the doctype for the document 11335 11336=item doctype_name 11337 11338returns the doctype of the document from the doctype declaration 11339 11340=item system_id 11341 11342returns the system value of the DTD of the document from the doctype declaration 11343 11344=item public_id 11345 11346returns the public doctype of the document from the doctype declaration 11347 11348=item internal_subset 11349 11350returns the internal subset of the DTD 11351 11352=item dtd_text 11353 11354Return the DTD text 11355 11356=item dtd_print 11357 11358Print the DTD 11359 11360=item model ($tag) 11361 11362Return the model (in the DTD) for the element C<$tag> 11363 11364=item root 11365 11366Return the root element of a twig 11367 11368=item set_root ($elt) 11369 11370Set the root of a twig 11371 11372=item first_elt ($optional_condition) 11373 11374Return the first element matching C<$optional_condition> of a twig, if 11375no condition is given then the root is returned 11376 11377=item last_elt ($optional_condition) 11378 11379Return the last element matching C<$optional_condition> of a twig, if 11380no condition is given then the last element of the twig is returned 11381 11382=item elt_id ($id) 11383 11384Return the element whose C<id> attribute is $id 11385 11386=item getEltById 11387 11388Same as C<L<elt_id>> 11389 11390=item index ($index_name, $optional_index) 11391 11392If the C<$optional_index> argument is present, return the corresponding element 11393in the index (created using the C<index> option for C<XML::Twig->new>) 11394 11395If the argument is not present, return an arrayref to the index 11396 11397=item normalize 11398 11399merge together all consecutive pcdata elements in the document (if for example 11400you have turned some elements into pcdata using C<L<erase>>, this will give you 11401a "clean" document in which there all text elements are as long as possible). 11402 11403=item encoding 11404 11405This method returns the encoding of the XML document, as defined by the 11406C<encoding> attribute in the XML declaration (ie it is C<undef> if the attribute 11407is not defined) 11408 11409=item set_encoding 11410 11411This method sets the value of the C<encoding> attribute in the XML declaration. 11412Note that if the document did not have a declaration it is generated (with 11413an XML version of 1.0) 11414 11415=item xml_version 11416 11417This method returns the XML version, as defined by the C<version> attribute in 11418the XML declaration (ie it is C<undef> if the attribute is not defined) 11419 11420=item set_xml_version 11421 11422This method sets the value of the C<version> attribute in the XML declaration. 11423If the declaration did not exist it is created. 11424 11425=item standalone 11426 11427This method returns the value of the C<standalone> declaration for the document 11428 11429=item set_standalone 11430 11431This method sets the value of the C<standalone> attribute in the XML 11432declaration. Note that if the document did not have a declaration it is 11433generated (with an XML version of 1.0) 11434 11435=item set_output_encoding 11436 11437Set the C<encoding> "attribute" in the XML declaration 11438 11439=item set_doctype ($name, $system, $public, $internal) 11440 11441Set the doctype of the element. If an argument is C<undef> (or not present) 11442then its former value is retained, if a false ('' or 0) value is passed then 11443the former value is deleted; 11444 11445=item entity_list 11446 11447Return the entity list of a twig 11448 11449=item entity_names 11450 11451Return the list of all defined entities 11452 11453=item entity ($entity_name) 11454 11455Return the entity 11456 11457=item notation_list 11458 11459Return the notation list of a twig 11460 11461=item notation_names 11462 11463Return the list of all defined notations 11464 11465=item notation ($notation_name) 11466 11467Return the notation 11468 11469=item change_gi ($old_gi, $new_gi) 11470 11471Performs a (very fast) global change. All elements C<$old_gi> are now 11472C<$new_gi>. This is a bit dangerous though and should be avoided if 11473< possible, as the new tag might be ignored in subsequent processing. 11474 11475See C<L<BUGS> > 11476 11477=item flush ($optional_filehandle, %options) 11478 11479Flushes a twig up to (and including) the current element, then deletes 11480all unnecessary elements from the tree that's kept in memory. 11481C<flush> keeps track of which elements need to be open/closed, so if you 11482flush from handlers you don't have to worry about anything. Just keep 11483flushing the twig every time you're done with a sub-tree and it will 11484come out well-formed. After the whole parsing don't forget toC<flush> 11485one more time to print the end of the document. 11486The doctype and entity declarations are also printed. 11487 11488flush take an optional filehandle as an argument. 11489 11490If you use C<flush> at any point during parsing, the document will be flushed 11491one last time at the end of the parsing, to the proper filehandle. 11492 11493options: use the C<update_DTD> option if you have updated the (internal) DTD 11494and/or the entity list and you want the updated DTD to be output 11495 11496The C<pretty_print> option sets the pretty printing of the document. 11497 11498 Example: $t->flush( Update_DTD => 1); 11499 $t->flush( $filehandle, pretty_print => 'indented'); 11500 $t->flush( \*FILE); 11501 11502 11503=item flush_up_to ($elt, $optional_filehandle, %options) 11504 11505Flushes up to the C<$elt> element. This allows you to keep part of the 11506tree in memory when you C<flush>. 11507 11508options: see flush. 11509 11510=item purge 11511 11512Does the same as a C<flush> except it does not print the twig. It just deletes 11513all elements that have been completely parsed so far. 11514 11515=item purge_up_to ($elt) 11516 11517Purges up to the C<$elt> element. This allows you to keep part of the tree in 11518memory when you C<purge>. 11519 11520=item print ($optional_filehandle, %options) 11521 11522Prints the whole document associated with the twig. To be used only AFTER the 11523parse. 11524 11525options: see C<flush>. 11526 11527=item print_to_file ($filename, %options) 11528 11529Prints the whole document associated with the twig to file C<$filename>. 11530To be used only AFTER the parse. 11531 11532options: see C<flush>. 11533 11534=item safe_print_to_file ($filename, %options) 11535 11536Prints the whole document associated with the twig to file C<$filename>. 11537This variant, which probably only works on *nix prints to a temp file, 11538then move the temp file to overwrite the original file. 11539 11540This is a bit safer when 2 processes an potentiallywrite the same file: 11541only the last one will succeed, but the file won't be corruted. I often 11542use this for cron jobs, so testing the code doesn't interfere with the 11543cron job running at the same time. 11544 11545options: see C<flush>. 11546 11547=item sprint 11548 11549Return the text of the whole document associated with the twig. To be used only 11550AFTER the parse. 11551 11552options: see C<flush>. 11553 11554=item trim 11555 11556Trim the document: gets rid of initial and trailing spaces, and replaces multiple spaces 11557by a single one. 11558 11559=item toSAX1 ($handler) 11560 11561Send SAX events for the twig to the SAX1 handler C<$handler> 11562 11563=item toSAX2 ($handler) 11564 11565Send SAX events for the twig to the SAX2 handler C<$handler> 11566 11567=item flush_toSAX1 ($handler) 11568 11569Same as flush, except that SAX events are sent to the SAX1 handler 11570C<$handler> instead of the twig being printed 11571 11572=item flush_toSAX2 ($handler) 11573 11574Same as flush, except that SAX events are sent to the SAX2 handler 11575C<$handler> instead of the twig being printed 11576 11577=item ignore 11578 11579This method should be called during parsing, usually in C<start_tag_handlers>. 11580It causes the element to be skipped during the parsing: the twig is not built 11581for this element, it will not be accessible during parsing or after it. The 11582element will not take up any memory and parsing will be faster. 11583 11584Note that this method can also be called on an element. If the element is a 11585parent of the current element then this element will be ignored (the twig will 11586not be built any more for it and what has already been built will be deleted). 11587 11588=item set_pretty_print ($style) 11589 11590Set the pretty print method, amongst 'C<none>' (default), 'C<nsgmls>', 11591'C<nice>', 'C<indented>', C<indented_c>, 'C<wrapped>', 'C<record>' and 11592'C<record_c>' 11593 11594B<WARNING:> the pretty print style is a B<GLOBAL> variable, so once set it's 11595applied to B<ALL> C<print>'s (and C<sprint>'s). Same goes if you use XML::Twig 11596with C<mod_perl> . This should not be a problem as the XML that's generated 11597is valid anyway, and XML processors (as well as HTML processors, including 11598browsers) should not care. Let me know if this is a big problem, but at the 11599moment the performance/cleanliness trade-off clearly favors the global 11600approach. 11601 11602=item set_empty_tag_style ($style) 11603 11604Set the empty tag display style ('C<normal>', 'C<html>' or 'C<expand>'). As 11605with C<L<set_pretty_print>> this sets a global flag. 11606 11607C<normal> outputs an empty tag 'C<< <tag/> >>', C<html> adds a space 11608'C<< <tag /> >>' for elements that can be empty in XHTML and C<expand> outputs 11609'C<< <tag></tag> >>' 11610 11611=item set_remove_cdata ($flag) 11612 11613set (or unset) the flag that forces the twig to output CDATA sections as 11614regular (escaped) PCDATA 11615 11616=item print_prolog ($optional_filehandle, %options) 11617 11618Prints the prolog (XML declaration + DTD + entity declarations) of a document. 11619 11620options: see C<L<flush>>. 11621 11622=item prolog ($optional_filehandle, %options) 11623 11624Return the prolog (XML declaration + DTD + entity declarations) of a document. 11625 11626options: see C<L<flush>>. 11627 11628=item finish 11629 11630Call Expat C<finish> method. 11631Unsets all handlers (including internal ones that set context), but expat 11632continues parsing to the end of the document or until it finds an error. 11633It should finish up a lot faster than with the handlers set. 11634 11635=item finish_print 11636 11637Stops twig processing, flush the twig and proceed to finish printing the 11638document as fast as possible. Use this method when modifying a document and 11639the modification is done. 11640 11641=item finish_now 11642 11643Stops twig processing, does not finish parsing the document (which could 11644actually be not well-formed after the point where C<finish_now> is called). 11645Execution resumes after the C<Lparse>> or C<L<parsefile>> call. The content 11646of the twig is what has been parsed so far (all open elements at the time 11647C<finish_now> is called are considered closed). 11648 11649=item set_expand_external_entities 11650 11651Same as using the C<L<expand_external_ents>> option when creating the twig 11652 11653=item set_input_filter 11654 11655Same as using the C<L<input_filter>> option when creating the twig 11656 11657=item set_keep_atts_order 11658 11659Same as using the C<L<keep_atts_order>> option when creating the twig 11660 11661=item set_keep_encoding 11662 11663Same as using the C<L<keep_encoding>> option when creating the twig 11664 11665=item escape_gt 11666 11667usually XML::Twig does not escape > in its output. Using this option 11668makes it replace > by > 11669 11670=item do_not_escape_gt 11671 11672reverts XML::Twig behavior to its default of not escaping > in its output. 11673 11674=item set_output_filter 11675 11676Same as using the C<L<output_filter>> option when creating the twig 11677 11678=item set_output_text_filter 11679 11680Same as using the C<L<output_text_filter>> option when creating the twig 11681 11682=item add_stylesheet ($type, @options) 11683 11684Adds an external stylesheet to an XML document. 11685 11686Supported types and options: 11687 11688=over 4 11689 11690=item xsl 11691 11692option: the url of the stylesheet 11693 11694Example: 11695 11696 $t->add_stylesheet( xsl => "xsl_style.xsl"); 11697 11698will generate the following PI at the beginning of the document: 11699 11700 <?xml-stylesheet type="text/xsl" href="xsl_style.xsl"?> 11701 11702=item css 11703 11704option: the url of the stylesheet 11705 11706=item active_twig 11707 11708a class method that returns the last processed twig, so you don't necessarily 11709need the object to call methods on it. 11710 11711=back 11712 11713=item Methods inherited from XML::Parser::Expat 11714 11715A twig inherits all the relevant methods from XML::Parser::Expat. These 11716methods can only be used during the parsing phase (they will generate 11717a fatal error otherwise). 11718 11719Inherited methods are: 11720 11721=over 4 11722 11723=item depth 11724 11725Returns the size of the context list. 11726 11727=item in_element 11728 11729Returns true if NAME is equal to the name of the innermost cur‐ 11730rently opened element. If namespace processing is being used and 11731you want to check against a name that may be in a namespace, then 11732use the generate_ns_name method to create the NAME argument. 11733 11734=item within_element 11735 11736Returns the number of times the given name appears in the context 11737list. If namespace processing is being used and you want to check 11738against a name that may be in a namespace, then use the gener‐ 11739ate_ns_name method to create the NAME argument. 11740 11741=item context 11742 11743Returns a list of element names that represent open elements, with 11744the last one being the innermost. Inside start and end tag han‐ 11745dlers, this will be the tag of the parent element. 11746 11747=item current_line 11748 11749Returns the line number of the current position of the parse. 11750 11751=item current_column 11752 11753Returns the column number of the current position of the parse. 11754 11755=item current_byte 11756 11757Returns the current position of the parse. 11758 11759=item position_in_context 11760 11761Returns a string that shows the current parse position. LINES 11762should be an integer >= 0 that represents the number of lines on 11763either side of the current parse line to place into the returned 11764string. 11765 11766=item base ([NEWBASE]) 11767 11768Returns the current value of the base for resolving relative URIs. 11769If NEWBASE is supplied, changes the base to that value. 11770 11771=item current_element 11772 11773Returns the name of the innermost currently opened element. Inside 11774start or end handlers, returns the parent of the element associated 11775with those tags. 11776 11777=item element_index 11778 11779Returns an integer that is the depth-first visit order of the cur‐ 11780rent element. This will be zero outside of the root element. For 11781example, this will return 1 when called from the start handler for 11782the root element start tag. 11783 11784=item recognized_string 11785 11786Returns the string from the document that was recognized in order 11787to call the current handler. For instance, when called from a start 11788handler, it will give us the start-tag string. The string is 11789encoded in UTF-8. This method doesn't return a meaningful string 11790inside declaration handlers. 11791 11792=item original_string 11793 11794Returns the verbatim string from the document that was recognized 11795in order to call the current handler. The string is in the original 11796document encoding. This method doesn't return a meaningful string 11797inside declaration handlers. 11798 11799=item xpcroak 11800 11801Concatenate onto the given message the current line number within 11802the XML document plus the message implied by ErrorContext. Then 11803croak with the formed message. 11804 11805=item xpcarp 11806 11807Concatenate onto the given message the current line number within 11808the XML document plus the message implied by ErrorContext. Then 11809carp with the formed message. 11810 11811=item xml_escape(TEXT [, CHAR [, CHAR ...]]) 11812 11813Returns TEXT with markup characters turned into character entities. 11814Any additional characters provided as arguments are also turned 11815into character references where found in TEXT. 11816 11817(this method is broken on some versions of expat/XML::Parser) 11818 11819=back 11820 11821=item path ( $optional_tag) 11822 11823Return the element context in a form similar to XPath's short 11824form: 'C</root/tag1/../tag>' 11825 11826=item get_xpath ( $optional_array_ref, $xpath, $optional_offset) 11827 11828Performs a C<get_xpath> on the document root (see <Elt|"Elt">) 11829 11830If the C<$optional_array_ref> argument is used the array must contain 11831elements. The C<$xpath> expression is applied to each element in turn 11832and the result is union of all results. This way a first query can be 11833refined in further steps. 11834 11835 11836=item find_nodes ( $optional_array_ref, $xpath, $optional_offset) 11837 11838same as C<get_xpath> 11839 11840=item findnodes ( $optional_array_ref, $xpath, $optional_offset) 11841 11842same as C<get_xpath> (similar to the XML::LibXML method) 11843 11844=item findvalue ( $optional_array_ref, $xpath, $optional_offset) 11845 11846Return the C<join> of all texts of the results of applying C<L<get_xpath>> 11847to the node (similar to the XML::LibXML method) 11848 11849=item findvalues ( $optional_array_ref, $xpath, $optional_offset) 11850 11851Return an array of all texts of the results of applying C<L<get_xpath>> 11852to the node 11853 11854=item subs_text ($regexp, $replace) 11855 11856subs_text does text substitution on the whole document, similar to perl's 11857C< s///> operator. 11858 11859=item dispose 11860 11861Useful only if you don't have C<Scalar::Util> or C<WeakRef> installed. 11862 11863Reclaims properly the memory used by an XML::Twig object. As the object has 11864circular references it never goes out of scope, so if you want to parse lots 11865of XML documents then the memory leak becomes a problem. Use 11866C<< $twig->dispose >> to clear this problem. 11867 11868=item att_accessors (list_of_attribute_names) 11869 11870A convenience method that creates l-valued accessors for attributes. 11871So C<< $twig->create_accessors( 'foo') >> will create a C<foo> method 11872that can be called on elements: 11873 11874 $elt->foo; # equivalent to $elt->{'att'}->{'foo'}; 11875 $elt->foo( 'bar'); # equivalent to $elt->set_att( foo => 'bar'); 11876 11877The methods are l-valued only under those perl's that support this 11878feature (5.6 and above) 11879 11880=item create_accessors (list_of_attribute_names) 11881 11882Same as att_accessors 11883 11884=item elt_accessors (list_of_attribute_names) 11885 11886A convenience method that creates accessors for elements. 11887So C<< $twig->create_accessors( 'foo') >> will create a C<foo> method 11888that can be called on elements: 11889 11890 $elt->foo; # equivalent to $elt->first_child( 'foo'); 11891 11892=item field_accessors (list_of_attribute_names) 11893 11894A convenience method that creates accessors for element values (C<field>). 11895So C<< $twig->create_accessors( 'foo') >> will create a C<foo> method 11896that can be called on elements: 11897 11898 $elt->foo; # equivalent to $elt->field( 'foo'); 11899 11900=item set_do_not_escape_amp_in_atts 11901 11902An evil method, that I only document because Test::Pod::Coverage complaints otherwise, 11903but really, you don't want to know about it. 11904 11905=back 11906 11907=head2 XML::Twig::Elt 11908 11909=over 4 11910 11911=item new ($optional_tag, $optional_atts, @optional_content) 11912 11913The C<tag> is optional (but then you can't have a content ), the C<$optional_atts> 11914argument is a reference to a hash of attributes, the content can be just a 11915string or a list of strings and element. A content of 'C<#EMPTY>' creates an empty 11916element; 11917 11918 Examples: my $elt= XML::Twig::Elt->new(); 11919 my $elt= XML::Twig::Elt->new( para => { align => 'center' }); 11920 my $elt= XML::Twig::Elt->new( para => { align => 'center' }, 'foo'); 11921 my $elt= XML::Twig::Elt->new( br => '#EMPTY'); 11922 my $elt= XML::Twig::Elt->new( 'para'); 11923 my $elt= XML::Twig::Elt->new( para => 'this is a para'); 11924 my $elt= XML::Twig::Elt->new( para => $elt3, 'another para'); 11925 11926The strings are not parsed, the element is not attached to any twig. 11927 11928B<WARNING>: if you rely on ID's then you will have to set the id yourself. At 11929this point the element does not belong to a twig yet, so the ID attribute 11930is not known so it won't be stored in the ID list. 11931 11932Note that C<#COMMENT>, C<#PCDATA> or C<#CDATA> are valid tag names, that will 11933create text elements. 11934 11935To create an element C<foo> containing a CDATA section: 11936 11937 my $foo= XML::Twig::Elt->new( '#CDATA' => "content of the CDATA section") 11938 ->wrap_in( 'foo'); 11939 11940An attribute of '#CDATA', will create the content of the element as CDATA: 11941 11942 my $elt= XML::Twig::Elt->new( 'p' => { '#CDATA' => 1}, 'foo < bar'); 11943 11944creates an element 11945 11946 <p><![CDATA[foo < bar]]></> 11947 11948=item parse ($string, %args) 11949 11950Creates an element from an XML string. The string is actually 11951parsed as a new twig, then the root of that twig is returned. 11952The arguments in C<%args> are passed to the twig. 11953As always if the parse fails the parser will die, so use an 11954eval if you want to trap syntax errors. 11955 11956As obviously the element does not exist beforehand this method has to be 11957called on the class: 11958 11959 my $elt= parse XML::Twig::Elt( "<a> string to parse, with <sub/> 11960 <elements>, actually tons of </elements> 11961 h</a>"); 11962 11963=item set_inner_xml ($string) 11964 11965Sets the content of the element to be the tree created from the string 11966 11967=item set_inner_html ($string) 11968 11969Sets the content of the element, after parsing the string with an HTML 11970parser (HTML::Parser) 11971 11972=item set_outer_xml ($string) 11973 11974Replaces the element with the tree created from the string 11975 11976=item print ($optional_filehandle, $optional_pretty_print_style) 11977 11978Prints an entire element, including the tags, optionally to a 11979C<$optional_filehandle>, optionally with a C<$pretty_print_style>. 11980 11981The print outputs XML data so base entities are escaped. 11982 11983=item print_to_file ($filename, %options) 11984 11985Prints the element to file C<$filename>. 11986 11987options: see C<flush>. 11988=item sprint ($elt, $optional_no_enclosing_tag) 11989 11990Return the xml string for an entire element, including the tags. 11991If the optional second argument is true then only the string inside the 11992element is returned (the start and end tag for $elt are not). 11993The text is XML-escaped: base entities (& and < in text, & < and " in 11994attribute values) are turned into entities. 11995 11996=item gi 11997 11998Return the gi of the element (the gi is the C<generic identifier> the tag 11999name in SGML parlance). 12000 12001C<tag> and C<name> are synonyms of C<gi>. 12002 12003=item tag 12004 12005Same as C<L<gi>> 12006 12007=item name 12008 12009Same as C<L<tag>> 12010 12011=item set_gi ($tag) 12012 12013Set the gi (tag) of an element 12014 12015=item set_tag ($tag) 12016 12017Set the tag (=C<L<tag>>) of an element 12018 12019=item set_name ($name) 12020 12021Set the name (=C<L<tag>>) of an element 12022 12023=item root 12024 12025Return the root of the twig in which the element is contained. 12026 12027=item twig 12028 12029Return the twig containing the element. 12030 12031=item parent ($optional_condition) 12032 12033Return the parent of the element, or the first ancestor matching the 12034C<$optional_condition> 12035 12036=item first_child ($optional_condition) 12037 12038Return the first child of the element, or the first child matching the 12039C<$optional_condition> 12040 12041=item has_child ($optional_condition) 12042 12043Return the first child of the element, or the first child matching the 12044C<$optional_condition> (same as L<first_child>) 12045 12046=item has_children ($optional_condition) 12047 12048Return the first child of the element, or the first child matching the 12049C<$optional_condition> (same as L<first_child>) 12050 12051 12052=item first_child_text ($optional_condition) 12053 12054Return the text of the first child of the element, or the first child 12055 matching the C<$optional_condition> 12056If there is no first_child then returns ''. This avoids getting the 12057child, checking for its existence then getting the text for trivial cases. 12058 12059Similar methods are available for the other navigation methods: 12060 12061=over 4 12062 12063=item last_child_text 12064 12065=item prev_sibling_text 12066 12067=item next_sibling_text 12068 12069=item prev_elt_text 12070 12071=item next_elt_text 12072 12073=item child_text 12074 12075=item parent_text 12076 12077=back 12078 12079All this methods also exist in "trimmed" variant: 12080 12081=over 4 12082 12083=item first_child_trimmed_text 12084 12085=item last_child_trimmed_text 12086 12087=item prev_sibling_trimmed_text 12088 12089=item next_sibling_trimmed_text 12090 12091=item prev_elt_trimmed_text 12092 12093=item next_elt_trimmed_text 12094 12095=item child_trimmed_text 12096 12097=item parent_trimmed_text 12098 12099=back 12100 12101=item field ($condition) 12102 12103Same method as C<first_child_text> with a different name 12104 12105=item fields ($condition_list) 12106 12107Return the list of field (text of first child matching the conditions), 12108missing fields are returned as the empty string. 12109 12110Same method as C<first_child_text> with a different name 12111 12112=item trimmed_field ($optional_condition) 12113 12114Same method as C<first_child_trimmed_text> with a different name 12115 12116=item set_field ($condition, $optional_atts, @list_of_elt_and_strings) 12117 12118Set the content of the first child of the element that matches 12119C<$condition>, the rest of the arguments is the same as for C<L<set_content>> 12120 12121If no child matches C<$condition> _and_ if C<$condition> is a valid 12122XML element name, then a new element by that name is created and 12123inserted as the last child. 12124 12125=item first_child_matches ($optional_condition) 12126 12127Return the element if the first child of the element (if it exists) passes 12128the C<$optional_condition> C<undef> otherwise 12129 12130 if( $elt->first_child_matches( 'title')) ... 12131 12132is equivalent to 12133 12134 if( $elt->{first_child} && $elt->{first_child}->passes( 'title')) 12135 12136C<first_child_is> is an other name for this method 12137 12138Similar methods are available for the other navigation methods: 12139 12140=over 4 12141 12142=item last_child_matches 12143 12144=item prev_sibling_matches 12145 12146=item next_sibling_matches 12147 12148=item prev_elt_matches 12149 12150=item next_elt_matches 12151 12152=item child_matches 12153 12154=item parent_matches 12155 12156=back 12157 12158=item is_first_child ($optional_condition) 12159 12160returns true (the element) if the element is the first child of its parent 12161(optionally that satisfies the C<$optional_condition>) 12162 12163=item is_last_child ($optional_condition) 12164 12165returns true (the element) if the element is the last child of its parent 12166(optionally that satisfies the C<$optional_condition>) 12167 12168=item prev_sibling ($optional_condition) 12169 12170Return the previous sibling of the element, or the previous sibling matching 12171C<$optional_condition> 12172 12173=item next_sibling ($optional_condition) 12174 12175Return the next sibling of the element, or the first one matching 12176C<$optional_condition>. 12177 12178=item next_elt ($optional_elt, $optional_condition) 12179 12180Return the next elt (optionally matching C<$optional_condition>) of the element. This 12181is defined as the next element which opens after the current element opens. 12182Which usually means the first child of the element. 12183Counter-intuitive as it might look this allows you to loop through the 12184whole document by starting from the root. 12185 12186The C<$optional_elt> is the root of a subtree. When the C<next_elt> is out of the 12187subtree then the method returns undef. You can then walk a sub-tree with: 12188 12189 my $elt= $subtree_root; 12190 while( $elt= $elt->next_elt( $subtree_root)) 12191 { # insert processing code here 12192 } 12193 12194=item prev_elt ($optional_condition) 12195 12196Return the previous elt (optionally matching C<$optional_condition>) of the 12197element. This is the first element which opens before the current one. 12198It is usually either the last descendant of the previous sibling or 12199simply the parent 12200 12201=item next_n_elt ($offset, $optional_condition) 12202 12203Return the C<$offset>-th element that matches the C<$optional_condition> 12204 12205=item following_elt 12206 12207Return the following element (as per the XPath following axis) 12208 12209=item preceding_elt 12210 12211Return the preceding element (as per the XPath preceding axis) 12212 12213=item following_elts 12214 12215Return the list of following elements (as per the XPath following axis) 12216 12217=item preceding_elts 12218 12219Return the list of preceding elements (as per the XPath preceding axis) 12220 12221=item children ($optional_condition) 12222 12223Return the list of children (optionally which matches C<$optional_condition>) of 12224the element. The list is in document order. 12225 12226=item children_count ($optional_condition) 12227 12228Return the number of children of the element (optionally which matches 12229C<$optional_condition>) 12230 12231=item children_text ($optional_condition) 12232 12233In array context, returns an array containing the text of children of the 12234element (optionally which matches C<$optional_condition>) 12235 12236In scalar context, returns the concatenation of the text of children of 12237the element 12238 12239=item children_trimmed_text ($optional_condition) 12240 12241In array context, returns an array containing the trimmed text of children 12242of the element (optionally which matches C<$optional_condition>) 12243 12244In scalar context, returns the concatenation of the trimmed text of children of 12245the element 12246 12247 12248=item children_copy ($optional_condition) 12249 12250Return a list of elements that are copies of the children of the element, 12251optionally which matches C<$optional_condition> 12252 12253=item descendants ($optional_condition) 12254 12255Return the list of all descendants (optionally which matches 12256C<$optional_condition>) of the element. This is the equivalent of the 12257C<getElementsByTagName> of the DOM (by the way, if you are really a DOM 12258addict, you can use C<getElementsByTagName> instead) 12259 12260=item getElementsByTagName ($optional_condition) 12261 12262Same as C<L<descendants>> 12263 12264=item find_by_tag_name ($optional_condition) 12265 12266Same as C<L<descendants>> 12267 12268=item descendants_or_self ($optional_condition) 12269 12270Same as C<L<descendants>> except that the element itself is included in the list 12271if it matches the C<$optional_condition> 12272 12273=item first_descendant ($optional_condition) 12274 12275Return the first descendant of the element that matches the condition 12276 12277=item last_descendant ($optional_condition) 12278 12279Return the last descendant of the element that matches the condition 12280 12281=item ancestors ($optional_condition) 12282 12283Return the list of ancestors (optionally matching C<$optional_condition>) of the 12284element. The list is ordered from the innermost ancestor to the outermost one 12285 12286NOTE: the element itself is not part of the list, in order to include it 12287you will have to use ancestors_or_self 12288 12289=item ancestors_or_self ($optional_condition) 12290 12291Return the list of ancestors (optionally matching C<$optional_condition>) of the 12292element, including the element (if it matches the condition>). 12293The list is ordered from the innermost ancestor to the outermost one 12294 12295=item passes ($condition) 12296 12297Return the element if it passes the C<$condition> 12298 12299=item att ($att) 12300 12301Return the value of attribute C<$att> or C<undef> 12302 12303=item latt ($att) 12304 12305Return the value of attribute C<$att> or C<undef> 12306 12307this method is an lvalue, so you can do C<< $elt->latt( 'foo')= 'bar' >> or C<< $elt->latt( 'foo')++; >> 12308 12309=item set_att ($att, $att_value) 12310 12311Set the attribute of the element to the given value 12312 12313You can actually set several attributes this way: 12314 12315 $elt->set_att( att1 => "val1", att2 => "val2"); 12316 12317=item del_att ($att) 12318 12319Delete the attribute for the element 12320 12321You can actually delete several attributes at once: 12322 12323 $elt->del_att( 'att1', 'att2', 'att3'); 12324 12325=item att_exists ($att) 12326 12327Returns true if the attribute C<$att> exists for the element, false 12328otherwise 12329 12330=item cut 12331 12332Cut the element from the tree. The element still exists, it can be copied 12333or pasted somewhere else, it is just not attached to the tree anymore. 12334 12335Note that the "old" links to the parent, previous and next siblings can 12336still be accessed using the former_* methods 12337 12338=item former_next_sibling 12339 12340Returns the former next sibling of a cut node (or undef if the node has not been cut) 12341 12342This makes it easier to write loops where you cut elements: 12343 12344 my $child= $parent->first_child( 'achild'); 12345 while( $child->{'att'}->{'cut'}) 12346 { $child->cut; $child= ($child->{former} && $child->{former}->{next_sibling}); } 12347 12348=item former_prev_sibling 12349 12350Returns the former previous sibling of a cut node (or undef if the node has not been cut) 12351 12352=item former_parent 12353 12354Returns the former parent of a cut node (or undef if the node has not been cut) 12355 12356=item cut_children ($optional_condition) 12357 12358Cut all the children of the element (or all of those which satisfy the 12359C<$optional_condition>). 12360 12361Return the list of children 12362 12363=item cut_descendants ($optional_condition) 12364 12365Cut all the descendants of the element (or all of those which satisfy the 12366C<$optional_condition>). 12367 12368Return the list of descendants 12369 12370=item copy ($elt) 12371 12372Return a copy of the element. The copy is a "deep" copy: all sub-elements of 12373the element are duplicated. 12374 12375=item paste ($optional_position, $ref) 12376 12377Paste a (previously C<cut> or newly generated) element. Die if the element 12378already belongs to a tree. 12379 12380Note that the calling element is pasted: 12381 12382 $child->paste( first_child => $existing_parent); 12383 $new_sibling->paste( after => $this_sibling_is_already_in_the_tree); 12384 12385or 12386 12387 my $new_elt= XML::Twig::Elt->new( tag => $content); 12388 $new_elt->paste( $position => $existing_elt); 12389 12390Example: 12391 12392 my $t= XML::Twig->new->parse( 'doc.xml') 12393 my $toc= $t->root->new( 'toc'); 12394 $toc->paste( $t->root); # $toc is pasted as first child of the root 12395 foreach my $title ($t->findnodes( '/doc/section/title')) 12396 { my $title_toc= $title->copy; 12397 # paste $title_toc as the last child of toc 12398 $title_toc->paste( last_child => $toc) 12399 } 12400 12401Position options: 12402 12403=over 4 12404 12405=item first_child (default) 12406 12407The element is pasted as the first child of C<$ref> 12408 12409=item last_child 12410 12411The element is pasted as the last child of C<$ref> 12412 12413=item before 12414 12415The element is pasted before C<$ref>, as its previous sibling. 12416 12417=item after 12418 12419The element is pasted after C<$ref>, as its next sibling. 12420 12421=item within 12422 12423In this case an extra argument, C<$offset>, should be supplied. The element 12424will be pasted in the reference element (or in its first text child) at the 12425given offset. To achieve this the reference element will be split at the 12426offset. 12427 12428=back 12429 12430Note that you can call directly the underlying method: 12431 12432=over 4 12433 12434=item paste_before 12435 12436=item paste_after 12437 12438=item paste_first_child 12439 12440=item paste_last_child 12441 12442=item paste_within 12443 12444=back 12445 12446=item move ($optional_position, $ref) 12447 12448Move an element in the tree. 12449This is just a C<cut> then a C<paste>. The syntax is the same as C<paste>. 12450 12451=item replace ($ref) 12452 12453Replaces an element in the tree. Sometimes it is just not possible toC<cut> 12454an element then C<paste> another in its place, so C<replace> comes in handy. 12455The calling element replaces C<$ref>. 12456 12457=item replace_with (@elts) 12458 12459Replaces the calling element with one or more elements 12460 12461=item delete 12462 12463Cut the element and frees the memory. 12464 12465=item prefix ($text, $optional_option) 12466 12467Add a prefix to an element. If the element is a C<PCDATA> element the text 12468is added to the pcdata, if the elements first child is a C<PCDATA> then the 12469text is added to it's pcdata, otherwise a new C<PCDATA> element is created 12470and pasted as the first child of the element. 12471 12472If the option is C<asis> then the prefix is added asis: it is created in 12473a separate C<PCDATA> element with an C<asis> property. You can then write: 12474 12475 $elt1->prefix( '<b>', 'asis'); 12476 12477to create a C<< <b> >> in the output of C<print>. 12478 12479=item suffix ($text, $optional_option) 12480 12481Add a suffix to an element. If the element is a C<PCDATA> element the text 12482is added to the pcdata, if the elements last child is a C<PCDATA> then the 12483text is added to it's pcdata, otherwise a new PCDATA element is created 12484and pasted as the last child of the element. 12485 12486If the option is C<asis> then the suffix is added asis: it is created in 12487a separate C<PCDATA> element with an C<asis> property. You can then write: 12488 12489 $elt2->suffix( '</b>', 'asis'); 12490 12491=item trim 12492 12493Trim the element in-place: spaces at the beginning and at the end of the element 12494are discarded and multiple spaces within the element (or its descendants) are 12495replaced by a single space. 12496 12497Note that in some cases you can still end up with multiple spaces, if they are 12498split between several elements: 12499 12500 <doc> text <b> hah! </b> yep</doc> 12501 12502gets trimmed to 12503 12504 <doc>text <b> hah! </b> yep</doc> 12505 12506This is somewhere in between a bug and a feature. 12507 12508=item normalize 12509 12510merge together all consecutive pcdata elements in the element (if for example 12511you have turned some elements into pcdata using C<L<erase>>, this will give you 12512a "clean" element in which there all text fragments are as long as possible). 12513 12514 12515=item simplify (%options) 12516 12517Return a data structure suspiciously similar to XML::Simple's. Options are 12518identical to XMLin options, see XML::Simple doc for more details (or use 12519DATA::dumper or YAML to dump the data structure) 12520 12521B<Note>: there is no magic here, if you write 12522C<< $twig->parsefile( $file )->simplify(); >> then it will load the entire 12523document in memory. I am afraid you will have to put some work into it to 12524get just the bits you want and discard the rest. Look at the synopsis or 12525the XML::Twig 101 section at the top of the docs for more information. 12526 12527=over 4 12528 12529=item content_key 12530 12531=item forcearray 12532 12533=item keyattr 12534 12535=item noattr 12536 12537=item normalize_space 12538 12539aka normalise_space 12540 12541=item variables (%var_hash) 12542 12543%var_hash is a hash { name => value } 12544 12545This option allows variables in the XML to be expanded when the file is read. (there is no facility for putting the variable names back if you regenerate XML using XMLout). 12546 12547A 'variable' is any text of the form ${name} (or $name) which occurs in an attribute value or in the text content of an element. If 'name' matches a key in the supplied hashref, ${name} will be replaced with the corresponding value from the hashref. If no matching key is found, the variable will not be replaced. 12548 12549=item var_att ($attribute_name) 12550 12551This option gives the name of an attribute that will be used to create 12552variables in the XML: 12553 12554 <dirs> 12555 <dir name="prefix">/usr/local</dir> 12556 <dir name="exec_prefix">$prefix/bin</dir> 12557 </dirs> 12558 12559use C<< var => 'name' >> to get $prefix replaced by /usr/local in the 12560generated data structure 12561 12562By default variables are captured by the following regexp: /$(\w+)/ 12563 12564=item var_regexp (regexp) 12565 12566This option changes the regexp used to capture variables. The variable 12567name should be in $1 12568 12569=item group_tags { grouping tag => grouped tag, grouping tag 2 => grouped tag 2...} 12570 12571Option used to simplify the structure: elements listed will not be used. 12572Their children will be, they will be considered children of the element 12573parent. 12574 12575If the element is: 12576 12577 <config host="laptop.xmltwig.org"> 12578 <server>localhost</server> 12579 <dirs> 12580 <dir name="base">/home/mrodrigu/standards</dir> 12581 <dir name="tools">$base/tools</dir> 12582 </dirs> 12583 <templates> 12584 <template name="std_def">std_def.templ</template> 12585 <template name="dummy">dummy</template> 12586 </templates> 12587 </config> 12588 12589Then calling simplify with C<< group_tags => { dirs => 'dir', 12590templates => 'template'} >> 12591makes the data structure be exactly as if the start and end tags for C<dirs> and 12592C<templates> were not there. 12593 12594A YAML dump of the structure 12595 12596 base: '/home/mrodrigu/standards' 12597 host: laptop.xmltwig.org 12598 server: localhost 12599 template: 12600 - std_def.templ 12601 - dummy.templ 12602 tools: '$base/tools' 12603 12604 12605=back 12606 12607=item split_at ($offset) 12608 12609Split a text (C<PCDATA> or C<CDATA>) element in 2 at C<$offset>, the original 12610element now holds the first part of the string and a new element holds the 12611right part. The new element is returned 12612 12613If the element is not a text element then the first text child of the element 12614is split 12615 12616=item split ( $optional_regexp, $tag1, $atts1, $tag2, $atts2...) 12617 12618Split the text descendants of an element in place, the text is split using 12619the C<$regexp>, if the regexp includes () then the matched separators will be 12620wrapped in elements. C<$1> is wrapped in $tag1, with attributes C<$atts1> if 12621C<$atts1> is given (as a hashref), C<$2> is wrapped in $tag2... 12622 12623if $elt is C<< <p>tati tata <b>tutu tati titi</b> tata tati tata</p> >> 12624 12625 $elt->split( qr/(ta)ti/, 'foo', {type => 'toto'} ) 12626 12627will change $elt to 12628 12629 <p><foo type="toto">ta</foo> tata <b>tutu <foo type="toto">ta</foo> 12630 titi</b> tata <foo type="toto">ta</foo> tata</p> 12631 12632The regexp can be passed either as a string or as C<qr//> (perl 5.005 and 12633later), it defaults to \s+ just as the C<split> built-in (but this would be 12634quite a useless behaviour without the C<$optional_tag> parameter) 12635 12636C<$optional_tag> defaults to PCDATA or CDATA, depending on the initial element 12637type 12638 12639The list of descendants is returned (including un-touched original elements 12640and newly created ones) 12641 12642=item mark ( $regexp, $optional_tag, $optional_attribute_ref) 12643 12644This method behaves exactly as L<split>, except only the newly created 12645elements are returned 12646 12647=item wrap_children ( $regexp_string, $tag, $optional_attribute_hashref) 12648 12649Wrap the children of the element that match the regexp in an element C<$tag>. 12650If $optional_attribute_hashref is passed then the new element will 12651have these attributes. 12652 12653The $regexp_string includes tags, within pointy brackets, as in 12654C<< <title><para>+ >> and the usual Perl modifiers (+*?...). 12655Tags can be further qualified with attributes: 12656C<< <para type="warning" classif="cosmic_secret">+ >>. The values 12657for attributes should be xml-escaped: C<< <candy type="M&Ms">* >> 12658(C<E<lt>>, C<&> B<C<E<gt>>> and C<"> should be escaped). 12659 12660Note that elements might get extra C<id> attributes in the process. See L<add_id>. 12661Use L<strip_att> to remove unwanted id's. 12662 12663Here is an example: 12664 12665If the element C<$elt> has the following content: 12666 12667 <elt> 12668 <p>para 1</p> 12669 <l_l1_1>list 1 item 1 para 1</l_l1_1> 12670 <l_l1>list 1 item 1 para 2</l_l1> 12671 <l_l1_n>list 1 item 2 para 1 (only para)</l_l1_n> 12672 <l_l1_n>list 1 item 3 para 1</l_l1_n> 12673 <l_l1>list 1 item 3 para 2</l_l1> 12674 <l_l1>list 1 item 3 para 3</l_l1> 12675 <l_l1_1>list 2 item 1 para 1</l_l1_1> 12676 <l_l1>list 2 item 1 para 2</l_l1> 12677 <l_l1_n>list 2 item 2 para 1 (only para)</l_l1_n> 12678 <l_l1_n>list 2 item 3 para 1</l_l1_n> 12679 <l_l1>list 2 item 3 para 2</l_l1> 12680 <l_l1>list 2 item 3 para 3</l_l1> 12681 </elt> 12682 12683Then the code 12684 12685 $elt->wrap_children( q{<l_l1_1><l_l1>*} , li => { type => "ul1" }); 12686 $elt->wrap_children( q{<l_l1_n><l_l1>*} , li => { type => "ul" }); 12687 12688 $elt->wrap_children( q{<li type="ul1"><li type="ul">+}, "ul"); 12689 $elt->strip_att( 'id'); 12690 $elt->strip_att( 'type'); 12691 $elt->print; 12692 12693will output: 12694 12695 <elt> 12696 <p>para 1</p> 12697 <ul> 12698 <li> 12699 <l_l1_1>list 1 item 1 para 1</l_l1_1> 12700 <l_l1>list 1 item 1 para 2</l_l1> 12701 </li> 12702 <li> 12703 <l_l1_n>list 1 item 2 para 1 (only para)</l_l1_n> 12704 </li> 12705 <li> 12706 <l_l1_n>list 1 item 3 para 1</l_l1_n> 12707 <l_l1>list 1 item 3 para 2</l_l1> 12708 <l_l1>list 1 item 3 para 3</l_l1> 12709 </li> 12710 </ul> 12711 <ul> 12712 <li> 12713 <l_l1_1>list 2 item 1 para 1</l_l1_1> 12714 <l_l1>list 2 item 1 para 2</l_l1> 12715 </li> 12716 <li> 12717 <l_l1_n>list 2 item 2 para 1 (only para)</l_l1_n> 12718 </li> 12719 <li> 12720 <l_l1_n>list 2 item 3 para 1</l_l1_n> 12721 <l_l1>list 2 item 3 para 2</l_l1> 12722 <l_l1>list 2 item 3 para 3</l_l1> 12723 </li> 12724 </ul> 12725 </elt> 12726 12727=item subs_text ($regexp, $replace) 12728 12729subs_text does text substitution, similar to perl's C< s///> operator. 12730 12731C<$regexp> must be a perl regexp, created with the C<qr> operator. 12732 12733C<$replace> can include C<$1, $2>... from the C<$regexp>. It can also be 12734used to create element and entities, by using 12735C<< &elt( tag => { att => val }, text) >> (similar syntax as C<L<new>>) and 12736C<< &ent( name) >>. 12737 12738Here is a rather complex example: 12739 12740 $elt->subs_text( qr{(?<!do not )link to (http://([^\s,]*))}, 12741 'see &elt( a =>{ href => $1 }, $2)' 12742 ); 12743 12744This will replace text like I<link to http://www.xmltwig.org> by 12745I<< see <a href="www.xmltwig.org">www.xmltwig.org</a> >>, but not 12746I<do not link to...> 12747 12748Generating entities (here replacing spaces with ): 12749 12750 $elt->subs_text( qr{ }, '&ent( " ")'); 12751 12752or, using a variable: 12753 12754 my $ent=" "; 12755 $elt->subs_text( qr{ }, "&ent( '$ent')"); 12756 12757Note that the substitution is always global, as in using the C<g> modifier 12758in a perl substitution, and that it is performed on all text descendants 12759of the element. 12760 12761B<Bug>: in the C<$regexp>, you can only use C<\1>, C<\2>... if the replacement 12762expression does not include elements or attributes. eg 12763 12764 $t->subs_text( qr/((t[aiou])\2)/, '$2'); # ok, replaces toto, tata, titi, tutu by to, ta, ti, tu 12765 $t->subs_text( qr/((t[aiou])\2)/, '&elt(p => $1)' ); # NOK, does not find toto... 12766 12767=item add_id ($optional_coderef) 12768 12769Add an id to the element. 12770 12771The id is an attribute, C<id> by default, see the C<id> option for XML::Twig 12772C<new> to change it. Use an id starting with C<#> to get an id that's not 12773output by L<print>, L<flush> or L<sprint>, yet that allows you to use the 12774L<elt_id> method to get the element easily. 12775 12776If the element already has an id, no new id is generated. 12777 12778By default the method create an id of the form C<< twig_id_<nnnn> >>, 12779where C<< <nnnn> >> is a number, incremented each time the method is called 12780successfully. 12781 12782=item set_id_seed ($prefix) 12783 12784by default the id generated by C<L<add_id>> is C<< twig_id_<nnnn> >>, 12785C<set_id_seed> changes the prefix to C<$prefix> and resets the number 12786to 1 12787 12788=item strip_att ($att) 12789 12790Remove the attribute C<$att> from all descendants of the element (including 12791the element) 12792 12793Return the element 12794 12795=item change_att_name ($old_name, $new_name) 12796 12797Change the name of the attribute from C<$old_name> to C<$new_name>. If there is no 12798attribute C<$old_name> nothing happens. 12799 12800=item lc_attnames 12801 12802Lower cases the name all the attributes of the element. 12803 12804=item sort_children_on_value( %options) 12805 12806Sort the children of the element in place according to their text. 12807All children are sorted. 12808 12809Return the element, with its children sorted. 12810 12811 12812C<%options> are 12813 12814 type : numeric | alpha (default: alpha) 12815 order : normal | reverse (default: normal) 12816 12817Return the element, with its children sorted 12818 12819 12820=item sort_children_on_att ($att, %options) 12821 12822Sort the children of the element in place according to attribute C<$att>. 12823C<%options> are the same as for C<sort_children_on_value> 12824 12825Return the element. 12826 12827 12828=item sort_children_on_field ($tag, %options) 12829 12830Sort the children of the element in place, according to the field C<$tag> (the 12831text of the first child of the child with this tag). C<%options> are the same 12832as for C<sort_children_on_value>. 12833 12834Return the element, with its children sorted 12835 12836 12837=item sort_children( $get_key, %options) 12838 12839Sort the children of the element in place. The C<$get_key> argument is 12840a reference to a function that returns the sort key when passed an element. 12841 12842For example: 12843 12844 $elt->sort_children( sub { $_[0]->{'att'}->{"nb"} + $_[0]->text }, 12845 type => 'numeric', order => 'reverse' 12846 ); 12847 12848=item field_to_att ($cond, $att) 12849 12850Turn the text of the first sub-element matched by C<$cond> into the value of 12851attribute C<$att> of the element. If C<$att> is omitted then C<$cond> is used 12852as the name of the attribute, which makes sense only if C<$cond> is a valid 12853element (and attribute) name. 12854 12855The sub-element is then cut. 12856 12857=item att_to_field ($att, $tag) 12858 12859Take the value of attribute C<$att> and create a sub-element C<$tag> as first 12860child of the element. If C<$tag> is omitted then C<$att> is used as the name of 12861the sub-element. 12862 12863 12864=item get_xpath ($xpath, $optional_offset) 12865 12866Return a list of elements satisfying the C<$xpath>. C<$xpath> is an XPATH-like 12867expression. 12868 12869A subset of the XPATH abbreviated syntax is covered: 12870 12871 tag 12872 tag[1] (or any other positive number) 12873 tag[last()] 12874 tag[@att] (the attribute exists for the element) 12875 tag[@att="val"] 12876 tag[@att=~ /regexp/] 12877 tag[att1="val1" and att2="val2"] 12878 tag[att1="val1" or att2="val2"] 12879 tag[string()="toto"] (returns tag elements which text (as per the text method) 12880 is toto) 12881 tag[string()=~/regexp/] (returns tag elements which text (as per the text 12882 method) matches regexp) 12883 expressions can start with / (search starts at the document root) 12884 expressions can start with . (search starts at the current element) 12885 // can be used to get all descendants instead of just direct children 12886 * matches any tag 12887 12888So the following examples from the 12889F<XPath recommendationL<http://www.w3.org/TR/xpath.html#path-abbrev>> work: 12890 12891 para selects the para element children of the context node 12892 * selects all element children of the context node 12893 para[1] selects the first para child of the context node 12894 para[last()] selects the last para child of the context node 12895 */para selects all para grandchildren of the context node 12896 /doc/chapter[5]/section[2] selects the second section of the fifth chapter 12897 of the doc 12898 chapter//para selects the para element descendants of the chapter element 12899 children of the context node 12900 //para selects all the para descendants of the document root and thus selects 12901 all para elements in the same document as the context node 12902 //olist/item selects all the item elements in the same document as the 12903 context node that have an olist parent 12904 .//para selects the para element descendants of the context node 12905 .. selects the parent of the context node 12906 para[@type="warning"] selects all para children of the context node that have 12907 a type attribute with value warning 12908 employee[@secretary and @assistant] selects all the employee children of the 12909 context node that have both a secretary attribute and an assistant 12910 attribute 12911 12912 12913The elements will be returned in the document order. 12914 12915If C<$optional_offset> is used then only one element will be returned, the one 12916with the appropriate offset in the list, starting at 0 12917 12918Quoting and interpolating variables can be a pain when the Perl syntax and the 12919XPATH syntax collide, so use alternate quoting mechanisms like q or qq 12920(I like q{} and qq{} myself). 12921 12922Here are some more examples to get you started: 12923 12924 my $p1= "p1"; 12925 my $p2= "p2"; 12926 my @res= $t->get_xpath( qq{p[string( "$p1") or string( "$p2")]}); 12927 12928 my $a= "a1"; 12929 my @res= $t->get_xpath( qq{//*[@att="$a"]}); 12930 12931 my $val= "a1"; 12932 my $exp= qq{//p[ \@att='$val']}; # you need to use \@ or you will get a warning 12933 my @res= $t->get_xpath( $exp); 12934 12935Note that the only supported regexps delimiters are / and that you must 12936backslash all / in regexps AND in regular strings. 12937 12938XML::Twig does not provide natively full XPATH support, but you can use 12939C<L<XML::Twig::XPath>> to get C<findnodes> to use C<XML::XPath> as the 12940XPath engine, with full coverage of the spec. 12941 12942C<L<XML::Twig::XPath>> to get C<findnodes> to use C<XML::XPath> as the 12943XPath engine, with full coverage of the spec. 12944 12945=item find_nodes 12946 12947same asC<get_xpath> 12948 12949=item findnodes 12950 12951same as C<get_xpath> 12952 12953 12954=item text @optional_options 12955 12956Return a string consisting of all the C<PCDATA> and C<CDATA> in an element, 12957without any tags. The text is not XML-escaped: base entities such as C<&> 12958and C<< < >> are not escaped. 12959 12960The 'C<no_recurse>' option will only return the text of the element, not 12961of any included sub-elements (same as C<L<text_only>>). 12962 12963=item text_only 12964 12965Same as C<L<text>> except that the text returned doesn't include 12966the text of sub-elements. 12967 12968=item trimmed_text 12969 12970Same as C<text> except that the text is trimmed: leading and trailing spaces 12971are discarded, consecutive spaces are collapsed 12972 12973=item set_text ($string) 12974 12975Set the text for the element: if the element is a C<PCDATA>, just set its 12976text, otherwise cut all the children of the element and create a single 12977C<PCDATA> child for it, which holds the text. 12978 12979=item merge ($elt2) 12980 12981Move the content of C<$elt2> within the element 12982 12983=item insert ($tag1, [$optional_atts1], $tag2, [$optional_atts2],...) 12984 12985For each tag in the list inserts an element C<$tag> as the only child of the 12986element. The element gets the optional attributes inC<< $optional_atts<n>. >> 12987All children of the element are set as children of the new element. 12988The upper level element is returned. 12989 12990 $p->insert( table => { border=> 1}, 'tr', 'td') 12991 12992put C<$p> in a table with a visible border, a single C<tr> and a single C<td> 12993and return the C<table> element: 12994 12995 <p><table border="1"><tr><td>original content of p</td></tr></table></p> 12996 12997=item wrap_in (@tag) 12998 12999Wrap elements in C<@tag> as the successive ancestors of the element, returns the 13000new element. 13001C<< $elt->wrap_in( 'td', 'tr', 'table') >> wraps the element as a single cell in a 13002table for example. 13003 13004Optionally each tag can be followed by a hashref of attributes, that will be 13005set on the wrapping element: 13006 13007 $elt->wrap_in( p => { class => "advisory" }, div => { class => "intro", id => "div_intro" }); 13008 13009=item insert_new_elt ($opt_position, $tag, $opt_atts_hashref, @opt_content) 13010 13011Combines a C<L<new> > and a C<L<paste> >: creates a new element using 13012C<$tag>, C<$opt_atts_hashref >and C<@opt_content> which are arguments similar 13013to those for C<new>, then paste it, using C<$opt_position> or C<'first_child'>, 13014relative to C<$elt>. 13015 13016Return the newly created element 13017 13018=item erase 13019 13020Erase the element: the element is deleted and all of its children are 13021pasted in its place. 13022 13023=item set_content ( $optional_atts, @list_of_elt_and_strings) 13024 ( $optional_atts, '#EMPTY') 13025 13026Set the content for the element, from a list of strings and 13027elements. Cuts all the element children, then pastes the list 13028elements as the children. This method will create a C<PCDATA> element 13029for any strings in the list. 13030 13031The C<$optional_atts> argument is the ref of a hash of attributes. If this 13032argument is used then the previous attributes are deleted, otherwise they 13033are left untouched. 13034 13035B<WARNING>: if you rely on ID's then you will have to set the id yourself. At 13036this point the element does not belong to a twig yet, so the ID attribute 13037is not known so it won't be stored in the ID list. 13038 13039A content of 'C<#EMPTY>' creates an empty element; 13040 13041=item namespace ($optional_prefix) 13042 13043Return the URI of the namespace that C<$optional_prefix> or the element name 13044belongs to. If the name doesn't belong to any namespace, C<undef> is returned. 13045 13046=item local_name 13047 13048Return the local name (without the prefix) for the element 13049 13050=item ns_prefix 13051 13052Return the namespace prefix for the element 13053 13054=item current_ns_prefixes 13055 13056Return a list of namespace prefixes valid for the element. The order of the 13057prefixes in the list has no meaning. If the default namespace is currently 13058bound, '' appears in the list. 13059 13060 13061=item inherit_att ($att, @optional_tag_list) 13062 13063Return the value of an attribute inherited from parent tags. The value 13064returned is found by looking for the attribute in the element then in turn 13065in each of its ancestors. If the C<@optional_tag_list> is supplied only those 13066ancestors whose tag is in the list will be checked. 13067 13068=item all_children_are ($optional_condition) 13069 13070return 1 if all children of the element pass the C<$optional_condition>, 130710 otherwise 13072 13073=item level ($optional_condition) 13074 13075Return the depth of the element in the twig (root is 0). 13076If C<$optional_condition> is given then only ancestors that match the condition are 13077counted. 13078 13079B<WARNING>: in a tree created using the C<twig_roots> option this will not return 13080the level in the document tree, level 0 will be the document root, level 1 13081will be the C<twig_roots> elements. During the parsing (in a C<twig_handler>) 13082you can use the C<depth> method on the twig object to get the real parsing depth. 13083 13084=item in ($potential_parent) 13085 13086Return true if the element is in the potential_parent (C<$potential_parent> is 13087an element) 13088 13089=item in_context ($cond, $optional_level) 13090 13091Return true if the element is included in an element which passes C<$cond> 13092optionally within C<$optional_level> levels. The returned value is the 13093including element. 13094 13095=item pcdata 13096 13097Return the text of a C<PCDATA> element or C<undef> if the element is not 13098C<PCDATA>. 13099 13100=item pcdata_xml_string 13101 13102Return the text of a C<PCDATA> element or undef if the element is not C<PCDATA>. 13103The text is "XML-escaped" ('&' and '<' are replaced by '&' and '<') 13104 13105=item set_pcdata ($text) 13106 13107Set the text of a C<PCDATA> element. This method does not check that the element is 13108indeed a C<PCDATA> so usually you should use C<L<set_text>> instead. 13109 13110=item append_pcdata ($text) 13111 13112Add the text at the end of a C<PCDATA> element. 13113 13114=item is_cdata 13115 13116Return 1 if the element is a C<CDATA> element, returns 0 otherwise. 13117 13118=item is_text 13119 13120Return 1 if the element is a C<CDATA> or C<PCDATA> element, returns 0 otherwise. 13121 13122=item cdata 13123 13124Return the text of a C<CDATA> element or C<undef> if the element is not 13125C<CDATA>. 13126 13127=item cdata_string 13128 13129Return the XML string of a C<CDATA> element, including the opening and 13130closing markers. 13131 13132=item set_cdata ($text) 13133 13134Set the text of a C<CDATA> element. 13135 13136=item append_cdata ($text) 13137 13138Add the text at the end of a C<CDATA> element. 13139 13140=item remove_cdata 13141 13142Turns all C<CDATA> sections in the element into regular C<PCDATA> elements. This is useful 13143when converting XML to HTML, as browsers do not support CDATA sections. 13144 13145=item extra_data 13146 13147Return the extra_data (comments and PI's) attached to an element 13148 13149=item set_extra_data ($extra_data) 13150 13151Set the extra_data (comments and PI's) attached to an element 13152 13153=item append_extra_data ($extra_data) 13154 13155Append extra_data to the existing extra_data before the element (if no 13156previous extra_data exists then it is created) 13157 13158=item set_asis 13159 13160Set a property of the element that causes it to be output without being XML 13161escaped by the print functions: if it contains C<< a < b >> it will be output 13162as such and not as C<< a < b >>. This can be useful to create text elements 13163that will be output as markup. Note that all C<PCDATA> descendants of the 13164element are also marked as having the property (they are the ones that are 13165actually impacted by the change). 13166 13167If the element is a C<CDATA> element it will also be output asis, without the 13168C<CDATA> markers. The same goes for any C<CDATA> descendant of the element 13169 13170=item set_not_asis 13171 13172Unsets the C<asis> property for the element and its text descendants. 13173 13174=item is_asis 13175 13176Return the C<asis> property status of the element ( 1 or C<undef>) 13177 13178=item closed 13179 13180Return true if the element has been closed. Might be useful if you are 13181somewhere in the tree, during the parse, and have no idea whether a parent 13182element is completely loaded or not. 13183 13184=item get_type 13185 13186Return the type of the element: 'C<#ELT>' for "real" elements, or 'C<#PCDATA>', 13187'C<#CDATA>', 'C<#COMMENT>', 'C<#ENT>', 'C<#PI>' 13188 13189=item is_elt 13190 13191Return the tag if the element is a "real" element, or 0 if it is C<PCDATA>, 13192C<CDATA>... 13193 13194=item contains_only_text 13195 13196Return 1 if the element does not contain any other "real" element 13197 13198=item contains_only ($exp) 13199 13200Return the list of children if all children of the element match 13201the expression C<$exp> 13202 13203 if( $para->contains_only( 'tt')) { ... } 13204 13205=item contains_a_single ($exp) 13206 13207If the element contains a single child that matches the expression C<$exp> 13208returns that element. Otherwise returns 0. 13209 13210=item is_field 13211 13212same as C<contains_only_text> 13213 13214=item is_pcdata 13215 13216Return 1 if the element is a C<PCDATA> element, returns 0 otherwise. 13217 13218=item is_ent 13219 13220Return 1 if the element is an entity (an unexpanded entity) element, 13221return 0 otherwise. 13222 13223=item is_empty 13224 13225Return 1 if the element is empty, 0 otherwise 13226 13227=item set_empty 13228 13229Flags the element as empty. No further check is made, so if the element 13230is actually not empty the output will be messed. The only effect of this 13231method is that the output will be C<< <tag att="value""/> >>. 13232 13233=item set_not_empty 13234 13235Flags the element as not empty. if it is actually empty then the element will 13236be output as C<< <tag att="value""></tag> >> 13237 13238=item is_pi 13239 13240Return 1 if the element is a processing instruction (C<#PI>) element, 13241return 0 otherwise. 13242 13243=item target 13244 13245Return the target of a processing instruction 13246 13247=item set_target ($target) 13248 13249Set the target of a processing instruction 13250 13251=item data 13252 13253Return the data part of a processing instruction 13254 13255=item set_data ($data) 13256 13257Set the data of a processing instruction 13258 13259=item set_pi ($target, $data) 13260 13261Set the target and data of a processing instruction 13262 13263=item pi_string 13264 13265Return the string form of a processing instruction 13266(C<< <?target data?> >>) 13267 13268=item is_comment 13269 13270Return 1 if the element is a comment (C<#COMMENT>) element, 13271return 0 otherwise. 13272 13273=item set_comment ($comment_text) 13274 13275Set the text for a comment 13276 13277=item comment 13278 13279Return the content of a comment (just the text, not the C<< <!-- >> 13280and C<< --> >>) 13281 13282=item comment_string 13283 13284Return the XML string for a comment (C<< <!-- comment --> >>) 13285 13286Note that an XML comment cannot start or end with a '-', or include '--' 13287(http://www.w3.org/TR/2008/REC-xml-20081126/#sec-comments), 13288if that is the case (because you have created the comment yourself presumably, 13289as it could not be in the input XML), then a space will be inserted before 13290an initial '-', after a trailing one or between two '-' in the comment 13291(which could presumably mangle javascript "hidden" in an XHTML comment); 13292 13293=item set_ent ($entity) 13294 13295Set an (non-expanded) entity (C<#ENT>). C<$entity>) is the entity 13296text (C<&ent;>) 13297 13298=item ent 13299 13300Return the entity for an entity (C<#ENT>) element (C<&ent;>) 13301 13302=item ent_name 13303 13304Return the entity name for an entity (C<#ENT>) element (C<ent>) 13305 13306=item ent_string 13307 13308Return the entity, either expanded if the expanded version is available, 13309or non-expanded (C<&ent;>) otherwise 13310 13311=item child ($offset, $optional_condition) 13312 13313Return the C<$offset>-th child of the element, optionally the C<$offset>-th 13314child that matches C<$optional_condition>. The children are treated as a list, so 13315C<< $elt->child( 0) >> is the first child, while C<< $elt->child( -1) >> is 13316the last child. 13317 13318=item child_text ($offset, $optional_condition) 13319 13320Return the text of a child or C<undef> if the sibling does not exist. Arguments 13321are the same as child. 13322 13323=item last_child ($optional_condition) 13324 13325Return the last child of the element, or the last child matching 13326C<$optional_condition> (ie the last of the element children matching 13327the condition). 13328 13329=item last_child_text ($optional_condition) 13330 13331Same as C<first_child_text> but for the last child. 13332 13333=item sibling ($offset, $optional_condition) 13334 13335Return the next or previous C<$offset>-th sibling of the element, or the 13336C<$offset>-th one matching C<$optional_condition>. If C<$offset> is negative then a 13337previous sibling is returned, if $offset is positive then a next sibling is 13338returned. C<$offset=0> returns the element if there is no condition or 13339if the element matches the condition>, C<undef> otherwise. 13340 13341=item sibling_text ($offset, $optional_condition) 13342 13343Return the text of a sibling or C<undef> if the sibling does not exist. 13344Arguments are the same as C<sibling>. 13345 13346=item prev_siblings ($optional_condition) 13347 13348Return the list of previous siblings (optionally matching C<$optional_condition>) 13349for the element. The elements are ordered in document order. 13350 13351=item next_siblings ($optional_condition) 13352 13353Return the list of siblings (optionally matching C<$optional_condition>) 13354following the element. The elements are ordered in document order. 13355 13356=item siblings ($optional_condition) 13357 13358Return the list of siblings (optionally matching C<$optional_condition>) 13359of the element (excluding the element itself). The elements are ordered 13360in document order. 13361 13362=item pos ($optional_condition) 13363 13364Return the position of the element in the children list. The first child has a 13365position of 1 (as in XPath). 13366 13367If the C<$optional_condition> is given then only siblings that match the condition 13368are counted. If the element itself does not match the condition then 133690 is returned. 13370 13371=item atts 13372 13373Return a hash ref containing the element attributes 13374 13375=item set_atts ({ att1=>$att1_val, att2=> $att2_val... }) 13376 13377Set the element attributes with the hash ref supplied as the argument. The previous 13378attributes are lost (ie the attributes set by C<set_atts> replace all of the 13379attributes of the element). 13380 13381You can also pass a list instead of a hashref: C<< $elt->set_atts( att1 => 'val1',...) >> 13382 13383=item del_atts 13384 13385Deletes all the element attributes. 13386 13387=item att_nb 13388 13389Return the number of attributes for the element 13390 13391=item has_atts 13392 13393Return true if the element has attributes (in fact return the number of 13394attributes, thus being an alias to C<L<att_nb>> 13395 13396=item has_no_atts 13397 13398Return true if the element has no attributes, false (0) otherwise 13399 13400=item att_names 13401 13402return a list of the attribute names for the element 13403 13404=item att_xml_string ($att, $options) 13405 13406Return the attribute value, where '&', '<' and quote (" or the value of the quote option 13407at twig creation) are XML-escaped. 13408 13409The options are passed as a hashref, setting C<escape_gt> to a true value will also escape 13410'>' ($elt( 'myatt', { escape_gt => 1 }); 13411 13412=item set_id ($id) 13413 13414Set the C<id> attribute of the element to the value. 13415See C<L<elt_id> > to change the id attribute name 13416 13417=item id 13418 13419Gets the id attribute value 13420 13421=item del_id ($id) 13422 13423Deletes the C<id> attribute of the element and remove it from the id list 13424for the document 13425 13426=item class 13427 13428Return the C<class> attribute for the element (methods on the C<class> 13429attribute are quite convenient when dealing with XHTML, or plain XML that 13430will eventually be displayed using CSS) 13431 13432=item lclass 13433 13434same as class, except that 13435this method is an lvalue, so you can do C<< $elt->lclass= "foo" >> 13436 13437=item set_class ($class) 13438 13439Set the C<class> attribute for the element to C<$class> 13440 13441=item add_class ($class) 13442 13443Add C<$class> to the element C<class> attribute: the new class is added 13444only if it is not already present. 13445 13446Note that classes are then sorted alphabetically, so the C<class> attribute 13447can be changed even if the class is already there 13448 13449=item remove_class ($class) 13450 13451Remove C<$class> from the element C<class> attribute. 13452 13453Note that classes are then sorted alphabetically, so the C<class> attribute can be 13454changed even if the class is already there 13455 13456 13457=item add_to_class ($class) 13458 13459alias for add_class 13460 13461=item att_to_class ($att) 13462 13463Set the C<class> attribute to the value of attribute C<$att> 13464 13465=item add_att_to_class ($att) 13466 13467Add the value of attribute C<$att> to the C<class> attribute of the element 13468 13469=item move_att_to_class ($att) 13470 13471Add the value of attribute C<$att> to the C<class> attribute of the element 13472and delete the attribute 13473 13474=item tag_to_class 13475 13476Set the C<class> attribute of the element to the element tag 13477 13478=item add_tag_to_class 13479 13480Add the element tag to its C<class> attribute 13481 13482=item set_tag_class ($new_tag) 13483 13484Add the element tag to its C<class> attribute and sets the tag to C<$new_tag> 13485 13486=item in_class ($class) 13487 13488Return true (C<1>) if the element is in the class C<$class> (if C<$class> is 13489one of the tokens in the element C<class> attribute) 13490 13491=item tag_to_span 13492 13493Change the element tag tp C<span> and set its class to the old tag 13494 13495=item tag_to_div 13496 13497Change the element tag tp C<div> and set its class to the old tag 13498 13499=item DESTROY 13500 13501Frees the element from memory. 13502 13503=item start_tag 13504 13505Return the string for the start tag for the element, including 13506the C<< /> >> at the end of an empty element tag 13507 13508=item end_tag 13509 13510Return the string for the end tag of an element. For an empty 13511element, this returns the empty string (''). 13512 13513=item xml_string @optional_options 13514 13515Equivalent to C<< $elt->sprint( 1) >>, returns the string for the entire 13516element, excluding the element's tags (but nested element tags are present) 13517 13518The 'C<no_recurse>' option will only return the text of the element, not 13519of any included sub-elements (same as C<L<xml_text_only>>). 13520 13521=item inner_xml 13522 13523Another synonym for xml_string 13524 13525=item outer_xml 13526 13527An other synonym for sprint 13528 13529=item xml_text 13530 13531Return the text of the element, encoded (and processed by the current 13532C<L<output_filter>> or C<L<output_encoding>> options, without any tag. 13533 13534=item xml_text_only 13535 13536Same as C<L<xml_text>> except that the text returned doesn't include 13537the text of sub-elements. 13538 13539=item set_pretty_print ($style) 13540 13541Set the pretty print method, amongst 'C<none>' (default), 'C<nsgmls>', 13542'C<nice>', 'C<indented>', 'C<record>' and 'C<record_c>' 13543 13544pretty_print styles: 13545 13546=over 4 13547 13548=item none 13549 13550the default, no C<\n> is used 13551 13552=item nsgmls 13553 13554nsgmls style, with C<\n> added within tags 13555 13556=item nice 13557 13558adds C<\n> wherever possible (NOT SAFE, can lead to invalid XML) 13559 13560=item indented 13561 13562same as C<nice> plus indents elements (NOT SAFE, can lead to invalid XML) 13563 13564=item record 13565 13566table-oriented pretty print, one field per line 13567 13568=item record_c 13569 13570table-oriented pretty print, more compact than C<record>, one record per line 13571 13572=back 13573 13574=item set_empty_tag_style ($style) 13575 13576Set the method to output empty tags, amongst 'C<normal>' (default), 'C<html>', 13577and 'C<expand>', 13578 13579C<normal> outputs an empty tag 'C<< <tag/> >>', C<html> adds a space 13580'C<< <tag /> >>' for elements that can be empty in XHTML and C<expand> outputs 13581'C<< <tag></tag> >>' 13582 13583=item set_remove_cdata ($flag) 13584 13585set (or unset) the flag that forces the twig to output CDATA sections as 13586regular (escaped) PCDATA 13587 13588 13589=item set_indent ($string) 13590 13591Set the indentation for the indented pretty print style (default is 2 spaces) 13592 13593=item set_quote ($quote) 13594 13595Set the quotes used for attributes. can be 'C<double>' (default) or 'C<single>' 13596 13597=item cmp ($elt) 13598 13599 Compare the order of the 2 elements in a twig. 13600 13601 C<$a> is the <A>..</A> element, C<$b> is the <B>...</B> element 13602 13603 document $a->cmp( $b) 13604 <A> ... </A> ... <B> ... </B> -1 13605 <A> ... <B> ... </B> ... </A> -1 13606 <B> ... </B> ... <A> ... </A> 1 13607 <B> ... <A> ... </A> ... </B> 1 13608 $a == $b 0 13609 $a and $b not in the same tree undef 13610 13611=item before ($elt) 13612 13613Return 1 if C<$elt> starts before the element, 0 otherwise. If the 2 elements 13614are not in the same twig then return C<undef>. 13615 13616 if( $a->cmp( $b) == -1) { return 1; } else { return 0; } 13617 13618=item after ($elt) 13619 13620Return 1 if $elt starts after the element, 0 otherwise. If the 2 elements 13621are not in the same twig then return C<undef>. 13622 13623 if( $a->cmp( $b) == -1) { return 1; } else { return 0; } 13624 13625=item other comparison methods 13626 13627=over 4 13628 13629=item lt 13630 13631=item le 13632 13633=item gt 13634 13635=item ge 13636 13637=back 13638 13639=item path 13640 13641Return the element context in a form similar to XPath's short 13642form: 'C</root/tag1/../tag>' 13643 13644=item xpath 13645 13646Return a unique XPath expression that can be used to find the element 13647again. 13648 13649It looks like C</doc/sect[3]/title>: unique elements do not have an index, 13650the others do. 13651 13652=item flush 13653 13654flushes the twig up to the current element (strictly equivalent to 13655C<< $elt->root->flush >>) 13656 13657=item private methods 13658 13659Low-level methods on the twig: 13660 13661=over 4 13662 13663=item set_parent ($parent) 13664 13665=item set_first_child ($first_child) 13666 13667=item set_last_child ($last_child) 13668 13669=item set_prev_sibling ($prev_sibling) 13670 13671=item set_next_sibling ($next_sibling) 13672 13673=item set_twig_current 13674 13675=item del_twig_current 13676 13677=item twig_current 13678 13679=item contains_text 13680 13681=back 13682 13683Those methods should not be used, unless of course you find some creative 13684and interesting, not to mention useful, ways to do it. 13685 13686=back 13687 13688=head2 cond 13689 13690Most of the navigation functions accept a condition as an optional argument 13691The first element (or all elements for C<L<children> > or 13692C<L<ancestors> >) that passes the condition is returned. 13693 13694The condition is a single step of an XPath expression using the XPath subset 13695defined by C<L<get_xpath>>. Additional conditions are: 13696 13697The condition can be 13698 13699=over 4 13700 13701=item #ELT 13702 13703return a "real" element (not a PCDATA, CDATA, comment or pi element) 13704 13705=item #TEXT 13706 13707return a PCDATA or CDATA element 13708 13709=item regular expression 13710 13711return an element whose tag matches the regexp. The regexp has to be created 13712with C<qr//> (hence this is available only on perl 5.005 and above) 13713 13714=item code reference 13715 13716applies the code, passing the current element as argument, if the code returns 13717true then the element is returned, if it returns false then the code is applied 13718to the next candidate. 13719 13720=back 13721 13722=head2 XML::Twig::XPath 13723 13724XML::Twig implements a subset of XPath through the C<L<get_xpath>> method. 13725 13726If you want to use the whole XPath power, then you can use C<XML::Twig::XPath> 13727instead. In this case C<XML::Twig> uses C<XML::XPath> to execute XPath queries. 13728You will of course need C<XML::XPath> installed to be able to use C<XML::Twig::XPath>. 13729 13730See L<XML::XPath> for more information. 13731 13732The methods you can use are: 13733 13734=over 4 13735 13736=item findnodes ($path) 13737 13738return a list of nodes found by C<$path>. 13739 13740=item findnodes_as_string ($path) 13741 13742return the nodes found reproduced as XML. The result is not guaranteed 13743to be valid XML though. 13744 13745=item findvalue ($path) 13746 13747return the concatenation of the text content of the result nodes 13748 13749=back 13750 13751In order for C<XML::XPath> to be used as the XPath engine the following methods 13752are included in C<XML::Twig>: 13753 13754in XML::Twig 13755 13756=over 4 13757 13758=item getRootNode 13759 13760=item getParentNode 13761 13762=item getChildNodes 13763 13764=back 13765 13766in XML::Twig::Elt 13767 13768=over 4 13769 13770=item string_value 13771 13772=item toString 13773 13774=item getName 13775 13776=item getRootNode 13777 13778=item getNextSibling 13779 13780=item getPreviousSibling 13781 13782=item isElementNode 13783 13784=item isTextNode 13785 13786=item isPI 13787 13788=item isPINode 13789 13790=item isProcessingInstructionNode 13791 13792=item isComment 13793 13794=item isCommentNode 13795 13796=item getTarget 13797 13798=item getChildNodes 13799 13800=item getElementById 13801 13802=back 13803 13804=head2 XML::Twig::XPath::Elt 13805 13806The methods you can use are the same as on C<XML::Twig::XPath> elements: 13807 13808=over 4 13809 13810=item findnodes ($path) 13811 13812return a list of nodes found by C<$path>. 13813 13814=item findnodes_as_string ($path) 13815 13816return the nodes found reproduced as XML. The result is not guaranteed 13817to be valid XML though. 13818 13819=item findvalue ($path) 13820 13821return the concatenation of the text content of the result nodes 13822 13823=back 13824 13825 13826=head2 XML::Twig::Entity_list 13827 13828=over 4 13829 13830=item new 13831 13832Create an entity list. 13833 13834=item add ($ent) 13835 13836Add an entity to an entity list. 13837 13838=item add_new_ent ($name, $val, $sysid, $pubid, $ndata, $param) 13839 13840Create a new entity and add it to the entity list 13841 13842=item delete ($ent or $tag). 13843 13844Delete an entity (defined by its name or by the Entity object) 13845from the list. 13846 13847=item print ($optional_filehandle) 13848 13849Print the entity list. 13850 13851=item list 13852 13853Return the list as an array 13854 13855=back 13856 13857 13858=head2 XML::Twig::Entity 13859 13860=over 4 13861 13862=item new ($name, $val, $sysid, $pubid, $ndata, $param) 13863 13864Same arguments as the Entity handler for XML::Parser. 13865 13866=item print ($optional_filehandle) 13867 13868Print an entity declaration. 13869 13870=item name 13871 13872Return the name of the entity 13873 13874=item val 13875 13876Return the value of the entity 13877 13878=item sysid 13879 13880Return the system id for the entity (for NDATA entities) 13881 13882=item pubid 13883 13884Return the public id for the entity (for NDATA entities) 13885 13886=item ndata 13887 13888Return true if the entity is an NDATA entity 13889 13890=item param 13891 13892Return true if the entity is a parameter entity 13893 13894 13895=item text 13896 13897Return the entity declaration text. 13898 13899=back 13900 13901=head2 XML::Twig::Notation_list 13902 13903=over 4 13904 13905=item new 13906 13907Create an notation list. 13908 13909=item add ($notation) 13910 13911Add an notation to an notation list. 13912 13913=item add_new_notation ($name, $base, $sysid, $pubid) 13914 13915Create a new notation and add it to the notation list 13916 13917=item delete ($notation or $tag). 13918 13919Delete an notation (defined by its name or by the Notation object) 13920from the list. 13921 13922=item print ($optional_filehandle) 13923 13924Print the notation list. 13925 13926=item list 13927 13928Return the list as an array 13929 13930=back 13931 13932 13933=head2 XML::Twig::Notation 13934 13935=over 4 13936 13937=item new ($name, $base, $sysid, $pubid) 13938 13939Same argumnotations as the Notation handler for XML::Parser. 13940 13941=item print ($optional_filehandle) 13942 13943Print an notation declaration. 13944 13945=item name 13946 13947Return the name of the notation 13948 13949=item base 13950 13951Return the base to be used for resolving a relative URI 13952 13953=item sysid 13954 13955Return the system id for the notation 13956 13957=item pubid 13958 13959Return the public id for the notation 13960 13961 13962=item text 13963 13964Return the notation declaration text. 13965 13966=back 13967 13968 13969=head1 EXAMPLES 13970 13971Additional examples (and a complete tutorial) can be found on the 13972F<XML::Twig PageL<http://www.xmltwig.org/xmltwig/>> 13973 13974To figure out what flush does call the following script with an 13975XML file and an element name as arguments 13976 13977 use XML::Twig; 13978 13979 my ($file, $elt)= @ARGV; 13980 my $t= XML::Twig->new( twig_handlers => 13981 { $elt => sub {$_[0]->flush; print "\n[flushed here]\n";} }); 13982 $t->parsefile( $file, ErrorContext => 2); 13983 $t->flush; 13984 print "\n"; 13985 13986 13987=head1 NOTES 13988 13989=head2 Subclassing XML::Twig 13990 13991Useful methods: 13992 13993=over 4 13994 13995=item elt_class 13996 13997In order to subclass C<XML::Twig> you will probably need to subclass also 13998C<L<XML::Twig::Elt>>. Use the C<elt_class> option when you create the 13999C<XML::Twig> object to get the elements created in a different class 14000(which should be a subclass of C<XML::Twig::Elt>. 14001 14002=item add_options 14003 14004If you inherit C<XML::Twig> new method but want to add more options to it 14005you can use this method to prevent XML::Twig to issue warnings for those 14006additional options. 14007 14008=back 14009 14010=head2 DTD Handling 14011 14012There are 3 possibilities here. They are: 14013 14014=over 4 14015 14016=item No DTD 14017 14018No doctype, no DTD information, no entity information, the world is simple... 14019 14020=item Internal DTD 14021 14022The XML document includes an internal DTD, and maybe entity declarations. 14023 14024If you use the load_DTD option when creating the twig the DTD information and 14025the entity declarations can be accessed. 14026 14027The DTD and the entity declarations will be C<flush>'ed (or C<print>'ed) either 14028as is (if they have not been modified) or as reconstructed (poorly, comments 14029are lost, order is not kept, due to it's content this DTD should not be viewed 14030by anyone) if they have been modified. You can also modify them directly by 14031changing the C<< $twig->{twig_doctype}->{internal} >> field (straight from 14032XML::Parser, see the C<Doctype> handler doc) 14033 14034=item External DTD 14035 14036The XML document includes a reference to an external DTD, and maybe entity 14037declarations. 14038 14039If you use the C<load_DTD> when creating the twig the DTD information and the 14040entity declarations can be accessed. The entity declarations will be 14041C<flush>'ed (or C<print>'ed) either as is (if they have not been modified) or 14042as reconstructed (badly, comments are lost, order is not kept). 14043 14044You can change the doctype through the C<< $twig->set_doctype >> method and 14045print the dtd through the C<< $twig->dtd_text >> or C<< $twig->dtd_print >> 14046 methods. 14047 14048If you need to modify the entity list this is probably the easiest way to do it. 14049 14050=back 14051 14052 14053=head2 Flush 14054 14055Remember that element handlers are called when the element is CLOSED, so 14056if you have handlers for nested elements the inner handlers will be called 14057first. It makes it for example trickier than it would seem to number nested 14058sections (or clauses, or divs), as the titles in the inner sections are handled 14059before the outer sections. 14060 14061 14062=head1 BUGS 14063 14064=over 4 14065 14066=item segfault during parsing 14067 14068This happens when parsing huge documents, or lots of small ones, with a version 14069of Perl before 5.16. 14070 14071This is due to a bug in the way weak references are handled in Perl itself. 14072 14073The fix is either to upgrade to Perl 5.16 or later (C<perlbrew> is a great 14074tool to manage several installations of perl on the same machine). 14075 14076An other, NOT RECOMMENDED, way of fixing the problem, is to switch off weak 14077references by writing C<XML::Twig::_set_weakrefs( 0);> at the top of the code. 14078This is totally unsupported, and may lead to other problems though, 14079 14080=item entity handling 14081 14082Due to XML::Parser behaviour, non-base entities in attribute values disappear if 14083they are not declared in the document: 14084C<att="val&ent;"> will be turned into C<< att => val >>, unless you use the 14085C<keep_encoding> argument to C<< XML::Twig->new >> 14086 14087=item DTD handling 14088 14089The DTD handling methods are quite bugged. No one uses them and 14090it seems very difficult to get them to work in all cases, including with 14091several slightly incompatible versions of XML::Parser and of libexpat. 14092 14093Basically you can read the DTD, output it back properly, and update entities, 14094but not much more. 14095 14096So use XML::Twig with standalone documents, or with documents referring to an 14097external DTD, but don't expect it to properly parse and even output back the 14098DTD. 14099 14100=item memory leak 14101 14102If you use a REALLY old Perl (5.005!) and 14103a lot of twigs you might find that you leak quite a lot of memory 14104(about 2Ks per twig). You can use the C<L<dispose> > method to free 14105that memory after you are done. 14106 14107If you create elements the same thing might happen, use the C<L<delete>> 14108method to get rid of them. 14109 14110Alternatively installing the C<Scalar::Util> (or C<WeakRef>) module on a version 14111of Perl that supports it (>5.6.0) will get rid of the memory leaks automagically. 14112 14113=item ID list 14114 14115The ID list is NOT updated when elements are cut or deleted. 14116 14117=item change_gi 14118 14119This method will not function properly if you do: 14120 14121 $twig->change_gi( $old1, $new); 14122 $twig->change_gi( $old2, $new); 14123 $twig->change_gi( $new, $even_newer); 14124 14125=item sanity check on XML::Parser method calls 14126 14127XML::Twig should really prevent calls to some XML::Parser methods, especially 14128the C<setHandlers> method. 14129 14130=item pretty printing 14131 14132Pretty printing (at least using the 'C<indented>' style) is hard to get right! 14133Only elements that belong to the document will be properly indented. Printing 14134elements that do not belong to the twig makes it impossible for XML::Twig to 14135figure out their depth, and thus their indentation level. 14136 14137Also there is an unavoidable bug when using C<flush> and pretty printing for 14138elements with mixed content that start with an embedded element: 14139 14140 <elt><b>b</b>toto<b>bold</b></elt> 14141 14142 will be output as 14143 14144 <elt> 14145 <b>b</b>toto<b>bold</b></elt> 14146 14147if you flush the twig when you find the C<< <b> >> element 14148 14149 14150=back 14151 14152=head1 Globals 14153 14154These are the things that can mess up calling code, especially if threaded. 14155They might also cause problem under mod_perl. 14156 14157=over 4 14158 14159=item Exported constants 14160 14161Whether you want them or not you get them! These are subroutines to use 14162as constant when creating or testing elements 14163 14164 PCDATA return '#PCDATA' 14165 CDATA return '#CDATA' 14166 PI return '#PI', I had the choice between PROC and PI :--( 14167 14168=item Module scoped values: constants 14169 14170these should cause no trouble: 14171 14172 %base_ent= ( '>' => '>', 14173 '<' => '<', 14174 '&' => '&', 14175 "'" => ''', 14176 '"' => '"', 14177 ); 14178 CDATA_START = "<![CDATA["; 14179 CDATA_END = "]]>"; 14180 PI_START = "<?"; 14181 PI_END = "?>"; 14182 COMMENT_START = "<!--"; 14183 COMMENT_END = "-->"; 14184 14185pretty print styles 14186 14187 ( $NSGMLS, $NICE, $INDENTED, $INDENTED_C, $WRAPPED, $RECORD1, $RECORD2)= (1..7); 14188 14189empty tag output style 14190 14191 ( $HTML, $EXPAND)= (1..2); 14192 14193=item Module scoped values: might be changed 14194 14195Most of these deal with pretty printing, so the worst that can 14196happen is probably that XML output does not look right, but is 14197still valid and processed identically by XML processors. 14198 14199C<$empty_tag_style> can mess up HTML bowsers though and changing C<$ID> 14200would most likely create problems. 14201 14202 $pretty=0; # pretty print style 14203 $quote='"'; # quote for attributes 14204 $INDENT= ' '; # indent for indented pretty print 14205 $empty_tag_style= 0; # how to display empty tags 14206 $ID # attribute used as an id ('id' by default) 14207 14208=item Module scoped values: definitely changed 14209 14210These 2 variables are used to replace tags by an index, thus 14211saving some space when creating a twig. If they really cause 14212you too much trouble, let me know, it is probably possible to 14213create either a switch or at least a version of XML::Twig that 14214does not perform this optimization. 14215 14216 %gi2index; # tag => index 14217 @index2gi; # list of tags 14218 14219=back 14220 14221If you need to manipulate all those values, you can use the following methods on the 14222XML::Twig object: 14223 14224=over 4 14225 14226=item global_state 14227 14228Return a hashref with all the global variables used by XML::Twig 14229 14230The hash has the following fields: C<pretty>, C<quote>, C<indent>, 14231C<empty_tag_style>, C<keep_encoding>, C<expand_external_entities>, 14232C<output_filter>, C<output_text_filter>, C<keep_atts_order> 14233 14234=item set_global_state ($state) 14235 14236Set the global state, C<$state> is a hashref 14237 14238=item save_global_state 14239 14240Save the current global state 14241 14242=item restore_global_state 14243 14244Restore the previously saved (using C<Lsave_global_state>> state 14245 14246=back 14247 14248=head1 TODO 14249 14250=over 4 14251 14252=item SAX handlers 14253 14254Allowing XML::Twig to work on top of any SAX parser 14255 14256=item multiple twigs are not well supported 14257 14258A number of twig features are just global at the moment. These include 14259the ID list and the "tag pool" (if you use C<change_gi> then you change the tag 14260for ALL twigs). 14261 14262A future version will try to support this while trying not to be to 14263hard on performance (at least when a single twig is used!). 14264 14265=back 14266 14267=head1 AUTHOR 14268 14269Michel Rodriguez <mirod@cpan.org> 14270 14271=head1 LICENSE 14272 14273This library is free software; you can redistribute it and/or modify 14274it under the same terms as Perl itself. 14275 14276Bug reports should be sent using: 14277F<RT L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=XML-Twig>> 14278 14279Comments can be sent to mirod@cpan.org 14280 14281The XML::Twig page is at L<http://www.xmltwig.org/xmltwig/> 14282It includes the development version of the module, a slightly better version 14283of the documentation, examples, a tutorial and a: 14284F<Processing XML efficiently with Perl and XML::Twig: 14285L<http://www.xmltwig.org/xmltwig/tutorial/index.html>> 14286 14287=head1 SEE ALSO 14288 14289Complete docs, including a tutorial, examples, an easier to use HTML version of 14290the docs, a quick reference card and a FAQ are available at 14291L<http://www.xmltwig.org/xmltwig/> 14292 14293git repository at L<http://github.com/mirod/xmltwig> 14294 14295L<XML::Parser>, L<XML::Parser::Expat>, L<XML::XPath>, L<Encode>, 14296L<Text::Iconv>, L<Scalar::Utils> 14297 14298 14299=head2 Alternative Modules 14300 14301XML::Twig is not the only XML::Processing module available on CPAN (far from 14302it!). 14303 14304The main alternative I would recommend is L<XML::LibXML>. 14305 14306Here is a quick comparison of the 2 modules: 14307 14308XML::LibXML, actually C<libxml2> on which it is based, sticks to the standards, 14309and implements a good number of them in a rather strict way: XML, XPath, DOM, 14310RelaxNG, I must be forgetting a couple (XInclude?). It is fast and rather 14311frugal memory-wise. 14312 14313XML::Twig is older: when I started writing it XML::Parser/expat was the only 14314game in town. It implements XML and that's about it (plus a subset of XPath, 14315and you can use XML::Twig::XPath if you have XML::XPathEngine installed for full 14316support). It is slower and requires more memory for a full tree than 14317XML::LibXML. On the plus side (yes, there is a plus side!) it lets you process 14318a big document in chunks, and thus let you tackle documents that couldn't be 14319loaded in memory by XML::LibXML, and it offers a lot (and I mean a LOT!) of 14320higher-level methods, for everything, from adding structure to "low-level" XML, 14321to shortcuts for XHTML conversions and more. It also DWIMs quite a bit, getting 14322comments and non-significant whitespaces out of the way but preserving them in 14323the output for example. As it does not stick to the DOM, is also usually leads 14324to shorter code than in XML::LibXML. 14325 14326Beyond the pure features of the 2 modules, XML::LibXML seems to be preferred by 14327"XML-purists", while XML::Twig seems to be more used by Perl Hackers who have 14328to deal with XML. As you have noted, XML::Twig also comes with quite a lot of 14329docs, but I am sure if you ask for help about XML::LibXML here or on Perlmonks 14330you will get answers. 14331 14332Note that it is actually quite hard for me to compare the 2 modules: on one hand 14333I know XML::Twig inside-out and I can get it to do pretty much anything I need 14334to (or I improve it ;--), while I have a very basic knowledge of XML::LibXML. 14335So feature-wise, I'd rather use XML::Twig ;--). On the other hand, I am 14336painfully aware of some of the deficiencies, potential bugs and plain ugly code 14337that lurk in XML::Twig, even though you are unlikely to be affected by them 14338(unless for example you need to change the DTD of a document programmatically), 14339while I haven't looked much into XML::LibXML so it still looks shinny and clean 14340to me. 14341 14342That said, if you need to process a document that is too big to fit memory 14343and XML::Twig is too slow for you, my reluctant advice would be to use "bare" 14344XML::Parser. It won't be as easy to use as XML::Twig: basically with XML::Twig 14345you trade some speed (depending on what you do from a factor 3 to... none) 14346for ease-of-use, but it will be easier IMHO than using SAX (albeit not 14347standard), and at this point a LOT faster (see the last test in 14348L<http://www.xmltwig.org/article/simple_benchmark/>). 14349 14350=cut 14351 14352 14353