1use strict; 2use warnings; # > perl 5.5 3 4# This is created in the caller's space 5# I realize (now!) that it's not clean, but it's been there for 10+ years... 6BEGIN 7{ sub ::PCDATA { '#PCDATA' } ## no critic (Subroutines::ProhibitNestedSubs); 8 sub ::CDATA { '#CDATA' } ## no critic (Subroutines::ProhibitNestedSubs); 9} 10 11use UNIVERSAL(); 12 13## if a sub returns a scalar, it better not bloody disappear in list context 14## no critic (Subroutines::ProhibitExplicitReturnUndef); 15 16my $perl_version; 17my $parser_version; 18 19###################################################################### 20package XML::Twig; 21###################################################################### 22 23require 5.004; 24 25use utf8; # > perl 5.5 26 27use vars qw($VERSION @ISA %valid_option); 28 29use Carp; 30use File::Spec; 31use File::Basename; 32 33*isa= *UNIVERSAL::isa; 34 35# flag, set to true if the weaken sub is available 36use vars qw( $weakrefs); 37 38# flag set to true if the version of expat seems to be 1.95.2, which has annoying bugs 39# wrt doctype handling. This is global for performance reasons. 40my $expat_1_95_2=0; 41 42# a slight non-xml mod: # is allowed as a first character 43my $REG_TAG_FIRST_LETTER; 44#$REG_TAG_FIRST_LETTER= q{(?:[^\W\d]|[:#_])}; # < perl 5.6 - does not work for leading non-ascii letters 45$REG_TAG_FIRST_LETTER= q{(?:[[:alpha:]:#_])}; # >= perl 5.6 46 47my $REG_TAG_LETTER= q{(?:[\w_.-]*)}; 48 49# a simple name (no colon) 50my $REG_NAME_TOKEN= qq{(?:$REG_TAG_FIRST_LETTER$REG_TAG_LETTER*)}; 51 52# a tag name, possibly including namespace 53my $REG_NAME= qq{(?:(?:$REG_NAME_TOKEN:)?$REG_NAME_TOKEN)}; 54 55# tag name (leading # allowed) 56# first line is for perl 5.005, second line for modern perl, that accept character classes 57my $REG_TAG_NAME=$REG_NAME; 58 59# name or wildcard (* or '') (leading # allowed) 60my $REG_NAME_W = qq{(?:$REG_NAME|[*])}; 61 62# class and ids are deliberatly permissive 63my $REG_NTOKEN_FIRST_LETTER; 64#$REG_NTOKEN_FIRST_LETTER= q{(?:[^\W\d]|[:_])}; # < perl 5.6 - does not work for leading non-ascii letters 65$REG_NTOKEN_FIRST_LETTER= q{(?:[[:alpha:]:_])}; # >= perl 5.6 66 67my $REG_NTOKEN_LETTER= q{(?:[\w_:.-]*)}; 68 69my $REG_NTOKEN= qq{(?:$REG_NTOKEN_FIRST_LETTER$REG_NTOKEN_LETTER*)}; 70my $REG_CLASS = $REG_NTOKEN; 71my $REG_ID = $REG_NTOKEN; 72 73# allow <tag> #<tag> (private elt) * <tag>.<class> *.<class> <tag>#<id> *#<id> 74my $REG_TAG_PART= qq{(?:$REG_NAME_W(?:[.]$REG_CLASS|[#]$REG_ID)?|[.]$REG_CLASS)}; 75 76my $REG_REGEXP = q{(?:/(?:[^\\/]|\\.)*/[eimsox]*)}; # regexp 77my $REG_MATCH = q{[!=]~}; # match (or not) 78my $REG_STRING = q{(?:"(?:[^\\"]|\\.)*"|'(?:[^\\']|\\.)*')}; # string (simple or double quoted) 79my $REG_NUMBER = q{(?:\d+(?:\.\d*)?|\.\d+)}; # number 80my $REG_VALUE = qq{(?:$REG_STRING|$REG_NUMBER)}; # value 81my $REG_OP = q{==|!=|>|<|>=|<=|eq|ne|lt|gt|le|ge|=}; # op 82my $REG_FUNCTION = q{(?:string|text)\(\s*\)}; 83my $REG_STRING_ARG = qq{(?:string|text)\\(\\s*$REG_NAME_W\\s*\\)}; 84my $REG_COMP = q{(?:>=|<=|!=|<|>|=)}; 85 86my $REG_TAG_IN_PREDICATE= $REG_NAME_W . q{(?=\s*(?i:and\b|or\b|\]|$))}; 87 88# keys in the context stack, chosen not to interfere with att names, even private (#-prefixed) ones 89my $ST_TAG = '##tag'; 90my $ST_ELT = '##elt'; 91my $ST_NS = '##ns' ; 92 93# used in the handler trigger code 94my $REG_NAKED_PREDICATE= qq{((?:"[^"]*"|'[^']*'|$REG_STRING_ARG|$REG_FUNCTION|\@$REG_NAME_W|$REG_MATCH\\s*$REG_REGEXP|[\\s\\d><=!()+.-]|(?i:and)|(?i:or)|$REG_TAG_IN_PREDICATE)*)}; 95my $REG_PREDICATE= qq{\\[$REG_NAKED_PREDICATE\\]}; 96 97# not all axis, only supported ones (in get_xpath) 98my @supported_axis= ( 'ancestor', 'ancestor-or-self', 'child', 'descendant', 'descendant-or-self', 99 'following', 'following-sibling', 'parent', 'preceding', 'preceding-sibling', 'self' 100 ); 101my $REG_AXIS = "(?:" . join( '|', @supported_axis) .")"; 102 103# only used in the "xpath"engine (for get_xpath/findnodes) for now 104my $REG_PREDICATE_ALT = qr{\[(?:(?:string\(\s*\)|\@$REG_TAG_NAME)\s*$REG_MATCH\s*$REG_REGEXP\s*|[^\]]*)\]}; 105 106# used to convert XPath tests on strings to the perl equivalent 107my %PERL_ALPHA_TEST= ( '=' => ' eq ', '!=' => ' ne ', '>' => ' gt ', '>=' => ' ge ', '<' => ' lt ', '<=' => ' le '); 108 109my( $FB_HTMLCREF, $FB_XMLCREF); 110 111my $NO_WARNINGS= $perl_version >= 5.006 ? 'no warnings' : 'local $^W=0'; 112 113# default namespaces, both ways 114my %DEFAULT_NS= ( xml => "http://www.w3.org/XML/1998/namespace", 115 xmlns => "http://www.w3.org/2000/xmlns/", 116 ); 117my %DEFAULT_URI2NS= map { $DEFAULT_NS{$_} => $_ } keys %DEFAULT_NS; 118 119# constants 120my( $PCDATA, $CDATA, $PI, $COMMENT, $ENT, $ELT, $TEXT, $ASIS, $EMPTY, $BUFSIZE); 121 122# used when an HTML doc only has a PUBLIC declaration, to generate the SYSTEM one 123# this should really be done by HTML::TreeBuilder, but as of HTML::TreeBuilder 4.2 it isn't 124# the various declarations are taken from http://en.wikipedia.org/wiki/Document_Type_Declaration 125my %HTML_DECL= ( "-//W3C//DTD HTML 4.0 Transitional//EN" => "http://www.w3.org/TR/REC-html40/loose.dtd", 126 "-//W3C//DTD HTML 4.01//EN" => "http://www.w3.org/TR/html4/strict.dtd", 127 "-//W3C//DTD HTML 4.01 Transitional//EN" => "http://www.w3.org/TR/html4/loose.dtd", 128 "-//W3C//DTD HTML 4.01 Frameset//EN" => "http://www.w3.org/TR/html4/frameset.dtd", 129 "-//W3C//DTD XHTML 1.0 Strict//EN" => "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd", 130 "-//W3C//DTD XHTML 1.0 Transitional//EN" => "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd", 131 "-//W3C//DTD XHTML 1.0 Frameset//EN" => "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd", 132 "-//W3C//DTD XHTML 1.1//EN" => "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd", 133 "-//W3C//DTD XHTML Basic 1.0//EN" => "http://www.w3.org/TR/xhtml-basic/xhtml-basic10.dtd", 134 "-//W3C//DTD XHTML Basic 1.1//EN" => "http://www.w3.org/TR/xhtml-basic/xhtml-basic11.dtd", 135 "-//WAPFORUM//DTD XHTML Mobile 1.0//EN" => "http://www.wapforum.org/DTD/xhtml-mobile10.dtd", 136 "-//WAPFORUM//DTD XHTML Mobile 1.1//EN" => "http://www.openmobilealliance.org/tech/DTD/xhtml-mobile11.dtd", 137 "-//WAPFORUM//DTD XHTML Mobile 1.2//EN" => "http://www.openmobilealliance.org/tech/DTD/xhtml-mobile12.dtd", 138 "-//W3C//DTD XHTML+RDFa 1.0//EN" => "http://www.w3.org/MarkUp/DTD/xhtml-rdfa-1.dtd", 139 ); 140 141my $DEFAULT_HTML_TYPE= "-//W3C//DTD HTML 4.0 Transitional//EN"; 142 143my $SEP= qr/\s*(?:$|\|)/; 144 145BEGIN 146{ 147$VERSION = '3.48'; 148 149use XML::Parser; 150my $needVersion = '2.23'; 151($parser_version= $XML::Parser::VERSION)=~ s{_\d+}{}; # remove _<n> from version so numeric tests do not warn 152croak "need at least XML::Parser version $needVersion" unless $parser_version >= $needVersion; 153 154($perl_version= $])=~ s{_\d+}{}; 155 156if( $perl_version >= 5.008) 157 { eval "use Encode qw( :all)"; ## no critic ProhibitStringyEval 158 $FB_XMLCREF = 0x0400; # Encode::FB_XMLCREF; 159 $FB_HTMLCREF = 0x0200; # Encode::FB_HTMLCREF; 160 } 161 162# test whether we can use weak references 163# set local empty signal handler to trap error messages 164{ local $SIG{__DIE__}; 165 if( eval( 'require Scalar::Util') && defined( \&Scalar::Util::weaken)) 166 { import Scalar::Util( 'weaken'); $weakrefs= 1; } 167 elsif( eval( 'require WeakRef')) 168 { import WeakRef; $weakrefs= 1; } 169 else 170 { $weakrefs= 0; } 171} 172 173import XML::Twig::Elt; 174import XML::Twig::Entity; 175import XML::Twig::Entity_list; 176 177# used to store the gi's 178# should be set for each twig really, at least when there are several 179# the init ensures that special gi's are always the same 180 181# constants: element types 182$PCDATA = '#PCDATA'; 183$CDATA = '#CDATA'; 184$PI = '#PI'; 185$COMMENT = '#COMMENT'; 186$ENT = '#ENT'; 187 188# element classes 189$ELT = '#ELT'; 190$TEXT = '#TEXT'; 191 192# element properties 193$ASIS = '#ASIS'; 194$EMPTY = '#EMPTY'; 195 196# used in parseurl to set the buffer size to the same size as in XML::Parser::Expat 197$BUFSIZE = 32768; 198 199 200# gi => index 201%XML::Twig::gi2index=( '', 0, $PCDATA => 1, $CDATA => 2, $PI => 3, $COMMENT => 4, $ENT => 5); 202# list of gi's 203@XML::Twig::index2gi=( '', $PCDATA, $CDATA, $PI, $COMMENT, $ENT); 204 205# gi's under this value are special 206$XML::Twig::SPECIAL_GI= @XML::Twig::index2gi; 207 208%XML::Twig::base_ent= ( '>' => '>', '<' => '<', '&' => '&', "'" => ''', '"' => '"',); 209foreach my $c ( "\n", "\r", "\t") { $XML::Twig::base_ent{$c}= sprintf( "&#x%02x;", ord( $c)); } 210 211# now set some aliases 212*find_nodes = *get_xpath; # same as XML::XPath 213*findnodes = *get_xpath; # same as XML::LibXML 214*getElementsByTagName = *descendants; 215*descendants_or_self = *descendants; # valid in XML::Twig, not in XML::Twig::Elt 216*find_by_tag_name = *descendants; 217*getElementById = *elt_id; 218*getEltById = *elt_id; 219*toString = *sprint; 220*create_accessors = *att_accessors; 221 222} 223 224@ISA = qw(XML::Parser); 225 226# fake gi's used in twig_handlers and start_tag_handlers 227my $ALL = '_all_'; # the associated function is always called 228my $DEFAULT= '_default_'; # the function is called if no other handler has been 229 230# some defaults 231my $COMMENTS_DEFAULT= 'keep'; 232my $PI_DEFAULT = 'keep'; 233 234 235# handlers used in regular mode 236my %twig_handlers=( Start => \&_twig_start, 237 End => \&_twig_end, 238 Char => \&_twig_char, 239 Entity => \&_twig_entity, 240 XMLDecl => \&_twig_xmldecl, 241 Doctype => \&_twig_doctype, 242 Element => \&_twig_element, 243 Attlist => \&_twig_attlist, 244 CdataStart => \&_twig_cdatastart, 245 CdataEnd => \&_twig_cdataend, 246 Proc => \&_twig_pi, 247 Comment => \&_twig_comment, 248 Default => \&_twig_default, 249 ExternEnt => \&_twig_extern_ent, 250 ); 251 252# handlers used when twig_roots is used and we are outside of the roots 253my %twig_handlers_roots= 254 ( Start => \&_twig_start_check_roots, 255 End => \&_twig_end_check_roots, 256 Doctype => \&_twig_doctype, 257 Char => undef, Entity => undef, XMLDecl => \&_twig_xmldecl, 258 Element => undef, Attlist => undef, CdataStart => undef, 259 CdataEnd => undef, Proc => undef, Comment => undef, 260 Proc => \&_twig_pi_check_roots, 261 Default => sub {}, # hack needed for XML::Parser 2.27 262 ExternEnt => \&_twig_extern_ent, 263 ); 264 265# handlers used when twig_roots and print_outside_roots are used and we are 266# outside of the roots 267my %twig_handlers_roots_print_2_30= 268 ( Start => \&_twig_start_check_roots, 269 End => \&_twig_end_check_roots, 270 Char => \&_twig_print, 271 Entity => \&_twig_print_entity, 272 ExternEnt => \&_twig_print_entity, 273 DoctypeFin => \&_twig_doctype_fin_print, 274 XMLDecl => sub { _twig_xmldecl( @_); _twig_print( @_); }, 275 Doctype => \&_twig_print_doctype, # because recognized_string is broken here 276 # Element => \&_twig_print, Attlist => \&_twig_print, 277 CdataStart => \&_twig_print, CdataEnd => \&_twig_print, 278 Proc => \&_twig_pi_check_roots, Comment => \&_twig_print, 279 Default => \&_twig_print_check_doctype, 280 ExternEnt => \&_twig_extern_ent, 281 ); 282 283# handlers used when twig_roots, print_outside_roots and keep_encoding are used 284# and we are outside of the roots 285my %twig_handlers_roots_print_original_2_30= 286 ( Start => \&_twig_start_check_roots, 287 End => \&_twig_end_check_roots, 288 Char => \&_twig_print_original, 289 # I have no idea why I should not be using this handler! 290 Entity => \&_twig_print_entity, 291 ExternEnt => \&_twig_print_entity, 292 DoctypeFin => \&_twig_doctype_fin_print, 293 XMLDecl => sub { _twig_xmldecl( @_); _twig_print_original( @_) }, 294 Doctype => \&_twig_print_original_doctype, # because original_string is broken here 295 Element => \&_twig_print_original, Attlist => \&_twig_print_original, 296 CdataStart => \&_twig_print_original, CdataEnd => \&_twig_print_original, 297 Proc => \&_twig_pi_check_roots, Comment => \&_twig_print_original, 298 Default => \&_twig_print_original_check_doctype, 299 ); 300 301# handlers used when twig_roots and print_outside_roots are used and we are 302# outside of the roots 303my %twig_handlers_roots_print_2_27= 304 ( Start => \&_twig_start_check_roots, 305 End => \&_twig_end_check_roots, 306 Char => \&_twig_print, 307 # if the Entity handler is set then it prints the entity declaration 308 # before the entire internal subset (including the declaration!) is output 309 Entity => sub {}, 310 XMLDecl => \&_twig_print, Doctype => \&_twig_print, 311 CdataStart => \&_twig_print, CdataEnd => \&_twig_print, 312 Proc => \&_twig_pi_check_roots, Comment => \&_twig_print, 313 Default => \&_twig_print, 314 ExternEnt => \&_twig_extern_ent, 315 ); 316 317# handlers used when twig_roots, print_outside_roots and keep_encoding are used 318# and we are outside of the roots 319my %twig_handlers_roots_print_original_2_27= 320 ( Start => \&_twig_start_check_roots, 321 End => \&_twig_end_check_roots, 322 Char => \&_twig_print_original, 323 # for some reason original_string is wrong here 324 # this can be a problem if the doctype includes non ascii characters 325 XMLDecl => \&_twig_print, Doctype => \&_twig_print, 326 # if the Entity handler is set then it prints the entity declaration 327 # before the entire internal subset (including the declaration!) is output 328 Entity => sub {}, 329 #Element => undef, Attlist => undef, 330 CdataStart => \&_twig_print_original, CdataEnd => \&_twig_print_original, 331 Proc => \&_twig_pi_check_roots, Comment => \&_twig_print_original, 332 Default => \&_twig_print, # _twig_print_original does not work 333 ExternEnt => \&_twig_extern_ent, 334 ); 335 336 337my %twig_handlers_roots_print= $parser_version > 2.27 338 ? %twig_handlers_roots_print_2_30 339 : %twig_handlers_roots_print_2_27; 340my %twig_handlers_roots_print_original= $parser_version > 2.27 341 ? %twig_handlers_roots_print_original_2_30 342 : %twig_handlers_roots_print_original_2_27; 343 344 345# handlers used when the finish_print method has been called 346my %twig_handlers_finish_print= 347 ( Start => \&_twig_print, 348 End => \&_twig_print, Char => \&_twig_print, 349 Entity => \&_twig_print, XMLDecl => \&_twig_print, 350 Doctype => \&_twig_print, Element => \&_twig_print, 351 Attlist => \&_twig_print, CdataStart => \&_twig_print, 352 CdataEnd => \&_twig_print, Proc => \&_twig_print, 353 Comment => \&_twig_print, Default => \&_twig_print, 354 ExternEnt => \&_twig_extern_ent, 355 ); 356 357# handlers used when the finish_print method has been called and the keep_encoding 358# option is used 359my %twig_handlers_finish_print_original= 360 ( Start => \&_twig_print_original, End => \&_twig_print_end_original, 361 Char => \&_twig_print_original, Entity => \&_twig_print_original, 362 XMLDecl => \&_twig_print_original, Doctype => \&_twig_print_original, 363 Element => \&_twig_print_original, Attlist => \&_twig_print_original, 364 CdataStart => \&_twig_print_original, CdataEnd => \&_twig_print_original, 365 Proc => \&_twig_print_original, Comment => \&_twig_print_original, 366 Default => \&_twig_print_original, 367 ); 368 369# handlers used within ignored elements 370my %twig_handlers_ignore= 371 ( Start => \&_twig_ignore_start, 372 End => \&_twig_ignore_end, 373 Char => undef, Entity => undef, XMLDecl => undef, 374 Doctype => undef, Element => undef, Attlist => undef, 375 CdataStart => undef, CdataEnd => undef, Proc => undef, 376 Comment => undef, Default => undef, 377 ExternEnt => undef, 378 ); 379 380 381# those handlers are only used if the entities are NOT to be expanded 382my %twig_noexpand_handlers= ( ExternEnt => undef, Default => \&_twig_default ); 383 384my @saved_default_handler; 385 386my $ID= 'id'; # default value, set by the Id argument 387my $css_sel=0; # set through the css_sel option to allow .class selectors in triggers 388 389# all allowed options 390%valid_option= 391 ( # XML::Twig options 392 TwigHandlers => 1, Id => 1, 393 TwigRoots => 1, TwigPrintOutsideRoots => 1, 394 StartTagHandlers => 1, EndTagHandlers => 1, 395 ForceEndTagHandlersUsage => 1, 396 DoNotChainHandlers => 1, 397 IgnoreElts => 1, 398 Index => 1, 399 AttAccessors => 1, 400 EltAccessors => 1, 401 FieldAccessors => 1, 402 CharHandler => 1, 403 TopDownHandlers => 1, 404 KeepEncoding => 1, DoNotEscapeAmpInAtts => 1, 405 ParseStartTag => 1, KeepAttsOrder => 1, 406 LoadDTD => 1, DTDHandler => 1, 407 DoNotOutputDTD => 1, NoProlog => 1, 408 ExpandExternalEnts => 1, 409 DiscardSpaces => 1, KeepSpaces => 1, DiscardAllSpaces => 1, 410 DiscardSpacesIn => 1, KeepSpacesIn => 1, 411 PrettyPrint => 1, EmptyTags => 1, 412 EscapeGt => 1, 413 Quote => 1, 414 Comments => 1, Pi => 1, 415 OutputFilter => 1, InputFilter => 1, 416 OutputTextFilter => 1, 417 OutputEncoding => 1, 418 RemoveCdata => 1, 419 EltClass => 1, 420 MapXmlns => 1, KeepOriginalPrefix => 1, 421 SkipMissingEnts => 1, 422 # XML::Parser options 423 ErrorContext => 1, ProtocolEncoding => 1, 424 Namespaces => 1, NoExpand => 1, 425 Stream_Delimiter => 1, ParseParamEnt => 1, 426 NoLWP => 1, Non_Expat_Options => 1, 427 Xmlns => 1, CssSel => 1, 428 UseTidy => 1, TidyOptions => 1, 429 OutputHtmlDoctype => 1, 430 ); 431 432my $active_twig; # last active twig,for XML::Twig::s 433 434# predefined input and output filters 435use vars qw( %filter); 436%filter= ( html => \&html_encode, 437 safe => \&safe_encode, 438 safe_hex => \&safe_encode_hex, 439 ); 440 441 442# trigger types (used to sort them) 443my ($LEVEL_TRIGGER, $REGEXP_TRIGGER, $XPATH_TRIGGER)=(1..3); 444 445sub new 446 { my ($class, %args) = @_; 447 my $handlers; 448 449 # change all nice_perlish_names into nicePerlishNames 450 %args= _normalize_args( %args); 451 452 # check options 453 unless( $args{MoreOptions}) 454 { foreach my $arg (keys %args) 455 { carp "invalid option $arg" unless $valid_option{$arg}; } 456 } 457 458 # a twig is really an XML::Parser 459 # my $self= XML::Parser->new(%args); 460 my $self; 461 $self= XML::Parser->new(%args); 462 463 bless $self, $class; 464 465 $self->{_twig_context_stack}= []; 466 467 # allow tag.class selectors in handler triggers 468 $css_sel= $args{CssSel} || 0; 469 470 471 if( exists $args{TwigHandlers}) 472 { $handlers= $args{TwigHandlers}; 473 $self->setTwigHandlers( $handlers); 474 delete $args{TwigHandlers}; 475 } 476 477 # take care of twig-specific arguments 478 if( exists $args{StartTagHandlers}) 479 { $self->setStartTagHandlers( $args{StartTagHandlers}); 480 delete $args{StartTagHandlers}; 481 } 482 483 if( exists $args{DoNotChainHandlers}) 484 { $self->{twig_do_not_chain_handlers}= $args{DoNotChainHandlers}; } 485 486 if( exists $args{IgnoreElts}) 487 { # change array to hash so you can write ignore_elts => [ qw(foo bar baz)] 488 if( isa( $args{IgnoreElts}, 'ARRAY')) { $args{IgnoreElts}= { map { $_ => 1 } @{$args{IgnoreElts}} }; } 489 $self->setIgnoreEltsHandlers( $args{IgnoreElts}); 490 delete $args{IgnoreElts}; 491 } 492 493 if( exists $args{Index}) 494 { my $index= $args{Index}; 495 # we really want a hash name => path, we turn an array into a hash if necessary 496 if( ref( $index) eq 'ARRAY') 497 { my %index= map { $_ => $_ } @$index; 498 $index= \%index; 499 } 500 while( my( $name, $exp)= each %$index) 501 { $self->setTwigHandler( $exp, sub { push @{$_[0]->{_twig_index}->{$name}}, $_; 1; }); } 502 } 503 504 $self->{twig_elt_class}= $args{EltClass} || 'XML::Twig::Elt'; 505 if( defined( $args{EltClass}) && $args{EltClass} ne 'XML::Twig::Elt') { $self->{twig_alt_elt_class}=1; } 506 if( exists( $args{EltClass})) { delete $args{EltClass}; } 507 508 if( exists( $args{MapXmlns})) 509 { $self->{twig_map_xmlns}= $args{MapXmlns}; 510 $self->{Namespaces}=1; 511 delete $args{MapXmlns}; 512 } 513 514 if( exists( $args{KeepOriginalPrefix})) 515 { $self->{twig_keep_original_prefix}= $args{KeepOriginalPrefix}; 516 delete $args{KeepOriginalPrefix}; 517 } 518 519 $self->{twig_dtd_handler}= $args{DTDHandler}; 520 delete $args{DTDHandler}; 521 522 if( $args{ExpandExternalEnts}) 523 { $self->set_expand_external_entities( 1); 524 $self->{twig_expand_external_ents}= $args{ExpandExternalEnts}; 525 $self->{twig_read_external_dtd}= 1; # implied by ExpandExternalEnts 526 if( $args{ExpandExternalEnts} == -1) 527 { $self->{twig_extern_ent_nofail}= 1; 528 $self->setHandlers( ExternEnt => \&_twig_extern_ent_nofail); 529 } 530 delete $args{LoadDTD}; 531 delete $args{ExpandExternalEnts}; 532 } 533 else 534 { $self->set_expand_external_entities( 0); } 535 536 if( !$args{NoLWP} && ! _use( 'URI') && ! _use( 'URI::File') && ! _use( 'LWP')) 537 { $self->{twig_ext_ent_handler}= \&XML::Parser::initial_ext_ent_handler } 538 else 539 { $self->{twig_ext_ent_handler}= \&XML::Parser::file_ext_ent_handler } 540 541 if( $args{DoNotEscapeAmpInAtts}) 542 { $self->set_do_not_escape_amp_in_atts( 1); 543 $self->{twig_do_not_escape_amp_in_atts}=1; 544 } 545 else 546 { $self->set_do_not_escape_amp_in_atts( 0); 547 $self->{twig_do_not_escape_amp_in_atts}=0; 548 } 549 550 # deal with TwigRoots argument, a hash of elements for which 551 # subtrees will be built (and associated handlers) 552 553 if( $args{TwigRoots}) 554 { $self->setTwigRoots( $args{TwigRoots}); 555 delete $args{TwigRoots}; 556 } 557 558 if( $args{EndTagHandlers}) 559 { unless ($self->{twig_roots} || $args{ForceEndTagHandlersUsage}) 560 { croak "you should not use EndTagHandlers without TwigRoots\n", 561 "if you want to use it anyway, normally because you have ", 562 "a start_tag_handlers that calls 'ignore' and you want to ", 563 "call an ent_tag_handlers at the end of the element, then ", 564 "pass 'force_end_tag_handlers_usage => 1' as an argument ", 565 "to new"; 566 } 567 568 $self->setEndTagHandlers( $args{EndTagHandlers}); 569 delete $args{EndTagHandlers}; 570 } 571 572 if( $args{TwigPrintOutsideRoots}) 573 { croak "cannot use twig_print_outside_roots without twig_roots" 574 unless( $self->{twig_roots}); 575 # if the arg is a filehandle then store it 576 if( _is_fh( $args{TwigPrintOutsideRoots}) ) 577 { $self->{twig_output_fh}= $args{TwigPrintOutsideRoots}; } 578 $self->{twig_default_print}= $args{TwigPrintOutsideRoots}; 579 } 580 581 # space policy 582 if( $args{KeepSpaces}) 583 { croak "cannot use both keep_spaces and discard_spaces" if( $args{DiscardSpaces}); 584 croak "cannot use both keep_spaces and discard_all_spaces" if( $args{DiscardAllSpaces}); 585 croak "cannot use both keep_spaces and keep_spaces_in" if( $args{KeepSpacesIn}); 586 $self->{twig_keep_spaces}=1; 587 delete $args{KeepSpaces}; 588 } 589 if( $args{DiscardSpaces}) 590 { 591 croak "cannot use both discard_spaces and keep_spaces_in" if( $args{KeepSpacesIn}); 592 croak "cannot use both discard_spaces and discard_all_spaces" if( $args{DiscardAllSpaces}); 593 croak "cannot use both discard_spaces and discard_spaces_in" if( $args{DiscardSpacesIn}); 594 $self->{twig_discard_spaces}=1; 595 delete $args{DiscardSpaces}; 596 } 597 if( $args{KeepSpacesIn}) 598 { croak "cannot use both keep_spaces_in and discard_spaces_in" if( $args{DiscardSpacesIn}); 599 croak "cannot use both keep_spaces_in and discard_all_spaces" if( $args{DiscardAllSpaces}); 600 $self->{twig_discard_spaces}=1; 601 $self->{twig_keep_spaces_in}={}; 602 my @tags= @{$args{KeepSpacesIn}}; 603 foreach my $tag (@tags) { $self->{twig_keep_spaces_in}->{$tag}=1; } 604 delete $args{KeepSpacesIn}; 605 } 606 607 if( $args{DiscardAllSpaces}) 608 { 609 croak "cannot use both discard_all_spaces and discard_spaces_in" if( $args{DiscardSpacesIn}); 610 $self->{twig_discard_all_spaces}=1; 611 delete $args{DiscardAllSpaces}; 612 } 613 614 if( $args{DiscardSpacesIn}) 615 { $self->{twig_keep_spaces}=1; 616 $self->{twig_discard_spaces_in}={}; 617 my @tags= @{$args{DiscardSpacesIn}}; 618 foreach my $tag (@tags) { $self->{twig_discard_spaces_in}->{$tag}=1; } 619 delete $args{DiscardSpacesIn}; 620 } 621 # discard spaces by default 622 $self->{twig_discard_spaces}= 1 unless( $self->{twig_keep_spaces}); 623 624 $args{Comments}||= $COMMENTS_DEFAULT; 625 if( $args{Comments} eq 'drop') { $self->{twig_keep_comments}= 0; } 626 elsif( $args{Comments} eq 'keep') { $self->{twig_keep_comments}= 1; } 627 elsif( $args{Comments} eq 'process') { $self->{twig_process_comments}= 1; } 628 else { croak "wrong value for comments argument: '$args{Comments}' (should be 'drop', 'keep' or 'process')"; } 629 delete $args{Comments}; 630 631 $args{Pi}||= $PI_DEFAULT; 632 if( $args{Pi} eq 'drop') { $self->{twig_keep_pi}= 0; } 633 elsif( $args{Pi} eq 'keep') { $self->{twig_keep_pi}= 1; } 634 elsif( $args{Pi} eq 'process') { $self->{twig_process_pi}= 1; } 635 else { croak "wrong value for pi argument: '$args{Pi}' (should be 'drop', 'keep' or 'process')"; } 636 delete $args{Pi}; 637 638 if( $args{KeepEncoding}) 639 { 640 # set it in XML::Twig::Elt so print functions know what to do 641 $self->set_keep_encoding( 1); 642 $self->{parse_start_tag}= $args{ParseStartTag} || \&_parse_start_tag; 643 delete $args{ParseStartTag} if defined( $args{ParseStartTag}) ; 644 delete $args{KeepEncoding}; 645 } 646 else 647 { $self->set_keep_encoding( 0); 648 if( $args{ParseStartTag}) 649 { $self->{parse_start_tag}= $args{ParseStartTag}; } 650 else 651 { delete $self->{parse_start_tag}; } 652 delete $args{ParseStartTag}; 653 } 654 655 if( $args{OutputFilter}) 656 { $self->set_output_filter( $args{OutputFilter}); 657 delete $args{OutputFilter}; 658 } 659 else 660 { $self->set_output_filter( 0); } 661 662 if( $args{RemoveCdata}) 663 { $self->set_remove_cdata( $args{RemoveCdata}); 664 delete $args{RemoveCdata}; 665 } 666 else 667 { $self->set_remove_cdata( 0); } 668 669 if( $args{OutputTextFilter}) 670 { $self->set_output_text_filter( $args{OutputTextFilter}); 671 delete $args{OutputTextFilter}; 672 } 673 else 674 { $self->set_output_text_filter( 0); } 675 676 if( exists $args{KeepAttsOrder}) 677 { $self->{keep_atts_order}= $args{KeepAttsOrder}; 678 if( _use( 'Tie::IxHash')) 679 { $self->set_keep_atts_order( $self->{keep_atts_order}); } 680 else 681 { croak "Tie::IxHash not available, option keep_atts_order not allowed"; } 682 } 683 else 684 { $self->set_keep_atts_order( 0); } 685 686 687 if( $args{PrettyPrint}) { $self->set_pretty_print( $args{PrettyPrint}); } 688 if( $args{EscapeGt}) { $self->escape_gt( $args{EscapeGt}); } 689 if( $args{EmptyTags}) { $self->set_empty_tag_style( $args{EmptyTags}) } 690 691 if( exists $args{Id}) { $ID= $args{Id}; delete $args{ID}; } 692 if( $args{NoProlog}) { $self->{no_prolog}= 1; delete $args{NoProlog}; } 693 if( $args{DoNotOutputDTD}) { $self->{no_dtd_output}= 1; delete $args{DoNotOutputDTD}; } 694 if( $args{LoadDTD}) { $self->{twig_read_external_dtd}= 1; delete $args{LoadDTD}; } 695 if( $args{CharHandler}) { $self->setCharHandler( $args{CharHandler}); delete $args{CharHandler}; } 696 697 if( $args{InputFilter}) { $self->set_input_filter( $args{InputFilter}); delete $args{InputFilter}; } 698 if( $args{NoExpand}) { $self->setHandlers( %twig_noexpand_handlers); $self->{twig_no_expand}=1; } 699 if( my $output_encoding= $args{OutputEncoding}) { $self->set_output_encoding( $output_encoding); delete $args{OutputFilter}; } 700 701 if( my $tdh= $args{TopDownHandlers}) { $self->{twig_tdh}=1; delete $args{TopDownHandlers}; } 702 703 if( my $acc_a= $args{AttAccessors}) { $self->att_accessors( @$acc_a); } 704 if( my $acc_e= $args{EltAccessors}) { $self->elt_accessors( isa( $acc_e, 'ARRAY') ? @$acc_e : $acc_e); } 705 if( my $acc_f= $args{FieldAccessors}) { $self->field_accessors( isa( $acc_f, 'ARRAY') ? @$acc_f : $acc_f); } 706 707 if( $args{UseTidy}) { $self->{use_tidy}= 1; } 708 $self->{tidy_options}= $args{TidyOptions} || {}; 709 710 if( $args{OutputHtmlDoctype}) { $self->{html_doctype}= 1; } 711 712 $self->set_quote( $args{Quote} || 'double'); 713 714 # set handlers 715 if( $self->{twig_roots}) 716 { if( $self->{twig_default_print}) 717 { if( $self->{twig_keep_encoding}) 718 { $self->setHandlers( %twig_handlers_roots_print_original); } 719 else 720 { $self->setHandlers( %twig_handlers_roots_print); } 721 } 722 else 723 { $self->setHandlers( %twig_handlers_roots); } 724 } 725 else 726 { $self->setHandlers( %twig_handlers); } 727 728 # XML::Parser::Expat does not like these handler to be set. So in order to 729 # use the various sets of handlers on XML::Parser or XML::Parser::Expat 730 # objects when needed, these ones have to be set only once, here, at 731 # XML::Parser level 732 $self->setHandlers( Init => \&_twig_init, Final => \&_twig_final); 733 734 $self->{twig_entity_list}= XML::Twig::Entity_list->new; 735 736 $self->{twig_id}= $ID; 737 $self->{twig_stored_spaces}=''; 738 739 $self->{twig_autoflush}= 1; # auto flush by default 740 741 $self->{twig}= $self; 742 if( $weakrefs) { weaken( $self->{twig}); } 743 744 return $self; 745 } 746 747sub parse 748 { 749 my $t= shift; 750 # if called as a class method, calls nparse, which creates the twig then parses it 751 if( !ref( $t) || !isa( $t, 'XML::Twig')) { return $t->nparse( @_); } 752 753 # requires 5.006 at least (or the ${^UNICODE} causes a problem) # > perl 5.5 754 # trap underlying bug in IO::Handle (see RT #17500) # > perl 5.5 755 # croak if perl 5.8+, -CD (or PERL_UNICODE set to D) and parsing a pipe # > perl 5.5 756 if( $perl_version>=5.008 && ${^UNICODE} && (${^UNICODE} & 24) && isa( $_[0], 'GLOB') && -p $_[0] ) # > perl 5.5 757 { croak "cannot parse the output of a pipe when perl is set to use the UTF8 perlIO layer\n" # > perl 5.5 758 . "set the environment variable PERL_UNICODE or use the -C option (see perldoc perlrun)\n" # > perl 5.5 759 . "not to include 'D'"; # > perl 5.5 760 } # > perl 5.5 761 $t= eval { $t->SUPER::parse( @_); }; 762 763 if( !$t 764 && $@=~m{(syntax error at line 1, column 0, byte 0|not well-formed \(invalid token\) at line 1, column 1, byte 1)} 765 && -f $_[0] 766 ) 767 { croak "you seem to have used the parse method on a filename ($_[0]), you probably want parsefile instead"; } 768 return _checked_parse_result( $t, $@); 769 } 770 771sub parsefile 772 { my $t= shift; 773 if( -f $_[0] && ! -s $_[0]) { return _checked_parse_result( undef, "empty file '$_[0]'"); } 774 $t= eval { $t->SUPER::parsefile( @_); }; 775 return _checked_parse_result( $t, $@); 776 } 777 778sub _checked_parse_result 779 { my( $t, $returned)= @_; 780 if( !$t) 781 { if( isa( $returned, 'XML::Twig') && $returned->{twig_finish_now}) 782 { $t= $returned; 783 delete $t->{twig_finish_now}; 784 return $t->_twig_final; 785 } 786 else 787 { _croak( $returned, 0); } 788 } 789 790 $active_twig= $t; 791 return $t; 792 } 793 794sub active_twig { return $active_twig; } 795 796sub finish_now 797 { my $t= shift; 798 $t->{twig_finish_now}=1; 799 die $t; 800 } 801 802 803sub parsefile_inplace { shift->_parse_inplace( parsefile => @_); } 804sub parsefile_html_inplace { shift->_parse_inplace( parsefile_html => @_); } 805 806sub _parse_inplace 807 { my( $t, $method, $file, $suffix)= @_; 808 _use( 'File::Temp') || croak "need File::Temp to use inplace methods\n"; 809 _use( 'File::Basename'); 810 811 812 my $tmpdir= dirname( $file); 813 my( $tmpfh, $tmpfile)= File::Temp::tempfile( DIR => $tmpdir); 814 my $original_fh= select $tmpfh; 815 816 unless( $t->{twig_keep_encoding} || $perl_version < 5.006) 817 { if( grep /useperlio=define/, `$^X -V`) # we can only use binmode :utf8 if perl was compiled with useperlio 818 { binmode( $tmpfh, ":utf8" ); } 819 } 820 821 $t->$method( $file); 822 823 select $original_fh; 824 close $tmpfh; 825 my $mode= (stat( $file))[2] & oct(7777); 826 chmod $mode, $tmpfile or croak "cannot change temp file mode to $mode: $!"; 827 828 if( $suffix) 829 { my $backup; 830 if( $suffix=~ m{\*}) { ($backup = $suffix) =~ s/\*/$file/g; } 831 else { $backup= $file . $suffix; } 832 833 rename( $file, $backup) or croak "cannot backup initial file ($file) to $backup: $!"; 834 } 835 rename( $tmpfile, $file) or croak "cannot rename temp file ($tmpfile) to initial file ($file): $!"; 836 837 return $t; 838 } 839 840 841sub parseurl 842 { my $t= shift; 843 $t->_parseurl( 0, @_); 844 } 845 846sub safe_parseurl 847 { my $t= shift; 848 $t->_parseurl( 1, @_); 849 } 850 851sub safe_parsefile_html 852 { my $t= shift; 853 eval { $t->parsefile_html( @_); }; 854 return $@ ? $t->_reset_twig_after_error : $t; 855 } 856 857sub safe_parseurl_html 858 { my $t= shift; 859 _use( 'LWP::Simple') or croak "missing LWP::Simple"; 860 eval { $t->parse_html( LWP::Simple::get( shift()), @_); } ; 861 return $@ ? $t->_reset_twig_after_error : $t; 862 } 863 864sub parseurl_html 865 { my $t= shift; 866 _use( 'LWP::Simple') or croak "missing LWP::Simple"; 867 $t->parse_html( LWP::Simple::get( shift()), @_); 868 } 869 870 871# uses eval to catch the parser's death 872sub safe_parse_html 873 { my $t= shift; 874 eval { $t->parse_html( @_); } ; 875 return $@ ? $t->_reset_twig_after_error : $t; 876 } 877 878sub parsefile_html 879 { my $t= shift; 880 my $file= shift; 881 my $indent= $t->{ErrorContext} ? 1 : 0; 882 $t->set_empty_tag_style( 'html'); 883 my $html2xml= $t->{use_tidy} ? \&_tidy_html : \&_html2xml; 884 my $options= $t->{use_tidy} ? $t->{tidy_options} || {} : { indent => $indent, html_doctype => $t->{html_doctype} }; 885 $t->parse( $html2xml->( _slurp( $file), $options), @_); 886 return $t; 887 } 888 889sub parse_html 890 { my $t= shift; 891 my $options= ref $_[0] && ref $_[0] eq 'HASH' ? shift() : {}; 892 my $use_tidy= exists $options->{use_tidy} ? $options->{use_tidy} : $t->{use_tidy}; 893 my $content= shift; 894 my $indent= $t->{ErrorContext} ? 1 : 0; 895 $t->set_empty_tag_style( 'html'); 896 my $html2xml= $use_tidy ? \&_tidy_html : \&_html2xml; 897 my $conv_options= $use_tidy ? $t->{tidy_options} || {} : { indent => $indent, html_doctype => $t->{html_doctype} }; 898 $t->parse( $html2xml->( isa( $content, 'GLOB') ? _slurp_fh( $content) : $content, $conv_options), @_); 899 return $t; 900 } 901 902sub xparse 903 { my $t= shift; 904 my $to_parse= $_[0]; 905 if( isa( $to_parse, 'GLOB')) { $t->parse( @_); } 906 elsif( $to_parse=~ m{^\s*<}) { $to_parse=~ m{<html}i ? $t->_parse_as_xml_or_html( @_) 907 : $t->parse( @_); 908 } 909 elsif( $to_parse=~ m{^\w+://.*\.html?$}) { _use( 'LWP::Simple') or croak "missing LWP::Simple"; 910 $t->_parse_as_xml_or_html( LWP::Simple::get( shift()), @_); 911 } 912 elsif( $to_parse=~ m{^\w+://}) { _use( 'LWP::Simple') or croak "missing LWP::Simple"; 913 my $doc= LWP::Simple::get( shift); 914 if( ! defined $doc) { $doc=''; } 915 my $xml_parse_ok= $t->safe_parse( $doc, @_); 916 if( $xml_parse_ok) 917 { return $xml_parse_ok; } 918 else 919 { my $diag= $@; 920 if( $doc=~ m{<html}i) 921 { $t->parse_html( $doc, @_); } 922 else 923 { croak $diag; } 924 } 925 } 926 elsif( $to_parse=~ m{\.html?$}) { my $content= _slurp( shift); 927 $t->_parse_as_xml_or_html( $content, @_); 928 } 929 else { $t->parsefile( @_); } 930 } 931 932sub _parse_as_xml_or_html 933 { my $t= shift; 934 if( _is_well_formed_xml( $_[0])) 935 { $t->parse( @_) } 936 else 937 { my $html2xml= $t->{use_tidy} ? \&_tidy_html : \&_html2xml; 938 my $options= $t->{use_tidy} ? $t->{tidy_options} || {} : { indent => 0, html_doctype => $t->{html_doctype} }; 939 my $html= $html2xml->( $_[0], $options, @_); 940 if( _is_well_formed_xml( $html)) 941 { $t->parse( $html); } 942 else 943 { croak $@; } # can't really test this because HTML::Parser or HTML::Tidy may change how they deal with bas HTML between versions 944 } 945 } 946 947{ my $parser; 948 sub _is_well_formed_xml 949 { $parser ||= XML::Parser->new; 950 eval { $parser->parse( $_[0]); }; 951 return $@ ? 0 : 1; 952 } 953} 954 955sub nparse 956 { my $class= shift; 957 my $to_parse= pop; 958 $class->new( @_)->xparse( $to_parse); 959 } 960 961sub nparse_pp { shift()->nparse( pretty_print => 'indented', @_); } 962sub nparse_e { shift()->nparse( error_context => 1, @_); } 963sub nparse_ppe { shift()->nparse( pretty_print => 'indented', error_context => 1, @_); } 964 965 966sub _html2xml 967 { my( $html, $options)= @_; 968 _use( 'HTML::TreeBuilder', '3.13') or croak "cannot parse HTML: missing HTML::TreeBuilder v >= 3.13\n"; 969 my $tree= HTML::TreeBuilder->new; 970 $tree->ignore_ignorable_whitespace( 0); 971 $tree->ignore_unknown( 0); 972 $tree->no_space_compacting( 1); 973 $tree->store_comments( 1); 974 $tree->store_pis(1); 975 $tree->parse( $html); 976 $tree->eof; 977 978 my $xml=''; 979 if( $options->{html_doctype} && exists $tree->{_decl} ) 980 { my $decl= $tree->{_decl}->as_XML; 981 982 # first try to fix declarations that are missing the SYSTEM part 983 $decl =~ s{^\s*<!DOCTYPE \s+ ((?i)html) \s+ PUBLIC \s+ "([^"]*)" \s* >} 984 { my $system= $HTML_DECL{$2} || $HTML_DECL{$DEFAULT_HTML_TYPE}; 985 qq{<!DOCTYPE $1 PUBLIC "$2" "$system">} 986 987 }xe; 988 989 # then check that the declaration looks OK (so it parses), if not remove it, 990 # better to parse without the declaration than to die stupidly 991 if( $decl =~ m{<!DOCTYPE \s+ (?i:HTML) (\s+ PUBLIC \s+ "[^"]*" \s+ (SYSTEM \s+)? "[^"]*")? \s*>}x # PUBLIC then SYSTEM 992 || $decl =~ m{<!DOCTYPE \s+ (?i:HTML) \s+ SYSTEM \s+ "[^"]*" \s*>}x # just SYSTEM 993 ) 994 { $xml= $decl; } 995 } 996 997 $xml.= _as_XML( $tree); 998 999 1000 _fix_xml( $tree, \$xml); 1001 1002 if( $options->{indent}) { _indent_xhtml( \$xml); } 1003 $tree->delete; 1004 $xml=~ s{\s+$}{}s; # trim end 1005 return $xml; 1006 } 1007 1008sub _tidy_html 1009 { my( $html, $options)= @_; 1010 _use( 'HTML::Tidy') or croak "cannot cleanup HTML using HTML::Tidy (required by the use_tidy option): $@\n"; ; 1011 my $TIDY_DEFAULTS= { output_xhtml => 1, # duh! 1012 tidy_mark => 0, # do not add the "generated by tidy" comment 1013 numeric_entities => 1, 1014 char_encoding => 'utf8', 1015 bare => 1, 1016 clean => 1, 1017 doctype => 'transitional', 1018 fix_backslash => 1, 1019 merge_divs => 0, 1020 merge_spans => 0, 1021 sort_attributes => 'alpha', 1022 indent => 0, 1023 wrap => 0, 1024 break_before_br => 0, 1025 }; 1026 $options ||= {}; 1027 my $tidy_options= { %$TIDY_DEFAULTS, %$options}; 1028 my $tidy = HTML::Tidy->new( $tidy_options); 1029 $tidy->ignore( type => 1, type => 2 ); # 1 is TIDY_WARNING, 2 is TIDY_ERROR, not clean 1030 my $xml= $tidy->clean( $html ); 1031 return $xml; 1032 } 1033 1034 1035{ my %xml_parser_encoding; 1036 sub _fix_xml 1037 { my( $tree, $xml)= @_; # $xml is a ref to the xml string 1038 1039 my $max_tries=5; 1040 my $add_decl; 1041 1042 while( ! _check_xml( $xml) && $max_tries--) 1043 { 1044 # a couple of fixes for weird HTML::TreeBuilder errors 1045 if( $@=~ m{^\s*xml (or text )?declaration not at start of (external )?entity}i) 1046 { $$xml=~ s{<\?xml.*?\?>}{}g; 1047 #warn " fixed xml declaration in the wrong place\n"; 1048 } 1049 elsif( $@=~ m{undefined entity}) 1050 { $$xml=~ s{&(amp;)?Amp;}{&}g if $HTML::TreeBuilder::VERSION < 4.00; 1051 if( _use( 'HTML::Entities::Numbered')) { $$xml=name2hex_xml( $$xml); } 1052 $$xml=~ s{&(\w+);}{ my $ent= $1; if( $ent !~ m{^(amp|lt|gt|apos|quote)$}) { "&$ent;" } }eg; 1053 } 1054 elsif( $@=~ m{&Amp; used in html}) 1055 # if $Amp; is used instead of & then HTML::TreeBuilder's as_xml is tripped (old version) 1056 { $$xml=~ s{&(amp;)?Amp;}{&}g if $HTML::TreeBuilder::VERSION < 4.00; 1057 } 1058 elsif( $@=~ m{^\s*not well-formed \(invalid token\)}) 1059 { if( $HTML::TreeBuilder::VERSION < 4.00) 1060 { $$xml=~ s{&(amp;)?Amp;}{&}g; 1061 $$xml=~ s{(<[^>]* )(\d+=)"}{$1a$2"}g; # <table 1> comes out as <table 1="1">, "fix the attribute 1062 } 1063 my $q= '<img "=""" '; # extracted so vim doesn't get confused 1064 if( _use( 'HTML::Entities::Numbered')) { $$xml=name2hex_xml( $$xml); } 1065 if( $$xml=~ m{$q}) 1066 { $$xml=~ s{$q}{<img }g; # happens with <img src="foo.png"" ... 1067 } 1068 else 1069 { my $encoding= _encoding_from_meta( $tree); 1070 unless( keys %xml_parser_encoding) { %xml_parser_encoding= _xml_parser_encodings(); } 1071 1072 if( ! $add_decl) 1073 { if( $xml_parser_encoding{$encoding}) 1074 { $add_decl=1; } 1075 elsif( $encoding eq 'euc-jp' && $xml_parser_encoding{'x-euc-jp-jisx0221'}) 1076 { $encoding="x-euc-jp-jisx0221"; $add_decl=1;} 1077 elsif( $encoding eq 'shift-jis' && $xml_parser_encoding{'x-sjis-jisx0221'}) 1078 { $encoding="x-sjis-jisx0221"; $add_decl=1;} 1079 1080 if( $add_decl) 1081 { $$xml=~ s{^(<\?xml.*?\?>)?}{<?xml version="1.0" encoding="$encoding"?>}s; 1082 #warn " added decl (encoding $encoding)\n"; 1083 } 1084 else 1085 { $$xml=~ s{^(<\?xml.*?\?>)?}{}s; 1086 #warn " converting to utf8 from $encoding\n"; 1087 $$xml= _to_utf8( $encoding, $$xml); 1088 } 1089 } 1090 else 1091 { $$xml=~ s{^(<\?xml.*?\?>)?}{}s; 1092 #warn " converting to utf8 from $encoding\n"; 1093 $$xml= _to_utf8( $encoding, $$xml); 1094 } 1095 } 1096 } 1097 } 1098 1099 # some versions of HTML::TreeBuilder escape CDATA sections 1100 $$xml=~ s{(<!\[CDATA\[.*?\]\]>)}{_unescape_cdata( $1)}eg; 1101 1102 } 1103 1104 sub _xml_parser_encodings 1105 { my @encodings=( 'iso-8859-1'); # this one is included by default, there is no map for it in @INC 1106 foreach my $inc (@INC) 1107 { push @encodings, map { basename( $_, '.enc') } glob( File::Spec->catdir( $inc => XML => Parser => Encodings => '*.enc')); } 1108 return map { $_ => 1 } @encodings; 1109 } 1110} 1111 1112 1113sub _unescape_cdata 1114 { my( $cdata)= @_; 1115 $cdata=~s{<}{<}g; 1116 $cdata=~s{>}{>}g; 1117 $cdata=~s{&}{&}g; 1118 return $cdata; 1119 } 1120 1121sub _as_XML { 1122 1123 # fork of HTML::Element::as_XML, which is a little too buggy and inconsistent between versions for my liking 1124 my ($elt) = @_; 1125 my $xml= ''; 1126 my $empty_element_map = $elt->_empty_element_map; 1127 1128 my ( $tag, $node, $start ); # per-iteration scratch 1129 $elt->traverse( 1130 sub { 1131 ( $node, $start ) = @_; 1132 if ( ref $node ) 1133 { # it's an element 1134 $tag = $node->{'_tag'}; 1135 if ($start) 1136 { # on the way in 1137 foreach my $att ( grep { ! m{^(_|/$)} } keys %$node ) 1138 { # fix attribute names instead of dying 1139 my $new_att= $att; 1140 if( $att=~ m{^\d}) { $new_att= "a$att"; } 1141 $new_att=~ s{[^\w\d:_-]}{}g; 1142 $new_att ||= 'a'; 1143 if( $new_att ne $att) { $node->{$new_att}= delete $node->{$att}; } 1144 } 1145 1146 if ( $empty_element_map->{$tag} && (!@{ $node->{'_content'} || []}) ) 1147 { $xml.= $node->starttag_XML( undef, 1 ); } 1148 else 1149 { $xml.= $node->starttag_XML(undef); } 1150 } 1151 else 1152 { # on the way out 1153 unless ( $empty_element_map->{$tag} and !@{ $node->{'_content'} || [] } ) 1154 { $xml.= $node->endtag_XML(); 1155 } # otherwise it will have been an <... /> tag. 1156 } 1157 } 1158 elsif( $node=~ /<!\[CDATA\[/) # the content includes CDATA 1159 { foreach my $chunk (split /(<!\[CDATA\[.*?\]\]>)/s, $node) # chunks are CDATA sections or normal text 1160 { $xml.= $chunk =~ m{<!\[CDATA\[} ? $chunk : _xml_escape( $chunk); } 1161 } 1162 else # it's just text 1163 { $xml .= _xml_escape($node); } 1164 1; # keep traversing 1165 } 1166 ); 1167 return $xml; 1168} 1169 1170sub _xml_escape 1171 { my( $html)= @_; 1172 $html =~ s{&(?! # An ampersand that isn't followed by... 1173 ( \#[0-9]+; | # A hash mark, digits and semicolon, or 1174 \#x[0-9a-fA-F]+; | # A hash mark, "x", hex digits and semicolon, or 1175 [\w]+; # A valid unicode entity name and semicolon 1176 ) 1177 ) 1178 } 1179 {&}gx if 0; # Needs to be escaped to amp 1180 1181 $html=~ s{&}{&}g; 1182 1183 # in old versions of HTML::TreeBuilder & can come out as &Amp; 1184 if( $HTML::TreeBuilder::VERSION && $HTML::TreeBuilder::VERSION <= 3.23) { $html=~ s{&Amp;}{&}g; } 1185 1186 # simple character escapes 1187 $html =~ s/</</g; 1188 $html =~ s/>/>/g; 1189 $html =~ s/"/"/g; 1190 $html =~ s/'/'/g; 1191 1192 return $html; 1193 } 1194 1195 1196 1197 1198sub _check_xml 1199 { my( $xml)= @_; # $xml is a ref to the xml string 1200 my $ok= eval { XML::Parser->new->parse( $$xml); }; 1201 #if( $ok) { warn " parse OK\n"; } 1202 return $ok; 1203 } 1204 1205sub _encoding_from_meta 1206 { my( $tree)= @_; 1207 my $enc="iso-8859-1"; 1208 my @meta= $tree->find( 'meta'); 1209 foreach my $meta (@meta) 1210 { if( $meta->{'http-equiv'} && ($meta->{'http-equiv'} =~ m{^\s*content-type\s*}i) 1211 && $meta->{content} && ($meta->{content} =~ m{^\s*text/html\s*;\s*charset\s*=\s*(\S*)\s*}i) 1212 ) 1213 { $enc= lc $1; 1214 #warn " encoding from meta tag is '$enc'\n"; 1215 last; 1216 } 1217 } 1218 return $enc; 1219 } 1220 1221{ sub _to_utf8 1222 { my( $encoding, $string)= @_; 1223 local $SIG{__DIE__}; 1224 if( _use( 'Encode')) 1225 { Encode::from_to( $string, $encoding => 'utf8', 0x0400); } # 0x0400 is Encode::FB_XMLCREF 1226 elsif( _use( 'Text::Iconv')) 1227 { my $converter = eval { Text::Iconv->new( $encoding => "utf8") }; 1228 if( $converter) { $string= $converter->convert( $string); } 1229 } 1230 elsif( _use( 'Unicode::Map8') && _use( 'Unicode::String')) 1231 { my $map= Unicode::Map8->new( $encoding); 1232 $string= $map->tou( $string)->utf8; 1233 } 1234 $string=~ s{[\x00-\x08\x0B\x0C\x0E-\x1F]}{}g; # get rid of control chars, portable in 5.6 1235 return $string; 1236 } 1237} 1238 1239 1240sub _indent_xhtml 1241 { my( $xhtml)= @_; # $xhtml is a ref 1242 my %block_tag= map { $_ => 1 } qw( html 1243 head 1244 meta title link script base 1245 body 1246 h1 h2 h3 h4 h5 h6 1247 p br address blockquote pre 1248 ol ul li dd dl dt 1249 table tr td th tbody tfoot thead col colgroup caption 1250 div frame frameset hr 1251 ); 1252 1253 my $level=0; 1254 $$xhtml=~ s{( (?:<!(?:--.*?-->|[CDATA[.*?]]>)) # ignore comments and CDATA sections 1255 | <(\w+)((?:\s+\w+\s*=\s*(?:"[^"]*"|'[^']*'))*\s*/>) # empty tag 1256 | <(\w+) # start tag 1257 |</(\w+) # end tag 1258 ) 1259 } 1260 { if( $2 && $block_tag{$2}) { my $indent= " " x $level; 1261 "\n$indent<$2$3"; 1262 } 1263 elsif( $4 && $block_tag{$4}) { my $indent= " " x $level; 1264 $level++ unless( $4=~ m{/>}); 1265 my $nl= $4 eq 'html' ? '' : "\n"; 1266 "$nl$indent<$4"; 1267 } 1268 elsif( $5 && $block_tag{$5}) { $level--; "</$5"; } 1269 else { $1; } 1270 }xesg; 1271 } 1272 1273 1274sub add_stylesheet 1275 { my( $t, $type, $href)= @_; 1276 my %text_type= map { $_ => 1 } qw( xsl css); 1277 my $ss= $t->{twig_elt_class}->new( $PI); 1278 if( $text_type{$type}) 1279 { $ss->_set_pi( 'xml-stylesheet', qq{type="text/$type" href="$href"}); } 1280 else 1281 { croak "unsupported style sheet type '$type'"; } 1282 1283 $t->_add_cpi_outside_of_root( leading_cpi => $ss); 1284 return $t; 1285 } 1286 1287{ my %used; # module => 1 if require ok, 0 otherwise 1288 my %disallowed; # for testing, refuses to _use modules in this hash 1289 1290 sub _disallow_use ## no critic (Subroutines::ProhibitNestedSubs); 1291 { my( @modules)= @_; 1292 $disallowed{$_}= 1 foreach (@modules); 1293 } 1294 1295 sub _allow_use ## no critic (Subroutines::ProhibitNestedSubs); 1296 { my( @modules)= @_; 1297 $disallowed{$_}= 0 foreach (@modules); 1298 } 1299 1300 sub _use ## no critic (Subroutines::ProhibitNestedSubs); 1301 { my( $module, $version)= @_; 1302 $version ||= 0; 1303 if( $disallowed{$module}) { return 0; } 1304 if( $used{$module}) { return 1; } 1305 if( eval "require $module") { import $module; $used{$module}= 1; # no critic ProhibitStringyEval 1306 if( $version) 1307 { 1308 ## no critic (TestingAndDebugging::ProhibitNoStrict); 1309 no strict 'refs'; 1310 if( ${"${module}::VERSION"} >= $version ) { return 1; } 1311 else { return 0; } 1312 } 1313 else 1314 { return 1; } 1315 } 1316 else { $used{$module}= 0; return 0; } 1317 } 1318} 1319 1320# used to solve the [n] predicates while avoiding getting the entire list 1321# needs a prototype to accept passing bare blocks 1322sub _first_n(&$@) ## no critic (Subroutines::ProhibitSubroutinePrototypes); 1323 { my $coderef= shift; 1324 my $n= shift; 1325 my $i=0; 1326 if( $n > 0) 1327 { foreach (@_) { if( &$coderef) { $i++; return $_ if( $i == $n); } } } 1328 elsif( $n < 0) 1329 { foreach (reverse @_) { if( &$coderef) { $i--; return $_ if( $i == $n); } } } 1330 else 1331 { croak "illegal position number 0"; } 1332 return undef; 1333 } 1334 1335sub _slurp_uri 1336 { my( $uri, $base)= @_; 1337 if( $uri=~ m{^\w+://}) { _use( 'LWP::Simple'); return LWP::Simple::get( $uri); } 1338 else { return _slurp( _based_filename( $uri, $base)); } 1339 } 1340 1341sub _based_filename 1342 { my( $filename, $base)= @_; 1343 # cf. XML/Parser.pm's file_ext_ent_handler 1344 if (defined($base) and not ($filename =~ m{^(?:[\\/]|\w+:)})) 1345 { my $newpath = $base; 1346 $newpath =~ s{[^\\/:]*$}{$filename}; 1347 $filename = $newpath; 1348 } 1349 return $filename; 1350 } 1351 1352sub _slurp 1353 { my( $filename)= @_; 1354 my $to_slurp; 1355 open( $to_slurp, "<$filename") or croak "cannot open '$filename': $!"; 1356 local $/= undef; 1357 my $content= <$to_slurp>; 1358 close $to_slurp; 1359 return $content; 1360 } 1361 1362sub _slurp_fh 1363 { my( $fh)= @_; 1364 local $/= undef; 1365 my $content= <$fh>; 1366 return $content; 1367 } 1368 1369# I should really add extra options to allow better configuration of the 1370# LWP::UserAgent object 1371# this method forks (except on VMS!) 1372# - the child gets the data and copies it to the pipe, 1373# - the parent reads the stream and sends it to XML::Parser 1374# the data is cut it chunks the size of the XML::Parser::Expat buffer 1375# the method returns the twig and the status 1376sub _parseurl 1377 { my( $t, $safe, $url, $agent)= @_; 1378 _use( 'LWP') || croak "LWP not available, needed to use parseurl methods"; 1379 if( $^O ne 'VMS') 1380 { pipe( README, WRITEME) or croak "cannot create connected pipes: $!"; 1381 if( my $pid= fork) 1382 { # parent code: parse the incoming file 1383 close WRITEME; # no need to write 1384 my $result= $safe ? $t->safe_parse( \*README) : $t->parse( \*README); 1385 close README; 1386 return $@ ? 0 : $t; 1387 } 1388 else 1389 { # child 1390 close README; # no need to read 1391 local $|=1; 1392 $agent ||= LWP::UserAgent->new; 1393 my $request = HTTP::Request->new( GET => $url); 1394 # _pass_url_content is called with chunks of data the same size as 1395 # the XML::Parser buffer 1396 my $response = $agent->request( $request, 1397 sub { _pass_url_content( \*WRITEME, @_); }, $BUFSIZE); 1398 $response->is_success or croak "$url ", $response->message; 1399 close WRITEME; 1400 CORE::exit(); # CORE is there for mod_perl (which redefines exit) 1401 } 1402 } 1403 else 1404 { # VMS branch (hard to test!) 1405 local $|=1; 1406 $agent ||= LWP::UserAgent->new; 1407 my $request = HTTP::Request->new( GET => $url); 1408 my $response = $agent->request( $request); 1409 $response->is_success or croak "$url ", $response->message; 1410 my $result= $safe ? $t->safe_parse($response->content) : $t->parse($response->content); 1411 return $@ ? 0 : $t; 1412 } 1413 1414 } 1415 1416# get the (hopefully!) XML data from the URL and 1417sub _pass_url_content 1418 { my( $fh, $data, $response, $protocol)= @_; 1419 print {$fh} $data; 1420 } 1421 1422sub add_options 1423 { my %args= map { $_, 1 } @_; 1424 %args= _normalize_args( %args); 1425 foreach (keys %args) { $valid_option{$_}++; } 1426 } 1427 1428sub _pretty_print_styles { return XML::Twig::Elt::_pretty_print_styles(); } 1429 1430sub _twig_store_internal_dtd 1431 { 1432 # warn " in _twig_store_internal_dtd...\n"; # DEBUG handler 1433 my( $p, $string)= @_; 1434 my $t= $p->{twig}; 1435 if( $t->{twig_keep_encoding}) { $string= $p->original_string(); } 1436 $t->{twig_doctype}->{internal} .= $string; 1437 return; 1438 } 1439 1440sub _twig_stop_storing_internal_dtd 1441 { # warn " in _twig_stop_storing_internal_dtd...\n"; # DEBUG handler 1442 my $p= shift; 1443 if( @saved_default_handler && defined $saved_default_handler[1]) 1444 { $p->setHandlers( @saved_default_handler); } 1445 else 1446 { 1447 $p->setHandlers( Default => undef); 1448 } 1449 $p->{twig}->{twig_doctype}->{internal}=~ s{^\s*\[}{}; 1450 $p->{twig}->{twig_doctype}->{internal}=~ s{\]\s*$}{}; 1451 return; 1452 } 1453 1454sub _twig_doctype_fin_print 1455 { # warn " in _twig_doctype_fin_print...\n"; # DEBUG handler 1456 my( $p)= shift; 1457 if( $p->{twig}->{twig_doctype}->{has_internal} && !$expat_1_95_2) { print ' ]>'; } 1458 return; 1459 } 1460 1461 1462sub _normalize_args 1463 { my %normalized_args; 1464 while( my $key= shift ) 1465 { $key= join '', map { ucfirst } split /_/, $key; 1466 #$key= "Twig".$key unless( substr( $key, 0, 4) eq 'Twig'); 1467 $normalized_args{$key}= shift ; 1468 } 1469 return %normalized_args; 1470 } 1471 1472sub _is_fh { return unless $_[0]; return $_[0] if( isa( $_[0], 'GLOB') || isa( $_[0], 'IO::Scalar')); } 1473 1474sub _set_handler 1475 { my( $handlers, $whole_path, $handler)= @_; 1476 1477 my $H_SPECIAL = qr{($ALL|$DEFAULT|$COMMENT|$TEXT)}; 1478 my $H_PI = qr{(\?|$PI)\s*(([^\s]*)\s*)}; 1479 my $H_LEVEL = qr{level \s* \( \s* ([0-9]+) \s* \)}x; 1480 my $H_REGEXP = qr{\(\?([\^xism]*)(-[\^xism]*)?:(.*)\)}x; 1481 my $H_XPATH = qr{(/?/?$REG_TAG_PART? \s* ($REG_PREDICATE\s*)?)+}x; 1482 1483 my $prev_handler; 1484 1485 my $cpath= $whole_path; 1486 #warn "\$cpath: '$cpath\n"; 1487 while( $cpath && $cpath=~ s{^\s*($H_SPECIAL|$H_PI|$H_LEVEL|$H_REGEXP|$H_XPATH)\s*($|\|)}{}) 1488 { my $path= $1; 1489 #warn "\$cpath: '$cpath' - $path: '$path'\n"; 1490 $prev_handler ||= $handlers->{handlers}->{string}->{$path} || undef; # $prev_handler gets the first found handler 1491 1492 _set_special_handler ( $handlers, $path, $handler, $prev_handler) 1493 || _set_pi_handler ( $handlers, $path, $handler, $prev_handler) 1494 || _set_level_handler ( $handlers, $path, $handler, $prev_handler) 1495 || _set_regexp_handler ( $handlers, $path, $handler, $prev_handler) 1496 || _set_xpath_handler ( $handlers, $path, $handler, $prev_handler) 1497 || croak "unrecognized expression in handler: '$whole_path'"; 1498 1499 # this both takes care of the simple (gi) handlers and store 1500 # the handler code reference for other handlers 1501 $handlers->{handlers}->{string}->{$path}= $handler; 1502 } 1503 1504 if( $cpath) { croak "unrecognized expression in handler: '$whole_path'"; } 1505 1506 return $prev_handler; 1507 } 1508 1509 1510sub _set_special_handler 1511 { my( $handlers, $path, $handler, $prev_handler)= @_; 1512 if( $path =~ m{^\s*($ALL|$DEFAULT|$COMMENT|$TEXT)\s*$}io ) 1513 { $handlers->{handlers}->{$1}= $handler; 1514 return 1; 1515 } 1516 else 1517 { return 0; } 1518 } 1519 1520sub _set_xpath_handler 1521 { my( $handlers, $path, $handler, $prev_handler)= @_; 1522 if( my $handler_data= _parse_xpath_handler( $path, $handler)) 1523 { _add_handler( $handlers, $handler_data, $path, $prev_handler); 1524 return 1; 1525 } 1526 else 1527 { return 0; } 1528 } 1529 1530sub _add_handler 1531 { my( $handlers, $handler_data, $path, $prev_handler)= @_; 1532 1533 my $tag= $handler_data->{tag}; 1534 my @handlers= $handlers->{xpath_handler}->{$tag} ? @{$handlers->{xpath_handler}->{$tag}} : (); 1535 1536 if( $prev_handler) { @handlers= grep { $_->{path} ne $path } @handlers; } 1537 1538 push @handlers, $handler_data if( $handler_data->{handler}); 1539 1540 if( @handlers > 1) 1541 { @handlers= sort { (($b->{score}->{type} || 0) <=> ($a->{score}->{type} || 0)) 1542 || (($b->{score}->{anchored} || 0) <=> ($a->{score}->{anchored} || 0)) 1543 || (($b->{score}->{steps} || 0) <=> ($a->{score}->{steps} || 0)) 1544 || (($b->{score}->{predicates} || 0) <=> ($a->{score}->{predicates} || 0)) 1545 || (($b->{score}->{tests} || 0) <=> ($a->{score}->{tests} || 0)) 1546 || ($a->{path} cmp $b->{path}) 1547 } @handlers; 1548 } 1549 1550 $handlers->{xpath_handler}->{$tag}= \@handlers; 1551 } 1552 1553sub _set_pi_handler 1554 { my( $handlers, $path, $handler, $prev_handler)= @_; 1555 # PI conditions ( '?target' => \&handler or '?' => \&handler 1556 # or '#PItarget' => \&handler or '#PI' => \&handler) 1557 if( $path=~ /^\s*(?:\?|$PI)\s*(?:([^\s]*)\s*)$/) 1558 { my $target= $1 || ''; 1559 # update the path_handlers count, knowing that 1560 # either the previous or the new handler can be undef 1561 $handlers->{pi_handlers}->{$1}= $handler; 1562 return 1; 1563 } 1564 else 1565 { return 0; 1566 } 1567 } 1568 1569sub _set_level_handler 1570 { my( $handlers, $path, $handler, $prev_handler)= @_; 1571 if( $path =~ m{^ \s* level \s* \( \s* ([0-9]+) \s* \) \s* $}ox ) 1572 { my $level= $1; 1573 my $sub= sub { my( $stack)= @_; return( ($stack->[-1]->{$ST_TAG} !~ m{^#}) && (scalar @$stack == $level + 1) ) }; 1574 my $handler_data= { tag=> '*', score => { type => $LEVEL_TRIGGER}, trigger => $sub, 1575 path => $path, handler => $handler, test_on_text => 0 1576 }; 1577 _add_handler( $handlers, $handler_data, $path, $prev_handler); 1578 return 1; 1579 } 1580 else 1581 { return 0; } 1582 } 1583 1584sub _set_regexp_handler 1585 { my( $handlers, $path, $handler, $prev_handler)= @_; 1586 # if the expression was a regexp it is now a string (it was stringified when it became a hash key) 1587 if( $path=~ m{^\(\?([\^xism]*)(?:-[\^xism]*)?:(.*)\)$}) 1588 { my $regexp= qr/(?$1:$2)/; # convert it back into a regexp 1589 my $sub= sub { my( $stack)= @_; return( $stack->[-1]->{$ST_TAG} =~ $regexp ) }; 1590 my $handler_data= { tag=> '*', score => { type => $REGEXP_TRIGGER} , trigger => $sub, 1591 path => $path, handler => $handler, test_on_text => 0 1592 }; 1593 _add_handler( $handlers, $handler_data, $path, $prev_handler); 1594 return 1; 1595 } 1596 else 1597 { return 0; } 1598 } 1599 1600my $DEBUG_HANDLER= 0; # 0 or 1 (output the handler checking code) or 2 (super verbose) 1601my $handler_string; # store the handler itself 1602sub _set_debug_handler { $DEBUG_HANDLER= shift; } 1603sub _warn_debug_handler { if( $DEBUG_HANDLER < 3) { warn @_; } else { $handler_string .= join( '', @_); } } 1604sub _return_debug_handler { my $string= $handler_string; $handler_string=''; return $string; } 1605 1606sub _parse_xpath_handler 1607 { my( $xpath, $handler)= @_; 1608 my $xpath_original= $xpath; 1609 1610 1611 if( $DEBUG_HANDLER >=1) { _warn_debug_handler( "\n\nparsing path '$xpath'\n"); } 1612 1613 my $path_to_check= $xpath; 1614 $path_to_check=~ s{/?/?$REG_TAG_PART?\s*(?:$REG_PREDICATE\s*)?}{}g; 1615 if( $DEBUG_HANDLER && $path_to_check=~ /\S/) { _warn_debug_handler( "left: $path_to_check\n"); } 1616 return if( $path_to_check=~ /\S/); 1617 1618 (my $xpath_to_display= $xpath)=~ s{(["{}'\[\]\@\$])}{\\$1}g; 1619 1620 my @xpath_steps; 1621 my $last_token_is_sep; 1622 1623 while( $xpath=~ s{^\s* 1624 ( (//?) # separator 1625 | (?:$REG_TAG_PART\s*(?:$REG_PREDICATE\s*)?) # tag name and optional predicate 1626 | (?:$REG_PREDICATE) # just a predicate 1627 ) 1628 } 1629 {}x 1630 ) 1631 { # check that we have alternating separators and steps 1632 if( $2) # found a separator 1633 { if( $last_token_is_sep) { return 0; } # 2 separators in a row 1634 $last_token_is_sep= 1; 1635 } 1636 else 1637 { if( defined( $last_token_is_sep) && !$last_token_is_sep) { return 0; } # 2 steps in a row 1638 $last_token_is_sep= 0; 1639 } 1640 1641 push @xpath_steps, $1; 1642 } 1643 if( $last_token_is_sep) { return 0; } # expression cannot end with a separator 1644 1645 my $i=-1; 1646 1647 my $perlfunc= _join_n( $NO_WARNINGS . ';', 1648 q|my( $stack)= @_; |, 1649 q|my @current_elts= (scalar @$stack); |, 1650 q|my @new_current_elts; |, 1651 q|my $elt; |, 1652 ($DEBUG_HANDLER >= 1) && (qq#warn q{checking path '$xpath_to_display'\n};#), 1653 ); 1654 1655 1656 my $last_tag=''; 1657 my $anchored= $xpath_original=~ m{^\s*/(?!/)} ? 1 : 0; 1658 my $score={ type => $XPATH_TRIGGER, anchored => $anchored }; 1659 my $flag= { test_on_text => 0 }; 1660 my $sep='/'; # '/' or '//' 1661 while( my $xpath_step= pop @xpath_steps) 1662 { my( $tag, $predicate)= $xpath_step =~ m{^($REG_TAG_PART)?(?:\[(.*)\])?\s*$}; 1663 $score->{steps}++; 1664 $tag||='*'; 1665 1666 my $warn_empty_stack= $DEBUG_HANDLER >= 2 ? qq{warn "return with empty stack\\n";} : ''; 1667 1668 if( $predicate) 1669 { if( $DEBUG_HANDLER >= 2) { _warn_debug_handler( "predicate is: '$predicate'\n"); } 1670 # changes $predicate (from an XPath expression to a Perl one) 1671 if( $predicate=~ m{^\s*$REG_NUMBER\s*$}) { croak "position selector [$predicate] not supported on twig_handlers"; } 1672 _parse_predicate_in_handler( $predicate, $flag, $score); 1673 if( $DEBUG_HANDLER >= 2) { _warn_debug_handler( "predicate becomes: '$predicate'\n"); } 1674 } 1675 1676 my $tag_cond= _tag_cond( $tag); 1677 my $cond= join( " && ", grep { $_ } $tag_cond, $predicate) || 1; 1678 1679 if( $css_sel && $tag=~ m{\.}) { $tag=~s{\.[^.]*$}{}; $tag ||='*'; } 1680 $tag=~ s{(.)#.+$}{$1}; 1681 1682 $last_tag ||= $tag; 1683 1684 if( $sep eq '/') 1685 { 1686 $perlfunc .= sprintf( _join_n( q#foreach my $current_elt (@current_elts) #, 1687 q# { next if( !$current_elt); #, 1688 q# $current_elt--; #, 1689 q# $elt= $stack->[$current_elt]; #, 1690 q# if( %s) { push @new_current_elts, $current_elt;} #, 1691 q# } #, 1692 ), 1693 $cond 1694 ); 1695 } 1696 elsif( $sep eq '//') 1697 { 1698 $perlfunc .= sprintf( _join_n( q#foreach my $current_elt (@current_elts) #, 1699 q# { next if( !$current_elt); #, 1700 q# $current_elt--; #, 1701 q# my $candidate= $current_elt; #, 1702 q# while( $candidate >=0) #, 1703 q# { $elt= $stack->[$candidate]; #, 1704 q# if( %s) { push @new_current_elts, $candidate;} #, 1705 q# $candidate--; #, 1706 q# } #, 1707 q# } #, 1708 ), 1709 $cond 1710 ); 1711 } 1712 my $warn= $DEBUG_HANDLER >= 2 ? _join_n( qq#warn qq%fail at cond '$cond'%;#) : ''; 1713 $perlfunc .= sprintf( _join_n( q#unless( @new_current_elts) { %s return 0; } #, 1714 q#@current_elts= @new_current_elts; #, 1715 q#@new_current_elts=(); #, 1716 ), 1717 $warn 1718 ); 1719 1720 $sep= pop @xpath_steps; 1721 } 1722 1723 if( $anchored) # there should be a better way, but this works 1724 { 1725 my $warn= $DEBUG_HANDLER >= 2 ? _join_n( qq#warn qq{fail, stack not empty};#) : ''; 1726 $perlfunc .= sprintf( _join_n( q#if( ! grep { $_ == 0 } @current_elts) { %s return 0;}#), $warn); 1727 } 1728 1729 $perlfunc.= qq{warn "handler for '$xpath_to_display' triggered\\n";\n} if( $DEBUG_HANDLER >=2); 1730 $perlfunc.= qq{return q{$xpath_original};\n}; 1731 _warn_debug_handler( "\nperlfunc:\n$perlfunc\n") if( $DEBUG_HANDLER>=1); 1732 my $s= eval "sub { $perlfunc }"; 1733 if( $@) 1734 { croak "wrong handler condition '$xpath' ($@);" } 1735 1736 _warn_debug_handler( "last tag: '$last_tag', test_on_text: '$flag->{test_on_text}'\n") if( $DEBUG_HANDLER >=1); 1737 _warn_debug_handler( "score: ", join( ' ', map { "$_: $score->{$_}" } sort keys %$score), "\n") if( $DEBUG_HANDLER >=1); 1738 return { tag=> $last_tag, score => $score, trigger => $s, path => $xpath_original, handler => $handler, test_on_text => $flag->{test_on_text} }; 1739 } 1740 1741sub _join_n { return join( "\n", @_, ''); } 1742 1743# the "tag" part can be <tag>, <tag>.<class> or <tag>#<id> (where tag can be *, or start with # for hidden tags) 1744sub _tag_cond 1745 { my( $full_tag)= @_; 1746 1747 my( $tag, $class, $id); 1748 if( $full_tag=~ m{^(.+)#(.+)$}) 1749 { ($tag, $id)= ($1, $2); } # <tag>#<id> 1750 else 1751 { ( $tag, $class)= $css_sel ? $full_tag=~ m{^(.*?)(?:\.([^.]*))?$} : ($full_tag, undef); } 1752 1753 my $tag_cond = $tag && $tag ne '*' ? qq#(\$elt->{'$ST_TAG'} eq "$tag")# : ''; 1754 my $id_cond = defined $id ? qq#(\$elt->{id} eq "$id")# : ''; 1755 my $class_cond = defined $class ? qq#(\$elt->{class}=~ m{(^| )$class( |\$)})# : ''; 1756 1757 my $full_cond= join( ' && ', grep { $_ } ( $tag_cond, $class_cond, $id_cond)); 1758 1759 return $full_cond; 1760 } 1761 1762# input: the predicate ($_[0]) which will be changed in place 1763# flags, a hashref with various flags (like test_on_text) 1764# the score 1765sub _parse_predicate_in_handler 1766 { my( $flag, $score)= @_[1..2]; 1767 $_[0]=~ s{( ($REG_STRING) # strings 1768 |\@($REG_TAG_NAME)(\s* $REG_MATCH \s* $REG_REGEXP) # @att and regexp 1769 |\@($REG_TAG_NAME)(?=\s*(?:[><=!])) # @att followed by a comparison operator 1770 |\@($REG_TAG_NAME) # @att (not followed by a comparison operator) 1771 |=~|!~ # matching operators 1772 |([><]=?|=|!=)(?=\s*[\d+-]) # test before a number 1773 |([><]=?|=|!=) # test, other cases 1774 |($REG_FUNCTION) # no arg functions 1775 # this bit is a mess, but it is the only solution with this half-baked parser 1776 |(string\(\s*$REG_NAME\s*\)\s*$REG_MATCH\s*$REG_REGEXP) # string( child)=~ /regexp/ 1777 |(string\(\s*$REG_NAME\s*\)\s*$REG_COMP\s*$REG_STRING) # string( child) = "value" (or other test) 1778 |(string\(\s*$REG_NAME\s*\)\s*$REG_COMP\s*$REG_NUMBER) # string( child) = nb (or other test) 1779 |(and|or) 1780 # |($REG_NAME(?=\s*(and|or|$))) # nested tag name (needs to be after all other unquoted strings) 1781 |($REG_TAG_IN_PREDICATE) # nested tag name (needs to be after all other unquoted strings) 1782 1783 )} 1784 { my( $token, $str, $att_re_name, $att_re_regexp, $att, $bare_att, $num_test, $alpha_test, $func, $str_regexp, $str_test_alpha, $str_test_num, $and_or, $tag) 1785 = ( $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14); 1786 1787 $score->{predicates}++; 1788 1789 # store tests on text (they are not always allowed) 1790 if( $func || $str_regexp || $str_test_num || $str_test_alpha ) { $flag->{test_on_text}= 1; } 1791 1792 if( defined $str) { $token } 1793 elsif( $tag) { qq{(\$elt->{'$ST_ELT'} && \$elt->{'$ST_ELT'}->has_child( '$tag'))} } 1794 elsif( $att) { $att=~ m{^#} ? qq{ (\$elt->{'$ST_ELT'} && \$elt->{'$ST_ELT'}->{att}->{'$att'})} 1795 : qq{\$elt->{'$att'}} 1796 } 1797 elsif( $att_re_name) { $att_re_name=~ m{^#} ? qq{ (\$elt->{'$ST_ELT'} && \$elt->{'$ST_ELT'}->{att}->{'$att_re_name'}$att_re_regexp)} 1798 : qq{\$elt->{'$att_re_name'}$att_re_regexp} 1799 } 1800 # for some reason Devel::Cover flags the following lines as not tested. They are though. 1801 elsif( $bare_att) { $bare_att=~ m{^#} ? qq{(\$elt->{'$ST_ELT'} && defined(\$elt->{'$ST_ELT'}->{att}->{'$bare_att'}))} 1802 : qq{defined( \$elt->{'$bare_att'})} 1803 } 1804 elsif( $num_test && ($num_test eq '=') ) { "==" } # others tests are unchanged 1805 elsif( $alpha_test) { $PERL_ALPHA_TEST{$alpha_test} } 1806 elsif( $func && $func=~ m{^string}) 1807 { "\$elt->{'$ST_ELT'}->text"; } 1808 elsif( $str_regexp && $str_regexp =~ m{string\(\s*($REG_TAG_NAME)\s*\)\s*($REG_MATCH)\s*($REG_REGEXP)}) 1809 { "defined( _first_n { \$_->text $2 $3 } 1, \$elt->{'$ST_ELT'}->_children( '$1'))"; } 1810 elsif( $str_test_alpha && $str_test_alpha =~ m{string\(\s*($REG_TAG_NAME)\s*\)\s*($REG_COMP)\s*($REG_STRING)}) 1811 { my( $tag, $op, $str)= ($1, $2, $3); 1812 $str=~ s{(?<=.)'(?=.)}{\\'}g; # escape a quote within the string 1813 $str=~ s{^"}{'}; 1814 $str=~ s{"$}{'}; 1815 "defined( _first_n { \$_->text $PERL_ALPHA_TEST{$op} $str } 1, \$elt->{'$ST_ELT'}->children( '$tag'))"; } 1816 elsif( $str_test_num && $str_test_num =~ m{string\(\s*($REG_TAG_NAME)\s*\)\s*($REG_COMP)\s*($REG_NUMBER)}) 1817 { my $test= ($2 eq '=') ? '==' : $2; 1818 "defined( _first_n { \$_->text $test $3 } 1, \$elt->{'$ST_ELT'}->children( '$1'))"; 1819 } 1820 elsif( $and_or) { $score->{tests}++; $and_or eq 'and' ? '&&' : '||' ; } 1821 else { $token; } 1822 }gexs; 1823 } 1824 1825 1826sub setCharHandler 1827 { my( $t, $handler)= @_; 1828 $t->{twig_char_handler}= $handler; 1829 } 1830 1831 1832sub _reset_handlers 1833 { my $handlers= shift; 1834 delete $handlers->{handlers}; 1835 delete $handlers->{path_handlers}; 1836 delete $handlers->{subpath_handlers}; 1837 $handlers->{attcond_handlers_exp}=[] if( $handlers->{attcond_handlers}); 1838 delete $handlers->{attcond_handlers}; 1839 } 1840 1841sub _set_handlers 1842 { my $handlers= shift || return; 1843 my $set_handlers= {}; 1844 foreach my $path (keys %{$handlers}) 1845 { _set_handler( $set_handlers, $path, $handlers->{$path}); } 1846 1847 return $set_handlers; 1848 } 1849 1850 1851sub setTwigHandler 1852 { my( $t, $path, $handler)= @_; 1853 $t->{twig_handlers} ||={}; 1854 return _set_handler( $t->{twig_handlers}, $path, $handler); 1855 } 1856 1857sub setTwigHandlers 1858 { my( $t, $handlers)= @_; 1859 my $previous_handlers= $t->{twig_handlers} || undef; 1860 _reset_handlers( $t->{twig_handlers}); 1861 $t->{twig_handlers}= _set_handlers( $handlers); 1862 return $previous_handlers; 1863 } 1864 1865sub setStartTagHandler 1866 { my( $t, $path, $handler)= @_; 1867 $t->{twig_starttag_handlers}||={}; 1868 return _set_handler( $t->{twig_starttag_handlers}, $path, $handler); 1869 } 1870 1871sub setStartTagHandlers 1872 { my( $t, $handlers)= @_; 1873 my $previous_handlers= $t->{twig_starttag_handlers} || undef; 1874 _reset_handlers( $t->{twig_starttag_handlers}); 1875 $t->{twig_starttag_handlers}= _set_handlers( $handlers); 1876 return $previous_handlers; 1877 } 1878 1879sub setIgnoreEltsHandler 1880 { my( $t, $path, $action)= @_; 1881 $t->{twig_ignore_elts_handlers}||={}; 1882 return _set_handler( $t->{twig_ignore_elts_handlers}, $path, $action ); 1883 } 1884 1885sub setIgnoreEltsHandlers 1886 { my( $t, $handlers)= @_; 1887 my $previous_handlers= $t->{twig_ignore_elts_handlers}; 1888 _reset_handlers( $t->{twig_ignore_elts_handlers}); 1889 $t->{twig_ignore_elts_handlers}= _set_handlers( $handlers); 1890 return $previous_handlers; 1891 } 1892 1893sub setEndTagHandler 1894 { my( $t, $path, $handler)= @_; 1895 $t->{twig_endtag_handlers}||={}; 1896 return _set_handler( $t->{twig_endtag_handlers}, $path,$handler); 1897 } 1898 1899sub setEndTagHandlers 1900 { my( $t, $handlers)= @_; 1901 my $previous_handlers= $t->{twig_endtag_handlers}; 1902 _reset_handlers( $t->{twig_endtag_handlers}); 1903 $t->{twig_endtag_handlers}= _set_handlers( $handlers); 1904 return $previous_handlers; 1905 } 1906 1907# a little more complex: set the twig_handlers only if a code ref is given 1908sub setTwigRoots 1909 { my( $t, $handlers)= @_; 1910 my $previous_roots= $t->{twig_roots}; 1911 _reset_handlers($t->{twig_roots}); 1912 $t->{twig_roots}= _set_handlers( $handlers); 1913 1914 _check_illegal_twig_roots_handlers( $t->{twig_roots}); 1915 1916 foreach my $path (keys %{$handlers}) 1917 { $t->{twig_handlers}||= {}; 1918 _set_handler( $t->{twig_handlers}, $path, $handlers->{$path}) 1919 if( ref($handlers->{$path}) && isa( $handlers->{$path}, 'CODE')); 1920 } 1921 return $previous_roots; 1922 } 1923 1924sub _check_illegal_twig_roots_handlers 1925 { my( $handlers)= @_; 1926 foreach my $tag_handlers (values %{$handlers->{xpath_handler}}) 1927 { foreach my $handler_data (@$tag_handlers) 1928 { if( my $type= $handler_data->{test_on_text}) 1929 { croak "string() condition not supported on twig_roots option"; } 1930 } 1931 } 1932 return; 1933 } 1934 1935 1936# just store the reference to the expat object in the twig 1937sub _twig_init 1938 { # warn " in _twig_init...\n"; # DEBUG handler 1939 1940 my $p= shift; 1941 my $t=$p->{twig}; 1942 1943 if( $t->{twig_parsing} ) { croak "cannot reuse a twig that is already parsing"; } 1944 $t->{twig_parsing}=1; 1945 1946 $t->{twig_parser}= $p; 1947 if( $weakrefs) { weaken( $t->{twig_parser}); } 1948 1949 # in case they had been created by a previous parse 1950 delete $t->{twig_dtd}; 1951 delete $t->{twig_doctype}; 1952 delete $t->{twig_xmldecl}; 1953 delete $t->{twig_root}; 1954 1955 # if needed set the output filehandle 1956 $t->_set_fh_to_twig_output_fh(); 1957 return; 1958 } 1959 1960# uses eval to catch the parser's death 1961sub safe_parse 1962 { my $t= shift; 1963 eval { $t->parse( @_); } ; 1964 return $@ ? $t->_reset_twig_after_error : $t; 1965 } 1966 1967sub safe_parsefile 1968 { my $t= shift; 1969 eval { $t->parsefile( @_); } ; 1970 return $@ ? $t->_reset_twig_after_error : $t; 1971 } 1972 1973# restore a twig in a proper state so it can be reused for a new parse 1974sub _reset_twig 1975 { my $t= shift; 1976 $t->{twig_parsing}= 0; 1977 delete $t->{twig_current}; 1978 delete $t->{extra_data}; 1979 delete $t->{twig_dtd}; 1980 delete $t->{twig_in_pcdata}; 1981 delete $t->{twig_in_cdata}; 1982 delete $t->{twig_stored_space}; 1983 delete $t->{twig_entity_list}; 1984 $t->root->delete if( $t->root); 1985 delete $t->{twig_root}; 1986 return $t; 1987 } 1988 1989sub _reset_twig_after_error 1990 { my $t= shift; 1991 $t->_reset_twig; 1992 return undef; 1993 } 1994 1995 1996sub _add_or_discard_stored_spaces 1997 { my $t= shift; 1998 1999 $t->{twig_right_after_root}=0; #XX 2000 2001 my $current= $t->{twig_current} or return; # ugly hack, with ignore on, twig_current can disappear 2002 return unless length $t->{twig_stored_spaces}; 2003 my $current_gi= $XML::Twig::index2gi[$current->{'gi'}]; 2004 2005 if( ! $t->{twig_discard_all_spaces}) 2006 { if( ! defined( $t->{twig_space_policy}->{$current_gi})) 2007 { $t->{twig_space_policy}->{$current_gi}= _space_policy( $t, $current_gi); } 2008 if( $t->{twig_space_policy}->{$current_gi} || ($t->{twig_stored_spaces}!~ m{\n}) || $t->{twig_preserve_space}) 2009 { _insert_pcdata( $t, $t->{twig_stored_spaces} ); } 2010 } 2011 2012 $t->{twig_stored_spaces}=''; 2013 2014 return; 2015 } 2016 2017# the default twig handlers, which build the tree 2018sub _twig_start 2019 { # warn " in _twig_start...\n"; # DEBUG handler 2020 2021 #foreach my $s (@_) { next if ref $s; warn "$s: ", is_utf8( $s) ? "has flag" : "FLAG NOT SET"; } # YYY 2022 2023 my ($p, $gi, @att)= @_; 2024 my $t=$p->{twig}; 2025 2026 # empty the stored pcdata (space stored in case they are really part of 2027 # a pcdata element) or stored it if the space policy dictates so 2028 # create a pcdata element with the spaces if need be 2029 _add_or_discard_stored_spaces( $t); 2030 my $parent= $t->{twig_current}; 2031 2032 # if we were parsing PCDATA then we exit the pcdata 2033 if( $t->{twig_in_pcdata}) 2034 { $t->{twig_in_pcdata}= 0; 2035 delete $parent->{'twig_current'}; 2036 $parent= $parent->{parent}; 2037 } 2038 2039 # if we choose to keep the encoding then we need to parse the tag 2040 if( my $func = $t->{parse_start_tag}) 2041 { ($gi, @att)= &$func($p->original_string); } 2042 elsif( $t->{twig_entities_in_attribute}) 2043 { 2044 ($gi,@att)= _parse_start_tag( $p->recognized_string); 2045 $t->{twig_entities_in_attribute}=0; 2046 } 2047 2048 # if we are using an external DTD, we need to fill the default attributes 2049 if( $t->{twig_read_external_dtd}) { _fill_default_atts( $t, $gi, \@att); } 2050 2051 # filter the input data if need be 2052 if( my $filter= $t->{twig_input_filter}) 2053 { $gi= $filter->( $gi); 2054 foreach my $att (@att) { $att= $filter->($att); } 2055 } 2056 2057 my $ns_decl; 2058 if( $t->{twig_map_xmlns}) 2059 { $ns_decl= _replace_ns( $t, \$gi, \@att); } 2060 2061 my $elt= $t->{twig_elt_class}->new( $gi); 2062 $elt->set_atts( @att); 2063 2064 # now we can store the tag and atts 2065 my $context= { $ST_TAG => $gi, $ST_ELT => $elt, @att}; 2066 $context->{$ST_NS}= $ns_decl if $ns_decl; 2067 if( $weakrefs) { weaken( $context->{$ST_ELT}); } 2068 push @{$t->{_twig_context_stack}}, $context; 2069 2070 delete $parent->{'twig_current'} if( $parent); 2071 $t->{twig_current}= $elt; 2072 $elt->{'twig_current'}=1; 2073 2074 if( $parent) 2075 { my $prev_sibling= $parent->{last_child}; 2076 if( $prev_sibling) 2077 { $prev_sibling->{next_sibling}= $elt; 2078 $elt->{prev_sibling}=$prev_sibling; if( $XML::Twig::weakrefs) { weaken( $elt->{prev_sibling});} ; 2079 } 2080 2081 $elt->{parent}=$parent; if( $XML::Twig::weakrefs) { weaken( $elt->{parent});} ; 2082 unless( $parent->{first_child}) { $parent->{first_child}= $elt; } 2083 $parent->{empty}=0; $parent->{last_child}=$elt; if( $XML::Twig::weakrefs) { weaken( $parent->{last_child});} ; 2084 } 2085 else 2086 { # processing root 2087 $t->set_root( $elt); 2088 # call dtd handler if need be 2089 $t->{twig_dtd_handler}->($t, $t->{twig_dtd}) 2090 if( defined $t->{twig_dtd_handler}); 2091 2092 # set this so we can catch external entities 2093 # (the handler was modified during DTD processing) 2094 if( $t->{twig_default_print}) 2095 { $p->setHandlers( Default => \&_twig_print); } 2096 elsif( $t->{twig_roots}) 2097 { $p->setHandlers( Default => sub { return }); } 2098 else 2099 { $p->setHandlers( Default => \&_twig_default); } 2100 } 2101 2102 $elt->{empty}= $p->recognized_string=~ m{/\s*>$}s ? 1 : 0; 2103 2104 $elt->{extra_data}= $t->{extra_data} if( $t->{extra_data}); 2105 $t->{extra_data}=''; 2106 2107 # if the element is ID-ed then store that info 2108 my $id= $elt->{'att'}->{$ID}; 2109 if( defined $id) 2110 { $t->{twig_id_list}->{$id}= $elt; 2111 if( $weakrefs) { weaken( $t->{twig_id_list}->{$id}); } 2112 } 2113 2114 # call user handler if need be 2115 if( $t->{twig_starttag_handlers}) 2116 { # call all appropriate handlers 2117 my @handlers= _handler( $t, $t->{twig_starttag_handlers}, $gi); 2118 2119 local $_= $elt; 2120 2121 foreach my $handler ( @handlers) 2122 { $handler->($t, $elt) || last; } 2123 # call _all_ handler if needed 2124 if( my $all= $t->{twig_starttag_handlers}->{handlers}->{$ALL}) 2125 { $all->($t, $elt); } 2126 } 2127 2128 # check if the tag is in the list of tags to be ignored 2129 if( $t->{twig_ignore_elts_handlers}) 2130 { my @handlers= _handler( $t, $t->{twig_ignore_elts_handlers}, $gi); 2131 # only the first handler counts, it contains the action (discard/print/string) 2132 if( @handlers) { my $action= shift @handlers; $t->ignore( $elt, $action); } 2133 } 2134 2135 if( $elt->{'att'}->{'xml:space'} && ( $elt->{'att'}->{'xml:space'} eq 'preserve')) { $t->{twig_preserve_space}++; } 2136 2137 2138 return; 2139 } 2140 2141sub _replace_ns 2142 { my( $t, $gi, $atts)= @_; 2143 my $decls; 2144 foreach my $new_prefix ( $t->parser->new_ns_prefixes) 2145 { my $uri= $t->parser->expand_ns_prefix( $new_prefix); 2146 # replace the prefix if it is mapped 2147 $decls->{$new_prefix}= $uri; 2148 if( !$t->{twig_keep_original_prefix} && (my $mapped_prefix= $t->{twig_map_xmlns}->{$uri})) 2149 { $new_prefix= $mapped_prefix; } 2150 # now put the namespace declaration back in the element 2151 if( $new_prefix eq '#default') 2152 { push @$atts, "xmlns" => $uri; } 2153 else 2154 { push @$atts, "xmlns:$new_prefix" => $uri; } 2155 } 2156 2157 if( $t->{twig_keep_original_prefix}) 2158 { # things become more complex: we need to find the original prefix 2159 # and store both prefixes 2160 my $ns_info= $t->_ns_info( $$gi); 2161 my $map_att; 2162 if( $ns_info->{mapped_prefix}) 2163 { $$gi= "$ns_info->{mapped_prefix}:$$gi"; 2164 $map_att->{$ns_info->{mapped_prefix}}= $ns_info->{prefix}; 2165 } 2166 my $att_name=1; 2167 foreach( @$atts) 2168 { if( $att_name) 2169 { 2170 my $ns_info= $t->_ns_info( $_); 2171 if( $ns_info->{mapped_prefix}) 2172 { $_= "$ns_info->{mapped_prefix}:$_"; 2173 $map_att->{$ns_info->{mapped_prefix}}= $ns_info->{prefix}; 2174 } 2175 $att_name=0; 2176 } 2177 else 2178 { $att_name=1; } 2179 } 2180 push @$atts, '#original_gi', $map_att if( $map_att); 2181 } 2182 else 2183 { $$gi= $t->_replace_prefix( $$gi); 2184 my $att_name=1; 2185 foreach( @$atts) 2186 { if( $att_name) { $_= $t->_replace_prefix( $_); $att_name=0; } 2187 else { $att_name=1; } 2188 } 2189 } 2190 return $decls; 2191 } 2192 2193 2194# extract prefix, local_name, uri, mapped_prefix from a name 2195# will only work if called from a start or end tag handler 2196sub _ns_info 2197 { my( $t, $name)= @_; 2198 my $ns_info={}; 2199 my $p= $t->parser; 2200 $ns_info->{uri}= $p->namespace( $name); 2201 return $ns_info unless( $ns_info->{uri}); 2202 2203 $ns_info->{prefix}= _a_proper_ns_prefix( $p, $ns_info->{uri}); 2204 $ns_info->{mapped_prefix}= $t->{twig_map_xmlns}->{$ns_info->{uri}} || $ns_info->{prefix}; 2205 2206 return $ns_info; 2207 } 2208 2209sub _a_proper_ns_prefix 2210 { my( $p, $uri)= @_; 2211 foreach my $prefix ($p->current_ns_prefixes) 2212 { if( $p->expand_ns_prefix( $prefix) eq $uri) 2213 { return $prefix; } 2214 } 2215 return; 2216 } 2217 2218# returns the uri bound to a prefix in the original document 2219# only works in a handler 2220# can be used to deal with xsi:type attributes 2221sub original_uri 2222 { my( $t, $prefix)= @_; 2223 my $ST_NS = '##ns' ; 2224 foreach my $ns (map { $_->{$ST_NS} if $_->{$ST_NS} } reverse @{$t->{_twig_context_stack}}) 2225 { return $ns->{$prefix} || next; } 2226 return; 2227 } 2228 2229 2230sub _fill_default_atts 2231 { my( $t, $gi, $atts)= @_; 2232 my $dtd= $t->{twig_dtd}; 2233 my $attlist= $dtd->{att}->{$gi}; 2234 my %value= @$atts; 2235 foreach my $att (keys %$attlist) 2236 { if( !exists( $value{$att}) 2237 && exists( $attlist->{$att}->{default}) 2238 && ( $attlist->{$att}->{default} ne '#IMPLIED') 2239 ) 2240 { # the quotes are included in the default, so we need to remove them 2241 my $default_value= substr( $attlist->{$att}->{default}, 1, -1); 2242 push @$atts, $att, $default_value; 2243 } 2244 } 2245 return; 2246 } 2247 2248 2249# the default function to parse a start tag (in keep_encoding mode) 2250# can be overridden with the parse_start_tag method 2251# only works for 1-byte character sets 2252sub _parse_start_tag 2253 { my $string= shift; 2254 my( $gi, @atts); 2255 2256 # get the gi (between < and the first space, / or > character) 2257 #if( $string=~ s{^<\s*([^\s>/]*)[\s>/]*}{}s) 2258 if( $string=~ s{^<\s*($REG_TAG_NAME)\s*[\s>/]}{}s) 2259 { $gi= $1; } 2260 else 2261 { croak "error parsing tag '$string'"; } 2262 while( $string=~ s{^([^\s=]*)\s*=\s*(["'])(.*?)\2\s*}{}s) 2263 { push @atts, $1, $3; } 2264 return $gi, @atts; 2265 } 2266 2267sub set_root 2268 { my( $t, $elt)= @_; 2269 $t->{twig_root}= $elt; 2270 if( $elt) 2271 { $elt->{twig}= $t; 2272 if( $weakrefs) { weaken( $elt->{twig}); } 2273 } 2274 return $t; 2275 } 2276 2277sub _twig_end 2278 { # warn " in _twig_end...\n"; # DEBUG handler 2279 my ($p, $gi) = @_; 2280 2281 my $t=$p->{twig}; 2282 2283 if( $t->{twig_in_pcdata} && (my $text_handler= $t->{TwigHandlers}->{$TEXT}) ) 2284 { local $_= $t->{twig_current}; $text_handler->( $t, $_) if $_; 2285 } 2286 2287 if( $t->{twig_map_xmlns}) { $gi= $t->_replace_prefix( $gi); } 2288 2289 _add_or_discard_stored_spaces( $t); 2290 2291 # the new twig_current is the parent 2292 my $elt= $t->{twig_current}; 2293 delete $elt->{'twig_current'}; 2294 2295 # if we were parsing PCDATA then we exit the pcdata too 2296 if( $t->{twig_in_pcdata}) 2297 { 2298 $t->{twig_in_pcdata}= 0; 2299 $elt= $elt->{parent} if($elt->{parent}); 2300 delete $elt->{'twig_current'}; 2301 } 2302 2303 # parent is the new current element 2304 my $parent= $elt->{parent}; 2305 $t->{twig_current}= $parent; 2306 2307 if( $parent) 2308 { $parent->{'twig_current'}=1; 2309 # twig_to_be_normalized 2310 if( $parent->{twig_to_be_normalized}) { $parent->normalize; $parent->{twig_to_be_normalized}=0; } 2311 } 2312 2313 if( $t->{extra_data}) 2314 { $elt->_set_extra_data_before_end_tag( $t->{extra_data}); 2315 $t->{extra_data}=''; 2316 } 2317 2318 if( $t->{twig_handlers}) 2319 { # look for handlers 2320 my @handlers= _handler( $t, $t->{twig_handlers}, $gi); 2321 2322 if( $t->{twig_tdh}) 2323 { if( @handlers) { push @{$t->{twig_handlers_to_trigger}}, [ $elt, \@handlers ]; } 2324 if( my $all= $t->{twig_handlers}->{handlers}->{$ALL}) 2325 { push @{$t->{twig_handlers_to_trigger}}, [ $elt, [$all] ]; } 2326 } 2327 else 2328 { 2329 local $_= $elt; # so we can use $_ in the handlers 2330 2331 foreach my $handler ( @handlers) 2332 { $handler->($t, $elt) || last; } 2333 # call _all_ handler if needed 2334 my $all= $t->{twig_handlers}->{handlers}->{$ALL}; 2335 if( $all) 2336 { $all->($t, $elt); } 2337 if( @handlers || $all) { $t->{twig_right_after_root}=0; } 2338 } 2339 } 2340 2341 # if twig_roots is set for the element then set appropriate handler 2342 if( $t->{twig_root_depth} and ($p->depth == $t->{twig_root_depth}) ) 2343 { if( $t->{twig_default_print}) 2344 { # select the proper fh (and store the currently selected one) 2345 $t->_set_fh_to_twig_output_fh(); 2346 if( !$p->depth==1) { $t->{twig_right_after_root}=1; } #XX 2347 if( $t->{twig_keep_encoding}) 2348 { $p->setHandlers( %twig_handlers_roots_print_original); } 2349 else 2350 { $p->setHandlers( %twig_handlers_roots_print); } 2351 } 2352 else 2353 { $p->setHandlers( %twig_handlers_roots); } 2354 } 2355 2356 if( $elt->{'att'}->{'xml:space'} && ( $elt->{'att'}->{'xml:space'} eq 'preserve')) { $t->{twig_preserve_space}--; } 2357 2358 pop @{$t->{_twig_context_stack}}; 2359 return; 2360 } 2361 2362sub _trigger_tdh 2363 { my( $t)= @_; 2364 2365 if( @{$t->{twig_handlers_to_trigger}}) 2366 { my @handlers_to_trigger_now= sort { $a->[0]->cmp( $b->[0]) } @{$t->{twig_handlers_to_trigger}}; 2367 foreach my $elt_handlers (@handlers_to_trigger_now) 2368 { my( $handled_elt, $handlers_to_trigger)= @$elt_handlers; 2369 foreach my $handler ( @$handlers_to_trigger) 2370 { local $_= $handled_elt; $handler->($t, $handled_elt) || last; } 2371 } 2372 } 2373 return; 2374 } 2375 2376# return the list of handler that can be activated for an element 2377# (either of CODE ref's or 1's for twig_roots) 2378 2379sub _handler 2380 { my( $t, $handlers, $gi)= @_; 2381 2382 my @found_handlers=(); 2383 my $found_handler; 2384 2385 foreach my $handler ( map { @$_ } grep { $_ } $handlers->{xpath_handler}->{$gi}, $handlers->{xpath_handler}->{'*'}) 2386 { my $trigger= $handler->{trigger}; 2387 if( my $found_path= $trigger->( $t->{_twig_context_stack})) 2388 { my $found_handler= $handler->{handler}; 2389 push @found_handlers, $found_handler; 2390 } 2391 } 2392 2393 # if no handler found call default handler if defined 2394 if( !@found_handlers && defined $handlers->{handlers}->{$DEFAULT}) 2395 { push @found_handlers, $handlers->{handlers}->{$DEFAULT}; } 2396 2397 if( @found_handlers and $t->{twig_do_not_chain_handlers}) 2398 { @found_handlers= ($found_handlers[0]); } 2399 2400 return @found_handlers; # empty if no handler found 2401 2402 } 2403 2404 2405sub _replace_prefix 2406 { my( $t, $name)= @_; 2407 my $p= $t->parser; 2408 my $uri= $p->namespace( $name); 2409 # try to get the namespace from default if none is found (for attributes) 2410 # this should probably be an option 2411 if( !$uri and( $name!~/^xml/)) { $uri= $p->expand_ns_prefix( '#default'); } 2412 if( $uri) 2413 { if (my $mapped_prefix= $t->{twig_map_xmlns}->{$uri} || $DEFAULT_URI2NS{$uri}) 2414 { return "$mapped_prefix:$name"; } 2415 else 2416 { my $prefix= _a_proper_ns_prefix( $p, $uri); 2417 if( $prefix eq '#default') { $prefix=''; } 2418 return $prefix ? "$prefix:$name" : $name; 2419 } 2420 } 2421 else 2422 { return $name; } 2423 } 2424 2425 2426sub _twig_char 2427 { # warn " in _twig_char...\n"; # DEBUG handler 2428 2429 my ($p, $string)= @_; 2430 my $t=$p->{twig}; 2431 2432 if( $t->{twig_keep_encoding}) 2433 { if( !$t->{twig_in_cdata}) 2434 { $string= $p->original_string(); } 2435 else 2436 { 2437 use bytes; # > perl 5.5 2438 if( length( $string) < 1024) 2439 { $string= $p->original_string(); } 2440 else 2441 { #warn "dodgy case"; 2442 # TODO original_string does not hold the entire string, but $string is wrong 2443 # I believe due to a bug in XML::Parser 2444 # for now, we use the original string, even if it means that it's been converted to utf8 2445 } 2446 } 2447 } 2448 2449 if( $t->{twig_input_filter}) { $string= $t->{twig_input_filter}->( $string); } 2450 if( $t->{twig_char_handler}) { $string= $t->{twig_char_handler}->( $string); } 2451 2452 my $elt= $t->{twig_current}; 2453 2454 if( $t->{twig_in_cdata}) 2455 { # text is the continuation of a previously created cdata 2456 $elt->{cdata}.= $t->{twig_stored_spaces} . $string; 2457 } 2458 elsif( $t->{twig_in_pcdata}) 2459 { # text is the continuation of a previously created pcdata 2460 if( $t->{extra_data}) 2461 { $elt->_push_extra_data_in_pcdata( $t->{extra_data}, length( $elt->{pcdata})); 2462 $t->{extra_data}=''; 2463 } 2464 $elt->{pcdata}.= $string; 2465 } 2466 else 2467 { 2468 # text is just space, which might be discarded later 2469 if( $string=~/\A\s*\Z/s) 2470 { 2471 if( $t->{extra_data}) 2472 { # we got extra data (comment, pi), lets add the spaces to it 2473 $t->{extra_data} .= $string; 2474 } 2475 else 2476 { # no extra data, just store the spaces 2477 $t->{twig_stored_spaces}.= $string; 2478 } 2479 } 2480 else 2481 { my $new_elt= _insert_pcdata( $t, $t->{twig_stored_spaces}.$string); 2482 delete $elt->{'twig_current'}; 2483 $new_elt->{'twig_current'}=1; 2484 $t->{twig_current}= $new_elt; 2485 $t->{twig_in_pcdata}=1; 2486 if( $t->{extra_data}) 2487 { $new_elt->_push_extra_data_in_pcdata( $t->{extra_data}, 0); 2488 $t->{extra_data}=''; 2489 } 2490 } 2491 } 2492 return; 2493 } 2494 2495sub _twig_cdatastart 2496 { # warn " in _twig_cdatastart...\n"; # DEBUG handler 2497 2498 my $p= shift; 2499 my $t=$p->{twig}; 2500 2501 $t->{twig_in_cdata}=1; 2502 my $cdata= $t->{twig_elt_class}->new( $CDATA); 2503 my $twig_current= $t->{twig_current}; 2504 2505 if( $t->{twig_in_pcdata}) 2506 { # create the node as a sibling of the PCDATA 2507 $cdata->{prev_sibling}=$twig_current; if( $XML::Twig::weakrefs) { weaken( $cdata->{prev_sibling});} ; 2508 $twig_current->{next_sibling}= $cdata; 2509 my $parent= $twig_current->{parent}; 2510 $cdata->{parent}=$parent; if( $XML::Twig::weakrefs) { weaken( $cdata->{parent});} ; 2511 $parent->{empty}=0; $parent->{last_child}=$cdata; if( $XML::Twig::weakrefs) { weaken( $parent->{last_child});} ; 2512 $t->{twig_in_pcdata}=0; 2513 } 2514 else 2515 { # we have to create a PCDATA element if we need to store spaces 2516 if( $t->_space_policy($XML::Twig::index2gi[$twig_current->{'gi'}]) && $t->{twig_stored_spaces}) 2517 { _insert_pcdata( $t, $t->{twig_stored_spaces}); } 2518 $t->{twig_stored_spaces}=''; 2519 2520 # create the node as a child of the current element 2521 $cdata->{parent}=$twig_current; if( $XML::Twig::weakrefs) { weaken( $cdata->{parent});} ; 2522 if( my $prev_sibling= $twig_current->{last_child}) 2523 { $cdata->{prev_sibling}=$prev_sibling; if( $XML::Twig::weakrefs) { weaken( $cdata->{prev_sibling});} ; 2524 $prev_sibling->{next_sibling}= $cdata; 2525 } 2526 else 2527 { $twig_current->{first_child}= $cdata; } 2528 $twig_current->{empty}=0; $twig_current->{last_child}=$cdata; if( $XML::Twig::weakrefs) { weaken( $twig_current->{last_child});} ; 2529 2530 } 2531 2532 delete $twig_current->{'twig_current'}; 2533 $t->{twig_current}= $cdata; 2534 $cdata->{'twig_current'}=1; 2535 if( $t->{extra_data}) { $cdata->set_extra_data( $t->{extra_data}); $t->{extra_data}='' }; 2536 return; 2537 } 2538 2539sub _twig_cdataend 2540 { # warn " in _twig_cdataend...\n"; # DEBUG handler 2541 2542 my $p= shift; 2543 my $t=$p->{twig}; 2544 2545 $t->{twig_in_cdata}=0; 2546 2547 my $elt= $t->{twig_current}; 2548 delete $elt->{'twig_current'}; 2549 my $cdata= $elt->{cdata}; 2550 $elt->_set_cdata( $cdata); 2551 2552 push @{$t->{_twig_context_stack}}, { $ST_TAG => $CDATA }; 2553 2554 if( $t->{twig_handlers}) 2555 { # look for handlers 2556 my @handlers= _handler( $t, $t->{twig_handlers}, $CDATA); 2557 local $_= $elt; # so we can use $_ in the handlers 2558 foreach my $handler ( @handlers) { $handler->($t, $elt) || last; } 2559 } 2560 2561 pop @{$t->{_twig_context_stack}}; 2562 2563 $elt= $elt->{parent}; 2564 $t->{twig_current}= $elt; 2565 $elt->{'twig_current'}=1; 2566 2567 $t->{twig_long_cdata}=0; 2568 return; 2569 } 2570 2571sub _pi_elt_handlers 2572 { my( $t, $pi)= @_; 2573 my $pi_handlers= $t->{twig_handlers}->{pi_handlers} || return; 2574 foreach my $handler ( $pi_handlers->{$pi->{target}}, $pi_handlers->{''}) 2575 { if( $handler) { local $_= $pi; $handler->( $t, $pi) || last; } } 2576 } 2577 2578sub _pi_text_handler 2579 { my( $t, $target, $data)= @_; 2580 if( my $handler= $t->{twig_handlers}->{pi_handlers}->{$target}) 2581 { return $handler->( $t, $target, $data); } 2582 if( my $handler= $t->{twig_handlers}->{pi_handlers}->{''}) 2583 { return $handler->( $t, $target, $data); } 2584 return defined( $data) && $data ne '' ? "<?$target $data?>" : "<?$target?>" ; 2585 } 2586 2587sub _comment_elt_handler 2588 { my( $t, $comment)= @_; 2589 if( my $handler= $t->{twig_handlers}->{handlers}->{$COMMENT}) 2590 { local $_= $comment; $handler->($t, $comment); } 2591 } 2592 2593sub _comment_text_handler 2594 { my( $t, $comment)= @_; 2595 if( my $handler= $t->{twig_handlers}->{handlers}->{$COMMENT}) 2596 { $comment= $handler->($t, $comment); 2597 if( !defined $comment || $comment eq '') { return ''; } 2598 } 2599 return "<!--$comment-->"; 2600 } 2601 2602 2603 2604sub _twig_comment 2605 { # warn " in _twig_comment...\n"; # DEBUG handler 2606 2607 my( $p, $comment_text)= @_; 2608 my $t=$p->{twig}; 2609 2610 if( $t->{twig_keep_encoding}) { $comment_text= substr( $p->original_string(), 4, -3); } 2611 2612 $t->_twig_pi_comment( $p, $COMMENT, $t->{twig_keep_comments}, $t->{twig_process_comments}, 2613 '_set_comment', '_comment_elt_handler', '_comment_text_handler', $comment_text 2614 ); 2615 return; 2616 } 2617 2618sub _twig_pi 2619 { # warn " in _twig_pi...\n"; # DEBUG handler 2620 2621 my( $p, $target, $data)= @_; 2622 my $t=$p->{twig}; 2623 2624 if( $t->{twig_keep_encoding}) 2625 { my $pi_text= substr( $p->original_string(), 2, -2); 2626 ($target, $data)= split( /\s+/, $pi_text, 2); 2627 } 2628 2629 $t->_twig_pi_comment( $p, $PI, $t->{twig_keep_pi}, $t->{twig_process_pi}, 2630 '_set_pi', '_pi_elt_handlers', '_pi_text_handler', $target, $data 2631 ); 2632 return; 2633 } 2634 2635sub _twig_pi_comment 2636 { my( $t, $p, $type, $keep, $process, $set, $elt_handler, $text_handler, @parser_args)= @_; 2637 2638 if( $t->{twig_input_filter}) 2639 { foreach my $arg (@parser_args) { $arg= $t->{twig_input_filter}->( $arg); } } 2640 2641 # if pi/comments are to be kept then we piggyback them to the current element 2642 if( $keep) 2643 { # first add spaces 2644 if( $t->{twig_stored_spaces}) 2645 { $t->{extra_data}.= $t->{twig_stored_spaces}; 2646 $t->{twig_stored_spaces}= ''; 2647 } 2648 2649 my $extra_data= $t->$text_handler( @parser_args); 2650 $t->{extra_data}.= $extra_data; 2651 2652 } 2653 elsif( $process) 2654 { 2655 my $twig_current= $t->{twig_current}; # defined unless we are outside of the root 2656 2657 my $elt= $t->{twig_elt_class}->new( $type); 2658 $elt->$set( @parser_args); 2659 if( $t->{extra_data}) 2660 { $elt->set_extra_data( $t->{extra_data}); 2661 $t->{extra_data}=''; 2662 } 2663 2664 unless( $t->root) 2665 { $t->_add_cpi_outside_of_root( leading_cpi => $elt); 2666 } 2667 elsif( $t->{twig_in_pcdata}) 2668 { # create the node as a sibling of the PCDATA 2669 $elt->paste_after( $twig_current); 2670 $t->{twig_in_pcdata}=0; 2671 } 2672 elsif( $twig_current) 2673 { # we have to create a PCDATA element if we need to store spaces 2674 if( $t->_space_policy($XML::Twig::index2gi[$twig_current->{'gi'}]) && $t->{twig_stored_spaces}) 2675 { _insert_pcdata( $t, $t->{twig_stored_spaces}); } 2676 $t->{twig_stored_spaces}=''; 2677 # create the node as a child of the current element 2678 $elt->paste_last_child( $twig_current); 2679 } 2680 else 2681 { $t->_add_cpi_outside_of_root( trailing_cpi => $elt); } 2682 2683 if( $twig_current) 2684 { delete $twig_current->{'twig_current'}; 2685 my $parent= $elt->{parent}; 2686 $t->{twig_current}= $parent; 2687 $parent->{'twig_current'}=1; 2688 } 2689 2690 $t->$elt_handler( $elt); 2691 } 2692 2693 } 2694 2695 2696# add a comment or pi before the first element 2697sub _add_cpi_outside_of_root 2698 { my($t, $type, $elt)= @_; # $type is 'leading_cpi' or 'trailing_cpi' 2699 $t->{$type} ||= $t->{twig_elt_class}->new( '#CPI'); 2700 # create the node as a child of the current element 2701 $elt->paste_last_child( $t->{$type}); 2702 return $t; 2703 } 2704 2705sub _twig_final 2706 { # warn " in _twig_final...\n"; # DEBUG handler 2707 2708 my $p= shift; 2709 my $t= $p->isa( 'XML::Twig') ? $p : $p->{twig}; 2710 2711 # store trailing data 2712 if( $t->{extra_data}) { $t->{trailing_cpi_text} = $t->{extra_data}; $t->{extra_data}=''; } 2713 $t->{trailing_spaces}= $t->{twig_stored_spaces} || ''; 2714 my $s= $t->{twig_stored_spaces}; $s=~s{\n}{\\n}g; 2715 if( $t->{twig_stored_spaces}) { my $s= $t->{twig_stored_spaces}; } 2716 2717 # restore the selected filehandle if needed 2718 $t->_set_fh_to_selected_fh(); 2719 2720 $t->_trigger_tdh if( $t->{twig_tdh}); 2721 2722 select $t->{twig_original_selected_fh} if($t->{twig_original_selected_fh}); # probably dodgy 2723 2724 if( exists $t->{twig_autoflush_data}) 2725 { my @args; 2726 push @args, $t->{twig_autoflush_data}->{fh} if( $t->{twig_autoflush_data}->{fh}); 2727 push @args, @{$t->{twig_autoflush_data}->{args}} if( $t->{twig_autoflush_data}->{args}); 2728 $t->flush( @args); 2729 delete $t->{twig_autoflush_data}; 2730 $t->root->delete if $t->root; 2731 } 2732 2733 # tries to clean-up (probably not very well at the moment) 2734 #undef $p->{twig}; 2735 undef $t->{twig_parser}; 2736 delete $t->{twig_parsing}; 2737 @{$t}{ qw( twig_parser twig_parsing _twig_context_stack twig_current) }=(); 2738 2739 return $t; 2740 } 2741 2742sub _insert_pcdata 2743 { my( $t, $string)= @_; 2744 # create a new PCDATA element 2745 my $parent= $t->{twig_current}; # always defined 2746 my $elt; 2747 if( exists $t->{twig_alt_elt_class}) 2748 { $elt= $t->{twig_elt_class}->new( $PCDATA); 2749 $elt->_set_pcdata( $string); 2750 } 2751 else 2752 { $elt= bless( { gi => $XML::Twig::gi2index{$PCDATA}, pcdata => $string }, 'XML::Twig::Elt'); } 2753 2754 my $prev_sibling= $parent->{last_child}; 2755 if( $prev_sibling) 2756 { $prev_sibling->{next_sibling}= $elt; 2757 $elt->{prev_sibling}=$prev_sibling; if( $XML::Twig::weakrefs) { weaken( $elt->{prev_sibling});} ; 2758 } 2759 else 2760 { $parent->{first_child}= $elt; } 2761 2762 $elt->{parent}=$parent; if( $XML::Twig::weakrefs) { weaken( $elt->{parent});} ; 2763 $parent->{empty}=0; $parent->{last_child}=$elt; if( $XML::Twig::weakrefs) { weaken( $parent->{last_child});} ; 2764 $t->{twig_stored_spaces}=''; 2765 return $elt; 2766 } 2767 2768sub _space_policy 2769 { my( $t, $gi)= @_; 2770 my $policy; 2771 $policy=0 if( $t->{twig_discard_spaces}); 2772 $policy=1 if( $t->{twig_keep_spaces}); 2773 $policy=1 if( $t->{twig_keep_spaces_in} 2774 && $t->{twig_keep_spaces_in}->{$gi}); 2775 $policy=0 if( $t->{twig_discard_spaces_in} 2776 && $t->{twig_discard_spaces_in}->{$gi}); 2777 return $policy; 2778 } 2779 2780 2781sub _twig_entity 2782 { # warn " in _twig_entity...\n"; # DEBUG handler 2783 my( $p, $name, $val, $sysid, $pubid, $ndata, $param)= @_; 2784 my $t=$p->{twig}; 2785 2786 #{ no warnings; my $base= $p->base; warn "_twig_entity called: expand: '$t->{twig_expand_external_ents}', base: '$base', name: '$name', val: '$val', sysid: '$sysid', pubid: '$pubid', ndata: '$ndata', param: '$param'\n";} 2787 2788 my $missing_entity=0; 2789 2790 if( $sysid) 2791 { if($ndata) 2792 { if( ! -f _based_filename( $sysid, $p->base)) { $missing_entity= 1; } 2793 } 2794 else 2795 { if( $t->{twig_expand_external_ents}) 2796 { $val= eval { _slurp_uri( $sysid, $p->base) }; 2797 if( ! defined $val) 2798 { if( $t->{twig_extern_ent_nofail}) 2799 { $missing_entity= 1; } 2800 else 2801 { _croak( "cannot load SYSTEM entity '$name' from '$sysid': $@", 3); } 2802 } 2803 } 2804 } 2805 } 2806 2807 my $ent=XML::Twig::Entity->new( $name, $val, $sysid, $pubid, $ndata, $param); 2808 if( $missing_entity) { $t->{twig_missing_system_entities}->{$name}= $ent; } 2809 2810 my $entity_list= $t->entity_list; 2811 if( $entity_list) { $entity_list->add( $ent); } 2812 2813 if( $parser_version > 2.27) 2814 { # this is really ugly, but with some versions of XML::Parser the value 2815 # of the entity is not properly returned by the default handler 2816 my $ent_decl= $ent->text; 2817 if( $t->{twig_keep_encoding}) 2818 { if( defined $ent->{val} && ($ent_decl !~ /["']/)) 2819 { my $val= $ent->{val}; 2820 $ent_decl .= $val =~ /"/ ? qq{'$val' } : qq{"$val" }; 2821 } 2822 # for my solaris box (perl 5.6.1, XML::Parser 2.31, expat?) 2823 $t->{twig_doctype}->{internal}=~ s{<!ENTITY\s+$name\s+$}{substr( $ent_decl, 0, -1)}e; 2824 } 2825 $t->{twig_doctype}->{internal} .= $ent_decl 2826 unless( $t->{twig_doctype}->{internal}=~ m{<!ENTITY\s+$name\s+}); 2827 } 2828 2829 return; 2830 } 2831 2832 2833sub _twig_extern_ent 2834 { # warn " in _twig_extern_ent...I (", $_[0]->original_string, ")\n"; # DEBUG handler 2835 my( $p, $base, $sysid, $pubid)= @_; 2836 my $t= $p->{twig}; 2837 if( $t->{twig_no_expand}) 2838 { my $ent_name= $t->{twig_keep_encoding} ? $p->original_string : $p->recognized_string; 2839 _twig_insert_ent( $t, $ent_name); 2840 return ''; 2841 } 2842 my $ent_content= eval { $t->{twig_ext_ent_handler}->( $p, $base, $sysid) }; 2843 if( ! defined $ent_content) 2844 { 2845 my $ent_name = $p->recognized_string; 2846 my $file = _based_filename( $sysid, $base); 2847 my $error_message= "cannot expand $ent_name - cannot load '$file'"; 2848 if( $t->{twig_extern_ent_nofail}) { return "<!-- $error_message -->"; } 2849 else { _croak( $error_message); } 2850 } 2851 return $ent_content; 2852 } 2853 2854# I use this so I can change the $Carp::CarpLevel (which determines how many call frames to skip when reporting an error) 2855sub _croak 2856 { my( $message, $level)= @_; 2857 $Carp::CarpLevel= $level || 0; 2858 croak $message; 2859 } 2860 2861sub _twig_xmldecl 2862 { # warn " in _twig_xmldecl...\n"; # DEBUG handler 2863 2864 my $p= shift; 2865 my $t=$p->{twig}; 2866 $t->{twig_xmldecl}||={}; # could have been set by set_output_encoding 2867 $t->{twig_xmldecl}->{version}= shift; 2868 $t->{twig_xmldecl}->{encoding}= shift; 2869 $t->{twig_xmldecl}->{standalone}= shift; 2870 return; 2871 } 2872 2873sub _twig_doctype 2874 { # warn " in _twig_doctype...\n"; # DEBUG handler 2875 my( $p, $name, $sysid, $pub, $internal)= @_; 2876 my $t=$p->{twig}; 2877 $t->{twig_doctype}||= {}; # create 2878 $t->{twig_doctype}->{name}= $name; # always there 2879 $t->{twig_doctype}->{sysid}= $sysid; # 2880 $t->{twig_doctype}->{pub}= $pub; # 2881 2882 # now let's try to cope with XML::Parser 2.28 and above 2883 if( $parser_version > 2.27) 2884 { @saved_default_handler= $p->setHandlers( Default => \&_twig_store_internal_dtd, 2885 Entity => \&_twig_entity, 2886 ); 2887 $p->setHandlers( DoctypeFin => \&_twig_stop_storing_internal_dtd); 2888 $t->{twig_doctype}->{internal}=''; 2889 } 2890 else 2891 # for XML::Parser before 2.28 2892 { $internal||=''; 2893 $internal=~ s{^\s*\[}{}; 2894 $internal=~ s{]\s*$}{}; 2895 $t->{twig_doctype}->{internal}=$internal; 2896 } 2897 2898 # now check if we want to get the DTD info 2899 if( $t->{twig_read_external_dtd} && $sysid) 2900 { # let's build a fake document with an internal DTD 2901 my $dtd= "<!DOCTYPE $name [" . _slurp_uri( $sysid) . "]><$name/>"; 2902 2903 $t->save_global_state(); # save the globals (they will be reset by the following new) 2904 my $t_dtd= XML::Twig->new( load_DTD => 1, ParseParamEnt => 1, error_context => $t->{ErrorContext} || 0); # create a temp twig 2905 $t_dtd->parse( $dtd); # parse it 2906 $t->{twig_dtd}= $t_dtd->{twig_dtd}; # grab the dtd info 2907 #$t->{twig_dtd_is_external}=1; 2908 $t->entity_list->_add_list( $t_dtd->entity_list) if( $t_dtd->entity_list); # grab the entity info 2909 $t->restore_global_state(); 2910 } 2911 return; 2912 } 2913 2914sub _twig_element 2915 { # warn " in _twig_element...\n"; # DEBUG handler 2916 2917 my( $p, $name, $model)= @_; 2918 my $t=$p->{twig}; 2919 $t->{twig_dtd}||= {}; # may create the dtd 2920 $t->{twig_dtd}->{model}||= {}; # may create the model hash 2921 $t->{twig_dtd}->{elt_list}||= []; # ordered list of elements 2922 push @{$t->{twig_dtd}->{elt_list}}, $name; # store the elt 2923 $t->{twig_dtd}->{model}->{$name}= $model; # store the model 2924 if( ($parser_version > 2.27) && ($t->{twig_doctype}->{internal}=~ m{(^|>)\s*$}) ) 2925 { my $text= $XML::Twig::Elt::keep_encoding ? $p->original_string : $p->recognized_string; 2926 unless( $text) 2927 { # this version of XML::Parser does not return the text in the *_string method 2928 # we need to rebuild it 2929 $text= "<!ELEMENT $name $model>"; 2930 } 2931 $t->{twig_doctype}->{internal} .= $text; 2932 } 2933 return; 2934 } 2935 2936sub _twig_attlist 2937 { # warn " in _twig_attlist...\n"; # DEBUG handler 2938 2939 my( $p, $gi, $att, $type, $default, $fixed)= @_; 2940 #warn "in attlist: gi: '$gi', att: '$att', type: '$type', default: '$default', fixed: '$fixed'\n"; 2941 my $t=$p->{twig}; 2942 $t->{twig_dtd}||= {}; # create dtd if need be 2943 $t->{twig_dtd}->{$gi}||= {}; # create elt if need be 2944 #$t->{twig_dtd}->{$gi}->{att}||= {}; # create att if need be 2945 if( ($parser_version > 2.27) && ($t->{twig_doctype}->{internal}=~ m{(^|>)\s*$}) ) 2946 { my $text= $XML::Twig::Elt::keep_encoding ? $p->original_string : $p->recognized_string; 2947 unless( $text) 2948 { # this version of XML::Parser does not return the text in the *_string method 2949 # we need to rebuild it 2950 my $att_decl="$att $type"; 2951 $att_decl .= " #FIXED" if( $fixed); 2952 $att_decl .= " $default" if( defined $default); 2953 # 2 cases: there is already an attlist on that element or not 2954 if( $t->{twig_dtd}->{att}->{$gi}) 2955 { # there is already an attlist, add to it 2956 $t->{twig_doctype}->{internal}=~ s{(<!ATTLIST\s*$gi )(.*?)\n?>} 2957 { "$1$2\n" . ' ' x length( $1) . "$att_decl\n>"}es; 2958 } 2959 else 2960 { # create the attlist 2961 $t->{twig_doctype}->{internal}.= "<!ATTLIST $gi $att_decl>" 2962 } 2963 } 2964 } 2965 $t->{twig_dtd}->{att}->{$gi}->{$att}= {} ; 2966 $t->{twig_dtd}->{att}->{$gi}->{$att}->{type}= $type; 2967 $t->{twig_dtd}->{att}->{$gi}->{$att}->{default}= $default if( defined $default); 2968 $t->{twig_dtd}->{att}->{$gi}->{$att}->{fixed}= $fixed; 2969 return; 2970 } 2971 2972sub _twig_default 2973 { # warn " in _twig_default...\n"; # DEBUG handler 2974 2975 my( $p, $string)= @_; 2976 2977 my $t= $p->{twig}; 2978 2979 # we need to process the data in 2 cases: entity, or spaces after the closing tag 2980 2981 # after the closing tag (no twig_current and root has been created) 2982 if( ! $t->{twig_current} && $t->{twig_root} && $string=~ m{^\s+$}m) { $t->{twig_stored_spaces} .= $string; } 2983 2984 # process only if we have an entity 2985 if( $string=~ m{^&([^;]*);$}) 2986 { # the entity has to be pure pcdata, or we have a problem 2987 if( ($p->original_string=~ m{^<}) && ($p->original_string=~ m{>$}) ) 2988 { # string is a tag, entity is in an attribute 2989 $t->{twig_entities_in_attribute}=1 if( $t->{twig_do_not_escape_amp_in_atts}); 2990 } 2991 else 2992 { my $ent; 2993 if( $t->{twig_keep_encoding}) 2994 { _twig_char( $p, $string); 2995 $ent= substr( $string, 1, -1); 2996 } 2997 else 2998 { $ent= _twig_insert_ent( $t, $string); 2999 } 3000 3001 return $ent; 3002 } 3003 } 3004 } 3005 3006sub _twig_insert_ent 3007 { 3008 my( $t, $string)=@_; 3009 3010 my $twig_current= $t->{twig_current}; 3011 3012 my $ent= $t->{twig_elt_class}->new( $ENT); 3013 $ent->{ent}= $string; 3014 3015 _add_or_discard_stored_spaces( $t); 3016 3017 if( $t->{twig_in_pcdata}) 3018 { # create the node as a sibling of the #PCDATA 3019 3020 $ent->{prev_sibling}=$twig_current; if( $XML::Twig::weakrefs) { weaken( $ent->{prev_sibling});} ; 3021 $twig_current->{next_sibling}= $ent; 3022 my $parent= $twig_current->{parent}; 3023 $ent->{parent}=$parent; if( $XML::Twig::weakrefs) { weaken( $ent->{parent});} ; 3024 $parent->{empty}=0; $parent->{last_child}=$ent; if( $XML::Twig::weakrefs) { weaken( $parent->{last_child});} ; 3025 # the twig_current is now the parent 3026 delete $twig_current->{'twig_current'}; 3027 $t->{twig_current}= $parent; 3028 # we left pcdata 3029 $t->{twig_in_pcdata}=0; 3030 } 3031 else 3032 { # create the node as a child of the current element 3033 $ent->{parent}=$twig_current; if( $XML::Twig::weakrefs) { weaken( $ent->{parent});} ; 3034 if( my $prev_sibling= $twig_current->{last_child}) 3035 { $ent->{prev_sibling}=$prev_sibling; if( $XML::Twig::weakrefs) { weaken( $ent->{prev_sibling});} ; 3036 $prev_sibling->{next_sibling}= $ent; 3037 } 3038 else 3039 { if( $twig_current) { $twig_current->{first_child}= $ent; } } 3040 if( $twig_current) { $twig_current->{empty}=0; $twig_current->{last_child}=$ent; if( $XML::Twig::weakrefs) { weaken( $twig_current->{last_child});} ; } 3041 } 3042 3043 # meant to trigger entity handler, does not seem to be activated at this time 3044 #if( my $handler= $t->{twig_handlers}->{gi}->{$ENT}) 3045 # { local $_= $ent; $handler->( $t, $ent); } 3046 3047 return $ent; 3048 } 3049 3050sub parser 3051 { return $_[0]->{twig_parser}; } 3052 3053# returns the declaration text (or a default one) 3054sub xmldecl 3055 { my $t= shift; 3056 return '' unless( $t->{twig_xmldecl} || $t->{output_encoding}); 3057 my $decl_string; 3058 my $decl= $t->{twig_xmldecl}; 3059 if( $decl) 3060 { my $version= $decl->{version}; 3061 $decl_string= q{<?xml}; 3062 $decl_string .= qq{ version="$version"}; 3063 3064 # encoding can either have been set (in $decl->{output_encoding}) 3065 # or come from the document (in $decl->{encoding}) 3066 if( $t->{output_encoding}) 3067 { my $encoding= $t->{output_encoding}; 3068 $decl_string .= qq{ encoding="$encoding"}; 3069 } 3070 elsif( $decl->{encoding}) 3071 { my $encoding= $decl->{encoding}; 3072 $decl_string .= qq{ encoding="$encoding"}; 3073 } 3074 3075 if( defined( $decl->{standalone})) 3076 { $decl_string .= q{ standalone="}; 3077 $decl_string .= $decl->{standalone} ? "yes" : "no"; 3078 $decl_string .= q{"}; 3079 } 3080 3081 $decl_string .= "?>\n"; 3082 } 3083 else 3084 { my $encoding= $t->{output_encoding}; 3085 $decl_string= qq{<?xml version="1.0" encoding="$encoding"?>}; 3086 } 3087 3088 my $output_filter= XML::Twig::Elt::output_filter(); 3089 return $output_filter ? $output_filter->( $decl_string) : $decl_string; 3090 } 3091 3092sub set_doctype 3093 { my( $t, $name, $system, $public, $internal)= @_; 3094 $t->{twig_doctype}= {} unless defined $t->{twig_doctype}; 3095 my $doctype= $t->{twig_doctype}; 3096 $doctype->{name} = $name if( defined $name); 3097 $doctype->{sysid} = $system if( defined $system); 3098 $doctype->{pub} = $public if( defined $public); 3099 $doctype->{internal} = $internal if( defined $internal); 3100 } 3101 3102sub doctype_name 3103 { my $t= shift; 3104 my $doctype= $t->{twig_doctype} or return ''; 3105 return $doctype->{name} || ''; 3106 } 3107 3108sub system_id 3109 { my $t= shift; 3110 my $doctype= $t->{twig_doctype} or return ''; 3111 return $doctype->{sysid} || ''; 3112 } 3113 3114sub public_id 3115 { my $t= shift; 3116 my $doctype= $t->{twig_doctype} or return ''; 3117 return $doctype->{pub} || ''; 3118 } 3119 3120sub internal_subset 3121 { my $t= shift; 3122 my $doctype= $t->{twig_doctype} or return ''; 3123 return $doctype->{internal} || ''; 3124 } 3125 3126# return the dtd object 3127sub dtd 3128 { my $t= shift; 3129 return $t->{twig_dtd}; 3130 } 3131 3132# return an element model, or the list of element models 3133sub model 3134 { my $t= shift; 3135 my $elt= shift; 3136 return $t->dtd->{model}->{$elt} if( $elt); 3137 return (sort keys %{$t->dtd->{model}}); 3138 } 3139 3140 3141# return the entity_list object 3142sub entity_list 3143 { my $t= shift; 3144 return $t->{twig_entity_list}; 3145 } 3146 3147# return the list of entity names 3148sub entity_names 3149 { my $t= shift; 3150 return $t->entity_list->entity_names; 3151 } 3152 3153# return the entity object 3154sub entity 3155 { my $t= shift; 3156 my $entity_name= shift; 3157 return $t->entity_list->ent( $entity_name); 3158 } 3159 3160 3161sub print_prolog 3162 { my $t= shift; 3163 my $fh= isa( $_[0], 'GLOB') || isa( $_[0], 'IO::Scalar') ? shift : $t->{twig_output_fh} || select() || \*STDOUT; 3164 ## no critic (TestingAndDebugging::ProhibitNoStrict); 3165 no strict 'refs'; 3166 print {$fh} $t->prolog( @_); 3167 } 3168 3169sub prolog 3170 { my $t= shift; 3171 if( $t->{no_prolog}){ return ''; } 3172 3173 return $t->{no_prolog} ? '' 3174 : defined $t->{no_dtd_output} ? $t->xmldecl 3175 : $t->xmldecl . $t->doctype( @_); 3176 } 3177 3178sub doctype 3179 { my $t= shift; 3180 my %args= _normalize_args( @_); 3181 my $update_dtd = $args{UpdateDTD} || ''; 3182 my $doctype_text=''; 3183 3184 my $doctype= $t->{twig_doctype}; 3185 3186 if( $doctype) 3187 { $doctype_text .= qq{<!DOCTYPE $doctype->{name}} if( $doctype->{name}); 3188 $doctype_text .= qq{ PUBLIC "$doctype->{pub}"} if( $doctype->{pub}); 3189 $doctype_text .= qq{ SYSTEM} if( $doctype->{sysid} && !$doctype->{pub}); 3190 $doctype_text .= qq{ "$doctype->{sysid}"} if( $doctype->{sysid}); 3191 } 3192 3193 if( $update_dtd) 3194 { if( $doctype) 3195 { my $internal=$doctype->{internal}; 3196 # awful hack, but at least it works a little better that what was there before 3197 if( $internal) 3198 { # remove entity declarations (they will be re-generated from the updated entity list) 3199 $internal=~ s{<! \s* ENTITY \s+ $REG_TAG_NAME \s+ ( ("[^"]*"|'[^']*') \s* | SYSTEM [^>]*) >\s*}{}xg; 3200 $internal=~ s{^\n}{}; 3201 } 3202 $internal .= $t->entity_list->text ||'' if( $t->entity_list); 3203 if( $internal) { $doctype_text .= "[\n$internal]>\n"; } 3204 } 3205 elsif( !$t->{'twig_dtd'} && keys %{$t->entity_list}) 3206 { $doctype_text .= "<!DOCTYPE " . $t->root->gi . " [\n" . $t->entity_list->text . "\n]>";;} 3207 else 3208 { $doctype_text= $t->{twig_dtd}; 3209 $doctype_text .= $t->dtd_text; 3210 } 3211 } 3212 elsif( $doctype) 3213 { if( my $internal= $doctype->{internal}) 3214 { # add opening and closing brackets if not already there 3215 # plus some spaces and newlines for a nice formating 3216 # I test it here because I can't remember which version of 3217 # XML::Parser need it or not, nor guess which one will in the 3218 # future, so this about the best I can do 3219 $internal=~ s{^\s*(\[\s*)?}{ [\n}; 3220 $internal=~ s{\s*(\]\s*(>\s*)?)?\s*$}{\n]>\n}; 3221 $doctype_text .= $internal; 3222 } 3223 } 3224 3225 if( $doctype_text) 3226 { 3227 # terrible hack, as I can't figure out in which case the darn prolog 3228 # should get an extra > (depends on XML::Parser and expat versions) 3229 $doctype_text=~ s/(>\s*)*$/>\n/; # if($doctype_text); 3230 3231 my $output_filter= XML::Twig::Elt::output_filter(); 3232 return $output_filter ? $output_filter->( $doctype_text) : $doctype_text; 3233 } 3234 else 3235 { return $doctype_text; } 3236 } 3237 3238sub _leading_cpi 3239 { my $t= shift; 3240 my $leading_cpi= $t->{leading_cpi} || return ''; 3241 return $leading_cpi->sprint( 1); 3242 } 3243 3244sub _trailing_cpi 3245 { my $t= shift; 3246 my $trailing_cpi= $t->{trailing_cpi} || return ''; 3247 return $trailing_cpi->sprint( 1); 3248 } 3249 3250sub _trailing_cpi_text 3251 { my $t= shift; 3252 return $t->{trailing_cpi_text} || ''; 3253 } 3254 3255sub print_to_file 3256 { my( $t, $filename)= (shift, shift); 3257 my $out_fh; 3258# open( $out_fh, ">$filename") or _croak( "cannot create file $filename: $!"); # < perl 5.8 3259 my $mode= $t->{twig_keep_encoding} ? '>' : '>:utf8'; # >= perl 5.8 3260 open( $out_fh, $mode, $filename) or _croak( "cannot create file $filename: $!"); # >= perl 5.8 3261 $t->print( $out_fh, @_); 3262 close $out_fh; 3263 return $t; 3264 } 3265 3266# probably only works on *nix (at least the chmod bit) 3267# first print to a temporary file, then rename that file to the desired file name, then change permissions 3268# to the original file permissions (or to the current umask) 3269sub safe_print_to_file 3270 { my( $t, $filename)= (shift, shift); 3271 my $perm= -f $filename ? (stat $filename)[2] & 07777 : ~umask() ; 3272 XML::Twig::_use( 'File::Temp') || croak "need File::Temp to use safe_print_to_file\n"; 3273 my $tmpdir= dirname( $filename); 3274 my( $fh, $tmpfilename) = File::Temp::tempfile( DIR => $tmpdir); 3275 $t->print_to_file( $tmpfilename, @_); 3276 rename( $tmpfilename, $filename) or unlink $tmpfilename && _croak( "cannot move temporary file to $filename: $!"); 3277 chmod $perm, $filename; 3278 return $t; 3279 } 3280 3281 3282sub print 3283 { my $t= shift; 3284 my $fh= isa( $_[0], 'GLOB') || isa( $_[0], 'IO::Scalar') ? shift : undef; 3285 my %args= _normalize_args( @_); 3286 3287 my $old_select = defined $fh ? select $fh : undef; 3288 my $old_pretty = defined ($args{PrettyPrint}) ? $t->set_pretty_print( $args{PrettyPrint}) : undef; 3289 my $old_empty_tag = defined ($args{EmptyTags}) ? $t->set_empty_tag_style( $args{EmptyTags}) : undef; 3290 3291 #if( !$t->{encoding} || lc( $t->{encoding}) eq 'utf-8') { my $out= $fh || \*STDOUT; binmode $out, ':utf8'; } 3292 3293 if( $perl_version > 5.006 && ! $t->{twig_keep_encoding}) 3294 { if( grep /useperlio=define/, `$^X -V`) # we can only use binmode :utf8 if perl was compiled with useperlio 3295 { binmode( $fh || \*STDOUT, ":utf8" ); } 3296 } 3297 3298 print $t->prolog( %args) . $t->_leading_cpi( %args); 3299 $t->{twig_root}->print; 3300 print $t->_trailing_cpi # trailing comments and pi's (elements, in 'process' mode) 3301 . $t->_trailing_cpi_text # trailing comments and pi's (in 'keep' mode) 3302 . ( ($t->{twig_keep_spaces}||'') && ($t->{trailing_spaces} || '')) 3303 ; 3304 3305 3306 $t->set_pretty_print( $old_pretty) if( defined $old_pretty); 3307 $t->set_empty_tag_style( $old_empty_tag) if( defined $old_empty_tag); 3308 if( $fh) { select $old_select; } 3309 3310 return $t; 3311 } 3312 3313 3314sub flush 3315 { my $t= shift; 3316 3317 $t->_trigger_tdh if $t->{twig_tdh}; 3318 3319 return if( $t->{twig_completely_flushed}); 3320 3321 my $fh= isa( $_[0], 'GLOB') || isa( $_[0], 'IO::Scalar') ? shift : undef; 3322 my $old_select= defined $fh ? select $fh : undef; 3323 my $up_to= ref $_[0] ? shift : undef; 3324 my %args= _normalize_args( @_); 3325 3326 my $old_pretty; 3327 if( defined $args{PrettyPrint}) 3328 { $old_pretty= $t->set_pretty_print( $args{PrettyPrint}); 3329 delete $args{PrettyPrint}; 3330 } 3331 3332 my $old_empty_tag_style; 3333 if( $args{EmptyTags}) 3334 { $old_empty_tag_style= $t->set_empty_tag_style( $args{EmptyTags}); 3335 delete $args{EmptyTags}; 3336 } 3337 3338 3339 # the "real" last element processed, as _twig_end has closed it 3340 my $last_elt; 3341 my $flush_trailing_data=0; 3342 if( $up_to) 3343 { $last_elt= $up_to; } 3344 elsif( $t->{twig_current}) 3345 { $last_elt= $t->{twig_current}->_last_child; } 3346 else 3347 { $last_elt= $t->{twig_root}; 3348 $flush_trailing_data=1; 3349 $t->{twig_completely_flushed}=1; 3350 } 3351 3352 # flush the DTD unless it has ready flushed (ie root has been flushed) 3353 my $elt= $t->{twig_root}; 3354 unless( $elt->_flushed) 3355 { # store flush info so we can auto-flush later 3356 if( $t->{twig_autoflush}) 3357 { $t->{twig_autoflush_data}={}; 3358 $t->{twig_autoflush_data}->{fh} = $fh if( $fh); 3359 $t->{twig_autoflush_data}->{args} = \@_ if( @_); 3360 } 3361 $t->print_prolog( %args); 3362 print $t->_leading_cpi; 3363 } 3364 3365 while( $elt) 3366 { my $next_elt; 3367 if( $last_elt && $last_elt->in( $elt)) 3368 { 3369 unless( $elt->_flushed) 3370 { # just output the front tag 3371 print $elt->start_tag(); 3372 $elt->_set_flushed; 3373 } 3374 $next_elt= $elt->{first_child}; 3375 } 3376 else 3377 { # an element before the last one or the last one, 3378 $next_elt= $elt->{next_sibling}; 3379 $elt->_flush(); 3380 $elt->delete; 3381 last if( $last_elt && ($elt == $last_elt)); 3382 } 3383 $elt= $next_elt; 3384 } 3385 3386 if( $flush_trailing_data) 3387 { print $t->_trailing_cpi # trailing comments and pi's (elements, in 'process' mode) 3388 , $t->_trailing_cpi_text # trailing comments and pi's (in 'keep' mode) 3389 } 3390 3391 select $old_select if( defined $old_select); 3392 $t->set_pretty_print( $old_pretty) if( defined $old_pretty); 3393 $t->set_empty_tag_style( $old_empty_tag_style) if( defined $old_empty_tag_style); 3394 3395 if( my $ids= $t->{twig_id_list}) 3396 { while( my ($id, $elt)= each %$ids) 3397 { if( ! defined $elt) 3398 { delete $t->{twig_id_list}->{$id} } 3399 } 3400 } 3401 3402 return $t; 3403 } 3404 3405 3406# flushes up to an element 3407# this method just reorders the arguments and calls flush 3408sub flush_up_to 3409 { my $t= shift; 3410 my $up_to= shift; 3411 if( isa( $_[0], 'GLOB') || isa( $_[0], 'IO::Scalar')) 3412 { my $fh= shift; 3413 $t->flush( $fh, $up_to, @_); 3414 } 3415 else 3416 { $t->flush( $up_to, @_); } 3417 3418 return $t; 3419 } 3420 3421 3422# same as print except the entire document text is returned as a string 3423sub sprint 3424 { my $t= shift; 3425 my %args= _normalize_args( @_); 3426 3427 my $old_pretty; 3428 if( defined $args{PrettyPrint}) 3429 { $old_pretty= $t->set_pretty_print( $args{PrettyPrint}); 3430 delete $args{PrettyPrint}; 3431 } 3432 3433 my $old_empty_tag_style; 3434 if( defined $args{EmptyTags}) 3435 { $old_empty_tag_style= $t->set_empty_tag_style( $args{EmptyTags}); 3436 delete $args{EmptyTags}; 3437 } 3438 3439 my $string= $t->prolog( %args) # xml declaration and doctype 3440 . $t->_leading_cpi( %args) # leading comments and pi's in 'process' mode 3441 . ( ($t->{twig_root} && $t->{twig_root}->sprint) || '') 3442 . $t->_trailing_cpi # trailing comments and pi's (elements, in 'process' mode) 3443 . $t->_trailing_cpi_text # trailing comments and pi's (in 'keep' mode) 3444 ; 3445 if( $t->{twig_keep_spaces} && $t->{trailing_spaces}) { $string .= $t->{trailing_spaces}; } 3446 3447 $t->set_pretty_print( $old_pretty) if( defined $old_pretty); 3448 $t->set_empty_tag_style( $old_empty_tag_style) if( defined $old_empty_tag_style); 3449 3450 return $string; 3451 } 3452 3453 3454# this method discards useless elements in a tree 3455# it does the same thing as a flush except it does not print it 3456# the second argument is an element, the last purged element 3457# (this argument is usually set through the purge_up_to method) 3458sub purge 3459 { my $t= shift; 3460 my $up_to= shift; 3461 3462 $t->_trigger_tdh if $t->{twig_tdh}; 3463 3464 # the "real" last element processed, as _twig_end has closed it 3465 my $last_elt; 3466 if( $up_to) 3467 { $last_elt= $up_to; } 3468 elsif( $t->{twig_current}) 3469 { $last_elt= $t->{twig_current}->_last_child; } 3470 else 3471 { $last_elt= $t->{twig_root}; } 3472 3473 my $elt= $t->{twig_root}; 3474 3475 while( $elt) 3476 { my $next_elt; 3477 if( $last_elt && $last_elt->in( $elt)) 3478 { $elt->_set_flushed; 3479 $next_elt= $elt->{first_child}; 3480 } 3481 else 3482 { # an element before the last one or the last one, 3483 $next_elt= $elt->{next_sibling}; 3484 $elt->delete; 3485 last if( $last_elt && ($elt == $last_elt) ); 3486 } 3487 $elt= $next_elt; 3488 } 3489 3490 if( my $ids= $t->{twig_id_list}) 3491 { while( my ($id, $elt)= each %$ids) { if( ! defined $elt) { delete $t->{twig_id_list}->{$id} } } } 3492 3493 return $t; 3494 } 3495 3496# flushes up to an element. This method just calls purge 3497sub purge_up_to 3498 { my $t= shift; 3499 return $t->purge( @_); 3500 } 3501 3502sub root 3503 { return $_[0]->{twig_root}; } 3504 3505sub normalize 3506 { return $_[0]->root->normalize; } 3507 3508 3509# create accessor methods on attribute names 3510{ my %accessor; # memorize accessor names so re-creating them won't trigger an error 3511sub att_accessors 3512 { 3513 my $twig_or_class= shift; 3514 my $elt_class= ref $twig_or_class ? $twig_or_class->{twig_elt_class} 3515 : 'XML::Twig::Elt' 3516 ; 3517 ## no critic (TestingAndDebugging::ProhibitNoStrict); 3518 no strict 'refs'; 3519 foreach my $att (@_) 3520 { _croak( "attempt to redefine existing method $att using att_accessors") 3521 if( $elt_class->can( $att) && !$accessor{$att}); 3522 3523 if( !$accessor{$att}) 3524 { *{"$elt_class\::$att"}= 3525 sub 3526 :lvalue # > perl 5.5 3527 { my $elt= shift; 3528 if( @_) { $elt->{att}->{$att}= $_[0]; } 3529 $elt->{att}->{$att}; 3530 }; 3531 $accessor{$att}=1; 3532 } 3533 } 3534 return $twig_or_class; 3535 } 3536} 3537 3538{ my %accessor; # memorize accessor names so re-creating them won't trigger an error 3539sub elt_accessors 3540 { 3541 my $twig_or_class= shift; 3542 my $elt_class= ref $twig_or_class ? $twig_or_class->{twig_elt_class} 3543 : 'XML::Twig::Elt' 3544 ; 3545 3546 # if arg is a hash ref, it's exp => name, otherwise it's a list of tags 3547 my %exp_to_alias= ref( $_[0]) && isa( $_[0], 'HASH') ? %{$_[0]} 3548 : map { $_ => $_ } @_; 3549 ## no critic (TestingAndDebugging::ProhibitNoStrict); 3550 no strict 'refs'; 3551 while( my( $alias, $exp)= each %exp_to_alias ) 3552 { if( $elt_class->can( $alias) && !$accessor{$alias}) 3553 { _croak( "attempt to redefine existing method $alias using elt_accessors"); } 3554 3555 if( !$accessor{$alias}) 3556 { *{"$elt_class\::$alias"}= 3557 sub 3558 { my $elt= shift; 3559 return wantarray ? $elt->children( $exp) : $elt->first_child( $exp); 3560 }; 3561 $accessor{$alias}=1; 3562 } 3563 } 3564 return $twig_or_class; 3565 } 3566} 3567 3568{ my %accessor; # memorize accessor names so re-creating them won't trigger an error 3569sub field_accessors 3570 { 3571 my $twig_or_class= shift; 3572 my $elt_class= ref $twig_or_class ? $twig_or_class->{twig_elt_class} 3573 : 'XML::Twig::Elt' 3574 ; 3575 my %exp_to_alias= ref( $_[0]) && isa( $_[0], 'HASH') ? %{$_[0]} 3576 : map { $_ => $_ } @_; 3577 3578 ## no critic (TestingAndDebugging::ProhibitNoStrict); 3579 no strict 'refs'; 3580 while( my( $alias, $exp)= each %exp_to_alias ) 3581 { if( $elt_class->can( $alias) && !$accessor{$alias}) 3582 { _croak( "attempt to redefine existing method $exp using field_accessors"); } 3583 if( !$accessor{$alias}) 3584 { *{"$elt_class\::$alias"}= 3585 sub 3586 { my $elt= shift; 3587 $elt->field( $exp) 3588 }; 3589 $accessor{$alias}=1; 3590 } 3591 } 3592 return $twig_or_class; 3593 } 3594} 3595 3596sub first_elt 3597 { my( $t, $cond)= @_; 3598 my $root= $t->root || return undef; 3599 return $root if( $root->passes( $cond)); 3600 return $root->next_elt( $cond); 3601 } 3602 3603sub last_elt 3604 { my( $t, $cond)= @_; 3605 my $root= $t->root || return undef; 3606 return $root->last_descendant( $cond); 3607 } 3608 3609sub next_n_elt 3610 { my( $t, $offset, $cond)= @_; 3611 $offset -- if( $t->root->matches( $cond) ); 3612 return $t->root->next_n_elt( $offset, $cond); 3613 } 3614 3615sub get_xpath 3616 { my $twig= shift; 3617 if( isa( $_[0], 'ARRAY')) 3618 { my $elt_array= shift; 3619 return _unique_elts( map { $_->get_xpath( @_) } @$elt_array); 3620 } 3621 else 3622 { return $twig->root->get_xpath( @_); } 3623 } 3624 3625# get a list of elts and return a sorted list of unique elts 3626sub _unique_elts 3627 { my @sorted= sort { $a ->cmp( $b) } @_; 3628 my @unique; 3629 while( my $current= shift @sorted) 3630 { push @unique, $current unless( @unique && ($unique[-1] == $current)); } 3631 return @unique; 3632 } 3633 3634sub findvalue 3635 { my $twig= shift; 3636 if( isa( $_[0], 'ARRAY')) 3637 { my $elt_array= shift; 3638 return join( '', map { $_->findvalue( @_) } @$elt_array); 3639 } 3640 else 3641 { return $twig->root->findvalue( @_); } 3642 } 3643 3644sub findvalues 3645 { my $twig= shift; 3646 if( isa( $_[0], 'ARRAY')) 3647 { my $elt_array= shift; 3648 return map { $_->findvalues( @_) } @$elt_array; 3649 } 3650 else 3651 { return $twig->root->findvalues( @_); } 3652 } 3653 3654sub set_id_seed 3655 { my $t= shift; 3656 XML::Twig::Elt->set_id_seed( @_); 3657 return $t; 3658 } 3659 3660# return an array ref to an index, or undef 3661sub index 3662 { my( $twig, $name, $index)= @_; 3663 return defined( $index) ? $twig->{_twig_index}->{$name}->[$index] : $twig->{_twig_index}->{$name}; 3664 } 3665 3666# return a list with just the root 3667# if a condition is given then return an empty list unless the root matches 3668sub children 3669 { my( $t, $cond)= @_; 3670 my $root= $t->root; 3671 unless( $cond && !($root->passes( $cond)) ) 3672 { return ($root); } 3673 else 3674 { return (); } 3675 } 3676 3677sub _children { return ($_[0]->root); } 3678 3679# weird, but here for completude 3680# used to solve (non-sensical) /doc[1] XPath queries 3681sub child 3682 { my $t= shift; 3683 my $nb= shift; 3684 return ($t->children( @_))[$nb]; 3685 } 3686 3687sub descendants 3688 { my( $t, $cond)= @_; 3689 my $root= $t->root; 3690 if( $root->passes( $cond) ) 3691 { return ($root, $root->descendants( $cond)); } 3692 else 3693 { return ( $root->descendants( $cond)); } 3694 } 3695 3696sub simplify { my $t= shift; $t->root->simplify( @_); } 3697sub subs_text { my $t= shift; $t->root->subs_text( @_); } 3698sub trim { my $t= shift; $t->root->trim( @_); } 3699 3700 3701sub set_keep_encoding 3702 { my( $t, $keep)= @_; 3703 $t->{twig_keep_encoding}= $keep; 3704 $t->{NoExpand}= $keep; 3705 return XML::Twig::Elt::set_keep_encoding( $keep); 3706 } 3707 3708sub set_expand_external_entities 3709 { return XML::Twig::Elt::set_expand_external_entities( @_); } 3710 3711sub escape_gt 3712 { my $t= shift; $t->{twig_escape_gt}= 1; return XML::Twig::Elt::escape_gt( @_); } 3713 3714sub do_not_escape_gt 3715 { my $t= shift; $t->{twig_escape_gt}= 0; return XML::Twig::Elt::do_not_escape_gt( @_); } 3716 3717sub elt_id 3718 { return $_[0]->{twig_id_list}->{$_[1]}; } 3719 3720# change it in ALL twigs at the moment 3721sub change_gi 3722 { my( $twig, $old_gi, $new_gi)= @_; 3723 my $index; 3724 return unless($index= $XML::Twig::gi2index{$old_gi}); 3725 $XML::Twig::index2gi[$index]= $new_gi; 3726 delete $XML::Twig::gi2index{$old_gi}; 3727 $XML::Twig::gi2index{$new_gi}= $index; 3728 return $twig; 3729 } 3730 3731 3732# builds the DTD from the stored (possibly updated) data 3733sub dtd_text 3734 { my $t= shift; 3735 my $dtd= $t->{twig_dtd}; 3736 my $doctype= $t->{twig_doctype} or return ''; 3737 my $string= "<!DOCTYPE ".$doctype->{name}; 3738 3739 $string .= " [\n"; 3740 3741 foreach my $gi (@{$dtd->{elt_list}}) 3742 { $string.= "<!ELEMENT $gi ".$dtd->{model}->{$gi}.">\n" ; 3743 if( $dtd->{att}->{$gi}) 3744 { my $attlist= $dtd->{att}->{$gi}; 3745 $string.= "<!ATTLIST $gi\n"; 3746 foreach my $att ( sort keys %{$attlist}) 3747 { 3748 if( $attlist->{$att}->{fixed}) 3749 { $string.= " $att $attlist->{$att}->{type} #FIXED $attlist->{$att}->{default}"; } 3750 else 3751 { $string.= " $att $attlist->{$att}->{type} $attlist->{$att}->{default}"; } 3752 $string.= "\n"; 3753 } 3754 $string.= ">\n"; 3755 } 3756 } 3757 $string.= $t->entity_list->text if( $t->entity_list); 3758 $string.= "\n]>\n"; 3759 return $string; 3760 } 3761 3762# prints the DTD from the stored (possibly updated) data 3763sub dtd_print 3764 { my $t= shift; 3765 my $fh= isa( $_[0], 'GLOB') || isa( $_[0], 'IO::Scalar') ? shift : undef; 3766 if( $fh) { print $fh $t->dtd_text; } 3767 else { print $t->dtd_text; } 3768 return $t; 3769 } 3770 3771# build the subs that call directly expat 3772BEGIN 3773 { my @expat_methods= qw( depth in_element within_element context 3774 current_line current_column current_byte 3775 recognized_string original_string 3776 xpcroak xpcarp 3777 base current_element element_index 3778 xml_escape 3779 position_in_context); 3780 foreach my $method (@expat_methods) 3781 { 3782 ## no critic (TestingAndDebugging::ProhibitNoStrict); 3783 no strict 'refs'; 3784 *{$method}= sub { my $t= shift; 3785 _croak( "calling $method after parsing is finished") unless( $t->{twig_parsing}); 3786 return $t->{twig_parser}->$method(@_); 3787 }; 3788 } 3789 } 3790 3791sub path 3792 { my( $t, $gi)= @_; 3793 if( $t->{twig_map_xmlns}) 3794 { return "/" . join( "/", map { $t->_replace_prefix( $_)} ($t->{twig_parser}->context, $gi)); } 3795 else 3796 { return "/" . join( "/", ($t->{twig_parser}->context, $gi)); } 3797 } 3798 3799sub finish 3800 { my $t= shift; 3801 return $t->{twig_parser}->finish; 3802 } 3803 3804# just finish the parse by printing the rest of the document 3805sub finish_print 3806 { my( $t, $fh)= @_; 3807 my $old_fh; 3808 unless( defined $fh) 3809 { $t->_set_fh_to_twig_output_fh(); } 3810 elsif( defined $fh) 3811 { $old_fh= select $fh; 3812 $t->{twig_original_selected_fh}= $old_fh if( $old_fh); 3813 } 3814 3815 my $p=$t->{twig_parser}; 3816 if( $t->{twig_keep_encoding}) 3817 { $p->setHandlers( %twig_handlers_finish_print); } 3818 else 3819 { $p->setHandlers( %twig_handlers_finish_print_original); } 3820 return $t; 3821 } 3822 3823sub set_remove_cdata { return XML::Twig::Elt::set_remove_cdata( @_); } 3824 3825sub output_filter { return XML::Twig::Elt::output_filter( @_); } 3826sub set_output_filter { return XML::Twig::Elt::set_output_filter( @_); } 3827 3828sub output_text_filter { return XML::Twig::Elt::output_text_filter( @_); } 3829sub set_output_text_filter { return XML::Twig::Elt::set_output_text_filter( @_); } 3830 3831sub set_input_filter 3832 { my( $t, $input_filter)= @_; 3833 my $old_filter= $t->{twig_input_filter}; 3834 if( !$input_filter || isa( $input_filter, 'CODE') ) 3835 { $t->{twig_input_filter}= $input_filter; } 3836 elsif( $input_filter eq 'latin1') 3837 { $t->{twig_input_filter}= latin1(); } 3838 elsif( $filter{$input_filter}) 3839 { $t->{twig_input_filter}= $filter{$input_filter}; } 3840 else 3841 { _croak( "invalid input filter: $input_filter"); } 3842 3843 return $old_filter; 3844 } 3845 3846sub set_empty_tag_style 3847 { return XML::Twig::Elt::set_empty_tag_style( @_); } 3848 3849sub set_pretty_print 3850 { return XML::Twig::Elt::set_pretty_print( @_); } 3851 3852sub set_quote 3853 { return XML::Twig::Elt::set_quote( @_); } 3854 3855sub set_indent 3856 { return XML::Twig::Elt::set_indent( @_); } 3857 3858sub set_keep_atts_order 3859 { shift; return XML::Twig::Elt::set_keep_atts_order( @_); } 3860 3861sub keep_atts_order 3862 { return XML::Twig::Elt::keep_atts_order( @_); } 3863 3864sub set_do_not_escape_amp_in_atts 3865 { return XML::Twig::Elt::set_do_not_escape_amp_in_atts( @_); } 3866 3867# save and restore package globals (the ones in XML::Twig::Elt) 3868# should probably return the XML::Twig object itself, but instead 3869# returns the state (as a hashref) for backward compatibility 3870sub save_global_state 3871 { my $t= shift; 3872 return $t->{twig_saved_state}= XML::Twig::Elt::global_state(); 3873 } 3874 3875sub restore_global_state 3876 { my $t= shift; 3877 XML::Twig::Elt::set_global_state( $t->{twig_saved_state}); 3878 } 3879 3880sub global_state 3881 { return XML::Twig::Elt::global_state(); } 3882 3883sub set_global_state 3884 { return XML::Twig::Elt::set_global_state( $_[1]); } 3885 3886sub dispose 3887 { my $t= shift; 3888 $t->DESTROY; 3889 return; 3890 } 3891 3892sub DESTROY 3893 { my $t= shift; 3894 if( $t->{twig_root} && isa( $t->{twig_root}, 'XML::Twig::Elt')) 3895 { $t->{twig_root}->delete } 3896 3897 # added to break circular references 3898 undef $t->{twig}; 3899 undef $t->{twig_root}->{twig} if( $t->{twig_root}); 3900 undef $t->{twig_parser}; 3901 3902 undef %$t;# prevents memory leaks (especially when using mod_perl) 3903 undef $t; 3904 } 3905 3906 3907# 3908# non standard handlers 3909# 3910 3911# kludge: expat 1.95.2 calls both Default AND Doctype handlers 3912# so if the default handler finds '<!DOCTYPE' then it must 3913# unset itself (_twig_print_doctype will reset it) 3914sub _twig_print_check_doctype 3915 { # warn " in _twig_print_check_doctype...\n"; # DEBUG handler 3916 3917 my $p= shift; 3918 my $string= $p->recognized_string(); 3919 if( $string eq '<!DOCTYPE') 3920 { 3921 $p->setHandlers( Default => undef); 3922 $p->setHandlers( Entity => undef); 3923 $expat_1_95_2=1; 3924 } 3925 else 3926 { print $string; } 3927 3928 return; 3929 } 3930 3931 3932sub _twig_print 3933 { # warn " in _twig_print...\n"; # DEBUG handler 3934 my $p= shift; 3935 if( $expat_1_95_2 && ($p->recognized_string eq '[') && !$p->{twig}->{expat_1_95_2_seen_bracket}) 3936 { # otherwise the opening square bracket of the doctype gets printed twice 3937 $p->{twig}->{expat_1_95_2_seen_bracket}=1; 3938 } 3939 else 3940 { if( $p->{twig}->{twig_right_after_root}) 3941 { my $s= $p->recognized_string(); print $s if $s=~ m{\S}; } 3942 else 3943 { print $p->recognized_string(); } 3944 } 3945 return; 3946 } 3947# recognized_string does not seem to work for entities, go figure! 3948# so this handler is used to print them anyway 3949sub _twig_print_entity 3950 { # warn " in _twig_print_entity...\n"; # DEBUG handler 3951 my $p= shift; 3952 XML::Twig::Entity->new( @_)->print; 3953 } 3954 3955# kludge: expat 1.95.2 calls both Default AND Doctype handlers 3956# so if the default handler finds '<!DOCTYPE' then it must 3957# unset itself (_twig_print_doctype will reset it) 3958sub _twig_print_original_check_doctype 3959 { # warn " in _twig_print_original_check_doctype...\n"; # DEBUG handler 3960 3961 my $p= shift; 3962 my $string= $p->original_string(); 3963 if( $string eq '<!DOCTYPE') 3964 { $p->setHandlers( Default => undef); 3965 $p->setHandlers( Entity => undef); 3966 $expat_1_95_2=1; 3967 } 3968 else 3969 { print $string; } 3970 3971 return; 3972 } 3973 3974sub _twig_print_original 3975 { # warn " in _twig_print_original...\n"; # DEBUG handler 3976 my $p= shift; 3977 print $p->original_string(); 3978 return; 3979 } 3980 3981 3982sub _twig_print_original_doctype 3983 { # warn " in _twig_print_original_doctype...\n"; # DEBUG handler 3984 3985 my( $p, $name, $sysid, $pubid, $internal)= @_; 3986 if( $name) 3987 { # with recent versions of XML::Parser original_string does not work, 3988 # hence we need to rebuild the doctype declaration 3989 my $doctype=''; 3990 $doctype .= qq{<!DOCTYPE $name} if( $name); 3991 $doctype .= qq{ PUBLIC "$pubid"} if( $pubid); 3992 $doctype .= qq{ SYSTEM} if( $sysid && !$pubid); 3993 $doctype .= qq{ "$sysid"} if( $sysid); 3994 $doctype .= ' [' if( $internal && !$expat_1_95_2) ; 3995 $doctype .= qq{>} unless( $internal || $expat_1_95_2); 3996 $p->{twig}->{twig_doctype}->{has_internal}=$internal; 3997 print $doctype; 3998 } 3999 $p->setHandlers( Default => \&_twig_print_original); 4000 return; 4001 } 4002 4003sub _twig_print_doctype 4004 { # warn " in _twig_print_doctype...\n"; # DEBUG handler 4005 my( $p, $name, $sysid, $pubid, $internal)= @_; 4006 if( $name) 4007 { # with recent versions of XML::Parser original_string does not work, 4008 # hence we need to rebuild the doctype declaration 4009 my $doctype=''; 4010 $doctype .= qq{<!DOCTYPE $name} if( $name); 4011 $doctype .= qq{ PUBLIC "$pubid"} if( $pubid); 4012 $doctype .= qq{ SYSTEM} if( $sysid && !$pubid); 4013 $doctype .= qq{ "$sysid"} if( $sysid); 4014 $doctype .= ' [' if( $internal) ; 4015 $doctype .= qq{>} unless( $internal || $expat_1_95_2); 4016 $p->{twig}->{twig_doctype}->{has_internal}=$internal; 4017 print $doctype; 4018 } 4019 $p->setHandlers( Default => \&_twig_print); 4020 return; 4021 } 4022 4023 4024sub _twig_print_original_default 4025 { # warn " in _twig_print_original_default...\n"; # DEBUG handler 4026 my $p= shift; 4027 print $p->original_string(); 4028 return; 4029 } 4030 4031# account for the case where the element is empty 4032sub _twig_print_end_original 4033 { # warn " in _twig_print_end_original...\n"; # DEBUG handler 4034 my $p= shift; 4035 print $p->original_string(); 4036 return; 4037 } 4038 4039sub _twig_start_check_roots 4040 { # warn " in _twig_start_check_roots...\n"; # DEBUG handler 4041 my $p= shift; 4042 my $gi= shift; 4043 4044 my $t= $p->{twig}; 4045 4046 my $fh= $t->{twig_output_fh} || select() || \*STDOUT; 4047 4048 my $ns_decl; 4049 unless( $p->depth == 0) 4050 { if( $t->{twig_map_xmlns}) { $ns_decl= _replace_ns( $t, \$gi, \@_); } 4051 } 4052 4053 my $context= { $ST_TAG => $gi, @_}; 4054 $context->{$ST_NS}= $ns_decl if $ns_decl; 4055 push @{$t->{_twig_context_stack}}, $context; 4056 my %att= @_; 4057 4058 if( _handler( $t, $t->{twig_roots}, $gi)) 4059 { $p->setHandlers( %twig_handlers); # restore regular handlers 4060 $t->{twig_root_depth}= $p->depth; 4061 pop @{$t->{_twig_context_stack}}; # will be pushed back in _twig_start 4062 _twig_start( $p, $gi, @_); 4063 return; 4064 } 4065 4066 # $tag will always be true if it needs to be printed (the tag string is never empty) 4067 my $tag= $t->{twig_default_print} ? $t->{twig_keep_encoding} ? $p->original_string 4068 : $p->recognized_string 4069 : ''; 4070 4071 if( $p->depth == 0) 4072 { 4073 ## no critic (TestingAndDebugging::ProhibitNoStrict); 4074 no strict 'refs'; 4075 print {$fh} $tag if( $tag); 4076 pop @{$t->{_twig_context_stack}}; # will be pushed back in _twig_start 4077 _twig_start( $p, $gi, @_); 4078 $t->root->_set_flushed; # or the root start tag gets output the first time we flush 4079 } 4080 elsif( $t->{twig_starttag_handlers}) 4081 { # look for start tag handlers 4082 4083 my @handlers= _handler( $t, $t->{twig_starttag_handlers}, $gi); 4084 my $last_handler_res; 4085 foreach my $handler ( @handlers) 4086 { $last_handler_res= $handler->($t, $gi, %att); 4087 last unless $last_handler_res; 4088 } 4089 ## no critic (TestingAndDebugging::ProhibitNoStrict); 4090 no strict 'refs'; 4091 print {$fh} $tag if( $tag && (!@handlers || $last_handler_res)); 4092 } 4093 else 4094 { 4095 ## no critic (TestingAndDebugging::ProhibitNoStrict); 4096 no strict 'refs'; 4097 print {$fh} $tag if( $tag); 4098 } 4099 return; 4100 } 4101 4102sub _twig_end_check_roots 4103 { # warn " in _twig_end_check_roots...\n"; # DEBUG handler 4104 4105 my( $p, $gi, %att)= @_; 4106 my $t= $p->{twig}; 4107 # $tag can be empty (<elt/>), hence the undef and the tests for defined 4108 my $tag= $t->{twig_default_print} ? $t->{twig_keep_encoding} ? $p->original_string 4109 : $p->recognized_string 4110 : undef; 4111 my $fh= $t->{twig_output_fh} || select() || \*STDOUT; 4112 4113 if( $t->{twig_endtag_handlers}) 4114 { # look for end tag handlers 4115 my @handlers= _handler( $t, $t->{twig_endtag_handlers}, $gi); 4116 my $last_handler_res=1; 4117 foreach my $handler ( @handlers) 4118 { $last_handler_res= $handler->($t, $gi) || last; } 4119 #if( ! $last_handler_res) 4120 # { pop @{$t->{_twig_context_stack}}; warn "tested"; 4121 # return; 4122 # } 4123 } 4124 { 4125 ## no critic (TestingAndDebugging::ProhibitNoStrict); 4126 no strict 'refs'; 4127 print {$fh} $tag if( defined $tag); 4128 } 4129 if( $p->depth == 0) 4130 { 4131 _twig_end( $p, $gi); 4132 $t->root->{end_tag_flushed}=1; 4133 } 4134 4135 pop @{$t->{_twig_context_stack}}; 4136 return; 4137 } 4138 4139sub _twig_pi_check_roots 4140 { # warn " in _twig_pi_check_roots...\n"; # DEBUG handler 4141 my( $p, $target, $data)= @_; 4142 my $t= $p->{twig}; 4143 my $pi= $t->{twig_default_print} ? $t->{twig_keep_encoding} ? $p->original_string 4144 : $p->recognized_string 4145 : undef; 4146 my $fh= $t->{twig_output_fh} || select() || \*STDOUT; 4147 4148 if( my $handler= $t->{twig_handlers}->{pi_handlers}->{$target} 4149 || $t->{twig_handlers}->{pi_handlers}->{''} 4150 ) 4151 { # if handler is called on pi, then it needs to be processed as a regular node 4152 my @flags= qw( twig_process_pi twig_keep_pi); 4153 my @save= @{$t}{@flags}; # save pi related flags 4154 @{$t}{@flags}= (1, 0); # override them, pi needs to be processed 4155 _twig_pi( @_); # call handler on the pi 4156 @{$t}{@flags}= @save;; # restore flag 4157 } 4158 else 4159 { 4160 ## no critic (TestingAndDebugging::ProhibitNoStrict); 4161 no strict 'refs'; 4162 print {$fh} $pi if( defined( $pi)); 4163 } 4164 return; 4165 } 4166 4167 4168sub _output_ignored 4169 { my( $t, $p)= @_; 4170 my $action= $t->{twig_ignore_action}; 4171 4172 my $get_string= $t->{twig_keep_encoding} ? 'original_string' : 'recognized_string'; 4173 4174 if( $action eq 'print' ) { print $p->$get_string; } 4175 else 4176 { my $string_ref; 4177 if( $action eq 'string') 4178 { $string_ref= \$t->{twig_buffered_string}; } 4179 elsif( ref( $action) && ref( $action) eq 'SCALAR') 4180 { $string_ref= $action; } 4181 else 4182 { _croak( "wrong ignore action: $action"); } 4183 4184 $$string_ref .= $p->$get_string; 4185 } 4186 } 4187 4188 4189 4190sub _twig_ignore_start 4191 { # warn " in _twig_ignore_start...\n"; # DEBUG handler 4192 4193 my( $p, $gi)= @_; 4194 my $t= $p->{twig}; 4195 $t->{twig_ignore_level}++; 4196 my $action= $t->{twig_ignore_action}; 4197 4198 $t->_output_ignored( $p) unless $action eq 'discard'; 4199 return; 4200 } 4201 4202sub _twig_ignore_end 4203 { # warn " in _twig_ignore_end...\n"; # DEBUG handler 4204 4205 my( $p, $gi)= @_; 4206 my $t= $p->{twig}; 4207 4208 my $action= $t->{twig_ignore_action}; 4209 $t->_output_ignored( $p) unless $action eq 'discard'; 4210 4211 $t->{twig_ignore_level}--; 4212 4213 if( ! $t->{twig_ignore_level}) 4214 { 4215 $t->{twig_current} = $t->{twig_ignore_elt}; 4216 $t->{twig_current}->set_twig_current; 4217 4218 $t->{twig_ignore_elt}->cut; # there could possibly be a memory leak here (delete would avoid it, 4219 # but could also delete elements that should not be deleted) 4220 4221 # restore the saved stack to the current level 4222 splice( @{$t->{_twig_context_stack}}, $p->depth+ 1 ); 4223 #warn "stack: ", _dump_stack( $t->{_twig_context_stack}), "\n"; 4224 4225 $p->setHandlers( @{$t->{twig_saved_handlers}}); 4226 # test for handlers 4227 if( $t->{twig_endtag_handlers}) 4228 { # look for end tag handlers 4229 my @handlers= _handler( $t, $t->{twig_endtag_handlers}, $gi); 4230 my $last_handler_res=1; 4231 foreach my $handler ( @handlers) 4232 { $last_handler_res= $handler->($t, $gi) || last; } 4233 } 4234 pop @{$t->{_twig_context_stack}}; 4235 }; 4236 return; 4237 } 4238 4239#sub _dump_stack { my( $stack)= @_; return join( ":", map { $_->{$ST_TAG} } @$stack); } 4240 4241sub ignore 4242 { my( $t, $elt, $action)= @_; 4243 my $current= $t->{twig_current}; 4244 4245 if( ! ($elt && ref( $elt) && isa( $elt, 'XML::Twig::Elt'))) { $elt= $current; } 4246 4247 #warn "ignore: current = ", $current->tag, ", elt = ", $elt->tag, ")\n"; 4248 4249 # we need the ($elt == $current->{last_child}) test because the current element is set to the 4250 # parent _before_ handlers are called (and I can't figure out how to fix this) 4251 unless( ($elt == $current) || ($current->{last_child} && ($elt == $current->{last_child})) || $current->in( $elt)) 4252 { _croak( "element to be ignored must be ancestor of current element"); } 4253 4254 $t->{twig_ignore_level}= $current == $elt ? 1 : $t->_level_in_stack( $current) - $t->_level_in_stack($elt) + 1; 4255 #warn "twig_ignore_level: $t->{twig_ignore_level} (current: ", $current->tag, ", elt: ", $elt->tag, ")\n"; 4256 $t->{twig_ignore_elt} = $elt; # save it, so we can delete it later 4257 4258 $action ||= 'discard'; 4259 if( !($action eq 'print' || $action eq 'string' || ( ref( $action) && ref( $action) eq 'SCALAR'))) 4260 { $action= 'discard'; } 4261 4262 $t->{twig_ignore_action}= $action; 4263 4264 my $p= $t->{twig_parser}; 4265 my @saved_handlers= $p->setHandlers( %twig_handlers_ignore); # set handlers 4266 4267 my $get_string= $t->{twig_keep_encoding} ? 'original_string' : 'recognized_string'; 4268 4269 my $default_handler; 4270 4271 if( $action ne 'discard') 4272 { if( $action eq 'print') 4273 { $p->setHandlers( Default => sub { print $_[0]->$get_string; }); } 4274 else 4275 { my $string_ref; 4276 if( $action eq 'string') 4277 { if( ! exists $t->{twig_buffered_string}) { $t->{twig_buffered_string}=''; } 4278 $string_ref= \$t->{twig_buffered_string}; 4279 } 4280 elsif( ref( $action) && ref( $action) eq 'SCALAR') 4281 { $string_ref= $action; } 4282 4283 $p->setHandlers( Default => sub { $$string_ref .= $_[0]->$get_string; }); 4284 } 4285 $t->_output_ignored( $p, $action); 4286 } 4287 4288 4289 $t->{twig_saved_handlers}= \@saved_handlers; # save current handlers 4290 } 4291 4292sub _level_in_stack 4293 { my( $t, $elt)= @_; 4294 my $level=1; 4295 foreach my $elt_in_stack ( @{$t->{_twig_context_stack}} ) 4296 { if( $elt_in_stack->{$ST_ELT} && ($elt == $elt_in_stack->{$ST_ELT})) { return $level } 4297 $level++; 4298 } 4299 } 4300 4301 4302 4303# select $t->{twig_output_fh} and store the current selected fh 4304sub _set_fh_to_twig_output_fh 4305 { my $t= shift; 4306 my $output_fh= $t->{twig_output_fh}; 4307 if( $output_fh && !$t->{twig_output_fh_selected}) 4308 { # there is an output fh 4309 $t->{twig_selected_fh}= select(); # store the currently selected fh 4310 $t->{twig_output_fh_selected}=1; 4311 select $output_fh; # select the output fh for the twig 4312 } 4313 } 4314 4315# select the fh that was stored in $t->{twig_selected_fh} 4316# (before $t->{twig_output_fh} was selected) 4317sub _set_fh_to_selected_fh 4318 { my $t= shift; 4319 return unless( $t->{twig_output_fh}); 4320 my $selected_fh= $t->{twig_selected_fh}; 4321 $t->{twig_output_fh_selected}=0; 4322 select $selected_fh; 4323 return; 4324 } 4325 4326 4327sub encoding 4328 { return $_[0]->{twig_xmldecl}->{encoding} if( $_[0]->{twig_xmldecl}); } 4329 4330sub set_encoding 4331 { my( $t, $encoding)= @_; 4332 $t->{twig_xmldecl} ||={}; 4333 $t->set_xml_version( "1.0") unless( $t->xml_version); 4334 $t->{twig_xmldecl}->{encoding}= $encoding; 4335 return $t; 4336 } 4337 4338sub output_encoding 4339 { return $_[0]->{output_encoding}; } 4340 4341sub set_output_encoding 4342 { my( $t, $encoding)= @_; 4343 my $output_filter= $t->output_filter || ''; 4344 4345 if( ($encoding && $encoding !~ m{^utf-?8$}i) || $t->{twig_keep_encoding} || $output_filter) 4346 { $t->set_output_filter( _encoding_filter( $encoding || '')); } 4347 4348 $t->{output_encoding}= $encoding; 4349 return $t; 4350 } 4351 4352sub xml_version 4353 { return $_[0]->{twig_xmldecl}->{version} if( $_[0]->{twig_xmldecl}); } 4354 4355sub set_xml_version 4356 { my( $t, $version)= @_; 4357 $t->{twig_xmldecl} ||={}; 4358 $t->{twig_xmldecl}->{version}= $version; 4359 return $t; 4360 } 4361 4362sub standalone 4363 { return $_[0]->{twig_xmldecl}->{standalone} if( $_[0]->{twig_xmldecl}); } 4364 4365sub set_standalone 4366 { my( $t, $standalone)= @_; 4367 $t->{twig_xmldecl} ||={}; 4368 $t->set_xml_version( "1.0") unless( $t->xml_version); 4369 $t->{twig_xmldecl}->{standalone}= $standalone; 4370 return $t; 4371 } 4372 4373 4374# SAX methods 4375 4376sub toSAX1 4377 { _croak( "cannot use toSAX1 while parsing (use flush_toSAX1)") if (defined $_[0]->{twig_parser}); 4378 shift(@_)->_toSAX(@_, \&XML::Twig::Elt::_start_tag_data_SAX1, 4379 \&XML::Twig::Elt::_end_tag_data_SAX1 4380 ); 4381 } 4382 4383sub toSAX2 4384 { _croak( "cannot use toSAX2 while parsing (use flush_toSAX2)") if (defined $_[0]->{twig_parser}); 4385 shift(@_)->_toSAX(@_, \&XML::Twig::Elt::_start_tag_data_SAX2, 4386 \&XML::Twig::Elt::_end_tag_data_SAX2 4387 ); 4388 } 4389 4390 4391sub _toSAX 4392 { my( $t, $handler, $start_tag_data, $end_tag_data) = @_; 4393 4394 if( my $start_document = $handler->can( 'start_document')) 4395 { $start_document->( $handler); } 4396 4397 $t->_prolog_toSAX( $handler); 4398 4399 if( $t->root) { $t->root->_toSAX( $handler, $start_tag_data, $end_tag_data) ; } 4400 if( my $end_document = $handler->can( 'end_document')) 4401 { $end_document->( $handler); } 4402 } 4403 4404 4405sub flush_toSAX1 4406 { shift(@_)->_flush_toSAX(@_, \&XML::Twig::Elt::_start_tag_data_SAX1, 4407 \&XML::Twig::Elt::_end_tag_data_SAX1 4408 ); 4409 } 4410 4411sub flush_toSAX2 4412 { shift(@_)->_flush_toSAX(@_, \&XML::Twig::Elt::_start_tag_data_SAX2, 4413 \&XML::Twig::Elt::_end_tag_data_SAX2 4414 ); 4415 } 4416 4417sub _flush_toSAX 4418 { my( $t, $handler, $start_tag_data, $end_tag_data)= @_; 4419 4420 # the "real" last element processed, as _twig_end has closed it 4421 my $last_elt; 4422 if( $t->{twig_current}) 4423 { $last_elt= $t->{twig_current}->_last_child; } 4424 else 4425 { $last_elt= $t->{twig_root}; } 4426 4427 my $elt= $t->{twig_root}; 4428 unless( $elt->_flushed) 4429 { # init unless already done (ie root has been flushed) 4430 if( my $start_document = $handler->can( 'start_document')) 4431 { $start_document->( $handler); } 4432 # flush the DTD 4433 $t->_prolog_toSAX( $handler) 4434 } 4435 4436 while( $elt) 4437 { my $next_elt; 4438 if( $last_elt && $last_elt->in( $elt)) 4439 { 4440 unless( $elt->_flushed) 4441 { # just output the front tag 4442 if( my $start_element = $handler->can( 'start_element')) 4443 { if( my $tag_data= $start_tag_data->( $elt)) 4444 { $start_element->( $handler, $tag_data); } 4445 } 4446 $elt->_set_flushed; 4447 } 4448 $next_elt= $elt->{first_child}; 4449 } 4450 else 4451 { # an element before the last one or the last one, 4452 $next_elt= $elt->{next_sibling}; 4453 $elt->_toSAX( $handler, $start_tag_data, $end_tag_data); 4454 $elt->delete; 4455 last if( $last_elt && ($elt == $last_elt)); 4456 } 4457 $elt= $next_elt; 4458 } 4459 if( !$t->{twig_parsing}) 4460 { if( my $end_document = $handler->can( 'end_document')) 4461 { $end_document->( $handler); } 4462 } 4463 } 4464 4465 4466sub _prolog_toSAX 4467 { my( $t, $handler)= @_; 4468 $t->_xmldecl_toSAX( $handler); 4469 $t->_DTD_toSAX( $handler); 4470 } 4471 4472sub _xmldecl_toSAX 4473 { my( $t, $handler)= @_; 4474 my $decl= $t->{twig_xmldecl}; 4475 my $data= { Version => $decl->{version}, 4476 Encoding => $decl->{encoding}, 4477 Standalone => $decl->{standalone}, 4478 }; 4479 if( my $xml_decl= $handler->can( 'xml_decl')) 4480 { $xml_decl->( $handler, $data); } 4481 } 4482 4483sub _DTD_toSAX 4484 { my( $t, $handler)= @_; 4485 my $doctype= $t->{twig_doctype}; 4486 return unless( $doctype); 4487 my $data= { Name => $doctype->{name}, 4488 PublicId => $doctype->{pub}, 4489 SystemId => $doctype->{sysid}, 4490 }; 4491 4492 if( my $start_dtd= $handler->can( 'start_dtd')) 4493 { $start_dtd->( $handler, $data); } 4494 4495 # I should call code to export the internal subset here 4496 4497 if( my $end_dtd= $handler->can( 'end_dtd')) 4498 { $end_dtd->( $handler); } 4499 } 4500 4501# input/output filters 4502 4503sub latin1 4504 { local $SIG{__DIE__}; 4505 if( _use( 'Encode')) 4506 { return encode_convert( 'ISO-8859-15'); } 4507 elsif( _use( 'Text::Iconv')) 4508 { return iconv_convert( 'ISO-8859-15'); } 4509 elsif( _use( 'Unicode::Map8') && _use( 'Unicode::String')) 4510 { return unicode_convert( 'ISO-8859-15'); } 4511 else 4512 { return \®exp2latin1; } 4513 } 4514 4515sub _encoding_filter 4516 { 4517 { local $SIG{__DIE__}; 4518 my $encoding= $_[1] || $_[0]; 4519 if( _use( 'Encode')) 4520 { my $sub= encode_convert( $encoding); 4521 return $sub; 4522 } 4523 elsif( _use( 'Text::Iconv')) 4524 { return iconv_convert( $encoding); } 4525 elsif( _use( 'Unicode::Map8') && _use( 'Unicode::String')) 4526 { return unicode_convert( $encoding); } 4527 } 4528 _croak( "Encode, Text::Iconv or Unicode::Map8 and Unicode::String need to be installed in order to use encoding options"); 4529 } 4530 4531# shamelessly lifted from XML::TyePYX (works only with XML::Parse 2.27) 4532sub regexp2latin1 4533 { my $text=shift; 4534 $text=~s{([\xc0-\xc3])(.)}{ my $hi = ord($1); 4535 my $lo = ord($2); 4536 chr((($hi & 0x03) <<6) | ($lo & 0x3F)) 4537 }ge; 4538 return $text; 4539 } 4540 4541 4542sub html_encode 4543 { _use( 'HTML::Entities') or croak "cannot use html_encode: missing HTML::Entities"; 4544 return HTML::Entities::encode_entities($_[0] ); 4545 } 4546 4547sub safe_encode 4548 { my $str= shift; 4549 if( $perl_version < 5.008) 4550 { # the no utf8 makes the regexp work in 5.6 4551 no utf8; # = perl 5.6 4552 $str =~ s{([\xC0-\xDF].|[\xE0-\xEF]..|[\xF0-\xFF]...)} 4553 {_XmlUtf8Decode($1)}egs; 4554 } 4555 else 4556 { $str= encode( ascii => $str, $FB_HTMLCREF); } 4557 return $str; 4558 } 4559 4560sub safe_encode_hex 4561 { my $str= shift; 4562 if( $perl_version < 5.008) 4563 { # the no utf8 makes the regexp work in 5.6 4564 no utf8; # = perl 5.6 4565 $str =~ s{([\xC0-\xDF].|[\xE0-\xEF]..|[\xF0-\xFF]...)} 4566 {_XmlUtf8Decode($1, 1)}egs; 4567 } 4568 else 4569 { $str= encode( ascii => $str, $FB_XMLCREF); } 4570 return $str; 4571 } 4572 4573# this one shamelessly lifted from XML::DOM 4574# does NOT work on 5.8.0 4575sub _XmlUtf8Decode 4576 { my ($str, $hex) = @_; 4577 my $len = length ($str); 4578 my $n; 4579 4580 if ($len == 2) 4581 { my @n = unpack "C2", $str; 4582 $n = (($n[0] & 0x3f) << 6) + ($n[1] & 0x3f); 4583 } 4584 elsif ($len == 3) 4585 { my @n = unpack "C3", $str; 4586 $n = (($n[0] & 0x1f) << 12) + (($n[1] & 0x3f) << 6) + ($n[2] & 0x3f); 4587 } 4588 elsif ($len == 4) 4589 { my @n = unpack "C4", $str; 4590 $n = (($n[0] & 0x0f) << 18) + (($n[1] & 0x3f) << 12) 4591 + (($n[2] & 0x3f) << 6) + ($n[3] & 0x3f); 4592 } 4593 elsif ($len == 1) # just to be complete... 4594 { $n = ord ($str); } 4595 else 4596 { croak "bad value [$str] for _XmlUtf8Decode"; } 4597 4598 my $char= $hex ? sprintf ("&#x%x;", $n) : "&#$n;"; 4599 return $char; 4600 } 4601 4602 4603sub unicode_convert 4604 { my $enc= $_[1] ? $_[1] : $_[0]; # so the method can be called on the twig or directly 4605 _use( 'Unicode::Map8') or croak "Unicode::Map8 not available, needed for encoding filter: $!"; 4606 _use( 'Unicode::String') or croak "Unicode::String not available, needed for encoding filter: $!"; 4607 import Unicode::String qw(utf8); 4608 my $sub= eval qq{ { $NO_WARNINGS; 4609 my \$cnv; 4610 BEGIN { \$cnv= Unicode::Map8->new(\$enc) 4611 or croak "Can't create converter to \$enc"; 4612 } 4613 sub { return \$cnv->to8 (utf8(\$_[0])->ucs2); } 4614 } 4615 }; 4616 unless( $sub) { croak $@; } 4617 return $sub; 4618 } 4619 4620sub iconv_convert 4621 { my $enc= $_[1] ? $_[1] : $_[0]; # so the method can be called on the twig or directly 4622 _use( 'Text::Iconv') or croak "Text::Iconv not available, needed for encoding filter: $!"; 4623 my $sub= eval qq{ { $NO_WARNINGS; 4624 my \$cnv; 4625 BEGIN { \$cnv = Text::Iconv->new( 'utf8', \$enc) 4626 or croak "Can't create iconv converter to \$enc"; 4627 } 4628 sub { return \$cnv->convert( \$_[0]); } 4629 } 4630 }; 4631 unless( $sub) 4632 { if( $@=~ m{^Unsupported conversion: Invalid argument}) 4633 { croak "Unsupported encoding: $enc"; } 4634 else 4635 { croak $@; } 4636 } 4637 4638 return $sub; 4639 } 4640 4641sub encode_convert 4642 { my $enc= $_[1] ? $_[1] : $_[0]; # so the method can be called on the twig or directly 4643 my $sub= eval qq{sub { $NO_WARNINGS; return encode( "$enc", \$_[0]); } }; 4644 croak "can't create Encode-based filter: $@" unless( $sub); 4645 return $sub; 4646 } 4647 4648 4649# XML::XPath compatibility 4650sub getRootNode { return $_[0]; } 4651sub getParentNode { return undef; } 4652sub getChildNodes { my @children= ($_[0]->root); return wantarray ? @children : \@children; } 4653 4654sub _weakrefs { return $weakrefs; } 4655sub _set_weakrefs { $weakrefs=shift() || 0; XML::Twig::Elt::set_destroy()if ! $weakrefs; } # for testing purposes 4656 4657sub _dump 4658 { my $t= shift; 4659 my $dump=''; 4660 4661 $dump="document\n"; # should dump twig level data here 4662 if( $t->root) { $dump .= $t->root->_dump( @_); } 4663 4664 return $dump; 4665 4666 } 4667 4668 46691; 4670 4671###################################################################### 4672package XML::Twig::Entity_list; 4673###################################################################### 4674 4675*isa= *UNIVERSAL::isa; 4676 4677sub new 4678 { my $class = shift; 4679 my $self={ entities => {}, updated => 0}; 4680 4681 bless $self, $class; 4682 return $self; 4683 4684 } 4685 4686sub add_new_ent 4687 { my $ent_list= shift; 4688 my $ent= XML::Twig::Entity->new( @_); 4689 $ent_list->add( $ent); 4690 return $ent_list; 4691 } 4692 4693sub _add_list 4694 { my( $ent_list, $to_add)= @_; 4695 my $ents_to_add= $to_add->{entities}; 4696 return $ent_list unless( $ents_to_add && %$ents_to_add); 4697 @{$ent_list->{entities}}{keys %$ents_to_add}= values %$ents_to_add; 4698 $ent_list->{updated}=1; 4699 return $ent_list; 4700 } 4701 4702sub add 4703 { my( $ent_list, $ent)= @_; 4704 $ent_list->{entities}->{$ent->{name}}= $ent; 4705 $ent_list->{updated}=1; 4706 return $ent_list; 4707 } 4708 4709sub ent 4710 { my( $ent_list, $ent_name)= @_; 4711 return $ent_list->{entities}->{$ent_name}; 4712 } 4713 4714# can be called with an entity or with an entity name 4715sub delete 4716 { my $ent_list= shift; 4717 if( isa( ref $_[0], 'XML::Twig::Entity')) 4718 { # the second arg is an entity 4719 my $ent= shift; 4720 delete $ent_list->{entities}->{$ent->{name}}; 4721 } 4722 else 4723 { # the second arg was not entity, must be a string then 4724 my $name= shift; 4725 delete $ent_list->{entities}->{$name}; 4726 } 4727 $ent_list->{updated}=1; 4728 return $ent_list; 4729 } 4730 4731sub print 4732 { my ($ent_list, $fh)= @_; 4733 my $old_select= defined $fh ? select $fh : undef; 4734 4735 foreach my $ent_name ( sort keys %{$ent_list->{entities}}) 4736 { my $ent= $ent_list->{entities}->{$ent_name}; 4737 # we have to test what the entity is or un-defined entities can creep in 4738 if( isa( $ent, 'XML::Twig::Entity')) { $ent->print(); } 4739 } 4740 select $old_select if( defined $old_select); 4741 return $ent_list; 4742 } 4743 4744sub text 4745 { my ($ent_list)= @_; 4746 return join "\n", map { $ent_list->{entities}->{$_}->text} sort keys %{$ent_list->{entities}}; 4747 } 4748 4749# return the list of entity names 4750sub entity_names 4751 { my $ent_list= shift; 4752 return (sort keys %{$ent_list->{entities}}) ; 4753 } 4754 4755 4756sub list 4757 { my ($ent_list)= @_; 4758 return map { $ent_list->{entities}->{$_} } sort keys %{$ent_list->{entities}}; 4759 } 4760 47611; 4762 4763###################################################################### 4764package XML::Twig::Entity; 4765###################################################################### 4766 4767#*isa= *UNIVERSAL::isa; 4768 4769sub new 4770 { my( $class, $name, $val, $sysid, $pubid, $ndata, $param)= @_; 4771 $class= ref( $class) || $class; 4772 4773 my $self={}; 4774 4775 $self->{name} = $name; 4776 $self->{val} = $val if( defined $val ); 4777 $self->{sysid} = $sysid if( defined $sysid); 4778 $self->{pubid} = $pubid if( defined $pubid); 4779 $self->{ndata} = $ndata if( defined $ndata); 4780 $self->{param} = $param if( defined $param); 4781 4782 bless $self, $class; 4783 return $self; 4784 } 4785 4786 4787sub name { return $_[0]->{name}; } 4788sub val { return $_[0]->{val}; } 4789sub sysid { return defined( $_[0]->{sysid}) ? $_[0]->{sysid} : ''; } 4790sub pubid { return defined( $_[0]->{pubid}) ? $_[0]->{pubid} : ''; } 4791sub ndata { return defined( $_[0]->{ndata}) ? $_[0]->{ndata} : ''; } 4792sub param { return defined( $_[0]->{param}) ? $_[0]->{param} : ''; } 4793 4794 4795sub print 4796 { my ($ent, $fh)= @_; 4797 my $text= $ent->text; 4798 if( $fh) { print $fh $text . "\n"; } 4799 else { print $text . "\n"; } 4800 } 4801 4802sub sprint 4803 { my ($ent)= @_; 4804 return $ent->text; 4805 } 4806 4807sub text 4808 { my ($ent)= @_; 4809 #warn "text called: '", $ent->_dump, "'\n"; 4810 return '' if( !$ent->{name}); 4811 my @tokens; 4812 push @tokens, '<!ENTITY'; 4813 4814 push @tokens, '%' if( $ent->{param}); 4815 push @tokens, $ent->{name}; 4816 4817 if( defined $ent->{val} && !defined( $ent->{sysid}) && !defined($ent->{pubid}) ) 4818 { push @tokens, _quoted_val( $ent->{val}); 4819 } 4820 elsif( defined $ent->{sysid}) 4821 { push @tokens, 'PUBLIC', _quoted_val( $ent->{pubid}) if( $ent->{pubid}); 4822 push @tokens, 'SYSTEM' unless( $ent->{pubid}); 4823 push @tokens, _quoted_val( $ent->{sysid}); 4824 push @tokens, 'NDATA', $ent->{ndata} if( $ent->{ndata}); 4825 } 4826 return join( ' ', @tokens) . '>'; 4827 } 4828 4829sub _quoted_val 4830 { my $q= $_[0]=~ m{"} ? q{'} : q{"}; 4831 return qq{$q$_[0]$q}; 4832 } 4833 4834sub _dump 4835 { my( $ent)= @_; return join( " - ", map { "$_ => '$ent->{$_}'" } grep { defined $ent->{$_} } sort keys %$ent); } 4836 48371; 4838 4839###################################################################### 4840package XML::Twig::Elt; 4841###################################################################### 4842 4843use Carp; 4844*isa= *UNIVERSAL::isa; 4845 4846my $CDATA_START = "<![CDATA["; 4847my $CDATA_END = "]]>"; 4848my $PI_START = "<?"; 4849my $PI_END = "?>"; 4850my $COMMENT_START = "<!--"; 4851my $COMMENT_END = "-->"; 4852 4853my $XMLNS_URI = 'http://www.w3.org/2000/xmlns/'; 4854 4855 4856BEGIN 4857 { # set some aliases for methods 4858 *tag = *gi; 4859 *name = *gi; 4860 *set_tag = *set_gi; 4861 *set_name = *set_gi; 4862 *find_nodes = *get_xpath; # as in XML::DOM 4863 *findnodes = *get_xpath; # as in XML::LibXML 4864 *field = *first_child_text; 4865 *trimmed_field = *first_child_trimmed_text; 4866 *is_field = *contains_only_text; 4867 *is = *passes; 4868 *matches = *passes; 4869 *has_child = *first_child; 4870 *has_children = *first_child; 4871 *all_children_pass = *all_children_are; 4872 *all_children_match= *all_children_are; 4873 *getElementsByTagName= *descendants; 4874 *find_by_tag_name= *descendants_or_self; 4875 *unwrap = *erase; 4876 *inner_xml = *xml_string; 4877 *outer_xml = *sprint; 4878 *add_class = *add_to_class; 4879 4880 *first_child_is = *first_child_matches; 4881 *last_child_is = *last_child_matches; 4882 *next_sibling_is = *next_sibling_matches; 4883 *prev_sibling_is = *prev_sibling_matches; 4884 *next_elt_is = *next_elt_matches; 4885 *prev_elt_is = *prev_elt_matches; 4886 *parent_is = *parent_matches; 4887 *child_is = *child_matches; 4888 *inherited_att = *inherit_att; 4889 4890 *sort_children_by_value= *sort_children_on_value; 4891 4892 *has_atts= *att_nb; 4893 4894 # imports from XML::Twig 4895 *_is_fh= *XML::Twig::_is_fh; 4896 4897 # XML::XPath compatibility 4898 *string_value = *text; 4899 *toString = *sprint; 4900 *getName = *gi; 4901 *getRootNode = *twig; 4902 *getNextSibling = *_next_sibling; 4903 *getPreviousSibling = *_prev_sibling; 4904 *isElementNode = *is_elt; 4905 *isTextNode = *is_text; 4906 *isPI = *is_pi; 4907 *isPINode = *is_pi; 4908 *isProcessingInstructionNode= *is_pi; 4909 *isComment = *is_comment; 4910 *isCommentNode = *is_comment; 4911 *getTarget = *target; 4912 *getFirstChild = *_first_child; 4913 *getLastChild = *_last_child; 4914 4915 # try using weak references 4916 # test whether we can use weak references 4917 { local $SIG{__DIE__}; 4918 if( eval 'require Scalar::Util' && defined( &Scalar::Util::weaken) ) 4919 { import Scalar::Util qw(weaken); } 4920 elsif( eval 'require WeakRef') 4921 { import WeakRef; } 4922 } 4923} 4924 4925 4926# can be called as XML::Twig::Elt->new( [[$gi, $atts, [@content]]) 4927# - gi is an optional gi given to the element 4928# - $atts is a hashref to attributes for the element 4929# - @content is an optional list of text and elements that will 4930# be inserted under the element 4931sub new 4932 { my $class= shift; 4933 $class= ref $class || $class; 4934 my $elt = {}; 4935 bless ($elt, $class); 4936 4937 return $elt unless @_; 4938 4939 if( @_ == 1 && $_[0]=~ m{^\s*<}) { return $class->parse( @_); } 4940 4941 # if a gi is passed then use it 4942 my $gi= shift; 4943 $elt->{gi}=$XML::Twig::gi2index{$gi} or $elt->set_gi( $gi); 4944 4945 4946 my $atts= ref $_[0] eq 'HASH' ? shift : undef; 4947 4948 if( $atts && defined $atts->{$CDATA}) 4949 { delete $atts->{$CDATA}; 4950 4951 my $cdata= $class->new( $CDATA => @_); 4952 return $class->new( $gi, $atts, $cdata); 4953 } 4954 4955 if( $gi eq $PCDATA) 4956 { if( grep { ref $_ } @_) { croak "element $PCDATA can only be created from text"; } 4957 $elt->_set_pcdata( join( '', @_)); 4958 } 4959 elsif( $gi eq $ENT) 4960 { $elt->{ent}= shift; } 4961 elsif( $gi eq $CDATA) 4962 { if( grep { ref $_ } @_) { croak "element $CDATA can only be created from text"; } 4963 $elt->_set_cdata( join( '', @_)); 4964 } 4965 elsif( $gi eq $COMMENT) 4966 { if( grep { ref $_ } @_) { croak "element $COMMENT can only be created from text"; } 4967 $elt->_set_comment( join( '', @_)); 4968 } 4969 elsif( $gi eq $PI) 4970 { if( grep { ref $_ } @_) { croak "element $PI can only be created from text"; } 4971 $elt->_set_pi( shift, join( '', @_)); 4972 } 4973 else 4974 { # the rest of the arguments are the content of the element 4975 if( @_) 4976 { $elt->set_content( @_); } 4977 else 4978 { $elt->{empty}= 1; } 4979 } 4980 4981 if( $atts) 4982 { # the attribute hash can be used to pass the asis status 4983 if( defined $atts->{$ASIS}) { $elt->set_asis( $atts->{$ASIS} ); delete $atts->{$ASIS}; } 4984 if( defined $atts->{$EMPTY}) { $elt->{empty}= $atts->{$EMPTY}; delete $atts->{$EMPTY}; } 4985 if( keys %$atts) { $elt->set_atts( $atts); } 4986 $elt->_set_id( $atts->{$ID}) if( $atts->{$ID}); 4987 } 4988 4989 return $elt; 4990 } 4991 4992# optimized version of $elt->new( PCDATA, $text); 4993sub _new_pcdata 4994 { my $class= $_[0]; 4995 $class= ref $class || $class; 4996 my $elt = {}; 4997 bless $elt, $class; 4998 $elt->{gi}=$XML::Twig::gi2index{$PCDATA} or $elt->set_gi( $PCDATA); 4999 $elt->_set_pcdata( $_[1]); 5000 return $elt; 5001 } 5002 5003# this function creates an XM:::Twig::Elt from a string 5004# it is quite clumsy at the moment, as it just creates a 5005# new twig then returns its root 5006# there might also be memory leaks there 5007# additional arguments are passed to new XML::Twig 5008sub parse 5009 { my $class= shift; 5010 if( ref( $class)) { $class= ref( $class); } 5011 my $string= shift; 5012 my %args= @_; 5013 my $t= XML::Twig->new(%args); 5014 $t->parse( $string); 5015 my $elt= $t->root; 5016 # clean-up the node 5017 delete $elt->{twig}; # get rid of the twig data 5018 delete $elt->{twig_current}; # better get rid of this too 5019 if( $t->{twig_id_list}) { $elt->{twig_id_list}= $t->{twig_id_list}; } 5020 $elt->cut; 5021 undef $t->{twig_root}; 5022 return $elt; 5023 } 5024 5025sub set_inner_xml 5026 { my( $elt, $xml, @args)= @_; 5027 my $new_elt= $elt->parse( "<dummy>$xml</dummy>", @args); 5028 $elt->cut_children; 5029 $new_elt->paste_first_child( $elt); 5030 $new_elt->erase; 5031 return $elt; 5032 } 5033 5034sub set_outer_xml 5035 { my( $elt, $xml, @args)= @_; 5036 my $new_elt= $elt->parse( "<dummy>$xml</dummy>", @args); 5037 $elt->cut_children; 5038 $new_elt->replace( $elt); 5039 $new_elt->erase; 5040 return $new_elt; 5041 } 5042 5043 5044sub set_inner_html 5045 { my( $elt, $html)= @_; 5046 my $t= XML::Twig->new->parse_html( "<html>$html</html>"); 5047 my $new_elt= $t->root; 5048 if( $elt->tag eq 'head') 5049 { $new_elt->first_child( 'head')->unwrap; 5050 $new_elt->first_child( 'body')->cut; 5051 } 5052 elsif( $elt->tag ne 'html') 5053 { $new_elt->first_child( 'head')->cut; 5054 $new_elt->first_child( 'body')->unwrap; 5055 } 5056 $new_elt->cut; 5057 $elt->cut_children; 5058 $new_elt->paste_first_child( $elt); 5059 $new_elt->erase; 5060 return $elt; 5061 } 5062 5063sub set_gi 5064 { my ($elt, $gi)= @_; 5065 unless( defined $XML::Twig::gi2index{$gi}) 5066 { # new gi, create entries in %gi2index and @index2gi 5067 push @XML::Twig::index2gi, $gi; 5068 $XML::Twig::gi2index{$gi}= $#XML::Twig::index2gi; 5069 } 5070 $elt->{gi}= $XML::Twig::gi2index{$gi}; 5071 return $elt; 5072 } 5073 5074sub gi { return $XML::Twig::index2gi[$_[0]->{gi}]; } 5075 5076sub local_name 5077 { my $elt= shift; 5078 return _local_name( $XML::Twig::index2gi[$elt->{'gi'}]); 5079 } 5080 5081sub ns_prefix 5082 { my $elt= shift; 5083 return _ns_prefix( $XML::Twig::index2gi[$elt->{'gi'}]); 5084 } 5085 5086# namespace prefix for any qname (can be used for elements or attributes) 5087sub _ns_prefix 5088 { my $qname= shift; 5089 if( $qname=~ m{^([^:]*):}) 5090 { return $1; } 5091 else 5092 { return( ''); } # should it be '' ? 5093 } 5094 5095# local name for any qname (can be used for elements or attributes) 5096sub _local_name 5097 { my $qname= shift; 5098 (my $local= $qname)=~ s{^[^:]*:}{}; 5099 return $local; 5100 } 5101 5102#sub get_namespace 5103sub namespace ## no critic (Subroutines::ProhibitNestedSubs); 5104 { my $elt= shift; 5105 my $prefix= defined $_[0] ? shift() : $elt->ns_prefix; 5106 my $ns_att= $prefix ? "xmlns:$prefix" : "xmlns"; 5107 my $expanded= $DEFAULT_NS{$prefix} || $elt->_inherit_att_through_cut( $ns_att) || ''; 5108 return $expanded; 5109 } 5110 5111sub declare_missing_ns ## no critic (Subroutines::ProhibitNestedSubs); 5112 { my $root= shift; 5113 my %missing_prefix; 5114 my $map= $root->_current_ns_prefix_map; 5115 5116 foreach my $prefix (keys %$map) 5117 { my $prefix_att= $prefix eq '#default' ? 'xmlns' : "xmlns:$prefix"; 5118 if( ! $root->{'att'}->{$prefix_att}) 5119 { $root->set_att( $prefix_att => $map->{$prefix}); } 5120 } 5121 return $root; 5122 } 5123 5124sub _current_ns_prefix_map 5125 { my( $elt)= shift; 5126 my $map; 5127 while( $elt) 5128 { foreach my $att ($elt->att_names) 5129 { my $prefix= $att eq 'xmlns' ? '#default' 5130 : $att=~ m{^xmlns:(.*)$} ? $1 5131 : next 5132 ; 5133 if( ! exists $map->{$prefix}) { $map->{$prefix}= $elt->{'att'}->{$att}; } 5134 } 5135 $elt= $elt->{parent} || ($elt->{former} && $elt->{former}->{parent}); 5136 } 5137 return $map; 5138 } 5139 5140sub set_ns_decl 5141 { my( $elt, $uri, $prefix)= @_; 5142 my $ns_att= $prefix ? "xmlns:$prefix" : 'xmlns'; 5143 $elt->set_att( $ns_att => $uri); 5144 return $elt; 5145 } 5146 5147sub set_ns_as_default 5148 { my( $root, $uri)= @_; 5149 my @ns_decl_to_remove; 5150 foreach my $elt ($root->descendants_or_self) 5151 { if( $elt->_ns_prefix && $elt->namespace eq $uri) 5152 { $elt->set_tag( $elt->local_name); } 5153 # store any namespace declaration for that uri 5154 foreach my $ns_decl (grep { $_=~ m{xmlns(:|$)} && $elt->{'att'}->{$_} eq $uri } $elt->att_names) 5155 { push @ns_decl_to_remove, [$elt, $ns_decl]; } 5156 } 5157 $root->set_ns_decl( $uri); 5158 # now remove the ns declarations (if done earlier then descendants of an element with the ns declaration 5159 # are not considered being in the namespace 5160 foreach my $ns_decl_to_remove ( @ns_decl_to_remove) 5161 { my( $elt, $ns_decl)= @$ns_decl_to_remove; 5162 $elt->del_att( $ns_decl); 5163 } 5164 5165 return $root; 5166 } 5167 5168 5169 5170# return #ELT for an element and #PCDATA... for others 5171sub get_type 5172 { my $gi_nb= $_[0]->{gi}; # the number, not the string 5173 return $ELT if( $gi_nb >= $XML::Twig::SPECIAL_GI); 5174 return $_[0]->gi; 5175 } 5176 5177# return the gi if it's a "real" element, 0 otherwise 5178sub is_elt 5179 { if( $_[0]->{gi} >= $XML::Twig::SPECIAL_GI) 5180 { return $_[0]->gi; } 5181 else 5182 { return 0; } 5183 } 5184 5185 5186sub is_pcdata 5187 { my $elt= shift; 5188 return (exists $elt->{'pcdata'}); 5189 } 5190 5191sub is_cdata 5192 { my $elt= shift; 5193 return (exists $elt->{'cdata'}); 5194 } 5195 5196sub is_pi 5197 { my $elt= shift; 5198 return (exists $elt->{'target'}); 5199 } 5200 5201sub is_comment 5202 { my $elt= shift; 5203 return (exists $elt->{'comment'}); 5204 } 5205 5206sub is_ent 5207 { my $elt= shift; 5208 return (exists $elt->{ent} || $elt->{ent_name}); 5209 } 5210 5211 5212sub is_text 5213 { my $elt= shift; 5214 return (exists( $elt->{'pcdata'}) || (exists $elt->{'cdata'})); 5215 } 5216 5217sub is_empty 5218 { return $_[0]->{empty} || 0; } 5219 5220sub set_empty 5221 { $_[0]->{empty}= defined( $_[1]) ? $_[1] : 1; return $_[0]; } 5222 5223sub set_not_empty 5224 { delete $_[0]->{empty} if( $_[0]->{'empty'}); return $_[0]; } 5225 5226 5227sub set_asis 5228 { my $elt=shift; 5229 5230 foreach my $descendant ($elt, $elt->_descendants ) 5231 { $descendant->{asis}= 1; 5232 if( (exists $descendant->{'cdata'})) 5233 { $descendant->{gi}=$XML::Twig::gi2index{$PCDATA} or $descendant->set_gi( $PCDATA); 5234 $descendant->_set_pcdata( $descendant->{cdata}); 5235 } 5236 5237 } 5238 return $elt; 5239 } 5240 5241sub set_not_asis 5242 { my $elt=shift; 5243 foreach my $descendant ($elt, $elt->descendants) 5244 { delete $descendant->{asis} if $descendant->{asis};} 5245 return $elt; 5246 } 5247 5248sub is_asis 5249 { return $_[0]->{asis}; } 5250 5251sub closed 5252 { my $elt= shift; 5253 my $t= $elt->twig || return; 5254 my $curr_elt= $t->{twig_current}; 5255 return 1 unless( $curr_elt); 5256 return $curr_elt->in( $elt); 5257 } 5258 5259sub set_pcdata 5260 { my( $elt, $pcdata)= @_; 5261 5262 if( $elt->{extra_data_in_pcdata}) 5263 { _try_moving_extra_data( $elt, $pcdata); 5264 } 5265 $elt->{pcdata}= $pcdata; 5266 return $elt; 5267 } 5268 5269sub _extra_data_in_pcdata { return $_[0]->{extra_data_in_pcdata}; } 5270sub _set_extra_data_in_pcdata { $_[0]->{extra_data_in_pcdata}= $_[1]; return $_[0]; } 5271sub _del_extra_data_in_pcdata { delete $_[0]->{extra_data_in_pcdata}; return $_[0]; } 5272sub _unshift_extra_data_in_pcdata 5273 { my $e= shift; 5274 $e->{extra_data_in_pcdata}||=[]; 5275 unshift @{$e->{extra_data_in_pcdata}}, { text => shift(), offset => shift() }; 5276 } 5277sub _push_extra_data_in_pcdata 5278 { my $e= shift; 5279 $e->{extra_data_in_pcdata}||=[]; 5280 push @{$e->{extra_data_in_pcdata}}, { text => shift(), offset => shift() }; 5281 } 5282 5283sub _extra_data_before_end_tag { return $_[0]->{extra_data_before_end_tag} || ''; } 5284sub _set_extra_data_before_end_tag { $_[0]->{extra_data_before_end_tag}= $_[1]; return $_[0]} 5285sub _del_extra_data_before_end_tag { delete $_[0]->{extra_data_before_end_tag}; return $_[0]} 5286sub _prefix_extra_data_before_end_tag 5287 { my( $elt, $data)= @_; 5288 if($elt->{extra_data_before_end_tag}) 5289 { $elt->{extra_data_before_end_tag}= $data . $elt->{extra_data_before_end_tag}; } 5290 else 5291 { $elt->{extra_data_before_end_tag}= $data; } 5292 return $elt; 5293 } 5294 5295# internal, in cases where we know there is no extra_data (inlined anyway!) 5296sub _set_pcdata { $_[0]->{pcdata}= $_[1]; } 5297 5298# try to figure out if we can keep the extra_data around 5299sub _try_moving_extra_data 5300 { my( $elt, $modified)=@_; 5301 my $initial= $elt->{pcdata}; 5302 my $cpis= $elt->{extra_data_in_pcdata}; 5303 5304 if( (my $offset= index( $modified, $initial)) != -1) 5305 { # text has been added 5306 foreach (@$cpis) { $_->{offset}+= $offset; } 5307 } 5308 elsif( ($offset= index( $initial, $modified)) != -1) 5309 { # text has been cut 5310 my $len= length( $modified); 5311 foreach my $cpi (@$cpis) { $cpi->{offset} -= $offset; } 5312 $elt->_set_extra_data_in_pcdata( [ grep { $_->{offset} >= 0 && $_->{offset} < $len } @$cpis ]); 5313 } 5314 else 5315 { _match_extra_data_words( $elt, $initial, $modified) 5316 || _match_extra_data_chars( $elt, $initial, $modified) 5317 || $elt->_del_extra_data_in_pcdata; 5318 } 5319 } 5320 5321sub _match_extra_data_words 5322 { my( $elt, $initial, $modified)= @_; 5323 my @initial= split /\b/, $initial; 5324 my @modified= split /\b/, $modified; 5325 5326 return _match_extra_data( $elt, length( $initial), \@initial, \@modified); 5327 } 5328 5329sub _match_extra_data_chars 5330 { my( $elt, $initial, $modified)= @_; 5331 my @initial= split //, $initial; 5332 my @modified= split //, $modified; 5333 5334 return _match_extra_data( $elt, length( $initial), \@initial, \@modified); 5335 } 5336 5337sub _match_extra_data 5338 { my( $elt, $length, $initial, $modified)= @_; 5339 5340 my $cpis= $elt->{extra_data_in_pcdata}; 5341 5342 if( @$initial <= @$modified) 5343 { 5344 my( $ok, $positions, $offsets)= _pos_offset( $initial, $modified); 5345 if( $ok) 5346 { my $offset=0; 5347 my $pos= shift @$positions; 5348 foreach my $cpi (@$cpis) 5349 { while( $cpi->{offset} >= $pos) 5350 { $offset= shift @$offsets; 5351 $pos= shift @$positions || $length +1; 5352 } 5353 $cpi->{offset} += $offset; 5354 } 5355 return 1; 5356 } 5357 } 5358 else 5359 { my( $ok, $positions, $offsets)= _pos_offset( $modified, $initial); 5360 if( $ok) 5361 { #print STDERR "pos: ", join( ':', @$positions), "\n", 5362 # "offset: ", join( ':', @$offsets), "\n"; 5363 my $offset=0; 5364 my $pos= shift @$positions; 5365 my $prev_pos= 0; 5366 5367 foreach my $cpi (@$cpis) 5368 { while( $cpi->{offset} >= $pos) 5369 { $offset= shift @$offsets; 5370 $prev_pos= $pos; 5371 $pos= shift @$positions || $length +1; 5372 } 5373 $cpi->{offset} -= $offset; 5374 if( $cpi->{offset} < $prev_pos) { delete $cpi->{text}; } 5375 } 5376 $elt->_set_extra_data_in_pcdata( [ grep { exists $_->{text} } @$cpis ]); 5377 return 1; 5378 } 5379 } 5380 return 0; 5381 } 5382 5383 5384sub _pos_offset 5385 { my( $short, $long)= @_; 5386 my( @pos, @offset); 5387 my( $s_length, $l_length)=(0,0); 5388 while (@$short) 5389 { my $s_word= shift @$short; 5390 my $l_word= shift @$long; 5391 if( $s_word ne $l_word) 5392 { while( @$long && $s_word ne $l_word) 5393 { $l_length += length( $l_word); 5394 $l_word= shift @$long; 5395 } 5396 if( !@$long && $s_word ne $l_word) { return 0; } 5397 push @pos, $s_length; 5398 push @offset, $l_length - $s_length; 5399 } 5400 my $length= length( $s_word); 5401 $s_length += $length; 5402 $l_length += $length; 5403 } 5404 return( 1, \@pos, \@offset); 5405 } 5406 5407sub append_pcdata 5408 { $_[0]->{'pcdata'}.= $_[1]; 5409 return $_[0]; 5410 } 5411 5412sub pcdata { return $_[0]->{pcdata}; } 5413 5414 5415sub append_extra_data 5416 { $_[0]->{extra_data}.= $_[1]; 5417 return $_[0]; 5418 } 5419 5420sub set_extra_data 5421 { $_[0]->{extra_data}= $_[1]; 5422 return $_[0]; 5423 } 5424sub extra_data { return $_[0]->{extra_data} || ''; } 5425 5426sub set_target 5427 { my( $elt, $target)= @_; 5428 $elt->{target}= $target; 5429 return $elt; 5430 } 5431sub target { return $_[0]->{target}; } 5432 5433sub set_data 5434 { $_[0]->{'data'}= $_[1]; 5435 return $_[0]; 5436 } 5437sub data { return $_[0]->{data}; } 5438 5439sub set_pi 5440 { my $elt= shift; 5441 unless( $elt->{gi} == $XML::Twig::gi2index{$PI}) 5442 { $elt->cut_children; 5443 $elt->{gi}=$XML::Twig::gi2index{$PI} or $elt->set_gi( $PI); 5444 } 5445 return $elt->_set_pi( @_); 5446 } 5447 5448sub _set_pi 5449 { $_[0]->set_target( $_[1]); 5450 $_[0]->{data}= $_[2]; 5451 return $_[0]; 5452 } 5453 5454sub pi_string { my $string= $PI_START . $_[0]->{target}; 5455 my $data= $_[0]->{data}; 5456 if( defined( $data) && $data ne '') { $string .= " $data"; } 5457 $string .= $PI_END ; 5458 return $string; 5459 } 5460 5461sub set_comment 5462 { my $elt= shift; 5463 unless( $elt->{gi} == $XML::Twig::gi2index{$COMMENT}) 5464 { $elt->cut_children; 5465 $elt->{gi}=$XML::Twig::gi2index{$COMMENT} or $elt->set_gi( $COMMENT); 5466 } 5467 return $elt->_set_comment( @_); 5468 } 5469 5470sub _set_comment { $_[0]->{comment}= $_[1]; return $_[0]; } 5471sub comment { return $_[0]->{comment}; } 5472sub comment_string { return $COMMENT_START . _comment_escaped_string( $_[0]->{comment}) . $COMMENT_END; } 5473# comments cannot start or end with 5474sub _comment_escaped_string 5475 { my( $c)= @_; 5476 $c=~ s{^-}{ -}; 5477 $c=~ s{-$}{- }; 5478 $c=~ s{--}{- -}g; 5479 return $c; 5480 } 5481 5482sub set_ent { $_[0]->{ent}= $_[1]; return $_[0]; } 5483sub ent { return $_[0]->{ent}; } 5484sub ent_name { return substr( $_[0]->{ent}, 1, -1);} 5485 5486sub set_cdata 5487 { my $elt= shift; 5488 unless( $elt->{gi} == $XML::Twig::gi2index{$CDATA}) 5489 { $elt->cut_children; 5490 $elt->insert_new_elt( first_child => $CDATA, @_); 5491 return $elt; 5492 } 5493 return $elt->_set_cdata( @_); 5494 } 5495 5496sub _set_cdata 5497 { $_[0]->{cdata}= $_[1]; 5498 return $_[0]; 5499 } 5500 5501sub append_cdata 5502 { $_[0]->{cdata}.= $_[1]; 5503 return $_[0]; 5504 } 5505sub cdata { return $_[0]->{cdata}; } 5506 5507 5508sub contains_only_text 5509 { my $elt= shift; 5510 return 0 unless $elt->is_elt; 5511 foreach my $child ($elt->_children) 5512 { return 0 if $child->is_elt; } 5513 return $elt; 5514 } 5515 5516sub contains_only 5517 { my( $elt, $exp)= @_; 5518 my @children= do { my $elt= $elt; my @children=(); my $child= $elt->{first_child}; while( $child) { push @children, $child; $child= $child->{next_sibling}; } @children; }; 5519 foreach my $child (@children) 5520 { return 0 unless $child->is( $exp); } 5521 return @children || 1; 5522 } 5523 5524sub contains_a_single 5525 { my( $elt, $exp)= @_; 5526 my $child= $elt->{first_child} or return 0; 5527 return 0 unless $child->passes( $exp); 5528 return 0 if( $child->{next_sibling}); 5529 return $child; 5530 } 5531 5532 5533sub root 5534 { my $elt= shift; 5535 while( $elt->{parent}) { $elt= $elt->{parent}; } 5536 return $elt; 5537 } 5538 5539sub _root_through_cut 5540 { my $elt= shift; 5541 while( $elt->{parent} || ($elt->{former} && $elt->{former}->{parent})) { $elt= $elt->{parent} || ($elt->{former} && $elt->{former}->{parent}); } 5542 return $elt; 5543 } 5544 5545sub twig 5546 { my $elt= shift; 5547 my $root= $elt->root; 5548 return $root->{twig}; 5549 } 5550 5551sub _twig_through_cut 5552 { my $elt= shift; 5553 my $root= $elt->_root_through_cut; 5554 return $root->{twig}; 5555 } 5556 5557 5558# used for navigation 5559# returns undef or the element, depending on whether $elt passes $cond 5560# $cond can be 5561# - empty: the element passes the condition 5562# - ELT ('#ELT'): the element passes the condition if it is a "real" element 5563# - TEXT ('#TEXT'): the element passes if it is a CDATA or PCDATA element 5564# - a string with an XPath condition (only a subset of XPath is actually 5565# supported). 5566# - a regexp: the element passes if its gi matches the regexp 5567# - a code ref: the element passes if the code, applied on the element, 5568# returns true 5569 5570my %cond_cache; # expression => coderef 5571 5572sub reset_cond_cache { %cond_cache=(); } 5573 5574{ 5575 sub _install_cond 5576 { my $cond= shift; 5577 my $test; 5578 my $init=''; 5579 5580 my $original_cond= $cond; 5581 5582 my $not= ($cond=~ s{^\s*!}{}) ? '!' : ''; 5583 5584 if( ref $cond eq 'CODE') { return $cond; } 5585 5586 if( ref $cond eq 'Regexp') 5587 { $test = qq{(\$_[0]->gi=~ /$cond/)}; } 5588 else 5589 { my @tests; 5590 while( $cond) 5591 { 5592 # the condition is a string 5593 if( $cond=~ s{$ELT$SEP}{}) 5594 { push @tests, qq{\$_[0]->is_elt}; } 5595 elsif( $cond=~ s{$TEXT$SEP}{}) 5596 { push @tests, qq{\$_[0]->is_text}; } 5597 elsif( $cond=~ s{^\s*($REG_TAG_PART)$SEP}{}) 5598 { push @tests, _gi_test( $1); } 5599 elsif( $cond=~ s{^\s*($REG_REGEXP)$SEP}{}) 5600 { # /regexp/ 5601 push @tests, qq{ \$_[0]->gi=~ $1 }; 5602 } 5603 elsif( $cond=~ s{^\s*($REG_TAG_PART)?\s* # $1 5604 \[\s*(-?)\s*(\d+)\s*\] # [$2] 5605 $SEP}{}xo 5606 ) 5607 { my( $gi, $neg, $index)= ($1, $2, $3); 5608 my $siblings= $neg ? q{$_[0]->_next_siblings} : q{$_[0]->_prev_siblings}; 5609 if( $gi && ($gi ne '*')) 5610 #{ $test= qq{((\$_[0]->gi eq "$gi") && (scalar( grep { \$_->gi eq "$gi" } $siblings) + 1 == $index))}; } 5611 { push @tests, _and( _gi_test( $gi), qq{ (scalar( grep { \$_->gi eq "$gi" } $siblings) + 1 == $index)}); } 5612 else 5613 { push @tests, qq{(scalar( $siblings) + 1 == $index)}; } 5614 } 5615 elsif( $cond=~ s{^\s*($REG_TAG_PART?)\s*($REG_PREDICATE)$SEP}{}) 5616 { my( $gi, $predicate)= ( $1, $2); 5617 push @tests, _and( _gi_test( $gi), _parse_predicate_in_step( $predicate)); 5618 } 5619 elsif( $cond=~ s{^\s*($REG_NAKED_PREDICATE)$SEP}{}) 5620 { push @tests, _parse_predicate_in_step( $1); } 5621 else 5622 { croak "wrong navigation condition '$original_cond' ($@)"; } 5623 } 5624 $test= @tests > 1 ? '(' . join( '||', map { "($_)" } @tests) . ')' : $tests[0]; 5625 } 5626 5627 #warn "init: '$init' - test: '$test'\n"; 5628 5629 my $sub= qq{sub { $NO_WARNINGS; $init; return $not($test) ? \$_[0] : undef; } }; 5630 my $s= eval $sub; 5631 #warn "cond: $cond\n$sub\n"; 5632 if( $@) 5633 { croak "wrong navigation condition '$original_cond' ($@);" } 5634 return $s; 5635 } 5636 5637 sub _gi_test 5638 { my( $full_gi)= @_; 5639 5640 # optimize if the gi exists, including the case where the gi includes a dot 5641 my $index= $XML::Twig::gi2index{$full_gi}; 5642 if( $index) { return qq{\$_[0]->{gi} == $index}; } 5643 5644 my( $gi, $class, $id)= $full_gi=~ m{^(.*?)(?:[.]([^.]*)|[#](.*))?$}; 5645 5646 my $gi_test=''; 5647 if( $gi && $gi ne '*' ) 5648 { # 2 options, depending on whether the gi exists in gi2index 5649 # start optimization 5650 my $index= $XML::Twig::gi2index{$gi}; 5651 if( $index) 5652 { # the gi exists, use its index as a faster shortcut 5653 $gi_test = qq{\$_[0]->{gi} == $index}; 5654 } 5655 else 5656 # end optimization 5657 { # it does not exist (but might be created later), compare the strings 5658 $gi_test = qq{ \$_[0]->gi eq "$gi"}; 5659 } 5660 } 5661 else 5662 { $gi_test= 1; } 5663 5664 my $class_test=''; 5665 #warn "class: '$class'"; 5666 if( $class) 5667 { $class_test = qq{ defined( \$_[0]->{att}->{class}) && \$_[0]->{att}->{class}=~ m{\\b$class\\b} }; } 5668 5669 my $id_test=''; 5670 #warn "id: '$id'"; 5671 if( $id) 5672 { $id_test = qq{ defined( \$_[0]->{att}->{$ID}) && \$_[0]->{att}->{$ID} eq '$id' }; } 5673 5674 5675 #warn "gi_test: '$gi_test' - class_test: '$class_test' returning ", _and( $gi_test, $class_test); 5676 return _and( $gi_test, $class_test, $id_test); 5677 } 5678 5679 5680 # input: the original predicate 5681 sub _parse_predicate_in_step 5682 { my $cond= shift; 5683 my %PERL_ALPHA_TEST= ( '=' => ' eq ', '!=' => ' ne ', '>' => ' gt ', '>=' => ' ge ', '<' => ' lt ', '<=' => ' le '); 5684 5685 $cond=~ s{^\s*\[\s*}{}; 5686 $cond=~ s{\s*\]\s*$}{}; 5687 $cond=~ s{( ($REG_STRING|$REG_REGEXP) # strings or regexps 5688 |\@($REG_TAG_NAME)(?=\s*(?:[><=!]|!~|=~)) # @att (followed by a comparison operator) 5689 |\@($REG_TAG_NAME) # @att (not followed by a comparison operator) 5690 |=~|!~ # matching operators 5691 |([><]=?|=|!=)(?=\s*[\d+-]) # test before a number 5692 |([><]=?|=|!=) # test, other cases 5693 |($REG_FUNCTION) # no arg functions 5694 # this bit is a mess, but it is the only solution with this half-baked parser 5695 |((?:string|text)\(\s*$REG_TAG_NAME\s*\)\s*$REG_MATCH\s*$REG_REGEXP) # string( child) =~ /regexp/ 5696 |((?:string|text)\(\s*$REG_TAG_NAME\s*\)\s*!?=\s*$REG_VALUE) # string( child) = "value" (or !=) 5697 |((?:string|text)\(\s*$REG_TAG_NAME\s*\)\s*[<>]=?\s*$REG_VALUE) # string( child) > "value" 5698 |(and|or) 5699 )} 5700 { my( $token, $string, $att, $bare_att, $num_test, $alpha_test, $func, $string_regexp, $string_eq, $string_test, $and_or) 5701 = ( $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11); 5702 5703 if( defined $string) { $token } 5704 elsif( $att) { "( \$_[0]->{att} && exists( \$_[0]->{att}->{'$att'}) && \$_[0]->{att}->{'$att'})"; } 5705 elsif( $bare_att) { "(\$_[0]->{att} && defined( \$_[0]->{att}->{'$bare_att'}))"; } 5706 elsif( $num_test && ($num_test eq '=') ) { "==" } # others tests are unchanged 5707 elsif( $alpha_test) { $PERL_ALPHA_TEST{$alpha_test} } 5708 elsif( $func && $func=~ m{^(?:string|text)}) 5709 { "\$_[0]->text"; } 5710 elsif( $string_regexp && $string_regexp =~ m{(?:string|text)\(\s*($REG_TAG_NAME)\s*\)\s*($REG_MATCH)\s*($REG_REGEXP)}) 5711 { "(XML::Twig::_first_n { (\$_->gi eq '$1') && (\$_->text $2 $3) } 1, \$_[0]->_children)"; } 5712 elsif( $string_eq && $string_eq =~ m{(?:string|text)\(\s*($REG_TAG_NAME)\s*\)\s*(!?=)\s*($REG_VALUE)}) 5713 {"(XML::Twig::_first_n { (\$_->gi eq '$1') && (\$_->text $PERL_ALPHA_TEST{$2} $3) } 1, \$_[0]->_children)"; } 5714 elsif( $string_test && $string_test =~ m{(?:string|text)\(\s*($REG_TAG_NAME)\s*\)\s*([<>]=?)\s*($REG_VALUE)}) 5715 { "(XML::Twig::_first_n { (\$_->gi eq '$1') && (\$_->text $2 $3) } 1, \$_[0]->_children)"; } 5716 elsif( $and_or) { $and_or eq 'and' ? '&&' : '||' ; } 5717 else { $token; } 5718 }gexs; 5719 return "($cond)"; 5720 } 5721 5722 5723 sub _op 5724 { my $op= shift; 5725 if( $op eq '=') { $op= 'eq'; } 5726 elsif( $op eq '!=') { $op= 'ne'; } 5727 return $op; 5728 } 5729 5730 sub passes 5731 { my( $elt, $cond)= @_; 5732 return $elt unless $cond; 5733 my $sub= ($cond_cache{$cond} ||= _install_cond( $cond)); 5734 return $sub->( $elt); 5735 } 5736} 5737 5738sub set_parent 5739 { $_[0]->{parent}= $_[1]; 5740 if( $XML::Twig::weakrefs) { weaken( $_[0]->{parent}); } 5741 } 5742 5743sub parent 5744 { my $elt= shift; 5745 my $cond= shift || return $elt->{parent}; 5746 do { $elt= $elt->{parent} || return; } until ( $elt->passes( $cond)); 5747 return $elt; 5748 } 5749 5750sub set_first_child 5751 { $_[0]->{'first_child'}= $_[1]; 5752 } 5753 5754sub first_child 5755 { my $elt= shift; 5756 my $cond= shift || return $elt->{first_child}; 5757 my $child= $elt->{first_child}; 5758 my $test_cond= ($cond_cache{$cond} ||= _install_cond( $cond)); 5759 while( $child && !$test_cond->( $child)) 5760 { $child= $child->{next_sibling}; } 5761 return $child; 5762 } 5763 5764sub _first_child { return $_[0]->{first_child}; } 5765sub _last_child { return $_[0]->{last_child}; } 5766sub _next_sibling { return $_[0]->{next_sibling}; } 5767sub _prev_sibling { return $_[0]->{prev_sibling}; } 5768sub _parent { return $_[0]->{parent}; } 5769sub _next_siblings { my $elt= shift; my @siblings; while( $elt= $elt->{next_sibling}) { push @siblings, $elt; } return @siblings; } 5770sub _prev_siblings { my $elt= shift; my @siblings; while( $elt= $elt->{prev_sibling}) { push @siblings, $elt; } return @siblings; } 5771 5772# sets a field 5773# arguments $record, $cond, @content 5774sub set_field 5775 { my $record = shift; 5776 my $cond = shift; 5777 my $child= $record->first_child( $cond); 5778 if( $child) 5779 { $child->set_content( @_); } 5780 else 5781 { if( $cond=~ m{^\s*($REG_TAG_NAME)}) 5782 { my $gi= $1; 5783 $child= $record->insert_new_elt( last_child => $gi, @_); 5784 } 5785 else 5786 { croak "can't create a field name from $cond"; } 5787 } 5788 return $child; 5789 } 5790 5791sub set_last_child 5792 { $_[0]->{'last_child'}= $_[1]; 5793 if( $XML::Twig::weakrefs) { weaken( $_[0]->{'last_child'}); } 5794 } 5795 5796sub last_child 5797 { my $elt= shift; 5798 my $cond= shift || return $elt->{last_child}; 5799 my $test_cond= ($cond_cache{$cond} ||= _install_cond( $cond)); 5800 my $child= $elt->{last_child}; 5801 while( $child && !$test_cond->( $child) ) 5802 { $child= $child->{prev_sibling}; } 5803 return $child 5804 } 5805 5806 5807sub set_prev_sibling 5808 { $_[0]->{'prev_sibling'}= $_[1]; 5809 if( $XML::Twig::weakrefs) { weaken( $_[0]->{'prev_sibling'}); } 5810 } 5811 5812sub prev_sibling 5813 { my $elt= shift; 5814 my $cond= shift || return $elt->{prev_sibling}; 5815 my $test_cond= ($cond_cache{$cond} ||= _install_cond( $cond)); 5816 my $sibling= $elt->{prev_sibling}; 5817 while( $sibling && !$test_cond->( $sibling) ) 5818 { $sibling= $sibling->{prev_sibling}; } 5819 return $sibling; 5820 } 5821 5822sub set_next_sibling { $_[0]->{'next_sibling'}= $_[1]; } 5823 5824sub next_sibling 5825 { my $elt= shift; 5826 my $cond= shift || return $elt->{next_sibling}; 5827 my $test_cond= ($cond_cache{$cond} ||= _install_cond( $cond)); 5828 my $sibling= $elt->{next_sibling}; 5829 while( $sibling && !$test_cond->( $sibling) ) 5830 { $sibling= $sibling->{next_sibling}; } 5831 return $sibling; 5832 } 5833 5834# methods dealing with the class attribute, convenient if you work with xhtml 5835sub class { $_[0]->{att}->{class}; } 5836# lvalue version of class. separate from class to avoid problem like RT# 5837sub lclass 5838 :lvalue # > perl 5.5 5839 { $_[0]->{att}->{class}; } 5840 5841sub set_class { my( $elt, $class)= @_; $elt->set_att( class => $class); } 5842 5843# adds a class to an element 5844sub add_to_class 5845 { my( $elt, $new_class)= @_; 5846 return $elt unless $new_class; 5847 my $class= $elt->class; 5848 my %class= $class ? map { $_ => 1 } split /\s+/, $class : (); 5849 $class{$new_class}= 1; 5850 $elt->set_class( join( ' ', sort keys %class)); 5851 } 5852 5853sub remove_class 5854 { my( $elt, $class_to_remove)= @_; 5855 return $elt unless $class_to_remove; 5856 my $class= $elt->class; 5857 my %class= $class ? map { $_ => 1 } split /\s+/, $class : (); 5858 delete $class{$class_to_remove}; 5859 $elt->set_class( join( ' ', sort keys %class)); 5860 } 5861 5862sub att_to_class { my( $elt, $att)= @_; $elt->set_class( $elt->{'att'}->{$att}); } 5863sub add_att_to_class { my( $elt, $att)= @_; $elt->add_to_class( $elt->{'att'}->{$att}); } 5864sub move_att_to_class { my( $elt, $att)= @_; $elt->add_to_class( $elt->{'att'}->{$att}); 5865 $elt->del_att( $att); 5866 } 5867sub tag_to_class { my( $elt)= @_; $elt->set_class( $elt->tag); } 5868sub add_tag_to_class { my( $elt)= @_; $elt->add_to_class( $elt->tag); } 5869sub set_tag_class { my( $elt, $new_tag)= @_; $elt->add_tag_to_class; $elt->set_tag( $new_tag); } 5870 5871sub tag_to_span 5872 { my( $elt)= @_; 5873 $elt->set_class( $elt->tag) unless( $elt->tag eq 'span' && $elt->class); # set class to span unless it would mean replacing it with span 5874 $elt->set_tag( 'span'); 5875 } 5876 5877sub tag_to_div 5878 { my( $elt)= @_; 5879 $elt->set_class( $elt->tag) unless( $elt->tag eq 'div' && $elt->class); # set class to div unless it would mean replacing it with div 5880 $elt->set_tag( 'div'); 5881 } 5882 5883sub in_class 5884 { my( $elt, $class)= @_; 5885 my $elt_class= $elt->class; 5886 return unless( defined $elt_class); 5887 return $elt->class=~ m{(?:^|\s)\Q$class\E(?:\s|$)} ? $elt : 0; 5888 } 5889 5890 5891# get or set all attributes 5892# argument can be a hash or a hashref 5893sub set_atts 5894 { my $elt= shift; 5895 my %atts; 5896 tie %atts, 'Tie::IxHash' if( keep_atts_order()); 5897 %atts= ( (ref( $_[0] || '') eq 'HASH') || isa( $_[0] || '', 'HASH')) ? %{$_[0]} : @_; 5898 $elt->{att}= \%atts; 5899 if( exists $atts{$ID}) { $elt->_set_id( $atts{$ID}); } 5900 return $elt; 5901 } 5902 5903sub atts { return $_[0]->{att}; } 5904sub att_names { return (sort keys %{$_[0]->{att}}); } 5905sub del_atts { $_[0]->{att}={}; return $_[0]; } 5906 5907# get or set a single attribute (set works for several atts) 5908sub set_att 5909 { my $elt= shift; 5910 5911 if( $_[0] && ref( $_[0]) && !$_[1]) 5912 { croak "improper call to set_att, usage is \$elt->set_att( att1 => 'val1', att2 => 'val2',...)"; } 5913 5914 unless( $elt->{att}) 5915 { $elt->{att}={}; 5916 tie %{$elt->{att}}, 'Tie::IxHash' if( keep_atts_order()); 5917 } 5918 5919 while(@_) 5920 { my( $att, $val)= (shift, shift); 5921 $elt->{att}->{$att}= $val; 5922 if( $att eq $ID) { $elt->_set_id( $val); } 5923 } 5924 return $elt; 5925 } 5926 5927sub att { $_[0]->{att}->{$_[1]}; } 5928# lvalue version of att. separate from class to avoid problem like RT# 5929sub latt 5930 :lvalue # > perl 5.5 5931 { $_[0]->{att}->{$_[1]}; } 5932 5933sub del_att 5934 { my $elt= shift; 5935 while( @_) { delete $elt->{'att'}->{shift()}; } 5936 return $elt; 5937 } 5938 5939sub att_exists { return exists $_[0]->{att}->{$_[1]}; } 5940 5941# delete an attribute from all descendants of an element 5942sub strip_att 5943 { my( $elt, $att)= @_; 5944 $_->del_att( $att) foreach ($elt->descendants_or_self( qq{*[\@$att]})); 5945 return $elt; 5946 } 5947 5948sub change_att_name 5949 { my( $elt, $old_name, $new_name)= @_; 5950 my $value= $elt->{'att'}->{$old_name}; 5951 return $elt unless( defined $value); 5952 $elt->del_att( $old_name) 5953 ->set_att( $new_name => $value); 5954 return $elt; 5955 } 5956 5957sub lc_attnames 5958 { my $elt= shift; 5959 foreach my $att ($elt->att_names) 5960 { if( $att ne lc $att) { $elt->change_att_name( $att, lc $att); } } 5961 return $elt; 5962 } 5963 5964sub set_twig_current { $_[0]->{twig_current}=1; } 5965sub del_twig_current { delete $_[0]->{twig_current}; } 5966 5967 5968# get or set the id attribute 5969sub set_id 5970 { my( $elt, $id)= @_; 5971 $elt->del_id() if( exists $elt->{att}->{$ID}); 5972 $elt->set_att($ID, $id); 5973 $elt->_set_id( $id); 5974 return $elt; 5975 } 5976 5977# only set id, does not update the attribute value 5978sub _set_id 5979 { my( $elt, $id)= @_; 5980 my $t= $elt->twig || $elt; 5981 $t->{twig_id_list}->{$id}= $elt; 5982 if( $XML::Twig::weakrefs) { weaken( $t->{twig_id_list}->{$id}); } 5983 return $elt; 5984 } 5985 5986sub id { return $_[0]->{att}->{$ID}; } 5987 5988# methods used to add ids to elements that don't have one 5989BEGIN 5990{ my $id_nb = "0001"; 5991 my $id_seed = "twig_id_"; 5992 5993 sub set_id_seed ## no critic (Subroutines::ProhibitNestedSubs); 5994 { $id_seed= $_[1]; $id_nb=1; } 5995 5996 sub add_id ## no critic (Subroutines::ProhibitNestedSubs); 5997 { my $elt= shift; 5998 if( defined $elt->{'att'}->{$ID}) 5999 { return $elt->{'att'}->{$ID}; } 6000 else 6001 { my $id= $_[0] && ref( $_[0]) && isa( $_[0], 'CODE') ? $_[0]->( $elt) : $id_seed . $id_nb++; 6002 $elt->set_id( $id); 6003 return $id; 6004 } 6005 } 6006} 6007 6008 6009 6010# delete the id attribute and remove the element from the id list 6011sub del_id 6012 { my $elt= shift; 6013 if( ! exists $elt->{att}->{$ID}) { return $elt }; 6014 my $id= $elt->{att}->{$ID}; 6015 6016 delete $elt->{att}->{$ID}; 6017 6018 my $t= shift || $elt->twig; 6019 unless( $t) { return $elt; } 6020 if( exists $t->{twig_id_list}->{$id}) { delete $t->{twig_id_list}->{$id}; } 6021 6022 return $elt; 6023 } 6024 6025# return the list of children 6026sub children 6027 { my $elt= shift; 6028 my @children; 6029 my $child= $elt->first_child( @_); 6030 while( $child) 6031 { push @children, $child; 6032 $child= $child->next_sibling( @_); 6033 } 6034 return @children; 6035 } 6036 6037sub _children 6038 { my $elt= shift; 6039 my @children=(); 6040 my $child= $elt->{first_child}; 6041 while( $child) 6042 { push @children, $child; 6043 $child= $child->{next_sibling}; 6044 } 6045 return @children; 6046 } 6047 6048sub children_copy 6049 { my $elt= shift; 6050 my @children; 6051 my $child= $elt->first_child( @_); 6052 while( $child) 6053 { push @children, $child->copy; 6054 $child= $child->next_sibling( @_); 6055 } 6056 return @children; 6057 } 6058 6059 6060sub children_count 6061 { my $elt= shift; 6062 my $cond= shift; 6063 my $count=0; 6064 my $child= $elt->{first_child}; 6065 while( $child) 6066 { $count++ if( $child->passes( $cond)); 6067 $child= $child->{next_sibling}; 6068 } 6069 return $count; 6070 } 6071 6072sub children_text 6073 { my $elt= shift; 6074 return wantarray() ? map { $_->text} $elt->children( @_) 6075 : join( '', map { $_->text} $elt->children( @_) ) 6076 ; 6077 } 6078 6079sub children_trimmed_text 6080 { my $elt= shift; 6081 return wantarray() ? map { $_->trimmed_text} $elt->children( @_) 6082 : join( '', map { $_->trimmed_text} $elt->children( @_) ) 6083 ; 6084 } 6085 6086sub all_children_are 6087 { my( $parent, $cond)= @_; 6088 foreach my $child ($parent->_children) 6089 { return 0 unless( $child->passes( $cond)); } 6090 return $parent; 6091 } 6092 6093 6094sub ancestors 6095 { my( $elt, $cond)= @_; 6096 my @ancestors; 6097 while( $elt->{parent}) 6098 { $elt= $elt->{parent}; 6099 push @ancestors, $elt if( $elt->passes( $cond)); 6100 } 6101 return @ancestors; 6102 } 6103 6104sub ancestors_or_self 6105 { my( $elt, $cond)= @_; 6106 my @ancestors; 6107 while( $elt) 6108 { push @ancestors, $elt if( $elt->passes( $cond)); 6109 $elt= $elt->{parent}; 6110 } 6111 return @ancestors; 6112 } 6113 6114 6115sub _ancestors 6116 { my( $elt, $include_self)= @_; 6117 my @ancestors= $include_self ? ($elt) : (); 6118 while( $elt= $elt->{parent}) { push @ancestors, $elt; } 6119 return @ancestors; 6120 } 6121 6122 6123sub inherit_att 6124 { my $elt= shift; 6125 my $att= shift; 6126 my %tags= map { ($_, 1) } @_; 6127 6128 do 6129 { if( (defined $elt->{'att'}->{$att}) 6130 && ( !%tags || $tags{$XML::Twig::index2gi[$elt->{'gi'}]}) 6131 ) 6132 { return $elt->{'att'}->{$att}; } 6133 } while( $elt= $elt->{parent}); 6134 return undef; 6135 } 6136 6137sub _inherit_att_through_cut 6138 { my $elt= shift; 6139 my $att= shift; 6140 my %tags= map { ($_, 1) } @_; 6141 6142 do 6143 { if( (defined $elt->{'att'}->{$att}) 6144 && ( !%tags || $tags{$XML::Twig::index2gi[$elt->{'gi'}]}) 6145 ) 6146 { return $elt->{'att'}->{$att}; } 6147 } while( $elt= $elt->{parent} || ($elt->{former} && $elt->{former}->{parent})); 6148 return undef; 6149 } 6150 6151 6152sub current_ns_prefixes 6153 { my $elt= shift; 6154 my %prefix; 6155 $prefix{''}=1 if( $elt->namespace( '')); 6156 while( $elt) 6157 { my @ns= grep { !m{^xml} } map { m{^([^:]+):} } ($XML::Twig::index2gi[$elt->{'gi'}], $elt->att_names); 6158 $prefix{$_}=1 foreach (@ns); 6159 $elt= $elt->{parent}; 6160 } 6161 6162 return (sort keys %prefix); 6163 } 6164 6165# kinda counter-intuitive actually: 6166# the next element is found by looking for the next open tag after from the 6167# current one, which is the first child, if it exists, or the next sibling 6168# or the first next sibling of an ancestor 6169# optional arguments are: 6170# - $subtree_root: a reference to an element, when the next element is not 6171# within $subtree_root anymore then next_elt returns undef 6172# - $cond: a condition, next_elt returns the next element matching the condition 6173 6174sub next_elt 6175 { my $elt= shift; 6176 my $subtree_root= 0; 6177 $subtree_root= shift if( ref( $_[0]) && isa( $_[0], 'XML::Twig::Elt')); 6178 my $cond= shift; 6179 my $next_elt; 6180 6181 my $ind; # optimization 6182 my $test_cond; 6183 if( $cond) # optimization 6184 { unless( defined( $ind= $XML::Twig::gi2index{$cond}) ) # optimization 6185 { $test_cond= ($cond_cache{$cond} ||= _install_cond( $cond)); } # optimization 6186 } # optimization 6187 6188 do 6189 { if( $next_elt= $elt->{first_child}) 6190 { # simplest case: the elt has a child 6191 } 6192 elsif( $next_elt= $elt->{next_sibling}) 6193 { # no child but a next sibling (just check we stay within the subtree) 6194 6195 # case where elt is subtree_root, is empty and has a sibling 6196 return undef if( $subtree_root && ($elt == $subtree_root)); 6197 6198 } 6199 else 6200 { # case where the element has no child and no next sibling: 6201 # get the first next sibling of an ancestor, checking subtree_root 6202 6203 # case where elt is subtree_root, is empty and has no sibling 6204 return undef if( $subtree_root && ($elt == $subtree_root)); 6205 6206 $next_elt= $elt->{parent}; 6207 6208 until( $next_elt->{next_sibling}) 6209 { return undef if( $subtree_root && ($subtree_root == $next_elt)); 6210 $next_elt= $next_elt->{parent} || return undef; 6211 } 6212 return undef if( $subtree_root && ($subtree_root == $next_elt)); 6213 $next_elt= $next_elt->{next_sibling}; 6214 } 6215 $elt= $next_elt; # just in case we need to loop 6216 } until( ! defined $elt 6217 || ! defined $cond 6218 || (defined $ind && ($elt->{gi} eq $ind)) # optimization 6219 || (defined $test_cond && ($test_cond->( $elt))) 6220 ); 6221 6222 return $elt; 6223 } 6224 6225# return the next_elt within the element 6226# just call next_elt with the element as first and second argument 6227sub first_descendant { return $_[0]->next_elt( @_); } 6228 6229# get the last descendant, # then return the element found or call prev_elt with the condition 6230sub last_descendant 6231 { my( $elt, $cond)= @_; 6232 my $last_descendant= $elt->_last_descendant; 6233 if( !$cond || $last_descendant->matches( $cond)) 6234 { return $last_descendant; } 6235 else 6236 { return $last_descendant->prev_elt( $elt, $cond); } 6237 } 6238 6239# no argument allowed here, just go down the last_child recursively 6240sub _last_descendant 6241 { my $elt= shift; 6242 while( my $child= $elt->{last_child}) { $elt= $child; } 6243 return $elt; 6244 } 6245 6246# counter-intuitive too: 6247# the previous element is found by looking 6248# for the first open tag backwards from the current one 6249# it's the last descendant of the previous sibling 6250# if it exists, otherwise it's simply the parent 6251sub prev_elt 6252 { my $elt= shift; 6253 my $subtree_root= 0; 6254 if( defined $_[0] and (ref( $_[0]) && isa( $_[0], 'XML::Twig::Elt'))) 6255 { $subtree_root= shift ; 6256 return undef if( $elt == $subtree_root); 6257 } 6258 my $cond= shift; 6259 # get prev elt 6260 my $prev_elt; 6261 do 6262 { return undef if( $elt == $subtree_root); 6263 if( $prev_elt= $elt->{prev_sibling}) 6264 { while( $prev_elt->{last_child}) 6265 { $prev_elt= $prev_elt->{last_child}; } 6266 } 6267 else 6268 { $prev_elt= $elt->{parent} || return undef; } 6269 $elt= $prev_elt; # in case we need to loop 6270 } until( $elt->passes( $cond)); 6271 6272 return $elt; 6273 } 6274 6275sub _following_elt 6276 { my( $elt)= @_; 6277 while( $elt && !$elt->{next_sibling}) 6278 { $elt= $elt->{parent}; } 6279 return $elt ? $elt->{next_sibling} : undef; 6280 } 6281 6282sub following_elt 6283 { my( $elt, $cond)= @_; 6284 $elt= $elt->_following_elt || return undef; 6285 return $elt if( !$cond || $elt->matches( $cond)); 6286 return $elt->next_elt( $cond); 6287 } 6288 6289sub following_elts 6290 { my( $elt, $cond)= @_; 6291 if( !$cond) { undef $cond; } 6292 my $following= $elt->following_elt( $cond); 6293 if( $following) 6294 { my @followings= $following; 6295 while( $following= $following->next_elt( $cond)) 6296 { push @followings, $following; } 6297 return( @followings); 6298 } 6299 else 6300 { return (); } 6301 } 6302 6303sub _preceding_elt 6304 { my( $elt)= @_; 6305 while( $elt && !$elt->{prev_sibling}) 6306 { $elt= $elt->{parent}; } 6307 return $elt ? $elt->{prev_sibling}->_last_descendant : undef; 6308 } 6309 6310sub preceding_elt 6311 { my( $elt, $cond)= @_; 6312 $elt= $elt->_preceding_elt || return undef; 6313 return $elt if( !$cond || $elt->matches( $cond)); 6314 return $elt->prev_elt( $cond); 6315 } 6316 6317sub preceding_elts 6318 { my( $elt, $cond)= @_; 6319 if( !$cond) { undef $cond; } 6320 my $preceding= $elt->preceding_elt( $cond); 6321 if( $preceding) 6322 { my @precedings= $preceding; 6323 while( $preceding= $preceding->prev_elt( $cond)) 6324 { push @precedings, $preceding; } 6325 return( @precedings); 6326 } 6327 else 6328 { return (); } 6329 } 6330 6331# used in get_xpath 6332sub _self 6333 { my( $elt, $cond)= @_; 6334 return $cond ? $elt->matches( $cond) : $elt; 6335 } 6336 6337sub next_n_elt 6338 { my $elt= shift; 6339 my $offset= shift || return undef; 6340 foreach (1..$offset) 6341 { $elt= $elt->next_elt( @_) || return undef; } 6342 return $elt; 6343 } 6344 6345# checks whether $elt is included in $ancestor, returns 1 in that case 6346sub in 6347 { my ($elt, $ancestor)= @_; 6348 if( ref( $ancestor) && isa( $ancestor, 'XML::Twig::Elt')) 6349 { # element 6350 while( $elt= $elt->{parent}) { return $elt if( $elt == $ancestor); } 6351 } 6352 else 6353 { # condition 6354 while( $elt= $elt->{parent}) { return $elt if( $elt->matches( $ancestor)); } 6355 } 6356 return 0; 6357 } 6358 6359sub first_child_text 6360 { my $elt= shift; 6361 my $dest=$elt->first_child(@_) or return ''; 6362 return $dest->text; 6363 } 6364 6365sub fields 6366 { my $elt= shift; 6367 return map { $elt->field( $_) } @_; 6368 } 6369 6370sub first_child_trimmed_text 6371 { my $elt= shift; 6372 my $dest=$elt->first_child(@_) or return ''; 6373 return $dest->trimmed_text; 6374 } 6375 6376sub first_child_matches 6377 { my $elt= shift; 6378 my $dest= $elt->{first_child} or return undef; 6379 return $dest->passes( @_); 6380 } 6381 6382sub last_child_text 6383 { my $elt= shift; 6384 my $dest=$elt->last_child(@_) or return ''; 6385 return $dest->text; 6386 } 6387 6388sub last_child_trimmed_text 6389 { my $elt= shift; 6390 my $dest=$elt->last_child(@_) or return ''; 6391 return $dest->trimmed_text; 6392 } 6393 6394sub last_child_matches 6395 { my $elt= shift; 6396 my $dest= $elt->{last_child} or return undef; 6397 return $dest->passes( @_); 6398 } 6399 6400sub child_text 6401 { my $elt= shift; 6402 my $dest=$elt->child(@_) or return ''; 6403 return $dest->text; 6404 } 6405 6406sub child_trimmed_text 6407 { my $elt= shift; 6408 my $dest=$elt->child(@_) or return ''; 6409 return $dest->trimmed_text; 6410 } 6411 6412sub child_matches 6413 { my $elt= shift; 6414 my $nb= shift; 6415 my $dest= $elt->child( $nb) or return undef; 6416 return $dest->passes( @_); 6417 } 6418 6419sub prev_sibling_text 6420 { my $elt= shift; 6421 my $dest=$elt->_prev_sibling(@_) or return ''; 6422 return $dest->text; 6423 } 6424 6425sub prev_sibling_trimmed_text 6426 { my $elt= shift; 6427 my $dest=$elt->_prev_sibling(@_) or return ''; 6428 return $dest->trimmed_text; 6429 } 6430 6431sub prev_sibling_matches 6432 { my $elt= shift; 6433 my $dest= $elt->{prev_sibling} or return undef; 6434 return $dest->passes( @_); 6435 } 6436 6437sub next_sibling_text 6438 { my $elt= shift; 6439 my $dest=$elt->next_sibling(@_) or return ''; 6440 return $dest->text; 6441 } 6442 6443sub next_sibling_trimmed_text 6444 { my $elt= shift; 6445 my $dest=$elt->next_sibling(@_) or return ''; 6446 return $dest->trimmed_text; 6447 } 6448 6449sub next_sibling_matches 6450 { my $elt= shift; 6451 my $dest= $elt->{next_sibling} or return undef; 6452 return $dest->passes( @_); 6453 } 6454 6455sub prev_elt_text 6456 { my $elt= shift; 6457 my $dest=$elt->prev_elt(@_) or return ''; 6458 return $dest->text; 6459 } 6460 6461sub prev_elt_trimmed_text 6462 { my $elt= shift; 6463 my $dest=$elt->prev_elt(@_) or return ''; 6464 return $dest->trimmed_text; 6465 } 6466 6467sub prev_elt_matches 6468 { my $elt= shift; 6469 my $dest= $elt->prev_elt or return undef; 6470 return $dest->passes( @_); 6471 } 6472 6473sub next_elt_text 6474 { my $elt= shift; 6475 my $dest=$elt->next_elt(@_) or return ''; 6476 return $dest->text; 6477 } 6478 6479sub next_elt_trimmed_text 6480 { my $elt= shift; 6481 my $dest=$elt->next_elt(@_) or return ''; 6482 return $dest->trimmed_text; 6483 } 6484 6485sub next_elt_matches 6486 { my $elt= shift; 6487 my $dest= $elt->next_elt or return undef; 6488 return $dest->passes( @_); 6489 } 6490 6491sub parent_text 6492 { my $elt= shift; 6493 my $dest=$elt->parent(@_) or return ''; 6494 return $dest->text; 6495 } 6496 6497sub parent_trimmed_text 6498 { my $elt= shift; 6499 my $dest=$elt->parent(@_) or return ''; 6500 return $dest->trimmed_text; 6501 } 6502 6503sub parent_matches 6504 { my $elt= shift; 6505 my $dest= $elt->{parent} or return undef; 6506 return $dest->passes( @_); 6507 } 6508 6509sub is_first_child 6510 { my $elt= shift; 6511 my $parent= $elt->{parent} or return 0; 6512 my $first_child= $parent->first_child( @_) or return 0; 6513 return ($first_child == $elt) ? $elt : 0; 6514 } 6515 6516sub is_last_child 6517 { my $elt= shift; 6518 my $parent= $elt->{parent} or return 0; 6519 my $last_child= $parent->last_child( @_) or return 0; 6520 return ($last_child == $elt) ? $elt : 0; 6521 } 6522 6523# returns the depth level of the element 6524# if 2 parameter are used then counts the 2cd element name in the 6525# ancestors list 6526sub level 6527 { my( $elt, $cond)= @_; 6528 my $level=0; 6529 my $name=shift || ''; 6530 while( $elt= $elt->{parent}) { $level++ if( !$cond || $elt->matches( $cond)); } 6531 return $level; 6532 } 6533 6534# checks whether $elt has an ancestor that satisfies $cond, returns the ancestor 6535sub in_context 6536 { my ($elt, $cond, $level)= @_; 6537 $level= -1 unless( $level) ; # $level-- will never hit 0 6538 6539 while( $level) 6540 { $elt= $elt->{parent} or return 0; 6541 if( $elt->matches( $cond)) { return $elt; } 6542 $level--; 6543 } 6544 return 0; 6545 } 6546 6547sub _descendants 6548 { my( $subtree_root, $include_self)= @_; 6549 my @descendants= $include_self ? ($subtree_root) : (); 6550 6551 my $elt= $subtree_root; 6552 my $next_elt; 6553 6554 MAIN: while( 1) 6555 { if( $next_elt= $elt->{first_child}) 6556 { # simplest case: the elt has a child 6557 } 6558 elsif( $next_elt= $elt->{next_sibling}) 6559 { # no child but a next sibling (just check we stay within the subtree) 6560 6561 # case where elt is subtree_root, is empty and has a sibling 6562 last MAIN if( $elt == $subtree_root); 6563 } 6564 else 6565 { # case where the element has no child and no next sibling: 6566 # get the first next sibling of an ancestor, checking subtree_root 6567 6568 # case where elt is subtree_root, is empty and has no sibling 6569 last MAIN if( $elt == $subtree_root); 6570 6571 # backtrack until we find a parent with a next sibling 6572 $next_elt= $elt->{parent} || last; 6573 until( $next_elt->{next_sibling}) 6574 { last MAIN if( $subtree_root == $next_elt); 6575 $next_elt= $next_elt->{parent} || last MAIN; 6576 } 6577 last MAIN if( $subtree_root == $next_elt); 6578 $next_elt= $next_elt->{next_sibling}; 6579 } 6580 $elt= $next_elt || last MAIN; 6581 push @descendants, $elt; 6582 } 6583 return @descendants; 6584 } 6585 6586 6587sub descendants 6588 { my( $subtree_root, $cond)= @_; 6589 my @descendants=(); 6590 my $elt= $subtree_root; 6591 6592 # this branch is pure optimization for speed: if $cond is a gi replace it 6593 # by the index of the gi and loop here 6594 # start optimization 6595 my $ind; 6596 if( !$cond || ( defined ( $ind= $XML::Twig::gi2index{$cond})) ) 6597 { 6598 my $next_elt; 6599 6600 while( 1) 6601 { if( $next_elt= $elt->{first_child}) 6602 { # simplest case: the elt has a child 6603 } 6604 elsif( $next_elt= $elt->{next_sibling}) 6605 { # no child but a next sibling (just check we stay within the subtree) 6606 6607 # case where elt is subtree_root, is empty and has a sibling 6608 last if( $subtree_root && ($elt == $subtree_root)); 6609 } 6610 else 6611 { # case where the element has no child and no next sibling: 6612 # get the first next sibling of an ancestor, checking subtree_root 6613 6614 # case where elt is subtree_root, is empty and has no sibling 6615 last if( $subtree_root && ($elt == $subtree_root)); 6616 6617 # backtrack until we find a parent with a next sibling 6618 $next_elt= $elt->{parent} || last undef; 6619 until( $next_elt->{next_sibling}) 6620 { last if( $subtree_root && ($subtree_root == $next_elt)); 6621 $next_elt= $next_elt->{parent} || last; 6622 } 6623 last if( $subtree_root && ($subtree_root == $next_elt)); 6624 $next_elt= $next_elt->{next_sibling}; 6625 } 6626 $elt= $next_elt || last; 6627 push @descendants, $elt if( !$cond || ($elt->{gi} eq $ind)); 6628 } 6629 } 6630 else 6631 # end optimization 6632 { # branch for a complex condition: use the regular (slow but simple) way 6633 while( $elt= $elt->next_elt( $subtree_root, $cond)) 6634 { push @descendants, $elt; } 6635 } 6636 return @descendants; 6637 } 6638 6639 6640sub descendants_or_self 6641 { my( $elt, $cond)= @_; 6642 my @descendants= $elt->passes( $cond) ? ($elt) : (); 6643 push @descendants, $elt->descendants( $cond); 6644 return @descendants; 6645 } 6646 6647sub sibling 6648 { my $elt= shift; 6649 my $nb= shift; 6650 if( $nb > 0) 6651 { foreach( 1..$nb) 6652 { $elt= $elt->next_sibling( @_) or return undef; } 6653 } 6654 elsif( $nb < 0) 6655 { foreach( 1..(-$nb)) 6656 { $elt= $elt->prev_sibling( @_) or return undef; } 6657 } 6658 else # $nb == 0 6659 { return $elt->passes( $_[0]); } 6660 return $elt; 6661 } 6662 6663sub sibling_text 6664 { my $elt= sibling( @_); 6665 return $elt ? $elt->text : undef; 6666 } 6667 6668 6669sub child 6670 { my $elt= shift; 6671 my $nb= shift; 6672 if( $nb >= 0) 6673 { $elt= $elt->first_child( @_) or return undef; 6674 foreach( 1..$nb) 6675 { $elt= $elt->next_sibling( @_) or return undef; } 6676 } 6677 else 6678 { $elt= $elt->last_child( @_) or return undef; 6679 foreach( 2..(-$nb)) 6680 { $elt= $elt->prev_sibling( @_) or return undef; } 6681 } 6682 return $elt; 6683 } 6684 6685sub prev_siblings 6686 { my $elt= shift; 6687 my @siblings=(); 6688 while( $elt= $elt->prev_sibling( @_)) 6689 { unshift @siblings, $elt; } 6690 return @siblings; 6691 } 6692 6693sub siblings 6694 { my $elt= shift; 6695 return grep { $_ ne $elt } $elt->{parent}->children( @_); 6696 } 6697 6698sub pos 6699 { my $elt= shift; 6700 return 0 if ($_[0] && !$elt->matches( @_)); 6701 my $pos=1; 6702 $pos++ while( $elt= $elt->prev_sibling( @_)); 6703 return $pos; 6704 } 6705 6706 6707sub next_siblings 6708 { my $elt= shift; 6709 my @siblings=(); 6710 while( $elt= $elt->next_sibling( @_)) 6711 { push @siblings, $elt; } 6712 return @siblings; 6713 } 6714 6715 6716# used by get_xpath: parses the xpath expression and generates a sub that performs the 6717# search 6718{ my %axis2method; 6719 BEGIN { %axis2method= ( child => 'children', 6720 descendant => 'descendants', 6721 'descendant-or-self' => 'descendants_or_self', 6722 parent => 'parent_is', 6723 ancestor => 'ancestors', 6724 'ancestor-or-self' => 'ancestors_or_self', 6725 'following-sibling' => 'next_siblings', 6726 'preceding-sibling' => 'prev_siblings', 6727 following => 'following_elts', 6728 preceding => 'preceding_elts', 6729 self => '_self', 6730 ); 6731 } 6732 6733 sub _install_xpath 6734 { my( $xpath_exp, $type)= @_; 6735 my $original_exp= $xpath_exp; 6736 my $sub= 'my $elt= shift; my @results;'; 6737 6738 # grab the root if expression starts with a / 6739 if( $xpath_exp=~ s{^/}{}) 6740 { $sub .= '@results= ($elt->twig) || croak "cannot use an XPath query starting with a / on a node not attached to a whole twig";'; } 6741 elsif( $xpath_exp=~ s{^\./}{}) 6742 { $sub .= '@results= ($elt);'; } 6743 else 6744 { $sub .= '@results= ($elt);'; } 6745 6746 6747 #warn "xpath_exp= '$xpath_exp'\n"; 6748 6749 while( $xpath_exp && 6750 $xpath_exp=~s{^\s*(/?) 6751 # the xxx=~/regexp/ is a pain as it includes / 6752 (\s*(?:(?:($REG_AXIS)::)?(\*|$REG_TAG_PART|\.\.|\.)\s*)?($REG_PREDICATE_ALT*) 6753 ) 6754 (/|$)}{}xo) 6755 6756 { my( $wildcard, $sub_exp, $axis, $gi, $predicates)= ($1, $2, $3, $4, $5); 6757 if( $axis && ! $gi) 6758 { _croak_and_doublecheck_xpath( $original_exp, "error in xpath expression $original_exp"); } 6759 6760 # grab a parent 6761 if( $sub_exp eq '..') 6762 { _croak_and_doublecheck_xpath( $original_exp, "error in xpath expression $original_exp") if( $wildcard); 6763 $sub .= '@results= map { $_->{parent}} @results;'; 6764 } 6765 # test the element itself 6766 elsif( $sub_exp=~ m{^\.(.*)$}s) 6767 { $sub .= "\@results= grep { \$_->matches( q{$1}) } \@results;" } 6768 # grab children 6769 else 6770 { 6771 if( !$axis) 6772 { $axis= $wildcard ? 'descendant' : 'child'; } 6773 if( !$gi or $gi eq '*') { $gi=''; } 6774 my $function; 6775 6776 # "special" predicates, that return just one element 6777 if( $predicates && ($predicates =~ m{^\s*\[\s*((-\s*)?\d+)\s*\]\s*$})) 6778 { # [<nb>] 6779 my $offset= $1; 6780 $offset-- if( $offset > 0); 6781 $function= $axis eq 'descendant' ? "next_n_elt( $offset, '$gi')" 6782 : $axis eq 'child' ? "child( $offset, '$gi')" 6783 : _croak_and_doublecheck_xpath( $original_exp, "error [$1] not supported along axis '$axis'") 6784 ; 6785 $sub .= "\@results= grep { \$_ } map { \$_->$function } \@results;" 6786 } 6787 elsif( $predicates && ($predicates =~ m{^\s*\[\s*last\s*\(\s*\)\s*\]\s*$}) ) 6788 { # last() 6789 _croak_and_doublecheck_xpath( $original_exp, "error in xpath expression $original_exp, usage of // and last() not supported") if( $wildcard); 6790 $sub .= "\@results= map { \$_->last_child( '$gi') } \@results;"; 6791 } 6792 else 6793 { # follow the axis 6794 #warn "axis: '$axis' - method: '$axis2method{$axis}' - gi: '$gi'\n"; 6795 6796 my $follow_axis= " \$_->$axis2method{$axis}( '$gi')"; 6797 my $step= $follow_axis; 6798 6799 # now filter using the predicate 6800 while( $predicates=~ s{^\s*($REG_PREDICATE_ALT)\s*}{}o) 6801 { my $pred= $1; 6802 $pred=~ s{^\s*\[\s*}{}; 6803 $pred=~ s{\s*\]\s*$}{}; 6804 my $test=""; 6805 my $pos; 6806 if( $pred=~ m{^(-?\s*\d+)$}) 6807 { my $pos= $1; 6808 if( $step=~ m{^\s*grep(.*) (\$_->\w+\(\s*'[^']*'\s*\))}) 6809 { $step= "XML::Twig::_first_n $1 $pos, $2"; } 6810 else 6811 { if( $pos > 0) { $pos--; } 6812 $step= "($step)[$pos]"; 6813 } 6814 #warn "number predicate '$pos' - generated step '$step'\n"; 6815 } 6816 else 6817 { my $syntax_error=0; 6818 do 6819 { if( $pred =~ s{^string\(\s*\)\s*=\s*($REG_STRING)\s*}{}o) # string()="string" pred 6820 { $test .= "\$_->text eq $1"; } 6821 elsif( $pred =~ s{^string\(\s*\)\s*!=\s*($REG_STRING)\s*}{}o) # string()!="string" pred 6822 { $test .= "\$_->text ne $1"; } 6823 if( $pred =~ s{^string\(\s*\)\s*=\s*($REG_NUMBER)\s*}{}o) # string()=<number> pred 6824 { $test .= "\$_->text eq $1"; } 6825 elsif( $pred =~ s{^string\(\s*\)\s*!=\s*($REG_NUMBER)\s*}{}o) # string()!=<number> pred 6826 { $test .= "\$_->text ne $1"; } 6827 elsif( $pred =~ s{^string\(\s*\)\s*(>|<|>=|<=)\s*($REG_NUMBER)\s*}{}o) # string()!=<number> pred 6828 { $test .= "\$_->text $1 $2"; } 6829 6830 elsif( $pred =~ s{^string\(\s*\)\s*($REG_MATCH)\s*($REG_REGEXP)\s*}{}o) # string()=~/regex/ pred 6831 { my( $match, $regexp)= ($1, $2); 6832 $test .= "\$_->text $match $regexp"; 6833 } 6834 elsif( $pred =~ s{^string\(\s*\)\s*}{}o) # string() pred 6835 { $test .= "\$_->text"; } 6836 elsif( $pred=~ s{^@($REG_TAG_NAME)\s*($REG_OP)\s*($REG_STRING|$REG_NUMBER)}{}o) # @att="val" pred 6837 { my( $att, $oper, $val)= ($1, _op( $2), $3); 6838 $test .= qq{((defined \$_->{'att'}->{"$att"}) && (\$_->{'att'}->{"$att"} $oper $val))}; 6839 } 6840 elsif( $pred =~ s{^@($REG_TAG_NAME)\s*($REG_MATCH)\s*($REG_REGEXP)\s*}{}o) # @att=~/regex/ pred XXX 6841 { my( $att, $match, $regexp)= ($1, $2, $3); 6842 $test .= qq{((defined \$_->{'att'}->{"$att"}) && (\$_->{'att'}->{"$att"} $match $regexp))};; 6843 } 6844 elsif( $pred=~ s{^@($REG_TAG_NAME)\s*}{}o) # @att pred 6845 { $test .= qq{(defined \$_->{'att'}->{"$1"})}; } 6846 elsif( $pred=~ s{^\s*(?:not|!)\s*@($REG_TAG_NAME)\s*}{}o) # not @att pred 6847 { $test .= qq{((\$_->is_elt) && (not defined \$_->{'att'}->{"$1"}))}; } 6848 elsif( $pred=~ s{^\s*([()])}{}) # ( or ) (just add to the test) 6849 { $test .= qq{$1}; } 6850 elsif( $pred=~ s{^\s*(and|or)\s*}{}) 6851 { $test .= lc " $1 "; } 6852 else 6853 { $syntax_error=1; } 6854 6855 } while( !$syntax_error && $pred); 6856 _croak_and_doublecheck_xpath( $original_exp, "error in xpath expression $original_exp at $pred") if( $pred); 6857 $step= " grep { $test } $step "; 6858 } 6859 } 6860 #warn "step: '$step'"; 6861 $sub .= "\@results= grep { \$_ } map { $step } \@results;"; 6862 } 6863 } 6864 } 6865 6866 if( $xpath_exp) 6867 { _croak_and_doublecheck_xpath( $original_exp, "error in xpath expression $original_exp around $xpath_exp"); } 6868 6869 $sub .= q{return XML::Twig::_unique_elts( @results); }; 6870 #warn "generated: '$sub'\n"; 6871 my $s= eval "sub { $NO_WARNINGS; $sub }"; 6872 if( $@) 6873 { _croak_and_doublecheck_xpath( $original_exp, "error in xpath expression $original_exp ($@);") } 6874 return( $s); 6875 } 6876} 6877 6878sub _croak_and_doublecheck_xpath 6879 { my $xpath_expression= shift; 6880 my $mess= join( "\n", @_); 6881 if( $XML::Twig::XPath::VERSION || 0) 6882 { my $check_twig= XML::Twig::XPath->new; 6883 if( eval { $check_twig->{twig_xp}->_parse( $xpath_expression) }) 6884 { $mess .= "\nthe expression is a valid XPath statement, and you are using XML::Twig::XPath, but" 6885 . "\nyou are using either 'find_nodes' or 'get_xpath' where the method you likely wanted" 6886 . "\nto use is 'findnodes', which is the only one that uses the full XPath engine\n"; 6887 } 6888 } 6889 croak $mess; 6890 } 6891 6892 6893 6894{ # extremely elaborate caching mechanism 6895 my %xpath; # xpath_expression => subroutine_code; 6896 sub get_xpath 6897 { my( $elt, $xpath_exp, $offset)= @_; 6898 my $sub= ($xpath{$xpath_exp} ||= _install_xpath( $xpath_exp)); 6899 return $sub->( $elt) unless( defined $offset); 6900 my @res= $sub->( $elt); 6901 return $res[$offset]; 6902 } 6903} 6904 6905 6906sub findvalues 6907 { my $elt= shift; 6908 return map { $_->text } $elt->get_xpath( @_); 6909 } 6910 6911sub findvalue 6912 { my $elt= shift; 6913 return join '', map { $_->text } $elt->get_xpath( @_); 6914 } 6915 6916 6917# XML::XPath compatibility 6918sub getElementById { return $_[0]->twig->elt_id( $_[1]); } 6919sub getChildNodes { my @children= do { my $elt= $_[0]; my @children=(); my $child= $elt->{first_child}; while( $child) { push @children, $child; $child= $child->{next_sibling}; } @children; }; return wantarray ? @children : \@children; } 6920 6921sub _flushed { return $_[0]->{flushed}; } 6922sub _set_flushed { $_[0]->{flushed}=1; } 6923sub _del_flushed { delete $_[0]->{flushed}; } 6924 6925sub cut 6926 { my $elt= shift; 6927 my( $parent, $prev_sibling, $next_sibling); 6928 $parent= $elt->{parent}; 6929 my $a= $elt->{'att'}->{'a'} || 'na'; 6930 if( ! $parent && $elt->is_elt) 6931 { # are we cutting the root? 6932 my $t= $elt->{twig}; 6933 if( $t && ! $t->{twig_parsing}) 6934 { delete $t->{twig_root}; 6935 delete $elt->{twig}; 6936 return $elt; 6937 } # cutt`ing the root 6938 else 6939 { return; } # cutting an orphan, returning $elt would break backward compatibility 6940 } 6941 6942 # save the old links, that'll make it easier for some loops 6943 foreach my $link ( qw(parent prev_sibling next_sibling) ) 6944 { $elt->{former}->{$link}= $elt->{$link}; 6945 if( $XML::Twig::weakrefs) { weaken( $elt->{former}->{$link}); } 6946 } 6947 6948 # if we cut the current element then its parent becomes the current elt 6949 if( $elt->{twig_current}) 6950 { my $twig_current= $elt->{parent}; 6951 $elt->twig->{twig_current}= $twig_current; 6952 $twig_current->{'twig_current'}=1; 6953 delete $elt->{'twig_current'}; 6954 } 6955 6956 if( $parent->{first_child} && $parent->{first_child} == $elt) 6957 { $parent->{first_child}= $elt->{next_sibling}; 6958 # cutting can make the parent empty 6959 if( ! $parent->{first_child}) { $parent->{empty}= 1; } 6960 } 6961 6962 if( $parent->{last_child} && $parent->{last_child} == $elt) 6963 { $parent->{empty}=0; $parent->{last_child}=$elt->{prev_sibling}; if( $XML::Twig::weakrefs) { weaken( $parent->{last_child});} ; 6964 } 6965 6966 if( $prev_sibling= $elt->{prev_sibling}) 6967 { $prev_sibling->{next_sibling}= $elt->{next_sibling}; } 6968 if( $next_sibling= $elt->{next_sibling}) 6969 { $next_sibling->{prev_sibling}=$elt->{prev_sibling}; if( $XML::Twig::weakrefs) { weaken( $next_sibling->{prev_sibling});} ; } 6970 6971 6972 $elt->{parent}=undef; if( $XML::Twig::weakrefs) { weaken( $elt->{parent});} ; 6973 $elt->{prev_sibling}=undef; if( $XML::Twig::weakrefs) { weaken( $elt->{prev_sibling});} ; 6974 $elt->{next_sibling}= undef; 6975 6976 # merge 2 (now) consecutive text nodes if they are of the same type 6977 # (type can be PCDATA or CDATA) 6978 if( $prev_sibling && $next_sibling && $prev_sibling->is_text && ( $XML::Twig::index2gi[$prev_sibling->{'gi'}] eq $XML::Twig::index2gi[$next_sibling->{'gi'}])) 6979 { $prev_sibling->merge_text( $next_sibling); } 6980 6981 return $elt; 6982 } 6983 6984 6985sub former_next_sibling { return $_[0]->{former}->{next_sibling}; } 6986sub former_prev_sibling { return $_[0]->{former}->{prev_sibling}; } 6987sub former_parent { return $_[0]->{former}->{parent}; } 6988 6989sub cut_children 6990 { my( $elt, $exp)= @_; 6991 my @children= $elt->children( $exp); 6992 foreach (@children) { $_->cut; } 6993 if( ! $elt->has_children) { $elt->{empty}= 1; } 6994 return @children; 6995 } 6996 6997sub cut_descendants 6998 { my( $elt, $exp)= @_; 6999 my @descendants= $elt->descendants( $exp); 7000 foreach ($elt->descendants( $exp)) { $_->cut; } 7001 if( ! $elt->has_children) { $elt->{empty}= 1; } 7002 return @descendants; 7003 } 7004 7005 7006 7007sub erase 7008 { my $elt= shift; 7009 #you cannot erase the current element 7010 if( $elt->{twig_current}) 7011 { croak "trying to erase an element before it has been completely parsed"; } 7012 unless( $elt->{parent}) 7013 { # trying to erase the root (of a twig or of a cut/new element) 7014 my @children= do { my $elt= $elt; my @children=(); my $child= $elt->{first_child}; while( $child) { push @children, $child; $child= $child->{next_sibling}; } @children; }; 7015 unless( @children == 1) 7016 { croak "can only erase an element with no parent if it has a single child"; } 7017 $elt->_move_extra_data_after_erase; 7018 my $child= shift @children; 7019 $child->{parent}=undef; if( $XML::Twig::weakrefs) { weaken( $child->{parent});} ; 7020 my $twig= $elt->twig; 7021 $twig->set_root( $child); 7022 } 7023 else 7024 { # normal case 7025 $elt->_move_extra_data_after_erase; 7026 my @children= do { my $elt= $elt; my @children=(); my $child= $elt->{first_child}; while( $child) { push @children, $child; $child= $child->{next_sibling}; } @children; }; 7027 if( @children) 7028 { # elt has children, move them up 7029 7030 my $first_child= $elt->{first_child}; 7031 my $prev_sibling=$elt->{prev_sibling}; 7032 if( $prev_sibling) 7033 { # connect first child to previous sibling 7034 $first_child->{prev_sibling}=$prev_sibling; if( $XML::Twig::weakrefs) { weaken( $first_child->{prev_sibling});} ; 7035 $prev_sibling->{next_sibling}= $first_child; 7036 } 7037 else 7038 { # elt was the first child 7039 $elt->{parent}->set_first_child( $first_child); 7040 } 7041 7042 my $last_child= $elt->{last_child}; 7043 my $next_sibling= $elt->{next_sibling}; 7044 if( $next_sibling) 7045 { # connect last child to next sibling 7046 $last_child->{next_sibling}= $next_sibling; 7047 $next_sibling->{prev_sibling}=$last_child; if( $XML::Twig::weakrefs) { weaken( $next_sibling->{prev_sibling});} ; 7048 } 7049 else 7050 { # elt was the last child 7051 $elt->{parent}->set_last_child( $last_child); 7052 } 7053 # update parent for all siblings 7054 foreach my $child (@children) 7055 { $child->{parent}=$elt->{parent}; if( $XML::Twig::weakrefs) { weaken( $child->{parent});} ; } 7056 7057 # merge consecutive text elements if need be 7058 if( $prev_sibling && $prev_sibling->is_text && ($XML::Twig::index2gi[$first_child->{'gi'}] eq $XML::Twig::index2gi[$prev_sibling->{'gi'}]) ) 7059 { $prev_sibling->merge_text( $first_child); } 7060 if( $next_sibling && $next_sibling->is_text && ($XML::Twig::index2gi[$last_child->{'gi'}] eq $XML::Twig::index2gi[$next_sibling->{'gi'}]) ) 7061 { $last_child->merge_text( $next_sibling); } 7062 7063 # if parsing and have now a PCDATA text, mark so we can normalize later on if need be 7064 if( $elt->{parent}->{twig_current} && $elt->{last_child}->is_text) { $elt->{parent}->{twig_to_be_normalized}=1; } 7065 7066 # elt is not referenced any more, so it will be DESTROYed 7067 # so we'd better break the links to its children ## FIX 7068 undef $elt->{first_child}; 7069 undef $elt->{last_child}; 7070 undef $elt->{parent}; 7071 undef $elt->{next_sibling}; 7072 undef $elt->{prev_sibling}; 7073 7074 } 7075 { # elt had no child, delete it 7076 $elt->delete; 7077 } 7078 7079 } 7080 return $elt; 7081 7082 } 7083 7084sub _move_extra_data_after_erase 7085 { my( $elt)= @_; 7086 # extra_data 7087 if( my $extra_data= $elt->{extra_data}) 7088 { my $target= $elt->{first_child} || $elt->{next_sibling}; 7089 if( $target) 7090 { 7091 if( $target->is( $ELT)) 7092 { $target->set_extra_data( $extra_data . ($target->extra_data || '')); } 7093 elsif( $target->is( $TEXT)) 7094 { $target->_unshift_extra_data_in_pcdata( $extra_data, 0); } # TO CHECK 7095 } 7096 else 7097 { my $parent= $elt->{parent}; # always exists or the erase cannot be performed 7098 $parent->_prefix_extra_data_before_end_tag( $extra_data); 7099 } 7100 } 7101 7102 # extra_data_before_end_tag 7103 if( my $extra_data= $elt->{extra_data_before_end_tag}) 7104 { if( my $target= $elt->{next_sibling}) 7105 { if( $target->is( $ELT)) 7106 { $target->set_extra_data( $extra_data . ($target->extra_data || '')); } 7107 elsif( $target->is( $TEXT)) 7108 { 7109 $target->_unshift_extra_data_in_pcdata( $extra_data, 0); 7110 } 7111 } 7112 elsif( my $parent= $elt->{parent}) 7113 { $parent->_prefix_extra_data_before_end_tag( $extra_data); } 7114 } 7115 7116 return $elt; 7117 7118 } 7119BEGIN 7120 { my %method= ( before => \&paste_before, 7121 after => \&paste_after, 7122 first_child => \&paste_first_child, 7123 last_child => \&paste_last_child, 7124 within => \&paste_within, 7125 ); 7126 7127 # paste elt somewhere around ref 7128 # pos can be first_child (default), last_child, before, after or within 7129 sub paste ## no critic (Subroutines::ProhibitNestedSubs); 7130 { my $elt= shift; 7131 if( $elt->{parent}) 7132 { croak "cannot paste an element that belongs to a tree"; } 7133 my $pos; 7134 my $ref; 7135 if( ref $_[0]) 7136 { $pos= 'first_child'; 7137 croak "wrong argument order in paste, should be $_[1] first" if($_[1]); 7138 } 7139 else 7140 { $pos= shift; } 7141 7142 if( my $method= $method{$pos}) 7143 { 7144 unless( ref( $_[0]) && isa( $_[0], 'XML::Twig::Elt')) 7145 { if( ! defined( $_[0])) 7146 { croak "missing target in paste"; } 7147 elsif( ! ref( $_[0])) 7148 { croak "wrong target type in paste (not a reference), should be XML::Twig::Elt or a subclass"; } 7149 else 7150 { my $ref= ref $_[0]; 7151 croak "wrong target type in paste: '$ref', should be XML::Twig::Elt or a subclass"; 7152 } 7153 } 7154 $ref= $_[0]; 7155 # check here so error message lists the caller file/line 7156 if( !$ref->{parent} && ($pos=~ m{^(before|after)$}) && !(exists $elt->{'target'}) && !(exists $elt->{'comment'})) 7157 { croak "cannot paste $1 root"; } 7158 $elt->$method( @_); 7159 } 7160 else 7161 { croak "tried to paste in wrong position '$pos', allowed positions " . 7162 " are 'first_child', 'last_child', 'before', 'after' and " . 7163 "'within'"; 7164 } 7165 if( (my $ids= $elt->{twig_id_list}) && (my $t= $ref->twig) ) 7166 { $t->{twig_id_list}||={}; 7167 foreach my $id (keys %$ids) 7168 { $t->{twig_id_list}->{$id}= $ids->{$id}; 7169 if( $XML::Twig::weakrefs) { weaken( $t->{twig_id_list}->{$id}); } 7170 } 7171 } 7172 return $elt; 7173 } 7174 7175 7176 sub paste_before 7177 { my( $elt, $ref)= @_; 7178 my( $parent, $prev_sibling, $next_sibling ); 7179 7180 # trying to paste before an orphan (root or detached wlt) 7181 unless( $ref->{parent}) 7182 { if( my $t= $ref->twig) 7183 { if( (exists $elt->{'comment'}) || (exists $elt->{'target'})) # we can still do this 7184 { $t->_add_cpi_outside_of_root( leading_cpi => $elt); return; } 7185 else 7186 { croak "cannot paste before root"; } 7187 } 7188 else 7189 { croak "cannot paste before an orphan element"; } 7190 } 7191 $parent= $ref->{parent}; 7192 $prev_sibling= $ref->{prev_sibling}; 7193 $next_sibling= $ref; 7194 7195 $elt->{parent}=$parent; if( $XML::Twig::weakrefs) { weaken( $elt->{parent});} ; 7196 if( $parent->{first_child} == $ref) { $parent->{first_child}= $elt; } 7197 7198 if( $prev_sibling) { $prev_sibling->{next_sibling}= $elt; } 7199 $elt->{prev_sibling}=$prev_sibling; if( $XML::Twig::weakrefs) { weaken( $elt->{prev_sibling});} ; 7200 7201 $next_sibling->{prev_sibling}=$elt; if( $XML::Twig::weakrefs) { weaken( $next_sibling->{prev_sibling});} ; 7202 $elt->{next_sibling}= $ref; 7203 return $elt; 7204 } 7205 7206 sub paste_after 7207 { my( $elt, $ref)= @_; 7208 my( $parent, $prev_sibling, $next_sibling ); 7209 7210 # trying to paste after an orphan (root or detached wlt) 7211 unless( $ref->{parent}) 7212 { if( my $t= $ref->twig) 7213 { if( (exists $elt->{'comment'}) || (exists $elt->{'target'})) # we can still do this 7214 { $t->_add_cpi_outside_of_root( trailing_cpi => $elt); return; } 7215 else 7216 { croak "cannot paste after root"; } 7217 } 7218 else 7219 { croak "cannot paste after an orphan element"; } 7220 } 7221 $parent= $ref->{parent}; 7222 $prev_sibling= $ref; 7223 $next_sibling= $ref->{next_sibling}; 7224 7225 $elt->{parent}=$parent; if( $XML::Twig::weakrefs) { weaken( $elt->{parent});} ; 7226 if( $parent->{last_child}== $ref) { $parent->{empty}=0; $parent->{last_child}=$elt; if( $XML::Twig::weakrefs) { weaken( $parent->{last_child});} ; } 7227 7228 $prev_sibling->{next_sibling}= $elt; 7229 $elt->{prev_sibling}=$prev_sibling; if( $XML::Twig::weakrefs) { weaken( $elt->{prev_sibling});} ; 7230 7231 if( $next_sibling) { $next_sibling->{prev_sibling}=$elt; if( $XML::Twig::weakrefs) { weaken( $next_sibling->{prev_sibling});} ; } 7232 $elt->{next_sibling}= $next_sibling; 7233 return $elt; 7234 7235 } 7236 7237 sub paste_first_child 7238 { my( $elt, $ref)= @_; 7239 my( $parent, $prev_sibling, $next_sibling ); 7240 $parent= $ref; 7241 $next_sibling= $ref->{first_child}; 7242 7243 $elt->{parent}=$parent; if( $XML::Twig::weakrefs) { weaken( $elt->{parent});} ; 7244 $parent->{first_child}= $elt; 7245 unless( $parent->{last_child}) { $parent->{empty}=0; $parent->{last_child}=$elt; if( $XML::Twig::weakrefs) { weaken( $parent->{last_child});} ; } 7246 7247 $elt->{prev_sibling}=undef; if( $XML::Twig::weakrefs) { weaken( $elt->{prev_sibling});} ; 7248 7249 if( $next_sibling) { $next_sibling->{prev_sibling}=$elt; if( $XML::Twig::weakrefs) { weaken( $next_sibling->{prev_sibling});} ; } 7250 $elt->{next_sibling}= $next_sibling; 7251 return $elt; 7252 } 7253 7254 sub paste_last_child 7255 { my( $elt, $ref)= @_; 7256 my( $parent, $prev_sibling, $next_sibling ); 7257 $parent= $ref; 7258 $prev_sibling= $ref->{last_child}; 7259 7260 $elt->{parent}=$parent; if( $XML::Twig::weakrefs) { weaken( $elt->{parent});} ; 7261 $parent->{empty}=0; $parent->{last_child}=$elt; if( $XML::Twig::weakrefs) { weaken( $parent->{last_child});} ; 7262 unless( $parent->{first_child}) { $parent->{first_child}= $elt; } 7263 7264 $elt->{prev_sibling}=$prev_sibling; if( $XML::Twig::weakrefs) { weaken( $elt->{prev_sibling});} ; 7265 if( $prev_sibling) { $prev_sibling->{next_sibling}= $elt; } 7266 7267 $elt->{next_sibling}= undef; 7268 return $elt; 7269 } 7270 7271 sub paste_within 7272 { my( $elt, $ref, $offset)= @_; 7273 my $text= $ref->is_text ? $ref : $ref->next_elt( $TEXT, $ref); 7274 my $new= $text->split_at( $offset); 7275 $elt->paste_before( $new); 7276 return $elt; 7277 } 7278 } 7279 7280# load an element into a structure similar to XML::Simple's 7281sub simplify 7282 { my $elt= shift; 7283 7284 # normalize option names 7285 my %options= @_; 7286 %options= map { my ($key, $val)= ($_, $options{$_}); 7287 $key=~ s{(\w)([A-Z])}{$1_\L$2}g; 7288 $key => $val 7289 } keys %options; 7290 7291 # check options 7292 my @allowed_options= qw( keyattr forcearray noattr content_key 7293 var var_regexp variables var_attr 7294 group_tags forcecontent 7295 normalise_space normalize_space 7296 ); 7297 my %allowed_options= map { $_ => 1 } @allowed_options; 7298 foreach my $option (keys %options) 7299 { carp "invalid option $option\n" unless( $allowed_options{$option}); } 7300 7301 $options{normalise_space} ||= $options{normalize_space} || 0; 7302 7303 $options{content_key} ||= 'content'; 7304 if( $options{content_key}=~ m{^-}) 7305 { # need to remove the - and to activate extra folding 7306 $options{content_key}=~ s{^-}{}; 7307 $options{extra_folding}= 1; 7308 } 7309 else 7310 { $options{extra_folding}= 0; } 7311 7312 $options{forcearray} ||=0; 7313 if( isa( $options{forcearray}, 'ARRAY')) 7314 { my %forcearray_tags= map { $_ => 1 } @{$options{forcearray}}; 7315 $options{forcearray_tags}= \%forcearray_tags; 7316 $options{forcearray}= 0; 7317 } 7318 7319 $options{keyattr} ||= ['name', 'key', 'id']; 7320 if( ref $options{keyattr} eq 'ARRAY') 7321 { foreach my $keyattr (@{$options{keyattr}}) 7322 { my( $prefix, $att)= ($keyattr=~ m{^([+-])?(.*)}); 7323 $prefix ||= ''; 7324 $options{key_for_all}->{$att}= 1; 7325 $options{remove_key_for_all}->{$att}=1 unless( $prefix eq '+'); 7326 $options{prefix_key_for_all}->{$att}=1 if( $prefix eq '-'); 7327 } 7328 } 7329 elsif( ref $options{keyattr} eq 'HASH') 7330 { while( my( $elt, $keyattr)= each %{$options{keyattr}}) 7331 { my( $prefix, $att)= ($keyattr=~ m{^([+-])?(.*)}); 7332 $prefix ||=''; 7333 $options{key_for_elt}->{$elt}= $att; 7334 $options{remove_key_for_elt}->{"$elt#$att"}=1 unless( $prefix); 7335 $options{prefix_key_for_elt}->{"$elt#$att"}=1 if( $prefix eq '-'); 7336 } 7337 } 7338 7339 7340 $options{var}||= $options{var_attr}; # for compat with XML::Simple 7341 if( $options{var}) { $options{var_values}= {}; } 7342 else { $options{var}=''; } 7343 7344 if( $options{variables}) 7345 { $options{var}||= 1; 7346 $options{var_values}= $options{variables}; 7347 } 7348 7349 if( $options{var_regexp} and !$options{var}) 7350 { warn "var option not used, var_regexp option ignored\n"; } 7351 $options{var_regexp} ||= '\$\{?(\w+)\}?'; 7352 7353 $elt->_simplify( \%options); 7354 7355 } 7356 7357sub _simplify 7358 { my( $elt, $options)= @_; 7359 7360 my $data; 7361 7362 my $gi= $XML::Twig::index2gi[$elt->{'gi'}]; 7363 my @children= do { my $elt= $elt; my @children=(); my $child= $elt->{first_child}; while( $child) { push @children, $child; $child= $child->{next_sibling}; } @children; }; 7364 my %atts= $options->{noattr} || !$elt->{att} ? () : %{$elt->{att}}; 7365 my $nb_atts= keys %atts; 7366 my $nb_children= $elt->children_count + $nb_atts; 7367 7368 my %nb_children; 7369 foreach (@children) { $nb_children{$_->tag}++; } 7370 foreach (keys %atts) { $nb_children{$_}++; } 7371 7372 my $arrays; # tag => array where elements are stored 7373 7374 7375 # store children 7376 foreach my $child (@children) 7377 { if( $child->is_text) 7378 { # generate with a content key 7379 my $text= $elt->_text_with_vars( $options); 7380 if( $options->{normalise_space} >= 2) { $text= _normalize_space( $text); } 7381 if( $options->{force_content} 7382 || $nb_atts 7383 || (scalar @children > 1) 7384 ) 7385 { $data->{$options->{content_key}}= $text; } 7386 else 7387 { $data= $text; } 7388 } 7389 else 7390 { # element with sub-elements 7391 my $child_gi= $XML::Twig::index2gi[$child->{'gi'}]; 7392 7393 my $child_data= $child->_simplify( $options); 7394 7395 # first see if we need to simplify further the child data 7396 # simplify because of grouped tags 7397 if( my $grouped_tag= $options->{group_tags}->{$child_gi}) 7398 { # check that the child data is a hash with a single field 7399 unless( (ref( $child_data) eq 'HASH') 7400 && (keys %$child_data == 1) 7401 && defined ( my $grouped_child_data= $child_data->{$grouped_tag}) 7402 ) 7403 { croak "error in grouped tag $child_gi"; } 7404 else 7405 { $child_data= $grouped_child_data; } 7406 } 7407 # simplify because of extra folding 7408 if( $options->{extra_folding}) 7409 { if( (ref( $child_data) eq 'HASH') 7410 && (keys %$child_data == 1) 7411 && defined( my $content= $child_data->{$options->{content_key}}) 7412 ) 7413 { $child_data= $content; } 7414 } 7415 7416 if( my $keyatt= $child->_key_attr( $options)) 7417 { # simplify element with key 7418 my $key= $child->{'att'}->{$keyatt}; 7419 if( $options->{normalise_space} >= 1) { $key= _normalize_space( $key); } 7420 $data->{$child_gi}->{$key}= $child_data; 7421 } 7422 elsif( $options->{forcearray} 7423 || $options->{forcearray_tags}->{$child_gi} 7424 || ( $nb_children{$child_gi} > 1) 7425 ) 7426 { # simplify element to store in an array 7427 $data->{$child_gi} ||= []; 7428 push @{$data->{$child_gi}}, $child_data; 7429 } 7430 else 7431 { # simplify element to store as a hash field 7432 $data->{$child_gi}= $child_data; 7433 } 7434 } 7435 } 7436 7437 # store atts 7438 # TODO: deal with att that already have an element by that name 7439 foreach my $att (keys %atts) 7440 { # do not store if the att is a key that needs to be removed 7441 if( $options->{remove_key_for_all}->{$att} 7442 || $options->{remove_key_for_elt}->{"$gi#$att"} 7443 ) 7444 { next; } 7445 7446 my $att_text= $options->{var} ? _replace_vars_in_text( $atts{$att}, $options) : $atts{$att} ; 7447 if( $options->{normalise_space} >= 2) { $att_text= _normalize_space( $att_text); } 7448 7449 if( $options->{prefix_key_for_all}->{$att} 7450 || $options->{prefix_key_for_elt}->{"$gi#$att"} 7451 ) 7452 { # prefix the att 7453 $data->{"-$att"}= $att_text; 7454 } 7455 else 7456 { # normal case 7457 $data->{$att}= $att_text; 7458 } 7459 } 7460 7461 return $data; 7462 } 7463 7464sub _key_attr 7465 { my( $elt, $options)=@_; 7466 return if( $options->{noattr}); 7467 if( $options->{key_for_all}) 7468 { foreach my $att ($elt->att_names) 7469 { if( $options->{key_for_all}->{$att}) 7470 { return $att; } 7471 } 7472 } 7473 elsif( $options->{key_for_elt}) 7474 { if( my $key_for_elt= $options->{key_for_elt}->{$XML::Twig::index2gi[$elt->{'gi'}]} ) 7475 { return $key_for_elt if( defined( $elt->{'att'}->{$key_for_elt})); } 7476 } 7477 return; 7478 } 7479 7480sub _text_with_vars 7481 { my( $elt, $options)= @_; 7482 my $text; 7483 if( $options->{var}) 7484 { $text= _replace_vars_in_text( $elt->text, $options); 7485 $elt->_store_var( $options); 7486 } 7487 else 7488 { $text= $elt->text; } 7489 return $text; 7490 } 7491 7492 7493sub _normalize_space 7494 { my $text= shift; 7495 $text=~ s{\s+}{ }sg; 7496 $text=~ s{^\s}{}; 7497 $text=~ s{\s$}{}; 7498 return $text; 7499 } 7500 7501 7502sub att_nb 7503 { return 0 unless( my $atts= $_[0]->{att}); 7504 return scalar keys %$atts; 7505 } 7506 7507sub has_no_atts 7508 { return 1 unless( my $atts= $_[0]->{att}); 7509 return scalar keys %$atts ? 0 : 1; 7510 } 7511 7512sub _replace_vars_in_text 7513 { my( $text, $options)= @_; 7514 7515 $text=~ s{($options->{var_regexp})} 7516 { if( defined( my $value= $options->{var_values}->{$2})) 7517 { $value } 7518 else 7519 { warn "unknown variable $2\n"; 7520 $1 7521 } 7522 }gex; 7523 return $text; 7524 } 7525 7526sub _store_var 7527 { my( $elt, $options)= @_; 7528 if( defined (my $var_name= $elt->{'att'}->{$options->{var}})) 7529 { $options->{var_values}->{$var_name}= $elt->text; 7530 } 7531 } 7532 7533 7534# split a text element at a given offset 7535sub split_at 7536 { my( $elt, $offset)= @_; 7537 my $text_elt= $elt->is_text ? $elt : $elt->first_child( $TEXT) || return ''; 7538 my $string= $text_elt->text; 7539 my $left_string= substr( $string, 0, $offset); 7540 my $right_string= substr( $string, $offset); 7541 $text_elt->{pcdata}= (delete $text_elt->{empty} || 1) && $left_string; 7542 my $new_elt= $elt->new( $XML::Twig::index2gi[$elt->{'gi'}], $right_string); 7543 $new_elt->paste( after => $elt); 7544 return $new_elt; 7545 } 7546 7547 7548# split an element or its text descendants into several, in place 7549# all elements (new and untouched) are returned 7550sub split 7551 { my $elt= shift; 7552 my @text_chunks; 7553 my @result; 7554 if( $elt->is_text) { @text_chunks= ($elt); } 7555 else { @text_chunks= $elt->descendants( $TEXT); } 7556 foreach my $text_chunk (@text_chunks) 7557 { push @result, $text_chunk->_split( 1, @_); } 7558 return @result; 7559 } 7560 7561# split an element or its text descendants into several, in place 7562# created elements (those which match the regexp) are returned 7563sub mark 7564 { my $elt= shift; 7565 my @text_chunks; 7566 my @result; 7567 if( $elt->is_text) { @text_chunks= ($elt); } 7568 else { @text_chunks= $elt->descendants( $TEXT); } 7569 foreach my $text_chunk (@text_chunks) 7570 { push @result, $text_chunk->_split( 0, @_); } 7571 return @result; 7572 } 7573 7574# split a single text element 7575# return_all defines what is returned: if it is true 7576# only returns the elements created by matches in the split regexp 7577# otherwise all elements (new and untouched) are returned 7578 7579 7580{ 7581 7582 sub _split 7583 { my $elt= shift; 7584 my $return_all= shift; 7585 my $regexp= shift; 7586 my @tags; 7587 7588 while( @_) 7589 { my $tag= shift(); 7590 if( ref $_[0]) 7591 { push @tags, { tag => $tag, atts => shift }; } 7592 else 7593 { push @tags, { tag => $tag }; } 7594 } 7595 7596 unless( @tags) { @tags= { tag => $elt->{parent}->gi }; } 7597 7598 my @result; # the returned list of elements 7599 my $text= $elt->text; 7600 my $gi= $XML::Twig::index2gi[$elt->{'gi'}]; 7601 7602 # 2 uses: if split matches then the first substring reuses $elt 7603 # once a split has occurred then the last match needs to be put in 7604 # a new element 7605 my $previous_match= 0; 7606 7607 while( my( $pre_match, @matches)= $text=~ /^(.*?)$regexp(.*)$/gcs) 7608 { $text= pop @matches; 7609 if( $previous_match) 7610 { # match, not the first one, create a new text ($gi) element 7611 _utf8_ify( $pre_match) if( $] < 5.010); 7612 $elt= $elt->insert_new_elt( after => $gi, $pre_match); 7613 push @result, $elt if( $return_all); 7614 } 7615 else 7616 { # first match in $elt, re-use $elt for the first sub-string 7617 _utf8_ify( $pre_match) if( $] < 5.010); 7618 $elt->set_text( $pre_match); 7619 $previous_match++; # store the fact that there was a match 7620 push @result, $elt if( $return_all); 7621 } 7622 7623 # now deal with matches captured in the regexp 7624 if( @matches) 7625 { # match, with capture 7626 my $i=0; 7627 foreach my $match (@matches) 7628 { # create new element, text is the match 7629 _utf8_ify( $match) if( $] < 5.010); 7630 my $tag = _repl_match( $tags[$i]->{tag}, @matches) || '#PCDATA'; 7631 my $atts = \%{$tags[$i]->{atts}} || {}; 7632 my %atts= map { _repl_match( $_, @matches) => _repl_match( $atts->{$_}, @matches) } keys %$atts; 7633 $elt= $elt->insert_new_elt( after => $tag, \%atts, $match); 7634 push @result, $elt; 7635 $i= ($i + 1) % @tags; 7636 } 7637 } 7638 else 7639 { # match, no captures 7640 my $tag = $tags[0]->{tag}; 7641 my $atts = \%{$tags[0]->{atts}} || {}; 7642 $elt= $elt->insert_new_elt( after => $tag, $atts); 7643 push @result, $elt; 7644 } 7645 } 7646 if( $previous_match && $text) 7647 { # there was at least 1 match, and there is text left after the match 7648 $elt= $elt->insert_new_elt( after => $gi, $text); 7649 } 7650 7651 push @result, $elt if( $return_all); 7652 7653 return @result; # return all elements 7654 } 7655 7656sub _repl_match 7657 { my( $val, @matches)= @_; 7658 $val=~ s{\$(\d+)}{$matches[$1-1]}g; 7659 return $val; 7660 } 7661 7662 # evil hack needed as sometimes 7663 my $encode_is_loaded=0; # so we only load Encode once 7664 sub _utf8_ify 7665 { 7666 if( $perl_version >= 5.008 and $perl_version < 5.010 and !_keep_encoding()) 7667 { unless( $encode_is_loaded) { require Encode; import Encode; $encode_is_loaded++; } 7668 Encode::_utf8_on( $_[0]); # the flag should be set but is not 7669 } 7670 } 7671 7672 7673} 7674 7675{ my %replace_sub; # cache for complex expressions (expression => sub) 7676 7677 sub subs_text 7678 { my( $elt, $regexp, $replace)= @_; 7679 7680 my $replacement_string; 7681 my $is_string= _is_string( $replace); 7682 7683 my @parents; 7684 7685 foreach my $text_elt ($elt->descendants_or_self( $TEXT)) 7686 { 7687 if( $is_string) 7688 { my $text= $text_elt->text; 7689 $text=~ s{$regexp}{ _replace_var( $replace, $1, $2, $3, $4, $5, $6, $7, $8, $9)}egx; 7690 $text_elt->set_text( $text); 7691 } 7692 else 7693 { 7694 no utf8; # = perl 5.6 7695 my $replace_sub= ( $replace_sub{$replace} ||= _install_replace_sub( $replace)); 7696 my $text= $text_elt->text; 7697 my $pos=0; # used to skip text that was previously matched 7698 my $found_hit; 7699 while( my( $pre_match_string, $match_string, @var)= ($text=~ m{(.*?)($regexp)}sg)) 7700 { $found_hit=1; 7701 my $match_start = length( $pre_match_string); 7702 my $match = $match_start ? $text_elt->split_at( $match_start + $pos) : $text_elt; 7703 my $match_length = length( $match_string); 7704 my $post_match = $match->split_at( $match_length); 7705 $replace_sub->( $match, @var); 7706 7707 # go to next 7708 $text_elt= $post_match; 7709 $text= $post_match->text; 7710 7711 if( $found_hit) { push @parents, $text_elt->{parent} unless $parents[-1] && $parents[-1]== $text_elt->{parent}; } 7712 7713 } 7714 } 7715 } 7716 7717 foreach my $parent (@parents) { $parent->normalize; } 7718 7719 return $elt; 7720 } 7721 7722 7723 sub _is_string 7724 { return ($_[0]=~ m{&e[ln]t}) ? 0: 1 } 7725 7726 sub _replace_var 7727 { my( $string, @var)= @_; 7728 unshift @var, undef; 7729 $string=~ s{\$(\d)}{$var[$1]}g; 7730 return $string; 7731 } 7732 7733 sub _install_replace_sub 7734 { my $replace_exp= shift; 7735 my @item= split m{(&e[ln]t\s*\([^)]*\))}, $replace_exp; 7736 my $sub= q{ my( $match, @var)= @_; my $new; my $last_inserted=$match;}; 7737 my( $gi, $exp); 7738 foreach my $item (@item) 7739 { next if ! length $item; 7740 if( $item=~ m{^&elt\s*\(([^)]*)\)}) 7741 { $exp= $1; } 7742 elsif( $item=~ m{^&ent\s*\(\s*([^\s)]*)\s*\)}) 7743 { $exp= " '#ENT' => $1"; } 7744 else 7745 { $exp= qq{ '#PCDATA' => "$item"}; } 7746 $exp=~ s{\$(\d)}{my $i= $1-1; "\$var[$i]"}eg; # replace references to matches 7747 $sub.= qq{ \$new= \$match->new( $exp); }; 7748 $sub .= q{ $new->paste( after => $last_inserted); $last_inserted=$new;}; 7749 } 7750 $sub .= q{ $match->delete; }; 7751 #$sub=~ s/;/;\n/g; warn "subs: $sub"; 7752 my $coderef= eval "sub { $NO_WARNINGS; $sub }"; 7753 if( $@) { croak( "invalid replacement expression $replace_exp: ",$@); } 7754 return $coderef; 7755 } 7756 7757 } 7758 7759 7760sub merge_text 7761 { my( $e1, $e2)= @_; 7762 croak "invalid merge: can only merge 2 elements" 7763 unless( isa( $e2, 'XML::Twig::Elt')); 7764 croak "invalid merge: can only merge 2 text elements" 7765 unless( $e1->is_text && $e2->is_text && ($e1->gi eq $e2->gi)); 7766 7767 my $t1_length= length( $e1->text); 7768 7769 $e1->set_text( $e1->text . $e2->text); 7770 7771 if( my $extra_data_in_pcdata= $e2->_extra_data_in_pcdata) 7772 { foreach my $data (@$extra_data_in_pcdata) { $e1->_push_extra_data_in_pcdata( $data->{text}, $data->{offset} + $t1_length); } } 7773 7774 $e2->delete; 7775 7776 return $e1; 7777 } 7778 7779sub merge 7780 { my( $e1, $e2)= @_; 7781 my @e2_children= $e2->_children; 7782 if( $e1->_last_child && $e1->_last_child->is_pcdata 7783 && @e2_children && $e2_children[0]->is_pcdata 7784 ) 7785 { my $t1_length= length( $e1->_last_child->{pcdata}); 7786 my $child1= $e1->_last_child; 7787 my $child2= shift @e2_children; 7788 $child1->{pcdata} .= $child2->{pcdata}; 7789 7790 my $extra_data= $e1->_extra_data_before_end_tag . $e2->extra_data; 7791 7792 if( $extra_data) 7793 { $e1->_del_extra_data_before_end_tag; 7794 $child1->_push_extra_data_in_pcdata( $extra_data, $t1_length); 7795 } 7796 7797 if( my $extra_data_in_pcdata= $child2->_extra_data_in_pcdata) 7798 { foreach my $data (@$extra_data_in_pcdata) { $child1->_push_extra_data_in_pcdata( $data->{text}, $data->{offset} + $t1_length); } } 7799 7800 if( my $extra_data_before_end_tag= $e2->_extra_data_before_end_tag) 7801 { $e1->_set_extra_data_before_end_tag( $extra_data_before_end_tag); } 7802 } 7803 7804 foreach my $e (@e2_children) { $e->move( last_child => $e1); } 7805 7806 $e2->delete; 7807 return $e1; 7808 } 7809 7810 7811# recursively copy an element and returns the copy (can be huge and long) 7812sub copy 7813 { my $elt= shift; 7814 my $copy= $elt->new( $XML::Twig::index2gi[$elt->{'gi'}]); 7815 7816 if( $elt->extra_data) { $copy->set_extra_data( $elt->extra_data); } 7817 if( $elt->{extra_data_before_end_tag}) { $copy->_set_extra_data_before_end_tag( $elt->{extra_data_before_end_tag}); } 7818 7819 if( $elt->is_asis) { $copy->set_asis; } 7820 7821 if( (exists $elt->{'pcdata'})) 7822 { $copy->{pcdata}= (delete $copy->{empty} || 1) && $elt->{pcdata}; 7823 if( $elt->{extra_data_in_pcdata}) { $copy->_set_extra_data_in_pcdata( $elt->{extra_data_in_pcdata}); } 7824 } 7825 elsif( (exists $elt->{'cdata'})) 7826 { $copy->_set_cdata( $elt->{cdata}); 7827 if( $elt->{extra_data_in_pcdata}) { $copy->_set_extra_data_in_pcdata( $elt->{extra_data_in_pcdata}); } 7828 } 7829 elsif( (exists $elt->{'target'})) 7830 { $copy->_set_pi( $elt->{target}, $elt->{data}); } 7831 elsif( (exists $elt->{'comment'})) 7832 { $copy->_set_comment( $elt->{comment}); } 7833 elsif( (exists $elt->{'ent'})) 7834 { $copy->{ent}= $elt->{ent}; } 7835 else 7836 { my @children= do { my $elt= $elt; my @children=(); my $child= $elt->{first_child}; while( $child) { push @children, $child; $child= $child->{next_sibling}; } @children; }; 7837 if( my $atts= $elt->{att}) 7838 { my %atts; 7839 tie %atts, 'Tie::IxHash' if (keep_atts_order()); 7840 %atts= %{$atts}; # we want to do a real copy of the attributes 7841 $copy->set_atts( \%atts); 7842 } 7843 foreach my $child (@children) 7844 { my $child_copy= $child->copy; 7845 $child_copy->paste( 'last_child', $copy); 7846 } 7847 } 7848 # save links to the original location, which can be convenient and is used for namespace resolution 7849 foreach my $link ( qw(parent prev_sibling next_sibling) ) 7850 { $copy->{former}->{$link}= $elt->{$link}; 7851 if( $XML::Twig::weakrefs) { weaken( $copy->{former}->{$link}); } 7852 } 7853 7854 $copy->{empty}= $elt->{'empty'}; 7855 7856 return $copy; 7857 } 7858 7859 7860sub delete 7861 { my $elt= shift; 7862 $elt->cut; 7863 $elt->DESTROY unless $XML::Twig::weakrefs; 7864 return undef; 7865 } 7866 7867sub __destroy 7868 { my $elt= shift; 7869 return if( $XML::Twig::weakrefs); 7870 my $t= shift || $elt->twig; # optional argument, passed in recursive calls 7871 7872 foreach( @{[$elt->_children]}) { $_->DESTROY( $t); } 7873 7874 # the id reference needs to be destroyed 7875 # lots of tests to avoid warnings during the cleanup phase 7876 $elt->del_id( $t) if( $ID && $t && defined( $elt->{att}) && exists( $elt->{att}->{$ID})); 7877 if( $elt->{former}) { foreach (keys %{$elt->{former}}) { delete $elt->{former}->{$_}; } delete $elt->{former}; } 7878 foreach (qw( keys %$elt)) { delete $elt->{$_}; } 7879 undef $elt; 7880 } 7881 7882BEGIN 7883{ sub set_destroy { if( $XML::Twig::weakrefs) { undef *DESTROY } else { *DESTROY= *__destroy; } } 7884 set_destroy(); 7885} 7886 7887# ignores the element 7888sub ignore 7889 { my $elt= shift; 7890 my $t= $elt->twig; 7891 $t->ignore( $elt, @_); 7892 } 7893 7894BEGIN { 7895 my $pretty = 0; 7896 my $quote = '"'; 7897 my $INDENT = ' '; 7898 my $empty_tag_style = 0; 7899 my $remove_cdata = 0; 7900 my $keep_encoding = 0; 7901 my $expand_external_entities = 0; 7902 my $keep_atts_order = 0; 7903 my $do_not_escape_amp_in_atts = 0; 7904 my $WRAP = '80'; 7905 my $REPLACED_ENTS = qq{&<}; 7906 7907 my ($NSGMLS, $NICE, $INDENTED, $INDENTEDCT, $INDENTEDC, $WRAPPED, $RECORD1, $RECORD2, $INDENTEDA)= (1..9); 7908 my %KEEP_TEXT_TAG_ON_ONE_LINE= map { $_ => 1 } ( $INDENTED, $INDENTEDCT, $INDENTEDC, $INDENTEDA, $WRAPPED); 7909 my %WRAPPED = map { $_ => 1 } ( $WRAPPED, $INDENTEDA, $INDENTEDC); 7910 7911 my %pretty_print_style= 7912 ( none => 0, # no added \n 7913 nsgmls => $NSGMLS, # nsgmls-style, \n in tags 7914 # below this line styles are UNSAFE (the generated XML can be well-formed but invalid) 7915 nice => $NICE, # \n after open/close tags except when the 7916 # element starts with text 7917 indented => $INDENTED, # nice plus idented 7918 indented_close_tag => $INDENTEDCT, # nice plus idented 7919 indented_c => $INDENTEDC, # slightly more compact than indented (closing 7920 # tags are on the same line) 7921 wrapped => $WRAPPED, # text is wrapped at column 7922 record_c => $RECORD1, # for record-like data (compact) 7923 record => $RECORD2, # for record-like data (not so compact) 7924 indented_a => $INDENTEDA, # nice, indented, and with attributes on separate 7925 # lines as the nsgmls style, as well as wrapped 7926 # lines - to make the xml friendly to line-oriented tools 7927 cvs => $INDENTEDA, # alias for indented_a 7928 ); 7929 7930 my ($HTML, $EXPAND)= (1..2); 7931 my %empty_tag_style= 7932 ( normal => 0, # <tag/> 7933 html => $HTML, # <tag /> 7934 xhtml => $HTML, # <tag /> 7935 expand => $EXPAND, # <tag></tag> 7936 ); 7937 7938 my %quote_style= 7939 ( double => '"', 7940 single => "'", 7941 # smart => "smart", 7942 ); 7943 7944 my $xml_space_preserve; # set when an element includes xml:space="preserve" 7945 7946 my $output_filter; # filters the entire output (including < and >) 7947 my $output_text_filter; # filters only the text part (tag names, attributes, pcdata) 7948 7949 my $replaced_ents= $REPLACED_ENTS; 7950 7951 7952 # returns those pesky "global" variables so you can switch between twigs 7953 sub global_state ## no critic (Subroutines::ProhibitNestedSubs); 7954 { return 7955 { pretty => $pretty, 7956 quote => $quote, 7957 indent => $INDENT, 7958 empty_tag_style => $empty_tag_style, 7959 remove_cdata => $remove_cdata, 7960 keep_encoding => $keep_encoding, 7961 expand_external_entities => $expand_external_entities, 7962 output_filter => $output_filter, 7963 output_text_filter => $output_text_filter, 7964 keep_atts_order => $keep_atts_order, 7965 do_not_escape_amp_in_atts => $do_not_escape_amp_in_atts, 7966 wrap => $WRAP, 7967 replaced_ents => $replaced_ents, 7968 }; 7969 } 7970 7971 # restores the global variables 7972 sub set_global_state 7973 { my $state= shift; 7974 $pretty = $state->{pretty}; 7975 $quote = $state->{quote}; 7976 $INDENT = $state->{indent}; 7977 $empty_tag_style = $state->{empty_tag_style}; 7978 $remove_cdata = $state->{remove_cdata}; 7979 $keep_encoding = $state->{keep_encoding}; 7980 $expand_external_entities = $state->{expand_external_entities}; 7981 $output_filter = $state->{output_filter}; 7982 $output_text_filter = $state->{output_text_filter}; 7983 $keep_atts_order = $state->{keep_atts_order}; 7984 $do_not_escape_amp_in_atts = $state->{do_not_escape_amp_in_atts}; 7985 $WRAP = $state->{wrap}; 7986 $replaced_ents = $state->{replaced_ents}, 7987 } 7988 7989 # sets global state to defaults 7990 sub init_global_state 7991 { set_global_state( 7992 { pretty => 0, 7993 quote => '"', 7994 indent => $INDENT, 7995 empty_tag_style => 0, 7996 remove_cdata => 0, 7997 keep_encoding => 0, 7998 expand_external_entities => 0, 7999 output_filter => undef, 8000 output_text_filter => undef, 8001 keep_atts_order => undef, 8002 do_not_escape_amp_in_atts => 0, 8003 wrap => $WRAP, 8004 replaced_ents => $REPLACED_ENTS, 8005 }); 8006 } 8007 8008 8009 # set the pretty_print style (in $pretty) and returns the old one 8010 # can be called from outside the package with 2 arguments (elt, style) 8011 # or from inside with only one argument (style) 8012 # the style can be either a string (one of the keys of %pretty_print_style 8013 # or a number (presumably an old value saved) 8014 sub set_pretty_print 8015 { my $style= lc( defined $_[1] ? $_[1] : $_[0]); # so we cover both cases 8016 my $old_pretty= $pretty; 8017 if( $style=~ /^\d+$/) 8018 { croak "invalid pretty print style $style" unless( $style < keys %pretty_print_style); 8019 $pretty= $style; 8020 } 8021 else 8022 { croak "invalid pretty print style '$style'" unless( exists $pretty_print_style{$style}); 8023 $pretty= $pretty_print_style{$style}; 8024 } 8025 if( $WRAPPED{$pretty} ) 8026 { XML::Twig::_use( 'Text::Wrap') or croak( "Text::Wrap not available, cannot use style $style"); } 8027 return $old_pretty; 8028 } 8029 8030 sub _pretty_print { return $pretty; } 8031 8032 # set the empty tag style (in $empty_tag_style) and returns the old one 8033 # can be called from outside the package with 2 arguments (elt, style) 8034 # or from inside with only one argument (style) 8035 # the style can be either a string (one of the keys of %empty_tag_style 8036 # or a number (presumably an old value saved) 8037 sub set_empty_tag_style 8038 { my $style= lc( defined $_[1] ? $_[1] : $_[0]); # so we cover both cases 8039 my $old_style= $empty_tag_style; 8040 if( $style=~ /^\d+$/) 8041 { croak "invalid empty tag style $style" 8042 unless( $style < keys %empty_tag_style); 8043 $empty_tag_style= $style; 8044 } 8045 else 8046 { croak "invalid empty tag style '$style'" 8047 unless( exists $empty_tag_style{$style}); 8048 $empty_tag_style= $empty_tag_style{$style}; 8049 } 8050 return $old_style; 8051 } 8052 8053 sub _pretty_print_styles 8054 { return (sort { $pretty_print_style{$a} <=> $pretty_print_style{$b} || $a cmp $b } keys %pretty_print_style); } 8055 8056 sub set_quote 8057 { my $style= $_[1] || $_[0]; 8058 my $old_quote= $quote; 8059 croak "invalid quote '$style'" unless( exists $quote_style{$style}); 8060 $quote= $quote_style{$style}; 8061 return $old_quote; 8062 } 8063 8064 sub set_remove_cdata 8065 { my $new_value= defined $_[1] ? $_[1] : $_[0]; 8066 my $old_value= $remove_cdata; 8067 $remove_cdata= $new_value; 8068 return $old_value; 8069 } 8070 8071 8072 sub set_indent 8073 { my $new_value= defined $_[1] ? $_[1] : $_[0]; 8074 my $old_value= $INDENT; 8075 $INDENT= $new_value; 8076 return $old_value; 8077 } 8078 8079 sub set_wrap 8080 { my $new_value= defined $_[1] ? $_[1] : $_[0]; 8081 my $old_value= $WRAP; 8082 $WRAP= $new_value; 8083 return $old_value; 8084 } 8085 8086 8087 sub set_keep_encoding 8088 { my $new_value= defined $_[1] ? $_[1] : $_[0]; 8089 my $old_value= $keep_encoding; 8090 $keep_encoding= $new_value; 8091 return $old_value; 8092 } 8093 8094 sub set_replaced_ents 8095 { my $new_value= defined $_[1] ? $_[1] : $_[0]; 8096 my $old_value= $replaced_ents; 8097 $replaced_ents= $new_value; 8098 return $old_value; 8099 } 8100 8101 sub do_not_escape_gt 8102 { my $old_value= $replaced_ents; 8103 $replaced_ents= q{&<}; # & needs to be first 8104 return $old_value; 8105 } 8106 8107 sub escape_gt 8108 { my $old_value= $replaced_ents; 8109 $replaced_ents= qq{&<>}; # & needs to be first 8110 return $old_value; 8111 } 8112 8113 sub _keep_encoding { return $keep_encoding; } # so I can use elsewhere in the module 8114 8115 sub set_do_not_escape_amp_in_atts 8116 { my $new_value= defined $_[1] ? $_[1] : $_[0]; 8117 my $old_value= $do_not_escape_amp_in_atts; 8118 $do_not_escape_amp_in_atts= $new_value; 8119 return $old_value; 8120 } 8121 8122 sub output_filter { return $output_filter; } 8123 sub output_text_filter { return $output_text_filter; } 8124 8125 sub set_output_filter 8126 { my $new_value= defined $_[1] ? $_[1] : $_[0]; # can be called in object/non-object mode 8127 # if called in object mode with no argument, the filter is undefined 8128 if( isa( $new_value, 'XML::Twig::Elt') || isa( $new_value, 'XML::Twig')) { undef $new_value; } 8129 my $old_value= $output_filter; 8130 if( !$new_value || isa( $new_value, 'CODE') ) 8131 { $output_filter= $new_value; } 8132 elsif( $new_value eq 'latin1') 8133 { $output_filter= XML::Twig::latin1(); 8134 } 8135 elsif( $XML::Twig::filter{$new_value}) 8136 { $output_filter= $XML::Twig::filter{$new_value}; } 8137 else 8138 { croak "invalid output filter '$new_value'"; } 8139 8140 return $old_value; 8141 } 8142 8143 sub set_output_text_filter 8144 { my $new_value= defined $_[1] ? $_[1] : $_[0]; # can be called in object/non-object mode 8145 # if called in object mode with no argument, the filter is undefined 8146 if( isa( $new_value, 'XML::Twig::Elt') || isa( $new_value, 'XML::Twig')) { undef $new_value; } 8147 my $old_value= $output_text_filter; 8148 if( !$new_value || isa( $new_value, 'CODE') ) 8149 { $output_text_filter= $new_value; } 8150 elsif( $new_value eq 'latin1') 8151 { $output_text_filter= XML::Twig::latin1(); 8152 } 8153 elsif( $XML::Twig::filter{$new_value}) 8154 { $output_text_filter= $XML::Twig::filter{$new_value}; } 8155 else 8156 { croak "invalid output text filter '$new_value'"; } 8157 8158 return $old_value; 8159 } 8160 8161 sub set_expand_external_entities 8162 { my $new_value= defined $_[1] ? $_[1] : $_[0]; 8163 my $old_value= $expand_external_entities; 8164 $expand_external_entities= $new_value; 8165 return $old_value; 8166 } 8167 8168 sub set_keep_atts_order 8169 { my $new_value= defined $_[1] ? $_[1] : $_[0]; 8170 my $old_value= $keep_atts_order; 8171 $keep_atts_order= $new_value; 8172 return $old_value; 8173 8174 } 8175 8176 sub keep_atts_order { return $keep_atts_order; } # so I can use elsewhere in the module 8177 8178 my %html_empty_elt; 8179 BEGIN { %html_empty_elt= map { $_ => 1} qw( base meta link hr br param img area input col); } 8180 8181 sub start_tag 8182 { my( $elt, $option)= @_; 8183 8184 8185 return if( $elt->{gi} < $XML::Twig::SPECIAL_GI); 8186 8187 my $extra_data= $elt->{extra_data} || ''; 8188 8189 my $gi= $XML::Twig::index2gi[$elt->{'gi'}]; 8190 my $att= $elt->{att}; # should be $elt->{att}, optimized into a pure hash look-up 8191 8192 my $ns_map= $att ? $att->{'#original_gi'} : ''; 8193 if( $ns_map) { $gi= _restore_original_prefix( $ns_map, $gi); } 8194 $gi=~ s{^#default:}{}; # remove default prefix 8195 8196 if( $output_text_filter) { $gi= $output_text_filter->( $gi); } 8197 8198 # get the attribute and their values 8199 my $att_sep = $pretty==$NSGMLS ? "\n" 8200 : $pretty==$INDENTEDA ? "\n" . $INDENT x ($elt->level+1) . ' ' 8201 : ' ' 8202 ; 8203 8204 my $replace_in_att_value= $replaced_ents . "$quote\t\r\n"; 8205 if( $option->{escape_gt} && $replaced_ents !~ m{>}) { $replace_in_att_value.= '>'; } 8206 8207 my $tag; 8208 my @att_names= grep { !( $_=~ m{^#(?!default:)} ) } $keep_atts_order ? keys %{$att} : sort keys %{$att}; 8209 if( @att_names) 8210 { my $atts= join $att_sep, map { my $output_att_name= $ns_map ? _restore_original_prefix( $ns_map, $_) : $_; 8211 if( $output_text_filter) 8212 { $output_att_name= $output_text_filter->( $output_att_name); } 8213 $output_att_name . '=' . $quote . _att_xml_string( $att->{$_}, $replace_in_att_value) . $quote 8214 8215 } 8216 @att_names 8217 ; 8218 if( $pretty==$INDENTEDA && @att_names == 1) { $att_sep= ' '; } 8219 $tag= "<$gi$att_sep$atts"; 8220 } 8221 else 8222 { $tag= "<$gi"; } 8223 8224 $tag .= "\n" if($pretty==$NSGMLS); 8225 8226 8227 # force empty if suitable HTML tag, otherwise use the value from the input tree 8228 if( ($empty_tag_style eq $HTML) && !$elt->{first_child} && !$elt->{extra_data_before_end_tag} && $html_empty_elt{$gi}) 8229 { $elt->{empty}= 1; } 8230 my $empty= defined $elt->{empty} ? $elt->{empty} 8231 : $elt->{first_child} ? 0 8232 : 1; 8233 8234 $tag .= (!$elt->{empty} || $elt->{extra_data_before_end_tag}) ? '>' # element has content 8235 : (($empty_tag_style eq $HTML) && $html_empty_elt{$gi}) ? ' />' # html empty element 8236 # cvs-friendly format 8237 : ( $pretty == $INDENTEDA && @att_names > 1) ? "\n" . $INDENT x $elt->level . "/>" 8238 : ( $pretty == $INDENTEDA && @att_names == 1) ? " />" 8239 : $empty_tag_style ? "></" . $XML::Twig::index2gi[$elt->{'gi'}] . ">" # $empty_tag_style is $HTML or $EXPAND 8240 : '/>' 8241 ; 8242 8243 if( ( (substr( $XML::Twig::index2gi[$elt->{'gi'}], 0, 1) eq '#') && (substr( $XML::Twig::index2gi[$elt->{'gi'}], 0, 9) ne '#default:') )) { $tag= ''; } 8244 8245#warn "TRACE: ", $tag,": ", Encode::is_utf8( $tag) ? "has flag" : "FLAG NOT SET"; 8246 8247 unless( $pretty) { return defined( $extra_data) ? $extra_data . $tag : $tag; } 8248 8249 my $prefix=''; 8250 my $return=''; # '' or \n is to be printed before the tag 8251 my $indent=0; # number of indents before the tag 8252 8253 if( $pretty==$RECORD1) 8254 { my $level= $elt->level; 8255 $return= "\n" if( $level < 2); 8256 $indent= 1 if( $level == 1); 8257 } 8258 8259 elsif( $pretty==$RECORD2) 8260 { $return= "\n"; 8261 $indent= $elt->level; 8262 } 8263 8264 elsif( $pretty==$NICE) 8265 { my $parent= $elt->{parent}; 8266 unless( !$parent || $parent->{contains_text}) 8267 { $return= "\n"; } 8268 $elt->{contains_text}= 1 if( ($parent && $parent->{contains_text}) 8269 || $elt->contains_text); 8270 } 8271 8272 elsif( $KEEP_TEXT_TAG_ON_ONE_LINE{$pretty}) 8273 { my $parent= $elt->{parent}; 8274 unless( !$parent || $parent->{contains_text}) 8275 { $return= "\n"; 8276 $indent= $elt->level; 8277 } 8278 $elt->{contains_text}= 1 if( ($parent && $parent->{contains_text}) 8279 || $elt->contains_text); 8280 } 8281 8282 if( $return || $indent) 8283 { # check for elements in which spaces should be kept 8284 my $t= $elt->twig; 8285 return $extra_data . $tag if( $xml_space_preserve); 8286 if( $t && $t->{twig_keep_spaces_in}) 8287 { foreach my $ancestor ($elt->ancestors) 8288 { return $extra_data . $tag if( $t->{twig_keep_spaces_in}->{$XML::Twig::index2gi[$ancestor->{'gi'}]}) } 8289 } 8290 8291 $prefix= $INDENT x $indent; 8292 if( $extra_data) 8293 { $extra_data=~ s{\s+$}{}; 8294 $extra_data=~ s{^\s+}{}; 8295 $extra_data= $prefix . $extra_data . $return; 8296 } 8297 } 8298 8299 8300 return $return . $extra_data . $prefix . $tag; 8301 } 8302 8303 sub end_tag 8304 { my $elt= shift; 8305 return '' if( ($elt->{gi}<$XML::Twig::SPECIAL_GI) 8306 || ($elt->{'empty'} && !$elt->{extra_data_before_end_tag}) 8307 ); 8308 my $tag= "<"; 8309 my $gi= $XML::Twig::index2gi[$elt->{'gi'}]; 8310 8311 if( my $map= $elt->{'att'}->{'#original_gi'}) { $gi= _restore_original_prefix( $map, $gi); } 8312 $gi=~ s{^#default:}{}; # remove default prefix 8313 8314 if( $output_text_filter) { $gi= $output_text_filter->( $XML::Twig::index2gi[$elt->{'gi'}]); } 8315 $tag .= "/$gi>"; 8316 8317 $tag = ($elt->{extra_data_before_end_tag} || '') . $tag; 8318 8319 if( ( (substr( $XML::Twig::index2gi[$elt->{'gi'}], 0, 1) eq '#') && (substr( $XML::Twig::index2gi[$elt->{'gi'}], 0, 9) ne '#default:') )) { $tag= ''; } 8320 8321 return $tag unless $pretty; 8322 8323 my $prefix=''; 8324 my $return=0; # 1 if a \n is to be printed before the tag 8325 my $indent=0; # number of indents before the tag 8326 8327 if( $pretty==$RECORD1) 8328 { $return= 1 if( $elt->level == 0); 8329 } 8330 8331 elsif( $pretty==$RECORD2) 8332 { unless( $elt->contains_text) 8333 { $return= 1 ; 8334 $indent= $elt->level; 8335 } 8336 } 8337 8338 elsif( $pretty==$NICE) 8339 { my $parent= $elt->{parent}; 8340 if( ( ($parent && !$parent->{contains_text}) || !$parent ) 8341 && ( !$elt->{contains_text} 8342 && ($elt->{has_flushed_child} || $elt->{first_child}) 8343 ) 8344 ) 8345 { $return= 1; } 8346 } 8347 8348 elsif( $KEEP_TEXT_TAG_ON_ONE_LINE{$pretty}) 8349 { my $parent= $elt->{parent}; 8350 if( ( ($parent && !$parent->{contains_text}) || !$parent ) 8351 && ( !$elt->{contains_text} 8352 && ($elt->{has_flushed_child} || $elt->{first_child}) 8353 ) 8354 ) 8355 { $return= 1; 8356 $indent= $elt->level; 8357 } 8358 } 8359 8360 if( $return || $indent) 8361 { # check for elements in which spaces should be kept 8362 my $t= $elt->twig; 8363 return $tag if( $xml_space_preserve); 8364 if( $t && $t->{twig_keep_spaces_in}) 8365 { foreach my $ancestor ($elt, $elt->ancestors) 8366 { return $tag if( $t->{twig_keep_spaces_in}->{$XML::Twig::index2gi[$ancestor->{'gi'}]}) } 8367 } 8368 8369 if( $return) { $prefix= ($pretty== $INDENTEDCT) ? "\n$INDENT" : "\n"; } 8370 $prefix.= $INDENT x $indent; 8371 } 8372 8373 # add a \n at the end of the document (after the root element) 8374 $tag .= "\n" unless( $elt->{parent}); 8375 8376 return $prefix . $tag; 8377 } 8378 8379 sub _restore_original_prefix 8380 { my( $map, $name)= @_; 8381 my $prefix= _ns_prefix( $name); 8382 if( my $original_prefix= $map->{$prefix}) 8383 { if( $original_prefix eq '#default') 8384 { $name=~ s{^$prefix:}{}; } 8385 else 8386 { $name=~ s{^$prefix(?=:)}{$original_prefix}; } 8387 } 8388 return $name; 8389 } 8390 8391 # buffer used to hold the text to print/sprint, to avoid passing it back and forth between methods 8392 my @sprint; 8393 8394 # $elt is an element to print 8395 # $fh is an optional filehandle to print to 8396 # $pretty is an optional value, if true a \n is printed after the < of the 8397 # opening tag 8398 sub print 8399 { my $elt= shift; 8400 8401 my $fh= isa( $_[0], 'GLOB') || isa( $_[0], 'IO::Scalar') ? shift : undef; 8402 my $old_select= defined $fh ? select $fh : undef; 8403 print $elt->sprint( @_); 8404 select $old_select if( defined $old_select); 8405 } 8406 8407 8408# those next 2 methods need to be refactored, they are copies of the same methods in XML::Twig 8409sub print_to_file 8410 { my( $elt, $filename)= (shift, shift); 8411 my $out_fh; 8412# open( $out_fh, ">$filename") or _croak( "cannot create file $filename: $!"); # < perl 5.8 8413 my $mode= $keep_encoding ? '>' : '>:utf8'; # >= perl 5.8 8414 open( $out_fh, $mode, $filename) or _croak( "cannot create file $filename: $!"); # >= perl 5.8 8415 $elt->print( $out_fh, @_); 8416 close $out_fh; 8417 return $elt; 8418 } 8419 8420# probably only works on *nix (at least the chmod bit) 8421# first print to a temporary file, then rename that file to the desired file name, then change permissions 8422# to the original file permissions (or to the current umask) 8423sub safe_print_to_file 8424 { my( $elt, $filename)= (shift, shift); 8425 my $perm= -f $filename ? (stat $filename)[2] & 07777 : ~umask() ; 8426 XML::Twig::_use( 'File::Temp') || croak "need File::Temp to use safe_print_to_file\n"; 8427 XML::Twig::_use( 'File::Basename') || croak "need File::Basename to use safe_print_to_file\n"; 8428 my $tmpdir= File::Basename::dirname( $filename); 8429 my( $fh, $tmpfilename) = File::Temp::tempfile( DIR => $tmpdir); 8430 $elt->print_to_file( $tmpfilename, @_); 8431 rename( $tmpfilename, $filename) or unlink $tmpfilename && _croak( "cannot move temporary file to $filename: $!"); 8432 chmod $perm, $filename; 8433 return $elt; 8434 } 8435 8436 8437 # same as print but does not output the start tag if the element 8438 # is marked as flushed 8439 sub flush 8440 { my $elt= shift; 8441 my $up_to= $_[0] && isa( $_[0], 'XML::Twig::Elt') ? shift : $elt; 8442 $elt->twig->flush_up_to( $up_to, @_); 8443 } 8444 sub purge 8445 { my $elt= shift; 8446 my $up_to= $_[0] && isa( $_[0], 'XML::Twig::Elt') ? shift : $elt; 8447 $elt->twig->purge_up_to( $up_to, @_); 8448 } 8449 8450 sub _flush 8451 { my $elt= shift; 8452 8453 my $pretty; 8454 my $fh= isa( $_[0], 'GLOB') || isa( $_[0], 'IO::Scalar') ? shift : undef; 8455 my $old_select= defined $fh ? select $fh : undef; 8456 my $old_pretty= defined ($pretty= shift) ? set_pretty_print( $pretty) : undef; 8457 8458 $xml_space_preserve= 1 if( ($elt->inherit_att( 'xml:space') || '') eq 'preserve'); 8459 8460 $elt->__flush(); 8461 8462 $xml_space_preserve= 0; 8463 8464 select $old_select if( defined $old_select); 8465 set_pretty_print( $old_pretty) if( defined $old_pretty); 8466 } 8467 8468 sub __flush 8469 { my $elt= shift; 8470 8471 if( $elt->{gi} >= $XML::Twig::SPECIAL_GI) 8472 { my $preserve= ($elt->{'att'}->{'xml:space'} || '') eq 'preserve'; 8473 $xml_space_preserve++ if $preserve; 8474 unless( $elt->_flushed) 8475 { print $elt->start_tag(); 8476 } 8477 8478 # flush the children 8479 my @children= do { my $elt= $elt; my @children=(); my $child= $elt->{first_child}; while( $child) { push @children, $child; $child= $child->{next_sibling}; } @children; }; 8480 foreach my $child (@children) 8481 { $child->_flush( $pretty); } 8482 unless( $elt->{end_tag_flushed}) { print $elt->end_tag; } 8483 $xml_space_preserve-- if $preserve; 8484 # used for pretty printing 8485 if( my $parent= $elt->{parent}) { $parent->{has_flushed_child}= 1; } 8486 } 8487 else # text or special element 8488 { my $text; 8489 if( (exists $elt->{'pcdata'})) { $text= $elt->pcdata_xml_string; 8490 if( my $parent= $elt->{parent}) 8491 { $parent->{contains_text}= 1; } 8492 } 8493 elsif( (exists $elt->{'cdata'})) { $text= $elt->cdata_string; 8494 if( my $parent= $elt->{parent}) 8495 { $parent->{contains_text}= 1; } 8496 } 8497 elsif( (exists $elt->{'target'})) { $text= $elt->pi_string; } 8498 elsif( (exists $elt->{'comment'})) { $text= $elt->comment_string; } 8499 elsif( (exists $elt->{'ent'})) { $text= $elt->ent_string; } 8500 8501 print $output_filter ? $output_filter->( $text) : $text; 8502 } 8503 } 8504 8505 8506 sub xml_text 8507 { my( $elt, @options)= @_; 8508 8509 if( @options && grep { lc( $_) eq 'no_recurse' } @options) { return $elt->xml_text_only; } 8510 8511 my $string=''; 8512 8513 if( ($elt->{gi} >= $XML::Twig::SPECIAL_GI) ) 8514 { # sprint the children 8515 my $child= $elt->{first_child} || ''; 8516 while( $child) 8517 { $string.= $child->xml_text; 8518 } continue { $child= $child->{next_sibling}; } 8519 } 8520 elsif( (exists $elt->{'pcdata'})) { $string .= $output_filter ? $output_filter->($elt->pcdata_xml_string) 8521 : $elt->pcdata_xml_string; 8522 } 8523 elsif( (exists $elt->{'cdata'})) { $string .= $output_filter ? $output_filter->($elt->cdata_string) 8524 : $elt->cdata_string; 8525 } 8526 elsif( (exists $elt->{'ent'})) { $string .= $elt->ent_string; } 8527 8528 return $string; 8529 } 8530 8531 sub xml_text_only 8532 { return join '', map { $_->xml_text if( $_->is_text || (exists $_->{'ent'})) } $_[0]->_children; } 8533 8534 # same as print but except... it does not print but rather returns the string 8535 # if the second parameter is set then only the content is returned, not the 8536 # start and end tags of the element (but the tags of the included elements are 8537 # returned) 8538 8539 sub sprint 8540 { my $elt= shift; 8541 my( $old_pretty, $old_empty_tag_style); 8542 8543 if( $_[0] && isa( $_[0], 'HASH')) 8544 { my %args= XML::Twig::_normalize_args( %{shift()}); 8545 if( defined $args{PrettyPrint}) { $old_pretty = set_pretty_print( $args{PrettyPrint}); } 8546 if( defined $args{EmptyTags}) { $old_empty_tag_style = set_empty_tag_style( $args{EmptyTags}); } 8547 } 8548 8549 $xml_space_preserve= 1 if( ($elt->inherit_att( 'xml:space') || '') eq 'preserve'); 8550 8551 @sprint=(); 8552 $elt->_sprint( @_); 8553 my $sprint= join( '', @sprint); 8554 if( $output_filter) { $sprint= $output_filter->( $sprint); } 8555 8556 if( ( ($pretty== $WRAPPED) || ($pretty==$INDENTEDC)) && !$xml_space_preserve) 8557 { $sprint= _wrap_text( $sprint); } 8558 $xml_space_preserve= 0; 8559 8560 8561 if( defined $old_pretty) { set_pretty_print( $old_pretty); } 8562 if( defined $old_empty_tag_style) { set_empty_tag_style( $old_empty_tag_style); } 8563 8564 return $sprint; 8565 } 8566 8567 sub _wrap_text 8568 { my( $string)= @_; 8569 my $wrapped; 8570 foreach my $line (split /\n/, $string) 8571 { my( $initial_indent)= $line=~ m{^(\s*)}; 8572 my $wrapped_line= Text::Wrap::wrap( '', $initial_indent . $INDENT, $line) . "\n"; 8573 8574 # fix glitch with Text::wrap when the first line is long and does not include spaces 8575 # the first line ends up being too short by 2 chars, but we'll have to live with it! 8576 $wrapped_line=~ s{^ +\n }{}s; # this prefix needs to be removed 8577 8578 $wrapped .= $wrapped_line; 8579 } 8580 8581 return $wrapped; 8582 } 8583 8584 8585 sub _sprint 8586 { my $elt= shift; 8587 my $no_tag= shift || 0; 8588 # in case there's some comments or PI's piggybacking 8589 8590 if( $elt->{gi} >= $XML::Twig::SPECIAL_GI) 8591 { 8592 my $preserve= ($elt->{'att'}->{'xml:space'} || '') eq 'preserve'; 8593 $xml_space_preserve++ if $preserve; 8594 8595 push @sprint, $elt->start_tag unless( $no_tag); 8596 8597 # sprint the children 8598 my $child= $elt->{first_child}; 8599 while( $child) 8600 { $child->_sprint; 8601 $child= $child->{next_sibling}; 8602 } 8603 push @sprint, $elt->end_tag unless( $no_tag); 8604 $xml_space_preserve-- if $preserve; 8605 } 8606 else 8607 { push @sprint, $elt->{extra_data} if( $elt->{extra_data}) ; 8608 if( (exists $elt->{'pcdata'})) { push @sprint, $elt->pcdata_xml_string; } 8609 elsif( (exists $elt->{'cdata'})) { push @sprint, $elt->cdata_string; } 8610 elsif( (exists $elt->{'target'})) { if( ($pretty >= $INDENTED) && !$elt->{parent}->{contains_text}) { push @sprint, "\n" . $INDENT x $elt->level; } 8611 push @sprint, $elt->pi_string; 8612 } 8613 elsif( (exists $elt->{'comment'})) { if( ($pretty >= $INDENTED) && !$elt->{parent}->{contains_text}) { push @sprint, "\n" . $INDENT x $elt->level; } 8614 push @sprint, $elt->comment_string; 8615 } 8616 elsif( (exists $elt->{'ent'})) { push @sprint, $elt->ent_string; } 8617 } 8618 8619 return; 8620 } 8621 8622 # just a shortcut to $elt->sprint( 1) 8623 sub xml_string 8624 { my $elt= shift; 8625 isa( $_[0], 'HASH') ? $elt->sprint( shift(), 1) : $elt->sprint( 1); 8626 } 8627 8628 sub pcdata_xml_string 8629 { my $elt= shift; 8630 if( defined( my $string= $elt->{pcdata}) ) 8631 { 8632 if( ! $elt->{extra_data_in_pcdata}) 8633 { 8634 $string=~ s/([$replaced_ents])/$XML::Twig::base_ent{$1}/g unless( !$replaced_ents || $keep_encoding || $elt->{asis}); 8635 $string=~ s{\Q]]>}{]]>}g; 8636 } 8637 else 8638 { _gen_mark( $string); # used by _(un)?protect_extra_data 8639 foreach my $data (reverse @{$elt->{extra_data_in_pcdata}}) 8640 { my $substr= substr( $string, $data->{offset}); 8641 if( $keep_encoding || $elt->{asis}) 8642 { substr( $string, $data->{offset}, 0, $data->{text}); } 8643 else 8644 { substr( $string, $data->{offset}, 0, _protect_extra_data( $data->{text})); } 8645 } 8646 unless( $keep_encoding || $elt->{asis}) 8647 { 8648 $string=~ s{([$replaced_ents])}{$XML::Twig::base_ent{$1}}g ; 8649 $string=~ s{\Q]]>}{]]>}g; 8650 _unprotect_extra_data( $string); 8651 } 8652 } 8653 return $output_text_filter ? $output_text_filter->( $string) : $string; 8654 } 8655 else 8656 { return ''; } 8657 } 8658 8659 { my $mark; 8660 my( %char2ent, %ent2char); 8661 BEGIN 8662 { %char2ent= ( '<' => 'lt', '&' => 'amp', '>' => 'gt'); 8663 %ent2char= map { $char2ent{$_} => $_ } keys %char2ent; 8664 } 8665 8666 # generate a unique mark (a string) not found in the string, 8667 # used to mark < and & in the extra data 8668 sub _gen_mark 8669 { $mark="AAAA"; 8670 $mark++ while( index( $_[0], $mark) > -1); 8671 return $mark; 8672 } 8673 8674 sub _protect_extra_data 8675 { my( $extra_data)= @_; 8676 $extra_data=~ s{([<&>])}{:$mark:$char2ent{$1}:}g; 8677 return $extra_data; 8678 } 8679 8680 sub _unprotect_extra_data 8681 { $_[0]=~ s{:$mark:(\w+):}{$ent2char{$1}}g; } 8682 8683 } 8684 8685 sub cdata_string 8686 { my $cdata= $_[0]->{cdata}; 8687 unless( defined $cdata) { return ''; } 8688 if( $remove_cdata) 8689 { $cdata=~ s/([$replaced_ents])/$XML::Twig::base_ent{$1}/g; } 8690 else 8691 { $cdata= $CDATA_START . $cdata . $CDATA_END; } 8692 return $cdata; 8693 } 8694 8695 sub att_xml_string 8696 { my $elt= shift; 8697 my $att= shift; 8698 8699 my $replace= $replaced_ents . "$quote\n\r\t"; 8700 if($_[0] && $_[0]->{escape_gt} && ($replace!~ m{>}) ) { $replace .='>'; } 8701 8702 if( defined (my $string= $elt->{att}->{$att})) 8703 { return _att_xml_string( $string, $replace); } 8704 else 8705 { return ''; } 8706 } 8707 8708 # escaped xml string for an attribute value 8709 sub _att_xml_string 8710 { my( $string, $escape)= @_; 8711 if( !defined( $string)) { return ''; } 8712 if( $keep_encoding) 8713 { $string=~ s{$quote}{$XML::Twig::base_ent{$quote}}g; 8714 } 8715 else 8716 { 8717 if( $do_not_escape_amp_in_atts) 8718 { $escape=~ s{^.}{}; # seems like the most backward compatible way to remove & from the list 8719 $string=~ s{([$escape])}{$XML::Twig::base_ent{$1}}g; 8720 $string=~ s{&(?!(\w+|#\d+|[xX][0-9a-fA-F]+);)}{&}g; # dodgy: escape & that do not start an entity 8721 } 8722 else 8723 { $string=~ s{([$escape])}{$XML::Twig::base_ent{$1}}g; 8724 $string=~ s{\Q]]>}{]]>}g; 8725 } 8726 } 8727 8728 return $output_text_filter ? $output_text_filter->( $string) : $string; 8729 } 8730 8731 sub ent_string 8732 { my $ent= shift; 8733 my $ent_text= $ent->{ent}; 8734 my( $t, $el, $ent_string); 8735 if( $expand_external_entities 8736 && ($t= $ent->twig) 8737 && ($el= $t->entity_list) 8738 && ($ent_string= $el->{entities}->{$ent->ent_name}->{val}) 8739 ) 8740 { return $ent_string; } 8741 else 8742 { return $ent_text; } 8743 } 8744 8745 # returns just the text, no tags, for an element 8746 sub text 8747 { my( $elt, @options)= @_; 8748 8749 if( @options && grep { lc( $_) eq 'no_recurse' } @options) { return $elt->text_only; } 8750 8751 my $string; 8752 8753 if( (exists $elt->{'pcdata'})) { return $elt->{pcdata}; } 8754 elsif( (exists $elt->{'cdata'})) { return $elt->{cdata}; } 8755 elsif( (exists $elt->{'target'})) { return $elt->pi_string;} 8756 elsif( (exists $elt->{'comment'})) { return $elt->{comment}; } 8757 elsif( (exists $elt->{'ent'})) { return $elt->{ent} ; } 8758 8759 my $child= $elt->{first_child} ||''; 8760 while( $child) 8761 { 8762 my $child_text= $child->text; 8763 $string.= defined( $child_text) ? $child_text : ''; 8764 } continue { $child= $child->{next_sibling}; } 8765 8766 unless( defined $string) { $string=''; } 8767 8768 return $output_text_filter ? $output_text_filter->( $string) : $string; 8769 } 8770 8771 sub text_only 8772 { return join '', map { $_->text if( $_->is_text || (exists $_->{'ent'})) } $_[0]->_children; } 8773 8774 sub trimmed_text 8775 { my $elt= shift; 8776 my $text= $elt->text( @_); 8777 $text=~ s{\s+}{ }sg; 8778 $text=~ s{^\s*}{}; 8779 $text=~ s{\s*$}{}; 8780 return $text; 8781 } 8782 8783 sub trim 8784 { my( $elt)= @_; 8785 my $pcdata= $elt->first_descendant( $TEXT); 8786 (my $pcdata_text= $pcdata->text)=~ s{^\s+}{}s; 8787 $pcdata->set_text( $pcdata_text); 8788 $pcdata= $elt->last_descendant( $TEXT); 8789 ($pcdata_text= $pcdata->text)=~ s{\s+$}{}; 8790 $pcdata->set_text( $pcdata_text); 8791 foreach my $pcdata ($elt->descendants( $TEXT)) 8792 { ($pcdata_text= $pcdata->text)=~ s{\s+}{ }g; 8793 $pcdata->set_text( $pcdata_text); 8794 } 8795 return $elt; 8796 } 8797 8798 8799 # remove cdata sections (turns them into regular pcdata) in an element 8800 sub remove_cdata 8801 { my $elt= shift; 8802 foreach my $cdata ($elt->descendants_or_self( $CDATA)) 8803 { if( $keep_encoding) 8804 { my $data= $cdata->{cdata}; 8805 $data=~ s{([&<"'])}{$XML::Twig::base_ent{$1}}g; 8806 $cdata->{pcdata}= (delete $cdata->{empty} || 1) && $data; 8807 } 8808 else 8809 { $cdata->{pcdata}= (delete $cdata->{empty} || 1) && $cdata->{cdata}; } 8810 $cdata->{gi}=$XML::Twig::gi2index{$PCDATA} or $cdata->set_gi( $PCDATA); 8811 undef $cdata->{cdata}; 8812 } 8813 } 8814 8815sub _is_private { return _is_private_name( $_[0]->gi); } 8816sub _is_private_name { return $_[0]=~ m{^#(?!default:)}; } 8817 8818 8819} # end of block containing package globals ($pretty_print, $quotes, keep_encoding...) 8820 8821# merges consecutive #PCDATAs in am element 8822sub normalize 8823 { my( $elt)= @_; 8824 my @descendants= $elt->descendants( $PCDATA); 8825 while( my $desc= shift @descendants) 8826 { if( ! length $desc->{pcdata}) { $desc->delete; next; } 8827 while( @descendants && $desc->{next_sibling} && $desc->{next_sibling}== $descendants[0]) 8828 { my $to_merge= shift @descendants; 8829 $desc->merge_text( $to_merge); 8830 } 8831 } 8832 return $elt; 8833 } 8834 8835# SAX export methods 8836sub toSAX1 8837 { _toSAX(@_, \&_start_tag_data_SAX1, \&_end_tag_data_SAX1); } 8838 8839sub toSAX2 8840 { _toSAX(@_, \&_start_tag_data_SAX2, \&_end_tag_data_SAX2); } 8841 8842sub _toSAX 8843 { my( $elt, $handler, $start_tag_data, $end_tag_data)= @_; 8844 if( $elt->{gi} >= $XML::Twig::SPECIAL_GI) 8845 { my $data= $start_tag_data->( $elt); 8846 _start_prefix_mapping( $elt, $handler, $data); 8847 if( $data && (my $start_element = $handler->can( 'start_element'))) 8848 { unless( $elt->_flushed) { $start_element->( $handler, $data); } } 8849 8850 foreach my $child ($elt->_children) 8851 { $child->_toSAX( $handler, $start_tag_data, $end_tag_data); } 8852 8853 if( (my $data= $end_tag_data->( $elt)) && (my $end_element = $handler->can( 'end_element')) ) 8854 { $end_element->( $handler, $data); } 8855 _end_prefix_mapping( $elt, $handler); 8856 } 8857 else # text or special element 8858 { if( (exists $elt->{'pcdata'}) && (my $characters= $handler->can( 'characters'))) 8859 { $characters->( $handler, { Data => $elt->{pcdata} }); } 8860 elsif( (exists $elt->{'cdata'})) 8861 { if( my $start_cdata= $handler->can( 'start_cdata')) 8862 { $start_cdata->( $handler); } 8863 if( my $characters= $handler->can( 'characters')) 8864 { $characters->( $handler, {Data => $elt->{cdata} }); } 8865 if( my $end_cdata= $handler->can( 'end_cdata')) 8866 { $end_cdata->( $handler); } 8867 } 8868 elsif( ((exists $elt->{'target'})) && (my $pi= $handler->can( 'processing_instruction'))) 8869 { $pi->( $handler, { Target =>$elt->{target}, Data => $elt->{data} }); } 8870 elsif( ((exists $elt->{'comment'})) && (my $comment= $handler->can( 'comment'))) 8871 { $comment->( $handler, { Data => $elt->{comment} }); } 8872 elsif( ((exists $elt->{'ent'}))) 8873 { 8874 if( my $se= $handler->can( 'skipped_entity')) 8875 { $se->( $handler, { Name => $elt->ent_name }); } 8876 elsif( my $characters= $handler->can( 'characters')) 8877 { if( defined $elt->ent_string) 8878 { $characters->( $handler, {Data => $elt->ent_string}); } 8879 else 8880 { $characters->( $handler, {Data => $elt->ent_name}); } 8881 } 8882 } 8883 8884 } 8885 } 8886 8887sub _start_tag_data_SAX1 8888 { my( $elt)= @_; 8889 my $name= $XML::Twig::index2gi[$elt->{'gi'}]; 8890 return if( ( (substr( $XML::Twig::index2gi[$elt->{'gi'}], 0, 1) eq '#') && (substr( $XML::Twig::index2gi[$elt->{'gi'}], 0, 9) ne '#default:') )); 8891 my $attributes={}; 8892 my $atts= $elt->{att}; 8893 while( my( $att, $value)= each %$atts) 8894 { $attributes->{$att}= $value unless( ( $att=~ m{^#(?!default:)} )); } 8895 my $data= { Name => $name, Attributes => $attributes}; 8896 return $data; 8897 } 8898 8899sub _end_tag_data_SAX1 8900 { my( $elt)= @_; 8901 return if( ( (substr( $XML::Twig::index2gi[$elt->{'gi'}], 0, 1) eq '#') && (substr( $XML::Twig::index2gi[$elt->{'gi'}], 0, 9) ne '#default:') )); 8902 return { Name => $XML::Twig::index2gi[$elt->{'gi'}] }; 8903 } 8904 8905sub _start_tag_data_SAX2 8906 { my( $elt)= @_; 8907 my $data={}; 8908 8909 my $name= $XML::Twig::index2gi[$elt->{'gi'}]; 8910 return if( ( (substr( $XML::Twig::index2gi[$elt->{'gi'}], 0, 1) eq '#') && (substr( $XML::Twig::index2gi[$elt->{'gi'}], 0, 9) ne '#default:') )); 8911 $data->{Name} = $name; 8912 $data->{Prefix} = $elt->ns_prefix; 8913 $data->{LocalName} = $elt->local_name; 8914 $data->{NamespaceURI} = $elt->namespace; 8915 8916 # save a copy of the data so we can re-use it for the end tag 8917 my %sax2_data= %$data; 8918 $elt->{twig_elt_SAX2_data}= \%sax2_data; 8919 8920 # add the attributes 8921 $data->{Attributes}= $elt->_atts_to_SAX2; 8922 8923 return $data; 8924 } 8925 8926sub _atts_to_SAX2 8927 { my $elt= shift; 8928 my $SAX2_atts= {}; 8929 foreach my $att (keys %{$elt->{att}}) 8930 { 8931 next if( ( $att=~ m{^#(?!default:)} )); 8932 my $SAX2_att={}; 8933 $SAX2_att->{Name} = $att; 8934 $SAX2_att->{Prefix} = _ns_prefix( $att); 8935 $SAX2_att->{LocalName} = _local_name( $att); 8936 $SAX2_att->{NamespaceURI} = $elt->namespace( $SAX2_att->{Prefix}); 8937 $SAX2_att->{Value} = $elt->{'att'}->{$att}; 8938 my $SAX2_att_name= "{$SAX2_att->{NamespaceURI}}$SAX2_att->{LocalName}"; 8939 8940 $SAX2_atts->{$SAX2_att_name}= $SAX2_att; 8941 } 8942 return $SAX2_atts; 8943 } 8944 8945sub _start_prefix_mapping 8946 { my( $elt, $handler, $data)= @_; 8947 if( my $start_prefix_mapping= $handler->can( 'start_prefix_mapping') 8948 and my @new_prefix_mappings= grep { /^\{[^}]*\}xmlns/ || /^\{$XMLNS_URI\}/ } keys %{$data->{Attributes}} 8949 ) 8950 { foreach my $prefix (@new_prefix_mappings) 8951 { my $prefix_string= $data->{Attributes}->{$prefix}->{LocalName}; 8952 if( $prefix_string eq 'xmlns') { $prefix_string=''; } 8953 my $prefix_data= 8954 { Prefix => $prefix_string, 8955 NamespaceURI => $data->{Attributes}->{$prefix}->{Value} 8956 }; 8957 $start_prefix_mapping->( $handler, $prefix_data); 8958 $elt->{twig_end_prefix_mapping} ||= []; 8959 push @{$elt->{twig_end_prefix_mapping}}, $prefix_string; 8960 } 8961 } 8962 } 8963 8964sub _end_prefix_mapping 8965 { my( $elt, $handler)= @_; 8966 if( my $end_prefix_mapping= $handler->can( 'end_prefix_mapping')) 8967 { foreach my $prefix (@{$elt->{twig_end_prefix_mapping}}) 8968 { $end_prefix_mapping->( $handler, { Prefix => $prefix} ); } 8969 } 8970 } 8971 8972sub _end_tag_data_SAX2 8973 { my( $elt)= @_; 8974 return if( ( (substr( $XML::Twig::index2gi[$elt->{'gi'}], 0, 1) eq '#') && (substr( $XML::Twig::index2gi[$elt->{'gi'}], 0, 9) ne '#default:') )); 8975 return $elt->{twig_elt_SAX2_data}; 8976 } 8977 8978sub contains_text 8979 { my $elt= shift; 8980 my $child= $elt->{first_child}; 8981 while ($child) 8982 { return 1 if( $child->is_text || (exists $child->{'ent'})); 8983 $child= $child->{next_sibling}; 8984 } 8985 return 0; 8986 } 8987 8988# creates a single pcdata element containing the text as child of the element 8989# options: 8990# - force_pcdata: when set to a true value forces the text to be in a #PCDATA 8991# even if the original element was a #CDATA 8992sub set_text 8993 { my( $elt, $string, %option)= @_; 8994 8995 if( $XML::Twig::index2gi[$elt->{'gi'}] eq $PCDATA) 8996 { return $elt->{pcdata}= (delete $elt->{empty} || 1) && $string; } 8997 elsif( $XML::Twig::index2gi[$elt->{'gi'}] eq $CDATA) 8998 { if( $option{force_pcdata}) 8999 { $elt->{gi}=$XML::Twig::gi2index{$PCDATA} or $elt->set_gi( $PCDATA); 9000 $elt->_set_cdata(''); 9001 return $elt->{pcdata}= (delete $elt->{empty} || 1) && $string; 9002 } 9003 else 9004 { return $elt->_set_cdata( $string); } 9005 } 9006 elsif( $elt->contains_a_single( $PCDATA) ) 9007 { # optimized so we have a slight chance of not loosing embedded comments and pi's 9008 $elt->{first_child}->set_pcdata( $string); 9009 return $elt; 9010 } 9011 9012 foreach my $child (@{[$elt->_children]}) 9013 { $child->delete; } 9014 9015 my $pcdata= $elt->_new_pcdata( $string); 9016 $pcdata->paste( $elt); 9017 9018 $elt->{empty}=0; 9019 9020 return $elt; 9021 } 9022 9023# set the content of an element from a list of strings and elements 9024sub set_content 9025 { my $elt= shift; 9026 9027 return $elt unless defined $_[0]; 9028 9029 # attributes can be given as a hash (passed by ref) 9030 if( ref $_[0] eq 'HASH') 9031 { my $atts= shift; 9032 $elt->del_atts; # usually useless but better safe than sorry 9033 $elt->set_atts( $atts); 9034 return $elt unless defined $_[0]; 9035 } 9036 9037 # check next argument for #EMPTY 9038 if( !(ref $_[0]) && ($_[0] eq $EMPTY) ) 9039 { $elt->{empty}= 1; return $elt; } 9040 9041 # case where we really want to do a set_text, the element is '#PCDATA' 9042 # or contains a single PCDATA and we only want to add text in it 9043 if( ($XML::Twig::index2gi[$elt->{'gi'}] eq $PCDATA || $elt->contains_a_single( $PCDATA)) 9044 && (@_ == 1) && !( ref $_[0])) 9045 { $elt->set_text( $_[0]); 9046 return $elt; 9047 } 9048 elsif( ($XML::Twig::index2gi[$elt->{'gi'}] eq $CDATA) && (@_ == 1) && !( ref $_[0])) 9049 { $elt->_set_cdata( $_[0]); 9050 return $elt; 9051 } 9052 9053 # delete the children 9054 foreach my $child (@{[$elt->_children]}) 9055 { $child->delete; } 9056 9057 if( @_) { $elt->{empty}=0; } 9058 9059 foreach my $child (@_) 9060 { if( ref( $child) && isa( $child, 'XML::Twig::Elt')) 9061 { # argument is an element 9062 $child->paste( 'last_child', $elt); 9063 } 9064 else 9065 { # argument is a string 9066 if( (my $pcdata= $elt->{last_child}) && $elt->{last_child}->is_pcdata) 9067 { # previous child is also pcdata: just concatenate 9068 $pcdata->{pcdata}= (delete $pcdata->{empty} || 1) && $pcdata->{pcdata} . $child 9069 } 9070 else 9071 { # previous child is not a string: create a new pcdata element 9072 $pcdata= $elt->_new_pcdata( $child); 9073 $pcdata->paste( 'last_child', $elt); 9074 } 9075 } 9076 } 9077 9078 9079 return $elt; 9080 } 9081 9082# inserts an element (whose gi is given) as child of the element 9083# all children of the element are now children of the new element 9084# returns the new element 9085sub insert 9086 { my ($elt, @args)= @_; 9087 # first cut the children 9088 my @children= do { my $elt= $elt; my @children=(); my $child= $elt->{first_child}; while( $child) { push @children, $child; $child= $child->{next_sibling}; } @children; }; 9089 foreach my $child (@children) 9090 { $child->cut; } 9091 # insert elements 9092 while( my $gi= shift @args) 9093 { my $new_elt= $elt->new( $gi); 9094 # add attributes if needed 9095 if( defined( $args[0]) && ( isa( $args[0], 'HASH')) ) 9096 { $new_elt->set_atts( shift @args); } 9097 # paste the element 9098 $new_elt->paste( $elt); 9099 $elt->{empty}=0; 9100 $elt= $new_elt; 9101 } 9102 # paste back the children 9103 foreach my $child (@children) 9104 { $child->paste( 'last_child', $elt); } 9105 return $elt; 9106 } 9107 9108# insert a new element 9109# $elt->insert_new_element( $opt_position, $gi, $opt_atts_hash, @opt_content); 9110# the element is created with the same syntax as new 9111# position is the same as in paste, first_child by default 9112sub insert_new_elt 9113 { my $elt= shift; 9114 my $position= $_[0]; 9115 if( ($position eq 'before') || ($position eq 'after') 9116 || ($position eq 'first_child') || ($position eq 'last_child')) 9117 { shift; } 9118 else 9119 { $position= 'first_child'; } 9120 9121 my $new_elt= $elt->new( @_); 9122 $new_elt->paste( $position, $elt); 9123 9124 #if( defined $new_elt->{'att'}->{$ID}) { $new_elt->set_id( $new_elt->{'att'}->{$ID}); } 9125 9126 return $new_elt; 9127 } 9128 9129# wraps an element in elements which gi's are given as arguments 9130# $elt->wrap_in( 'td', 'tr', 'table') wraps the element as a single 9131# cell in a table for example 9132# returns the new element 9133sub wrap_in 9134 { my $elt= shift; 9135 while( my $gi = shift @_) 9136 { my $new_elt = $elt->new( $gi); 9137 if( $elt->{twig_current}) 9138 { my $t= $elt->twig; 9139 $t->{twig_current}= $new_elt; 9140 delete $elt->{'twig_current'}; 9141 $new_elt->{'twig_current'}=1; 9142 } 9143 9144 if( my $parent= $elt->{parent}) 9145 { $new_elt->{parent}=$parent; if( $XML::Twig::weakrefs) { weaken( $new_elt->{parent});} ; 9146 if( $parent->{first_child} == $elt) { $parent->{first_child}= $new_elt; } 9147 if( $parent->{last_child} == $elt) { $parent->{empty}=0; $parent->{last_child}=$new_elt; if( $XML::Twig::weakrefs) { weaken( $parent->{last_child});} ; } 9148 } 9149 else 9150 { # wrapping the root 9151 my $twig= $elt->twig; 9152 if( $twig && $twig->root && ($twig->root eq $elt) ) 9153 { $twig->set_root( $new_elt); 9154 } 9155 } 9156 9157 if( my $prev_sibling= $elt->{prev_sibling}) 9158 { $new_elt->{prev_sibling}=$prev_sibling; if( $XML::Twig::weakrefs) { weaken( $new_elt->{prev_sibling});} ; 9159 $prev_sibling->{next_sibling}= $new_elt; 9160 } 9161 9162 if( my $next_sibling= $elt->{next_sibling}) 9163 { $new_elt->{next_sibling}= $next_sibling; 9164 $next_sibling->{prev_sibling}=$new_elt; if( $XML::Twig::weakrefs) { weaken( $next_sibling->{prev_sibling});} ; 9165 } 9166 $new_elt->{first_child}= $elt; 9167 $new_elt->{empty}=0; $new_elt->{last_child}=$elt; if( $XML::Twig::weakrefs) { weaken( $new_elt->{last_child});} ; 9168 9169 $elt->{parent}=$new_elt; if( $XML::Twig::weakrefs) { weaken( $elt->{parent});} ; 9170 $elt->{prev_sibling}=undef; if( $XML::Twig::weakrefs) { weaken( $elt->{prev_sibling});} ; 9171 $elt->{next_sibling}= undef; 9172 9173 # add the attributes if the next argument is a hash ref 9174 if( defined( $_[0]) && (isa( $_[0], 'HASH')) ) 9175 { $new_elt->set_atts( shift @_); } 9176 9177 $elt= $new_elt; 9178 } 9179 9180 return $elt; 9181 } 9182 9183sub replace 9184 { my( $elt, $ref)= @_; 9185 9186 if( $elt->{parent}) { $elt->cut; } 9187 9188 if( my $parent= $ref->{parent}) 9189 { $elt->{parent}=$parent; if( $XML::Twig::weakrefs) { weaken( $elt->{parent});} ; 9190 if( $parent->{first_child} == $ref) { $parent->{first_child}= $elt; } 9191 if( $parent->{last_child} == $ref) { $parent->{empty}=0; $parent->{last_child}=$elt; if( $XML::Twig::weakrefs) { weaken( $parent->{last_child});} ; } 9192 } 9193 elsif( $ref->twig && $ref == $ref->twig->root) 9194 { $ref->twig->set_root( $elt); } 9195 9196 if( my $prev_sibling= $ref->{prev_sibling}) 9197 { $elt->{prev_sibling}=$prev_sibling; if( $XML::Twig::weakrefs) { weaken( $elt->{prev_sibling});} ; 9198 $prev_sibling->{next_sibling}= $elt; 9199 } 9200 if( my $next_sibling= $ref->{next_sibling}) 9201 { $elt->{next_sibling}= $next_sibling; 9202 $next_sibling->{prev_sibling}=$elt; if( $XML::Twig::weakrefs) { weaken( $next_sibling->{prev_sibling});} ; 9203 } 9204 9205 $ref->{parent}=undef; if( $XML::Twig::weakrefs) { weaken( $ref->{parent});} ; 9206 $ref->{prev_sibling}=undef; if( $XML::Twig::weakrefs) { weaken( $ref->{prev_sibling});} ; 9207 $ref->{next_sibling}= undef; 9208 return $ref; 9209 } 9210 9211sub replace_with 9212 { my $ref= shift; 9213 my $elt= shift; 9214 $elt->replace( $ref); 9215 foreach my $new_elt (reverse @_) 9216 { $new_elt->paste( after => $elt); } 9217 return $elt; 9218 } 9219 9220 9221# move an element, same syntax as paste, except the element is first cut 9222sub move 9223 { my $elt= shift; 9224 $elt->cut; 9225 $elt->paste( @_); 9226 return $elt; 9227 } 9228 9229 9230# adds a prefix to an element, creating a pcdata child if needed 9231sub prefix 9232 { my ($elt, $prefix, $option)= @_; 9233 my $asis= ($option && ($option eq 'asis')) ? 1 : 0; 9234 if( (exists $elt->{'pcdata'}) 9235 && (($asis && $elt->{asis}) || (!$asis && ! $elt->{asis})) 9236 ) 9237 { $elt->{pcdata}= (delete $elt->{empty} || 1) && $prefix . $elt->{pcdata}; } 9238 elsif( $elt->{first_child} && $elt->{first_child}->is_pcdata 9239 && ( ($asis && $elt->{first_child}->{asis}) 9240 || (!$asis && ! $elt->{first_child}->{asis})) 9241 ) 9242 { 9243 $elt->{first_child}->set_pcdata( $prefix . $elt->{first_child}->pcdata); 9244 } 9245 else 9246 { my $new_elt= $elt->_new_pcdata( $prefix); 9247 my $pos= (exists $elt->{'pcdata'}) ? 'before' : 'first_child'; 9248 $new_elt->paste( $pos => $elt); 9249 if( $asis) { $new_elt->set_asis; } 9250 } 9251 return $elt; 9252 } 9253 9254# adds a suffix to an element, creating a pcdata child if needed 9255sub suffix 9256 { my ($elt, $suffix, $option)= @_; 9257 my $asis= ($option && ($option eq 'asis')) ? 1 : 0; 9258 if( (exists $elt->{'pcdata'}) 9259 && (($asis && $elt->{asis}) || (!$asis && ! $elt->{asis})) 9260 ) 9261 { $elt->{pcdata}= (delete $elt->{empty} || 1) && $elt->{pcdata} . $suffix; } 9262 elsif( $elt->{last_child} && $elt->{last_child}->is_pcdata 9263 && ( ($asis && $elt->{last_child}->{asis}) 9264 || (!$asis && ! $elt->{last_child}->{asis})) 9265 ) 9266 { $elt->{last_child}->set_pcdata( $elt->{last_child}->pcdata . $suffix); } 9267 else 9268 { my $new_elt= $elt->_new_pcdata( $suffix); 9269 my $pos= (exists $elt->{'pcdata'}) ? 'after' : 'last_child'; 9270 $new_elt->paste( $pos => $elt); 9271 if( $asis) { $new_elt->set_asis; } 9272 } 9273 return $elt; 9274 } 9275 9276# create a path to an element ('/root/.../gi) 9277sub path 9278 { my $elt= shift; 9279 my @context= ( $elt, $elt->ancestors); 9280 return "/" . join( "/", reverse map {$_->gi} @context); 9281 } 9282 9283sub xpath 9284 { my $elt= shift; 9285 my $xpath; 9286 foreach my $ancestor (reverse $elt->ancestors_or_self) 9287 { my $gi= $XML::Twig::index2gi[$ancestor->{'gi'}]; 9288 $xpath.= "/$gi"; 9289 my $index= $ancestor->prev_siblings( $gi) + 1; 9290 unless( ($index == 1) && !$ancestor->next_sibling( $gi)) 9291 { $xpath.= "[$index]"; } 9292 } 9293 return $xpath; 9294 } 9295 9296# methods used mainly by wrap_children 9297 9298# return a string with the 9299# for an element <foo><elt att="val">...</elt><elt2/><elt>...</elt></foo> 9300# returns '<elt att="val"><elt2><elt>' 9301sub _stringify_struct 9302 { my( $elt, %opt)= @_; 9303 my $string=''; 9304 my $pretty_print= set_pretty_print( 'none'); 9305 foreach my $child ($elt->_children) 9306 { $child->add_id; $string .= $child->start_tag( { escape_gt => 1 }) ||''; } 9307 set_pretty_print( $pretty_print); 9308 return $string; 9309 } 9310 9311# wrap a series of elements in a new one 9312sub _wrap_range 9313 { my $elt= shift; 9314 my $gi= shift; 9315 my $atts= isa( $_[0], 'HASH') ? shift : undef; 9316 my $range= shift; # the string with the tags to wrap 9317 9318 my $t= $elt->twig; 9319 9320 # get the tags to wrap 9321 my @to_wrap; 9322 while( $range=~ m{<\w+\s+[^>]*id=("[^"]*"|'[^']*')[^>]*>}g) 9323 { push @to_wrap, $t->elt_id( substr( $1, 1, -1)); } 9324 9325 return '' unless @to_wrap; 9326 9327 my $to_wrap= shift @to_wrap; 9328 my %atts= %$atts; 9329 my $new_elt= $to_wrap->wrap_in( $gi, \%atts); 9330 $_->move( last_child => $new_elt) foreach (@to_wrap); 9331 9332 return ''; 9333 } 9334 9335# wrap children matching a regexp in a new element 9336sub wrap_children 9337 { my( $elt, $regexp, $gi, $atts)= @_; 9338 9339 $atts ||={}; 9340 9341 my $elt_as_string= $elt->_stringify_struct; # stringify the elt structure 9342 $regexp=~ s{(<[^>]*>)}{_match_expr( $1)}eg; # in the regexp, replace gi's by the proper regexp 9343 $elt_as_string=~ s{($regexp)}{$elt->_wrap_range( $gi, $atts, $1)}eg; # then do the actual replace 9344 9345 return $elt; 9346 } 9347 9348sub _match_expr 9349 { my $tag= shift; 9350 my( $gi, %atts)= XML::Twig::_parse_start_tag( $tag); 9351 return _match_tag( $gi, %atts); 9352 } 9353 9354 9355sub _match_tag 9356 { my( $elt, %atts)= @_; 9357 my $string= "<$elt\\b"; 9358 foreach my $key (sort keys %atts) 9359 { my $val= qq{\Q$atts{$key}\E}; 9360 $string.= qq{[^>]*$key=(?:"$val"|'$val')}; 9361 } 9362 $string.= qq{[^>]*>}; 9363 return "(?:$string)"; 9364 } 9365 9366sub field_to_att 9367 { my( $elt, $cond, $att)= @_; 9368 $att ||= $cond; 9369 my $child= $elt->first_child( $cond) or return undef; 9370 $elt->set_att( $att => $child->text); 9371 $child->cut; 9372 return $elt; 9373 } 9374 9375sub att_to_field 9376 { my( $elt, $att, $tag)= @_; 9377 $tag ||= $att; 9378 my $child= $elt->insert_new_elt( first_child => $tag, $elt->{'att'}->{$att}); 9379 $elt->del_att( $att); 9380 return $elt; 9381 } 9382 9383# sort children methods 9384 9385sub sort_children_on_field 9386 { my $elt = shift; 9387 my $field = shift; 9388 my $get_key= sub { return $_[0]->field( $field) }; 9389 return $elt->sort_children( $get_key, @_); 9390 } 9391 9392sub sort_children_on_att 9393 { my $elt = shift; 9394 my $att = shift; 9395 my $get_key= sub { return $_[0]->{'att'}->{$att} }; 9396 return $elt->sort_children( $get_key, @_); 9397 } 9398 9399sub sort_children_on_value 9400 { my $elt = shift; 9401 #my $get_key= eval qq{ sub { $NO_WARNINGS; return \$_[0]->text } }; 9402 my $get_key= \&text; 9403 return $elt->sort_children( $get_key, @_); 9404 } 9405 9406sub sort_children 9407 { my( $elt, $get_key, %opt)=@_; 9408 $opt{order} ||= 'normal'; 9409 $opt{type} ||= 'alpha'; 9410 my( $par_a, $par_b)= ($opt{order} eq 'reverse') ? qw( b a) : qw ( a b) ; 9411 my $op= ($opt{type} eq 'numeric') ? '<=>' : 'cmp' ; 9412 my @children= $elt->cut_children; 9413 if( $opt{type} eq 'numeric') 9414 { @children= map { $_->[1] } 9415 sort { $a->[0] <=> $b->[0] } 9416 map { [ $get_key->( $_), $_] } @children; 9417 } 9418 elsif( $opt{type} eq 'alpha') 9419 { @children= map { $_->[1] } 9420 sort { $a->[0] cmp $b->[0] } 9421 map { [ $get_key->( $_), $_] } @children; 9422 } 9423 else 9424 { croak "wrong sort type '$opt{type}', should be either 'alpha' or 'numeric'"; } 9425 9426 @children= reverse @children if( $opt{order} eq 'reverse'); 9427 $elt->set_content( @children); 9428 } 9429 9430 9431# comparison methods 9432 9433sub before 9434 { my( $a, $b)=@_; 9435 if( $a->cmp( $b) == -1) { return 1; } else { return 0; } 9436 } 9437 9438sub after 9439 { my( $a, $b)=@_; 9440 if( $a->cmp( $b) == 1) { return 1; } else { return 0; } 9441 } 9442 9443sub lt 9444 { my( $a, $b)=@_; 9445 return 1 if( $a->cmp( $b) == -1); 9446 return 0; 9447 } 9448 9449sub le 9450 { my( $a, $b)=@_; 9451 return 1 unless( $a->cmp( $b) == 1); 9452 return 0; 9453 } 9454 9455sub gt 9456 { my( $a, $b)=@_; 9457 return 1 if( $a->cmp( $b) == 1); 9458 return 0; 9459 } 9460 9461sub ge 9462 { my( $a, $b)=@_; 9463 return 1 unless( $a->cmp( $b) == -1); 9464 return 0; 9465 } 9466 9467 9468sub cmp 9469 { my( $a, $b)=@_; 9470 9471 # easy cases 9472 return 0 if( $a == $b); 9473 return 1 if( $a->in($b)); # a in b => a starts after b 9474 return -1 if( $b->in($a)); # b in a => a starts before b 9475 9476 # ancestors does not include the element itself 9477 my @a_pile= ($a, $a->ancestors); 9478 my @b_pile= ($b, $b->ancestors); 9479 9480 # the 2 elements are not in the same twig 9481 return undef unless( $a_pile[-1] == $b_pile[-1]); 9482 9483 # find the first non common ancestors (they are siblings) 9484 my $a_anc= pop @a_pile; 9485 my $b_anc= pop @b_pile; 9486 9487 while( $a_anc == $b_anc) 9488 { $a_anc= pop @a_pile; 9489 $b_anc= pop @b_pile; 9490 } 9491 9492 # from there move left and right and figure out the order 9493 my( $a_prev, $a_next, $b_prev, $b_next)= ($a_anc, $a_anc, $b_anc, $b_anc); 9494 while() 9495 { $a_prev= $a_prev->{prev_sibling} || return( -1); 9496 return 1 if( $a_prev == $b_next); 9497 $a_next= $a_next->{next_sibling} || return( 1); 9498 return -1 if( $a_next == $b_prev); 9499 $b_prev= $b_prev->{prev_sibling} || return( 1); 9500 return -1 if( $b_prev == $a_next); 9501 $b_next= $b_next->{next_sibling} || return( -1); 9502 return 1 if( $b_next == $a_prev); 9503 } 9504 } 9505 9506sub _dump 9507 { my( $elt, $option)= @_; 9508 9509 my $atts = defined $option->{atts} ? $option->{atts} : 1; 9510 my $extra = defined $option->{extra} ? $option->{extra} : 0; 9511 my $short_text = defined $option->{short_text} ? $option->{short_text} : 40; 9512 9513 my $sp= '| '; 9514 my $indent= $sp x $elt->level; 9515 my $indent_sp= ' ' x $elt->level; 9516 9517 my $dump=''; 9518 if( $elt->is_elt) 9519 { 9520 $dump .= $indent . '|-' . $XML::Twig::index2gi[$elt->{'gi'}]; 9521 9522 if( $atts && (my @atts= $elt->att_names) ) 9523 { $dump .= ' ' . join( ' ', map { qq{$_="} . $elt->{'att'}->{$_} . qq{"} } @atts); } 9524 9525 $dump .= "\n"; 9526 if( $extra) { $dump .= $elt->_dump_extra_data( $indent, $indent_sp, $short_text); } 9527 $dump .= join( "", map { $_->_dump( $option) } do { my $elt= $elt; my @children=(); my $child= $elt->{first_child}; while( $child) { push @children, $child; $child= $child->{next_sibling}; } @children; }); 9528 } 9529 else 9530 { 9531 if( (exists $elt->{'pcdata'})) 9532 { $dump .= "$indent|-PCDATA: '" . _short_text( $elt->{pcdata}, $short_text) . "'\n" } 9533 elsif( (exists $elt->{'ent'})) 9534 { $dump .= "$indent|-ENTITY: '" . _short_text( $elt->{ent}, $short_text) . "'\n" } 9535 elsif( (exists $elt->{'cdata'})) 9536 { $dump .= "$indent|-CDATA: '" . _short_text( $elt->{cdata}, $short_text) . "'\n" } 9537 elsif( (exists $elt->{'comment'})) 9538 { $dump .= "$indent|-COMMENT: '" . _short_text( $elt->comment_string, $short_text) . "'\n" } 9539 elsif( (exists $elt->{'target'})) 9540 { $dump .= "$indent|-PI: '" . $elt->{target} . "' - '" . _short_text( $elt->{data}, $short_text) . "'\n" } 9541 if( $extra) { $dump .= $elt->_dump_extra_data( $indent, $indent_sp, $short_text); } 9542 } 9543 return $dump; 9544 } 9545 9546sub _dump_extra_data 9547 { my( $elt, $indent, $indent_sp, $short_text)= @_; 9548 my $dump=''; 9549 if( $elt->extra_data) 9550 { my $extra_data = $indent . "|-- (cpi before) '" . _short_text( $elt->extra_data, $short_text) . "'"; 9551 $extra_data=~ s{\n}{$indent_sp}g; 9552 $dump .= $extra_data . "\n"; 9553 } 9554 if( $elt->{extra_data_in_pcdata}) 9555 { foreach my $data ( @{$elt->{extra_data_in_pcdata}}) 9556 { my $extra_data = $indent . "|-- (cpi offset $data->{offset}) '" . _short_text( $data->{text}, $short_text) . "'"; 9557 $extra_data=~ s{\n}{$indent_sp}g; 9558 $dump .= $extra_data . "\n"; 9559 } 9560 } 9561 if( $elt->{extra_data_before_end_tag}) 9562 { my $extra_data = $indent . "|-- (cpi end) '" . _short_text( $elt->{extra_data_before_end_tag}, $short_text) . "'"; 9563 $extra_data=~ s{\n}{$indent_sp}g; 9564 $dump .= $extra_data . "\n"; 9565 } 9566 return $dump; 9567 } 9568 9569 9570sub _short_text 9571 { my( $string, $length)= @_; 9572 if( !$length || (length( $string) < $length) ) { return $string; } 9573 my $l1= (length( $string) -5) /2; 9574 my $l2= length( $string) - ($l1 + 5); 9575 return substr( $string, 0, $l1) . ' ... ' . substr( $string, -$l2); 9576 } 9577 9578 9579sub _and { return _join_defined( ' && ', @_); } 9580sub _join_defined { return join( shift(), grep { $_ } @_); } 9581 95821; 9583__END__ 9584 9585=head1 NAME 9586 9587XML::Twig - A perl module for processing huge XML documents in tree mode. 9588 9589=head1 SYNOPSIS 9590 9591Note that this documentation is intended as a reference to the module. 9592 9593Complete docs, including a tutorial, examples, an easier to use HTML version, 9594a quick reference card and a FAQ are available at L<http://www.xmltwig.org/xmltwig> 9595 9596Small documents (loaded in memory as a tree): 9597 9598 my $twig=XML::Twig->new(); # create the twig 9599 $twig->parsefile( 'doc.xml'); # build it 9600 my_process( $twig); # use twig methods to process it 9601 $twig->print; # output the twig 9602 9603Huge documents (processed in combined stream/tree mode): 9604 9605 # at most one div will be loaded in memory 9606 my $twig=XML::Twig->new( 9607 twig_handlers => 9608 { title => sub { $_->set_tag( 'h2') }, # change title tags to h2 9609 para => sub { $_->set_tag( 'p') }, # change para to p 9610 hidden => sub { $_->delete; }, # remove hidden elements 9611 list => \&my_list_process, # process list elements 9612 div => sub { $_[0]->flush; }, # output and free memory 9613 }, 9614 pretty_print => 'indented', # output will be nicely formatted 9615 empty_tags => 'html', # outputs <empty_tag /> 9616 ); 9617 $twig->parsefile( 'my_big.xml'); 9618 9619 sub my_list_process 9620 { my( $twig, $list)= @_; 9621 # ... 9622 } 9623 9624See L<XML::Twig 101|/XML::Twig 101> for other ways to use the module, as a 9625filter for example. 9626 9627=encoding utf8 9628 9629=head1 DESCRIPTION 9630 9631This module provides a way to process XML documents. It is build on top 9632of C<XML::Parser>. 9633 9634The module offers a tree interface to the document, while allowing you 9635to output the parts of it that have been completely processed. 9636 9637It allows minimal resource (CPU and memory) usage by building the tree 9638only for the parts of the documents that need actual processing, through the 9639use of the C<L<twig_roots> > and 9640C<L<twig_print_outside_roots> > options. The 9641C<L<finish> > and C<L<finish_print> > methods also help 9642to increase performances. 9643 9644XML::Twig tries to make simple things easy so it tries its best to takes care 9645of a lot of the (usually) annoying (but sometimes necessary) features that 9646come with XML and XML::Parser. 9647 9648=head1 TOOLS 9649 9650XML::Twig comes with a few command-line utilities: 9651 9652=head2 xml_pp - xml pretty-printer 9653 9654XML pretty printer using XML::Twig 9655 9656=head2 xml_grep - grep XML files looking for specific elements 9657 9658C<xml_grep> does a grep on XML files. Instead of using regular expressions 9659it uses XPath expressions (in fact the subset of XPath supported by 9660XML::Twig). 9661 9662=head2 xml_split - cut a big XML file into smaller chunks 9663 9664C<xml_split> takes a (presumably big) XML file and split it in several smaller 9665files, based on various criteria (level in the tree, size or an XPath 9666expression) 9667 9668=head2 xml_merge - merge back XML files split with xml_split 9669 9670C<xml_merge> takes several xml files that have been split using C<xml_split> 9671and recreates a single file. 9672 9673=head2 xml_spellcheck - spellcheck XML files 9674 9675C<xml_spellcheck> lets you spell check the content of an XML file. It extracts 9676the text (the content of elements and optionally of attributes), call a spell 9677checker on it and then recreates the XML document. 9678 9679 9680=head1 XML::Twig 101 9681 9682XML::Twig can be used either on "small" XML documents (that fit in memory) 9683or on huge ones, by processing parts of the document and outputting or 9684discarding them once they are processed. 9685 9686 9687=head2 Loading an XML document and processing it 9688 9689 my $t= XML::Twig->new(); 9690 $t->parse( '<d><title>title</title><para>p 1</para><para>p 2</para></d>'); 9691 my $root= $t->root; 9692 $root->set_tag( 'html'); # change doc to html 9693 $title= $root->first_child( 'title'); # get the title 9694 $title->set_tag( 'h1'); # turn it into h1 9695 my @para= $root->children( 'para'); # get the para children 9696 foreach my $para (@para) 9697 { $para->set_tag( 'p'); } # turn them into p 9698 $t->print; # output the document 9699 9700Other useful methods include: 9701 9702L<att>: C<< $elt->{'att'}->{'foo'} >> return the C<foo> attribute for an 9703element, 9704 9705L<set_att> : C<< $elt->set_att( foo => "bar") >> sets the C<foo> 9706attribute to the C<bar> value, 9707 9708L<next_sibling>: C<< $elt->{next_sibling} >> return the next sibling 9709in the document (in the example C<< $title->{next_sibling} >> is the first 9710C<para>, you can also (and actually should) use 9711C<< $elt->next_sibling( 'para') >> to get it 9712 9713The document can also be transformed through the use of the L<cut>, 9714L<copy>, L<paste> and L<move> methods: 9715C<< $title->cut; $title->paste( after => $p); >> for example 9716 9717And much, much more, see L<XML::Twig::Elt|/XML::Twig::Elt>. 9718 9719=head2 Processing an XML document chunk by chunk 9720 9721One of the strengths of XML::Twig is that it let you work with files that do 9722not fit in memory (BTW storing an XML document in memory as a tree is quite 9723memory-expensive, the expansion factor being often around 10). 9724 9725To do this you can define handlers, that will be called once a specific 9726element has been completely parsed. In these handlers you can access the 9727element and process it as you see fit, using the navigation and the 9728cut-n-paste methods, plus lots of convenient ones like C<L<prefix> >. 9729Once the element is completely processed you can then C<L<flush> > it, 9730which will output it and free the memory. You can also C<L<purge> > it 9731if you don't need to output it (if you are just extracting some data from 9732the document for example). The handler will be called again once the next 9733relevant element has been parsed. 9734 9735 my $t= XML::Twig->new( twig_handlers => 9736 { section => \§ion, 9737 para => sub { $_->set_tag( 'p'); } 9738 }, 9739 ); 9740 $t->parsefile( 'doc.xml'); 9741 9742 # the handler is called once a section is completely parsed, ie when 9743 # the end tag for section is found, it receives the twig itself and 9744 # the element (including all its sub-elements) as arguments 9745 sub section 9746 { my( $t, $section)= @_; # arguments for all twig_handlers 9747 $section->set_tag( 'div'); # change the tag name.4, my favourite method... 9748 # let's use the attribute nb as a prefix to the title 9749 my $title= $section->first_child( 'title'); # find the title 9750 my $nb= $title->{'att'}->{'nb'}; # get the attribute 9751 $title->prefix( "$nb - "); # easy isn't it? 9752 $section->flush; # outputs the section and frees memory 9753 } 9754 9755 9756There is of course more to it: you can trigger handlers on more elaborate 9757conditions than just the name of the element, C<section/title> for example. 9758 9759 my $t= XML::Twig->new( twig_handlers => 9760 { 'section/title' => sub { $_->print } } 9761 ) 9762 ->parsefile( 'doc.xml'); 9763 9764Here C<< sub { $_->print } >> simply prints the current element (C<$_> is aliased 9765to the element in the handler). 9766 9767You can also trigger a handler on a test on an attribute: 9768 9769 my $t= XML::Twig->new( twig_handlers => 9770 { 'section[@level="1"]' => sub { $_->print } } 9771 ); 9772 ->parsefile( 'doc.xml'); 9773 9774You can also use C<L<start_tag_handlers> > to process an 9775element as soon as the start tag is found. Besides C<L<prefix> > you 9776can also use C<L<suffix> >, 9777 9778=head2 Processing just parts of an XML document 9779 9780The twig_roots mode builds only the required sub-trees from the document 9781Anything outside of the twig roots will just be ignored: 9782 9783 my $t= XML::Twig->new( 9784 # the twig will include just the root and selected titles 9785 twig_roots => { 'section/title' => \&print_n_purge, 9786 'annex/title' => \&print_n_purge 9787 } 9788 ); 9789 $t->parsefile( 'doc.xml'); 9790 9791 sub print_n_purge 9792 { my( $t, $elt)= @_; 9793 print $elt->text; # print the text (including sub-element texts) 9794 $t->purge; # frees the memory 9795 } 9796 9797You can use that mode when you want to process parts of a documents but are 9798not interested in the rest and you don't want to pay the price, either in 9799time or memory, to build the tree for the it. 9800 9801 9802=head2 Building an XML filter 9803 9804You can combine the C<twig_roots> and the C<twig_print_outside_roots> options to 9805build filters, which let you modify selected elements and will output the rest 9806of the document as is. 9807 9808This would convert prices in $ to prices in Euro in a document: 9809 9810 my $t= XML::Twig->new( 9811 twig_roots => { 'price' => \&convert, }, # process prices 9812 twig_print_outside_roots => 1, # print the rest 9813 ); 9814 $t->parsefile( 'doc.xml'); 9815 9816 sub convert 9817 { my( $t, $price)= @_; 9818 my $currency= $price->{'att'}->{'currency'}; # get the currency 9819 if( $currency eq 'USD') 9820 { $usd_price= $price->text; # get the price 9821 # %rate is just a conversion table 9822 my $euro_price= $usd_price * $rate{usd2euro}; 9823 $price->set_text( $euro_price); # set the new price 9824 $price->set_att( currency => 'EUR'); # don't forget this! 9825 } 9826 $price->print; # output the price 9827 } 9828 9829=head2 XML::Twig and various versions of Perl, XML::Parser and expat: 9830 9831XML::Twig is a lot more sensitive to variations in versions of perl, 9832XML::Parser and expat than to the OS, so this should cover some 9833reasonable configurations. 9834 9835The "recommended configuration" is perl 5.8.3+ (for good Unicode 9836support), XML::Parser 2.31+ and expat 1.95.5+ 9837 9838See L<http://testers.cpan.org/search?request=dist&dist=XML-Twig> for the 9839CPAN testers reports on XML::Twig, which list all tested configurations. 9840 9841An Atom feed of the CPAN Testers results is available at 9842L<http://xmltwig.org/rss/twig_testers.rss> 9843 9844Finally: 9845 9846=over 4 9847 9848=item XML::Twig does B<NOT> work with expat 1.95.4 9849 9850=item XML::Twig only works with XML::Parser 2.27 in perl 5.6.* 9851 9852Note that I can't compile XML::Parser 2.27 anymore, so I can't guarantee 9853that it still works 9854 9855=item XML::Parser 2.28 does not really work 9856 9857=back 9858 9859When in doubt, upgrade expat, XML::Parser and Scalar::Util 9860 9861Finally, for some optional features, XML::Twig depends on some additional 9862modules. The complete list, which depends somewhat on the version of Perl 9863that you are running, is given by running C<t/zz_dump_config.t> 9864 9865=head1 Simplifying XML processing 9866 9867=over 4 9868 9869=item Whitespaces 9870 9871Whitespaces that look non-significant are discarded, this behaviour can be 9872controlled using the C<L<keep_spaces> >, 9873C<L<keep_spaces_in> > and 9874C<L<discard_spaces_in> > options. 9875 9876=item Encoding 9877 9878You can specify that you want the output in the same encoding as the input 9879(provided you have valid XML, which means you have to specify the encoding 9880either in the document or when you create the Twig object) using the 9881C<L<keep_encoding> > option 9882 9883You can also use C<L<output_encoding>> to convert the internal UTF-8 format 9884to the required encoding. 9885 9886=item Comments and Processing Instructions (PI) 9887 9888Comments and PI's can be hidden from the processing, but still appear in the 9889output (they are carried by the "real" element closer to them) 9890 9891=item Pretty Printing 9892 9893XML::Twig can output the document pretty printed so it is easier to read for 9894us humans. 9895 9896=item Surviving an untimely death 9897 9898XML parsers are supposed to react violently when fed improper XML. 9899XML::Parser just dies. 9900 9901XML::Twig provides the C<L<safe_parse> > and the 9902C<L<safe_parsefile> > methods which wrap the parse in an eval 9903and return either the parsed twig or 0 in case of failure. 9904 9905=item Private attributes 9906 9907Attributes with a name starting with # (illegal in XML) will not be 9908output, so you can safely use them to store temporary values during 9909processing. Note that you can store anything in a private attribute, 9910not just text, it's just a regular Perl variable, so a reference to 9911an object or a huge data structure is perfectly fine. 9912 9913=back 9914 9915=head1 CLASSES 9916 9917XML::Twig uses a very limited number of classes. The ones you are most likely to use 9918are C<L<XML::Twig>> of course, which represents a complete XML document, including the 9919document itself (the root of the document itself is C<L<root>>), its handlers, its 9920input or output filters... The other main class is C<L<XML::Twig::Elt>>, which models 9921an XML element. Element here has a very wide definition: it can be a regular element, or 9922but also text, with an element C<L<tag>> of C<#PCDATA> (or C<#CDATA>), an entity (tag is 9923C<#ENT>), a Processing Instruction (C<#PI>), a comment (C<#COMMENT>). 9924 9925Those are the 2 commonly used classes. 9926 9927You might want to look the C<L<elt_class>> option if you want to subclass C<XML::Twig::Elt>. 9928 9929Attributes are just attached to their parent element, they are not objects per se. (Please 9930use the provided methods C<L<att>> and C<L<set_att>> to access them, if you access them 9931as a hash, then your code becomes implementation dependent and might break in the future). 9932 9933Other classes that are seldom used are C<L<XML::Twig::Entity_list>> and C<L<XML::Twig::Entity>>. 9934 9935If you use C<L<XML::Twig::XPath>> instead of C<XML::Twig>, elements are then created as 9936C<L<XML::Twig::XPath::Elt>> 9937 9938 9939=head1 METHODS 9940 9941=head2 XML::Twig 9942 9943A twig is a subclass of XML::Parser, so all XML::Parser methods can be 9944called on a twig object, including parse and parsefile. 9945C<setHandlers> on the other hand cannot be used, see C<L<BUGS> > 9946 9947 9948=over 4 9949 9950=item new 9951 9952This is a class method, the constructor for XML::Twig. Options are passed 9953as keyword value pairs. Recognized options are the same as XML::Parser, 9954plus some (in fact a lot!) XML::Twig specifics. 9955 9956New Options: 9957 9958=over 4 9959 9960=item twig_handlers 9961 9962This argument consists of a hash C<{ expression => \&handler}> where 9963expression is a an I<XPath-like expression> (+ some others). 9964 9965XPath expressions are limited to using the child and descendant axis 9966(indeed you can't specify an axis), and predicates cannot be nested. 9967You can use the C<string>, or C<< string(<tag>) >> function (except 9968in C<twig_roots> triggers). 9969 9970Additionally you can use regexps (/ delimited) to match attribute 9971and string values. 9972 9973Examples: 9974 9975 foo 9976 foo/bar 9977 foo//bar 9978 /foo/bar 9979 /foo//bar 9980 /foo/bar[@att1 = "val1" and @att2 = "val2"]/baz[@a >= 1] 9981 foo[string()=~ /^duh!+/] 9982 /foo[string(bar)=~ /\d+/]/baz[@att != 3] 9983 9984#CDATA can be used to call a handler for a CDATA section. 9985#COMMENT can be used to call a handler for comments 9986 9987Some additional (non-XPath) expressions are also provided for convenience: 9988 9989=over 4 9990 9991=item processing instructions 9992 9993C<'?'> or C<'#PI'> triggers the handler for any processing instruction, 9994and C<< '?<target>' >> or C<< '#PI <target>' >> triggers a handler for processing 9995instruction with the given target( ex: C<'#PI xml-stylesheet'>). 9996 9997=item level(<level>) 9998 9999Triggers the handler on any element at that level in the tree (root is level 1) 10000 10001=item _all_ 10002 10003Triggers the handler for B<all> elements in the tree 10004 10005=item _default_ 10006 10007Triggers the handler for each element that does NOT have any other handler. 10008 10009=back 10010 10011Expressions are evaluated against the input document. 10012Which means that even if you have changed the tag of an element (changing the 10013tag of a parent element from a handler for example) the change will not impact 10014the expression evaluation. There is an exception to this: "private" attributes 10015(which name start with a '#', and can only be created during the parsing, as 10016they are not valid XML) are checked against the current twig. 10017 10018Handlers are triggered in fixed order, sorted by their type (xpath expressions 10019first, then regexps, then level), then by whether they specify a full path 10020(starting at the root element) or 10021not, then by number of steps in the expression , then number of 10022predicates, then number of tests in predicates. Handlers where the last 10023step does not specify a step (C<foo/bar/*>) are triggered after other XPath 10024handlers. Finally C<_all_> handlers are triggered last. 10025 10026B<Important>: once a handler has been triggered if it returns 0 then no other 10027handler is called, except a C<_all_> handler which will be called anyway. 10028 10029If a handler returns a true value and other handlers apply, then the next 10030applicable handler will be called. Repeat, rinse, lather..; The exception 10031to that rule is when the C<L<do_not_chain_handlers>> 10032option is set, in which case only the first handler will be called. 10033 10034Note that it might be a good idea to explicitly return a short true value 10035(like 1) from handlers: this ensures that other applicable handlers are 10036called even if the last statement for the handler happens to evaluate to 10037false. This might also speedup the code by avoiding the result of the last 10038statement of the code to be copied and passed to the code managing handlers. 10039It can really pay to have 1 instead of a long string returned. 10040 10041When the closing tag for an element is parsed the corresponding handler is 10042called, with 2 arguments: the twig and the C<L<Element> >. The twig includes 10043the document tree that has been built so far, the element is the complete 10044sub-tree for the element. The fact that the handler is called only when the 10045closing tag for the element is found means that handlers for inner elements 10046are called before handlers for outer elements. 10047 10048C<$_> is also set to the element, so it is easy to write inline handlers like 10049 10050 para => sub { $_->set_tag( 'p'); } 10051 10052Text is stored in elements whose tag name is #PCDATA (due to mixed content, 10053text and sub-element in an element there is no way to store the text as just 10054an attribute of the enclosing element). 10055 10056B<Warning>: if you have used purge or flush on the twig the element might not 10057be complete, some of its children might have been entirely flushed or purged, 10058and the start tag might even have been printed (by C<flush>) already, so changing 10059its tag might not give the expected result. 10060 10061 10062=item twig_roots 10063 10064This argument let's you build the tree only for those elements you are 10065interested in. 10066 10067 Example: my $t= XML::Twig->new( twig_roots => { title => 1, subtitle => 1}); 10068 $t->parsefile( file); 10069 my $t= XML::Twig->new( twig_roots => { 'section/title' => 1}); 10070 $t->parsefile( file); 10071 10072 10073return a twig containing a document including only C<title> and C<subtitle> 10074elements, as children of the root element. 10075 10076You can use I<generic_attribute_condition>, I<attribute_condition>, 10077I<full_path>, I<partial_path>, I<tag>, I<tag_regexp>, I<_default_> and 10078I<_all_> to trigger the building of the twig. 10079I<string_condition> and I<regexp_condition> cannot be used as the content 10080of the element, and the string, have not yet been parsed when the condition 10081is checked. 10082 10083B<WARNING>: path are checked for the document. Even if the C<twig_roots> option 10084is used they will be checked against the full document tree, not the virtual 10085tree created by XML::Twig 10086 10087 10088B<WARNING>: twig_roots elements should NOT be nested, that would hopelessly 10089confuse XML::Twig ;--( 10090 10091Note: you can set handlers (twig_handlers) using twig_roots 10092 Example: my $t= XML::Twig->new( twig_roots => 10093 { title => sub { $_[1]->print;}, 10094 subtitle => \&process_subtitle 10095 } 10096 ); 10097 $t->parsefile( file); 10098 10099 10100=item twig_print_outside_roots 10101 10102To be used in conjunction with the C<twig_roots> argument. When set to a true 10103value this will print the document outside of the C<twig_roots> elements. 10104 10105 Example: my $t= XML::Twig->new( twig_roots => { title => \&number_title }, 10106 twig_print_outside_roots => 1, 10107 ); 10108 $t->parsefile( file); 10109 { my $nb; 10110 sub number_title 10111 { my( $twig, $title); 10112 $nb++; 10113 $title->prefix( "$nb "); 10114 $title->print; 10115 } 10116 } 10117 10118 10119This example prints the document outside of the title element, calls 10120C<number_title> for each C<title> element, prints it, and then resumes printing 10121the document. The twig is built only for the C<title> elements. 10122 10123If the value is a reference to a file handle then the document outside the 10124C<twig_roots> elements will be output to this file handle: 10125 10126 open( my $out, '>', 'out_file.xml') or die "cannot open out file.xml out_file:$!"; 10127 my $t= XML::Twig->new( twig_roots => { title => \&number_title }, 10128 # default output to $out 10129 twig_print_outside_roots => $out, 10130 ); 10131 10132 { my $nb; 10133 sub number_title 10134 { my( $twig, $title); 10135 $nb++; 10136 $title->prefix( "$nb "); 10137 $title->print( $out); # you have to print to \*OUT here 10138 } 10139 } 10140 10141 10142=item start_tag_handlers 10143 10144A hash C<{ expression => \&handler}>. Sets element handlers that are called when 10145the element is open (at the end of the XML::Parser C<Start> handler). The handlers 10146are called with 2 params: the twig and the element. The element is empty at 10147that point, its attributes are created though. 10148 10149You can use I<generic_attribute_condition>, I<attribute_condition>, 10150I<full_path>, I<partial_path>, I<tag>, I<tag_regexp>, I<_default_> and I<_all_> 10151to trigger the handler. 10152 10153I<string_condition> and I<regexp_condition> cannot be used as the content of 10154the element, and the string, have not yet been parsed when the condition is 10155checked. 10156 10157The main uses for those handlers are to change the tag name (you might have to 10158do it as soon as you find the open tag if you plan to C<flush> the twig at some 10159point in the element, and to create temporary attributes that will be used 10160when processing sub-element with C<twig_hanlders>. 10161 10162You should also use it to change tags if you use C<flush>. If you change the tag 10163in a regular C<twig_handler> then the start tag might already have been flushed. 10164 10165B<Note>: C<start_tag> handlers can be called outside of C<twig_roots> if this 10166argument is used, in this case handlers are called with the following arguments: 10167C<$t> (the twig), C<$tag> (the tag of the element) and C<%att> (a hash of the 10168attributes of the element). 10169 10170If the C<twig_print_outside_roots> argument is also used, if the last handler 10171called returns a C<true> value, then the start tag will be output as it 10172appeared in the original document, if the handler returns a C<false> value 10173then the start tag will B<not> be printed (so you can print a modified string 10174yourself for example). 10175 10176Note that you can use the L<ignore> method in C<start_tag_handlers> 10177(and only there). 10178 10179=item end_tag_handlers 10180 10181A hash C<{ expression => \&handler}>. Sets element handlers that are called when 10182the element is closed (at the end of the XML::Parser C<End> handler). The handlers 10183are called with 2 params: the twig and the tag of the element. 10184 10185I<twig_handlers> are called when an element is completely parsed, so why have 10186this redundant option? There is only one use for C<end_tag_handlers>: when using 10187the C<twig_roots> option, to trigger a handler for an element B<outside> the roots. 10188It is for example very useful to number titles in a document using nested 10189sections: 10190 10191 my @no= (0); 10192 my $no; 10193 my $t= XML::Twig->new( 10194 start_tag_handlers => 10195 { section => sub { $no[$#no]++; $no= join '.', @no; push @no, 0; } }, 10196 twig_roots => 10197 { title => sub { $_[1]->prefix( $no); $_[1]->print; } }, 10198 end_tag_handlers => { section => sub { pop @no; } }, 10199 twig_print_outside_roots => 1 10200 ); 10201 $t->parsefile( $file); 10202 10203Using the C<end_tag_handlers> argument without C<twig_roots> will result in an 10204error. 10205 10206=item do_not_chain_handlers 10207 10208If this option is set to a true value, then only one handler will be called for 10209each element, even if several satisfy the condition 10210 10211Note that the C<_all_> handler will still be called regardless 10212 10213=item ignore_elts 10214 10215This option lets you ignore elements when building the twig. This is useful 10216in cases where you cannot use C<twig_roots> to ignore elements, for example if 10217the element to ignore is a sibling of elements you are interested in. 10218 10219Example: 10220 10221 my $twig= XML::Twig->new( ignore_elts => { elt => 'discard' }); 10222 $twig->parsefile( 'doc.xml'); 10223 10224This will build the complete twig for the document, except that all C<elt> 10225elements (and their children) will be left out. 10226 10227The keys in the hash are triggers, limited to the same subset as 10228C<L<start_tag_handlers>>. The values can be C<discard>, to discard 10229the element, C<print>, to output the element as-is, C<string> to 10230store the text of the ignored element(s), including markup, in a field of 10231the twig: C<< $t->{twig_buffered_string} >> or a reference to a scalar, in 10232which case the text of the ignored element(s), including markup, will be 10233stored in the scalar. Any other value will be treated as C<discard>. 10234 10235 10236=item char_handler 10237 10238A reference to a subroutine that will be called every time C<PCDATA> is found. 10239 10240The subroutine receives the string as argument, and returns the modified string: 10241 10242 # we want all strings in upper case 10243 sub my_char_handler 10244 { my( $text)= @_; 10245 $text= uc( $text); 10246 return $text; 10247 } 10248 10249=item elt_class 10250 10251The name of a class used to store elements. this class should inherit from 10252C<XML::Twig::Elt> (and by default it is C<XML::Twig::Elt>). This option is used 10253to subclass the element class and extend it with new methods. 10254 10255This option is needed because during the parsing of the XML, elements are created 10256by C<XML::Twig>, without any control from the user code. 10257 10258=item keep_atts_order 10259 10260Setting this option to a true value causes the attribute hash to be tied to 10261a C<Tie::IxHash> object. 10262This means that C<Tie::IxHash> needs to be installed for this option to be 10263available. It also means that the hash keeps its order, so you will get 10264the attributes in order. This allows outputting the attributes in the same 10265order as they were in the original document. 10266 10267=item keep_encoding 10268 10269This is a (slightly?) evil option: if the XML document is not UTF-8 encoded and 10270you want to keep it that way, then setting keep_encoding will use theC<Expat> 10271original_string method for character, thus keeping the original encoding, as 10272well as the original entities in the strings. 10273 10274See the C<t/test6.t> test file to see what results you can expect from the 10275various encoding options. 10276 10277B<WARNING>: if the original encoding is multi-byte then attribute parsing will 10278be EXTREMELY unsafe under any Perl before 5.6, as it uses regular expressions 10279which do not deal properly with multi-byte characters. You can specify an 10280alternate function to parse the start tags with the C<parse_start_tag> option 10281(see below) 10282 10283B<WARNING>: this option is NOT used when parsing with the non-blocking parser 10284(C<parse_start>, C<parse_more>, parse_done methods) which you probably should 10285not use with XML::Twig anyway as they are totally untested! 10286 10287=item output_encoding 10288 10289This option generates an output_filter using C<Encode>, C<Text::Iconv> or 10290C<Unicode::Map8> and C<Unicode::Strings>, and sets the encoding in the XML 10291declaration. This is the easiest way to deal with encodings, if you need 10292more sophisticated features, look at C<output_filter> below 10293 10294 10295=item output_filter 10296 10297This option is used to convert the character encoding of the output document. 10298It is passed either a string corresponding to a predefined filter or 10299a subroutine reference. The filter will be called every time a document or 10300element is processed by the "print" functions (C<print>, C<sprint>, C<flush>). 10301 10302Pre-defined filters: 10303 10304=over 4 10305 10306=item latin1 10307 10308uses either C<Encode>, C<Text::Iconv> or C<Unicode::Map8> and C<Unicode::String> 10309or a regexp (which works only with XML::Parser 2.27), in this order, to convert 10310all characters to ISO-8859-15 (usually latin1 is synonym to ISO-8859-1, but 10311in practice it seems that ISO-8859-15, which includes the euro sign, is more 10312useful and probably what most people want). 10313 10314=item html 10315 10316does the same conversion as C<latin1>, plus encodes entities using 10317C<HTML::Entities> (oddly enough you will need to have HTML::Entities installed 10318for it to be available). This should only be used if the tags and attribute 10319names themselves are in US-ASCII, or they will be converted and the output will 10320not be valid XML any more 10321 10322=item safe 10323 10324converts the output to ASCII (US) only plus I<character entities> (C<&#nnn;>) 10325this should be used only if the tags and attribute names themselves are in 10326US-ASCII, or they will be converted and the output will not be valid XML any 10327more 10328 10329=item safe_hex 10330 10331same as C<safe> except that the character entities are in hex (C<&#xnnn;>) 10332 10333=item encode_convert ($encoding) 10334 10335Return a subref that can be used to convert utf8 strings to C<$encoding>). 10336Uses C<Encode>. 10337 10338 my $conv = XML::Twig::encode_convert( 'latin1'); 10339 my $t = XML::Twig->new(output_filter => $conv); 10340 10341=item iconv_convert ($encoding) 10342 10343this function is used to create a filter subroutine that will be used to 10344convert the characters to the target encoding using C<Text::Iconv> (which needs 10345to be installed, look at the documentation for the module and for the 10346C<iconv> library to find out which encodings are available on your system) 10347 10348 my $conv = XML::Twig::iconv_convert( 'latin1'); 10349 my $t = XML::Twig->new(output_filter => $conv); 10350 10351=item unicode_convert ($encoding) 10352 10353this function is used to create a filter subroutine that will be used to 10354convert the characters to the target encoding using C<Unicode::Strings> 10355and C<Unicode::Map8> (which need to be installed, look at the documentation 10356for the modules to find out which encodings are available on your system) 10357 10358 my $conv = XML::Twig::unicode_convert( 'latin1'); 10359 my $t = XML::Twig->new(output_filter => $conv); 10360 10361=back 10362 10363The C<text> and C<att> methods do not use the filter, so their 10364result are always in unicode. 10365 10366Those predeclared filters are based on subroutines that can be used 10367by themselves (as C<XML::Twig::foo>). 10368 10369=over 4 10370 10371=item html_encode ($string) 10372 10373Use C<HTML::Entities> to encode a utf8 string 10374 10375=item safe_encode ($string) 10376 10377Use either a regexp (perl < 5.8) or C<Encode> to encode non-ascii characters 10378in the string in C<< &#<nnnn>; >> format 10379 10380=item safe_encode_hex ($string) 10381 10382Use either a regexp (perl < 5.8) or C<Encode> to encode non-ascii characters 10383in the string in C<< &#x<nnnn>; >> format 10384 10385=item regexp2latin1 ($string) 10386 10387Use a regexp to encode a utf8 string into latin 1 (ISO-8859-1). Does not 10388work with Perl 5.8.0! 10389 10390=back 10391 10392=item output_text_filter 10393 10394same as output_filter, except it doesn't apply to the brackets and quotes 10395around attribute values. This is useful for all filters that could change 10396the tagging, basically anything that does not just change the encoding of 10397the output. C<html>, C<safe> and C<safe_hex> are better used with this option. 10398 10399=item input_filter 10400 10401This option is similar to C<output_filter> except the filter is applied to 10402the characters before they are stored in the twig, at parsing time. 10403 10404=item remove_cdata 10405 10406Setting this option to a true value will force the twig to output CDATA 10407sections as regular (escaped) PCDATA 10408 10409=item parse_start_tag 10410 10411If you use the C<keep_encoding> option then this option can be used to replace 10412the default parsing function. You should provide a coderef (a reference to a 10413subroutine) as the argument, this subroutine takes the original tag (given 10414by XML::Parser::Expat C<original_string()> method) and returns a tag and the 10415attributes in a hash (or in a list attribute_name/attribute value). 10416 10417=item expand_external_ents 10418 10419When this option is used external entities (that are defined) are expanded 10420when the document is output using "print" functions such as C<L<print> >, 10421C<L<sprint> >, C<L<flush> > and C<L<xml_string> >. 10422Note that in the twig the entity will be stored as an element with a 10423tag 'C<#ENT>', the entity will not be expanded there, so you might want to 10424process the entities before outputting it. 10425 10426If an external entity is not available, then the parse will fail. 10427 10428A special case is when the value of this option is -1. In that case a missing 10429entity will not cause the parser to die, but its C<name>, C<sysid> and C<pubid> 10430will be stored in the twig as C<< $twig->{twig_missing_system_entities} >> 10431(a reference to an array of hashes { name => <name>, sysid => <sysid>, 10432pubid => <pubid> }). Yes, this is a bit of a hack, but it's useful in some 10433cases. 10434 10435=item load_DTD 10436 10437If this argument is set to a true value, C<parse> or C<parsefile> on the twig 10438will load the DTD information. This information can then be accessed through 10439the twig, in a C<DTD_handler> for example. This will load even an external DTD. 10440 10441Default and fixed values for attributes will also be filled, based on the DTD. 10442 10443Note that to do this the module will generate a temporary file in the current 10444directory. If this is a problem let me know and I will add an option to 10445specify an alternate directory. 10446 10447See L<DTD Handling> for more information 10448 10449=item DTD_handler 10450 10451Set a handler that will be called once the doctype (and the DTD) have been 10452loaded, with 2 arguments, the twig and the DTD. 10453 10454=item no_prolog 10455 10456Does not output a prolog (XML declaration and DTD) 10457 10458=item id 10459 10460This optional argument gives the name of an attribute that can be used as 10461an ID in the document. Elements whose ID is known can be accessed through 10462the elt_id method. id defaults to 'id'. 10463See C<L<BUGS> > 10464 10465=item discard_spaces 10466 10467If this optional argument is set to a true value then spaces are discarded 10468when they look non-significant: strings containing only spaces and at least 10469one line feed are discarded. This argument is set to true by default. 10470 10471The exact algorithm to drop spaces is: strings including only spaces (perl \s) 10472and at least one \n right before an open or close tag are dropped. 10473 10474=item discard_all_spaces 10475 10476If this argument is set to a true value, spaces are discarded more 10477aggressively than with C<discard_spaces>: strings not including a \n are also 10478dropped. This option is appropriate for data-oriented XML. 10479 10480 10481=item keep_spaces 10482 10483If this optional argument is set to a true value then all spaces in the 10484document are kept, and stored as C<PCDATA>. 10485 10486B<Warning>: adding this option can result in changes in the twig generated: 10487space that was previously discarded might end up in a new text element. see 10488the difference by calling the following code with 0 and 1 as arguments: 10489 10490 perl -MXML::Twig -e'print XML::Twig->new( keep_spaces => shift)->parse( "<d> \n<e/></d>")->_dump' 10491 10492 10493C<keep_spaces> and C<discard_spaces> cannot be both set. 10494 10495=item discard_spaces_in 10496 10497This argument sets C<keep_spaces> to true but will cause the twig builder to 10498discard spaces in the elements listed. 10499 10500The syntax for using this argument is: 10501 10502 XML::Twig->new( discard_spaces_in => [ 'elt1', 'elt2']); 10503 10504=item keep_spaces_in 10505 10506This argument sets C<discard_spaces> to true but will cause the twig builder to 10507keep spaces in the elements listed. 10508 10509The syntax for using this argument is: 10510 10511 XML::Twig->new( keep_spaces_in => [ 'elt1', 'elt2']); 10512 10513B<Warning>: adding this option can result in changes in the twig generated: 10514space that was previously discarded might end up in a new text element. 10515 10516=item pretty_print 10517 10518Set the pretty print method, amongst 'C<none>' (default), 'C<nsgmls>', 10519'C<nice>', 'C<indented>', 'C<indented_c>', 'C<indented_a>', 10520'C<indented_close_tag>', 'C<cvs>', 'C<wrapped>', 'C<record>' and 'C<record_c>' 10521 10522pretty_print formats: 10523 10524=over 4 10525 10526=item none 10527 10528The document is output as one ling string, with no line breaks except those 10529found within text elements 10530 10531=item nsgmls 10532 10533Line breaks are inserted in safe places: that is within tags, between a tag 10534and an attribute, between attributes and before the > at the end of a tag. 10535 10536This is quite ugly but better than C<none>, and it is very safe, the document 10537will still be valid (conforming to its DTD). 10538 10539This is how the SGML parser C<sgmls> splits documents, hence the name. 10540 10541=item nice 10542 10543This option inserts line breaks before any tag that does not contain text (so 10544element with textual content are not broken as the \n is the significant). 10545 10546B<WARNING>: this option leaves the document well-formed but might make it 10547invalid (not conformant to its DTD). If you have elements declared as 10548 10549 <!ELEMENT foo (#PCDATA|bar)> 10550 10551then a C<foo> element including a C<bar> one will be printed as 10552 10553 <foo> 10554 <bar>bar is just pcdata</bar> 10555 </foo> 10556 10557This is invalid, as the parser will take the line break after the C<foo> tag 10558as a sign that the element contains PCDATA, it will then die when it finds the 10559C<bar> tag. This may or may not be important for you, but be aware of it! 10560 10561=item indented 10562 10563Same as C<nice> (and with the same warning) but indents elements according to 10564their level 10565 10566=item indented_c 10567 10568Same as C<indented> but a little more compact: the closing tags are on the 10569same line as the preceding text 10570 10571=item indented_close_tag 10572 10573Same as C<indented> except that the closing tag is also indented, to line up 10574with the tags within the element 10575 10576=item idented_a 10577 10578This formats XML files in a line-oriented version control friendly way. 10579The format is described in L<http://tinyurl.com/2kwscq> (that's an Oracle 10580document with an insanely long URL). 10581 10582Note that to be totaly conformant to the "spec", the order of attributes 10583should not be changed, so if they are not already in alphabetical order 10584you will need to use the C<L<keep_atts_order>> option. 10585 10586=item cvs 10587 10588Same as C<L<idented_a>>. 10589 10590=item wrapped 10591 10592Same as C<indented_c> but lines are wrapped using L<Text::Wrap::wrap>. The 10593default length for lines is the default for C<$Text::Wrap::columns>, and can 10594be changed by changing that variable. 10595 10596=item record 10597 10598This is a record-oriented pretty print, that display data in records, one field 10599per line (which looks a LOT like C<indented>) 10600 10601=item record_c 10602 10603Stands for record compact, one record per line 10604 10605=back 10606 10607 10608=item empty_tags 10609 10610Set the empty tag display style ('C<normal>', 'C<html>' or 'C<expand>'). 10611 10612C<normal> outputs an empty tag 'C<< <tag/> >>', C<html> adds a space 10613'C<< <tag /> >>' for elements that can be empty in XHTML and C<expand> outputs 10614'C<< <tag></tag> >>' 10615 10616=item quote 10617 10618Set the quote character for attributes ('C<single>' or 'C<double>'). 10619 10620=item escape_gt 10621 10622By default XML::Twig does not escape the character > in its output, as it is not 10623mandated by the XML spec. With this option on, > will be replaced by C<>> 10624 10625=item comments 10626 10627Set the way comments are processed: 'C<drop>' (default), 'C<keep>' or 10628'C<process>' 10629 10630Comments processing options: 10631 10632=over 4 10633 10634=item drop 10635 10636drops the comments, they are not read, nor printed to the output 10637 10638=item keep 10639 10640comments are loaded and will appear on the output, they are not 10641accessible within the twig and will not interfere with processing 10642though 10643 10644B<Note>: comments in the middle of a text element such as 10645 10646 <p>text <!-- comment --> more text --></p> 10647 10648are kept at their original position in the text. Using ˝"print" 10649methods like C<print> or C<sprint> will return the comments in the 10650text. Using C<text> or C<field> on the other hand will not. 10651 10652Any use of C<set_pcdata> on the C<#PCDATA> element (directly or 10653through other methods like C<set_content>) will delete the comment(s). 10654 10655=item process 10656 10657comments are loaded in the twig and will be treated as regular elements 10658(their C<tag> is C<#COMMENT>) this can interfere with processing if you 10659expect C<< $elt->{first_child} >> to be an element but find a comment there. 10660Validation will not protect you from this as comments can happen anywhere. 10661You can use C<< $elt->first_child( 'tag') >> (which is a good habit anyway) 10662to get where you want. 10663 10664Consider using C<process> if you are outputting SAX events from XML::Twig. 10665 10666=back 10667 10668=item pi 10669 10670Set the way processing instructions are processed: 'C<drop>', 'C<keep>' 10671(default) or 'C<process>' 10672 10673Note that you can also set PI handlers in the C<twig_handlers> option: 10674 10675 '?' => \&handler 10676 '?target' => \&handler 2 10677 10678The handlers will be called with 2 parameters, the twig and the PI element if 10679C<pi> is set to C<process>, and with 3, the twig, the target and the data if 10680C<pi> is set to C<keep>. Of course they will not be called if C<pi> is set to 10681C<drop>. 10682 10683If C<pi> is set to C<keep> the handler should return a string that will be used 10684as-is as the PI text (it should look like "C< <?target data?> >" or '' if you 10685want to remove the PI), 10686 10687Only one handler will be called, C<?target> or C<?> if no specific handler for 10688that target is available. 10689 10690=item map_xmlns 10691 10692This option is passed a hashref that maps uri's to prefixes. The prefixes in 10693the document will be replaced by the ones in the map. The mapped prefixes can 10694(actually have to) be used to trigger handlers, navigate or query the document. 10695 10696Here is an example: 10697 10698 my $t= XML::Twig->new( map_xmlns => {'http://www.w3.org/2000/svg' => "svg"}, 10699 twig_handlers => 10700 { 'svg:circle' => sub { $_->set_att( r => 20) } }, 10701 pretty_print => 'indented', 10702 ) 10703 ->parse( '<doc xmlns:gr="http://www.w3.org/2000/svg"> 10704 <gr:circle cx="10" cy="90" r="10"/> 10705 </doc>' 10706 ) 10707 ->print; 10708 10709This will output: 10710 10711 <doc xmlns:svg="http://www.w3.org/2000/svg"> 10712 <svg:circle cx="10" cy="90" r="20"/> 10713 </doc> 10714 10715=item keep_original_prefix 10716 10717When used with C<L<map_xmlns>> this option will make C<XML::Twig> use the original 10718namespace prefixes when outputting a document. The mapped prefix will still be used 10719for triggering handlers and in navigation and query methods. 10720 10721 my $t= XML::Twig->new( map_xmlns => {'http://www.w3.org/2000/svg' => "svg"}, 10722 twig_handlers => 10723 { 'svg:circle' => sub { $_->set_att( r => 20) } }, 10724 keep_original_prefix => 1, 10725 pretty_print => 'indented', 10726 ) 10727 ->parse( '<doc xmlns:gr="http://www.w3.org/2000/svg"> 10728 <gr:circle cx="10" cy="90" r="10"/> 10729 </doc>' 10730 ) 10731 ->print; 10732 10733This will output: 10734 10735 <doc xmlns:gr="http://www.w3.org/2000/svg"> 10736 <gr:circle cx="10" cy="90" r="20"/> 10737 </doc> 10738 10739=item original_uri ($prefix) 10740 10741called within a handler, this will return the uri bound to the namespace prefix 10742in the original document. 10743 10744=item index ($arrayref or $hashref) 10745 10746This option creates lists of specific elements during the parsing of the XML. 10747It takes a reference to either a list of triggering expressions or to a hash 10748name => expression, and for each one generates the list of elements that 10749match the expression. The list can be accessed through the C<L<index>> method. 10750 10751example: 10752 10753 # using an array ref 10754 my $t= XML::Twig->new( index => [ 'div', 'table' ]) 10755 ->parsefile( "foo.xml"); 10756 my $divs= $t->index( 'div'); 10757 my $first_div= $divs->[0]; 10758 my $last_table= $t->index( table => -1); 10759 10760 # using a hashref to name the indexes 10761 my $t= XML::Twig->new( index => { email => 'a[@href=~/^ \s*mailto:/]'}) 10762 ->parsefile( "foo.xml"); 10763 my $last_emails= $t->index( email => -1); 10764 10765Note that the index is not maintained after the parsing. If elements are 10766deleted, renamed or otherwise hurt during processing, the index is NOT updated. 10767(changing the id element OTOH will update the index) 10768 10769=item att_accessors <list of attribute names> 10770 10771creates methods that give direct access to attribute: 10772 10773 my $t= XML::Twig->new( att_accessors => [ 'href', 'src']) 10774 ->parsefile( $file); 10775 my $first_href= $t->first_elt( 'img')->src; # same as ->att( 'src') 10776 $t->first_elt( 'img')->src( 'new_logo.png') # changes the attribute value 10777 10778=item elt_accessors 10779 10780creates methods that give direct access to the first child element (in scalar context) 10781or the list of elements (in list context): 10782 10783the list of accessors to create can be given 1 2 different ways: in an array, 10784or in a hash alias => expression 10785 my $t= XML::Twig->new( elt_accessors => [ 'head']) 10786 ->parsefile( $file); 10787 my $title_text= $t->root->head->field( 'title'); 10788 # same as $title_text= $t->root->first_child( 'head')->field( 'title'); 10789 10790 my $t= XML::Twig->new( elt_accessors => { warnings => 'p[@class="warning"]', d2 => 'div[2]'}, ) 10791 ->parsefile( $file); 10792 my $body= $t->first_elt( 'body'); 10793 my @warnings= $body->warnings; # same as $body->children( 'p[@class="warning"]'); 10794 my $s2= $body->d2; # same as $body->first_child( 'div[2]') 10795 10796=item field_accessors 10797 10798creates methods that give direct access to the first child element text: 10799 10800 my $t= XML::Twig->new( field_accessors => [ 'h1']) 10801 ->parsefile( $file); 10802 my $div_title_text= $t->first_elt( 'div')->title; 10803 # same as $title_text= $t->first_elt( 'div')->field( 'title'); 10804 10805=item use_tidy 10806 10807set this option to use HTML::Tidy instead of HTML::TreeBuilder to convert 10808HTML to XML. HTML, especially real (real "crap") HTML found in the wild, 10809so depending on the data, one module or the other does a better job at 10810the conversion. Also, HTML::Tidy can be a bit difficult to install, so 10811XML::Twig offers both option. TIMTOWTDI 10812 10813=item output_html_doctype 10814 10815when using HTML::TreeBuilder to convert HTML, this option causes the DOCTYPE 10816declaration to be output, which may be important for some legacy browsers. 10817Without that option the DOCTYPE definition is NOT output. Also if the definition 10818is completely wrong (ie not easily parsable), it is not output either. 10819 10820=back 10821 10822B<Note>: I _HATE_ the Java-like name of arguments used by most XML modules. 10823So in pure TIMTOWTDI fashion all arguments can be written either as 10824C<UglyJavaLikeName> or as C<readable_perl_name>: C<twig_print_outside_roots> 10825or C<TwigPrintOutsideRoots> (or even C<twigPrintOutsideRoots> {shudder}). 10826XML::Twig normalizes them before processing them. 10827 10828=item parse ( $source) 10829 10830The C<$source> parameter should either be a string containing the whole XML 10831document, or it should be an open C<IO::Handle> (aka a filehandle). 10832 10833A die call is thrown if a parse error occurs. Otherwise it will return 10834the twig built by the parse. Use C<safe_parse> if you want the parsing 10835to return even when an error occurs. 10836 10837If this method is called as a class method 10838(C<< XML::Twig->parse( $some_xml_or_html) >>) then an XML::Twig object is 10839created, using the parameters except the last one (eg 10840C<< XML::Twig->parse( pretty_print => 'indented', $some_xml_or_html) >>) 10841and C<L<xparse>> is called on it. 10842 10843Note that when parsing a filehandle, the handle should NOT be open with an 10844encoding (ie open with C<open( my $in, '<', $filename)>. The file will be 10845parsed by C<expat>, so specifying the encoding actually causes problems 10846for the parser (as in: it can crash it, see 10847https://rt.cpan.org/Ticket/Display.html?id=78877). For parsing a file it 10848is actually recommended to use C<parsefile> on the file name, instead of 10849<parse> on the open file. 10850 10851=item parsestring 10852 10853This is just an alias for C<parse> for backwards compatibility. 10854 10855=item parsefile (FILE [, OPT => OPT_VALUE [...]]) 10856 10857Open C<FILE> for reading, then call C<parse> with the open handle. The file 10858is closed no matter how C<parse> returns. 10859 10860A C<die> call is thrown if a parse error occurs. Otherwise it will return 10861the twig built by the parse. Use C<safe_parsefile> if you want the parsing 10862to return even when an error occurs. 10863 10864=item parsefile_inplace ( $file, $optional_extension) 10865 10866Parse and update a file "in place". It does this by creating a temp file, 10867selecting it as the default for print() statements (and methods), then parsing 10868the input file. If the parsing is successful, then the temp file is 10869moved to replace the input file. 10870 10871If an extension is given then the original file is backed-up (the rules for 10872the extension are the same as the rule for the -i option in perl). 10873 10874=item parsefile_html_inplace ( $file, $optional_extension) 10875 10876Same as parsefile_inplace, except that it parses HTML instead of XML 10877 10878=item parseurl ($url $optional_user_agent) 10879 10880Gets the data from C<$url> and parse it. The data is piped to the parser in 10881chunks the size of the XML::Parser::Expat buffer, so memory consumption and 10882hopefully speed are optimal. 10883 10884For most (read "small") XML it is probably as efficient (and easier to debug) 10885to just C<get> the XML file and then parse it as a string. 10886 10887 use XML::Twig; 10888 use LWP::Simple; 10889 my $twig= XML::Twig->new(); 10890 $twig->parse( LWP::Simple::get( $URL )); 10891 10892or 10893 10894 use XML::Twig; 10895 my $twig= XML::Twig->nparse( $URL); 10896 10897 10898If the C<$optional_user_agent> argument is used then it is used, otherwise a 10899new one is created. 10900 10901=item safe_parse ( SOURCE [, OPT => OPT_VALUE [...]]) 10902 10903This method is similar to C<parse> except that it wraps the parsing in an 10904C<eval> block. It returns the twig on success and 0 on failure (the twig object 10905also contains the parsed twig). C<$@> contains the error message on failure. 10906 10907Note that the parsing still stops as soon as an error is detected, there is 10908no way to keep going after an error. 10909 10910=item safe_parsefile (FILE [, OPT => OPT_VALUE [...]]) 10911 10912This method is similar to C<parsefile> except that it wraps the parsing in an 10913C<eval> block. It returns the twig on success and 0 on failure (the twig object 10914also contains the parsed twig) . C<$@> contains the error message on failure 10915 10916Note that the parsing still stops as soon as an error is detected, there is 10917no way to keep going after an error. 10918 10919=item safe_parseurl ($url $optional_user_agent) 10920 10921Same as C<parseurl> except that it wraps the parsing in an C<eval> block. It 10922returns the twig on success and 0 on failure (the twig object also contains 10923the parsed twig) . C<$@> contains the error message on failure 10924 10925=item parse_html ($string_or_fh) 10926 10927parse an HTML string or file handle (by converting it to XML using 10928HTML::TreeBuilder, which needs to be available). 10929 10930This works nicely, but some information gets lost in the process: 10931newlines are removed, and (at least on the version I use), comments 10932get an extra CDATA section inside ( <!-- foo --> becomes 10933<!-- <![CDATA[ foo ]]> --> 10934 10935=item parsefile_html ($file) 10936 10937parse an HTML file (by converting it to XML using HTML::TreeBuilder, which 10938needs to be available, or HTML::Tidy if the C<use_tidy> option was used). 10939The file is loaded completely in memory and converted to XML before being parsed. 10940 10941this method is to be used with caution though, as it doesn't know about the 10942file encoding, it is usually better to use C<L<parse_html>>, which gives you 10943a chance to open the file with the proper encoding layer. 10944 10945=item parseurl_html ($url $optional_user_agent) 10946 10947parse an URL as html the same way C<L<parse_html>> does 10948 10949=item safe_parseurl_html ($url $optional_user_agent) 10950 10951Same as C<L<parseurl_html>>> except that it wraps the parsing in an C<eval> 10952block. It returns the twig on success and 0 on failure (the twig object also 10953contains the parsed twig) . C<$@> contains the error message on failure 10954 10955=item safe_parsefile_html ($file $optional_user_agent) 10956 10957Same as C<L<parsefile_html>>> except that it wraps the parsing in an C<eval> 10958block. It returns the twig on success and 0 on failure (the twig object also 10959contains the parsed twig) . C<$@> contains the error message on failure 10960 10961=item safe_parse_html ($string_or_fh) 10962 10963Same as C<L<parse_html>> except that it wraps the parsing in an C<eval> block. 10964It returns the twig on success and 0 on failure (the twig object also contains 10965the parsed twig) . C<$@> contains the error message on failure 10966 10967=item xparse ($thing_to_parse) 10968 10969parse the C<$thing_to_parse>, whether it is a filehandle, a string, an HTML 10970file, an HTML URL, an URL or a file. 10971 10972Note that this is mostly a convenience method for one-off scripts. For example 10973files that end in '.htm' or '.html' are parsed first as XML, and if this fails 10974as HTML. This is certainly not the most efficient way to do this in general. 10975 10976=item nparse ($optional_twig_options, $thing_to_parse) 10977 10978create a twig with the C<$optional_options>, and parse the C<$thing_to_parse>, 10979whether it is a filehandle, a string, an HTML file, an HTML URL, an URL or a 10980file. 10981 10982Examples: 10983 10984 XML::Twig->nparse( "file.xml"); 10985 XML::Twig->nparse( error_context => 1, "file://file.xml"); 10986 10987=item nparse_pp ($optional_twig_options, $thing_to_parse) 10988 10989same as C<L<nparse>> but also sets the C<pretty_print> option to C<indented>. 10990 10991=item nparse_e ($optional_twig_options, $thing_to_parse) 10992 10993same as C<L<nparse>> but also sets the C<error_context> option to 1. 10994 10995=item nparse_ppe ($optional_twig_options, $thing_to_parse) 10996 10997same as C<L<nparse>> but also sets the C<pretty_print> option to C<indented> 10998and the C<error_context> option to 1. 10999 11000=item parser 11001 11002This method returns the C<expat> object (actually the XML::Parser::Expat object) 11003used during parsing. It is useful for example to call XML::Parser::Expat methods 11004on it. To get the line of a tag for example use C<< $t->parser->current_line >>. 11005 11006=item setTwigHandlers ($handlers) 11007 11008Set the twig_handlers. C<$handlers> is a reference to a hash similar to the 11009one in the C<twig_handlers> option of new. All previous handlers are unset. 11010The method returns the reference to the previous handlers. 11011 11012=item setTwigHandler ($exp $handler) 11013 11014Set a single twig_handler for elements matching C<$exp>. C<$handler> is a 11015reference to a subroutine. If the handler was previously set then the reference 11016to the previous handler is returned. 11017 11018=item setStartTagHandlers ($handlers) 11019 11020Set the start_tag handlers. C<$handlers> is a reference to a hash similar to the 11021one in the C<start_tag_handlers> option of new. All previous handlers are unset. 11022The method returns the reference to the previous handlers. 11023 11024=item setStartTagHandler ($exp $handler) 11025 11026Set a single start_tag handlers for elements matching C<$exp>. C<$handler> is a 11027reference to a subroutine. If the handler was previously set then the reference 11028to the previous handler is returned. 11029 11030=item setEndTagHandlers ($handlers) 11031 11032Set the end_tag handlers. C<$handlers> is a reference to a hash similar to the 11033one in the C<end_tag_handlers> option of new. All previous handlers are unset. 11034The method returns the reference to the previous handlers. 11035 11036=item setEndTagHandler ($exp $handler) 11037 11038Set a single end_tag handlers for elements matching C<$exp>. C<$handler> is a 11039reference to a subroutine. If the handler was previously set then the 11040reference to the previous handler is returned. 11041 11042=item setTwigRoots ($handlers) 11043 11044Same as using the C<L<twig_roots>> option when creating the twig 11045 11046=item setCharHandler ($exp $handler) 11047 11048Set a C<char_handler> 11049 11050=item setIgnoreEltsHandler ($exp) 11051 11052Set a C<ignore_elt> handler (elements that match C<$exp> will be ignored 11053 11054=item setIgnoreEltsHandlers ($exp) 11055 11056Set all C<ignore_elt> handlers (previous handlers are replaced) 11057 11058=item dtd 11059 11060Return the dtd (an L<XML::Twig::DTD> object) of a twig 11061 11062=item xmldecl 11063 11064Return the XML declaration for the document, or a default one if it doesn't 11065have one 11066 11067=item doctype 11068 11069Return the doctype for the document 11070 11071=item doctype_name 11072 11073returns the doctype of the document from the doctype declaration 11074 11075=item system_id 11076 11077returns the system value of the DTD of the document from the doctype declaration 11078 11079=item public_id 11080 11081returns the public doctype of the document from the doctype declaration 11082 11083=item internal_subset 11084 11085returns the internal subset of the DTD 11086 11087=item dtd_text 11088 11089Return the DTD text 11090 11091=item dtd_print 11092 11093Print the DTD 11094 11095=item model ($tag) 11096 11097Return the model (in the DTD) for the element C<$tag> 11098 11099=item root 11100 11101Return the root element of a twig 11102 11103=item set_root ($elt) 11104 11105Set the root of a twig 11106 11107=item first_elt ($optional_condition) 11108 11109Return the first element matching C<$optional_condition> of a twig, if 11110no condition is given then the root is returned 11111 11112=item last_elt ($optional_condition) 11113 11114Return the last element matching C<$optional_condition> of a twig, if 11115no condition is given then the last element of the twig is returned 11116 11117=item elt_id ($id) 11118 11119Return the element whose C<id> attribute is $id 11120 11121=item getEltById 11122 11123Same as C<L<elt_id>> 11124 11125=item index ($index_name, $optional_index) 11126 11127If the C<$optional_index> argument is present, return the corresponding element 11128in the index (created using the C<index> option for C<XML::Twig->new>) 11129 11130If the argument is not present, return an arrayref to the index 11131 11132=item normalize 11133 11134merge together all consecutive pcdata elements in the document (if for example 11135you have turned some elements into pcdata using C<L<erase>>, this will give you 11136a "clean" document in which there all text elements are as long as possible). 11137 11138=item encoding 11139 11140This method returns the encoding of the XML document, as defined by the 11141C<encoding> attribute in the XML declaration (ie it is C<undef> if the attribute 11142is not defined) 11143 11144=item set_encoding 11145 11146This method sets the value of the C<encoding> attribute in the XML declaration. 11147Note that if the document did not have a declaration it is generated (with 11148an XML version of 1.0) 11149 11150=item xml_version 11151 11152This method returns the XML version, as defined by the C<version> attribute in 11153the XML declaration (ie it is C<undef> if the attribute is not defined) 11154 11155=item set_xml_version 11156 11157This method sets the value of the C<version> attribute in the XML declaration. 11158If the declaration did not exist it is created. 11159 11160=item standalone 11161 11162This method returns the value of the C<standalone> declaration for the document 11163 11164=item set_standalone 11165 11166This method sets the value of the C<standalone> attribute in the XML 11167declaration. Note that if the document did not have a declaration it is 11168generated (with an XML version of 1.0) 11169 11170=item set_output_encoding 11171 11172Set the C<encoding> "attribute" in the XML declaration 11173 11174=item set_doctype ($name, $system, $public, $internal) 11175 11176Set the doctype of the element. If an argument is C<undef> (or not present) 11177then its former value is retained, if a false ('' or 0) value is passed then 11178the former value is deleted; 11179 11180=item entity_list 11181 11182Return the entity list of a twig 11183 11184=item entity_names 11185 11186Return the list of all defined entities 11187 11188=item entity ($entity_name) 11189 11190Return the entity 11191 11192=item change_gi ($old_gi, $new_gi) 11193 11194Performs a (very fast) global change. All elements C<$old_gi> are now 11195C<$new_gi>. This is a bit dangerous though and should be avoided if 11196< possible, as the new tag might be ignored in subsequent processing. 11197 11198See C<L<BUGS> > 11199 11200=item flush ($optional_filehandle, %options) 11201 11202Flushes a twig up to (and including) the current element, then deletes 11203all unnecessary elements from the tree that's kept in memory. 11204C<flush> keeps track of which elements need to be open/closed, so if you 11205flush from handlers you don't have to worry about anything. Just keep 11206flushing the twig every time you're done with a sub-tree and it will 11207come out well-formed. After the whole parsing don't forget toC<flush> 11208one more time to print the end of the document. 11209The doctype and entity declarations are also printed. 11210 11211flush take an optional filehandle as an argument. 11212 11213If you use C<flush> at any point during parsing, the document will be flushed 11214one last time at the end of the parsing, to the proper filehandle. 11215 11216options: use the C<update_DTD> option if you have updated the (internal) DTD 11217and/or the entity list and you want the updated DTD to be output 11218 11219The C<pretty_print> option sets the pretty printing of the document. 11220 11221 Example: $t->flush( Update_DTD => 1); 11222 $t->flush( $filehandle, pretty_print => 'indented'); 11223 $t->flush( \*FILE); 11224 11225 11226=item flush_up_to ($elt, $optional_filehandle, %options) 11227 11228Flushes up to the C<$elt> element. This allows you to keep part of the 11229tree in memory when you C<flush>. 11230 11231options: see flush. 11232 11233=item purge 11234 11235Does the same as a C<flush> except it does not print the twig. It just deletes 11236all elements that have been completely parsed so far. 11237 11238=item purge_up_to ($elt) 11239 11240Purges up to the C<$elt> element. This allows you to keep part of the tree in 11241memory when you C<purge>. 11242 11243=item print ($optional_filehandle, %options) 11244 11245Prints the whole document associated with the twig. To be used only AFTER the 11246parse. 11247 11248options: see C<flush>. 11249 11250=item print_to_file ($filename, %options) 11251 11252Prints the whole document associated with the twig to file C<$filename>. 11253To be used only AFTER the parse. 11254 11255options: see C<flush>. 11256 11257=item safe_print_to_file ($filename, %options) 11258 11259Prints the whole document associated with the twig to file C<$filename>. 11260This variant, which probably only works on *nix prints to a temp file, 11261then move the temp file to overwrite the original file. 11262 11263This is a bit safer when 2 processes an potentiallywrite the same file: 11264only the last one will succeed, but the file won't be corruted. I often 11265use this for cron jobs, so testing the code doesn't interfere with the 11266cron job running at the same time. 11267 11268options: see C<flush>. 11269 11270=item sprint 11271 11272Return the text of the whole document associated with the twig. To be used only 11273AFTER the parse. 11274 11275options: see C<flush>. 11276 11277=item trim 11278 11279Trim the document: gets rid of initial and trailing spaces, and replaces multiple spaces 11280by a single one. 11281 11282=item toSAX1 ($handler) 11283 11284Send SAX events for the twig to the SAX1 handler C<$handler> 11285 11286=item toSAX2 ($handler) 11287 11288Send SAX events for the twig to the SAX2 handler C<$handler> 11289 11290=item flush_toSAX1 ($handler) 11291 11292Same as flush, except that SAX events are sent to the SAX1 handler 11293C<$handler> instead of the twig being printed 11294 11295=item flush_toSAX2 ($handler) 11296 11297Same as flush, except that SAX events are sent to the SAX2 handler 11298C<$handler> instead of the twig being printed 11299 11300=item ignore 11301 11302This method should be called during parsing, usually in C<start_tag_handlers>. 11303It causes the element to be skipped during the parsing: the twig is not built 11304for this element, it will not be accessible during parsing or after it. The 11305element will not take up any memory and parsing will be faster. 11306 11307Note that this method can also be called on an element. If the element is a 11308parent of the current element then this element will be ignored (the twig will 11309not be built any more for it and what has already been built will be deleted). 11310 11311=item set_pretty_print ($style) 11312 11313Set the pretty print method, amongst 'C<none>' (default), 'C<nsgmls>', 11314'C<nice>', 'C<indented>', C<indented_c>, 'C<wrapped>', 'C<record>' and 11315'C<record_c>' 11316 11317B<WARNING:> the pretty print style is a B<GLOBAL> variable, so once set it's 11318applied to B<ALL> C<print>'s (and C<sprint>'s). Same goes if you use XML::Twig 11319with C<mod_perl> . This should not be a problem as the XML that's generated 11320is valid anyway, and XML processors (as well as HTML processors, including 11321browsers) should not care. Let me know if this is a big problem, but at the 11322moment the performance/cleanliness trade-off clearly favors the global 11323approach. 11324 11325=item set_empty_tag_style ($style) 11326 11327Set the empty tag display style ('C<normal>', 'C<html>' or 'C<expand>'). As 11328with C<L<set_pretty_print>> this sets a global flag. 11329 11330C<normal> outputs an empty tag 'C<< <tag/> >>', C<html> adds a space 11331'C<< <tag /> >>' for elements that can be empty in XHTML and C<expand> outputs 11332'C<< <tag></tag> >>' 11333 11334=item set_remove_cdata ($flag) 11335 11336set (or unset) the flag that forces the twig to output CDATA sections as 11337regular (escaped) PCDATA 11338 11339=item print_prolog ($optional_filehandle, %options) 11340 11341Prints the prolog (XML declaration + DTD + entity declarations) of a document. 11342 11343options: see C<L<flush>>. 11344 11345=item prolog ($optional_filehandle, %options) 11346 11347Return the prolog (XML declaration + DTD + entity declarations) of a document. 11348 11349options: see C<L<flush>>. 11350 11351=item finish 11352 11353Call Expat C<finish> method. 11354Unsets all handlers (including internal ones that set context), but expat 11355continues parsing to the end of the document or until it finds an error. 11356It should finish up a lot faster than with the handlers set. 11357 11358=item finish_print 11359 11360Stops twig processing, flush the twig and proceed to finish printing the 11361document as fast as possible. Use this method when modifying a document and 11362the modification is done. 11363 11364=item finish_now 11365 11366Stops twig processing, does not finish parsing the document (which could 11367actually be not well-formed after the point where C<finish_now> is called). 11368Execution resumes after the C<Lparse>> or C<L<parsefile>> call. The content 11369of the twig is what has been parsed so far (all open elements at the time 11370C<finish_now> is called are considered closed). 11371 11372=item set_expand_external_entities 11373 11374Same as using the C<L<expand_external_ents>> option when creating the twig 11375 11376=item set_input_filter 11377 11378Same as using the C<L<input_filter>> option when creating the twig 11379 11380=item set_keep_atts_order 11381 11382Same as using the C<L<keep_atts_order>> option when creating the twig 11383 11384=item set_keep_encoding 11385 11386Same as using the C<L<keep_encoding>> option when creating the twig 11387 11388=item escape_gt 11389 11390usually XML::Twig does not escape > in its output. Using this option 11391makes it replace > by > 11392 11393=item do_not_escape_gt 11394 11395reverts XML::Twig behavior to its default of not escaping > in its output. 11396 11397=item set_output_filter 11398 11399Same as using the C<L<output_filter>> option when creating the twig 11400 11401=item set_output_text_filter 11402 11403Same as using the C<L<output_text_filter>> option when creating the twig 11404 11405=item add_stylesheet ($type, @options) 11406 11407Adds an external stylesheet to an XML document. 11408 11409Supported types and options: 11410 11411=over 4 11412 11413=item xsl 11414 11415option: the url of the stylesheet 11416 11417Example: 11418 11419 $t->add_stylesheet( xsl => "xsl_style.xsl"); 11420 11421will generate the following PI at the beginning of the document: 11422 11423 <?xml-stylesheet type="text/xsl" href="xsl_style.xsl"?> 11424 11425=item css 11426 11427option: the url of the stylesheet 11428 11429=item active_twig 11430 11431a class method that returns the last processed twig, so you don't necessarily 11432need the object to call methods on it. 11433 11434=back 11435 11436=item Methods inherited from XML::Parser::Expat 11437 11438A twig inherits all the relevant methods from XML::Parser::Expat. These 11439methods can only be used during the parsing phase (they will generate 11440a fatal error otherwise). 11441 11442Inherited methods are: 11443 11444=over 4 11445 11446=item depth 11447 11448Returns the size of the context list. 11449 11450=item in_element 11451 11452Returns true if NAME is equal to the name of the innermost cur‐ 11453rently opened element. If namespace processing is being used and 11454you want to check against a name that may be in a namespace, then 11455use the generate_ns_name method to create the NAME argument. 11456 11457=item within_element 11458 11459Returns the number of times the given name appears in the context 11460list. If namespace processing is being used and you want to check 11461against a name that may be in a namespace, then use the gener‐ 11462ate_ns_name method to create the NAME argument. 11463 11464=item context 11465 11466Returns a list of element names that represent open elements, with 11467the last one being the innermost. Inside start and end tag han‐ 11468dlers, this will be the tag of the parent element. 11469 11470=item current_line 11471 11472Returns the line number of the current position of the parse. 11473 11474=item current_column 11475 11476Returns the column number of the current position of the parse. 11477 11478=item current_byte 11479 11480Returns the current position of the parse. 11481 11482=item position_in_context 11483 11484Returns a string that shows the current parse position. LINES 11485should be an integer >= 0 that represents the number of lines on 11486either side of the current parse line to place into the returned 11487string. 11488 11489=item base ([NEWBASE]) 11490 11491Returns the current value of the base for resolving relative URIs. 11492If NEWBASE is supplied, changes the base to that value. 11493 11494=item current_element 11495 11496Returns the name of the innermost currently opened element. Inside 11497start or end handlers, returns the parent of the element associated 11498with those tags. 11499 11500=item element_index 11501 11502Returns an integer that is the depth-first visit order of the cur‐ 11503rent element. This will be zero outside of the root element. For 11504example, this will return 1 when called from the start handler for 11505the root element start tag. 11506 11507=item recognized_string 11508 11509Returns the string from the document that was recognized in order 11510to call the current handler. For instance, when called from a start 11511handler, it will give us the start-tag string. The string is 11512encoded in UTF-8. This method doesn't return a meaningful string 11513inside declaration handlers. 11514 11515=item original_string 11516 11517Returns the verbatim string from the document that was recognized 11518in order to call the current handler. The string is in the original 11519document encoding. This method doesn't return a meaningful string 11520inside declaration handlers. 11521 11522=item xpcroak 11523 11524Concatenate onto the given message the current line number within 11525the XML document plus the message implied by ErrorContext. Then 11526croak with the formed message. 11527 11528=item xpcarp 11529 11530Concatenate onto the given message the current line number within 11531the XML document plus the message implied by ErrorContext. Then 11532carp with the formed message. 11533 11534=item xml_escape(TEXT [, CHAR [, CHAR ...]]) 11535 11536Returns TEXT with markup characters turned into character entities. 11537Any additional characters provided as arguments are also turned 11538into character references where found in TEXT. 11539 11540(this method is broken on some versions of expat/XML::Parser) 11541 11542=back 11543 11544=item path ( $optional_tag) 11545 11546Return the element context in a form similar to XPath's short 11547form: 'C</root/tag1/../tag>' 11548 11549=item get_xpath ( $optional_array_ref, $xpath, $optional_offset) 11550 11551Performs a C<get_xpath> on the document root (see <Elt|"Elt">) 11552 11553If the C<$optional_array_ref> argument is used the array must contain 11554elements. The C<$xpath> expression is applied to each element in turn 11555and the result is union of all results. This way a first query can be 11556refined in further steps. 11557 11558 11559=item find_nodes ( $optional_array_ref, $xpath, $optional_offset) 11560 11561same as C<get_xpath> 11562 11563=item findnodes ( $optional_array_ref, $xpath, $optional_offset) 11564 11565same as C<get_xpath> (similar to the XML::LibXML method) 11566 11567=item findvalue ( $optional_array_ref, $xpath, $optional_offset) 11568 11569Return the C<join> of all texts of the results of applying C<L<get_xpath>> 11570to the node (similar to the XML::LibXML method) 11571 11572=item findvalues ( $optional_array_ref, $xpath, $optional_offset) 11573 11574Return an array of all texts of the results of applying C<L<get_xpath>> 11575to the node 11576 11577=item subs_text ($regexp, $replace) 11578 11579subs_text does text substitution on the whole document, similar to perl's 11580C< s///> operator. 11581 11582=item dispose 11583 11584Useful only if you don't have C<Scalar::Util> or C<WeakRef> installed. 11585 11586Reclaims properly the memory used by an XML::Twig object. As the object has 11587circular references it never goes out of scope, so if you want to parse lots 11588of XML documents then the memory leak becomes a problem. Use 11589C<< $twig->dispose >> to clear this problem. 11590 11591=item att_accessors (list_of_attribute_names) 11592 11593A convenience method that creates l-valued accessors for attributes. 11594So C<< $twig->create_accessors( 'foo') >> will create a C<foo> method 11595that can be called on elements: 11596 11597 $elt->foo; # equivalent to $elt->{'att'}->{'foo'}; 11598 $elt->foo( 'bar'); # equivalent to $elt->set_att( foo => 'bar'); 11599 11600The methods are l-valued only under those perl's that support this 11601feature (5.6 and above) 11602 11603=item create_accessors (list_of_attribute_names) 11604 11605Same as att_accessors 11606 11607=item elt_accessors (list_of_attribute_names) 11608 11609A convenience method that creates accessors for elements. 11610So C<< $twig->create_accessors( 'foo') >> will create a C<foo> method 11611that can be called on elements: 11612 11613 $elt->foo; # equivalent to $elt->first_child( 'foo'); 11614 11615=item field_accessors (list_of_attribute_names) 11616 11617A convenience method that creates accessors for element values (C<field>). 11618So C<< $twig->create_accessors( 'foo') >> will create a C<foo> method 11619that can be called on elements: 11620 11621 $elt->foo; # equivalent to $elt->field( 'foo'); 11622 11623=item set_do_not_escape_amp_in_atts 11624 11625An evil method, that I only document because Test::Pod::Coverage complaints otherwise, 11626but really, you don't want to know about it. 11627 11628=back 11629 11630=head2 XML::Twig::Elt 11631 11632=over 4 11633 11634=item new ($optional_tag, $optional_atts, @optional_content) 11635 11636The C<tag> is optional (but then you can't have a content ), the C<$optional_atts> 11637argument is a reference to a hash of attributes, the content can be just a 11638string or a list of strings and element. A content of 'C<#EMPTY>' creates an empty 11639element; 11640 11641 Examples: my $elt= XML::Twig::Elt->new(); 11642 my $elt= XML::Twig::Elt->new( para => { align => 'center' }); 11643 my $elt= XML::Twig::Elt->new( para => { align => 'center' }, 'foo'); 11644 my $elt= XML::Twig::Elt->new( br => '#EMPTY'); 11645 my $elt= XML::Twig::Elt->new( 'para'); 11646 my $elt= XML::Twig::Elt->new( para => 'this is a para'); 11647 my $elt= XML::Twig::Elt->new( para => $elt3, 'another para'); 11648 11649The strings are not parsed, the element is not attached to any twig. 11650 11651B<WARNING>: if you rely on ID's then you will have to set the id yourself. At 11652this point the element does not belong to a twig yet, so the ID attribute 11653is not known so it won't be stored in the ID list. 11654 11655Note that C<#COMMENT>, C<#PCDATA> or C<#CDATA> are valid tag names, that will 11656create text elements. 11657 11658To create an element C<foo> containing a CDATA section: 11659 11660 my $foo= XML::Twig::Elt->new( '#CDATA' => "content of the CDATA section") 11661 ->wrap_in( 'foo'); 11662 11663An attribute of '#CDATA', will create the content of the element as CDATA: 11664 11665 my $elt= XML::Twig::Elt->new( 'p' => { '#CDATA' => 1}, 'foo < bar'); 11666 11667creates an element 11668 11669 <p><![CDATA[foo < bar]]></> 11670 11671=item parse ($string, %args) 11672 11673Creates an element from an XML string. The string is actually 11674parsed as a new twig, then the root of that twig is returned. 11675The arguments in C<%args> are passed to the twig. 11676As always if the parse fails the parser will die, so use an 11677eval if you want to trap syntax errors. 11678 11679As obviously the element does not exist beforehand this method has to be 11680called on the class: 11681 11682 my $elt= parse XML::Twig::Elt( "<a> string to parse, with <sub/> 11683 <elements>, actually tons of </elements> 11684 h</a>"); 11685 11686=item set_inner_xml ($string) 11687 11688Sets the content of the element to be the tree created from the string 11689 11690=item set_inner_html ($string) 11691 11692Sets the content of the element, after parsing the string with an HTML 11693parser (HTML::Parser) 11694 11695=item set_outer_xml ($string) 11696 11697Replaces the element with the tree created from the string 11698 11699=item print ($optional_filehandle, $optional_pretty_print_style) 11700 11701Prints an entire element, including the tags, optionally to a 11702C<$optional_filehandle>, optionally with a C<$pretty_print_style>. 11703 11704The print outputs XML data so base entities are escaped. 11705 11706=item print_to_file ($filename, %options) 11707 11708Prints the element to file C<$filename>. 11709 11710options: see C<flush>. 11711=item sprint ($elt, $optional_no_enclosing_tag) 11712 11713Return the xml string for an entire element, including the tags. 11714If the optional second argument is true then only the string inside the 11715element is returned (the start and end tag for $elt are not). 11716The text is XML-escaped: base entities (& and < in text, & < and " in 11717attribute values) are turned into entities. 11718 11719=item gi 11720 11721Return the gi of the element (the gi is the C<generic identifier> the tag 11722name in SGML parlance). 11723 11724C<tag> and C<name> are synonyms of C<gi>. 11725 11726=item tag 11727 11728Same as C<L<gi>> 11729 11730=item name 11731 11732Same as C<L<tag>> 11733 11734=item set_gi ($tag) 11735 11736Set the gi (tag) of an element 11737 11738=item set_tag ($tag) 11739 11740Set the tag (=C<L<tag>>) of an element 11741 11742=item set_name ($name) 11743 11744Set the name (=C<L<tag>>) of an element 11745 11746=item root 11747 11748Return the root of the twig in which the element is contained. 11749 11750=item twig 11751 11752Return the twig containing the element. 11753 11754=item parent ($optional_condition) 11755 11756Return the parent of the element, or the first ancestor matching the 11757C<$optional_condition> 11758 11759=item first_child ($optional_condition) 11760 11761Return the first child of the element, or the first child matching the 11762C<$optional_condition> 11763 11764=item has_child ($optional_condition) 11765 11766Return the first child of the element, or the first child matching the 11767C<$optional_condition> (same as L<first_child>) 11768 11769=item has_children ($optional_condition) 11770 11771Return the first child of the element, or the first child matching the 11772C<$optional_condition> (same as L<first_child>) 11773 11774 11775=item first_child_text ($optional_condition) 11776 11777Return the text of the first child of the element, or the first child 11778 matching the C<$optional_condition> 11779If there is no first_child then returns ''. This avoids getting the 11780child, checking for its existence then getting the text for trivial cases. 11781 11782Similar methods are available for the other navigation methods: 11783 11784=over 4 11785 11786=item last_child_text 11787 11788=item prev_sibling_text 11789 11790=item next_sibling_text 11791 11792=item prev_elt_text 11793 11794=item next_elt_text 11795 11796=item child_text 11797 11798=item parent_text 11799 11800=back 11801 11802All this methods also exist in "trimmed" variant: 11803 11804=over 4 11805 11806=item first_child_trimmed_text 11807 11808=item last_child_trimmed_text 11809 11810=item prev_sibling_trimmed_text 11811 11812=item next_sibling_trimmed_text 11813 11814=item prev_elt_trimmed_text 11815 11816=item next_elt_trimmed_text 11817 11818=item child_trimmed_text 11819 11820=item parent_trimmed_text 11821 11822=back 11823 11824=item field ($condition) 11825 11826Same method as C<first_child_text> with a different name 11827 11828=item fields ($condition_list) 11829 11830Return the list of field (text of first child matching the conditions), 11831missing fields are returned as the empty string. 11832 11833Same method as C<first_child_text> with a different name 11834 11835=item trimmed_field ($optional_condition) 11836 11837Same method as C<first_child_trimmed_text> with a different name 11838 11839=item set_field ($condition, $optional_atts, @list_of_elt_and_strings) 11840 11841Set the content of the first child of the element that matches 11842C<$condition>, the rest of the arguments is the same as for C<L<set_content>> 11843 11844If no child matches C<$condition> _and_ if C<$condition> is a valid 11845XML element name, then a new element by that name is created and 11846inserted as the last child. 11847 11848=item first_child_matches ($optional_condition) 11849 11850Return the element if the first child of the element (if it exists) passes 11851the C<$optional_condition> C<undef> otherwise 11852 11853 if( $elt->first_child_matches( 'title')) ... 11854 11855is equivalent to 11856 11857 if( $elt->{first_child} && $elt->{first_child}->passes( 'title')) 11858 11859C<first_child_is> is an other name for this method 11860 11861Similar methods are available for the other navigation methods: 11862 11863=over 4 11864 11865=item last_child_matches 11866 11867=item prev_sibling_matches 11868 11869=item next_sibling_matches 11870 11871=item prev_elt_matches 11872 11873=item next_elt_matches 11874 11875=item child_matches 11876 11877=item parent_matches 11878 11879=back 11880 11881=item is_first_child ($optional_condition) 11882 11883returns true (the element) if the element is the first child of its parent 11884(optionally that satisfies the C<$optional_condition>) 11885 11886=item is_last_child ($optional_condition) 11887 11888returns true (the element) if the element is the last child of its parent 11889(optionally that satisfies the C<$optional_condition>) 11890 11891=item prev_sibling ($optional_condition) 11892 11893Return the previous sibling of the element, or the previous sibling matching 11894C<$optional_condition> 11895 11896=item next_sibling ($optional_condition) 11897 11898Return the next sibling of the element, or the first one matching 11899C<$optional_condition>. 11900 11901=item next_elt ($optional_elt, $optional_condition) 11902 11903Return the next elt (optionally matching C<$optional_condition>) of the element. This 11904is defined as the next element which opens after the current element opens. 11905Which usually means the first child of the element. 11906Counter-intuitive as it might look this allows you to loop through the 11907whole document by starting from the root. 11908 11909The C<$optional_elt> is the root of a subtree. When the C<next_elt> is out of the 11910subtree then the method returns undef. You can then walk a sub-tree with: 11911 11912 my $elt= $subtree_root; 11913 while( $elt= $elt->next_elt( $subtree_root)) 11914 { # insert processing code here 11915 } 11916 11917=item prev_elt ($optional_condition) 11918 11919Return the previous elt (optionally matching C<$optional_condition>) of the 11920element. This is the first element which opens before the current one. 11921It is usually either the last descendant of the previous sibling or 11922simply the parent 11923 11924=item next_n_elt ($offset, $optional_condition) 11925 11926Return the C<$offset>-th element that matches the C<$optional_condition> 11927 11928=item following_elt 11929 11930Return the following element (as per the XPath following axis) 11931 11932=item preceding_elt 11933 11934Return the preceding element (as per the XPath preceding axis) 11935 11936=item following_elts 11937 11938Return the list of following elements (as per the XPath following axis) 11939 11940=item preceding_elts 11941 11942Return the list of preceding elements (as per the XPath preceding axis) 11943 11944=item children ($optional_condition) 11945 11946Return the list of children (optionally which matches C<$optional_condition>) of 11947the element. The list is in document order. 11948 11949=item children_count ($optional_condition) 11950 11951Return the number of children of the element (optionally which matches 11952C<$optional_condition>) 11953 11954=item children_text ($optional_condition) 11955 11956In array context, returns an array containing the text of children of the 11957element (optionally which matches C<$optional_condition>) 11958 11959In scalar context, returns the concatenation of the text of children of 11960the element 11961 11962=item children_trimmed_text ($optional_condition) 11963 11964In array context, returns an array containing the trimmed text of children 11965of the element (optionally which matches C<$optional_condition>) 11966 11967In scalar context, returns the concatenation of the trimmed text of children of 11968the element 11969 11970 11971=item children_copy ($optional_condition) 11972 11973Return a list of elements that are copies of the children of the element, 11974optionally which matches C<$optional_condition> 11975 11976=item descendants ($optional_condition) 11977 11978Return the list of all descendants (optionally which matches 11979C<$optional_condition>) of the element. This is the equivalent of the 11980C<getElementsByTagName> of the DOM (by the way, if you are really a DOM 11981addict, you can use C<getElementsByTagName> instead) 11982 11983=item getElementsByTagName ($optional_condition) 11984 11985Same as C<L<descendants>> 11986 11987=item find_by_tag_name ($optional_condition) 11988 11989Same as C<L<descendants>> 11990 11991=item descendants_or_self ($optional_condition) 11992 11993Same as C<L<descendants>> except that the element itself is included in the list 11994if it matches the C<$optional_condition> 11995 11996=item first_descendant ($optional_condition) 11997 11998Return the first descendant of the element that matches the condition 11999 12000=item last_descendant ($optional_condition) 12001 12002Return the last descendant of the element that matches the condition 12003 12004=item ancestors ($optional_condition) 12005 12006Return the list of ancestors (optionally matching C<$optional_condition>) of the 12007element. The list is ordered from the innermost ancestor to the outermost one 12008 12009NOTE: the element itself is not part of the list, in order to include it 12010you will have to use ancestors_or_self 12011 12012=item ancestors_or_self ($optional_condition) 12013 12014Return the list of ancestors (optionally matching C<$optional_condition>) of the 12015element, including the element (if it matches the condition>). 12016The list is ordered from the innermost ancestor to the outermost one 12017 12018=item passes ($condition) 12019 12020Return the element if it passes the C<$condition> 12021 12022=item att ($att) 12023 12024Return the value of attribute C<$att> or C<undef> 12025 12026=item latt ($att) 12027 12028Return the value of attribute C<$att> or C<undef> 12029 12030this method is an lvalue, so you can do C<< $elt->latt( 'foo')= 'bar' >> or C<< $elt->latt( 'foo')++; >> 12031 12032=item set_att ($att, $att_value) 12033 12034Set the attribute of the element to the given value 12035 12036You can actually set several attributes this way: 12037 12038 $elt->set_att( att1 => "val1", att2 => "val2"); 12039 12040=item del_att ($att) 12041 12042Delete the attribute for the element 12043 12044You can actually delete several attributes at once: 12045 12046 $elt->del_att( 'att1', 'att2', 'att3'); 12047 12048=item att_exists ($att) 12049 12050Returns true if the attribute C<$att> exists for the element, false 12051otherwise 12052 12053=item cut 12054 12055Cut the element from the tree. The element still exists, it can be copied 12056or pasted somewhere else, it is just not attached to the tree anymore. 12057 12058Note that the "old" links to the parent, previous and next siblings can 12059still be accessed using the former_* methods 12060 12061=item former_next_sibling 12062 12063Returns the former next sibling of a cut node (or undef if the node has not been cut) 12064 12065This makes it easier to write loops where you cut elements: 12066 12067 my $child= $parent->first_child( 'achild'); 12068 while( $child->{'att'}->{'cut'}) 12069 { $child->cut; $child= ($child->{former} && $child->{former}->{next_sibling}); } 12070 12071=item former_prev_sibling 12072 12073Returns the former previous sibling of a cut node (or undef if the node has not been cut) 12074 12075=item former_parent 12076 12077Returns the former parent of a cut node (or undef if the node has not been cut) 12078 12079=item cut_children ($optional_condition) 12080 12081Cut all the children of the element (or all of those which satisfy the 12082C<$optional_condition>). 12083 12084Return the list of children 12085 12086=item cut_descendants ($optional_condition) 12087 12088Cut all the descendants of the element (or all of those which satisfy the 12089C<$optional_condition>). 12090 12091Return the list of descendants 12092 12093=item copy ($elt) 12094 12095Return a copy of the element. The copy is a "deep" copy: all sub-elements of 12096the element are duplicated. 12097 12098=item paste ($optional_position, $ref) 12099 12100Paste a (previously C<cut> or newly generated) element. Die if the element 12101already belongs to a tree. 12102 12103Note that the calling element is pasted: 12104 12105 $child->paste( first_child => $existing_parent); 12106 $new_sibling->paste( after => $this_sibling_is_already_in_the_tree); 12107 12108or 12109 12110 my $new_elt= XML::Twig::Elt->new( tag => $content); 12111 $new_elt->paste( $position => $existing_elt); 12112 12113Example: 12114 12115 my $t= XML::Twig->new->parse( 'doc.xml') 12116 my $toc= $t->root->new( 'toc'); 12117 $toc->paste( $t->root); # $toc is pasted as first child of the root 12118 foreach my $title ($t->findnodes( '/doc/section/title')) 12119 { my $title_toc= $title->copy; 12120 # paste $title_toc as the last child of toc 12121 $title_toc->paste( last_child => $toc) 12122 } 12123 12124Position options: 12125 12126=over 4 12127 12128=item first_child (default) 12129 12130The element is pasted as the first child of C<$ref> 12131 12132=item last_child 12133 12134The element is pasted as the last child of C<$ref> 12135 12136=item before 12137 12138The element is pasted before C<$ref>, as its previous sibling. 12139 12140=item after 12141 12142The element is pasted after C<$ref>, as its next sibling. 12143 12144=item within 12145 12146In this case an extra argument, C<$offset>, should be supplied. The element 12147will be pasted in the reference element (or in its first text child) at the 12148given offset. To achieve this the reference element will be split at the 12149offset. 12150 12151=back 12152 12153Note that you can call directly the underlying method: 12154 12155=over 4 12156 12157=item paste_before 12158 12159=item paste_after 12160 12161=item paste_first_child 12162 12163=item paste_last_child 12164 12165=item paste_within 12166 12167=back 12168 12169=item move ($optional_position, $ref) 12170 12171Move an element in the tree. 12172This is just a C<cut> then a C<paste>. The syntax is the same as C<paste>. 12173 12174=item replace ($ref) 12175 12176Replaces an element in the tree. Sometimes it is just not possible toC<cut> 12177an element then C<paste> another in its place, so C<replace> comes in handy. 12178The calling element replaces C<$ref>. 12179 12180=item replace_with (@elts) 12181 12182Replaces the calling element with one or more elements 12183 12184=item delete 12185 12186Cut the element and frees the memory. 12187 12188=item prefix ($text, $optional_option) 12189 12190Add a prefix to an element. If the element is a C<PCDATA> element the text 12191is added to the pcdata, if the elements first child is a C<PCDATA> then the 12192text is added to it's pcdata, otherwise a new C<PCDATA> element is created 12193and pasted as the first child of the element. 12194 12195If the option is C<asis> then the prefix is added asis: it is created in 12196a separate C<PCDATA> element with an C<asis> property. You can then write: 12197 12198 $elt1->prefix( '<b>', 'asis'); 12199 12200to create a C<< <b> >> in the output of C<print>. 12201 12202=item suffix ($text, $optional_option) 12203 12204Add a suffix to an element. If the element is a C<PCDATA> element the text 12205is added to the pcdata, if the elements last child is a C<PCDATA> then the 12206text is added to it's pcdata, otherwise a new PCDATA element is created 12207and pasted as the last child of the element. 12208 12209If the option is C<asis> then the suffix is added asis: it is created in 12210a separate C<PCDATA> element with an C<asis> property. You can then write: 12211 12212 $elt2->suffix( '</b>', 'asis'); 12213 12214=item trim 12215 12216Trim the element in-place: spaces at the beginning and at the end of the element 12217are discarded and multiple spaces within the element (or its descendants) are 12218replaced by a single space. 12219 12220Note that in some cases you can still end up with multiple spaces, if they are 12221split between several elements: 12222 12223 <doc> text <b> hah! </b> yep</doc> 12224 12225gets trimmed to 12226 12227 <doc>text <b> hah! </b> yep</doc> 12228 12229This is somewhere in between a bug and a feature. 12230 12231=item normalize 12232 12233merge together all consecutive pcdata elements in the element (if for example 12234you have turned some elements into pcdata using C<L<erase>>, this will give you 12235a "clean" element in which there all text fragments are as long as possible). 12236 12237 12238=item simplify (%options) 12239 12240Return a data structure suspiciously similar to XML::Simple's. Options are 12241identical to XMLin options, see XML::Simple doc for more details (or use 12242DATA::dumper or YAML to dump the data structure) 12243 12244B<Note>: there is no magic here, if you write 12245C<< $twig->parsefile( $file )->simplify(); >> then it will load the entire 12246document in memory. I am afraid you will have to put some work into it to 12247get just the bits you want and discard the rest. Look at the synopsis or 12248the XML::Twig 101 section at the top of the docs for more information. 12249 12250=over 4 12251 12252=item content_key 12253 12254=item forcearray 12255 12256=item keyattr 12257 12258=item noattr 12259 12260=item normalize_space 12261 12262aka normalise_space 12263 12264=item variables (%var_hash) 12265 12266%var_hash is a hash { name => value } 12267 12268This option allows variables in the XML to be expanded when the file is read. (there is no facility for putting the variable names back if you regenerate XML using XMLout). 12269 12270A 'variable' is any text of the form ${name} (or $name) which occurs in an attribute value or in the text content of an element. If 'name' matches a key in the supplied hashref, ${name} will be replaced with the corresponding value from the hashref. If no matching key is found, the variable will not be replaced. 12271 12272=item var_att ($attribute_name) 12273 12274This option gives the name of an attribute that will be used to create 12275variables in the XML: 12276 12277 <dirs> 12278 <dir name="prefix">/usr/local</dir> 12279 <dir name="exec_prefix">$prefix/bin</dir> 12280 </dirs> 12281 12282use C<< var => 'name' >> to get $prefix replaced by /usr/local in the 12283generated data structure 12284 12285By default variables are captured by the following regexp: /$(\w+)/ 12286 12287=item var_regexp (regexp) 12288 12289This option changes the regexp used to capture variables. The variable 12290name should be in $1 12291 12292=item group_tags { grouping tag => grouped tag, grouping tag 2 => grouped tag 2...} 12293 12294Option used to simplify the structure: elements listed will not be used. 12295Their children will be, they will be considered children of the element 12296parent. 12297 12298If the element is: 12299 12300 <config host="laptop.xmltwig.org"> 12301 <server>localhost</server> 12302 <dirs> 12303 <dir name="base">/home/mrodrigu/standards</dir> 12304 <dir name="tools">$base/tools</dir> 12305 </dirs> 12306 <templates> 12307 <template name="std_def">std_def.templ</template> 12308 <template name="dummy">dummy</template> 12309 </templates> 12310 </config> 12311 12312Then calling simplify with C<< group_tags => { dirs => 'dir', 12313templates => 'template'} >> 12314makes the data structure be exactly as if the start and end tags for C<dirs> and 12315C<templates> were not there. 12316 12317A YAML dump of the structure 12318 12319 base: '/home/mrodrigu/standards' 12320 host: laptop.xmltwig.org 12321 server: localhost 12322 template: 12323 - std_def.templ 12324 - dummy.templ 12325 tools: '$base/tools' 12326 12327 12328=back 12329 12330=item split_at ($offset) 12331 12332Split a text (C<PCDATA> or C<CDATA>) element in 2 at C<$offset>, the original 12333element now holds the first part of the string and a new element holds the 12334right part. The new element is returned 12335 12336If the element is not a text element then the first text child of the element 12337is split 12338 12339=item split ( $optional_regexp, $tag1, $atts1, $tag2, $atts2...) 12340 12341Split the text descendants of an element in place, the text is split using 12342the C<$regexp>, if the regexp includes () then the matched separators will be 12343wrapped in elements. C<$1> is wrapped in $tag1, with attributes C<$atts1> if 12344C<$atts1> is given (as a hashref), C<$2> is wrapped in $tag2... 12345 12346if $elt is C<< <p>tati tata <b>tutu tati titi</b> tata tati tata</p> >> 12347 12348 $elt->split( qr/(ta)ti/, 'foo', {type => 'toto'} ) 12349 12350will change $elt to 12351 12352 <p><foo type="toto">ta</foo> tata <b>tutu <foo type="toto">ta</foo> 12353 titi</b> tata <foo type="toto">ta</foo> tata</p> 12354 12355The regexp can be passed either as a string or as C<qr//> (perl 5.005 and 12356later), it defaults to \s+ just as the C<split> built-in (but this would be 12357quite a useless behaviour without the C<$optional_tag> parameter) 12358 12359C<$optional_tag> defaults to PCDATA or CDATA, depending on the initial element 12360type 12361 12362The list of descendants is returned (including un-touched original elements 12363and newly created ones) 12364 12365=item mark ( $regexp, $optional_tag, $optional_attribute_ref) 12366 12367This method behaves exactly as L<split>, except only the newly created 12368elements are returned 12369 12370=item wrap_children ( $regexp_string, $tag, $optional_attribute_hashref) 12371 12372Wrap the children of the element that match the regexp in an element C<$tag>. 12373If $optional_attribute_hashref is passed then the new element will 12374have these attributes. 12375 12376The $regexp_string includes tags, within pointy brackets, as in 12377C<< <title><para>+ >> and the usual Perl modifiers (+*?...). 12378Tags can be further qualified with attributes: 12379C<< <para type="warning" classif="cosmic_secret">+ >>. The values 12380for attributes should be xml-escaped: C<< <candy type="M&Ms">* >> 12381(C<E<lt>>, C<&> B<C<E<gt>>> and C<"> should be escaped). 12382 12383Note that elements might get extra C<id> attributes in the process. See L<add_id>. 12384Use L<strip_att> to remove unwanted id's. 12385 12386Here is an example: 12387 12388If the element C<$elt> has the following content: 12389 12390 <elt> 12391 <p>para 1</p> 12392 <l_l1_1>list 1 item 1 para 1</l_l1_1> 12393 <l_l1>list 1 item 1 para 2</l_l1> 12394 <l_l1_n>list 1 item 2 para 1 (only para)</l_l1_n> 12395 <l_l1_n>list 1 item 3 para 1</l_l1_n> 12396 <l_l1>list 1 item 3 para 2</l_l1> 12397 <l_l1>list 1 item 3 para 3</l_l1> 12398 <l_l1_1>list 2 item 1 para 1</l_l1_1> 12399 <l_l1>list 2 item 1 para 2</l_l1> 12400 <l_l1_n>list 2 item 2 para 1 (only para)</l_l1_n> 12401 <l_l1_n>list 2 item 3 para 1</l_l1_n> 12402 <l_l1>list 2 item 3 para 2</l_l1> 12403 <l_l1>list 2 item 3 para 3</l_l1> 12404 </elt> 12405 12406Then the code 12407 12408 $elt->wrap_children( q{<l_l1_1><l_l1>*} , li => { type => "ul1" }); 12409 $elt->wrap_children( q{<l_l1_n><l_l1>*} , li => { type => "ul" }); 12410 12411 $elt->wrap_children( q{<li type="ul1"><li type="ul">+}, "ul"); 12412 $elt->strip_att( 'id'); 12413 $elt->strip_att( 'type'); 12414 $elt->print; 12415 12416will output: 12417 12418 <elt> 12419 <p>para 1</p> 12420 <ul> 12421 <li> 12422 <l_l1_1>list 1 item 1 para 1</l_l1_1> 12423 <l_l1>list 1 item 1 para 2</l_l1> 12424 </li> 12425 <li> 12426 <l_l1_n>list 1 item 2 para 1 (only para)</l_l1_n> 12427 </li> 12428 <li> 12429 <l_l1_n>list 1 item 3 para 1</l_l1_n> 12430 <l_l1>list 1 item 3 para 2</l_l1> 12431 <l_l1>list 1 item 3 para 3</l_l1> 12432 </li> 12433 </ul> 12434 <ul> 12435 <li> 12436 <l_l1_1>list 2 item 1 para 1</l_l1_1> 12437 <l_l1>list 2 item 1 para 2</l_l1> 12438 </li> 12439 <li> 12440 <l_l1_n>list 2 item 2 para 1 (only para)</l_l1_n> 12441 </li> 12442 <li> 12443 <l_l1_n>list 2 item 3 para 1</l_l1_n> 12444 <l_l1>list 2 item 3 para 2</l_l1> 12445 <l_l1>list 2 item 3 para 3</l_l1> 12446 </li> 12447 </ul> 12448 </elt> 12449 12450=item subs_text ($regexp, $replace) 12451 12452subs_text does text substitution, similar to perl's C< s///> operator. 12453 12454C<$regexp> must be a perl regexp, created with the C<qr> operator. 12455 12456C<$replace> can include C<$1, $2>... from the C<$regexp>. It can also be 12457used to create element and entities, by using 12458C<< &elt( tag => { att => val }, text) >> (similar syntax as C<L<new>>) and 12459C<< &ent( name) >>. 12460 12461Here is a rather complex example: 12462 12463 $elt->subs_text( qr{(?<!do not )link to (http://([^\s,]*))}, 12464 'see &elt( a =>{ href => $1 }, $2)' 12465 ); 12466 12467This will replace text like I<link to http://www.xmltwig.org> by 12468I<< see <a href="www.xmltwig.org">www.xmltwig.org</a> >>, but not 12469I<do not link to...> 12470 12471Generating entities (here replacing spaces with ): 12472 12473 $elt->subs_text( qr{ }, '&ent( " ")'); 12474 12475or, using a variable: 12476 12477 my $ent=" "; 12478 $elt->subs_text( qr{ }, "&ent( '$ent')"); 12479 12480Note that the substitution is always global, as in using the C<g> modifier 12481in a perl substitution, and that it is performed on all text descendants 12482of the element. 12483 12484B<Bug>: in the C<$regexp>, you can only use C<\1>, C<\2>... if the replacement 12485expression does not include elements or attributes. eg 12486 12487 $t->subs_text( qr/((t[aiou])\2)/, '$2'); # ok, replaces toto, tata, titi, tutu by to, ta, ti, tu 12488 $t->subs_text( qr/((t[aiou])\2)/, '&elt(p => $1)' ); # NOK, does not find toto... 12489 12490=item add_id ($optional_coderef) 12491 12492Add an id to the element. 12493 12494The id is an attribute, C<id> by default, see the C<id> option for XML::Twig 12495C<new> to change it. Use an id starting with C<#> to get an id that's not 12496output by L<print>, L<flush> or L<sprint>, yet that allows you to use the 12497L<elt_id> method to get the element easily. 12498 12499If the element already has an id, no new id is generated. 12500 12501By default the method create an id of the form C<< twig_id_<nnnn> >>, 12502where C<< <nnnn> >> is a number, incremented each time the method is called 12503successfully. 12504 12505=item set_id_seed ($prefix) 12506 12507by default the id generated by C<L<add_id>> is C<< twig_id_<nnnn> >>, 12508C<set_id_seed> changes the prefix to C<$prefix> and resets the number 12509to 1 12510 12511=item strip_att ($att) 12512 12513Remove the attribute C<$att> from all descendants of the element (including 12514the element) 12515 12516Return the element 12517 12518=item change_att_name ($old_name, $new_name) 12519 12520Change the name of the attribute from C<$old_name> to C<$new_name>. If there is no 12521attribute C<$old_name> nothing happens. 12522 12523=item lc_attnames 12524 12525Lower cases the name all the attributes of the element. 12526 12527=item sort_children_on_value( %options) 12528 12529Sort the children of the element in place according to their text. 12530All children are sorted. 12531 12532Return the element, with its children sorted. 12533 12534 12535C<%options> are 12536 12537 type : numeric | alpha (default: alpha) 12538 order : normal | reverse (default: normal) 12539 12540Return the element, with its children sorted 12541 12542 12543=item sort_children_on_att ($att, %options) 12544 12545Sort the children of the element in place according to attribute C<$att>. 12546C<%options> are the same as for C<sort_children_on_value> 12547 12548Return the element. 12549 12550 12551=item sort_children_on_field ($tag, %options) 12552 12553Sort the children of the element in place, according to the field C<$tag> (the 12554text of the first child of the child with this tag). C<%options> are the same 12555as for C<sort_children_on_value>. 12556 12557Return the element, with its children sorted 12558 12559 12560=item sort_children( $get_key, %options) 12561 12562Sort the children of the element in place. The C<$get_key> argument is 12563a reference to a function that returns the sort key when passed an element. 12564 12565For example: 12566 12567 $elt->sort_children( sub { $_[0]->{'att'}->{"nb"} + $_[0]->text }, 12568 type => 'numeric', order => 'reverse' 12569 ); 12570 12571=item field_to_att ($cond, $att) 12572 12573Turn the text of the first sub-element matched by C<$cond> into the value of 12574attribute C<$att> of the element. If C<$att> is omitted then C<$cond> is used 12575as the name of the attribute, which makes sense only if C<$cond> is a valid 12576element (and attribute) name. 12577 12578The sub-element is then cut. 12579 12580=item att_to_field ($att, $tag) 12581 12582Take the value of attribute C<$att> and create a sub-element C<$tag> as first 12583child of the element. If C<$tag> is omitted then C<$att> is used as the name of 12584the sub-element. 12585 12586 12587=item get_xpath ($xpath, $optional_offset) 12588 12589Return a list of elements satisfying the C<$xpath>. C<$xpath> is an XPATH-like 12590expression. 12591 12592A subset of the XPATH abbreviated syntax is covered: 12593 12594 tag 12595 tag[1] (or any other positive number) 12596 tag[last()] 12597 tag[@att] (the attribute exists for the element) 12598 tag[@att="val"] 12599 tag[@att=~ /regexp/] 12600 tag[att1="val1" and att2="val2"] 12601 tag[att1="val1" or att2="val2"] 12602 tag[string()="toto"] (returns tag elements which text (as per the text method) 12603 is toto) 12604 tag[string()=~/regexp/] (returns tag elements which text (as per the text 12605 method) matches regexp) 12606 expressions can start with / (search starts at the document root) 12607 expressions can start with . (search starts at the current element) 12608 // can be used to get all descendants instead of just direct children 12609 * matches any tag 12610 12611So the following examples from the 12612F<XPath recommendationL<http://www.w3.org/TR/xpath.html#path-abbrev>> work: 12613 12614 para selects the para element children of the context node 12615 * selects all element children of the context node 12616 para[1] selects the first para child of the context node 12617 para[last()] selects the last para child of the context node 12618 */para selects all para grandchildren of the context node 12619 /doc/chapter[5]/section[2] selects the second section of the fifth chapter 12620 of the doc 12621 chapter//para selects the para element descendants of the chapter element 12622 children of the context node 12623 //para selects all the para descendants of the document root and thus selects 12624 all para elements in the same document as the context node 12625 //olist/item selects all the item elements in the same document as the 12626 context node that have an olist parent 12627 .//para selects the para element descendants of the context node 12628 .. selects the parent of the context node 12629 para[@type="warning"] selects all para children of the context node that have 12630 a type attribute with value warning 12631 employee[@secretary and @assistant] selects all the employee children of the 12632 context node that have both a secretary attribute and an assistant 12633 attribute 12634 12635 12636The elements will be returned in the document order. 12637 12638If C<$optional_offset> is used then only one element will be returned, the one 12639with the appropriate offset in the list, starting at 0 12640 12641Quoting and interpolating variables can be a pain when the Perl syntax and the 12642XPATH syntax collide, so use alternate quoting mechanisms like q or qq 12643(I like q{} and qq{} myself). 12644 12645Here are some more examples to get you started: 12646 12647 my $p1= "p1"; 12648 my $p2= "p2"; 12649 my @res= $t->get_xpath( qq{p[string( "$p1") or string( "$p2")]}); 12650 12651 my $a= "a1"; 12652 my @res= $t->get_xpath( qq{//*[@att="$a"]}); 12653 12654 my $val= "a1"; 12655 my $exp= qq{//p[ \@att='$val']}; # you need to use \@ or you will get a warning 12656 my @res= $t->get_xpath( $exp); 12657 12658Note that the only supported regexps delimiters are / and that you must 12659backslash all / in regexps AND in regular strings. 12660 12661XML::Twig does not provide natively full XPATH support, but you can use 12662C<L<XML::Twig::XPath>> to get C<findnodes> to use C<XML::XPath> as the 12663XPath engine, with full coverage of the spec. 12664 12665C<L<XML::Twig::XPath>> to get C<findnodes> to use C<XML::XPath> as the 12666XPath engine, with full coverage of the spec. 12667 12668=item find_nodes 12669 12670same asC<get_xpath> 12671 12672=item findnodes 12673 12674same as C<get_xpath> 12675 12676 12677=item text @optional_options 12678 12679Return a string consisting of all the C<PCDATA> and C<CDATA> in an element, 12680without any tags. The text is not XML-escaped: base entities such as C<&> 12681and C<< < >> are not escaped. 12682 12683The 'C<no_recurse>' option will only return the text of the element, not 12684of any included sub-elements (same as C<L<text_only>>). 12685 12686=item text_only 12687 12688Same as C<L<text>> except that the text returned doesn't include 12689the text of sub-elements. 12690 12691=item trimmed_text 12692 12693Same as C<text> except that the text is trimmed: leading and trailing spaces 12694are discarded, consecutive spaces are collapsed 12695 12696=item set_text ($string) 12697 12698Set the text for the element: if the element is a C<PCDATA>, just set its 12699text, otherwise cut all the children of the element and create a single 12700C<PCDATA> child for it, which holds the text. 12701 12702=item merge ($elt2) 12703 12704Move the content of C<$elt2> within the element 12705 12706=item insert ($tag1, [$optional_atts1], $tag2, [$optional_atts2],...) 12707 12708For each tag in the list inserts an element C<$tag> as the only child of the 12709element. The element gets the optional attributes inC<< $optional_atts<n>. >> 12710All children of the element are set as children of the new element. 12711The upper level element is returned. 12712 12713 $p->insert( table => { border=> 1}, 'tr', 'td') 12714 12715put C<$p> in a table with a visible border, a single C<tr> and a single C<td> 12716and return the C<table> element: 12717 12718 <p><table border="1"><tr><td>original content of p</td></tr></table></p> 12719 12720=item wrap_in (@tag) 12721 12722Wrap elements in C<@tag> as the successive ancestors of the element, returns the 12723new element. 12724C<< $elt->wrap_in( 'td', 'tr', 'table') >> wraps the element as a single cell in a 12725table for example. 12726 12727Optionally each tag can be followed by a hashref of attributes, that will be 12728set on the wrapping element: 12729 12730 $elt->wrap_in( p => { class => "advisory" }, div => { class => "intro", id => "div_intro" }); 12731 12732=item insert_new_elt ($opt_position, $tag, $opt_atts_hashref, @opt_content) 12733 12734Combines a C<L<new> > and a C<L<paste> >: creates a new element using 12735C<$tag>, C<$opt_atts_hashref >and C<@opt_content> which are arguments similar 12736to those for C<new>, then paste it, using C<$opt_position> or C<'first_child'>, 12737relative to C<$elt>. 12738 12739Return the newly created element 12740 12741=item erase 12742 12743Erase the element: the element is deleted and all of its children are 12744pasted in its place. 12745 12746=item set_content ( $optional_atts, @list_of_elt_and_strings) 12747 ( $optional_atts, '#EMPTY') 12748 12749Set the content for the element, from a list of strings and 12750elements. Cuts all the element children, then pastes the list 12751elements as the children. This method will create a C<PCDATA> element 12752for any strings in the list. 12753 12754The C<$optional_atts> argument is the ref of a hash of attributes. If this 12755argument is used then the previous attributes are deleted, otherwise they 12756are left untouched. 12757 12758B<WARNING>: if you rely on ID's then you will have to set the id yourself. At 12759this point the element does not belong to a twig yet, so the ID attribute 12760is not known so it won't be stored in the ID list. 12761 12762A content of 'C<#EMPTY>' creates an empty element; 12763 12764=item namespace ($optional_prefix) 12765 12766Return the URI of the namespace that C<$optional_prefix> or the element name 12767belongs to. If the name doesn't belong to any namespace, C<undef> is returned. 12768 12769=item local_name 12770 12771Return the local name (without the prefix) for the element 12772 12773=item ns_prefix 12774 12775Return the namespace prefix for the element 12776 12777=item current_ns_prefixes 12778 12779Return a list of namespace prefixes valid for the element. The order of the 12780prefixes in the list has no meaning. If the default namespace is currently 12781bound, '' appears in the list. 12782 12783 12784=item inherit_att ($att, @optional_tag_list) 12785 12786Return the value of an attribute inherited from parent tags. The value 12787returned is found by looking for the attribute in the element then in turn 12788in each of its ancestors. If the C<@optional_tag_list> is supplied only those 12789ancestors whose tag is in the list will be checked. 12790 12791=item all_children_are ($optional_condition) 12792 12793return 1 if all children of the element pass the C<$optional_condition>, 127940 otherwise 12795 12796=item level ($optional_condition) 12797 12798Return the depth of the element in the twig (root is 0). 12799If C<$optional_condition> is given then only ancestors that match the condition are 12800counted. 12801 12802B<WARNING>: in a tree created using the C<twig_roots> option this will not return 12803the level in the document tree, level 0 will be the document root, level 1 12804will be the C<twig_roots> elements. During the parsing (in a C<twig_handler>) 12805you can use the C<depth> method on the twig object to get the real parsing depth. 12806 12807=item in ($potential_parent) 12808 12809Return true if the element is in the potential_parent (C<$potential_parent> is 12810an element) 12811 12812=item in_context ($cond, $optional_level) 12813 12814Return true if the element is included in an element which passes C<$cond> 12815optionally within C<$optional_level> levels. The returned value is the 12816including element. 12817 12818=item pcdata 12819 12820Return the text of a C<PCDATA> element or C<undef> if the element is not 12821C<PCDATA>. 12822 12823=item pcdata_xml_string 12824 12825Return the text of a C<PCDATA> element or undef if the element is not C<PCDATA>. 12826The text is "XML-escaped" ('&' and '<' are replaced by '&' and '<') 12827 12828=item set_pcdata ($text) 12829 12830Set the text of a C<PCDATA> element. This method does not check that the element is 12831indeed a C<PCDATA> so usually you should use C<L<set_text>> instead. 12832 12833=item append_pcdata ($text) 12834 12835Add the text at the end of a C<PCDATA> element. 12836 12837=item is_cdata 12838 12839Return 1 if the element is a C<CDATA> element, returns 0 otherwise. 12840 12841=item is_text 12842 12843Return 1 if the element is a C<CDATA> or C<PCDATA> element, returns 0 otherwise. 12844 12845=item cdata 12846 12847Return the text of a C<CDATA> element or C<undef> if the element is not 12848C<CDATA>. 12849 12850=item cdata_string 12851 12852Return the XML string of a C<CDATA> element, including the opening and 12853closing markers. 12854 12855=item set_cdata ($text) 12856 12857Set the text of a C<CDATA> element. 12858 12859=item append_cdata ($text) 12860 12861Add the text at the end of a C<CDATA> element. 12862 12863=item remove_cdata 12864 12865Turns all C<CDATA> sections in the element into regular C<PCDATA> elements. This is useful 12866when converting XML to HTML, as browsers do not support CDATA sections. 12867 12868=item extra_data 12869 12870Return the extra_data (comments and PI's) attached to an element 12871 12872=item set_extra_data ($extra_data) 12873 12874Set the extra_data (comments and PI's) attached to an element 12875 12876=item append_extra_data ($extra_data) 12877 12878Append extra_data to the existing extra_data before the element (if no 12879previous extra_data exists then it is created) 12880 12881=item set_asis 12882 12883Set a property of the element that causes it to be output without being XML 12884escaped by the print functions: if it contains C<< a < b >> it will be output 12885as such and not as C<< a < b >>. This can be useful to create text elements 12886that will be output as markup. Note that all C<PCDATA> descendants of the 12887element are also marked as having the property (they are the ones that are 12888actually impacted by the change). 12889 12890If the element is a C<CDATA> element it will also be output asis, without the 12891C<CDATA> markers. The same goes for any C<CDATA> descendant of the element 12892 12893=item set_not_asis 12894 12895Unsets the C<asis> property for the element and its text descendants. 12896 12897=item is_asis 12898 12899Return the C<asis> property status of the element ( 1 or C<undef>) 12900 12901=item closed 12902 12903Return true if the element has been closed. Might be useful if you are 12904somewhere in the tree, during the parse, and have no idea whether a parent 12905element is completely loaded or not. 12906 12907=item get_type 12908 12909Return the type of the element: 'C<#ELT>' for "real" elements, or 'C<#PCDATA>', 12910'C<#CDATA>', 'C<#COMMENT>', 'C<#ENT>', 'C<#PI>' 12911 12912=item is_elt 12913 12914Return the tag if the element is a "real" element, or 0 if it is C<PCDATA>, 12915C<CDATA>... 12916 12917=item contains_only_text 12918 12919Return 1 if the element does not contain any other "real" element 12920 12921=item contains_only ($exp) 12922 12923Return the list of children if all children of the element match 12924the expression C<$exp> 12925 12926 if( $para->contains_only( 'tt')) { ... } 12927 12928=item contains_a_single ($exp) 12929 12930If the element contains a single child that matches the expression C<$exp> 12931returns that element. Otherwise returns 0. 12932 12933=item is_field 12934 12935same as C<contains_only_text> 12936 12937=item is_pcdata 12938 12939Return 1 if the element is a C<PCDATA> element, returns 0 otherwise. 12940 12941=item is_ent 12942 12943Return 1 if the element is an entity (an unexpanded entity) element, 12944return 0 otherwise. 12945 12946=item is_empty 12947 12948Return 1 if the element is empty, 0 otherwise 12949 12950=item set_empty 12951 12952Flags the element as empty. No further check is made, so if the element 12953is actually not empty the output will be messed. The only effect of this 12954method is that the output will be C<< <tag att="value""/> >>. 12955 12956=item set_not_empty 12957 12958Flags the element as not empty. if it is actually empty then the element will 12959be output as C<< <tag att="value""></tag> >> 12960 12961=item is_pi 12962 12963Return 1 if the element is a processing instruction (C<#PI>) element, 12964return 0 otherwise. 12965 12966=item target 12967 12968Return the target of a processing instruction 12969 12970=item set_target ($target) 12971 12972Set the target of a processing instruction 12973 12974=item data 12975 12976Return the data part of a processing instruction 12977 12978=item set_data ($data) 12979 12980Set the data of a processing instruction 12981 12982=item set_pi ($target, $data) 12983 12984Set the target and data of a processing instruction 12985 12986=item pi_string 12987 12988Return the string form of a processing instruction 12989(C<< <?target data?> >>) 12990 12991=item is_comment 12992 12993Return 1 if the element is a comment (C<#COMMENT>) element, 12994return 0 otherwise. 12995 12996=item set_comment ($comment_text) 12997 12998Set the text for a comment 12999 13000=item comment 13001 13002Return the content of a comment (just the text, not the C<< <!-- >> 13003and C<< --> >>) 13004 13005=item comment_string 13006 13007Return the XML string for a comment (C<< <!-- comment --> >>) 13008 13009Note that an XML comment cannot start or end with a '-', or include '--' 13010(http://www.w3.org/TR/2008/REC-xml-20081126/#sec-comments), 13011if that is the case (because you have created the comment yourself presumably, 13012as it could not be in the input XML), then a space will be inserted before 13013an initial '-', after a trailing one or between two '-' in the comment 13014(which could presumably mangle javascript "hidden" in an XHTML comment); 13015 13016=item set_ent ($entity) 13017 13018Set an (non-expanded) entity (C<#ENT>). C<$entity>) is the entity 13019text (C<&ent;>) 13020 13021=item ent 13022 13023Return the entity for an entity (C<#ENT>) element (C<&ent;>) 13024 13025=item ent_name 13026 13027Return the entity name for an entity (C<#ENT>) element (C<ent>) 13028 13029=item ent_string 13030 13031Return the entity, either expanded if the expanded version is available, 13032or non-expanded (C<&ent;>) otherwise 13033 13034=item child ($offset, $optional_condition) 13035 13036Return the C<$offset>-th child of the element, optionally the C<$offset>-th 13037child that matches C<$optional_condition>. The children are treated as a list, so 13038C<< $elt->child( 0) >> is the first child, while C<< $elt->child( -1) >> is 13039the last child. 13040 13041=item child_text ($offset, $optional_condition) 13042 13043Return the text of a child or C<undef> if the sibling does not exist. Arguments 13044are the same as child. 13045 13046=item last_child ($optional_condition) 13047 13048Return the last child of the element, or the last child matching 13049C<$optional_condition> (ie the last of the element children matching 13050the condition). 13051 13052=item last_child_text ($optional_condition) 13053 13054Same as C<first_child_text> but for the last child. 13055 13056=item sibling ($offset, $optional_condition) 13057 13058Return the next or previous C<$offset>-th sibling of the element, or the 13059C<$offset>-th one matching C<$optional_condition>. If C<$offset> is negative then a 13060previous sibling is returned, if $offset is positive then a next sibling is 13061returned. C<$offset=0> returns the element if there is no condition or 13062if the element matches the condition>, C<undef> otherwise. 13063 13064=item sibling_text ($offset, $optional_condition) 13065 13066Return the text of a sibling or C<undef> if the sibling does not exist. 13067Arguments are the same as C<sibling>. 13068 13069=item prev_siblings ($optional_condition) 13070 13071Return the list of previous siblings (optionally matching C<$optional_condition>) 13072for the element. The elements are ordered in document order. 13073 13074=item next_siblings ($optional_condition) 13075 13076Return the list of siblings (optionally matching C<$optional_condition>) 13077following the element. The elements are ordered in document order. 13078 13079=item siblings ($optional_condition) 13080 13081Return the list of siblings (optionally matching C<$optional_condition>) 13082of the element (excluding the element itself). The elements are ordered 13083in document order. 13084 13085=item pos ($optional_condition) 13086 13087Return the position of the element in the children list. The first child has a 13088position of 1 (as in XPath). 13089 13090If the C<$optional_condition> is given then only siblings that match the condition 13091are counted. If the element itself does not match the condition then 130920 is returned. 13093 13094=item atts 13095 13096Return a hash ref containing the element attributes 13097 13098=item set_atts ({ att1=>$att1_val, att2=> $att2_val... }) 13099 13100Set the element attributes with the hash ref supplied as the argument. The previous 13101attributes are lost (ie the attributes set by C<set_atts> replace all of the 13102attributes of the element). 13103 13104You can also pass a list instead of a hashref: C<< $elt->set_atts( att1 => 'val1',...) >> 13105 13106=item del_atts 13107 13108Deletes all the element attributes. 13109 13110=item att_nb 13111 13112Return the number of attributes for the element 13113 13114=item has_atts 13115 13116Return true if the element has attributes (in fact return the number of 13117attributes, thus being an alias to C<L<att_nb>> 13118 13119=item has_no_atts 13120 13121Return true if the element has no attributes, false (0) otherwise 13122 13123=item att_names 13124 13125return a list of the attribute names for the element 13126 13127=item att_xml_string ($att, $options) 13128 13129Return the attribute value, where '&', '<' and quote (" or the value of the quote option 13130at twig creation) are XML-escaped. 13131 13132The options are passed as a hashref, setting C<escape_gt> to a true value will also escape 13133'>' ($elt( 'myatt', { escape_gt => 1 }); 13134 13135=item set_id ($id) 13136 13137Set the C<id> attribute of the element to the value. 13138See C<L<elt_id> > to change the id attribute name 13139 13140=item id 13141 13142Gets the id attribute value 13143 13144=item del_id ($id) 13145 13146Deletes the C<id> attribute of the element and remove it from the id list 13147for the document 13148 13149=item class 13150 13151Return the C<class> attribute for the element (methods on the C<class> 13152attribute are quite convenient when dealing with XHTML, or plain XML that 13153will eventually be displayed using CSS) 13154 13155=item lclass 13156 13157same as class, except that 13158this method is an lvalue, so you can do C<< $elt->lclass= "foo" >> 13159 13160=item set_class ($class) 13161 13162Set the C<class> attribute for the element to C<$class> 13163 13164=item add_class ($class) 13165 13166Add C<$class> to the element C<class> attribute: the new class is added 13167only if it is not already present. 13168 13169Note that classes are then sorted alphabetically, so the C<class> attribute 13170can be changed even if the class is already there 13171 13172=item remove_class ($class) 13173 13174Remove C<$class> from the element C<class> attribute. 13175 13176Note that classes are then sorted alphabetically, so the C<class> attribute can be 13177changed even if the class is already there 13178 13179 13180=item add_to_class ($class) 13181 13182alias for add_class 13183 13184=item att_to_class ($att) 13185 13186Set the C<class> attribute to the value of attribute C<$att> 13187 13188=item add_att_to_class ($att) 13189 13190Add the value of attribute C<$att> to the C<class> attribute of the element 13191 13192=item move_att_to_class ($att) 13193 13194Add the value of attribute C<$att> to the C<class> attribute of the element 13195and delete the attribute 13196 13197=item tag_to_class 13198 13199Set the C<class> attribute of the element to the element tag 13200 13201=item add_tag_to_class 13202 13203Add the element tag to its C<class> attribute 13204 13205=item set_tag_class ($new_tag) 13206 13207Add the element tag to its C<class> attribute and sets the tag to C<$new_tag> 13208 13209=item in_class ($class) 13210 13211Return true (C<1>) if the element is in the class C<$class> (if C<$class> is 13212one of the tokens in the element C<class> attribute) 13213 13214=item tag_to_span 13215 13216Change the element tag tp C<span> and set its class to the old tag 13217 13218=item tag_to_div 13219 13220Change the element tag tp C<div> and set its class to the old tag 13221 13222=item DESTROY 13223 13224Frees the element from memory. 13225 13226=item start_tag 13227 13228Return the string for the start tag for the element, including 13229the C<< /> >> at the end of an empty element tag 13230 13231=item end_tag 13232 13233Return the string for the end tag of an element. For an empty 13234element, this returns the empty string (''). 13235 13236=item xml_string @optional_options 13237 13238Equivalent to C<< $elt->sprint( 1) >>, returns the string for the entire 13239element, excluding the element's tags (but nested element tags are present) 13240 13241The 'C<no_recurse>' option will only return the text of the element, not 13242of any included sub-elements (same as C<L<xml_text_only>>). 13243 13244=item inner_xml 13245 13246Another synonym for xml_string 13247 13248=item outer_xml 13249 13250An other synonym for sprint 13251 13252=item xml_text 13253 13254Return the text of the element, encoded (and processed by the current 13255C<L<output_filter>> or C<L<output_encoding>> options, without any tag. 13256 13257=item xml_text_only 13258 13259Same as C<L<xml_text>> except that the text returned doesn't include 13260the text of sub-elements. 13261 13262=item set_pretty_print ($style) 13263 13264Set the pretty print method, amongst 'C<none>' (default), 'C<nsgmls>', 13265'C<nice>', 'C<indented>', 'C<record>' and 'C<record_c>' 13266 13267pretty_print styles: 13268 13269=over 4 13270 13271=item none 13272 13273the default, no C<\n> is used 13274 13275=item nsgmls 13276 13277nsgmls style, with C<\n> added within tags 13278 13279=item nice 13280 13281adds C<\n> wherever possible (NOT SAFE, can lead to invalid XML) 13282 13283=item indented 13284 13285same as C<nice> plus indents elements (NOT SAFE, can lead to invalid XML) 13286 13287=item record 13288 13289table-oriented pretty print, one field per line 13290 13291=item record_c 13292 13293table-oriented pretty print, more compact than C<record>, one record per line 13294 13295=back 13296 13297=item set_empty_tag_style ($style) 13298 13299Set the method to output empty tags, amongst 'C<normal>' (default), 'C<html>', 13300and 'C<expand>', 13301 13302C<normal> outputs an empty tag 'C<< <tag/> >>', C<html> adds a space 13303'C<< <tag /> >>' for elements that can be empty in XHTML and C<expand> outputs 13304'C<< <tag></tag> >>' 13305 13306=item set_remove_cdata ($flag) 13307 13308set (or unset) the flag that forces the twig to output CDATA sections as 13309regular (escaped) PCDATA 13310 13311 13312=item set_indent ($string) 13313 13314Set the indentation for the indented pretty print style (default is 2 spaces) 13315 13316=item set_quote ($quote) 13317 13318Set the quotes used for attributes. can be 'C<double>' (default) or 'C<single>' 13319 13320=item cmp ($elt) 13321 13322 Compare the order of the 2 elements in a twig. 13323 13324 C<$a> is the <A>..</A> element, C<$b> is the <B>...</B> element 13325 13326 document $a->cmp( $b) 13327 <A> ... </A> ... <B> ... </B> -1 13328 <A> ... <B> ... </B> ... </A> -1 13329 <B> ... </B> ... <A> ... </A> 1 13330 <B> ... <A> ... </A> ... </B> 1 13331 $a == $b 0 13332 $a and $b not in the same tree undef 13333 13334=item before ($elt) 13335 13336Return 1 if C<$elt> starts before the element, 0 otherwise. If the 2 elements 13337are not in the same twig then return C<undef>. 13338 13339 if( $a->cmp( $b) == -1) { return 1; } else { return 0; } 13340 13341=item after ($elt) 13342 13343Return 1 if $elt starts after the element, 0 otherwise. If the 2 elements 13344are not in the same twig then return C<undef>. 13345 13346 if( $a->cmp( $b) == -1) { return 1; } else { return 0; } 13347 13348=item other comparison methods 13349 13350=over 4 13351 13352=item lt 13353 13354=item le 13355 13356=item gt 13357 13358=item ge 13359 13360=back 13361 13362=item path 13363 13364Return the element context in a form similar to XPath's short 13365form: 'C</root/tag1/../tag>' 13366 13367=item xpath 13368 13369Return a unique XPath expression that can be used to find the element 13370again. 13371 13372It looks like C</doc/sect[3]/title>: unique elements do not have an index, 13373the others do. 13374 13375=item flush 13376 13377flushes the twig up to the current element (strictly equivalent to 13378C<< $elt->root->flush >>) 13379 13380=item private methods 13381 13382Low-level methods on the twig: 13383 13384=over 4 13385 13386=item set_parent ($parent) 13387 13388=item set_first_child ($first_child) 13389 13390=item set_last_child ($last_child) 13391 13392=item set_prev_sibling ($prev_sibling) 13393 13394=item set_next_sibling ($next_sibling) 13395 13396=item set_twig_current 13397 13398=item del_twig_current 13399 13400=item twig_current 13401 13402=item contains_text 13403 13404=back 13405 13406Those methods should not be used, unless of course you find some creative 13407and interesting, not to mention useful, ways to do it. 13408 13409=back 13410 13411=head2 cond 13412 13413Most of the navigation functions accept a condition as an optional argument 13414The first element (or all elements for C<L<children> > or 13415C<L<ancestors> >) that passes the condition is returned. 13416 13417The condition is a single step of an XPath expression using the XPath subset 13418defined by C<L<get_xpath>>. Additional conditions are: 13419 13420The condition can be 13421 13422=over 4 13423 13424=item #ELT 13425 13426return a "real" element (not a PCDATA, CDATA, comment or pi element) 13427 13428=item #TEXT 13429 13430return a PCDATA or CDATA element 13431 13432=item regular expression 13433 13434return an element whose tag matches the regexp. The regexp has to be created 13435with C<qr//> (hence this is available only on perl 5.005 and above) 13436 13437=item code reference 13438 13439applies the code, passing the current element as argument, if the code returns 13440true then the element is returned, if it returns false then the code is applied 13441to the next candidate. 13442 13443=back 13444 13445=head2 XML::Twig::XPath 13446 13447XML::Twig implements a subset of XPath through the C<L<get_xpath>> method. 13448 13449If you want to use the whole XPath power, then you can use C<XML::Twig::XPath> 13450instead. In this case C<XML::Twig> uses C<XML::XPath> to execute XPath queries. 13451You will of course need C<XML::XPath> installed to be able to use C<XML::Twig::XPath>. 13452 13453See L<XML::XPath> for more information. 13454 13455The methods you can use are: 13456 13457=over 4 13458 13459=item findnodes ($path) 13460 13461return a list of nodes found by C<$path>. 13462 13463=item findnodes_as_string ($path) 13464 13465return the nodes found reproduced as XML. The result is not guaranteed 13466to be valid XML though. 13467 13468=item findvalue ($path) 13469 13470return the concatenation of the text content of the result nodes 13471 13472=back 13473 13474In order for C<XML::XPath> to be used as the XPath engine the following methods 13475are included in C<XML::Twig>: 13476 13477in XML::Twig 13478 13479=over 4 13480 13481=item getRootNode 13482 13483=item getParentNode 13484 13485=item getChildNodes 13486 13487=back 13488 13489in XML::Twig::Elt 13490 13491=over 4 13492 13493=item string_value 13494 13495=item toString 13496 13497=item getName 13498 13499=item getRootNode 13500 13501=item getNextSibling 13502 13503=item getPreviousSibling 13504 13505=item isElementNode 13506 13507=item isTextNode 13508 13509=item isPI 13510 13511=item isPINode 13512 13513=item isProcessingInstructionNode 13514 13515=item isComment 13516 13517=item isCommentNode 13518 13519=item getTarget 13520 13521=item getChildNodes 13522 13523=item getElementById 13524 13525=back 13526 13527=head2 XML::Twig::XPath::Elt 13528 13529The methods you can use are the same as on C<XML::Twig::XPath> elements: 13530 13531=over 4 13532 13533=item findnodes ($path) 13534 13535return a list of nodes found by C<$path>. 13536 13537=item findnodes_as_string ($path) 13538 13539return the nodes found reproduced as XML. The result is not guaranteed 13540to be valid XML though. 13541 13542=item findvalue ($path) 13543 13544return the concatenation of the text content of the result nodes 13545 13546=back 13547 13548 13549=head2 XML::Twig::Entity_list 13550 13551=over 4 13552 13553=item new 13554 13555Create an entity list. 13556 13557=item add ($ent) 13558 13559Add an entity to an entity list. 13560 13561=item add_new_ent ($name, $val, $sysid, $pubid, $ndata, $param) 13562 13563Create a new entity and add it to the entity list 13564 13565=item delete ($ent or $tag). 13566 13567Delete an entity (defined by its name or by the Entity object) 13568from the list. 13569 13570=item print ($optional_filehandle) 13571 13572Print the entity list. 13573 13574=item list 13575 13576Return the list as an array 13577 13578=back 13579 13580 13581=head2 XML::Twig::Entity 13582 13583=over 4 13584 13585=item new ($name, $val, $sysid, $pubid, $ndata, $param) 13586 13587Same arguments as the Entity handler for XML::Parser. 13588 13589=item print ($optional_filehandle) 13590 13591Print an entity declaration. 13592 13593=item name 13594 13595Return the name of the entity 13596 13597=item val 13598 13599Return the value of the entity 13600 13601=item sysid 13602 13603Return the system id for the entity (for NDATA entities) 13604 13605=item pubid 13606 13607Return the public id for the entity (for NDATA entities) 13608 13609=item ndata 13610 13611Return true if the entity is an NDATA entity 13612 13613=item param 13614 13615Return true if the entity is a parameter entity 13616 13617 13618=item text 13619 13620Return the entity declaration text. 13621 13622=back 13623 13624 13625=head1 EXAMPLES 13626 13627Additional examples (and a complete tutorial) can be found on the 13628F<XML::Twig PageL<http://www.xmltwig.org/xmltwig/>> 13629 13630To figure out what flush does call the following script with an 13631XML file and an element name as arguments 13632 13633 use XML::Twig; 13634 13635 my ($file, $elt)= @ARGV; 13636 my $t= XML::Twig->new( twig_handlers => 13637 { $elt => sub {$_[0]->flush; print "\n[flushed here]\n";} }); 13638 $t->parsefile( $file, ErrorContext => 2); 13639 $t->flush; 13640 print "\n"; 13641 13642 13643=head1 NOTES 13644 13645=head2 Subclassing XML::Twig 13646 13647Useful methods: 13648 13649=over 4 13650 13651=item elt_class 13652 13653In order to subclass C<XML::Twig> you will probably need to subclass also 13654C<L<XML::Twig::Elt>>. Use the C<elt_class> option when you create the 13655C<XML::Twig> object to get the elements created in a different class 13656(which should be a subclass of C<XML::Twig::Elt>. 13657 13658=item add_options 13659 13660If you inherit C<XML::Twig> new method but want to add more options to it 13661you can use this method to prevent XML::Twig to issue warnings for those 13662additional options. 13663 13664=back 13665 13666=head2 DTD Handling 13667 13668There are 3 possibilities here. They are: 13669 13670=over 4 13671 13672=item No DTD 13673 13674No doctype, no DTD information, no entity information, the world is simple... 13675 13676=item Internal DTD 13677 13678The XML document includes an internal DTD, and maybe entity declarations. 13679 13680If you use the load_DTD option when creating the twig the DTD information and 13681the entity declarations can be accessed. 13682 13683The DTD and the entity declarations will be C<flush>'ed (or C<print>'ed) either 13684as is (if they have not been modified) or as reconstructed (poorly, comments 13685are lost, order is not kept, due to it's content this DTD should not be viewed 13686by anyone) if they have been modified. You can also modify them directly by 13687changing the C<< $twig->{twig_doctype}->{internal} >> field (straight from 13688XML::Parser, see the C<Doctype> handler doc) 13689 13690=item External DTD 13691 13692The XML document includes a reference to an external DTD, and maybe entity 13693declarations. 13694 13695If you use the C<load_DTD> when creating the twig the DTD information and the 13696entity declarations can be accessed. The entity declarations will be 13697C<flush>'ed (or C<print>'ed) either as is (if they have not been modified) or 13698as reconstructed (badly, comments are lost, order is not kept). 13699 13700You can change the doctype through the C<< $twig->set_doctype >> method and 13701print the dtd through the C<< $twig->dtd_text >> or C<< $twig->dtd_print >> 13702 methods. 13703 13704If you need to modify the entity list this is probably the easiest way to do it. 13705 13706=back 13707 13708 13709=head2 Flush 13710 13711Remember that element handlers are called when the element is CLOSED, so 13712if you have handlers for nested elements the inner handlers will be called 13713first. It makes it for example trickier than it would seem to number nested 13714sections (or clauses, or divs), as the titles in the inner sections are handled 13715before the outer sections. 13716 13717 13718=head1 BUGS 13719 13720=over 4 13721 13722=item segfault during parsing 13723 13724This happens when parsing huge documents, or lots of small ones, with a version 13725of Perl before 5.16. 13726 13727This is due to a bug in the way weak references are handled in Perl itself. 13728 13729The fix is either to upgrade to Perl 5.16 or later (C<perlbrew> is a great 13730tool to manage several installations of perl on the same machine). 13731 13732An other, NOT RECOMMENDED, way of fixing the problem, is to switch off weak 13733references by writing C<XML::Twig::_set_weakrefs( 0);> at the top of the code. 13734This is totally unsupported, and may lead to other problems though, 13735 13736=item entity handling 13737 13738Due to XML::Parser behaviour, non-base entities in attribute values disappear if 13739they are not declared in the document: 13740C<att="val&ent;"> will be turned into C<< att => val >>, unless you use the 13741C<keep_encoding> argument to C<< XML::Twig->new >> 13742 13743=item DTD handling 13744 13745The DTD handling methods are quite bugged. No one uses them and 13746it seems very difficult to get them to work in all cases, including with 13747several slightly incompatible versions of XML::Parser and of libexpat. 13748 13749Basically you can read the DTD, output it back properly, and update entities, 13750but not much more. 13751 13752So use XML::Twig with standalone documents, or with documents referring to an 13753external DTD, but don't expect it to properly parse and even output back the 13754DTD. 13755 13756=item memory leak 13757 13758If you use a REALLY old Perl (5.005!) and 13759a lot of twigs you might find that you leak quite a lot of memory 13760(about 2Ks per twig). You can use the C<L<dispose> > method to free 13761that memory after you are done. 13762 13763If you create elements the same thing might happen, use the C<L<delete>> 13764method to get rid of them. 13765 13766Alternatively installing the C<Scalar::Util> (or C<WeakRef>) module on a version 13767of Perl that supports it (>5.6.0) will get rid of the memory leaks automagically. 13768 13769=item ID list 13770 13771The ID list is NOT updated when elements are cut or deleted. 13772 13773=item change_gi 13774 13775This method will not function properly if you do: 13776 13777 $twig->change_gi( $old1, $new); 13778 $twig->change_gi( $old2, $new); 13779 $twig->change_gi( $new, $even_newer); 13780 13781=item sanity check on XML::Parser method calls 13782 13783XML::Twig should really prevent calls to some XML::Parser methods, especially 13784the C<setHandlers> method. 13785 13786=item pretty printing 13787 13788Pretty printing (at least using the 'C<indented>' style) is hard to get right! 13789Only elements that belong to the document will be properly indented. Printing 13790elements that do not belong to the twig makes it impossible for XML::Twig to 13791figure out their depth, and thus their indentation level. 13792 13793Also there is an unavoidable bug when using C<flush> and pretty printing for 13794elements with mixed content that start with an embedded element: 13795 13796 <elt><b>b</b>toto<b>bold</b></elt> 13797 13798 will be output as 13799 13800 <elt> 13801 <b>b</b>toto<b>bold</b></elt> 13802 13803if you flush the twig when you find the C<< <b> >> element 13804 13805 13806=back 13807 13808=head1 Globals 13809 13810These are the things that can mess up calling code, especially if threaded. 13811They might also cause problem under mod_perl. 13812 13813=over 4 13814 13815=item Exported constants 13816 13817Whether you want them or not you get them! These are subroutines to use 13818as constant when creating or testing elements 13819 13820 PCDATA return '#PCDATA' 13821 CDATA return '#CDATA' 13822 PI return '#PI', I had the choice between PROC and PI :--( 13823 13824=item Module scoped values: constants 13825 13826these should cause no trouble: 13827 13828 %base_ent= ( '>' => '>', 13829 '<' => '<', 13830 '&' => '&', 13831 "'" => ''', 13832 '"' => '"', 13833 ); 13834 CDATA_START = "<![CDATA["; 13835 CDATA_END = "]]>"; 13836 PI_START = "<?"; 13837 PI_END = "?>"; 13838 COMMENT_START = "<!--"; 13839 COMMENT_END = "-->"; 13840 13841pretty print styles 13842 13843 ( $NSGMLS, $NICE, $INDENTED, $INDENTED_C, $WRAPPED, $RECORD1, $RECORD2)= (1..7); 13844 13845empty tag output style 13846 13847 ( $HTML, $EXPAND)= (1..2); 13848 13849=item Module scoped values: might be changed 13850 13851Most of these deal with pretty printing, so the worst that can 13852happen is probably that XML output does not look right, but is 13853still valid and processed identically by XML processors. 13854 13855C<$empty_tag_style> can mess up HTML bowsers though and changing C<$ID> 13856would most likely create problems. 13857 13858 $pretty=0; # pretty print style 13859 $quote='"'; # quote for attributes 13860 $INDENT= ' '; # indent for indented pretty print 13861 $empty_tag_style= 0; # how to display empty tags 13862 $ID # attribute used as an id ('id' by default) 13863 13864=item Module scoped values: definitely changed 13865 13866These 2 variables are used to replace tags by an index, thus 13867saving some space when creating a twig. If they really cause 13868you too much trouble, let me know, it is probably possible to 13869create either a switch or at least a version of XML::Twig that 13870does not perform this optimization. 13871 13872 %gi2index; # tag => index 13873 @index2gi; # list of tags 13874 13875=back 13876 13877If you need to manipulate all those values, you can use the following methods on the 13878XML::Twig object: 13879 13880=over 4 13881 13882=item global_state 13883 13884Return a hashref with all the global variables used by XML::Twig 13885 13886The hash has the following fields: C<pretty>, C<quote>, C<indent>, 13887C<empty_tag_style>, C<keep_encoding>, C<expand_external_entities>, 13888C<output_filter>, C<output_text_filter>, C<keep_atts_order> 13889 13890=item set_global_state ($state) 13891 13892Set the global state, C<$state> is a hashref 13893 13894=item save_global_state 13895 13896Save the current global state 13897 13898=item restore_global_state 13899 13900Restore the previously saved (using C<Lsave_global_state>> state 13901 13902=back 13903 13904=head1 TODO 13905 13906=over 4 13907 13908=item SAX handlers 13909 13910Allowing XML::Twig to work on top of any SAX parser 13911 13912=item multiple twigs are not well supported 13913 13914A number of twig features are just global at the moment. These include 13915the ID list and the "tag pool" (if you use C<change_gi> then you change the tag 13916for ALL twigs). 13917 13918A future version will try to support this while trying not to be to 13919hard on performance (at least when a single twig is used!). 13920 13921=back 13922 13923=head1 AUTHOR 13924 13925Michel Rodriguez <mirod@cpan.org> 13926 13927=head1 LICENSE 13928 13929This library is free software; you can redistribute it and/or modify 13930it under the same terms as Perl itself. 13931 13932Bug reports should be sent using: 13933F<RT L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=XML-Twig>> 13934 13935Comments can be sent to mirod@cpan.org 13936 13937The XML::Twig page is at L<http://www.xmltwig.org/xmltwig/> 13938It includes the development version of the module, a slightly better version 13939of the documentation, examples, a tutorial and a: 13940F<Processing XML efficiently with Perl and XML::Twig: 13941L<http://www.xmltwig.org/xmltwig/tutorial/index.html>> 13942 13943=head1 SEE ALSO 13944 13945Complete docs, including a tutorial, examples, an easier to use HTML version of 13946the docs, a quick reference card and a FAQ are available at 13947L<http://www.xmltwig.org/xmltwig/> 13948 13949git repository at L<http://github.com/mirod/xmltwig> 13950 13951L<XML::Parser>, L<XML::Parser::Expat>, L<XML::XPath>, L<Encode>, 13952L<Text::Iconv>, L<Scalar::Utils> 13953 13954 13955=head2 Alternative Modules 13956 13957XML::Twig is not the only XML::Processing module available on CPAN (far from 13958it!). 13959 13960The main alternative I would recommend is L<XML::LibXML>. 13961 13962Here is a quick comparison of the 2 modules: 13963 13964XML::LibXML, actually C<libxml2> on which it is based, sticks to the standards, 13965and implements a good number of them in a rather strict way: XML, XPath, DOM, 13966RelaxNG, I must be forgetting a couple (XInclude?). It is fast and rather 13967frugal memory-wise. 13968 13969XML::Twig is older: when I started writing it XML::Parser/expat was the only 13970game in town. It implements XML and that's about it (plus a subset of XPath, 13971and you can use XML::Twig::XPath if you have XML::XPathEngine installed for full 13972support). It is slower and requires more memory for a full tree than 13973XML::LibXML. On the plus side (yes, there is a plus side!) it lets you process 13974a big document in chunks, and thus let you tackle documents that couldn't be 13975loaded in memory by XML::LibXML, and it offers a lot (and I mean a LOT!) of 13976higher-level methods, for everything, from adding structure to "low-level" XML, 13977to shortcuts for XHTML conversions and more. It also DWIMs quite a bit, getting 13978comments and non-significant whitespaces out of the way but preserving them in 13979the output for example. As it does not stick to the DOM, is also usually leads 13980to shorter code than in XML::LibXML. 13981 13982Beyond the pure features of the 2 modules, XML::LibXML seems to be preferred by 13983"XML-purists", while XML::Twig seems to be more used by Perl Hackers who have 13984to deal with XML. As you have noted, XML::Twig also comes with quite a lot of 13985docs, but I am sure if you ask for help about XML::LibXML here or on Perlmonks 13986you will get answers. 13987 13988Note that it is actually quite hard for me to compare the 2 modules: on one hand 13989I know XML::Twig inside-out and I can get it to do pretty much anything I need 13990to (or I improve it ;--), while I have a very basic knowledge of XML::LibXML. 13991So feature-wise, I'd rather use XML::Twig ;--). On the other hand, I am 13992painfully aware of some of the deficiencies, potential bugs and plain ugly code 13993that lurk in XML::Twig, even though you are unlikely to be affected by them 13994(unless for example you need to change the DTD of a document programmatically), 13995while I haven't looked much into XML::LibXML so it still looks shinny and clean 13996to me. 13997 13998That said, if you need to process a document that is too big to fit memory 13999and XML::Twig is too slow for you, my reluctant advice would be to use "bare" 14000XML::Parser. It won't be as easy to use as XML::Twig: basically with XML::Twig 14001you trade some speed (depending on what you do from a factor 3 to... none) 14002for ease-of-use, but it will be easier IMHO than using SAX (albeit not 14003standard), and at this point a LOT faster (see the last test in 14004L<http://www.xmltwig.org/article/simple_benchmark/>). 14005 14006=cut 14007 14008 14009