1use strict;
2use warnings; # > perl 5.5
3
4# This is created in the caller's space
5# I realize (now!) that it's not clean, but it's been there for 10+ years...
6BEGIN
7{ sub ::PCDATA { '#PCDATA' }  ## no critic (Subroutines::ProhibitNestedSubs);
8  sub ::CDATA  { '#CDATA'  }  ## no critic (Subroutines::ProhibitNestedSubs);
9}
10
11use UNIVERSAL();
12
13## if a sub returns a scalar, it better not bloody disappear in list context
14## no critic (Subroutines::ProhibitExplicitReturnUndef);
15
16my $perl_version;
17my $parser_version;
18
19######################################################################
20package XML::Twig;
21######################################################################
22
23require 5.004;
24
25use utf8; # > perl 5.5
26
27use vars qw($VERSION @ISA %valid_option);
28
29use Carp;
30use File::Spec;
31use File::Basename;
32
33use Config; # to get perl's path name in case we need to know if perlio is available
34
35*isa= *UNIVERSAL::isa;
36
37# flag, set to true if the weaken sub is available
38use vars qw( $weakrefs);
39
40# flag set to true if the version of expat seems to be 1.95.2, which has annoying bugs
41# wrt doctype handling. This is global for performance reasons.
42my $expat_1_95_2=0;
43
44# a slight non-xml mod: # is allowed as a first character
45my $REG_TAG_FIRST_LETTER;
46#$REG_TAG_FIRST_LETTER= q{(?:[^\W\d]|[:#_])};  # < perl 5.6 - does not work for leading non-ascii letters
47$REG_TAG_FIRST_LETTER= q{(?:[[:alpha:]:#_])}; # >= perl 5.6
48
49my $REG_TAG_LETTER= q{(?:[\w_.-]*)};
50
51# a simple name (no colon)
52my $REG_NAME_TOKEN= qq{(?:$REG_TAG_FIRST_LETTER$REG_TAG_LETTER*)};
53
54# a tag name, possibly including namespace
55my $REG_NAME= qq{(?:(?:$REG_NAME_TOKEN:)?$REG_NAME_TOKEN)};
56
57# tag name (leading # allowed)
58# first line is for perl 5.005, second line for modern perl, that accept character classes
59my $REG_TAG_NAME=$REG_NAME;
60
61# name or wildcard (* or '') (leading # allowed)
62my $REG_NAME_W = qq{(?:$REG_NAME|[*])};
63
64# class and ids are deliberately permissive
65my $REG_NTOKEN_FIRST_LETTER;
66#$REG_NTOKEN_FIRST_LETTER= q{(?:[^\W\d]|[:_])};  # < perl 5.6 - does not work for leading non-ascii letters
67$REG_NTOKEN_FIRST_LETTER= q{(?:[[:alpha:]:_])}; # >= perl 5.6
68
69my $REG_NTOKEN_LETTER= q{(?:[\w_:.-]*)};
70
71my $REG_NTOKEN= qq{(?:$REG_NTOKEN_FIRST_LETTER$REG_NTOKEN_LETTER*)};
72my $REG_CLASS = $REG_NTOKEN;
73my $REG_ID    = $REG_NTOKEN;
74
75# allow <tag> #<tag> (private elt) * <tag>.<class> *.<class> <tag>#<id> *#<id>
76my $REG_TAG_PART= qq{(?:$REG_NAME_W(?:[.]$REG_CLASS|[#]$REG_ID)?|[.]$REG_CLASS)};
77
78my $REG_REGEXP     = q{(?:/(?:[^\\/]|\\.)*/[eimsox]*)};               # regexp
79my $REG_MATCH      = q{[!=]~};                                        # match (or not)
80my $REG_STRING     = q{(?:"(?:[^\\"]|\\.)*"|'(?:[^\\']|\\.)*')};      # string (simple or double quoted)
81my $REG_NUMBER     = q{(?:\d+(?:\.\d*)?|\.\d+)};                      # number
82my $REG_VALUE      = qq{(?:$REG_STRING|$REG_NUMBER)};                 # value
83my $REG_OP         = q{==|!=|>|<|>=|<=|eq|ne|lt|gt|le|ge|=};          # op
84my $REG_FUNCTION   = q{(?:string|text)\(\s*\)};
85my $REG_STRING_ARG = qq{(?:string|text)\\(\\s*$REG_NAME_W\\s*\\)};
86my $REG_COMP       = q{(?:>=|<=|!=|<|>|=)};
87
88my $REG_TAG_IN_PREDICATE= $REG_NAME_W . q{(?=\s*(?i:and\b|or\b|\]|$))};
89
90# keys in the context stack, chosen not to interfere with att names, even private (#-prefixed) ones
91my $ST_TAG = '##tag';
92my $ST_ELT = '##elt';
93my $ST_NS  = '##ns' ;
94
95# used in the handler trigger code
96my $REG_NAKED_PREDICATE= qq{((?:"[^"]*"|'[^']*'|$REG_STRING_ARG|$REG_FUNCTION|\@$REG_NAME_W|$REG_MATCH\\s*$REG_REGEXP|[\\s\\d><=!()+.-]|(?i:and)|(?i:or)|$REG_TAG_IN_PREDICATE)*)};
97my $REG_PREDICATE= qq{\\[$REG_NAKED_PREDICATE\\]};
98
99# not all axis, only supported ones (in get_xpath)
100my @supported_axis= ( 'ancestor', 'ancestor-or-self', 'child', 'descendant', 'descendant-or-self',
101                      'following', 'following-sibling', 'parent', 'preceding', 'preceding-sibling', 'self'
102                    );
103my $REG_AXIS       = "(?:" . join( '|', @supported_axis) .")";
104
105# only used in the "xpath"engine (for get_xpath/findnodes) for now
106my $REG_PREDICATE_ALT  = qr{\[(?:(?:string\(\s*\)|\@$REG_TAG_NAME)\s*$REG_MATCH\s*$REG_REGEXP\s*|[^\]]*)\]};
107
108# used to convert XPath tests on strings to the perl equivalent
109my %PERL_ALPHA_TEST= ( '=' => ' eq ', '!=' => ' ne ', '>' => ' gt ', '>=' => ' ge ', '<' => ' lt ', '<=' => ' le ');
110
111my( $FB_HTMLCREF, $FB_XMLCREF);
112
113my $NO_WARNINGS= $perl_version >= 5.006 ? 'no warnings' : 'local $^W=0';
114
115# default namespaces, both ways
116my %DEFAULT_NS= ( xml   => "http://www.w3.org/XML/1998/namespace",
117                  xmlns => "http://www.w3.org/2000/xmlns/",
118                );
119my %DEFAULT_URI2NS= map { $DEFAULT_NS{$_} => $_ } keys %DEFAULT_NS;
120
121# constants
122my( $PCDATA, $CDATA, $PI, $COMMENT, $ENT, $ELT, $NOTATION, $TEXT, $ASIS, $EMPTY, $BUFSIZE);
123
124# used when an HTML doc only has a PUBLIC declaration, to generate the SYSTEM one
125# this should really be done by HTML::TreeBuilder, but as of HTML::TreeBuilder 4.2 it isn't
126# the various declarations are taken from http://en.wikipedia.org/wiki/Document_Type_Declaration
127my %HTML_DECL= ( "-//W3C//DTD HTML 4.0 Transitional//EN"  => "http://www.w3.org/TR/REC-html40/loose.dtd",
128                 "-//W3C//DTD HTML 4.01//EN"              => "http://www.w3.org/TR/html4/strict.dtd",
129                 "-//W3C//DTD HTML 4.01 Transitional//EN" => "http://www.w3.org/TR/html4/loose.dtd",
130                 "-//W3C//DTD HTML 4.01 Frameset//EN"     => "http://www.w3.org/TR/html4/frameset.dtd",
131                 "-//W3C//DTD XHTML 1.0 Strict//EN"       => "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd",
132                 "-//W3C//DTD XHTML 1.0 Transitional//EN" => "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd",
133                 "-//W3C//DTD XHTML 1.0 Frameset//EN"     => "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd",
134                 "-//W3C//DTD XHTML 1.1//EN"              => "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd",
135                 "-//W3C//DTD XHTML Basic 1.0//EN"        => "http://www.w3.org/TR/xhtml-basic/xhtml-basic10.dtd",
136                 "-//W3C//DTD XHTML Basic 1.1//EN"        => "http://www.w3.org/TR/xhtml-basic/xhtml-basic11.dtd",
137                 "-//WAPFORUM//DTD XHTML Mobile 1.0//EN"  => "http://www.wapforum.org/DTD/xhtml-mobile10.dtd",
138                 "-//WAPFORUM//DTD XHTML Mobile 1.1//EN"  => "http://www.openmobilealliance.org/tech/DTD/xhtml-mobile11.dtd",
139                 "-//WAPFORUM//DTD XHTML Mobile 1.2//EN"  => "http://www.openmobilealliance.org/tech/DTD/xhtml-mobile12.dtd",
140                 "-//W3C//DTD XHTML+RDFa 1.0//EN"         => "http://www.w3.org/MarkUp/DTD/xhtml-rdfa-1.dtd",
141               );
142
143my $DEFAULT_HTML_TYPE= "-//W3C//DTD HTML 4.0 Transitional//EN";
144
145my $SEP= qr/\s*(?:$|\|)/;
146
147BEGIN
148{
149$VERSION = '3.52';
150
151use XML::Parser;
152my $needVersion = '2.23';
153($parser_version= $XML::Parser::VERSION)=~ s{_\d+}{}; # remove _<n> from version so numeric tests do not warn
154croak "need at least XML::Parser version $needVersion" unless $parser_version >= $needVersion;
155
156($perl_version= $])=~ s{_\d+}{};
157
158if( $perl_version >= 5.008)
159  { eval "use Encode qw( :all)"; ## no critic ProhibitStringyEval
160    $FB_XMLCREF  = 0x0400; # Encode::FB_XMLCREF;
161    $FB_HTMLCREF = 0x0200; # Encode::FB_HTMLCREF;
162  }
163
164# test whether we can use weak references
165# set local empty signal handler to trap error messages
166{ local $SIG{__DIE__};
167  if( eval( 'require Scalar::Util') && defined( \&Scalar::Util::weaken))
168    { import Scalar::Util( 'weaken'); $weakrefs= 1; }
169  elsif( eval( 'require WeakRef'))
170    { import WeakRef; $weakrefs= 1;                 }
171  else
172    { $weakrefs= 0;                                 }
173}
174
175import XML::Twig::Elt;
176import XML::Twig::Entity;
177import XML::Twig::Entity_list;
178
179# used to store the gi's
180# should be set for each twig really, at least when there are several
181# the init ensures that special gi's are always the same
182
183# constants: element types
184$PCDATA    = '#PCDATA';
185$CDATA     = '#CDATA';
186$PI        = '#PI';
187$COMMENT   = '#COMMENT';
188$ENT       = '#ENT';
189$NOTATION  = '#NOTATION';
190
191# element classes
192$ELT     = '#ELT';
193$TEXT    = '#TEXT';
194
195# element properties
196$ASIS    = '#ASIS';
197$EMPTY   = '#EMPTY';
198
199# used in parseurl to set the buffer size to the same size as in XML::Parser::Expat
200$BUFSIZE = 32768;
201
202
203# gi => index
204%XML::Twig::gi2index=( '', 0, $PCDATA => 1, $CDATA => 2, $PI => 3, $COMMENT => 4, $ENT => 5);
205# list of gi's
206@XML::Twig::index2gi=( '', $PCDATA, $CDATA, $PI, $COMMENT, $ENT);
207
208# gi's under this value are special
209$XML::Twig::SPECIAL_GI= @XML::Twig::index2gi;
210
211%XML::Twig::base_ent= ( '>' => '&gt;', '<' => '&lt;', '&' => '&amp;', "'" => '&apos;', '"' => '&quot;',);
212foreach my $c ( "\n", "\r", "\t") { $XML::Twig::base_ent{$c}= sprintf( "&#x%02x;", ord( $c)); }
213
214# now set some aliases
215*find_nodes           = *get_xpath;               # same as XML::XPath
216*findnodes            = *get_xpath;               # same as XML::LibXML
217*getElementsByTagName = *descendants;
218*descendants_or_self  = *descendants;             # valid in XML::Twig, not in XML::Twig::Elt
219*find_by_tag_name     = *descendants;
220*getElementById       = *elt_id;
221*getEltById           = *elt_id;
222*toString             = *sprint;
223*create_accessors     = *att_accessors;
224
225}
226
227@ISA = qw(XML::Parser);
228
229# fake gi's used in twig_handlers and start_tag_handlers
230my $ALL    = '_all_';     # the associated function is always called
231my $DEFAULT= '_default_'; # the function is called if no other handler has been
232
233# some defaults
234my $COMMENTS_DEFAULT= 'keep';
235my $PI_DEFAULT      = 'keep';
236
237
238# handlers used in regular mode
239my %twig_handlers=( Start      => \&_twig_start,
240                    End        => \&_twig_end,
241                    Char       => \&_twig_char,
242                    Entity     => \&_twig_entity,
243                    Notation   => \&_twig_notation,
244                    XMLDecl    => \&_twig_xmldecl,
245                    Doctype    => \&_twig_doctype,
246                    Element    => \&_twig_element,
247                    Attlist    => \&_twig_attlist,
248                    CdataStart => \&_twig_cdatastart,
249                    CdataEnd   => \&_twig_cdataend,
250                    Proc       => \&_twig_pi,
251                    Comment    => \&_twig_comment,
252                    Default    => \&_twig_default,
253                    ExternEnt  => \&_twig_extern_ent,
254      );
255
256# handlers used when twig_roots is used and we are outside of the roots
257my %twig_handlers_roots=
258  ( Start      => \&_twig_start_check_roots,
259    End        => \&_twig_end_check_roots,
260    Doctype    => \&_twig_doctype,
261    Char       => undef, Entity     => undef, XMLDecl    => \&_twig_xmldecl,
262    Element    => undef, Attlist    => undef, CdataStart => undef,
263    CdataEnd   => undef, Proc       => undef, Comment    => undef,
264    Proc       => \&_twig_pi_check_roots,
265    Default    =>  sub {}, # hack needed for XML::Parser 2.27
266    ExternEnt  => \&_twig_extern_ent,
267  );
268
269# handlers used when twig_roots and print_outside_roots are used and we are
270# outside of the roots
271my %twig_handlers_roots_print_2_30=
272  ( Start      => \&_twig_start_check_roots,
273    End        => \&_twig_end_check_roots,
274    Char       => \&_twig_print,
275    Entity     => \&_twig_print_entity,
276    ExternEnt  => \&_twig_print_entity,
277    DoctypeFin => \&_twig_doctype_fin_print,
278    XMLDecl    => sub { _twig_xmldecl( @_); _twig_print( @_); },
279    Doctype   =>  \&_twig_print_doctype, # because recognized_string is broken here
280    # Element    => \&_twig_print, Attlist    => \&_twig_print,
281    CdataStart => \&_twig_print, CdataEnd   => \&_twig_print,
282    Proc       => \&_twig_pi_check_roots, Comment    => \&_twig_print,
283    Default    => \&_twig_print_check_doctype,
284    ExternEnt  => \&_twig_extern_ent,
285  );
286
287# handlers used when twig_roots, print_outside_roots and keep_encoding are used
288# and we are outside of the roots
289my %twig_handlers_roots_print_original_2_30=
290  ( Start      => \&_twig_start_check_roots,
291    End        => \&_twig_end_check_roots,
292    Char       => \&_twig_print_original,
293    # I have no idea why I should not be using this handler!
294    Entity     => \&_twig_print_entity,
295    ExternEnt  => \&_twig_print_entity,
296    DoctypeFin => \&_twig_doctype_fin_print,
297    XMLDecl    => sub { _twig_xmldecl( @_); _twig_print_original( @_) },
298    Doctype    => \&_twig_print_original_doctype,  # because original_string is broken here
299    Element    => \&_twig_print_original, Attlist   => \&_twig_print_original,
300    CdataStart => \&_twig_print_original, CdataEnd  => \&_twig_print_original,
301    Proc       => \&_twig_pi_check_roots, Comment   => \&_twig_print_original,
302    Default    => \&_twig_print_original_check_doctype,
303  );
304
305# handlers used when twig_roots and print_outside_roots are used and we are
306# outside of the roots
307my %twig_handlers_roots_print_2_27=
308  ( Start      => \&_twig_start_check_roots,
309    End        => \&_twig_end_check_roots,
310    Char       => \&_twig_print,
311    # if the Entity handler is set then it prints the entity declaration
312    # before the entire internal subset (including the declaration!) is output
313    Entity     => sub {},
314    XMLDecl    => \&_twig_print, Doctype    => \&_twig_print,
315    CdataStart => \&_twig_print, CdataEnd   => \&_twig_print,
316    Proc       => \&_twig_pi_check_roots, Comment    => \&_twig_print,
317    Default    => \&_twig_print,
318    ExternEnt  => \&_twig_extern_ent,
319  );
320
321# handlers used when twig_roots, print_outside_roots and keep_encoding are used
322# and we are outside of the roots
323my %twig_handlers_roots_print_original_2_27=
324  ( Start      => \&_twig_start_check_roots,
325    End        => \&_twig_end_check_roots,
326    Char       => \&_twig_print_original,
327    # for some reason original_string is wrong here
328    # this can be a problem if the doctype includes non ascii characters
329    XMLDecl    => \&_twig_print, Doctype    => \&_twig_print,
330    # if the Entity handler is set then it prints the entity declaration
331    # before the entire internal subset (including the declaration!) is output
332    Entity     => sub {},
333    #Element    => undef, Attlist   => undef,
334    CdataStart => \&_twig_print_original, CdataEnd  => \&_twig_print_original,
335    Proc       => \&_twig_pi_check_roots, Comment   => \&_twig_print_original,
336    Default    => \&_twig_print, #  _twig_print_original does not work
337    ExternEnt  => \&_twig_extern_ent,
338  );
339
340
341my %twig_handlers_roots_print= $parser_version > 2.27
342                               ? %twig_handlers_roots_print_2_30
343                               : %twig_handlers_roots_print_2_27;
344my %twig_handlers_roots_print_original= $parser_version > 2.27
345                               ? %twig_handlers_roots_print_original_2_30
346                               : %twig_handlers_roots_print_original_2_27;
347
348
349# handlers used when the finish_print method has been called
350my %twig_handlers_finish_print=
351  ( Start      => \&_twig_print,
352    End        => \&_twig_print, Char       => \&_twig_print,
353    Entity     => \&_twig_print, XMLDecl    => \&_twig_print,
354    Doctype    => \&_twig_print, Element    => \&_twig_print,
355    Attlist    => \&_twig_print, CdataStart => \&_twig_print,
356    CdataEnd   => \&_twig_print, Proc       => \&_twig_print,
357    Comment    => \&_twig_print, Default    => \&_twig_print,
358    ExternEnt  => \&_twig_extern_ent,
359  );
360
361# handlers used when the finish_print method has been called and the keep_encoding
362# option is used
363my %twig_handlers_finish_print_original=
364  ( Start      => \&_twig_print_original, End      => \&_twig_print_end_original,
365    Char       => \&_twig_print_original, Entity   => \&_twig_print_original,
366    XMLDecl    => \&_twig_print_original, Doctype  => \&_twig_print_original,
367    Element    => \&_twig_print_original, Attlist  => \&_twig_print_original,
368    CdataStart => \&_twig_print_original, CdataEnd => \&_twig_print_original,
369    Proc       => \&_twig_print_original, Comment  => \&_twig_print_original,
370    Default    => \&_twig_print_original,
371  );
372
373# handlers used within ignored elements
374my %twig_handlers_ignore=
375  ( Start      => \&_twig_ignore_start,
376    End        => \&_twig_ignore_end,
377    Char       => undef, Entity     => undef, XMLDecl    => undef,
378    Doctype    => undef, Element    => undef, Attlist    => undef,
379    CdataStart => undef, CdataEnd   => undef, Proc       => undef,
380    Comment    => undef, Default    => undef,
381    ExternEnt  => undef,
382  );
383
384
385# those handlers are only used if the entities are NOT to be expanded
386my %twig_noexpand_handlers= ( ExternEnt => undef, Default => \&_twig_default );
387
388my @saved_default_handler;
389
390my $ID= 'id';  # default value, set by the Id argument
391my $css_sel=0; # set through the css_sel option to allow .class selectors in triggers
392
393# all allowed options
394%valid_option=
395    ( # XML::Twig options
396      TwigHandlers          => 1, Id                    => 1,
397      TwigRoots             => 1, TwigPrintOutsideRoots => 1,
398      StartTagHandlers      => 1, EndTagHandlers        => 1,
399      ForceEndTagHandlersUsage => 1,
400      DoNotChainHandlers    => 1,
401      IgnoreElts            => 1,
402      Index                 => 1,
403      AttAccessors          => 1,
404      EltAccessors          => 1,
405      FieldAccessors        => 1,
406      CharHandler           => 1,
407      TopDownHandlers       => 1,
408      KeepEncoding          => 1, DoNotEscapeAmpInAtts  => 1,
409      ParseStartTag         => 1, KeepAttsOrder         => 1,
410      LoadDTD               => 1, DTDHandler            => 1, DTDBase => 1, NoXxe => 1,
411      DoNotOutputDTD        => 1, NoProlog              => 1,
412      ExpandExternalEnts    => 1,
413      DiscardSpaces         => 1, KeepSpaces            => 1, DiscardAllSpaces => 1,
414      DiscardSpacesIn       => 1, KeepSpacesIn          => 1,
415      PrettyPrint           => 1, EmptyTags             => 1,
416      EscapeGt              => 1,
417      Quote                 => 1,
418      Comments              => 1, Pi                    => 1,
419      OutputFilter          => 1, InputFilter           => 1,
420      OutputTextFilter      => 1,
421      OutputEncoding        => 1,
422      RemoveCdata           => 1,
423      EltClass              => 1,
424      MapXmlns              => 1, KeepOriginalPrefix    => 1,
425      SkipMissingEnts       => 1,
426      # XML::Parser options
427      ErrorContext          => 1, ProtocolEncoding      => 1,
428      Namespaces            => 1, NoExpand              => 1,
429      Stream_Delimiter      => 1, ParseParamEnt         => 1,
430      NoLWP                 => 1, Non_Expat_Options     => 1,
431      Xmlns                 => 1, CssSel                => 1,
432      UseTidy               => 1, TidyOptions           => 1,
433      OutputHtmlDoctype     => 1,
434    );
435
436my $active_twig; # last active twig,for XML::Twig::s
437
438# predefined input and output filters
439use vars qw( %filter);
440%filter= ( html       => \&html_encode,
441           safe       => \&safe_encode,
442           safe_hex   => \&safe_encode_hex,
443         );
444
445
446# trigger types (used to sort them)
447my ($LEVEL_TRIGGER, $REGEXP_TRIGGER, $XPATH_TRIGGER)=(1..3);
448
449sub new
450  { my ($class, %args) = @_;
451    my $handlers;
452
453    # change all nice_perlish_names into nicePerlishNames
454    %args= _normalize_args( %args);
455
456    # check options
457    unless( $args{MoreOptions})
458      { foreach my $arg (keys %args)
459        { carp "invalid option $arg" unless $valid_option{$arg}; }
460      }
461
462    # a twig is really an XML::Parser
463    # my $self= XML::Parser->new(%args);
464    my $self;
465    $self= XML::Parser->new(%args);
466
467    bless $self, $class;
468
469    $self->{_twig_context_stack}= [];
470
471    # allow tag.class selectors in handler triggers
472    $css_sel= $args{CssSel} || 0;
473
474
475    if( exists $args{TwigHandlers})
476      { $handlers= $args{TwigHandlers};
477        $self->setTwigHandlers( $handlers);
478        delete $args{TwigHandlers};
479      }
480
481    # take care of twig-specific arguments
482    if( exists $args{StartTagHandlers})
483      { $self->setStartTagHandlers( $args{StartTagHandlers});
484        delete $args{StartTagHandlers};
485      }
486
487    if( exists $args{DoNotChainHandlers})
488      { $self->{twig_do_not_chain_handlers}=  $args{DoNotChainHandlers}; }
489
490    if( exists $args{IgnoreElts})
491      { # change array to hash so you can write ignore_elts => [ qw(foo bar baz)]
492        if( isa( $args{IgnoreElts}, 'ARRAY')) { $args{IgnoreElts}= { map { $_ => 1 } @{$args{IgnoreElts}} }; }
493        $self->setIgnoreEltsHandlers( $args{IgnoreElts});
494        delete $args{IgnoreElts};
495      }
496
497    if( exists $args{Index})
498      { my $index= $args{Index};
499        # we really want a hash name => path, we turn an array into a hash if necessary
500        if( ref( $index) eq 'ARRAY')
501          { my %index= map { $_ => $_ } @$index;
502            $index= \%index;
503          }
504        while( my( $name, $exp)= each %$index)
505          { $self->setTwigHandler( $exp, sub { push @{$_[0]->{_twig_index}->{$name}}, $_; 1; }); }
506      }
507
508    $self->{twig_elt_class}= $args{EltClass} || 'XML::Twig::Elt';
509    if( defined( $args{EltClass}) && $args{EltClass} ne 'XML::Twig::Elt') { $self->{twig_alt_elt_class}=1; }
510    if( exists( $args{EltClass})) { delete $args{EltClass}; }
511
512    if( exists( $args{MapXmlns}))
513      { $self->{twig_map_xmlns}=  $args{MapXmlns};
514        $self->{Namespaces}=1;
515        delete $args{MapXmlns};
516      }
517
518    if( exists( $args{KeepOriginalPrefix}))
519      { $self->{twig_keep_original_prefix}= $args{KeepOriginalPrefix};
520        delete $args{KeepOriginalPrefix};
521      }
522
523    $self->{twig_dtd_handler}= $args{DTDHandler};
524    delete $args{DTDHandler};
525
526    if( $args{ExpandExternalEnts})
527      { $self->set_expand_external_entities( 1);
528        $self->{twig_expand_external_ents}= $args{ExpandExternalEnts};
529        $self->{twig_read_external_dtd}= 1; # implied by ExpandExternalEnts
530        if( $args{ExpandExternalEnts} == -1)
531          { $self->{twig_extern_ent_nofail}= 1;
532            $self->setHandlers( ExternEnt => \&_twig_extern_ent_nofail);
533          }
534        delete $args{LoadDTD};
535        delete $args{ExpandExternalEnts};
536      }
537    else
538      { $self->set_expand_external_entities( 0); }
539
540    if( !$args{NoLWP} && ! _use( 'URI') && ! _use( 'URI::File') && ! _use( 'LWP'))
541      { $self->{twig_ext_ent_handler}= \&XML::Parser::initial_ext_ent_handler }
542    elsif( $args{NoXxe})
543      { $self->{twig_ext_ent_handler}=
544          sub { my($xp, $base, $path) = @_; $xp->{ErrorMessage}.= "cannot use entities in document when the no_xxe option is on"; return undef; };
545      }
546    else
547      { $self->{twig_ext_ent_handler}= \&XML::Parser::file_ext_ent_handler }
548
549    if( $args{DoNotEscapeAmpInAtts})
550      { $self->set_do_not_escape_amp_in_atts( 1);
551        $self->{twig_do_not_escape_amp_in_atts}=1;
552      }
553    else
554      { $self->set_do_not_escape_amp_in_atts( 0);
555        $self->{twig_do_not_escape_amp_in_atts}=0;
556      }
557
558    # deal with TwigRoots argument, a hash of elements for which
559    # subtrees will be built (and associated handlers)
560
561    if( $args{TwigRoots})
562      { $self->setTwigRoots( $args{TwigRoots});
563        delete $args{TwigRoots};
564      }
565
566    if( $args{EndTagHandlers})
567      { unless ($self->{twig_roots} || $args{ForceEndTagHandlersUsage})
568          { croak "you should not use EndTagHandlers without TwigRoots\n",
569                  "if you want to use it anyway, normally because you have ",
570                  "a start_tag_handlers that calls 'ignore' and you want to ",
571                  "call an ent_tag_handlers at the end of the element, then ",
572                  "pass 'force_end_tag_handlers_usage => 1' as an argument ",
573                  "to new";
574          }
575
576        $self->setEndTagHandlers( $args{EndTagHandlers});
577        delete $args{EndTagHandlers};
578      }
579
580    if( $args{TwigPrintOutsideRoots})
581      { croak "cannot use twig_print_outside_roots without twig_roots"
582          unless( $self->{twig_roots});
583        # if the arg is a filehandle then store it
584        if( _is_fh( $args{TwigPrintOutsideRoots}) )
585          { $self->{twig_output_fh}= $args{TwigPrintOutsideRoots}; }
586        $self->{twig_default_print}= $args{TwigPrintOutsideRoots};
587      }
588
589    # space policy
590    if( $args{KeepSpaces})
591      { croak "cannot use both keep_spaces and discard_spaces"        if( $args{DiscardSpaces});
592        croak "cannot use both keep_spaces and discard_all_spaces"    if( $args{DiscardAllSpaces});
593        croak "cannot use both keep_spaces and keep_spaces_in"        if( $args{KeepSpacesIn});
594        $self->{twig_keep_spaces}=1;
595        delete $args{KeepSpaces};
596      }
597    if( $args{DiscardSpaces})
598      {
599        croak "cannot use both discard_spaces and keep_spaces_in"     if( $args{KeepSpacesIn});
600        croak "cannot use both discard_spaces and discard_all_spaces" if( $args{DiscardAllSpaces});
601        croak "cannot use both discard_spaces and discard_spaces_in"  if( $args{DiscardSpacesIn});
602        $self->{twig_discard_spaces}=1;
603        delete $args{DiscardSpaces};
604      }
605    if( $args{KeepSpacesIn})
606      { croak "cannot use both keep_spaces_in and discard_spaces_in"  if( $args{DiscardSpacesIn});
607        croak "cannot use both keep_spaces_in and discard_all_spaces" if( $args{DiscardAllSpaces});
608        $self->{twig_discard_spaces}=1;
609        $self->{twig_keep_spaces_in}={};
610        my @tags= @{$args{KeepSpacesIn}};
611        foreach my $tag (@tags) { $self->{twig_keep_spaces_in}->{$tag}=1; }
612        delete $args{KeepSpacesIn};
613      }
614
615    if( $args{DiscardAllSpaces})
616      {
617        croak "cannot use both discard_all_spaces and discard_spaces_in" if( $args{DiscardSpacesIn});
618        $self->{twig_discard_all_spaces}=1;
619        delete $args{DiscardAllSpaces};
620      }
621
622    if( $args{DiscardSpacesIn})
623      { $self->{twig_keep_spaces}=1;
624        $self->{twig_discard_spaces_in}={};
625        my @tags= @{$args{DiscardSpacesIn}};
626        foreach my $tag (@tags) { $self->{twig_discard_spaces_in}->{$tag}=1; }
627        delete $args{DiscardSpacesIn};
628      }
629    # discard spaces by default
630    $self->{twig_discard_spaces}= 1 unless(  $self->{twig_keep_spaces});
631
632    $args{Comments}||= $COMMENTS_DEFAULT;
633    if( $args{Comments} eq 'drop')       { $self->{twig_keep_comments}= 0;    }
634    elsif( $args{Comments} eq 'keep')    { $self->{twig_keep_comments}= 1;    }
635    elsif( $args{Comments} eq 'process') { $self->{twig_process_comments}= 1; }
636    else { croak "wrong value for comments argument: '$args{Comments}' (should be 'drop', 'keep' or 'process')"; }
637    delete $args{Comments};
638
639    $args{Pi}||= $PI_DEFAULT;
640    if( $args{Pi} eq 'drop')       { $self->{twig_keep_pi}= 0;    }
641    elsif( $args{Pi} eq 'keep')    { $self->{twig_keep_pi}= 1;    }
642    elsif( $args{Pi} eq 'process') { $self->{twig_process_pi}= 1; }
643    else { croak "wrong value for pi argument: '$args{Pi}' (should be 'drop', 'keep' or 'process')"; }
644    delete $args{Pi};
645
646    if( $args{KeepEncoding})
647      {
648        # set it in XML::Twig::Elt so print functions know what to do
649        $self->set_keep_encoding( 1);
650        $self->{parse_start_tag}= $args{ParseStartTag} || \&_parse_start_tag;
651        delete $args{ParseStartTag} if defined( $args{ParseStartTag}) ;
652        delete $args{KeepEncoding};
653      }
654    else
655      { $self->set_keep_encoding( 0);
656        if( $args{ParseStartTag})
657          { $self->{parse_start_tag}= $args{ParseStartTag}; }
658        else
659          { delete $self->{parse_start_tag}; }
660        delete $args{ParseStartTag};
661      }
662
663    if( $args{OutputFilter})
664      { $self->set_output_filter( $args{OutputFilter});
665        delete $args{OutputFilter};
666      }
667    else
668      { $self->set_output_filter( 0); }
669
670    if( $args{RemoveCdata})
671      { $self->set_remove_cdata( $args{RemoveCdata});
672        delete $args{RemoveCdata};
673      }
674    else
675      { $self->set_remove_cdata( 0); }
676
677    if( $args{OutputTextFilter})
678      { $self->set_output_text_filter( $args{OutputTextFilter});
679        delete $args{OutputTextFilter};
680      }
681    else
682      { $self->set_output_text_filter( 0); }
683
684    if( $args{KeepAttsOrder})
685      { $self->{keep_atts_order}= $args{KeepAttsOrder};
686        if( _use( 'Tie::IxHash'))
687          { $self->set_keep_atts_order(  $self->{keep_atts_order}); }
688        else
689          { croak "Tie::IxHash not available, option keep_atts_order not allowed"; }
690      }
691    else
692      { $self->set_keep_atts_order( 0); }
693
694
695    if( $args{PrettyPrint})    { $self->set_pretty_print( $args{PrettyPrint}); }
696    if( $args{EscapeGt})       { $self->escape_gt( $args{EscapeGt});           }
697    if( $args{EmptyTags})      { $self->set_empty_tag_style( $args{EmptyTags}) }
698
699    if( exists $args{Id})      { $ID= $args{Id};                     delete $args{ID};             }
700    if( $args{NoProlog})       { $self->{no_prolog}= 1;              delete $args{NoProlog};       }
701    if( $args{DoNotOutputDTD}) { $self->{no_dtd_output}= 1;          delete $args{DoNotOutputDTD}; }
702    if( $args{LoadDTD})        { $self->{twig_read_external_dtd}= 1; delete $args{LoadDTD};        }
703    if( $args{CharHandler})    { $self->setCharHandler( $args{CharHandler}); delete $args{CharHandler}; }
704
705    if( $args{InputFilter})    { $self->set_input_filter(  $args{InputFilter}); delete  $args{InputFilter}; }
706    if( $args{NoExpand})       { $self->setHandlers( %twig_noexpand_handlers); $self->{twig_no_expand}=1; }
707    if( my $output_encoding= $args{OutputEncoding}) { $self->set_output_encoding( $output_encoding); delete $args{OutputFilter}; }
708
709    if( my $tdh= $args{TopDownHandlers}) { $self->{twig_tdh}=1; delete $args{TopDownHandlers}; }
710
711    if( my $acc_a= $args{AttAccessors})   { $self->att_accessors( @$acc_a);  }
712    if( my $acc_e= $args{EltAccessors})   { $self->elt_accessors( isa( $acc_e, 'ARRAY') ? @$acc_e : $acc_e);   }
713    if( my $acc_f= $args{FieldAccessors}) { $self->field_accessors( isa( $acc_f, 'ARRAY') ? @$acc_f : $acc_f); }
714
715    if( $args{UseTidy}) { $self->{use_tidy}= 1; }
716    $self->{tidy_options}= $args{TidyOptions} || {};
717
718    if( $args{OutputHtmlDoctype}) { $self->{html_doctype}= 1; }
719
720    $self->set_quote( $args{Quote} || 'double');
721
722    # set handlers
723    if( $self->{twig_roots})
724      { if( $self->{twig_default_print})
725          { if( $self->{twig_keep_encoding})
726              { $self->setHandlers( %twig_handlers_roots_print_original); }
727            else
728              { $self->setHandlers( %twig_handlers_roots_print);  }
729          }
730        else
731          { $self->setHandlers( %twig_handlers_roots); }
732      }
733    else
734      { $self->setHandlers( %twig_handlers); }
735
736    # XML::Parser::Expat does not like these handler to be set. So in order to
737    # use the various sets of handlers on XML::Parser or XML::Parser::Expat
738    # objects when needed, these ones have to be set only once, here, at
739    # XML::Parser level
740    $self->setHandlers( Init => \&_twig_init, Final => \&_twig_final);
741
742    $self->{twig_entity_list}= XML::Twig::Entity_list->new;
743    $self->{twig_notation_list}= XML::Twig::Notation_list->new;
744
745    $self->{twig_id}= $ID;
746    $self->{twig_stored_spaces}='';
747
748    $self->{twig_autoflush}= 1; # auto flush by default
749
750    $self->{twig}= $self;
751    if( $weakrefs) { weaken( $self->{twig}); }
752
753    return $self;
754  }
755
756sub parse
757  {
758    my $t= shift;
759    # if called as a class method, calls nparse, which creates the twig then parses it
760    if( !ref( $t) || !isa( $t, 'XML::Twig')) { return $t->nparse( @_); }
761
762    # requires 5.006 at least (or the ${^UNICODE} causes a problem)                                       # > perl 5.5
763    # trap underlying bug in IO::Handle (see RT #17500)                                                   # > perl 5.5
764    # croak if perl 5.8+, -CD (or PERL_UNICODE set to D) and parsing a pipe                               # > perl 5.5
765    if( $perl_version>=5.008 && ${^UNICODE} && (${^UNICODE} & 24) && isa( $_[0], 'GLOB') && -p $_[0] )               # > perl 5.5
766      { croak   "cannot parse the output of a pipe when perl is set to use the UTF8 perlIO layer\n"       # > perl 5.5
767              . "set the environment variable PERL_UNICODE or use the -C option (see perldoc perlrun)\n"  # > perl 5.5
768              . "not to include 'D'";                                                                     # > perl 5.5
769      }                                                                                                   # > perl 5.5
770    $t= eval { $t->SUPER::parse( @_); };
771
772    if(    !$t
773        && $@=~m{(syntax error at line 1, column 0, byte 0|not well-formed \(invalid token\) at line 1, column 1, byte 1)}
774        && -f $_[0]
775        && ( ! ref( $_[0]) || ref( $_[0])) ne 'GLOB' # -f works on a filehandle, so this make sure $_[0] is a real file
776      )
777      { croak "you seem to have used the parse method on a filename ($_[0]), you probably want parsefile instead"; }
778    return _checked_parse_result( $t, $@);
779  }
780
781sub parsefile
782  { my $t= shift;
783    if( -f $_[0] && ! -s $_[0]) { return _checked_parse_result( undef, "empty file '$_[0]'"); }
784    $t= eval { $t->SUPER::parsefile( @_); };
785    return _checked_parse_result( $t, $@);
786  }
787
788sub _checked_parse_result
789  { my( $t, $returned)= @_;
790    if( !$t)
791      { if( isa( $returned, 'XML::Twig') && $returned->{twig_finish_now})
792          { $t= $returned;
793            delete $t->{twig_finish_now};
794            return $t->_twig_final;
795          }
796        else
797          { _croak( $returned, 0); }
798      }
799
800    $active_twig= $t;
801    return $t;
802  }
803
804sub active_twig { return $active_twig; }
805
806sub finish_now
807  { my $t= shift;
808    $t->{twig_finish_now}=1;
809    # XML::Parser 2.43 changed xpcroak in a way that caused test failures for XML::Twig
810    # the change was reverted in 2.44, but this is here to ensure that tests pass with 2.43
811    if( $XML::Parser::VERSION == 2.43)
812      { no warnings;
813        $t->parser->{twig_error}= $t;
814        *XML::Parser::Expat::xpcroak= sub { die $_[0]->{twig_error}; };
815        die $t;
816      }
817    else
818      { die $t; }
819  }
820
821
822sub parsefile_inplace      { shift->_parse_inplace( parsefile      => @_); }
823sub parsefile_html_inplace { shift->_parse_inplace( parsefile_html => @_); }
824
825sub _parse_inplace
826  { my( $t, $method, $file, $suffix)= @_;
827    _use( 'File::Temp') || croak "need File::Temp to use inplace methods\n";
828    _use( 'File::Basename');
829
830
831    my $tmpdir= dirname( $file);
832    my( $tmpfh, $tmpfile)= File::Temp::tempfile( DIR => $tmpdir);
833    my $original_fh= select $tmpfh;
834
835    # we can only use binmode :utf8 if perl was compiled with useperlio
836    # might be a problem if keep_encoding used but the file is already in utf8
837    if( $perl_version > 5.006 && !$t->{twig_keep_encoding} && _use_perlio()) {  binmode( $tmpfh, ":utf8" ); }
838
839    $t->$method( $file);
840
841    select $original_fh;
842    close $tmpfh;
843    my $mode= (stat( $file))[2] & oct(7777);
844    chmod $mode, $tmpfile or croak "cannot change temp file mode to $mode: $!";
845
846    if( $suffix)
847      { my $backup;
848        if( $suffix=~ m{\*}) { ($backup = $suffix) =~ s/\*/$file/g; }
849        else                 { $backup= $file . $suffix; }
850
851        rename( $file, $backup) or croak "cannot backup initial file ($file) to $backup: $!";
852      }
853    rename( $tmpfile, $file) or croak "cannot rename temp file ($tmpfile) to initial file ($file): $!";
854
855    return $t;
856  }
857
858
859sub parseurl
860  { my $t= shift;
861    $t->_parseurl( 0, @_);
862  }
863
864sub safe_parseurl
865  { my $t= shift;
866    $t->_parseurl( 1, @_);
867  }
868
869sub safe_parsefile_html
870  { my $t= shift;
871    eval { $t->parsefile_html( @_); };
872    return $@ ? $t->_reset_twig_after_error : $t;
873  }
874
875sub safe_parseurl_html
876  { my $t= shift;
877    _use( 'LWP::Simple') or croak "missing LWP::Simple";
878    eval { $t->parse_html( LWP::Simple::get( shift()), @_); } ;
879    return $@ ? $t->_reset_twig_after_error : $t;
880  }
881
882sub parseurl_html
883  { my $t= shift;
884    _use( 'LWP::Simple') or croak "missing LWP::Simple";
885    $t->parse_html( LWP::Simple::get( shift()), @_);
886  }
887
888
889# uses eval to catch the parser's death
890sub safe_parse_html
891  { my $t= shift;
892    eval { $t->parse_html( @_); } ;
893    return $@ ? $t->_reset_twig_after_error : $t;
894  }
895
896sub parsefile_html
897  { my $t= shift;
898    my $file= shift;
899    my $indent= $t->{ErrorContext} ? 1 : 0;
900    $t->set_empty_tag_style( 'html');
901    my $html2xml=  $t->{use_tidy} ? \&_tidy_html : \&_html2xml;
902    my $options= $t->{use_tidy} ? $t->{tidy_options} || {} :  { indent => $indent, html_doctype => $t->{html_doctype} };
903    $t->parse( $html2xml->( _slurp( $file), $options), @_);
904    return $t;
905  }
906
907sub parse_html
908  { my $t= shift;
909    my $options= ref $_[0] && ref $_[0] eq 'HASH' ? shift() : {};
910    my $use_tidy= exists $options->{use_tidy} ? $options->{use_tidy} : $t->{use_tidy};
911    my $content= shift;
912    my $indent= $t->{ErrorContext} ? 1 : 0;
913    $t->set_empty_tag_style( 'html');
914    my $html2xml=  $use_tidy ? \&_tidy_html : \&_html2xml;
915    my $conv_options= $use_tidy ? $t->{tidy_options} || {} :  { indent => $indent, html_doctype => $t->{html_doctype} };
916    $t->parse( $html2xml->( isa( $content, 'GLOB') ? _slurp_fh( $content) : $content, $conv_options), @_);
917    return $t;
918  }
919
920sub xparse
921  { my $t= shift;
922    my $to_parse= $_[0];
923    if( isa( $to_parse, 'GLOB'))             { $t->parse( @_);                 }
924    elsif( $to_parse=~ m{^\s*<})             { $to_parse=~ m{<html}i ? $t->_parse_as_xml_or_html( @_)
925                                                                     : $t->parse( @_);
926                                             }
927    elsif( $to_parse=~ m{^\w+://.*\.html?$}) { _use( 'LWP::Simple') or croak "missing LWP::Simple";
928                                               $t->_parse_as_xml_or_html( LWP::Simple::get( shift()), @_);
929                                             }
930    elsif( $to_parse=~ m{^\w+://})           { _use( 'LWP::Simple') or croak "missing LWP::Simple";
931                                               my $doc= LWP::Simple::get( shift);
932                                               if( ! defined $doc) { $doc=''; }
933                                               my $xml_parse_ok= $t->safe_parse( $doc, @_);
934                                               if( $xml_parse_ok)
935                                                 { return $xml_parse_ok; }
936                                               else
937                                                 { my $diag= $@;
938                                                   if( $doc=~ m{<html}i)
939                                                     { $t->parse_html( $doc, @_); }
940                                                    else
941                                                      { croak $diag; }
942                                                 }
943                                             }
944    elsif( $to_parse=~ m{\.html?$})          { my $content= _slurp( shift);
945                                               $t->_parse_as_xml_or_html( $content, @_);
946                                             }
947    else                                     { $t->parsefile( @_);             }
948  }
949
950sub _parse_as_xml_or_html
951  { my $t= shift;
952    if( _is_well_formed_xml( $_[0]))
953      { $t->parse( @_) }
954    else
955      { my $html2xml=  $t->{use_tidy} ? \&_tidy_html : \&_html2xml;
956        my $options= $t->{use_tidy} ? $t->{tidy_options} || {} :  { indent => 0, html_doctype => $t->{html_doctype} };
957        my $html= $html2xml->( $_[0], $options, @_);
958        if( _is_well_formed_xml( $html))
959          { $t->parse( $html); }
960        else
961          { croak $@; } # can't really test this because HTML::Parser or HTML::Tidy may change how they deal with bas HTML between versions
962      }
963  }
964
965{ my $parser;
966  sub _is_well_formed_xml
967    { $parser ||= XML::Parser->new;
968      eval { $parser->parse( $_[0]); };
969      return $@ ? 0 : 1;
970    }
971}
972
973sub nparse
974  { my $class= shift;
975    my $to_parse= pop;
976    $class->new( @_)->xparse( $to_parse);
977  }
978
979sub nparse_pp   { shift()->nparse( pretty_print => 'indented', @_); }
980sub nparse_e    { shift()->nparse( error_context => 1,         @_); }
981sub nparse_ppe  { shift()->nparse( pretty_print => 'indented', error_context => 1, @_); }
982
983
984sub _html2xml
985  { my( $html, $options)= @_;
986    _use( 'HTML::TreeBuilder', '3.13') or croak "cannot parse HTML: missing HTML::TreeBuilder v >= 3.13\n";
987    my $tree= HTML::TreeBuilder->new;
988    $tree->ignore_ignorable_whitespace( 0);
989    $tree->ignore_unknown( 0);
990    $tree->no_space_compacting( 1);
991    $tree->store_comments( 1);
992    $tree->store_pis(1);
993    $tree->parse( $html);
994    $tree->eof;
995
996    my $xml='';
997    if( $options->{html_doctype} && exists $tree->{_decl} )
998      { my $decl= $tree->{_decl}->as_XML;
999
1000        # first try to fix declarations that are missing the SYSTEM part
1001        $decl =~ s{^\s*<!DOCTYPE \s+ ((?i)html) \s+ PUBLIC \s+ "([^"]*)" \s* >}
1002                  { my $system= $HTML_DECL{$2} || $HTML_DECL{$DEFAULT_HTML_TYPE};
1003                    qq{<!DOCTYPE $1 PUBLIC "$2" "$system">}
1004
1005                  }xe;
1006
1007        # then check that the declaration looks OK (so it parses), if not remove it,
1008        # better to parse without the declaration than to die stupidly
1009        if(    $decl =~ m{<!DOCTYPE \s+ (?i:HTML) (\s+ PUBLIC \s+ "[^"]*" \s+ (SYSTEM \s+)? "[^"]*")? \s*>}x # PUBLIC then SYSTEM
1010            || $decl =~ m{<!DOCTYPE \s+ (?i:HTML) \s+ SYSTEM \s+ "[^"]*" \s*>}x                             # just SYSTEM
1011          )
1012          { $xml= $decl; }
1013      }
1014
1015    $xml.= _as_XML( $tree);
1016
1017
1018    _fix_xml( $tree, \$xml);
1019
1020    if( $options->{indent}) { _indent_xhtml( \$xml); }
1021    $tree->delete;
1022    $xml=~ s{\s+$}{}s; # trim end
1023    return $xml;
1024  }
1025
1026sub _tidy_html
1027  { my( $html, $options)= @_;
1028   _use( 'HTML::Tidy') or croak "cannot cleanup HTML using HTML::Tidy (required by the use_tidy option): $@\n"; ;
1029    my $TIDY_DEFAULTS= { output_xhtml => 1, # duh!
1030                         tidy_mark => 0,    # do not add the "generated by tidy" comment
1031                         numeric_entities => 1,
1032                         char_encoding =>  'utf8',
1033                         bare => 1,
1034                         clean => 1,
1035                         doctype => 'transitional',
1036                         fix_backslash => 1,
1037                         merge_divs => 0,
1038                         merge_spans => 0,
1039                         sort_attributes => 'alpha',
1040                         indent => 0,
1041                         wrap => 0,
1042                         break_before_br => 0,
1043                       };
1044    $options ||= {};
1045    my $tidy_options= { %$TIDY_DEFAULTS, %$options};
1046    my $tidy = HTML::Tidy->new( $tidy_options);
1047    $tidy->ignore( type => 1, type => 2 ); # 1 is TIDY_WARNING, 2 is TIDY_ERROR, not clean
1048    my $xml= $tidy->clean( $html );
1049    return $xml;
1050  }
1051
1052
1053{ my %xml_parser_encoding;
1054  sub _fix_xml
1055    { my( $tree, $xml)= @_; # $xml is a ref to the xml string
1056
1057      my $max_tries=5;
1058      my $add_decl;
1059
1060      while( ! _check_xml( $xml) && $max_tries--)
1061        {
1062          # a couple of fixes for weird HTML::TreeBuilder errors
1063          if( $@=~ m{^\s*xml (or text )?declaration not at start of (external )?entity}i)
1064            { $$xml=~ s{<\?xml.*?\?>}{}g;
1065              #warn " fixed xml declaration in the wrong place\n";
1066            }
1067          elsif( $@=~ m{undefined entity})
1068            { $$xml=~ s{&(amp;)?Amp;}{&amp;}g if $HTML::TreeBuilder::VERSION < 4.00;
1069              if( _use( 'HTML::Entities::Numbered')) { $$xml=name2hex_xml( $$xml); }
1070              $$xml=~ s{&(\w+);}{ my $ent= $1; if( $ent !~ m{^(amp|lt|gt|apos|quote)$}) { "&amp;$ent;" } }eg;
1071            }
1072          elsif( $@=~ m{&Amp; used in html})
1073            # if $Amp; is used instead of &amp; then HTML::TreeBuilder's as_xml is tripped (old version)
1074            { $$xml=~ s{&(amp;)?Amp;}{&amp;}g if $HTML::TreeBuilder::VERSION < 4.00;
1075            }
1076          elsif( $@=~ m{^\s*not well-formed \(invalid token\)})
1077            { if( $HTML::TreeBuilder::VERSION < 4.00)
1078                { $$xml=~ s{&(amp;)?Amp;}{&amp;}g;
1079                  $$xml=~  s{(<[^>]* )(\d+=)"}{$1a$2"}g; # <table 1> comes out as <table 1="1">, "fix the attribute
1080                }
1081              my $q= '<img "="&#34;" '; # extracted so vim doesn't get confused
1082              if( _use( 'HTML::Entities::Numbered')) { $$xml=name2hex_xml( $$xml); }
1083              if( $$xml=~ m{$q})
1084                { $$xml=~ s{$q}{<img }g; # happens with <img src="foo.png"" ...
1085                }
1086              else
1087                { my $encoding= _encoding_from_meta( $tree);
1088                  unless( keys %xml_parser_encoding) { %xml_parser_encoding= _xml_parser_encodings(); }
1089
1090                  if( ! $add_decl)
1091                    { if( $xml_parser_encoding{$encoding})
1092                        { $add_decl=1; }
1093                      elsif( $encoding eq 'euc-jp' && $xml_parser_encoding{'x-euc-jp-jisx0221'})
1094                        { $encoding="x-euc-jp-jisx0221"; $add_decl=1;}
1095                      elsif( $encoding eq 'shift-jis' && $xml_parser_encoding{'x-sjis-jisx0221'})
1096                        { $encoding="x-sjis-jisx0221";   $add_decl=1;}
1097
1098                      if( $add_decl)
1099                        { $$xml=~ s{^(<\?xml.*?\?>)?}{<?xml version="1.0" encoding="$encoding"?>}s;
1100                          #warn "  added decl (encoding $encoding)\n";
1101                        }
1102                      else
1103                        { $$xml=~ s{^(<\?xml.*?\?>)?}{}s;
1104                          #warn "  converting to utf8 from $encoding\n";
1105                          $$xml= _to_utf8( $encoding, $$xml);
1106                        }
1107                    }
1108                  else
1109                    { $$xml=~ s{^(<\?xml.*?\?>)?}{}s;
1110                      #warn "  converting to utf8 from $encoding\n";
1111                      $$xml= _to_utf8( $encoding, $$xml);
1112                    }
1113                }
1114            }
1115        }
1116
1117      # some versions of HTML::TreeBuilder escape CDATA sections
1118      $$xml=~ s{(&lt;!\[CDATA\[.*?\]\]&gt;)}{_unescape_cdata( $1)}eg;
1119
1120  }
1121
1122  sub _xml_parser_encodings
1123    { my @encodings=( 'iso-8859-1'); # this one is included by default, there is no map for it in @INC
1124      foreach my $inc (@INC)
1125        { push @encodings, map { basename( $_, '.enc') } glob( File::Spec->catdir( $inc => XML => Parser => Encodings => '*.enc')); }
1126      return map { $_ => 1 } @encodings;
1127    }
1128}
1129
1130
1131sub _unescape_cdata
1132  { my( $cdata)= @_;
1133    $cdata=~s{&lt;}{<}g;
1134    $cdata=~s{&gt;}{>}g;
1135    $cdata=~s{&amp;}{&}g;
1136    return $cdata;
1137  }
1138
1139sub _as_XML {
1140
1141    # fork of HTML::Element::as_XML, which is a little too buggy and inconsistent between versions for my liking
1142    my ($elt) = @_;
1143    my $xml= '';
1144    my $empty_element_map = $elt->_empty_element_map;
1145
1146    my ( $tag, $node, $start );    # per-iteration scratch
1147    $elt->traverse(
1148        sub {
1149            ( $node, $start ) = @_;
1150            if ( ref $node )
1151              { # it's an element
1152                $tag = $node->{'_tag'};
1153                if ($start)
1154                  { # on the way in
1155                    foreach my $att ( grep { ! m{^(_|/$)} } keys %$node )
1156                       { # fix attribute names instead of dying
1157                         my $new_att= $att;
1158                         if( $att=~ m{^\d}) { $new_att= "a$att"; }
1159                         $new_att=~ s{[^\w\d:_-]}{}g;
1160                         $new_att ||= 'a';
1161                         if( $new_att ne $att) { $node->{$new_att}= delete $node->{$att}; }
1162                       }
1163
1164                    if ( $empty_element_map->{$tag} && (!@{ $node->{'_content'} || []}) )
1165                      { $xml.= $node->starttag_XML( undef, 1 ); }
1166                    else
1167                      { $xml.= $node->starttag_XML(undef); }
1168                  }
1169                else
1170                 { # on the way out
1171                   unless ( $empty_element_map->{$tag} and !@{ $node->{'_content'} || [] } )
1172                    { $xml.= $node->endtag_XML();
1173                    }     # otherwise it will have been an <... /> tag.
1174                  }
1175              }
1176            elsif( $node=~ /<!\[CDATA\[/)  # the content includes CDATA
1177              {  foreach my $chunk (split /(<!\[CDATA\[.*?\]\]>)/s, $node) # chunks are CDATA sections or normal text
1178                  { $xml.= $chunk =~ m{<!\[CDATA\[} ? $chunk : _xml_escape( $chunk); }
1179              }
1180            else   # it's just text
1181              { $xml .= _xml_escape($node); }
1182            1;            # keep traversing
1183        }
1184    );
1185  return $xml;
1186}
1187
1188sub _xml_escape
1189  { my( $html)= @_;
1190    $html =~ s{&(?!                     # An ampersand that isn't followed by...
1191                  (  \#[0-9]+;       |  #   A hash mark, digits and semicolon, or
1192                    \#x[0-9a-fA-F]+; |  #   A hash mark, "x", hex digits and semicolon, or
1193                    [\w]+;              #   A valid unicode entity name and semicolon
1194                  )
1195                )
1196              }
1197              {&amp;}gx if 0;    # Needs to be escaped to amp
1198
1199    $html=~ s{&}{&amp;}g;
1200
1201    # in old versions of HTML::TreeBuilder &amp; can come out as &Amp;
1202    if( $HTML::TreeBuilder::VERSION && $HTML::TreeBuilder::VERSION <= 3.23) { $html=~ s{&Amp;}{&amp;}g; }
1203
1204    # simple character escapes
1205    $html =~ s/</&lt;/g;
1206    $html =~ s/>/&gt;/g;
1207    $html =~ s/"/&quot;/g;
1208    $html =~ s/'/&apos;/g;
1209
1210    return $html;
1211  }
1212
1213
1214
1215
1216sub _check_xml
1217  { my( $xml)= @_; # $xml is a ref to the xml string
1218    my $ok= eval { XML::Parser->new->parse( $$xml); };
1219    #if( $ok) { warn "  parse OK\n"; }
1220    return $ok;
1221  }
1222
1223sub _encoding_from_meta
1224  { my( $tree)= @_;
1225    my $enc="iso-8859-1";
1226    my @meta= $tree->find( 'meta');
1227    foreach my $meta (@meta)
1228      { if(    $meta->{'http-equiv'} && ($meta->{'http-equiv'} =~ m{^\s*content-type\s*}i)
1229            && $meta->{content}      && ($meta->{content}      =~ m{^\s*text/html\s*;\s*charset\s*=\s*(\S*)\s*}i)
1230          )
1231          { $enc= lc $1;
1232            #warn "  encoding from meta tag is '$enc'\n";
1233            last;
1234          }
1235      }
1236    return $enc;
1237  }
1238
1239{ sub _to_utf8
1240    { my( $encoding, $string)= @_;
1241      local $SIG{__DIE__};
1242      if( _use(  'Encode'))
1243        { Encode::from_to( $string, $encoding => 'utf8', 0x0400); } # 0x0400 is Encode::FB_XMLCREF
1244      elsif( _use( 'Text::Iconv'))
1245        { my $converter =  eval { Text::Iconv->new( $encoding => "utf8") };
1246          if( $converter) {  $string= $converter->convert( $string); }
1247        }
1248      elsif( _use( 'Unicode::Map8') && _use( 'Unicode::String'))
1249        { my $map= Unicode::Map8->new( $encoding);
1250          $string= $map->tou( $string)->utf8;
1251        }
1252      $string=~ s{[\x00-\x08\x0B\x0C\x0E-\x1F]}{}g; # get rid of control chars, portable in 5.6
1253    return $string;
1254  }
1255}
1256
1257
1258sub _indent_xhtml
1259  { my( $xhtml)= @_; # $xhtml is a ref
1260    my %block_tag= map { $_ => 1 } qw( html
1261                                         head
1262                                           meta title link script base
1263                                         body
1264                                           h1 h2 h3 h4 h5 h6
1265                                           p br address  blockquote pre
1266                                           ol ul li  dd dl dt
1267                                           table tr td th tbody tfoot thead  col colgroup caption
1268                                           div frame frameset hr
1269                                     );
1270
1271    my $level=0;
1272    $$xhtml=~ s{( (?:<!(?:--.*?-->|[CDATA[.*?]]>)) # ignore comments and CDATA sections
1273                  | <(\w+)((?:\s+\w+\s*=\s*(?:"[^"]*"|'[^']*'))*\s*/>) # empty tag
1274                  | <(\w+)                         # start tag
1275                  |</(\w+)                         # end tag
1276                )
1277               }
1278               { if(    $2 && $block_tag{$2})  { my $indent= "  " x $level;
1279                                                 "\n$indent<$2$3";
1280                                               }
1281                 elsif( $4 && $block_tag{$4})  { my $indent= "  " x $level;
1282                                                 $level++ unless( $4=~ m{/>});
1283                                                 my $nl= $4 eq 'html' ? '' : "\n";
1284                                                 "$nl$indent<$4";
1285                                               }
1286                 elsif( $5  && $block_tag{$5}) { $level--; "</$5"; }
1287                 else                          { $1; }
1288               }xesg;
1289  }
1290
1291
1292sub add_stylesheet
1293  { my( $t, $type, $href)= @_;
1294    my %text_type= map { $_ => 1 } qw( xsl css);
1295    my $ss= $t->{twig_elt_class}->new( $PI);
1296    if( $text_type{$type})
1297      { $ss->_set_pi( 'xml-stylesheet', qq{type="text/$type" href="$href"}); }
1298    else
1299      { croak "unsupported style sheet type '$type'"; }
1300
1301    $t->_add_cpi_outside_of_root( leading_cpi => $ss);
1302    return $t;
1303  }
1304
1305{ my %used;       # module => 1 if require ok, 0 otherwise
1306  my %disallowed; # for testing, refuses to _use modules in this hash
1307
1308  sub _disallow_use ## no critic (Subroutines::ProhibitNestedSubs);
1309    { my( @modules)= @_;
1310      $disallowed{$_}= 1 foreach (@modules);
1311    }
1312
1313  sub _allow_use  ## no critic (Subroutines::ProhibitNestedSubs);
1314    { my( @modules)= @_;
1315      $disallowed{$_}= 0 foreach (@modules);
1316    }
1317
1318  sub _use  ## no critic (Subroutines::ProhibitNestedSubs);
1319    { my( $module, $version)= @_;
1320      $version ||= 0;
1321      if( $disallowed{$module})   { return 0; }
1322      if( $used{$module})         { return 1; }
1323      if( eval "require $module") { import $module; $used{$module}= 1;  # no critic ProhibitStringyEval
1324                                    if( $version)
1325                                      {
1326                                        ## no critic (TestingAndDebugging::ProhibitNoStrict);
1327                                        no strict 'refs';
1328                                        if( ${"${module}::VERSION"} >= $version ) { return 1; }
1329                                        else                                      { return 0; }
1330                                      }
1331                                    else
1332                                      { return 1; }
1333                                  }
1334      else                        {                          $used{$module}= 0; return 0; }
1335    }
1336}
1337
1338# used to solve the [n] predicates while avoiding getting the entire list
1339# needs a prototype to accept passing bare blocks
1340sub _first_n(&$@)       ## no critic (Subroutines::ProhibitSubroutinePrototypes);
1341  { my $coderef= shift;
1342    my $n= shift;
1343    my $i=0;
1344    if( $n > 0)
1345      { foreach (@_)         { if( &$coderef) { $i++; return $_ if( $i == $n); } } }
1346    elsif( $n < 0)
1347      { foreach (reverse @_) { if( &$coderef) { $i--; return $_ if( $i == $n); } } }
1348    else
1349      { croak "illegal position number 0"; }
1350    return undef;
1351  }
1352
1353sub _slurp_uri
1354  { my( $uri, $base)= @_;
1355    if( $uri=~ m{^\w+://}) { _use( 'LWP::Simple'); return LWP::Simple::get( $uri); }
1356    else                   { return _slurp( _based_filename( $uri, $base));        }
1357  }
1358
1359sub _based_filename
1360  { my( $filename, $base)= @_;
1361    # cf. XML/Parser.pm's file_ext_ent_handler
1362    if (defined($base) and not ($filename =~ m{^(?:[\\/]|\w+:)}))
1363          { my $newpath = $base;
1364            $newpath =~ s{[^\\/:]*$}{$filename};
1365            $filename = $newpath;
1366          }
1367    return $filename;
1368  }
1369
1370sub _slurp
1371  { my( $filename)= @_;
1372    my $to_slurp;
1373    open( $to_slurp, "<$filename") or croak "cannot open '$filename': $!";
1374    local $/= undef;
1375    my $content= <$to_slurp>;
1376    close $to_slurp;
1377    return $content;
1378  }
1379
1380sub _slurp_fh
1381  { my( $fh)= @_;
1382    local $/= undef;
1383    my $content= <$fh>;
1384    return $content;
1385  }
1386
1387# I should really add extra options to allow better configuration of the
1388# LWP::UserAgent object
1389# this method forks (except on VMS!)
1390#   - the child gets the data and copies it to the pipe,
1391#   - the parent reads the stream and sends it to XML::Parser
1392# the data is cut it chunks the size of the XML::Parser::Expat buffer
1393# the method returns the twig and the status
1394sub _parseurl
1395  { my( $t, $safe, $url, $agent)= @_;
1396    _use( 'LWP') || croak "LWP not available, needed to use parseurl methods";
1397    if( $^O ne 'VMS')
1398      { pipe( README, WRITEME) or croak  "cannot create connected pipes: $!";
1399        if( my $pid= fork)
1400          { # parent code: parse the incoming file
1401            close WRITEME; # no need to write
1402            my $result= $safe ? $t->safe_parse( \*README) : $t->parse( \*README);
1403            close README;
1404            return $@ ? 0 : $t;
1405          }
1406        else
1407         { # child
1408            close README; # no need to read
1409            local $|=1;
1410            $agent    ||= LWP::UserAgent->new;
1411            my $request  = HTTP::Request->new( GET => $url);
1412            # _pass_url_content is called with chunks of data the same size as
1413            # the XML::Parser buffer
1414            my $response = $agent->request( $request,
1415                             sub { _pass_url_content( \*WRITEME, @_); }, $BUFSIZE);
1416            $response->is_success or croak "$url ", $response->message;
1417            close WRITEME;
1418            CORE::exit(); # CORE is there for mod_perl (which redefines exit)
1419          }
1420      }
1421    else
1422      { # VMS branch (hard to test!)
1423        local $|=1;
1424        $agent    ||= LWP::UserAgent->new;
1425        my $request  = HTTP::Request->new( GET => $url);
1426        my $response = $agent->request( $request);
1427        $response->is_success or croak "$url ", $response->message;
1428        my $result= $safe ? $t->safe_parse($response->content) : $t->parse($response->content);
1429        return $@ ? 0 : $t;
1430     }
1431
1432  }
1433
1434# get the (hopefully!) XML data from the URL and
1435sub _pass_url_content
1436  { my( $fh, $data, $response, $protocol)= @_;
1437    print {$fh} $data;
1438  }
1439
1440sub add_options
1441  { my %args= map { $_, 1 } @_;
1442    %args= _normalize_args( %args);
1443    foreach (keys %args) { $valid_option{$_}++; }
1444  }
1445
1446sub _pretty_print_styles { return XML::Twig::Elt::_pretty_print_styles(); }
1447
1448sub _twig_store_internal_dtd
1449 {
1450   # warn " in _twig_store_internal_dtd...\n"; # DEBUG handler
1451    my( $p, $string)= @_;
1452    my $t= $p->{twig};
1453    if( $t->{twig_keep_encoding}) { $string= $p->original_string(); }
1454    $t->{twig_doctype}->{internal} .= $string;
1455    return;
1456  }
1457
1458sub _twig_stop_storing_internal_dtd
1459   { # warn " in _twig_stop_storing_internal_dtd...\n"; # DEBUG handler
1460    my $p= shift;
1461    if( @saved_default_handler && defined $saved_default_handler[1])
1462      { $p->setHandlers( @saved_default_handler); }
1463    else
1464      {
1465        $p->setHandlers( Default => undef);
1466      }
1467    $p->{twig}->{twig_doctype}->{internal}=~ s{^\s*\[}{};
1468    $p->{twig}->{twig_doctype}->{internal}=~ s{\]\s*$}{};
1469    return;
1470  }
1471
1472sub _twig_doctype_fin_print
1473  { # warn " in _twig_doctype_fin_print...\n"; # DEBUG handler
1474    my( $p)= shift;
1475    if( $p->{twig}->{twig_doctype}->{has_internal} && !$expat_1_95_2) { print ' ]>'; }
1476    return;
1477  }
1478
1479
1480sub _normalize_args
1481  { my %normalized_args;
1482    while( my $key= shift )
1483      { $key= join '', map { ucfirst } split /_/, $key;
1484        #$key= "Twig".$key unless( substr( $key, 0, 4) eq 'Twig');
1485        $normalized_args{$key}= shift ;
1486      }
1487    return %normalized_args;
1488  }
1489
1490sub _is_fh { return unless $_[0]; return $_[0] if( isa( $_[0], 'GLOB') || isa( $_[0], 'IO::Scalar')); }
1491
1492sub _set_handler
1493  { my( $handlers, $whole_path, $handler)= @_;
1494
1495    my $H_SPECIAL = qr{($ALL|$DEFAULT|$COMMENT|$TEXT)};
1496    my $H_PI      = qr{(\?|$PI)\s*(([^\s]*)\s*)};
1497    my $H_LEVEL   = qr{level \s* \( \s* ([0-9]+) \s* \)}x;
1498    my $H_REGEXP  = qr{\(\?([\^xism]*)(-[\^xism]*)?:(.*)\)}x;
1499    my $H_XPATH   = qr{(/?/?$REG_TAG_PART? \s* ($REG_PREDICATE\s*)?)+}x;
1500
1501    my $prev_handler;
1502
1503    my $cpath= $whole_path;
1504    #warn "\$cpath: '$cpath\n";
1505    while( $cpath && $cpath=~ s{^\s*($H_SPECIAL|$H_PI|$H_LEVEL|$H_REGEXP|$H_XPATH)\s*($|\|)}{})
1506      { my $path= $1;
1507        #warn "\$cpath: '$cpath' - $path: '$path'\n";
1508        $prev_handler ||= $handlers->{handlers}->{string}->{$path} || undef; # $prev_handler gets the first found handler
1509
1510           _set_special_handler         ( $handlers, $path, $handler, $prev_handler)
1511        || _set_pi_handler              ( $handlers, $path, $handler, $prev_handler)
1512        || _set_level_handler           ( $handlers, $path, $handler, $prev_handler)
1513        || _set_regexp_handler          ( $handlers, $path, $handler, $prev_handler)
1514        || _set_xpath_handler           ( $handlers, $path, $handler, $prev_handler)
1515        || croak "unrecognized expression in handler: '$whole_path'";
1516
1517        # this both takes care of the simple (gi) handlers and store
1518        # the handler code reference for other handlers
1519        $handlers->{handlers}->{string}->{$path}= $handler;
1520      }
1521
1522    if( $cpath) { croak "unrecognized expression in handler: '$whole_path'"; }
1523
1524    return $prev_handler;
1525  }
1526
1527
1528sub _set_special_handler
1529  { my( $handlers, $path, $handler, $prev_handler)= @_;
1530    if( $path =~ m{^\s*($ALL|$DEFAULT|$COMMENT|$TEXT)\s*$}io )
1531      { $handlers->{handlers}->{$1}= $handler;
1532        return 1;
1533      }
1534    else
1535      { return 0; }
1536  }
1537
1538sub _set_xpath_handler
1539  { my( $handlers, $path, $handler, $prev_handler)= @_;
1540    if( my $handler_data= _parse_xpath_handler( $path, $handler))
1541      { _add_handler( $handlers, $handler_data, $path, $prev_handler);
1542        return 1;
1543      }
1544    else
1545      { return 0; }
1546  }
1547
1548sub _add_handler
1549  { my( $handlers, $handler_data, $path, $prev_handler)= @_;
1550
1551    my $tag= $handler_data->{tag};
1552    my @handlers= $handlers->{xpath_handler}->{$tag} ? @{$handlers->{xpath_handler}->{$tag}} : ();
1553
1554    if( $prev_handler) { @handlers= grep { $_->{path} ne $path } @handlers; }
1555
1556    push @handlers, $handler_data if( $handler_data->{handler});
1557
1558    if( @handlers > 1)
1559      { @handlers= sort {    (($b->{score}->{type}        || 0)  <=>  ($a->{score}->{type}        || 0))
1560                          || (($b->{score}->{anchored}    || 0)  <=>  ($a->{score}->{anchored}    || 0))
1561                          || (($b->{score}->{steps}       || 0)  <=>  ($a->{score}->{steps}       || 0))
1562                          || (($b->{score}->{predicates}  || 0)  <=>  ($a->{score}->{predicates}  || 0))
1563                          || (($b->{score}->{tests}       || 0)  <=>  ($a->{score}->{tests}       || 0))
1564                          || ($a->{path} cmp $b->{path})
1565                        } @handlers;
1566      }
1567
1568    $handlers->{xpath_handler}->{$tag}= \@handlers;
1569  }
1570
1571sub _set_pi_handler
1572  { my( $handlers, $path, $handler, $prev_handler)= @_;
1573    # PI conditions ( '?target' => \&handler or '?' => \&handler
1574    #             or '#PItarget' => \&handler or '#PI' => \&handler)
1575    if( $path=~ /^\s*(?:\?|$PI)\s*(?:([^\s]*)\s*)$/)
1576      { my $target= $1 || '';
1577        # update the path_handlers count, knowing that
1578        # either the previous or the new handler can be undef
1579        $handlers->{pi_handlers}->{$1}= $handler;
1580        return 1;
1581      }
1582    else
1583      { return 0;
1584      }
1585  }
1586
1587sub _set_level_handler
1588  { my( $handlers, $path, $handler, $prev_handler)= @_;
1589    if( $path =~ m{^ \s* level \s* \( \s* ([0-9]+) \s* \) \s* $}ox )
1590      { my $level= $1;
1591        my $sub= sub { my( $stack)= @_; return( ($stack->[-1]->{$ST_TAG} !~ m{^#}) && (scalar @$stack == $level + 1) ) };
1592        my $handler_data=  { tag=> '*', score => { type => $LEVEL_TRIGGER}, trigger => $sub,
1593                             path => $path, handler => $handler, test_on_text => 0
1594                           };
1595        _add_handler( $handlers, $handler_data, $path, $prev_handler);
1596        return 1;
1597      }
1598    else
1599      { return 0; }
1600  }
1601
1602sub _set_regexp_handler
1603  { my( $handlers, $path, $handler, $prev_handler)= @_;
1604    # if the expression was a regexp it is now a string (it was stringified when it became a hash key)
1605    if( $path=~ m{^\(\?([\^xism]*)(?:-[\^xism]*)?:(.*)\)$})
1606      { my $regexp= qr/(?$1:$2)/; # convert it back into a regexp
1607        my $sub= sub { my( $stack)= @_; return( $stack->[-1]->{$ST_TAG} =~ $regexp ) };
1608        my $handler_data=  { tag=> '*', score => { type => $REGEXP_TRIGGER} , trigger => $sub,
1609                             path => $path, handler => $handler, test_on_text => 0
1610                           };
1611        _add_handler( $handlers, $handler_data, $path, $prev_handler);
1612        return 1;
1613      }
1614    else
1615      { return 0; }
1616  }
1617
1618my $DEBUG_HANDLER= 0; # 0 or 1 (output the handler checking code) or 2 (super verbose)
1619my $handler_string;   # store the handler itself
1620sub _set_debug_handler    { $DEBUG_HANDLER= shift; }
1621sub _warn_debug_handler   { if( $DEBUG_HANDLER < 3) { warn @_; } else { $handler_string .= join( '', @_); } }
1622sub _return_debug_handler { my $string=  $handler_string; $handler_string=''; return $string; }
1623
1624sub _parse_xpath_handler
1625  { my( $xpath, $handler)= @_;
1626    my $xpath_original= $xpath;
1627
1628
1629    if( $DEBUG_HANDLER >=1) { _warn_debug_handler( "\n\nparsing path '$xpath'\n"); }
1630
1631    my $path_to_check= $xpath;
1632    $path_to_check=~ s{/?/?$REG_TAG_PART?\s*(?:$REG_PREDICATE\s*)?}{}g;
1633    if( $DEBUG_HANDLER && $path_to_check=~ /\S/) { _warn_debug_handler( "left: $path_to_check\n"); }
1634    return if( $path_to_check=~ /\S/);
1635
1636    (my $xpath_to_display= $xpath)=~ s{(["{}'\[\]\@\$])}{\\$1}g;
1637
1638    my @xpath_steps;
1639    my $last_token_is_sep;
1640
1641    while( $xpath=~ s{^\s*
1642                       ( (//?)                                      # separator
1643                        | (?:$REG_TAG_PART\s*(?:$REG_PREDICATE\s*)?) # tag name and optional predicate
1644                        | (?:$REG_PREDICATE)                        # just a predicate
1645                       )
1646                     }
1647                     {}x
1648         )
1649      { # check that we have alternating separators and steps
1650        if( $2) # found a separator
1651          { if(  $last_token_is_sep) { return 0; }                                 # 2 separators in a row
1652            $last_token_is_sep= 1;
1653          }
1654        else
1655          { if( defined( $last_token_is_sep) && !$last_token_is_sep) { return 0; } # 2 steps in a row
1656            $last_token_is_sep= 0;
1657          }
1658
1659        push @xpath_steps, $1;
1660      }
1661    if( $last_token_is_sep) { return 0; } # expression cannot end with a separator
1662
1663    my $i=-1;
1664
1665    my $perlfunc= _join_n( $NO_WARNINGS . ';',
1666                           q|my( $stack)= @_;                    |,
1667                           q|my @current_elts= (scalar @$stack); |,
1668                           q|my @new_current_elts;               |,
1669                           q|my $elt;                            |,
1670                           ($DEBUG_HANDLER >= 1) && (qq#warn q{checking path '$xpath_to_display'\n};#),
1671                         );
1672
1673
1674    my $last_tag='';
1675    my $anchored= $xpath_original=~ m{^\s*/(?!/)} ? 1 : 0;
1676    my $score={ type => $XPATH_TRIGGER, anchored => $anchored };
1677    my $flag= { test_on_text => 0 };
1678    my $sep='/';  # '/' or '//'
1679    while( my $xpath_step= pop @xpath_steps)
1680      { my( $tag, $predicate)= $xpath_step =~ m{^($REG_TAG_PART)?(?:\[(.*)\])?\s*$};
1681        $score->{steps}++;
1682        $tag||='*';
1683
1684        my $warn_empty_stack= $DEBUG_HANDLER >= 2 ? qq{warn "return with empty stack\\n";} : '';
1685
1686        if( $predicate)
1687          { if( $DEBUG_HANDLER >= 2)  { _warn_debug_handler( "predicate is: '$predicate'\n"); }
1688            # changes $predicate (from an XPath expression to a Perl one)
1689            if( $predicate=~ m{^\s*$REG_NUMBER\s*$}) { croak "position selector [$predicate] not supported on twig_handlers"; }
1690            _parse_predicate_in_handler( $predicate, $flag, $score);
1691            if( $DEBUG_HANDLER >= 2) { _warn_debug_handler( "predicate becomes: '$predicate'\n"); }
1692          }
1693
1694       my $tag_cond=  _tag_cond( $tag);
1695       my $cond= join( " && ", grep { $_ } $tag_cond, $predicate) || 1;
1696
1697       if( $css_sel && $tag=~ m{\.}) { $tag=~s{\.[^.]*$}{}; $tag ||='*'; }
1698       $tag=~ s{(.)#.+$}{$1};
1699
1700       $last_tag ||= $tag;
1701
1702       if( $sep eq '/')
1703         {
1704           $perlfunc .= sprintf( _join_n(  q#foreach my $current_elt (@current_elts)              #,
1705                                           q#  { next if( !$current_elt);                         #,
1706                                           q#    $current_elt--;                                  #,
1707                                           q#    $elt= $stack->[$current_elt];                    #,
1708                                           q#    if( %s) { push @new_current_elts, $current_elt;} #,
1709                                           q#  }                                                  #,
1710                                        ),
1711                                 $cond
1712                               );
1713         }
1714       elsif( $sep eq '//')
1715         {
1716           $perlfunc .= sprintf( _join_n(  q#foreach my $current_elt (@current_elts)                #,
1717                                           q#  { next if( !$current_elt);                           #,
1718                                           q#    $current_elt--;                                    #,
1719                                           q#    my $candidate= $current_elt;                       #,
1720                                           q#    while( $candidate >=0)                             #,
1721                                           q#      { $elt= $stack->[$candidate];                    #,
1722                                           q#        if( %s) { push @new_current_elts, $candidate;} #,
1723                                           q#        $candidate--;                                  #,
1724                                           q#      }                                                #,
1725                                           q#  }                                                    #,
1726                                        ),
1727                                 $cond
1728                               );
1729         }
1730       my $warn= $DEBUG_HANDLER >= 2 ? _join_n( qq#warn qq%fail at cond '$cond'%;#) : '';
1731       $perlfunc .= sprintf( _join_n( q#unless( @new_current_elts) { %s return 0; } #,
1732                                      q#@current_elts= @new_current_elts;           #,
1733                                      q#@new_current_elts=();                       #,
1734                                    ),
1735                             $warn
1736                           );
1737
1738        $sep= pop @xpath_steps;
1739     }
1740
1741    if( $anchored) # there should be a better way, but this works
1742      {
1743       my $warn= $DEBUG_HANDLER >= 2 ? _join_n( qq#warn qq{fail, stack not empty};#) : '';
1744       $perlfunc .= sprintf( _join_n( q#if( ! grep { $_ == 0 } @current_elts) { %s return 0;}#), $warn);
1745      }
1746
1747    $perlfunc.= qq{warn "handler for '$xpath_to_display' triggered\\n";\n} if( $DEBUG_HANDLER >=2);
1748    $perlfunc.= qq{return q{$xpath_original};\n};
1749    _warn_debug_handler( "\nperlfunc:\n$perlfunc\n") if( $DEBUG_HANDLER>=1);
1750    my $s= eval "sub { $perlfunc }";
1751      if( $@)
1752        { croak "wrong handler condition '$xpath' ($@);" }
1753
1754      _warn_debug_handler( "last tag: '$last_tag', test_on_text: '$flag->{test_on_text}'\n") if( $DEBUG_HANDLER >=1);
1755      _warn_debug_handler( "score: ", join( ' ', map { "$_: $score->{$_}" } sort keys %$score), "\n") if( $DEBUG_HANDLER >=1);
1756      return { tag=> $last_tag, score => $score, trigger => $s, path => $xpath_original, handler => $handler, test_on_text => $flag->{test_on_text} };
1757    }
1758
1759sub _join_n { return join( "\n", @_, ''); }
1760
1761# the "tag" part can be <tag>, <tag>.<class> or <tag>#<id> (where tag can be *, or start with # for hidden tags)
1762sub _tag_cond
1763  { my( $full_tag)= @_;
1764
1765    my( $tag, $class, $id);
1766    if( $full_tag=~ m{^(.+)#(.+)$})
1767      { ($tag, $id)= ($1, $2); } # <tag>#<id>
1768    else
1769      { ( $tag, $class)= $css_sel ? $full_tag=~ m{^(.*?)(?:\.([^.]*))?$} : ($full_tag, undef); }
1770
1771    my $tag_cond   = $tag && $tag ne '*' ? qq#(\$elt->{'$ST_TAG'} eq "$tag")# : '';
1772    my $id_cond    = defined $id         ? qq#(\$elt->{id} eq "$id")#  : '';
1773    my $class_cond = defined $class      ? qq#(\$elt->{class}=~ m{(^| )$class( |\$)})# : '';
1774
1775    my $full_cond= join( ' && ', grep { $_ } ( $tag_cond, $class_cond, $id_cond));
1776
1777    return $full_cond;
1778  }
1779
1780# input: the predicate ($_[0]) which will be changed in place
1781#        flags, a hashref with various flags (like test_on_text)
1782#        the score
1783sub _parse_predicate_in_handler
1784  { my( $flag, $score)= @_[1..2];
1785    $_[0]=~ s{(   ($REG_STRING)                            # strings
1786                 |\@($REG_TAG_NAME)(\s* $REG_MATCH \s* $REG_REGEXP) # @att and regexp
1787                 |\@($REG_TAG_NAME)(?=\s*(?:[><=!]))       # @att followed by a comparison operator
1788                 |\@($REG_TAG_NAME)                        # @att (not followed by a comparison operator)
1789                 |=~|!~                                    # matching operators
1790                 |([><]=?|=|!=)(?=\s*[\d+-])               # test before a number
1791                 |([><]=?|=|!=)                            # test, other cases
1792                 |($REG_FUNCTION)                          # no arg functions
1793                 # this bit is a mess, but it is the only solution with this half-baked parser
1794                 |(string\(\s*$REG_NAME\s*\)\s*$REG_MATCH\s*$REG_REGEXP)  # string( child)=~ /regexp/
1795                 |(string\(\s*$REG_NAME\s*\)\s*$REG_COMP\s*$REG_STRING)   # string( child) = "value" (or other test)
1796                 |(string\(\s*$REG_NAME\s*\)\s*$REG_COMP\s*$REG_NUMBER)   # string( child) = nb (or other test)
1797                 |(and|or)
1798                # |($REG_NAME(?=\s*(and|or|$)))            # nested tag name (needs to be after all other unquoted strings)
1799                 |($REG_TAG_IN_PREDICATE)                  # nested tag name (needs to be after all other unquoted strings)
1800
1801              )}
1802             { my( $token, $str, $att_re_name, $att_re_regexp, $att, $bare_att, $num_test, $alpha_test, $func, $str_regexp, $str_test_alpha, $str_test_num, $and_or, $tag)
1803               = ( $1,     $2,   $3,           $4,             $5,   $6,        $7,        $8,          $9,    $10,         $11,             $12,           $13,     $14);
1804
1805               $score->{predicates}++;
1806
1807               # store tests on text (they are not always allowed)
1808               if( $func || $str_regexp || $str_test_num || $str_test_alpha ) { $flag->{test_on_text}= 1;   }
1809
1810               if( defined $str)      { $token }
1811               elsif( $tag)           { qq{(\$elt->{'$ST_ELT'} && \$elt->{'$ST_ELT'}->has_child( '$tag'))} }
1812               elsif( $att)           { $att=~ m{^#} ? qq{ (\$elt->{'$ST_ELT'} && \$elt->{'$ST_ELT'}->{att}->{'$att'})}
1813                                                     : qq{\$elt->{'$att'}}
1814                                      }
1815               elsif( $att_re_name)   { $att_re_name=~ m{^#} ? qq{ (\$elt->{'$ST_ELT'} && \$elt->{'$ST_ELT'}->{att}->{'$att_re_name'}$att_re_regexp)}
1816                                                     : qq{\$elt->{'$att_re_name'}$att_re_regexp}
1817                                      }
1818                                        # for some reason Devel::Cover flags the following lines as not tested. They are though.
1819               elsif( $bare_att)      { $bare_att=~ m{^#} ? qq{(\$elt->{'$ST_ELT'} && defined(\$elt->{'$ST_ELT'}->{att}->{'$bare_att'}))}
1820                                                          : qq{defined( \$elt->{'$bare_att'})}
1821                                      }
1822               elsif( $num_test && ($num_test eq '=') ) { "==" } # others tests are unchanged
1823               elsif( $alpha_test)    { $PERL_ALPHA_TEST{$alpha_test} }
1824               elsif( $func && $func=~ m{^string})
1825                                      { "\$elt->{'$ST_ELT'}->text"; }
1826               elsif( $str_regexp     && $str_regexp     =~ m{string\(\s*($REG_TAG_NAME)\s*\)\s*($REG_MATCH)\s*($REG_REGEXP)})
1827                                      { "defined( _first_n {  \$_->text $2 $3 } 1, \$elt->{'$ST_ELT'}->_children( '$1'))"; }
1828               elsif( $str_test_alpha && $str_test_alpha =~ m{string\(\s*($REG_TAG_NAME)\s*\)\s*($REG_COMP)\s*($REG_STRING)})
1829                                      { my( $tag, $op, $str)= ($1, $2, $3);
1830                                        $str=~ s{(?<=.)'(?=.)}{\\'}g; # escape a quote within the string
1831                                        $str=~ s{^"}{'};
1832                                        $str=~ s{"$}{'};
1833                                        "defined( _first_n { \$_->text $PERL_ALPHA_TEST{$op} $str } 1, \$elt->{'$ST_ELT'}->children( '$tag'))"; }
1834               elsif( $str_test_num   && $str_test_num   =~ m{string\(\s*($REG_TAG_NAME)\s*\)\s*($REG_COMP)\s*($REG_NUMBER)})
1835                                      { my $test= ($2 eq '=') ? '==' : $2;
1836                                        "defined( _first_n { \$_->text $test $3 } 1, \$elt->{'$ST_ELT'}->children( '$1'))";
1837                                      }
1838               elsif( $and_or)        { $score->{tests}++; $and_or eq 'and' ? '&&' : '||' ; }
1839               else                   { $token; }
1840             }gexs;
1841  }
1842
1843
1844sub setCharHandler
1845  { my( $t, $handler)= @_;
1846    $t->{twig_char_handler}= $handler;
1847  }
1848
1849
1850sub _reset_handlers
1851  { my $handlers= shift;
1852    delete $handlers->{handlers};
1853    delete $handlers->{path_handlers};
1854    delete $handlers->{subpath_handlers};
1855    $handlers->{attcond_handlers_exp}=[] if( $handlers->{attcond_handlers});
1856    delete $handlers->{attcond_handlers};
1857  }
1858
1859sub _set_handlers
1860  { my $handlers= shift || return;
1861    my $set_handlers= {};
1862    foreach my $path (keys %{$handlers})
1863      { _set_handler( $set_handlers, $path, $handlers->{$path}); }
1864
1865    return $set_handlers;
1866  }
1867
1868
1869sub setTwigHandler
1870  { my( $t, $path, $handler)= @_;
1871    $t->{twig_handlers} ||={};
1872    return _set_handler( $t->{twig_handlers}, $path, $handler);
1873  }
1874
1875sub setTwigHandlers
1876  { my( $t, $handlers)= @_;
1877    my $previous_handlers= $t->{twig_handlers} || undef;
1878    _reset_handlers( $t->{twig_handlers});
1879    $t->{twig_handlers}= _set_handlers( $handlers);
1880    return $previous_handlers;
1881  }
1882
1883sub setStartTagHandler
1884  { my( $t, $path, $handler)= @_;
1885    $t->{twig_starttag_handlers}||={};
1886    return _set_handler( $t->{twig_starttag_handlers}, $path, $handler);
1887  }
1888
1889sub setStartTagHandlers
1890  { my( $t, $handlers)= @_;
1891    my $previous_handlers= $t->{twig_starttag_handlers} || undef;
1892    _reset_handlers( $t->{twig_starttag_handlers});
1893    $t->{twig_starttag_handlers}= _set_handlers( $handlers);
1894    return $previous_handlers;
1895   }
1896
1897sub setIgnoreEltsHandler
1898  { my( $t, $path, $action)= @_;
1899    $t->{twig_ignore_elts_handlers}||={};
1900    return _set_handler( $t->{twig_ignore_elts_handlers}, $path, $action );
1901  }
1902
1903sub setIgnoreEltsHandlers
1904  { my( $t, $handlers)= @_;
1905    my $previous_handlers= $t->{twig_ignore_elts_handlers};
1906    _reset_handlers( $t->{twig_ignore_elts_handlers});
1907    $t->{twig_ignore_elts_handlers}= _set_handlers( $handlers);
1908    return $previous_handlers;
1909   }
1910
1911sub setEndTagHandler
1912  { my( $t, $path, $handler)= @_;
1913    $t->{twig_endtag_handlers}||={};
1914    return _set_handler( $t->{twig_endtag_handlers}, $path,$handler);
1915  }
1916
1917sub setEndTagHandlers
1918  { my( $t, $handlers)= @_;
1919    my $previous_handlers= $t->{twig_endtag_handlers};
1920    _reset_handlers( $t->{twig_endtag_handlers});
1921    $t->{twig_endtag_handlers}= _set_handlers( $handlers);
1922    return $previous_handlers;
1923   }
1924
1925# a little more complex: set the twig_handlers only if a code ref is given
1926sub setTwigRoots
1927  { my( $t, $handlers)= @_;
1928    my $previous_roots= $t->{twig_roots};
1929    _reset_handlers($t->{twig_roots});
1930    $t->{twig_roots}= _set_handlers( $handlers);
1931
1932    _check_illegal_twig_roots_handlers( $t->{twig_roots});
1933
1934    foreach my $path (keys %{$handlers})
1935      { $t->{twig_handlers}||= {};
1936        _set_handler( $t->{twig_handlers}, $path, $handlers->{$path})
1937          if( ref($handlers->{$path}) && isa( $handlers->{$path}, 'CODE'));
1938      }
1939    return $previous_roots;
1940  }
1941
1942sub _check_illegal_twig_roots_handlers
1943  { my( $handlers)= @_;
1944    foreach my $tag_handlers (values %{$handlers->{xpath_handler}})
1945      { foreach my $handler_data (@$tag_handlers)
1946          { if( my $type= $handler_data->{test_on_text})
1947              { croak "string() condition not supported on twig_roots option"; }
1948          }
1949      }
1950    return;
1951  }
1952
1953
1954# just store the reference to the expat object in the twig
1955sub _twig_init
1956   { # warn " in _twig_init...\n"; # DEBUG handler
1957
1958    my $p= shift;
1959    my $t=$p->{twig};
1960
1961    if( $t->{twig_parsing} ) { croak "cannot reuse a twig that is already parsing"; }
1962    $t->{twig_parsing}=1;
1963
1964    $t->{twig_parser}= $p;
1965    if( $weakrefs) { weaken( $t->{twig_parser}); }
1966
1967    # in case they had been created by a previous parse
1968    delete $t->{twig_dtd};
1969    delete $t->{twig_doctype};
1970    delete $t->{twig_xmldecl};
1971    delete $t->{twig_root};
1972
1973    # if needed set the output filehandle
1974    $t->_set_fh_to_twig_output_fh();
1975    return;
1976  }
1977
1978# uses eval to catch the parser's death
1979sub safe_parse
1980  { my $t= shift;
1981    eval { $t->parse( @_); } ;
1982    return $@ ? $t->_reset_twig_after_error : $t;
1983  }
1984
1985sub safe_parsefile
1986  { my $t= shift;
1987    eval { $t->parsefile( @_); } ;
1988    return $@ ? $t->_reset_twig_after_error : $t;
1989  }
1990
1991# restore a twig in a proper state so it can be reused for a new parse
1992sub _reset_twig
1993  { my $t= shift;
1994    $t->{twig_parsing}= 0;
1995    delete $t->{twig_current};
1996    delete $t->{extra_data};
1997    delete $t->{twig_dtd};
1998    delete $t->{twig_in_pcdata};
1999    delete $t->{twig_in_cdata};
2000    delete $t->{twig_stored_space};
2001    delete $t->{twig_entity_list};
2002    $t->root->delete if( $t->root);
2003    delete $t->{twig_root};
2004    return $t;
2005  }
2006
2007sub _reset_twig_after_error
2008  { my $t= shift;
2009    $t->_reset_twig;
2010    return undef;
2011  }
2012
2013
2014sub _add_or_discard_stored_spaces
2015  { my $t= shift;
2016
2017    $t->{twig_right_after_root}=0; #XX
2018
2019    my $current= $t->{twig_current} or return; # ugly hack, with ignore on, twig_current can disappear
2020    return unless length $t->{twig_stored_spaces};
2021    my $current_gi= $XML::Twig::index2gi[$current->{'gi'}];
2022
2023    if( ! $t->{twig_discard_all_spaces})
2024      { if( ! defined( $t->{twig_space_policy}->{$current_gi}))
2025          { $t->{twig_space_policy}->{$current_gi}= _space_policy( $t, $current_gi); }
2026        if(    $t->{twig_space_policy}->{$current_gi} || ($t->{twig_stored_spaces}!~ m{\n}) || $t->{twig_preserve_space})
2027          { _insert_pcdata( $t, $t->{twig_stored_spaces} ); }
2028      }
2029
2030    $t->{twig_stored_spaces}='';
2031
2032    return;
2033  }
2034
2035# the default twig handlers, which build the tree
2036sub _twig_start
2037   { # warn " in _twig_start...\n"; # DEBUG handler
2038
2039    #foreach my $s (@_) { next if ref $s; warn "$s: ", is_utf8( $s) ? "has flag" : "FLAG NOT SET"; } # YYY
2040
2041    my ($p, $gi, @att)= @_;
2042    my $t=$p->{twig};
2043
2044    # empty the stored pcdata (space stored in case they are really part of
2045    # a pcdata element) or stored it if the space policy dictates so
2046    # create a pcdata element with the spaces if need be
2047    _add_or_discard_stored_spaces( $t);
2048    my $parent= $t->{twig_current};
2049
2050    # if we were parsing PCDATA then we exit the pcdata
2051    if( $t->{twig_in_pcdata})
2052      { $t->{twig_in_pcdata}= 0;
2053        delete $parent->{'twig_current'};
2054        $parent= $parent->{parent};
2055      }
2056
2057    # if we choose to keep the encoding then we need to parse the tag
2058    if( my $func = $t->{parse_start_tag})
2059      { ($gi, @att)= &$func($p->original_string); }
2060    elsif( $t->{twig_entities_in_attribute})
2061      {
2062       ($gi,@att)= _parse_start_tag( $p->recognized_string);
2063         $t->{twig_entities_in_attribute}=0;
2064      }
2065
2066    # if we are using an external DTD, we need to fill the default attributes
2067    if( $t->{twig_read_external_dtd}) { _fill_default_atts( $t, $gi, \@att); }
2068
2069    # filter the input data if need be
2070    if( my $filter= $t->{twig_input_filter})
2071      { $gi= $filter->( $gi);
2072        foreach my $att (@att) { $att= $filter->($att); }
2073      }
2074
2075    my $ns_decl;
2076    if( $t->{twig_map_xmlns})
2077      { $ns_decl= _replace_ns( $t, \$gi, \@att); }
2078
2079    my $elt= $t->{twig_elt_class}->new( $gi);
2080    $elt->set_atts( @att);
2081
2082    # now we can store the tag and atts
2083    my $context= { $ST_TAG => $gi, $ST_ELT => $elt, @att};
2084    $context->{$ST_NS}= $ns_decl if $ns_decl;
2085    if( $weakrefs) { weaken( $context->{$ST_ELT}); }
2086    push @{$t->{_twig_context_stack}}, $context;
2087
2088    delete $parent->{'twig_current'} if( $parent);
2089    $t->{twig_current}= $elt;
2090    $elt->{'twig_current'}=1;
2091
2092    if( $parent)
2093      { my $prev_sibling= $parent->{last_child};
2094        if( $prev_sibling)
2095          { $prev_sibling->{next_sibling}=  $elt;
2096            $elt->{prev_sibling}=$prev_sibling; if( $XML::Twig::weakrefs) { weaken( $elt->{prev_sibling});} ;
2097          }
2098
2099        $elt->{parent}=$parent; if( $XML::Twig::weakrefs) { weaken( $elt->{parent});} ;
2100        unless( $parent->{first_child}) { $parent->{first_child}=  $elt; }
2101         delete $parent->{empty}; $parent->{last_child}=$elt; if( $XML::Twig::weakrefs) { weaken( $parent->{last_child});} ;
2102      }
2103    else
2104      { # processing root
2105        $t->set_root( $elt);
2106        # call dtd handler if need be
2107        $t->{twig_dtd_handler}->($t, $t->{twig_dtd})
2108          if( defined $t->{twig_dtd_handler});
2109
2110        # set this so we can catch external entities
2111        # (the handler was modified during DTD processing)
2112        if( $t->{twig_default_print})
2113          { $p->setHandlers( Default => \&_twig_print); }
2114        elsif( $t->{twig_roots})
2115          { $p->setHandlers( Default => sub { return }); }
2116        else
2117          { $p->setHandlers( Default => \&_twig_default); }
2118      }
2119
2120    $elt->{empty}=  $p->recognized_string=~ m{/\s*>$}s ? 1 : 0;
2121
2122    $elt->{extra_data}= $t->{extra_data} if( $t->{extra_data});
2123    $t->{extra_data}='';
2124
2125    # if the element is ID-ed then store that info
2126    my $id= $elt->{'att'}->{$ID};
2127    if( defined $id)
2128      { $t->{twig_id_list}->{$id}= $elt;
2129        if( $weakrefs) { weaken( $t->{twig_id_list}->{$id}); }
2130      }
2131
2132    # call user handler if need be
2133    if( $t->{twig_starttag_handlers})
2134      { # call all appropriate handlers
2135        my @handlers= _handler( $t, $t->{twig_starttag_handlers}, $gi);
2136
2137        local $_= $elt;
2138
2139        foreach my $handler ( @handlers)
2140          { $handler->($t, $elt) || last; }
2141        # call _all_ handler if needed
2142        if( my $all= $t->{twig_starttag_handlers}->{handlers}->{$ALL})
2143          { $all->($t, $elt); }
2144      }
2145
2146    # check if the tag is in the list of tags to be ignored
2147    if( $t->{twig_ignore_elts_handlers})
2148      { my @handlers= _handler( $t, $t->{twig_ignore_elts_handlers}, $gi);
2149        # only the first handler counts, it contains the action (discard/print/string)
2150        if( @handlers) { my $action= shift @handlers; $t->ignore( $elt, $action); }
2151      }
2152
2153    if( $elt->{'att'}->{'xml:space'} && (  $elt->{'att'}->{'xml:space'} eq 'preserve')) { $t->{twig_preserve_space}++; }
2154
2155
2156    return;
2157  }
2158
2159sub _replace_ns
2160  { my( $t, $gi, $atts)= @_;
2161    my $decls;
2162    foreach my $new_prefix ( $t->parser->new_ns_prefixes)
2163      { my $uri= $t->parser->expand_ns_prefix( $new_prefix);
2164        # replace the prefix if it is mapped
2165        $decls->{$new_prefix}= $uri;
2166        if( !$t->{twig_keep_original_prefix} && (my $mapped_prefix= $t->{twig_map_xmlns}->{$uri}))
2167          { $new_prefix= $mapped_prefix; }
2168        # now put the namespace declaration back in the element
2169        if( $new_prefix eq '#default')
2170          { push @$atts, "xmlns" =>  $uri; }
2171        else
2172          { push @$atts, "xmlns:$new_prefix" =>  $uri; }
2173      }
2174
2175    if( $t->{twig_keep_original_prefix})
2176      { # things become more complex: we need to find the original prefix
2177        # and store both prefixes
2178        my $ns_info= $t->_ns_info( $$gi);
2179        my $map_att;
2180        if( $ns_info->{mapped_prefix})
2181          { $$gi= "$ns_info->{mapped_prefix}:$$gi";
2182            $map_att->{$ns_info->{mapped_prefix}}= $ns_info->{prefix};
2183          }
2184        my $att_name=1;
2185        foreach( @$atts)
2186          { if( $att_name)
2187              {
2188                my $ns_info= $t->_ns_info( $_);
2189                if( $ns_info->{mapped_prefix})
2190                  { $_= "$ns_info->{mapped_prefix}:$_";
2191                    $map_att->{$ns_info->{mapped_prefix}}= $ns_info->{prefix};
2192                  }
2193                $att_name=0;
2194              }
2195            else
2196              {  $att_name=1; }
2197          }
2198        push @$atts, '#original_gi', $map_att if( $map_att);
2199      }
2200    else
2201      { $$gi= $t->_replace_prefix( $$gi);
2202        my $att_name=1;
2203        foreach( @$atts)
2204          { if( $att_name) { $_= $t->_replace_prefix( $_); $att_name=0; }
2205            else           {  $att_name=1; }
2206          }
2207      }
2208    return $decls;
2209  }
2210
2211
2212# extract prefix, local_name, uri, mapped_prefix from a name
2213# will only work if called from a start or end tag handler
2214sub _ns_info
2215  { my( $t, $name)= @_;
2216    my $ns_info={};
2217    my $p= $t->parser;
2218    $ns_info->{uri}= $p->namespace( $name);
2219    return $ns_info unless( $ns_info->{uri});
2220
2221    $ns_info->{prefix}= _a_proper_ns_prefix( $p, $ns_info->{uri});
2222    $ns_info->{mapped_prefix}= $t->{twig_map_xmlns}->{$ns_info->{uri}} || $ns_info->{prefix};
2223
2224    return $ns_info;
2225  }
2226
2227sub _a_proper_ns_prefix
2228  { my( $p, $uri)= @_;
2229    foreach my $prefix ($p->current_ns_prefixes)
2230      { if( $p->expand_ns_prefix( $prefix) eq $uri)
2231          { return $prefix; }
2232      }
2233    return;
2234  }
2235
2236# returns the uri bound to a prefix in the original document
2237# only works in a handler
2238# can be used to deal with xsi:type attributes
2239sub original_uri
2240  { my( $t, $prefix)= @_;
2241    my $ST_NS  = '##ns' ;
2242    foreach my $ns (map { $_->{$ST_NS} if  $_->{$ST_NS} } reverse @{$t->{_twig_context_stack}})
2243      { return $ns->{$prefix} || next; }
2244    return;
2245  }
2246
2247
2248sub _fill_default_atts
2249  { my( $t, $gi, $atts)= @_;
2250    my $dtd= $t->{twig_dtd};
2251    my $attlist= $dtd->{att}->{$gi};
2252    my %value= @$atts;
2253    foreach my $att (keys %$attlist)
2254      { if(   !exists( $value{$att})
2255            && exists( $attlist->{$att}->{default})
2256            && ( $attlist->{$att}->{default} ne '#IMPLIED')
2257          )
2258          { # the quotes are included in the default, so we need to remove them
2259            my $default_value= substr( $attlist->{$att}->{default}, 1, -1);
2260            push @$atts, $att, $default_value;
2261          }
2262      }
2263    return;
2264  }
2265
2266
2267# the default function to parse a start tag (in keep_encoding mode)
2268# can be overridden with the parse_start_tag method
2269# only works for 1-byte character sets
2270sub _parse_start_tag
2271  { my $string= shift;
2272    my( $gi, @atts);
2273
2274    # get the gi (between < and the first space, / or > character)
2275    #if( $string=~ s{^<\s*([^\s>/]*)[\s>/]*}{}s)
2276    if( $string=~ s{^<\s*($REG_TAG_NAME)\s*[\s>/]}{}s)
2277      { $gi= $1; }
2278    else
2279      { croak "error parsing tag '$string'"; }
2280    while( $string=~ s{^([^\s=]*)\s*=\s*(["'])(.*?)\2\s*}{}s)
2281      { push @atts, $1, $3; }
2282    return $gi, @atts;
2283  }
2284
2285sub set_root
2286  { my( $t, $elt)= @_;
2287    $t->{twig_root}= $elt;
2288    if( $elt)
2289      { $elt->{twig}= $t;
2290        if( $weakrefs) { weaken(  $elt->{twig}); }
2291      }
2292    return $t;
2293  }
2294
2295sub _twig_end
2296   { # warn " in _twig_end...\n"; # DEBUG handler
2297    my ($p, $gi)  = @_;
2298
2299    my $t=$p->{twig};
2300
2301    if( $t->{twig_in_pcdata} && (my $text_handler= $t->{TwigHandlers}->{$TEXT}) )
2302      { local $_= $t->{twig_current}; $text_handler->( $t, $_) if $_;
2303      }
2304
2305    if( $t->{twig_map_xmlns}) { $gi= $t->_replace_prefix( $gi); }
2306
2307    _add_or_discard_stored_spaces( $t);
2308
2309    # the new twig_current is the parent
2310    my $elt= $t->{twig_current};
2311    delete $elt->{'twig_current'};
2312
2313    # if we were parsing PCDATA then we exit the pcdata too
2314    if( $t->{twig_in_pcdata})
2315      {
2316        $t->{twig_in_pcdata}= 0;
2317        $elt= $elt->{parent} if($elt->{parent});
2318        delete $elt->{'twig_current'};
2319      }
2320
2321    # parent is the new current element
2322    my $parent= $elt->{parent};
2323    $t->{twig_current}= $parent;
2324
2325    if( $parent)
2326      { $parent->{'twig_current'}=1;
2327        # twig_to_be_normalized
2328        if( $parent->{twig_to_be_normalized}) { $parent->normalize; $parent->{twig_to_be_normalized}=0; }
2329      }
2330
2331    if( $t->{extra_data})
2332      { $elt->_set_extra_data_before_end_tag( $t->{extra_data});
2333        $t->{extra_data}='';
2334      }
2335
2336    if( $t->{twig_handlers})
2337      { # look for handlers
2338        my @handlers= _handler( $t, $t->{twig_handlers}, $gi);
2339
2340        if( $t->{twig_tdh})
2341          { if( @handlers) { push @{$t->{twig_handlers_to_trigger}}, [ $elt, \@handlers ]; }
2342            if( my $all= $t->{twig_handlers}->{handlers}->{$ALL})
2343              { push @{$t->{twig_handlers_to_trigger}}, [ $elt, [$all] ]; }
2344          }
2345        else
2346          {
2347            local $_= $elt; # so we can use $_ in the handlers
2348
2349            foreach my $handler ( @handlers)
2350              { $handler->($t, $elt) || last; }
2351            # call _all_ handler if needed
2352            my $all= $t->{twig_handlers}->{handlers}->{$ALL};
2353            if( $all)
2354              { $all->($t, $elt); }
2355            if( @handlers || $all) { $t->{twig_right_after_root}=0; }
2356          }
2357      }
2358
2359    # if twig_roots is set for the element then set appropriate handler
2360    if(  $t->{twig_root_depth} and ($p->depth == $t->{twig_root_depth}) )
2361      { if( $t->{twig_default_print})
2362          { # select the proper fh (and store the currently selected one)
2363            $t->_set_fh_to_twig_output_fh();
2364            if( !$p->depth==1) { $t->{twig_right_after_root}=1; } #XX
2365            if( $t->{twig_keep_encoding})
2366              { $p->setHandlers( %twig_handlers_roots_print_original); }
2367            else
2368              { $p->setHandlers( %twig_handlers_roots_print); }
2369          }
2370        else
2371          { $p->setHandlers( %twig_handlers_roots); }
2372      }
2373
2374    if( $elt->{'att'}->{'xml:space'} && (  $elt->{'att'}->{'xml:space'} eq 'preserve')) { $t->{twig_preserve_space}--; }
2375
2376    pop @{$t->{_twig_context_stack}};
2377    return;
2378  }
2379
2380sub _trigger_tdh
2381  { my( $t)= @_;
2382
2383    if( @{$t->{twig_handlers_to_trigger}})
2384      { my @handlers_to_trigger_now= sort { $a->[0]->cmp( $b->[0]) } @{$t->{twig_handlers_to_trigger}};
2385        foreach my $elt_handlers (@handlers_to_trigger_now)
2386          { my( $handled_elt, $handlers_to_trigger)= @$elt_handlers;
2387            foreach my $handler ( @$handlers_to_trigger)
2388              { local $_= $handled_elt; $handler->($t, $handled_elt) || last; }
2389          }
2390      }
2391    return;
2392  }
2393
2394# return the list of handler that can be activated for an element
2395# (either of CODE ref's or 1's for twig_roots)
2396
2397sub _handler
2398  { my( $t, $handlers, $gi)= @_;
2399
2400    my @found_handlers=();
2401    my $found_handler;
2402
2403    foreach my $handler ( map { @$_ } grep { $_ } $handlers->{xpath_handler}->{$gi}, $handlers->{xpath_handler}->{'*'})
2404      {  my $trigger= $handler->{trigger};
2405         if( my $found_path= $trigger->( $t->{_twig_context_stack}))
2406          { my $found_handler= $handler->{handler};
2407            push @found_handlers, $found_handler;
2408          }
2409      }
2410
2411    # if no handler found call default handler if defined
2412    if( !@found_handlers && defined $handlers->{handlers}->{$DEFAULT})
2413      { push @found_handlers, $handlers->{handlers}->{$DEFAULT}; }
2414
2415    if( @found_handlers and $t->{twig_do_not_chain_handlers})
2416      { @found_handlers= ($found_handlers[0]); }
2417
2418    return @found_handlers; # empty if no handler found
2419
2420  }
2421
2422
2423sub _replace_prefix
2424  { my( $t, $name)= @_;
2425    my $p= $t->parser;
2426    my $uri= $p->namespace( $name);
2427    # try to get the namespace from default if none is found (for attributes)
2428    # this should probably be an option
2429    if( !$uri and( $name!~/^xml/)) { $uri= $p->expand_ns_prefix( '#default'); }
2430    if( $uri)
2431      { if (my $mapped_prefix= $t->{twig_map_xmlns}->{$uri} || $DEFAULT_URI2NS{$uri})
2432          { return "$mapped_prefix:$name"; }
2433        else
2434          { my $prefix= _a_proper_ns_prefix( $p, $uri);
2435            if( $prefix eq '#default') { $prefix=''; }
2436            return $prefix ? "$prefix:$name" : $name;
2437          }
2438      }
2439    else
2440      { return $name; }
2441  }
2442
2443
2444sub _twig_char
2445   { # warn " in _twig_char...\n"; # DEBUG handler
2446
2447    my ($p, $string)= @_;
2448    my $t=$p->{twig};
2449
2450    if( $t->{twig_keep_encoding})
2451      { if( !$t->{twig_in_cdata})
2452          { $string= $p->original_string(); }
2453        else
2454          {
2455            use bytes; # > perl 5.5
2456            if( length( $string) < 1024)
2457              { $string= $p->original_string(); }
2458            else
2459              { #warn "dodgy case";
2460                # TODO original_string does not hold the entire string, but $string is wrong
2461                # I believe due to a bug in XML::Parser
2462                # for now, we use the original string, even if it means that it's been converted to utf8
2463              }
2464          }
2465      }
2466
2467    if( $t->{twig_input_filter}) { $string= $t->{twig_input_filter}->( $string); }
2468    if( $t->{twig_char_handler}) { $string= $t->{twig_char_handler}->( $string); }
2469
2470    my $elt= $t->{twig_current};
2471
2472    if(    $t->{twig_in_cdata})
2473      { # text is the continuation of a previously created cdata
2474        $elt->{cdata}.=  $t->{twig_stored_spaces} . $string;
2475      }
2476    elsif( $t->{twig_in_pcdata})
2477      { # text is the continuation of a previously created pcdata
2478        if( $t->{extra_data})
2479          { $elt->_push_extra_data_in_pcdata( $t->{extra_data}, length( $elt->{pcdata}));
2480            $t->{extra_data}='';
2481          }
2482        $elt->{pcdata}.=  $string;
2483      }
2484    else
2485      {
2486        # text is just space, which might be discarded later
2487        if( $string=~/\A\s*\Z/s)
2488          {
2489            if( $t->{extra_data})
2490              { # we got extra data (comment, pi), lets add the spaces to it
2491                $t->{extra_data} .= $string;
2492              }
2493            else
2494              { # no extra data, just store the spaces
2495                $t->{twig_stored_spaces}.= $string;
2496              }
2497          }
2498        else
2499          { my $new_elt= _insert_pcdata( $t, $t->{twig_stored_spaces}.$string);
2500            delete $elt->{'twig_current'};
2501            $new_elt->{'twig_current'}=1;
2502            $t->{twig_current}= $new_elt;
2503            $t->{twig_in_pcdata}=1;
2504            if( $t->{extra_data})
2505              { $new_elt->_push_extra_data_in_pcdata( $t->{extra_data}, 0);
2506                $t->{extra_data}='';
2507              }
2508          }
2509      }
2510    return;
2511  }
2512
2513sub _twig_cdatastart
2514   { # warn " in _twig_cdatastart...\n"; # DEBUG handler
2515
2516    my $p= shift;
2517    my $t=$p->{twig};
2518
2519    $t->{twig_in_cdata}=1;
2520    my $cdata=  $t->{twig_elt_class}->new( $CDATA);
2521    my $twig_current= $t->{twig_current};
2522
2523    if( $t->{twig_in_pcdata})
2524      { # create the node as a sibling of the PCDATA
2525        $cdata->{prev_sibling}=$twig_current; if( $XML::Twig::weakrefs) { weaken( $cdata->{prev_sibling});} ;
2526        $twig_current->{next_sibling}=  $cdata;
2527        my $parent= $twig_current->{parent};
2528        $cdata->{parent}=$parent; if( $XML::Twig::weakrefs) { weaken( $cdata->{parent});} ;
2529         delete $parent->{empty}; $parent->{last_child}=$cdata; if( $XML::Twig::weakrefs) { weaken( $parent->{last_child});} ;
2530        $t->{twig_in_pcdata}=0;
2531      }
2532    else
2533      { # we have to create a PCDATA element if we need to store spaces
2534        if( $t->_space_policy($XML::Twig::index2gi[$twig_current->{'gi'}]) && $t->{twig_stored_spaces})
2535          { _insert_pcdata( $t, $t->{twig_stored_spaces}); }
2536        $t->{twig_stored_spaces}='';
2537
2538        # create the node as a child of the current element
2539        $cdata->{parent}=$twig_current; if( $XML::Twig::weakrefs) { weaken( $cdata->{parent});} ;
2540        if( my $prev_sibling= $twig_current->{last_child})
2541          { $cdata->{prev_sibling}=$prev_sibling; if( $XML::Twig::weakrefs) { weaken( $cdata->{prev_sibling});} ;
2542            $prev_sibling->{next_sibling}=  $cdata;
2543          }
2544        else
2545          { $twig_current->{first_child}=  $cdata; }
2546         delete $twig_current->{empty}; $twig_current->{last_child}=$cdata; if( $XML::Twig::weakrefs) { weaken( $twig_current->{last_child});} ;
2547
2548      }
2549
2550    delete $twig_current->{'twig_current'};
2551    $t->{twig_current}= $cdata;
2552    $cdata->{'twig_current'}=1;
2553    if( $t->{extra_data}) { $cdata->set_extra_data( $t->{extra_data}); $t->{extra_data}='' };
2554    return;
2555  }
2556
2557sub _twig_cdataend
2558   { # warn " in _twig_cdataend...\n"; # DEBUG handler
2559
2560    my $p= shift;
2561    my $t=$p->{twig};
2562
2563    $t->{twig_in_cdata}=0;
2564
2565    my $elt= $t->{twig_current};
2566    delete $elt->{'twig_current'};
2567    my $cdata= $elt->{cdata};
2568    $elt->{cdata}=  $cdata;
2569
2570    push @{$t->{_twig_context_stack}}, { $ST_TAG => $CDATA };
2571
2572    if( $t->{twig_handlers})
2573      { # look for handlers
2574        my @handlers= _handler( $t, $t->{twig_handlers}, $CDATA);
2575        local $_= $elt; # so we can use $_ in the handlers
2576        foreach my $handler ( @handlers) { $handler->($t, $elt) || last; }
2577      }
2578
2579    pop @{$t->{_twig_context_stack}};
2580
2581    $elt= $elt->{parent};
2582    $t->{twig_current}= $elt;
2583    $elt->{'twig_current'}=1;
2584
2585    $t->{twig_long_cdata}=0;
2586    return;
2587  }
2588
2589sub _pi_elt_handlers
2590  { my( $t, $pi)= @_;
2591    my $pi_handlers= $t->{twig_handlers}->{pi_handlers} || return;
2592    foreach my $handler ( $pi_handlers->{$pi->{target}}, $pi_handlers->{''})
2593      { if( $handler) { local $_= $pi; $handler->( $t, $pi) || last; } }
2594  }
2595
2596sub _pi_text_handler
2597  { my( $t, $target, $data)= @_;
2598    if( my $handler= $t->{twig_handlers}->{pi_handlers}->{$target})
2599      { return $handler->( $t, $target, $data); }
2600    if( my $handler= $t->{twig_handlers}->{pi_handlers}->{''})
2601      { return $handler->( $t, $target, $data); }
2602    return defined( $data) && $data ne ''  ? "<?$target $data?>" : "<?$target?>" ;
2603  }
2604
2605sub _comment_elt_handler
2606  { my( $t, $comment)= @_;
2607    if( my $handler= $t->{twig_handlers}->{handlers}->{$COMMENT})
2608      { local $_= $comment; $handler->($t, $comment); }
2609  }
2610
2611sub _comment_text_handler
2612  { my( $t, $comment)= @_;
2613    if( my $handler= $t->{twig_handlers}->{handlers}->{$COMMENT})
2614      { $comment= $handler->($t, $comment);
2615        if( !defined $comment || $comment eq '') { return ''; }
2616      }
2617    return "<!--$comment-->";
2618  }
2619
2620
2621
2622sub _twig_comment
2623   { # warn " in _twig_comment...\n"; # DEBUG handler
2624
2625    my( $p, $comment_text)= @_;
2626    my $t=$p->{twig};
2627
2628    if( $t->{twig_keep_encoding}) { $comment_text= substr( $p->original_string(), 4, -3); }
2629
2630    $t->_twig_pi_comment( $p, $COMMENT, $t->{twig_keep_comments}, $t->{twig_process_comments},
2631                          '_set_comment', '_comment_elt_handler', '_comment_text_handler', $comment_text
2632                        );
2633    return;
2634  }
2635
2636sub _twig_pi
2637   { # warn " in _twig_pi...\n"; # DEBUG handler
2638
2639    my( $p, $target, $data)= @_;
2640    my $t=$p->{twig};
2641
2642    if( $t->{twig_keep_encoding})
2643      { my $pi_text= substr( $p->original_string(), 2, -2);
2644        ($target, $data)= split( /\s+/, $pi_text, 2);
2645      }
2646
2647    $t->_twig_pi_comment( $p, $PI, $t->{twig_keep_pi}, $t->{twig_process_pi},
2648                          '_set_pi', '_pi_elt_handlers', '_pi_text_handler', $target, $data
2649                        );
2650    return;
2651  }
2652
2653sub _twig_pi_comment
2654  { my( $t, $p, $type, $keep, $process, $set, $elt_handler, $text_handler, @parser_args)= @_;
2655
2656    if( $t->{twig_input_filter})
2657          { foreach my $arg (@parser_args) { $arg= $t->{twig_input_filter}->( $arg); } }
2658
2659    # if pi/comments are to be kept then we piggyback them to the current element
2660    if( $keep)
2661      { # first add spaces
2662        if( $t->{twig_stored_spaces})
2663              { $t->{extra_data}.= $t->{twig_stored_spaces};
2664                $t->{twig_stored_spaces}= '';
2665              }
2666
2667        my $extra_data= $t->$text_handler( @parser_args);
2668        $t->{extra_data}.= $extra_data;
2669
2670      }
2671    elsif( $process)
2672      {
2673        my $twig_current= $t->{twig_current}; # defined unless we are outside of the root
2674
2675        my $elt= $t->{twig_elt_class}->new( $type);
2676        $elt->$set( @parser_args);
2677        if( $t->{extra_data})
2678          { $elt->set_extra_data( $t->{extra_data});
2679            $t->{extra_data}='';
2680          }
2681
2682        unless( $t->root)
2683          { $t->_add_cpi_outside_of_root( leading_cpi => $elt);
2684          }
2685        elsif( $t->{twig_in_pcdata})
2686          { # create the node as a sibling of the PCDATA
2687            $elt->paste_after( $twig_current);
2688            $t->{twig_in_pcdata}=0;
2689          }
2690        elsif( $twig_current)
2691          { # we have to create a PCDATA element if we need to store spaces
2692            if( $t->_space_policy($XML::Twig::index2gi[$twig_current->{'gi'}]) && $t->{twig_stored_spaces})
2693              { _insert_pcdata( $t, $t->{twig_stored_spaces}); }
2694            $t->{twig_stored_spaces}='';
2695            # create the node as a child of the current element
2696            $elt->paste_last_child( $twig_current);
2697          }
2698        else
2699          { $t->_add_cpi_outside_of_root( trailing_cpi => $elt); }
2700
2701        if( $twig_current)
2702          { delete $twig_current->{'twig_current'};
2703            my $parent= $elt->{parent};
2704            $t->{twig_current}= $parent;
2705            $parent->{'twig_current'}=1;
2706          }
2707
2708        $t->$elt_handler( $elt);
2709      }
2710
2711  }
2712
2713
2714# add a comment or pi before the first element
2715sub _add_cpi_outside_of_root
2716  { my($t, $type, $elt)= @_; # $type is 'leading_cpi' or 'trailing_cpi'
2717    $t->{$type} ||= $t->{twig_elt_class}->new( '#CPI');
2718    # create the node as a child of the current element
2719    $elt->paste_last_child( $t->{$type});
2720    return $t;
2721  }
2722
2723sub _twig_final
2724   { # warn " in _twig_final...\n"; # DEBUG handler
2725
2726    my $p= shift;
2727    my $t= $p->isa( 'XML::Twig') ? $p : $p->{twig};
2728
2729    # store trailing data
2730    if( $t->{extra_data}) { $t->{trailing_cpi_text} = $t->{extra_data}; $t->{extra_data}=''; }
2731    $t->{trailing_spaces}= $t->{twig_stored_spaces} || '';
2732    my $s=  $t->{twig_stored_spaces}; $s=~s{\n}{\\n}g;
2733    if( $t->{twig_stored_spaces}) { my $s=  $t->{twig_stored_spaces}; }
2734
2735    # restore the selected filehandle if needed
2736    $t->_set_fh_to_selected_fh();
2737
2738    $t->_trigger_tdh if( $t->{twig_tdh});
2739
2740    select $t->{twig_original_selected_fh} if($t->{twig_original_selected_fh}); # probably dodgy
2741
2742    if( exists $t->{twig_autoflush_data})
2743      { my @args;
2744        push @args,  $t->{twig_autoflush_data}->{fh}      if( $t->{twig_autoflush_data}->{fh});
2745        push @args,  @{$t->{twig_autoflush_data}->{args}} if( $t->{twig_autoflush_data}->{args});
2746        $t->flush( @args);
2747        delete $t->{twig_autoflush_data};
2748        $t->root->delete if $t->root;
2749      }
2750
2751    # tries to clean-up (probably not very well at the moment)
2752    #undef $p->{twig};
2753    undef $t->{twig_parser};
2754    delete $t->{twig_parsing};
2755    @{$t}{ qw( twig_parser twig_parsing _twig_context_stack twig_current) }=();
2756
2757    return $t;
2758  }
2759
2760sub _insert_pcdata
2761  { my( $t, $string)= @_;
2762    # create a new PCDATA element
2763    my $parent= $t->{twig_current};    # always defined
2764    my $elt;
2765    if( exists $t->{twig_alt_elt_class})
2766      { $elt=  $t->{twig_elt_class}->new( $PCDATA);
2767        $elt->{pcdata}=  $string;
2768      }
2769    else
2770      { $elt= bless( { gi => $XML::Twig::gi2index{$PCDATA}, pcdata => $string }, 'XML::Twig::Elt'); }
2771
2772    my $prev_sibling= $parent->{last_child};
2773    if( $prev_sibling)
2774      { $prev_sibling->{next_sibling}=  $elt;
2775        $elt->{prev_sibling}=$prev_sibling; if( $XML::Twig::weakrefs) { weaken( $elt->{prev_sibling});} ;
2776      }
2777    else
2778      { $parent->{first_child}=  $elt; }
2779
2780    $elt->{parent}=$parent; if( $XML::Twig::weakrefs) { weaken( $elt->{parent});} ;
2781     delete $parent->{empty}; $parent->{last_child}=$elt; if( $XML::Twig::weakrefs) { weaken( $parent->{last_child});} ;
2782    $t->{twig_stored_spaces}='';
2783    return $elt;
2784  }
2785
2786sub _space_policy
2787  { my( $t, $gi)= @_;
2788    my $policy;
2789    $policy=0 if( $t->{twig_discard_spaces});
2790    $policy=1 if( $t->{twig_keep_spaces});
2791    $policy=1 if( $t->{twig_keep_spaces_in}
2792               && $t->{twig_keep_spaces_in}->{$gi});
2793    $policy=0 if( $t->{twig_discard_spaces_in}
2794               && $t->{twig_discard_spaces_in}->{$gi});
2795    return $policy;
2796  }
2797
2798
2799sub _twig_entity
2800   { # warn " in _twig_entity...\n"; # DEBUG handler
2801    my( $p, $name, $val, $sysid, $pubid, $ndata, $param)= @_;
2802    my $t=$p->{twig};
2803
2804    #{ no warnings; my $base= $p->base; warn "_twig_entity called: expand: '$t->{twig_expand_external_ents}', base: '$base', name: '$name', val: '$val', sysid: '$sysid', pubid: '$pubid', ndata: '$ndata', param: '$param'\n";}
2805
2806    my $missing_entity=0;
2807
2808    if( $sysid)
2809      { if($ndata)
2810          { if( ! -f _based_filename( $sysid, $p->base)) { $missing_entity= 1; }
2811          }
2812        else
2813          { if( $t->{twig_expand_external_ents})
2814              { $val= eval { _slurp_uri( $sysid, $p->base) };
2815                if( ! defined $val)
2816                  { if( $t->{twig_extern_ent_nofail})
2817                      { $missing_entity= 1; }
2818                    else
2819                      { _croak( "cannot load SYSTEM entity '$name' from '$sysid': $@", 3); }
2820                  }
2821              }
2822          }
2823      }
2824
2825    my $ent=XML::Twig::Entity->new( $name, $val, $sysid, $pubid, $ndata, $param);
2826    if( $missing_entity) { $t->{twig_missing_system_entities}->{$name}= $ent; }
2827
2828    my $entity_list= $t->entity_list;
2829    if( $entity_list) { $entity_list->add( $ent); }
2830
2831    if( $parser_version > 2.27)
2832      { # this is really ugly, but with some versions of XML::Parser the value
2833        # of the entity is not properly returned by the default handler
2834        my $ent_decl= $ent->text;
2835        if( $t->{twig_keep_encoding})
2836          { if( defined $ent->{val} && ($ent_decl !~ /["']/))
2837              { my $val=  $ent->{val};
2838                $ent_decl .= $val =~ /"/ ? qq{'$val' } : qq{"$val" };
2839              }
2840            # for my solaris box (perl 5.6.1, XML::Parser 2.31, expat?)
2841            $t->{twig_doctype}->{internal}=~ s{<!ENTITY\s+$name\s+$}{substr( $ent_decl, 0, -1)}e;
2842          }
2843        $t->{twig_doctype}->{internal} .= $ent_decl
2844          unless( $t->{twig_doctype}->{internal}=~ m{<!ENTITY\s+$name\s+});
2845      }
2846
2847    return;
2848  }
2849
2850sub _twig_notation
2851   { my( $p, $name, $base, $sysid, $pubid ) = @_;
2852     my $t = $p->{twig};
2853
2854     my $notation = XML::Twig::Notation->new( $name, $base, $sysid, $pubid );
2855     my $notation_list = $t->notation_list();
2856     if( $notation_list ) { $notation_list->add( $notation ); }
2857
2858     # internal should get the recognized_string, but XML::Parser does not provide it
2859     # so we need to re-create it ( $notation->text) and stick it there.
2860     $t->{twig_doctype}->{internal} .= $notation->text;
2861
2862     return;
2863   }
2864
2865
2866sub _twig_extern_ent
2867   { # warn " in _twig_extern_ent...I (", $_[0]->original_string, ")\n"; # DEBUG handler
2868    my( $p, $base, $sysid, $pubid)= @_;
2869    my $t= $p->{twig};
2870    if( $t->{twig_no_expand})
2871      { my $ent_name= $t->{twig_keep_encoding} ? $p->original_string : $p->recognized_string;
2872        _twig_insert_ent( $t, $ent_name);
2873        return '';
2874      }
2875    my $ent_content= eval { $t->{twig_ext_ent_handler}->( $p, $base, $sysid) };
2876    if( ! defined $ent_content)
2877      {
2878        my $ent_name = $p->recognized_string;
2879        my $file     =  _based_filename( $sysid, $base);
2880        my $error_message= "cannot expand $ent_name - cannot load '$file'";
2881        if( $t->{twig_extern_ent_nofail}) { return "<!-- $error_message -->"; }
2882        else                              { _croak( $error_message);   }
2883      }
2884    return $ent_content;
2885  }
2886
2887# I use this so I can change the $Carp::CarpLevel (which determines how many call frames to skip when reporting an error)
2888sub _croak
2889  { my( $message, $level)= @_;
2890    $Carp::CarpLevel= $level || 0;
2891    croak $message;
2892  }
2893
2894sub _twig_xmldecl
2895   { # warn " in _twig_xmldecl...\n"; # DEBUG handler
2896
2897    my $p= shift;
2898    my $t=$p->{twig};
2899    $t->{twig_xmldecl}||={};                 # could have been set by set_output_encoding
2900    $t->{twig_xmldecl}->{version}= shift;
2901    $t->{twig_xmldecl}->{encoding}= shift;
2902    $t->{twig_xmldecl}->{standalone}= shift;
2903    return;
2904  }
2905
2906sub _twig_doctype
2907   { # warn " in _twig_doctype...\n"; # DEBUG handler
2908    my( $p, $name, $sysid, $pub, $internal)= @_;
2909    my $t=$p->{twig};
2910    $t->{twig_doctype}||= {};                   # create
2911    $t->{twig_doctype}->{name}= $name;          # always there
2912    $t->{twig_doctype}->{sysid}= $sysid;        #
2913    $t->{twig_doctype}->{pub}= $pub;            #
2914
2915    # now let's try to cope with XML::Parser 2.28 and above
2916    if( $parser_version > 2.27)
2917      { @saved_default_handler= $p->setHandlers( Default     => \&_twig_store_internal_dtd,
2918                                                 Entity      => \&_twig_entity,
2919                                               );
2920      $p->setHandlers( DoctypeFin  => \&_twig_stop_storing_internal_dtd);
2921      $t->{twig_doctype}->{internal}='';
2922      }
2923    else
2924      # for XML::Parser before 2.28
2925      { $internal||='';
2926        $internal=~ s{^\s*\[}{};
2927        $internal=~ s{]\s*$}{};
2928        $t->{twig_doctype}->{internal}=$internal;
2929      }
2930
2931    # now check if we want to get the DTD info
2932    if( $t->{twig_read_external_dtd} && $sysid)
2933      { # let's build a fake document with an internal DTD
2934        if( $t->{DTDBase})
2935          { _use( 'File::Spec');
2936            $sysid=File::Spec->catfile($t->{DTDBase}, $sysid);
2937          }
2938        my $dtd= _slurp_uri( $sysid);
2939        # if the DTD includes an XML declaration, it needs to be moved before the DOCTYPE bit
2940        if( $dtd=~ s{^(\s*<\?xml(\s+\w+\s*=\s*("[^"]*"|'[^']*'))*\s*\?>)}{})
2941          { $dtd= "$1<!DOCTYPE $name [$dtd]><$name/>"; }
2942        else
2943          { $dtd= "<!DOCTYPE $name [$dtd]><$name/>"; }
2944
2945        $t->save_global_state();            # save the globals (they will be reset by the following new)
2946        my $t_dtd= XML::Twig->new( load_DTD => 1, ParseParamEnt => 1, error_context => $t->{ErrorContext} || 0);          # create a temp twig
2947        $t_dtd->parse( $dtd);               # parse it
2948        $t->{twig_dtd}= $t_dtd->{twig_dtd}; # grab the dtd info
2949        #$t->{twig_dtd_is_external}=1;
2950        $t->entity_list->_add_list( $t_dtd->entity_list)     if( $t_dtd->entity_list);   # grab the entity info
2951        $t->notation_list->_add_list( $t_dtd->notation_list) if( $t_dtd->notation_list); # grab the notation info
2952        $t->restore_global_state();
2953      }
2954    return;
2955  }
2956
2957sub _twig_element
2958   { # warn " in _twig_element...\n"; # DEBUG handler
2959
2960    my( $p, $name, $model)= @_;
2961    my $t=$p->{twig};
2962    $t->{twig_dtd}||= {};                      # may create the dtd
2963    $t->{twig_dtd}->{model}||= {};             # may create the model hash
2964    $t->{twig_dtd}->{elt_list}||= [];          # ordered list of elements
2965    push @{$t->{twig_dtd}->{elt_list}}, $name; # store the elt
2966    $t->{twig_dtd}->{model}->{$name}= $model;  # store the model
2967    if( ($parser_version > 2.27) && ($t->{twig_doctype}->{internal}=~ m{(^|>)\s*$}) )
2968      { my $text= $XML::Twig::Elt::keep_encoding ? $p->original_string : $p->recognized_string;
2969        unless( $text)
2970          { # this version of XML::Parser does not return the text in the *_string method
2971            # we need to rebuild it
2972            $text= "<!ELEMENT $name $model>";
2973          }
2974        $t->{twig_doctype}->{internal} .= $text;
2975      }
2976    return;
2977  }
2978
2979sub _twig_attlist
2980   { # warn " in _twig_attlist...\n"; # DEBUG handler
2981
2982    my( $p, $gi, $att, $type, $default, $fixed)= @_;
2983    #warn "in attlist: gi: '$gi', att: '$att', type: '$type', default: '$default', fixed: '$fixed'\n";
2984    my $t=$p->{twig};
2985    $t->{twig_dtd}||= {};                      # create dtd if need be
2986    $t->{twig_dtd}->{$gi}||= {};               # create elt if need be
2987    #$t->{twig_dtd}->{$gi}->{att}||= {};        # create att if need be
2988    if( ($parser_version > 2.27) && ($t->{twig_doctype}->{internal}=~ m{(^|>)\s*$}) )
2989      { my $text= $XML::Twig::Elt::keep_encoding ? $p->original_string : $p->recognized_string;
2990        unless( $text)
2991          { # this version of XML::Parser does not return the text in the *_string method
2992            # we need to rebuild it
2993            my $att_decl="$att $type";
2994            $att_decl .= " #FIXED"   if( $fixed);
2995            $att_decl .= " $default" if( defined $default);
2996            # 2 cases: there is already an attlist on that element or not
2997            if( $t->{twig_dtd}->{att}->{$gi})
2998              { # there is already an attlist, add to it
2999                $t->{twig_doctype}->{internal}=~ s{(<!ATTLIST\s*$gi )(.*?)\n?>}
3000                                                  { "$1$2\n" . ' ' x length( $1) . "$att_decl\n>"}es;
3001              }
3002            else
3003              { # create the attlist
3004                 $t->{twig_doctype}->{internal}.= "<!ATTLIST $gi $att_decl>"
3005              }
3006          }
3007      }
3008    $t->{twig_dtd}->{att}->{$gi}->{$att}= {} ;
3009    $t->{twig_dtd}->{att}->{$gi}->{$att}->{type}= $type;
3010    $t->{twig_dtd}->{att}->{$gi}->{$att}->{default}= $default if( defined $default);
3011    $t->{twig_dtd}->{att}->{$gi}->{$att}->{fixed}= $fixed;
3012    return;
3013  }
3014
3015sub _twig_default
3016   { # warn " in _twig_default...\n"; # DEBUG handler
3017
3018    my( $p, $string)= @_;
3019
3020    my $t= $p->{twig};
3021
3022    # we need to process the data in 2 cases: entity, or spaces after the closing tag
3023
3024    # after the closing tag (no twig_current and root has been created)
3025    if(  ! $t->{twig_current} && $t->{twig_root} && $string=~ m{^\s+$}m) { $t->{twig_stored_spaces} .= $string; }
3026
3027    # process only if we have an entity
3028    if( $string=~ m{^&([^;]*);$})
3029      { # the entity has to be pure pcdata, or we have a problem
3030        if( ($p->original_string=~ m{^<}) && ($p->original_string=~ m{>$}) )
3031          { # string is a tag, entity is in an attribute
3032            $t->{twig_entities_in_attribute}=1 if( $t->{twig_do_not_escape_amp_in_atts});
3033          }
3034        else
3035          { my $ent;
3036            if( $t->{twig_keep_encoding})
3037              { _twig_char( $p, $string);
3038                $ent= substr( $string, 1, -1);
3039              }
3040            else
3041              { $ent= _twig_insert_ent( $t, $string);
3042              }
3043
3044            return $ent;
3045          }
3046      }
3047  }
3048
3049sub _twig_insert_ent
3050  {
3051    my( $t, $string)=@_;
3052
3053    my $twig_current= $t->{twig_current};
3054
3055    my $ent=  $t->{twig_elt_class}->new( $ENT);
3056    $ent->{ent}=  $string;
3057
3058    _add_or_discard_stored_spaces( $t);
3059
3060    if( $t->{twig_in_pcdata})
3061      { # create the node as a sibling of the #PCDATA
3062
3063        $ent->{prev_sibling}=$twig_current; if( $XML::Twig::weakrefs) { weaken( $ent->{prev_sibling});} ;
3064        $twig_current->{next_sibling}=  $ent;
3065        my $parent= $twig_current->{parent};
3066        $ent->{parent}=$parent; if( $XML::Twig::weakrefs) { weaken( $ent->{parent});} ;
3067         delete $parent->{empty}; $parent->{last_child}=$ent; if( $XML::Twig::weakrefs) { weaken( $parent->{last_child});} ;
3068        # the twig_current is now the parent
3069        delete $twig_current->{'twig_current'};
3070        $t->{twig_current}= $parent;
3071        # we left pcdata
3072        $t->{twig_in_pcdata}=0;
3073      }
3074    else
3075      { # create the node as a child of the current element
3076        $ent->{parent}=$twig_current; if( $XML::Twig::weakrefs) { weaken( $ent->{parent});} ;
3077        if( my $prev_sibling= $twig_current->{last_child})
3078          { $ent->{prev_sibling}=$prev_sibling; if( $XML::Twig::weakrefs) { weaken( $ent->{prev_sibling});} ;
3079            $prev_sibling->{next_sibling}=  $ent;
3080          }
3081        else
3082          { if( $twig_current) { $twig_current->{first_child}=  $ent; } }
3083        if( $twig_current) {  delete $twig_current->{empty}; $twig_current->{last_child}=$ent; if( $XML::Twig::weakrefs) { weaken( $twig_current->{last_child});} ; }
3084      }
3085
3086    # meant to trigger entity handler, does not seem to be activated at this time
3087    #if( my $handler= $t->{twig_handlers}->{gi}->{$ENT})
3088    #  { local $_= $ent; $handler->( $t, $ent); }
3089
3090    return $ent;
3091  }
3092
3093sub parser
3094  { return $_[0]->{twig_parser}; }
3095
3096# returns the declaration text (or a default one)
3097sub xmldecl
3098  { my $t= shift;
3099    return '' unless( $t->{twig_xmldecl} || $t->{output_encoding});
3100    my $decl_string;
3101    my $decl= $t->{twig_xmldecl};
3102    if( $decl)
3103      { my $version= $decl->{version};
3104        $decl_string= q{<?xml};
3105        $decl_string .= qq{ version="$version"};
3106
3107        # encoding can either have been set (in $decl->{output_encoding})
3108        # or come from the document (in $decl->{encoding})
3109        if( $t->{output_encoding})
3110          { my $encoding= $t->{output_encoding};
3111            $decl_string .= qq{ encoding="$encoding"};
3112          }
3113        elsif( $decl->{encoding})
3114          { my $encoding= $decl->{encoding};
3115            $decl_string .= qq{ encoding="$encoding"};
3116          }
3117
3118        if( defined( $decl->{standalone}))
3119          { $decl_string .= q{ standalone="};
3120            $decl_string .= $decl->{standalone} ? "yes" : "no";
3121            $decl_string .= q{"};
3122          }
3123
3124        $decl_string .= "?>\n";
3125      }
3126    else
3127      { my $encoding= $t->{output_encoding};
3128        $decl_string= qq{<?xml version="1.0" encoding="$encoding"?>};
3129      }
3130
3131    my $output_filter= XML::Twig::Elt::output_filter();
3132    return $output_filter ? $output_filter->( $decl_string) : $decl_string;
3133  }
3134
3135sub set_doctype
3136  { my( $t, $name, $system, $public, $internal)= @_;
3137    $t->{twig_doctype}= {} unless defined $t->{twig_doctype};
3138    my $doctype= $t->{twig_doctype};
3139    $doctype->{name}     = $name     if( defined $name);
3140    $doctype->{sysid}    = $system   if( defined $system);
3141    $doctype->{pub}      = $public   if( defined $public);
3142    $doctype->{internal} = $internal if( defined $internal);
3143  }
3144
3145sub doctype_name
3146  { my $t= shift;
3147    my $doctype= $t->{twig_doctype} or return '';
3148    return $doctype->{name} || '';
3149  }
3150
3151sub system_id
3152  { my $t= shift;
3153    my $doctype= $t->{twig_doctype} or return '';
3154    return $doctype->{sysid} || '';
3155  }
3156
3157sub public_id
3158  { my $t= shift;
3159    my $doctype= $t->{twig_doctype} or return '';
3160    return $doctype->{pub} || '';
3161  }
3162
3163sub internal_subset
3164  { my $t= shift;
3165    my $doctype= $t->{twig_doctype} or return '';
3166    return $doctype->{internal} || '';
3167  }
3168
3169# return the dtd object
3170sub dtd
3171  { my $t= shift;
3172    return $t->{twig_dtd};
3173  }
3174
3175# return an element model, or the list of element models
3176sub model
3177  { my $t= shift;
3178    my $elt= shift;
3179    return $t->dtd->{model}->{$elt} if( $elt);
3180    return (sort keys %{$t->dtd->{model}});
3181  }
3182
3183
3184# return the entity_list object
3185sub entity_list
3186  { my $t= shift;
3187    return $t->{twig_entity_list};
3188  }
3189
3190# return the list of entity names
3191sub entity_names
3192  { my $t= shift;
3193    return $t->entity_list->entity_names;
3194  }
3195
3196# return the entity object
3197sub entity
3198  { my $t= shift;
3199    my $entity_name= shift;
3200    return $t->entity_list->ent( $entity_name);
3201  }
3202
3203# return the notation_list object
3204sub notation_list
3205  { my $t= shift;
3206    return $t->{twig_notation_list};
3207  }
3208
3209# return the list of notation names
3210sub notation_names
3211  { my $t= shift;
3212    return $t->notation_list->notation_names;
3213  }
3214
3215# return the notation object
3216sub notation
3217  { my $t= shift;
3218    my $notation_name= shift;
3219    return $t->notation_list->notation( $notation_name);
3220  }
3221
3222
3223
3224
3225sub print_prolog
3226  { my $t= shift;
3227    my $fh=  isa( $_[0], 'GLOB') || isa( $_[0], 'IO::Scalar')  ? shift : $t->{twig_output_fh} || select() || \*STDOUT;
3228    ## no critic (TestingAndDebugging::ProhibitNoStrict);
3229    no strict 'refs';
3230    print {$fh} $t->prolog( @_);
3231  }
3232
3233sub prolog
3234  { my $t= shift;
3235    if( $t->{no_prolog}){ return ''; }
3236
3237    return   $t->{no_prolog}             ? ''
3238           : defined $t->{no_dtd_output} ? $t->xmldecl
3239           :                               $t->xmldecl . $t->doctype( @_);
3240  }
3241
3242sub doctype
3243  { my $t= shift;
3244    my %args= _normalize_args( @_);
3245    my $update_dtd = $args{UpdateDTD} || '';
3246    my $doctype_text='';
3247
3248    my $doctype= $t->{twig_doctype};
3249
3250    if( $doctype)
3251      { $doctype_text .= qq{<!DOCTYPE $doctype->{name}} if( $doctype->{name});
3252        $doctype_text .= qq{ PUBLIC "$doctype->{pub}"}  if( $doctype->{pub});
3253        $doctype_text .= qq{ SYSTEM}                    if( $doctype->{sysid} && !$doctype->{pub});
3254        $doctype_text .= qq{ "$doctype->{sysid}"}       if( $doctype->{sysid});
3255      }
3256
3257    if( $update_dtd)
3258      { if( $doctype)
3259          { my $internal=$doctype->{internal};
3260            # awful hack, but at least it works a little better that what was there before
3261            if( $internal)
3262              { # remove entity and notation declarations (they will be re-generated from the updated entity list)
3263                $internal=~ s{<! \s* ENTITY \s+ $REG_TAG_NAME \s+ ( ("[^"]*"|'[^']*') \s* | SYSTEM [^>]*) >\s*}{}xg;
3264                $internal=~ s{<! \s* NOTATION .*? >\s*}{}sxg;
3265                $internal=~ s{^\n}{};
3266              }
3267            $internal .= $t->entity_list->text   ||'' if( $t->entity_list);
3268            $internal .= $t->notation_list->text ||'' if( $t->notation_list);
3269            if( $internal) { $doctype_text .= "[\n$internal]>\n"; }
3270          }
3271        elsif( !$t->{'twig_dtd'} && ( keys %{$t->entity_list} || keys %{$t->notation_list} ) )
3272          { $doctype_text .= "<!DOCTYPE " . $t->root->gi . " [\n" . $t->entity_list->text . $t->notation_list->text . "\n]>";}
3273        else
3274          { $doctype_text= $t->{twig_dtd};
3275            $doctype_text .= $t->dtd_text;
3276          }
3277      }
3278    elsif( $doctype)
3279      { if( my $internal= $doctype->{internal})
3280          { # add opening and closing brackets if not already there
3281            # plus some spaces and newlines for a nice formating
3282            # I test it here because I can't remember which version of
3283            # XML::Parser need it or not, nor guess which one will in the
3284            # future, so this about the best I can do
3285            $internal=~ s{^\s*(\[\s*)?}{ [\n};
3286            $internal=~ s{\s*(\]\s*(>\s*)?)?\s*$}{\n]>\n};
3287
3288            # XML::Parser does not include the NOTATION declarations in the DTD
3289            # at least in the current version. So put them back
3290            #if( $t->notation_list && $internal !~ m{<!\s*NOTATION})
3291            #  { $internal=~ s{(\n]>\n)$}{ "\n" . $t->notation_list->text . $1}es; }
3292
3293            $doctype_text .=  $internal;
3294          }
3295      }
3296
3297    if( $doctype_text)
3298      {
3299        # terrible hack, as I can't figure out in which case the darn prolog
3300        # should get an extra > (depends on XML::Parser and expat versions)
3301        $doctype_text=~ s/(>\s*)*$/>\n/; # if($doctype_text);
3302
3303        my $output_filter= XML::Twig::Elt::output_filter();
3304        return $output_filter ? $output_filter->( $doctype_text) : $doctype_text;
3305      }
3306    else
3307      { return $doctype_text; }
3308  }
3309
3310sub _leading_cpi
3311  { my $t= shift;
3312    my $leading_cpi= $t->{leading_cpi} || return '';
3313    return $leading_cpi->sprint( 1);
3314  }
3315
3316sub _trailing_cpi
3317  { my $t= shift;
3318    my $trailing_cpi= $t->{trailing_cpi} || return '';
3319    return $trailing_cpi->sprint( 1);
3320  }
3321
3322sub _trailing_cpi_text
3323  { my $t= shift;
3324    return $t->{trailing_cpi_text} || '';
3325  }
3326
3327sub print_to_file
3328  { my( $t, $filename)= (shift, shift);
3329    my $out_fh;
3330#    open( $out_fh, ">$filename") or _croak( "cannot create file $filename: $!");     # < perl 5.8
3331    my $mode= $t->{twig_keep_encoding} && ! _use_perlio() ? '>' : '>:utf8';                             # >= perl 5.8
3332    open( $out_fh, $mode, $filename) or _croak( "cannot create file $filename: $!"); # >= perl 5.8
3333    $t->print( $out_fh, @_);
3334    close $out_fh;
3335    return $t;
3336  }
3337
3338# probably only works on *nix (at least the chmod bit)
3339# first print to a temporary file, then rename that file to the desired file name, then change permissions
3340# to the original file permissions (or to the current umask)
3341sub safe_print_to_file
3342  { my( $t, $filename)= (shift, shift);
3343    my $perm= -f $filename ? (stat $filename)[2] & 07777 : ~umask() ;
3344    XML::Twig::_use( 'File::Temp') || croak "need File::Temp to use safe_print_to_file\n";
3345    my $tmpdir= dirname( $filename);
3346    my( $fh, $tmpfilename) = File::Temp::tempfile( DIR => $tmpdir);
3347    $t->print_to_file( $tmpfilename, @_);
3348    rename( $tmpfilename, $filename) or unlink $tmpfilename && _croak( "cannot move temporary file to $filename: $!");
3349    chmod $perm, $filename;
3350    return $t;
3351  }
3352
3353
3354sub print
3355  { my $t= shift;
3356    my $fh=  isa( $_[0], 'GLOB') || isa( $_[0], 'IO::Scalar')  ? shift : undef;
3357    my %args= _normalize_args( @_);
3358
3359    my $old_select    = defined $fh                  ? select $fh                                 : undef;
3360    my $old_pretty    = defined ($args{PrettyPrint}) ? $t->set_pretty_print( $args{PrettyPrint})  : undef;
3361    my $old_empty_tag = defined ($args{EmptyTags})   ? $t->set_empty_tag_style( $args{EmptyTags}) : undef;
3362
3363    #if( !$t->{encoding} || lc( $t->{encoding}) eq 'utf-8') { my $out= $fh || \*STDOUT; binmode $out, ':utf8'; }
3364
3365    if( $perl_version > 5.006 && ! $t->{twig_keep_encoding} && _use_perlio() ) { binmode( $fh || \*STDOUT, ":utf8" ); }
3366
3367     print  $t->prolog( %args) . $t->_leading_cpi( %args);
3368     $t->{twig_root}->print;
3369     print $t->_trailing_cpi        # trailing comments and pi's (elements, in 'process' mode)
3370         . $t->_trailing_cpi_text   # trailing comments and pi's (in 'keep' mode)
3371         . ( ($t->{twig_keep_spaces}||'') && ($t->{trailing_spaces} || ''))
3372         ;
3373
3374
3375    $t->set_pretty_print( $old_pretty)       if( defined $old_pretty);
3376    $t->set_empty_tag_style( $old_empty_tag) if( defined $old_empty_tag);
3377    if( $fh) { select $old_select; }
3378
3379    return $t;
3380  }
3381
3382
3383sub flush
3384  { my $t= shift;
3385
3386    $t->_trigger_tdh if $t->{twig_tdh};
3387
3388    return if( $t->{twig_completely_flushed});
3389
3390    my $fh=  isa( $_[0], 'GLOB') || isa( $_[0], 'IO::Scalar') ? shift : undef;
3391    my $old_select= defined $fh ? select $fh : undef;
3392    my $up_to= ref $_[0] ? shift : undef;
3393    my %args= _normalize_args( @_);
3394
3395    my $old_pretty;
3396    if( defined $args{PrettyPrint})
3397      { $old_pretty= $t->set_pretty_print( $args{PrettyPrint});
3398        delete $args{PrettyPrint};
3399      }
3400
3401     my $old_empty_tag_style;
3402     if( $args{EmptyTags})
3403      { $old_empty_tag_style= $t->set_empty_tag_style( $args{EmptyTags});
3404        delete $args{EmptyTags};
3405      }
3406
3407
3408    # the "real" last element processed, as _twig_end has closed it
3409    my $last_elt;
3410    my $flush_trailing_data=0;
3411    if( $up_to)
3412      { $last_elt= $up_to; }
3413    elsif( $t->{twig_current})
3414      { $last_elt= $t->{twig_current}->{last_child}; }
3415    else
3416      { $last_elt= $t->{twig_root};
3417        $flush_trailing_data=1;
3418        $t->{twig_completely_flushed}=1;
3419      }
3420
3421    # flush the DTD unless it has ready flushed (ie root has been flushed)
3422    my $elt= $t->{twig_root};
3423    unless( $elt->{'flushed'})
3424      { # store flush info so we can auto-flush later
3425        if( $t->{twig_autoflush})
3426          { $t->{twig_autoflush_data}={};
3427            $t->{twig_autoflush_data}->{fh}   = $fh  if( $fh);
3428            $t->{twig_autoflush_data}->{args} = \@_  if( @_);
3429          }
3430        $t->print_prolog( %args);
3431        print $t->_leading_cpi;
3432      }
3433
3434    while( $elt)
3435      { my $next_elt;
3436        if( $last_elt && $last_elt->in( $elt))
3437          {
3438            unless( $elt->{'flushed'})
3439              { # just output the front tag
3440                print $elt->start_tag();
3441                $elt->{'flushed'}=1;
3442              }
3443            $next_elt= $elt->{first_child};
3444          }
3445        else
3446          { # an element before the last one or the last one,
3447            $next_elt= $elt->{next_sibling};
3448            $elt->_flush();
3449            $elt->delete;
3450            last if( $last_elt && ($elt == $last_elt));
3451          }
3452        $elt= $next_elt;
3453      }
3454
3455    if( $flush_trailing_data)
3456      { print $t->_trailing_cpi        # trailing comments and pi's (elements, in 'process' mode)
3457            , $t->_trailing_cpi_text   # trailing comments and pi's (in 'keep' mode)
3458      }
3459
3460    select $old_select if( defined $old_select);
3461    $t->set_pretty_print( $old_pretty) if( defined $old_pretty);
3462    $t->set_empty_tag_style( $old_empty_tag_style) if( defined $old_empty_tag_style);
3463
3464    if( my $ids= $t->{twig_id_list})
3465      { while( my ($id, $elt)= each %$ids)
3466          { if( ! defined $elt)
3467             { delete $t->{twig_id_list}->{$id} }
3468          }
3469      }
3470
3471    return $t;
3472  }
3473
3474
3475# flushes up to an element
3476# this method just reorders the arguments and calls flush
3477sub flush_up_to
3478  { my $t= shift;
3479    my $up_to= shift;
3480    if( isa( $_[0], 'GLOB') || isa( $_[0], 'IO::Scalar'))
3481      { my $fh=  shift;
3482        $t->flush( $fh, $up_to, @_);
3483      }
3484    else
3485      { $t->flush( $up_to, @_); }
3486
3487    return $t;
3488  }
3489
3490
3491# same as print except the entire document text is returned as a string
3492sub sprint
3493  { my $t= shift;
3494    my %args= _normalize_args( @_);
3495
3496    my $old_pretty;
3497    if( defined $args{PrettyPrint})
3498      { $old_pretty= $t->set_pretty_print( $args{PrettyPrint});
3499        delete $args{PrettyPrint};
3500      }
3501
3502     my $old_empty_tag_style;
3503     if( defined $args{EmptyTags})
3504      { $old_empty_tag_style= $t->set_empty_tag_style( $args{EmptyTags});
3505        delete $args{EmptyTags};
3506      }
3507
3508    my $string=   $t->prolog( %args)       # xml declaration and doctype
3509                . $t->_leading_cpi( %args) # leading comments and pi's in 'process' mode
3510                . ( ($t->{twig_root} && $t->{twig_root}->sprint) || '')
3511                . $t->_trailing_cpi        # trailing comments and pi's (elements, in 'process' mode)
3512                . $t->_trailing_cpi_text   # trailing comments and pi's (in 'keep' mode)
3513                ;
3514    if( $t->{twig_keep_spaces} && $t->{trailing_spaces}) { $string .= $t->{trailing_spaces}; }
3515
3516    $t->set_pretty_print( $old_pretty) if( defined $old_pretty);
3517    $t->set_empty_tag_style( $old_empty_tag_style) if( defined $old_empty_tag_style);
3518
3519    return $string;
3520  }
3521
3522
3523# this method discards useless elements in a tree
3524# it does the same thing as a flush except it does not print it
3525# the second argument is an element, the last purged element
3526# (this argument is usually set through the purge_up_to method)
3527sub purge
3528  { my $t= shift;
3529    my $up_to= shift;
3530
3531    $t->_trigger_tdh if $t->{twig_tdh};
3532
3533    # the "real" last element processed, as _twig_end has closed it
3534    my $last_elt;
3535    if( $up_to)
3536      { $last_elt= $up_to; }
3537    elsif( $t->{twig_current})
3538      { $last_elt= $t->{twig_current}->{last_child}; }
3539    else
3540      { $last_elt= $t->{twig_root}; }
3541
3542    my $elt= $t->{twig_root};
3543
3544    while( $elt)
3545      { my $next_elt;
3546        if( $last_elt && $last_elt->in( $elt))
3547          { $elt->{'flushed'}=1;
3548            $next_elt= $elt->{first_child};
3549          }
3550        else
3551          { # an element before the last one or the last one,
3552            $next_elt= $elt->{next_sibling};
3553            $elt->delete;
3554            last if( $last_elt && ($elt == $last_elt) );
3555          }
3556        $elt= $next_elt;
3557      }
3558
3559    if( my $ids= $t->{twig_id_list})
3560      { while( my ($id, $elt)= each %$ids) { if( ! defined $elt) { delete $t->{twig_id_list}->{$id} } } }
3561
3562    return $t;
3563  }
3564
3565# flushes up to an element. This method just calls purge
3566sub purge_up_to
3567  { my $t= shift;
3568    return $t->purge( @_);
3569  }
3570
3571sub root
3572  { return $_[0]->{twig_root}; }
3573
3574sub normalize
3575  { return $_[0]->root->normalize; }
3576
3577
3578# create accessor methods on attribute names
3579{ my %accessor; # memorize accessor names so re-creating them won't trigger an error
3580sub att_accessors
3581  {
3582    my $twig_or_class= shift;
3583    my $elt_class= ref $twig_or_class ? $twig_or_class->{twig_elt_class}
3584                                      : 'XML::Twig::Elt'
3585                                      ;
3586    ## no critic (TestingAndDebugging::ProhibitNoStrict);
3587    no strict 'refs';
3588    foreach my $att (@_)
3589      { _croak( "attempt to redefine existing method $att using att_accessors")
3590          if( $elt_class->can( $att) && !$accessor{$att});
3591
3592        if( !$accessor{$att})
3593          { *{"$elt_class\::$att"}=
3594                sub
3595                    :lvalue                                  # > perl 5.5
3596                  { my $elt= shift;
3597                    if( @_) { $elt->{att}->{$att}= $_[0]; }
3598                    $elt->{att}->{$att};
3599                  };
3600            $accessor{$att}=1;
3601          }
3602      }
3603    return $twig_or_class;
3604  }
3605}
3606
3607{ my %accessor; # memorize accessor names so re-creating them won't trigger an error
3608sub elt_accessors
3609  {
3610    my $twig_or_class= shift;
3611    my $elt_class= ref $twig_or_class ? $twig_or_class->{twig_elt_class}
3612                                      : 'XML::Twig::Elt'
3613                                      ;
3614
3615    # if arg is a hash ref, it's exp => name, otherwise it's a list of tags
3616    my %exp_to_alias= ref( $_[0]) && isa( $_[0], 'HASH') ? %{$_[0]}
3617                                                         : map { $_ => $_ } @_;
3618    ## no critic (TestingAndDebugging::ProhibitNoStrict);
3619    no strict 'refs';
3620    while( my( $alias, $exp)= each %exp_to_alias )
3621      { if( $elt_class->can( $alias) && !$accessor{$alias})
3622          { _croak( "attempt to redefine existing method $alias using elt_accessors"); }
3623
3624        if( !$accessor{$alias})
3625          { *{"$elt_class\::$alias"}=
3626                sub
3627                  { my $elt= shift;
3628                    return wantarray ? $elt->children( $exp) : $elt->first_child( $exp);
3629                  };
3630            $accessor{$alias}=1;
3631          }
3632      }
3633    return $twig_or_class;
3634  }
3635}
3636
3637{ my %accessor; # memorize accessor names so re-creating them won't trigger an error
3638sub field_accessors
3639  {
3640    my $twig_or_class= shift;
3641    my $elt_class= ref $twig_or_class ? $twig_or_class->{twig_elt_class}
3642                                      : 'XML::Twig::Elt'
3643                                      ;
3644    my %exp_to_alias= ref( $_[0]) && isa( $_[0], 'HASH') ? %{$_[0]}
3645                                                         : map { $_ => $_ } @_;
3646
3647    ## no critic (TestingAndDebugging::ProhibitNoStrict);
3648    no strict 'refs';
3649    while( my( $alias, $exp)= each %exp_to_alias )
3650      { if( $elt_class->can( $alias) && !$accessor{$alias})
3651          { _croak( "attempt to redefine existing method $exp using field_accessors"); }
3652        if( !$accessor{$alias})
3653          { *{"$elt_class\::$alias"}=
3654                sub
3655                  { my $elt= shift;
3656                    $elt->field( $exp)
3657                  };
3658            $accessor{$alias}=1;
3659          }
3660      }
3661    return $twig_or_class;
3662  }
3663}
3664
3665sub first_elt
3666  { my( $t, $cond)= @_;
3667    my $root= $t->root || return undef;
3668    return $root if( $root->passes( $cond));
3669    return $root->next_elt( $cond);
3670  }
3671
3672sub last_elt
3673  { my( $t, $cond)= @_;
3674    my $root= $t->root || return undef;
3675    return $root->last_descendant( $cond);
3676  }
3677
3678sub next_n_elt
3679  { my( $t, $offset, $cond)= @_;
3680    $offset -- if( $t->root->matches( $cond) );
3681    return $t->root->next_n_elt( $offset, $cond);
3682  }
3683
3684sub get_xpath
3685  { my $twig= shift;
3686    if( isa( $_[0], 'ARRAY'))
3687      { my $elt_array= shift;
3688        return _unique_elts( map { $_->get_xpath( @_) } @$elt_array);
3689      }
3690    else
3691      { return $twig->root->get_xpath( @_); }
3692  }
3693
3694# get a list of elts and return a sorted list of unique elts
3695sub _unique_elts
3696  { my @sorted= sort { $a ->cmp( $b) } @_;
3697    my @unique;
3698    while( my $current= shift @sorted)
3699      { push @unique, $current unless( @unique && ($unique[-1] == $current)); }
3700    return @unique;
3701  }
3702
3703sub findvalue
3704  { my $twig= shift;
3705    if( isa( $_[0], 'ARRAY'))
3706      { my $elt_array= shift;
3707        return join( '', map { $_->findvalue( @_) } @$elt_array);
3708      }
3709    else
3710      { return $twig->root->findvalue( @_); }
3711  }
3712
3713sub findvalues
3714  { my $twig= shift;
3715    if( isa( $_[0], 'ARRAY'))
3716      { my $elt_array= shift;
3717        return map { $_->findvalues( @_) } @$elt_array;
3718      }
3719    else
3720      { return $twig->root->findvalues( @_); }
3721  }
3722
3723sub set_id_seed
3724  { my $t= shift;
3725    XML::Twig::Elt->set_id_seed( @_);
3726    return $t;
3727  }
3728
3729# return an array ref to an index, or undef
3730sub index
3731  { my( $twig, $name, $index)= @_;
3732    return defined( $index) ? $twig->{_twig_index}->{$name}->[$index] : $twig->{_twig_index}->{$name};
3733  }
3734
3735# return a list with just the root
3736# if a condition is given then return an empty list unless the root matches
3737sub children
3738  { my( $t, $cond)= @_;
3739    my $root= $t->root;
3740    unless( $cond && !($root->passes( $cond)) )
3741      { return ($root); }
3742    else
3743      { return (); }
3744  }
3745
3746sub _children { return ($_[0]->root); }
3747
3748# weird, but here for completude
3749# used to solve (non-sensical) /doc[1] XPath queries
3750sub child
3751  { my $t= shift;
3752    my $nb= shift;
3753    return ($t->children( @_))[$nb];
3754  }
3755
3756sub descendants
3757  { my( $t, $cond)= @_;
3758    my $root= $t->root;
3759    if( $root->passes( $cond) )
3760      { return ($root, $root->descendants( $cond)); }
3761    else
3762      { return ( $root->descendants( $cond)); }
3763  }
3764
3765sub simplify  { my $t= shift; $t->root->simplify( @_);  }
3766sub subs_text { my $t= shift; $t->root->subs_text( @_); }
3767sub trim      { my $t= shift; $t->root->trim( @_);      }
3768
3769
3770sub set_keep_encoding
3771  { my( $t, $keep)= @_;
3772    $t->{twig_keep_encoding}= $keep;
3773    $t->{NoExpand}= $keep;
3774    return XML::Twig::Elt::set_keep_encoding( $keep);
3775   }
3776
3777sub set_expand_external_entities
3778  { return XML::Twig::Elt::set_expand_external_entities( @_); }
3779
3780sub escape_gt
3781  { my $t= shift; $t->{twig_escape_gt}= 1; return XML::Twig::Elt::escape_gt( @_); }
3782
3783sub do_not_escape_gt
3784  { my $t= shift; $t->{twig_escape_gt}= 0; return XML::Twig::Elt::do_not_escape_gt( @_); }
3785
3786sub elt_id
3787  { return $_[0]->{twig_id_list}->{$_[1]}; }
3788
3789# change it in ALL twigs at the moment
3790sub change_gi
3791  { my( $twig, $old_gi, $new_gi)= @_;
3792    my $index;
3793    return unless($index= $XML::Twig::gi2index{$old_gi});
3794    $XML::Twig::index2gi[$index]= $new_gi;
3795    delete $XML::Twig::gi2index{$old_gi};
3796    $XML::Twig::gi2index{$new_gi}= $index;
3797    return $twig;
3798  }
3799
3800
3801# builds the DTD from the stored (possibly updated) data
3802sub dtd_text
3803  { my $t= shift;
3804    my $dtd= $t->{twig_dtd};
3805    my $doctype= $t->{twig_doctype} or return '';
3806    my $string= "<!DOCTYPE ".$doctype->{name};
3807
3808    $string .= " [\n";
3809
3810    foreach my $gi (@{$dtd->{elt_list}})
3811      { $string.= "<!ELEMENT $gi ".$dtd->{model}->{$gi}.">\n" ;
3812        if( $dtd->{att}->{$gi})
3813          { my $attlist= $dtd->{att}->{$gi};
3814            $string.= "<!ATTLIST $gi\n";
3815            foreach my $att ( sort keys %{$attlist})
3816              {
3817                if( $attlist->{$att}->{fixed})
3818                  { $string.= "   $att $attlist->{$att}->{type} #FIXED $attlist->{$att}->{default}"; }
3819                else
3820                  { $string.= "   $att $attlist->{$att}->{type} $attlist->{$att}->{default}"; }
3821                $string.= "\n";
3822              }
3823            $string.= ">\n";
3824          }
3825      }
3826    $string.= $t->entity_list->text if( $t->entity_list);
3827    $string.= "\n]>\n";
3828    return $string;
3829  }
3830
3831# prints the DTD from the stored (possibly updated) data
3832sub dtd_print
3833  { my $t= shift;
3834    my $fh=  isa( $_[0], 'GLOB') || isa( $_[0], 'IO::Scalar')  ? shift : undef;
3835    if( $fh) { print $fh $t->dtd_text; }
3836    else     { print $t->dtd_text;     }
3837    return $t;
3838  }
3839
3840# build the subs that call directly expat
3841BEGIN
3842  { my @expat_methods= qw( depth in_element within_element context
3843                           current_line current_column current_byte
3844                           recognized_string original_string
3845                           xpcroak xpcarp
3846                           base current_element element_index
3847                           xml_escape
3848                           position_in_context);
3849    foreach my $method (@expat_methods)
3850      {
3851        ## no critic (TestingAndDebugging::ProhibitNoStrict);
3852        no strict 'refs';
3853        *{$method}= sub { my $t= shift;
3854                          _croak( "calling $method after parsing is finished") unless( $t->{twig_parsing});
3855                          return $t->{twig_parser}->$method(@_);
3856                        };
3857      }
3858  }
3859
3860sub path
3861  { my( $t, $gi)= @_;
3862    if( $t->{twig_map_xmlns})
3863      { return "/" . join( "/", map { $t->_replace_prefix( $_)} ($t->{twig_parser}->context, $gi)); }
3864    else
3865      { return "/" . join( "/", ($t->{twig_parser}->context, $gi)); }
3866  }
3867
3868sub finish
3869  { my $t= shift;
3870    return $t->{twig_parser}->finish;
3871  }
3872
3873# just finish the parse by printing the rest of the document
3874sub finish_print
3875  { my( $t, $fh)= @_;
3876    my $old_fh;
3877    unless( defined $fh)
3878      { $t->_set_fh_to_twig_output_fh(); }
3879    elsif( defined $fh)
3880      { $old_fh= select $fh;
3881        $t->{twig_original_selected_fh}= $old_fh if( $old_fh);
3882      }
3883
3884    my $p=$t->{twig_parser};
3885    if( $t->{twig_keep_encoding})
3886      { $p->setHandlers( %twig_handlers_finish_print); }
3887    else
3888      { $p->setHandlers( %twig_handlers_finish_print_original); }
3889    return $t;
3890  }
3891
3892sub set_remove_cdata { return XML::Twig::Elt::set_remove_cdata( @_); }
3893
3894sub output_filter          { return XML::Twig::Elt::output_filter( @_);          }
3895sub set_output_filter      { return XML::Twig::Elt::set_output_filter( @_);      }
3896
3897sub output_text_filter     { return XML::Twig::Elt::output_text_filter( @_);     }
3898sub set_output_text_filter { return XML::Twig::Elt::set_output_text_filter( @_); }
3899
3900sub set_input_filter
3901  { my( $t, $input_filter)= @_;
3902    my $old_filter= $t->{twig_input_filter};
3903      if( !$input_filter || isa( $input_filter, 'CODE') )
3904        { $t->{twig_input_filter}= $input_filter; }
3905      elsif( $input_filter eq 'latin1')
3906        {  $t->{twig_input_filter}= latin1(); }
3907      elsif( $filter{$input_filter})
3908        {  $t->{twig_input_filter}= $filter{$input_filter}; }
3909      else
3910        { _croak( "invalid input filter: $input_filter"); }
3911
3912      return $old_filter;
3913    }
3914
3915sub set_empty_tag_style
3916  { return XML::Twig::Elt::set_empty_tag_style( @_); }
3917
3918sub set_pretty_print
3919  { return XML::Twig::Elt::set_pretty_print( @_); }
3920
3921sub set_quote
3922  { return XML::Twig::Elt::set_quote( @_); }
3923
3924sub set_indent
3925  { return XML::Twig::Elt::set_indent( @_); }
3926
3927sub set_keep_atts_order
3928  { shift; return XML::Twig::Elt::set_keep_atts_order( @_); }
3929
3930sub keep_atts_order
3931  { return XML::Twig::Elt::keep_atts_order( @_); }
3932
3933sub set_do_not_escape_amp_in_atts
3934  { return XML::Twig::Elt::set_do_not_escape_amp_in_atts( @_); }
3935
3936# save and restore package globals (the ones in XML::Twig::Elt)
3937# should probably return the XML::Twig object itself, but instead
3938# returns the state (as a hashref) for backward compatibility
3939sub save_global_state
3940  { my $t= shift;
3941    return $t->{twig_saved_state}= XML::Twig::Elt::global_state();
3942  }
3943
3944sub restore_global_state
3945  { my $t= shift;
3946    XML::Twig::Elt::set_global_state( $t->{twig_saved_state});
3947  }
3948
3949sub global_state
3950  { return XML::Twig::Elt::global_state(); }
3951
3952sub set_global_state
3953  {  return XML::Twig::Elt::set_global_state( $_[1]); }
3954
3955sub dispose
3956  { my $t= shift;
3957    $t->DESTROY;
3958    return;
3959  }
3960
3961sub DESTROY
3962  { my $t= shift;
3963    if( $t->{twig_root} && isa(  $t->{twig_root}, 'XML::Twig::Elt'))
3964      { $t->{twig_root}->delete }
3965
3966    # added to break circular references
3967    undef $t->{twig};
3968    undef $t->{twig_root}->{twig} if( $t->{twig_root});
3969    undef $t->{twig_parser};
3970
3971    undef %$t;# prevents memory leaks (especially when using mod_perl)
3972    undef $t;
3973  }
3974
3975# return true if perl was compiled using perlio
3976# if perl is not available return true, these days perlio should be used
3977sub _use_perlio
3978  { my $perl= _this_perl();
3979    return $perl ? grep /useperlio=define/, `$perl -V` : 1;
3980  }
3981
3982# returns the parth to the perl executable (if available)
3983sub _this_perl
3984  { # straight from perlvar
3985    my $secure_perl_path= $Config{perlpath};
3986    if ($^O ne 'VMS')
3987      { $secure_perl_path .= $Config{_exe} unless $secure_perl_path =~ m/$Config{_exe}$/i; }
3988    if( ! -f $secure_perl_path) { $secure_perl_path= ''; } # when perl is not available (PDK)
3989    return $secure_perl_path;
3990  }
3991
3992#
3993#  non standard handlers
3994#
3995
3996# kludge: expat 1.95.2 calls both Default AND Doctype handlers
3997# so if the default handler finds '<!DOCTYPE' then it must
3998# unset itself (_twig_print_doctype will reset it)
3999sub _twig_print_check_doctype
4000   { # warn " in _twig_print_check_doctype...\n"; # DEBUG handler
4001
4002    my $p= shift;
4003    my $string= $p->recognized_string();
4004    if( $string eq '<!DOCTYPE')
4005      {
4006        $p->setHandlers( Default => undef);
4007        $p->setHandlers( Entity => undef);
4008        $expat_1_95_2=1;
4009      }
4010    else
4011      { print $string; }
4012
4013    return;
4014  }
4015
4016
4017sub _twig_print
4018   { # warn " in _twig_print...\n"; # DEBUG handler
4019    my $p= shift;
4020    if( $expat_1_95_2 && ($p->recognized_string eq '[') && !$p->{twig}->{expat_1_95_2_seen_bracket})
4021      { # otherwise the opening square bracket of the doctype gets printed twice
4022        $p->{twig}->{expat_1_95_2_seen_bracket}=1;
4023      }
4024    else
4025      { if( $p->{twig}->{twig_right_after_root})
4026          { my $s= $p->recognized_string(); print $s if $s=~ m{\S}; }
4027        else
4028          { print $p->recognized_string(); }
4029      }
4030    return;
4031  }
4032# recognized_string does not seem to work for entities, go figure!
4033# so this handler is used to print them anyway
4034sub _twig_print_entity
4035   { # warn " in _twig_print_entity...\n"; # DEBUG handler
4036    my $p= shift;
4037    XML::Twig::Entity->new( @_)->print;
4038  }
4039
4040# kludge: expat 1.95.2 calls both Default AND Doctype handlers
4041# so if the default handler finds '<!DOCTYPE' then it must
4042# unset itself (_twig_print_doctype will reset it)
4043sub _twig_print_original_check_doctype
4044   { # warn " in _twig_print_original_check_doctype...\n"; # DEBUG handler
4045
4046    my $p= shift;
4047    my $string= $p->original_string();
4048    if( $string eq '<!DOCTYPE')
4049      { $p->setHandlers( Default => undef);
4050        $p->setHandlers( Entity => undef);
4051        $expat_1_95_2=1;
4052      }
4053    else
4054      { print $string; }
4055
4056    return;
4057  }
4058
4059sub _twig_print_original
4060   { # warn " in _twig_print_original...\n"; # DEBUG handler
4061    my $p= shift;
4062    print $p->original_string();
4063    return;
4064  }
4065
4066
4067sub _twig_print_original_doctype
4068   { # warn " in _twig_print_original_doctype...\n"; # DEBUG handler
4069
4070    my(  $p, $name, $sysid, $pubid, $internal)= @_;
4071    if( $name)
4072      { # with recent versions of XML::Parser original_string does not work,
4073        # hence we need to rebuild the doctype declaration
4074        my $doctype='';
4075        $doctype .= qq{<!DOCTYPE $name}    if( $name);
4076        $doctype .=  qq{ PUBLIC  "$pubid"}  if( $pubid);
4077        $doctype .=  qq{ SYSTEM}            if( $sysid && !$pubid);
4078        $doctype .=  qq{ "$sysid"}          if( $sysid);
4079        $doctype .=  ' [' if( $internal && !$expat_1_95_2) ;
4080        $doctype .=  qq{>} unless( $internal || $expat_1_95_2);
4081        $p->{twig}->{twig_doctype}->{has_internal}=$internal;
4082        print $doctype;
4083      }
4084    $p->setHandlers( Default => \&_twig_print_original);
4085    return;
4086  }
4087
4088sub _twig_print_doctype
4089   { # warn " in _twig_print_doctype...\n"; # DEBUG handler
4090    my(  $p, $name, $sysid, $pubid, $internal)= @_;
4091    if( $name)
4092      { # with recent versions of XML::Parser original_string does not work,
4093        # hence we need to rebuild the doctype declaration
4094        my $doctype='';
4095        $doctype .= qq{<!DOCTYPE $name}    if( $name);
4096        $doctype .=  qq{ PUBLIC  "$pubid"}  if( $pubid);
4097        $doctype .=  qq{ SYSTEM}            if( $sysid && !$pubid);
4098        $doctype .=  qq{ "$sysid"}          if( $sysid);
4099        $doctype .=  ' [' if( $internal) ;
4100        $doctype .=  qq{>} unless( $internal || $expat_1_95_2);
4101        $p->{twig}->{twig_doctype}->{has_internal}=$internal;
4102        print $doctype;
4103      }
4104    $p->setHandlers( Default => \&_twig_print);
4105    return;
4106  }
4107
4108
4109sub _twig_print_original_default
4110   { # warn " in _twig_print_original_default...\n"; # DEBUG handler
4111    my $p= shift;
4112    print $p->original_string();
4113    return;
4114  }
4115
4116# account for the case where the element is empty
4117sub _twig_print_end_original
4118   { # warn " in _twig_print_end_original...\n"; # DEBUG handler
4119    my $p= shift;
4120    print $p->original_string();
4121    return;
4122  }
4123
4124sub _twig_start_check_roots
4125   { # warn " in _twig_start_check_roots...\n"; # DEBUG handler
4126    my $p= shift;
4127    my $gi= shift;
4128
4129    my $t= $p->{twig};
4130
4131    my $fh= $t->{twig_output_fh} || select() || \*STDOUT;
4132
4133    my $ns_decl;
4134    unless( $p->depth == 0)
4135      { if( $t->{twig_map_xmlns}) { $ns_decl= _replace_ns( $t, \$gi, \@_); }
4136      }
4137
4138    my $context= { $ST_TAG => $gi, @_};
4139    $context->{$ST_NS}= $ns_decl if $ns_decl;
4140    push @{$t->{_twig_context_stack}}, $context;
4141    my %att= @_;
4142
4143    if( _handler( $t, $t->{twig_roots}, $gi))
4144      { $p->setHandlers( %twig_handlers); # restore regular handlers
4145        $t->{twig_root_depth}= $p->depth;
4146        pop @{$t->{_twig_context_stack}}; # will be pushed back in _twig_start
4147        _twig_start( $p, $gi, @_);
4148        return;
4149      }
4150
4151    # $tag will always be true if it needs to be printed (the tag string is never empty)
4152    my $tag= $t->{twig_default_print} ? $t->{twig_keep_encoding} ? $p->original_string
4153                                                                 : $p->recognized_string
4154                                      : '';
4155
4156    if( $p->depth == 0)
4157      {
4158        ## no critic (TestingAndDebugging::ProhibitNoStrict);
4159        no strict 'refs';
4160        print {$fh} $tag if( $tag);
4161        pop @{$t->{_twig_context_stack}}; # will be pushed back in _twig_start
4162        _twig_start( $p, $gi, @_);
4163        $t->root->{'flushed'}=1; # or the root start tag gets output the first time we flush
4164      }
4165    elsif( $t->{twig_starttag_handlers})
4166      { # look for start tag handlers
4167
4168        my @handlers= _handler( $t, $t->{twig_starttag_handlers}, $gi);
4169        my $last_handler_res;
4170        foreach my $handler ( @handlers)
4171          { $last_handler_res= $handler->($t, $gi, %att);
4172            last unless $last_handler_res;
4173          }
4174        ## no critic (TestingAndDebugging::ProhibitNoStrict);
4175        no strict 'refs';
4176        print {$fh} $tag if( $tag && (!@handlers || $last_handler_res));
4177      }
4178    else
4179      {
4180        ## no critic (TestingAndDebugging::ProhibitNoStrict);
4181        no strict 'refs';
4182        print {$fh} $tag if( $tag);
4183      }
4184    return;
4185  }
4186
4187sub _twig_end_check_roots
4188   { # warn " in _twig_end_check_roots...\n"; # DEBUG handler
4189
4190    my( $p, $gi, %att)= @_;
4191    my $t= $p->{twig};
4192    # $tag can be empty (<elt/>), hence the undef and the tests for defined
4193    my $tag= $t->{twig_default_print} ? $t->{twig_keep_encoding} ? $p->original_string
4194                                                                 : $p->recognized_string
4195                                      : undef;
4196    my $fh= $t->{twig_output_fh} || select() || \*STDOUT;
4197
4198    if( $t->{twig_endtag_handlers})
4199      { # look for end tag handlers
4200        my @handlers= _handler( $t, $t->{twig_endtag_handlers}, $gi);
4201        my $last_handler_res=1;
4202        foreach my $handler ( @handlers)
4203          { $last_handler_res= $handler->($t, $gi) || last; }
4204        #if( ! $last_handler_res)
4205        #  { pop @{$t->{_twig_context_stack}}; warn "tested";
4206        #    return;
4207        #  }
4208      }
4209    {
4210      ## no critic (TestingAndDebugging::ProhibitNoStrict);
4211      no strict 'refs';
4212      print {$fh} $tag if( defined $tag);
4213    }
4214    if( $p->depth == 0)
4215      {
4216        _twig_end( $p, $gi);
4217        $t->root->{end_tag_flushed}=1;
4218      }
4219
4220    pop @{$t->{_twig_context_stack}};
4221    return;
4222  }
4223
4224sub _twig_pi_check_roots
4225   { # warn " in _twig_pi_check_roots...\n"; # DEBUG handler
4226    my( $p, $target, $data)= @_;
4227    my $t= $p->{twig};
4228    my $pi= $t->{twig_default_print} ? $t->{twig_keep_encoding} ? $p->original_string
4229                                                                : $p->recognized_string
4230                                    : undef;
4231    my $fh= $t->{twig_output_fh} || select() || \*STDOUT;
4232
4233    if( my $handler=    $t->{twig_handlers}->{pi_handlers}->{$target}
4234                     || $t->{twig_handlers}->{pi_handlers}->{''}
4235      )
4236      { # if handler is called on pi, then it needs to be processed as a regular node
4237        my @flags= qw( twig_process_pi twig_keep_pi);
4238        my @save= @{$t}{@flags}; # save pi related flags
4239        @{$t}{@flags}= (1, 0);   # override them, pi needs to be processed
4240        _twig_pi( @_);           # call handler on the pi
4241        @{$t}{@flags}= @save;;   # restore flag
4242      }
4243    else
4244      {
4245        ## no critic (TestingAndDebugging::ProhibitNoStrict);
4246        no strict 'refs';
4247        print  {$fh} $pi if( defined( $pi));
4248      }
4249    return;
4250  }
4251
4252
4253sub _output_ignored
4254  { my( $t, $p)= @_;
4255    my $action= $t->{twig_ignore_action};
4256
4257    my $get_string= $t->{twig_keep_encoding} ? 'original_string' : 'recognized_string';
4258
4259    if( $action eq 'print' ) { print $p->$get_string; }
4260    else
4261      { my $string_ref;
4262        if( $action eq 'string')
4263          { $string_ref= \$t->{twig_buffered_string}; }
4264        elsif( ref( $action) && ref( $action) eq 'SCALAR')
4265          { $string_ref= $action; }
4266        else
4267          { _croak( "wrong ignore action: $action"); }
4268
4269        $$string_ref .= $p->$get_string;
4270      }
4271  }
4272
4273
4274
4275sub _twig_ignore_start
4276   { # warn " in _twig_ignore_start...\n"; # DEBUG handler
4277
4278    my( $p, $gi)= @_;
4279    my $t= $p->{twig};
4280    $t->{twig_ignore_level}++;
4281    my $action= $t->{twig_ignore_action};
4282
4283    $t->_output_ignored( $p) unless $action eq 'discard';
4284    return;
4285  }
4286
4287sub _twig_ignore_end
4288   { # warn " in _twig_ignore_end...\n"; # DEBUG handler
4289
4290    my( $p, $gi)= @_;
4291    my $t= $p->{twig};
4292
4293    my $action= $t->{twig_ignore_action};
4294    $t->_output_ignored( $p) unless $action eq 'discard';
4295
4296    $t->{twig_ignore_level}--;
4297
4298    if( ! $t->{twig_ignore_level})
4299      {
4300        $t->{twig_current}   = $t->{twig_ignore_elt};
4301        $t->{twig_current}->{'twig_current'}=1;
4302
4303        $t->{twig_ignore_elt}->cut;  # there could possibly be a memory leak here (delete would avoid it,
4304                                     # but could also delete elements that should not be deleted)
4305
4306        # restore the saved stack to the current level
4307        splice( @{$t->{_twig_context_stack}}, $p->depth+ 1 );
4308        #warn "stack: ", _dump_stack( $t->{_twig_context_stack}), "\n";
4309
4310        $p->setHandlers( @{$t->{twig_saved_handlers}});
4311        # test for handlers
4312        if( $t->{twig_endtag_handlers})
4313          { # look for end tag handlers
4314            my @handlers= _handler( $t, $t->{twig_endtag_handlers}, $gi);
4315            my $last_handler_res=1;
4316            foreach my $handler ( @handlers)
4317              { $last_handler_res= $handler->($t, $gi) || last; }
4318          }
4319        pop @{$t->{_twig_context_stack}};
4320      };
4321    return;
4322  }
4323
4324#sub _dump_stack { my( $stack)= @_; return join( ":", map { $_->{$ST_TAG} } @$stack); }
4325
4326sub ignore
4327  { my( $t, $elt, $action)= @_;
4328    my $current= $t->{twig_current};
4329
4330    if( ! ($elt && ref( $elt) && isa( $elt, 'XML::Twig::Elt'))) { $elt= $current; }
4331
4332    #warn "ignore:  current = ", $current->tag, ", elt = ", $elt->tag, ")\n";
4333
4334    # we need the ($elt == $current->{last_child}) test because the current element is set to the
4335    # parent _before_ handlers are called (and I can't figure out how to fix this)
4336    unless( ($elt == $current) || ($current->{last_child} && ($elt == $current->{last_child})) || $current->in( $elt))
4337      { _croak( "element to be ignored must be ancestor of current element"); }
4338
4339    $t->{twig_ignore_level}= $current == $elt ? 1 : $t->_level_in_stack( $current) - $t->_level_in_stack($elt) + 1;
4340    #warn "twig_ignore_level:  $t->{twig_ignore_level} (current: ", $current->tag, ", elt: ", $elt->tag, ")\n";
4341    $t->{twig_ignore_elt}  = $elt;     # save it, so we can delete it later
4342
4343    $action ||= 'discard';
4344    if( !($action eq 'print' || $action eq 'string' || ( ref( $action) && ref( $action) eq 'SCALAR')))
4345      { $action= 'discard'; }
4346
4347    $t->{twig_ignore_action}= $action;
4348
4349    my $p= $t->{twig_parser};
4350    my @saved_handlers= $p->setHandlers( %twig_handlers_ignore); # set handlers
4351
4352    my $get_string= $t->{twig_keep_encoding} ? 'original_string' : 'recognized_string';
4353
4354    my $default_handler;
4355
4356    if( $action ne 'discard')
4357      { if( $action eq 'print')
4358          { $p->setHandlers( Default => sub { print $_[0]->$get_string; }); }
4359        else
4360          { my $string_ref;
4361            if( $action eq 'string')
4362              { if( ! exists $t->{twig_buffered_string}) { $t->{twig_buffered_string}=''; }
4363                $string_ref= \$t->{twig_buffered_string};
4364              }
4365            elsif( ref( $action) && ref( $action) eq 'SCALAR')
4366              { $string_ref= $action; }
4367
4368            $p->setHandlers( Default =>  sub { $$string_ref .= $_[0]->$get_string; });
4369          }
4370        $t->_output_ignored( $p, $action);
4371      }
4372
4373
4374    $t->{twig_saved_handlers}= \@saved_handlers;        # save current handlers
4375  }
4376
4377sub _level_in_stack
4378  { my( $t, $elt)= @_;
4379    my $level=1;
4380    foreach my $elt_in_stack ( @{$t->{_twig_context_stack}} )
4381      { if( $elt_in_stack->{$ST_ELT} && ($elt == $elt_in_stack->{$ST_ELT})) { return $level }
4382        $level++;
4383      }
4384  }
4385
4386
4387
4388# select $t->{twig_output_fh} and store the current selected fh
4389sub _set_fh_to_twig_output_fh
4390  { my $t= shift;
4391    my $output_fh= $t->{twig_output_fh};
4392    if( $output_fh && !$t->{twig_output_fh_selected})
4393      { # there is an output fh
4394        $t->{twig_selected_fh}= select(); # store the currently selected fh
4395        $t->{twig_output_fh_selected}=1;
4396        select $output_fh;                # select the output fh for the twig
4397      }
4398  }
4399
4400# select the fh that was stored in $t->{twig_selected_fh}
4401# (before $t->{twig_output_fh} was selected)
4402sub _set_fh_to_selected_fh
4403  { my $t= shift;
4404    return unless( $t->{twig_output_fh});
4405    my $selected_fh= $t->{twig_selected_fh};
4406    $t->{twig_output_fh_selected}=0;
4407    select $selected_fh;
4408    return;
4409  }
4410
4411
4412sub encoding
4413  { return $_[0]->{twig_xmldecl}->{encoding} if( $_[0]->{twig_xmldecl}); }
4414
4415sub set_encoding
4416  { my( $t, $encoding)= @_;
4417    $t->{twig_xmldecl} ||={};
4418    $t->set_xml_version( "1.0") unless( $t->xml_version);
4419    $t->{twig_xmldecl}->{encoding}= $encoding;
4420    return $t;
4421  }
4422
4423sub output_encoding
4424  { return $_[0]->{output_encoding}; }
4425
4426sub set_output_encoding
4427  { my( $t, $encoding)= @_;
4428    my $output_filter= $t->output_filter || '';
4429
4430    if( ($encoding && $encoding !~ m{^utf-?8$}i) || $t->{twig_keep_encoding} || $output_filter)
4431      { $t->set_output_filter( _encoding_filter( $encoding || '')); }
4432
4433    $t->{output_encoding}= $encoding;
4434    return $t;
4435  }
4436
4437sub xml_version
4438  { return $_[0]->{twig_xmldecl}->{version} if( $_[0]->{twig_xmldecl}); }
4439
4440sub set_xml_version
4441  { my( $t, $version)= @_;
4442    $t->{twig_xmldecl} ||={};
4443    $t->{twig_xmldecl}->{version}= $version;
4444    return $t;
4445  }
4446
4447sub standalone
4448  { return $_[0]->{twig_xmldecl}->{standalone} if( $_[0]->{twig_xmldecl}); }
4449
4450sub set_standalone
4451  { my( $t, $standalone)= @_;
4452    $t->{twig_xmldecl} ||={};
4453    $t->set_xml_version( "1.0") unless( $t->xml_version);
4454    $t->{twig_xmldecl}->{standalone}= $standalone;
4455    return $t;
4456  }
4457
4458
4459# SAX methods
4460
4461sub toSAX1
4462  { _croak( "cannot use toSAX1 while parsing (use flush_toSAX1)") if (defined $_[0]->{twig_parser});
4463    shift(@_)->_toSAX(@_, \&XML::Twig::Elt::_start_tag_data_SAX1,
4464                          \&XML::Twig::Elt::_end_tag_data_SAX1
4465             );
4466  }
4467
4468sub toSAX2
4469  { _croak( "cannot use toSAX2 while parsing (use flush_toSAX2)") if (defined $_[0]->{twig_parser});
4470    shift(@_)->_toSAX(@_, \&XML::Twig::Elt::_start_tag_data_SAX2,
4471                          \&XML::Twig::Elt::_end_tag_data_SAX2
4472             );
4473  }
4474
4475
4476sub _toSAX
4477  { my( $t, $handler, $start_tag_data, $end_tag_data) = @_;
4478
4479    if( my $start_document =  $handler->can( 'start_document'))
4480      { $start_document->( $handler); }
4481
4482    $t->_prolog_toSAX( $handler);
4483
4484    if( $t->root) { $t->root->_toSAX( $handler, $start_tag_data, $end_tag_data) ; }
4485    if( my $end_document =  $handler->can( 'end_document'))
4486      { $end_document->( $handler); }
4487  }
4488
4489
4490sub flush_toSAX1
4491  { shift(@_)->_flush_toSAX(@_, \&XML::Twig::Elt::_start_tag_data_SAX1,
4492                               \&XML::Twig::Elt::_end_tag_data_SAX1
4493             );
4494  }
4495
4496sub flush_toSAX2
4497  { shift(@_)->_flush_toSAX(@_, \&XML::Twig::Elt::_start_tag_data_SAX2,
4498                               \&XML::Twig::Elt::_end_tag_data_SAX2
4499             );
4500  }
4501
4502sub _flush_toSAX
4503  { my( $t, $handler, $start_tag_data, $end_tag_data)= @_;
4504
4505    # the "real" last element processed, as _twig_end has closed it
4506    my $last_elt;
4507    if( $t->{twig_current})
4508      { $last_elt= $t->{twig_current}->{last_child}; }
4509    else
4510      { $last_elt= $t->{twig_root}; }
4511
4512    my $elt= $t->{twig_root};
4513    unless( $elt->{'flushed'})
4514      { # init unless already done (ie root has been flushed)
4515        if( my $start_document =  $handler->can( 'start_document'))
4516          { $start_document->( $handler); }
4517        # flush the DTD
4518        $t->_prolog_toSAX( $handler)
4519      }
4520
4521    while( $elt)
4522      { my $next_elt;
4523        if( $last_elt && $last_elt->in( $elt))
4524          {
4525            unless( $elt->{'flushed'})
4526              { # just output the front tag
4527                if( my $start_element = $handler->can( 'start_element'))
4528                 { if( my $tag_data= $start_tag_data->( $elt))
4529                     { $start_element->( $handler, $tag_data); }
4530                 }
4531                $elt->{'flushed'}=1;
4532              }
4533            $next_elt= $elt->{first_child};
4534          }
4535        else
4536          { # an element before the last one or the last one,
4537            $next_elt= $elt->{next_sibling};
4538            $elt->_toSAX( $handler, $start_tag_data, $end_tag_data);
4539            $elt->delete;
4540            last if( $last_elt && ($elt == $last_elt));
4541          }
4542        $elt= $next_elt;
4543      }
4544    if( !$t->{twig_parsing})
4545      { if( my $end_document =  $handler->can( 'end_document'))
4546          { $end_document->( $handler); }
4547      }
4548  }
4549
4550
4551sub _prolog_toSAX
4552  { my( $t, $handler)= @_;
4553    $t->_xmldecl_toSAX( $handler);
4554    $t->_DTD_toSAX( $handler);
4555  }
4556
4557sub _xmldecl_toSAX
4558  { my( $t, $handler)= @_;
4559    my $decl= $t->{twig_xmldecl};
4560    my $data= { Version    => $decl->{version},
4561                Encoding   => $decl->{encoding},
4562                Standalone => $decl->{standalone},
4563          };
4564    if( my $xml_decl= $handler->can( 'xml_decl'))
4565      { $xml_decl->( $handler, $data); }
4566  }
4567
4568sub _DTD_toSAX
4569  { my( $t, $handler)= @_;
4570    my $doctype= $t->{twig_doctype};
4571    return unless( $doctype);
4572    my $data= { Name     => $doctype->{name},
4573                PublicId => $doctype->{pub},
4574                SystemId => $doctype->{sysid},
4575              };
4576
4577    if( my $start_dtd= $handler->can( 'start_dtd'))
4578      { $start_dtd->( $handler, $data); }
4579
4580    # I should call code to export the internal subset here
4581
4582    if( my $end_dtd= $handler->can( 'end_dtd'))
4583      { $end_dtd->( $handler); }
4584  }
4585
4586# input/output filters
4587
4588sub latin1
4589  { local $SIG{__DIE__};
4590    if( _use(  'Encode'))
4591      { return encode_convert( 'ISO-8859-15'); }
4592    elsif( _use( 'Text::Iconv'))
4593      { return iconv_convert( 'ISO-8859-15'); }
4594    elsif( _use( 'Unicode::Map8') && _use( 'Unicode::String'))
4595      { return unicode_convert( 'ISO-8859-15'); }
4596    else
4597      { return \&regexp2latin1; }
4598  }
4599
4600sub _encoding_filter
4601  {
4602      { local $SIG{__DIE__};
4603        my $encoding= $_[1] || $_[0];
4604        if( _use( 'Encode'))
4605          { my $sub= encode_convert( $encoding);
4606            return $sub;
4607          }
4608        elsif( _use( 'Text::Iconv'))
4609          { return iconv_convert( $encoding); }
4610        elsif( _use( 'Unicode::Map8') && _use( 'Unicode::String'))
4611          { return unicode_convert( $encoding); }
4612        }
4613    _croak( "Encode, Text::Iconv or Unicode::Map8 and Unicode::String need to be installed in order to use encoding options");
4614  }
4615
4616# shamelessly lifted from XML::TyePYX (works only with XML::Parse 2.27)
4617sub regexp2latin1
4618  { my $text=shift;
4619    $text=~s{([\xc0-\xc3])(.)}{ my $hi = ord($1);
4620                                my $lo = ord($2);
4621                                chr((($hi & 0x03) <<6) | ($lo & 0x3F))
4622                              }ge;
4623    return $text;
4624  }
4625
4626
4627sub html_encode
4628  { _use( 'HTML::Entities') or croak "cannot use html_encode: missing HTML::Entities";
4629    return HTML::Entities::encode_entities($_[0] );
4630  }
4631
4632sub safe_encode
4633  {   my $str= shift;
4634      if( $perl_version < 5.008)
4635        { # the no utf8 makes the regexp work in 5.6
4636          no utf8; # = perl 5.6
4637          $str =~ s{([\xC0-\xDF].|[\xE0-\xEF]..|[\xF0-\xFF]...)}
4638                   {_XmlUtf8Decode($1)}egs;
4639        }
4640      else
4641        { $str= encode( ascii => $str, $FB_HTMLCREF); }
4642      return $str;
4643  }
4644
4645sub safe_encode_hex
4646  {   my $str= shift;
4647      if( $perl_version < 5.008)
4648        { # the no utf8 makes the regexp work in 5.6
4649          no utf8; # = perl 5.6
4650          $str =~ s{([\xC0-\xDF].|[\xE0-\xEF]..|[\xF0-\xFF]...)}
4651                   {_XmlUtf8Decode($1, 1)}egs;
4652        }
4653      else
4654        { $str= encode( ascii => $str, $FB_XMLCREF); }
4655      return $str;
4656  }
4657
4658# this one shamelessly lifted from XML::DOM
4659# does NOT work on 5.8.0
4660sub _XmlUtf8Decode
4661  { my ($str, $hex) = @_;
4662    my $len = length ($str);
4663    my $n;
4664
4665    if ($len == 2)
4666      { my @n = unpack "C2", $str;
4667        $n = (($n[0] & 0x3f) << 6) + ($n[1] & 0x3f);
4668      }
4669    elsif ($len == 3)
4670      { my @n = unpack "C3", $str;
4671        $n = (($n[0] & 0x1f) << 12) + (($n[1] & 0x3f) << 6) + ($n[2] & 0x3f);
4672      }
4673    elsif ($len == 4)
4674      { my @n = unpack "C4", $str;
4675        $n = (($n[0] & 0x0f) << 18) + (($n[1] & 0x3f) << 12)
4676           + (($n[2] & 0x3f) << 6) + ($n[3] & 0x3f);
4677      }
4678    elsif ($len == 1)    # just to be complete...
4679      { $n = ord ($str); }
4680    else
4681      { croak "bad value [$str] for _XmlUtf8Decode"; }
4682
4683    my $char= $hex ? sprintf ("&#x%x;", $n) : "&#$n;";
4684    return $char;
4685  }
4686
4687
4688sub unicode_convert
4689  { my $enc= $_[1] ? $_[1] : $_[0]; # so the method can be called on the twig or directly
4690    _use( 'Unicode::Map8') or croak "Unicode::Map8 not available, needed for encoding filter: $!";
4691    _use( 'Unicode::String') or croak "Unicode::String not available, needed for encoding filter: $!";
4692    import Unicode::String qw(utf8);
4693    my $sub= eval qq{ { $NO_WARNINGS;
4694                        my \$cnv;
4695                        BEGIN {  \$cnv= Unicode::Map8->new(\$enc)
4696                                     or croak "Can't create converter to \$enc";
4697                              }
4698                        sub { return  \$cnv->to8 (utf8(\$_[0])->ucs2); }
4699                      }
4700                    };
4701    unless( $sub) { croak $@; }
4702    return $sub;
4703  }
4704
4705sub iconv_convert
4706  { my $enc= $_[1] ? $_[1] : $_[0]; # so the method can be called on the twig or directly
4707    _use( 'Text::Iconv') or croak "Text::Iconv not available, needed for encoding filter: $!";
4708    my $sub= eval qq{ { $NO_WARNINGS;
4709                        my \$cnv;
4710                        BEGIN { \$cnv = Text::Iconv->new( 'utf8', \$enc)
4711                                     or croak "Can't create iconv converter to \$enc";
4712                              }
4713                        sub { return  \$cnv->convert( \$_[0]); }
4714                      }
4715                    };
4716    unless( $sub)
4717      { if( $@=~ m{^Unsupported conversion: Invalid argument})
4718          { croak "Unsupported encoding: $enc"; }
4719        else
4720          { croak $@; }
4721      }
4722
4723    return $sub;
4724  }
4725
4726sub encode_convert
4727  { my $enc= $_[1] ? $_[1] : $_[0]; # so the method can be called on the twig or directly
4728    my $sub=  eval qq{sub { $NO_WARNINGS; return encode( "$enc", \$_[0]); } };
4729    croak "can't create Encode-based filter: $@" unless( $sub);
4730    return $sub;
4731  }
4732
4733
4734# XML::XPath compatibility
4735sub getRootNode        { return $_[0]; }
4736sub getParentNode      { return undef; }
4737sub getChildNodes      { my @children= ($_[0]->root); return wantarray ? @children : \@children; }
4738
4739sub _weakrefs     { return $weakrefs;       }
4740sub _set_weakrefs { $weakrefs=shift() || 0; XML::Twig::Elt::set_destroy()if ! $weakrefs; } # for testing purposes
4741
4742sub _dump
4743  { my $t= shift;
4744    my $dump='';
4745
4746    $dump="document\n"; # should dump twig level data here
4747    if( $t->root) { $dump .= $t->root->_dump( @_); }
4748
4749    return $dump;
4750
4751  }
4752
4753
47541;
4755
4756######################################################################
4757package XML::Twig::Entity_list;
4758######################################################################
4759
4760*isa= *UNIVERSAL::isa;
4761
4762sub new
4763  { my $class = shift;
4764    my $self={ entities => {}, updated => 0};
4765
4766    bless $self, $class;
4767    return $self;
4768
4769  }
4770
4771sub add_new_ent
4772  { my $ent_list= shift;
4773    my $ent= XML::Twig::Entity->new( @_);
4774    $ent_list->add( $ent);
4775    return $ent_list;
4776  }
4777
4778sub _add_list
4779  { my( $ent_list, $to_add)= @_;
4780    my $ents_to_add= $to_add->{entities};
4781    return $ent_list unless( $ents_to_add && %$ents_to_add);
4782    @{$ent_list->{entities}}{keys %$ents_to_add}= values %$ents_to_add;
4783    $ent_list->{updated}=1;
4784    return $ent_list;
4785  }
4786
4787sub add
4788  { my( $ent_list, $ent)= @_;
4789    $ent_list->{entities}->{$ent->{name}}= $ent;
4790    $ent_list->{updated}=1;
4791    return $ent_list;
4792  }
4793
4794sub ent
4795  { my( $ent_list, $ent_name)= @_;
4796    return $ent_list->{entities}->{$ent_name};
4797  }
4798
4799# can be called with an entity or with an entity name
4800sub delete
4801  { my $ent_list= shift;
4802    if( isa( ref $_[0], 'XML::Twig::Entity'))
4803      { # the second arg is an entity
4804        my $ent= shift;
4805        delete $ent_list->{entities}->{$ent->{name}};
4806      }
4807    else
4808      { # the second arg was not entity, must be a string then
4809        my $name= shift;
4810        delete $ent_list->{entities}->{$name};
4811      }
4812    $ent_list->{updated}=1;
4813    return $ent_list;
4814  }
4815
4816sub print
4817  { my ($ent_list, $fh)= @_;
4818    my $old_select= defined $fh ? select $fh : undef;
4819
4820    foreach my $ent_name ( sort keys %{$ent_list->{entities}})
4821      { my $ent= $ent_list->{entities}->{$ent_name};
4822        # we have to test what the entity is or un-defined entities can creep in
4823        if( isa( $ent, 'XML::Twig::Entity')) { $ent->print(); }
4824      }
4825    select $old_select if( defined $old_select);
4826    return $ent_list;
4827  }
4828
4829sub text
4830  { my ($ent_list)= @_;
4831    return join "\n", map { $ent_list->{entities}->{$_}->text} sort keys %{$ent_list->{entities}};
4832  }
4833
4834# return the list of entity names
4835sub entity_names
4836  { my $ent_list= shift;
4837    return (sort keys %{$ent_list->{entities}}) ;
4838  }
4839
4840
4841sub list
4842  { my ($ent_list)= @_;
4843    return map { $ent_list->{entities}->{$_} } sort keys %{$ent_list->{entities}};
4844  }
4845
48461;
4847
4848######################################################################
4849package XML::Twig::Entity;
4850######################################################################
4851
4852#*isa= *UNIVERSAL::isa;
4853
4854sub new
4855  { my( $class, $name, $val, $sysid, $pubid, $ndata, $param)= @_;
4856    $class= ref( $class) || $class;
4857
4858    my $self={};
4859
4860    $self->{name}  = $name;
4861    $self->{val}   = $val   if( defined $val  );
4862    $self->{sysid} = $sysid if( defined $sysid);
4863    $self->{pubid} = $pubid if( defined $pubid);
4864    $self->{ndata} = $ndata if( defined $ndata);
4865    $self->{param} = $param if( defined $param);
4866
4867    bless $self, $class;
4868    return $self;
4869  }
4870
4871
4872sub name  { return $_[0]->{name}; }
4873sub val   { return $_[0]->{val}; }
4874sub sysid { return defined( $_[0]->{sysid}) ? $_[0]->{sysid} : ''; }
4875sub pubid { return defined( $_[0]->{pubid}) ? $_[0]->{pubid} : ''; }
4876sub ndata { return defined( $_[0]->{ndata}) ? $_[0]->{ndata} : ''; }
4877sub param { return defined( $_[0]->{param}) ? $_[0]->{param} : ''; }
4878
4879
4880sub print
4881  { my ($ent, $fh)= @_;
4882    my $text= $ent->text;
4883    if( $fh) { print $fh $text . "\n"; }
4884    else     { print $text . "\n"; }
4885  }
4886
4887sub sprint
4888  { my ($ent)= @_;
4889    return $ent->text;
4890  }
4891
4892sub text
4893  { my ($ent)= @_;
4894    #warn "text called: '", $ent->_dump, "'\n";
4895    return '' if( !$ent->{name});
4896    my @tokens;
4897    push @tokens, '<!ENTITY';
4898
4899    push @tokens, '%' if( $ent->{param});
4900    push @tokens, $ent->{name};
4901
4902    if( defined $ent->{val} && !defined( $ent->{sysid}) && !defined($ent->{pubid}) )
4903      { push @tokens, _quoted_val( $ent->{val});
4904      }
4905    elsif( defined $ent->{sysid})
4906      { push @tokens, 'PUBLIC', _quoted_val( $ent->{pubid}) if( $ent->{pubid});
4907        push @tokens, 'SYSTEM' unless( $ent->{pubid});
4908        push @tokens, _quoted_val( $ent->{sysid});
4909        push @tokens, 'NDATA', $ent->{ndata} if( $ent->{ndata});
4910      }
4911    return join( ' ', @tokens) . '>';
4912  }
4913
4914sub _quoted_val
4915  { my $q= $_[0]=~ m{"} ? q{'} : q{"};
4916    return qq{$q$_[0]$q};
4917  }
4918
4919sub _dump
4920  { my( $ent)= @_; return join( " - ", map { "$_ => '$ent->{$_}'" } grep { defined $ent->{$_} } sort keys %$ent); }
4921
49221;
4923
4924######################################################################
4925package XML::Twig::Notation_list;
4926######################################################################
4927
4928*isa= *UNIVERSAL::isa;
4929
4930sub new
4931  { my $class = shift;
4932    my $self={ notations => {}, updated => 0};
4933
4934    bless $self, $class;
4935    return $self;
4936
4937  }
4938
4939sub add_new_notation
4940  { my $notation_list= shift;
4941    my $notation= XML::Twig::Notation->new( @_);
4942    $notation_list->add( $notation);
4943    return $notation_list;
4944  }
4945
4946sub _add_list
4947  { my( $notation_list, $to_add)= @_;
4948    my $notations_to_add= $to_add->{notations};
4949    return $notation_list unless( $notations_to_add && %$notations_to_add);
4950    @{$notation_list->{notations}}{keys %$notations_to_add}= values %$notations_to_add;
4951    $notation_list->{updated}=1;
4952    return $notation_list;
4953  }
4954
4955sub add
4956  { my( $notation_list, $notation)= @_;
4957    $notation_list->{notations}->{$notation->{name}}= $notation;
4958    $notation_list->{updated}=1;
4959    return $notation_list;
4960  }
4961
4962sub notation
4963  { my( $notation_list, $notation_name)= @_;
4964    return $notation_list->{notations}->{$notation_name};
4965  }
4966
4967# can be called with an notation or with an notation name
4968sub delete
4969  { my $notation_list= shift;
4970    if( isa( ref $_[0], 'XML::Twig::Notation'))
4971      { # the second arg is an notation
4972        my $notation= shift;
4973        delete $notation_list->{notations}->{$notation->{name}};
4974      }
4975    else
4976      { # the second arg was not notation, must be a string then
4977        my $name= shift;
4978        delete $notation_list->{notations}->{$name};
4979      }
4980    $notation_list->{updated}=1;
4981    return $notation_list;
4982  }
4983
4984sub print
4985  { my ($notation_list, $fh)= @_;
4986    my $old_select= defined $fh ? select $fh : undef;
4987
4988    foreach my $notation_name ( sort keys %{$notation_list->{notations}})
4989      { my $notation= $notation_list->{notations}->{$notation_name};
4990        # we have to test what the notation is or un-defined notations can creep in
4991        if( isa( $notation, 'XML::Twig::Notation')) { $notation->print(); }
4992      }
4993    select $old_select if( defined $old_select);
4994    return $notation_list;
4995  }
4996
4997sub text
4998  { my ($notation_list)= @_;
4999    return join "\n", map { $notation_list->{notations}->{$_}->text} sort keys %{$notation_list->{notations}};
5000  }
5001
5002# return the list of notation names
5003sub notation_names
5004  { my $notation_list= shift;
5005    return (sort keys %{$notation_list->{notations}}) ;
5006  }
5007
5008
5009sub list
5010  { my ($notation_list)= @_;
5011    return map { $notation_list->{notations}->{$_} } sort keys %{$notation_list->{notations}};
5012  }
5013
50141;
5015
5016######################################################################
5017package XML::Twig::Notation;
5018######################################################################
5019
5020#*isa= *UNIVERSAL::isa;
5021
5022BEGIN
5023  { *sprint= *text;
5024  }
5025
5026sub new
5027  { my( $class, $name, $base, $sysid, $pubid)= @_;
5028    $class= ref( $class) || $class;
5029
5030    my $self={};
5031
5032    $self->{name}  = $name;
5033    $self->{base}  = $base  if( defined $base  );
5034    $self->{sysid} = $sysid if( defined $sysid);
5035    $self->{pubid} = $pubid if( defined $pubid);
5036
5037    bless $self, $class;
5038    return $self;
5039  }
5040
5041
5042sub name  { return $_[0]->{name};  }
5043sub base  { return $_[0]->{base};  }
5044sub sysid { return $_[0]->{sysid}; }
5045sub pubid { return $_[0]->{pubid}; }
5046
5047
5048sub print
5049  { my ($notation, $fh)= @_;
5050    my $text= $notation->text;
5051    if( $fh) { print $fh $text . "\n"; }
5052    else     { print $text . "\n"; }
5053  }
5054
5055sub text
5056  { my ($notation)= @_;
5057    return '' if( !$notation->{name});
5058    my @tokens;
5059    push @tokens, '<!NOTATION';
5060    push @tokens, $notation->{name};
5061    push @tokens, ( 'PUBLIC', _quoted_val( $notation->{pubid} ) ) if $notation->{pubid};
5062    push @tokens, ( 'SYSTEM')                                     if ! $notation->{pubid} && $notation->{sysid};
5063    push @tokens, (_quoted_val( $notation->{sysid}) )             if $notation->{sysid};
5064
5065    return join( ' ', @tokens) . '>';
5066  }
5067
5068sub _quoted_val
5069  { my $q= $_[0]=~ m{"} ? q{'} : q{"};
5070    return qq{$q$_[0]$q};
5071  }
5072
5073sub _dump
5074  { my( $notation)= @_; return join( " - ", map { "$_ => '$notation->{$_}'" } grep { defined $notation->{$_} } sort keys %$notation); }
5075
50761;
5077
5078######################################################################
5079package XML::Twig::Elt;
5080######################################################################
5081
5082use Carp;
5083*isa= *UNIVERSAL::isa;
5084
5085my $CDATA_START    = "<![CDATA[";
5086my $CDATA_END      = "]]>";
5087my $PI_START       = "<?";
5088my $PI_END         = "?>";
5089my $COMMENT_START  = "<!--";
5090my $COMMENT_END    = "-->";
5091
5092my $XMLNS_URI      = 'http://www.w3.org/2000/xmlns/';
5093
5094
5095BEGIN
5096  { # set some aliases for methods
5097    *tag           = *gi;
5098    *name          = *gi;
5099    *set_tag       = *set_gi;
5100    *set_name      = *set_gi;
5101    *find_nodes    = *get_xpath; # as in XML::DOM
5102    *findnodes     = *get_xpath; # as in XML::LibXML
5103    *field         = *first_child_text;
5104    *trimmed_field = *first_child_trimmed_text;
5105    *is_field      = *contains_only_text;
5106    *is            = *passes;
5107    *matches       = *passes;
5108    *has_child     = *first_child;
5109    *has_children  = *first_child;
5110    *all_children_pass = *all_children_are;
5111    *all_children_match= *all_children_are;
5112    *getElementsByTagName= *descendants;
5113    *find_by_tag_name= *descendants_or_self;
5114    *unwrap          = *erase;
5115    *inner_xml       = *xml_string;
5116    *outer_xml       = *sprint;
5117    *add_class       = *add_to_class;
5118
5119    *first_child_is  = *first_child_matches;
5120    *last_child_is   = *last_child_matches;
5121    *next_sibling_is = *next_sibling_matches;
5122    *prev_sibling_is = *prev_sibling_matches;
5123    *next_elt_is     = *next_elt_matches;
5124    *prev_elt_is     = *prev_elt_matches;
5125    *parent_is       = *parent_matches;
5126    *child_is        = *child_matches;
5127    *inherited_att   = *inherit_att;
5128
5129    *sort_children_by_value= *sort_children_on_value;
5130
5131    *has_atts= *att_nb;
5132
5133    # imports from XML::Twig
5134    *_is_fh= *XML::Twig::_is_fh;
5135
5136    # XML::XPath compatibility
5137    *string_value       = *text;
5138    *toString           = *sprint;
5139    *getName            = *gi;
5140    *getRootNode        = *twig;
5141    *getNextSibling     = *_next_sibling;
5142    *getPreviousSibling = *_prev_sibling;
5143    *isElementNode      = *is_elt;
5144    *isTextNode         = *is_text;
5145    *isPI               = *is_pi;
5146    *isPINode           = *is_pi;
5147    *isProcessingInstructionNode= *is_pi;
5148    *isComment          = *is_comment;
5149    *isCommentNode      = *is_comment;
5150    *getTarget          = *target;
5151    *getFirstChild      = *_first_child;
5152    *getLastChild      = *_last_child;
5153
5154    # try using weak references
5155    # test whether we can use weak references
5156    { local $SIG{__DIE__};
5157      if( eval 'require Scalar::Util' && defined( &Scalar::Util::weaken) )
5158        { import Scalar::Util qw(weaken); }
5159      elsif( eval 'require WeakRef')
5160        { import WeakRef; }
5161    }
5162}
5163
5164
5165# can be called as XML::Twig::Elt->new( [[$gi, $atts, [@content]])
5166# - gi is an optional gi given to the element
5167# - $atts is a hashref to attributes for the element
5168# - @content is an optional list of text and elements that will
5169#   be inserted under the element
5170sub new
5171  { my $class= shift;
5172    $class= ref $class || $class;
5173    my $elt  = {};
5174    bless ($elt, $class);
5175
5176    return $elt unless @_;
5177
5178    if( @_ == 1 && $_[0]=~ m{^\s*<}) { return $class->parse( @_); }
5179
5180    # if a gi is passed then use it
5181    my $gi= shift;
5182    $elt->{gi}=$XML::Twig::gi2index{$gi} or $elt->set_gi( $gi);
5183
5184
5185    my $atts= ref $_[0] eq 'HASH' ? shift : undef;
5186
5187    if( $atts && defined $atts->{$CDATA})
5188      { delete $atts->{$CDATA};
5189
5190        my $cdata= $class->new( $CDATA => @_);
5191        return $class->new( $gi, $atts, $cdata);
5192      }
5193
5194    if( $gi eq $PCDATA)
5195      { if( grep { ref $_ } @_) { croak "element $PCDATA can only be created from text"; }
5196        $elt->{pcdata}=  join '', @_;
5197      }
5198    elsif( $gi eq $ENT)
5199      { $elt->{ent}=  shift; }
5200    elsif( $gi eq $CDATA)
5201      { if( grep { ref $_ } @_) { croak "element $CDATA can only be created from text"; }
5202        $elt->{cdata}=  join '', @_;
5203      }
5204    elsif( $gi eq $COMMENT)
5205      { if( grep { ref $_ } @_) { croak "element $COMMENT can only be created from text"; }
5206        $elt->{comment}=  join '', @_;
5207      }
5208    elsif( $gi eq $PI)
5209      { if( grep { ref $_ } @_) { croak "element $PI can only be created from text"; }
5210        $elt->_set_pi( shift, join '', @_);
5211      }
5212    else
5213      { # the rest of the arguments are the content of the element
5214        if( @_)
5215          { $elt->set_content( @_); }
5216        else
5217          { $elt->{empty}=  1;    }
5218      }
5219
5220    if( $atts)
5221      { # the attribute hash can be used to pass the asis status
5222        if( defined $atts->{$ASIS})  { $elt->set_asis(  $atts->{$ASIS} ); delete $atts->{$ASIS};  }
5223        if( defined $atts->{$EMPTY}) { $elt->{empty}=  $atts->{$EMPTY}; delete $atts->{$EMPTY}; }
5224        if( keys %$atts) { $elt->set_atts( $atts); }
5225        $elt->_set_id( $atts->{$ID}) if( $atts->{$ID});
5226      }
5227
5228    return $elt;
5229  }
5230
5231# optimized version of $elt->new( PCDATA, $text);
5232sub _new_pcdata
5233  { my $class= $_[0];
5234    $class= ref $class || $class;
5235    my $elt  = {};
5236    bless $elt, $class;
5237    $elt->{gi}=$XML::Twig::gi2index{$PCDATA} or $elt->set_gi( $PCDATA);
5238    $elt->{pcdata}=  $_[1];
5239    return $elt;
5240  }
5241
5242# this function creates an XM:::Twig::Elt from a string
5243# it is quite clumsy at the moment, as it just creates a
5244# new twig then returns its root
5245# there might also be memory leaks there
5246# additional arguments are passed to new XML::Twig
5247sub parse
5248  { my $class= shift;
5249    if( ref( $class)) { $class= ref( $class); }
5250    my $string= shift;
5251    my %args= @_;
5252    my $t= XML::Twig->new(%args);
5253    $t->parse( $string);
5254    my $elt= $t->root;
5255    # clean-up the node
5256    delete $elt->{twig};         # get rid of the twig data
5257    delete $elt->{twig_current}; # better get rid of this too
5258    if( $t->{twig_id_list}) { $elt->{twig_id_list}= $t->{twig_id_list}; }
5259    $elt->cut;
5260    undef $t->{twig_root};
5261    return $elt;
5262  }
5263
5264sub set_inner_xml
5265  { my( $elt, $xml, @args)= @_;
5266    my $new_elt= $elt->parse( "<dummy>$xml</dummy>", @args);
5267    $elt->cut_children;
5268    $new_elt->paste_first_child( $elt);
5269    $new_elt->erase;
5270    return $elt;
5271  }
5272
5273sub set_outer_xml
5274  { my( $elt, $xml, @args)= @_;
5275    my $new_elt= $elt->parse( "<dummy>$xml</dummy>", @args);
5276    $elt->cut_children;
5277    $new_elt->replace( $elt);
5278    $new_elt->erase;
5279    return $new_elt;
5280  }
5281
5282
5283sub set_inner_html
5284  { my( $elt, $html)= @_;
5285    my $t= XML::Twig->new->parse_html( "<html>$html</html>");
5286    my $new_elt= $t->root;
5287    if( $elt->tag eq 'head')
5288      { $new_elt->first_child( 'head')->unwrap;
5289        $new_elt->first_child( 'body')->cut;
5290      }
5291    elsif( $elt->tag ne 'html')
5292      { $new_elt->first_child( 'head')->cut;
5293        $new_elt->first_child( 'body')->unwrap;
5294      }
5295    $new_elt->cut;
5296    $elt->cut_children;
5297    $new_elt->paste_first_child( $elt);
5298    $new_elt->erase;
5299    return $elt;
5300  }
5301
5302sub set_gi
5303  { my ($elt, $gi)= @_;
5304    unless( defined $XML::Twig::gi2index{$gi})
5305      { # new gi, create entries in %gi2index and @index2gi
5306        push  @XML::Twig::index2gi, $gi;
5307        $XML::Twig::gi2index{$gi}= $#XML::Twig::index2gi;
5308      }
5309    $elt->{gi}= $XML::Twig::gi2index{$gi};
5310    return $elt;
5311  }
5312
5313sub gi  { return $XML::Twig::index2gi[$_[0]->{gi}]; }
5314
5315sub local_name
5316  { my $elt= shift;
5317    return _local_name( $XML::Twig::index2gi[$elt->{'gi'}]);
5318  }
5319
5320sub ns_prefix
5321  { my $elt= shift;
5322    return _ns_prefix( $XML::Twig::index2gi[$elt->{'gi'}]);
5323  }
5324
5325# namespace prefix for any qname (can be used for elements or attributes)
5326sub _ns_prefix
5327  { my $qname= shift;
5328    if( $qname=~ m{^([^:]*):})
5329      { return $1; }
5330    else
5331      { return( ''); } # should it be '' ?
5332  }
5333
5334# local name for any qname (can be used for elements or attributes)
5335sub _local_name
5336  { my $qname= shift;
5337    (my $local= $qname)=~ s{^[^:]*:}{};
5338    return $local;
5339  }
5340
5341#sub get_namespace
5342sub namespace ## no critic (Subroutines::ProhibitNestedSubs);
5343  { my $elt= shift;
5344    my $prefix= defined $_[0] ? shift() : $elt->ns_prefix;
5345    my $ns_att= $prefix ? "xmlns:$prefix" : "xmlns";
5346    my $expanded= $DEFAULT_NS{$prefix} || $elt->_inherit_att_through_cut( $ns_att) || '';
5347    return $expanded;
5348  }
5349
5350sub declare_missing_ns ## no critic (Subroutines::ProhibitNestedSubs);
5351  { my $root= shift;
5352    my %missing_prefix;
5353    my $map= $root->_current_ns_prefix_map;
5354
5355    foreach my $prefix (keys %$map)
5356      { my $prefix_att= $prefix eq '#default' ? 'xmlns' : "xmlns:$prefix";
5357        if( ! $root->{'att'}->{$prefix_att})
5358          { $root->set_att( $prefix_att => $map->{$prefix}); }
5359      }
5360    return $root;
5361  }
5362
5363sub _current_ns_prefix_map
5364  { my( $elt)= shift;
5365    my $map;
5366    while( $elt)
5367      { foreach my $att ($elt->att_names)
5368          { my $prefix= $att eq 'xmlns'        ? '#default'
5369                      : $att=~ m{^xmlns:(.*)$} ? $1
5370                      : next
5371                      ;
5372            if( ! exists $map->{$prefix}) { $map->{$prefix}= $elt->{'att'}->{$att}; }
5373          }
5374        $elt= $elt->{parent} || ($elt->{former} && $elt->{former}->{parent});
5375      }
5376    return $map;
5377  }
5378
5379sub set_ns_decl
5380  { my( $elt, $uri, $prefix)= @_;
5381    my $ns_att=  $prefix ? "xmlns:$prefix" : 'xmlns';
5382    $elt->set_att( $ns_att => $uri);
5383    return $elt;
5384  }
5385
5386sub set_ns_as_default
5387  { my( $root, $uri)= @_;
5388    my @ns_decl_to_remove;
5389    foreach my $elt ($root->descendants_or_self)
5390      { if( $elt->_ns_prefix && $elt->namespace eq $uri)
5391          { $elt->set_tag( $elt->local_name); }
5392        # store any namespace declaration for that uri
5393        foreach my $ns_decl (grep { $_=~ m{xmlns(:|$)} && $elt->{'att'}->{$_} eq $uri } $elt->att_names)
5394          { push @ns_decl_to_remove, [$elt, $ns_decl]; }
5395      }
5396    $root->set_ns_decl( $uri);
5397    # now remove the ns declarations (if done earlier then descendants of an element with the ns declaration
5398    # are not considered being in the namespace
5399    foreach my $ns_decl_to_remove ( @ns_decl_to_remove)
5400      { my( $elt, $ns_decl)= @$ns_decl_to_remove;
5401        $elt->del_att( $ns_decl);
5402      }
5403
5404    return $root;
5405  }
5406
5407
5408
5409# return #ELT for an element and #PCDATA... for others
5410sub get_type
5411  { my $gi_nb= $_[0]->{gi}; # the number, not the string
5412    return $ELT if( $gi_nb >= $XML::Twig::SPECIAL_GI);
5413    return $_[0]->gi;
5414  }
5415
5416# return the gi if it's a "real" element, 0 otherwise
5417sub is_elt
5418  { if(  $_[0]->{gi} >=  $XML::Twig::SPECIAL_GI)
5419     { return $_[0]->gi; }
5420    else
5421      { return 0; }
5422  }
5423
5424
5425sub is_pcdata
5426  { my $elt= shift;
5427    return (exists $elt->{'pcdata'});
5428  }
5429
5430sub is_cdata
5431  { my $elt= shift;
5432    return (exists $elt->{'cdata'});
5433  }
5434
5435sub is_pi
5436  { my $elt= shift;
5437    return (exists $elt->{'target'});
5438  }
5439
5440sub is_comment
5441  { my $elt= shift;
5442    return (exists $elt->{'comment'});
5443  }
5444
5445sub is_ent
5446  { my $elt= shift;
5447    return (exists $elt->{ent} || $elt->{ent_name});
5448  }
5449
5450
5451sub is_text
5452  { my $elt= shift;
5453    return (exists( $elt->{'pcdata'}) || (exists $elt->{'cdata'}));
5454  }
5455
5456sub is_empty
5457  { return $_[0]->{empty} || 0; }
5458
5459sub set_empty
5460  { $_[0]->{empty}= defined( $_[1]) ? $_[1] : 1; return $_[0]; }
5461
5462sub set_not_empty
5463  { delete $_[0]->{empty} if( $_[0]->{'empty'}); return $_[0]; }
5464
5465
5466sub set_asis
5467  { my $elt=shift;
5468
5469    foreach my $descendant ($elt, $elt->_descendants )
5470      { $descendant->{asis}= 1;
5471        if( (exists $descendant->{'cdata'}))
5472          { $descendant->{gi}=$XML::Twig::gi2index{$PCDATA} or $descendant->set_gi( $PCDATA);
5473            $descendant->{pcdata}=  $descendant->{cdata};
5474          }
5475
5476      }
5477    return $elt;
5478  }
5479
5480sub set_not_asis
5481  { my $elt=shift;
5482    foreach my $descendant ($elt, $elt->descendants)
5483      { delete $descendant->{asis} if $descendant->{asis};}
5484    return $elt;
5485  }
5486
5487sub is_asis
5488  { return $_[0]->{asis}; }
5489
5490sub closed
5491  { my $elt= shift;
5492    my $t= $elt->twig || return;
5493    my $curr_elt= $t->{twig_current};
5494    return 1 unless( $curr_elt);
5495    return $curr_elt->in( $elt);
5496  }
5497
5498sub set_pcdata
5499  { my( $elt, $pcdata)= @_;
5500
5501    if( $elt->{extra_data_in_pcdata})
5502      { _try_moving_extra_data( $elt, $pcdata);
5503      }
5504    $elt->{pcdata}= $pcdata;
5505    return $elt;
5506  }
5507
5508sub _extra_data_in_pcdata      { return $_[0]->{extra_data_in_pcdata}; }
5509sub _set_extra_data_in_pcdata  { $_[0]->{extra_data_in_pcdata}= $_[1]; return $_[0]; }
5510sub _del_extra_data_in_pcdata  { delete $_[0]->{extra_data_in_pcdata}; return $_[0]; }
5511sub _unshift_extra_data_in_pcdata
5512    { my $e= shift;
5513      $e->{extra_data_in_pcdata}||=[];
5514      unshift @{$e->{extra_data_in_pcdata}}, { text => shift(), offset => shift() };
5515    }
5516sub _push_extra_data_in_pcdata
5517  { my $e= shift;
5518    $e->{extra_data_in_pcdata}||=[];
5519    push @{$e->{extra_data_in_pcdata}}, { text => shift(), offset => shift() };
5520  }
5521
5522sub _extra_data_before_end_tag     { return $_[0]->{extra_data_before_end_tag} || ''; }
5523sub _set_extra_data_before_end_tag { $_[0]->{extra_data_before_end_tag}= $_[1]; return $_[0]}
5524sub _del_extra_data_before_end_tag { delete $_[0]->{extra_data_before_end_tag}; return $_[0]}
5525sub _prefix_extra_data_before_end_tag
5526  { my( $elt, $data)= @_;
5527    if($elt->{extra_data_before_end_tag})
5528      { $elt->{extra_data_before_end_tag}= $data . $elt->{extra_data_before_end_tag}; }
5529    else
5530      { $elt->{extra_data_before_end_tag}= $data; }
5531    return $elt;
5532  }
5533
5534# internal, in cases where we know there is no extra_data (inlined anyway!)
5535sub _set_pcdata { $_[0]->{pcdata}= $_[1]; }
5536
5537# try to figure out if we can keep the extra_data around
5538sub _try_moving_extra_data
5539  { my( $elt, $modified)=@_;
5540    my $initial= $elt->{pcdata};
5541    my $cpis= $elt->{extra_data_in_pcdata};
5542
5543    if( (my $offset= index( $modified, $initial)) != -1)
5544      { # text has been added
5545        foreach (@$cpis) { $_->{offset}+= $offset; }
5546      }
5547    elsif( ($offset= index( $initial, $modified)) != -1)
5548      { # text has been cut
5549        my $len= length( $modified);
5550        foreach my $cpi (@$cpis) { $cpi->{offset} -= $offset; }
5551        $elt->_set_extra_data_in_pcdata( [ grep { $_->{offset} >= 0 && $_->{offset} < $len } @$cpis ]);
5552      }
5553    else
5554      {    _match_extra_data_words( $elt, $initial, $modified)
5555        || _match_extra_data_chars( $elt, $initial, $modified)
5556        || $elt->_del_extra_data_in_pcdata;
5557      }
5558  }
5559
5560sub _match_extra_data_words
5561  { my( $elt, $initial, $modified)= @_;
5562    my @initial= split /\b/, $initial;
5563    my @modified= split /\b/, $modified;
5564
5565    return _match_extra_data( $elt, length( $initial), \@initial, \@modified);
5566  }
5567
5568sub _match_extra_data_chars
5569  { my( $elt, $initial, $modified)= @_;
5570    my @initial= split //, $initial;
5571    my @modified= split //, $modified;
5572
5573    return _match_extra_data( $elt, length( $initial), \@initial, \@modified);
5574  }
5575
5576sub _match_extra_data
5577  { my( $elt, $length, $initial, $modified)= @_;
5578
5579    my $cpis= $elt->{extra_data_in_pcdata};
5580
5581    if( @$initial <= @$modified)
5582      {
5583        my( $ok, $positions, $offsets)= _pos_offset( $initial, $modified);
5584        if( $ok)
5585          { my $offset=0;
5586            my $pos= shift @$positions;
5587            foreach my $cpi (@$cpis)
5588              { while( $cpi->{offset} >= $pos)
5589                  { $offset= shift @$offsets;
5590                    $pos= shift @$positions || $length +1;
5591                  }
5592                $cpi->{offset} += $offset;
5593              }
5594            return 1;
5595          }
5596      }
5597    else
5598      { my( $ok, $positions, $offsets)= _pos_offset( $modified, $initial);
5599        if( $ok)
5600          { #print STDERR "pos:    ", join( ':', @$positions), "\n",
5601            #             "offset: ", join( ':', @$offsets), "\n";
5602            my $offset=0;
5603            my $pos= shift @$positions;
5604            my $prev_pos= 0;
5605
5606            foreach my $cpi (@$cpis)
5607              { while( $cpi->{offset} >= $pos)
5608                  { $offset= shift @$offsets;
5609                    $prev_pos= $pos;
5610                    $pos= shift @$positions || $length +1;
5611                  }
5612                $cpi->{offset} -= $offset;
5613                if( $cpi->{offset} < $prev_pos) { delete $cpi->{text}; }
5614              }
5615            $elt->_set_extra_data_in_pcdata( [ grep { exists $_->{text} } @$cpis ]);
5616            return 1;
5617          }
5618      }
5619    return 0;
5620  }
5621
5622
5623sub _pos_offset
5624  { my( $short, $long)= @_;
5625    my( @pos, @offset);
5626    my( $s_length, $l_length)=(0,0);
5627    while (@$short)
5628      { my $s_word= shift @$short;
5629        my $l_word= shift @$long;
5630        if( $s_word ne $l_word)
5631          { while( @$long && $s_word ne $l_word)
5632              { $l_length += length( $l_word);
5633                $l_word= shift @$long;
5634              }
5635            if( !@$long && $s_word ne $l_word) { return 0; }
5636            push @pos, $s_length;
5637            push @offset, $l_length - $s_length;
5638          }
5639        my $length= length( $s_word);
5640        $s_length += $length;
5641        $l_length += $length;
5642      }
5643    return( 1, \@pos, \@offset);
5644  }
5645
5646sub append_pcdata
5647  { $_[0]->{'pcdata'}.= $_[1];
5648    return $_[0];
5649  }
5650
5651sub pcdata        { return $_[0]->{pcdata}; }
5652
5653
5654sub append_extra_data
5655  {  $_[0]->{extra_data}.= $_[1];
5656     return $_[0];
5657  }
5658
5659sub set_extra_data
5660  { $_[0]->{extra_data}= $_[1];
5661    return $_[0];
5662  }
5663sub extra_data { return $_[0]->{extra_data} || ''; }
5664
5665sub set_target
5666  { my( $elt, $target)= @_;
5667    $elt->{target}= $target;
5668    return $elt;
5669  }
5670sub target { return $_[0]->{target}; }
5671
5672sub set_data
5673  { $_[0]->{'data'}= $_[1];
5674    return $_[0];
5675  }
5676sub data { return $_[0]->{data}; }
5677
5678sub set_pi
5679  { my $elt= shift;
5680    unless( $elt->{gi} == $XML::Twig::gi2index{$PI})
5681      { $elt->cut_children;
5682        $elt->{gi}=$XML::Twig::gi2index{$PI} or $elt->set_gi( $PI);
5683      }
5684    return $elt->_set_pi( @_);
5685  }
5686
5687sub _set_pi
5688  { $_[0]->set_target( $_[1]);
5689    $_[0]->{data}=  $_[2];
5690    return $_[0];
5691  }
5692
5693sub pi_string { my $string= $PI_START . $_[0]->{target};
5694                my $data= $_[0]->{data};
5695                if( defined( $data) && $data ne '') { $string .= " $data"; }
5696                $string .= $PI_END ;
5697                return $string;
5698              }
5699
5700sub set_comment
5701  { my $elt= shift;
5702    unless( $elt->{gi} == $XML::Twig::gi2index{$COMMENT})
5703      { $elt->cut_children;
5704        $elt->{gi}=$XML::Twig::gi2index{$COMMENT} or $elt->set_gi( $COMMENT);
5705      }
5706    $elt->{comment}=  $_[0];
5707    return $elt;
5708  }
5709
5710sub _set_comment   { $_[0]->{comment}= $_[1]; return $_[0]; }
5711sub comment        { return $_[0]->{comment}; }
5712sub comment_string { return $COMMENT_START . _comment_escaped_string( $_[0]->{comment}) . $COMMENT_END; }
5713# comments cannot start or end with
5714sub _comment_escaped_string
5715  { my( $c)= @_;
5716    $c=~ s{^-}{ -};
5717    $c=~ s{-$}{- };
5718    $c=~ s{--}{- -}g;
5719    return $c;
5720  }
5721
5722sub set_ent  { $_[0]->{ent}= $_[1]; return $_[0]; }
5723sub ent      { return $_[0]->{ent}; }
5724sub ent_name { return substr( $_[0]->{ent}, 1, -1);}
5725
5726sub set_cdata
5727  { my $elt= shift;
5728    unless( $elt->{gi} == $XML::Twig::gi2index{$CDATA})
5729      { $elt->cut_children;
5730        $elt->insert_new_elt( first_child => $CDATA, @_);
5731        return $elt;
5732      }
5733    $elt->{cdata}=  $_[0];
5734    return $_[0];
5735  }
5736
5737sub _set_cdata
5738  { $_[0]->{cdata}= $_[1];
5739    return $_[0];
5740  }
5741
5742sub append_cdata
5743  { $_[0]->{cdata}.= $_[1];
5744    return $_[0];
5745  }
5746sub cdata { return $_[0]->{cdata}; }
5747
5748
5749sub contains_only_text
5750  { my $elt= shift;
5751    return 0 unless $elt->is_elt;
5752    foreach my $child ($elt->_children)
5753      { return 0 if $child->is_elt; }
5754    return $elt;
5755  }
5756
5757sub contains_only
5758  { my( $elt, $exp)= @_;
5759    my @children= do { my $elt= $elt; my @children=(); my $child= $elt->{first_child}; while( $child) { push @children, $child; $child= $child->{next_sibling}; } @children; };
5760    foreach my $child (@children)
5761      { return 0 unless $child->is( $exp); }
5762    return @children || 1;
5763  }
5764
5765sub contains_a_single
5766  { my( $elt, $exp)= @_;
5767    my $child= $elt->{first_child} or return 0;
5768    return 0 unless $child->passes( $exp);
5769    return 0 if( $child->{next_sibling});
5770    return $child;
5771  }
5772
5773
5774sub root
5775  { my $elt= shift;
5776    while( $elt->{parent}) { $elt= $elt->{parent}; }
5777    return $elt;
5778  }
5779
5780sub _root_through_cut
5781  { my $elt= shift;
5782    while( $elt->{parent} || ($elt->{former} && $elt->{former}->{parent})) { $elt= $elt->{parent} || ($elt->{former} && $elt->{former}->{parent}); }
5783    return $elt;
5784  }
5785
5786sub twig
5787  { my $elt= shift;
5788    my $root= $elt->root;
5789    return $root->{twig};
5790  }
5791
5792sub _twig_through_cut
5793  { my $elt= shift;
5794    my $root= $elt->_root_through_cut;
5795    return $root->{twig};
5796  }
5797
5798
5799# used for navigation
5800# returns undef or the element, depending on whether $elt passes $cond
5801# $cond can be
5802# - empty: the element passes the condition
5803# - ELT ('#ELT'): the element passes the condition if it is a "real" element
5804# - TEXT ('#TEXT'): the element passes if it is a CDATA or PCDATA element
5805# - a string with an XPath condition (only a subset of XPath is actually
5806#   supported).
5807# - a regexp: the element passes if its gi matches the regexp
5808# - a code ref: the element passes if the code, applied on the element,
5809#   returns true
5810
5811my %cond_cache; # expression => coderef
5812
5813sub reset_cond_cache { %cond_cache=(); }
5814
5815{
5816   sub _install_cond
5817    { my $cond= shift;
5818      my $test;
5819      my $init='';
5820
5821      my $original_cond= $cond;
5822
5823      my $not= ($cond=~ s{^\s*!}{}) ? '!' : '';
5824
5825      if( ref $cond eq 'CODE') { return $cond; }
5826
5827      if( ref $cond eq 'Regexp')
5828        { $test = qq{(\$_[0]->gi=~ /$cond/)}; }
5829      else
5830        { my @tests;
5831          while( $cond)
5832            {
5833              # the condition is a string
5834              if( $cond=~ s{$ELT$SEP}{})
5835                { push @tests, qq{\$_[0]->is_elt}; }
5836              elsif( $cond=~ s{$TEXT$SEP}{})
5837                { push @tests, qq{\$_[0]->is_text}; }
5838              elsif( $cond=~ s{^\s*($REG_TAG_PART)$SEP}{})
5839                { push @tests, _gi_test( $1); }
5840              elsif( $cond=~ s{^\s*($REG_REGEXP)$SEP}{})
5841                { # /regexp/
5842                  push @tests, qq{ \$_[0]->gi=~ $1 };
5843                }
5844              elsif( $cond=~ s{^\s*($REG_TAG_PART)?\s*  # $1
5845                               \[\s*(-?)\s*(\d+)\s*\]  #   [$2]
5846                               $SEP}{}xo
5847                   )
5848                { my( $gi, $neg, $index)= ($1, $2, $3);
5849                  my $siblings= $neg ? q{$_[0]->_next_siblings} : q{$_[0]->_prev_siblings};
5850                  if( $gi && ($gi ne '*'))
5851                    #{ $test= qq{((\$_[0]->gi eq "$gi") && (scalar( grep { \$_->gi eq "$gi" } $siblings) + 1 == $index))}; }
5852                    { push @tests, _and( _gi_test( $gi), qq{ (scalar( grep { \$_->gi eq "$gi" } $siblings) + 1 == $index)}); }
5853                  else
5854                    { push @tests, qq{(scalar( $siblings) + 1 == $index)}; }
5855                }
5856              elsif( $cond=~ s{^\s*($REG_TAG_PART?)\s*($REG_PREDICATE)$SEP}{})
5857                { my( $gi, $predicate)= ( $1, $2);
5858                  push @tests, _and( _gi_test( $gi), _parse_predicate_in_step( $predicate));
5859                }
5860              elsif( $cond=~ s{^\s*($REG_NAKED_PREDICATE)$SEP}{})
5861                { push @tests,   _parse_predicate_in_step( $1); }
5862              else
5863                { croak "wrong navigation condition '$original_cond' ($@)"; }
5864            }
5865           $test= @tests > 1 ? '(' . join( '||', map { "($_)" } @tests) . ')' : $tests[0];
5866        }
5867
5868      #warn "init: '$init' - test: '$test'\n";
5869
5870      my $sub= qq{sub { $NO_WARNINGS; $init; return $not($test) ? \$_[0] : undef; } };
5871      my $s= eval $sub;
5872      #warn "cond: $cond\n$sub\n";
5873      if( $@)
5874        { croak "wrong navigation condition '$original_cond' ($@);" }
5875      return $s;
5876    }
5877
5878  sub _gi_test
5879    { my( $full_gi)= @_;
5880
5881      # optimize if the gi exists, including the case where the gi includes a dot
5882      my $index= $XML::Twig::gi2index{$full_gi};
5883      if( $index) { return qq{\$_[0]->{gi} == $index}; }
5884
5885      my( $gi, $class, $id)= $full_gi=~ m{^(.*?)(?:[.]([^.]*)|[#](.*))?$};
5886
5887      my $gi_test='';
5888      if( $gi && $gi ne '*' )
5889        { # 2 options, depending on whether the gi exists in gi2index
5890          # start optimization
5891          my $index= $XML::Twig::gi2index{$gi};
5892          if( $index)
5893            { # the gi exists, use its index as a faster shortcut
5894              $gi_test = qq{\$_[0]->{gi} == $index};
5895            }
5896          else
5897          # end optimization
5898            { # it does not exist (but might be created later), compare the strings
5899              $gi_test = qq{ \$_[0]->gi eq "$gi"};
5900            }
5901        }
5902      else
5903        { $gi_test= 1; }
5904
5905      my $class_test='';
5906      #warn "class: '$class'";
5907      if( $class)
5908        { $class_test = qq{ defined( \$_[0]->{att}->{class}) && \$_[0]->{att}->{class}=~ m{\\b$class\\b} }; }
5909
5910      my $id_test='';
5911      #warn "id: '$id'";
5912      if( $id)
5913        { $id_test = qq{ defined( \$_[0]->{att}->{$ID}) && \$_[0]->{att}->{$ID} eq '$id' }; }
5914
5915
5916      #warn "gi_test: '$gi_test' - class_test: '$class_test' returning ",  _and( $gi_test, $class_test);
5917      return _and( $gi_test, $class_test, $id_test);
5918  }
5919
5920
5921  # input: the original predicate
5922  sub _parse_predicate_in_step
5923    { my $cond= shift;
5924      my %PERL_ALPHA_TEST= ( '=' => ' eq ', '!=' => ' ne ', '>' => ' gt ', '>=' => ' ge ', '<' => ' lt ', '<=' => ' le ');
5925
5926      $cond=~ s{^\s*\[\s*}{};
5927      $cond=~ s{\s*\]\s*$}{};
5928      $cond=~ s{(   ($REG_STRING|$REG_REGEXP)                # strings or regexps
5929                   |\@($REG_TAG_NAME)(?=\s*(?:[><=!]|!~|=~)) # @att (followed by a comparison operator)
5930                   |\@($REG_TAG_NAME)                        # @att (not followed by a comparison operator)
5931                   |=~|!~                                    # matching operators
5932                   |([><]=?|=|!=)(?=\s*[\d+-])               # test before a number
5933                   |([><]=?|=|!=)                            # test, other cases
5934                   |($REG_FUNCTION)                          # no arg functions
5935                   # this bit is a mess, but it is the only solution with this half-baked parser
5936                   |((?:string|text)\(\s*$REG_TAG_NAME\s*\)\s*$REG_MATCH\s*$REG_REGEXP) # string( child) =~ /regexp/
5937                   |((?:string|text)\(\s*$REG_TAG_NAME\s*\)\s*!?=\s*$REG_VALUE)         # string( child) = "value" (or !=)
5938                   |((?:string|text)\(\s*$REG_TAG_NAME\s*\)\s*[<>]=?\s*$REG_VALUE)      # string( child) > "value"
5939                   |(and|or)
5940                )}
5941               { my( $token, $string, $att, $bare_att, $num_test, $alpha_test, $func, $string_regexp, $string_eq, $string_test, $and_or)
5942                 = ( $1,     $2,      $3,   $4,        $5,        $6,          $7,    $8,             $9,         $10,          $11);
5943
5944                 if( defined $string)   { $token }
5945                 elsif( $att)           { "( \$_[0]->{att} && exists( \$_[0]->{att}->{'$att'}) && \$_[0]->{att}->{'$att'})"; }
5946                 elsif( $bare_att)      { "(\$_[0]->{att} && defined( \$_[0]->{att}->{'$bare_att'}))"; }
5947                 elsif( $num_test && ($num_test eq '=') ) { "==" } # others tests are unchanged
5948                 elsif( $alpha_test)    { $PERL_ALPHA_TEST{$alpha_test} }
5949                 elsif( $func && $func=~ m{^(?:string|text)})
5950                                        { "\$_[0]->text"; }
5951                 elsif( $string_regexp && $string_regexp =~ m{(?:string|text)\(\s*($REG_TAG_NAME)\s*\)\s*($REG_MATCH)\s*($REG_REGEXP)})
5952                                        { "(XML::Twig::_first_n { (\$_->gi eq '$1') && (\$_->text $2 $3) } 1, \$_[0]->_children)"; }
5953                 elsif( $string_eq     && $string_eq     =~ m{(?:string|text)\(\s*($REG_TAG_NAME)\s*\)\s*(!?=)\s*($REG_VALUE)})
5954                                        {"(XML::Twig::_first_n { (\$_->gi eq '$1') && (\$_->text $PERL_ALPHA_TEST{$2} $3) } 1, \$_[0]->_children)"; }
5955                 elsif( $string_test   && $string_test   =~ m{(?:string|text)\(\s*($REG_TAG_NAME)\s*\)\s*([<>]=?)\s*($REG_VALUE)})
5956                                        { "(XML::Twig::_first_n { (\$_->gi eq '$1') && (\$_->text $2 $3) } 1, \$_[0]->_children)"; }
5957                 elsif( $and_or)        { $and_or eq 'and' ? '&&' : '||' ; }
5958                 else                   { $token; }
5959               }gexs;
5960      return "($cond)";
5961    }
5962
5963
5964  sub _op
5965    { my $op= shift;
5966      if(    $op eq '=')  { $op= 'eq'; }
5967      elsif( $op eq '!=') { $op= 'ne'; }
5968      return $op;
5969    }
5970
5971  sub passes
5972    { my( $elt, $cond)= @_;
5973      return $elt unless $cond;
5974      my $sub= ($cond_cache{$cond} ||= _install_cond( $cond));
5975      return $sub->( $elt);
5976    }
5977}
5978
5979sub set_parent
5980  { $_[0]->{parent}= $_[1];
5981    if( $XML::Twig::weakrefs) { weaken( $_[0]->{parent}); }
5982  }
5983
5984sub parent
5985  { my $elt= shift;
5986    my $cond= shift || return $elt->{parent};
5987    do { $elt= $elt->{parent} || return; } until ( $elt->passes( $cond));
5988    return $elt;
5989  }
5990
5991sub set_first_child
5992  { $_[0]->{'first_child'}= $_[1];
5993  }
5994
5995sub first_child
5996  { my $elt= shift;
5997    my $cond= shift || return $elt->{first_child};
5998    my $child= $elt->{first_child};
5999    my $test_cond= ($cond_cache{$cond} ||= _install_cond( $cond));
6000    while( $child && !$test_cond->( $child))
6001       { $child= $child->{next_sibling}; }
6002    return $child;
6003  }
6004
6005sub _first_child   { return $_[0]->{first_child};  }
6006sub _last_child    { return $_[0]->{last_child};   }
6007sub _next_sibling  { return $_[0]->{next_sibling}; }
6008sub _prev_sibling  { return $_[0]->{prev_sibling}; }
6009sub _parent        { return $_[0]->{parent};       }
6010sub _next_siblings { my $elt= shift; my @siblings; while( $elt= $elt->{next_sibling}) { push @siblings, $elt; } return @siblings; }
6011sub _prev_siblings { my $elt= shift; my @siblings; while( $elt= $elt->{prev_sibling}) { push @siblings, $elt; } return @siblings; }
6012
6013# sets a field
6014# arguments $record, $cond, @content
6015sub set_field
6016  { my $record = shift;
6017    my $cond = shift;
6018    my $child= $record->first_child( $cond);
6019    if( $child)
6020      { $child->set_content( @_); }
6021    else
6022      { if( $cond=~ m{^\s*($REG_TAG_NAME)})
6023          { my $gi= $1;
6024            $child= $record->insert_new_elt( last_child => $gi, @_);
6025          }
6026        else
6027          { croak "can't create a field name from $cond"; }
6028      }
6029    return $child;
6030  }
6031
6032sub set_last_child
6033  { $_[0]->{'last_child'}= $_[1];
6034    delete $_->[0]->{empty};
6035    if( $XML::Twig::weakrefs) { weaken( $_[0]->{'last_child'}); }
6036  }
6037
6038sub last_child
6039  { my $elt= shift;
6040    my $cond= shift || return $elt->{last_child};
6041    my $test_cond= ($cond_cache{$cond} ||= _install_cond( $cond));
6042    my $child= $elt->{last_child};
6043    while( $child && !$test_cond->( $child) )
6044      { $child= $child->{prev_sibling}; }
6045    return $child
6046  }
6047
6048
6049sub set_prev_sibling
6050  { $_[0]->{'prev_sibling'}= $_[1];
6051    if( $XML::Twig::weakrefs) { weaken( $_[0]->{'prev_sibling'}); }
6052  }
6053
6054sub prev_sibling
6055  { my $elt= shift;
6056    my $cond= shift || return $elt->{prev_sibling};
6057    my $test_cond= ($cond_cache{$cond} ||= _install_cond( $cond));
6058    my $sibling= $elt->{prev_sibling};
6059    while( $sibling && !$test_cond->( $sibling) )
6060          { $sibling= $sibling->{prev_sibling}; }
6061    return $sibling;
6062  }
6063
6064sub set_next_sibling { $_[0]->{'next_sibling'}= $_[1]; }
6065
6066sub next_sibling
6067  { my $elt= shift;
6068    my $cond= shift || return $elt->{next_sibling};
6069    my $test_cond= ($cond_cache{$cond} ||= _install_cond( $cond));
6070    my $sibling= $elt->{next_sibling};
6071    while( $sibling && !$test_cond->( $sibling) )
6072          { $sibling= $sibling->{next_sibling}; }
6073    return $sibling;
6074  }
6075
6076# methods dealing with the class attribute, convenient if you work with xhtml
6077sub class   {   $_[0]->{att}->{class}; }
6078# lvalue version of class. separate from class to avoid problem like RT#
6079sub lclass
6080          :lvalue    # > perl 5.5
6081  { $_[0]->{att}->{class}; }
6082
6083sub set_class { my( $elt, $class)= @_; $elt->set_att( class => $class); }
6084
6085# adds a class to an element
6086sub add_to_class
6087  { my( $elt, $new_class)= @_;
6088    return $elt unless $new_class;
6089    my $class= $elt->class;
6090    my %class= $class ? map { $_ => 1 } split /\s+/, $class : ();
6091    $class{$new_class}= 1;
6092    $elt->set_class( join( ' ', sort keys %class));
6093  }
6094
6095sub remove_class
6096  { my( $elt, $class_to_remove)= @_;
6097    return $elt unless $class_to_remove;
6098    my $class= $elt->class;
6099    my %class= $class ? map { $_ => 1 } split /\s+/, $class : ();
6100    delete $class{$class_to_remove};
6101    $elt->set_class( join( ' ', sort keys %class));
6102  }
6103
6104sub att_to_class      { my( $elt, $att)= @_; $elt->set_class( $elt->{'att'}->{$att}); }
6105sub add_att_to_class  { my( $elt, $att)= @_; $elt->add_to_class( $elt->{'att'}->{$att}); }
6106sub move_att_to_class { my( $elt, $att)= @_; $elt->add_to_class( $elt->{'att'}->{$att});
6107                        $elt->del_att( $att);
6108                      }
6109sub tag_to_class      { my( $elt)= @_; $elt->set_class( $elt->tag);    }
6110sub add_tag_to_class  { my( $elt)= @_; $elt->add_to_class( $elt->tag); }
6111sub set_tag_class     { my( $elt, $new_tag)= @_; $elt->add_tag_to_class; $elt->set_tag( $new_tag); }
6112
6113sub tag_to_span
6114  { my( $elt)= @_;
6115    $elt->set_class( $elt->tag) unless( $elt->tag eq 'span' && $elt->class); # set class to span unless it would mean replacing it with span
6116    $elt->set_tag( 'span');
6117  }
6118
6119sub tag_to_div
6120  { my( $elt)= @_;
6121    $elt->set_class( $elt->tag) unless( $elt->tag eq 'div' && $elt->class); # set class to div unless it would mean replacing it with div
6122    $elt->set_tag( 'div');
6123  }
6124
6125sub in_class
6126  { my( $elt, $class)= @_;
6127    my $elt_class= $elt->class;
6128    return unless( defined $elt_class);
6129    return $elt->class=~ m{(?:^|\s)\Q$class\E(?:\s|$)} ? $elt : 0;
6130  }
6131
6132
6133# get or set all attributes
6134# argument can be a hash or a hashref
6135sub set_atts
6136  { my $elt= shift;
6137    my %atts;
6138    tie %atts, 'Tie::IxHash' if( keep_atts_order());
6139    %atts= @_ == 1 ? %{$_[0]} : @_;
6140    $elt->{att}= \%atts;
6141    if( exists $atts{$ID}) { $elt->_set_id( $atts{$ID}); }
6142    return $elt;
6143  }
6144
6145sub atts      { return $_[0]->{att};                }
6146sub att_names { return (sort keys %{$_[0]->{att}}); }
6147sub del_atts  { $_[0]->{att}={}; return $_[0];      }
6148
6149# get or set a single attribute (set works for several atts)
6150sub set_att
6151  { my $elt= shift;
6152
6153    if( $_[0] && ref( $_[0]) && !$_[1])
6154      { croak "improper call to set_att, usage is \$elt->set_att( att1 => 'val1', att2 => 'val2',...)"; }
6155
6156    unless( $elt->{att})
6157      { $elt->{att}={};
6158        tie %{$elt->{att}}, 'Tie::IxHash' if( keep_atts_order());
6159      }
6160
6161    while(@_)
6162      { my( $att, $val)= (shift, shift);
6163        $elt->{att}->{$att}= $val;
6164        if( $att eq $ID) { $elt->_set_id( $val); }
6165      }
6166    return $elt;
6167  }
6168
6169sub att {  $_[0]->{att}->{$_[1]}; }
6170# lvalue version of att. separate from class to avoid problem like RT#
6171sub latt
6172          :lvalue    # > perl 5.5
6173  { $_[0]->{att}->{$_[1]}; }
6174
6175sub del_att
6176  { my $elt= shift;
6177    while( @_) { delete $elt->{'att'}->{shift()}; }
6178    return $elt;
6179  }
6180
6181sub att_exists { return exists  $_[0]->{att}->{$_[1]}; }
6182
6183# delete an attribute from all descendants of an element
6184sub strip_att
6185  { my( $elt, $att)= @_;
6186    $_->del_att( $att) foreach ($elt->descendants_or_self( qq{*[\@$att]}));
6187    return $elt;
6188  }
6189
6190sub change_att_name
6191  { my( $elt, $old_name, $new_name)= @_;
6192    my $value= $elt->{'att'}->{$old_name};
6193    return $elt unless( defined $value);
6194    $elt->del_att( $old_name)
6195        ->set_att( $new_name => $value);
6196    return $elt;
6197  }
6198
6199sub lc_attnames
6200  { my $elt= shift;
6201    foreach my $att ($elt->att_names)
6202      { if( $att ne lc $att) { $elt->change_att_name( $att, lc $att); } }
6203    return $elt;
6204  }
6205
6206sub set_twig_current { $_[0]->{twig_current}=1; }
6207sub del_twig_current { delete $_[0]->{twig_current}; }
6208
6209
6210# get or set the id attribute
6211sub set_id
6212  { my( $elt, $id)= @_;
6213    $elt->del_id() if( exists $elt->{att}->{$ID});
6214    $elt->set_att($ID, $id);
6215    $elt->_set_id( $id);
6216    return $elt;
6217  }
6218
6219# only set id, does not update the attribute value
6220sub _set_id
6221  { my( $elt, $id)= @_;
6222    my $t= $elt->twig || $elt;
6223    $t->{twig_id_list}->{$id}= $elt;
6224    if( $XML::Twig::weakrefs) { weaken(  $t->{twig_id_list}->{$id}); }
6225    return $elt;
6226  }
6227
6228sub id { return $_[0]->{att}->{$ID}; }
6229
6230# methods used to add ids to elements that don't have one
6231BEGIN
6232{ my $id_nb   = "0001";
6233  my $id_seed = "twig_id_";
6234
6235  sub set_id_seed ## no critic (Subroutines::ProhibitNestedSubs);
6236    { $id_seed= $_[1]; $id_nb=1; }
6237
6238  sub add_id ## no critic (Subroutines::ProhibitNestedSubs);
6239    { my $elt= shift;
6240      if( defined $elt->{'att'}->{$ID})
6241        { return $elt->{'att'}->{$ID}; }
6242      else
6243        { my $id= $_[0] && ref( $_[0]) && isa( $_[0], 'CODE') ? $_[0]->( $elt) : $id_seed . $id_nb++;
6244          $elt->set_id( $id);
6245          return $id;
6246        }
6247    }
6248}
6249
6250
6251
6252# delete the id attribute and remove the element from the id list
6253sub del_id
6254  { my $elt= shift;
6255    if( ! exists $elt->{att}->{$ID}) { return $elt };
6256    my $id= $elt->{att}->{$ID};
6257
6258    delete $elt->{att}->{$ID};
6259
6260    my $t= shift || $elt->twig;
6261    unless( $t) { return $elt; }
6262    if( exists $t->{twig_id_list}->{$id}) { delete $t->{twig_id_list}->{$id}; }
6263
6264    return $elt;
6265  }
6266
6267# return the list of children
6268sub children
6269  { my $elt= shift;
6270    my @children;
6271    my $child= $elt->first_child( @_);
6272    while( $child)
6273      { push @children, $child;
6274        $child= $child->next_sibling( @_);
6275      }
6276    return @children;
6277  }
6278
6279sub _children
6280  { my $elt= shift;
6281    my @children=();
6282    my $child= $elt->{first_child};
6283    while( $child)
6284      { push @children, $child;
6285        $child= $child->{next_sibling};
6286      }
6287    return @children;
6288  }
6289
6290sub children_copy
6291  { my $elt= shift;
6292    my @children;
6293    my $child= $elt->first_child( @_);
6294    while( $child)
6295      { push @children, $child->copy;
6296        $child= $child->next_sibling( @_);
6297      }
6298    return @children;
6299  }
6300
6301
6302sub children_count
6303  { my $elt= shift;
6304    my $cond= shift;
6305    my $count=0;
6306    my $child= $elt->{first_child};
6307    while( $child)
6308      { $count++ if( $child->passes( $cond));
6309        $child= $child->{next_sibling};
6310      }
6311    return $count;
6312  }
6313
6314sub children_text
6315  { my $elt= shift;
6316    return wantarray() ? map { $_->text} $elt->children( @_)
6317                       : join( '', map { $_->text} $elt->children( @_) )
6318                       ;
6319  }
6320
6321sub children_trimmed_text
6322  { my $elt= shift;
6323    return wantarray() ? map { $_->trimmed_text} $elt->children( @_)
6324                       : join( '', map { $_->trimmed_text} $elt->children( @_) )
6325                       ;
6326  }
6327
6328sub all_children_are
6329  { my( $parent, $cond)= @_;
6330    foreach my $child ($parent->_children)
6331      { return 0 unless( $child->passes( $cond)); }
6332    return $parent;
6333  }
6334
6335
6336sub ancestors
6337  { my( $elt, $cond)= @_;
6338    my @ancestors;
6339    while( $elt->{parent})
6340      { $elt= $elt->{parent};
6341        push @ancestors, $elt if( $elt->passes( $cond));
6342      }
6343    return @ancestors;
6344  }
6345
6346sub ancestors_or_self
6347  { my( $elt, $cond)= @_;
6348    my @ancestors;
6349    while( $elt)
6350      { push @ancestors, $elt if( $elt->passes( $cond));
6351        $elt= $elt->{parent};
6352      }
6353    return @ancestors;
6354  }
6355
6356
6357sub _ancestors
6358  { my( $elt, $include_self)= @_;
6359    my @ancestors= $include_self ? ($elt) : ();
6360    while( $elt= $elt->{parent}) { push @ancestors, $elt; }
6361    return @ancestors;
6362  }
6363
6364
6365sub inherit_att
6366  { my $elt= shift;
6367    my $att= shift;
6368    my %tags= map { ($_, 1) } @_;
6369
6370    do
6371      { if(   (defined $elt->{'att'}->{$att})
6372           && ( !%tags || $tags{$XML::Twig::index2gi[$elt->{'gi'}]})
6373          )
6374          { return $elt->{'att'}->{$att}; }
6375      } while( $elt= $elt->{parent});
6376    return undef;
6377  }
6378
6379sub _inherit_att_through_cut
6380  { my $elt= shift;
6381    my $att= shift;
6382    my %tags= map { ($_, 1) } @_;
6383
6384    do
6385      { if(   (defined $elt->{'att'}->{$att})
6386           && ( !%tags || $tags{$XML::Twig::index2gi[$elt->{'gi'}]})
6387          )
6388          { return $elt->{'att'}->{$att}; }
6389      } while( $elt= $elt->{parent} || ($elt->{former} && $elt->{former}->{parent}));
6390    return undef;
6391  }
6392
6393
6394sub current_ns_prefixes
6395  { my $elt= shift;
6396    my %prefix;
6397    $prefix{''}=1 if( $elt->namespace( ''));
6398    while( $elt)
6399      { my @ns= grep { !m{^xml} } map { m{^([^:]+):} } ($XML::Twig::index2gi[$elt->{'gi'}], $elt->att_names);
6400        $prefix{$_}=1 foreach (@ns);
6401        $elt= $elt->{parent};
6402      }
6403
6404    return (sort keys %prefix);
6405  }
6406
6407# kinda counter-intuitive actually:
6408# the next element is found by looking for the next open tag after from the
6409# current one, which is the first child, if it exists, or the next sibling
6410# or the first next sibling of an ancestor
6411# optional arguments are:
6412#   - $subtree_root: a reference to an element, when the next element is not
6413#                    within $subtree_root anymore then next_elt returns undef
6414#   - $cond: a condition, next_elt returns the next element matching the condition
6415
6416sub next_elt
6417  { my $elt= shift;
6418    my $subtree_root= 0;
6419    $subtree_root= shift if( ref( $_[0]) && isa( $_[0], 'XML::Twig::Elt'));
6420    my $cond= shift;
6421    my $next_elt;
6422
6423    my $ind;                                                              # optimization
6424    my $test_cond;
6425    if( $cond)                                                            # optimization
6426      { unless( defined( $ind= $XML::Twig::gi2index{$cond}) )             # optimization
6427          { $test_cond= ($cond_cache{$cond} ||= _install_cond( $cond)); } # optimization
6428      }                                                                   # optimization
6429
6430    do
6431      { if( $next_elt= $elt->{first_child})
6432          { # simplest case: the elt has a child
6433          }
6434         elsif( $next_elt= $elt->{next_sibling})
6435          { # no child but a next sibling (just check we stay within the subtree)
6436
6437            # case where elt is subtree_root, is empty and has a sibling
6438            return undef if( $subtree_root && ($elt == $subtree_root));
6439
6440          }
6441        else
6442          { # case where the element has no child and no next sibling:
6443            # get the first next sibling of an ancestor, checking subtree_root
6444
6445            # case where elt is subtree_root, is empty and has no sibling
6446            return undef if( $subtree_root && ($elt == $subtree_root));
6447
6448            $next_elt= $elt->{parent} || return undef;
6449
6450            until( $next_elt->{next_sibling})
6451              { return undef if( $subtree_root && ($subtree_root == $next_elt));
6452                $next_elt= $next_elt->{parent} || return undef;
6453              }
6454            return undef if( $subtree_root && ($subtree_root == $next_elt));
6455            $next_elt= $next_elt->{next_sibling};
6456          }
6457      $elt= $next_elt;                   # just in case we need to loop
6458    } until(    ! defined $elt
6459             || ! defined $cond
6460         || (defined $ind       && ($elt->{gi} eq $ind))   # optimization
6461         || (defined $test_cond && ($test_cond->( $elt)))
6462               );
6463
6464      return $elt;
6465      }
6466
6467# return the next_elt within the element
6468# just call next_elt with the element as first and second argument
6469sub first_descendant { return $_[0]->next_elt( @_); }
6470
6471# get the last descendant, # then return the element found or call prev_elt with the condition
6472sub last_descendant
6473  { my( $elt, $cond)= @_;
6474    my $last_descendant= $elt->_last_descendant;
6475    if( !$cond || $last_descendant->matches( $cond))
6476      { return $last_descendant; }
6477    else
6478      { return $last_descendant->prev_elt( $elt, $cond); }
6479  }
6480
6481# no argument allowed here, just go down the last_child recursively
6482sub _last_descendant
6483  { my $elt= shift;
6484    while( my $child= $elt->{last_child}) { $elt= $child; }
6485    return $elt;
6486  }
6487
6488# counter-intuitive too:
6489# the previous element is found by looking
6490# for the first open tag backwards from the current one
6491# it's the last descendant of the previous sibling
6492# if it exists, otherwise it's simply the parent
6493sub prev_elt
6494  { my $elt= shift;
6495    my $subtree_root= 0;
6496    if( defined $_[0] and (ref( $_[0]) && isa( $_[0], 'XML::Twig::Elt')))
6497      { $subtree_root= shift ;
6498        return undef if( $elt == $subtree_root);
6499      }
6500    my $cond= shift;
6501    # get prev elt
6502    my $prev_elt;
6503    do
6504      { return undef if( $elt == $subtree_root);
6505        if( $prev_elt= $elt->{prev_sibling})
6506          { while( $prev_elt->{last_child})
6507              { $prev_elt= $prev_elt->{last_child}; }
6508          }
6509        else
6510          { $prev_elt= $elt->{parent} || return undef; }
6511        $elt= $prev_elt;     # in case we need to loop
6512      } until( $elt->passes( $cond));
6513
6514    return $elt;
6515  }
6516
6517sub _following_elt
6518  { my( $elt)= @_;
6519    while( $elt && !$elt->{next_sibling})
6520      { $elt= $elt->{parent}; }
6521    return $elt ? $elt->{next_sibling} : undef;
6522  }
6523
6524sub following_elt
6525  { my( $elt, $cond)= @_;
6526    $elt= $elt->_following_elt || return undef;
6527    return $elt if( !$cond || $elt->matches( $cond));
6528    return $elt->next_elt( $cond);
6529  }
6530
6531sub following_elts
6532  { my( $elt, $cond)= @_;
6533    if( !$cond) { undef $cond; }
6534    my $following= $elt->following_elt( $cond);
6535    if( $following)
6536      { my @followings= $following;
6537        while( $following= $following->next_elt( $cond))
6538          { push @followings, $following; }
6539        return( @followings);
6540      }
6541    else
6542      { return (); }
6543  }
6544
6545sub _preceding_elt
6546  { my( $elt)= @_;
6547    while( $elt && !$elt->{prev_sibling})
6548      { $elt= $elt->{parent}; }
6549    return $elt ? $elt->{prev_sibling}->_last_descendant : undef;
6550  }
6551
6552sub preceding_elt
6553  { my( $elt, $cond)= @_;
6554    $elt= $elt->_preceding_elt || return undef;
6555    return $elt if( !$cond || $elt->matches( $cond));
6556    return $elt->prev_elt( $cond);
6557  }
6558
6559sub preceding_elts
6560  { my( $elt, $cond)= @_;
6561    if( !$cond) { undef $cond; }
6562    my $preceding= $elt->preceding_elt( $cond);
6563    if( $preceding)
6564      { my @precedings= $preceding;
6565        while( $preceding= $preceding->prev_elt( $cond))
6566          { push @precedings, $preceding; }
6567        return( @precedings);
6568      }
6569    else
6570      { return (); }
6571  }
6572
6573# used in get_xpath
6574sub _self
6575  { my( $elt, $cond)= @_;
6576    return $cond ? $elt->matches( $cond) : $elt;
6577  }
6578
6579sub next_n_elt
6580  { my $elt= shift;
6581    my $offset= shift || return undef;
6582    foreach (1..$offset)
6583      { $elt= $elt->next_elt( @_) || return undef; }
6584    return $elt;
6585  }
6586
6587# checks whether $elt is included in $ancestor, returns 1 in that case
6588sub in
6589  { my ($elt, $ancestor)= @_;
6590    if( ref( $ancestor) && isa( $ancestor, 'XML::Twig::Elt'))
6591      { # element
6592        while( $elt= $elt->{parent}) { return $elt if( $elt ==  $ancestor); }
6593      }
6594    else
6595      { # condition
6596        while( $elt= $elt->{parent}) { return $elt if( $elt->matches( $ancestor)); }
6597      }
6598    return 0;
6599  }
6600
6601sub first_child_text
6602  { my $elt= shift;
6603    my $dest=$elt->first_child(@_) or return '';
6604    return $dest->text;
6605  }
6606
6607sub fields
6608  { my $elt= shift;
6609    return map { $elt->field( $_) } @_;
6610  }
6611
6612sub first_child_trimmed_text
6613  { my $elt= shift;
6614    my $dest=$elt->first_child(@_) or return '';
6615    return $dest->trimmed_text;
6616  }
6617
6618sub first_child_matches
6619  { my $elt= shift;
6620    my $dest= $elt->{first_child} or return undef;
6621    return $dest->passes( @_);
6622  }
6623
6624sub last_child_text
6625  { my $elt= shift;
6626    my $dest=$elt->last_child(@_) or return '';
6627    return $dest->text;
6628  }
6629
6630sub last_child_trimmed_text
6631  { my $elt= shift;
6632    my $dest=$elt->last_child(@_) or return '';
6633    return $dest->trimmed_text;
6634  }
6635
6636sub last_child_matches
6637  { my $elt= shift;
6638    my $dest= $elt->{last_child} or return undef;
6639    return $dest->passes( @_);
6640  }
6641
6642sub child_text
6643  { my $elt= shift;
6644    my $dest=$elt->child(@_) or return '';
6645    return $dest->text;
6646  }
6647
6648sub child_trimmed_text
6649  { my $elt= shift;
6650    my $dest=$elt->child(@_) or return '';
6651    return $dest->trimmed_text;
6652  }
6653
6654sub child_matches
6655  { my $elt= shift;
6656    my $nb= shift;
6657    my $dest= $elt->child( $nb) or return undef;
6658    return $dest->passes( @_);
6659  }
6660
6661sub prev_sibling_text
6662  { my $elt= shift;
6663    my $dest= $elt->_prev_sibling(@_) or return '';
6664    return $dest->text;
6665  }
6666
6667sub prev_sibling_trimmed_text
6668  { my $elt= shift;
6669    my $dest= $elt->_prev_sibling(@_) or return '';
6670    return $dest->trimmed_text;
6671  }
6672
6673sub prev_sibling_matches
6674  { my $elt= shift;
6675    my $dest= $elt->{prev_sibling} or return undef;
6676    return $dest->passes( @_);
6677  }
6678
6679sub next_sibling_text
6680  { my $elt= shift;
6681    my $dest= $elt->next_sibling(@_) or return '';
6682    return $dest->text;
6683  }
6684
6685sub next_sibling_trimmed_text
6686  { my $elt= shift;
6687    my $dest= $elt->next_sibling(@_) or return '';
6688    return $dest->trimmed_text;
6689  }
6690
6691sub next_sibling_matches
6692  { my $elt= shift;
6693    my $dest= $elt->{next_sibling} or return undef;
6694    return $dest->passes( @_);
6695  }
6696
6697sub prev_elt_text
6698  { my $elt= shift;
6699    my $dest= $elt->prev_elt(@_) or return '';
6700    return $dest->text;
6701  }
6702
6703sub prev_elt_trimmed_text
6704  { my $elt= shift;
6705    my $dest= $elt->prev_elt(@_) or return '';
6706    return $dest->trimmed_text;
6707  }
6708
6709sub prev_elt_matches
6710  { my $elt= shift;
6711    my $dest= $elt->prev_elt or return undef;
6712    return $dest->passes( @_);
6713  }
6714
6715sub next_elt_text
6716  { my $elt= shift;
6717    my $dest= $elt->next_elt(@_) or return '';
6718    return $dest->text;
6719  }
6720
6721sub next_elt_trimmed_text
6722  { my $elt= shift;
6723    my $dest= $elt->next_elt(@_) or return '';
6724    return $dest->trimmed_text;
6725  }
6726
6727sub next_elt_matches
6728  { my $elt= shift;
6729    my $dest= $elt->next_elt or return undef;
6730    return $dest->passes( @_);
6731  }
6732
6733sub parent_text
6734  { my $elt= shift;
6735    my $dest= $elt->parent(@_) or return '';
6736    return $dest->text;
6737  }
6738
6739sub parent_trimmed_text
6740  { my $elt= shift;
6741    my $dest= $elt->parent(@_) or return '';
6742    return $dest->trimmed_text;
6743  }
6744
6745sub parent_matches
6746  { my $elt= shift;
6747    my $dest= $elt->{parent} or return undef;
6748    return $dest->passes( @_);
6749  }
6750
6751sub is_first_child
6752  { my $elt= shift;
6753    my $parent= $elt->{parent} or return 0;
6754    my $first_child= $parent->first_child( @_) or return 0;
6755    return ($first_child == $elt) ? $elt : 0;
6756  }
6757
6758sub is_last_child
6759  { my $elt= shift;
6760    my $parent= $elt->{parent} or return 0;
6761    my $last_child= $parent->last_child( @_) or return 0;
6762    return ($last_child == $elt) ? $elt : 0;
6763  }
6764
6765# returns the depth level of the element
6766# if 2 parameter are used then counts the 2cd element name in the
6767# ancestors list
6768sub level
6769  { my( $elt, $cond)= @_;
6770    my $level=0;
6771    my $name=shift || '';
6772    while( $elt= $elt->{parent}) { $level++ if( !$cond || $elt->matches( $cond)); }
6773    return $level;
6774  }
6775
6776# checks whether $elt has an ancestor that satisfies $cond, returns the ancestor
6777sub in_context
6778  { my ($elt, $cond, $level)= @_;
6779    $level= -1 unless( $level) ;  # $level-- will never hit 0
6780
6781    while( $level)
6782      { $elt= $elt->{parent} or return 0;
6783        if( $elt->matches( $cond)) { return $elt; }
6784        $level--;
6785      }
6786    return 0;
6787  }
6788
6789sub _descendants
6790  { my( $subtree_root, $include_self)= @_;
6791    my @descendants= $include_self ? ($subtree_root) : ();
6792
6793    my $elt= $subtree_root;
6794    my $next_elt;
6795
6796    MAIN: while( 1)
6797      { if( $next_elt= $elt->{first_child})
6798          { # simplest case: the elt has a child
6799          }
6800        elsif( $next_elt= $elt->{next_sibling})
6801          { # no child but a next sibling (just check we stay within the subtree)
6802
6803            # case where elt is subtree_root, is empty and has a sibling
6804            last MAIN if( $elt == $subtree_root);
6805          }
6806        else
6807          { # case where the element has no child and no next sibling:
6808            # get the first next sibling of an ancestor, checking subtree_root
6809
6810            # case where elt is subtree_root, is empty and has no sibling
6811            last MAIN if( $elt == $subtree_root);
6812
6813            # backtrack until we find a parent with a next sibling
6814            $next_elt= $elt->{parent} || last;
6815            until( $next_elt->{next_sibling})
6816              { last MAIN if( $subtree_root == $next_elt);
6817                $next_elt= $next_elt->{parent} || last MAIN;
6818              }
6819            last MAIN if( $subtree_root == $next_elt);
6820            $next_elt= $next_elt->{next_sibling};
6821          }
6822        $elt= $next_elt || last MAIN;
6823        push @descendants, $elt;
6824      }
6825    return @descendants;
6826  }
6827
6828
6829sub descendants
6830  { my( $subtree_root, $cond)= @_;
6831    my @descendants=();
6832    my $elt= $subtree_root;
6833
6834    # this branch is pure optimization for speed: if $cond is a gi replace it
6835    # by the index of the gi and loop here
6836    # start optimization
6837    my $ind;
6838    if( !$cond || ( defined ( $ind= $XML::Twig::gi2index{$cond})) )
6839      {
6840        my $next_elt;
6841
6842        while( 1)
6843          { if( $next_elt= $elt->{first_child})
6844                { # simplest case: the elt has a child
6845                }
6846             elsif( $next_elt= $elt->{next_sibling})
6847              { # no child but a next sibling (just check we stay within the subtree)
6848
6849                # case where elt is subtree_root, is empty and has a sibling
6850                last if( $subtree_root && ($elt == $subtree_root));
6851              }
6852            else
6853              { # case where the element has no child and no next sibling:
6854                # get the first next sibling of an ancestor, checking subtree_root
6855
6856                # case where elt is subtree_root, is empty and has no sibling
6857                last if( $subtree_root && ($elt == $subtree_root));
6858
6859                # backtrack until we find a parent with a next sibling
6860                $next_elt= $elt->{parent} || last undef;
6861                until( $next_elt->{next_sibling})
6862                  { last if( $subtree_root && ($subtree_root == $next_elt));
6863                    $next_elt= $next_elt->{parent} || last;
6864                  }
6865                last if( $subtree_root && ($subtree_root == $next_elt));
6866                $next_elt= $next_elt->{next_sibling};
6867              }
6868            $elt= $next_elt || last;
6869            push @descendants, $elt if( !$cond || ($elt->{gi} eq $ind));
6870          }
6871      }
6872    else
6873    # end optimization
6874      { # branch for a complex condition: use the regular (slow but simple) way
6875        while( $elt= $elt->next_elt( $subtree_root, $cond))
6876          { push @descendants, $elt; }
6877      }
6878    return @descendants;
6879  }
6880
6881
6882sub descendants_or_self
6883  { my( $elt, $cond)= @_;
6884    my @descendants= $elt->passes( $cond) ? ($elt) : ();
6885    push @descendants, $elt->descendants( $cond);
6886    return @descendants;
6887  }
6888
6889sub sibling
6890  { my $elt= shift;
6891    my $nb= shift;
6892    if( $nb > 0)
6893      { foreach( 1..$nb)
6894          { $elt= $elt->next_sibling( @_) or return undef; }
6895      }
6896    elsif( $nb < 0)
6897      { foreach( 1..(-$nb))
6898          { $elt= $elt->prev_sibling( @_) or return undef; }
6899      }
6900    else # $nb == 0
6901      { return $elt->passes( $_[0]); }
6902    return $elt;
6903  }
6904
6905sub sibling_text
6906  { my $elt= sibling( @_);
6907    return $elt ? $elt->text : undef;
6908  }
6909
6910
6911sub child
6912  { my $elt= shift;
6913    my $nb= shift;
6914    if( $nb >= 0)
6915      { $elt= $elt->first_child( @_) or return undef;
6916        foreach( 1..$nb)
6917          { $elt= $elt->next_sibling( @_) or return undef; }
6918      }
6919    else
6920      { $elt= $elt->last_child( @_) or return undef;
6921        foreach( 2..(-$nb))
6922          { $elt= $elt->prev_sibling( @_) or return undef; }
6923      }
6924    return $elt;
6925  }
6926
6927sub prev_siblings
6928  { my $elt= shift;
6929    my @siblings=();
6930    while( $elt= $elt->prev_sibling( @_))
6931      { unshift @siblings, $elt; }
6932    return @siblings;
6933  }
6934
6935sub siblings
6936  { my $elt= shift;
6937    return grep { $_ ne $elt } $elt->{parent}->children( @_);
6938  }
6939
6940sub pos
6941  { my $elt= shift;
6942    return 0 if ($_[0] && !$elt->matches( @_));
6943    my $pos=1;
6944    $pos++ while( $elt= $elt->prev_sibling( @_));
6945    return $pos;
6946  }
6947
6948
6949sub next_siblings
6950  { my $elt= shift;
6951    my @siblings=();
6952    while( $elt= $elt->next_sibling( @_))
6953      { push @siblings, $elt; }
6954    return @siblings;
6955  }
6956
6957
6958# used by get_xpath: parses the xpath expression and generates a sub that performs the
6959# search
6960{ my %axis2method;
6961  BEGIN { %axis2method= ( child               => 'children',
6962                          descendant          => 'descendants',
6963                         'descendant-or-self' => 'descendants_or_self',
6964                          parent              => 'parent_is',
6965                          ancestor            => 'ancestors',
6966                         'ancestor-or-self'   => 'ancestors_or_self',
6967                         'following-sibling'  => 'next_siblings',
6968                         'preceding-sibling'  => 'prev_siblings',
6969                          following           => 'following_elts',
6970                          preceding           => 'preceding_elts',
6971                          self                => '_self',
6972                        );
6973        }
6974
6975  sub _install_xpath
6976    { my( $xpath_exp, $type)= @_;
6977      my $original_exp= $xpath_exp;
6978      my $sub= 'my $elt= shift; my @results;';
6979
6980      # grab the root if expression starts with a /
6981      if( $xpath_exp=~ s{^/}{})
6982        { $sub .= '@results= ($elt->twig) || croak "cannot use an XPath query starting with a / on a node not attached to a whole twig";'; }
6983      elsif( $xpath_exp=~ s{^\./}{})
6984        { $sub .= '@results= ($elt);'; }
6985      else
6986        { $sub .= '@results= ($elt);'; }
6987
6988
6989     #warn "xpath_exp= '$xpath_exp'\n";
6990
6991      while( $xpath_exp &&
6992             $xpath_exp=~s{^\s*(/?)
6993                            # the xxx=~/regexp/ is a pain as it includes /
6994                            (\s*(?:(?:($REG_AXIS)::)?(\*|$REG_TAG_PART|\.\.|\.)\s*)?($REG_PREDICATE_ALT*)
6995                            )
6996                            (/|$)}{}xo)
6997
6998        { my( $wildcard, $sub_exp, $axis, $gi, $predicates)= ($1, $2, $3, $4, $5);
6999           if( $axis && ! $gi)
7000                { _croak_and_doublecheck_xpath( $original_exp, "error in xpath expression $original_exp"); }
7001
7002          # grab a parent
7003          if( $sub_exp eq '..')
7004            { _croak_and_doublecheck_xpath( $original_exp, "error in xpath expression $original_exp") if( $wildcard);
7005              $sub .= '@results= map { $_->{parent}} @results;';
7006            }
7007          # test the element itself
7008          elsif( $sub_exp=~ m{^\.(.*)$}s)
7009            { $sub .= "\@results= grep { \$_->matches( q{$1}) } \@results;" }
7010          # grab children
7011          else
7012            {
7013              if( !$axis)
7014                { $axis= $wildcard ? 'descendant' : 'child'; }
7015              if( !$gi or $gi eq '*') { $gi=''; }
7016              my $function;
7017
7018              # "special" predicates, that return just one element
7019              if( $predicates && ($predicates =~ m{^\s*\[\s*((-\s*)?\d+)\s*\]\s*$}))
7020                { # [<nb>]
7021                  my $offset= $1;
7022                  $offset-- if( $offset > 0);
7023                  $function=  $axis eq 'descendant' ? "next_n_elt( $offset, '$gi')"
7024                           :  $axis eq 'child'      ? "child( $offset, '$gi')"
7025                           :                          _croak_and_doublecheck_xpath( $original_exp, "error [$1] not supported along axis '$axis'")
7026                           ;
7027                  $sub .= "\@results= grep { \$_ } map { \$_->$function } \@results;"
7028                }
7029              elsif( $predicates && ($predicates =~ m{^\s*\[\s*last\s*\(\s*\)\s*\]\s*$}) )
7030                { # last()
7031                  _croak_and_doublecheck_xpath( $original_exp, "error in xpath expression $original_exp, usage of // and last() not supported") if( $wildcard);
7032                   $sub .= "\@results= map { \$_->last_child( '$gi') } \@results;";
7033                }
7034              else
7035                { # follow the axis
7036                  #warn "axis: '$axis' - method: '$axis2method{$axis}' - gi: '$gi'\n";
7037
7038                  my $follow_axis= " \$_->$axis2method{$axis}( '$gi')";
7039                  my $step= $follow_axis;
7040
7041                  # now filter using the predicate
7042                  while( $predicates=~ s{^\s*($REG_PREDICATE_ALT)\s*}{}o)
7043                    { my $pred= $1;
7044                      $pred=~ s{^\s*\[\s*}{};
7045                      $pred=~ s{\s*\]\s*$}{};
7046                      my $test="";
7047                      my $pos;
7048                      if( $pred=~ m{^(-?\s*\d+)$})
7049                        { my $pos= $1;
7050                          if( $step=~ m{^\s*grep(.*) (\$_->\w+\(\s*'[^']*'\s*\))})
7051                            { $step= "XML::Twig::_first_n $1 $pos, $2"; }
7052                          else
7053                            { if( $pos > 0) { $pos--; }
7054                              $step= "($step)[$pos]";
7055                            }
7056                          #warn "number predicate '$pos' - generated step '$step'\n";
7057                        }
7058                      else
7059                        { my $syntax_error=0;
7060                          do
7061                            { if( $pred =~ s{^string\(\s*\)\s*=\s*($REG_STRING)\s*}{}o)  # string()="string" pred
7062                                { $test .= "\$_->text eq $1"; }
7063                              elsif( $pred =~ s{^string\(\s*\)\s*!=\s*($REG_STRING)\s*}{}o)  # string()!="string" pred
7064                                { $test .= "\$_->text ne $1"; }
7065                              if( $pred =~ s{^string\(\s*\)\s*=\s*($REG_NUMBER)\s*}{}o)  # string()=<number> pred
7066                                { $test .= "\$_->text eq $1"; }
7067                              elsif( $pred =~ s{^string\(\s*\)\s*!=\s*($REG_NUMBER)\s*}{}o)  # string()!=<number> pred
7068                                { $test .= "\$_->text ne $1"; }
7069                              elsif( $pred =~ s{^string\(\s*\)\s*(>|<|>=|<=)\s*($REG_NUMBER)\s*}{}o)  # string()!=<number> pred
7070                                { $test .= "\$_->text $1 $2"; }
7071
7072                             elsif( $pred =~ s{^string\(\s*\)\s*($REG_MATCH)\s*($REG_REGEXP)\s*}{}o)  # string()=~/regex/ pred
7073                                { my( $match, $regexp)= ($1, $2);
7074                                  $test .= "\$_->text $match $regexp";
7075                                }
7076                              elsif( $pred =~ s{^string\(\s*\)\s*}{}o)  # string() pred
7077                                { $test .= "\$_->text"; }
7078                             elsif( $pred=~ s{^@($REG_TAG_NAME)\s*($REG_OP)\s*($REG_STRING|$REG_NUMBER)}{}o)  # @att="val" pred
7079                                { my( $att, $oper, $val)= ($1, _op( $2), $3);
7080                                  $test .= qq{((defined \$_->{'att'}->{"$att"})  && (\$_->{'att'}->{"$att"} $oper $val))};
7081                                }
7082                             elsif( $pred =~ s{^@($REG_TAG_NAME)\s*($REG_MATCH)\s*($REG_REGEXP)\s*}{}o)  # @att=~/regex/ pred XXX
7083                                { my( $att, $match, $regexp)= ($1, $2, $3);
7084                                  $test .= qq{((defined \$_->{'att'}->{"$att"})  && (\$_->{'att'}->{"$att"} $match $regexp))};;
7085                                }
7086                             elsif( $pred=~ s{^@($REG_TAG_NAME)\s*}{}o)                      # @att pred
7087                                { $test .= qq{(defined \$_->{'att'}->{"$1"})}; }
7088                             elsif( $pred=~ s{^\s*(?:not|!)\s*@($REG_TAG_NAME)\s*}{}o)       # not @att pred
7089                                { $test .= qq{((\$_->is_elt) && (not defined \$_->{'att'}->{"$1"}))}; }
7090                              elsif( $pred=~ s{^\s*([()])}{})                            # ( or ) (just add to the test)
7091                                { $test .= qq{$1};           }
7092                              elsif( $pred=~ s{^\s*(and|or)\s*}{})
7093                                { $test .= lc " $1 "; }
7094                              else
7095                                { $syntax_error=1; }
7096
7097                             } while( !$syntax_error && $pred);
7098                           _croak_and_doublecheck_xpath( $original_exp, "error in xpath expression $original_exp at $pred") if( $pred);
7099                           $step= " grep { $test } $step ";
7100                        }
7101                    }
7102                  #warn "step: '$step'";
7103                  $sub .= "\@results= grep defined, map { $step } \@results;";
7104                }
7105            }
7106        }
7107
7108      if( $xpath_exp)
7109        { _croak_and_doublecheck_xpath( $original_exp, "error in xpath expression $original_exp around $xpath_exp"); }
7110
7111      $sub .= q{return XML::Twig::_unique_elts( @results); };
7112      #warn "generated: '$sub'\n";
7113      my $s= eval "sub { $NO_WARNINGS; $sub }";
7114      if( $@)
7115        { _croak_and_doublecheck_xpath( $original_exp, "error in xpath expression $original_exp ($@);") }
7116      return( $s);
7117    }
7118}
7119
7120sub _croak_and_doublecheck_xpath
7121  { my $xpath_expression= shift;
7122    my $mess= join( "\n", @_);
7123    if( $XML::Twig::XPath::VERSION || 0)
7124      { my $check_twig= XML::Twig::XPath->new;
7125        if( eval { $check_twig->{twig_xp}->_parse( $xpath_expression) })
7126          { $mess .= "\nthe expression is a valid XPath statement, and you are using XML::Twig::XPath, but"
7127                   . "\nyou are using either 'find_nodes' or 'get_xpath' where the method you likely wanted"
7128                   . "\nto use is 'findnodes', which is the only one that uses the full XPath engine\n";
7129          }
7130      }
7131    croak $mess;
7132  }
7133
7134
7135
7136{ # extremely elaborate caching mechanism
7137  my %xpath; # xpath_expression => subroutine_code;
7138  sub get_xpath
7139    { my( $elt, $xpath_exp, $offset)= @_;
7140      my $sub= ($xpath{$xpath_exp} ||= _install_xpath( $xpath_exp));
7141      return $sub->( $elt) unless( defined $offset);
7142      my @res= $sub->( $elt);
7143      return $res[$offset];
7144    }
7145}
7146
7147
7148sub findvalues
7149  { my $elt= shift;
7150    return map { $_->text } $elt->get_xpath( @_);
7151  }
7152
7153sub findvalue
7154  { my $elt= shift;
7155    return join '', map { $_->text } $elt->get_xpath( @_);
7156  }
7157
7158
7159# XML::XPath compatibility
7160sub getElementById     { return $_[0]->twig->elt_id( $_[1]); }
7161sub getChildNodes      { my @children= do { my $elt= $_[0]; my @children=(); my $child= $elt->{first_child}; while( $child) { push @children, $child; $child= $child->{next_sibling}; } @children; }; return wantarray ? @children : \@children; }
7162
7163sub _flushed     { return $_[0]->{flushed}; }
7164sub _set_flushed { $_[0]->{flushed}=1;      }
7165sub _del_flushed { delete $_[0]->{flushed}; }
7166
7167sub cut
7168  { my $elt= shift;
7169    my( $parent, $prev_sibling, $next_sibling);
7170    $parent=  $elt->{parent};
7171    if( ! $parent && $elt->is_elt)
7172      { # are we cutting the root?
7173        my $t= $elt->{twig};
7174        if( $t && ! $t->{twig_parsing})
7175          { delete $t->{twig_root};
7176            delete $elt->{twig};
7177            return $elt;
7178          }  # cutt`ing the root
7179        else
7180          { return;  }  # cutting an orphan, returning $elt would break backward compatibility
7181      }
7182
7183    # save the old links, that'll make it easier for some loops
7184    foreach my $link ( qw(parent prev_sibling next_sibling) )
7185      { $elt->{former}->{$link}= $elt->{$link};
7186         if( $XML::Twig::weakrefs) { weaken( $elt->{former}->{$link}); }
7187      }
7188
7189    # if we cut the current element then its parent becomes the current elt
7190    if( $elt->{twig_current})
7191      { my $twig_current= $elt->{parent};
7192        $elt->twig->{twig_current}= $twig_current;
7193        $twig_current->{'twig_current'}=1;
7194        delete $elt->{'twig_current'};
7195      }
7196
7197    if( $parent->{first_child} && $parent->{first_child} == $elt)
7198      { $parent->{first_child}=  $elt->{next_sibling};
7199        # cutting can make the parent empty
7200        if( ! $parent->{first_child}) { $parent->{empty}=  1; }
7201      }
7202
7203    if( $parent->{last_child} && $parent->{last_child} == $elt)
7204      {  delete $parent->{empty}; $parent->{last_child}=$elt->{prev_sibling}; if( $XML::Twig::weakrefs) { weaken( $parent->{last_child});} ;
7205      }
7206
7207    if( $prev_sibling= $elt->{prev_sibling})
7208      { $prev_sibling->{next_sibling}=  $elt->{next_sibling}; }
7209    if( $next_sibling= $elt->{next_sibling})
7210      { $next_sibling->{prev_sibling}=$elt->{prev_sibling}; if( $XML::Twig::weakrefs) { weaken( $next_sibling->{prev_sibling});} ; }
7211
7212
7213    $elt->{parent}=undef; if( $XML::Twig::weakrefs) { weaken( $elt->{parent});} ;
7214    $elt->{prev_sibling}=undef; if( $XML::Twig::weakrefs) { weaken( $elt->{prev_sibling});} ;
7215    $elt->{next_sibling}=  undef;
7216
7217    # merge 2 (now) consecutive text nodes if they are of the same type
7218    # (type can be PCDATA or CDATA)
7219    if( $prev_sibling && $next_sibling && $prev_sibling->is_text && ( $XML::Twig::index2gi[$prev_sibling->{'gi'}] eq $XML::Twig::index2gi[$next_sibling->{'gi'}]))
7220      { $prev_sibling->merge_text( $next_sibling); }
7221
7222    return $elt;
7223  }
7224
7225
7226sub former_next_sibling { return $_[0]->{former}->{next_sibling}; }
7227sub former_prev_sibling { return $_[0]->{former}->{prev_sibling}; }
7228sub former_parent       { return $_[0]->{former}->{parent};       }
7229
7230sub cut_children
7231  { my( $elt, $exp)= @_;
7232    my @children= $elt->children( $exp);
7233    foreach (@children) { $_->cut; }
7234    if( ! $elt->has_children) { $elt->{empty}=  1; }
7235    return @children;
7236  }
7237
7238sub cut_descendants
7239  { my( $elt, $exp)= @_;
7240    my @descendants= $elt->descendants( $exp);
7241    foreach ($elt->descendants( $exp)) { $_->cut; }
7242    if( ! $elt->has_children) { $elt->{empty}=  1; }
7243    return @descendants;
7244  }
7245
7246
7247sub erase
7248  { my $elt= shift;
7249    #you cannot erase the current element
7250    if( $elt->{twig_current})
7251      { croak "trying to erase an element before it has been completely parsed"; }
7252    if( my $parent= $elt->{parent})
7253      { # normal case
7254        $elt->_move_extra_data_after_erase;
7255        my @children= do { my $elt= $elt; my @children=(); my $child= $elt->{first_child}; while( $child) { push @children, $child; $child= $child->{next_sibling}; } @children; };
7256        if( @children)
7257          {
7258            # elt has children, move them up
7259
7260            # the first child may need to be merged with a previous text
7261            my $first_child= shift @children;
7262            $first_child->move( before => $elt);
7263            my $prev= $first_child->{prev_sibling};
7264            if( $prev && $prev->is_text && ($XML::Twig::index2gi[$first_child->{'gi'}] eq $XML::Twig::index2gi[$prev->{'gi'}]) )
7265              { $prev->merge_text( $first_child); }
7266
7267            # move the rest of the children
7268            foreach my $child (@children)
7269              { $child->move( before => $elt); }
7270
7271            # now the elt had no child, delete it
7272            $elt->delete;
7273
7274            # now see if we need to merge the last child with the next element
7275            my $last_child= $children[-1] || $first_child; # if no last child, then it's also the first child
7276            my $next= $last_child->{next_sibling};
7277            if( $next && $next->is_text && ($XML::Twig::index2gi[$last_child->{'gi'}] eq $XML::Twig::index2gi[$next->{'gi'}]) )
7278              { $last_child->merge_text( $next); }
7279
7280            # if parsing and have now a PCDATA text, mark so we can normalize later on if need be
7281            if( $parent->{twig_current} && $last_child->is_text) {  $parent->{twig_to_be_normalized}=1; }
7282          }
7283       else
7284         { # no children, just cut the elt
7285           $elt->delete;
7286         }
7287      }
7288    else
7289      { # trying to erase the root (of a twig or of a cut/new element)
7290        my @children= do { my $elt= $elt; my @children=(); my $child= $elt->{first_child}; while( $child) { push @children, $child; $child= $child->{next_sibling}; } @children; };
7291        unless( @children == 1)
7292          { croak "can only erase an element with no parent if it has a single child"; }
7293        $elt->_move_extra_data_after_erase;
7294        my $child= shift @children;
7295        $child->{parent}=undef; if( $XML::Twig::weakrefs) { weaken( $child->{parent});} ;
7296        my $twig= $elt->twig;
7297        $twig->set_root( $child);
7298      }
7299
7300    return $elt;
7301
7302  }
7303
7304sub _move_extra_data_after_erase
7305  { my( $elt)= @_;
7306    # extra_data
7307    if( my $extra_data= $elt->{extra_data})
7308      { my $target= $elt->{first_child} || $elt->{next_sibling};
7309        if( $target)
7310          {
7311            if( $target->is( $ELT))
7312              { $target->set_extra_data( $extra_data . ($target->extra_data || '')); }
7313            elsif( $target->is( $TEXT))
7314              { $target->_unshift_extra_data_in_pcdata( $extra_data, 0); }  # TO CHECK
7315          }
7316        else
7317          { my $parent= $elt->{parent}; # always exists or the erase cannot be performed
7318            $parent->_prefix_extra_data_before_end_tag( $extra_data);
7319          }
7320      }
7321
7322     # extra_data_before_end_tag
7323    if( my $extra_data= $elt->{extra_data_before_end_tag})
7324      { if( my $target= $elt->{next_sibling})
7325          { if( $target->is( $ELT))
7326              { $target->set_extra_data( $extra_data . ($target->extra_data || '')); }
7327            elsif( $target->is( $TEXT))
7328              {
7329                $target->_unshift_extra_data_in_pcdata( $extra_data, 0);
7330             }
7331          }
7332        elsif( my $parent= $elt->{parent})
7333          { $parent->_prefix_extra_data_before_end_tag( $extra_data); }
7334       }
7335
7336    return $elt;
7337
7338  }
7339BEGIN
7340  { my %method= ( before      => \&paste_before,
7341                  after       => \&paste_after,
7342                  first_child => \&paste_first_child,
7343                  last_child  => \&paste_last_child,
7344                  within      => \&paste_within,
7345        );
7346
7347    # paste elt somewhere around ref
7348    # pos can be first_child (default), last_child, before, after or within
7349    sub paste ## no critic (Subroutines::ProhibitNestedSubs);
7350      { my $elt= shift;
7351        if( $elt->{parent})
7352          { croak "cannot paste an element that belongs to a tree"; }
7353        my $pos;
7354        my $ref;
7355        if( ref $_[0])
7356          { $pos= 'first_child';
7357            croak "wrong argument order in paste, should be $_[1] first" if($_[1]);
7358          }
7359        else
7360          { $pos= shift; }
7361
7362        if( my $method= $method{$pos})
7363          {
7364            unless( ref( $_[0]) && isa( $_[0], 'XML::Twig::Elt'))
7365              { if( ! defined( $_[0]))
7366                  { croak "missing target in paste"; }
7367                elsif( ! ref( $_[0]))
7368                  { croak "wrong target type in paste (not a reference), should be XML::Twig::Elt or a subclass"; }
7369                else
7370                  { my $ref= ref $_[0];
7371                    croak "wrong target type in paste: '$ref', should be XML::Twig::Elt or a subclass";
7372                  }
7373              }
7374            $ref= $_[0];
7375            # check here so error message lists the caller file/line
7376            if( !$ref->{parent} && ($pos=~ m{^(before|after)$}) && !(exists $elt->{'target'}) && !(exists $elt->{'comment'}))
7377              { croak "cannot paste $1 root"; }
7378            $elt->$method( @_);
7379          }
7380        else
7381          { croak "tried to paste in wrong position '$pos', allowed positions " .
7382              " are 'first_child', 'last_child', 'before', 'after' and "    .
7383              "'within'";
7384          }
7385        if( (my $ids= $elt->{twig_id_list}) && (my $t= $ref->twig) )
7386          { $t->{twig_id_list}||={};
7387            foreach my $id (keys %$ids)
7388              { $t->{twig_id_list}->{$id}= $ids->{$id};
7389                if( $XML::Twig::weakrefs) { weaken( $t->{twig_id_list}->{$id}); }
7390              }
7391          }
7392        return $elt;
7393      }
7394
7395
7396    sub paste_before
7397      { my( $elt, $ref)= @_;
7398        my( $parent, $prev_sibling, $next_sibling );
7399
7400        # trying to paste before an orphan (root or detached wlt)
7401        unless( $ref->{parent})
7402          { if( my $t= $ref->twig)
7403              { if( (exists $elt->{'comment'}) || (exists $elt->{'target'})) # we can still do this
7404                  { $t->_add_cpi_outside_of_root( leading_cpi => $elt); return; }
7405                else
7406                  { croak "cannot paste before root"; }
7407              }
7408            else
7409              { croak "cannot paste before an orphan element"; }
7410          }
7411        $parent= $ref->{parent};
7412        $prev_sibling= $ref->{prev_sibling};
7413        $next_sibling= $ref;
7414
7415        $elt->{parent}=$parent; if( $XML::Twig::weakrefs) { weaken( $elt->{parent});} ;
7416        if( $parent->{first_child} == $ref) { $parent->{first_child}=  $elt; }
7417
7418        if( $prev_sibling) { $prev_sibling->{next_sibling}=  $elt; }
7419        $elt->{prev_sibling}=$prev_sibling; if( $XML::Twig::weakrefs) { weaken( $elt->{prev_sibling});} ;
7420
7421        $next_sibling->{prev_sibling}=$elt; if( $XML::Twig::weakrefs) { weaken( $next_sibling->{prev_sibling});} ;
7422        $elt->{next_sibling}=  $ref;
7423        return $elt;
7424      }
7425
7426     sub paste_after
7427      { my( $elt, $ref)= @_;
7428        my( $parent, $prev_sibling, $next_sibling );
7429
7430        # trying to paste after an orphan (root or detached wlt)
7431        unless( $ref->{parent})
7432            { if( my $t= $ref->twig)
7433                { if( (exists $elt->{'comment'}) || (exists $elt->{'target'})) # we can still do this
7434                    { $t->_add_cpi_outside_of_root( trailing_cpi => $elt); return; }
7435                  else
7436                    { croak "cannot paste after root"; }
7437                }
7438              else
7439                { croak "cannot paste after an orphan element"; }
7440            }
7441        $parent= $ref->{parent};
7442        $prev_sibling= $ref;
7443        $next_sibling= $ref->{next_sibling};
7444
7445        $elt->{parent}=$parent; if( $XML::Twig::weakrefs) { weaken( $elt->{parent});} ;
7446        if( $parent->{last_child}== $ref) {  delete $parent->{empty}; $parent->{last_child}=$elt; if( $XML::Twig::weakrefs) { weaken( $parent->{last_child});} ; }
7447
7448        $prev_sibling->{next_sibling}=  $elt;
7449        $elt->{prev_sibling}=$prev_sibling; if( $XML::Twig::weakrefs) { weaken( $elt->{prev_sibling});} ;
7450
7451        if( $next_sibling) { $next_sibling->{prev_sibling}=$elt; if( $XML::Twig::weakrefs) { weaken( $next_sibling->{prev_sibling});} ; }
7452        $elt->{next_sibling}=  $next_sibling;
7453        return $elt;
7454
7455      }
7456
7457    sub paste_first_child
7458      { my( $elt, $ref)= @_;
7459        my( $parent, $prev_sibling, $next_sibling );
7460        $parent= $ref;
7461        $next_sibling= $ref->{first_child};
7462
7463        $elt->{parent}=$parent; if( $XML::Twig::weakrefs) { weaken( $elt->{parent});} ;
7464        $parent->{first_child}=  $elt;
7465        unless( $parent->{last_child}) {  delete $parent->{empty}; $parent->{last_child}=$elt; if( $XML::Twig::weakrefs) { weaken( $parent->{last_child});} ; }
7466
7467        $elt->{prev_sibling}=undef; if( $XML::Twig::weakrefs) { weaken( $elt->{prev_sibling});} ;
7468
7469        if( $next_sibling) { $next_sibling->{prev_sibling}=$elt; if( $XML::Twig::weakrefs) { weaken( $next_sibling->{prev_sibling});} ; }
7470        $elt->{next_sibling}=  $next_sibling;
7471        return $elt;
7472      }
7473
7474    sub paste_last_child
7475      { my( $elt, $ref)= @_;
7476        my( $parent, $prev_sibling, $next_sibling );
7477        $parent= $ref;
7478        $prev_sibling= $ref->{last_child};
7479
7480        $elt->{parent}=$parent; if( $XML::Twig::weakrefs) { weaken( $elt->{parent});} ;
7481         delete $parent->{empty}; $parent->{last_child}=$elt; if( $XML::Twig::weakrefs) { weaken( $parent->{last_child});} ;
7482        unless( $parent->{first_child}) { $parent->{first_child}=  $elt; }
7483
7484        $elt->{prev_sibling}=$prev_sibling; if( $XML::Twig::weakrefs) { weaken( $elt->{prev_sibling});} ;
7485        if( $prev_sibling) { $prev_sibling->{next_sibling}=  $elt; }
7486
7487        $elt->{next_sibling}=  undef;
7488        return $elt;
7489      }
7490
7491    sub paste_within
7492      { my( $elt, $ref, $offset)= @_;
7493        my $text= $ref->is_text ? $ref : $ref->next_elt( $TEXT, $ref);
7494        my $new= $text->split_at( $offset);
7495        $elt->paste_before( $new);
7496        return $elt;
7497      }
7498  }
7499
7500# load an element into a structure similar to XML::Simple's
7501sub simplify
7502  { my $elt= shift;
7503
7504    # normalize option names
7505    my %options= @_;
7506    %options= map { my ($key, $val)= ($_, $options{$_});
7507                       $key=~ s{(\w)([A-Z])}{$1_\L$2}g;
7508                       $key => $val
7509                     } keys %options;
7510
7511    # check options
7512    my @allowed_options= qw( keyattr forcearray noattr content_key
7513                             var var_regexp variables var_attr
7514                             group_tags forcecontent
7515                             normalise_space normalize_space
7516                   );
7517    my %allowed_options= map { $_ => 1 } @allowed_options;
7518    foreach my $option (keys %options)
7519      { carp "invalid option $option\n" unless( $allowed_options{$option}); }
7520
7521    $options{normalise_space} ||= $options{normalize_space} || 0;
7522
7523    $options{content_key} ||= 'content';
7524    if( $options{content_key}=~ m{^-})
7525      { # need to remove the - and to activate extra folding
7526        $options{content_key}=~ s{^-}{};
7527        $options{extra_folding}= 1;
7528      }
7529    else
7530      { $options{extra_folding}= 0; }
7531
7532    $options{forcearray} ||=0;
7533    if( isa( $options{forcearray}, 'ARRAY'))
7534      { my %forcearray_tags= map { $_ => 1 } @{$options{forcearray}};
7535        $options{forcearray_tags}= \%forcearray_tags;
7536        $options{forcearray}= 0;
7537      }
7538
7539    $options{keyattr}     ||= ['name', 'key', 'id'];
7540    if( ref $options{keyattr} eq 'ARRAY')
7541      { foreach my $keyattr (@{$options{keyattr}})
7542          { my( $prefix, $att)= ($keyattr=~ m{^([+-])?(.*)});
7543            $prefix ||= '';
7544            $options{key_for_all}->{$att}= 1;
7545            $options{remove_key_for_all}->{$att}=1 unless( $prefix eq '+');
7546            $options{prefix_key_for_all}->{$att}=1 if( $prefix eq '-');
7547          }
7548      }
7549    elsif( ref $options{keyattr} eq 'HASH')
7550      { while( my( $elt, $keyattr)= each %{$options{keyattr}})
7551         { my( $prefix, $att)= ($keyattr=~ m{^([+-])?(.*)});
7552           $prefix ||='';
7553           $options{key_for_elt}->{$elt}= $att;
7554           $options{remove_key_for_elt}->{"$elt#$att"}=1 unless( $prefix);
7555           $options{prefix_key_for_elt}->{"$elt#$att"}=1 if( $prefix eq '-');
7556         }
7557      }
7558
7559
7560    $options{var}||= $options{var_attr}; # for compat with XML::Simple
7561    if( $options{var}) { $options{var_values}= {}; }
7562    else               { $options{var}='';         }
7563
7564    if( $options{variables})
7565      { $options{var}||= 1;
7566        $options{var_values}= $options{variables};
7567      }
7568
7569    if( $options{var_regexp} and !$options{var})
7570      { warn "var option not used, var_regexp option ignored\n"; }
7571    $options{var_regexp} ||= '\$\{?(\w+)\}?';
7572
7573    $elt->_simplify( \%options);
7574
7575 }
7576
7577sub _simplify
7578  { my( $elt, $options)= @_;
7579
7580    my $data;
7581
7582    my $gi= $XML::Twig::index2gi[$elt->{'gi'}];
7583    my @children= do { my $elt= $elt; my @children=(); my $child= $elt->{first_child}; while( $child) { push @children, $child; $child= $child->{next_sibling}; } @children; };
7584    my %atts= $options->{noattr} || !$elt->{att} ? () : %{$elt->{att}};
7585    my $nb_atts= keys %atts;
7586    my $nb_children= $elt->children_count + $nb_atts;
7587
7588    my %nb_children;
7589    foreach (@children)   { $nb_children{$_->tag}++; }
7590    foreach (keys %atts)  { $nb_children{$_}++;      }
7591
7592    my $arrays; # tag => array where elements are stored
7593
7594
7595    # store children
7596    foreach my $child (@children)
7597      { if( $child->is_text)
7598          { # generate with a content key
7599            my $text= $elt->_text_with_vars( $options);
7600            if( $options->{normalise_space} >= 2) { $text= _normalize_space( $text); }
7601            if(    $options->{force_content}
7602                || $nb_atts
7603                || (scalar @children > 1)
7604              )
7605              { $data->{$options->{content_key}}= $text; }
7606            else
7607              { $data= $text; }
7608          }
7609        else
7610          { # element with sub-elements
7611            my $child_gi= $XML::Twig::index2gi[$child->{'gi'}];
7612
7613            my $child_data= $child->_simplify( $options);
7614
7615            # first see if we need to simplify further the child data
7616            # simplify because of grouped tags
7617            if( my $grouped_tag= $options->{group_tags}->{$child_gi})
7618              { # check that the child data is a hash with a single field
7619                unless(    (ref( $child_data) eq 'HASH')
7620                        && (keys %$child_data == 1)
7621                        && defined ( my $grouped_child_data= $child_data->{$grouped_tag})
7622                      )
7623                  { croak "error in grouped tag $child_gi"; }
7624                else
7625                  { $child_data=  $grouped_child_data; }
7626              }
7627            # simplify because of extra folding
7628            if( $options->{extra_folding})
7629              { if(    (ref( $child_data) eq 'HASH')
7630                    && (keys %$child_data == 1)
7631                    && defined( my $content= $child_data->{$options->{content_key}})
7632                  )
7633                  { $child_data= $content; }
7634              }
7635
7636            if( my $keyatt= $child->_key_attr( $options))
7637              { # simplify element with key
7638                my $key= $child->{'att'}->{$keyatt};
7639                if( $options->{normalise_space} >= 1) { $key= _normalize_space( $key); }
7640                $data->{$child_gi}->{$key}= $child_data;
7641              }
7642            elsif(      $options->{forcearray}
7643                   ||   $options->{forcearray_tags}->{$child_gi}
7644                   || ( $nb_children{$child_gi} > 1)
7645                 )
7646              { # simplify element to store in an array
7647                if( defined $child_data && $child_data ne "" )
7648                  { $data->{$child_gi} ||= [];
7649                    push @{$data->{$child_gi}}, $child_data;
7650                  }
7651                else
7652                  { $data->{$child_gi}= [{}]; }
7653              }
7654            else
7655              { # simplify element to store as a hash field
7656                $data->{$child_gi}=$child_data;
7657                $data->{$child_gi}= defined $child_data && $child_data ne "" ? $child_data : {};
7658              }
7659          }
7660    }
7661
7662    # store atts
7663    # TODO: deal with att that already have an element by that name
7664    foreach my $att (keys %atts)
7665      { # do not store if the att is a key that needs to be removed
7666        if(    $options->{remove_key_for_all}->{$att}
7667            || $options->{remove_key_for_elt}->{"$gi#$att"}
7668          )
7669          { next; }
7670
7671        my $att_text= $options->{var} ?  _replace_vars_in_text( $atts{$att}, $options) : $atts{$att} ;
7672        if( $options->{normalise_space} >= 2) { $att_text= _normalize_space( $att_text); }
7673
7674        if(    $options->{prefix_key_for_all}->{$att}
7675            || $options->{prefix_key_for_elt}->{"$gi#$att"}
7676          )
7677          { # prefix the att
7678            $data->{"-$att"}= $att_text;
7679          }
7680        else
7681          { # normal case
7682            $data->{$att}= $att_text;
7683          }
7684      }
7685
7686    return $data;
7687  }
7688
7689sub _key_attr
7690  { my( $elt, $options)=@_;
7691    return if( $options->{noattr});
7692    if( $options->{key_for_all})
7693      { foreach my $att ($elt->att_names)
7694          { if( $options->{key_for_all}->{$att})
7695              { return $att; }
7696          }
7697      }
7698    elsif( $options->{key_for_elt})
7699      { if( my $key_for_elt= $options->{key_for_elt}->{$XML::Twig::index2gi[$elt->{'gi'}]} )
7700          { return $key_for_elt if( defined( $elt->{'att'}->{$key_for_elt})); }
7701      }
7702    return;
7703  }
7704
7705sub _text_with_vars
7706  { my( $elt, $options)= @_;
7707    my $text;
7708    if( $options->{var})
7709      { $text= _replace_vars_in_text( $elt->text, $options);
7710        $elt->_store_var( $options);
7711      }
7712     else
7713      { $text= $elt->text; }
7714    return $text;
7715  }
7716
7717
7718sub _normalize_space
7719  { my $text= shift;
7720    $text=~ s{\s+}{ }sg;
7721    $text=~ s{^\s}{};
7722    $text=~ s{\s$}{};
7723    return $text;
7724  }
7725
7726
7727sub att_nb
7728  { return 0 unless( my $atts= $_[0]->{att});
7729    return scalar keys %$atts;
7730  }
7731
7732sub has_no_atts
7733  { return 1 unless( my $atts= $_[0]->{att});
7734    return scalar keys %$atts ? 0 : 1;
7735  }
7736
7737sub _replace_vars_in_text
7738  { my( $text, $options)= @_;
7739
7740    $text=~ s{($options->{var_regexp})}
7741             { if( defined( my $value= $options->{var_values}->{$2}))
7742                 { $value }
7743               else
7744                 { warn "unknown variable $2\n";
7745                   $1
7746                 }
7747             }gex;
7748    return $text;
7749  }
7750
7751sub _store_var
7752  { my( $elt, $options)= @_;
7753    if( defined (my $var_name= $elt->{'att'}->{$options->{var}}))
7754       { $options->{var_values}->{$var_name}= $elt->text;
7755       }
7756  }
7757
7758
7759# split a text element at a given offset
7760sub split_at
7761  { my( $elt, $offset)= @_;
7762    my $text_elt= $elt->is_text ? $elt : $elt->first_child( $TEXT) || return '';
7763    my $string= $text_elt->text;
7764    my $left_string= substr( $string, 0, $offset);
7765    my $right_string= substr( $string, $offset);
7766    $text_elt->{pcdata}= (delete $text_elt->{empty} || 1) &&  $left_string;
7767    my $new_elt= $elt->new( $XML::Twig::index2gi[$elt->{'gi'}], $right_string);
7768    $new_elt->paste( after => $elt);
7769    return $new_elt;
7770  }
7771
7772
7773# split an element or its text descendants into several, in place
7774# all elements (new and untouched) are returned
7775sub split
7776  { my $elt= shift;
7777    my @text_chunks;
7778    my @result;
7779    if( $elt->is_text) { @text_chunks= ($elt); }
7780    else               { @text_chunks= $elt->descendants( $TEXT); }
7781    foreach my $text_chunk (@text_chunks)
7782      { push @result, $text_chunk->_split( 1, @_); }
7783    return @result;
7784  }
7785
7786# split an element or its text descendants into several, in place
7787# created elements (those which match the regexp) are returned
7788sub mark
7789  { my $elt= shift;
7790    my @text_chunks;
7791    my @result;
7792    if( $elt->is_text) { @text_chunks= ($elt); }
7793    else               { @text_chunks= $elt->descendants( $TEXT); }
7794    foreach my $text_chunk (@text_chunks)
7795      { push @result, $text_chunk->_split( 0, @_); }
7796    return @result;
7797  }
7798
7799# split a single text element
7800# return_all defines what is returned: if it is true
7801# only returns the elements created by matches in the split regexp
7802# otherwise all elements (new and untouched) are returned
7803
7804
7805{
7806
7807  sub _split
7808    { my $elt= shift;
7809      my $return_all= shift;
7810      my $regexp= shift;
7811      my @tags;
7812
7813      while( @_)
7814        { my $tag= shift();
7815          if( ref $_[0])
7816            { push @tags, { tag => $tag, atts => shift }; }
7817          else
7818            { push @tags, { tag => $tag }; }
7819        }
7820
7821      unless( @tags) { @tags= { tag => $elt->{parent}->gi }; }
7822
7823      my @result;                                 # the returned list of elements
7824      my $text= $elt->text;
7825      my $gi= $XML::Twig::index2gi[$elt->{'gi'}];
7826
7827      # 2 uses: if split matches then the first substring reuses $elt
7828      #         once a split has occurred then the last match needs to be put in
7829      #         a new element
7830      my $previous_match= 0;
7831
7832      while( my( $pre_match, @matches)= $text=~ /^(.*?)$regexp(.*)$/gcs)
7833        { $text= pop @matches;
7834          if( $previous_match)
7835            { # match, not the first one, create a new text ($gi) element
7836              _utf8_ify( $pre_match) if( $] < 5.010);
7837              $elt= $elt->insert_new_elt( after => $gi, $pre_match);
7838              push @result, $elt if( $return_all);
7839            }
7840          else
7841            { # first match in $elt, re-use $elt for the first sub-string
7842              _utf8_ify( $pre_match) if( $] < 5.010);
7843              $elt->set_text( $pre_match);
7844              $previous_match++;                # store the fact that there was a match
7845              push @result, $elt if( $return_all);
7846            }
7847
7848          # now deal with matches captured in the regexp
7849          if( @matches)
7850            { # match, with capture
7851              my $i=0;
7852              foreach my $match (@matches)
7853                { # create new element, text is the match
7854                  _utf8_ify( $match) if( $] < 5.010);
7855                  my $tag  = _repl_match( $tags[$i]->{tag}, @matches) || '#PCDATA';
7856                  my $atts = \%{$tags[$i]->{atts}} || {};
7857                  my %atts= map { _repl_match( $_, @matches) => _repl_match( $atts->{$_}, @matches) } keys %$atts;
7858                  $elt= $elt->insert_new_elt( after => $tag, \%atts, $match);
7859                  push @result, $elt;
7860                  $i= ($i + 1) % @tags;
7861                }
7862            }
7863          else
7864            { # match, no captures
7865              my $tag  = $tags[0]->{tag};
7866              my $atts = \%{$tags[0]->{atts}} || {};
7867              $elt=  $elt->insert_new_elt( after => $tag, $atts);
7868              push @result, $elt;
7869            }
7870        }
7871      if( $previous_match && $text)
7872        { # there was at least 1 match, and there is text left after the match
7873          $elt= $elt->insert_new_elt( after => $gi, $text);
7874        }
7875
7876      push @result, $elt if( $return_all);
7877
7878      return @result; # return all elements
7879   }
7880
7881sub _repl_match
7882  { my( $val, @matches)= @_;
7883    $val=~ s{\$(\d+)}{$matches[$1-1]}g;
7884    return $val;
7885  }
7886
7887  # evil hack needed as sometimes
7888  my $encode_is_loaded=0;   # so we only load Encode once
7889  sub _utf8_ify
7890    {
7891      if( $perl_version >= 5.008 and $perl_version < 5.010 and !_keep_encoding())
7892        { unless( $encode_is_loaded) { require Encode; import Encode; $encode_is_loaded++; }
7893          Encode::_utf8_on( $_[0]); # the flag should be set but is not
7894        }
7895    }
7896
7897
7898}
7899
7900{ my %replace_sub; # cache for complex expressions (expression => sub)
7901
7902  sub subs_text
7903    { my( $elt, $regexp, $replace)= @_;
7904
7905      my $replacement_string;
7906      my $is_string= _is_string( $replace);
7907
7908      my @parents;
7909
7910      foreach my $text_elt ($elt->descendants_or_self( $TEXT))
7911        {
7912          if( $is_string)
7913            { my $text= $text_elt->text;
7914              $text=~ s{$regexp}{ _replace_var( $replace, $1, $2, $3, $4, $5, $6, $7, $8, $9)}egx;
7915              $text_elt->set_text( $text);
7916           }
7917          else
7918            {
7919              no utf8; # = perl 5.6
7920              my $replace_sub= ( $replace_sub{$replace} ||= _install_replace_sub( $replace));
7921              my $text= $text_elt->text;
7922              my $pos=0;  # used to skip text that was previously matched
7923              my $found_hit;
7924              while( my( $pre_match_string, $match_string, @var)= ($text=~ m{(.*?)($regexp)}sg))
7925                { $found_hit=1;
7926                  my $match_start  = length( $pre_match_string);
7927                  my $match        = $match_start ? $text_elt->split_at( $match_start + $pos) : $text_elt;
7928                  my $match_length = length( $match_string);
7929                  my $post_match   = $match->split_at( $match_length);
7930                  $replace_sub->( $match, @var);
7931
7932                  # go to next
7933                  $text_elt= $post_match;
7934                  $text= $post_match->text;
7935
7936                  if( $found_hit) { push @parents, $text_elt->{parent} unless $parents[-1] && $parents[-1]== $text_elt->{parent}; }
7937
7938                }
7939            }
7940        }
7941
7942      foreach my $parent (@parents) { $parent->normalize; }
7943
7944      return $elt;
7945    }
7946
7947
7948  sub _is_string
7949    { return ($_[0]=~ m{&e[ln]t}) ? 0: 1 }
7950
7951  sub _replace_var
7952    { my( $string, @var)= @_;
7953      unshift @var, undef;
7954      $string=~ s{\$(\d)}{$var[$1]}g;
7955      return $string;
7956    }
7957
7958  sub _install_replace_sub
7959    { my $replace_exp= shift;
7960      my @item= split m{(&e[ln]t\s*\([^)]*\))}, $replace_exp;
7961      my $sub= q{ my( $match, @var)= @_; my $new; my $last_inserted=$match;};
7962      my( $gi, $exp);
7963      foreach my $item (@item)
7964        { next if ! length $item;
7965          if(    $item=~ m{^&elt\s*\(([^)]*)\)})
7966            { $exp= $1; }
7967          elsif( $item=~ m{^&ent\s*\(\s*([^\s)]*)\s*\)})
7968            { $exp= " '#ENT' => $1"; }
7969          else
7970            { $exp= qq{ '#PCDATA' => "$item"}; }
7971          $exp=~ s{\$(\d)}{my $i= $1-1; "\$var[$i]"}eg; # replace references to matches
7972          $sub.= qq{ \$new= \$match->new( $exp); };
7973          $sub .= q{ $new->paste( after => $last_inserted); $last_inserted=$new;};
7974        }
7975      $sub .= q{ $match->delete; };
7976      #$sub=~ s/;/;\n/g; warn "subs: $sub";
7977      my $coderef= eval "sub { $NO_WARNINGS; $sub }";
7978      if( $@) { croak( "invalid replacement expression $replace_exp: ",$@); }
7979      return $coderef;
7980    }
7981
7982  }
7983
7984
7985sub merge_text
7986  { my( $e1, $e2)= @_;
7987    croak "invalid merge: can only merge 2 elements"
7988        unless( isa( $e2, 'XML::Twig::Elt'));
7989    croak "invalid merge: can only merge 2 text elements"
7990        unless( $e1->is_text && $e2->is_text && ($e1->gi eq $e2->gi));
7991
7992    my $t1_length= length( $e1->text);
7993
7994    $e1->set_text( $e1->text . $e2->text);
7995
7996    if( my $extra_data_in_pcdata= $e2->_extra_data_in_pcdata)
7997      { foreach my $data (@$extra_data_in_pcdata) { $e1->_push_extra_data_in_pcdata( $data->{text}, $data->{offset} + $t1_length); } }
7998
7999    $e2->delete;
8000
8001    return $e1;
8002  }
8003
8004sub merge
8005  { my( $e1, $e2)= @_;
8006    my @e2_children= $e2->_children;
8007    if(     $e1->_last_child && $e1->_last_child->is_pcdata
8008        &&  @e2_children && $e2_children[0]->is_pcdata
8009      )
8010      { my $t1_length= length( $e1->_last_child->{pcdata});
8011        my $child1= $e1->_last_child;
8012        my $child2= shift @e2_children;
8013        $child1->{pcdata} .= $child2->{pcdata};
8014
8015        my $extra_data= $e1->_extra_data_before_end_tag . $e2->extra_data;
8016
8017        if( $extra_data)
8018          { $e1->_del_extra_data_before_end_tag;
8019            $child1->_push_extra_data_in_pcdata( $extra_data, $t1_length);
8020          }
8021
8022        if( my $extra_data_in_pcdata= $child2->_extra_data_in_pcdata)
8023          { foreach my $data (@$extra_data_in_pcdata) { $child1->_push_extra_data_in_pcdata( $data->{text}, $data->{offset} + $t1_length); } }
8024
8025        if( my $extra_data_before_end_tag= $e2->_extra_data_before_end_tag)
8026          { $e1->_set_extra_data_before_end_tag( $extra_data_before_end_tag); }
8027      }
8028
8029    foreach my $e (@e2_children) { $e->move( last_child => $e1); }
8030
8031    $e2->delete;
8032    return $e1;
8033  }
8034
8035
8036# recursively copy an element and returns the copy (can be huge and long)
8037sub copy
8038  { my $elt= shift;
8039    my $copy= $elt->new( $XML::Twig::index2gi[$elt->{'gi'}]);
8040
8041    if( $elt->extra_data) { $copy->set_extra_data( $elt->extra_data); }
8042    if( $elt->{extra_data_before_end_tag}) { $copy->_set_extra_data_before_end_tag( $elt->{extra_data_before_end_tag}); }
8043
8044    if( $elt->is_asis)   { $copy->set_asis; }
8045
8046    if( (exists $elt->{'pcdata'}))
8047      { $copy->{pcdata}= (delete $copy->{empty} || 1) &&  $elt->{pcdata};
8048        if( $elt->{extra_data_in_pcdata}) { $copy->_set_extra_data_in_pcdata( $elt->{extra_data_in_pcdata}); }
8049      }
8050    elsif( (exists $elt->{'cdata'}))
8051      { $copy->{cdata}=  $elt->{cdata};
8052        if( $elt->{extra_data_in_pcdata}) { $copy->_set_extra_data_in_pcdata( $elt->{extra_data_in_pcdata}); }
8053      }
8054    elsif( (exists $elt->{'target'}))
8055      { $copy->_set_pi( $elt->{target}, $elt->{data}); }
8056    elsif( (exists $elt->{'comment'}))
8057      { $copy->{comment}=  $elt->{comment}; }
8058    elsif( (exists $elt->{'ent'}))
8059      { $copy->{ent}=  $elt->{ent}; }
8060    else
8061      { my @children= do { my $elt= $elt; my @children=(); my $child= $elt->{first_child}; while( $child) { push @children, $child; $child= $child->{next_sibling}; } @children; };
8062        if( my $atts= $elt->{att})
8063          { my %atts;
8064            tie %atts, 'Tie::IxHash' if (keep_atts_order());
8065            %atts= %{$atts}; # we want to do a real copy of the attributes
8066            $copy->set_atts( \%atts);
8067          }
8068        foreach my $child (@children)
8069          { my $child_copy= $child->copy;
8070            $child_copy->paste( 'last_child', $copy);
8071          }
8072      }
8073    # save links to the original location, which can be convenient and is used for namespace resolution
8074    foreach my $link ( qw(parent prev_sibling next_sibling) )
8075      { $copy->{former}->{$link}= $elt->{$link};
8076        if( $XML::Twig::weakrefs) { weaken( $copy->{former}->{$link}); }
8077      }
8078
8079    $copy->{empty}=  $elt->{'empty'};
8080
8081    return $copy;
8082  }
8083
8084
8085sub delete
8086  { my $elt= shift;
8087    $elt->cut;
8088    $elt->DESTROY unless $XML::Twig::weakrefs;
8089    return undef;
8090  }
8091
8092sub __destroy
8093  { my $elt= shift;
8094    return if( $XML::Twig::weakrefs);
8095    my $t= shift || $elt->twig; # optional argument, passed in recursive calls
8096
8097    foreach( @{[$elt->_children]}) { $_->DESTROY( $t); }
8098
8099    # the id reference needs to be destroyed
8100    # lots of tests to avoid warnings during the cleanup phase
8101    $elt->del_id( $t) if( $ID && $t && defined( $elt->{att}) && exists( $elt->{att}->{$ID}));
8102    if( $elt->{former}) { foreach (keys %{$elt->{former}}) { delete $elt->{former}->{$_}; } delete $elt->{former}; }
8103    foreach (qw( keys %$elt)) { delete $elt->{$_}; }
8104    undef $elt;
8105  }
8106
8107BEGIN
8108{ sub set_destroy { if( $XML::Twig::weakrefs) { undef *DESTROY } else { *DESTROY= *__destroy; } }
8109  set_destroy();
8110}
8111
8112# ignores the element
8113sub ignore
8114  { my $elt= shift;
8115    my $t= $elt->twig;
8116    $t->ignore( $elt, @_);
8117  }
8118
8119BEGIN {
8120  my $pretty                    = 0;
8121  my $quote                     = '"';
8122  my $INDENT                    = '  ';
8123  my $empty_tag_style           = 0;
8124  my $remove_cdata              = 0;
8125  my $keep_encoding             = 0;
8126  my $expand_external_entities  = 0;
8127  my $keep_atts_order           = 0;
8128  my $do_not_escape_amp_in_atts = 0;
8129  my $WRAP                      = '80';
8130  my $REPLACED_ENTS             = qq{&<};
8131
8132  my ($NSGMLS, $NICE, $INDENTED, $INDENTEDCT, $INDENTEDC, $WRAPPED, $RECORD1, $RECORD2, $INDENTEDA)= (1..9);
8133  my %KEEP_TEXT_TAG_ON_ONE_LINE= map { $_ => 1 } ( $INDENTED, $INDENTEDCT, $INDENTEDC, $INDENTEDA, $WRAPPED);
8134  my %WRAPPED =  map { $_ => 1 } ( $WRAPPED, $INDENTEDA, $INDENTEDC);
8135
8136  my %pretty_print_style=
8137    ( none       => 0,          # no added \n
8138      nsgmls     => $NSGMLS,    # nsgmls-style, \n in tags
8139      # below this line styles are UNSAFE (the generated XML can be well-formed but invalid)
8140      nice       => $NICE,      # \n after open/close tags except when the
8141                                # element starts with text
8142      indented   => $INDENTED,  # nice plus idented
8143      indented_close_tag   => $INDENTEDCT,  # nice plus idented
8144      indented_c => $INDENTEDC, # slightly more compact than indented (closing
8145                                # tags are on the same line)
8146      wrapped    => $WRAPPED,   # text is wrapped at column
8147      record_c   => $RECORD1,   # for record-like data (compact)
8148      record     => $RECORD2,   # for record-like data  (not so compact)
8149      indented_a => $INDENTEDA, # nice, indented, and with attributes on separate
8150                                # lines as the nsgmls style, as well as wrapped
8151                                # lines - to make the xml friendly to line-oriented tools
8152      cvs        => $INDENTEDA, # alias for indented_a
8153    );
8154
8155  my ($HTML, $EXPAND)= (1..2);
8156  my %empty_tag_style=
8157    ( normal => 0,        # <tag/>
8158      html   => $HTML,    # <tag />
8159      xhtml  => $HTML,    # <tag />
8160      expand => $EXPAND,  # <tag></tag>
8161    );
8162
8163  my %quote_style=
8164    ( double  => '"',
8165      single  => "'",
8166      # smart  => "smart",
8167    );
8168
8169  my $xml_space_preserve; # set when an element includes xml:space="preserve"
8170
8171  my $output_filter;      # filters the entire output (including < and >)
8172  my $output_text_filter; # filters only the text part (tag names, attributes, pcdata)
8173
8174  my $replaced_ents= $REPLACED_ENTS;
8175
8176
8177  # returns those pesky "global" variables so you can switch between twigs
8178  sub global_state ## no critic (Subroutines::ProhibitNestedSubs);
8179    { return
8180       { pretty                    => $pretty,
8181         quote                     => $quote,
8182         indent                    => $INDENT,
8183         empty_tag_style           => $empty_tag_style,
8184         remove_cdata              => $remove_cdata,
8185         keep_encoding             => $keep_encoding,
8186         expand_external_entities  => $expand_external_entities,
8187         output_filter             => $output_filter,
8188         output_text_filter        => $output_text_filter,
8189         keep_atts_order           => $keep_atts_order,
8190         do_not_escape_amp_in_atts => $do_not_escape_amp_in_atts,
8191         wrap                      => $WRAP,
8192         replaced_ents             => $replaced_ents,
8193        };
8194    }
8195
8196  # restores the global variables
8197  sub set_global_state
8198    { my $state= shift;
8199      $pretty                    = $state->{pretty};
8200      $quote                     = $state->{quote};
8201      $INDENT                    = $state->{indent};
8202      $empty_tag_style           = $state->{empty_tag_style};
8203      $remove_cdata              = $state->{remove_cdata};
8204      $keep_encoding             = $state->{keep_encoding};
8205      $expand_external_entities  = $state->{expand_external_entities};
8206      $output_filter             = $state->{output_filter};
8207      $output_text_filter        = $state->{output_text_filter};
8208      $keep_atts_order           = $state->{keep_atts_order};
8209      $do_not_escape_amp_in_atts = $state->{do_not_escape_amp_in_atts};
8210      $WRAP                      = $state->{wrap};
8211      $replaced_ents             = $state->{replaced_ents},
8212    }
8213
8214  # sets global state to defaults
8215  sub init_global_state
8216    { set_global_state(
8217       { pretty                    => 0,
8218         quote                     => '"',
8219         indent                    => $INDENT,
8220         empty_tag_style           => 0,
8221         remove_cdata              => 0,
8222         keep_encoding             => 0,
8223         expand_external_entities  => 0,
8224         output_filter             => undef,
8225         output_text_filter        => undef,
8226         keep_atts_order           => undef,
8227         do_not_escape_amp_in_atts => 0,
8228         wrap                      => $WRAP,
8229         replaced_ents             => $REPLACED_ENTS,
8230        });
8231    }
8232
8233
8234  # set the pretty_print style (in $pretty) and returns the old one
8235  # can be called from outside the package with 2 arguments (elt, style)
8236  # or from inside with only one argument (style)
8237  # the style can be either a string (one of the keys of %pretty_print_style
8238  # or a number (presumably an old value saved)
8239  sub set_pretty_print
8240    { my $style= lc( defined $_[1] ? $_[1] : $_[0]); # so we cover both cases
8241      my $old_pretty= $pretty;
8242      if( $style=~ /^\d+$/)
8243        { croak "invalid pretty print style $style" unless( $style < keys %pretty_print_style);
8244          $pretty= $style;
8245        }
8246      else
8247        { croak "invalid pretty print style '$style'" unless( exists $pretty_print_style{$style});
8248          $pretty= $pretty_print_style{$style};
8249        }
8250      if( $WRAPPED{$pretty} )
8251        { XML::Twig::_use( 'Text::Wrap') or croak( "Text::Wrap not available, cannot use style $style"); }
8252      return $old_pretty;
8253    }
8254
8255  sub _pretty_print { return $pretty; }
8256
8257  # set the empty tag style (in $empty_tag_style) and returns the old one
8258  # can be called from outside the package with 2 arguments (elt, style)
8259  # or from inside with only one argument (style)
8260  # the style can be either a string (one of the keys of %empty_tag_style
8261  # or a number (presumably an old value saved)
8262  sub set_empty_tag_style
8263    { my $style= lc( defined $_[1] ? $_[1] : $_[0]); # so we cover both cases
8264      my $old_style= $empty_tag_style;
8265      if( $style=~ /^\d+$/)
8266        { croak "invalid empty tag style $style"
8267        unless( $style < keys %empty_tag_style);
8268        $empty_tag_style= $style;
8269        }
8270      else
8271        { croak "invalid empty tag style '$style'"
8272            unless( exists $empty_tag_style{$style});
8273          $empty_tag_style= $empty_tag_style{$style};
8274        }
8275      return $old_style;
8276    }
8277
8278  sub _pretty_print_styles
8279    { return (sort { $pretty_print_style{$a} <=> $pretty_print_style{$b} || $a cmp $b } keys %pretty_print_style); }
8280
8281  sub set_quote
8282    { my $style= $_[1] || $_[0];
8283      my $old_quote= $quote;
8284      croak "invalid quote '$style'" unless( exists $quote_style{$style});
8285      $quote= $quote_style{$style};
8286      return $old_quote;
8287    }
8288
8289  sub set_remove_cdata
8290    { my $new_value= defined $_[1] ? $_[1] : $_[0];
8291      my $old_value= $remove_cdata;
8292      $remove_cdata= $new_value;
8293      return $old_value;
8294    }
8295
8296
8297  sub set_indent
8298    { my $new_value= defined $_[1] ? $_[1] : $_[0];
8299      my $old_value= $INDENT;
8300      $INDENT= $new_value;
8301      return $old_value;
8302    }
8303
8304  sub set_wrap
8305    { my $new_value= defined $_[1] ? $_[1] : $_[0];
8306      my $old_value= $WRAP;
8307      $WRAP= $new_value;
8308      return $old_value;
8309    }
8310
8311
8312  sub set_keep_encoding
8313    { my $new_value= defined $_[1] ? $_[1] : $_[0];
8314      my $old_value= $keep_encoding;
8315      $keep_encoding= $new_value;
8316      return $old_value;
8317   }
8318
8319  sub set_replaced_ents
8320    { my $new_value= defined $_[1] ? $_[1] : $_[0];
8321      my $old_value= $replaced_ents;
8322      $replaced_ents= $new_value;
8323      return $old_value;
8324   }
8325
8326  sub do_not_escape_gt
8327    { my $old_value= $replaced_ents;
8328      $replaced_ents= q{&<}; # & needs to be first
8329      return $old_value;
8330    }
8331
8332  sub escape_gt
8333    { my $old_value= $replaced_ents;
8334      $replaced_ents= qq{&<>}; # & needs to be first
8335      return $old_value;
8336    }
8337
8338  sub _keep_encoding { return $keep_encoding; } # so I can use elsewhere in the module
8339
8340  sub set_do_not_escape_amp_in_atts
8341    { my $new_value= defined $_[1] ? $_[1] : $_[0];
8342      my $old_value= $do_not_escape_amp_in_atts;
8343      $do_not_escape_amp_in_atts= $new_value;
8344      return $old_value;
8345   }
8346
8347  sub output_filter      { return $output_filter; }
8348  sub output_text_filter { return $output_text_filter; }
8349
8350  sub set_output_filter
8351    { my $new_value= defined $_[1] ? $_[1] : $_[0]; # can be called in object/non-object mode
8352      # if called in object mode with no argument, the filter is undefined
8353      if( isa( $new_value, 'XML::Twig::Elt') || isa( $new_value, 'XML::Twig')) { undef $new_value; }
8354      my $old_value= $output_filter;
8355      if( !$new_value || isa( $new_value, 'CODE') )
8356        { $output_filter= $new_value; }
8357      elsif( $new_value eq 'latin1')
8358        { $output_filter= XML::Twig::latin1();
8359        }
8360      elsif( $XML::Twig::filter{$new_value})
8361        {  $output_filter= $XML::Twig::filter{$new_value}; }
8362      else
8363        { croak "invalid output filter '$new_value'"; }
8364
8365      return $old_value;
8366    }
8367
8368  sub set_output_text_filter
8369    { my $new_value= defined $_[1] ? $_[1] : $_[0]; # can be called in object/non-object mode
8370      # if called in object mode with no argument, the filter is undefined
8371      if( isa( $new_value, 'XML::Twig::Elt') || isa( $new_value, 'XML::Twig')) { undef $new_value; }
8372      my $old_value= $output_text_filter;
8373      if( !$new_value || isa( $new_value, 'CODE') )
8374        { $output_text_filter= $new_value; }
8375      elsif( $new_value eq 'latin1')
8376        { $output_text_filter= XML::Twig::latin1();
8377        }
8378      elsif( $XML::Twig::filter{$new_value})
8379        {  $output_text_filter= $XML::Twig::filter{$new_value}; }
8380      else
8381        { croak "invalid output text filter '$new_value'"; }
8382
8383      return $old_value;
8384    }
8385
8386  sub set_expand_external_entities
8387    { my $new_value= defined $_[1] ? $_[1] : $_[0];
8388      my $old_value= $expand_external_entities;
8389      $expand_external_entities= $new_value;
8390      return $old_value;
8391    }
8392
8393  sub set_keep_atts_order
8394    { my $new_value= defined $_[1] ? $_[1] : $_[0];
8395      my $old_value= $keep_atts_order;
8396      $keep_atts_order= $new_value;
8397      return $old_value;
8398
8399   }
8400
8401  sub keep_atts_order { return $keep_atts_order; } # so I can use elsewhere in the module
8402
8403  my %html_empty_elt;
8404  BEGIN { %html_empty_elt= map { $_ => 1} qw( base meta link hr br param img area input col); }
8405
8406  sub start_tag
8407    { my( $elt, $option)= @_;
8408
8409
8410      return if( $elt->{gi} < $XML::Twig::SPECIAL_GI);
8411
8412      my $extra_data= $elt->{extra_data} || '';
8413
8414      my $gi= $XML::Twig::index2gi[$elt->{'gi'}];
8415      my $att= $elt->{att}; # should be $elt->{att}, optimized into a pure hash look-up
8416
8417      my $ns_map= $att ? $att->{'#original_gi'} : '';
8418      if( $ns_map) { $gi= _restore_original_prefix( $ns_map, $gi); }
8419      $gi=~ s{^#default:}{}; # remove default prefix
8420
8421      if( $output_text_filter) { $gi= $output_text_filter->( $gi); }
8422
8423      # get the attribute and their values
8424      my $att_sep = $pretty==$NSGMLS    ? "\n"
8425                  : $pretty==$INDENTEDA ? "\n" . $INDENT x ($elt->level+1) . '  '
8426                  :                       ' '
8427                  ;
8428
8429      my $replace_in_att_value= $replaced_ents . "$quote\t\r\n";
8430      if( $option->{escape_gt} && $replaced_ents !~ m{>}) { $replace_in_att_value.= '>'; }
8431
8432      my $tag;
8433      my @att_names= grep { !( $_=~ m{^#(?!default:)} ) } $keep_atts_order ?  keys %{$att} : sort keys %{$att};
8434      if( @att_names)
8435        { my $atts= join $att_sep, map  { my $output_att_name= $ns_map ? _restore_original_prefix( $ns_map, $_) : $_;
8436                                          if( $output_text_filter)
8437                                            { $output_att_name=  $output_text_filter->( $output_att_name); }
8438                                          $output_att_name . '=' . $quote . _att_xml_string( $att->{$_}, $replace_in_att_value) . $quote
8439
8440                                        }
8441                                        @att_names
8442                                   ;
8443           if( $pretty==$INDENTEDA && @att_names == 1) { $att_sep= ' '; }
8444           $tag= "<$gi$att_sep$atts";
8445        }
8446      else
8447        { $tag= "<$gi"; }
8448
8449      $tag .= "\n" if($pretty==$NSGMLS);
8450
8451
8452      # force empty if suitable HTML tag, otherwise use the value from the input tree
8453      if( ($empty_tag_style eq $HTML) && !$elt->{first_child} && !$elt->{extra_data_before_end_tag} && $html_empty_elt{$gi})
8454        { $elt->{empty}= 1; }
8455      my $empty= defined $elt->{empty} ? $elt->{empty}
8456               : $elt->{first_child}    ? 0
8457               :                         1;
8458
8459      $tag .= (!$elt->{empty} || $elt->{extra_data_before_end_tag})  ? '>'            # element has content
8460            : (($empty_tag_style eq $HTML) && $html_empty_elt{$gi}) ? ' />'          # html empty element
8461                                                                                     # cvs-friendly format
8462            : ( $pretty == $INDENTEDA && @att_names > 1)            ? "\n" .  $INDENT x $elt->level . "/>"
8463            : ( $pretty == $INDENTEDA && @att_names == 1)           ? " />"
8464            : $empty_tag_style                                      ? "></" . $XML::Twig::index2gi[$elt->{'gi'}] . ">" # $empty_tag_style is $HTML or $EXPAND
8465            :                                                         '/>'
8466            ;
8467
8468      if( ( (substr( $XML::Twig::index2gi[$elt->{'gi'}], 0, 1) eq '#') && (substr( $XML::Twig::index2gi[$elt->{'gi'}], 0, 9) ne '#default:') )) { $tag= ''; }
8469
8470#warn "TRACE: ", $tag,": ", Encode::is_utf8( $tag) ? "has flag" : "FLAG NOT SET";
8471
8472      unless( $pretty) { return defined( $extra_data) ? $extra_data . $tag : $tag;  }
8473
8474      my $prefix='';
8475      my $return='';   # '' or \n is to be printed before the tag
8476      my $indent=0;    # number of indents before the tag
8477
8478      if( $pretty==$RECORD1)
8479        { my $level= $elt->level;
8480          $return= "\n" if( $level < 2);
8481          $indent= 1 if( $level == 1);
8482        }
8483
8484     elsif( $pretty==$RECORD2)
8485        { $return= "\n";
8486          $indent= $elt->level;
8487        }
8488
8489      elsif( $pretty==$NICE)
8490        { my $parent= $elt->{parent};
8491          unless( !$parent || $parent->{contains_text})
8492            { $return= "\n"; }
8493          $elt->{contains_text}= 1 if( ($parent && $parent->{contains_text})
8494                                     || $elt->contains_text);
8495        }
8496
8497      elsif( $KEEP_TEXT_TAG_ON_ONE_LINE{$pretty})
8498        { my $parent= $elt->{parent};
8499          unless( !$parent || $parent->{contains_text})
8500            { $return= "\n";
8501              $indent= $elt->level;
8502            }
8503          $elt->{contains_text}= 1 if( ($parent && $parent->{contains_text})
8504                                     || $elt->contains_text);
8505        }
8506
8507      if( $return || $indent)
8508        { # check for elements in which spaces should be kept
8509          my $t= $elt->twig;
8510          return $extra_data . $tag if( $xml_space_preserve);
8511          if( $t && $t->{twig_keep_spaces_in})
8512            { foreach my $ancestor ($elt->ancestors)
8513                { return $extra_data . $tag if( $t->{twig_keep_spaces_in}->{$XML::Twig::index2gi[$ancestor->{'gi'}]}) }
8514            }
8515
8516          $prefix= $INDENT x $indent;
8517          if( $extra_data)
8518            { $extra_data=~ s{\s+$}{};
8519              $extra_data=~ s{^\s+}{};
8520              $extra_data= $prefix .  $extra_data . $return;
8521            }
8522        }
8523
8524
8525      return $return . $extra_data . $prefix . $tag;
8526    }
8527
8528  sub end_tag
8529    { my $elt= shift;
8530      return  '' if(    ($elt->{gi}<$XML::Twig::SPECIAL_GI)
8531                     || ($elt->{'empty'} && !$elt->{extra_data_before_end_tag})
8532                   );
8533      my $tag= "<";
8534      my $gi= $XML::Twig::index2gi[$elt->{'gi'}];
8535
8536      if( my $map= $elt->{'att'}->{'#original_gi'}) { $gi= _restore_original_prefix( $map, $gi); }
8537      $gi=~ s{^#default:}{}; # remove default prefix
8538
8539      if( $output_text_filter) { $gi= $output_text_filter->( $XML::Twig::index2gi[$elt->{'gi'}]); }
8540      $tag .=  "/$gi>";
8541
8542      $tag = ($elt->{extra_data_before_end_tag} || '') . $tag;
8543
8544      if( ( (substr( $XML::Twig::index2gi[$elt->{'gi'}], 0, 1) eq '#') && (substr( $XML::Twig::index2gi[$elt->{'gi'}], 0, 9) ne '#default:') )) { $tag= ''; }
8545
8546      return $tag unless $pretty;
8547
8548      my $prefix='';
8549      my $return=0;    # 1 if a \n is to be printed before the tag
8550      my $indent=0;    # number of indents before the tag
8551
8552      if( $pretty==$RECORD1)
8553        { $return= 1 if( $elt->level == 0);
8554        }
8555
8556     elsif( $pretty==$RECORD2)
8557        { unless( $elt->contains_text)
8558            { $return= 1 ;
8559              $indent= $elt->level;
8560            }
8561        }
8562
8563      elsif( $pretty==$NICE)
8564        { my $parent= $elt->{parent};
8565          if( (    ($parent && !$parent->{contains_text}) || !$parent )
8566            && ( !$elt->{contains_text}
8567             && ($elt->{has_flushed_child} || $elt->{first_child})
8568           )
8569         )
8570            { $return= 1; }
8571        }
8572
8573      elsif( $KEEP_TEXT_TAG_ON_ONE_LINE{$pretty})
8574        { my $parent= $elt->{parent};
8575          if( (    ($parent && !$parent->{contains_text}) || !$parent )
8576            && ( !$elt->{contains_text}
8577             && ($elt->{has_flushed_child} || $elt->{first_child})
8578           )
8579         )
8580            { $return= 1;
8581              $indent= $elt->level;
8582            }
8583        }
8584
8585      if( $return || $indent)
8586        { # check for elements in which spaces should be kept
8587          my $t= $elt->twig;
8588          return $tag if( $xml_space_preserve);
8589          if( $t && $t->{twig_keep_spaces_in})
8590            { foreach my $ancestor ($elt, $elt->ancestors)
8591                { return $tag if( $t->{twig_keep_spaces_in}->{$XML::Twig::index2gi[$ancestor->{'gi'}]}) }
8592            }
8593
8594          if( $return) { $prefix= ($pretty== $INDENTEDCT) ? "\n$INDENT" : "\n"; }
8595          $prefix.= $INDENT x $indent;
8596    }
8597
8598      # add a \n at the end of the document (after the root element)
8599      $tag .= "\n" unless( $elt->{parent});
8600
8601      return $prefix . $tag;
8602    }
8603
8604  sub _restore_original_prefix
8605    { my( $map, $name)= @_;
8606      my $prefix= _ns_prefix( $name);
8607      if( my $original_prefix= $map->{$prefix})
8608        { if( $original_prefix eq '#default')
8609            { $name=~ s{^$prefix:}{}; }
8610          else
8611            { $name=~ s{^$prefix(?=:)}{$original_prefix}; }
8612        }
8613      return $name;
8614    }
8615
8616  # buffer used to hold the text to print/sprint, to avoid passing it back and forth between methods
8617  my @sprint;
8618
8619  # $elt is an element to print
8620  # $fh is an optional filehandle to print to
8621  # $pretty is an optional value, if true a \n is printed after the < of the
8622  # opening tag
8623  sub print
8624    { my $elt= shift;
8625
8626      my $fh= isa( $_[0], 'GLOB') || isa( $_[0], 'IO::Scalar') ? shift : undef;
8627      my $old_select= defined $fh ? select $fh : undef;
8628      print $elt->sprint( @_);
8629      select $old_select if( defined $old_select);
8630    }
8631
8632
8633# those next 2 methods need to be refactored, they are copies of the same methods in XML::Twig
8634sub print_to_file
8635  { my( $elt, $filename)= (shift, shift);
8636    my $out_fh;
8637#    open( $out_fh, ">$filename") or _croak( "cannot create file $filename: $!");     # < perl 5.8
8638    my $mode= $keep_encoding ? '>' : '>:utf8';                                       # >= perl 5.8
8639    open( $out_fh, $mode, $filename) or _croak( "cannot create file $filename: $!"); # >= perl 5.8
8640    $elt->print( $out_fh, @_);
8641    close $out_fh;
8642    return $elt;
8643  }
8644
8645# probably only works on *nix (at least the chmod bit)
8646# first print to a temporary file, then rename that file to the desired file name, then change permissions
8647# to the original file permissions (or to the current umask)
8648sub safe_print_to_file
8649  { my( $elt, $filename)= (shift, shift);
8650    my $perm= -f $filename ? (stat $filename)[2] & 07777 : ~umask() ;
8651    XML::Twig::_use( 'File::Temp') || croak "need File::Temp to use safe_print_to_file\n";
8652    XML::Twig::_use( 'File::Basename') || croak "need File::Basename to use safe_print_to_file\n";
8653    my $tmpdir= File::Basename::dirname( $filename);
8654    my( $fh, $tmpfilename) = File::Temp::tempfile( DIR => $tmpdir);
8655    $elt->print_to_file( $tmpfilename, @_);
8656    rename( $tmpfilename, $filename) or unlink $tmpfilename && _croak( "cannot move temporary file to $filename: $!");
8657    chmod $perm, $filename;
8658    return $elt;
8659  }
8660
8661
8662  # same as print but does not output the start tag if the element
8663  # is marked as flushed
8664  sub flush
8665    { my $elt= shift;
8666      my $up_to= $_[0] && isa( $_[0], 'XML::Twig::Elt') ? shift : $elt;
8667      $elt->twig->flush_up_to( $up_to, @_);
8668    }
8669  sub purge
8670    { my $elt= shift;
8671      my $up_to= $_[0] && isa( $_[0], 'XML::Twig::Elt') ? shift : $elt;
8672      $elt->twig->purge_up_to( $up_to, @_);
8673    }
8674
8675  sub _flush
8676    { my $elt= shift;
8677
8678      my $pretty;
8679      my $fh=  isa( $_[0], 'GLOB') || isa( $_[0], 'IO::Scalar') ? shift : undef;
8680      my $old_select= defined $fh ? select $fh : undef;
8681      my $old_pretty= defined ($pretty= shift) ? set_pretty_print( $pretty) : undef;
8682
8683      $xml_space_preserve= 1 if( ($elt->inherit_att( 'xml:space') || '') eq 'preserve');
8684
8685      $elt->__flush();
8686
8687      $xml_space_preserve= 0;
8688
8689      select $old_select if( defined $old_select);
8690      set_pretty_print( $old_pretty) if( defined $old_pretty);
8691    }
8692
8693  sub __flush
8694    { my $elt= shift;
8695
8696      if( $elt->{gi} >= $XML::Twig::SPECIAL_GI)
8697        { my $preserve= ($elt->{'att'}->{'xml:space'} || '') eq 'preserve';
8698          $xml_space_preserve++ if $preserve;
8699          unless( $elt->{'flushed'})
8700            { print $elt->start_tag();
8701            }
8702
8703          # flush the children
8704          my @children= do { my $elt= $elt; my @children=(); my $child= $elt->{first_child}; while( $child) { push @children, $child; $child= $child->{next_sibling}; } @children; };
8705          foreach my $child (@children)
8706            { $child->_flush( $pretty);
8707              $child->{'flushed'}=1;
8708            }
8709          if( ! $elt->{end_tag_flushed})
8710            { print $elt->end_tag;
8711              $elt->{end_tag_flushed}=1;
8712              $elt->{'flushed'}=1;
8713            }
8714          $xml_space_preserve-- if $preserve;
8715          # used for pretty printing
8716          if( my $parent= $elt->{parent}) { $parent->{has_flushed_child}= 1; }
8717        }
8718      else # text or special element
8719        { my $text;
8720          if( (exists $elt->{'pcdata'}))     { $text= $elt->pcdata_xml_string;
8721                                     if( my $parent= $elt->{parent})
8722                                       { $parent->{contains_text}= 1; }
8723                                   }
8724          elsif( (exists $elt->{'cdata'}))   { $text= $elt->cdata_string;
8725                                     if( my $parent= $elt->{parent})
8726                                       { $parent->{contains_text}= 1; }
8727                                   }
8728          elsif( (exists $elt->{'target'}))      { $text= $elt->pi_string;          }
8729          elsif( (exists $elt->{'comment'})) { $text= $elt->comment_string;     }
8730          elsif( (exists $elt->{'ent'}))     { $text= $elt->ent_string;         }
8731
8732          print $output_filter ? $output_filter->( $text) : $text;
8733        }
8734    }
8735
8736
8737  sub xml_text
8738    { my( $elt, @options)= @_;
8739
8740      if( @options && grep { lc( $_) eq 'no_recurse' } @options) { return $elt->xml_text_only; }
8741
8742      my $string='';
8743
8744      if( ($elt->{gi} >= $XML::Twig::SPECIAL_GI) )
8745        { # sprint the children
8746          my $child= $elt->{first_child} || '';
8747          while( $child)
8748            { $string.= $child->xml_text;
8749            } continue { $child= $child->{next_sibling}; }
8750        }
8751      elsif( (exists $elt->{'pcdata'}))  { $string .= $output_filter ?  $output_filter->($elt->pcdata_xml_string)
8752                                                           : $elt->pcdata_xml_string;
8753                               }
8754      elsif( (exists $elt->{'cdata'}))   { $string .= $output_filter ?  $output_filter->($elt->cdata_string)
8755                                                           : $elt->cdata_string;
8756                               }
8757      elsif( (exists $elt->{'ent'}))     { $string .= $elt->ent_string; }
8758
8759      return $string;
8760    }
8761
8762  sub xml_text_only
8763    { return join '', map { $_->xml_text if( $_->is_text || (exists $_->{'ent'})) } $_[0]->_children; }
8764
8765  # same as print but except... it does not print but rather returns the string
8766  # if the second parameter is set then only the content is returned, not the
8767  # start and end tags of the element (but the tags of the included elements are
8768  # returned)
8769
8770  sub sprint
8771    { my $elt= shift;
8772      my( $old_pretty, $old_empty_tag_style);
8773
8774      if( $_[0])
8775        { if( isa( $_[0], 'HASH'))
8776            { # "proper way, using a hashref for options
8777              my %args= XML::Twig::_normalize_args( %{shift()});
8778              if( defined $args{PrettyPrint}) { $old_pretty          = set_pretty_print( $args{PrettyPrint});  }
8779              if( defined $args{EmptyTags})   { $old_empty_tag_style = set_empty_tag_style( $args{EmptyTags}); }
8780            }
8781          else
8782            { # "old" way, just using the option name
8783              my @other_opt;
8784              foreach my $opt (@_)
8785                { if( exists $pretty_print_style{$opt}) { $old_pretty = set_pretty_print( $opt);             }
8786                  elsif( exists $empty_tag_style{$opt}) { $old_empty_tag_style = set_empty_tag_style( $opt); }
8787                  else                                  { push @other_opt, $opt;                             }
8788                }
8789               @_= @other_opt;
8790            }
8791        }
8792
8793      $xml_space_preserve= 1 if( ($elt->inherit_att( 'xml:space') || '') eq 'preserve');
8794
8795      @sprint=();
8796      $elt->_sprint( @_);
8797      my $sprint= join( '', @sprint);
8798      if( $output_filter) { $sprint= $output_filter->( $sprint); }
8799
8800      if( ( ($pretty== $WRAPPED) || ($pretty==$INDENTEDC)) && !$xml_space_preserve)
8801        { $sprint= _wrap_text( $sprint); }
8802      $xml_space_preserve= 0;
8803
8804
8805      if( defined $old_pretty)          { set_pretty_print( $old_pretty);             }
8806      if( defined $old_empty_tag_style) { set_empty_tag_style( $old_empty_tag_style); }
8807
8808      return $sprint;
8809    }
8810
8811  sub _wrap_text
8812    { my( $string)= @_;
8813      my $wrapped;
8814      foreach my $line (split /\n/, $string)
8815        { my( $initial_indent)= $line=~ m{^(\s*)};
8816          my $wrapped_line= Text::Wrap::wrap(  '',  $initial_indent . $INDENT, $line) . "\n";
8817
8818          # fix glitch with Text::wrap when the first line is long and does not include spaces
8819          # the first line ends up being too short by 2 chars, but we'll have to live with it!
8820          $wrapped_line=~ s{^ +\n  }{}s; # this prefix needs to be removed
8821
8822          $wrapped .= $wrapped_line;
8823        }
8824
8825      return $wrapped;
8826    }
8827
8828
8829  sub _sprint
8830    { my $elt= shift;
8831      my $no_tag= shift || 0;
8832      # in case there's some comments or PI's piggybacking
8833
8834      if( $elt->{gi} >= $XML::Twig::SPECIAL_GI)
8835        {
8836          my $preserve= ($elt->{'att'}->{'xml:space'} || '') eq 'preserve';
8837          $xml_space_preserve++ if $preserve;
8838
8839          push @sprint, $elt->start_tag unless( $no_tag);
8840
8841          # sprint the children
8842          my $child= $elt->{first_child};
8843          while( $child)
8844            { $child->_sprint;
8845              $child= $child->{next_sibling};
8846            }
8847          push @sprint, $elt->end_tag unless( $no_tag);
8848          $xml_space_preserve-- if $preserve;
8849        }
8850      else
8851        { push @sprint, $elt->{extra_data} if( $elt->{extra_data}) ;
8852          if(    (exists $elt->{'pcdata'}))  { push @sprint, $elt->pcdata_xml_string; }
8853          elsif( (exists $elt->{'cdata'}))   { push @sprint, $elt->cdata_string;      }
8854          elsif( (exists $elt->{'target'}))      { if( ($pretty >= $INDENTED) && !$elt->{parent}->{contains_text}) { push @sprint, "\n" . $INDENT x $elt->level; }
8855                                     push @sprint, $elt->pi_string;
8856                                   }
8857          elsif( (exists $elt->{'comment'})) { if( ($pretty >= $INDENTED) && !$elt->{parent}->{contains_text}) { push @sprint, "\n" . $INDENT x $elt->level; }
8858                                     push @sprint, $elt->comment_string;
8859                                   }
8860          elsif( (exists $elt->{'ent'}))     { push @sprint, $elt->ent_string;        }
8861        }
8862
8863      return;
8864    }
8865
8866  # just a shortcut to $elt->sprint( 1)
8867  sub xml_string
8868    { my $elt= shift;
8869      isa( $_[0], 'HASH') ?  $elt->sprint( shift(), 1) : $elt->sprint( 1);
8870    }
8871
8872  sub pcdata_xml_string
8873    { my $elt= shift;
8874      if( defined( my $string= $elt->{pcdata}) )
8875        {
8876          if( ! $elt->{extra_data_in_pcdata})
8877            {
8878              $string=~ s/([$replaced_ents])/$XML::Twig::base_ent{$1}/g unless( !$replaced_ents || $keep_encoding || $elt->{asis});
8879              $string=~ s{\Q]]>}{]]&gt;}g;
8880            }
8881          else
8882            { _gen_mark( $string); # used by _(un)?protect_extra_data
8883              foreach my $data (reverse @{$elt->{extra_data_in_pcdata}})
8884                { my $substr= substr( $string, $data->{offset});
8885                  if( $keep_encoding || $elt->{asis})
8886                    { substr( $string, $data->{offset}, 0, $data->{text}); }
8887                  else
8888                    { substr( $string, $data->{offset}, 0, _protect_extra_data( $data->{text})); }
8889                }
8890              unless( $keep_encoding || $elt->{asis})
8891                {
8892                  $string=~ s{([$replaced_ents])}{$XML::Twig::base_ent{$1}}g ;
8893                  $string=~ s{\Q]]>}{]]&gt;}g;
8894                  _unprotect_extra_data( $string);
8895                }
8896            }
8897          return $output_text_filter ? $output_text_filter->( $string) : $string;
8898        }
8899      else
8900        { return ''; }
8901    }
8902
8903  { my $mark;
8904    my( %char2ent, %ent2char);
8905    BEGIN
8906      { %char2ent= ( '<' => 'lt', '&' => 'amp', '>' => 'gt');
8907        %ent2char= map { $char2ent{$_} => $_ } keys %char2ent;
8908      }
8909
8910    # generate a unique mark (a string) not found in the string,
8911    # used to mark < and & in the extra data
8912    sub _gen_mark
8913      { $mark="AAAA";
8914        $mark++ while( index( $_[0], $mark) > -1);
8915        return $mark;
8916      }
8917
8918    sub _protect_extra_data
8919      { my( $extra_data)= @_;
8920        $extra_data=~ s{([<&>])}{:$mark:$char2ent{$1}:}g;
8921        return $extra_data;
8922      }
8923
8924    sub _unprotect_extra_data
8925      { $_[0]=~ s{:$mark:(\w+):}{$ent2char{$1}}g; }
8926
8927  }
8928
8929  sub cdata_string
8930    { my $cdata= $_[0]->{cdata};
8931      unless( defined $cdata) { return ''; }
8932      if( $remove_cdata)
8933        { $cdata=~ s/([$replaced_ents])/$XML::Twig::base_ent{$1}/g; }
8934      else
8935        { $cdata= $CDATA_START . $cdata . $CDATA_END; }
8936      return $cdata;
8937   }
8938
8939  sub att_xml_string
8940    { my $elt= shift;
8941      my $att= shift;
8942
8943      my $replace= $replaced_ents . "$quote\n\r\t";
8944      if($_[0] && $_[0]->{escape_gt} && ($replace!~ m{>}) ) { $replace .='>'; }
8945
8946      if( defined (my $string= $elt->{att}->{$att}))
8947        { return _att_xml_string( $string, $replace); }
8948      else
8949        { return ''; }
8950    }
8951
8952  # escaped xml string for an attribute value
8953  sub _att_xml_string
8954    { my( $string, $escape)= @_;
8955      if( !defined( $string)) { return ''; }
8956      if( $keep_encoding)
8957        { $string=~ s{$quote}{$XML::Twig::base_ent{$quote}}g;
8958        }
8959      else
8960        {
8961          if( $do_not_escape_amp_in_atts)
8962            { $escape=~ s{^.}{}; # seems like the most backward compatible way to remove & from the list
8963              $string=~ s{([$escape])}{$XML::Twig::base_ent{$1}}g;
8964              $string=~ s{&(?!(\w+|#\d+|[xX][0-9a-fA-F]+);)}{&amp;}g; # dodgy: escape & that do not start an entity
8965            }
8966          else
8967            { $string=~ s{([$escape])}{$XML::Twig::base_ent{$1}}g;
8968              $string=~ s{\Q]]>}{]]&gt;}g;
8969            }
8970        }
8971
8972      return $output_text_filter ? $output_text_filter->( $string) : $string;
8973    }
8974
8975  sub ent_string
8976    { my $ent= shift;
8977      my $ent_text= $ent->{ent};
8978      my( $t, $el, $ent_string);
8979      if(    $expand_external_entities
8980          && ($t= $ent->twig)
8981          && ($el= $t->entity_list)
8982          && ($ent_string= $el->{entities}->{$ent->ent_name}->{val})
8983        )
8984        { return $ent_string; }
8985       else
8986         { return $ent_text;  }
8987    }
8988
8989  # returns just the text, no tags, for an element
8990  sub text
8991    { my( $elt, @options)= @_;
8992
8993      if( @options && grep { lc( $_) eq 'no_recurse' } @options) { return $elt->text_only; }
8994      my $sep = (@options && grep { lc( $_) eq 'sep' } @options) ? ' ' : '';
8995
8996      my $string;
8997
8998      if( (exists $elt->{'pcdata'}))     { return  $elt->{pcdata}    . $sep;  }
8999      elsif( (exists $elt->{'cdata'}))   { return  $elt->{cdata}     . $sep;  }
9000      elsif( (exists $elt->{'target'}))      { return  $elt->pi_string . $sep;  }
9001      elsif( (exists $elt->{'comment'})) { return  $elt->{comment}   . $sep;  }
9002      elsif( (exists $elt->{'ent'}))     { return  $elt->{ent}       . $sep ; }
9003
9004
9005      my $child= $elt->{first_child} ||'';
9006      while( $child)
9007        {
9008          my $child_text= $child->text( @options);
9009          $string.= defined( $child_text) ? $sep . $child_text : '';
9010        } continue { $child= $child->{next_sibling}; }
9011
9012      unless( defined $string) { $string=''; }
9013
9014      return $output_text_filter ? $output_text_filter->( $string) : $string;
9015    }
9016
9017  sub text_only
9018    { return join '', map { $_->text if( $_->is_text || (exists $_->{'ent'})) } $_[0]->_children; }
9019
9020  sub trimmed_text
9021    { my $elt= shift;
9022      my $text= $elt->text( @_);
9023      $text=~ s{\s+}{ }sg;
9024      $text=~ s{^\s*}{};
9025      $text=~ s{\s*$}{};
9026      return $text;
9027    }
9028
9029  sub trim
9030    { my( $elt)= @_;
9031      my $pcdata= $elt->first_descendant( $TEXT);
9032      (my $pcdata_text= $pcdata->text)=~ s{^\s+}{}s;
9033      $pcdata->set_text( $pcdata_text);
9034      $pcdata= $elt->last_descendant( $TEXT);
9035      ($pcdata_text= $pcdata->text)=~ s{\s+$}{};
9036      $pcdata->set_text( $pcdata_text);
9037      foreach my $pcdata ($elt->descendants( $TEXT))
9038        { ($pcdata_text= $pcdata->text)=~ s{\s+}{ }g;
9039          $pcdata->set_text( $pcdata_text);
9040        }
9041      return $elt;
9042    }
9043
9044
9045  # remove cdata sections (turns them into regular pcdata) in an element
9046  sub remove_cdata
9047    { my $elt= shift;
9048      foreach my $cdata ($elt->descendants_or_self( $CDATA))
9049        { if( $keep_encoding)
9050            { my $data= $cdata->{cdata};
9051              $data=~ s{([&<"'])}{$XML::Twig::base_ent{$1}}g;
9052              $cdata->{pcdata}= (delete $cdata->{empty} || 1) &&  $data;
9053            }
9054          else
9055            { $cdata->{pcdata}= (delete $cdata->{empty} || 1) &&  $cdata->{cdata}; }
9056          $cdata->{gi}=$XML::Twig::gi2index{$PCDATA} or $cdata->set_gi( $PCDATA);
9057          undef $cdata->{cdata};
9058        }
9059    }
9060
9061sub _is_private      { return _is_private_name( $_[0]->gi); }
9062sub _is_private_name { return $_[0]=~ m{^#(?!default:)};                }
9063
9064
9065} # end of block containing package globals ($pretty_print, $quotes, keep_encoding...)
9066
9067# merges consecutive #PCDATAs in am element
9068sub normalize
9069  { my( $elt)= @_;
9070    my @descendants= $elt->descendants( $PCDATA);
9071    while( my $desc= shift @descendants)
9072      { if( ! length $desc->{pcdata}) { $desc->delete; next; }
9073        while( @descendants && $desc->{next_sibling} && $desc->{next_sibling}== $descendants[0])
9074          { my $to_merge= shift @descendants;
9075            $desc->merge_text( $to_merge);
9076          }
9077      }
9078    return $elt;
9079  }
9080
9081# SAX export methods
9082sub toSAX1
9083  { _toSAX(@_, \&_start_tag_data_SAX1, \&_end_tag_data_SAX1); }
9084
9085sub toSAX2
9086  { _toSAX(@_, \&_start_tag_data_SAX2, \&_end_tag_data_SAX2); }
9087
9088sub _toSAX
9089  { my( $elt, $handler, $start_tag_data, $end_tag_data)= @_;
9090    if( $elt->{gi} >= $XML::Twig::SPECIAL_GI)
9091      { my $data= $start_tag_data->( $elt);
9092        _start_prefix_mapping( $elt, $handler, $data);
9093        if( $data && (my $start_element = $handler->can( 'start_element')))
9094          { unless( $elt->{'flushed'}) { $start_element->( $handler, $data); } }
9095
9096        foreach my $child ($elt->_children)
9097          { $child->_toSAX( $handler, $start_tag_data, $end_tag_data); }
9098
9099        if( (my $data= $end_tag_data->( $elt)) && (my $end_element = $handler->can( 'end_element')) )
9100          { $end_element->( $handler, $data); }
9101        _end_prefix_mapping( $elt, $handler);
9102      }
9103    else # text or special element
9104      { if( (exists $elt->{'pcdata'}) && (my $characters= $handler->can( 'characters')))
9105          { $characters->( $handler, { Data => $elt->{pcdata} });  }
9106        elsif( (exists $elt->{'cdata'}))
9107          { if( my $start_cdata= $handler->can( 'start_cdata'))
9108              { $start_cdata->( $handler); }
9109            if( my $characters= $handler->can( 'characters'))
9110              { $characters->( $handler, {Data => $elt->{cdata} });  }
9111            if( my $end_cdata= $handler->can( 'end_cdata'))
9112              { $end_cdata->( $handler); }
9113          }
9114        elsif( ((exists $elt->{'target'}))  && (my $pi= $handler->can( 'processing_instruction')))
9115          { $pi->( $handler, { Target =>$elt->{target}, Data => $elt->{data} });  }
9116        elsif( ((exists $elt->{'comment'}))  && (my $comment= $handler->can( 'comment')))
9117          { $comment->( $handler, { Data => $elt->{comment} });  }
9118        elsif( ((exists $elt->{'ent'})))
9119          {
9120            if( my $se=   $handler->can( 'skipped_entity'))
9121              { $se->( $handler, { Name => $elt->ent_name });  }
9122            elsif( my $characters= $handler->can( 'characters'))
9123              { if( defined $elt->ent_string)
9124                  { $characters->( $handler, {Data => $elt->ent_string});  }
9125                else
9126                  { $characters->( $handler, {Data => $elt->ent_name});  }
9127              }
9128          }
9129
9130      }
9131  }
9132
9133sub _start_tag_data_SAX1
9134  { my( $elt)= @_;
9135    my $name= $XML::Twig::index2gi[$elt->{'gi'}];
9136    return if( ( (substr( $XML::Twig::index2gi[$elt->{'gi'}], 0, 1) eq '#') && (substr( $XML::Twig::index2gi[$elt->{'gi'}], 0, 9) ne '#default:') ));
9137    my $attributes={};
9138    my $atts= $elt->{att};
9139    while( my( $att, $value)= each %$atts)
9140      { $attributes->{$att}= $value unless( ( $att=~ m{^#(?!default:)} )); }
9141    my $data= { Name => $name, Attributes => $attributes};
9142    return $data;
9143  }
9144
9145sub _end_tag_data_SAX1
9146  { my( $elt)= @_;
9147    return if( ( (substr( $XML::Twig::index2gi[$elt->{'gi'}], 0, 1) eq '#') && (substr( $XML::Twig::index2gi[$elt->{'gi'}], 0, 9) ne '#default:') ));
9148    return  { Name => $XML::Twig::index2gi[$elt->{'gi'}] };
9149  }
9150
9151sub _start_tag_data_SAX2
9152  { my( $elt)= @_;
9153    my $data={};
9154
9155    my $name= $XML::Twig::index2gi[$elt->{'gi'}];
9156    return if( ( (substr( $XML::Twig::index2gi[$elt->{'gi'}], 0, 1) eq '#') && (substr( $XML::Twig::index2gi[$elt->{'gi'}], 0, 9) ne '#default:') ));
9157    $data->{Name}         = $name;
9158    $data->{Prefix}       = $elt->ns_prefix;
9159    $data->{LocalName}    = $elt->local_name;
9160    $data->{NamespaceURI} = $elt->namespace;
9161
9162    # save a copy of the data so we can re-use it for the end tag
9163    my %sax2_data= %$data;
9164    $elt->{twig_elt_SAX2_data}= \%sax2_data;
9165
9166    # add the attributes
9167    $data->{Attributes}= $elt->_atts_to_SAX2;
9168
9169    return $data;
9170  }
9171
9172sub _atts_to_SAX2
9173  { my $elt= shift;
9174    my $SAX2_atts= {};
9175    foreach my $att (keys %{$elt->{att}})
9176      {
9177        next if( ( $att=~ m{^#(?!default:)} ));
9178        my $SAX2_att={};
9179        $SAX2_att->{Name}         = $att;
9180        $SAX2_att->{Prefix}       = _ns_prefix( $att);
9181        $SAX2_att->{LocalName}    = _local_name( $att);
9182        $SAX2_att->{NamespaceURI} = $elt->namespace( $SAX2_att->{Prefix});
9183        $SAX2_att->{Value}        = $elt->{'att'}->{$att};
9184        my $SAX2_att_name= "{$SAX2_att->{NamespaceURI}}$SAX2_att->{LocalName}";
9185
9186        $SAX2_atts->{$SAX2_att_name}= $SAX2_att;
9187      }
9188    return $SAX2_atts;
9189  }
9190
9191sub _start_prefix_mapping
9192  { my( $elt, $handler, $data)= @_;
9193    if( my $start_prefix_mapping= $handler->can( 'start_prefix_mapping')
9194        and my @new_prefix_mappings= grep { /^\{[^}]*\}xmlns/ || /^\{$XMLNS_URI\}/ } keys %{$data->{Attributes}}
9195      )
9196      { foreach my $prefix (@new_prefix_mappings)
9197          { my $prefix_string= $data->{Attributes}->{$prefix}->{LocalName};
9198            if( $prefix_string eq 'xmlns') { $prefix_string=''; }
9199            my $prefix_data=
9200              {  Prefix       => $prefix_string,
9201                 NamespaceURI => $data->{Attributes}->{$prefix}->{Value}
9202              };
9203            $start_prefix_mapping->( $handler, $prefix_data);
9204            $elt->{twig_end_prefix_mapping} ||= [];
9205            push @{$elt->{twig_end_prefix_mapping}}, $prefix_string;
9206          }
9207      }
9208  }
9209
9210sub _end_prefix_mapping
9211  { my( $elt, $handler)= @_;
9212    if( my $end_prefix_mapping= $handler->can( 'end_prefix_mapping'))
9213      { foreach my $prefix (@{$elt->{twig_end_prefix_mapping}})
9214          { $end_prefix_mapping->( $handler, { Prefix => $prefix} ); }
9215      }
9216  }
9217
9218sub _end_tag_data_SAX2
9219  { my( $elt)= @_;
9220    return if( ( (substr( $XML::Twig::index2gi[$elt->{'gi'}], 0, 1) eq '#') && (substr( $XML::Twig::index2gi[$elt->{'gi'}], 0, 9) ne '#default:') ));
9221    return $elt->{twig_elt_SAX2_data};
9222  }
9223
9224sub contains_text
9225  { my $elt= shift;
9226    my $child= $elt->{first_child};
9227    while ($child)
9228      { return 1 if( $child->is_text || (exists $child->{'ent'}));
9229        $child= $child->{next_sibling};
9230      }
9231    return 0;
9232  }
9233
9234# creates a single pcdata element containing the text as child of the element
9235# options:
9236#   - force_pcdata: when set to a true value forces the text to be in a #PCDATA
9237#                   even if the original element was a #CDATA
9238sub set_text
9239  { my( $elt, $string, %option)= @_;
9240
9241    if( $XML::Twig::index2gi[$elt->{'gi'}] eq $PCDATA)
9242      { return $elt->{pcdata}= (delete $elt->{empty} || 1) &&  $string; }
9243    elsif( $XML::Twig::index2gi[$elt->{'gi'}] eq $CDATA)
9244      { if( $option{force_pcdata})
9245          { $elt->{gi}=$XML::Twig::gi2index{$PCDATA} or $elt->set_gi( $PCDATA);
9246            $elt->{cdata}= '';
9247            return $elt->{pcdata}= (delete $elt->{empty} || 1) &&  $string;
9248          }
9249        else
9250          { $elt->{cdata}=  $string;
9251            return $string;
9252          }
9253      }
9254    elsif( $elt->contains_a_single( $PCDATA) )
9255      { # optimized so we have a slight chance of not losing embedded comments and pi's
9256        $elt->{first_child}->set_pcdata( $string);
9257        return $elt;
9258      }
9259
9260    foreach my $child (@{[$elt->_children]})
9261      { $child->delete; }
9262
9263    my $pcdata= $elt->_new_pcdata( $string);
9264    $pcdata->paste( $elt);
9265
9266    delete $elt->{empty};
9267
9268    return $elt;
9269  }
9270
9271# set the content of an element from a list of strings and elements
9272sub set_content
9273  { my $elt= shift;
9274
9275    return $elt unless defined $_[0];
9276
9277    # attributes can be given as a hash (passed by ref)
9278    if( ref $_[0] eq 'HASH')
9279      { my $atts= shift;
9280        $elt->del_atts; # usually useless but better safe than sorry
9281        $elt->set_atts( $atts);
9282        return $elt unless defined $_[0];
9283      }
9284
9285    # check next argument for #EMPTY
9286    if( !(ref $_[0]) && ($_[0] eq $EMPTY) )
9287      { $elt->{empty}= 1; return $elt; }
9288
9289    # case where we really want to do a set_text, the element is '#PCDATA'
9290    # or contains a single PCDATA and we only want to add text in it
9291    if( ($XML::Twig::index2gi[$elt->{'gi'}] eq $PCDATA || $elt->contains_a_single( $PCDATA))
9292        && (@_ == 1) && !( ref $_[0]))
9293      { $elt->set_text( $_[0]);
9294        return $elt;
9295      }
9296    elsif( ($XML::Twig::index2gi[$elt->{'gi'}] eq $CDATA) && (@_ == 1) && !( ref $_[0]))
9297      { $elt->{cdata}=  $_[0];
9298        return $elt;
9299      }
9300
9301    # delete the children
9302    foreach my $child (@{[$elt->_children]})
9303      { $child->delete; }
9304
9305    if( @_) { delete $elt->{empty}; }
9306
9307    foreach my $child (@_)
9308      { if( ref( $child) && isa( $child, 'XML::Twig::Elt'))
9309          { # argument is an element
9310            $child->paste( 'last_child', $elt);
9311          }
9312        else
9313          { # argument is a string
9314            if( (my $pcdata= $elt->{last_child}) && $elt->{last_child}->is_pcdata)
9315              { # previous child is also pcdata: just concatenate
9316                $pcdata->{pcdata}= (delete $pcdata->{empty} || 1) &&  $pcdata->{pcdata} . $child
9317              }
9318            else
9319              { # previous child is not a string: create a new pcdata element
9320                $pcdata= $elt->_new_pcdata( $child);
9321                $pcdata->paste( 'last_child', $elt);
9322              }
9323          }
9324      }
9325
9326
9327    return $elt;
9328  }
9329
9330# inserts an element (whose gi is given) as child of the element
9331# all children of the element are now children of the new element
9332# returns the new element
9333sub insert
9334  { my ($elt, @args)= @_;
9335    # first cut the children
9336    my @children= do { my $elt= $elt; my @children=(); my $child= $elt->{first_child}; while( $child) { push @children, $child; $child= $child->{next_sibling}; } @children; };
9337    foreach my $child (@children)
9338      { $child->cut; }
9339    # insert elements
9340    while( my $gi= shift @args)
9341      { my $new_elt= $elt->new( $gi);
9342        # add attributes if needed
9343        if( defined( $args[0]) && ( isa( $args[0], 'HASH')) )
9344          { $new_elt->set_atts( shift @args); }
9345        # paste the element
9346        $new_elt->paste( $elt);
9347        delete $elt->{empty};
9348        $elt= $new_elt;
9349      }
9350    # paste back the children
9351    foreach my $child (@children)
9352      { $child->paste( 'last_child', $elt); }
9353    return $elt;
9354  }
9355
9356# insert a new element
9357# $elt->insert_new_element( $opt_position, $gi, $opt_atts_hash, @opt_content);
9358# the element is created with the same syntax as new
9359# position is the same as in paste, first_child by default
9360sub insert_new_elt
9361  { my $elt= shift;
9362    my $position= $_[0];
9363    if(     ($position eq 'before') || ($position eq 'after')
9364         || ($position eq 'first_child') || ($position eq 'last_child'))
9365      { shift; }
9366    else
9367      { $position= 'first_child'; }
9368
9369    my $new_elt= $elt->new( @_);
9370    $new_elt->paste( $position, $elt);
9371
9372    #if( defined $new_elt->{'att'}->{$ID}) { $new_elt->set_id( $new_elt->{'att'}->{$ID}); }
9373
9374    return $new_elt;
9375  }
9376
9377# wraps an element in elements which gi's are given as arguments
9378# $elt->wrap_in( 'td', 'tr', 'table') wraps the element as a single
9379# cell in a table for example
9380# returns the new element
9381sub wrap_in
9382  { my $elt= shift;
9383    while( my $gi = shift @_)
9384      { my $new_elt = $elt->new( $gi);
9385        if( $elt->{twig_current})
9386          { my $t= $elt->twig;
9387            $t->{twig_current}= $new_elt;
9388            delete $elt->{'twig_current'};
9389            $new_elt->{'twig_current'}=1;
9390          }
9391
9392        if( my $parent= $elt->{parent})
9393          { $new_elt->{parent}=$parent; if( $XML::Twig::weakrefs) { weaken( $new_elt->{parent});} ;
9394            if( $parent->{first_child} == $elt) { $parent->{first_child}=  $new_elt; }
9395             if( $parent->{last_child} == $elt) {  delete $parent->{empty}; $parent->{last_child}=$new_elt; if( $XML::Twig::weakrefs) { weaken( $parent->{last_child});} ;  }
9396          }
9397        else
9398          { # wrapping the root
9399            my $twig= $elt->twig;
9400            if( $twig && $twig->root && ($twig->root eq $elt) )
9401              { $twig->set_root( $new_elt);
9402              }
9403          }
9404
9405        if( my $prev_sibling= $elt->{prev_sibling})
9406          { $new_elt->{prev_sibling}=$prev_sibling; if( $XML::Twig::weakrefs) { weaken( $new_elt->{prev_sibling});} ;
9407            $prev_sibling->{next_sibling}=  $new_elt;
9408          }
9409
9410        if( my $next_sibling= $elt->{next_sibling})
9411          { $new_elt->{next_sibling}=  $next_sibling;
9412            $next_sibling->{prev_sibling}=$new_elt; if( $XML::Twig::weakrefs) { weaken( $next_sibling->{prev_sibling});} ;
9413          }
9414        $new_elt->{first_child}=  $elt;
9415         delete $new_elt->{empty}; $new_elt->{last_child}=$elt; if( $XML::Twig::weakrefs) { weaken( $new_elt->{last_child});} ;
9416
9417        $elt->{parent}=$new_elt; if( $XML::Twig::weakrefs) { weaken( $elt->{parent});} ;
9418        $elt->{prev_sibling}=undef; if( $XML::Twig::weakrefs) { weaken( $elt->{prev_sibling});} ;
9419        $elt->{next_sibling}=  undef;
9420
9421        # add the attributes if the next argument is a hash ref
9422        if( defined( $_[0]) && (isa( $_[0], 'HASH')) )
9423          { $new_elt->set_atts( shift @_); }
9424
9425        $elt= $new_elt;
9426      }
9427
9428    return $elt;
9429  }
9430
9431sub replace
9432  { my( $elt, $ref)= @_;
9433
9434    if( $elt->{parent}) { $elt->cut; }
9435
9436    if( my $parent= $ref->{parent})
9437      { $elt->{parent}=$parent; if( $XML::Twig::weakrefs) { weaken( $elt->{parent});} ;
9438        if( $parent->{first_child} == $ref) { $parent->{first_child}=  $elt; }
9439        if( $parent->{last_child} == $ref)  {  delete $parent->{empty}; $parent->{last_child}=$elt; if( $XML::Twig::weakrefs) { weaken( $parent->{last_child});}  ; }
9440      }
9441    elsif( $ref->twig && $ref == $ref->twig->root)
9442      { $ref->twig->set_root( $elt); }
9443
9444    if( my $prev_sibling= $ref->{prev_sibling})
9445      { $elt->{prev_sibling}=$prev_sibling; if( $XML::Twig::weakrefs) { weaken( $elt->{prev_sibling});} ;
9446        $prev_sibling->{next_sibling}=  $elt;
9447      }
9448    if( my $next_sibling= $ref->{next_sibling})
9449      { $elt->{next_sibling}=  $next_sibling;
9450        $next_sibling->{prev_sibling}=$elt; if( $XML::Twig::weakrefs) { weaken( $next_sibling->{prev_sibling});} ;
9451      }
9452
9453    $ref->{parent}=undef; if( $XML::Twig::weakrefs) { weaken( $ref->{parent});} ;
9454    $ref->{prev_sibling}=undef; if( $XML::Twig::weakrefs) { weaken( $ref->{prev_sibling});} ;
9455    $ref->{next_sibling}=  undef;
9456    return $ref;
9457  }
9458
9459sub replace_with
9460  { my $ref= shift;
9461    my $elt= shift;
9462    $elt->replace( $ref);
9463    foreach my $new_elt (reverse @_)
9464      { $new_elt->paste( after => $elt); }
9465    return $elt;
9466  }
9467
9468
9469# move an element, same syntax as paste, except the element is first cut
9470sub move
9471  { my $elt= shift;
9472    $elt->cut;
9473    $elt->paste( @_);
9474    return $elt;
9475  }
9476
9477
9478# adds a prefix to an element, creating a pcdata child if needed
9479sub prefix
9480  { my ($elt, $prefix, $option)= @_;
9481    my $asis= ($option && ($option eq 'asis')) ? 1 : 0;
9482    if( (exists $elt->{'pcdata'})
9483        && (($asis && $elt->{asis}) || (!$asis && ! $elt->{asis}))
9484      )
9485      { $elt->{pcdata}= (delete $elt->{empty} || 1) &&  $prefix . $elt->{pcdata}; }
9486    elsif( $elt->{first_child} && $elt->{first_child}->is_pcdata
9487        && (   ($asis && $elt->{first_child}->{asis})
9488            || (!$asis && ! $elt->{first_child}->{asis}))
9489         )
9490      {
9491        $elt->{first_child}->set_pcdata( $prefix . $elt->{first_child}->pcdata);
9492      }
9493    else
9494      { my $new_elt= $elt->_new_pcdata( $prefix);
9495        my $pos= (exists $elt->{'pcdata'}) ? 'before' : 'first_child';
9496        $new_elt->paste( $pos => $elt);
9497        if( $asis) { $new_elt->set_asis; }
9498      }
9499    return $elt;
9500  }
9501
9502# adds a suffix to an element, creating a pcdata child if needed
9503sub suffix
9504  { my ($elt, $suffix, $option)= @_;
9505    my $asis= ($option && ($option eq 'asis')) ? 1 : 0;
9506    if( (exists $elt->{'pcdata'})
9507        && (($asis && $elt->{asis}) || (!$asis && ! $elt->{asis}))
9508      )
9509      { $elt->{pcdata}= (delete $elt->{empty} || 1) &&  $elt->{pcdata} . $suffix; }
9510    elsif( $elt->{last_child} && $elt->{last_child}->is_pcdata
9511        && (   ($asis && $elt->{last_child}->{asis})
9512            || (!$asis && ! $elt->{last_child}->{asis}))
9513         )
9514      { $elt->{last_child}->set_pcdata( $elt->{last_child}->pcdata . $suffix); }
9515    else
9516      { my $new_elt= $elt->_new_pcdata( $suffix);
9517        my $pos= (exists $elt->{'pcdata'}) ? 'after' : 'last_child';
9518        $new_elt->paste( $pos => $elt);
9519        if( $asis) { $new_elt->set_asis; }
9520      }
9521    return $elt;
9522  }
9523
9524# create a path to an element ('/root/.../gi)
9525sub path
9526  { my $elt= shift;
9527    my @context= ( $elt, $elt->ancestors);
9528    return "/" . join( "/", reverse map {$_->gi} @context);
9529  }
9530
9531sub xpath
9532  { my $elt= shift;
9533    my $xpath;
9534    foreach my $ancestor (reverse $elt->ancestors_or_self)
9535      { my $gi= $XML::Twig::index2gi[$ancestor->{'gi'}];
9536        $xpath.= "/$gi";
9537        my $index= $ancestor->prev_siblings( $gi) + 1;
9538        unless( ($index == 1) && !$ancestor->next_sibling( $gi))
9539          { $xpath.= "[$index]"; }
9540      }
9541    return $xpath;
9542  }
9543
9544# methods used mainly by wrap_children
9545
9546# return a string with the
9547# for an element <foo><elt att="val">...</elt><elt2/><elt>...</elt></foo>
9548# returns '<elt att="val"><elt2><elt>'
9549sub _stringify_struct
9550  { my( $elt, %opt)= @_;
9551    my $string='';
9552    my $pretty_print= set_pretty_print( 'none');
9553    foreach my $child ($elt->_children)
9554      { $child->add_id; $string .= $child->start_tag( { escape_gt => 1 }) ||''; }
9555    set_pretty_print( $pretty_print);
9556    return $string;
9557  }
9558
9559# wrap a series of elements in a new one
9560sub _wrap_range
9561  { my $elt= shift;
9562    my $gi= shift;
9563    my $atts= isa( $_[0], 'HASH') ? shift : undef;
9564    my $range= shift; # the string with the tags to wrap
9565
9566    my $t= $elt->twig;
9567
9568    # get the tags to wrap
9569    my @to_wrap;
9570    while( $range=~ m{<\w+\s+[^>]*id=("[^"]*"|'[^']*')[^>]*>}g)
9571      { push @to_wrap, $t->elt_id( substr( $1, 1, -1)); }
9572
9573    return '' unless @to_wrap;
9574
9575    my $to_wrap= shift @to_wrap;
9576    my %atts= %$atts;
9577    my $new_elt= $to_wrap->wrap_in( $gi, \%atts);
9578    $_->move( last_child => $new_elt) foreach (@to_wrap);
9579
9580    return '';
9581  }
9582
9583# wrap children matching a regexp in a new element
9584sub wrap_children
9585  { my( $elt, $regexp, $gi, $atts)= @_;
9586
9587    $atts ||={};
9588
9589    my $elt_as_string= $elt->_stringify_struct; # stringify the elt structure
9590    $regexp=~ s{(<[^>]*>)}{_match_expr( $1)}eg; # in the regexp, replace gi's by the proper regexp
9591    $elt_as_string=~ s{($regexp)}{$elt->_wrap_range( $gi, $atts, $1)}eg; # then do the actual replace
9592
9593    return $elt;
9594  }
9595
9596sub _match_expr
9597  { my $tag= shift;
9598    my( $gi, %atts)= XML::Twig::_parse_start_tag( $tag);
9599    return _match_tag( $gi, %atts);
9600  }
9601
9602
9603sub _match_tag
9604  { my( $elt, %atts)= @_;
9605    my $string= "<$elt\\b";
9606    foreach my $key (sort keys %atts)
9607      { my $val= qq{\Q$atts{$key}\E};
9608        $string.= qq{[^>]*$key=(?:"$val"|'$val')};
9609      }
9610    $string.=  qq{[^>]*>};
9611    return "(?:$string)";
9612  }
9613
9614sub field_to_att
9615  { my( $elt, $cond, $att)= @_;
9616    $att ||= $cond;
9617    my $child= $elt->first_child( $cond) or return undef;
9618    $elt->set_att( $att => $child->text);
9619    $child->cut;
9620    return $elt;
9621  }
9622
9623sub att_to_field
9624  { my( $elt, $att, $tag)= @_;
9625    $tag ||= $att;
9626    my $child= $elt->insert_new_elt( first_child => $tag, $elt->{'att'}->{$att});
9627    $elt->del_att( $att);
9628    return $elt;
9629  }
9630
9631# sort children methods
9632
9633sub sort_children_on_field
9634  { my $elt   = shift;
9635    my $field = shift;
9636    my $get_key= sub { return $_[0]->field( $field) };
9637    return $elt->sort_children( $get_key, @_);
9638  }
9639
9640sub sort_children_on_att
9641  { my $elt = shift;
9642    my $att = shift;
9643    my $get_key= sub { return $_[0]->{'att'}->{$att} };
9644    return $elt->sort_children( $get_key, @_);
9645  }
9646
9647sub sort_children_on_value
9648  { my $elt   = shift;
9649    #my $get_key= eval qq{ sub { $NO_WARNINGS; return \$_[0]->text } };
9650    my $get_key= \&text;
9651    return $elt->sort_children( $get_key, @_);
9652  }
9653
9654sub sort_children
9655  { my( $elt, $get_key, %opt)=@_;
9656    $opt{order} ||= 'normal';
9657    $opt{type}  ||= 'alpha';
9658    my( $par_a, $par_b)= ($opt{order} eq 'reverse') ? qw( b a) : qw ( a b) ;
9659    my $op= ($opt{type} eq 'numeric') ? '<=>' : 'cmp' ;
9660    my @children= $elt->cut_children;
9661    if( $opt{type} eq 'numeric')
9662      {  @children= map  { $_->[1] }
9663                    sort { $a->[0] <=> $b->[0] }
9664                    map  { [ $get_key->( $_), $_] } @children;
9665      }
9666    elsif( $opt{type} eq 'alpha')
9667      {  @children= map  { $_->[1] }
9668                    sort { $a->[0] cmp $b->[0] }
9669                    map  { [ $get_key->( $_), $_] } @children;
9670      }
9671    else
9672      { croak "wrong sort type '$opt{type}', should be either 'alpha' or 'numeric'"; }
9673
9674    @children= reverse @children if( $opt{order} eq 'reverse');
9675    $elt->set_content( @children);
9676  }
9677
9678
9679# comparison methods
9680
9681sub before
9682  { my( $a, $b)=@_;
9683    if( $a->cmp( $b) == -1) { return 1; } else { return 0; }
9684  }
9685
9686sub after
9687  { my( $a, $b)=@_;
9688    if( $a->cmp( $b) == 1) { return 1; } else { return 0; }
9689  }
9690
9691sub lt
9692  { my( $a, $b)=@_;
9693    return 1 if( $a->cmp( $b) == -1);
9694    return 0;
9695  }
9696
9697sub le
9698  { my( $a, $b)=@_;
9699    return 1 unless( $a->cmp( $b) == 1);
9700    return 0;
9701  }
9702
9703sub gt
9704  { my( $a, $b)=@_;
9705    return 1 if( $a->cmp( $b) == 1);
9706    return 0;
9707  }
9708
9709sub ge
9710  { my( $a, $b)=@_;
9711    return 1 unless( $a->cmp( $b) == -1);
9712    return 0;
9713  }
9714
9715
9716sub cmp
9717  { my( $a, $b)=@_;
9718
9719    # easy cases
9720    return  0 if( $a == $b);
9721    return  1 if( $a->in($b)); # a in b => a starts after b
9722    return -1 if( $b->in($a)); # b in a => a starts before b
9723
9724    # ancestors does not include the element itself
9725    my @a_pile= ($a, $a->ancestors);
9726    my @b_pile= ($b, $b->ancestors);
9727
9728    # the 2 elements are not in the same twig
9729    return undef unless( $a_pile[-1] == $b_pile[-1]);
9730
9731    # find the first non common ancestors (they are siblings)
9732    my $a_anc= pop @a_pile;
9733    my $b_anc= pop @b_pile;
9734
9735    while( $a_anc == $b_anc)
9736      { $a_anc= pop @a_pile;
9737        $b_anc= pop @b_pile;
9738      }
9739
9740    # from there move left and right and figure out the order
9741    my( $a_prev, $a_next, $b_prev, $b_next)= ($a_anc, $a_anc, $b_anc, $b_anc);
9742    while()
9743      { $a_prev= $a_prev->{prev_sibling} || return( -1);
9744        return 1 if( $a_prev == $b_next);
9745        $a_next= $a_next->{next_sibling} || return( 1);
9746        return -1 if( $a_next == $b_prev);
9747        $b_prev= $b_prev->{prev_sibling} || return( 1);
9748        return -1 if( $b_prev == $a_next);
9749        $b_next= $b_next->{next_sibling} || return( -1);
9750        return 1 if( $b_next == $a_prev);
9751      }
9752  }
9753
9754sub _dump
9755  { my( $elt, $option)= @_;
9756
9757    my $atts       = defined $option->{atts}       ? $option->{atts}       :  1;
9758    my $extra      = defined $option->{extra}      ? $option->{extra}      :  0;
9759    my $short_text = defined $option->{short_text} ? $option->{short_text} : 40;
9760
9761    my $sp= '| ';
9762    my $indent= $sp x $elt->level;
9763    my $indent_sp= '  ' x $elt->level;
9764
9765    my $dump='';
9766    if( $elt->is_elt)
9767      {
9768        $dump .= $indent  . '|-' . $XML::Twig::index2gi[$elt->{'gi'}];
9769
9770        if( $atts && (my @atts= $elt->att_names) )
9771          { $dump .= ' ' . join( ' ', map { qq{$_="} . $elt->{'att'}->{$_} . qq{"} } @atts); }
9772
9773        $dump .= "\n";
9774        if( $extra) { $dump .= $elt->_dump_extra_data( $indent, $indent_sp, $short_text); }
9775        $dump .= join( "", map { $_->_dump( $option) } do { my $elt= $elt; my @children=(); my $child= $elt->{first_child}; while( $child) { push @children, $child; $child= $child->{next_sibling}; } @children; });
9776      }
9777    else
9778      {
9779        if( (exists $elt->{'pcdata'}))
9780          { $dump .= "$indent|-PCDATA:  '"  . _short_text( $elt->{pcdata}, $short_text) . "'\n" }
9781        elsif( (exists $elt->{'ent'}))
9782          { $dump .= "$indent|-ENTITY:  '" . _short_text( $elt->{ent}, $short_text) . "'\n" }
9783        elsif( (exists $elt->{'cdata'}))
9784          { $dump .= "$indent|-CDATA:   '" . _short_text( $elt->{cdata}, $short_text) . "'\n" }
9785        elsif( (exists $elt->{'comment'}))
9786          { $dump .= "$indent|-COMMENT: '" . _short_text( $elt->comment_string, $short_text) . "'\n" }
9787        elsif( (exists $elt->{'target'}))
9788          { $dump .= "$indent|-PI:      '"      . $elt->{target} . "' - '" . _short_text( $elt->{data}, $short_text) . "'\n" }
9789        if( $extra) { $dump .= $elt->_dump_extra_data( $indent, $indent_sp, $short_text); }
9790      }
9791    return $dump;
9792  }
9793
9794sub _dump_extra_data
9795  { my( $elt, $indent, $indent_sp, $short_text)= @_;
9796    my $dump='';
9797    if( $elt->extra_data)
9798      { my $extra_data = $indent . "|-- (cpi before) '" . _short_text( $elt->extra_data, $short_text) . "'";
9799        $extra_data=~ s{\n}{$indent_sp}g;
9800        $dump .= $extra_data . "\n";
9801      }
9802    if( $elt->{extra_data_in_pcdata})
9803      { foreach my $data ( @{$elt->{extra_data_in_pcdata}})
9804          { my $extra_data = $indent . "|-- (cpi offset $data->{offset}) '" . _short_text( $data->{text}, $short_text) . "'";
9805            $extra_data=~ s{\n}{$indent_sp}g;
9806            $dump .= $extra_data . "\n";
9807          }
9808      }
9809    if( $elt->{extra_data_before_end_tag})
9810      { my $extra_data = $indent . "|-- (cpi end) '" . _short_text( $elt->{extra_data_before_end_tag}, $short_text) . "'";
9811        $extra_data=~ s{\n}{$indent_sp}g;
9812        $dump .= $extra_data . "\n";
9813      }
9814    return $dump;
9815  }
9816
9817
9818sub _short_text
9819  { my( $string, $length)= @_;
9820    if( !$length || (length( $string) < $length) ) { return $string; }
9821    my $l1= (length( $string) -5) /2;
9822    my $l2= length( $string) - ($l1 + 5);
9823    return substr( $string, 0, $l1) . ' ... ' . substr( $string, -$l2);
9824  }
9825
9826
9827sub _and { return _join_defined( ' && ',  @_); }
9828sub _join_defined { return join( shift(), grep { $_ } @_); }
9829
98301;
9831__END__
9832
9833=head1 NAME
9834
9835XML::Twig - A perl module for processing huge XML documents in tree mode.
9836
9837=head1 SYNOPSIS
9838
9839Note that this documentation is intended as a reference to the module.
9840
9841Complete docs, including a tutorial, examples, an easier to use HTML version,
9842a quick reference card and a FAQ are available at L<http://www.xmltwig.org/xmltwig>
9843
9844Small documents (loaded in memory as a tree):
9845
9846  my $twig=XML::Twig->new();    # create the twig
9847  $twig->parsefile( 'doc.xml'); # build it
9848  my_process( $twig);           # use twig methods to process it
9849  $twig->print;                 # output the twig
9850
9851Huge documents (processed in combined stream/tree mode):
9852
9853  # at most one div will be loaded in memory
9854  my $twig=XML::Twig->new(
9855    twig_handlers =>
9856      { title   => sub { $_->set_tag( 'h2') }, # change title tags to h2
9857                                               # $_ is the current element
9858        para    => sub { $_->set_tag( 'p')  }, # change para to p
9859        hidden  => sub { $_->delete;       },  # remove hidden elements
9860        list    => \&my_list_process,          # process list elements
9861        div     => sub { $_[0]->flush;     },  # output and free memory
9862      },
9863    pretty_print => 'indented',                # output will be nicely formatted
9864    empty_tags   => 'html',                    # outputs <empty_tag />
9865                         );
9866  $twig->parsefile( 'my_big.xml');
9867
9868  sub my_list_process
9869    { my( $twig, $list)= @_;
9870      # ...
9871    }
9872
9873See L<XML::Twig 101|/XML::Twig 101> for other ways to use the module, as a
9874filter for example.
9875
9876=encoding utf8
9877
9878=head1 DESCRIPTION
9879
9880This module provides a way to process XML documents. It is build on top
9881of C<XML::Parser>.
9882
9883The module offers a tree interface to the document, while allowing you
9884to output the parts of it that have been completely processed.
9885
9886It allows minimal resource (CPU and memory) usage by building the tree
9887only for the parts of the documents that need actual processing, through the
9888use of the C<L<twig_roots> > and
9889C<L<twig_print_outside_roots> > options. The
9890C<L<finish> > and C<L<finish_print> > methods also help
9891to increase performances.
9892
9893XML::Twig tries to make simple things easy so it tries its best to takes care
9894of a lot of the (usually) annoying (but sometimes necessary) features that
9895come with XML and XML::Parser.
9896
9897=head1 TOOLS
9898
9899XML::Twig comes with a few command-line utilities:
9900
9901=head2 xml_pp - xml pretty-printer
9902
9903XML pretty printer using XML::Twig
9904
9905=head2 xml_grep - grep XML files looking for specific elements
9906
9907C<xml_grep> does a grep on XML files. Instead of using regular expressions
9908it uses XPath expressions (in fact the subset of XPath supported by
9909XML::Twig).
9910
9911=head2 xml_split - cut a big XML file into smaller chunks
9912
9913C<xml_split> takes a (presumably big) XML file and split it in several smaller
9914files, based on various criteria (level in the tree, size or an XPath
9915expression)
9916
9917=head2 xml_merge - merge back XML files split with xml_split
9918
9919C<xml_merge> takes several xml files that have been split using C<xml_split>
9920and recreates a single file.
9921
9922=head2 xml_spellcheck - spellcheck XML files
9923
9924C<xml_spellcheck> lets you spell check the content of an XML file. It extracts
9925the text (the content of elements and optionally of attributes), call a spell
9926checker on it and then recreates the XML document.
9927
9928
9929=head1 XML::Twig 101
9930
9931XML::Twig can be used either on "small" XML documents (that fit in memory)
9932or on huge ones, by processing parts of the document and outputting or
9933discarding them once they are processed.
9934
9935
9936=head2 Loading an XML document and processing it
9937
9938  my $t= XML::Twig->new();
9939  $t->parse( '<d><title>title</title><para>p 1</para><para>p 2</para></d>');
9940  my $root= $t->root;
9941  $root->set_tag( 'html');              # change doc to html
9942  $title= $root->first_child( 'title'); # get the title
9943  $title->set_tag( 'h1');               # turn it into h1
9944  my @para= $root->children( 'para');   # get the para children
9945  foreach my $para (@para)
9946    { $para->set_tag( 'p'); }           # turn them into p
9947  $t->print;                            # output the document
9948
9949Other useful methods include:
9950
9951L<att>: C<< $elt->{'att'}->{'foo'} >> return the C<foo> attribute for an
9952element,
9953
9954L<set_att> : C<< $elt->set_att( foo => "bar") >> sets the C<foo>
9955attribute to the C<bar> value,
9956
9957L<next_sibling>: C<< $elt->{next_sibling} >> return the next sibling
9958in the document (in the example C<< $title->{next_sibling} >> is the first
9959C<para>, you can also (and actually should) use
9960C<< $elt->next_sibling( 'para') >> to get it
9961
9962The document can also be transformed through the use of the L<cut>,
9963L<copy>, L<paste> and L<move> methods:
9964C<< $title->cut; $title->paste( after => $p); >> for example
9965
9966And much, much more, see L<XML::Twig::Elt|/XML::Twig::Elt>.
9967
9968=head2 Processing an XML document chunk by chunk
9969
9970One of the strengths of XML::Twig is that it let you work with files that do
9971not fit in memory (BTW storing an XML document in memory as a tree is quite
9972memory-expensive, the expansion factor being often around 10).
9973
9974To do this you can define handlers, that will be called once a specific
9975element has been completely parsed. In these handlers you can access the
9976element and process it as you see fit, using the navigation and the
9977cut-n-paste methods, plus lots of convenient ones like C<L<prefix> >.
9978Once the element is completely processed you can then C<L<flush> > it,
9979which will output it and free the memory. You can also C<L<purge> > it
9980if you don't need to output it (if you are just extracting some data from
9981the document for example). The handler will be called again once the next
9982relevant element has been parsed.
9983
9984  my $t= XML::Twig->new( twig_handlers =>
9985                          { section => \&section,
9986                            para   => sub { $_->set_tag( 'p'); }
9987                          },
9988                       );
9989  $t->parsefile( 'doc.xml');
9990
9991  # the handler is called once a section is completely parsed, ie when
9992  # the end tag for section is found, it receives the twig itself and
9993  # the element (including all its sub-elements) as arguments
9994  sub section
9995    { my( $t, $section)= @_;      # arguments for all twig_handlers
9996      $section->set_tag( 'div');  # change the tag name
9997      # let's use the attribute nb as a prefix to the title
9998      my $title= $section->first_child( 'title'); # find the title
9999      my $nb= $title->{'att'}->{'nb'}; # get the attribute
10000      $title->prefix( "$nb - ");  # easy isn't it?
10001      $section->flush;            # outputs the section and frees memory
10002    }
10003
10004
10005There is of course more to it: you can trigger handlers on more elaborate
10006conditions than just the name of the element, C<section/title> for example.
10007
10008  my $t= XML::Twig->new( twig_handlers =>
10009                           { 'section/title' => sub { $_->print } }
10010                       )
10011                  ->parsefile( 'doc.xml');
10012
10013Here C<< sub { $_->print } >> simply prints the current element (C<$_> is aliased
10014to the element in the handler).
10015
10016You can also trigger a handler on a test on an attribute:
10017
10018  my $t= XML::Twig->new( twig_handlers =>
10019                      { 'section[@level="1"]' => sub { $_->print } }
10020                       );
10021                  ->parsefile( 'doc.xml');
10022
10023You can also use C<L<start_tag_handlers> > to process an
10024element as soon as the start tag is found. Besides C<L<prefix> > you
10025can also use C<L<suffix> >,
10026
10027=head2 Processing just parts of an XML document
10028
10029The twig_roots mode builds only the required sub-trees from the document
10030Anything outside of the twig roots will just be ignored:
10031
10032  my $t= XML::Twig->new(
10033       # the twig will include just the root and selected titles
10034           twig_roots   => { 'section/title' => \&print_n_purge,
10035                             'annex/title'   => \&print_n_purge
10036           }
10037                      );
10038  $t->parsefile( 'doc.xml');
10039
10040  sub print_n_purge
10041    { my( $t, $elt)= @_;
10042      print $elt->text;    # print the text (including sub-element texts)
10043      $t->purge;           # frees the memory
10044    }
10045
10046You can use that mode when you want to process parts of a documents but are
10047not interested in the rest and you don't want to pay the price, either in
10048time or memory, to build the tree for the it.
10049
10050
10051=head2 Building an XML filter
10052
10053You can combine the C<twig_roots> and the C<twig_print_outside_roots> options to
10054build filters, which let you modify selected elements and will output the rest
10055of the document as is.
10056
10057This would convert prices in $ to prices in Euro in a document:
10058
10059  my $t= XML::Twig->new(
10060           twig_roots   => { 'price' => \&convert, },   # process prices
10061           twig_print_outside_roots => 1,               # print the rest
10062                      );
10063  $t->parsefile( 'doc.xml');
10064
10065  sub convert
10066    { my( $t, $price)= @_;
10067      my $currency=  $price->{'att'}->{'currency'};          # get the currency
10068      if( $currency eq 'USD')
10069        { $usd_price= $price->text;                     # get the price
10070          # %rate is just a conversion table
10071          my $euro_price= $usd_price * $rate{usd2euro};
10072          $price->set_text( $euro_price);               # set the new price
10073          $price->set_att( currency => 'EUR');          # don't forget this!
10074        }
10075      $price->print;                                    # output the price
10076    }
10077
10078=head2 XML::Twig and various versions of Perl, XML::Parser and expat:
10079
10080XML::Twig is a lot more sensitive to variations in versions of perl,
10081XML::Parser and expat than to the OS, so this should cover some
10082reasonable configurations.
10083
10084The "recommended configuration" is perl 5.8.3+ (for good Unicode
10085support), XML::Parser 2.31+ and expat 1.95.5+
10086
10087See L<http://testers.cpan.org/search?request=dist&dist=XML-Twig> for the
10088CPAN testers reports on XML::Twig, which list all tested configurations.
10089
10090An Atom feed of the CPAN Testers results is available at
10091L<http://xmltwig.org/rss/twig_testers.rss>
10092
10093Finally:
10094
10095=over 4
10096
10097=item XML::Twig does B<NOT> work with expat 1.95.4
10098
10099=item  XML::Twig only works with XML::Parser 2.27 in perl 5.6.*
10100
10101Note that I can't compile XML::Parser 2.27 anymore, so I can't guarantee
10102that it still works
10103
10104=item XML::Parser 2.28 does not really work
10105
10106=back
10107
10108When in doubt, upgrade expat, XML::Parser and Scalar::Util
10109
10110Finally, for some optional features, XML::Twig depends on some additional
10111modules. The complete list, which depends somewhat on the version of Perl
10112that you are running, is given by running C<t/zz_dump_config.t>
10113
10114=head1 Simplifying XML processing
10115
10116=over 4
10117
10118=item Whitespaces
10119
10120Whitespaces that look non-significant are discarded, this behaviour can be
10121controlled using the C<L<keep_spaces> >,
10122C<L<keep_spaces_in> > and
10123C<L<discard_spaces_in> > options.
10124
10125=item Encoding
10126
10127You can specify that you want the output in the same encoding as the input
10128(provided you have valid XML, which means you have to specify the encoding
10129either in the document or when you create the Twig object) using the
10130C<L<keep_encoding> > option
10131
10132You can also use C<L<output_encoding>> to convert the internal UTF-8 format
10133to the required encoding.
10134
10135=item Comments and Processing Instructions (PI)
10136
10137Comments and PI's can be hidden from the processing, but still appear in the
10138output (they are carried by the "real" element closer to them)
10139
10140=item Pretty Printing
10141
10142XML::Twig can output the document pretty printed so it is easier to read for
10143us humans.
10144
10145=item Surviving an untimely death
10146
10147XML parsers are supposed to react violently when fed improper XML.
10148XML::Parser just dies.
10149
10150XML::Twig provides the C<L<safe_parse> > and the
10151C<L<safe_parsefile> > methods which wrap the parse in an eval
10152and return either the parsed twig or 0 in case of failure.
10153
10154=item Private attributes
10155
10156Attributes with a name starting with # (illegal in XML) will not be
10157output, so you can safely use them to store temporary values during
10158processing. Note that you can store anything in a private attribute,
10159not just text, it's just a regular Perl variable, so a reference to
10160an object or a huge data structure is perfectly fine.
10161
10162=back
10163
10164=head1 CLASSES
10165
10166XML::Twig uses a very limited number of classes. The ones you are most likely to use
10167are C<L<XML::Twig>> of course, which represents a complete XML document, including the
10168document itself (the root of the document itself is C<L<root>>), its handlers, its
10169input or output filters... The other main class is C<L<XML::Twig::Elt>>, which models
10170an XML element. Element here has a very wide definition: it can be a regular element, or
10171but also text, with an element C<L<tag>> of C<#PCDATA> (or C<#CDATA>), an entity (tag is
10172C<#ENT>), a Processing Instruction (C<#PI>), a comment (C<#COMMENT>).
10173
10174Those are the 2 commonly used classes.
10175
10176You might want to look the C<L<elt_class>> option if you want to subclass C<XML::Twig::Elt>.
10177
10178Attributes are just attached to their parent element, they are not objects per se. (Please
10179use the provided methods C<L<att>> and C<L<set_att>> to access them, if you access them
10180as a hash, then your code becomes implementation dependent and might break in the future).
10181
10182Other classes that are seldom used are C<L<XML::Twig::Entity_list>> and C<L<XML::Twig::Entity>>.
10183
10184If you use C<L<XML::Twig::XPath>> instead of C<XML::Twig>, elements are then created as
10185C<L<XML::Twig::XPath::Elt>>
10186
10187
10188=head1 METHODS
10189
10190=head2 XML::Twig
10191
10192A twig is a subclass of XML::Parser, so all XML::Parser methods can be
10193called on a twig object, including parse and parsefile.
10194C<setHandlers> on the other hand cannot be used, see C<L<BUGS> >
10195
10196
10197=over 4
10198
10199=item new
10200
10201This is a class method, the constructor for XML::Twig. Options are passed
10202as keyword value pairs. Recognized options are the same as XML::Parser,
10203plus some (in fact a lot!) XML::Twig specifics.
10204
10205New Options:
10206
10207=over 4
10208
10209=item twig_handlers
10210
10211This argument consists of a hash C<{ expression => \&handler}> where
10212expression is a an I<XPath-like expression> (+ some others).
10213
10214XPath expressions are limited to using the child and descendant axis
10215(indeed you can't specify an axis), and predicates cannot be nested.
10216You can use the C<string>, or C<< string(<tag>) >> function (except
10217in C<twig_roots> triggers).
10218
10219Additionally you can use regexps (/ delimited) to match attribute
10220and string values.
10221
10222Examples:
10223
10224  foo
10225  foo/bar
10226  foo//bar
10227  /foo/bar
10228  /foo//bar
10229  /foo/bar[@att1 = "val1" and @att2 = "val2"]/baz[@a >= 1]
10230  foo[string()=~ /^duh!+/]
10231  /foo[string(bar)=~ /\d+/]/baz[@att != 3]
10232
10233#CDATA can be used to call a handler for a CDATA section.
10234#COMMENT can be used to call a handler for comments
10235
10236Some additional (non-XPath) expressions are also provided for convenience:
10237
10238=over 4
10239
10240=item processing instructions
10241
10242C<'?'> or C<'#PI'> triggers the handler for any processing instruction,
10243and C<< '?<target>' >> or C<< '#PI <target>' >> triggers a handler for processing
10244instruction with the given target( ex: C<'#PI xml-stylesheet'>).
10245
10246=item level(<level>)
10247
10248Triggers the handler on any element at that level in the tree (root is level 1)
10249
10250=item _all_
10251
10252Triggers the handler for B<all> elements in the tree
10253
10254=item _default_
10255
10256Triggers the handler for each element that does NOT have any other handler.
10257
10258=back
10259
10260Expressions are evaluated against the input document.
10261Which means that even if you have changed the tag of an element (changing the
10262tag of a parent element from a handler for example) the change will not impact
10263the expression evaluation. There is an exception to this: "private" attributes
10264(which name start with a '#', and can only be created during the parsing, as
10265they are not valid XML) are checked against the current twig.
10266
10267Handlers are triggered in fixed order, sorted by their type (xpath expressions
10268first, then regexps, then level), then by whether they specify a full path
10269(starting at the root element) or
10270not, then by number of steps in the expression, then number of
10271predicates, then number of tests in predicates. Handlers where the last
10272step does not specify a step (C<foo/bar/*>) are triggered after other XPath
10273handlers. Finally C<_all_> handlers are triggered last.
10274
10275B<Important>: once a handler has been triggered if it returns 0 then no other
10276handler is called, except a C<_all_> handler which will be called anyway.
10277
10278If a handler returns a true value and other handlers apply, then the next
10279applicable handler will be called. Repeat, rinse, lather..; The exception
10280to that rule is when the C<L<do_not_chain_handlers>>
10281option is set, in which case only the first handler will be called.
10282
10283Note that it might be a good idea to explicitly return a short true value
10284(like 1) from handlers: this ensures that other applicable handlers are
10285called even if the last statement for the handler happens to evaluate to
10286false. This might also speedup the code by avoiding the result of the last
10287statement of the code to be copied and passed to the code managing handlers.
10288It can really pay to have 1 instead of a long string returned.
10289
10290When the closing tag for an element is parsed the corresponding handler is
10291called, with 2 arguments: the twig and the C<L<Element> >. The twig includes
10292the document tree that has been built so far, the element is the complete
10293sub-tree for the element. B<The fact that the handler is called only when the
10294closing tag for the element is found means that handlers for inner elements
10295are called before handlers for outer elements>.
10296
10297C<$_> is also set to the element, so it is easy to write inline handlers like
10298
10299  para => sub { $_->set_tag( 'p'); }
10300
10301Text is stored in elements whose tag name is #PCDATA (due to mixed content,
10302text and sub-element in an element there is no way to store the text as just
10303an attribute of the enclosing element, this is similar to the DOM model).
10304
10305B<Warning>: if you have used purge or flush on the twig the element might not
10306be complete, some of its children might have been entirely flushed or purged,
10307and the start tag might even have been printed (by C<flush>) already, so changing
10308its tag might not give the expected result.
10309
10310
10311=item twig_roots
10312
10313This argument let's you build the tree only for those elements you are
10314interested in.
10315
10316  Example: my $t= XML::Twig->new( twig_roots => { title => 1, subtitle => 1});
10317           $t->parsefile( file);
10318           my $t= XML::Twig->new( twig_roots => { 'section/title' => 1});
10319           $t->parsefile( file);
10320
10321
10322return a twig containing a document including only C<title> and C<subtitle>
10323elements, as children of the root element.
10324
10325You can use I<generic_attribute_condition>, I<attribute_condition>,
10326I<full_path>, I<partial_path>, I<tag>, I<tag_regexp>, I<_default_> and
10327I<_all_> to trigger the building of the twig.
10328I<string_condition> and I<regexp_condition> cannot be used as the content
10329of the element, and the string, have not yet been parsed when the condition
10330is checked.
10331
10332B<WARNING>: path are checked for the document. Even if the C<twig_roots> option
10333is used they will be checked against the full document tree, not the virtual
10334tree created by XML::Twig
10335
10336
10337B<WARNING>: twig_roots elements should NOT be nested, that would hopelessly
10338confuse XML::Twig ;--(
10339
10340Note: you can set handlers (twig_handlers) using twig_roots
10341  Example: my $t= XML::Twig->new( twig_roots =>
10342                                   { title    => sub { $_[1]->print;},
10343                                     subtitle => \&process_subtitle
10344                                   }
10345                               );
10346           $t->parsefile( file);
10347
10348
10349=item twig_print_outside_roots
10350
10351To be used in conjunction with the C<twig_roots> argument. When set to a true
10352value this will print the document outside of the C<twig_roots> elements.
10353
10354 Example: my $t= XML::Twig->new( twig_roots => { title => \&number_title },
10355                                twig_print_outside_roots => 1,
10356                               );
10357           $t->parsefile( file);
10358           { my $nb;
10359           sub number_title
10360             { my( $twig, $title);
10361               $nb++;
10362               $title->prefix( "$nb ");
10363               $title->print;
10364             }
10365           }
10366
10367
10368This example prints the document outside of the title element, calls
10369C<number_title> for each C<title> element, prints it, and then resumes printing
10370the document. The twig is built only for the C<title> elements.
10371
10372If the value is a reference to a file handle then the document outside the
10373C<twig_roots> elements will be output to this file handle:
10374
10375  open( my $out, '>', 'out_file.xml') or die "cannot open out file.xml out_file:$!";
10376  my $t= XML::Twig->new( twig_roots => { title => \&number_title },
10377                         # default output to $out
10378                         twig_print_outside_roots => $out,
10379                       );
10380
10381         { my $nb;
10382           sub number_title
10383             { my( $twig, $title);
10384               $nb++;
10385               $title->prefix( "$nb ");
10386               $title->print( $out);    # you have to print to \*OUT here
10387             }
10388           }
10389
10390
10391=item start_tag_handlers
10392
10393A hash C<{ expression => \&handler}>. Sets element handlers that are called when
10394the element is open (at the end of the XML::Parser C<Start> handler). The handlers
10395are called with 2 params: the twig and the element. The element is empty at
10396that point, its attributes are created though.
10397
10398You can use I<generic_attribute_condition>, I<attribute_condition>,
10399I<full_path>, I<partial_path>, I<tag>, I<tag_regexp>, I<_default_>  and I<_all_>
10400to trigger the handler.
10401
10402I<string_condition> and I<regexp_condition> cannot be used as the content of
10403the element, and the string, have not yet been parsed when the condition is
10404checked.
10405
10406The main uses for those handlers are to change the tag name (you might have to
10407do it as soon as you find the open tag if you plan to C<flush> the twig at some
10408point in the element, and to create temporary attributes that will be used
10409when processing sub-element with C<twig_hanlders>.
10410
10411B<Note>: C<start_tag> handlers can be called outside of C<twig_roots> if this
10412argument is used. Since the element object is not built, in this case handlers
10413are called with the following arguments: C<$t> (the twig), C<$tag> (the tag of
10414the element) and C<%att> (a hash of the attributes of the element).
10415
10416If the C<twig_print_outside_roots> argument is also used, if the last handler
10417called returns  a C<true> value, then the start tag will be output as it
10418appeared in the original document, if the handler returns a C<false> value
10419then the start tag will B<not> be printed (so you can print a modified string
10420yourself for example).
10421
10422Note that you can use the L<ignore> method in C<start_tag_handlers>
10423(and only there).
10424
10425=item end_tag_handlers
10426
10427A hash C<{ expression => \&handler}>. Sets element handlers that are called when
10428the element is closed (at the end of the XML::Parser C<End> handler). The handlers
10429are called with 2 params: the twig and the tag of the element.
10430
10431I<twig_handlers> are called when an element is completely parsed, so why have
10432this redundant option? There is only one use for C<end_tag_handlers>: when using
10433the C<twig_roots> option, to trigger a handler for an element B<outside> the roots.
10434It is for example very useful to number titles in a document using nested
10435sections:
10436
10437  my @no= (0);
10438  my $no;
10439  my $t= XML::Twig->new(
10440          start_tag_handlers =>
10441           { section => sub { $no[$#no]++; $no= join '.', @no; push @no, 0; } },
10442          twig_roots         =>
10443           { title   => sub { $_->prefix( $no); $_->print; } },
10444          end_tag_handlers   => { section => sub { pop @no;  } },
10445          twig_print_outside_roots => 1
10446                      );
10447   $t->parsefile( $file);
10448
10449Using the C<end_tag_handlers> argument without C<twig_roots> will result in an
10450error.
10451
10452=item do_not_chain_handlers
10453
10454If this option is set to a true value, then only one handler will be called for
10455each element, even if several satisfy the condition
10456
10457Note that the C<_all_> handler will still be called regardless
10458
10459=item ignore_elts
10460
10461This option lets you ignore elements when building the twig. This is useful
10462in cases where you cannot use C<twig_roots> to ignore elements, for example if
10463the element to ignore is a sibling of elements you are interested in.
10464
10465Example:
10466
10467  my $twig= XML::Twig->new( ignore_elts => { elt => 'discard' });
10468  $twig->parsefile( 'doc.xml');
10469
10470This will build the complete twig for the document, except that all C<elt>
10471elements (and their children) will be left out.
10472
10473The keys in the hash are triggers, limited to the same subset as
10474C<L<start_tag_handlers>>. The values can be C<discard>, to discard
10475the element, C<print>, to output the element as-is, C<string> to
10476store the text of the ignored element(s), including markup, in a field of
10477the twig: C<< $t->{twig_buffered_string} >> or a reference to a scalar, in
10478which case the text of the ignored element(s), including markup, will be
10479stored in the scalar. Any other value will be treated as C<discard>.
10480
10481
10482=item char_handler
10483
10484A reference to a subroutine that will be called every time C<PCDATA> is found.
10485
10486The subroutine receives the string as argument, and returns the modified string:
10487
10488  # WE WANT ALL STRINGS IN UPPER CASE
10489  sub my_char_handler
10490    { my( $text)= @_;
10491      $text= uc( $text);
10492      return $text;
10493    }
10494
10495=item elt_class
10496
10497The name of a class used to store elements. this class should inherit from
10498C<XML::Twig::Elt> (and by default it is C<XML::Twig::Elt>). This option is used
10499to subclass the element class and extend it with new methods.
10500
10501This option is needed because during the parsing of the XML, elements are created
10502by C<XML::Twig>, without any control from the user code.
10503
10504=item keep_atts_order
10505
10506Setting this option to a true value causes the attribute hash to be tied to
10507a C<Tie::IxHash> object.
10508This means that C<Tie::IxHash> needs to be installed for this option to be
10509available. It also means that the hash keeps its order, so you will get
10510the attributes in order. This allows outputting the attributes in the same
10511order as they were in the original document.
10512
10513=item keep_encoding
10514
10515This is a (slightly?) evil option: if the XML document is not UTF-8 encoded and
10516you want to keep it that way, then setting keep_encoding will use theC<Expat>
10517original_string method for character, thus keeping the original encoding, as
10518well as the original entities in the strings.
10519
10520See the C<t/test6.t> test file to see what results you can expect from the
10521various encoding options.
10522
10523B<WARNING>: if the original encoding is multi-byte then attribute parsing will
10524be EXTREMELY unsafe under any Perl before 5.6, as it uses regular expressions
10525which do not deal properly with multi-byte characters. You can specify an
10526alternate function to parse the start tags with the C<parse_start_tag> option
10527(see below)
10528
10529B<WARNING>: this option is NOT used when parsing with XML::Parser non-blocking
10530parser (C<parse_start>, C<parse_more>, C<parse_done> methods) which you probably
10531should not use with XML::Twig anyway as they are totally untested!
10532
10533=item output_encoding
10534
10535This option generates an output_filter using C<Encode>,  C<Text::Iconv> or
10536C<Unicode::Map8> and C<Unicode::Strings>, and sets the encoding in the XML
10537declaration. This is the easiest way to deal with encodings, if you need
10538more sophisticated features, look at C<output_filter> below
10539
10540
10541=item output_filter
10542
10543This option is used to convert the character encoding of the output document.
10544It is passed either a string corresponding to a predefined filter or
10545a subroutine reference. The filter will be called every time a document or
10546element is processed by the "print" functions (C<print>, C<sprint>, C<flush>).
10547
10548Pre-defined filters:
10549
10550=over 4
10551
10552=item latin1
10553
10554uses either C<Encode>, C<Text::Iconv> or C<Unicode::Map8> and C<Unicode::String>
10555or a regexp (which works only with XML::Parser 2.27), in this order, to convert
10556all characters to ISO-8859-15 (usually latin1 is synonym to ISO-8859-1, but
10557in practice it seems that ISO-8859-15, which includes the euro sign, is more
10558useful and probably what most people want).
10559
10560=item html
10561
10562does the same conversion as C<latin1>, plus encodes entities using
10563C<HTML::Entities> (oddly enough you will need to have HTML::Entities installed
10564for it to be available). This should only be used if the tags and attribute
10565names themselves are in US-ASCII, or they will be converted and the output will
10566not be valid XML any more
10567
10568=item safe
10569
10570converts the output to ASCII (US) only  plus I<character entities> (C<&#nnn;>)
10571this should be used only if the tags and attribute names themselves are in
10572US-ASCII, or they will be converted and the output will not be valid XML any
10573more
10574
10575=item safe_hex
10576
10577same as C<safe> except that the character entities are in hex (C<&#xnnn;>)
10578
10579=item encode_convert ($encoding)
10580
10581Return a subref that can be used to convert utf8 strings to C<$encoding>).
10582Uses C<Encode>.
10583
10584   my $conv = XML::Twig::encode_convert( 'latin1');
10585   my $t = XML::Twig->new(output_filter => $conv);
10586
10587=item iconv_convert ($encoding)
10588
10589this function is used to create a filter subroutine that will be used to
10590convert the characters to the target encoding using C<Text::Iconv> (which needs
10591to be installed, look at the documentation for the module and for the
10592C<iconv> library to find out which encodings are available on your system,
10593C<iconv -l> should give you a list of available encodings)
10594
10595   my $conv = XML::Twig::iconv_convert( 'latin1');
10596   my $t = XML::Twig->new(output_filter => $conv);
10597
10598=item unicode_convert ($encoding)
10599
10600this function is used to create a filter subroutine that will be used to
10601convert the characters to the target encoding using  C<Unicode::Strings>
10602and C<Unicode::Map8> (which need to be installed, look at the documentation
10603for the modules to find out which encodings are available on your system)
10604
10605   my $conv = XML::Twig::unicode_convert( 'latin1');
10606   my $t = XML::Twig->new(output_filter => $conv);
10607
10608=back
10609
10610The C<text> and C<att> methods do not use the filter, so their
10611result are always in unicode.
10612
10613Those predeclared filters are based on subroutines that can be used
10614by themselves (as C<XML::Twig::foo>).
10615
10616=over 4
10617
10618=item html_encode ($string)
10619
10620Use C<HTML::Entities> to encode a utf8 string
10621
10622=item safe_encode ($string)
10623
10624Use either a regexp (perl < 5.8) or C<Encode> to encode non-ascii characters
10625in the string in C<< &#<nnnn>; >> format
10626
10627=item safe_encode_hex ($string)
10628
10629Use either a regexp (perl < 5.8) or C<Encode> to encode non-ascii characters
10630in the string in C<< &#x<nnnn>; >> format
10631
10632=item regexp2latin1 ($string)
10633
10634Use a regexp to encode a utf8 string into latin 1 (ISO-8859-1). Does not
10635work with Perl 5.8.0!
10636
10637=back
10638
10639=item output_text_filter
10640
10641same as output_filter, except it doesn't apply to the brackets and quotes
10642around attribute values. This is useful for all filters that could change
10643the tagging, basically anything that does not just change the encoding of
10644the output. C<html>, C<safe> and C<safe_hex> are better used with this option.
10645
10646=item input_filter
10647
10648This option is similar to C<output_filter> except the filter is applied to
10649the characters before they are stored in the twig, at parsing time.
10650
10651=item remove_cdata
10652
10653Setting this option to a true value will force the twig to output CDATA
10654sections as regular (escaped) PCDATA
10655
10656=item parse_start_tag
10657
10658If you use the C<keep_encoding> option then this option can be used to replace
10659the default parsing function. You should provide a coderef (a reference to a
10660subroutine) as the argument, this subroutine takes the original tag (given
10661by XML::Parser::Expat C<original_string()> method) and returns a tag and the
10662attributes in a hash (or in a list attribute_name/attribute value).
10663
10664=item no_xxe
10665
10666prevents external entities to be parsed.
10667
10668This is a security feature, in case the input XML cannot be trusted. With this
10669option set to a true value defining external entities in the document will cause
10670the parse to fail.
10671
10672This prevents an entity like C<< <!ENTITY xxe PUBLIC "bar" "/etc/passwd"> >> to
10673make the password fiel available in the document.
10674
10675
10676=item expand_external_ents
10677
10678When this option is used external entities (that are defined) are expanded
10679when the document is output using "print" functions such as C<L<print> >,
10680C<L<sprint> >, C<L<flush> > and C<L<xml_string> >.
10681Note that in the twig the entity will be stored as an element with a
10682tag 'C<#ENT>', the entity will not be expanded there, so you might want to
10683process the entities before outputting it.
10684
10685If an external entity is not available, then the parse will fail.
10686
10687A special case is when the value of this option is -1. In that case a missing
10688entity will not cause the parser to die, but its C<name>, C<sysid> and C<pubid>
10689will be stored in the twig as C<< $twig->{twig_missing_system_entities} >>
10690(a reference to an array of hashes { name => <name>, sysid => <sysid>,
10691pubid => <pubid> }). Yes, this is a bit of a hack, but it's useful in some
10692cases.
10693
10694=item load_DTD
10695
10696If this argument is set to a true value, C<parse> or C<parsefile> on the twig
10697will load  the DTD information. This information can then be accessed through
10698the twig, in a C<DTD_handler> for example. This will load even an external DTD.
10699
10700Default and fixed values for attributes will also be filled, based on the DTD.
10701
10702Note that to do this the module will generate a temporary file in the current
10703directory. If this is a problem let me know and I will add an option to
10704specify an alternate directory.
10705
10706See L<DTD Handling> for more information
10707
10708=item DTD_base <path_to_DTD_directory>
10709
10710If the DTD is in a different directory, looks for it there, useful to make up
10711somewhat for the lack of catalog suport in C<expat>. You still need a SYSTEM
10712declaration
10713
10714=item DTD_handler
10715
10716Set a handler that will be called once the doctype (and the DTD) have been
10717loaded, with 2 arguments, the twig and the DTD.
10718
10719=item no_prolog
10720
10721Does not output a prolog (XML declaration and DTD)
10722
10723=item id
10724
10725This optional argument gives the name of an attribute that can be used as
10726an ID in the document. Elements whose ID is known can be accessed through
10727the elt_id method. id defaults to 'id'.
10728See C<L<BUGS> >
10729
10730=item discard_spaces
10731
10732If this optional argument is set to a true value then spaces are discarded
10733when they look non-significant: strings containing only spaces and at least
10734one line feed are discarded. This argument is set to true by default.
10735
10736The exact algorithm to drop spaces is: strings including only spaces (perl \s)
10737and at least one \n right before an open or close tag are dropped.
10738
10739=item discard_all_spaces
10740
10741If this argument is set to a true value, spaces are discarded more
10742aggressively than with C<discard_spaces>: strings not including a \n are also
10743dropped. This option is appropriate for data-oriented XML.
10744
10745
10746=item keep_spaces
10747
10748If this optional argument is set to a true value then all spaces in the
10749document are kept, and stored as C<PCDATA>.
10750
10751B<Warning>: adding this option can result in changes in the twig generated:
10752space that was previously discarded might end up in a new text element. see
10753the difference by calling the following code with 0 and 1 as arguments:
10754
10755  perl -MXML::Twig -e'print XML::Twig->new( keep_spaces => shift)->parse( "<d> \n<e/></d>")->_dump'
10756
10757
10758C<keep_spaces> and C<discard_spaces> cannot be both set.
10759
10760=item discard_spaces_in
10761
10762This argument sets C<keep_spaces> to true but will cause the twig builder to
10763discard spaces in the elements listed.
10764
10765The syntax for using this argument is:
10766
10767  XML::Twig->new( discard_spaces_in => [ 'elt1', 'elt2']);
10768
10769=item keep_spaces_in
10770
10771This argument sets C<discard_spaces> to true but will cause the twig builder to
10772keep spaces in the elements listed.
10773
10774The syntax for using this argument is:
10775
10776  XML::Twig->new( keep_spaces_in => [ 'elt1', 'elt2']);
10777
10778B<Warning>: adding this option can result in changes in the twig generated:
10779space that was previously discarded might end up in a new text element.
10780
10781=item pretty_print
10782
10783Set the pretty print method, amongst 'C<none>' (default), 'C<nsgmls>',
10784'C<nice>', 'C<indented>', 'C<indented_c>', 'C<indented_a>',
10785'C<indented_close_tag>', 'C<cvs>', 'C<wrapped>', 'C<record>' and 'C<record_c>'
10786
10787pretty_print formats:
10788
10789=over 4
10790
10791=item none
10792
10793The document is output as one ling string, with no line breaks except those
10794found within text elements
10795
10796=item nsgmls
10797
10798Line breaks are inserted in safe places: that is within tags, between a tag
10799and an attribute, between attributes and before the > at the end of a tag.
10800
10801This is quite ugly but better than C<none>, and it is very safe, the document
10802will still be valid (conforming to its DTD).
10803
10804This is how the SGML parser C<sgmls> splits documents, hence the name.
10805
10806=item nice
10807
10808This option inserts line breaks before any tag that does not contain text (so
10809element with textual content are not broken as the \n is the significant).
10810
10811B<WARNING>: this option leaves the document well-formed but might make it
10812invalid (not conformant to its DTD). If you have elements declared as
10813
10814  <!ELEMENT foo (#PCDATA|bar)>
10815
10816then a C<foo> element including a C<bar> one will be printed as
10817
10818  <foo>
10819  <bar>bar is just pcdata</bar>
10820  </foo>
10821
10822This is invalid, as the parser will take the line break after the C<foo> tag
10823as a sign that the element contains PCDATA, it will then die when it finds the
10824C<bar> tag. This may or may not be important for you, but be aware of it!
10825
10826=item indented
10827
10828Same as C<nice> (and with the same warning) but indents elements according to
10829their level
10830
10831=item indented_c
10832
10833Same as C<indented> but a little more compact: the closing tags are on the
10834same line as the preceding text
10835
10836=item indented_close_tag
10837
10838Same as C<indented> except that the closing tag is also indented, to line up
10839with the tags within the element
10840
10841=item idented_a
10842
10843This formats XML files in a line-oriented version control friendly way.
10844The format is described in L<http://tinyurl.com/2kwscq> (that's an Oracle
10845document with an insanely long URL).
10846
10847Note that to be totaly conformant to the "spec", the order of attributes
10848should not be changed, so if they are not already in alphabetical order
10849you will need to use the C<L<keep_atts_order>> option.
10850
10851=item cvs
10852
10853Same as C<L<idented_a>>.
10854
10855=item wrapped
10856
10857Same as C<indented_c> but lines are wrapped using L<Text::Wrap::wrap>. The
10858default length for lines is the default for C<$Text::Wrap::columns>, and can
10859be changed by changing that variable.
10860
10861=item record
10862
10863This is a record-oriented pretty print, that display data in records, one field
10864per line (which looks a LOT like C<indented>)
10865
10866=item record_c
10867
10868Stands for record compact, one record per line
10869
10870=back
10871
10872
10873=item empty_tags
10874
10875Set the empty tag display style ('C<normal>', 'C<html>' or 'C<expand>').
10876
10877C<normal> outputs an empty tag 'C<< <tag/> >>', C<html> adds a space
10878'C<< <tag /> >>' for elements that can be empty in XHTML and C<expand> outputs
10879'C<< <tag></tag> >>'
10880
10881=item quote
10882
10883Set the quote character for attributes ('C<single>' or 'C<double>').
10884
10885=item escape_gt
10886
10887By default XML::Twig does not escape the character > in its output, as it is not
10888mandated by the XML spec. With this option on, > will be replaced by C<&gt;>
10889
10890=item comments
10891
10892Set the way comments are processed: 'C<drop>' (default), 'C<keep>' or
10893'C<process>'
10894
10895Comments processing options:
10896
10897=over 4
10898
10899=item drop
10900
10901drops the comments, they are not read, nor printed to the output
10902
10903=item keep
10904
10905comments are loaded and will appear on the output, they are not
10906accessible within the twig and will not interfere with processing
10907though
10908
10909B<Note>: comments in the middle of a text element such as
10910
10911  <p>text <!-- comment --> more text --></p>
10912
10913are kept at their original position in the text. Using ˝"print"
10914methods like C<print> or C<sprint> will return the comments in the
10915text. Using C<text> or C<field> on the other hand will not.
10916
10917Any use of C<set_pcdata> on the C<#PCDATA> element (directly or
10918through other methods like C<set_content>) will delete the comment(s).
10919
10920=item process
10921
10922comments are loaded in the twig and will be treated as regular elements
10923(their C<tag> is C<#COMMENT>) this can interfere with processing if you
10924expect C<< $elt->{first_child} >> to be an element but find a comment there.
10925Validation will not protect you from this as comments can happen anywhere.
10926You can use C<< $elt->first_child( 'tag') >> (which is a good habit anyway)
10927to get where you want.
10928
10929Consider using C<process> if you are outputting SAX events from XML::Twig.
10930
10931=back
10932
10933=item pi
10934
10935Set the way processing instructions are processed: 'C<drop>', 'C<keep>'
10936(default) or 'C<process>'
10937
10938Note that you can also set PI handlers in the C<twig_handlers> option:
10939
10940  '?'       => \&handler
10941  '?target' => \&handler 2
10942
10943The handlers will be called with 2 parameters, the twig and the PI element if
10944C<pi> is set to C<process>, and with 3, the twig, the target and the data if
10945C<pi> is set to C<keep>. Of course they will not be called if C<pi> is set to
10946C<drop>.
10947
10948If C<pi> is set to C<keep> the handler should return a string that will be used
10949as-is as the PI text (it should look like "C< <?target data?> >" or '' if you
10950want to remove the PI),
10951
10952Only one handler will be called, C<?target> or C<?> if no specific handler for
10953that target is available.
10954
10955=item map_xmlns
10956
10957This option is passed a hashref that maps uri's to prefixes. The prefixes in
10958the document will be replaced by the ones in the map. The mapped prefixes can
10959(actually have to) be used to trigger handlers, navigate or query the document.
10960
10961Here is an example:
10962
10963  my $t= XML::Twig->new( map_xmlns => {'http://www.w3.org/2000/svg' => "svg"},
10964                         twig_handlers =>
10965                           { 'svg:circle' => sub { $_->set_att( r => 20) } },
10966                         pretty_print => 'indented',
10967                       )
10968                  ->parse( '<doc xmlns:gr="http://www.w3.org/2000/svg">
10969                              <gr:circle cx="10" cy="90" r="10"/>
10970                           </doc>'
10971                         )
10972                  ->print;
10973
10974This will output:
10975
10976  <doc xmlns:svg="http://www.w3.org/2000/svg">
10977     <svg:circle cx="10" cy="90" r="20"/>
10978  </doc>
10979
10980=item keep_original_prefix
10981
10982When used with C<L<map_xmlns>> this option will make C<XML::Twig> use the original
10983namespace prefixes when outputting a document. The mapped prefix will still be used
10984for triggering handlers and in navigation and query methods.
10985
10986  my $t= XML::Twig->new( map_xmlns => {'http://www.w3.org/2000/svg' => "svg"},
10987                         twig_handlers =>
10988                           { 'svg:circle' => sub { $_->set_att( r => 20) } },
10989                         keep_original_prefix => 1,
10990                         pretty_print => 'indented',
10991                       )
10992                  ->parse( '<doc xmlns:gr="http://www.w3.org/2000/svg">
10993                              <gr:circle cx="10" cy="90" r="10"/>
10994                           </doc>'
10995                         )
10996                  ->print;
10997
10998This will output:
10999
11000  <doc xmlns:gr="http://www.w3.org/2000/svg">
11001     <gr:circle cx="10" cy="90" r="20"/>
11002  </doc>
11003
11004=item original_uri ($prefix)
11005
11006called within a handler, this will return the uri bound to the namespace prefix
11007in the original document.
11008
11009=item index ($arrayref or $hashref)
11010
11011This option creates lists of specific elements during the parsing of the XML.
11012It takes a reference to either a list of triggering expressions or to a hash
11013name => expression, and for each one generates the list of elements that
11014match the expression. The list can be accessed through the C<L<index>> method.
11015
11016example:
11017
11018  # using an array ref
11019  my $t= XML::Twig->new( index => [ 'div', 'table' ])
11020                  ->parsefile( "foo.xml");
11021  my $divs= $t->index( 'div');
11022  my $first_div= $divs->[0];
11023  my $last_table= $t->index( table => -1);
11024
11025  # using a hashref to name the indexes
11026  my $t= XML::Twig->new( index => { email => 'a[@href=~/^ \s*mailto:/]'})
11027                  ->parsefile( "foo.xml");
11028  my $last_emails= $t->index( email => -1);
11029
11030Note that the index is not maintained after the parsing. If elements are
11031deleted, renamed or otherwise hurt during processing, the index is NOT updated.
11032(changing the id element OTOH will update the index)
11033
11034=item att_accessors <list of attribute names>
11035
11036creates methods that give direct access to attribute:
11037
11038  my $t= XML::Twig->new( att_accessors => [ 'href', 'src'])
11039                  ->parsefile( $file);
11040  my $first_href= $t->first_elt( 'img')->src; # same as ->att( 'src')
11041  $t->first_elt( 'img')->src( 'new_logo.png') # changes the attribute value
11042
11043=item elt_accessors
11044
11045creates methods that give direct access to the first child element (in scalar context)
11046or the list of elements (in list context):
11047
11048the list of accessors to create can be given 1 2 different ways: in an array,
11049or in a hash alias => expression
11050  my $t=  XML::Twig->new( elt_accessors => [ 'head'])
11051                  ->parsefile( $file);
11052  my $title_text= $t->root->head->field( 'title');
11053  # same as $title_text= $t->root->first_child( 'head')->field( 'title');
11054
11055  my $t=  XML::Twig->new( elt_accessors => { warnings => 'p[@class="warning"]', d2 => 'div[2]'}, )
11056                  ->parsefile( $file);
11057  my $body= $t->first_elt( 'body');
11058  my @warnings= $body->warnings; # same as $body->children( 'p[@class="warning"]');
11059  my $s2= $body->d2;             # same as $body->first_child( 'div[2]')
11060
11061=item field_accessors
11062
11063creates methods that give direct access to the first child element text:
11064
11065  my $t=  XML::Twig->new( field_accessors => [ 'h1'])
11066                  ->parsefile( $file);
11067  my $div_title_text= $t->first_elt( 'div')->title;
11068  # same as $title_text= $t->first_elt( 'div')->field( 'title');
11069
11070=item use_tidy
11071
11072set this option to use HTML::Tidy instead of HTML::TreeBuilder to convert
11073HTML to XML. HTML, especially real (real "crap") HTML found in the wild,
11074so depending on the data, one module or the other does a better job at
11075the conversion. Also, HTML::Tidy can be a bit difficult to install, so
11076XML::Twig offers both option. TIMTOWTDI
11077
11078=item output_html_doctype
11079
11080when using HTML::TreeBuilder to convert HTML, this option causes the DOCTYPE
11081declaration to be output, which may be important for some legacy browsers.
11082Without that option the DOCTYPE definition is NOT output. Also if the definition
11083is completely wrong (ie not easily parsable), it is not output either.
11084
11085=back
11086
11087B<Note>: I _HATE_ the Java-like name of arguments used by most XML modules.
11088So in pure TIMTOWTDI fashion all arguments can be written either as
11089C<UglyJavaLikeName> or as C<readable_perl_name>: C<twig_print_outside_roots>
11090or C<TwigPrintOutsideRoots> (or even C<twigPrintOutsideRoots> {shudder}).
11091XML::Twig normalizes them before processing them.
11092
11093=item parse ( $source)
11094
11095The C<$source> parameter should either be a string containing the whole XML
11096document, or it should be an open C<IO::Handle> (aka a filehandle).
11097
11098A die call is thrown if a parse error occurs. Otherwise it will return
11099the twig built by the parse. Use C<safe_parse> if you want the parsing
11100to return even when an error occurs.
11101
11102If this method is called as a class method
11103(C<< XML::Twig->parse( $some_xml_or_html) >>) then an XML::Twig object is
11104created, using the parameters except the last one (eg
11105C<< XML::Twig->parse( pretty_print => 'indented', $some_xml_or_html) >>)
11106and C<L<xparse>> is called on it.
11107
11108Note that when parsing a filehandle, the handle should NOT be open with an
11109encoding (ie open with C<open( my $in, '<', $filename)>. The file will be
11110parsed by C<expat>, so specifying the encoding actually causes problems
11111for the parser (as in: it can crash it, see
11112https://rt.cpan.org/Ticket/Display.html?id=78877). For parsing a file it
11113is actually recommended to use C<parsefile> on the file name, instead of
11114<parse> on the open file.
11115
11116=item parsestring
11117
11118This is just an alias for C<parse> for backwards compatibility.
11119
11120=item parsefile (FILE [, OPT => OPT_VALUE [...]])
11121
11122Open C<FILE> for reading, then call C<parse> with the open handle. The file
11123is closed no matter how C<parse> returns.
11124
11125A C<die> call is thrown if a parse error occurs. Otherwise it will return
11126the twig built by the parse. Use C<safe_parsefile> if you want the parsing
11127to return even when an error occurs.
11128
11129=item parsefile_inplace ( $file, $optional_extension)
11130
11131Parse and update a file "in place". It does this by creating a temp file,
11132selecting it as the default for print() statements (and methods), then parsing
11133the input file. If the parsing is successful, then the temp file is
11134moved to replace the input file.
11135
11136If an extension is given then the original file is backed-up (the rules for
11137the extension are the same as the rule for the -i option in perl).
11138
11139=item parsefile_html_inplace ( $file, $optional_extension)
11140
11141Same as parsefile_inplace, except that it parses HTML instead of XML
11142
11143=item parseurl ($url $optional_user_agent)
11144
11145Gets the data from C<$url> and parse it. The data is piped to the parser in
11146chunks the size of the XML::Parser::Expat buffer, so memory consumption and
11147hopefully speed are optimal.
11148
11149For most (read "small") XML it is probably as efficient (and easier to debug)
11150to just C<get> the XML file and then parse it as a string.
11151
11152  use XML::Twig;
11153  use LWP::Simple;
11154  my $twig= XML::Twig->new();
11155  $twig->parse( LWP::Simple::get( $URL ));
11156
11157or
11158
11159  use XML::Twig;
11160  my $twig= XML::Twig->nparse( $URL);
11161
11162
11163If the C<$optional_user_agent> argument is used then it is used, otherwise a
11164new one is created.
11165
11166=item safe_parse ( SOURCE [, OPT => OPT_VALUE [...]])
11167
11168This method is similar to C<parse> except that it wraps the parsing in an
11169C<eval> block. It returns the twig on success and 0 on failure (the twig object
11170also contains the parsed twig). C<$@> contains the error message on failure.
11171
11172Note that the parsing still stops as soon as an error is detected, there is
11173no way to keep going after an error.
11174
11175=item safe_parsefile (FILE [, OPT => OPT_VALUE [...]])
11176
11177This method is similar to C<parsefile> except that it wraps the parsing in an
11178C<eval> block. It returns the twig on success and 0 on failure (the twig object
11179also contains the parsed twig) . C<$@> contains the error message on failure
11180
11181Note that the parsing still stops as soon as an error is detected, there is
11182no way to keep going after an error.
11183
11184=item safe_parseurl ($url $optional_user_agent)
11185
11186Same as C<parseurl> except that it wraps the parsing in an C<eval> block. It
11187returns the twig on success and 0 on failure (the twig object also contains
11188the parsed twig) . C<$@> contains the error message on failure
11189
11190=item parse_html ($string_or_fh)
11191
11192parse an HTML string or file handle (by converting it to XML using
11193HTML::TreeBuilder, which needs to be available).
11194
11195This works nicely, but some information gets lost in the process:
11196newlines are removed, and (at least on the version I use), comments
11197get an extra CDATA section inside ( <!-- foo --> becomes
11198<!-- <![CDATA[ foo ]]> -->
11199
11200=item parsefile_html ($file)
11201
11202parse an HTML file (by converting it to XML using HTML::TreeBuilder, which
11203needs to be available, or HTML::Tidy if the C<use_tidy> option was used).
11204The file is loaded completely in memory and converted to XML before being parsed.
11205
11206this method is to be used with caution though, as it doesn't know about the
11207file encoding, it is usually better to use C<L<parse_html>>, which gives you
11208a chance to open the file with the proper encoding layer.
11209
11210=item parseurl_html ($url $optional_user_agent)
11211
11212parse an URL as html the same way C<L<parse_html>> does
11213
11214=item safe_parseurl_html ($url $optional_user_agent)
11215
11216Same as C<L<parseurl_html>>> except that it wraps the parsing in an C<eval>
11217block.  It returns the twig on success and 0 on failure (the twig object also
11218contains the parsed twig) . C<$@> contains the error message on failure
11219
11220=item safe_parsefile_html ($file $optional_user_agent)
11221
11222Same as C<L<parsefile_html>>> except that it wraps the parsing in an C<eval>
11223block.  It returns the twig on success and 0 on failure (the twig object also
11224contains the parsed twig) . C<$@> contains the error message on failure
11225
11226=item safe_parse_html ($string_or_fh)
11227
11228Same as C<L<parse_html>> except that it wraps the parsing in an C<eval> block.
11229It returns the twig on success and 0 on failure (the twig object also contains
11230the parsed twig) . C<$@> contains the error message on failure
11231
11232=item xparse ($thing_to_parse)
11233
11234parse the C<$thing_to_parse>, whether it is a filehandle, a string, an HTML
11235file, an HTML URL, an URL or a file.
11236
11237Note that this is mostly a convenience method for one-off scripts. For example
11238files that end in '.htm' or '.html' are parsed first as XML, and if this fails
11239as HTML. This is certainly not the most efficient way to do this in general.
11240
11241=item nparse ($optional_twig_options, $thing_to_parse)
11242
11243create a twig with the C<$optional_options>, and parse the C<$thing_to_parse>,
11244whether it is a filehandle, a string, an HTML file, an HTML URL, an URL or a
11245file.
11246
11247Examples:
11248
11249   XML::Twig->nparse( "file.xml");
11250   XML::Twig->nparse( error_context => 1, "file://file.xml");
11251
11252=item nparse_pp ($optional_twig_options, $thing_to_parse)
11253
11254same as C<L<nparse>> but also sets the C<pretty_print> option to C<indented>.
11255
11256=item nparse_e ($optional_twig_options, $thing_to_parse)
11257
11258same as C<L<nparse>> but also sets the C<error_context> option to 1.
11259
11260=item nparse_ppe ($optional_twig_options, $thing_to_parse)
11261
11262same as C<L<nparse>> but also sets the C<pretty_print> option to C<indented>
11263and the C<error_context> option to 1.
11264
11265=item parser
11266
11267This method returns the C<expat> object (actually the XML::Parser::Expat object)
11268used during parsing. It is useful for example to call XML::Parser::Expat methods
11269on it. To get the line of a tag for example use C<< $t->parser->current_line >>.
11270
11271=item setTwigHandlers ($handlers)
11272
11273Set the twig_handlers. C<$handlers> is a reference to a hash similar to the
11274one in the C<twig_handlers> option of new. All previous handlers are unset.
11275The method returns the reference to the previous handlers.
11276
11277=item setTwigHandler ($exp $handler)
11278
11279Set a single twig_handler for elements matching C<$exp>. C<$handler> is a
11280reference to a subroutine. If the handler was previously set then the reference
11281to the previous handler is returned.
11282
11283=item setStartTagHandlers ($handlers)
11284
11285Set the start_tag handlers. C<$handlers> is a reference to a hash similar to the
11286one in the C<start_tag_handlers> option of new. All previous handlers are unset.
11287The method returns the reference to the previous handlers.
11288
11289=item setStartTagHandler ($exp $handler)
11290
11291Set a single start_tag handlers for elements matching C<$exp>. C<$handler> is a
11292reference to a subroutine. If the handler was previously set then the reference
11293to the previous handler is returned.
11294
11295=item setEndTagHandlers ($handlers)
11296
11297Set the end_tag handlers. C<$handlers> is a reference to a hash similar to the
11298one in the C<end_tag_handlers> option of new. All previous handlers are unset.
11299The method returns the reference to the previous handlers.
11300
11301=item setEndTagHandler ($exp $handler)
11302
11303Set a single end_tag handlers for elements matching C<$exp>. C<$handler> is a
11304reference to a subroutine. If the handler was previously set then the
11305reference to the previous handler is returned.
11306
11307=item setTwigRoots ($handlers)
11308
11309Same as using the C<L<twig_roots>> option when creating the twig
11310
11311=item setCharHandler ($exp $handler)
11312
11313Set a C<char_handler>
11314
11315=item setIgnoreEltsHandler ($exp)
11316
11317Set a C<ignore_elt> handler (elements that match C<$exp> will be ignored
11318
11319=item setIgnoreEltsHandlers ($exp)
11320
11321Set all C<ignore_elt> handlers (previous handlers are replaced)
11322
11323=item dtd
11324
11325Return the dtd (an L<XML::Twig::DTD> object) of a twig
11326
11327=item xmldecl
11328
11329Return the XML declaration for the document, or a default one if it doesn't
11330have one
11331
11332=item doctype
11333
11334Return the doctype for the document
11335
11336=item doctype_name
11337
11338returns the doctype of the document from the doctype declaration
11339
11340=item system_id
11341
11342returns the system value of the DTD of the document from the doctype declaration
11343
11344=item public_id
11345
11346returns the public doctype of the document from the doctype declaration
11347
11348=item internal_subset
11349
11350returns the internal subset of the DTD
11351
11352=item dtd_text
11353
11354Return the DTD text
11355
11356=item dtd_print
11357
11358Print the DTD
11359
11360=item model ($tag)
11361
11362Return the model (in the DTD) for the element C<$tag>
11363
11364=item root
11365
11366Return the root element of a twig
11367
11368=item set_root ($elt)
11369
11370Set the root of a twig
11371
11372=item first_elt ($optional_condition)
11373
11374Return the first element matching C<$optional_condition> of a twig, if
11375no condition is given then the root is returned
11376
11377=item last_elt ($optional_condition)
11378
11379Return the last element matching C<$optional_condition> of a twig, if
11380no condition is given then the last element of the twig is returned
11381
11382=item elt_id        ($id)
11383
11384Return the element whose C<id> attribute is $id
11385
11386=item getEltById
11387
11388Same as C<L<elt_id>>
11389
11390=item index ($index_name, $optional_index)
11391
11392If the C<$optional_index> argument is present, return the corresponding element
11393in the index (created using the C<index> option for C<XML::Twig->new>)
11394
11395If the argument is not present, return an arrayref to the index
11396
11397=item normalize
11398
11399merge together all consecutive pcdata elements in the document (if for example
11400you have turned some elements into pcdata using C<L<erase>>, this will give you
11401a "clean" document in which there all text elements are as long as possible).
11402
11403=item encoding
11404
11405This method returns the encoding of the XML document, as defined by the
11406C<encoding> attribute in the XML declaration (ie it is C<undef> if the attribute
11407is not defined)
11408
11409=item set_encoding
11410
11411This method sets the value of the C<encoding> attribute in the XML declaration.
11412Note that if the document did not have a declaration it is generated (with
11413an XML version of 1.0)
11414
11415=item xml_version
11416
11417This method returns the XML version, as defined by the C<version> attribute in
11418the XML declaration (ie it is C<undef> if the attribute is not defined)
11419
11420=item set_xml_version
11421
11422This method sets the value of the C<version> attribute in the XML declaration.
11423If the declaration did not exist it is created.
11424
11425=item standalone
11426
11427This method returns the value of the C<standalone> declaration for the document
11428
11429=item set_standalone
11430
11431This method sets the value of the C<standalone> attribute in the XML
11432declaration.  Note that if the document did not have a declaration it is
11433generated (with an XML version of 1.0)
11434
11435=item set_output_encoding
11436
11437Set the C<encoding> "attribute" in the XML declaration
11438
11439=item set_doctype ($name, $system, $public, $internal)
11440
11441Set the doctype of the element. If an argument is C<undef> (or not present)
11442then its former value is retained, if a false ('' or 0) value is passed then
11443the former value is deleted;
11444
11445=item entity_list
11446
11447Return the entity list of a twig
11448
11449=item entity_names
11450
11451Return the list of all defined entities
11452
11453=item entity ($entity_name)
11454
11455Return the entity
11456
11457=item notation_list
11458
11459Return the notation list of a twig
11460
11461=item notation_names
11462
11463Return the list of all defined notations
11464
11465=item notation ($notation_name)
11466
11467Return the notation
11468
11469=item change_gi      ($old_gi, $new_gi)
11470
11471Performs a (very fast) global change. All elements C<$old_gi> are now
11472C<$new_gi>. This is a bit dangerous though and should be avoided if
11473< possible, as the new tag might be ignored in subsequent processing.
11474
11475See C<L<BUGS> >
11476
11477=item flush            ($optional_filehandle, %options)
11478
11479Flushes a twig up to (and including) the current element, then deletes
11480all unnecessary elements from the tree that's kept in memory.
11481C<flush> keeps track of which elements need to be open/closed, so if you
11482flush from handlers you don't have to worry about anything. Just keep
11483flushing the twig every time you're done with a sub-tree and it will
11484come out well-formed. After the whole parsing don't forget toC<flush>
11485one more time to print the end of the document.
11486The doctype and entity declarations are also printed.
11487
11488flush take an optional filehandle as an argument.
11489
11490If you use C<flush> at any point during parsing, the document will be flushed
11491one last time at the end of the parsing, to the proper filehandle.
11492
11493options: use the C<update_DTD> option if you have updated the (internal) DTD
11494and/or the entity list and you want the updated DTD to be output
11495
11496The C<pretty_print> option sets the pretty printing of the document.
11497
11498   Example: $t->flush( Update_DTD => 1);
11499            $t->flush( $filehandle, pretty_print => 'indented');
11500            $t->flush( \*FILE);
11501
11502
11503=item flush_up_to ($elt, $optional_filehandle, %options)
11504
11505Flushes up to the C<$elt> element. This allows you to keep part of the
11506tree in memory when you C<flush>.
11507
11508options: see flush.
11509
11510=item purge
11511
11512Does the same as a C<flush> except it does not print the twig. It just deletes
11513all elements that have been completely parsed so far.
11514
11515=item purge_up_to ($elt)
11516
11517Purges up to the C<$elt> element. This allows you to keep part of the tree in
11518memory when you C<purge>.
11519
11520=item print            ($optional_filehandle, %options)
11521
11522Prints the whole document associated with the twig. To be used only AFTER the
11523parse.
11524
11525options: see C<flush>.
11526
11527=item print_to_file    ($filename, %options)
11528
11529Prints the whole document associated with the twig to file C<$filename>.
11530To be used only AFTER the parse.
11531
11532options: see C<flush>.
11533
11534=item safe_print_to_file    ($filename, %options)
11535
11536Prints the whole document associated with the twig to file C<$filename>.
11537This variant, which probably only works on *nix prints to a temp file,
11538then move the temp file to overwrite the original file.
11539
11540This is a bit safer when 2 processes an potentiallywrite the same file:
11541only the last one will succeed, but the file won't be corruted. I often
11542use this for cron jobs, so testing the code doesn't interfere with the
11543cron job running at the same time.
11544
11545options: see C<flush>.
11546
11547=item sprint
11548
11549Return the text of the whole document associated with the twig. To be used only
11550AFTER the parse.
11551
11552options: see C<flush>.
11553
11554=item trim
11555
11556Trim the document: gets rid of initial and trailing spaces, and replaces multiple spaces
11557by a single one.
11558
11559=item toSAX1 ($handler)
11560
11561Send SAX events for the twig to the SAX1 handler C<$handler>
11562
11563=item toSAX2 ($handler)
11564
11565Send SAX events for the twig to the SAX2 handler C<$handler>
11566
11567=item flush_toSAX1 ($handler)
11568
11569Same as flush, except that SAX events are sent to the SAX1 handler
11570C<$handler> instead of the twig being printed
11571
11572=item flush_toSAX2 ($handler)
11573
11574Same as flush, except that SAX events are sent to the SAX2 handler
11575C<$handler> instead of the twig being printed
11576
11577=item ignore
11578
11579This method should be called during parsing, usually in C<start_tag_handlers>.
11580It causes the element to be skipped during the parsing: the twig is not built
11581for this element, it will not be accessible during parsing or after it. The
11582element will not take up any memory and parsing will be faster.
11583
11584Note that this method can also be called on an element. If the element is a
11585parent of the current element then this element will be ignored (the twig will
11586not be built any more for it and what has already been built will be deleted).
11587
11588=item set_pretty_print  ($style)
11589
11590Set the pretty print method, amongst 'C<none>' (default), 'C<nsgmls>',
11591'C<nice>', 'C<indented>', C<indented_c>, 'C<wrapped>', 'C<record>' and
11592'C<record_c>'
11593
11594B<WARNING:> the pretty print style is a B<GLOBAL> variable, so once set it's
11595applied to B<ALL> C<print>'s (and C<sprint>'s). Same goes if you use XML::Twig
11596with C<mod_perl> . This should not be a problem as the XML that's generated
11597is valid anyway, and XML processors (as well as HTML processors, including
11598browsers) should not care. Let me know if this is a big problem, but at the
11599moment the performance/cleanliness trade-off clearly favors the global
11600approach.
11601
11602=item set_empty_tag_style  ($style)
11603
11604Set the empty tag display style ('C<normal>', 'C<html>' or 'C<expand>'). As
11605with C<L<set_pretty_print>> this sets a global flag.
11606
11607C<normal> outputs an empty tag 'C<< <tag/> >>', C<html> adds a space
11608'C<< <tag /> >>' for elements that can be empty in XHTML and C<expand> outputs
11609'C<< <tag></tag> >>'
11610
11611=item set_remove_cdata  ($flag)
11612
11613set (or unset) the flag that forces the twig to output CDATA sections as
11614regular (escaped) PCDATA
11615
11616=item print_prolog     ($optional_filehandle, %options)
11617
11618Prints the prolog (XML declaration + DTD + entity declarations) of a document.
11619
11620options: see C<L<flush>>.
11621
11622=item prolog     ($optional_filehandle, %options)
11623
11624Return the prolog (XML declaration + DTD + entity declarations) of a document.
11625
11626options: see C<L<flush>>.
11627
11628=item finish
11629
11630Call Expat C<finish> method.
11631Unsets all handlers (including internal ones that set context), but expat
11632continues parsing to the end of the document or until it finds an error.
11633It should finish up a lot faster than with the handlers set.
11634
11635=item finish_print
11636
11637Stops twig processing, flush the twig and proceed to finish printing the
11638document as fast as possible. Use this method when modifying a document and
11639the modification is done.
11640
11641=item finish_now
11642
11643Stops twig processing, does not finish parsing the document (which could
11644actually be not well-formed after the point where C<finish_now> is called).
11645Execution resumes after the C<Lparse>> or C<L<parsefile>> call. The content
11646of the twig is what has been parsed so far (all open elements at the time
11647C<finish_now> is called are considered closed).
11648
11649=item set_expand_external_entities
11650
11651Same as using the C<L<expand_external_ents>> option when creating the twig
11652
11653=item set_input_filter
11654
11655Same as using the C<L<input_filter>> option when creating the twig
11656
11657=item set_keep_atts_order
11658
11659Same as using the C<L<keep_atts_order>> option when creating the twig
11660
11661=item set_keep_encoding
11662
11663Same as using the C<L<keep_encoding>> option when creating the twig
11664
11665=item escape_gt
11666
11667usually XML::Twig does not escape > in its output. Using this option
11668makes it replace > by &gt;
11669
11670=item do_not_escape_gt
11671
11672reverts XML::Twig behavior to its default of not escaping > in its output.
11673
11674=item set_output_filter
11675
11676Same as using the C<L<output_filter>> option when creating the twig
11677
11678=item set_output_text_filter
11679
11680Same as using the C<L<output_text_filter>> option when creating the twig
11681
11682=item add_stylesheet ($type, @options)
11683
11684Adds an external stylesheet to an XML document.
11685
11686Supported types and options:
11687
11688=over 4
11689
11690=item xsl
11691
11692option: the url of the stylesheet
11693
11694Example:
11695
11696  $t->add_stylesheet( xsl => "xsl_style.xsl");
11697
11698will generate the following PI at the beginning of the document:
11699
11700  <?xml-stylesheet type="text/xsl" href="xsl_style.xsl"?>
11701
11702=item css
11703
11704option: the url of the stylesheet
11705
11706=item active_twig
11707
11708a class method that returns the last processed twig, so you don't necessarily
11709need the object to call methods on it.
11710
11711=back
11712
11713=item Methods inherited from XML::Parser::Expat
11714
11715A twig inherits all the relevant methods from XML::Parser::Expat. These
11716methods can only be used during the parsing phase (they will generate
11717a fatal error otherwise).
11718
11719Inherited methods are:
11720
11721=over 4
11722
11723=item depth
11724
11725Returns the size of the context list.
11726
11727=item in_element
11728
11729Returns true if NAME is equal to the name of the innermost cur‐
11730rently opened element. If namespace processing is being used and
11731you want to check against a name that may be in a namespace, then
11732use the generate_ns_name method to create the NAME argument.
11733
11734=item within_element
11735
11736Returns the number of times the given name appears in the context
11737list.  If namespace processing is being used and you want to check
11738against a name that may be in a namespace, then use the gener‐
11739ate_ns_name method to create the NAME argument.
11740
11741=item context
11742
11743Returns a list of element names that represent open elements, with
11744the last one being the innermost. Inside start and end tag han‐
11745dlers, this will be the tag of the parent element.
11746
11747=item current_line
11748
11749Returns the line number of the current position of the parse.
11750
11751=item current_column
11752
11753Returns the column number of the current position of the parse.
11754
11755=item current_byte
11756
11757Returns the current position of the parse.
11758
11759=item position_in_context
11760
11761Returns a string that shows the current parse position. LINES
11762should be an integer >= 0 that represents the number of lines on
11763either side of the current parse line to place into the returned
11764string.
11765
11766=item base ([NEWBASE])
11767
11768Returns the current value of the base for resolving relative URIs.
11769If NEWBASE is supplied, changes the base to that value.
11770
11771=item current_element
11772
11773Returns the name of the innermost currently opened element. Inside
11774start or end handlers, returns the parent of the element associated
11775with those tags.
11776
11777=item element_index
11778
11779Returns an integer that is the depth-first visit order of the cur‐
11780rent element. This will be zero outside of the root element. For
11781example, this will return 1 when called from the start handler for
11782the root element start tag.
11783
11784=item recognized_string
11785
11786Returns the string from the document that was recognized in order
11787to call the current handler. For instance, when called from a start
11788handler, it will give us the start-tag string. The string is
11789encoded in UTF-8.  This method doesn't return a meaningful string
11790inside declaration handlers.
11791
11792=item original_string
11793
11794Returns the verbatim string from the document that was recognized
11795in order to call the current handler. The string is in the original
11796document encoding. This method doesn't return a meaningful string
11797inside declaration handlers.
11798
11799=item xpcroak
11800
11801Concatenate onto the given message the current line number within
11802the XML document plus the message implied by ErrorContext. Then
11803croak with the formed message.
11804
11805=item xpcarp
11806
11807Concatenate onto the given message the current line number within
11808the XML document plus the message implied by ErrorContext. Then
11809carp with the formed message.
11810
11811=item xml_escape(TEXT [, CHAR [, CHAR ...]])
11812
11813Returns TEXT with markup characters turned into character entities.
11814Any additional characters provided as arguments are also turned
11815into character references where found in TEXT.
11816
11817(this method is broken on some versions of expat/XML::Parser)
11818
11819=back
11820
11821=item path ( $optional_tag)
11822
11823Return the element context in a form similar to XPath's short
11824form: 'C</root/tag1/../tag>'
11825
11826=item get_xpath  ( $optional_array_ref, $xpath, $optional_offset)
11827
11828Performs a C<get_xpath> on the document root (see <Elt|"Elt">)
11829
11830If the C<$optional_array_ref> argument is used the array must contain
11831elements. The C<$xpath> expression is applied to each element in turn
11832and the result is union of all results. This way a first query can be
11833refined in further steps.
11834
11835
11836=item find_nodes ( $optional_array_ref, $xpath, $optional_offset)
11837
11838same as C<get_xpath>
11839
11840=item findnodes ( $optional_array_ref, $xpath, $optional_offset)
11841
11842same as C<get_xpath> (similar to the XML::LibXML method)
11843
11844=item findvalue ( $optional_array_ref, $xpath, $optional_offset)
11845
11846Return the C<join> of all texts of the results of applying C<L<get_xpath>>
11847to the node (similar to the XML::LibXML method)
11848
11849=item findvalues ( $optional_array_ref, $xpath, $optional_offset)
11850
11851Return an array of all texts of the results of applying C<L<get_xpath>>
11852to the node
11853
11854=item subs_text ($regexp, $replace)
11855
11856subs_text does text substitution on the whole document, similar to perl's
11857C< s///> operator.
11858
11859=item dispose
11860
11861Useful only if you don't have C<Scalar::Util> or C<WeakRef> installed.
11862
11863Reclaims properly the memory used by an XML::Twig object. As the object has
11864circular references it never goes out of scope, so if you want to parse lots
11865of XML documents then the memory leak becomes a problem. Use
11866C<< $twig->dispose >> to clear this problem.
11867
11868=item att_accessors (list_of_attribute_names)
11869
11870A convenience method that creates l-valued accessors for attributes.
11871So C<< $twig->create_accessors( 'foo') >> will create a C<foo> method
11872that can be called on elements:
11873
11874  $elt->foo;         # equivalent to $elt->{'att'}->{'foo'};
11875  $elt->foo( 'bar'); # equivalent to $elt->set_att( foo => 'bar');
11876
11877The methods are l-valued only under those perl's that support this
11878feature (5.6 and above)
11879
11880=item create_accessors (list_of_attribute_names)
11881
11882Same as att_accessors
11883
11884=item elt_accessors (list_of_attribute_names)
11885
11886A convenience method that creates accessors for elements.
11887So C<< $twig->create_accessors( 'foo') >> will create a C<foo> method
11888that can be called on elements:
11889
11890  $elt->foo;         # equivalent to $elt->first_child( 'foo');
11891
11892=item field_accessors (list_of_attribute_names)
11893
11894A convenience method that creates accessors for element values (C<field>).
11895So C<< $twig->create_accessors( 'foo') >> will create a C<foo> method
11896that can be called on elements:
11897
11898  $elt->foo;         # equivalent to $elt->field( 'foo');
11899
11900=item set_do_not_escape_amp_in_atts
11901
11902An evil method, that I only document because Test::Pod::Coverage complaints otherwise,
11903but really, you don't want to know about it.
11904
11905=back
11906
11907=head2 XML::Twig::Elt
11908
11909=over 4
11910
11911=item new          ($optional_tag, $optional_atts, @optional_content)
11912
11913The C<tag> is optional (but then you can't have a content ), the C<$optional_atts>
11914argument is a reference to a hash of attributes, the content can be just a
11915string or a list of strings and element. A content of 'C<#EMPTY>' creates an empty
11916element;
11917
11918 Examples: my $elt= XML::Twig::Elt->new();
11919           my $elt= XML::Twig::Elt->new( para => { align => 'center' });
11920           my $elt= XML::Twig::Elt->new( para => { align => 'center' }, 'foo');
11921           my $elt= XML::Twig::Elt->new( br   => '#EMPTY');
11922           my $elt= XML::Twig::Elt->new( 'para');
11923           my $elt= XML::Twig::Elt->new( para => 'this is a para');
11924           my $elt= XML::Twig::Elt->new( para => $elt3, 'another para');
11925
11926The strings are not parsed, the element is not attached to any twig.
11927
11928B<WARNING>: if you rely on ID's then you will have to set the id yourself. At
11929this point the element does not belong to a twig yet, so the ID attribute
11930is not known so it won't be stored in the ID list.
11931
11932Note that C<#COMMENT>, C<#PCDATA> or C<#CDATA> are valid tag names, that will
11933create text elements.
11934
11935To create an element C<foo> containing a CDATA section:
11936
11937           my $foo= XML::Twig::Elt->new( '#CDATA' => "content of the CDATA section")
11938                                  ->wrap_in( 'foo');
11939
11940An attribute of '#CDATA', will create the content of the element as CDATA:
11941
11942  my $elt= XML::Twig::Elt->new( 'p' => { '#CDATA' => 1}, 'foo < bar');
11943
11944creates an element
11945
11946  <p><![CDATA[foo < bar]]></>
11947
11948=item parse         ($string, %args)
11949
11950Creates an element from an XML string. The string is actually
11951parsed as a new twig, then the root of that twig is returned.
11952The arguments in C<%args> are passed to the twig.
11953As always if the parse fails the parser will die, so use an
11954eval if you want to trap syntax errors.
11955
11956As obviously the element does not exist beforehand this method has to be
11957called on the class:
11958
11959  my $elt= parse XML::Twig::Elt( "<a> string to parse, with <sub/>
11960                                  <elements>, actually tons of </elements>
11961                  h</a>");
11962
11963=item set_inner_xml ($string)
11964
11965Sets the content of the element to be the tree created from the string
11966
11967=item set_inner_html ($string)
11968
11969Sets the content of the element, after parsing the string with an HTML
11970parser (HTML::Parser)
11971
11972=item set_outer_xml ($string)
11973
11974Replaces the element with the tree created from the string
11975
11976=item print         ($optional_filehandle, $optional_pretty_print_style)
11977
11978Prints an entire element, including the tags, optionally to a
11979C<$optional_filehandle>, optionally with a C<$pretty_print_style>.
11980
11981The print outputs XML data so base entities are escaped.
11982
11983=item print_to_file    ($filename, %options)
11984
11985Prints the element to file C<$filename>.
11986
11987options: see C<flush>.
11988=item sprint       ($elt, $optional_no_enclosing_tag)
11989
11990Return the xml string for an entire element, including the tags.
11991If the optional second argument is true then only the string inside the
11992element is returned (the start and end tag for $elt are not).
11993The text is XML-escaped: base entities (& and < in text, & < and " in
11994attribute values) are turned into entities.
11995
11996=item gi
11997
11998Return the gi of the element (the gi is the C<generic identifier> the tag
11999name in SGML parlance).
12000
12001C<tag> and C<name> are synonyms of C<gi>.
12002
12003=item tag
12004
12005Same as C<L<gi>>
12006
12007=item name
12008
12009Same as C<L<tag>>
12010
12011=item set_gi         ($tag)
12012
12013Set the gi (tag) of an element
12014
12015=item set_tag        ($tag)
12016
12017Set the tag (=C<L<tag>>) of an element
12018
12019=item set_name       ($name)
12020
12021Set the name (=C<L<tag>>) of an element
12022
12023=item root
12024
12025Return the root of the twig in which the element is contained.
12026
12027=item twig
12028
12029Return the twig containing the element.
12030
12031=item parent        ($optional_condition)
12032
12033Return the parent of the element, or the first ancestor matching the
12034C<$optional_condition>
12035
12036=item first_child   ($optional_condition)
12037
12038Return the first child of the element, or the first child matching the
12039C<$optional_condition>
12040
12041=item has_child ($optional_condition)
12042
12043Return the first child of the element, or the first child matching the
12044C<$optional_condition> (same as L<first_child>)
12045
12046=item has_children ($optional_condition)
12047
12048Return the first child of the element, or the first child matching the
12049C<$optional_condition> (same as L<first_child>)
12050
12051
12052=item first_child_text   ($optional_condition)
12053
12054Return the text of the first child of the element, or the first child
12055 matching the C<$optional_condition>
12056If there is no first_child then returns ''. This avoids getting the
12057child, checking for its existence then getting the text for trivial cases.
12058
12059Similar methods are available for the other navigation methods:
12060
12061=over 4
12062
12063=item last_child_text
12064
12065=item prev_sibling_text
12066
12067=item next_sibling_text
12068
12069=item prev_elt_text
12070
12071=item next_elt_text
12072
12073=item child_text
12074
12075=item parent_text
12076
12077=back
12078
12079All this methods also exist in "trimmed" variant:
12080
12081=over 4
12082
12083=item first_child_trimmed_text
12084
12085=item last_child_trimmed_text
12086
12087=item prev_sibling_trimmed_text
12088
12089=item next_sibling_trimmed_text
12090
12091=item prev_elt_trimmed_text
12092
12093=item next_elt_trimmed_text
12094
12095=item child_trimmed_text
12096
12097=item parent_trimmed_text
12098
12099=back
12100
12101=item field         ($condition)
12102
12103Same method as C<first_child_text> with a different name
12104
12105=item fields         ($condition_list)
12106
12107Return the list of field (text of first child matching the conditions),
12108missing fields are returned as the empty string.
12109
12110Same method as C<first_child_text> with a different name
12111
12112=item trimmed_field         ($optional_condition)
12113
12114Same method as C<first_child_trimmed_text> with a different name
12115
12116=item set_field ($condition, $optional_atts, @list_of_elt_and_strings)
12117
12118Set the content of the first child of the element that matches
12119C<$condition>, the rest of the arguments is the same as for C<L<set_content>>
12120
12121If no child matches C<$condition> _and_ if C<$condition> is a valid
12122XML element name, then a new element by that name is created and
12123inserted as the last child.
12124
12125=item first_child_matches   ($optional_condition)
12126
12127Return the element if the first child of the element (if it exists) passes
12128the C<$optional_condition> C<undef> otherwise
12129
12130  if( $elt->first_child_matches( 'title')) ...
12131
12132is equivalent to
12133
12134  if( $elt->{first_child} && $elt->{first_child}->passes( 'title'))
12135
12136C<first_child_is> is an other name for this method
12137
12138Similar methods are available for the other navigation methods:
12139
12140=over 4
12141
12142=item last_child_matches
12143
12144=item prev_sibling_matches
12145
12146=item next_sibling_matches
12147
12148=item prev_elt_matches
12149
12150=item next_elt_matches
12151
12152=item child_matches
12153
12154=item parent_matches
12155
12156=back
12157
12158=item is_first_child ($optional_condition)
12159
12160returns true (the element) if the element is the first child of its parent
12161(optionally that satisfies the C<$optional_condition>)
12162
12163=item is_last_child ($optional_condition)
12164
12165returns true (the element) if the element is the last child of its parent
12166(optionally that satisfies the C<$optional_condition>)
12167
12168=item prev_sibling  ($optional_condition)
12169
12170Return the previous sibling of the element, or the previous sibling matching
12171C<$optional_condition>
12172
12173=item next_sibling  ($optional_condition)
12174
12175Return the next sibling of the element, or the first one matching
12176C<$optional_condition>.
12177
12178=item next_elt     ($optional_elt, $optional_condition)
12179
12180Return the next elt (optionally matching C<$optional_condition>) of the element. This
12181is defined as the next element which opens after the current element opens.
12182Which usually means the first child of the element.
12183Counter-intuitive as it might look this allows you to loop through the
12184whole document by starting from the root.
12185
12186The C<$optional_elt> is the root of a subtree. When the C<next_elt> is out of the
12187subtree then the method returns undef. You can then walk a sub-tree with:
12188
12189  my $elt= $subtree_root;
12190  while( $elt= $elt->next_elt( $subtree_root))
12191    { # insert processing code here
12192    }
12193
12194=item prev_elt     ($optional_condition)
12195
12196Return the previous elt (optionally matching C<$optional_condition>) of the
12197element. This is the first element which opens before the current one.
12198It is usually either the last descendant of the previous sibling or
12199simply the parent
12200
12201=item next_n_elt   ($offset, $optional_condition)
12202
12203Return the C<$offset>-th element that matches the C<$optional_condition>
12204
12205=item following_elt
12206
12207Return the following element (as per the XPath following axis)
12208
12209=item preceding_elt
12210
12211Return the preceding element (as per the XPath preceding axis)
12212
12213=item following_elts
12214
12215Return the list of following elements (as per the XPath following axis)
12216
12217=item preceding_elts
12218
12219Return the list of preceding elements (as per the XPath preceding axis)
12220
12221=item children     ($optional_condition)
12222
12223Return the list of children (optionally which matches C<$optional_condition>) of
12224the element. The list is in document order.
12225
12226=item children_count ($optional_condition)
12227
12228Return the number of children of the element (optionally which matches
12229C<$optional_condition>)
12230
12231=item children_text ($optional_condition)
12232
12233In array context, returns an array containing the text of children of the
12234element (optionally which matches C<$optional_condition>)
12235
12236In scalar context, returns the concatenation of the text of children of
12237the element
12238
12239=item children_trimmed_text ($optional_condition)
12240
12241In array context, returns an array containing the trimmed text of children
12242of the element (optionally which matches C<$optional_condition>)
12243
12244In scalar context, returns the concatenation of the trimmed text of children of
12245the element
12246
12247
12248=item children_copy ($optional_condition)
12249
12250Return a list of elements that are copies of the children of the element,
12251optionally which matches C<$optional_condition>
12252
12253=item descendants     ($optional_condition)
12254
12255Return the list of all descendants (optionally which matches
12256C<$optional_condition>) of the element. This is the equivalent of the
12257C<getElementsByTagName> of the DOM (by the way, if you are really a DOM
12258addict, you can use C<getElementsByTagName> instead)
12259
12260=item getElementsByTagName ($optional_condition)
12261
12262Same as C<L<descendants>>
12263
12264=item find_by_tag_name ($optional_condition)
12265
12266Same as C<L<descendants>>
12267
12268=item descendants_or_self ($optional_condition)
12269
12270Same as C<L<descendants>> except that the element itself is included in the list
12271if it matches the C<$optional_condition>
12272
12273=item first_descendant  ($optional_condition)
12274
12275Return the first descendant of the element that matches the condition
12276
12277=item last_descendant  ($optional_condition)
12278
12279Return the last descendant of the element that matches the condition
12280
12281=item ancestors    ($optional_condition)
12282
12283Return the list of ancestors (optionally matching C<$optional_condition>) of the
12284element.  The list is ordered from the innermost ancestor to the outermost one
12285
12286NOTE: the element itself is not part of the list, in order to include it
12287you will have to use ancestors_or_self
12288
12289=item ancestors_or_self     ($optional_condition)
12290
12291Return the list of ancestors (optionally matching C<$optional_condition>) of the
12292element, including the element (if it matches the condition>).
12293The list is ordered from the innermost ancestor to the outermost one
12294
12295=item passes ($condition)
12296
12297Return the element if it passes the C<$condition>
12298
12299=item att          ($att)
12300
12301Return the value of attribute C<$att> or C<undef>
12302
12303=item latt          ($att)
12304
12305Return the value of attribute C<$att> or C<undef>
12306
12307this method is an lvalue, so you can do C<< $elt->latt( 'foo')= 'bar' >> or C<< $elt->latt( 'foo')++; >>
12308
12309=item set_att      ($att, $att_value)
12310
12311Set the attribute of the element to the given value
12312
12313You can actually set several attributes this way:
12314
12315  $elt->set_att( att1 => "val1", att2 => "val2");
12316
12317=item del_att      ($att)
12318
12319Delete the attribute for the element
12320
12321You can actually delete several attributes at once:
12322
12323  $elt->del_att( 'att1', 'att2', 'att3');
12324
12325=item att_exists ($att)
12326
12327Returns true if the attribute C<$att> exists for the element, false
12328otherwise
12329
12330=item cut
12331
12332Cut the element from the tree. The element still exists, it can be copied
12333or pasted somewhere else, it is just not attached to the tree anymore.
12334
12335Note that the "old" links to the parent, previous and next siblings can
12336still be accessed using the former_* methods
12337
12338=item former_next_sibling
12339
12340Returns the former next sibling of a cut node (or undef if the node has not been cut)
12341
12342This makes it easier to write loops where you cut elements:
12343
12344    my $child= $parent->first_child( 'achild');
12345    while( $child->{'att'}->{'cut'})
12346      { $child->cut; $child= ($child->{former} && $child->{former}->{next_sibling}); }
12347
12348=item former_prev_sibling
12349
12350Returns the former previous sibling of a cut node (or undef if the node has not been cut)
12351
12352=item former_parent
12353
12354Returns the former parent of a cut node (or undef if the node has not been cut)
12355
12356=item cut_children ($optional_condition)
12357
12358Cut all the children of the element (or all of those which satisfy the
12359C<$optional_condition>).
12360
12361Return the list of children
12362
12363=item cut_descendants ($optional_condition)
12364
12365Cut all the descendants of the element (or all of those which satisfy the
12366C<$optional_condition>).
12367
12368Return the list of descendants
12369
12370=item copy        ($elt)
12371
12372Return a copy of the element. The copy is a "deep" copy: all sub-elements of
12373the element are duplicated.
12374
12375=item paste       ($optional_position, $ref)
12376
12377Paste a (previously C<cut> or newly generated) element. Die if the element
12378already belongs to a tree.
12379
12380Note that the calling element is pasted:
12381
12382  $child->paste( first_child => $existing_parent);
12383  $new_sibling->paste( after => $this_sibling_is_already_in_the_tree);
12384
12385or
12386
12387  my $new_elt= XML::Twig::Elt->new( tag => $content);
12388  $new_elt->paste( $position => $existing_elt);
12389
12390Example:
12391
12392  my $t= XML::Twig->new->parse( 'doc.xml')
12393  my $toc= $t->root->new( 'toc');
12394  $toc->paste( $t->root); # $toc is pasted as first child of the root
12395  foreach my $title ($t->findnodes( '/doc/section/title'))
12396    { my $title_toc= $title->copy;
12397      # paste $title_toc as the last child of toc
12398      $title_toc->paste( last_child => $toc)
12399    }
12400
12401Position options:
12402
12403=over 4
12404
12405=item first_child (default)
12406
12407The element is pasted as the first child of C<$ref>
12408
12409=item last_child
12410
12411The element is pasted as the last child of C<$ref>
12412
12413=item before
12414
12415The element is pasted before C<$ref>, as its previous sibling.
12416
12417=item after
12418
12419The element is pasted after C<$ref>, as its next sibling.
12420
12421=item within
12422
12423In this case an extra argument, C<$offset>, should be supplied. The element
12424will be pasted in the reference element (or in its first text child) at the
12425given offset. To achieve this the reference element will be split at the
12426offset.
12427
12428=back
12429
12430Note that you can call directly the underlying method:
12431
12432=over 4
12433
12434=item paste_before
12435
12436=item paste_after
12437
12438=item paste_first_child
12439
12440=item paste_last_child
12441
12442=item paste_within
12443
12444=back
12445
12446=item move       ($optional_position, $ref)
12447
12448Move an element in the tree.
12449This is just a C<cut> then a C<paste>.  The syntax is the same as C<paste>.
12450
12451=item replace       ($ref)
12452
12453Replaces an element in the tree. Sometimes it is just not possible toC<cut>
12454an element then C<paste> another in its place, so C<replace> comes in handy.
12455The calling element replaces C<$ref>.
12456
12457=item replace_with   (@elts)
12458
12459Replaces the calling element with one or more elements
12460
12461=item delete
12462
12463Cut the element and frees the memory.
12464
12465=item prefix       ($text, $optional_option)
12466
12467Add a prefix to an element. If the element is a C<PCDATA> element the text
12468is added to the pcdata, if the elements first child is a C<PCDATA> then the
12469text is added to it's pcdata, otherwise a new C<PCDATA> element is created
12470and pasted as the first child of the element.
12471
12472If the option is C<asis> then the prefix is added asis: it is created in
12473a separate C<PCDATA> element with an C<asis> property. You can then write:
12474
12475  $elt1->prefix( '<b>', 'asis');
12476
12477to create a C<< <b> >> in the output of C<print>.
12478
12479=item suffix       ($text, $optional_option)
12480
12481Add a suffix to an element. If the element is a C<PCDATA> element the text
12482is added to the pcdata, if the elements last child is a C<PCDATA> then the
12483text is added to it's pcdata, otherwise a new PCDATA element is created
12484and pasted as the last child of the element.
12485
12486If the option is C<asis> then the suffix is added asis: it is created in
12487a separate C<PCDATA> element with an C<asis> property. You can then write:
12488
12489  $elt2->suffix( '</b>', 'asis');
12490
12491=item trim
12492
12493Trim the element in-place: spaces at the beginning and at the end of the element
12494are discarded and multiple spaces within the element (or its descendants) are
12495replaced by a single space.
12496
12497Note that in some cases you can still end up with multiple spaces, if they are
12498split between several elements:
12499
12500  <doc>  text <b>  hah! </b>  yep</doc>
12501
12502gets trimmed to
12503
12504  <doc>text <b> hah! </b> yep</doc>
12505
12506This is somewhere in between a bug and a feature.
12507
12508=item normalize
12509
12510merge together all consecutive pcdata elements in the element (if for example
12511you have turned some elements into pcdata using C<L<erase>>, this will give you
12512a "clean" element in which there all text fragments are as long as possible).
12513
12514
12515=item simplify (%options)
12516
12517Return a data structure suspiciously similar to XML::Simple's. Options are
12518identical to XMLin options, see XML::Simple doc for more details (or use
12519DATA::dumper or YAML to dump the data structure)
12520
12521B<Note>: there is no magic here, if you write
12522C<< $twig->parsefile( $file )->simplify(); >> then it will load the entire
12523document in memory. I am afraid you will have to put some work into it to
12524get just the bits you want and discard the rest. Look at the synopsis or
12525the XML::Twig 101 section at the top of the docs for more information.
12526
12527=over 4
12528
12529=item content_key
12530
12531=item forcearray
12532
12533=item keyattr
12534
12535=item noattr
12536
12537=item normalize_space
12538
12539aka normalise_space
12540
12541=item variables (%var_hash)
12542
12543%var_hash is a hash { name => value }
12544
12545This option allows variables in the XML to be expanded when the file is read. (there is no facility for putting the variable names back if you regenerate XML using XMLout).
12546
12547A 'variable' is any text of the form ${name} (or $name) which occurs in an attribute value or in the text content of an element. If 'name' matches a key in the supplied hashref, ${name} will be replaced with the corresponding value from the hashref. If no matching key is found, the variable will not be replaced.
12548
12549=item var_att ($attribute_name)
12550
12551This option gives the name of an attribute that will be used to create
12552variables in the XML:
12553
12554  <dirs>
12555    <dir name="prefix">/usr/local</dir>
12556    <dir name="exec_prefix">$prefix/bin</dir>
12557  </dirs>
12558
12559use C<< var => 'name' >> to get $prefix replaced by /usr/local in the
12560generated data structure
12561
12562By default variables are captured by the following regexp: /$(\w+)/
12563
12564=item var_regexp (regexp)
12565
12566This option changes the regexp used to capture variables. The variable
12567name should be in $1
12568
12569=item group_tags { grouping tag => grouped tag, grouping tag 2 => grouped tag 2...}
12570
12571Option used to simplify the structure: elements listed will not be used.
12572Their children will be, they will be considered children of the element
12573parent.
12574
12575If the element is:
12576
12577  <config host="laptop.xmltwig.org">
12578    <server>localhost</server>
12579    <dirs>
12580      <dir name="base">/home/mrodrigu/standards</dir>
12581      <dir name="tools">$base/tools</dir>
12582    </dirs>
12583    <templates>
12584      <template name="std_def">std_def.templ</template>
12585      <template name="dummy">dummy</template>
12586    </templates>
12587  </config>
12588
12589Then calling simplify with C<< group_tags => { dirs => 'dir',
12590templates => 'template'} >>
12591makes the data structure be exactly as if the start and end tags for C<dirs> and
12592C<templates> were not there.
12593
12594A YAML dump of the structure
12595
12596  base: '/home/mrodrigu/standards'
12597  host: laptop.xmltwig.org
12598  server: localhost
12599  template:
12600    - std_def.templ
12601    - dummy.templ
12602  tools: '$base/tools'
12603
12604
12605=back
12606
12607=item split_at        ($offset)
12608
12609Split a text (C<PCDATA> or C<CDATA>) element in 2 at C<$offset>, the original
12610element now holds the first part of the string and a new element holds the
12611right part. The new element is returned
12612
12613If the element is not a text element then the first text child of the element
12614is split
12615
12616=item split        ( $optional_regexp, $tag1, $atts1, $tag2, $atts2...)
12617
12618Split the text descendants of an element in place, the text is split using
12619the C<$regexp>, if the regexp includes () then the matched separators will be
12620wrapped in elements.  C<$1> is wrapped in $tag1, with attributes C<$atts1> if
12621C<$atts1> is given (as a hashref), C<$2> is wrapped in $tag2...
12622
12623if $elt is C<< <p>tati tata <b>tutu tati titi</b> tata tati tata</p> >>
12624
12625  $elt->split( qr/(ta)ti/, 'foo', {type => 'toto'} )
12626
12627will change $elt to
12628
12629  <p><foo type="toto">ta</foo> tata <b>tutu <foo type="toto">ta</foo>
12630      titi</b> tata <foo type="toto">ta</foo> tata</p>
12631
12632The regexp can be passed either as a string or as C<qr//> (perl 5.005 and
12633later), it defaults to \s+ just as the C<split> built-in (but this would be
12634quite a useless behaviour without the C<$optional_tag> parameter)
12635
12636C<$optional_tag> defaults to PCDATA or CDATA, depending on the initial element
12637type
12638
12639The list of descendants is returned (including un-touched original elements
12640and newly created ones)
12641
12642=item mark        ( $regexp, $optional_tag, $optional_attribute_ref)
12643
12644This method behaves exactly as L<split>, except only the newly created
12645elements are returned
12646
12647=item wrap_children ( $regexp_string, $tag, $optional_attribute_hashref)
12648
12649Wrap the children of the element that match the regexp in an element C<$tag>.
12650If $optional_attribute_hashref is passed then the new element will
12651have these attributes.
12652
12653The $regexp_string includes tags, within pointy brackets, as in
12654C<< <title><para>+ >> and the usual Perl modifiers (+*?...).
12655Tags can be further qualified with attributes:
12656C<< <para type="warning" classif="cosmic_secret">+ >>. The values
12657for attributes should be xml-escaped: C<< <candy type="M&amp;Ms">* >>
12658(C<E<lt>>, C<&> B<C<E<gt>>> and C<"> should be escaped).
12659
12660Note that elements might get extra C<id> attributes in the process. See L<add_id>.
12661Use L<strip_att> to remove unwanted id's.
12662
12663Here is an example:
12664
12665If the element C<$elt> has the following content:
12666
12667  <elt>
12668   <p>para 1</p>
12669   <l_l1_1>list 1 item 1 para 1</l_l1_1>
12670     <l_l1>list 1 item 1 para 2</l_l1>
12671   <l_l1_n>list 1 item 2 para 1 (only para)</l_l1_n>
12672   <l_l1_n>list 1 item 3 para 1</l_l1_n>
12673     <l_l1>list 1 item 3 para 2</l_l1>
12674     <l_l1>list 1 item 3 para 3</l_l1>
12675   <l_l1_1>list 2 item 1 para 1</l_l1_1>
12676     <l_l1>list 2 item 1 para 2</l_l1>
12677   <l_l1_n>list 2 item 2 para 1 (only para)</l_l1_n>
12678   <l_l1_n>list 2 item 3 para 1</l_l1_n>
12679     <l_l1>list 2 item 3 para 2</l_l1>
12680     <l_l1>list 2 item 3 para 3</l_l1>
12681  </elt>
12682
12683Then the code
12684
12685  $elt->wrap_children( q{<l_l1_1><l_l1>*} , li => { type => "ul1" });
12686  $elt->wrap_children( q{<l_l1_n><l_l1>*} , li => { type => "ul" });
12687
12688  $elt->wrap_children( q{<li type="ul1"><li type="ul">+}, "ul");
12689  $elt->strip_att( 'id');
12690  $elt->strip_att( 'type');
12691  $elt->print;
12692
12693will output:
12694
12695  <elt>
12696     <p>para 1</p>
12697     <ul>
12698       <li>
12699         <l_l1_1>list 1 item 1 para 1</l_l1_1>
12700         <l_l1>list 1 item 1 para 2</l_l1>
12701       </li>
12702       <li>
12703         <l_l1_n>list 1 item 2 para 1 (only para)</l_l1_n>
12704       </li>
12705       <li>
12706         <l_l1_n>list 1 item 3 para 1</l_l1_n>
12707         <l_l1>list 1 item 3 para 2</l_l1>
12708         <l_l1>list 1 item 3 para 3</l_l1>
12709       </li>
12710     </ul>
12711     <ul>
12712       <li>
12713         <l_l1_1>list 2 item 1 para 1</l_l1_1>
12714         <l_l1>list 2 item 1 para 2</l_l1>
12715       </li>
12716       <li>
12717         <l_l1_n>list 2 item 2 para 1 (only para)</l_l1_n>
12718       </li>
12719       <li>
12720         <l_l1_n>list 2 item 3 para 1</l_l1_n>
12721         <l_l1>list 2 item 3 para 2</l_l1>
12722         <l_l1>list 2 item 3 para 3</l_l1>
12723       </li>
12724     </ul>
12725  </elt>
12726
12727=item subs_text ($regexp, $replace)
12728
12729subs_text does text substitution, similar to perl's C< s///> operator.
12730
12731C<$regexp> must be a perl regexp, created with the C<qr> operator.
12732
12733C<$replace> can include C<$1, $2>... from the C<$regexp>. It can also be
12734used to create element and entities, by using
12735C<< &elt( tag => { att => val }, text) >> (similar syntax as C<L<new>>) and
12736C<< &ent( name) >>.
12737
12738Here is a rather complex example:
12739
12740  $elt->subs_text( qr{(?<!do not )link to (http://([^\s,]*))},
12741                   'see &elt( a =>{ href => $1 }, $2)'
12742                 );
12743
12744This will replace text like I<link to http://www.xmltwig.org> by
12745I<< see <a href="www.xmltwig.org">www.xmltwig.org</a> >>, but not
12746I<do not link to...>
12747
12748Generating entities (here replacing spaces with &nbsp;):
12749
12750  $elt->subs_text( qr{ }, '&ent( "&nbsp;")');
12751
12752or, using a variable:
12753
12754  my $ent="&nbsp;";
12755  $elt->subs_text( qr{ }, "&ent( '$ent')");
12756
12757Note that the substitution is always global, as in using the C<g> modifier
12758in a perl substitution, and that it is performed on all text descendants
12759of the element.
12760
12761B<Bug>: in the C<$regexp>, you can only use C<\1>, C<\2>... if the replacement
12762expression does not include elements or attributes. eg
12763
12764  $t->subs_text( qr/((t[aiou])\2)/, '$2');             # ok, replaces toto, tata, titi, tutu by to, ta, ti, tu
12765  $t->subs_text( qr/((t[aiou])\2)/, '&elt(p => $1)' ); # NOK, does not find toto...
12766
12767=item add_id ($optional_coderef)
12768
12769Add an id to the element.
12770
12771The id is an attribute, C<id> by default, see the C<id> option for XML::Twig
12772C<new> to change it. Use an id starting with C<#> to get an id that's not
12773output by L<print>, L<flush> or L<sprint>, yet that allows you to use the
12774L<elt_id> method to get the element easily.
12775
12776If the element already has an id, no new id is generated.
12777
12778By default the method create an id of the form C<< twig_id_<nnnn> >>,
12779where C<< <nnnn> >> is a number, incremented each time the method is called
12780successfully.
12781
12782=item set_id_seed ($prefix)
12783
12784by default the id generated by C<L<add_id>> is C<< twig_id_<nnnn> >>,
12785C<set_id_seed> changes the prefix to C<$prefix> and resets the number
12786to 1
12787
12788=item strip_att ($att)
12789
12790Remove the attribute C<$att> from all descendants of the element (including
12791the element)
12792
12793Return the element
12794
12795=item change_att_name ($old_name, $new_name)
12796
12797Change the name of the attribute from C<$old_name> to C<$new_name>. If there is no
12798attribute C<$old_name> nothing happens.
12799
12800=item lc_attnames
12801
12802Lower cases the name all the attributes of the element.
12803
12804=item sort_children_on_value( %options)
12805
12806Sort the children of the element in place according to their text.
12807All children are sorted.
12808
12809Return the element, with its children sorted.
12810
12811
12812C<%options> are
12813
12814  type  : numeric |  alpha     (default: alpha)
12815  order : normal  |  reverse   (default: normal)
12816
12817Return the element, with its children sorted
12818
12819
12820=item sort_children_on_att ($att, %options)
12821
12822Sort the children of the  element in place according to attribute C<$att>.
12823C<%options> are the same as for C<sort_children_on_value>
12824
12825Return the element.
12826
12827
12828=item sort_children_on_field ($tag, %options)
12829
12830Sort the children of the element in place, according to the field C<$tag> (the
12831text of the first child of the child with this tag). C<%options> are the same
12832as for C<sort_children_on_value>.
12833
12834Return the element, with its children sorted
12835
12836
12837=item sort_children( $get_key, %options)
12838
12839Sort the children of the element in place. The C<$get_key> argument is
12840a reference to a function that returns the sort key when passed an element.
12841
12842For example:
12843
12844  $elt->sort_children( sub { $_[0]->{'att'}->{"nb"} + $_[0]->text },
12845                       type => 'numeric', order => 'reverse'
12846                     );
12847
12848=item field_to_att ($cond, $att)
12849
12850Turn the text of the first sub-element matched by C<$cond> into the value of
12851attribute C<$att> of the element. If C<$att> is omitted then C<$cond> is used
12852as the name of the attribute, which makes sense only if C<$cond> is a valid
12853element (and attribute) name.
12854
12855The sub-element is then cut.
12856
12857=item att_to_field ($att, $tag)
12858
12859Take the value of attribute C<$att> and create a sub-element C<$tag> as first
12860child of the element. If C<$tag> is omitted then C<$att> is used as the name of
12861the sub-element.
12862
12863
12864=item get_xpath  ($xpath, $optional_offset)
12865
12866Return a list of elements satisfying the C<$xpath>. C<$xpath> is an XPATH-like
12867expression.
12868
12869A subset of the XPATH abbreviated syntax is covered:
12870
12871  tag
12872  tag[1] (or any other positive number)
12873  tag[last()]
12874  tag[@att] (the attribute exists for the element)
12875  tag[@att="val"]
12876  tag[@att=~ /regexp/]
12877  tag[att1="val1" and att2="val2"]
12878  tag[att1="val1" or att2="val2"]
12879  tag[string()="toto"] (returns tag elements which text (as per the text method)
12880                       is toto)
12881  tag[string()=~/regexp/] (returns tag elements which text (as per the text
12882                          method) matches regexp)
12883  expressions can start with / (search starts at the document root)
12884  expressions can start with . (search starts at the current element)
12885  // can be used to get all descendants instead of just direct children
12886  * matches any tag
12887
12888So the following examples from the
12889F<XPath recommendationL<http://www.w3.org/TR/xpath.html#path-abbrev>> work:
12890
12891  para selects the para element children of the context node
12892  * selects all element children of the context node
12893  para[1] selects the first para child of the context node
12894  para[last()] selects the last para child of the context node
12895  */para selects all para grandchildren of the context node
12896  /doc/chapter[5]/section[2] selects the second section of the fifth chapter
12897     of the doc
12898  chapter//para selects the para element descendants of the chapter element
12899     children of the context node
12900  //para selects all the para descendants of the document root and thus selects
12901     all para elements in the same document as the context node
12902  //olist/item selects all the item elements in the same document as the
12903     context node that have an olist parent
12904  .//para selects the para element descendants of the context node
12905  .. selects the parent of the context node
12906  para[@type="warning"] selects all para children of the context node that have
12907     a type attribute with value warning
12908  employee[@secretary and @assistant] selects all the employee children of the
12909     context node that have both a secretary attribute and an assistant
12910     attribute
12911
12912
12913The elements will be returned in the document order.
12914
12915If C<$optional_offset> is used then only one element will be returned, the one
12916with the appropriate offset in the list, starting at 0
12917
12918Quoting and interpolating variables can be a pain when the Perl syntax and the
12919XPATH syntax collide, so use alternate quoting mechanisms like q or qq
12920(I like q{} and qq{} myself).
12921
12922Here are some more examples to get you started:
12923
12924  my $p1= "p1";
12925  my $p2= "p2";
12926  my @res= $t->get_xpath( qq{p[string( "$p1") or string( "$p2")]});
12927
12928  my $a= "a1";
12929  my @res= $t->get_xpath( qq{//*[@att="$a"]});
12930
12931  my $val= "a1";
12932  my $exp= qq{//p[ \@att='$val']}; # you need to use \@ or you will get a warning
12933  my @res= $t->get_xpath( $exp);
12934
12935Note that the only supported regexps delimiters are / and that you must
12936backslash all / in regexps AND in regular strings.
12937
12938XML::Twig does not provide natively full XPATH support, but you can use
12939C<L<XML::Twig::XPath>> to get C<findnodes> to use C<XML::XPath> as the
12940XPath engine, with full coverage of the spec.
12941
12942C<L<XML::Twig::XPath>> to get C<findnodes> to use C<XML::XPath> as the
12943XPath engine, with full coverage of the spec.
12944
12945=item find_nodes
12946
12947same asC<get_xpath>
12948
12949=item findnodes
12950
12951same as C<get_xpath>
12952
12953
12954=item text @optional_options
12955
12956Return a string consisting of all the C<PCDATA> and C<CDATA> in an element,
12957without any tags. The text is not XML-escaped: base entities such as C<&>
12958and C<< < >> are not escaped.
12959
12960The 'C<no_recurse>' option will only return the text of the element, not
12961of any included sub-elements (same as C<L<text_only>>).
12962
12963=item text_only
12964
12965Same as C<L<text>> except that the text returned doesn't include
12966the text of sub-elements.
12967
12968=item trimmed_text
12969
12970Same as C<text> except that the text is trimmed: leading and trailing spaces
12971are discarded, consecutive spaces are collapsed
12972
12973=item set_text        ($string)
12974
12975Set the text for the element: if the element is a C<PCDATA>, just set its
12976text, otherwise cut all the children of the element and create a single
12977C<PCDATA> child for it, which holds the text.
12978
12979=item merge ($elt2)
12980
12981Move the content of C<$elt2> within the element
12982
12983=item insert         ($tag1, [$optional_atts1], $tag2, [$optional_atts2],...)
12984
12985For each tag in the list inserts an element C<$tag> as the only child of the
12986element.  The element gets the optional attributes inC<< $optional_atts<n>. >>
12987All children of the element are set as children of the new element.
12988The upper level element is returned.
12989
12990  $p->insert( table => { border=> 1}, 'tr', 'td')
12991
12992put C<$p> in a table with a visible border, a single C<tr> and a single C<td>
12993and return the C<table> element:
12994
12995  <p><table border="1"><tr><td>original content of p</td></tr></table></p>
12996
12997=item wrap_in        (@tag)
12998
12999Wrap elements in C<@tag> as the successive ancestors of the element, returns the
13000new element.
13001C<< $elt->wrap_in( 'td', 'tr', 'table') >> wraps the element as a single cell in a
13002table for example.
13003
13004Optionally each tag can be followed by a hashref of attributes, that will be
13005set on the wrapping element:
13006
13007  $elt->wrap_in( p => { class => "advisory" }, div => { class => "intro", id => "div_intro" });
13008
13009=item insert_new_elt ($opt_position, $tag, $opt_atts_hashref, @opt_content)
13010
13011Combines a C<L<new> > and a C<L<paste> >: creates a new element using
13012C<$tag>, C<$opt_atts_hashref >and C<@opt_content> which are arguments similar
13013to those for C<new>, then paste it, using C<$opt_position> or C<'first_child'>,
13014relative to C<$elt>.
13015
13016Return the newly created element
13017
13018=item erase
13019
13020Erase the element: the element is deleted and all of its children are
13021pasted in its place.
13022
13023=item set_content    ( $optional_atts, @list_of_elt_and_strings)
13024                     ( $optional_atts, '#EMPTY')
13025
13026Set the content for the element, from a list of strings and
13027elements.  Cuts all the element children, then pastes the list
13028elements as the children.  This method will create a C<PCDATA> element
13029for any strings in the list.
13030
13031The C<$optional_atts> argument is the ref of a hash of attributes. If this
13032argument is used then the previous attributes are deleted, otherwise they
13033are left untouched.
13034
13035B<WARNING>: if you rely on ID's then you will have to set the id yourself. At
13036this point the element does not belong to a twig yet, so the ID attribute
13037is not known so it won't be stored in the ID list.
13038
13039A content of 'C<#EMPTY>' creates an empty element;
13040
13041=item namespace ($optional_prefix)
13042
13043Return the URI of the namespace that C<$optional_prefix> or the element name
13044belongs to. If the name doesn't belong to any namespace, C<undef> is returned.
13045
13046=item local_name
13047
13048Return the local name (without the prefix) for the element
13049
13050=item ns_prefix
13051
13052Return the namespace prefix for the element
13053
13054=item current_ns_prefixes
13055
13056Return a list of namespace prefixes valid for the element. The order of the
13057prefixes in the list has no meaning. If the default namespace is currently
13058bound, '' appears in the list.
13059
13060
13061=item inherit_att  ($att, @optional_tag_list)
13062
13063Return the value of an attribute inherited from parent tags. The value
13064returned is found by looking for the attribute in the element then in turn
13065in each of its ancestors. If the C<@optional_tag_list> is supplied only those
13066ancestors whose tag is in the list will be checked.
13067
13068=item all_children_are ($optional_condition)
13069
13070return 1 if all children of the element pass the C<$optional_condition>,
130710 otherwise
13072
13073=item level       ($optional_condition)
13074
13075Return the depth of the element in the twig (root is 0).
13076If C<$optional_condition> is given then only ancestors that match the condition are
13077counted.
13078
13079B<WARNING>: in a tree created using the C<twig_roots> option this will not return
13080the level in the document tree, level 0 will be the document root, level 1
13081will be the C<twig_roots> elements. During the parsing (in a C<twig_handler>)
13082you can use the C<depth> method on the twig object to get the real parsing depth.
13083
13084=item in           ($potential_parent)
13085
13086Return true if the element is in the potential_parent (C<$potential_parent> is
13087an element)
13088
13089=item in_context   ($cond, $optional_level)
13090
13091Return true if the element is included in an element which passes C<$cond>
13092optionally within C<$optional_level> levels. The returned value is the
13093including element.
13094
13095=item pcdata
13096
13097Return the text of a C<PCDATA> element or C<undef> if the element is not
13098C<PCDATA>.
13099
13100=item pcdata_xml_string
13101
13102Return the text of a C<PCDATA> element or undef if the element is not C<PCDATA>.
13103The text is "XML-escaped" ('&' and '<' are replaced by '&amp;' and '&lt;')
13104
13105=item set_pcdata     ($text)
13106
13107Set the text of a C<PCDATA> element. This method does not check that the element is
13108indeed a C<PCDATA> so usually you should use C<L<set_text>> instead.
13109
13110=item append_pcdata  ($text)
13111
13112Add the text at the end of a C<PCDATA> element.
13113
13114=item is_cdata
13115
13116Return 1 if the element is a C<CDATA> element, returns 0 otherwise.
13117
13118=item is_text
13119
13120Return 1 if the element is a C<CDATA> or C<PCDATA> element, returns 0 otherwise.
13121
13122=item cdata
13123
13124Return the text of a C<CDATA> element or C<undef> if the element is not
13125C<CDATA>.
13126
13127=item cdata_string
13128
13129Return the XML string of a C<CDATA> element, including the opening and
13130closing markers.
13131
13132=item set_cdata     ($text)
13133
13134Set the text of a C<CDATA> element.
13135
13136=item append_cdata  ($text)
13137
13138Add the text at the end of a C<CDATA> element.
13139
13140=item remove_cdata
13141
13142Turns all C<CDATA> sections in the element into regular C<PCDATA> elements. This is useful
13143when converting XML to HTML, as browsers do not support CDATA sections.
13144
13145=item extra_data
13146
13147Return the extra_data (comments and PI's) attached to an element
13148
13149=item set_extra_data     ($extra_data)
13150
13151Set the extra_data (comments and PI's) attached to an element
13152
13153=item append_extra_data  ($extra_data)
13154
13155Append extra_data to the existing extra_data before the element (if no
13156previous extra_data exists then it is created)
13157
13158=item set_asis
13159
13160Set a property of the element that causes it to be output without being XML
13161escaped by the print functions: if it contains C<< a < b >> it will be output
13162as such and not as C<< a &lt; b >>. This can be useful to create text elements
13163that will be output as markup. Note that all C<PCDATA> descendants of the
13164element are also marked as having the property (they are the ones that are
13165actually impacted by the change).
13166
13167If the element is a C<CDATA> element it will also be output asis, without the
13168C<CDATA> markers. The same goes for any C<CDATA> descendant of the element
13169
13170=item set_not_asis
13171
13172Unsets the C<asis> property for the element and its text descendants.
13173
13174=item is_asis
13175
13176Return the C<asis> property status of the element ( 1 or C<undef>)
13177
13178=item closed
13179
13180Return true if the element has been closed. Might be useful if you are
13181somewhere in the tree, during the parse, and have no idea whether a parent
13182element is completely loaded or not.
13183
13184=item get_type
13185
13186Return the type of the element: 'C<#ELT>' for "real" elements, or 'C<#PCDATA>',
13187'C<#CDATA>', 'C<#COMMENT>', 'C<#ENT>', 'C<#PI>'
13188
13189=item is_elt
13190
13191Return the tag if the element is a "real" element, or 0 if it is C<PCDATA>,
13192C<CDATA>...
13193
13194=item contains_only_text
13195
13196Return 1 if the element does not contain any other "real" element
13197
13198=item contains_only ($exp)
13199
13200Return the list of children if all children of the element match
13201the expression C<$exp>
13202
13203  if( $para->contains_only( 'tt')) { ... }
13204
13205=item contains_a_single ($exp)
13206
13207If the element contains a single child that matches the expression C<$exp>
13208returns that element. Otherwise returns 0.
13209
13210=item is_field
13211
13212same as C<contains_only_text>
13213
13214=item is_pcdata
13215
13216Return 1 if the element is a C<PCDATA> element, returns 0 otherwise.
13217
13218=item is_ent
13219
13220Return 1 if the element is an entity (an unexpanded entity) element,
13221return 0 otherwise.
13222
13223=item is_empty
13224
13225Return 1 if the element is empty, 0 otherwise
13226
13227=item set_empty
13228
13229Flags the element as empty. No further check is made, so if the element
13230is actually not empty the output will be messed. The only effect of this
13231method is that the output will be C<< <tag att="value""/> >>.
13232
13233=item set_not_empty
13234
13235Flags the element as not empty. if it is actually empty then the element will
13236be output as C<< <tag att="value""></tag> >>
13237
13238=item is_pi
13239
13240Return 1 if the element is a processing instruction (C<#PI>) element,
13241return 0 otherwise.
13242
13243=item target
13244
13245Return the target of a processing instruction
13246
13247=item set_target ($target)
13248
13249Set the target of a processing instruction
13250
13251=item data
13252
13253Return the data part of a processing instruction
13254
13255=item set_data ($data)
13256
13257Set the data of a processing instruction
13258
13259=item set_pi ($target, $data)
13260
13261Set the target and data of a processing instruction
13262
13263=item pi_string
13264
13265Return the string form of a processing instruction
13266(C<< <?target data?> >>)
13267
13268=item is_comment
13269
13270Return 1 if the element is a comment (C<#COMMENT>) element,
13271return 0 otherwise.
13272
13273=item set_comment ($comment_text)
13274
13275Set the text for a comment
13276
13277=item comment
13278
13279Return the content of a comment (just the text, not the C<< <!-- >>
13280and C<< --> >>)
13281
13282=item comment_string
13283
13284Return the XML string for a comment (C<< <!-- comment --> >>)
13285
13286Note that an XML comment cannot start or end with a '-', or include '--'
13287(http://www.w3.org/TR/2008/REC-xml-20081126/#sec-comments),
13288if that is the case (because you have created the comment yourself presumably,
13289as it could not be in the input XML), then a space will be inserted before
13290an initial '-', after a trailing one or between two '-' in the comment
13291(which could presumably mangle javascript "hidden" in an XHTML comment);
13292
13293=item set_ent ($entity)
13294
13295Set an (non-expanded) entity (C<#ENT>). C<$entity>) is the entity
13296text (C<&ent;>)
13297
13298=item ent
13299
13300Return the entity for an entity (C<#ENT>) element (C<&ent;>)
13301
13302=item ent_name
13303
13304Return the entity name for an entity (C<#ENT>) element (C<ent>)
13305
13306=item ent_string
13307
13308Return the entity, either expanded if the expanded version is available,
13309or non-expanded (C<&ent;>) otherwise
13310
13311=item child ($offset, $optional_condition)
13312
13313Return the C<$offset>-th child of the element, optionally the C<$offset>-th
13314child that matches C<$optional_condition>. The children are treated as a list, so
13315C<< $elt->child( 0) >> is the first child, while C<< $elt->child( -1) >> is
13316the last child.
13317
13318=item child_text ($offset, $optional_condition)
13319
13320Return the text of a child or C<undef> if the sibling does not exist. Arguments
13321are the same as child.
13322
13323=item last_child    ($optional_condition)
13324
13325Return the last child of the element, or the last child matching
13326C<$optional_condition> (ie the last of the element children matching
13327the condition).
13328
13329=item last_child_text   ($optional_condition)
13330
13331Same as C<first_child_text> but for the last child.
13332
13333=item sibling  ($offset, $optional_condition)
13334
13335Return the next or previous C<$offset>-th sibling of the element, or the
13336C<$offset>-th one matching C<$optional_condition>. If C<$offset> is negative then a
13337previous sibling is returned, if $offset is positive then  a next sibling is
13338returned. C<$offset=0> returns the element if there is no condition or
13339if the element matches the condition>, C<undef> otherwise.
13340
13341=item sibling_text ($offset, $optional_condition)
13342
13343Return the text of a sibling or C<undef> if the sibling does not exist.
13344Arguments are the same as C<sibling>.
13345
13346=item prev_siblings ($optional_condition)
13347
13348Return the list of previous siblings (optionally matching C<$optional_condition>)
13349for the element. The elements are ordered in document order.
13350
13351=item next_siblings ($optional_condition)
13352
13353Return the list of siblings (optionally matching C<$optional_condition>)
13354following the element. The elements are ordered in document order.
13355
13356=item siblings ($optional_condition)
13357
13358Return the list of siblings (optionally matching C<$optional_condition>)
13359of the element (excluding the element itself). The elements are ordered
13360in document order.
13361
13362=item pos ($optional_condition)
13363
13364Return the position of the element in the children list. The first child has a
13365position of 1 (as in XPath).
13366
13367If the C<$optional_condition> is given then only siblings that match the condition
13368are counted. If the element itself does not match the  condition then
133690 is returned.
13370
13371=item atts
13372
13373Return a hash ref containing the element attributes
13374
13375=item set_atts      ({ att1=>$att1_val, att2=> $att2_val... })
13376
13377Set the element attributes with the hash ref supplied as the argument. The previous
13378attributes are lost (ie the attributes set by C<set_atts> replace all of the
13379attributes of the element).
13380
13381You can also pass a list instead of a hashref: C<< $elt->set_atts( att1 => 'val1',...) >>
13382
13383=item del_atts
13384
13385Deletes all the element attributes.
13386
13387=item att_nb
13388
13389Return the number of attributes for the element
13390
13391=item has_atts
13392
13393Return true if the element has attributes (in fact return the number of
13394attributes, thus being an alias to C<L<att_nb>>
13395
13396=item has_no_atts
13397
13398Return true if the element has no attributes, false (0) otherwise
13399
13400=item att_names
13401
13402return a list of the attribute names for the element
13403
13404=item att_xml_string ($att, $options)
13405
13406Return the attribute value, where '&', '<' and quote (" or the value of the quote option
13407at twig creation) are XML-escaped.
13408
13409The options are passed as a hashref, setting C<escape_gt> to a true value will also escape
13410'>' ($elt( 'myatt', { escape_gt => 1 });
13411
13412=item set_id       ($id)
13413
13414Set the C<id> attribute of the element to the value.
13415See C<L<elt_id> > to change the id attribute name
13416
13417=item id
13418
13419Gets the id attribute value
13420
13421=item del_id       ($id)
13422
13423Deletes the C<id> attribute of the element and remove it from the id list
13424for the document
13425
13426=item class
13427
13428Return the C<class> attribute for the element (methods on the C<class>
13429attribute are quite convenient when dealing with XHTML, or plain XML that
13430will eventually be displayed using CSS)
13431
13432=item lclass
13433
13434same as class, except that
13435this method is an lvalue, so you can do C<< $elt->lclass= "foo" >>
13436
13437=item set_class ($class)
13438
13439Set the C<class> attribute for the element to C<$class>
13440
13441=item add_class ($class)
13442
13443Add C<$class> to the element C<class> attribute: the new class is added
13444only if it is not already present.
13445
13446Note that classes are then sorted alphabetically, so the C<class> attribute
13447can be changed even if the class is already there
13448
13449=item remove_class ($class)
13450
13451Remove C<$class> from the element C<class> attribute.
13452
13453Note that classes are then sorted alphabetically, so the C<class> attribute can be
13454changed even if the class is already there
13455
13456
13457=item add_to_class ($class)
13458
13459alias for add_class
13460
13461=item att_to_class ($att)
13462
13463Set the C<class> attribute to the value of attribute C<$att>
13464
13465=item add_att_to_class ($att)
13466
13467Add the value of attribute C<$att> to the C<class> attribute of the element
13468
13469=item move_att_to_class ($att)
13470
13471Add the value of attribute C<$att> to the C<class> attribute of the element
13472and delete the attribute
13473
13474=item tag_to_class
13475
13476Set the C<class> attribute of the element to the element tag
13477
13478=item add_tag_to_class
13479
13480Add the element tag to its C<class> attribute
13481
13482=item set_tag_class ($new_tag)
13483
13484Add the element tag to its C<class> attribute and sets the tag to C<$new_tag>
13485
13486=item in_class ($class)
13487
13488Return true (C<1>) if the element is in the class C<$class> (if C<$class> is
13489one of the tokens in the element C<class> attribute)
13490
13491=item tag_to_span
13492
13493Change the element tag tp C<span> and set its class to the old tag
13494
13495=item tag_to_div
13496
13497Change the element tag tp C<div> and set its class to the old tag
13498
13499=item DESTROY
13500
13501Frees the element from memory.
13502
13503=item start_tag
13504
13505Return the string for the start tag for the element, including
13506the C<< /> >> at the end of an empty element tag
13507
13508=item end_tag
13509
13510Return the string for the end tag of an element.  For an empty
13511element, this returns the empty string ('').
13512
13513=item xml_string @optional_options
13514
13515Equivalent to C<< $elt->sprint( 1) >>, returns the string for the entire
13516element, excluding the element's tags (but nested element tags are present)
13517
13518The 'C<no_recurse>' option will only return the text of the element, not
13519of any included sub-elements (same as C<L<xml_text_only>>).
13520
13521=item inner_xml
13522
13523Another synonym for xml_string
13524
13525=item outer_xml
13526
13527An other synonym for sprint
13528
13529=item xml_text
13530
13531Return the text of the element, encoded (and processed by the current
13532C<L<output_filter>> or C<L<output_encoding>> options, without any tag.
13533
13534=item xml_text_only
13535
13536Same as C<L<xml_text>> except that the text returned doesn't include
13537the text of sub-elements.
13538
13539=item set_pretty_print ($style)
13540
13541Set the pretty print method, amongst 'C<none>' (default), 'C<nsgmls>',
13542'C<nice>', 'C<indented>', 'C<record>' and 'C<record_c>'
13543
13544pretty_print styles:
13545
13546=over 4
13547
13548=item none
13549
13550the default, no C<\n> is used
13551
13552=item nsgmls
13553
13554nsgmls style, with C<\n> added within tags
13555
13556=item nice
13557
13558adds C<\n> wherever possible (NOT SAFE, can lead to invalid XML)
13559
13560=item indented
13561
13562same as C<nice> plus indents elements (NOT SAFE, can lead to invalid XML)
13563
13564=item record
13565
13566table-oriented pretty print, one field per line
13567
13568=item record_c
13569
13570table-oriented pretty print, more compact than C<record>, one record per line
13571
13572=back
13573
13574=item set_empty_tag_style ($style)
13575
13576Set the method to output empty tags, amongst 'C<normal>' (default), 'C<html>',
13577and 'C<expand>',
13578
13579C<normal> outputs an empty tag 'C<< <tag/> >>', C<html> adds a space
13580'C<< <tag /> >>' for elements that can be empty in XHTML and C<expand> outputs
13581'C<< <tag></tag> >>'
13582
13583=item set_remove_cdata  ($flag)
13584
13585set (or unset) the flag that forces the twig to output CDATA sections as
13586regular (escaped) PCDATA
13587
13588
13589=item set_indent ($string)
13590
13591Set the indentation for the indented pretty print style (default is 2 spaces)
13592
13593=item set_quote ($quote)
13594
13595Set the quotes used for attributes. can be 'C<double>' (default) or 'C<single>'
13596
13597=item cmp       ($elt)
13598
13599  Compare the order of the 2 elements in a twig.
13600
13601  C<$a> is the <A>..</A> element, C<$b> is the <B>...</B> element
13602
13603  document                        $a->cmp( $b)
13604  <A> ... </A> ... <B>  ... </B>     -1
13605  <A> ... <B>  ... </B> ... </A>     -1
13606  <B> ... </B> ... <A>  ... </A>      1
13607  <B> ... <A>  ... </A> ... </B>      1
13608   $a == $b                           0
13609   $a and $b not in the same tree   undef
13610
13611=item before       ($elt)
13612
13613Return 1 if C<$elt> starts before the element, 0 otherwise. If the 2 elements
13614are not in the same twig then return C<undef>.
13615
13616    if( $a->cmp( $b) == -1) { return 1; } else { return 0; }
13617
13618=item after       ($elt)
13619
13620Return 1 if $elt starts after the element, 0 otherwise. If the 2 elements
13621are not in the same twig then return C<undef>.
13622
13623    if( $a->cmp( $b) == -1) { return 1; } else { return 0; }
13624
13625=item other comparison methods
13626
13627=over 4
13628
13629=item lt
13630
13631=item le
13632
13633=item gt
13634
13635=item ge
13636
13637=back
13638
13639=item path
13640
13641Return the element context in a form similar to XPath's short
13642form: 'C</root/tag1/../tag>'
13643
13644=item xpath
13645
13646Return a unique XPath expression that can be used to find the element
13647again.
13648
13649It looks like C</doc/sect[3]/title>: unique elements do not have an index,
13650the others do.
13651
13652=item flush
13653
13654flushes the twig up to the current element (strictly equivalent to
13655C<< $elt->root->flush >>)
13656
13657=item private methods
13658
13659Low-level methods on the twig:
13660
13661=over 4
13662
13663=item set_parent        ($parent)
13664
13665=item set_first_child   ($first_child)
13666
13667=item set_last_child    ($last_child)
13668
13669=item set_prev_sibling  ($prev_sibling)
13670
13671=item set_next_sibling  ($next_sibling)
13672
13673=item set_twig_current
13674
13675=item del_twig_current
13676
13677=item twig_current
13678
13679=item contains_text
13680
13681=back
13682
13683Those methods should not be used, unless of course you find some creative
13684and interesting, not to mention useful, ways to do it.
13685
13686=back
13687
13688=head2 cond
13689
13690Most of the navigation functions accept a condition as an optional argument
13691The first element (or all elements for C<L<children> > or
13692C<L<ancestors> >) that passes the condition is returned.
13693
13694The condition is a single step of an XPath expression using the XPath subset
13695defined by C<L<get_xpath>>. Additional conditions are:
13696
13697The condition can be
13698
13699=over 4
13700
13701=item #ELT
13702
13703return a "real" element (not a PCDATA, CDATA, comment or pi element)
13704
13705=item #TEXT
13706
13707return a PCDATA or CDATA element
13708
13709=item regular expression
13710
13711return an element whose tag matches the regexp. The regexp has to be created
13712with C<qr//> (hence this is available only on perl 5.005 and above)
13713
13714=item code reference
13715
13716applies the code, passing the current element as argument, if the code returns
13717true then the element is returned, if it returns false then the code is applied
13718to the next candidate.
13719
13720=back
13721
13722=head2 XML::Twig::XPath
13723
13724XML::Twig implements a subset of XPath through the C<L<get_xpath>> method.
13725
13726If you want to use the whole XPath power, then you can use C<XML::Twig::XPath>
13727instead. In this case C<XML::Twig> uses C<XML::XPath> to execute XPath queries.
13728You will of course need C<XML::XPath> installed to be able to use C<XML::Twig::XPath>.
13729
13730See L<XML::XPath> for more information.
13731
13732The methods you can use are:
13733
13734=over 4
13735
13736=item findnodes              ($path)
13737
13738return a list of nodes found by C<$path>.
13739
13740=item findnodes_as_string    ($path)
13741
13742return the nodes found reproduced as XML. The result is not guaranteed
13743to be valid XML though.
13744
13745=item findvalue              ($path)
13746
13747return the concatenation of the text content of the result nodes
13748
13749=back
13750
13751In order for C<XML::XPath> to be used as the XPath engine the following methods
13752are included in C<XML::Twig>:
13753
13754in XML::Twig
13755
13756=over 4
13757
13758=item getRootNode
13759
13760=item getParentNode
13761
13762=item getChildNodes
13763
13764=back
13765
13766in XML::Twig::Elt
13767
13768=over 4
13769
13770=item string_value
13771
13772=item toString
13773
13774=item getName
13775
13776=item getRootNode
13777
13778=item getNextSibling
13779
13780=item getPreviousSibling
13781
13782=item isElementNode
13783
13784=item isTextNode
13785
13786=item isPI
13787
13788=item isPINode
13789
13790=item isProcessingInstructionNode
13791
13792=item isComment
13793
13794=item isCommentNode
13795
13796=item getTarget
13797
13798=item getChildNodes
13799
13800=item getElementById
13801
13802=back
13803
13804=head2 XML::Twig::XPath::Elt
13805
13806The methods you can use are the same as on C<XML::Twig::XPath> elements:
13807
13808=over 4
13809
13810=item findnodes              ($path)
13811
13812return a list of nodes found by C<$path>.
13813
13814=item findnodes_as_string    ($path)
13815
13816return the nodes found reproduced as XML. The result is not guaranteed
13817to be valid XML though.
13818
13819=item findvalue              ($path)
13820
13821return the concatenation of the text content of the result nodes
13822
13823=back
13824
13825
13826=head2 XML::Twig::Entity_list
13827
13828=over 4
13829
13830=item new
13831
13832Create an entity list.
13833
13834=item add         ($ent)
13835
13836Add an entity to an entity list.
13837
13838=item add_new_ent ($name, $val, $sysid, $pubid, $ndata, $param)
13839
13840Create a new entity and add it to the entity list
13841
13842=item delete     ($ent or $tag).
13843
13844Delete an entity (defined by its name or by the Entity object)
13845from the list.
13846
13847=item print      ($optional_filehandle)
13848
13849Print the entity list.
13850
13851=item list
13852
13853Return the list as an array
13854
13855=back
13856
13857
13858=head2 XML::Twig::Entity
13859
13860=over 4
13861
13862=item new        ($name, $val, $sysid, $pubid, $ndata, $param)
13863
13864Same arguments as the Entity handler for XML::Parser.
13865
13866=item print       ($optional_filehandle)
13867
13868Print an entity declaration.
13869
13870=item name
13871
13872Return the name of the entity
13873
13874=item val
13875
13876Return the value of the entity
13877
13878=item sysid
13879
13880Return the system id for the entity (for NDATA entities)
13881
13882=item pubid
13883
13884Return the public id for the entity (for NDATA entities)
13885
13886=item ndata
13887
13888Return true if the entity is an NDATA entity
13889
13890=item param
13891
13892Return true if the entity is a parameter entity
13893
13894
13895=item text
13896
13897Return the entity declaration text.
13898
13899=back
13900
13901=head2 XML::Twig::Notation_list
13902
13903=over 4
13904
13905=item new
13906
13907Create an notation list.
13908
13909=item add         ($notation)
13910
13911Add an notation to an notation list.
13912
13913=item add_new_notation ($name, $base, $sysid, $pubid)
13914
13915Create a new notation and add it to the notation list
13916
13917=item delete     ($notation or $tag).
13918
13919Delete an notation (defined by its name or by the Notation object)
13920from the list.
13921
13922=item print      ($optional_filehandle)
13923
13924Print the notation list.
13925
13926=item list
13927
13928Return the list as an array
13929
13930=back
13931
13932
13933=head2 XML::Twig::Notation
13934
13935=over 4
13936
13937=item new        ($name, $base, $sysid, $pubid)
13938
13939Same argumnotations as the Notation handler for XML::Parser.
13940
13941=item print       ($optional_filehandle)
13942
13943Print an notation declaration.
13944
13945=item name
13946
13947Return the name of the notation
13948
13949=item base
13950
13951Return the base to be used for resolving a relative URI
13952
13953=item sysid
13954
13955Return the system id for the notation
13956
13957=item pubid
13958
13959Return the public id for the notation
13960
13961
13962=item text
13963
13964Return the notation declaration text.
13965
13966=back
13967
13968
13969=head1 EXAMPLES
13970
13971Additional examples (and a complete tutorial) can be found  on the
13972F<XML::Twig PageL<http://www.xmltwig.org/xmltwig/>>
13973
13974To figure out what flush does call the following script with an
13975XML file and an element name as arguments
13976
13977  use XML::Twig;
13978
13979  my ($file, $elt)= @ARGV;
13980  my $t= XML::Twig->new( twig_handlers =>
13981      { $elt => sub {$_[0]->flush; print "\n[flushed here]\n";} });
13982  $t->parsefile( $file, ErrorContext => 2);
13983  $t->flush;
13984  print "\n";
13985
13986
13987=head1 NOTES
13988
13989=head2 Subclassing XML::Twig
13990
13991Useful methods:
13992
13993=over 4
13994
13995=item elt_class
13996
13997In order to subclass C<XML::Twig> you will probably need to subclass also
13998C<L<XML::Twig::Elt>>. Use the C<elt_class> option when you create the
13999C<XML::Twig> object to get the elements created in a different class
14000(which should be a subclass of C<XML::Twig::Elt>.
14001
14002=item add_options
14003
14004If you inherit C<XML::Twig> new method but want to add more options to it
14005you can use this method to prevent XML::Twig to issue warnings for those
14006additional options.
14007
14008=back
14009
14010=head2 DTD Handling
14011
14012There are 3 possibilities here.  They are:
14013
14014=over 4
14015
14016=item No DTD
14017
14018No doctype, no DTD information, no entity information, the world is simple...
14019
14020=item Internal DTD
14021
14022The XML document includes an internal DTD, and maybe entity declarations.
14023
14024If you use the load_DTD option when creating the twig the DTD information and
14025the entity declarations can be accessed.
14026
14027The DTD and the entity declarations will be C<flush>'ed (or C<print>'ed) either
14028as is (if they have not been modified) or as reconstructed (poorly, comments
14029are lost, order is not kept, due to it's content this DTD should not be viewed
14030by anyone) if they have been modified. You can also modify them directly by
14031changing the C<< $twig->{twig_doctype}->{internal} >> field (straight from
14032XML::Parser, see the C<Doctype> handler doc)
14033
14034=item External DTD
14035
14036The XML document includes a reference to an external DTD, and maybe entity
14037declarations.
14038
14039If you use the C<load_DTD> when creating the twig the DTD information and the
14040entity declarations can be accessed. The entity declarations will be
14041C<flush>'ed (or C<print>'ed) either as is (if they have not been modified) or
14042as reconstructed (badly, comments are lost, order is not kept).
14043
14044You can change the doctype through the C<< $twig->set_doctype >> method and
14045print the dtd through the C<< $twig->dtd_text >> or C<< $twig->dtd_print >>
14046 methods.
14047
14048If you need to modify the entity list this is probably the easiest way to do it.
14049
14050=back
14051
14052
14053=head2 Flush
14054
14055Remember that element handlers are called when the element is CLOSED, so
14056if you have handlers for nested elements the inner handlers will be called
14057first. It makes it for example trickier than it would seem to number nested
14058sections (or clauses, or divs), as the titles in the inner sections are handled
14059before the outer sections.
14060
14061
14062=head1 BUGS
14063
14064=over 4
14065
14066=item segfault during parsing
14067
14068This happens when parsing huge documents, or lots of small ones, with a version
14069of Perl before 5.16.
14070
14071This is due to a bug in the way weak references are handled in Perl itself.
14072
14073The fix is either to upgrade to Perl 5.16 or later (C<perlbrew> is a great
14074tool to manage several installations of perl on the same machine).
14075
14076An other, NOT RECOMMENDED, way of fixing the problem, is to switch off weak
14077references by writing C<XML::Twig::_set_weakrefs( 0);> at the top of the code.
14078This is totally unsupported, and may lead to other problems though,
14079
14080=item entity handling
14081
14082Due to XML::Parser behaviour, non-base entities in attribute values disappear if
14083they are not declared in the document:
14084C<att="val&ent;"> will be turned into C<< att => val >>, unless you use the
14085C<keep_encoding> argument to C<< XML::Twig->new >>
14086
14087=item DTD handling
14088
14089The DTD handling methods are quite bugged. No one uses them and
14090it seems very difficult to get them to work in all cases, including with
14091several slightly incompatible versions of XML::Parser and of libexpat.
14092
14093Basically you can read the DTD, output it back properly, and update entities,
14094but not much more.
14095
14096So use XML::Twig with standalone documents, or with documents referring to an
14097external DTD, but don't expect it to properly parse and even output back the
14098DTD.
14099
14100=item memory leak
14101
14102If you use a REALLY old Perl (5.005!) and
14103a lot of twigs you might find that you leak quite a lot of memory
14104(about 2Ks per twig). You can use the C<L<dispose> > method to free
14105that memory after you are done.
14106
14107If you create elements the same thing might happen, use the C<L<delete>>
14108method to get rid of them.
14109
14110Alternatively installing the C<Scalar::Util> (or C<WeakRef>) module on a version
14111of Perl that supports it (>5.6.0) will get rid of the memory leaks automagically.
14112
14113=item ID list
14114
14115The ID list is NOT updated when elements are cut or deleted.
14116
14117=item change_gi
14118
14119This method will not function properly if you do:
14120
14121     $twig->change_gi( $old1, $new);
14122     $twig->change_gi( $old2, $new);
14123     $twig->change_gi( $new, $even_newer);
14124
14125=item sanity check on XML::Parser method calls
14126
14127XML::Twig should really prevent calls to some XML::Parser methods, especially
14128the C<setHandlers> method.
14129
14130=item pretty printing
14131
14132Pretty printing (at least using the 'C<indented>' style) is hard to get right!
14133Only elements that belong to the document will be properly indented. Printing
14134elements that do not belong to the twig makes it impossible for XML::Twig to
14135figure out their depth, and thus their indentation level.
14136
14137Also there is an unavoidable bug when using C<flush> and pretty printing for
14138elements with mixed content that start with an embedded element:
14139
14140  <elt><b>b</b>toto<b>bold</b></elt>
14141
14142  will be output as
14143
14144  <elt>
14145    <b>b</b>toto<b>bold</b></elt>
14146
14147if you flush the twig when you find the C<< <b> >> element
14148
14149
14150=back
14151
14152=head1 Globals
14153
14154These are the things that can mess up calling code, especially if threaded.
14155They might also cause problem under mod_perl.
14156
14157=over 4
14158
14159=item Exported constants
14160
14161Whether you want them or not you get them! These are subroutines to use
14162as constant when creating or testing elements
14163
14164  PCDATA  return '#PCDATA'
14165  CDATA   return '#CDATA'
14166  PI      return '#PI', I had the choice between PROC and PI :--(
14167
14168=item Module scoped values: constants
14169
14170these should cause no trouble:
14171
14172  %base_ent= ( '>' => '&gt;',
14173               '<' => '&lt;',
14174               '&' => '&amp;',
14175               "'" => '&apos;',
14176               '"' => '&quot;',
14177             );
14178  CDATA_START   = "<![CDATA[";
14179  CDATA_END     = "]]>";
14180  PI_START      = "<?";
14181  PI_END        = "?>";
14182  COMMENT_START = "<!--";
14183  COMMENT_END   = "-->";
14184
14185pretty print styles
14186
14187  ( $NSGMLS, $NICE, $INDENTED, $INDENTED_C, $WRAPPED, $RECORD1, $RECORD2)= (1..7);
14188
14189empty tag output style
14190
14191  ( $HTML, $EXPAND)= (1..2);
14192
14193=item Module scoped values: might be changed
14194
14195Most of these deal with pretty printing, so the worst that can
14196happen is probably that XML output does not look right, but is
14197still valid and processed identically by XML processors.
14198
14199C<$empty_tag_style> can mess up HTML bowsers though and changing C<$ID>
14200would most likely create problems.
14201
14202  $pretty=0;           # pretty print style
14203  $quote='"';          # quote for attributes
14204  $INDENT= '  ';       # indent for indented pretty print
14205  $empty_tag_style= 0; # how to display empty tags
14206  $ID                  # attribute used as an id ('id' by default)
14207
14208=item Module scoped values: definitely changed
14209
14210These 2 variables are used to replace tags by an index, thus
14211saving some space when creating a twig. If they really cause
14212you too much trouble, let me know, it is probably possible to
14213create either a switch or at least a version of XML::Twig that
14214does not perform this optimization.
14215
14216  %gi2index;     # tag => index
14217  @index2gi;     # list of tags
14218
14219=back
14220
14221If you need to manipulate all those values, you can use the following methods on the
14222XML::Twig object:
14223
14224=over 4
14225
14226=item global_state
14227
14228Return a hashref with all the global variables used by XML::Twig
14229
14230The hash has the following fields:  C<pretty>, C<quote>, C<indent>,
14231C<empty_tag_style>, C<keep_encoding>, C<expand_external_entities>,
14232C<output_filter>, C<output_text_filter>, C<keep_atts_order>
14233
14234=item set_global_state ($state)
14235
14236Set the global state, C<$state> is a hashref
14237
14238=item save_global_state
14239
14240Save the current global state
14241
14242=item restore_global_state
14243
14244Restore the previously saved (using C<Lsave_global_state>> state
14245
14246=back
14247
14248=head1 TODO
14249
14250=over 4
14251
14252=item SAX handlers
14253
14254Allowing XML::Twig to work on top of any SAX parser
14255
14256=item multiple twigs are not well supported
14257
14258A number of twig features are just global at the moment. These include
14259the ID list and the "tag pool" (if you use C<change_gi> then you change the tag
14260for ALL twigs).
14261
14262A future version will try to support this while trying not to be to
14263hard on performance (at least when a single twig is used!).
14264
14265=back
14266
14267=head1 AUTHOR
14268
14269Michel Rodriguez <mirod@cpan.org>
14270
14271=head1 LICENSE
14272
14273This library is free software; you can redistribute it and/or modify
14274it under the same terms as Perl itself.
14275
14276Bug reports should be sent using:
14277F<RT L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=XML-Twig>>
14278
14279Comments can be sent to mirod@cpan.org
14280
14281The XML::Twig page is at L<http://www.xmltwig.org/xmltwig/>
14282It includes the development version of the module, a slightly better version
14283of the documentation, examples, a tutorial and a:
14284F<Processing XML efficiently with Perl and XML::Twig:
14285L<http://www.xmltwig.org/xmltwig/tutorial/index.html>>
14286
14287=head1 SEE ALSO
14288
14289Complete docs, including a tutorial, examples, an easier to use HTML version of
14290the docs, a quick reference card and a FAQ are available at
14291L<http://www.xmltwig.org/xmltwig/>
14292
14293git repository at L<http://github.com/mirod/xmltwig>
14294
14295L<XML::Parser>, L<XML::Parser::Expat>, L<XML::XPath>, L<Encode>,
14296L<Text::Iconv>, L<Scalar::Utils>
14297
14298
14299=head2 Alternative Modules
14300
14301XML::Twig is not the only XML::Processing module available on CPAN (far from
14302it!).
14303
14304The main alternative I would recommend is L<XML::LibXML>.
14305
14306Here is a quick comparison of the 2 modules:
14307
14308XML::LibXML, actually C<libxml2> on which it is based, sticks to the standards,
14309and implements a good number of them in a rather strict way: XML, XPath, DOM,
14310RelaxNG, I must be forgetting a couple (XInclude?). It is fast and rather
14311frugal memory-wise.
14312
14313XML::Twig is older: when I started writing it XML::Parser/expat was the only
14314game in town. It implements XML and that's about it (plus a subset of XPath,
14315and you can use XML::Twig::XPath if you have XML::XPathEngine installed for full
14316support). It is slower and requires more memory for a full tree than
14317XML::LibXML. On the plus side (yes, there is a plus side!) it lets you process
14318a big document in chunks, and thus let you tackle documents that couldn't be
14319loaded in memory by XML::LibXML, and it offers a lot (and I mean a LOT!) of
14320higher-level methods, for everything, from adding structure to "low-level" XML,
14321to shortcuts for XHTML conversions and more. It also DWIMs quite a bit, getting
14322comments and non-significant whitespaces out of the way but preserving them in
14323the output for example. As it does not stick to the DOM, is also usually leads
14324to shorter code than in XML::LibXML.
14325
14326Beyond the pure features of the 2 modules, XML::LibXML seems to be preferred by
14327"XML-purists", while XML::Twig seems to be more used by Perl Hackers who have
14328to deal with XML. As you have noted, XML::Twig also comes with quite a lot of
14329docs, but I am sure if you ask for help about XML::LibXML here or on Perlmonks
14330you will get answers.
14331
14332Note that it is actually quite hard for me to compare the 2 modules: on one hand
14333I know XML::Twig inside-out and I can get it to do pretty much anything I need
14334to (or I improve it ;--), while I have a very basic knowledge of XML::LibXML.
14335So feature-wise, I'd rather use XML::Twig ;--). On the other hand, I am
14336painfully aware of some of the deficiencies, potential bugs and plain ugly code
14337that lurk in XML::Twig, even though you are unlikely to be affected by them
14338(unless for example you need to change the DTD of a document programmatically),
14339while I haven't looked much into XML::LibXML so it still looks shinny and clean
14340to me.
14341
14342That said, if you need to process a document that is too big to fit memory
14343and XML::Twig is too slow for you, my reluctant advice would be to use "bare"
14344XML::Parser.  It won't be as easy to use as XML::Twig: basically with XML::Twig
14345you trade some speed (depending on what you do from a factor 3 to... none)
14346for ease-of-use, but it will be easier IMHO than using SAX (albeit not
14347standard), and at this point a LOT faster (see the last test in
14348L<http://www.xmltwig.org/article/simple_benchmark/>).
14349
14350=cut
14351
14352
14353