1use strict;
2use warnings; # > perl 5.5
3
4# This is created in the caller's space
5# I realize (now!) that it's not clean, but it's been there for 10+ years...
6BEGIN
7{ sub ::PCDATA { '#PCDATA' }  ## no critic (Subroutines::ProhibitNestedSubs);
8  sub ::CDATA  { '#CDATA'  }  ## no critic (Subroutines::ProhibitNestedSubs);
9}
10
11use UNIVERSAL();
12
13## if a sub returns a scalar, it better not bloody disappear in list context
14## no critic (Subroutines::ProhibitExplicitReturnUndef);
15
16my $perl_version;
17my $parser_version;
18
19######################################################################
20package XML::Twig;
21######################################################################
22
23require 5.004;
24
25use utf8; # > perl 5.5
26
27use vars qw($VERSION @ISA %valid_option);
28
29use Carp;
30use File::Spec;
31use File::Basename;
32
33*isa= *UNIVERSAL::isa;
34
35# flag, set to true if the weaken sub is available
36use vars qw( $weakrefs);
37
38# flag set to true if the version of expat seems to be 1.95.2, which has annoying bugs
39# wrt doctype handling. This is global for performance reasons.
40my $expat_1_95_2=0;
41
42# a slight non-xml mod: # is allowed as a first character
43my $REG_TAG_FIRST_LETTER;
44#$REG_TAG_FIRST_LETTER= q{(?:[^\W\d]|[:#_])};  # < perl 5.6 - does not work for leading non-ascii letters
45$REG_TAG_FIRST_LETTER= q{(?:[[:alpha:]:#_])}; # >= perl 5.6
46
47my $REG_TAG_LETTER= q{(?:[\w_.-]*)};
48
49# a simple name (no colon)
50my $REG_NAME_TOKEN= qq{(?:$REG_TAG_FIRST_LETTER$REG_TAG_LETTER*)};
51
52# a tag name, possibly including namespace
53my $REG_NAME= qq{(?:(?:$REG_NAME_TOKEN:)?$REG_NAME_TOKEN)};
54
55# tag name (leading # allowed)
56# first line is for perl 5.005, second line for modern perl, that accept character classes
57my $REG_TAG_NAME=$REG_NAME;
58
59# name or wildcard (* or '') (leading # allowed)
60my $REG_NAME_W = qq{(?:$REG_NAME|[*])};
61
62# class and ids are deliberatly permissive
63my $REG_NTOKEN_FIRST_LETTER;
64#$REG_NTOKEN_FIRST_LETTER= q{(?:[^\W\d]|[:_])};  # < perl 5.6 - does not work for leading non-ascii letters
65$REG_NTOKEN_FIRST_LETTER= q{(?:[[:alpha:]:_])}; # >= perl 5.6
66
67my $REG_NTOKEN_LETTER= q{(?:[\w_:.-]*)};
68
69my $REG_NTOKEN= qq{(?:$REG_NTOKEN_FIRST_LETTER$REG_NTOKEN_LETTER*)};
70my $REG_CLASS = $REG_NTOKEN;
71my $REG_ID    = $REG_NTOKEN;
72
73# allow <tag> #<tag> (private elt) * <tag>.<class> *.<class> <tag>#<id> *#<id>
74my $REG_TAG_PART= qq{(?:$REG_NAME_W(?:[.]$REG_CLASS|[#]$REG_ID)?|[.]$REG_CLASS)};
75
76my $REG_REGEXP     = q{(?:/(?:[^\\/]|\\.)*/[eimsox]*)};               # regexp
77my $REG_MATCH      = q{[!=]~};                                        # match (or not)
78my $REG_STRING     = q{(?:"(?:[^\\"]|\\.)*"|'(?:[^\\']|\\.)*')};      # string (simple or double quoted)
79my $REG_NUMBER     = q{(?:\d+(?:\.\d*)?|\.\d+)};                      # number
80my $REG_VALUE      = qq{(?:$REG_STRING|$REG_NUMBER)};                 # value
81my $REG_OP         = q{==|!=|>|<|>=|<=|eq|ne|lt|gt|le|ge|=};          # op
82my $REG_FUNCTION   = q{(?:string|text)\(\s*\)};
83my $REG_STRING_ARG = qq{(?:string|text)\\(\\s*$REG_NAME_W\\s*\\)};
84my $REG_COMP       = q{(?:>=|<=|!=|<|>|=)};
85
86my $REG_TAG_IN_PREDICATE= $REG_NAME_W . q{(?=\s*(?i:and\b|or\b|\]|$))};
87
88# keys in the context stack, chosen not to interfere with att names, even private (#-prefixed) ones
89my $ST_TAG = '##tag';
90my $ST_ELT = '##elt';
91my $ST_NS  = '##ns' ;
92
93# used in the handler trigger code
94my $REG_NAKED_PREDICATE= qq{((?:"[^"]*"|'[^']*'|$REG_STRING_ARG|$REG_FUNCTION|\@$REG_NAME_W|$REG_MATCH\\s*$REG_REGEXP|[\\s\\d><=!()+.-]|(?i:and)|(?i:or)|$REG_TAG_IN_PREDICATE)*)};
95my $REG_PREDICATE= qq{\\[$REG_NAKED_PREDICATE\\]};
96
97# not all axis, only supported ones (in get_xpath)
98my @supported_axis= ( 'ancestor', 'ancestor-or-self', 'child', 'descendant', 'descendant-or-self',
99                      'following', 'following-sibling', 'parent', 'preceding', 'preceding-sibling', 'self'
100                    );
101my $REG_AXIS       = "(?:" . join( '|', @supported_axis) .")";
102
103# only used in the "xpath"engine (for get_xpath/findnodes) for now
104my $REG_PREDICATE_ALT  = qr{\[(?:(?:string\(\s*\)|\@$REG_TAG_NAME)\s*$REG_MATCH\s*$REG_REGEXP\s*|[^\]]*)\]};
105
106# used to convert XPath tests on strings to the perl equivalent
107my %PERL_ALPHA_TEST= ( '=' => ' eq ', '!=' => ' ne ', '>' => ' gt ', '>=' => ' ge ', '<' => ' lt ', '<=' => ' le ');
108
109my( $FB_HTMLCREF, $FB_XMLCREF);
110
111my $NO_WARNINGS= $perl_version >= 5.006 ? 'no warnings' : 'local $^W=0';
112
113# default namespaces, both ways
114my %DEFAULT_NS= ( xml   => "http://www.w3.org/XML/1998/namespace",
115                  xmlns => "http://www.w3.org/2000/xmlns/",
116                );
117my %DEFAULT_URI2NS= map { $DEFAULT_NS{$_} => $_ } keys %DEFAULT_NS;
118
119# constants
120my( $PCDATA, $CDATA, $PI, $COMMENT, $ENT, $ELT, $TEXT, $ASIS, $EMPTY, $BUFSIZE);
121
122# used when an HTML doc only has a PUBLIC declaration, to generate the SYSTEM one
123# this should really be done by HTML::TreeBuilder, but as of HTML::TreeBuilder 4.2 it isn't
124# the various declarations are taken from http://en.wikipedia.org/wiki/Document_Type_Declaration
125my %HTML_DECL= ( "-//W3C//DTD HTML 4.0 Transitional//EN"  => "http://www.w3.org/TR/REC-html40/loose.dtd",
126                 "-//W3C//DTD HTML 4.01//EN"              => "http://www.w3.org/TR/html4/strict.dtd",
127                 "-//W3C//DTD HTML 4.01 Transitional//EN" => "http://www.w3.org/TR/html4/loose.dtd",
128                 "-//W3C//DTD HTML 4.01 Frameset//EN"     => "http://www.w3.org/TR/html4/frameset.dtd",
129                 "-//W3C//DTD XHTML 1.0 Strict//EN"       => "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd",
130                 "-//W3C//DTD XHTML 1.0 Transitional//EN" => "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd",
131                 "-//W3C//DTD XHTML 1.0 Frameset//EN"     => "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd",
132                 "-//W3C//DTD XHTML 1.1//EN"              => "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd",
133                 "-//W3C//DTD XHTML Basic 1.0//EN"        => "http://www.w3.org/TR/xhtml-basic/xhtml-basic10.dtd",
134                 "-//W3C//DTD XHTML Basic 1.1//EN"        => "http://www.w3.org/TR/xhtml-basic/xhtml-basic11.dtd",
135                 "-//WAPFORUM//DTD XHTML Mobile 1.0//EN"  => "http://www.wapforum.org/DTD/xhtml-mobile10.dtd",
136                 "-//WAPFORUM//DTD XHTML Mobile 1.1//EN"  => "http://www.openmobilealliance.org/tech/DTD/xhtml-mobile11.dtd",
137                 "-//WAPFORUM//DTD XHTML Mobile 1.2//EN"  => "http://www.openmobilealliance.org/tech/DTD/xhtml-mobile12.dtd",
138                 "-//W3C//DTD XHTML+RDFa 1.0//EN"         => "http://www.w3.org/MarkUp/DTD/xhtml-rdfa-1.dtd",
139               );
140
141my $DEFAULT_HTML_TYPE= "-//W3C//DTD HTML 4.0 Transitional//EN";
142
143my $SEP= qr/\s*(?:$|\|)/;
144
145BEGIN
146{
147$VERSION = '3.48';
148
149use XML::Parser;
150my $needVersion = '2.23';
151($parser_version= $XML::Parser::VERSION)=~ s{_\d+}{}; # remove _<n> from version so numeric tests do not warn
152croak "need at least XML::Parser version $needVersion" unless $parser_version >= $needVersion;
153
154($perl_version= $])=~ s{_\d+}{};
155
156if( $perl_version >= 5.008)
157  { eval "use Encode qw( :all)"; ## no critic ProhibitStringyEval
158    $FB_XMLCREF  = 0x0400; # Encode::FB_XMLCREF;
159    $FB_HTMLCREF = 0x0200; # Encode::FB_HTMLCREF;
160  }
161
162# test whether we can use weak references
163# set local empty signal handler to trap error messages
164{ local $SIG{__DIE__};
165  if( eval( 'require Scalar::Util') && defined( \&Scalar::Util::weaken))
166    { import Scalar::Util( 'weaken'); $weakrefs= 1; }
167  elsif( eval( 'require WeakRef'))
168    { import WeakRef; $weakrefs= 1;                 }
169  else
170    { $weakrefs= 0;                                 }
171}
172
173import XML::Twig::Elt;
174import XML::Twig::Entity;
175import XML::Twig::Entity_list;
176
177# used to store the gi's
178# should be set for each twig really, at least when there are several
179# the init ensures that special gi's are always the same
180
181# constants: element types
182$PCDATA  = '#PCDATA';
183$CDATA   = '#CDATA';
184$PI      = '#PI';
185$COMMENT = '#COMMENT';
186$ENT     = '#ENT';
187
188# element classes
189$ELT     = '#ELT';
190$TEXT    = '#TEXT';
191
192# element properties
193$ASIS    = '#ASIS';
194$EMPTY   = '#EMPTY';
195
196# used in parseurl to set the buffer size to the same size as in XML::Parser::Expat
197$BUFSIZE = 32768;
198
199
200# gi => index
201%XML::Twig::gi2index=( '', 0, $PCDATA => 1, $CDATA => 2, $PI => 3, $COMMENT => 4, $ENT => 5);
202# list of gi's
203@XML::Twig::index2gi=( '', $PCDATA, $CDATA, $PI, $COMMENT, $ENT);
204
205# gi's under this value are special
206$XML::Twig::SPECIAL_GI= @XML::Twig::index2gi;
207
208%XML::Twig::base_ent= ( '>' => '&gt;', '<' => '&lt;', '&' => '&amp;', "'" => '&apos;', '"' => '&quot;',);
209foreach my $c ( "\n", "\r", "\t") { $XML::Twig::base_ent{$c}= sprintf( "&#x%02x;", ord( $c)); }
210
211# now set some aliases
212*find_nodes           = *get_xpath;               # same as XML::XPath
213*findnodes            = *get_xpath;               # same as XML::LibXML
214*getElementsByTagName = *descendants;
215*descendants_or_self  = *descendants;             # valid in XML::Twig, not in XML::Twig::Elt
216*find_by_tag_name     = *descendants;
217*getElementById       = *elt_id;
218*getEltById           = *elt_id;
219*toString             = *sprint;
220*create_accessors     = *att_accessors;
221
222}
223
224@ISA = qw(XML::Parser);
225
226# fake gi's used in twig_handlers and start_tag_handlers
227my $ALL    = '_all_';     # the associated function is always called
228my $DEFAULT= '_default_'; # the function is called if no other handler has been
229
230# some defaults
231my $COMMENTS_DEFAULT= 'keep';
232my $PI_DEFAULT      = 'keep';
233
234
235# handlers used in regular mode
236my %twig_handlers=( Start      => \&_twig_start,
237                    End        => \&_twig_end,
238                    Char       => \&_twig_char,
239                    Entity     => \&_twig_entity,
240                    XMLDecl    => \&_twig_xmldecl,
241                    Doctype    => \&_twig_doctype,
242                    Element    => \&_twig_element,
243                    Attlist    => \&_twig_attlist,
244                    CdataStart => \&_twig_cdatastart,
245                    CdataEnd   => \&_twig_cdataend,
246                    Proc       => \&_twig_pi,
247                    Comment    => \&_twig_comment,
248                    Default    => \&_twig_default,
249                    ExternEnt  => \&_twig_extern_ent,
250      );
251
252# handlers used when twig_roots is used and we are outside of the roots
253my %twig_handlers_roots=
254  ( Start      => \&_twig_start_check_roots,
255    End        => \&_twig_end_check_roots,
256    Doctype    => \&_twig_doctype,
257    Char       => undef, Entity     => undef, XMLDecl    => \&_twig_xmldecl,
258    Element    => undef, Attlist    => undef, CdataStart => undef,
259    CdataEnd   => undef, Proc       => undef, Comment    => undef,
260    Proc       => \&_twig_pi_check_roots,
261    Default    =>  sub {}, # hack needed for XML::Parser 2.27
262    ExternEnt  => \&_twig_extern_ent,
263  );
264
265# handlers used when twig_roots and print_outside_roots are used and we are
266# outside of the roots
267my %twig_handlers_roots_print_2_30=
268  ( Start      => \&_twig_start_check_roots,
269    End        => \&_twig_end_check_roots,
270    Char       => \&_twig_print,
271    Entity     => \&_twig_print_entity,
272    ExternEnt  => \&_twig_print_entity,
273    DoctypeFin => \&_twig_doctype_fin_print,
274    XMLDecl    => sub { _twig_xmldecl( @_); _twig_print( @_); },
275    Doctype   =>  \&_twig_print_doctype, # because recognized_string is broken here
276    # Element    => \&_twig_print, Attlist    => \&_twig_print,
277    CdataStart => \&_twig_print, CdataEnd   => \&_twig_print,
278    Proc       => \&_twig_pi_check_roots, Comment    => \&_twig_print,
279    Default    => \&_twig_print_check_doctype,
280    ExternEnt  => \&_twig_extern_ent,
281  );
282
283# handlers used when twig_roots, print_outside_roots and keep_encoding are used
284# and we are outside of the roots
285my %twig_handlers_roots_print_original_2_30=
286  ( Start      => \&_twig_start_check_roots,
287    End        => \&_twig_end_check_roots,
288    Char       => \&_twig_print_original,
289    # I have no idea why I should not be using this handler!
290    Entity     => \&_twig_print_entity,
291    ExternEnt  => \&_twig_print_entity,
292    DoctypeFin => \&_twig_doctype_fin_print,
293    XMLDecl    => sub { _twig_xmldecl( @_); _twig_print_original( @_) },
294    Doctype    => \&_twig_print_original_doctype,  # because original_string is broken here
295    Element    => \&_twig_print_original, Attlist   => \&_twig_print_original,
296    CdataStart => \&_twig_print_original, CdataEnd  => \&_twig_print_original,
297    Proc       => \&_twig_pi_check_roots, Comment   => \&_twig_print_original,
298    Default    => \&_twig_print_original_check_doctype,
299  );
300
301# handlers used when twig_roots and print_outside_roots are used and we are
302# outside of the roots
303my %twig_handlers_roots_print_2_27=
304  ( Start      => \&_twig_start_check_roots,
305    End        => \&_twig_end_check_roots,
306    Char       => \&_twig_print,
307    # if the Entity handler is set then it prints the entity declaration
308    # before the entire internal subset (including the declaration!) is output
309    Entity     => sub {},
310    XMLDecl    => \&_twig_print, Doctype    => \&_twig_print,
311    CdataStart => \&_twig_print, CdataEnd   => \&_twig_print,
312    Proc       => \&_twig_pi_check_roots, Comment    => \&_twig_print,
313    Default    => \&_twig_print,
314    ExternEnt  => \&_twig_extern_ent,
315  );
316
317# handlers used when twig_roots, print_outside_roots and keep_encoding are used
318# and we are outside of the roots
319my %twig_handlers_roots_print_original_2_27=
320  ( Start      => \&_twig_start_check_roots,
321    End        => \&_twig_end_check_roots,
322    Char       => \&_twig_print_original,
323    # for some reason original_string is wrong here
324    # this can be a problem if the doctype includes non ascii characters
325    XMLDecl    => \&_twig_print, Doctype    => \&_twig_print,
326    # if the Entity handler is set then it prints the entity declaration
327    # before the entire internal subset (including the declaration!) is output
328    Entity     => sub {},
329    #Element    => undef, Attlist   => undef,
330    CdataStart => \&_twig_print_original, CdataEnd  => \&_twig_print_original,
331    Proc       => \&_twig_pi_check_roots, Comment   => \&_twig_print_original,
332    Default    => \&_twig_print, #  _twig_print_original does not work
333    ExternEnt  => \&_twig_extern_ent,
334  );
335
336
337my %twig_handlers_roots_print= $parser_version > 2.27
338                               ? %twig_handlers_roots_print_2_30
339                               : %twig_handlers_roots_print_2_27;
340my %twig_handlers_roots_print_original= $parser_version > 2.27
341                               ? %twig_handlers_roots_print_original_2_30
342                               : %twig_handlers_roots_print_original_2_27;
343
344
345# handlers used when the finish_print method has been called
346my %twig_handlers_finish_print=
347  ( Start      => \&_twig_print,
348    End        => \&_twig_print, Char       => \&_twig_print,
349    Entity     => \&_twig_print, XMLDecl    => \&_twig_print,
350    Doctype    => \&_twig_print, Element    => \&_twig_print,
351    Attlist    => \&_twig_print, CdataStart => \&_twig_print,
352    CdataEnd   => \&_twig_print, Proc       => \&_twig_print,
353    Comment    => \&_twig_print, Default    => \&_twig_print,
354    ExternEnt  => \&_twig_extern_ent,
355  );
356
357# handlers used when the finish_print method has been called and the keep_encoding
358# option is used
359my %twig_handlers_finish_print_original=
360  ( Start      => \&_twig_print_original, End      => \&_twig_print_end_original,
361    Char       => \&_twig_print_original, Entity   => \&_twig_print_original,
362    XMLDecl    => \&_twig_print_original, Doctype  => \&_twig_print_original,
363    Element    => \&_twig_print_original, Attlist  => \&_twig_print_original,
364    CdataStart => \&_twig_print_original, CdataEnd => \&_twig_print_original,
365    Proc       => \&_twig_print_original, Comment  => \&_twig_print_original,
366    Default    => \&_twig_print_original,
367  );
368
369# handlers used within ignored elements
370my %twig_handlers_ignore=
371  ( Start      => \&_twig_ignore_start,
372    End        => \&_twig_ignore_end,
373    Char       => undef, Entity     => undef, XMLDecl    => undef,
374    Doctype    => undef, Element    => undef, Attlist    => undef,
375    CdataStart => undef, CdataEnd   => undef, Proc       => undef,
376    Comment    => undef, Default    => undef,
377    ExternEnt  => undef,
378  );
379
380
381# those handlers are only used if the entities are NOT to be expanded
382my %twig_noexpand_handlers= ( ExternEnt => undef, Default => \&_twig_default );
383
384my @saved_default_handler;
385
386my $ID= 'id';  # default value, set by the Id argument
387my $css_sel=0; # set through the css_sel option to allow .class selectors in triggers
388
389# all allowed options
390%valid_option=
391    ( # XML::Twig options
392      TwigHandlers          => 1, Id                    => 1,
393      TwigRoots             => 1, TwigPrintOutsideRoots => 1,
394      StartTagHandlers      => 1, EndTagHandlers        => 1,
395      ForceEndTagHandlersUsage => 1,
396      DoNotChainHandlers    => 1,
397      IgnoreElts            => 1,
398      Index                 => 1,
399      AttAccessors          => 1,
400      EltAccessors          => 1,
401      FieldAccessors        => 1,
402      CharHandler           => 1,
403      TopDownHandlers       => 1,
404      KeepEncoding          => 1, DoNotEscapeAmpInAtts  => 1,
405      ParseStartTag         => 1, KeepAttsOrder         => 1,
406      LoadDTD               => 1, DTDHandler            => 1,
407      DoNotOutputDTD        => 1, NoProlog              => 1,
408      ExpandExternalEnts    => 1,
409      DiscardSpaces         => 1, KeepSpaces            => 1, DiscardAllSpaces => 1,
410      DiscardSpacesIn       => 1, KeepSpacesIn          => 1,
411      PrettyPrint           => 1, EmptyTags             => 1,
412      EscapeGt              => 1,
413      Quote                 => 1,
414      Comments              => 1, Pi                    => 1,
415      OutputFilter          => 1, InputFilter           => 1,
416      OutputTextFilter      => 1,
417      OutputEncoding        => 1,
418      RemoveCdata           => 1,
419      EltClass              => 1,
420      MapXmlns              => 1, KeepOriginalPrefix    => 1,
421      SkipMissingEnts       => 1,
422      # XML::Parser options
423      ErrorContext          => 1, ProtocolEncoding      => 1,
424      Namespaces            => 1, NoExpand              => 1,
425      Stream_Delimiter      => 1, ParseParamEnt         => 1,
426      NoLWP                 => 1, Non_Expat_Options     => 1,
427      Xmlns                 => 1, CssSel                => 1,
428      UseTidy               => 1, TidyOptions           => 1,
429      OutputHtmlDoctype     => 1,
430    );
431
432my $active_twig; # last active twig,for XML::Twig::s
433
434# predefined input and output filters
435use vars qw( %filter);
436%filter= ( html       => \&html_encode,
437           safe       => \&safe_encode,
438           safe_hex   => \&safe_encode_hex,
439         );
440
441
442# trigger types (used to sort them)
443my ($LEVEL_TRIGGER, $REGEXP_TRIGGER, $XPATH_TRIGGER)=(1..3);
444
445sub new
446  { my ($class, %args) = @_;
447    my $handlers;
448
449    # change all nice_perlish_names into nicePerlishNames
450    %args= _normalize_args( %args);
451
452    # check options
453    unless( $args{MoreOptions})
454      { foreach my $arg (keys %args)
455        { carp "invalid option $arg" unless $valid_option{$arg}; }
456      }
457
458    # a twig is really an XML::Parser
459    # my $self= XML::Parser->new(%args);
460    my $self;
461    $self= XML::Parser->new(%args);
462
463    bless $self, $class;
464
465    $self->{_twig_context_stack}= [];
466
467    # allow tag.class selectors in handler triggers
468    $css_sel= $args{CssSel} || 0;
469
470
471    if( exists $args{TwigHandlers})
472      { $handlers= $args{TwigHandlers};
473        $self->setTwigHandlers( $handlers);
474        delete $args{TwigHandlers};
475      }
476
477    # take care of twig-specific arguments
478    if( exists $args{StartTagHandlers})
479      { $self->setStartTagHandlers( $args{StartTagHandlers});
480        delete $args{StartTagHandlers};
481      }
482
483    if( exists $args{DoNotChainHandlers})
484      { $self->{twig_do_not_chain_handlers}=  $args{DoNotChainHandlers}; }
485
486    if( exists $args{IgnoreElts})
487      { # change array to hash so you can write ignore_elts => [ qw(foo bar baz)]
488        if( isa( $args{IgnoreElts}, 'ARRAY')) { $args{IgnoreElts}= { map { $_ => 1 } @{$args{IgnoreElts}} }; }
489        $self->setIgnoreEltsHandlers( $args{IgnoreElts});
490        delete $args{IgnoreElts};
491      }
492
493    if( exists $args{Index})
494      { my $index= $args{Index};
495        # we really want a hash name => path, we turn an array into a hash if necessary
496        if( ref( $index) eq 'ARRAY')
497          { my %index= map { $_ => $_ } @$index;
498            $index= \%index;
499          }
500        while( my( $name, $exp)= each %$index)
501          { $self->setTwigHandler( $exp, sub { push @{$_[0]->{_twig_index}->{$name}}, $_; 1; }); }
502      }
503
504    $self->{twig_elt_class}= $args{EltClass} || 'XML::Twig::Elt';
505    if( defined( $args{EltClass}) && $args{EltClass} ne 'XML::Twig::Elt') { $self->{twig_alt_elt_class}=1; }
506    if( exists( $args{EltClass})) { delete $args{EltClass}; }
507
508    if( exists( $args{MapXmlns}))
509      { $self->{twig_map_xmlns}=  $args{MapXmlns};
510        $self->{Namespaces}=1;
511        delete $args{MapXmlns};
512      }
513
514    if( exists( $args{KeepOriginalPrefix}))
515      { $self->{twig_keep_original_prefix}= $args{KeepOriginalPrefix};
516        delete $args{KeepOriginalPrefix};
517      }
518
519    $self->{twig_dtd_handler}= $args{DTDHandler};
520    delete $args{DTDHandler};
521
522    if( $args{ExpandExternalEnts})
523      { $self->set_expand_external_entities( 1);
524        $self->{twig_expand_external_ents}= $args{ExpandExternalEnts};
525        $self->{twig_read_external_dtd}= 1; # implied by ExpandExternalEnts
526        if( $args{ExpandExternalEnts} == -1)
527          { $self->{twig_extern_ent_nofail}= 1;
528            $self->setHandlers( ExternEnt => \&_twig_extern_ent_nofail);
529          }
530        delete $args{LoadDTD};
531        delete $args{ExpandExternalEnts};
532      }
533    else
534      { $self->set_expand_external_entities( 0); }
535
536    if( !$args{NoLWP} && ! _use( 'URI') && ! _use( 'URI::File') && ! _use( 'LWP'))
537      { $self->{twig_ext_ent_handler}= \&XML::Parser::initial_ext_ent_handler }
538    else
539      { $self->{twig_ext_ent_handler}= \&XML::Parser::file_ext_ent_handler }
540
541    if( $args{DoNotEscapeAmpInAtts})
542      { $self->set_do_not_escape_amp_in_atts( 1);
543        $self->{twig_do_not_escape_amp_in_atts}=1;
544      }
545    else
546      { $self->set_do_not_escape_amp_in_atts( 0);
547        $self->{twig_do_not_escape_amp_in_atts}=0;
548      }
549
550    # deal with TwigRoots argument, a hash of elements for which
551    # subtrees will be built (and associated handlers)
552
553    if( $args{TwigRoots})
554      { $self->setTwigRoots( $args{TwigRoots});
555        delete $args{TwigRoots};
556      }
557
558    if( $args{EndTagHandlers})
559      { unless ($self->{twig_roots} || $args{ForceEndTagHandlersUsage})
560          { croak "you should not use EndTagHandlers without TwigRoots\n",
561                  "if you want to use it anyway, normally because you have ",
562                  "a start_tag_handlers that calls 'ignore' and you want to ",
563                  "call an ent_tag_handlers at the end of the element, then ",
564                  "pass 'force_end_tag_handlers_usage => 1' as an argument ",
565                  "to new";
566          }
567
568        $self->setEndTagHandlers( $args{EndTagHandlers});
569        delete $args{EndTagHandlers};
570      }
571
572    if( $args{TwigPrintOutsideRoots})
573      { croak "cannot use twig_print_outside_roots without twig_roots"
574          unless( $self->{twig_roots});
575        # if the arg is a filehandle then store it
576        if( _is_fh( $args{TwigPrintOutsideRoots}) )
577          { $self->{twig_output_fh}= $args{TwigPrintOutsideRoots}; }
578        $self->{twig_default_print}= $args{TwigPrintOutsideRoots};
579      }
580
581    # space policy
582    if( $args{KeepSpaces})
583      { croak "cannot use both keep_spaces and discard_spaces"        if( $args{DiscardSpaces});
584        croak "cannot use both keep_spaces and discard_all_spaces"    if( $args{DiscardAllSpaces});
585        croak "cannot use both keep_spaces and keep_spaces_in"        if( $args{KeepSpacesIn});
586        $self->{twig_keep_spaces}=1;
587        delete $args{KeepSpaces};
588      }
589    if( $args{DiscardSpaces})
590      {
591        croak "cannot use both discard_spaces and keep_spaces_in"     if( $args{KeepSpacesIn});
592        croak "cannot use both discard_spaces and discard_all_spaces" if( $args{DiscardAllSpaces});
593        croak "cannot use both discard_spaces and discard_spaces_in"  if( $args{DiscardSpacesIn});
594        $self->{twig_discard_spaces}=1;
595        delete $args{DiscardSpaces};
596      }
597    if( $args{KeepSpacesIn})
598      { croak "cannot use both keep_spaces_in and discard_spaces_in"  if( $args{DiscardSpacesIn});
599        croak "cannot use both keep_spaces_in and discard_all_spaces" if( $args{DiscardAllSpaces});
600        $self->{twig_discard_spaces}=1;
601        $self->{twig_keep_spaces_in}={};
602        my @tags= @{$args{KeepSpacesIn}};
603        foreach my $tag (@tags) { $self->{twig_keep_spaces_in}->{$tag}=1; }
604        delete $args{KeepSpacesIn};
605      }
606
607    if( $args{DiscardAllSpaces})
608      {
609        croak "cannot use both discard_all_spaces and discard_spaces_in" if( $args{DiscardSpacesIn});
610        $self->{twig_discard_all_spaces}=1;
611        delete $args{DiscardAllSpaces};
612      }
613
614    if( $args{DiscardSpacesIn})
615      { $self->{twig_keep_spaces}=1;
616        $self->{twig_discard_spaces_in}={};
617        my @tags= @{$args{DiscardSpacesIn}};
618        foreach my $tag (@tags) { $self->{twig_discard_spaces_in}->{$tag}=1; }
619        delete $args{DiscardSpacesIn};
620      }
621    # discard spaces by default
622    $self->{twig_discard_spaces}= 1 unless(  $self->{twig_keep_spaces});
623
624    $args{Comments}||= $COMMENTS_DEFAULT;
625    if( $args{Comments} eq 'drop')       { $self->{twig_keep_comments}= 0;    }
626    elsif( $args{Comments} eq 'keep')    { $self->{twig_keep_comments}= 1;    }
627    elsif( $args{Comments} eq 'process') { $self->{twig_process_comments}= 1; }
628    else { croak "wrong value for comments argument: '$args{Comments}' (should be 'drop', 'keep' or 'process')"; }
629    delete $args{Comments};
630
631    $args{Pi}||= $PI_DEFAULT;
632    if( $args{Pi} eq 'drop')       { $self->{twig_keep_pi}= 0;    }
633    elsif( $args{Pi} eq 'keep')    { $self->{twig_keep_pi}= 1;    }
634    elsif( $args{Pi} eq 'process') { $self->{twig_process_pi}= 1; }
635    else { croak "wrong value for pi argument: '$args{Pi}' (should be 'drop', 'keep' or 'process')"; }
636    delete $args{Pi};
637
638    if( $args{KeepEncoding})
639      {
640        # set it in XML::Twig::Elt so print functions know what to do
641        $self->set_keep_encoding( 1);
642        $self->{parse_start_tag}= $args{ParseStartTag} || \&_parse_start_tag;
643        delete $args{ParseStartTag} if defined( $args{ParseStartTag}) ;
644        delete $args{KeepEncoding};
645      }
646    else
647      { $self->set_keep_encoding( 0);
648        if( $args{ParseStartTag})
649          { $self->{parse_start_tag}= $args{ParseStartTag}; }
650        else
651          { delete $self->{parse_start_tag}; }
652        delete $args{ParseStartTag};
653      }
654
655    if( $args{OutputFilter})
656      { $self->set_output_filter( $args{OutputFilter});
657        delete $args{OutputFilter};
658      }
659    else
660      { $self->set_output_filter( 0); }
661
662    if( $args{RemoveCdata})
663      { $self->set_remove_cdata( $args{RemoveCdata});
664        delete $args{RemoveCdata};
665      }
666    else
667      { $self->set_remove_cdata( 0); }
668
669    if( $args{OutputTextFilter})
670      { $self->set_output_text_filter( $args{OutputTextFilter});
671        delete $args{OutputTextFilter};
672      }
673    else
674      { $self->set_output_text_filter( 0); }
675
676    if( exists $args{KeepAttsOrder})
677      { $self->{keep_atts_order}= $args{KeepAttsOrder};
678        if( _use( 'Tie::IxHash'))
679          { $self->set_keep_atts_order(  $self->{keep_atts_order}); }
680        else
681          { croak "Tie::IxHash not available, option keep_atts_order not allowed"; }
682      }
683    else
684      { $self->set_keep_atts_order( 0); }
685
686
687    if( $args{PrettyPrint})    { $self->set_pretty_print( $args{PrettyPrint}); }
688    if( $args{EscapeGt})       { $self->escape_gt( $args{EscapeGt});           }
689    if( $args{EmptyTags})      { $self->set_empty_tag_style( $args{EmptyTags}) }
690
691    if( exists $args{Id})      { $ID= $args{Id};                     delete $args{ID};             }
692    if( $args{NoProlog})       { $self->{no_prolog}= 1;              delete $args{NoProlog};       }
693    if( $args{DoNotOutputDTD}) { $self->{no_dtd_output}= 1;          delete $args{DoNotOutputDTD}; }
694    if( $args{LoadDTD})        { $self->{twig_read_external_dtd}= 1; delete $args{LoadDTD};        }
695    if( $args{CharHandler})    { $self->setCharHandler( $args{CharHandler}); delete $args{CharHandler}; }
696
697    if( $args{InputFilter})    { $self->set_input_filter(  $args{InputFilter}); delete  $args{InputFilter}; }
698    if( $args{NoExpand})       { $self->setHandlers( %twig_noexpand_handlers); $self->{twig_no_expand}=1; }
699    if( my $output_encoding= $args{OutputEncoding}) { $self->set_output_encoding( $output_encoding); delete $args{OutputFilter}; }
700
701    if( my $tdh= $args{TopDownHandlers}) { $self->{twig_tdh}=1; delete $args{TopDownHandlers}; }
702
703    if( my $acc_a= $args{AttAccessors})   { $self->att_accessors( @$acc_a);  }
704    if( my $acc_e= $args{EltAccessors})   { $self->elt_accessors( isa( $acc_e, 'ARRAY') ? @$acc_e : $acc_e);   }
705    if( my $acc_f= $args{FieldAccessors}) { $self->field_accessors( isa( $acc_f, 'ARRAY') ? @$acc_f : $acc_f); }
706
707    if( $args{UseTidy}) { $self->{use_tidy}= 1; }
708    $self->{tidy_options}= $args{TidyOptions} || {};
709
710    if( $args{OutputHtmlDoctype}) { $self->{html_doctype}= 1; }
711
712    $self->set_quote( $args{Quote} || 'double');
713
714    # set handlers
715    if( $self->{twig_roots})
716      { if( $self->{twig_default_print})
717          { if( $self->{twig_keep_encoding})
718              { $self->setHandlers( %twig_handlers_roots_print_original); }
719            else
720              { $self->setHandlers( %twig_handlers_roots_print);  }
721          }
722        else
723          { $self->setHandlers( %twig_handlers_roots); }
724      }
725    else
726      { $self->setHandlers( %twig_handlers); }
727
728    # XML::Parser::Expat does not like these handler to be set. So in order to
729    # use the various sets of handlers on XML::Parser or XML::Parser::Expat
730    # objects when needed, these ones have to be set only once, here, at
731    # XML::Parser level
732    $self->setHandlers( Init => \&_twig_init, Final => \&_twig_final);
733
734    $self->{twig_entity_list}= XML::Twig::Entity_list->new;
735
736    $self->{twig_id}= $ID;
737    $self->{twig_stored_spaces}='';
738
739    $self->{twig_autoflush}= 1; # auto flush by default
740
741    $self->{twig}= $self;
742    if( $weakrefs) { weaken( $self->{twig}); }
743
744    return $self;
745  }
746
747sub parse
748  {
749    my $t= shift;
750    # if called as a class method, calls nparse, which creates the twig then parses it
751    if( !ref( $t) || !isa( $t, 'XML::Twig')) { return $t->nparse( @_); }
752
753    # requires 5.006 at least (or the ${^UNICODE} causes a problem)                                       # > perl 5.5
754    # trap underlying bug in IO::Handle (see RT #17500)                                                   # > perl 5.5
755    # croak if perl 5.8+, -CD (or PERL_UNICODE set to D) and parsing a pipe                               # > perl 5.5
756    if( $perl_version>=5.008 && ${^UNICODE} && (${^UNICODE} & 24) && isa( $_[0], 'GLOB') && -p $_[0] )               # > perl 5.5
757      { croak   "cannot parse the output of a pipe when perl is set to use the UTF8 perlIO layer\n"       # > perl 5.5
758              . "set the environment variable PERL_UNICODE or use the -C option (see perldoc perlrun)\n"  # > perl 5.5
759              . "not to include 'D'";                                                                     # > perl 5.5
760      }                                                                                                   # > perl 5.5
761    $t= eval { $t->SUPER::parse( @_); };
762
763    if(    !$t
764        && $@=~m{(syntax error at line 1, column 0, byte 0|not well-formed \(invalid token\) at line 1, column 1, byte 1)}
765        && -f $_[0]
766      )
767      { croak "you seem to have used the parse method on a filename ($_[0]), you probably want parsefile instead"; }
768    return _checked_parse_result( $t, $@);
769  }
770
771sub parsefile
772  { my $t= shift;
773    if( -f $_[0] && ! -s $_[0]) { return _checked_parse_result( undef, "empty file '$_[0]'"); }
774    $t= eval { $t->SUPER::parsefile( @_); };
775    return _checked_parse_result( $t, $@);
776  }
777
778sub _checked_parse_result
779  { my( $t, $returned)= @_;
780    if( !$t)
781      { if( isa( $returned, 'XML::Twig') && $returned->{twig_finish_now})
782          { $t= $returned;
783            delete $t->{twig_finish_now};
784            return $t->_twig_final;
785          }
786        else
787          { _croak( $returned, 0); }
788      }
789
790    $active_twig= $t;
791    return $t;
792  }
793
794sub active_twig { return $active_twig; }
795
796sub finish_now
797  { my $t= shift;
798    $t->{twig_finish_now}=1;
799    die $t;
800  }
801
802
803sub parsefile_inplace      { shift->_parse_inplace( parsefile      => @_); }
804sub parsefile_html_inplace { shift->_parse_inplace( parsefile_html => @_); }
805
806sub _parse_inplace
807  { my( $t, $method, $file, $suffix)= @_;
808    _use( 'File::Temp') || croak "need File::Temp to use inplace methods\n";
809    _use( 'File::Basename');
810
811
812    my $tmpdir= dirname( $file);
813    my( $tmpfh, $tmpfile)= File::Temp::tempfile( DIR => $tmpdir);
814    my $original_fh= select $tmpfh;
815
816    unless( $t->{twig_keep_encoding} || $perl_version < 5.006)
817      { if( grep /useperlio=define/, `$^X -V`) # we can only use binmode :utf8 if perl was compiled with useperlio
818          { binmode( $tmpfh, ":utf8" ); }
819      }
820
821    $t->$method( $file);
822
823    select $original_fh;
824    close $tmpfh;
825    my $mode= (stat( $file))[2] & oct(7777);
826    chmod $mode, $tmpfile or croak "cannot change temp file mode to $mode: $!";
827
828    if( $suffix)
829      { my $backup;
830        if( $suffix=~ m{\*}) { ($backup = $suffix) =~ s/\*/$file/g; }
831        else                 { $backup= $file . $suffix; }
832
833        rename( $file, $backup) or croak "cannot backup initial file ($file) to $backup: $!";
834      }
835    rename( $tmpfile, $file) or croak "cannot rename temp file ($tmpfile) to initial file ($file): $!";
836
837    return $t;
838  }
839
840
841sub parseurl
842  { my $t= shift;
843    $t->_parseurl( 0, @_);
844  }
845
846sub safe_parseurl
847  { my $t= shift;
848    $t->_parseurl( 1, @_);
849  }
850
851sub safe_parsefile_html
852  { my $t= shift;
853    eval { $t->parsefile_html( @_); };
854    return $@ ? $t->_reset_twig_after_error : $t;
855  }
856
857sub safe_parseurl_html
858  { my $t= shift;
859    _use( 'LWP::Simple') or croak "missing LWP::Simple";
860    eval { $t->parse_html( LWP::Simple::get( shift()), @_); } ;
861    return $@ ? $t->_reset_twig_after_error : $t;
862  }
863
864sub parseurl_html
865  { my $t= shift;
866    _use( 'LWP::Simple') or croak "missing LWP::Simple";
867    $t->parse_html( LWP::Simple::get( shift()), @_);
868  }
869
870
871# uses eval to catch the parser's death
872sub safe_parse_html
873  { my $t= shift;
874    eval { $t->parse_html( @_); } ;
875    return $@ ? $t->_reset_twig_after_error : $t;
876  }
877
878sub parsefile_html
879  { my $t= shift;
880    my $file= shift;
881    my $indent= $t->{ErrorContext} ? 1 : 0;
882    $t->set_empty_tag_style( 'html');
883    my $html2xml=  $t->{use_tidy} ? \&_tidy_html : \&_html2xml;
884    my $options= $t->{use_tidy} ? $t->{tidy_options} || {} :  { indent => $indent, html_doctype => $t->{html_doctype} };
885    $t->parse( $html2xml->( _slurp( $file), $options), @_);
886    return $t;
887  }
888
889sub parse_html
890  { my $t= shift;
891    my $options= ref $_[0] && ref $_[0] eq 'HASH' ? shift() : {};
892    my $use_tidy= exists $options->{use_tidy} ? $options->{use_tidy} : $t->{use_tidy};
893    my $content= shift;
894    my $indent= $t->{ErrorContext} ? 1 : 0;
895    $t->set_empty_tag_style( 'html');
896    my $html2xml=  $use_tidy ? \&_tidy_html : \&_html2xml;
897    my $conv_options= $use_tidy ? $t->{tidy_options} || {} :  { indent => $indent, html_doctype => $t->{html_doctype} };
898    $t->parse( $html2xml->( isa( $content, 'GLOB') ? _slurp_fh( $content) : $content, $conv_options), @_);
899    return $t;
900  }
901
902sub xparse
903  { my $t= shift;
904    my $to_parse= $_[0];
905    if( isa( $to_parse, 'GLOB'))             { $t->parse( @_);                 }
906    elsif( $to_parse=~ m{^\s*<})             { $to_parse=~ m{<html}i ? $t->_parse_as_xml_or_html( @_)
907                                                                     : $t->parse( @_);
908                                             }
909    elsif( $to_parse=~ m{^\w+://.*\.html?$}) { _use( 'LWP::Simple') or croak "missing LWP::Simple";
910                                               $t->_parse_as_xml_or_html( LWP::Simple::get( shift()), @_);
911                                             }
912    elsif( $to_parse=~ m{^\w+://})           { _use( 'LWP::Simple') or croak "missing LWP::Simple";
913                                               my $doc= LWP::Simple::get( shift);
914                                               if( ! defined $doc) { $doc=''; }
915                                               my $xml_parse_ok= $t->safe_parse( $doc, @_);
916                                               if( $xml_parse_ok)
917                                                 { return $xml_parse_ok; }
918                                               else
919                                                 { my $diag= $@;
920                                                   if( $doc=~ m{<html}i)
921                                                     { $t->parse_html( $doc, @_); }
922                                                    else
923                                                      { croak $diag; }
924                                                 }
925                                             }
926    elsif( $to_parse=~ m{\.html?$})          { my $content= _slurp( shift);
927                                               $t->_parse_as_xml_or_html( $content, @_);
928                                             }
929    else                                     { $t->parsefile( @_);             }
930  }
931
932sub _parse_as_xml_or_html
933  { my $t= shift;
934    if( _is_well_formed_xml( $_[0]))
935      { $t->parse( @_) }
936    else
937      { my $html2xml=  $t->{use_tidy} ? \&_tidy_html : \&_html2xml;
938        my $options= $t->{use_tidy} ? $t->{tidy_options} || {} :  { indent => 0, html_doctype => $t->{html_doctype} };
939        my $html= $html2xml->( $_[0], $options, @_);
940        if( _is_well_formed_xml( $html))
941          { $t->parse( $html); }
942        else
943          { croak $@; } # can't really test this because HTML::Parser or HTML::Tidy may change how they deal with bas HTML between versions
944      }
945  }
946
947{ my $parser;
948  sub _is_well_formed_xml
949    { $parser ||= XML::Parser->new;
950      eval { $parser->parse( $_[0]); };
951      return $@ ? 0 : 1;
952    }
953}
954
955sub nparse
956  { my $class= shift;
957    my $to_parse= pop;
958    $class->new( @_)->xparse( $to_parse);
959  }
960
961sub nparse_pp   { shift()->nparse( pretty_print => 'indented', @_); }
962sub nparse_e    { shift()->nparse( error_context => 1,         @_); }
963sub nparse_ppe  { shift()->nparse( pretty_print => 'indented', error_context => 1, @_); }
964
965
966sub _html2xml
967  { my( $html, $options)= @_;
968    _use( 'HTML::TreeBuilder', '3.13') or croak "cannot parse HTML: missing HTML::TreeBuilder v >= 3.13\n";
969    my $tree= HTML::TreeBuilder->new;
970    $tree->ignore_ignorable_whitespace( 0);
971    $tree->ignore_unknown( 0);
972    $tree->no_space_compacting( 1);
973    $tree->store_comments( 1);
974    $tree->store_pis(1);
975    $tree->parse( $html);
976    $tree->eof;
977
978    my $xml='';
979    if( $options->{html_doctype} && exists $tree->{_decl} )
980      { my $decl= $tree->{_decl}->as_XML;
981
982        # first try to fix declarations that are missing the SYSTEM part
983        $decl =~ s{^\s*<!DOCTYPE \s+ ((?i)html) \s+ PUBLIC \s+ "([^"]*)" \s* >}
984                  { my $system= $HTML_DECL{$2} || $HTML_DECL{$DEFAULT_HTML_TYPE};
985                    qq{<!DOCTYPE $1 PUBLIC "$2" "$system">}
986
987                  }xe;
988
989        # then check that the declaration looks OK (so it parses), if not remove it,
990        # better to parse without the declaration than to die stupidly
991        if(    $decl =~ m{<!DOCTYPE \s+ (?i:HTML) (\s+ PUBLIC \s+ "[^"]*" \s+ (SYSTEM \s+)? "[^"]*")? \s*>}x # PUBLIC then SYSTEM
992            || $decl =~ m{<!DOCTYPE \s+ (?i:HTML) \s+ SYSTEM \s+ "[^"]*" \s*>}x                             # just SYSTEM
993          )
994          { $xml= $decl; }
995      }
996
997    $xml.= _as_XML( $tree);
998
999
1000    _fix_xml( $tree, \$xml);
1001
1002    if( $options->{indent}) { _indent_xhtml( \$xml); }
1003    $tree->delete;
1004    $xml=~ s{\s+$}{}s; # trim end
1005    return $xml;
1006  }
1007
1008sub _tidy_html
1009  { my( $html, $options)= @_;
1010   _use( 'HTML::Tidy') or croak "cannot cleanup HTML using HTML::Tidy (required by the use_tidy option): $@\n"; ;
1011    my $TIDY_DEFAULTS= { output_xhtml => 1, # duh!
1012                         tidy_mark => 0,    # do not add the "generated by tidy" comment
1013                         numeric_entities => 1,
1014                         char_encoding =>  'utf8',
1015                         bare => 1,
1016                         clean => 1,
1017                         doctype => 'transitional',
1018                         fix_backslash => 1,
1019                         merge_divs => 0,
1020                         merge_spans => 0,
1021                         sort_attributes => 'alpha',
1022                         indent => 0,
1023                         wrap => 0,
1024                         break_before_br => 0,
1025                       };
1026    $options ||= {};
1027    my $tidy_options= { %$TIDY_DEFAULTS, %$options};
1028    my $tidy = HTML::Tidy->new( $tidy_options);
1029    $tidy->ignore( type => 1, type => 2 ); # 1 is TIDY_WARNING, 2 is TIDY_ERROR, not clean
1030    my $xml= $tidy->clean( $html );
1031    return $xml;
1032  }
1033
1034
1035{ my %xml_parser_encoding;
1036  sub _fix_xml
1037    { my( $tree, $xml)= @_; # $xml is a ref to the xml string
1038
1039      my $max_tries=5;
1040      my $add_decl;
1041
1042      while( ! _check_xml( $xml) && $max_tries--)
1043        {
1044          # a couple of fixes for weird HTML::TreeBuilder errors
1045          if( $@=~ m{^\s*xml (or text )?declaration not at start of (external )?entity}i)
1046            { $$xml=~ s{<\?xml.*?\?>}{}g;
1047              #warn " fixed xml declaration in the wrong place\n";
1048            }
1049          elsif( $@=~ m{undefined entity})
1050            { $$xml=~ s{&(amp;)?Amp;}{&amp;}g if $HTML::TreeBuilder::VERSION < 4.00;
1051              if( _use( 'HTML::Entities::Numbered')) { $$xml=name2hex_xml( $$xml); }
1052              $$xml=~ s{&(\w+);}{ my $ent= $1; if( $ent !~ m{^(amp|lt|gt|apos|quote)$}) { "&amp;$ent;" } }eg;
1053            }
1054          elsif( $@=~ m{&Amp; used in html})
1055            # if $Amp; is used instead of &amp; then HTML::TreeBuilder's as_xml is tripped (old version)
1056            { $$xml=~ s{&(amp;)?Amp;}{&amp;}g if $HTML::TreeBuilder::VERSION < 4.00;
1057            }
1058          elsif( $@=~ m{^\s*not well-formed \(invalid token\)})
1059            { if( $HTML::TreeBuilder::VERSION < 4.00)
1060                { $$xml=~ s{&(amp;)?Amp;}{&amp;}g;
1061                  $$xml=~  s{(<[^>]* )(\d+=)"}{$1a$2"}g; # <table 1> comes out as <table 1="1">, "fix the attribute
1062                }
1063              my $q= '<img "="&#34;" '; # extracted so vim doesn't get confused
1064              if( _use( 'HTML::Entities::Numbered')) { $$xml=name2hex_xml( $$xml); }
1065              if( $$xml=~ m{$q})
1066                { $$xml=~ s{$q}{<img }g; # happens with <img src="foo.png"" ...
1067                }
1068              else
1069                { my $encoding= _encoding_from_meta( $tree);
1070                  unless( keys %xml_parser_encoding) { %xml_parser_encoding= _xml_parser_encodings(); }
1071
1072                  if( ! $add_decl)
1073                    { if( $xml_parser_encoding{$encoding})
1074                        { $add_decl=1; }
1075                      elsif( $encoding eq 'euc-jp' && $xml_parser_encoding{'x-euc-jp-jisx0221'})
1076                        { $encoding="x-euc-jp-jisx0221"; $add_decl=1;}
1077                      elsif( $encoding eq 'shift-jis' && $xml_parser_encoding{'x-sjis-jisx0221'})
1078                        { $encoding="x-sjis-jisx0221";   $add_decl=1;}
1079
1080                      if( $add_decl)
1081                        { $$xml=~ s{^(<\?xml.*?\?>)?}{<?xml version="1.0" encoding="$encoding"?>}s;
1082                          #warn "  added decl (encoding $encoding)\n";
1083                        }
1084                      else
1085                        { $$xml=~ s{^(<\?xml.*?\?>)?}{}s;
1086                          #warn "  converting to utf8 from $encoding\n";
1087                          $$xml= _to_utf8( $encoding, $$xml);
1088                        }
1089                    }
1090                  else
1091                    { $$xml=~ s{^(<\?xml.*?\?>)?}{}s;
1092                      #warn "  converting to utf8 from $encoding\n";
1093                      $$xml= _to_utf8( $encoding, $$xml);
1094                    }
1095                }
1096            }
1097        }
1098
1099      # some versions of HTML::TreeBuilder escape CDATA sections
1100      $$xml=~ s{(&lt;!\[CDATA\[.*?\]\]&gt;)}{_unescape_cdata( $1)}eg;
1101
1102  }
1103
1104  sub _xml_parser_encodings
1105    { my @encodings=( 'iso-8859-1'); # this one is included by default, there is no map for it in @INC
1106      foreach my $inc (@INC)
1107        { push @encodings, map { basename( $_, '.enc') } glob( File::Spec->catdir( $inc => XML => Parser => Encodings => '*.enc')); }
1108      return map { $_ => 1 } @encodings;
1109    }
1110}
1111
1112
1113sub _unescape_cdata
1114  { my( $cdata)= @_;
1115    $cdata=~s{&lt;}{<}g;
1116    $cdata=~s{&gt;}{>}g;
1117    $cdata=~s{&amp;}{&}g;
1118    return $cdata;
1119  }
1120
1121sub _as_XML {
1122
1123    # fork of HTML::Element::as_XML, which is a little too buggy and inconsistent between versions for my liking
1124    my ($elt) = @_;
1125    my $xml= '';
1126    my $empty_element_map = $elt->_empty_element_map;
1127
1128    my ( $tag, $node, $start );    # per-iteration scratch
1129    $elt->traverse(
1130        sub {
1131            ( $node, $start ) = @_;
1132            if ( ref $node )
1133              { # it's an element
1134                $tag = $node->{'_tag'};
1135                if ($start)
1136                  { # on the way in
1137                    foreach my $att ( grep { ! m{^(_|/$)} } keys %$node )
1138                       { # fix attribute names instead of dying
1139                         my $new_att= $att;
1140                         if( $att=~ m{^\d}) { $new_att= "a$att"; }
1141                         $new_att=~ s{[^\w\d:_-]}{}g;
1142                         $new_att ||= 'a';
1143                         if( $new_att ne $att) { $node->{$new_att}= delete $node->{$att}; }
1144                       }
1145
1146                    if ( $empty_element_map->{$tag} && (!@{ $node->{'_content'} || []}) )
1147                      { $xml.= $node->starttag_XML( undef, 1 ); }
1148                    else
1149                      { $xml.= $node->starttag_XML(undef); }
1150                  }
1151                else
1152                 { # on the way out
1153                   unless ( $empty_element_map->{$tag} and !@{ $node->{'_content'} || [] } )
1154                    { $xml.= $node->endtag_XML();
1155                    }     # otherwise it will have been an <... /> tag.
1156                  }
1157              }
1158            elsif( $node=~ /<!\[CDATA\[/)  # the content includes CDATA
1159              {  foreach my $chunk (split /(<!\[CDATA\[.*?\]\]>)/s, $node) # chunks are CDATA sections or normal text
1160                  { $xml.= $chunk =~ m{<!\[CDATA\[} ? $chunk : _xml_escape( $chunk); }
1161              }
1162            else   # it's just text
1163              { $xml .= _xml_escape($node); }
1164            1;            # keep traversing
1165        }
1166    );
1167  return $xml;
1168}
1169
1170sub _xml_escape
1171  { my( $html)= @_;
1172    $html =~ s{&(?!                     # An ampersand that isn't followed by...
1173                  (  \#[0-9]+;       |  #   A hash mark, digits and semicolon, or
1174                    \#x[0-9a-fA-F]+; |  #   A hash mark, "x", hex digits and semicolon, or
1175                    [\w]+;              #   A valid unicode entity name and semicolon
1176                  )
1177                )
1178              }
1179              {&amp;}gx if 0;    # Needs to be escaped to amp
1180
1181    $html=~ s{&}{&amp;}g;
1182
1183    # in old versions of HTML::TreeBuilder &amp; can come out as &Amp;
1184    if( $HTML::TreeBuilder::VERSION && $HTML::TreeBuilder::VERSION <= 3.23) { $html=~ s{&Amp;}{&amp;}g; }
1185
1186    # simple character escapes
1187    $html =~ s/</&lt;/g;
1188    $html =~ s/>/&gt;/g;
1189    $html =~ s/"/&quot;/g;
1190    $html =~ s/'/&apos;/g;
1191
1192    return $html;
1193  }
1194
1195
1196
1197
1198sub _check_xml
1199  { my( $xml)= @_; # $xml is a ref to the xml string
1200    my $ok= eval { XML::Parser->new->parse( $$xml); };
1201    #if( $ok) { warn "  parse OK\n"; }
1202    return $ok;
1203  }
1204
1205sub _encoding_from_meta
1206  { my( $tree)= @_;
1207    my $enc="iso-8859-1";
1208    my @meta= $tree->find( 'meta');
1209    foreach my $meta (@meta)
1210      { if(    $meta->{'http-equiv'} && ($meta->{'http-equiv'} =~ m{^\s*content-type\s*}i)
1211            && $meta->{content}      && ($meta->{content}      =~ m{^\s*text/html\s*;\s*charset\s*=\s*(\S*)\s*}i)
1212          )
1213          { $enc= lc $1;
1214            #warn "  encoding from meta tag is '$enc'\n";
1215            last;
1216          }
1217      }
1218    return $enc;
1219  }
1220
1221{ sub _to_utf8
1222    { my( $encoding, $string)= @_;
1223      local $SIG{__DIE__};
1224      if( _use(  'Encode'))
1225        { Encode::from_to( $string, $encoding => 'utf8', 0x0400); } # 0x0400 is Encode::FB_XMLCREF
1226      elsif( _use( 'Text::Iconv'))
1227        { my $converter =  eval { Text::Iconv->new( $encoding => "utf8") };
1228          if( $converter) {  $string= $converter->convert( $string); }
1229        }
1230      elsif( _use( 'Unicode::Map8') && _use( 'Unicode::String'))
1231        { my $map= Unicode::Map8->new( $encoding);
1232          $string= $map->tou( $string)->utf8;
1233        }
1234      $string=~ s{[\x00-\x08\x0B\x0C\x0E-\x1F]}{}g; # get rid of control chars, portable in 5.6
1235    return $string;
1236  }
1237}
1238
1239
1240sub _indent_xhtml
1241  { my( $xhtml)= @_; # $xhtml is a ref
1242    my %block_tag= map { $_ => 1 } qw( html
1243                                         head
1244                                           meta title link script base
1245                                         body
1246                                           h1 h2 h3 h4 h5 h6
1247                                           p br address  blockquote pre
1248                                           ol ul li  dd dl dt
1249                                           table tr td th tbody tfoot thead  col colgroup caption
1250                                           div frame frameset hr
1251                                     );
1252
1253    my $level=0;
1254    $$xhtml=~ s{( (?:<!(?:--.*?-->|[CDATA[.*?]]>)) # ignore comments and CDATA sections
1255                  | <(\w+)((?:\s+\w+\s*=\s*(?:"[^"]*"|'[^']*'))*\s*/>) # empty tag
1256                  | <(\w+)                         # start tag
1257                  |</(\w+)                         # end tag
1258                )
1259               }
1260               { if(    $2 && $block_tag{$2})  { my $indent= "  " x $level;
1261                                                 "\n$indent<$2$3";
1262                                               }
1263                 elsif( $4 && $block_tag{$4})  { my $indent= "  " x $level;
1264                                                 $level++ unless( $4=~ m{/>});
1265                                                 my $nl= $4 eq 'html' ? '' : "\n";
1266                                                 "$nl$indent<$4";
1267                                               }
1268                 elsif( $5  && $block_tag{$5}) { $level--; "</$5"; }
1269                 else                          { $1; }
1270               }xesg;
1271  }
1272
1273
1274sub add_stylesheet
1275  { my( $t, $type, $href)= @_;
1276    my %text_type= map { $_ => 1 } qw( xsl css);
1277    my $ss= $t->{twig_elt_class}->new( $PI);
1278    if( $text_type{$type})
1279      { $ss->_set_pi( 'xml-stylesheet', qq{type="text/$type" href="$href"}); }
1280    else
1281      { croak "unsupported style sheet type '$type'"; }
1282
1283    $t->_add_cpi_outside_of_root( leading_cpi => $ss);
1284    return $t;
1285  }
1286
1287{ my %used;       # module => 1 if require ok, 0 otherwise
1288  my %disallowed; # for testing, refuses to _use modules in this hash
1289
1290  sub _disallow_use ## no critic (Subroutines::ProhibitNestedSubs);
1291    { my( @modules)= @_;
1292      $disallowed{$_}= 1 foreach (@modules);
1293    }
1294
1295  sub _allow_use  ## no critic (Subroutines::ProhibitNestedSubs);
1296    { my( @modules)= @_;
1297      $disallowed{$_}= 0 foreach (@modules);
1298    }
1299
1300  sub _use  ## no critic (Subroutines::ProhibitNestedSubs);
1301    { my( $module, $version)= @_;
1302      $version ||= 0;
1303      if( $disallowed{$module})   { return 0; }
1304      if( $used{$module})         { return 1; }
1305      if( eval "require $module") { import $module; $used{$module}= 1;  # no critic ProhibitStringyEval
1306                                    if( $version)
1307                                      {
1308                                        ## no critic (TestingAndDebugging::ProhibitNoStrict);
1309                                        no strict 'refs';
1310                                        if( ${"${module}::VERSION"} >= $version ) { return 1; }
1311                                        else                                      { return 0; }
1312                                      }
1313                                    else
1314                                      { return 1; }
1315                                  }
1316      else                        {                          $used{$module}= 0; return 0; }
1317    }
1318}
1319
1320# used to solve the [n] predicates while avoiding getting the entire list
1321# needs a prototype to accept passing bare blocks
1322sub _first_n(&$@)       ## no critic (Subroutines::ProhibitSubroutinePrototypes);
1323  { my $coderef= shift;
1324    my $n= shift;
1325    my $i=0;
1326    if( $n > 0)
1327      { foreach (@_)         { if( &$coderef) { $i++; return $_ if( $i == $n); } } }
1328    elsif( $n < 0)
1329      { foreach (reverse @_) { if( &$coderef) { $i--; return $_ if( $i == $n); } } }
1330    else
1331      { croak "illegal position number 0"; }
1332    return undef;
1333  }
1334
1335sub _slurp_uri
1336  { my( $uri, $base)= @_;
1337    if( $uri=~ m{^\w+://}) { _use( 'LWP::Simple'); return LWP::Simple::get( $uri); }
1338    else                   { return _slurp( _based_filename( $uri, $base));        }
1339  }
1340
1341sub _based_filename
1342  { my( $filename, $base)= @_;
1343    # cf. XML/Parser.pm's file_ext_ent_handler
1344    if (defined($base) and not ($filename =~ m{^(?:[\\/]|\w+:)}))
1345          { my $newpath = $base;
1346            $newpath =~ s{[^\\/:]*$}{$filename};
1347            $filename = $newpath;
1348          }
1349    return $filename;
1350  }
1351
1352sub _slurp
1353  { my( $filename)= @_;
1354    my $to_slurp;
1355    open( $to_slurp, "<$filename") or croak "cannot open '$filename': $!";
1356    local $/= undef;
1357    my $content= <$to_slurp>;
1358    close $to_slurp;
1359    return $content;
1360  }
1361
1362sub _slurp_fh
1363  { my( $fh)= @_;
1364    local $/= undef;
1365    my $content= <$fh>;
1366    return $content;
1367  }
1368
1369# I should really add extra options to allow better configuration of the
1370# LWP::UserAgent object
1371# this method forks (except on VMS!)
1372#   - the child gets the data and copies it to the pipe,
1373#   - the parent reads the stream and sends it to XML::Parser
1374# the data is cut it chunks the size of the XML::Parser::Expat buffer
1375# the method returns the twig and the status
1376sub _parseurl
1377  { my( $t, $safe, $url, $agent)= @_;
1378    _use( 'LWP') || croak "LWP not available, needed to use parseurl methods";
1379    if( $^O ne 'VMS')
1380      { pipe( README, WRITEME) or croak  "cannot create connected pipes: $!";
1381        if( my $pid= fork)
1382          { # parent code: parse the incoming file
1383            close WRITEME; # no need to write
1384            my $result= $safe ? $t->safe_parse( \*README) : $t->parse( \*README);
1385            close README;
1386            return $@ ? 0 : $t;
1387          }
1388        else
1389         { # child
1390            close README; # no need to read
1391            local $|=1;
1392            $agent    ||= LWP::UserAgent->new;
1393            my $request  = HTTP::Request->new( GET => $url);
1394            # _pass_url_content is called with chunks of data the same size as
1395            # the XML::Parser buffer
1396            my $response = $agent->request( $request,
1397                             sub { _pass_url_content( \*WRITEME, @_); }, $BUFSIZE);
1398            $response->is_success or croak "$url ", $response->message;
1399            close WRITEME;
1400            CORE::exit(); # CORE is there for mod_perl (which redefines exit)
1401          }
1402      }
1403    else
1404      { # VMS branch (hard to test!)
1405        local $|=1;
1406        $agent    ||= LWP::UserAgent->new;
1407        my $request  = HTTP::Request->new( GET => $url);
1408        my $response = $agent->request( $request);
1409        $response->is_success or croak "$url ", $response->message;
1410        my $result= $safe ? $t->safe_parse($response->content) : $t->parse($response->content);
1411        return $@ ? 0 : $t;
1412     }
1413
1414  }
1415
1416# get the (hopefully!) XML data from the URL and
1417sub _pass_url_content
1418  { my( $fh, $data, $response, $protocol)= @_;
1419    print {$fh} $data;
1420  }
1421
1422sub add_options
1423  { my %args= map { $_, 1 } @_;
1424    %args= _normalize_args( %args);
1425    foreach (keys %args) { $valid_option{$_}++; }
1426  }
1427
1428sub _pretty_print_styles { return XML::Twig::Elt::_pretty_print_styles(); }
1429
1430sub _twig_store_internal_dtd
1431 {
1432   # warn " in _twig_store_internal_dtd...\n"; # DEBUG handler
1433    my( $p, $string)= @_;
1434    my $t= $p->{twig};
1435    if( $t->{twig_keep_encoding}) { $string= $p->original_string(); }
1436    $t->{twig_doctype}->{internal} .= $string;
1437    return;
1438  }
1439
1440sub _twig_stop_storing_internal_dtd
1441   { # warn " in _twig_stop_storing_internal_dtd...\n"; # DEBUG handler
1442    my $p= shift;
1443    if( @saved_default_handler && defined $saved_default_handler[1])
1444      { $p->setHandlers( @saved_default_handler); }
1445    else
1446      {
1447        $p->setHandlers( Default => undef);
1448      }
1449    $p->{twig}->{twig_doctype}->{internal}=~ s{^\s*\[}{};
1450    $p->{twig}->{twig_doctype}->{internal}=~ s{\]\s*$}{};
1451    return;
1452  }
1453
1454sub _twig_doctype_fin_print
1455  { # warn " in _twig_doctype_fin_print...\n"; # DEBUG handler
1456    my( $p)= shift;
1457    if( $p->{twig}->{twig_doctype}->{has_internal} && !$expat_1_95_2) { print ' ]>'; }
1458    return;
1459  }
1460
1461
1462sub _normalize_args
1463  { my %normalized_args;
1464    while( my $key= shift )
1465      { $key= join '', map { ucfirst } split /_/, $key;
1466        #$key= "Twig".$key unless( substr( $key, 0, 4) eq 'Twig');
1467        $normalized_args{$key}= shift ;
1468      }
1469    return %normalized_args;
1470  }
1471
1472sub _is_fh { return unless $_[0]; return $_[0] if( isa( $_[0], 'GLOB') || isa( $_[0], 'IO::Scalar')); }
1473
1474sub _set_handler
1475  { my( $handlers, $whole_path, $handler)= @_;
1476
1477    my $H_SPECIAL = qr{($ALL|$DEFAULT|$COMMENT|$TEXT)};
1478    my $H_PI      = qr{(\?|$PI)\s*(([^\s]*)\s*)};
1479    my $H_LEVEL   = qr{level \s* \( \s* ([0-9]+) \s* \)}x;
1480    my $H_REGEXP  = qr{\(\?([\^xism]*)(-[\^xism]*)?:(.*)\)}x;
1481    my $H_XPATH   = qr{(/?/?$REG_TAG_PART? \s* ($REG_PREDICATE\s*)?)+}x;
1482
1483    my $prev_handler;
1484
1485    my $cpath= $whole_path;
1486    #warn "\$cpath: '$cpath\n";
1487    while( $cpath && $cpath=~ s{^\s*($H_SPECIAL|$H_PI|$H_LEVEL|$H_REGEXP|$H_XPATH)\s*($|\|)}{})
1488      { my $path= $1;
1489        #warn "\$cpath: '$cpath' - $path: '$path'\n";
1490        $prev_handler ||= $handlers->{handlers}->{string}->{$path} || undef; # $prev_handler gets the first found handler
1491
1492           _set_special_handler         ( $handlers, $path, $handler, $prev_handler)
1493        || _set_pi_handler              ( $handlers, $path, $handler, $prev_handler)
1494        || _set_level_handler           ( $handlers, $path, $handler, $prev_handler)
1495        || _set_regexp_handler          ( $handlers, $path, $handler, $prev_handler)
1496        || _set_xpath_handler           ( $handlers, $path, $handler, $prev_handler)
1497        || croak "unrecognized expression in handler: '$whole_path'";
1498
1499        # this both takes care of the simple (gi) handlers and store
1500        # the handler code reference for other handlers
1501        $handlers->{handlers}->{string}->{$path}= $handler;
1502      }
1503
1504    if( $cpath) { croak "unrecognized expression in handler: '$whole_path'"; }
1505
1506    return $prev_handler;
1507  }
1508
1509
1510sub _set_special_handler
1511  { my( $handlers, $path, $handler, $prev_handler)= @_;
1512    if( $path =~ m{^\s*($ALL|$DEFAULT|$COMMENT|$TEXT)\s*$}io )
1513      { $handlers->{handlers}->{$1}= $handler;
1514        return 1;
1515      }
1516    else
1517      { return 0; }
1518  }
1519
1520sub _set_xpath_handler
1521  { my( $handlers, $path, $handler, $prev_handler)= @_;
1522    if( my $handler_data= _parse_xpath_handler( $path, $handler))
1523      { _add_handler( $handlers, $handler_data, $path, $prev_handler);
1524        return 1;
1525      }
1526    else
1527      { return 0; }
1528  }
1529
1530sub _add_handler
1531  { my( $handlers, $handler_data, $path, $prev_handler)= @_;
1532
1533    my $tag= $handler_data->{tag};
1534    my @handlers= $handlers->{xpath_handler}->{$tag} ? @{$handlers->{xpath_handler}->{$tag}} : ();
1535
1536    if( $prev_handler) { @handlers= grep { $_->{path} ne $path } @handlers; }
1537
1538    push @handlers, $handler_data if( $handler_data->{handler});
1539
1540    if( @handlers > 1)
1541      { @handlers= sort {    (($b->{score}->{type}        || 0)  <=>  ($a->{score}->{type}        || 0))
1542                          || (($b->{score}->{anchored}    || 0)  <=>  ($a->{score}->{anchored}    || 0))
1543                          || (($b->{score}->{steps}       || 0)  <=>  ($a->{score}->{steps}       || 0))
1544                          || (($b->{score}->{predicates}  || 0)  <=>  ($a->{score}->{predicates}  || 0))
1545                          || (($b->{score}->{tests}       || 0)  <=>  ($a->{score}->{tests}       || 0))
1546                          || ($a->{path} cmp $b->{path})
1547                        } @handlers;
1548      }
1549
1550    $handlers->{xpath_handler}->{$tag}= \@handlers;
1551  }
1552
1553sub _set_pi_handler
1554  { my( $handlers, $path, $handler, $prev_handler)= @_;
1555    # PI conditions ( '?target' => \&handler or '?' => \&handler
1556    #             or '#PItarget' => \&handler or '#PI' => \&handler)
1557    if( $path=~ /^\s*(?:\?|$PI)\s*(?:([^\s]*)\s*)$/)
1558      { my $target= $1 || '';
1559        # update the path_handlers count, knowing that
1560        # either the previous or the new handler can be undef
1561        $handlers->{pi_handlers}->{$1}= $handler;
1562        return 1;
1563      }
1564    else
1565      { return 0;
1566      }
1567  }
1568
1569sub _set_level_handler
1570  { my( $handlers, $path, $handler, $prev_handler)= @_;
1571    if( $path =~ m{^ \s* level \s* \( \s* ([0-9]+) \s* \) \s* $}ox )
1572      { my $level= $1;
1573        my $sub= sub { my( $stack)= @_; return( ($stack->[-1]->{$ST_TAG} !~ m{^#}) && (scalar @$stack == $level + 1) ) };
1574        my $handler_data=  { tag=> '*', score => { type => $LEVEL_TRIGGER}, trigger => $sub,
1575                             path => $path, handler => $handler, test_on_text => 0
1576                           };
1577        _add_handler( $handlers, $handler_data, $path, $prev_handler);
1578        return 1;
1579      }
1580    else
1581      { return 0; }
1582  }
1583
1584sub _set_regexp_handler
1585  { my( $handlers, $path, $handler, $prev_handler)= @_;
1586    # if the expression was a regexp it is now a string (it was stringified when it became a hash key)
1587    if( $path=~ m{^\(\?([\^xism]*)(?:-[\^xism]*)?:(.*)\)$})
1588      { my $regexp= qr/(?$1:$2)/; # convert it back into a regexp
1589        my $sub= sub { my( $stack)= @_; return( $stack->[-1]->{$ST_TAG} =~ $regexp ) };
1590        my $handler_data=  { tag=> '*', score => { type => $REGEXP_TRIGGER} , trigger => $sub,
1591                             path => $path, handler => $handler, test_on_text => 0
1592                           };
1593        _add_handler( $handlers, $handler_data, $path, $prev_handler);
1594        return 1;
1595      }
1596    else
1597      { return 0; }
1598  }
1599
1600my $DEBUG_HANDLER= 0; # 0 or 1 (output the handler checking code) or 2 (super verbose)
1601my $handler_string;   # store the handler itself
1602sub _set_debug_handler    { $DEBUG_HANDLER= shift; }
1603sub _warn_debug_handler   { if( $DEBUG_HANDLER < 3) { warn @_; } else { $handler_string .= join( '', @_); } }
1604sub _return_debug_handler { my $string=  $handler_string; $handler_string=''; return $string; }
1605
1606sub _parse_xpath_handler
1607  { my( $xpath, $handler)= @_;
1608    my $xpath_original= $xpath;
1609
1610
1611    if( $DEBUG_HANDLER >=1) { _warn_debug_handler( "\n\nparsing path '$xpath'\n"); }
1612
1613    my $path_to_check= $xpath;
1614    $path_to_check=~ s{/?/?$REG_TAG_PART?\s*(?:$REG_PREDICATE\s*)?}{}g;
1615    if( $DEBUG_HANDLER && $path_to_check=~ /\S/) { _warn_debug_handler( "left: $path_to_check\n"); }
1616    return if( $path_to_check=~ /\S/);
1617
1618    (my $xpath_to_display= $xpath)=~ s{(["{}'\[\]\@\$])}{\\$1}g;
1619
1620    my @xpath_steps;
1621    my $last_token_is_sep;
1622
1623    while( $xpath=~ s{^\s*
1624                       ( (//?)                                      # separator
1625                        | (?:$REG_TAG_PART\s*(?:$REG_PREDICATE\s*)?) # tag name and optional predicate
1626                        | (?:$REG_PREDICATE)                        # just a predicate
1627                       )
1628                     }
1629                     {}x
1630         )
1631      { # check that we have alternating separators and steps
1632        if( $2) # found a separator
1633          { if(  $last_token_is_sep) { return 0; }                                 # 2 separators in a row
1634            $last_token_is_sep= 1;
1635          }
1636        else
1637          { if( defined( $last_token_is_sep) && !$last_token_is_sep) { return 0; } # 2 steps in a row
1638            $last_token_is_sep= 0;
1639          }
1640
1641        push @xpath_steps, $1;
1642      }
1643    if( $last_token_is_sep) { return 0; } # expression cannot end with a separator
1644
1645    my $i=-1;
1646
1647    my $perlfunc= _join_n( $NO_WARNINGS . ';',
1648                           q|my( $stack)= @_;                    |,
1649                           q|my @current_elts= (scalar @$stack); |,
1650                           q|my @new_current_elts;               |,
1651                           q|my $elt;                            |,
1652                           ($DEBUG_HANDLER >= 1) && (qq#warn q{checking path '$xpath_to_display'\n};#),
1653                         );
1654
1655
1656    my $last_tag='';
1657    my $anchored= $xpath_original=~ m{^\s*/(?!/)} ? 1 : 0;
1658    my $score={ type => $XPATH_TRIGGER, anchored => $anchored };
1659    my $flag= { test_on_text => 0 };
1660    my $sep='/';  # '/' or '//'
1661    while( my $xpath_step= pop @xpath_steps)
1662      { my( $tag, $predicate)= $xpath_step =~ m{^($REG_TAG_PART)?(?:\[(.*)\])?\s*$};
1663        $score->{steps}++;
1664        $tag||='*';
1665
1666        my $warn_empty_stack= $DEBUG_HANDLER >= 2 ? qq{warn "return with empty stack\\n";} : '';
1667
1668        if( $predicate)
1669          { if( $DEBUG_HANDLER >= 2)  { _warn_debug_handler( "predicate is: '$predicate'\n"); }
1670            # changes $predicate (from an XPath expression to a Perl one)
1671            if( $predicate=~ m{^\s*$REG_NUMBER\s*$}) { croak "position selector [$predicate] not supported on twig_handlers"; }
1672            _parse_predicate_in_handler( $predicate, $flag, $score);
1673            if( $DEBUG_HANDLER >= 2) { _warn_debug_handler( "predicate becomes: '$predicate'\n"); }
1674          }
1675
1676       my $tag_cond=  _tag_cond( $tag);
1677       my $cond= join( " && ", grep { $_ } $tag_cond, $predicate) || 1;
1678
1679       if( $css_sel && $tag=~ m{\.}) { $tag=~s{\.[^.]*$}{}; $tag ||='*'; }
1680       $tag=~ s{(.)#.+$}{$1};
1681
1682       $last_tag ||= $tag;
1683
1684       if( $sep eq '/')
1685         {
1686           $perlfunc .= sprintf( _join_n(  q#foreach my $current_elt (@current_elts)              #,
1687                                           q#  { next if( !$current_elt);                         #,
1688                                           q#    $current_elt--;                                  #,
1689                                           q#    $elt= $stack->[$current_elt];                    #,
1690                                           q#    if( %s) { push @new_current_elts, $current_elt;} #,
1691                                           q#  }                                                  #,
1692                                        ),
1693                                 $cond
1694                               );
1695         }
1696       elsif( $sep eq '//')
1697         {
1698           $perlfunc .= sprintf( _join_n(  q#foreach my $current_elt (@current_elts)                #,
1699                                           q#  { next if( !$current_elt);                           #,
1700                                           q#    $current_elt--;                                    #,
1701                                           q#    my $candidate= $current_elt;                       #,
1702                                           q#    while( $candidate >=0)                             #,
1703                                           q#      { $elt= $stack->[$candidate];                    #,
1704                                           q#        if( %s) { push @new_current_elts, $candidate;} #,
1705                                           q#        $candidate--;                                  #,
1706                                           q#      }                                                #,
1707                                           q#  }                                                    #,
1708                                        ),
1709                                 $cond
1710                               );
1711         }
1712       my $warn= $DEBUG_HANDLER >= 2 ? _join_n( qq#warn qq%fail at cond '$cond'%;#) : '';
1713       $perlfunc .= sprintf( _join_n( q#unless( @new_current_elts) { %s return 0; } #,
1714                                      q#@current_elts= @new_current_elts;           #,
1715                                      q#@new_current_elts=();                       #,
1716                                    ),
1717                             $warn
1718                           );
1719
1720        $sep= pop @xpath_steps;
1721     }
1722
1723    if( $anchored) # there should be a better way, but this works
1724      {
1725       my $warn= $DEBUG_HANDLER >= 2 ? _join_n( qq#warn qq{fail, stack not empty};#) : '';
1726       $perlfunc .= sprintf( _join_n( q#if( ! grep { $_ == 0 } @current_elts) { %s return 0;}#), $warn);
1727      }
1728
1729    $perlfunc.= qq{warn "handler for '$xpath_to_display' triggered\\n";\n} if( $DEBUG_HANDLER >=2);
1730    $perlfunc.= qq{return q{$xpath_original};\n};
1731    _warn_debug_handler( "\nperlfunc:\n$perlfunc\n") if( $DEBUG_HANDLER>=1);
1732    my $s= eval "sub { $perlfunc }";
1733      if( $@)
1734        { croak "wrong handler condition '$xpath' ($@);" }
1735
1736      _warn_debug_handler( "last tag: '$last_tag', test_on_text: '$flag->{test_on_text}'\n") if( $DEBUG_HANDLER >=1);
1737      _warn_debug_handler( "score: ", join( ' ', map { "$_: $score->{$_}" } sort keys %$score), "\n") if( $DEBUG_HANDLER >=1);
1738      return { tag=> $last_tag, score => $score, trigger => $s, path => $xpath_original, handler => $handler, test_on_text => $flag->{test_on_text} };
1739    }
1740
1741sub _join_n { return join( "\n", @_, ''); }
1742
1743# the "tag" part can be <tag>, <tag>.<class> or <tag>#<id> (where tag can be *, or start with # for hidden tags)
1744sub _tag_cond
1745  { my( $full_tag)= @_;
1746
1747    my( $tag, $class, $id);
1748    if( $full_tag=~ m{^(.+)#(.+)$})
1749      { ($tag, $id)= ($1, $2); } # <tag>#<id>
1750    else
1751      { ( $tag, $class)= $css_sel ? $full_tag=~ m{^(.*?)(?:\.([^.]*))?$} : ($full_tag, undef); }
1752
1753    my $tag_cond   = $tag && $tag ne '*' ? qq#(\$elt->{'$ST_TAG'} eq "$tag")# : '';
1754    my $id_cond    = defined $id         ? qq#(\$elt->{id} eq "$id")#  : '';
1755    my $class_cond = defined $class      ? qq#(\$elt->{class}=~ m{(^| )$class( |\$)})# : '';
1756
1757    my $full_cond= join( ' && ', grep { $_ } ( $tag_cond, $class_cond, $id_cond));
1758
1759    return $full_cond;
1760  }
1761
1762# input: the predicate ($_[0]) which will be changed in place
1763#        flags, a hashref with various flags (like test_on_text)
1764#        the score
1765sub _parse_predicate_in_handler
1766  { my( $flag, $score)= @_[1..2];
1767    $_[0]=~ s{(   ($REG_STRING)                            # strings
1768                 |\@($REG_TAG_NAME)(\s* $REG_MATCH \s* $REG_REGEXP) # @att and regexp
1769                 |\@($REG_TAG_NAME)(?=\s*(?:[><=!]))       # @att followed by a comparison operator
1770                 |\@($REG_TAG_NAME)                        # @att (not followed by a comparison operator)
1771                 |=~|!~                                    # matching operators
1772                 |([><]=?|=|!=)(?=\s*[\d+-])               # test before a number
1773                 |([><]=?|=|!=)                            # test, other cases
1774                 |($REG_FUNCTION)                          # no arg functions
1775                 # this bit is a mess, but it is the only solution with this half-baked parser
1776                 |(string\(\s*$REG_NAME\s*\)\s*$REG_MATCH\s*$REG_REGEXP)  # string( child)=~ /regexp/
1777                 |(string\(\s*$REG_NAME\s*\)\s*$REG_COMP\s*$REG_STRING)   # string( child) = "value" (or other test)
1778                 |(string\(\s*$REG_NAME\s*\)\s*$REG_COMP\s*$REG_NUMBER)   # string( child) = nb (or other test)
1779                 |(and|or)
1780                # |($REG_NAME(?=\s*(and|or|$)))            # nested tag name (needs to be after all other unquoted strings)
1781                 |($REG_TAG_IN_PREDICATE)                  # nested tag name (needs to be after all other unquoted strings)
1782
1783              )}
1784             { my( $token, $str, $att_re_name, $att_re_regexp, $att, $bare_att, $num_test, $alpha_test, $func, $str_regexp, $str_test_alpha, $str_test_num, $and_or, $tag)
1785               = ( $1,     $2,   $3,           $4,             $5,   $6,        $7,        $8,          $9,    $10,         $11,             $12,           $13,     $14);
1786
1787               $score->{predicates}++;
1788
1789               # store tests on text (they are not always allowed)
1790               if( $func || $str_regexp || $str_test_num || $str_test_alpha ) { $flag->{test_on_text}= 1;   }
1791
1792               if( defined $str)      { $token }
1793               elsif( $tag)           { qq{(\$elt->{'$ST_ELT'} && \$elt->{'$ST_ELT'}->has_child( '$tag'))} }
1794               elsif( $att)           { $att=~ m{^#} ? qq{ (\$elt->{'$ST_ELT'} && \$elt->{'$ST_ELT'}->{att}->{'$att'})}
1795                                                     : qq{\$elt->{'$att'}}
1796                                      }
1797               elsif( $att_re_name)   { $att_re_name=~ m{^#} ? qq{ (\$elt->{'$ST_ELT'} && \$elt->{'$ST_ELT'}->{att}->{'$att_re_name'}$att_re_regexp)}
1798                                                     : qq{\$elt->{'$att_re_name'}$att_re_regexp}
1799                                      }
1800                                        # for some reason Devel::Cover flags the following lines as not tested. They are though.
1801               elsif( $bare_att)      { $bare_att=~ m{^#} ? qq{(\$elt->{'$ST_ELT'} && defined(\$elt->{'$ST_ELT'}->{att}->{'$bare_att'}))}
1802                                                          : qq{defined( \$elt->{'$bare_att'})}
1803                                      }
1804               elsif( $num_test && ($num_test eq '=') ) { "==" } # others tests are unchanged
1805               elsif( $alpha_test)    { $PERL_ALPHA_TEST{$alpha_test} }
1806               elsif( $func && $func=~ m{^string})
1807                                      { "\$elt->{'$ST_ELT'}->text"; }
1808               elsif( $str_regexp     && $str_regexp     =~ m{string\(\s*($REG_TAG_NAME)\s*\)\s*($REG_MATCH)\s*($REG_REGEXP)})
1809                                      { "defined( _first_n {  \$_->text $2 $3 } 1, \$elt->{'$ST_ELT'}->_children( '$1'))"; }
1810               elsif( $str_test_alpha && $str_test_alpha =~ m{string\(\s*($REG_TAG_NAME)\s*\)\s*($REG_COMP)\s*($REG_STRING)})
1811                                      { my( $tag, $op, $str)= ($1, $2, $3);
1812                                        $str=~ s{(?<=.)'(?=.)}{\\'}g; # escape a quote within the string
1813                                        $str=~ s{^"}{'};
1814                                        $str=~ s{"$}{'};
1815                                        "defined( _first_n { \$_->text $PERL_ALPHA_TEST{$op} $str } 1, \$elt->{'$ST_ELT'}->children( '$tag'))"; }
1816               elsif( $str_test_num   && $str_test_num   =~ m{string\(\s*($REG_TAG_NAME)\s*\)\s*($REG_COMP)\s*($REG_NUMBER)})
1817                                      { my $test= ($2 eq '=') ? '==' : $2;
1818                                        "defined( _first_n { \$_->text $test $3 } 1, \$elt->{'$ST_ELT'}->children( '$1'))";
1819                                      }
1820               elsif( $and_or)        { $score->{tests}++; $and_or eq 'and' ? '&&' : '||' ; }
1821               else                   { $token; }
1822             }gexs;
1823  }
1824
1825
1826sub setCharHandler
1827  { my( $t, $handler)= @_;
1828    $t->{twig_char_handler}= $handler;
1829  }
1830
1831
1832sub _reset_handlers
1833  { my $handlers= shift;
1834    delete $handlers->{handlers};
1835    delete $handlers->{path_handlers};
1836    delete $handlers->{subpath_handlers};
1837    $handlers->{attcond_handlers_exp}=[] if( $handlers->{attcond_handlers});
1838    delete $handlers->{attcond_handlers};
1839  }
1840
1841sub _set_handlers
1842  { my $handlers= shift || return;
1843    my $set_handlers= {};
1844    foreach my $path (keys %{$handlers})
1845      { _set_handler( $set_handlers, $path, $handlers->{$path}); }
1846
1847    return $set_handlers;
1848  }
1849
1850
1851sub setTwigHandler
1852  { my( $t, $path, $handler)= @_;
1853    $t->{twig_handlers} ||={};
1854    return _set_handler( $t->{twig_handlers}, $path, $handler);
1855  }
1856
1857sub setTwigHandlers
1858  { my( $t, $handlers)= @_;
1859    my $previous_handlers= $t->{twig_handlers} || undef;
1860    _reset_handlers( $t->{twig_handlers});
1861    $t->{twig_handlers}= _set_handlers( $handlers);
1862    return $previous_handlers;
1863  }
1864
1865sub setStartTagHandler
1866  { my( $t, $path, $handler)= @_;
1867    $t->{twig_starttag_handlers}||={};
1868    return _set_handler( $t->{twig_starttag_handlers}, $path, $handler);
1869  }
1870
1871sub setStartTagHandlers
1872  { my( $t, $handlers)= @_;
1873    my $previous_handlers= $t->{twig_starttag_handlers} || undef;
1874    _reset_handlers( $t->{twig_starttag_handlers});
1875    $t->{twig_starttag_handlers}= _set_handlers( $handlers);
1876    return $previous_handlers;
1877   }
1878
1879sub setIgnoreEltsHandler
1880  { my( $t, $path, $action)= @_;
1881    $t->{twig_ignore_elts_handlers}||={};
1882    return _set_handler( $t->{twig_ignore_elts_handlers}, $path, $action );
1883  }
1884
1885sub setIgnoreEltsHandlers
1886  { my( $t, $handlers)= @_;
1887    my $previous_handlers= $t->{twig_ignore_elts_handlers};
1888    _reset_handlers( $t->{twig_ignore_elts_handlers});
1889    $t->{twig_ignore_elts_handlers}= _set_handlers( $handlers);
1890    return $previous_handlers;
1891   }
1892
1893sub setEndTagHandler
1894  { my( $t, $path, $handler)= @_;
1895    $t->{twig_endtag_handlers}||={};
1896    return _set_handler( $t->{twig_endtag_handlers}, $path,$handler);
1897  }
1898
1899sub setEndTagHandlers
1900  { my( $t, $handlers)= @_;
1901    my $previous_handlers= $t->{twig_endtag_handlers};
1902    _reset_handlers( $t->{twig_endtag_handlers});
1903    $t->{twig_endtag_handlers}= _set_handlers( $handlers);
1904    return $previous_handlers;
1905   }
1906
1907# a little more complex: set the twig_handlers only if a code ref is given
1908sub setTwigRoots
1909  { my( $t, $handlers)= @_;
1910    my $previous_roots= $t->{twig_roots};
1911    _reset_handlers($t->{twig_roots});
1912    $t->{twig_roots}= _set_handlers( $handlers);
1913
1914    _check_illegal_twig_roots_handlers( $t->{twig_roots});
1915
1916    foreach my $path (keys %{$handlers})
1917      { $t->{twig_handlers}||= {};
1918        _set_handler( $t->{twig_handlers}, $path, $handlers->{$path})
1919          if( ref($handlers->{$path}) && isa( $handlers->{$path}, 'CODE'));
1920      }
1921    return $previous_roots;
1922  }
1923
1924sub _check_illegal_twig_roots_handlers
1925  { my( $handlers)= @_;
1926    foreach my $tag_handlers (values %{$handlers->{xpath_handler}})
1927      { foreach my $handler_data (@$tag_handlers)
1928          { if( my $type= $handler_data->{test_on_text})
1929              { croak "string() condition not supported on twig_roots option"; }
1930          }
1931      }
1932    return;
1933  }
1934
1935
1936# just store the reference to the expat object in the twig
1937sub _twig_init
1938   { # warn " in _twig_init...\n"; # DEBUG handler
1939
1940    my $p= shift;
1941    my $t=$p->{twig};
1942
1943    if( $t->{twig_parsing} ) { croak "cannot reuse a twig that is already parsing"; }
1944    $t->{twig_parsing}=1;
1945
1946    $t->{twig_parser}= $p;
1947    if( $weakrefs) { weaken( $t->{twig_parser}); }
1948
1949    # in case they had been created by a previous parse
1950    delete $t->{twig_dtd};
1951    delete $t->{twig_doctype};
1952    delete $t->{twig_xmldecl};
1953    delete $t->{twig_root};
1954
1955    # if needed set the output filehandle
1956    $t->_set_fh_to_twig_output_fh();
1957    return;
1958  }
1959
1960# uses eval to catch the parser's death
1961sub safe_parse
1962  { my $t= shift;
1963    eval { $t->parse( @_); } ;
1964    return $@ ? $t->_reset_twig_after_error : $t;
1965  }
1966
1967sub safe_parsefile
1968  { my $t= shift;
1969    eval { $t->parsefile( @_); } ;
1970    return $@ ? $t->_reset_twig_after_error : $t;
1971  }
1972
1973# restore a twig in a proper state so it can be reused for a new parse
1974sub _reset_twig
1975  { my $t= shift;
1976    $t->{twig_parsing}= 0;
1977    delete $t->{twig_current};
1978    delete $t->{extra_data};
1979    delete $t->{twig_dtd};
1980    delete $t->{twig_in_pcdata};
1981    delete $t->{twig_in_cdata};
1982    delete $t->{twig_stored_space};
1983    delete $t->{twig_entity_list};
1984    $t->root->delete if( $t->root);
1985    delete $t->{twig_root};
1986    return $t;
1987  }
1988
1989sub _reset_twig_after_error
1990  { my $t= shift;
1991    $t->_reset_twig;
1992    return undef;
1993  }
1994
1995
1996sub _add_or_discard_stored_spaces
1997  { my $t= shift;
1998
1999    $t->{twig_right_after_root}=0; #XX
2000
2001    my $current= $t->{twig_current} or return; # ugly hack, with ignore on, twig_current can disappear
2002    return unless length $t->{twig_stored_spaces};
2003    my $current_gi= $XML::Twig::index2gi[$current->{'gi'}];
2004
2005    if( ! $t->{twig_discard_all_spaces})
2006      { if( ! defined( $t->{twig_space_policy}->{$current_gi}))
2007          { $t->{twig_space_policy}->{$current_gi}= _space_policy( $t, $current_gi); }
2008        if(    $t->{twig_space_policy}->{$current_gi} || ($t->{twig_stored_spaces}!~ m{\n}) || $t->{twig_preserve_space})
2009          { _insert_pcdata( $t, $t->{twig_stored_spaces} ); }
2010      }
2011
2012    $t->{twig_stored_spaces}='';
2013
2014    return;
2015  }
2016
2017# the default twig handlers, which build the tree
2018sub _twig_start
2019   { # warn " in _twig_start...\n"; # DEBUG handler
2020
2021    #foreach my $s (@_) { next if ref $s; warn "$s: ", is_utf8( $s) ? "has flag" : "FLAG NOT SET"; } # YYY
2022
2023    my ($p, $gi, @att)= @_;
2024    my $t=$p->{twig};
2025
2026    # empty the stored pcdata (space stored in case they are really part of
2027    # a pcdata element) or stored it if the space policy dictates so
2028    # create a pcdata element with the spaces if need be
2029    _add_or_discard_stored_spaces( $t);
2030    my $parent= $t->{twig_current};
2031
2032    # if we were parsing PCDATA then we exit the pcdata
2033    if( $t->{twig_in_pcdata})
2034      { $t->{twig_in_pcdata}= 0;
2035        delete $parent->{'twig_current'};
2036        $parent= $parent->{parent};
2037      }
2038
2039    # if we choose to keep the encoding then we need to parse the tag
2040    if( my $func = $t->{parse_start_tag})
2041      { ($gi, @att)= &$func($p->original_string); }
2042    elsif( $t->{twig_entities_in_attribute})
2043      {
2044       ($gi,@att)= _parse_start_tag( $p->recognized_string);
2045         $t->{twig_entities_in_attribute}=0;
2046      }
2047
2048    # if we are using an external DTD, we need to fill the default attributes
2049    if( $t->{twig_read_external_dtd}) { _fill_default_atts( $t, $gi, \@att); }
2050
2051    # filter the input data if need be
2052    if( my $filter= $t->{twig_input_filter})
2053      { $gi= $filter->( $gi);
2054        foreach my $att (@att) { $att= $filter->($att); }
2055      }
2056
2057    my $ns_decl;
2058    if( $t->{twig_map_xmlns})
2059      { $ns_decl= _replace_ns( $t, \$gi, \@att); }
2060
2061    my $elt= $t->{twig_elt_class}->new( $gi);
2062    $elt->set_atts( @att);
2063
2064    # now we can store the tag and atts
2065    my $context= { $ST_TAG => $gi, $ST_ELT => $elt, @att};
2066    $context->{$ST_NS}= $ns_decl if $ns_decl;
2067    if( $weakrefs) { weaken( $context->{$ST_ELT}); }
2068    push @{$t->{_twig_context_stack}}, $context;
2069
2070    delete $parent->{'twig_current'} if( $parent);
2071    $t->{twig_current}= $elt;
2072    $elt->{'twig_current'}=1;
2073
2074    if( $parent)
2075      { my $prev_sibling= $parent->{last_child};
2076        if( $prev_sibling)
2077          { $prev_sibling->{next_sibling}=  $elt;
2078            $elt->{prev_sibling}=$prev_sibling; if( $XML::Twig::weakrefs) { weaken( $elt->{prev_sibling});} ;
2079          }
2080
2081        $elt->{parent}=$parent; if( $XML::Twig::weakrefs) { weaken( $elt->{parent});} ;
2082        unless( $parent->{first_child}) { $parent->{first_child}=  $elt; }
2083         $parent->{empty}=0; $parent->{last_child}=$elt; if( $XML::Twig::weakrefs) { weaken( $parent->{last_child});} ;
2084      }
2085    else
2086      { # processing root
2087        $t->set_root( $elt);
2088        # call dtd handler if need be
2089        $t->{twig_dtd_handler}->($t, $t->{twig_dtd})
2090          if( defined $t->{twig_dtd_handler});
2091
2092        # set this so we can catch external entities
2093        # (the handler was modified during DTD processing)
2094        if( $t->{twig_default_print})
2095          { $p->setHandlers( Default => \&_twig_print); }
2096        elsif( $t->{twig_roots})
2097          { $p->setHandlers( Default => sub { return }); }
2098        else
2099          { $p->setHandlers( Default => \&_twig_default); }
2100      }
2101
2102    $elt->{empty}=  $p->recognized_string=~ m{/\s*>$}s ? 1 : 0;
2103
2104    $elt->{extra_data}= $t->{extra_data} if( $t->{extra_data});
2105    $t->{extra_data}='';
2106
2107    # if the element is ID-ed then store that info
2108    my $id= $elt->{'att'}->{$ID};
2109    if( defined $id)
2110      { $t->{twig_id_list}->{$id}= $elt;
2111        if( $weakrefs) { weaken( $t->{twig_id_list}->{$id}); }
2112      }
2113
2114    # call user handler if need be
2115    if( $t->{twig_starttag_handlers})
2116      { # call all appropriate handlers
2117        my @handlers= _handler( $t, $t->{twig_starttag_handlers}, $gi);
2118
2119        local $_= $elt;
2120
2121        foreach my $handler ( @handlers)
2122          { $handler->($t, $elt) || last; }
2123        # call _all_ handler if needed
2124        if( my $all= $t->{twig_starttag_handlers}->{handlers}->{$ALL})
2125          { $all->($t, $elt); }
2126      }
2127
2128    # check if the tag is in the list of tags to be ignored
2129    if( $t->{twig_ignore_elts_handlers})
2130      { my @handlers= _handler( $t, $t->{twig_ignore_elts_handlers}, $gi);
2131        # only the first handler counts, it contains the action (discard/print/string)
2132        if( @handlers) { my $action= shift @handlers; $t->ignore( $elt, $action); }
2133      }
2134
2135    if( $elt->{'att'}->{'xml:space'} && (  $elt->{'att'}->{'xml:space'} eq 'preserve')) { $t->{twig_preserve_space}++; }
2136
2137
2138    return;
2139  }
2140
2141sub _replace_ns
2142  { my( $t, $gi, $atts)= @_;
2143    my $decls;
2144    foreach my $new_prefix ( $t->parser->new_ns_prefixes)
2145      { my $uri= $t->parser->expand_ns_prefix( $new_prefix);
2146        # replace the prefix if it is mapped
2147        $decls->{$new_prefix}= $uri;
2148        if( !$t->{twig_keep_original_prefix} && (my $mapped_prefix= $t->{twig_map_xmlns}->{$uri}))
2149          { $new_prefix= $mapped_prefix; }
2150        # now put the namespace declaration back in the element
2151        if( $new_prefix eq '#default')
2152          { push @$atts, "xmlns" =>  $uri; }
2153        else
2154          { push @$atts, "xmlns:$new_prefix" =>  $uri; }
2155      }
2156
2157    if( $t->{twig_keep_original_prefix})
2158      { # things become more complex: we need to find the original prefix
2159        # and store both prefixes
2160        my $ns_info= $t->_ns_info( $$gi);
2161        my $map_att;
2162        if( $ns_info->{mapped_prefix})
2163          { $$gi= "$ns_info->{mapped_prefix}:$$gi";
2164            $map_att->{$ns_info->{mapped_prefix}}= $ns_info->{prefix};
2165          }
2166        my $att_name=1;
2167        foreach( @$atts)
2168          { if( $att_name)
2169              {
2170                my $ns_info= $t->_ns_info( $_);
2171                if( $ns_info->{mapped_prefix})
2172                  { $_= "$ns_info->{mapped_prefix}:$_";
2173                    $map_att->{$ns_info->{mapped_prefix}}= $ns_info->{prefix};
2174                  }
2175                $att_name=0;
2176              }
2177            else
2178              {  $att_name=1; }
2179          }
2180        push @$atts, '#original_gi', $map_att if( $map_att);
2181      }
2182    else
2183      { $$gi= $t->_replace_prefix( $$gi);
2184        my $att_name=1;
2185        foreach( @$atts)
2186          { if( $att_name) { $_= $t->_replace_prefix( $_); $att_name=0; }
2187            else           {  $att_name=1; }
2188          }
2189      }
2190    return $decls;
2191  }
2192
2193
2194# extract prefix, local_name, uri, mapped_prefix from a name
2195# will only work if called from a start or end tag handler
2196sub _ns_info
2197  { my( $t, $name)= @_;
2198    my $ns_info={};
2199    my $p= $t->parser;
2200    $ns_info->{uri}= $p->namespace( $name);
2201    return $ns_info unless( $ns_info->{uri});
2202
2203    $ns_info->{prefix}= _a_proper_ns_prefix( $p, $ns_info->{uri});
2204    $ns_info->{mapped_prefix}= $t->{twig_map_xmlns}->{$ns_info->{uri}} || $ns_info->{prefix};
2205
2206    return $ns_info;
2207  }
2208
2209sub _a_proper_ns_prefix
2210  { my( $p, $uri)= @_;
2211    foreach my $prefix ($p->current_ns_prefixes)
2212      { if( $p->expand_ns_prefix( $prefix) eq $uri)
2213          { return $prefix; }
2214      }
2215    return;
2216  }
2217
2218# returns the uri bound to a prefix in the original document
2219# only works in a handler
2220# can be used to deal with xsi:type attributes
2221sub original_uri
2222  { my( $t, $prefix)= @_;
2223    my $ST_NS  = '##ns' ;
2224    foreach my $ns (map { $_->{$ST_NS} if  $_->{$ST_NS} } reverse @{$t->{_twig_context_stack}})
2225      { return $ns->{$prefix} || next; }
2226    return;
2227  }
2228
2229
2230sub _fill_default_atts
2231  { my( $t, $gi, $atts)= @_;
2232    my $dtd= $t->{twig_dtd};
2233    my $attlist= $dtd->{att}->{$gi};
2234    my %value= @$atts;
2235    foreach my $att (keys %$attlist)
2236      { if(   !exists( $value{$att})
2237            && exists( $attlist->{$att}->{default})
2238            && ( $attlist->{$att}->{default} ne '#IMPLIED')
2239          )
2240          { # the quotes are included in the default, so we need to remove them
2241            my $default_value= substr( $attlist->{$att}->{default}, 1, -1);
2242            push @$atts, $att, $default_value;
2243          }
2244      }
2245    return;
2246  }
2247
2248
2249# the default function to parse a start tag (in keep_encoding mode)
2250# can be overridden with the parse_start_tag method
2251# only works for 1-byte character sets
2252sub _parse_start_tag
2253  { my $string= shift;
2254    my( $gi, @atts);
2255
2256    # get the gi (between < and the first space, / or > character)
2257    #if( $string=~ s{^<\s*([^\s>/]*)[\s>/]*}{}s)
2258    if( $string=~ s{^<\s*($REG_TAG_NAME)\s*[\s>/]}{}s)
2259      { $gi= $1; }
2260    else
2261      { croak "error parsing tag '$string'"; }
2262    while( $string=~ s{^([^\s=]*)\s*=\s*(["'])(.*?)\2\s*}{}s)
2263      { push @atts, $1, $3; }
2264    return $gi, @atts;
2265  }
2266
2267sub set_root
2268  { my( $t, $elt)= @_;
2269    $t->{twig_root}= $elt;
2270    if( $elt)
2271      { $elt->{twig}= $t;
2272        if( $weakrefs) { weaken(  $elt->{twig}); }
2273      }
2274    return $t;
2275  }
2276
2277sub _twig_end
2278   { # warn " in _twig_end...\n"; # DEBUG handler
2279    my ($p, $gi)  = @_;
2280
2281    my $t=$p->{twig};
2282
2283    if( $t->{twig_in_pcdata} && (my $text_handler= $t->{TwigHandlers}->{$TEXT}) )
2284      { local $_= $t->{twig_current}; $text_handler->( $t, $_) if $_;
2285      }
2286
2287    if( $t->{twig_map_xmlns}) { $gi= $t->_replace_prefix( $gi); }
2288
2289    _add_or_discard_stored_spaces( $t);
2290
2291    # the new twig_current is the parent
2292    my $elt= $t->{twig_current};
2293    delete $elt->{'twig_current'};
2294
2295    # if we were parsing PCDATA then we exit the pcdata too
2296    if( $t->{twig_in_pcdata})
2297      {
2298        $t->{twig_in_pcdata}= 0;
2299        $elt= $elt->{parent} if($elt->{parent});
2300        delete $elt->{'twig_current'};
2301      }
2302
2303    # parent is the new current element
2304    my $parent= $elt->{parent};
2305    $t->{twig_current}= $parent;
2306
2307    if( $parent)
2308      { $parent->{'twig_current'}=1;
2309        # twig_to_be_normalized
2310        if( $parent->{twig_to_be_normalized}) { $parent->normalize; $parent->{twig_to_be_normalized}=0; }
2311      }
2312
2313    if( $t->{extra_data})
2314      { $elt->_set_extra_data_before_end_tag( $t->{extra_data});
2315        $t->{extra_data}='';
2316      }
2317
2318    if( $t->{twig_handlers})
2319      { # look for handlers
2320        my @handlers= _handler( $t, $t->{twig_handlers}, $gi);
2321
2322        if( $t->{twig_tdh})
2323          { if( @handlers) { push @{$t->{twig_handlers_to_trigger}}, [ $elt, \@handlers ]; }
2324            if( my $all= $t->{twig_handlers}->{handlers}->{$ALL})
2325              { push @{$t->{twig_handlers_to_trigger}}, [ $elt, [$all] ]; }
2326          }
2327        else
2328          {
2329            local $_= $elt; # so we can use $_ in the handlers
2330
2331            foreach my $handler ( @handlers)
2332              { $handler->($t, $elt) || last; }
2333            # call _all_ handler if needed
2334            my $all= $t->{twig_handlers}->{handlers}->{$ALL};
2335            if( $all)
2336              { $all->($t, $elt); }
2337            if( @handlers || $all) { $t->{twig_right_after_root}=0; }
2338          }
2339      }
2340
2341    # if twig_roots is set for the element then set appropriate handler
2342    if(  $t->{twig_root_depth} and ($p->depth == $t->{twig_root_depth}) )
2343      { if( $t->{twig_default_print})
2344          { # select the proper fh (and store the currently selected one)
2345            $t->_set_fh_to_twig_output_fh();
2346            if( !$p->depth==1) { $t->{twig_right_after_root}=1; } #XX
2347            if( $t->{twig_keep_encoding})
2348              { $p->setHandlers( %twig_handlers_roots_print_original); }
2349            else
2350              { $p->setHandlers( %twig_handlers_roots_print); }
2351          }
2352        else
2353          { $p->setHandlers( %twig_handlers_roots); }
2354      }
2355
2356    if( $elt->{'att'}->{'xml:space'} && (  $elt->{'att'}->{'xml:space'} eq 'preserve')) { $t->{twig_preserve_space}--; }
2357
2358    pop @{$t->{_twig_context_stack}};
2359    return;
2360  }
2361
2362sub _trigger_tdh
2363  { my( $t)= @_;
2364
2365    if( @{$t->{twig_handlers_to_trigger}})
2366      { my @handlers_to_trigger_now= sort { $a->[0]->cmp( $b->[0]) } @{$t->{twig_handlers_to_trigger}};
2367        foreach my $elt_handlers (@handlers_to_trigger_now)
2368          { my( $handled_elt, $handlers_to_trigger)= @$elt_handlers;
2369            foreach my $handler ( @$handlers_to_trigger)
2370              { local $_= $handled_elt; $handler->($t, $handled_elt) || last; }
2371          }
2372      }
2373    return;
2374  }
2375
2376# return the list of handler that can be activated for an element
2377# (either of CODE ref's or 1's for twig_roots)
2378
2379sub _handler
2380  { my( $t, $handlers, $gi)= @_;
2381
2382    my @found_handlers=();
2383    my $found_handler;
2384
2385    foreach my $handler ( map { @$_ } grep { $_ } $handlers->{xpath_handler}->{$gi}, $handlers->{xpath_handler}->{'*'})
2386      {  my $trigger= $handler->{trigger};
2387         if( my $found_path= $trigger->( $t->{_twig_context_stack}))
2388          { my $found_handler= $handler->{handler};
2389            push @found_handlers, $found_handler;
2390          }
2391      }
2392
2393    # if no handler found call default handler if defined
2394    if( !@found_handlers && defined $handlers->{handlers}->{$DEFAULT})
2395      { push @found_handlers, $handlers->{handlers}->{$DEFAULT}; }
2396
2397    if( @found_handlers and $t->{twig_do_not_chain_handlers})
2398      { @found_handlers= ($found_handlers[0]); }
2399
2400    return @found_handlers; # empty if no handler found
2401
2402  }
2403
2404
2405sub _replace_prefix
2406  { my( $t, $name)= @_;
2407    my $p= $t->parser;
2408    my $uri= $p->namespace( $name);
2409    # try to get the namespace from default if none is found (for attributes)
2410    # this should probably be an option
2411    if( !$uri and( $name!~/^xml/)) { $uri= $p->expand_ns_prefix( '#default'); }
2412    if( $uri)
2413      { if (my $mapped_prefix= $t->{twig_map_xmlns}->{$uri} || $DEFAULT_URI2NS{$uri})
2414          { return "$mapped_prefix:$name"; }
2415        else
2416          { my $prefix= _a_proper_ns_prefix( $p, $uri);
2417            if( $prefix eq '#default') { $prefix=''; }
2418            return $prefix ? "$prefix:$name" : $name;
2419          }
2420      }
2421    else
2422      { return $name; }
2423  }
2424
2425
2426sub _twig_char
2427   { # warn " in _twig_char...\n"; # DEBUG handler
2428
2429    my ($p, $string)= @_;
2430    my $t=$p->{twig};
2431
2432    if( $t->{twig_keep_encoding})
2433      { if( !$t->{twig_in_cdata})
2434          { $string= $p->original_string(); }
2435        else
2436          {
2437            use bytes; # > perl 5.5
2438            if( length( $string) < 1024)
2439              { $string= $p->original_string(); }
2440            else
2441              { #warn "dodgy case";
2442                # TODO original_string does not hold the entire string, but $string is wrong
2443                # I believe due to a bug in XML::Parser
2444                # for now, we use the original string, even if it means that it's been converted to utf8
2445              }
2446          }
2447      }
2448
2449    if( $t->{twig_input_filter}) { $string= $t->{twig_input_filter}->( $string); }
2450    if( $t->{twig_char_handler}) { $string= $t->{twig_char_handler}->( $string); }
2451
2452    my $elt= $t->{twig_current};
2453
2454    if(    $t->{twig_in_cdata})
2455      { # text is the continuation of a previously created cdata
2456        $elt->{cdata}.=  $t->{twig_stored_spaces} . $string;
2457      }
2458    elsif( $t->{twig_in_pcdata})
2459      { # text is the continuation of a previously created pcdata
2460        if( $t->{extra_data})
2461          { $elt->_push_extra_data_in_pcdata( $t->{extra_data}, length( $elt->{pcdata}));
2462            $t->{extra_data}='';
2463          }
2464        $elt->{pcdata}.=  $string;
2465      }
2466    else
2467      {
2468        # text is just space, which might be discarded later
2469        if( $string=~/\A\s*\Z/s)
2470          {
2471            if( $t->{extra_data})
2472              { # we got extra data (comment, pi), lets add the spaces to it
2473                $t->{extra_data} .= $string;
2474              }
2475            else
2476              { # no extra data, just store the spaces
2477                $t->{twig_stored_spaces}.= $string;
2478              }
2479          }
2480        else
2481          { my $new_elt= _insert_pcdata( $t, $t->{twig_stored_spaces}.$string);
2482            delete $elt->{'twig_current'};
2483            $new_elt->{'twig_current'}=1;
2484            $t->{twig_current}= $new_elt;
2485            $t->{twig_in_pcdata}=1;
2486            if( $t->{extra_data})
2487              { $new_elt->_push_extra_data_in_pcdata( $t->{extra_data}, 0);
2488                $t->{extra_data}='';
2489              }
2490          }
2491      }
2492    return;
2493  }
2494
2495sub _twig_cdatastart
2496   { # warn " in _twig_cdatastart...\n"; # DEBUG handler
2497
2498    my $p= shift;
2499    my $t=$p->{twig};
2500
2501    $t->{twig_in_cdata}=1;
2502    my $cdata=  $t->{twig_elt_class}->new( $CDATA);
2503    my $twig_current= $t->{twig_current};
2504
2505    if( $t->{twig_in_pcdata})
2506      { # create the node as a sibling of the PCDATA
2507        $cdata->{prev_sibling}=$twig_current; if( $XML::Twig::weakrefs) { weaken( $cdata->{prev_sibling});} ;
2508        $twig_current->{next_sibling}=  $cdata;
2509        my $parent= $twig_current->{parent};
2510        $cdata->{parent}=$parent; if( $XML::Twig::weakrefs) { weaken( $cdata->{parent});} ;
2511         $parent->{empty}=0; $parent->{last_child}=$cdata; if( $XML::Twig::weakrefs) { weaken( $parent->{last_child});} ;
2512        $t->{twig_in_pcdata}=0;
2513      }
2514    else
2515      { # we have to create a PCDATA element if we need to store spaces
2516        if( $t->_space_policy($XML::Twig::index2gi[$twig_current->{'gi'}]) && $t->{twig_stored_spaces})
2517          { _insert_pcdata( $t, $t->{twig_stored_spaces}); }
2518        $t->{twig_stored_spaces}='';
2519
2520        # create the node as a child of the current element
2521        $cdata->{parent}=$twig_current; if( $XML::Twig::weakrefs) { weaken( $cdata->{parent});} ;
2522        if( my $prev_sibling= $twig_current->{last_child})
2523          { $cdata->{prev_sibling}=$prev_sibling; if( $XML::Twig::weakrefs) { weaken( $cdata->{prev_sibling});} ;
2524            $prev_sibling->{next_sibling}=  $cdata;
2525          }
2526        else
2527          { $twig_current->{first_child}=  $cdata; }
2528         $twig_current->{empty}=0; $twig_current->{last_child}=$cdata; if( $XML::Twig::weakrefs) { weaken( $twig_current->{last_child});} ;
2529
2530      }
2531
2532    delete $twig_current->{'twig_current'};
2533    $t->{twig_current}= $cdata;
2534    $cdata->{'twig_current'}=1;
2535    if( $t->{extra_data}) { $cdata->set_extra_data( $t->{extra_data}); $t->{extra_data}='' };
2536    return;
2537  }
2538
2539sub _twig_cdataend
2540   { # warn " in _twig_cdataend...\n"; # DEBUG handler
2541
2542    my $p= shift;
2543    my $t=$p->{twig};
2544
2545    $t->{twig_in_cdata}=0;
2546
2547    my $elt= $t->{twig_current};
2548    delete $elt->{'twig_current'};
2549    my $cdata= $elt->{cdata};
2550    $elt->_set_cdata( $cdata);
2551
2552    push @{$t->{_twig_context_stack}}, { $ST_TAG => $CDATA };
2553
2554    if( $t->{twig_handlers})
2555      { # look for handlers
2556        my @handlers= _handler( $t, $t->{twig_handlers}, $CDATA);
2557        local $_= $elt; # so we can use $_ in the handlers
2558        foreach my $handler ( @handlers) { $handler->($t, $elt) || last; }
2559      }
2560
2561    pop @{$t->{_twig_context_stack}};
2562
2563    $elt= $elt->{parent};
2564    $t->{twig_current}= $elt;
2565    $elt->{'twig_current'}=1;
2566
2567    $t->{twig_long_cdata}=0;
2568    return;
2569  }
2570
2571sub _pi_elt_handlers
2572  { my( $t, $pi)= @_;
2573    my $pi_handlers= $t->{twig_handlers}->{pi_handlers} || return;
2574    foreach my $handler ( $pi_handlers->{$pi->{target}}, $pi_handlers->{''})
2575      { if( $handler) { local $_= $pi; $handler->( $t, $pi) || last; } }
2576  }
2577
2578sub _pi_text_handler
2579  { my( $t, $target, $data)= @_;
2580    if( my $handler= $t->{twig_handlers}->{pi_handlers}->{$target})
2581      { return $handler->( $t, $target, $data); }
2582    if( my $handler= $t->{twig_handlers}->{pi_handlers}->{''})
2583      { return $handler->( $t, $target, $data); }
2584    return defined( $data) && $data ne ''  ? "<?$target $data?>" : "<?$target?>" ;
2585  }
2586
2587sub _comment_elt_handler
2588  { my( $t, $comment)= @_;
2589    if( my $handler= $t->{twig_handlers}->{handlers}->{$COMMENT})
2590      { local $_= $comment; $handler->($t, $comment); }
2591  }
2592
2593sub _comment_text_handler
2594  { my( $t, $comment)= @_;
2595    if( my $handler= $t->{twig_handlers}->{handlers}->{$COMMENT})
2596      { $comment= $handler->($t, $comment);
2597        if( !defined $comment || $comment eq '') { return ''; }
2598      }
2599    return "<!--$comment-->";
2600  }
2601
2602
2603
2604sub _twig_comment
2605   { # warn " in _twig_comment...\n"; # DEBUG handler
2606
2607    my( $p, $comment_text)= @_;
2608    my $t=$p->{twig};
2609
2610    if( $t->{twig_keep_encoding}) { $comment_text= substr( $p->original_string(), 4, -3); }
2611
2612    $t->_twig_pi_comment( $p, $COMMENT, $t->{twig_keep_comments}, $t->{twig_process_comments},
2613                          '_set_comment', '_comment_elt_handler', '_comment_text_handler', $comment_text
2614                        );
2615    return;
2616  }
2617
2618sub _twig_pi
2619   { # warn " in _twig_pi...\n"; # DEBUG handler
2620
2621    my( $p, $target, $data)= @_;
2622    my $t=$p->{twig};
2623
2624    if( $t->{twig_keep_encoding})
2625      { my $pi_text= substr( $p->original_string(), 2, -2);
2626        ($target, $data)= split( /\s+/, $pi_text, 2);
2627      }
2628
2629    $t->_twig_pi_comment( $p, $PI, $t->{twig_keep_pi}, $t->{twig_process_pi},
2630                          '_set_pi', '_pi_elt_handlers', '_pi_text_handler', $target, $data
2631                        );
2632    return;
2633  }
2634
2635sub _twig_pi_comment
2636  { my( $t, $p, $type, $keep, $process, $set, $elt_handler, $text_handler, @parser_args)= @_;
2637
2638    if( $t->{twig_input_filter})
2639          { foreach my $arg (@parser_args) { $arg= $t->{twig_input_filter}->( $arg); } }
2640
2641    # if pi/comments are to be kept then we piggyback them to the current element
2642    if( $keep)
2643      { # first add spaces
2644        if( $t->{twig_stored_spaces})
2645              { $t->{extra_data}.= $t->{twig_stored_spaces};
2646                $t->{twig_stored_spaces}= '';
2647              }
2648
2649        my $extra_data= $t->$text_handler( @parser_args);
2650        $t->{extra_data}.= $extra_data;
2651
2652      }
2653    elsif( $process)
2654      {
2655        my $twig_current= $t->{twig_current}; # defined unless we are outside of the root
2656
2657        my $elt= $t->{twig_elt_class}->new( $type);
2658        $elt->$set( @parser_args);
2659        if( $t->{extra_data})
2660          { $elt->set_extra_data( $t->{extra_data});
2661            $t->{extra_data}='';
2662          }
2663
2664        unless( $t->root)
2665          { $t->_add_cpi_outside_of_root( leading_cpi => $elt);
2666          }
2667        elsif( $t->{twig_in_pcdata})
2668          { # create the node as a sibling of the PCDATA
2669            $elt->paste_after( $twig_current);
2670            $t->{twig_in_pcdata}=0;
2671          }
2672        elsif( $twig_current)
2673          { # we have to create a PCDATA element if we need to store spaces
2674            if( $t->_space_policy($XML::Twig::index2gi[$twig_current->{'gi'}]) && $t->{twig_stored_spaces})
2675              { _insert_pcdata( $t, $t->{twig_stored_spaces}); }
2676            $t->{twig_stored_spaces}='';
2677            # create the node as a child of the current element
2678            $elt->paste_last_child( $twig_current);
2679          }
2680        else
2681          { $t->_add_cpi_outside_of_root( trailing_cpi => $elt); }
2682
2683        if( $twig_current)
2684          { delete $twig_current->{'twig_current'};
2685            my $parent= $elt->{parent};
2686            $t->{twig_current}= $parent;
2687            $parent->{'twig_current'}=1;
2688          }
2689
2690        $t->$elt_handler( $elt);
2691      }
2692
2693  }
2694
2695
2696# add a comment or pi before the first element
2697sub _add_cpi_outside_of_root
2698  { my($t, $type, $elt)= @_; # $type is 'leading_cpi' or 'trailing_cpi'
2699    $t->{$type} ||= $t->{twig_elt_class}->new( '#CPI');
2700    # create the node as a child of the current element
2701    $elt->paste_last_child( $t->{$type});
2702    return $t;
2703  }
2704
2705sub _twig_final
2706   { # warn " in _twig_final...\n"; # DEBUG handler
2707
2708    my $p= shift;
2709    my $t= $p->isa( 'XML::Twig') ? $p : $p->{twig};
2710
2711    # store trailing data
2712    if( $t->{extra_data}) { $t->{trailing_cpi_text} = $t->{extra_data}; $t->{extra_data}=''; }
2713    $t->{trailing_spaces}= $t->{twig_stored_spaces} || '';
2714    my $s=  $t->{twig_stored_spaces}; $s=~s{\n}{\\n}g;
2715    if( $t->{twig_stored_spaces}) { my $s=  $t->{twig_stored_spaces}; }
2716
2717    # restore the selected filehandle if needed
2718    $t->_set_fh_to_selected_fh();
2719
2720    $t->_trigger_tdh if( $t->{twig_tdh});
2721
2722    select $t->{twig_original_selected_fh} if($t->{twig_original_selected_fh}); # probably dodgy
2723
2724    if( exists $t->{twig_autoflush_data})
2725      { my @args;
2726        push @args,  $t->{twig_autoflush_data}->{fh}      if( $t->{twig_autoflush_data}->{fh});
2727        push @args,  @{$t->{twig_autoflush_data}->{args}} if( $t->{twig_autoflush_data}->{args});
2728        $t->flush( @args);
2729        delete $t->{twig_autoflush_data};
2730        $t->root->delete if $t->root;
2731      }
2732
2733    # tries to clean-up (probably not very well at the moment)
2734    #undef $p->{twig};
2735    undef $t->{twig_parser};
2736    delete $t->{twig_parsing};
2737    @{$t}{ qw( twig_parser twig_parsing _twig_context_stack twig_current) }=();
2738
2739    return $t;
2740  }
2741
2742sub _insert_pcdata
2743  { my( $t, $string)= @_;
2744    # create a new PCDATA element
2745    my $parent= $t->{twig_current};    # always defined
2746    my $elt;
2747    if( exists $t->{twig_alt_elt_class})
2748      { $elt=  $t->{twig_elt_class}->new( $PCDATA);
2749        $elt->_set_pcdata( $string);
2750      }
2751    else
2752      { $elt= bless( { gi => $XML::Twig::gi2index{$PCDATA}, pcdata => $string }, 'XML::Twig::Elt'); }
2753
2754    my $prev_sibling= $parent->{last_child};
2755    if( $prev_sibling)
2756      { $prev_sibling->{next_sibling}=  $elt;
2757        $elt->{prev_sibling}=$prev_sibling; if( $XML::Twig::weakrefs) { weaken( $elt->{prev_sibling});} ;
2758      }
2759    else
2760      { $parent->{first_child}=  $elt; }
2761
2762    $elt->{parent}=$parent; if( $XML::Twig::weakrefs) { weaken( $elt->{parent});} ;
2763     $parent->{empty}=0; $parent->{last_child}=$elt; if( $XML::Twig::weakrefs) { weaken( $parent->{last_child});} ;
2764    $t->{twig_stored_spaces}='';
2765    return $elt;
2766  }
2767
2768sub _space_policy
2769  { my( $t, $gi)= @_;
2770    my $policy;
2771    $policy=0 if( $t->{twig_discard_spaces});
2772    $policy=1 if( $t->{twig_keep_spaces});
2773    $policy=1 if( $t->{twig_keep_spaces_in}
2774               && $t->{twig_keep_spaces_in}->{$gi});
2775    $policy=0 if( $t->{twig_discard_spaces_in}
2776               && $t->{twig_discard_spaces_in}->{$gi});
2777    return $policy;
2778  }
2779
2780
2781sub _twig_entity
2782   { # warn " in _twig_entity...\n"; # DEBUG handler
2783    my( $p, $name, $val, $sysid, $pubid, $ndata, $param)= @_;
2784    my $t=$p->{twig};
2785
2786    #{ no warnings; my $base= $p->base; warn "_twig_entity called: expand: '$t->{twig_expand_external_ents}', base: '$base', name: '$name', val: '$val', sysid: '$sysid', pubid: '$pubid', ndata: '$ndata', param: '$param'\n";}
2787
2788    my $missing_entity=0;
2789
2790    if( $sysid)
2791      { if($ndata)
2792          { if( ! -f _based_filename( $sysid, $p->base)) { $missing_entity= 1; }
2793          }
2794        else
2795          { if( $t->{twig_expand_external_ents})
2796              { $val= eval { _slurp_uri( $sysid, $p->base) };
2797                if( ! defined $val)
2798                  { if( $t->{twig_extern_ent_nofail})
2799                      { $missing_entity= 1; }
2800                    else
2801                      { _croak( "cannot load SYSTEM entity '$name' from '$sysid': $@", 3); }
2802                  }
2803              }
2804          }
2805      }
2806
2807    my $ent=XML::Twig::Entity->new( $name, $val, $sysid, $pubid, $ndata, $param);
2808    if( $missing_entity) { $t->{twig_missing_system_entities}->{$name}= $ent; }
2809
2810    my $entity_list= $t->entity_list;
2811    if( $entity_list) { $entity_list->add( $ent); }
2812
2813    if( $parser_version > 2.27)
2814      { # this is really ugly, but with some versions of XML::Parser the value
2815        # of the entity is not properly returned by the default handler
2816        my $ent_decl= $ent->text;
2817        if( $t->{twig_keep_encoding})
2818          { if( defined $ent->{val} && ($ent_decl !~ /["']/))
2819              { my $val=  $ent->{val};
2820                $ent_decl .= $val =~ /"/ ? qq{'$val' } : qq{"$val" };
2821              }
2822            # for my solaris box (perl 5.6.1, XML::Parser 2.31, expat?)
2823            $t->{twig_doctype}->{internal}=~ s{<!ENTITY\s+$name\s+$}{substr( $ent_decl, 0, -1)}e;
2824          }
2825        $t->{twig_doctype}->{internal} .= $ent_decl
2826          unless( $t->{twig_doctype}->{internal}=~ m{<!ENTITY\s+$name\s+});
2827      }
2828
2829    return;
2830  }
2831
2832
2833sub _twig_extern_ent
2834   { # warn " in _twig_extern_ent...I (", $_[0]->original_string, ")\n"; # DEBUG handler
2835    my( $p, $base, $sysid, $pubid)= @_;
2836    my $t= $p->{twig};
2837    if( $t->{twig_no_expand})
2838      { my $ent_name= $t->{twig_keep_encoding} ? $p->original_string : $p->recognized_string;
2839        _twig_insert_ent( $t, $ent_name);
2840        return '';
2841      }
2842    my $ent_content= eval { $t->{twig_ext_ent_handler}->( $p, $base, $sysid) };
2843    if( ! defined $ent_content)
2844      {
2845        my $ent_name = $p->recognized_string;
2846        my $file     =  _based_filename( $sysid, $base);
2847        my $error_message= "cannot expand $ent_name - cannot load '$file'";
2848        if( $t->{twig_extern_ent_nofail}) { return "<!-- $error_message -->"; }
2849        else                              { _croak( $error_message);   }
2850      }
2851    return $ent_content;
2852  }
2853
2854# I use this so I can change the $Carp::CarpLevel (which determines how many call frames to skip when reporting an error)
2855sub _croak
2856  { my( $message, $level)= @_;
2857    $Carp::CarpLevel= $level || 0;
2858    croak $message;
2859  }
2860
2861sub _twig_xmldecl
2862   { # warn " in _twig_xmldecl...\n"; # DEBUG handler
2863
2864    my $p= shift;
2865    my $t=$p->{twig};
2866    $t->{twig_xmldecl}||={};                 # could have been set by set_output_encoding
2867    $t->{twig_xmldecl}->{version}= shift;
2868    $t->{twig_xmldecl}->{encoding}= shift;
2869    $t->{twig_xmldecl}->{standalone}= shift;
2870    return;
2871  }
2872
2873sub _twig_doctype
2874   { # warn " in _twig_doctype...\n"; # DEBUG handler
2875    my( $p, $name, $sysid, $pub, $internal)= @_;
2876    my $t=$p->{twig};
2877    $t->{twig_doctype}||= {};                   # create
2878    $t->{twig_doctype}->{name}= $name;          # always there
2879    $t->{twig_doctype}->{sysid}= $sysid;        #
2880    $t->{twig_doctype}->{pub}= $pub;            #
2881
2882    # now let's try to cope with XML::Parser 2.28 and above
2883    if( $parser_version > 2.27)
2884      { @saved_default_handler= $p->setHandlers( Default     => \&_twig_store_internal_dtd,
2885                                                 Entity      => \&_twig_entity,
2886                                               );
2887      $p->setHandlers( DoctypeFin  => \&_twig_stop_storing_internal_dtd);
2888      $t->{twig_doctype}->{internal}='';
2889      }
2890    else
2891      # for XML::Parser before 2.28
2892      { $internal||='';
2893        $internal=~ s{^\s*\[}{};
2894        $internal=~ s{]\s*$}{};
2895        $t->{twig_doctype}->{internal}=$internal;
2896      }
2897
2898    # now check if we want to get the DTD info
2899    if( $t->{twig_read_external_dtd} && $sysid)
2900      { # let's build a fake document with an internal DTD
2901        my $dtd=  "<!DOCTYPE $name [" . _slurp_uri( $sysid) .  "]><$name/>";
2902
2903        $t->save_global_state();            # save the globals (they will be reset by the following new)
2904        my $t_dtd= XML::Twig->new( load_DTD => 1, ParseParamEnt => 1, error_context => $t->{ErrorContext} || 0);          # create a temp twig
2905        $t_dtd->parse( $dtd);               # parse it
2906        $t->{twig_dtd}= $t_dtd->{twig_dtd}; # grab the dtd info
2907        #$t->{twig_dtd_is_external}=1;
2908        $t->entity_list->_add_list( $t_dtd->entity_list) if( $t_dtd->entity_list); # grab the entity info
2909        $t->restore_global_state();
2910      }
2911    return;
2912  }
2913
2914sub _twig_element
2915   { # warn " in _twig_element...\n"; # DEBUG handler
2916
2917    my( $p, $name, $model)= @_;
2918    my $t=$p->{twig};
2919    $t->{twig_dtd}||= {};                      # may create the dtd
2920    $t->{twig_dtd}->{model}||= {};             # may create the model hash
2921    $t->{twig_dtd}->{elt_list}||= [];          # ordered list of elements
2922    push @{$t->{twig_dtd}->{elt_list}}, $name; # store the elt
2923    $t->{twig_dtd}->{model}->{$name}= $model;  # store the model
2924    if( ($parser_version > 2.27) && ($t->{twig_doctype}->{internal}=~ m{(^|>)\s*$}) )
2925      { my $text= $XML::Twig::Elt::keep_encoding ? $p->original_string : $p->recognized_string;
2926        unless( $text)
2927          { # this version of XML::Parser does not return the text in the *_string method
2928            # we need to rebuild it
2929            $text= "<!ELEMENT $name $model>";
2930          }
2931        $t->{twig_doctype}->{internal} .= $text;
2932      }
2933    return;
2934  }
2935
2936sub _twig_attlist
2937   { # warn " in _twig_attlist...\n"; # DEBUG handler
2938
2939    my( $p, $gi, $att, $type, $default, $fixed)= @_;
2940    #warn "in attlist: gi: '$gi', att: '$att', type: '$type', default: '$default', fixed: '$fixed'\n";
2941    my $t=$p->{twig};
2942    $t->{twig_dtd}||= {};                      # create dtd if need be
2943    $t->{twig_dtd}->{$gi}||= {};               # create elt if need be
2944    #$t->{twig_dtd}->{$gi}->{att}||= {};        # create att if need be
2945    if( ($parser_version > 2.27) && ($t->{twig_doctype}->{internal}=~ m{(^|>)\s*$}) )
2946      { my $text= $XML::Twig::Elt::keep_encoding ? $p->original_string : $p->recognized_string;
2947        unless( $text)
2948          { # this version of XML::Parser does not return the text in the *_string method
2949            # we need to rebuild it
2950            my $att_decl="$att $type";
2951            $att_decl .= " #FIXED"   if( $fixed);
2952            $att_decl .= " $default" if( defined $default);
2953            # 2 cases: there is already an attlist on that element or not
2954            if( $t->{twig_dtd}->{att}->{$gi})
2955              { # there is already an attlist, add to it
2956                $t->{twig_doctype}->{internal}=~ s{(<!ATTLIST\s*$gi )(.*?)\n?>}
2957                                                  { "$1$2\n" . ' ' x length( $1) . "$att_decl\n>"}es;
2958              }
2959            else
2960              { # create the attlist
2961                 $t->{twig_doctype}->{internal}.= "<!ATTLIST $gi $att_decl>"
2962              }
2963          }
2964      }
2965    $t->{twig_dtd}->{att}->{$gi}->{$att}= {} ;
2966    $t->{twig_dtd}->{att}->{$gi}->{$att}->{type}= $type;
2967    $t->{twig_dtd}->{att}->{$gi}->{$att}->{default}= $default if( defined $default);
2968    $t->{twig_dtd}->{att}->{$gi}->{$att}->{fixed}= $fixed;
2969    return;
2970  }
2971
2972sub _twig_default
2973   { # warn " in _twig_default...\n"; # DEBUG handler
2974
2975    my( $p, $string)= @_;
2976
2977    my $t= $p->{twig};
2978
2979    # we need to process the data in 2 cases: entity, or spaces after the closing tag
2980
2981    # after the closing tag (no twig_current and root has been created)
2982    if(  ! $t->{twig_current} && $t->{twig_root} && $string=~ m{^\s+$}m) { $t->{twig_stored_spaces} .= $string; }
2983
2984    # process only if we have an entity
2985    if( $string=~ m{^&([^;]*);$})
2986      { # the entity has to be pure pcdata, or we have a problem
2987        if( ($p->original_string=~ m{^<}) && ($p->original_string=~ m{>$}) )
2988          { # string is a tag, entity is in an attribute
2989            $t->{twig_entities_in_attribute}=1 if( $t->{twig_do_not_escape_amp_in_atts});
2990          }
2991        else
2992          { my $ent;
2993            if( $t->{twig_keep_encoding})
2994              { _twig_char( $p, $string);
2995                $ent= substr( $string, 1, -1);
2996              }
2997            else
2998              { $ent= _twig_insert_ent( $t, $string);
2999              }
3000
3001            return $ent;
3002          }
3003      }
3004  }
3005
3006sub _twig_insert_ent
3007  {
3008    my( $t, $string)=@_;
3009
3010    my $twig_current= $t->{twig_current};
3011
3012    my $ent=  $t->{twig_elt_class}->new( $ENT);
3013    $ent->{ent}=  $string;
3014
3015    _add_or_discard_stored_spaces( $t);
3016
3017    if( $t->{twig_in_pcdata})
3018      { # create the node as a sibling of the #PCDATA
3019
3020        $ent->{prev_sibling}=$twig_current; if( $XML::Twig::weakrefs) { weaken( $ent->{prev_sibling});} ;
3021        $twig_current->{next_sibling}=  $ent;
3022        my $parent= $twig_current->{parent};
3023        $ent->{parent}=$parent; if( $XML::Twig::weakrefs) { weaken( $ent->{parent});} ;
3024         $parent->{empty}=0; $parent->{last_child}=$ent; if( $XML::Twig::weakrefs) { weaken( $parent->{last_child});} ;
3025        # the twig_current is now the parent
3026        delete $twig_current->{'twig_current'};
3027        $t->{twig_current}= $parent;
3028        # we left pcdata
3029        $t->{twig_in_pcdata}=0;
3030      }
3031    else
3032      { # create the node as a child of the current element
3033        $ent->{parent}=$twig_current; if( $XML::Twig::weakrefs) { weaken( $ent->{parent});} ;
3034        if( my $prev_sibling= $twig_current->{last_child})
3035          { $ent->{prev_sibling}=$prev_sibling; if( $XML::Twig::weakrefs) { weaken( $ent->{prev_sibling});} ;
3036            $prev_sibling->{next_sibling}=  $ent;
3037          }
3038        else
3039          { if( $twig_current) { $twig_current->{first_child}=  $ent; } }
3040        if( $twig_current) {  $twig_current->{empty}=0; $twig_current->{last_child}=$ent; if( $XML::Twig::weakrefs) { weaken( $twig_current->{last_child});} ; }
3041      }
3042
3043    # meant to trigger entity handler, does not seem to be activated at this time
3044    #if( my $handler= $t->{twig_handlers}->{gi}->{$ENT})
3045    #  { local $_= $ent; $handler->( $t, $ent); }
3046
3047    return $ent;
3048  }
3049
3050sub parser
3051  { return $_[0]->{twig_parser}; }
3052
3053# returns the declaration text (or a default one)
3054sub xmldecl
3055  { my $t= shift;
3056    return '' unless( $t->{twig_xmldecl} || $t->{output_encoding});
3057    my $decl_string;
3058    my $decl= $t->{twig_xmldecl};
3059    if( $decl)
3060      { my $version= $decl->{version};
3061        $decl_string= q{<?xml};
3062        $decl_string .= qq{ version="$version"};
3063
3064        # encoding can either have been set (in $decl->{output_encoding})
3065        # or come from the document (in $decl->{encoding})
3066        if( $t->{output_encoding})
3067          { my $encoding= $t->{output_encoding};
3068            $decl_string .= qq{ encoding="$encoding"};
3069          }
3070        elsif( $decl->{encoding})
3071          { my $encoding= $decl->{encoding};
3072            $decl_string .= qq{ encoding="$encoding"};
3073          }
3074
3075        if( defined( $decl->{standalone}))
3076          { $decl_string .= q{ standalone="};
3077            $decl_string .= $decl->{standalone} ? "yes" : "no";
3078            $decl_string .= q{"};
3079          }
3080
3081        $decl_string .= "?>\n";
3082      }
3083    else
3084      { my $encoding= $t->{output_encoding};
3085        $decl_string= qq{<?xml version="1.0" encoding="$encoding"?>};
3086      }
3087
3088    my $output_filter= XML::Twig::Elt::output_filter();
3089    return $output_filter ? $output_filter->( $decl_string) : $decl_string;
3090  }
3091
3092sub set_doctype
3093  { my( $t, $name, $system, $public, $internal)= @_;
3094    $t->{twig_doctype}= {} unless defined $t->{twig_doctype};
3095    my $doctype= $t->{twig_doctype};
3096    $doctype->{name}     = $name     if( defined $name);
3097    $doctype->{sysid}    = $system   if( defined $system);
3098    $doctype->{pub}      = $public   if( defined $public);
3099    $doctype->{internal} = $internal if( defined $internal);
3100  }
3101
3102sub doctype_name
3103  { my $t= shift;
3104    my $doctype= $t->{twig_doctype} or return '';
3105    return $doctype->{name} || '';
3106  }
3107
3108sub system_id
3109  { my $t= shift;
3110    my $doctype= $t->{twig_doctype} or return '';
3111    return $doctype->{sysid} || '';
3112  }
3113
3114sub public_id
3115  { my $t= shift;
3116    my $doctype= $t->{twig_doctype} or return '';
3117    return $doctype->{pub} || '';
3118  }
3119
3120sub internal_subset
3121  { my $t= shift;
3122    my $doctype= $t->{twig_doctype} or return '';
3123    return $doctype->{internal} || '';
3124  }
3125
3126# return the dtd object
3127sub dtd
3128  { my $t= shift;
3129    return $t->{twig_dtd};
3130  }
3131
3132# return an element model, or the list of element models
3133sub model
3134  { my $t= shift;
3135    my $elt= shift;
3136    return $t->dtd->{model}->{$elt} if( $elt);
3137    return (sort keys %{$t->dtd->{model}});
3138  }
3139
3140
3141# return the entity_list object
3142sub entity_list
3143  { my $t= shift;
3144    return $t->{twig_entity_list};
3145  }
3146
3147# return the list of entity names
3148sub entity_names
3149  { my $t= shift;
3150    return $t->entity_list->entity_names;
3151  }
3152
3153# return the entity object
3154sub entity
3155  { my $t= shift;
3156    my $entity_name= shift;
3157    return $t->entity_list->ent( $entity_name);
3158  }
3159
3160
3161sub print_prolog
3162  { my $t= shift;
3163    my $fh=  isa( $_[0], 'GLOB') || isa( $_[0], 'IO::Scalar')  ? shift : $t->{twig_output_fh} || select() || \*STDOUT;
3164    ## no critic (TestingAndDebugging::ProhibitNoStrict);
3165    no strict 'refs';
3166    print {$fh} $t->prolog( @_);
3167  }
3168
3169sub prolog
3170  { my $t= shift;
3171    if( $t->{no_prolog}){ return ''; }
3172
3173    return   $t->{no_prolog}             ? ''
3174           : defined $t->{no_dtd_output} ? $t->xmldecl
3175           :                               $t->xmldecl . $t->doctype( @_);
3176  }
3177
3178sub doctype
3179  { my $t= shift;
3180    my %args= _normalize_args( @_);
3181    my $update_dtd = $args{UpdateDTD} || '';
3182    my $doctype_text='';
3183
3184    my $doctype= $t->{twig_doctype};
3185
3186    if( $doctype)
3187      { $doctype_text .= qq{<!DOCTYPE $doctype->{name}} if( $doctype->{name});
3188        $doctype_text .= qq{ PUBLIC "$doctype->{pub}"}  if( $doctype->{pub});
3189        $doctype_text .= qq{ SYSTEM}                    if( $doctype->{sysid} && !$doctype->{pub});
3190        $doctype_text .= qq{ "$doctype->{sysid}"}       if( $doctype->{sysid});
3191      }
3192
3193    if( $update_dtd)
3194      { if( $doctype)
3195          { my $internal=$doctype->{internal};
3196            # awful hack, but at least it works a little better that what was there before
3197            if( $internal)
3198              { # remove entity declarations (they will be re-generated from the updated entity list)
3199                $internal=~ s{<! \s* ENTITY \s+ $REG_TAG_NAME \s+ ( ("[^"]*"|'[^']*') \s* | SYSTEM [^>]*) >\s*}{}xg;
3200                $internal=~ s{^\n}{};
3201              }
3202            $internal .= $t->entity_list->text ||'' if( $t->entity_list);
3203            if( $internal) { $doctype_text .= "[\n$internal]>\n"; }
3204          }
3205        elsif( !$t->{'twig_dtd'} && keys %{$t->entity_list})
3206          { $doctype_text .= "<!DOCTYPE " . $t->root->gi . " [\n" . $t->entity_list->text . "\n]>";;}
3207        else
3208          { $doctype_text= $t->{twig_dtd};
3209            $doctype_text .= $t->dtd_text;
3210          }
3211      }
3212    elsif( $doctype)
3213      { if( my $internal= $doctype->{internal})
3214          { # add opening and closing brackets if not already there
3215            # plus some spaces and newlines for a nice formating
3216            # I test it here because I can't remember which version of
3217            # XML::Parser need it or not, nor guess which one will in the
3218            # future, so this about the best I can do
3219            $internal=~ s{^\s*(\[\s*)?}{ [\n};
3220            $internal=~ s{\s*(\]\s*(>\s*)?)?\s*$}{\n]>\n};
3221            $doctype_text .=  $internal;
3222          }
3223      }
3224
3225    if( $doctype_text)
3226      {
3227        # terrible hack, as I can't figure out in which case the darn prolog
3228        # should get an extra > (depends on XML::Parser and expat versions)
3229        $doctype_text=~ s/(>\s*)*$/>\n/; # if($doctype_text);
3230
3231        my $output_filter= XML::Twig::Elt::output_filter();
3232        return $output_filter ? $output_filter->( $doctype_text) : $doctype_text;
3233      }
3234    else
3235      { return $doctype_text; }
3236  }
3237
3238sub _leading_cpi
3239  { my $t= shift;
3240    my $leading_cpi= $t->{leading_cpi} || return '';
3241    return $leading_cpi->sprint( 1);
3242  }
3243
3244sub _trailing_cpi
3245  { my $t= shift;
3246    my $trailing_cpi= $t->{trailing_cpi} || return '';
3247    return $trailing_cpi->sprint( 1);
3248  }
3249
3250sub _trailing_cpi_text
3251  { my $t= shift;
3252    return $t->{trailing_cpi_text} || '';
3253  }
3254
3255sub print_to_file
3256  { my( $t, $filename)= (shift, shift);
3257    my $out_fh;
3258#    open( $out_fh, ">$filename") or _croak( "cannot create file $filename: $!");     # < perl 5.8
3259    my $mode= $t->{twig_keep_encoding} ? '>' : '>:utf8';                             # >= perl 5.8
3260    open( $out_fh, $mode, $filename) or _croak( "cannot create file $filename: $!"); # >= perl 5.8
3261    $t->print( $out_fh, @_);
3262    close $out_fh;
3263    return $t;
3264  }
3265
3266# probably only works on *nix (at least the chmod bit)
3267# first print to a temporary file, then rename that file to the desired file name, then change permissions
3268# to the original file permissions (or to the current umask)
3269sub safe_print_to_file
3270  { my( $t, $filename)= (shift, shift);
3271    my $perm= -f $filename ? (stat $filename)[2] & 07777 : ~umask() ;
3272    XML::Twig::_use( 'File::Temp') || croak "need File::Temp to use safe_print_to_file\n";
3273    my $tmpdir= dirname( $filename);
3274    my( $fh, $tmpfilename) = File::Temp::tempfile( DIR => $tmpdir);
3275    $t->print_to_file( $tmpfilename, @_);
3276    rename( $tmpfilename, $filename) or unlink $tmpfilename && _croak( "cannot move temporary file to $filename: $!");
3277    chmod $perm, $filename;
3278    return $t;
3279  }
3280
3281
3282sub print
3283  { my $t= shift;
3284    my $fh=  isa( $_[0], 'GLOB') || isa( $_[0], 'IO::Scalar')  ? shift : undef;
3285    my %args= _normalize_args( @_);
3286
3287    my $old_select    = defined $fh                  ? select $fh                                 : undef;
3288    my $old_pretty    = defined ($args{PrettyPrint}) ? $t->set_pretty_print( $args{PrettyPrint})  : undef;
3289    my $old_empty_tag = defined ($args{EmptyTags})   ? $t->set_empty_tag_style( $args{EmptyTags}) : undef;
3290
3291    #if( !$t->{encoding} || lc( $t->{encoding}) eq 'utf-8') { my $out= $fh || \*STDOUT; binmode $out, ':utf8'; }
3292
3293    if( $perl_version > 5.006 && ! $t->{twig_keep_encoding})
3294      { if( grep /useperlio=define/, `$^X -V`) # we can only use binmode :utf8 if perl was compiled with useperlio
3295          { binmode( $fh || \*STDOUT, ":utf8" ); }
3296      }
3297
3298     print  $t->prolog( %args) . $t->_leading_cpi( %args);
3299     $t->{twig_root}->print;
3300     print $t->_trailing_cpi        # trailing comments and pi's (elements, in 'process' mode)
3301         . $t->_trailing_cpi_text   # trailing comments and pi's (in 'keep' mode)
3302         . ( ($t->{twig_keep_spaces}||'') && ($t->{trailing_spaces} || ''))
3303         ;
3304
3305
3306    $t->set_pretty_print( $old_pretty)       if( defined $old_pretty);
3307    $t->set_empty_tag_style( $old_empty_tag) if( defined $old_empty_tag);
3308    if( $fh) { select $old_select; }
3309
3310    return $t;
3311  }
3312
3313
3314sub flush
3315  { my $t= shift;
3316
3317    $t->_trigger_tdh if $t->{twig_tdh};
3318
3319    return if( $t->{twig_completely_flushed});
3320
3321    my $fh=  isa( $_[0], 'GLOB') || isa( $_[0], 'IO::Scalar') ? shift : undef;
3322    my $old_select= defined $fh ? select $fh : undef;
3323    my $up_to= ref $_[0] ? shift : undef;
3324    my %args= _normalize_args( @_);
3325
3326    my $old_pretty;
3327    if( defined $args{PrettyPrint})
3328      { $old_pretty= $t->set_pretty_print( $args{PrettyPrint});
3329        delete $args{PrettyPrint};
3330      }
3331
3332     my $old_empty_tag_style;
3333     if( $args{EmptyTags})
3334      { $old_empty_tag_style= $t->set_empty_tag_style( $args{EmptyTags});
3335        delete $args{EmptyTags};
3336      }
3337
3338
3339    # the "real" last element processed, as _twig_end has closed it
3340    my $last_elt;
3341    my $flush_trailing_data=0;
3342    if( $up_to)
3343      { $last_elt= $up_to; }
3344    elsif( $t->{twig_current})
3345      { $last_elt= $t->{twig_current}->_last_child; }
3346    else
3347      { $last_elt= $t->{twig_root};
3348        $flush_trailing_data=1;
3349        $t->{twig_completely_flushed}=1;
3350      }
3351
3352    # flush the DTD unless it has ready flushed (ie root has been flushed)
3353    my $elt= $t->{twig_root};
3354    unless( $elt->_flushed)
3355      { # store flush info so we can auto-flush later
3356        if( $t->{twig_autoflush})
3357          { $t->{twig_autoflush_data}={};
3358            $t->{twig_autoflush_data}->{fh}   = $fh  if( $fh);
3359            $t->{twig_autoflush_data}->{args} = \@_  if( @_);
3360          }
3361        $t->print_prolog( %args);
3362        print $t->_leading_cpi;
3363      }
3364
3365    while( $elt)
3366      { my $next_elt;
3367        if( $last_elt && $last_elt->in( $elt))
3368          {
3369            unless( $elt->_flushed)
3370              { # just output the front tag
3371                print $elt->start_tag();
3372                $elt->_set_flushed;
3373              }
3374            $next_elt= $elt->{first_child};
3375          }
3376        else
3377          { # an element before the last one or the last one,
3378            $next_elt= $elt->{next_sibling};
3379            $elt->_flush();
3380            $elt->delete;
3381            last if( $last_elt && ($elt == $last_elt));
3382          }
3383        $elt= $next_elt;
3384      }
3385
3386    if( $flush_trailing_data)
3387      { print $t->_trailing_cpi        # trailing comments and pi's (elements, in 'process' mode)
3388            , $t->_trailing_cpi_text   # trailing comments and pi's (in 'keep' mode)
3389      }
3390
3391    select $old_select if( defined $old_select);
3392    $t->set_pretty_print( $old_pretty) if( defined $old_pretty);
3393    $t->set_empty_tag_style( $old_empty_tag_style) if( defined $old_empty_tag_style);
3394
3395    if( my $ids= $t->{twig_id_list})
3396      { while( my ($id, $elt)= each %$ids)
3397          { if( ! defined $elt)
3398             { delete $t->{twig_id_list}->{$id} }
3399          }
3400      }
3401
3402    return $t;
3403  }
3404
3405
3406# flushes up to an element
3407# this method just reorders the arguments and calls flush
3408sub flush_up_to
3409  { my $t= shift;
3410    my $up_to= shift;
3411    if( isa( $_[0], 'GLOB') || isa( $_[0], 'IO::Scalar'))
3412      { my $fh=  shift;
3413        $t->flush( $fh, $up_to, @_);
3414      }
3415    else
3416      { $t->flush( $up_to, @_); }
3417
3418    return $t;
3419  }
3420
3421
3422# same as print except the entire document text is returned as a string
3423sub sprint
3424  { my $t= shift;
3425    my %args= _normalize_args( @_);
3426
3427    my $old_pretty;
3428    if( defined $args{PrettyPrint})
3429      { $old_pretty= $t->set_pretty_print( $args{PrettyPrint});
3430        delete $args{PrettyPrint};
3431      }
3432
3433     my $old_empty_tag_style;
3434     if( defined $args{EmptyTags})
3435      { $old_empty_tag_style= $t->set_empty_tag_style( $args{EmptyTags});
3436        delete $args{EmptyTags};
3437      }
3438
3439    my $string=   $t->prolog( %args)       # xml declaration and doctype
3440                . $t->_leading_cpi( %args) # leading comments and pi's in 'process' mode
3441                . ( ($t->{twig_root} && $t->{twig_root}->sprint) || '')
3442                . $t->_trailing_cpi        # trailing comments and pi's (elements, in 'process' mode)
3443                . $t->_trailing_cpi_text   # trailing comments and pi's (in 'keep' mode)
3444                ;
3445    if( $t->{twig_keep_spaces} && $t->{trailing_spaces}) { $string .= $t->{trailing_spaces}; }
3446
3447    $t->set_pretty_print( $old_pretty) if( defined $old_pretty);
3448    $t->set_empty_tag_style( $old_empty_tag_style) if( defined $old_empty_tag_style);
3449
3450    return $string;
3451  }
3452
3453
3454# this method discards useless elements in a tree
3455# it does the same thing as a flush except it does not print it
3456# the second argument is an element, the last purged element
3457# (this argument is usually set through the purge_up_to method)
3458sub purge
3459  { my $t= shift;
3460    my $up_to= shift;
3461
3462    $t->_trigger_tdh if $t->{twig_tdh};
3463
3464    # the "real" last element processed, as _twig_end has closed it
3465    my $last_elt;
3466    if( $up_to)
3467      { $last_elt= $up_to; }
3468    elsif( $t->{twig_current})
3469      { $last_elt= $t->{twig_current}->_last_child; }
3470    else
3471      { $last_elt= $t->{twig_root}; }
3472
3473    my $elt= $t->{twig_root};
3474
3475    while( $elt)
3476      { my $next_elt;
3477        if( $last_elt && $last_elt->in( $elt))
3478          { $elt->_set_flushed;
3479            $next_elt= $elt->{first_child};
3480          }
3481        else
3482          { # an element before the last one or the last one,
3483            $next_elt= $elt->{next_sibling};
3484            $elt->delete;
3485            last if( $last_elt && ($elt == $last_elt) );
3486          }
3487        $elt= $next_elt;
3488      }
3489
3490    if( my $ids= $t->{twig_id_list})
3491      { while( my ($id, $elt)= each %$ids) { if( ! defined $elt) { delete $t->{twig_id_list}->{$id} } } }
3492
3493    return $t;
3494  }
3495
3496# flushes up to an element. This method just calls purge
3497sub purge_up_to
3498  { my $t= shift;
3499    return $t->purge( @_);
3500  }
3501
3502sub root
3503  { return $_[0]->{twig_root}; }
3504
3505sub normalize
3506  { return $_[0]->root->normalize; }
3507
3508
3509# create accessor methods on attribute names
3510{ my %accessor; # memorize accessor names so re-creating them won't trigger an error
3511sub att_accessors
3512  {
3513    my $twig_or_class= shift;
3514    my $elt_class= ref $twig_or_class ? $twig_or_class->{twig_elt_class}
3515                                      : 'XML::Twig::Elt'
3516                                      ;
3517    ## no critic (TestingAndDebugging::ProhibitNoStrict);
3518    no strict 'refs';
3519    foreach my $att (@_)
3520      { _croak( "attempt to redefine existing method $att using att_accessors")
3521          if( $elt_class->can( $att) && !$accessor{$att});
3522
3523        if( !$accessor{$att})
3524          { *{"$elt_class\::$att"}=
3525                sub
3526                    :lvalue                                  # > perl 5.5
3527                  { my $elt= shift;
3528                    if( @_) { $elt->{att}->{$att}= $_[0]; }
3529                    $elt->{att}->{$att};
3530                  };
3531            $accessor{$att}=1;
3532          }
3533      }
3534    return $twig_or_class;
3535  }
3536}
3537
3538{ my %accessor; # memorize accessor names so re-creating them won't trigger an error
3539sub elt_accessors
3540  {
3541    my $twig_or_class= shift;
3542    my $elt_class= ref $twig_or_class ? $twig_or_class->{twig_elt_class}
3543                                      : 'XML::Twig::Elt'
3544                                      ;
3545
3546    # if arg is a hash ref, it's exp => name, otherwise it's a list of tags
3547    my %exp_to_alias= ref( $_[0]) && isa( $_[0], 'HASH') ? %{$_[0]}
3548                                                         : map { $_ => $_ } @_;
3549    ## no critic (TestingAndDebugging::ProhibitNoStrict);
3550    no strict 'refs';
3551    while( my( $alias, $exp)= each %exp_to_alias )
3552      { if( $elt_class->can( $alias) && !$accessor{$alias})
3553          { _croak( "attempt to redefine existing method $alias using elt_accessors"); }
3554
3555        if( !$accessor{$alias})
3556          { *{"$elt_class\::$alias"}=
3557                sub
3558                  { my $elt= shift;
3559                    return wantarray ? $elt->children( $exp) : $elt->first_child( $exp);
3560                  };
3561            $accessor{$alias}=1;
3562          }
3563      }
3564    return $twig_or_class;
3565  }
3566}
3567
3568{ my %accessor; # memorize accessor names so re-creating them won't trigger an error
3569sub field_accessors
3570  {
3571    my $twig_or_class= shift;
3572    my $elt_class= ref $twig_or_class ? $twig_or_class->{twig_elt_class}
3573                                      : 'XML::Twig::Elt'
3574                                      ;
3575    my %exp_to_alias= ref( $_[0]) && isa( $_[0], 'HASH') ? %{$_[0]}
3576                                                         : map { $_ => $_ } @_;
3577
3578    ## no critic (TestingAndDebugging::ProhibitNoStrict);
3579    no strict 'refs';
3580    while( my( $alias, $exp)= each %exp_to_alias )
3581      { if( $elt_class->can( $alias) && !$accessor{$alias})
3582          { _croak( "attempt to redefine existing method $exp using field_accessors"); }
3583        if( !$accessor{$alias})
3584          { *{"$elt_class\::$alias"}=
3585                sub
3586                  { my $elt= shift;
3587                    $elt->field( $exp)
3588                  };
3589            $accessor{$alias}=1;
3590          }
3591      }
3592    return $twig_or_class;
3593  }
3594}
3595
3596sub first_elt
3597  { my( $t, $cond)= @_;
3598    my $root= $t->root || return undef;
3599    return $root if( $root->passes( $cond));
3600    return $root->next_elt( $cond);
3601  }
3602
3603sub last_elt
3604  { my( $t, $cond)= @_;
3605    my $root= $t->root || return undef;
3606    return $root->last_descendant( $cond);
3607  }
3608
3609sub next_n_elt
3610  { my( $t, $offset, $cond)= @_;
3611    $offset -- if( $t->root->matches( $cond) );
3612    return $t->root->next_n_elt( $offset, $cond);
3613  }
3614
3615sub get_xpath
3616  { my $twig= shift;
3617    if( isa( $_[0], 'ARRAY'))
3618      { my $elt_array= shift;
3619        return _unique_elts( map { $_->get_xpath( @_) } @$elt_array);
3620      }
3621    else
3622      { return $twig->root->get_xpath( @_); }
3623  }
3624
3625# get a list of elts and return a sorted list of unique elts
3626sub _unique_elts
3627  { my @sorted= sort { $a ->cmp( $b) } @_;
3628    my @unique;
3629    while( my $current= shift @sorted)
3630      { push @unique, $current unless( @unique && ($unique[-1] == $current)); }
3631    return @unique;
3632  }
3633
3634sub findvalue
3635  { my $twig= shift;
3636    if( isa( $_[0], 'ARRAY'))
3637      { my $elt_array= shift;
3638        return join( '', map { $_->findvalue( @_) } @$elt_array);
3639      }
3640    else
3641      { return $twig->root->findvalue( @_); }
3642  }
3643
3644sub findvalues
3645  { my $twig= shift;
3646    if( isa( $_[0], 'ARRAY'))
3647      { my $elt_array= shift;
3648        return map { $_->findvalues( @_) } @$elt_array;
3649      }
3650    else
3651      { return $twig->root->findvalues( @_); }
3652  }
3653
3654sub set_id_seed
3655  { my $t= shift;
3656    XML::Twig::Elt->set_id_seed( @_);
3657    return $t;
3658  }
3659
3660# return an array ref to an index, or undef
3661sub index
3662  { my( $twig, $name, $index)= @_;
3663    return defined( $index) ? $twig->{_twig_index}->{$name}->[$index] : $twig->{_twig_index}->{$name};
3664  }
3665
3666# return a list with just the root
3667# if a condition is given then return an empty list unless the root matches
3668sub children
3669  { my( $t, $cond)= @_;
3670    my $root= $t->root;
3671    unless( $cond && !($root->passes( $cond)) )
3672      { return ($root); }
3673    else
3674      { return (); }
3675  }
3676
3677sub _children { return ($_[0]->root); }
3678
3679# weird, but here for completude
3680# used to solve (non-sensical) /doc[1] XPath queries
3681sub child
3682  { my $t= shift;
3683    my $nb= shift;
3684    return ($t->children( @_))[$nb];
3685  }
3686
3687sub descendants
3688  { my( $t, $cond)= @_;
3689    my $root= $t->root;
3690    if( $root->passes( $cond) )
3691      { return ($root, $root->descendants( $cond)); }
3692    else
3693      { return ( $root->descendants( $cond)); }
3694  }
3695
3696sub simplify  { my $t= shift; $t->root->simplify( @_);  }
3697sub subs_text { my $t= shift; $t->root->subs_text( @_); }
3698sub trim      { my $t= shift; $t->root->trim( @_);      }
3699
3700
3701sub set_keep_encoding
3702  { my( $t, $keep)= @_;
3703    $t->{twig_keep_encoding}= $keep;
3704    $t->{NoExpand}= $keep;
3705    return XML::Twig::Elt::set_keep_encoding( $keep);
3706   }
3707
3708sub set_expand_external_entities
3709  { return XML::Twig::Elt::set_expand_external_entities( @_); }
3710
3711sub escape_gt
3712  { my $t= shift; $t->{twig_escape_gt}= 1; return XML::Twig::Elt::escape_gt( @_); }
3713
3714sub do_not_escape_gt
3715  { my $t= shift; $t->{twig_escape_gt}= 0; return XML::Twig::Elt::do_not_escape_gt( @_); }
3716
3717sub elt_id
3718  { return $_[0]->{twig_id_list}->{$_[1]}; }
3719
3720# change it in ALL twigs at the moment
3721sub change_gi
3722  { my( $twig, $old_gi, $new_gi)= @_;
3723    my $index;
3724    return unless($index= $XML::Twig::gi2index{$old_gi});
3725    $XML::Twig::index2gi[$index]= $new_gi;
3726    delete $XML::Twig::gi2index{$old_gi};
3727    $XML::Twig::gi2index{$new_gi}= $index;
3728    return $twig;
3729  }
3730
3731
3732# builds the DTD from the stored (possibly updated) data
3733sub dtd_text
3734  { my $t= shift;
3735    my $dtd= $t->{twig_dtd};
3736    my $doctype= $t->{twig_doctype} or return '';
3737    my $string= "<!DOCTYPE ".$doctype->{name};
3738
3739    $string .= " [\n";
3740
3741    foreach my $gi (@{$dtd->{elt_list}})
3742      { $string.= "<!ELEMENT $gi ".$dtd->{model}->{$gi}.">\n" ;
3743        if( $dtd->{att}->{$gi})
3744          { my $attlist= $dtd->{att}->{$gi};
3745            $string.= "<!ATTLIST $gi\n";
3746            foreach my $att ( sort keys %{$attlist})
3747              {
3748                if( $attlist->{$att}->{fixed})
3749                  { $string.= "   $att $attlist->{$att}->{type} #FIXED $attlist->{$att}->{default}"; }
3750                else
3751                  { $string.= "   $att $attlist->{$att}->{type} $attlist->{$att}->{default}"; }
3752                $string.= "\n";
3753              }
3754            $string.= ">\n";
3755          }
3756      }
3757    $string.= $t->entity_list->text if( $t->entity_list);
3758    $string.= "\n]>\n";
3759    return $string;
3760  }
3761
3762# prints the DTD from the stored (possibly updated) data
3763sub dtd_print
3764  { my $t= shift;
3765    my $fh=  isa( $_[0], 'GLOB') || isa( $_[0], 'IO::Scalar')  ? shift : undef;
3766    if( $fh) { print $fh $t->dtd_text; }
3767    else     { print $t->dtd_text;     }
3768    return $t;
3769  }
3770
3771# build the subs that call directly expat
3772BEGIN
3773  { my @expat_methods= qw( depth in_element within_element context
3774                           current_line current_column current_byte
3775                           recognized_string original_string
3776                           xpcroak xpcarp
3777                           base current_element element_index
3778                           xml_escape
3779                           position_in_context);
3780    foreach my $method (@expat_methods)
3781      {
3782        ## no critic (TestingAndDebugging::ProhibitNoStrict);
3783        no strict 'refs';
3784        *{$method}= sub { my $t= shift;
3785                          _croak( "calling $method after parsing is finished") unless( $t->{twig_parsing});
3786                          return $t->{twig_parser}->$method(@_);
3787                        };
3788      }
3789  }
3790
3791sub path
3792  { my( $t, $gi)= @_;
3793    if( $t->{twig_map_xmlns})
3794      { return "/" . join( "/", map { $t->_replace_prefix( $_)} ($t->{twig_parser}->context, $gi)); }
3795    else
3796      { return "/" . join( "/", ($t->{twig_parser}->context, $gi)); }
3797  }
3798
3799sub finish
3800  { my $t= shift;
3801    return $t->{twig_parser}->finish;
3802  }
3803
3804# just finish the parse by printing the rest of the document
3805sub finish_print
3806  { my( $t, $fh)= @_;
3807    my $old_fh;
3808    unless( defined $fh)
3809      { $t->_set_fh_to_twig_output_fh(); }
3810    elsif( defined $fh)
3811      { $old_fh= select $fh;
3812        $t->{twig_original_selected_fh}= $old_fh if( $old_fh);
3813      }
3814
3815    my $p=$t->{twig_parser};
3816    if( $t->{twig_keep_encoding})
3817      { $p->setHandlers( %twig_handlers_finish_print); }
3818    else
3819      { $p->setHandlers( %twig_handlers_finish_print_original); }
3820    return $t;
3821  }
3822
3823sub set_remove_cdata { return XML::Twig::Elt::set_remove_cdata( @_); }
3824
3825sub output_filter          { return XML::Twig::Elt::output_filter( @_);          }
3826sub set_output_filter      { return XML::Twig::Elt::set_output_filter( @_);      }
3827
3828sub output_text_filter     { return XML::Twig::Elt::output_text_filter( @_);     }
3829sub set_output_text_filter { return XML::Twig::Elt::set_output_text_filter( @_); }
3830
3831sub set_input_filter
3832  { my( $t, $input_filter)= @_;
3833    my $old_filter= $t->{twig_input_filter};
3834      if( !$input_filter || isa( $input_filter, 'CODE') )
3835        { $t->{twig_input_filter}= $input_filter; }
3836      elsif( $input_filter eq 'latin1')
3837        {  $t->{twig_input_filter}= latin1(); }
3838      elsif( $filter{$input_filter})
3839        {  $t->{twig_input_filter}= $filter{$input_filter}; }
3840      else
3841        { _croak( "invalid input filter: $input_filter"); }
3842
3843      return $old_filter;
3844    }
3845
3846sub set_empty_tag_style
3847  { return XML::Twig::Elt::set_empty_tag_style( @_); }
3848
3849sub set_pretty_print
3850  { return XML::Twig::Elt::set_pretty_print( @_); }
3851
3852sub set_quote
3853  { return XML::Twig::Elt::set_quote( @_); }
3854
3855sub set_indent
3856  { return XML::Twig::Elt::set_indent( @_); }
3857
3858sub set_keep_atts_order
3859  { shift; return XML::Twig::Elt::set_keep_atts_order( @_); }
3860
3861sub keep_atts_order
3862  { return XML::Twig::Elt::keep_atts_order( @_); }
3863
3864sub set_do_not_escape_amp_in_atts
3865  { return XML::Twig::Elt::set_do_not_escape_amp_in_atts( @_); }
3866
3867# save and restore package globals (the ones in XML::Twig::Elt)
3868# should probably return the XML::Twig object itself, but instead
3869# returns the state (as a hashref) for backward compatibility
3870sub save_global_state
3871  { my $t= shift;
3872    return $t->{twig_saved_state}= XML::Twig::Elt::global_state();
3873  }
3874
3875sub restore_global_state
3876  { my $t= shift;
3877    XML::Twig::Elt::set_global_state( $t->{twig_saved_state});
3878  }
3879
3880sub global_state
3881  { return XML::Twig::Elt::global_state(); }
3882
3883sub set_global_state
3884  {  return XML::Twig::Elt::set_global_state( $_[1]); }
3885
3886sub dispose
3887  { my $t= shift;
3888    $t->DESTROY;
3889    return;
3890  }
3891
3892sub DESTROY
3893  { my $t= shift;
3894    if( $t->{twig_root} && isa(  $t->{twig_root}, 'XML::Twig::Elt'))
3895      { $t->{twig_root}->delete }
3896
3897    # added to break circular references
3898    undef $t->{twig};
3899    undef $t->{twig_root}->{twig} if( $t->{twig_root});
3900    undef $t->{twig_parser};
3901
3902    undef %$t;# prevents memory leaks (especially when using mod_perl)
3903    undef $t;
3904  }
3905
3906
3907#
3908#  non standard handlers
3909#
3910
3911# kludge: expat 1.95.2 calls both Default AND Doctype handlers
3912# so if the default handler finds '<!DOCTYPE' then it must
3913# unset itself (_twig_print_doctype will reset it)
3914sub _twig_print_check_doctype
3915   { # warn " in _twig_print_check_doctype...\n"; # DEBUG handler
3916
3917    my $p= shift;
3918    my $string= $p->recognized_string();
3919    if( $string eq '<!DOCTYPE')
3920      {
3921        $p->setHandlers( Default => undef);
3922        $p->setHandlers( Entity => undef);
3923        $expat_1_95_2=1;
3924      }
3925    else
3926      { print $string; }
3927
3928    return;
3929  }
3930
3931
3932sub _twig_print
3933   { # warn " in _twig_print...\n"; # DEBUG handler
3934    my $p= shift;
3935    if( $expat_1_95_2 && ($p->recognized_string eq '[') && !$p->{twig}->{expat_1_95_2_seen_bracket})
3936      { # otherwise the opening square bracket of the doctype gets printed twice
3937        $p->{twig}->{expat_1_95_2_seen_bracket}=1;
3938      }
3939    else
3940      { if( $p->{twig}->{twig_right_after_root})
3941          { my $s= $p->recognized_string(); print $s if $s=~ m{\S}; }
3942        else
3943          { print $p->recognized_string(); }
3944      }
3945    return;
3946  }
3947# recognized_string does not seem to work for entities, go figure!
3948# so this handler is used to print them anyway
3949sub _twig_print_entity
3950   { # warn " in _twig_print_entity...\n"; # DEBUG handler
3951    my $p= shift;
3952    XML::Twig::Entity->new( @_)->print;
3953  }
3954
3955# kludge: expat 1.95.2 calls both Default AND Doctype handlers
3956# so if the default handler finds '<!DOCTYPE' then it must
3957# unset itself (_twig_print_doctype will reset it)
3958sub _twig_print_original_check_doctype
3959   { # warn " in _twig_print_original_check_doctype...\n"; # DEBUG handler
3960
3961    my $p= shift;
3962    my $string= $p->original_string();
3963    if( $string eq '<!DOCTYPE')
3964      { $p->setHandlers( Default => undef);
3965        $p->setHandlers( Entity => undef);
3966        $expat_1_95_2=1;
3967      }
3968    else
3969      { print $string; }
3970
3971    return;
3972  }
3973
3974sub _twig_print_original
3975   { # warn " in _twig_print_original...\n"; # DEBUG handler
3976    my $p= shift;
3977    print $p->original_string();
3978    return;
3979  }
3980
3981
3982sub _twig_print_original_doctype
3983   { # warn " in _twig_print_original_doctype...\n"; # DEBUG handler
3984
3985    my(  $p, $name, $sysid, $pubid, $internal)= @_;
3986    if( $name)
3987      { # with recent versions of XML::Parser original_string does not work,
3988        # hence we need to rebuild the doctype declaration
3989        my $doctype='';
3990        $doctype .= qq{<!DOCTYPE $name}    if( $name);
3991        $doctype .=  qq{ PUBLIC  "$pubid"}  if( $pubid);
3992        $doctype .=  qq{ SYSTEM}            if( $sysid && !$pubid);
3993        $doctype .=  qq{ "$sysid"}          if( $sysid);
3994        $doctype .=  ' [' if( $internal && !$expat_1_95_2) ;
3995        $doctype .=  qq{>} unless( $internal || $expat_1_95_2);
3996        $p->{twig}->{twig_doctype}->{has_internal}=$internal;
3997        print $doctype;
3998      }
3999    $p->setHandlers( Default => \&_twig_print_original);
4000    return;
4001  }
4002
4003sub _twig_print_doctype
4004   { # warn " in _twig_print_doctype...\n"; # DEBUG handler
4005    my(  $p, $name, $sysid, $pubid, $internal)= @_;
4006    if( $name)
4007      { # with recent versions of XML::Parser original_string does not work,
4008        # hence we need to rebuild the doctype declaration
4009        my $doctype='';
4010        $doctype .= qq{<!DOCTYPE $name}    if( $name);
4011        $doctype .=  qq{ PUBLIC  "$pubid"}  if( $pubid);
4012        $doctype .=  qq{ SYSTEM}            if( $sysid && !$pubid);
4013        $doctype .=  qq{ "$sysid"}          if( $sysid);
4014        $doctype .=  ' [' if( $internal) ;
4015        $doctype .=  qq{>} unless( $internal || $expat_1_95_2);
4016        $p->{twig}->{twig_doctype}->{has_internal}=$internal;
4017        print $doctype;
4018      }
4019    $p->setHandlers( Default => \&_twig_print);
4020    return;
4021  }
4022
4023
4024sub _twig_print_original_default
4025   { # warn " in _twig_print_original_default...\n"; # DEBUG handler
4026    my $p= shift;
4027    print $p->original_string();
4028    return;
4029  }
4030
4031# account for the case where the element is empty
4032sub _twig_print_end_original
4033   { # warn " in _twig_print_end_original...\n"; # DEBUG handler
4034    my $p= shift;
4035    print $p->original_string();
4036    return;
4037  }
4038
4039sub _twig_start_check_roots
4040   { # warn " in _twig_start_check_roots...\n"; # DEBUG handler
4041    my $p= shift;
4042    my $gi= shift;
4043
4044    my $t= $p->{twig};
4045
4046    my $fh= $t->{twig_output_fh} || select() || \*STDOUT;
4047
4048    my $ns_decl;
4049    unless( $p->depth == 0)
4050      { if( $t->{twig_map_xmlns}) { $ns_decl= _replace_ns( $t, \$gi, \@_); }
4051      }
4052
4053    my $context= { $ST_TAG => $gi, @_};
4054    $context->{$ST_NS}= $ns_decl if $ns_decl;
4055    push @{$t->{_twig_context_stack}}, $context;
4056    my %att= @_;
4057
4058    if( _handler( $t, $t->{twig_roots}, $gi))
4059      { $p->setHandlers( %twig_handlers); # restore regular handlers
4060        $t->{twig_root_depth}= $p->depth;
4061        pop @{$t->{_twig_context_stack}}; # will be pushed back in _twig_start
4062        _twig_start( $p, $gi, @_);
4063        return;
4064      }
4065
4066    # $tag will always be true if it needs to be printed (the tag string is never empty)
4067    my $tag= $t->{twig_default_print} ? $t->{twig_keep_encoding} ? $p->original_string
4068                                                                 : $p->recognized_string
4069                                      : '';
4070
4071    if( $p->depth == 0)
4072      {
4073        ## no critic (TestingAndDebugging::ProhibitNoStrict);
4074        no strict 'refs';
4075        print {$fh} $tag if( $tag);
4076        pop @{$t->{_twig_context_stack}}; # will be pushed back in _twig_start
4077        _twig_start( $p, $gi, @_);
4078        $t->root->_set_flushed; # or the root start tag gets output the first time we flush
4079      }
4080    elsif( $t->{twig_starttag_handlers})
4081      { # look for start tag handlers
4082
4083        my @handlers= _handler( $t, $t->{twig_starttag_handlers}, $gi);
4084        my $last_handler_res;
4085        foreach my $handler ( @handlers)
4086          { $last_handler_res= $handler->($t, $gi, %att);
4087            last unless $last_handler_res;
4088          }
4089        ## no critic (TestingAndDebugging::ProhibitNoStrict);
4090        no strict 'refs';
4091        print {$fh} $tag if( $tag && (!@handlers || $last_handler_res));
4092      }
4093    else
4094      {
4095        ## no critic (TestingAndDebugging::ProhibitNoStrict);
4096        no strict 'refs';
4097        print {$fh} $tag if( $tag);
4098      }
4099    return;
4100  }
4101
4102sub _twig_end_check_roots
4103   { # warn " in _twig_end_check_roots...\n"; # DEBUG handler
4104
4105    my( $p, $gi, %att)= @_;
4106    my $t= $p->{twig};
4107    # $tag can be empty (<elt/>), hence the undef and the tests for defined
4108    my $tag= $t->{twig_default_print} ? $t->{twig_keep_encoding} ? $p->original_string
4109                                                                 : $p->recognized_string
4110                                      : undef;
4111    my $fh= $t->{twig_output_fh} || select() || \*STDOUT;
4112
4113    if( $t->{twig_endtag_handlers})
4114      { # look for end tag handlers
4115        my @handlers= _handler( $t, $t->{twig_endtag_handlers}, $gi);
4116        my $last_handler_res=1;
4117        foreach my $handler ( @handlers)
4118          { $last_handler_res= $handler->($t, $gi) || last; }
4119        #if( ! $last_handler_res)
4120        #  { pop @{$t->{_twig_context_stack}}; warn "tested";
4121        #    return;
4122        #  }
4123      }
4124    {
4125      ## no critic (TestingAndDebugging::ProhibitNoStrict);
4126      no strict 'refs';
4127      print {$fh} $tag if( defined $tag);
4128    }
4129    if( $p->depth == 0)
4130      {
4131        _twig_end( $p, $gi);
4132        $t->root->{end_tag_flushed}=1;
4133      }
4134
4135    pop @{$t->{_twig_context_stack}};
4136    return;
4137  }
4138
4139sub _twig_pi_check_roots
4140   { # warn " in _twig_pi_check_roots...\n"; # DEBUG handler
4141    my( $p, $target, $data)= @_;
4142    my $t= $p->{twig};
4143    my $pi= $t->{twig_default_print} ? $t->{twig_keep_encoding} ? $p->original_string
4144                                                                : $p->recognized_string
4145                                    : undef;
4146    my $fh= $t->{twig_output_fh} || select() || \*STDOUT;
4147
4148    if( my $handler=    $t->{twig_handlers}->{pi_handlers}->{$target}
4149                     || $t->{twig_handlers}->{pi_handlers}->{''}
4150      )
4151      { # if handler is called on pi, then it needs to be processed as a regular node
4152        my @flags= qw( twig_process_pi twig_keep_pi);
4153        my @save= @{$t}{@flags}; # save pi related flags
4154        @{$t}{@flags}= (1, 0);   # override them, pi needs to be processed
4155        _twig_pi( @_);           # call handler on the pi
4156        @{$t}{@flags}= @save;;   # restore flag
4157      }
4158    else
4159      {
4160        ## no critic (TestingAndDebugging::ProhibitNoStrict);
4161        no strict 'refs';
4162        print  {$fh} $pi if( defined( $pi));
4163      }
4164    return;
4165  }
4166
4167
4168sub _output_ignored
4169  { my( $t, $p)= @_;
4170    my $action= $t->{twig_ignore_action};
4171
4172    my $get_string= $t->{twig_keep_encoding} ? 'original_string' : 'recognized_string';
4173
4174    if( $action eq 'print' ) { print $p->$get_string; }
4175    else
4176      { my $string_ref;
4177        if( $action eq 'string')
4178          { $string_ref= \$t->{twig_buffered_string}; }
4179        elsif( ref( $action) && ref( $action) eq 'SCALAR')
4180          { $string_ref= $action; }
4181        else
4182          { _croak( "wrong ignore action: $action"); }
4183
4184        $$string_ref .= $p->$get_string;
4185      }
4186  }
4187
4188
4189
4190sub _twig_ignore_start
4191   { # warn " in _twig_ignore_start...\n"; # DEBUG handler
4192
4193    my( $p, $gi)= @_;
4194    my $t= $p->{twig};
4195    $t->{twig_ignore_level}++;
4196    my $action= $t->{twig_ignore_action};
4197
4198    $t->_output_ignored( $p) unless $action eq 'discard';
4199    return;
4200  }
4201
4202sub _twig_ignore_end
4203   { # warn " in _twig_ignore_end...\n"; # DEBUG handler
4204
4205    my( $p, $gi)= @_;
4206    my $t= $p->{twig};
4207
4208    my $action= $t->{twig_ignore_action};
4209    $t->_output_ignored( $p) unless $action eq 'discard';
4210
4211    $t->{twig_ignore_level}--;
4212
4213    if( ! $t->{twig_ignore_level})
4214      {
4215        $t->{twig_current}   = $t->{twig_ignore_elt};
4216        $t->{twig_current}->set_twig_current;
4217
4218        $t->{twig_ignore_elt}->cut;  # there could possibly be a memory leak here (delete would avoid it,
4219                                     # but could also delete elements that should not be deleted)
4220
4221        # restore the saved stack to the current level
4222        splice( @{$t->{_twig_context_stack}}, $p->depth+ 1 );
4223        #warn "stack: ", _dump_stack( $t->{_twig_context_stack}), "\n";
4224
4225        $p->setHandlers( @{$t->{twig_saved_handlers}});
4226        # test for handlers
4227        if( $t->{twig_endtag_handlers})
4228          { # look for end tag handlers
4229            my @handlers= _handler( $t, $t->{twig_endtag_handlers}, $gi);
4230            my $last_handler_res=1;
4231            foreach my $handler ( @handlers)
4232              { $last_handler_res= $handler->($t, $gi) || last; }
4233          }
4234        pop @{$t->{_twig_context_stack}};
4235      };
4236    return;
4237  }
4238
4239#sub _dump_stack { my( $stack)= @_; return join( ":", map { $_->{$ST_TAG} } @$stack); }
4240
4241sub ignore
4242  { my( $t, $elt, $action)= @_;
4243    my $current= $t->{twig_current};
4244
4245    if( ! ($elt && ref( $elt) && isa( $elt, 'XML::Twig::Elt'))) { $elt= $current; }
4246
4247    #warn "ignore:  current = ", $current->tag, ", elt = ", $elt->tag, ")\n";
4248
4249    # we need the ($elt == $current->{last_child}) test because the current element is set to the
4250    # parent _before_ handlers are called (and I can't figure out how to fix this)
4251    unless( ($elt == $current) || ($current->{last_child} && ($elt == $current->{last_child})) || $current->in( $elt))
4252      { _croak( "element to be ignored must be ancestor of current element"); }
4253
4254    $t->{twig_ignore_level}= $current == $elt ? 1 : $t->_level_in_stack( $current) - $t->_level_in_stack($elt) + 1;
4255    #warn "twig_ignore_level:  $t->{twig_ignore_level} (current: ", $current->tag, ", elt: ", $elt->tag, ")\n";
4256    $t->{twig_ignore_elt}  = $elt;     # save it, so we can delete it later
4257
4258    $action ||= 'discard';
4259    if( !($action eq 'print' || $action eq 'string' || ( ref( $action) && ref( $action) eq 'SCALAR')))
4260      { $action= 'discard'; }
4261
4262    $t->{twig_ignore_action}= $action;
4263
4264    my $p= $t->{twig_parser};
4265    my @saved_handlers= $p->setHandlers( %twig_handlers_ignore); # set handlers
4266
4267    my $get_string= $t->{twig_keep_encoding} ? 'original_string' : 'recognized_string';
4268
4269    my $default_handler;
4270
4271    if( $action ne 'discard')
4272      { if( $action eq 'print')
4273          { $p->setHandlers( Default => sub { print $_[0]->$get_string; }); }
4274        else
4275          { my $string_ref;
4276            if( $action eq 'string')
4277              { if( ! exists $t->{twig_buffered_string}) { $t->{twig_buffered_string}=''; }
4278                $string_ref= \$t->{twig_buffered_string};
4279              }
4280            elsif( ref( $action) && ref( $action) eq 'SCALAR')
4281              { $string_ref= $action; }
4282
4283            $p->setHandlers( Default =>  sub { $$string_ref .= $_[0]->$get_string; });
4284          }
4285        $t->_output_ignored( $p, $action);
4286      }
4287
4288
4289    $t->{twig_saved_handlers}= \@saved_handlers;        # save current handlers
4290  }
4291
4292sub _level_in_stack
4293  { my( $t, $elt)= @_;
4294    my $level=1;
4295    foreach my $elt_in_stack ( @{$t->{_twig_context_stack}} )
4296      { if( $elt_in_stack->{$ST_ELT} && ($elt == $elt_in_stack->{$ST_ELT})) { return $level }
4297        $level++;
4298      }
4299  }
4300
4301
4302
4303# select $t->{twig_output_fh} and store the current selected fh
4304sub _set_fh_to_twig_output_fh
4305  { my $t= shift;
4306    my $output_fh= $t->{twig_output_fh};
4307    if( $output_fh && !$t->{twig_output_fh_selected})
4308      { # there is an output fh
4309        $t->{twig_selected_fh}= select(); # store the currently selected fh
4310        $t->{twig_output_fh_selected}=1;
4311        select $output_fh;                # select the output fh for the twig
4312      }
4313  }
4314
4315# select the fh that was stored in $t->{twig_selected_fh}
4316# (before $t->{twig_output_fh} was selected)
4317sub _set_fh_to_selected_fh
4318  { my $t= shift;
4319    return unless( $t->{twig_output_fh});
4320    my $selected_fh= $t->{twig_selected_fh};
4321    $t->{twig_output_fh_selected}=0;
4322    select $selected_fh;
4323    return;
4324  }
4325
4326
4327sub encoding
4328  { return $_[0]->{twig_xmldecl}->{encoding} if( $_[0]->{twig_xmldecl}); }
4329
4330sub set_encoding
4331  { my( $t, $encoding)= @_;
4332    $t->{twig_xmldecl} ||={};
4333    $t->set_xml_version( "1.0") unless( $t->xml_version);
4334    $t->{twig_xmldecl}->{encoding}= $encoding;
4335    return $t;
4336  }
4337
4338sub output_encoding
4339  { return $_[0]->{output_encoding}; }
4340
4341sub set_output_encoding
4342  { my( $t, $encoding)= @_;
4343    my $output_filter= $t->output_filter || '';
4344
4345    if( ($encoding && $encoding !~ m{^utf-?8$}i) || $t->{twig_keep_encoding} || $output_filter)
4346      { $t->set_output_filter( _encoding_filter( $encoding || '')); }
4347
4348    $t->{output_encoding}= $encoding;
4349    return $t;
4350  }
4351
4352sub xml_version
4353  { return $_[0]->{twig_xmldecl}->{version} if( $_[0]->{twig_xmldecl}); }
4354
4355sub set_xml_version
4356  { my( $t, $version)= @_;
4357    $t->{twig_xmldecl} ||={};
4358    $t->{twig_xmldecl}->{version}= $version;
4359    return $t;
4360  }
4361
4362sub standalone
4363  { return $_[0]->{twig_xmldecl}->{standalone} if( $_[0]->{twig_xmldecl}); }
4364
4365sub set_standalone
4366  { my( $t, $standalone)= @_;
4367    $t->{twig_xmldecl} ||={};
4368    $t->set_xml_version( "1.0") unless( $t->xml_version);
4369    $t->{twig_xmldecl}->{standalone}= $standalone;
4370    return $t;
4371  }
4372
4373
4374# SAX methods
4375
4376sub toSAX1
4377  { _croak( "cannot use toSAX1 while parsing (use flush_toSAX1)") if (defined $_[0]->{twig_parser});
4378    shift(@_)->_toSAX(@_, \&XML::Twig::Elt::_start_tag_data_SAX1,
4379                          \&XML::Twig::Elt::_end_tag_data_SAX1
4380             );
4381  }
4382
4383sub toSAX2
4384  { _croak( "cannot use toSAX2 while parsing (use flush_toSAX2)") if (defined $_[0]->{twig_parser});
4385    shift(@_)->_toSAX(@_, \&XML::Twig::Elt::_start_tag_data_SAX2,
4386                          \&XML::Twig::Elt::_end_tag_data_SAX2
4387             );
4388  }
4389
4390
4391sub _toSAX
4392  { my( $t, $handler, $start_tag_data, $end_tag_data) = @_;
4393
4394    if( my $start_document =  $handler->can( 'start_document'))
4395      { $start_document->( $handler); }
4396
4397    $t->_prolog_toSAX( $handler);
4398
4399    if( $t->root) { $t->root->_toSAX( $handler, $start_tag_data, $end_tag_data) ; }
4400    if( my $end_document =  $handler->can( 'end_document'))
4401      { $end_document->( $handler); }
4402  }
4403
4404
4405sub flush_toSAX1
4406  { shift(@_)->_flush_toSAX(@_, \&XML::Twig::Elt::_start_tag_data_SAX1,
4407                               \&XML::Twig::Elt::_end_tag_data_SAX1
4408             );
4409  }
4410
4411sub flush_toSAX2
4412  { shift(@_)->_flush_toSAX(@_, \&XML::Twig::Elt::_start_tag_data_SAX2,
4413                               \&XML::Twig::Elt::_end_tag_data_SAX2
4414             );
4415  }
4416
4417sub _flush_toSAX
4418  { my( $t, $handler, $start_tag_data, $end_tag_data)= @_;
4419
4420    # the "real" last element processed, as _twig_end has closed it
4421    my $last_elt;
4422    if( $t->{twig_current})
4423      { $last_elt= $t->{twig_current}->_last_child; }
4424    else
4425      { $last_elt= $t->{twig_root}; }
4426
4427    my $elt= $t->{twig_root};
4428    unless( $elt->_flushed)
4429      { # init unless already done (ie root has been flushed)
4430        if( my $start_document =  $handler->can( 'start_document'))
4431          { $start_document->( $handler); }
4432        # flush the DTD
4433        $t->_prolog_toSAX( $handler)
4434      }
4435
4436    while( $elt)
4437      { my $next_elt;
4438        if( $last_elt && $last_elt->in( $elt))
4439          {
4440            unless( $elt->_flushed)
4441              { # just output the front tag
4442                if( my $start_element = $handler->can( 'start_element'))
4443                 { if( my $tag_data= $start_tag_data->( $elt))
4444                     { $start_element->( $handler, $tag_data); }
4445                 }
4446                $elt->_set_flushed;
4447              }
4448            $next_elt= $elt->{first_child};
4449          }
4450        else
4451          { # an element before the last one or the last one,
4452            $next_elt= $elt->{next_sibling};
4453            $elt->_toSAX( $handler, $start_tag_data, $end_tag_data);
4454            $elt->delete;
4455            last if( $last_elt && ($elt == $last_elt));
4456          }
4457        $elt= $next_elt;
4458      }
4459    if( !$t->{twig_parsing})
4460      { if( my $end_document =  $handler->can( 'end_document'))
4461          { $end_document->( $handler); }
4462      }
4463  }
4464
4465
4466sub _prolog_toSAX
4467  { my( $t, $handler)= @_;
4468    $t->_xmldecl_toSAX( $handler);
4469    $t->_DTD_toSAX( $handler);
4470  }
4471
4472sub _xmldecl_toSAX
4473  { my( $t, $handler)= @_;
4474    my $decl= $t->{twig_xmldecl};
4475    my $data= { Version    => $decl->{version},
4476                Encoding   => $decl->{encoding},
4477                Standalone => $decl->{standalone},
4478          };
4479    if( my $xml_decl= $handler->can( 'xml_decl'))
4480      { $xml_decl->( $handler, $data); }
4481  }
4482
4483sub _DTD_toSAX
4484  { my( $t, $handler)= @_;
4485    my $doctype= $t->{twig_doctype};
4486    return unless( $doctype);
4487    my $data= { Name     => $doctype->{name},
4488                PublicId => $doctype->{pub},
4489                SystemId => $doctype->{sysid},
4490              };
4491
4492    if( my $start_dtd= $handler->can( 'start_dtd'))
4493      { $start_dtd->( $handler, $data); }
4494
4495    # I should call code to export the internal subset here
4496
4497    if( my $end_dtd= $handler->can( 'end_dtd'))
4498      { $end_dtd->( $handler); }
4499  }
4500
4501# input/output filters
4502
4503sub latin1
4504  { local $SIG{__DIE__};
4505    if( _use(  'Encode'))
4506      { return encode_convert( 'ISO-8859-15'); }
4507    elsif( _use( 'Text::Iconv'))
4508      { return iconv_convert( 'ISO-8859-15'); }
4509    elsif( _use( 'Unicode::Map8') && _use( 'Unicode::String'))
4510      { return unicode_convert( 'ISO-8859-15'); }
4511    else
4512      { return \&regexp2latin1; }
4513  }
4514
4515sub _encoding_filter
4516  {
4517      { local $SIG{__DIE__};
4518        my $encoding= $_[1] || $_[0];
4519        if( _use( 'Encode'))
4520          { my $sub= encode_convert( $encoding);
4521            return $sub;
4522          }
4523        elsif( _use( 'Text::Iconv'))
4524          { return iconv_convert( $encoding); }
4525        elsif( _use( 'Unicode::Map8') && _use( 'Unicode::String'))
4526          { return unicode_convert( $encoding); }
4527        }
4528    _croak( "Encode, Text::Iconv or Unicode::Map8 and Unicode::String need to be installed in order to use encoding options");
4529  }
4530
4531# shamelessly lifted from XML::TyePYX (works only with XML::Parse 2.27)
4532sub regexp2latin1
4533  { my $text=shift;
4534    $text=~s{([\xc0-\xc3])(.)}{ my $hi = ord($1);
4535                                my $lo = ord($2);
4536                                chr((($hi & 0x03) <<6) | ($lo & 0x3F))
4537                              }ge;
4538    return $text;
4539  }
4540
4541
4542sub html_encode
4543  { _use( 'HTML::Entities') or croak "cannot use html_encode: missing HTML::Entities";
4544    return HTML::Entities::encode_entities($_[0] );
4545  }
4546
4547sub safe_encode
4548  {   my $str= shift;
4549      if( $perl_version < 5.008)
4550        { # the no utf8 makes the regexp work in 5.6
4551          no utf8; # = perl 5.6
4552          $str =~ s{([\xC0-\xDF].|[\xE0-\xEF]..|[\xF0-\xFF]...)}
4553                   {_XmlUtf8Decode($1)}egs;
4554        }
4555      else
4556        { $str= encode( ascii => $str, $FB_HTMLCREF); }
4557      return $str;
4558  }
4559
4560sub safe_encode_hex
4561  {   my $str= shift;
4562      if( $perl_version < 5.008)
4563        { # the no utf8 makes the regexp work in 5.6
4564          no utf8; # = perl 5.6
4565          $str =~ s{([\xC0-\xDF].|[\xE0-\xEF]..|[\xF0-\xFF]...)}
4566                   {_XmlUtf8Decode($1, 1)}egs;
4567        }
4568      else
4569        { $str= encode( ascii => $str, $FB_XMLCREF); }
4570      return $str;
4571  }
4572
4573# this one shamelessly lifted from XML::DOM
4574# does NOT work on 5.8.0
4575sub _XmlUtf8Decode
4576  { my ($str, $hex) = @_;
4577    my $len = length ($str);
4578    my $n;
4579
4580    if ($len == 2)
4581      { my @n = unpack "C2", $str;
4582        $n = (($n[0] & 0x3f) << 6) + ($n[1] & 0x3f);
4583      }
4584    elsif ($len == 3)
4585      { my @n = unpack "C3", $str;
4586        $n = (($n[0] & 0x1f) << 12) + (($n[1] & 0x3f) << 6) + ($n[2] & 0x3f);
4587      }
4588    elsif ($len == 4)
4589      { my @n = unpack "C4", $str;
4590        $n = (($n[0] & 0x0f) << 18) + (($n[1] & 0x3f) << 12)
4591           + (($n[2] & 0x3f) << 6) + ($n[3] & 0x3f);
4592      }
4593    elsif ($len == 1)    # just to be complete...
4594      { $n = ord ($str); }
4595    else
4596      { croak "bad value [$str] for _XmlUtf8Decode"; }
4597
4598    my $char= $hex ? sprintf ("&#x%x;", $n) : "&#$n;";
4599    return $char;
4600  }
4601
4602
4603sub unicode_convert
4604  { my $enc= $_[1] ? $_[1] : $_[0]; # so the method can be called on the twig or directly
4605    _use( 'Unicode::Map8') or croak "Unicode::Map8 not available, needed for encoding filter: $!";
4606    _use( 'Unicode::String') or croak "Unicode::String not available, needed for encoding filter: $!";
4607    import Unicode::String qw(utf8);
4608    my $sub= eval qq{ { $NO_WARNINGS;
4609                        my \$cnv;
4610                        BEGIN {  \$cnv= Unicode::Map8->new(\$enc)
4611                                     or croak "Can't create converter to \$enc";
4612                              }
4613                        sub { return  \$cnv->to8 (utf8(\$_[0])->ucs2); }
4614                      }
4615                    };
4616    unless( $sub) { croak $@; }
4617    return $sub;
4618  }
4619
4620sub iconv_convert
4621  { my $enc= $_[1] ? $_[1] : $_[0]; # so the method can be called on the twig or directly
4622    _use( 'Text::Iconv') or croak "Text::Iconv not available, needed for encoding filter: $!";
4623    my $sub= eval qq{ { $NO_WARNINGS;
4624                        my \$cnv;
4625                        BEGIN { \$cnv = Text::Iconv->new( 'utf8', \$enc)
4626                                     or croak "Can't create iconv converter to \$enc";
4627                              }
4628                        sub { return  \$cnv->convert( \$_[0]); }
4629                      }
4630                    };
4631    unless( $sub)
4632      { if( $@=~ m{^Unsupported conversion: Invalid argument})
4633          { croak "Unsupported encoding: $enc"; }
4634        else
4635          { croak $@; }
4636      }
4637
4638    return $sub;
4639  }
4640
4641sub encode_convert
4642  { my $enc= $_[1] ? $_[1] : $_[0]; # so the method can be called on the twig or directly
4643    my $sub=  eval qq{sub { $NO_WARNINGS; return encode( "$enc", \$_[0]); } };
4644    croak "can't create Encode-based filter: $@" unless( $sub);
4645    return $sub;
4646  }
4647
4648
4649# XML::XPath compatibility
4650sub getRootNode        { return $_[0]; }
4651sub getParentNode      { return undef; }
4652sub getChildNodes      { my @children= ($_[0]->root); return wantarray ? @children : \@children; }
4653
4654sub _weakrefs     { return $weakrefs;       }
4655sub _set_weakrefs { $weakrefs=shift() || 0; XML::Twig::Elt::set_destroy()if ! $weakrefs; } # for testing purposes
4656
4657sub _dump
4658  { my $t= shift;
4659    my $dump='';
4660
4661    $dump="document\n"; # should dump twig level data here
4662    if( $t->root) { $dump .= $t->root->_dump( @_); }
4663
4664    return $dump;
4665
4666  }
4667
4668
46691;
4670
4671######################################################################
4672package XML::Twig::Entity_list;
4673######################################################################
4674
4675*isa= *UNIVERSAL::isa;
4676
4677sub new
4678  { my $class = shift;
4679    my $self={ entities => {}, updated => 0};
4680
4681    bless $self, $class;
4682    return $self;
4683
4684  }
4685
4686sub add_new_ent
4687  { my $ent_list= shift;
4688    my $ent= XML::Twig::Entity->new( @_);
4689    $ent_list->add( $ent);
4690    return $ent_list;
4691  }
4692
4693sub _add_list
4694  { my( $ent_list, $to_add)= @_;
4695    my $ents_to_add= $to_add->{entities};
4696    return $ent_list unless( $ents_to_add && %$ents_to_add);
4697    @{$ent_list->{entities}}{keys %$ents_to_add}= values %$ents_to_add;
4698    $ent_list->{updated}=1;
4699    return $ent_list;
4700  }
4701
4702sub add
4703  { my( $ent_list, $ent)= @_;
4704    $ent_list->{entities}->{$ent->{name}}= $ent;
4705    $ent_list->{updated}=1;
4706    return $ent_list;
4707  }
4708
4709sub ent
4710  { my( $ent_list, $ent_name)= @_;
4711    return $ent_list->{entities}->{$ent_name};
4712  }
4713
4714# can be called with an entity or with an entity name
4715sub delete
4716  { my $ent_list= shift;
4717    if( isa( ref $_[0], 'XML::Twig::Entity'))
4718      { # the second arg is an entity
4719        my $ent= shift;
4720        delete $ent_list->{entities}->{$ent->{name}};
4721      }
4722    else
4723      { # the second arg was not entity, must be a string then
4724        my $name= shift;
4725        delete $ent_list->{entities}->{$name};
4726      }
4727    $ent_list->{updated}=1;
4728    return $ent_list;
4729  }
4730
4731sub print
4732  { my ($ent_list, $fh)= @_;
4733    my $old_select= defined $fh ? select $fh : undef;
4734
4735    foreach my $ent_name ( sort keys %{$ent_list->{entities}})
4736      { my $ent= $ent_list->{entities}->{$ent_name};
4737        # we have to test what the entity is or un-defined entities can creep in
4738        if( isa( $ent, 'XML::Twig::Entity')) { $ent->print(); }
4739      }
4740    select $old_select if( defined $old_select);
4741    return $ent_list;
4742  }
4743
4744sub text
4745  { my ($ent_list)= @_;
4746    return join "\n", map { $ent_list->{entities}->{$_}->text} sort keys %{$ent_list->{entities}};
4747  }
4748
4749# return the list of entity names
4750sub entity_names
4751  { my $ent_list= shift;
4752    return (sort keys %{$ent_list->{entities}}) ;
4753  }
4754
4755
4756sub list
4757  { my ($ent_list)= @_;
4758    return map { $ent_list->{entities}->{$_} } sort keys %{$ent_list->{entities}};
4759  }
4760
47611;
4762
4763######################################################################
4764package XML::Twig::Entity;
4765######################################################################
4766
4767#*isa= *UNIVERSAL::isa;
4768
4769sub new
4770  { my( $class, $name, $val, $sysid, $pubid, $ndata, $param)= @_;
4771    $class= ref( $class) || $class;
4772
4773    my $self={};
4774
4775    $self->{name}  = $name;
4776    $self->{val}   = $val   if( defined $val  );
4777    $self->{sysid} = $sysid if( defined $sysid);
4778    $self->{pubid} = $pubid if( defined $pubid);
4779    $self->{ndata} = $ndata if( defined $ndata);
4780    $self->{param} = $param if( defined $param);
4781
4782    bless $self, $class;
4783    return $self;
4784  }
4785
4786
4787sub name  { return $_[0]->{name}; }
4788sub val   { return $_[0]->{val}; }
4789sub sysid { return defined( $_[0]->{sysid}) ? $_[0]->{sysid} : ''; }
4790sub pubid { return defined( $_[0]->{pubid}) ? $_[0]->{pubid} : ''; }
4791sub ndata { return defined( $_[0]->{ndata}) ? $_[0]->{ndata} : ''; }
4792sub param { return defined( $_[0]->{param}) ? $_[0]->{param} : ''; }
4793
4794
4795sub print
4796  { my ($ent, $fh)= @_;
4797    my $text= $ent->text;
4798    if( $fh) { print $fh $text . "\n"; }
4799    else     { print $text . "\n"; }
4800  }
4801
4802sub sprint
4803  { my ($ent)= @_;
4804    return $ent->text;
4805  }
4806
4807sub text
4808  { my ($ent)= @_;
4809    #warn "text called: '", $ent->_dump, "'\n";
4810    return '' if( !$ent->{name});
4811    my @tokens;
4812    push @tokens, '<!ENTITY';
4813
4814    push @tokens, '%' if( $ent->{param});
4815    push @tokens, $ent->{name};
4816
4817    if( defined $ent->{val} && !defined( $ent->{sysid}) && !defined($ent->{pubid}) )
4818      { push @tokens, _quoted_val( $ent->{val});
4819      }
4820    elsif( defined $ent->{sysid})
4821      { push @tokens, 'PUBLIC', _quoted_val( $ent->{pubid}) if( $ent->{pubid});
4822        push @tokens, 'SYSTEM' unless( $ent->{pubid});
4823        push @tokens, _quoted_val( $ent->{sysid});
4824        push @tokens, 'NDATA', $ent->{ndata} if( $ent->{ndata});
4825      }
4826    return join( ' ', @tokens) . '>';
4827  }
4828
4829sub _quoted_val
4830  { my $q= $_[0]=~ m{"} ? q{'} : q{"};
4831    return qq{$q$_[0]$q};
4832  }
4833
4834sub _dump
4835  { my( $ent)= @_; return join( " - ", map { "$_ => '$ent->{$_}'" } grep { defined $ent->{$_} } sort keys %$ent); }
4836
48371;
4838
4839######################################################################
4840package XML::Twig::Elt;
4841######################################################################
4842
4843use Carp;
4844*isa= *UNIVERSAL::isa;
4845
4846my $CDATA_START    = "<![CDATA[";
4847my $CDATA_END      = "]]>";
4848my $PI_START       = "<?";
4849my $PI_END         = "?>";
4850my $COMMENT_START  = "<!--";
4851my $COMMENT_END    = "-->";
4852
4853my $XMLNS_URI      = 'http://www.w3.org/2000/xmlns/';
4854
4855
4856BEGIN
4857  { # set some aliases for methods
4858    *tag           = *gi;
4859    *name          = *gi;
4860    *set_tag       = *set_gi;
4861    *set_name      = *set_gi;
4862    *find_nodes    = *get_xpath; # as in XML::DOM
4863    *findnodes     = *get_xpath; # as in XML::LibXML
4864    *field         = *first_child_text;
4865    *trimmed_field = *first_child_trimmed_text;
4866    *is_field      = *contains_only_text;
4867    *is            = *passes;
4868    *matches       = *passes;
4869    *has_child     = *first_child;
4870    *has_children  = *first_child;
4871    *all_children_pass = *all_children_are;
4872    *all_children_match= *all_children_are;
4873    *getElementsByTagName= *descendants;
4874    *find_by_tag_name= *descendants_or_self;
4875    *unwrap          = *erase;
4876    *inner_xml       = *xml_string;
4877    *outer_xml       = *sprint;
4878    *add_class       = *add_to_class;
4879
4880    *first_child_is  = *first_child_matches;
4881    *last_child_is   = *last_child_matches;
4882    *next_sibling_is = *next_sibling_matches;
4883    *prev_sibling_is = *prev_sibling_matches;
4884    *next_elt_is     = *next_elt_matches;
4885    *prev_elt_is     = *prev_elt_matches;
4886    *parent_is       = *parent_matches;
4887    *child_is        = *child_matches;
4888    *inherited_att   = *inherit_att;
4889
4890    *sort_children_by_value= *sort_children_on_value;
4891
4892    *has_atts= *att_nb;
4893
4894    # imports from XML::Twig
4895    *_is_fh= *XML::Twig::_is_fh;
4896
4897    # XML::XPath compatibility
4898    *string_value       = *text;
4899    *toString           = *sprint;
4900    *getName            = *gi;
4901    *getRootNode        = *twig;
4902    *getNextSibling     = *_next_sibling;
4903    *getPreviousSibling = *_prev_sibling;
4904    *isElementNode      = *is_elt;
4905    *isTextNode         = *is_text;
4906    *isPI               = *is_pi;
4907    *isPINode           = *is_pi;
4908    *isProcessingInstructionNode= *is_pi;
4909    *isComment          = *is_comment;
4910    *isCommentNode      = *is_comment;
4911    *getTarget          = *target;
4912    *getFirstChild      = *_first_child;
4913    *getLastChild      = *_last_child;
4914
4915    # try using weak references
4916    # test whether we can use weak references
4917    { local $SIG{__DIE__};
4918      if( eval 'require Scalar::Util' && defined( &Scalar::Util::weaken) )
4919        { import Scalar::Util qw(weaken); }
4920      elsif( eval 'require WeakRef')
4921        { import WeakRef; }
4922    }
4923}
4924
4925
4926# can be called as XML::Twig::Elt->new( [[$gi, $atts, [@content]])
4927# - gi is an optional gi given to the element
4928# - $atts is a hashref to attributes for the element
4929# - @content is an optional list of text and elements that will
4930#   be inserted under the element
4931sub new
4932  { my $class= shift;
4933    $class= ref $class || $class;
4934    my $elt  = {};
4935    bless ($elt, $class);
4936
4937    return $elt unless @_;
4938
4939    if( @_ == 1 && $_[0]=~ m{^\s*<}) { return $class->parse( @_); }
4940
4941    # if a gi is passed then use it
4942    my $gi= shift;
4943    $elt->{gi}=$XML::Twig::gi2index{$gi} or $elt->set_gi( $gi);
4944
4945
4946    my $atts= ref $_[0] eq 'HASH' ? shift : undef;
4947
4948    if( $atts && defined $atts->{$CDATA})
4949      { delete $atts->{$CDATA};
4950
4951        my $cdata= $class->new( $CDATA => @_);
4952        return $class->new( $gi, $atts, $cdata);
4953      }
4954
4955    if( $gi eq $PCDATA)
4956      { if( grep { ref $_ } @_) { croak "element $PCDATA can only be created from text"; }
4957        $elt->_set_pcdata( join( '', @_));
4958      }
4959    elsif( $gi eq $ENT)
4960      { $elt->{ent}=  shift; }
4961    elsif( $gi eq $CDATA)
4962      { if( grep { ref $_ } @_) { croak "element $CDATA can only be created from text"; }
4963        $elt->_set_cdata( join( '', @_));
4964      }
4965    elsif( $gi eq $COMMENT)
4966      { if( grep { ref $_ } @_) { croak "element $COMMENT can only be created from text"; }
4967        $elt->_set_comment( join( '', @_));
4968      }
4969    elsif( $gi eq $PI)
4970      { if( grep { ref $_ } @_) { croak "element $PI can only be created from text"; }
4971        $elt->_set_pi( shift, join( '', @_));
4972      }
4973    else
4974      { # the rest of the arguments are the content of the element
4975        if( @_)
4976          { $elt->set_content( @_); }
4977        else
4978          { $elt->{empty}=  1;    }
4979      }
4980
4981    if( $atts)
4982      { # the attribute hash can be used to pass the asis status
4983        if( defined $atts->{$ASIS})  { $elt->set_asis(  $atts->{$ASIS} ); delete $atts->{$ASIS};  }
4984        if( defined $atts->{$EMPTY}) { $elt->{empty}=  $atts->{$EMPTY}; delete $atts->{$EMPTY}; }
4985        if( keys %$atts) { $elt->set_atts( $atts); }
4986        $elt->_set_id( $atts->{$ID}) if( $atts->{$ID});
4987      }
4988
4989    return $elt;
4990  }
4991
4992# optimized version of $elt->new( PCDATA, $text);
4993sub _new_pcdata
4994  { my $class= $_[0];
4995    $class= ref $class || $class;
4996    my $elt  = {};
4997    bless $elt, $class;
4998    $elt->{gi}=$XML::Twig::gi2index{$PCDATA} or $elt->set_gi( $PCDATA);
4999    $elt->_set_pcdata( $_[1]);
5000    return $elt;
5001  }
5002
5003# this function creates an XM:::Twig::Elt from a string
5004# it is quite clumsy at the moment, as it just creates a
5005# new twig then returns its root
5006# there might also be memory leaks there
5007# additional arguments are passed to new XML::Twig
5008sub parse
5009  { my $class= shift;
5010    if( ref( $class)) { $class= ref( $class); }
5011    my $string= shift;
5012    my %args= @_;
5013    my $t= XML::Twig->new(%args);
5014    $t->parse( $string);
5015    my $elt= $t->root;
5016    # clean-up the node
5017    delete $elt->{twig};         # get rid of the twig data
5018    delete $elt->{twig_current}; # better get rid of this too
5019    if( $t->{twig_id_list}) { $elt->{twig_id_list}= $t->{twig_id_list}; }
5020    $elt->cut;
5021    undef $t->{twig_root};
5022    return $elt;
5023  }
5024
5025sub set_inner_xml
5026  { my( $elt, $xml, @args)= @_;
5027    my $new_elt= $elt->parse( "<dummy>$xml</dummy>", @args);
5028    $elt->cut_children;
5029    $new_elt->paste_first_child( $elt);
5030    $new_elt->erase;
5031    return $elt;
5032  }
5033
5034sub set_outer_xml
5035  { my( $elt, $xml, @args)= @_;
5036    my $new_elt= $elt->parse( "<dummy>$xml</dummy>", @args);
5037    $elt->cut_children;
5038    $new_elt->replace( $elt);
5039    $new_elt->erase;
5040    return $new_elt;
5041  }
5042
5043
5044sub set_inner_html
5045  { my( $elt, $html)= @_;
5046    my $t= XML::Twig->new->parse_html( "<html>$html</html>");
5047    my $new_elt= $t->root;
5048    if( $elt->tag eq 'head')
5049      { $new_elt->first_child( 'head')->unwrap;
5050        $new_elt->first_child( 'body')->cut;
5051      }
5052    elsif( $elt->tag ne 'html')
5053      { $new_elt->first_child( 'head')->cut;
5054        $new_elt->first_child( 'body')->unwrap;
5055      }
5056    $new_elt->cut;
5057    $elt->cut_children;
5058    $new_elt->paste_first_child( $elt);
5059    $new_elt->erase;
5060    return $elt;
5061  }
5062
5063sub set_gi
5064  { my ($elt, $gi)= @_;
5065    unless( defined $XML::Twig::gi2index{$gi})
5066      { # new gi, create entries in %gi2index and @index2gi
5067        push  @XML::Twig::index2gi, $gi;
5068        $XML::Twig::gi2index{$gi}= $#XML::Twig::index2gi;
5069      }
5070    $elt->{gi}= $XML::Twig::gi2index{$gi};
5071    return $elt;
5072  }
5073
5074sub gi  { return $XML::Twig::index2gi[$_[0]->{gi}]; }
5075
5076sub local_name
5077  { my $elt= shift;
5078    return _local_name( $XML::Twig::index2gi[$elt->{'gi'}]);
5079  }
5080
5081sub ns_prefix
5082  { my $elt= shift;
5083    return _ns_prefix( $XML::Twig::index2gi[$elt->{'gi'}]);
5084  }
5085
5086# namespace prefix for any qname (can be used for elements or attributes)
5087sub _ns_prefix
5088  { my $qname= shift;
5089    if( $qname=~ m{^([^:]*):})
5090      { return $1; }
5091    else
5092      { return( ''); } # should it be '' ?
5093  }
5094
5095# local name for any qname (can be used for elements or attributes)
5096sub _local_name
5097  { my $qname= shift;
5098    (my $local= $qname)=~ s{^[^:]*:}{};
5099    return $local;
5100  }
5101
5102#sub get_namespace
5103sub namespace ## no critic (Subroutines::ProhibitNestedSubs);
5104  { my $elt= shift;
5105    my $prefix= defined $_[0] ? shift() : $elt->ns_prefix;
5106    my $ns_att= $prefix ? "xmlns:$prefix" : "xmlns";
5107    my $expanded= $DEFAULT_NS{$prefix} || $elt->_inherit_att_through_cut( $ns_att) || '';
5108    return $expanded;
5109  }
5110
5111sub declare_missing_ns ## no critic (Subroutines::ProhibitNestedSubs);
5112  { my $root= shift;
5113    my %missing_prefix;
5114    my $map= $root->_current_ns_prefix_map;
5115
5116    foreach my $prefix (keys %$map)
5117      { my $prefix_att= $prefix eq '#default' ? 'xmlns' : "xmlns:$prefix";
5118        if( ! $root->{'att'}->{$prefix_att})
5119          { $root->set_att( $prefix_att => $map->{$prefix}); }
5120      }
5121    return $root;
5122  }
5123
5124sub _current_ns_prefix_map
5125  { my( $elt)= shift;
5126    my $map;
5127    while( $elt)
5128      { foreach my $att ($elt->att_names)
5129          { my $prefix= $att eq 'xmlns'        ? '#default'
5130                      : $att=~ m{^xmlns:(.*)$} ? $1
5131                      : next
5132                      ;
5133            if( ! exists $map->{$prefix}) { $map->{$prefix}= $elt->{'att'}->{$att}; }
5134          }
5135        $elt= $elt->{parent} || ($elt->{former} && $elt->{former}->{parent});
5136      }
5137    return $map;
5138  }
5139
5140sub set_ns_decl
5141  { my( $elt, $uri, $prefix)= @_;
5142    my $ns_att=  $prefix ? "xmlns:$prefix" : 'xmlns';
5143    $elt->set_att( $ns_att => $uri);
5144    return $elt;
5145  }
5146
5147sub set_ns_as_default
5148  { my( $root, $uri)= @_;
5149    my @ns_decl_to_remove;
5150    foreach my $elt ($root->descendants_or_self)
5151      { if( $elt->_ns_prefix && $elt->namespace eq $uri)
5152          { $elt->set_tag( $elt->local_name); }
5153        # store any namespace declaration for that uri
5154        foreach my $ns_decl (grep { $_=~ m{xmlns(:|$)} && $elt->{'att'}->{$_} eq $uri } $elt->att_names)
5155          { push @ns_decl_to_remove, [$elt, $ns_decl]; }
5156      }
5157    $root->set_ns_decl( $uri);
5158    # now remove the ns declarations (if done earlier then descendants of an element with the ns declaration
5159    # are not considered being in the namespace
5160    foreach my $ns_decl_to_remove ( @ns_decl_to_remove)
5161      { my( $elt, $ns_decl)= @$ns_decl_to_remove;
5162        $elt->del_att( $ns_decl);
5163      }
5164
5165    return $root;
5166  }
5167
5168
5169
5170# return #ELT for an element and #PCDATA... for others
5171sub get_type
5172  { my $gi_nb= $_[0]->{gi}; # the number, not the string
5173    return $ELT if( $gi_nb >= $XML::Twig::SPECIAL_GI);
5174    return $_[0]->gi;
5175  }
5176
5177# return the gi if it's a "real" element, 0 otherwise
5178sub is_elt
5179  { if(  $_[0]->{gi} >=  $XML::Twig::SPECIAL_GI)
5180     { return $_[0]->gi; }
5181    else
5182      { return 0; }
5183  }
5184
5185
5186sub is_pcdata
5187  { my $elt= shift;
5188    return (exists $elt->{'pcdata'});
5189  }
5190
5191sub is_cdata
5192  { my $elt= shift;
5193    return (exists $elt->{'cdata'});
5194  }
5195
5196sub is_pi
5197  { my $elt= shift;
5198    return (exists $elt->{'target'});
5199  }
5200
5201sub is_comment
5202  { my $elt= shift;
5203    return (exists $elt->{'comment'});
5204  }
5205
5206sub is_ent
5207  { my $elt= shift;
5208    return (exists $elt->{ent} || $elt->{ent_name});
5209  }
5210
5211
5212sub is_text
5213  { my $elt= shift;
5214    return (exists( $elt->{'pcdata'}) || (exists $elt->{'cdata'}));
5215  }
5216
5217sub is_empty
5218  { return $_[0]->{empty} || 0; }
5219
5220sub set_empty
5221  { $_[0]->{empty}= defined( $_[1]) ? $_[1] : 1; return $_[0]; }
5222
5223sub set_not_empty
5224  { delete $_[0]->{empty} if( $_[0]->{'empty'}); return $_[0]; }
5225
5226
5227sub set_asis
5228  { my $elt=shift;
5229
5230    foreach my $descendant ($elt, $elt->_descendants )
5231      { $descendant->{asis}= 1;
5232        if( (exists $descendant->{'cdata'}))
5233          { $descendant->{gi}=$XML::Twig::gi2index{$PCDATA} or $descendant->set_gi( $PCDATA);
5234            $descendant->_set_pcdata( $descendant->{cdata});
5235          }
5236
5237      }
5238    return $elt;
5239  }
5240
5241sub set_not_asis
5242  { my $elt=shift;
5243    foreach my $descendant ($elt, $elt->descendants)
5244      { delete $descendant->{asis} if $descendant->{asis};}
5245    return $elt;
5246  }
5247
5248sub is_asis
5249  { return $_[0]->{asis}; }
5250
5251sub closed
5252  { my $elt= shift;
5253    my $t= $elt->twig || return;
5254    my $curr_elt= $t->{twig_current};
5255    return 1 unless( $curr_elt);
5256    return $curr_elt->in( $elt);
5257  }
5258
5259sub set_pcdata
5260  { my( $elt, $pcdata)= @_;
5261
5262    if( $elt->{extra_data_in_pcdata})
5263      { _try_moving_extra_data( $elt, $pcdata);
5264      }
5265    $elt->{pcdata}= $pcdata;
5266    return $elt;
5267  }
5268
5269sub _extra_data_in_pcdata      { return $_[0]->{extra_data_in_pcdata}; }
5270sub _set_extra_data_in_pcdata  { $_[0]->{extra_data_in_pcdata}= $_[1]; return $_[0]; }
5271sub _del_extra_data_in_pcdata  { delete $_[0]->{extra_data_in_pcdata}; return $_[0]; }
5272sub _unshift_extra_data_in_pcdata
5273    { my $e= shift;
5274      $e->{extra_data_in_pcdata}||=[];
5275      unshift @{$e->{extra_data_in_pcdata}}, { text => shift(), offset => shift() };
5276    }
5277sub _push_extra_data_in_pcdata
5278  { my $e= shift;
5279    $e->{extra_data_in_pcdata}||=[];
5280    push @{$e->{extra_data_in_pcdata}}, { text => shift(), offset => shift() };
5281  }
5282
5283sub _extra_data_before_end_tag     { return $_[0]->{extra_data_before_end_tag} || ''; }
5284sub _set_extra_data_before_end_tag { $_[0]->{extra_data_before_end_tag}= $_[1]; return $_[0]}
5285sub _del_extra_data_before_end_tag { delete $_[0]->{extra_data_before_end_tag}; return $_[0]}
5286sub _prefix_extra_data_before_end_tag
5287  { my( $elt, $data)= @_;
5288    if($elt->{extra_data_before_end_tag})
5289      { $elt->{extra_data_before_end_tag}= $data . $elt->{extra_data_before_end_tag}; }
5290    else
5291      { $elt->{extra_data_before_end_tag}= $data; }
5292    return $elt;
5293  }
5294
5295# internal, in cases where we know there is no extra_data (inlined anyway!)
5296sub _set_pcdata { $_[0]->{pcdata}= $_[1]; }
5297
5298# try to figure out if we can keep the extra_data around
5299sub _try_moving_extra_data
5300  { my( $elt, $modified)=@_;
5301    my $initial= $elt->{pcdata};
5302    my $cpis= $elt->{extra_data_in_pcdata};
5303
5304    if( (my $offset= index( $modified, $initial)) != -1)
5305      { # text has been added
5306        foreach (@$cpis) { $_->{offset}+= $offset; }
5307      }
5308    elsif( ($offset= index( $initial, $modified)) != -1)
5309      { # text has been cut
5310        my $len= length( $modified);
5311        foreach my $cpi (@$cpis) { $cpi->{offset} -= $offset; }
5312        $elt->_set_extra_data_in_pcdata( [ grep { $_->{offset} >= 0 && $_->{offset} < $len } @$cpis ]);
5313      }
5314    else
5315      {    _match_extra_data_words( $elt, $initial, $modified)
5316        || _match_extra_data_chars( $elt, $initial, $modified)
5317        || $elt->_del_extra_data_in_pcdata;
5318      }
5319  }
5320
5321sub _match_extra_data_words
5322  { my( $elt, $initial, $modified)= @_;
5323    my @initial= split /\b/, $initial;
5324    my @modified= split /\b/, $modified;
5325
5326    return _match_extra_data( $elt, length( $initial), \@initial, \@modified);
5327  }
5328
5329sub _match_extra_data_chars
5330  { my( $elt, $initial, $modified)= @_;
5331    my @initial= split //, $initial;
5332    my @modified= split //, $modified;
5333
5334    return _match_extra_data( $elt, length( $initial), \@initial, \@modified);
5335  }
5336
5337sub _match_extra_data
5338  { my( $elt, $length, $initial, $modified)= @_;
5339
5340    my $cpis= $elt->{extra_data_in_pcdata};
5341
5342    if( @$initial <= @$modified)
5343      {
5344        my( $ok, $positions, $offsets)= _pos_offset( $initial, $modified);
5345        if( $ok)
5346          { my $offset=0;
5347            my $pos= shift @$positions;
5348            foreach my $cpi (@$cpis)
5349              { while( $cpi->{offset} >= $pos)
5350                  { $offset= shift @$offsets;
5351                    $pos= shift @$positions || $length +1;
5352                  }
5353                $cpi->{offset} += $offset;
5354              }
5355            return 1;
5356          }
5357      }
5358    else
5359      { my( $ok, $positions, $offsets)= _pos_offset( $modified, $initial);
5360        if( $ok)
5361          { #print STDERR "pos:    ", join( ':', @$positions), "\n",
5362            #             "offset: ", join( ':', @$offsets), "\n";
5363            my $offset=0;
5364            my $pos= shift @$positions;
5365            my $prev_pos= 0;
5366
5367            foreach my $cpi (@$cpis)
5368              { while( $cpi->{offset} >= $pos)
5369                  { $offset= shift @$offsets;
5370                    $prev_pos= $pos;
5371                    $pos= shift @$positions || $length +1;
5372                  }
5373                $cpi->{offset} -= $offset;
5374                if( $cpi->{offset} < $prev_pos) { delete $cpi->{text}; }
5375              }
5376            $elt->_set_extra_data_in_pcdata( [ grep { exists $_->{text} } @$cpis ]);
5377            return 1;
5378          }
5379      }
5380    return 0;
5381  }
5382
5383
5384sub _pos_offset
5385  { my( $short, $long)= @_;
5386    my( @pos, @offset);
5387    my( $s_length, $l_length)=(0,0);
5388    while (@$short)
5389      { my $s_word= shift @$short;
5390        my $l_word= shift @$long;
5391        if( $s_word ne $l_word)
5392          { while( @$long && $s_word ne $l_word)
5393              { $l_length += length( $l_word);
5394                $l_word= shift @$long;
5395              }
5396            if( !@$long && $s_word ne $l_word) { return 0; }
5397            push @pos, $s_length;
5398            push @offset, $l_length - $s_length;
5399          }
5400        my $length= length( $s_word);
5401        $s_length += $length;
5402        $l_length += $length;
5403      }
5404    return( 1, \@pos, \@offset);
5405  }
5406
5407sub append_pcdata
5408  { $_[0]->{'pcdata'}.= $_[1];
5409    return $_[0];
5410  }
5411
5412sub pcdata        { return $_[0]->{pcdata}; }
5413
5414
5415sub append_extra_data
5416  {  $_[0]->{extra_data}.= $_[1];
5417     return $_[0];
5418  }
5419
5420sub set_extra_data
5421  { $_[0]->{extra_data}= $_[1];
5422    return $_[0];
5423  }
5424sub extra_data { return $_[0]->{extra_data} || ''; }
5425
5426sub set_target
5427  { my( $elt, $target)= @_;
5428    $elt->{target}= $target;
5429    return $elt;
5430  }
5431sub target { return $_[0]->{target}; }
5432
5433sub set_data
5434  { $_[0]->{'data'}= $_[1];
5435    return $_[0];
5436  }
5437sub data { return $_[0]->{data}; }
5438
5439sub set_pi
5440  { my $elt= shift;
5441    unless( $elt->{gi} == $XML::Twig::gi2index{$PI})
5442      { $elt->cut_children;
5443        $elt->{gi}=$XML::Twig::gi2index{$PI} or $elt->set_gi( $PI);
5444      }
5445    return $elt->_set_pi( @_);
5446  }
5447
5448sub _set_pi
5449  { $_[0]->set_target( $_[1]);
5450    $_[0]->{data}=  $_[2];
5451    return $_[0];
5452  }
5453
5454sub pi_string { my $string= $PI_START . $_[0]->{target};
5455                my $data= $_[0]->{data};
5456                if( defined( $data) && $data ne '') { $string .= " $data"; }
5457                $string .= $PI_END ;
5458                return $string;
5459              }
5460
5461sub set_comment
5462  { my $elt= shift;
5463    unless( $elt->{gi} == $XML::Twig::gi2index{$COMMENT})
5464      { $elt->cut_children;
5465        $elt->{gi}=$XML::Twig::gi2index{$COMMENT} or $elt->set_gi( $COMMENT);
5466      }
5467    return $elt->_set_comment( @_);
5468  }
5469
5470sub _set_comment   { $_[0]->{comment}= $_[1]; return $_[0]; }
5471sub comment        { return $_[0]->{comment}; }
5472sub comment_string { return $COMMENT_START . _comment_escaped_string( $_[0]->{comment}) . $COMMENT_END; }
5473# comments cannot start or end with
5474sub _comment_escaped_string
5475  { my( $c)= @_;
5476    $c=~ s{^-}{ -};
5477    $c=~ s{-$}{- };
5478    $c=~ s{--}{- -}g;
5479    return $c;
5480  }
5481
5482sub set_ent  { $_[0]->{ent}= $_[1]; return $_[0]; }
5483sub ent      { return $_[0]->{ent}; }
5484sub ent_name { return substr( $_[0]->{ent}, 1, -1);}
5485
5486sub set_cdata
5487  { my $elt= shift;
5488    unless( $elt->{gi} == $XML::Twig::gi2index{$CDATA})
5489      { $elt->cut_children;
5490        $elt->insert_new_elt( first_child => $CDATA, @_);
5491        return $elt;
5492      }
5493    return $elt->_set_cdata( @_);
5494  }
5495
5496sub _set_cdata
5497  { $_[0]->{cdata}= $_[1];
5498    return $_[0];
5499  }
5500
5501sub append_cdata
5502  { $_[0]->{cdata}.= $_[1];
5503    return $_[0];
5504  }
5505sub cdata { return $_[0]->{cdata}; }
5506
5507
5508sub contains_only_text
5509  { my $elt= shift;
5510    return 0 unless $elt->is_elt;
5511    foreach my $child ($elt->_children)
5512      { return 0 if $child->is_elt; }
5513    return $elt;
5514  }
5515
5516sub contains_only
5517  { my( $elt, $exp)= @_;
5518    my @children= do { my $elt= $elt; my @children=(); my $child= $elt->{first_child}; while( $child) { push @children, $child; $child= $child->{next_sibling}; } @children; };
5519    foreach my $child (@children)
5520      { return 0 unless $child->is( $exp); }
5521    return @children || 1;
5522  }
5523
5524sub contains_a_single
5525  { my( $elt, $exp)= @_;
5526    my $child= $elt->{first_child} or return 0;
5527    return 0 unless $child->passes( $exp);
5528    return 0 if( $child->{next_sibling});
5529    return $child;
5530  }
5531
5532
5533sub root
5534  { my $elt= shift;
5535    while( $elt->{parent}) { $elt= $elt->{parent}; }
5536    return $elt;
5537  }
5538
5539sub _root_through_cut
5540  { my $elt= shift;
5541    while( $elt->{parent} || ($elt->{former} && $elt->{former}->{parent})) { $elt= $elt->{parent} || ($elt->{former} && $elt->{former}->{parent}); }
5542    return $elt;
5543  }
5544
5545sub twig
5546  { my $elt= shift;
5547    my $root= $elt->root;
5548    return $root->{twig};
5549  }
5550
5551sub _twig_through_cut
5552  { my $elt= shift;
5553    my $root= $elt->_root_through_cut;
5554    return $root->{twig};
5555  }
5556
5557
5558# used for navigation
5559# returns undef or the element, depending on whether $elt passes $cond
5560# $cond can be
5561# - empty: the element passes the condition
5562# - ELT ('#ELT'): the element passes the condition if it is a "real" element
5563# - TEXT ('#TEXT'): the element passes if it is a CDATA or PCDATA element
5564# - a string with an XPath condition (only a subset of XPath is actually
5565#   supported).
5566# - a regexp: the element passes if its gi matches the regexp
5567# - a code ref: the element passes if the code, applied on the element,
5568#   returns true
5569
5570my %cond_cache; # expression => coderef
5571
5572sub reset_cond_cache { %cond_cache=(); }
5573
5574{
5575   sub _install_cond
5576    { my $cond= shift;
5577      my $test;
5578      my $init='';
5579
5580      my $original_cond= $cond;
5581
5582      my $not= ($cond=~ s{^\s*!}{}) ? '!' : '';
5583
5584      if( ref $cond eq 'CODE') { return $cond; }
5585
5586      if( ref $cond eq 'Regexp')
5587        { $test = qq{(\$_[0]->gi=~ /$cond/)}; }
5588      else
5589        { my @tests;
5590          while( $cond)
5591            {
5592              # the condition is a string
5593              if( $cond=~ s{$ELT$SEP}{})
5594                { push @tests, qq{\$_[0]->is_elt}; }
5595              elsif( $cond=~ s{$TEXT$SEP}{})
5596                { push @tests, qq{\$_[0]->is_text}; }
5597              elsif( $cond=~ s{^\s*($REG_TAG_PART)$SEP}{})
5598                { push @tests, _gi_test( $1); }
5599              elsif( $cond=~ s{^\s*($REG_REGEXP)$SEP}{})
5600                { # /regexp/
5601                  push @tests, qq{ \$_[0]->gi=~ $1 };
5602                }
5603              elsif( $cond=~ s{^\s*($REG_TAG_PART)?\s*  # $1
5604                               \[\s*(-?)\s*(\d+)\s*\]  #   [$2]
5605                               $SEP}{}xo
5606                   )
5607                { my( $gi, $neg, $index)= ($1, $2, $3);
5608                  my $siblings= $neg ? q{$_[0]->_next_siblings} : q{$_[0]->_prev_siblings};
5609                  if( $gi && ($gi ne '*'))
5610                    #{ $test= qq{((\$_[0]->gi eq "$gi") && (scalar( grep { \$_->gi eq "$gi" } $siblings) + 1 == $index))}; }
5611                    { push @tests, _and( _gi_test( $gi), qq{ (scalar( grep { \$_->gi eq "$gi" } $siblings) + 1 == $index)}); }
5612                  else
5613                    { push @tests, qq{(scalar( $siblings) + 1 == $index)}; }
5614                }
5615              elsif( $cond=~ s{^\s*($REG_TAG_PART?)\s*($REG_PREDICATE)$SEP}{})
5616                { my( $gi, $predicate)= ( $1, $2);
5617                  push @tests, _and( _gi_test( $gi), _parse_predicate_in_step( $predicate));
5618                }
5619              elsif( $cond=~ s{^\s*($REG_NAKED_PREDICATE)$SEP}{})
5620                { push @tests,   _parse_predicate_in_step( $1); }
5621              else
5622                { croak "wrong navigation condition '$original_cond' ($@)"; }
5623            }
5624           $test= @tests > 1 ? '(' . join( '||', map { "($_)" } @tests) . ')' : $tests[0];
5625        }
5626
5627      #warn "init: '$init' - test: '$test'\n";
5628
5629      my $sub= qq{sub { $NO_WARNINGS; $init; return $not($test) ? \$_[0] : undef; } };
5630      my $s= eval $sub;
5631      #warn "cond: $cond\n$sub\n";
5632      if( $@)
5633        { croak "wrong navigation condition '$original_cond' ($@);" }
5634      return $s;
5635    }
5636
5637  sub _gi_test
5638    { my( $full_gi)= @_;
5639
5640      # optimize if the gi exists, including the case where the gi includes a dot
5641      my $index= $XML::Twig::gi2index{$full_gi};
5642      if( $index) { return qq{\$_[0]->{gi} == $index}; }
5643
5644      my( $gi, $class, $id)= $full_gi=~ m{^(.*?)(?:[.]([^.]*)|[#](.*))?$};
5645
5646      my $gi_test='';
5647      if( $gi && $gi ne '*' )
5648        { # 2 options, depending on whether the gi exists in gi2index
5649          # start optimization
5650          my $index= $XML::Twig::gi2index{$gi};
5651          if( $index)
5652            { # the gi exists, use its index as a faster shortcut
5653              $gi_test = qq{\$_[0]->{gi} == $index};
5654            }
5655          else
5656          # end optimization
5657            { # it does not exist (but might be created later), compare the strings
5658              $gi_test = qq{ \$_[0]->gi eq "$gi"};
5659            }
5660        }
5661      else
5662        { $gi_test= 1; }
5663
5664      my $class_test='';
5665      #warn "class: '$class'";
5666      if( $class)
5667        { $class_test = qq{ defined( \$_[0]->{att}->{class}) && \$_[0]->{att}->{class}=~ m{\\b$class\\b} }; }
5668
5669      my $id_test='';
5670      #warn "id: '$id'";
5671      if( $id)
5672        { $id_test = qq{ defined( \$_[0]->{att}->{$ID}) && \$_[0]->{att}->{$ID} eq '$id' }; }
5673
5674
5675      #warn "gi_test: '$gi_test' - class_test: '$class_test' returning ",  _and( $gi_test, $class_test);
5676      return _and( $gi_test, $class_test, $id_test);
5677  }
5678
5679
5680  # input: the original predicate
5681  sub _parse_predicate_in_step
5682    { my $cond= shift;
5683      my %PERL_ALPHA_TEST= ( '=' => ' eq ', '!=' => ' ne ', '>' => ' gt ', '>=' => ' ge ', '<' => ' lt ', '<=' => ' le ');
5684
5685      $cond=~ s{^\s*\[\s*}{};
5686      $cond=~ s{\s*\]\s*$}{};
5687      $cond=~ s{(   ($REG_STRING|$REG_REGEXP)                # strings or regexps
5688                   |\@($REG_TAG_NAME)(?=\s*(?:[><=!]|!~|=~)) # @att (followed by a comparison operator)
5689                   |\@($REG_TAG_NAME)                        # @att (not followed by a comparison operator)
5690                   |=~|!~                                    # matching operators
5691                   |([><]=?|=|!=)(?=\s*[\d+-])               # test before a number
5692                   |([><]=?|=|!=)                            # test, other cases
5693                   |($REG_FUNCTION)                          # no arg functions
5694                   # this bit is a mess, but it is the only solution with this half-baked parser
5695                   |((?:string|text)\(\s*$REG_TAG_NAME\s*\)\s*$REG_MATCH\s*$REG_REGEXP) # string( child) =~ /regexp/
5696                   |((?:string|text)\(\s*$REG_TAG_NAME\s*\)\s*!?=\s*$REG_VALUE)         # string( child) = "value" (or !=)
5697                   |((?:string|text)\(\s*$REG_TAG_NAME\s*\)\s*[<>]=?\s*$REG_VALUE)      # string( child) > "value"
5698                   |(and|or)
5699                )}
5700               { my( $token, $string, $att, $bare_att, $num_test, $alpha_test, $func, $string_regexp, $string_eq, $string_test, $and_or)
5701                 = ( $1,     $2,      $3,   $4,        $5,        $6,          $7,    $8,             $9,         $10,          $11);
5702
5703                 if( defined $string)   { $token }
5704                 elsif( $att)           { "( \$_[0]->{att} && exists( \$_[0]->{att}->{'$att'}) && \$_[0]->{att}->{'$att'})"; }
5705                 elsif( $bare_att)      { "(\$_[0]->{att} && defined( \$_[0]->{att}->{'$bare_att'}))"; }
5706                 elsif( $num_test && ($num_test eq '=') ) { "==" } # others tests are unchanged
5707                 elsif( $alpha_test)    { $PERL_ALPHA_TEST{$alpha_test} }
5708                 elsif( $func && $func=~ m{^(?:string|text)})
5709                                        { "\$_[0]->text"; }
5710                 elsif( $string_regexp && $string_regexp =~ m{(?:string|text)\(\s*($REG_TAG_NAME)\s*\)\s*($REG_MATCH)\s*($REG_REGEXP)})
5711                                        { "(XML::Twig::_first_n { (\$_->gi eq '$1') && (\$_->text $2 $3) } 1, \$_[0]->_children)"; }
5712                 elsif( $string_eq     && $string_eq     =~ m{(?:string|text)\(\s*($REG_TAG_NAME)\s*\)\s*(!?=)\s*($REG_VALUE)})
5713                                        {"(XML::Twig::_first_n { (\$_->gi eq '$1') && (\$_->text $PERL_ALPHA_TEST{$2} $3) } 1, \$_[0]->_children)"; }
5714                 elsif( $string_test   && $string_test   =~ m{(?:string|text)\(\s*($REG_TAG_NAME)\s*\)\s*([<>]=?)\s*($REG_VALUE)})
5715                                        { "(XML::Twig::_first_n { (\$_->gi eq '$1') && (\$_->text $2 $3) } 1, \$_[0]->_children)"; }
5716                 elsif( $and_or)        { $and_or eq 'and' ? '&&' : '||' ; }
5717                 else                   { $token; }
5718               }gexs;
5719      return "($cond)";
5720    }
5721
5722
5723  sub _op
5724    { my $op= shift;
5725      if(    $op eq '=')  { $op= 'eq'; }
5726      elsif( $op eq '!=') { $op= 'ne'; }
5727      return $op;
5728    }
5729
5730  sub passes
5731    { my( $elt, $cond)= @_;
5732      return $elt unless $cond;
5733      my $sub= ($cond_cache{$cond} ||= _install_cond( $cond));
5734      return $sub->( $elt);
5735    }
5736}
5737
5738sub set_parent
5739  { $_[0]->{parent}= $_[1];
5740    if( $XML::Twig::weakrefs) { weaken( $_[0]->{parent}); }
5741  }
5742
5743sub parent
5744  { my $elt= shift;
5745    my $cond= shift || return $elt->{parent};
5746    do { $elt= $elt->{parent} || return; } until ( $elt->passes( $cond));
5747    return $elt;
5748  }
5749
5750sub set_first_child
5751  { $_[0]->{'first_child'}= $_[1];
5752  }
5753
5754sub first_child
5755  { my $elt= shift;
5756    my $cond= shift || return $elt->{first_child};
5757    my $child= $elt->{first_child};
5758    my $test_cond= ($cond_cache{$cond} ||= _install_cond( $cond));
5759    while( $child && !$test_cond->( $child))
5760       { $child= $child->{next_sibling}; }
5761    return $child;
5762  }
5763
5764sub _first_child   { return $_[0]->{first_child};  }
5765sub _last_child    { return $_[0]->{last_child};   }
5766sub _next_sibling  { return $_[0]->{next_sibling}; }
5767sub _prev_sibling  { return $_[0]->{prev_sibling}; }
5768sub _parent        { return $_[0]->{parent};       }
5769sub _next_siblings { my $elt= shift; my @siblings; while( $elt= $elt->{next_sibling}) { push @siblings, $elt; } return @siblings; }
5770sub _prev_siblings { my $elt= shift; my @siblings; while( $elt= $elt->{prev_sibling}) { push @siblings, $elt; } return @siblings; }
5771
5772# sets a field
5773# arguments $record, $cond, @content
5774sub set_field
5775  { my $record = shift;
5776    my $cond = shift;
5777    my $child= $record->first_child( $cond);
5778    if( $child)
5779      { $child->set_content( @_); }
5780    else
5781      { if( $cond=~ m{^\s*($REG_TAG_NAME)})
5782          { my $gi= $1;
5783            $child= $record->insert_new_elt( last_child => $gi, @_);
5784          }
5785        else
5786          { croak "can't create a field name from $cond"; }
5787      }
5788    return $child;
5789  }
5790
5791sub set_last_child
5792  { $_[0]->{'last_child'}= $_[1];
5793    if( $XML::Twig::weakrefs) { weaken( $_[0]->{'last_child'}); }
5794  }
5795
5796sub last_child
5797  { my $elt= shift;
5798    my $cond= shift || return $elt->{last_child};
5799    my $test_cond= ($cond_cache{$cond} ||= _install_cond( $cond));
5800    my $child= $elt->{last_child};
5801    while( $child && !$test_cond->( $child) )
5802      { $child= $child->{prev_sibling}; }
5803    return $child
5804  }
5805
5806
5807sub set_prev_sibling
5808  { $_[0]->{'prev_sibling'}= $_[1];
5809    if( $XML::Twig::weakrefs) { weaken( $_[0]->{'prev_sibling'}); }
5810  }
5811
5812sub prev_sibling
5813  { my $elt= shift;
5814    my $cond= shift || return $elt->{prev_sibling};
5815    my $test_cond= ($cond_cache{$cond} ||= _install_cond( $cond));
5816    my $sibling= $elt->{prev_sibling};
5817    while( $sibling && !$test_cond->( $sibling) )
5818          { $sibling= $sibling->{prev_sibling}; }
5819    return $sibling;
5820  }
5821
5822sub set_next_sibling { $_[0]->{'next_sibling'}= $_[1]; }
5823
5824sub next_sibling
5825  { my $elt= shift;
5826    my $cond= shift || return $elt->{next_sibling};
5827    my $test_cond= ($cond_cache{$cond} ||= _install_cond( $cond));
5828    my $sibling= $elt->{next_sibling};
5829    while( $sibling && !$test_cond->( $sibling) )
5830          { $sibling= $sibling->{next_sibling}; }
5831    return $sibling;
5832  }
5833
5834# methods dealing with the class attribute, convenient if you work with xhtml
5835sub class   {   $_[0]->{att}->{class}; }
5836# lvalue version of class. separate from class to avoid problem like RT#
5837sub lclass
5838          :lvalue    # > perl 5.5
5839  { $_[0]->{att}->{class}; }
5840
5841sub set_class { my( $elt, $class)= @_; $elt->set_att( class => $class); }
5842
5843# adds a class to an element
5844sub add_to_class
5845  { my( $elt, $new_class)= @_;
5846    return $elt unless $new_class;
5847    my $class= $elt->class;
5848    my %class= $class ? map { $_ => 1 } split /\s+/, $class : ();
5849    $class{$new_class}= 1;
5850    $elt->set_class( join( ' ', sort keys %class));
5851  }
5852
5853sub remove_class
5854  { my( $elt, $class_to_remove)= @_;
5855    return $elt unless $class_to_remove;
5856    my $class= $elt->class;
5857    my %class= $class ? map { $_ => 1 } split /\s+/, $class : ();
5858    delete $class{$class_to_remove};
5859    $elt->set_class( join( ' ', sort keys %class));
5860  }
5861
5862sub att_to_class      { my( $elt, $att)= @_; $elt->set_class( $elt->{'att'}->{$att}); }
5863sub add_att_to_class  { my( $elt, $att)= @_; $elt->add_to_class( $elt->{'att'}->{$att}); }
5864sub move_att_to_class { my( $elt, $att)= @_; $elt->add_to_class( $elt->{'att'}->{$att});
5865                        $elt->del_att( $att);
5866                      }
5867sub tag_to_class      { my( $elt)= @_; $elt->set_class( $elt->tag);    }
5868sub add_tag_to_class  { my( $elt)= @_; $elt->add_to_class( $elt->tag); }
5869sub set_tag_class     { my( $elt, $new_tag)= @_; $elt->add_tag_to_class; $elt->set_tag( $new_tag); }
5870
5871sub tag_to_span
5872  { my( $elt)= @_;
5873    $elt->set_class( $elt->tag) unless( $elt->tag eq 'span' && $elt->class); # set class to span unless it would mean replacing it with span
5874    $elt->set_tag( 'span');
5875  }
5876
5877sub tag_to_div
5878  { my( $elt)= @_;
5879    $elt->set_class( $elt->tag) unless( $elt->tag eq 'div' && $elt->class); # set class to div unless it would mean replacing it with div
5880    $elt->set_tag( 'div');
5881  }
5882
5883sub in_class
5884  { my( $elt, $class)= @_;
5885    my $elt_class= $elt->class;
5886    return unless( defined $elt_class);
5887    return $elt->class=~ m{(?:^|\s)\Q$class\E(?:\s|$)} ? $elt : 0;
5888  }
5889
5890
5891# get or set all attributes
5892# argument can be a hash or a hashref
5893sub set_atts
5894  { my $elt= shift;
5895    my %atts;
5896    tie %atts, 'Tie::IxHash' if( keep_atts_order());
5897    %atts= ( (ref( $_[0] || '') eq 'HASH') || isa( $_[0] || '', 'HASH')) ? %{$_[0]} : @_;
5898    $elt->{att}= \%atts;
5899    if( exists $atts{$ID}) { $elt->_set_id( $atts{$ID}); }
5900    return $elt;
5901  }
5902
5903sub atts      { return $_[0]->{att};                }
5904sub att_names { return (sort keys %{$_[0]->{att}}); }
5905sub del_atts  { $_[0]->{att}={}; return $_[0];      }
5906
5907# get or set a single attribute (set works for several atts)
5908sub set_att
5909  { my $elt= shift;
5910
5911    if( $_[0] && ref( $_[0]) && !$_[1])
5912      { croak "improper call to set_att, usage is \$elt->set_att( att1 => 'val1', att2 => 'val2',...)"; }
5913
5914    unless( $elt->{att})
5915      { $elt->{att}={};
5916        tie %{$elt->{att}}, 'Tie::IxHash' if( keep_atts_order());
5917      }
5918
5919    while(@_)
5920      { my( $att, $val)= (shift, shift);
5921        $elt->{att}->{$att}= $val;
5922        if( $att eq $ID) { $elt->_set_id( $val); }
5923      }
5924    return $elt;
5925  }
5926
5927sub att {  $_[0]->{att}->{$_[1]}; }
5928# lvalue version of att. separate from class to avoid problem like RT#
5929sub latt
5930          :lvalue    # > perl 5.5
5931  { $_[0]->{att}->{$_[1]}; }
5932
5933sub del_att
5934  { my $elt= shift;
5935    while( @_) { delete $elt->{'att'}->{shift()}; }
5936    return $elt;
5937  }
5938
5939sub att_exists { return exists  $_[0]->{att}->{$_[1]}; }
5940
5941# delete an attribute from all descendants of an element
5942sub strip_att
5943  { my( $elt, $att)= @_;
5944    $_->del_att( $att) foreach ($elt->descendants_or_self( qq{*[\@$att]}));
5945    return $elt;
5946  }
5947
5948sub change_att_name
5949  { my( $elt, $old_name, $new_name)= @_;
5950    my $value= $elt->{'att'}->{$old_name};
5951    return $elt unless( defined $value);
5952    $elt->del_att( $old_name)
5953        ->set_att( $new_name => $value);
5954    return $elt;
5955  }
5956
5957sub lc_attnames
5958  { my $elt= shift;
5959    foreach my $att ($elt->att_names)
5960      { if( $att ne lc $att) { $elt->change_att_name( $att, lc $att); } }
5961    return $elt;
5962  }
5963
5964sub set_twig_current { $_[0]->{twig_current}=1; }
5965sub del_twig_current { delete $_[0]->{twig_current}; }
5966
5967
5968# get or set the id attribute
5969sub set_id
5970  { my( $elt, $id)= @_;
5971    $elt->del_id() if( exists $elt->{att}->{$ID});
5972    $elt->set_att($ID, $id);
5973    $elt->_set_id( $id);
5974    return $elt;
5975  }
5976
5977# only set id, does not update the attribute value
5978sub _set_id
5979  { my( $elt, $id)= @_;
5980    my $t= $elt->twig || $elt;
5981    $t->{twig_id_list}->{$id}= $elt;
5982    if( $XML::Twig::weakrefs) { weaken(  $t->{twig_id_list}->{$id}); }
5983    return $elt;
5984  }
5985
5986sub id { return $_[0]->{att}->{$ID}; }
5987
5988# methods used to add ids to elements that don't have one
5989BEGIN
5990{ my $id_nb   = "0001";
5991  my $id_seed = "twig_id_";
5992
5993  sub set_id_seed ## no critic (Subroutines::ProhibitNestedSubs);
5994    { $id_seed= $_[1]; $id_nb=1; }
5995
5996  sub add_id ## no critic (Subroutines::ProhibitNestedSubs);
5997    { my $elt= shift;
5998      if( defined $elt->{'att'}->{$ID})
5999        { return $elt->{'att'}->{$ID}; }
6000      else
6001        { my $id= $_[0] && ref( $_[0]) && isa( $_[0], 'CODE') ? $_[0]->( $elt) : $id_seed . $id_nb++;
6002          $elt->set_id( $id);
6003          return $id;
6004        }
6005    }
6006}
6007
6008
6009
6010# delete the id attribute and remove the element from the id list
6011sub del_id
6012  { my $elt= shift;
6013    if( ! exists $elt->{att}->{$ID}) { return $elt };
6014    my $id= $elt->{att}->{$ID};
6015
6016    delete $elt->{att}->{$ID};
6017
6018    my $t= shift || $elt->twig;
6019    unless( $t) { return $elt; }
6020    if( exists $t->{twig_id_list}->{$id}) { delete $t->{twig_id_list}->{$id}; }
6021
6022    return $elt;
6023  }
6024
6025# return the list of children
6026sub children
6027  { my $elt= shift;
6028    my @children;
6029    my $child= $elt->first_child( @_);
6030    while( $child)
6031      { push @children, $child;
6032        $child= $child->next_sibling( @_);
6033      }
6034    return @children;
6035  }
6036
6037sub _children
6038  { my $elt= shift;
6039    my @children=();
6040    my $child= $elt->{first_child};
6041    while( $child)
6042      { push @children, $child;
6043        $child= $child->{next_sibling};
6044      }
6045    return @children;
6046  }
6047
6048sub children_copy
6049  { my $elt= shift;
6050    my @children;
6051    my $child= $elt->first_child( @_);
6052    while( $child)
6053      { push @children, $child->copy;
6054        $child= $child->next_sibling( @_);
6055      }
6056    return @children;
6057  }
6058
6059
6060sub children_count
6061  { my $elt= shift;
6062    my $cond= shift;
6063    my $count=0;
6064    my $child= $elt->{first_child};
6065    while( $child)
6066      { $count++ if( $child->passes( $cond));
6067        $child= $child->{next_sibling};
6068      }
6069    return $count;
6070  }
6071
6072sub children_text
6073  { my $elt= shift;
6074    return wantarray() ? map { $_->text} $elt->children( @_)
6075                       : join( '', map { $_->text} $elt->children( @_) )
6076                       ;
6077  }
6078
6079sub children_trimmed_text
6080  { my $elt= shift;
6081    return wantarray() ? map { $_->trimmed_text} $elt->children( @_)
6082                       : join( '', map { $_->trimmed_text} $elt->children( @_) )
6083                       ;
6084  }
6085
6086sub all_children_are
6087  { my( $parent, $cond)= @_;
6088    foreach my $child ($parent->_children)
6089      { return 0 unless( $child->passes( $cond)); }
6090    return $parent;
6091  }
6092
6093
6094sub ancestors
6095  { my( $elt, $cond)= @_;
6096    my @ancestors;
6097    while( $elt->{parent})
6098      { $elt= $elt->{parent};
6099        push @ancestors, $elt if( $elt->passes( $cond));
6100      }
6101    return @ancestors;
6102  }
6103
6104sub ancestors_or_self
6105  { my( $elt, $cond)= @_;
6106    my @ancestors;
6107    while( $elt)
6108      { push @ancestors, $elt if( $elt->passes( $cond));
6109        $elt= $elt->{parent};
6110      }
6111    return @ancestors;
6112  }
6113
6114
6115sub _ancestors
6116  { my( $elt, $include_self)= @_;
6117    my @ancestors= $include_self ? ($elt) : ();
6118    while( $elt= $elt->{parent}) { push @ancestors, $elt; }
6119    return @ancestors;
6120  }
6121
6122
6123sub inherit_att
6124  { my $elt= shift;
6125    my $att= shift;
6126    my %tags= map { ($_, 1) } @_;
6127
6128    do
6129      { if(   (defined $elt->{'att'}->{$att})
6130           && ( !%tags || $tags{$XML::Twig::index2gi[$elt->{'gi'}]})
6131          )
6132          { return $elt->{'att'}->{$att}; }
6133      } while( $elt= $elt->{parent});
6134    return undef;
6135  }
6136
6137sub _inherit_att_through_cut
6138  { my $elt= shift;
6139    my $att= shift;
6140    my %tags= map { ($_, 1) } @_;
6141
6142    do
6143      { if(   (defined $elt->{'att'}->{$att})
6144           && ( !%tags || $tags{$XML::Twig::index2gi[$elt->{'gi'}]})
6145          )
6146          { return $elt->{'att'}->{$att}; }
6147      } while( $elt= $elt->{parent} || ($elt->{former} && $elt->{former}->{parent}));
6148    return undef;
6149  }
6150
6151
6152sub current_ns_prefixes
6153  { my $elt= shift;
6154    my %prefix;
6155    $prefix{''}=1 if( $elt->namespace( ''));
6156    while( $elt)
6157      { my @ns= grep { !m{^xml} } map { m{^([^:]+):} } ($XML::Twig::index2gi[$elt->{'gi'}], $elt->att_names);
6158        $prefix{$_}=1 foreach (@ns);
6159        $elt= $elt->{parent};
6160      }
6161
6162    return (sort keys %prefix);
6163  }
6164
6165# kinda counter-intuitive actually:
6166# the next element is found by looking for the next open tag after from the
6167# current one, which is the first child, if it exists, or the next sibling
6168# or the first next sibling of an ancestor
6169# optional arguments are:
6170#   - $subtree_root: a reference to an element, when the next element is not
6171#                    within $subtree_root anymore then next_elt returns undef
6172#   - $cond: a condition, next_elt returns the next element matching the condition
6173
6174sub next_elt
6175  { my $elt= shift;
6176    my $subtree_root= 0;
6177    $subtree_root= shift if( ref( $_[0]) && isa( $_[0], 'XML::Twig::Elt'));
6178    my $cond= shift;
6179    my $next_elt;
6180
6181    my $ind;                                                              # optimization
6182    my $test_cond;
6183    if( $cond)                                                            # optimization
6184      { unless( defined( $ind= $XML::Twig::gi2index{$cond}) )             # optimization
6185          { $test_cond= ($cond_cache{$cond} ||= _install_cond( $cond)); } # optimization
6186      }                                                                   # optimization
6187
6188    do
6189      { if( $next_elt= $elt->{first_child})
6190          { # simplest case: the elt has a child
6191          }
6192         elsif( $next_elt= $elt->{next_sibling})
6193          { # no child but a next sibling (just check we stay within the subtree)
6194
6195            # case where elt is subtree_root, is empty and has a sibling
6196            return undef if( $subtree_root && ($elt == $subtree_root));
6197
6198          }
6199        else
6200          { # case where the element has no child and no next sibling:
6201            # get the first next sibling of an ancestor, checking subtree_root
6202
6203            # case where elt is subtree_root, is empty and has no sibling
6204            return undef if( $subtree_root && ($elt == $subtree_root));
6205
6206            $next_elt= $elt->{parent};
6207
6208            until( $next_elt->{next_sibling})
6209              { return undef if( $subtree_root && ($subtree_root == $next_elt));
6210                $next_elt= $next_elt->{parent} || return undef;
6211              }
6212            return undef if( $subtree_root && ($subtree_root == $next_elt));
6213            $next_elt= $next_elt->{next_sibling};
6214          }
6215      $elt= $next_elt;                   # just in case we need to loop
6216    } until(    ! defined $elt
6217             || ! defined $cond
6218         || (defined $ind       && ($elt->{gi} eq $ind))   # optimization
6219         || (defined $test_cond && ($test_cond->( $elt)))
6220               );
6221
6222      return $elt;
6223      }
6224
6225# return the next_elt within the element
6226# just call next_elt with the element as first and second argument
6227sub first_descendant { return $_[0]->next_elt( @_); }
6228
6229# get the last descendant, # then return the element found or call prev_elt with the condition
6230sub last_descendant
6231  { my( $elt, $cond)= @_;
6232    my $last_descendant= $elt->_last_descendant;
6233    if( !$cond || $last_descendant->matches( $cond))
6234      { return $last_descendant; }
6235    else
6236      { return $last_descendant->prev_elt( $elt, $cond); }
6237  }
6238
6239# no argument allowed here, just go down the last_child recursively
6240sub _last_descendant
6241  { my $elt= shift;
6242    while( my $child= $elt->{last_child}) { $elt= $child; }
6243    return $elt;
6244  }
6245
6246# counter-intuitive too:
6247# the previous element is found by looking
6248# for the first open tag backwards from the current one
6249# it's the last descendant of the previous sibling
6250# if it exists, otherwise it's simply the parent
6251sub prev_elt
6252  { my $elt= shift;
6253    my $subtree_root= 0;
6254    if( defined $_[0] and (ref( $_[0]) && isa( $_[0], 'XML::Twig::Elt')))
6255      { $subtree_root= shift ;
6256        return undef if( $elt == $subtree_root);
6257      }
6258    my $cond= shift;
6259    # get prev elt
6260    my $prev_elt;
6261    do
6262      { return undef if( $elt == $subtree_root);
6263        if( $prev_elt= $elt->{prev_sibling})
6264          { while( $prev_elt->{last_child})
6265              { $prev_elt= $prev_elt->{last_child}; }
6266          }
6267        else
6268          { $prev_elt= $elt->{parent} || return undef; }
6269        $elt= $prev_elt;     # in case we need to loop
6270      } until( $elt->passes( $cond));
6271
6272    return $elt;
6273  }
6274
6275sub _following_elt
6276  { my( $elt)= @_;
6277    while( $elt && !$elt->{next_sibling})
6278      { $elt= $elt->{parent}; }
6279    return $elt ? $elt->{next_sibling} : undef;
6280  }
6281
6282sub following_elt
6283  { my( $elt, $cond)= @_;
6284    $elt= $elt->_following_elt || return undef;
6285    return $elt if( !$cond || $elt->matches( $cond));
6286    return $elt->next_elt( $cond);
6287  }
6288
6289sub following_elts
6290  { my( $elt, $cond)= @_;
6291    if( !$cond) { undef $cond; }
6292    my $following= $elt->following_elt( $cond);
6293    if( $following)
6294      { my @followings= $following;
6295        while( $following= $following->next_elt( $cond))
6296          { push @followings, $following; }
6297        return( @followings);
6298      }
6299    else
6300      { return (); }
6301  }
6302
6303sub _preceding_elt
6304  { my( $elt)= @_;
6305    while( $elt && !$elt->{prev_sibling})
6306      { $elt= $elt->{parent}; }
6307    return $elt ? $elt->{prev_sibling}->_last_descendant : undef;
6308  }
6309
6310sub preceding_elt
6311  { my( $elt, $cond)= @_;
6312    $elt= $elt->_preceding_elt || return undef;
6313    return $elt if( !$cond || $elt->matches( $cond));
6314    return $elt->prev_elt( $cond);
6315  }
6316
6317sub preceding_elts
6318  { my( $elt, $cond)= @_;
6319    if( !$cond) { undef $cond; }
6320    my $preceding= $elt->preceding_elt( $cond);
6321    if( $preceding)
6322      { my @precedings= $preceding;
6323        while( $preceding= $preceding->prev_elt( $cond))
6324          { push @precedings, $preceding; }
6325        return( @precedings);
6326      }
6327    else
6328      { return (); }
6329  }
6330
6331# used in get_xpath
6332sub _self
6333  { my( $elt, $cond)= @_;
6334    return $cond ? $elt->matches( $cond) : $elt;
6335  }
6336
6337sub next_n_elt
6338  { my $elt= shift;
6339    my $offset= shift || return undef;
6340    foreach (1..$offset)
6341      { $elt= $elt->next_elt( @_) || return undef; }
6342    return $elt;
6343  }
6344
6345# checks whether $elt is included in $ancestor, returns 1 in that case
6346sub in
6347  { my ($elt, $ancestor)= @_;
6348    if( ref( $ancestor) && isa( $ancestor, 'XML::Twig::Elt'))
6349      { # element
6350        while( $elt= $elt->{parent}) { return $elt if( $elt ==  $ancestor); }
6351      }
6352    else
6353      { # condition
6354        while( $elt= $elt->{parent}) { return $elt if( $elt->matches( $ancestor)); }
6355      }
6356    return 0;
6357  }
6358
6359sub first_child_text
6360  { my $elt= shift;
6361    my $dest=$elt->first_child(@_) or return '';
6362    return $dest->text;
6363  }
6364
6365sub fields
6366  { my $elt= shift;
6367    return map { $elt->field( $_) } @_;
6368  }
6369
6370sub first_child_trimmed_text
6371  { my $elt= shift;
6372    my $dest=$elt->first_child(@_) or return '';
6373    return $dest->trimmed_text;
6374  }
6375
6376sub first_child_matches
6377  { my $elt= shift;
6378    my $dest= $elt->{first_child} or return undef;
6379    return $dest->passes( @_);
6380  }
6381
6382sub last_child_text
6383  { my $elt= shift;
6384    my $dest=$elt->last_child(@_) or return '';
6385    return $dest->text;
6386  }
6387
6388sub last_child_trimmed_text
6389  { my $elt= shift;
6390    my $dest=$elt->last_child(@_) or return '';
6391    return $dest->trimmed_text;
6392  }
6393
6394sub last_child_matches
6395  { my $elt= shift;
6396    my $dest= $elt->{last_child} or return undef;
6397    return $dest->passes( @_);
6398  }
6399
6400sub child_text
6401  { my $elt= shift;
6402    my $dest=$elt->child(@_) or return '';
6403    return $dest->text;
6404  }
6405
6406sub child_trimmed_text
6407  { my $elt= shift;
6408    my $dest=$elt->child(@_) or return '';
6409    return $dest->trimmed_text;
6410  }
6411
6412sub child_matches
6413  { my $elt= shift;
6414    my $nb= shift;
6415    my $dest= $elt->child( $nb) or return undef;
6416    return $dest->passes( @_);
6417  }
6418
6419sub prev_sibling_text
6420  { my $elt= shift;
6421    my $dest=$elt->_prev_sibling(@_) or return '';
6422    return $dest->text;
6423  }
6424
6425sub prev_sibling_trimmed_text
6426  { my $elt= shift;
6427    my $dest=$elt->_prev_sibling(@_) or return '';
6428    return $dest->trimmed_text;
6429  }
6430
6431sub prev_sibling_matches
6432  { my $elt= shift;
6433    my $dest= $elt->{prev_sibling} or return undef;
6434    return $dest->passes( @_);
6435  }
6436
6437sub next_sibling_text
6438  { my $elt= shift;
6439    my $dest=$elt->next_sibling(@_) or return '';
6440    return $dest->text;
6441  }
6442
6443sub next_sibling_trimmed_text
6444  { my $elt= shift;
6445    my $dest=$elt->next_sibling(@_) or return '';
6446    return $dest->trimmed_text;
6447  }
6448
6449sub next_sibling_matches
6450  { my $elt= shift;
6451    my $dest= $elt->{next_sibling} or return undef;
6452    return $dest->passes( @_);
6453  }
6454
6455sub prev_elt_text
6456  { my $elt= shift;
6457    my $dest=$elt->prev_elt(@_) or return '';
6458    return $dest->text;
6459  }
6460
6461sub prev_elt_trimmed_text
6462  { my $elt= shift;
6463    my $dest=$elt->prev_elt(@_) or return '';
6464    return $dest->trimmed_text;
6465  }
6466
6467sub prev_elt_matches
6468  { my $elt= shift;
6469    my $dest= $elt->prev_elt or return undef;
6470    return $dest->passes( @_);
6471  }
6472
6473sub next_elt_text
6474  { my $elt= shift;
6475    my $dest=$elt->next_elt(@_) or return '';
6476    return $dest->text;
6477  }
6478
6479sub next_elt_trimmed_text
6480  { my $elt= shift;
6481    my $dest=$elt->next_elt(@_) or return '';
6482    return $dest->trimmed_text;
6483  }
6484
6485sub next_elt_matches
6486  { my $elt= shift;
6487    my $dest= $elt->next_elt or return undef;
6488    return $dest->passes( @_);
6489  }
6490
6491sub parent_text
6492  { my $elt= shift;
6493    my $dest=$elt->parent(@_) or return '';
6494    return $dest->text;
6495  }
6496
6497sub parent_trimmed_text
6498  { my $elt= shift;
6499    my $dest=$elt->parent(@_) or return '';
6500    return $dest->trimmed_text;
6501  }
6502
6503sub parent_matches
6504  { my $elt= shift;
6505    my $dest= $elt->{parent} or return undef;
6506    return $dest->passes( @_);
6507  }
6508
6509sub is_first_child
6510  { my $elt= shift;
6511    my $parent= $elt->{parent} or return 0;
6512    my $first_child= $parent->first_child( @_) or return 0;
6513    return ($first_child == $elt) ? $elt : 0;
6514  }
6515
6516sub is_last_child
6517  { my $elt= shift;
6518    my $parent= $elt->{parent} or return 0;
6519    my $last_child= $parent->last_child( @_) or return 0;
6520    return ($last_child == $elt) ? $elt : 0;
6521  }
6522
6523# returns the depth level of the element
6524# if 2 parameter are used then counts the 2cd element name in the
6525# ancestors list
6526sub level
6527  { my( $elt, $cond)= @_;
6528    my $level=0;
6529    my $name=shift || '';
6530    while( $elt= $elt->{parent}) { $level++ if( !$cond || $elt->matches( $cond)); }
6531    return $level;
6532  }
6533
6534# checks whether $elt has an ancestor that satisfies $cond, returns the ancestor
6535sub in_context
6536  { my ($elt, $cond, $level)= @_;
6537    $level= -1 unless( $level) ;  # $level-- will never hit 0
6538
6539    while( $level)
6540      { $elt= $elt->{parent} or return 0;
6541        if( $elt->matches( $cond)) { return $elt; }
6542        $level--;
6543      }
6544    return 0;
6545  }
6546
6547sub _descendants
6548  { my( $subtree_root, $include_self)= @_;
6549    my @descendants= $include_self ? ($subtree_root) : ();
6550
6551    my $elt= $subtree_root;
6552    my $next_elt;
6553
6554    MAIN: while( 1)
6555      { if( $next_elt= $elt->{first_child})
6556          { # simplest case: the elt has a child
6557          }
6558        elsif( $next_elt= $elt->{next_sibling})
6559          { # no child but a next sibling (just check we stay within the subtree)
6560
6561            # case where elt is subtree_root, is empty and has a sibling
6562            last MAIN if( $elt == $subtree_root);
6563          }
6564        else
6565          { # case where the element has no child and no next sibling:
6566            # get the first next sibling of an ancestor, checking subtree_root
6567
6568            # case where elt is subtree_root, is empty and has no sibling
6569            last MAIN if( $elt == $subtree_root);
6570
6571            # backtrack until we find a parent with a next sibling
6572            $next_elt= $elt->{parent} || last;
6573            until( $next_elt->{next_sibling})
6574              { last MAIN if( $subtree_root == $next_elt);
6575                $next_elt= $next_elt->{parent} || last MAIN;
6576              }
6577            last MAIN if( $subtree_root == $next_elt);
6578            $next_elt= $next_elt->{next_sibling};
6579          }
6580        $elt= $next_elt || last MAIN;
6581        push @descendants, $elt;
6582      }
6583    return @descendants;
6584  }
6585
6586
6587sub descendants
6588  { my( $subtree_root, $cond)= @_;
6589    my @descendants=();
6590    my $elt= $subtree_root;
6591
6592    # this branch is pure optimization for speed: if $cond is a gi replace it
6593    # by the index of the gi and loop here
6594    # start optimization
6595    my $ind;
6596    if( !$cond || ( defined ( $ind= $XML::Twig::gi2index{$cond})) )
6597      {
6598        my $next_elt;
6599
6600        while( 1)
6601          { if( $next_elt= $elt->{first_child})
6602                { # simplest case: the elt has a child
6603                }
6604             elsif( $next_elt= $elt->{next_sibling})
6605              { # no child but a next sibling (just check we stay within the subtree)
6606
6607                # case where elt is subtree_root, is empty and has a sibling
6608                last if( $subtree_root && ($elt == $subtree_root));
6609              }
6610            else
6611              { # case where the element has no child and no next sibling:
6612                # get the first next sibling of an ancestor, checking subtree_root
6613
6614                # case where elt is subtree_root, is empty and has no sibling
6615                last if( $subtree_root && ($elt == $subtree_root));
6616
6617                # backtrack until we find a parent with a next sibling
6618                $next_elt= $elt->{parent} || last undef;
6619                until( $next_elt->{next_sibling})
6620                  { last if( $subtree_root && ($subtree_root == $next_elt));
6621                    $next_elt= $next_elt->{parent} || last;
6622                  }
6623                last if( $subtree_root && ($subtree_root == $next_elt));
6624                $next_elt= $next_elt->{next_sibling};
6625              }
6626            $elt= $next_elt || last;
6627            push @descendants, $elt if( !$cond || ($elt->{gi} eq $ind));
6628          }
6629      }
6630    else
6631    # end optimization
6632      { # branch for a complex condition: use the regular (slow but simple) way
6633        while( $elt= $elt->next_elt( $subtree_root, $cond))
6634          { push @descendants, $elt; }
6635      }
6636    return @descendants;
6637  }
6638
6639
6640sub descendants_or_self
6641  { my( $elt, $cond)= @_;
6642    my @descendants= $elt->passes( $cond) ? ($elt) : ();
6643    push @descendants, $elt->descendants( $cond);
6644    return @descendants;
6645  }
6646
6647sub sibling
6648  { my $elt= shift;
6649    my $nb= shift;
6650    if( $nb > 0)
6651      { foreach( 1..$nb)
6652          { $elt= $elt->next_sibling( @_) or return undef; }
6653      }
6654    elsif( $nb < 0)
6655      { foreach( 1..(-$nb))
6656          { $elt= $elt->prev_sibling( @_) or return undef; }
6657      }
6658    else # $nb == 0
6659      { return $elt->passes( $_[0]); }
6660    return $elt;
6661  }
6662
6663sub sibling_text
6664  { my $elt= sibling( @_);
6665    return $elt ? $elt->text : undef;
6666  }
6667
6668
6669sub child
6670  { my $elt= shift;
6671    my $nb= shift;
6672    if( $nb >= 0)
6673      { $elt= $elt->first_child( @_) or return undef;
6674        foreach( 1..$nb)
6675          { $elt= $elt->next_sibling( @_) or return undef; }
6676      }
6677    else
6678      { $elt= $elt->last_child( @_) or return undef;
6679        foreach( 2..(-$nb))
6680          { $elt= $elt->prev_sibling( @_) or return undef; }
6681      }
6682    return $elt;
6683  }
6684
6685sub prev_siblings
6686  { my $elt= shift;
6687    my @siblings=();
6688    while( $elt= $elt->prev_sibling( @_))
6689      { unshift @siblings, $elt; }
6690    return @siblings;
6691  }
6692
6693sub siblings
6694  { my $elt= shift;
6695    return grep { $_ ne $elt } $elt->{parent}->children( @_);
6696  }
6697
6698sub pos
6699  { my $elt= shift;
6700    return 0 if ($_[0] && !$elt->matches( @_));
6701    my $pos=1;
6702    $pos++ while( $elt= $elt->prev_sibling( @_));
6703    return $pos;
6704  }
6705
6706
6707sub next_siblings
6708  { my $elt= shift;
6709    my @siblings=();
6710    while( $elt= $elt->next_sibling( @_))
6711      { push @siblings, $elt; }
6712    return @siblings;
6713  }
6714
6715
6716# used by get_xpath: parses the xpath expression and generates a sub that performs the
6717# search
6718{ my %axis2method;
6719  BEGIN { %axis2method= ( child               => 'children',
6720                          descendant          => 'descendants',
6721                         'descendant-or-self' => 'descendants_or_self',
6722                          parent              => 'parent_is',
6723                          ancestor            => 'ancestors',
6724                         'ancestor-or-self'   => 'ancestors_or_self',
6725                         'following-sibling'  => 'next_siblings',
6726                         'preceding-sibling'  => 'prev_siblings',
6727                          following           => 'following_elts',
6728                          preceding           => 'preceding_elts',
6729                          self                => '_self',
6730                        );
6731        }
6732
6733  sub _install_xpath
6734    { my( $xpath_exp, $type)= @_;
6735      my $original_exp= $xpath_exp;
6736      my $sub= 'my $elt= shift; my @results;';
6737
6738      # grab the root if expression starts with a /
6739      if( $xpath_exp=~ s{^/}{})
6740        { $sub .= '@results= ($elt->twig) || croak "cannot use an XPath query starting with a / on a node not attached to a whole twig";'; }
6741      elsif( $xpath_exp=~ s{^\./}{})
6742        { $sub .= '@results= ($elt);'; }
6743      else
6744        { $sub .= '@results= ($elt);'; }
6745
6746
6747     #warn "xpath_exp= '$xpath_exp'\n";
6748
6749      while( $xpath_exp &&
6750             $xpath_exp=~s{^\s*(/?)
6751                            # the xxx=~/regexp/ is a pain as it includes /
6752                            (\s*(?:(?:($REG_AXIS)::)?(\*|$REG_TAG_PART|\.\.|\.)\s*)?($REG_PREDICATE_ALT*)
6753                            )
6754                            (/|$)}{}xo)
6755
6756        { my( $wildcard, $sub_exp, $axis, $gi, $predicates)= ($1, $2, $3, $4, $5);
6757           if( $axis && ! $gi)
6758                { _croak_and_doublecheck_xpath( $original_exp, "error in xpath expression $original_exp"); }
6759
6760          # grab a parent
6761          if( $sub_exp eq '..')
6762            { _croak_and_doublecheck_xpath( $original_exp, "error in xpath expression $original_exp") if( $wildcard);
6763              $sub .= '@results= map { $_->{parent}} @results;';
6764            }
6765          # test the element itself
6766          elsif( $sub_exp=~ m{^\.(.*)$}s)
6767            { $sub .= "\@results= grep { \$_->matches( q{$1}) } \@results;" }
6768          # grab children
6769          else
6770            {
6771              if( !$axis)
6772                { $axis= $wildcard ? 'descendant' : 'child'; }
6773              if( !$gi or $gi eq '*') { $gi=''; }
6774              my $function;
6775
6776              # "special" predicates, that return just one element
6777              if( $predicates && ($predicates =~ m{^\s*\[\s*((-\s*)?\d+)\s*\]\s*$}))
6778                { # [<nb>]
6779                  my $offset= $1;
6780                  $offset-- if( $offset > 0);
6781                  $function=  $axis eq 'descendant' ? "next_n_elt( $offset, '$gi')"
6782                           :  $axis eq 'child'      ? "child( $offset, '$gi')"
6783                           :                          _croak_and_doublecheck_xpath( $original_exp, "error [$1] not supported along axis '$axis'")
6784                           ;
6785                  $sub .= "\@results= grep { \$_ } map { \$_->$function } \@results;"
6786                }
6787              elsif( $predicates && ($predicates =~ m{^\s*\[\s*last\s*\(\s*\)\s*\]\s*$}) )
6788                { # last()
6789                  _croak_and_doublecheck_xpath( $original_exp, "error in xpath expression $original_exp, usage of // and last() not supported") if( $wildcard);
6790                   $sub .= "\@results= map { \$_->last_child( '$gi') } \@results;";
6791                }
6792              else
6793                { # follow the axis
6794                  #warn "axis: '$axis' - method: '$axis2method{$axis}' - gi: '$gi'\n";
6795
6796                  my $follow_axis= " \$_->$axis2method{$axis}( '$gi')";
6797                  my $step= $follow_axis;
6798
6799                  # now filter using the predicate
6800                  while( $predicates=~ s{^\s*($REG_PREDICATE_ALT)\s*}{}o)
6801                    { my $pred= $1;
6802                      $pred=~ s{^\s*\[\s*}{};
6803                      $pred=~ s{\s*\]\s*$}{};
6804                      my $test="";
6805                      my $pos;
6806                      if( $pred=~ m{^(-?\s*\d+)$})
6807                        { my $pos= $1;
6808                          if( $step=~ m{^\s*grep(.*) (\$_->\w+\(\s*'[^']*'\s*\))})
6809                            { $step= "XML::Twig::_first_n $1 $pos, $2"; }
6810                          else
6811                            { if( $pos > 0) { $pos--; }
6812                              $step= "($step)[$pos]";
6813                            }
6814                          #warn "number predicate '$pos' - generated step '$step'\n";
6815                        }
6816                      else
6817                        { my $syntax_error=0;
6818                          do
6819                            { if( $pred =~ s{^string\(\s*\)\s*=\s*($REG_STRING)\s*}{}o)  # string()="string" pred
6820                                { $test .= "\$_->text eq $1"; }
6821                              elsif( $pred =~ s{^string\(\s*\)\s*!=\s*($REG_STRING)\s*}{}o)  # string()!="string" pred
6822                                { $test .= "\$_->text ne $1"; }
6823                              if( $pred =~ s{^string\(\s*\)\s*=\s*($REG_NUMBER)\s*}{}o)  # string()=<number> pred
6824                                { $test .= "\$_->text eq $1"; }
6825                              elsif( $pred =~ s{^string\(\s*\)\s*!=\s*($REG_NUMBER)\s*}{}o)  # string()!=<number> pred
6826                                { $test .= "\$_->text ne $1"; }
6827                              elsif( $pred =~ s{^string\(\s*\)\s*(>|<|>=|<=)\s*($REG_NUMBER)\s*}{}o)  # string()!=<number> pred
6828                                { $test .= "\$_->text $1 $2"; }
6829
6830                             elsif( $pred =~ s{^string\(\s*\)\s*($REG_MATCH)\s*($REG_REGEXP)\s*}{}o)  # string()=~/regex/ pred
6831                                { my( $match, $regexp)= ($1, $2);
6832                                  $test .= "\$_->text $match $regexp";
6833                                }
6834                              elsif( $pred =~ s{^string\(\s*\)\s*}{}o)  # string() pred
6835                                { $test .= "\$_->text"; }
6836                             elsif( $pred=~ s{^@($REG_TAG_NAME)\s*($REG_OP)\s*($REG_STRING|$REG_NUMBER)}{}o)  # @att="val" pred
6837                                { my( $att, $oper, $val)= ($1, _op( $2), $3);
6838                                  $test .= qq{((defined \$_->{'att'}->{"$att"})  && (\$_->{'att'}->{"$att"} $oper $val))};
6839                                }
6840                             elsif( $pred =~ s{^@($REG_TAG_NAME)\s*($REG_MATCH)\s*($REG_REGEXP)\s*}{}o)  # @att=~/regex/ pred XXX
6841                                { my( $att, $match, $regexp)= ($1, $2, $3);
6842                                  $test .= qq{((defined \$_->{'att'}->{"$att"})  && (\$_->{'att'}->{"$att"} $match $regexp))};;
6843                                }
6844                             elsif( $pred=~ s{^@($REG_TAG_NAME)\s*}{}o)                      # @att pred
6845                                { $test .= qq{(defined \$_->{'att'}->{"$1"})}; }
6846                             elsif( $pred=~ s{^\s*(?:not|!)\s*@($REG_TAG_NAME)\s*}{}o)       # not @att pred
6847                                { $test .= qq{((\$_->is_elt) && (not defined \$_->{'att'}->{"$1"}))}; }
6848                              elsif( $pred=~ s{^\s*([()])}{})                            # ( or ) (just add to the test)
6849                                { $test .= qq{$1};           }
6850                              elsif( $pred=~ s{^\s*(and|or)\s*}{})
6851                                { $test .= lc " $1 "; }
6852                              else
6853                                { $syntax_error=1; }
6854
6855                             } while( !$syntax_error && $pred);
6856                           _croak_and_doublecheck_xpath( $original_exp, "error in xpath expression $original_exp at $pred") if( $pred);
6857                           $step= " grep { $test } $step ";
6858                        }
6859                    }
6860                  #warn "step: '$step'";
6861                  $sub .= "\@results= grep { \$_ } map { $step } \@results;";
6862                }
6863            }
6864        }
6865
6866      if( $xpath_exp)
6867        { _croak_and_doublecheck_xpath( $original_exp, "error in xpath expression $original_exp around $xpath_exp"); }
6868
6869      $sub .= q{return XML::Twig::_unique_elts( @results); };
6870      #warn "generated: '$sub'\n";
6871      my $s= eval "sub { $NO_WARNINGS; $sub }";
6872      if( $@)
6873        { _croak_and_doublecheck_xpath( $original_exp, "error in xpath expression $original_exp ($@);") }
6874      return( $s);
6875    }
6876}
6877
6878sub _croak_and_doublecheck_xpath
6879  { my $xpath_expression= shift;
6880    my $mess= join( "\n", @_);
6881    if( $XML::Twig::XPath::VERSION || 0)
6882      { my $check_twig= XML::Twig::XPath->new;
6883        if( eval { $check_twig->{twig_xp}->_parse( $xpath_expression) })
6884          { $mess .= "\nthe expression is a valid XPath statement, and you are using XML::Twig::XPath, but"
6885                   . "\nyou are using either 'find_nodes' or 'get_xpath' where the method you likely wanted"
6886                   . "\nto use is 'findnodes', which is the only one that uses the full XPath engine\n";
6887          }
6888      }
6889    croak $mess;
6890  }
6891
6892
6893
6894{ # extremely elaborate caching mechanism
6895  my %xpath; # xpath_expression => subroutine_code;
6896  sub get_xpath
6897    { my( $elt, $xpath_exp, $offset)= @_;
6898      my $sub= ($xpath{$xpath_exp} ||= _install_xpath( $xpath_exp));
6899      return $sub->( $elt) unless( defined $offset);
6900      my @res= $sub->( $elt);
6901      return $res[$offset];
6902    }
6903}
6904
6905
6906sub findvalues
6907  { my $elt= shift;
6908    return map { $_->text } $elt->get_xpath( @_);
6909  }
6910
6911sub findvalue
6912  { my $elt= shift;
6913    return join '', map { $_->text } $elt->get_xpath( @_);
6914  }
6915
6916
6917# XML::XPath compatibility
6918sub getElementById     { return $_[0]->twig->elt_id( $_[1]); }
6919sub getChildNodes      { my @children= do { my $elt= $_[0]; my @children=(); my $child= $elt->{first_child}; while( $child) { push @children, $child; $child= $child->{next_sibling}; } @children; }; return wantarray ? @children : \@children; }
6920
6921sub _flushed     { return $_[0]->{flushed}; }
6922sub _set_flushed { $_[0]->{flushed}=1;      }
6923sub _del_flushed { delete $_[0]->{flushed}; }
6924
6925sub cut
6926  { my $elt= shift;
6927    my( $parent, $prev_sibling, $next_sibling);
6928    $parent=  $elt->{parent};
6929    my $a= $elt->{'att'}->{'a'} || 'na';
6930    if( ! $parent && $elt->is_elt)
6931      { # are we cutting the root?
6932        my $t= $elt->{twig};
6933        if( $t && ! $t->{twig_parsing})
6934          { delete $t->{twig_root};
6935            delete $elt->{twig};
6936            return $elt;
6937          }  # cutt`ing the root
6938        else
6939          { return;  }  # cutting an orphan, returning $elt would break backward compatibility
6940      }
6941
6942    # save the old links, that'll make it easier for some loops
6943    foreach my $link ( qw(parent prev_sibling next_sibling) )
6944      { $elt->{former}->{$link}= $elt->{$link};
6945         if( $XML::Twig::weakrefs) { weaken( $elt->{former}->{$link}); }
6946      }
6947
6948    # if we cut the current element then its parent becomes the current elt
6949    if( $elt->{twig_current})
6950      { my $twig_current= $elt->{parent};
6951        $elt->twig->{twig_current}= $twig_current;
6952        $twig_current->{'twig_current'}=1;
6953        delete $elt->{'twig_current'};
6954      }
6955
6956    if( $parent->{first_child} && $parent->{first_child} == $elt)
6957      { $parent->{first_child}=  $elt->{next_sibling};
6958        # cutting can make the parent empty
6959        if( ! $parent->{first_child}) { $parent->{empty}=  1; }
6960      }
6961
6962    if( $parent->{last_child} && $parent->{last_child} == $elt)
6963      {  $parent->{empty}=0; $parent->{last_child}=$elt->{prev_sibling}; if( $XML::Twig::weakrefs) { weaken( $parent->{last_child});} ;
6964      }
6965
6966    if( $prev_sibling= $elt->{prev_sibling})
6967      { $prev_sibling->{next_sibling}=  $elt->{next_sibling}; }
6968    if( $next_sibling= $elt->{next_sibling})
6969      { $next_sibling->{prev_sibling}=$elt->{prev_sibling}; if( $XML::Twig::weakrefs) { weaken( $next_sibling->{prev_sibling});} ; }
6970
6971
6972    $elt->{parent}=undef; if( $XML::Twig::weakrefs) { weaken( $elt->{parent});} ;
6973    $elt->{prev_sibling}=undef; if( $XML::Twig::weakrefs) { weaken( $elt->{prev_sibling});} ;
6974    $elt->{next_sibling}=  undef;
6975
6976    # merge 2 (now) consecutive text nodes if they are of the same type
6977    # (type can be PCDATA or CDATA)
6978    if( $prev_sibling && $next_sibling && $prev_sibling->is_text && ( $XML::Twig::index2gi[$prev_sibling->{'gi'}] eq $XML::Twig::index2gi[$next_sibling->{'gi'}]))
6979      { $prev_sibling->merge_text( $next_sibling); }
6980
6981    return $elt;
6982  }
6983
6984
6985sub former_next_sibling { return $_[0]->{former}->{next_sibling}; }
6986sub former_prev_sibling { return $_[0]->{former}->{prev_sibling}; }
6987sub former_parent       { return $_[0]->{former}->{parent};       }
6988
6989sub cut_children
6990  { my( $elt, $exp)= @_;
6991    my @children= $elt->children( $exp);
6992    foreach (@children) { $_->cut; }
6993    if( ! $elt->has_children) { $elt->{empty}=  1; }
6994    return @children;
6995  }
6996
6997sub cut_descendants
6998  { my( $elt, $exp)= @_;
6999    my @descendants= $elt->descendants( $exp);
7000    foreach ($elt->descendants( $exp)) { $_->cut; }
7001    if( ! $elt->has_children) { $elt->{empty}=  1; }
7002    return @descendants;
7003  }
7004
7005
7006
7007sub erase
7008  { my $elt= shift;
7009    #you cannot erase the current element
7010    if( $elt->{twig_current})
7011      { croak "trying to erase an element before it has been completely parsed"; }
7012    unless( $elt->{parent})
7013      { # trying to erase the root (of a twig or of a cut/new element)
7014        my @children= do { my $elt= $elt; my @children=(); my $child= $elt->{first_child}; while( $child) { push @children, $child; $child= $child->{next_sibling}; } @children; };
7015        unless( @children == 1)
7016          { croak "can only erase an element with no parent if it has a single child"; }
7017        $elt->_move_extra_data_after_erase;
7018        my $child= shift @children;
7019        $child->{parent}=undef; if( $XML::Twig::weakrefs) { weaken( $child->{parent});} ;
7020        my $twig= $elt->twig;
7021        $twig->set_root( $child);
7022      }
7023    else
7024      { # normal case
7025        $elt->_move_extra_data_after_erase;
7026        my @children= do { my $elt= $elt; my @children=(); my $child= $elt->{first_child}; while( $child) { push @children, $child; $child= $child->{next_sibling}; } @children; };
7027        if( @children)
7028          { # elt has children, move them up
7029
7030            my $first_child= $elt->{first_child};
7031            my $prev_sibling=$elt->{prev_sibling};
7032            if( $prev_sibling)
7033              { # connect first child to previous sibling
7034                $first_child->{prev_sibling}=$prev_sibling; if( $XML::Twig::weakrefs) { weaken( $first_child->{prev_sibling});} ;
7035                $prev_sibling->{next_sibling}=  $first_child;
7036              }
7037            else
7038              { # elt was the first child
7039                $elt->{parent}->set_first_child( $first_child);
7040              }
7041
7042            my $last_child= $elt->{last_child};
7043            my $next_sibling= $elt->{next_sibling};
7044            if( $next_sibling)
7045              { # connect last child to next sibling
7046                $last_child->{next_sibling}=  $next_sibling;
7047                $next_sibling->{prev_sibling}=$last_child; if( $XML::Twig::weakrefs) { weaken( $next_sibling->{prev_sibling});} ;
7048              }
7049            else
7050              { # elt was the last child
7051                $elt->{parent}->set_last_child( $last_child);
7052              }
7053            # update parent for all siblings
7054            foreach my $child (@children)
7055              { $child->{parent}=$elt->{parent}; if( $XML::Twig::weakrefs) { weaken( $child->{parent});} ; }
7056
7057            # merge consecutive text elements if need be
7058            if( $prev_sibling && $prev_sibling->is_text && ($XML::Twig::index2gi[$first_child->{'gi'}] eq $XML::Twig::index2gi[$prev_sibling->{'gi'}]) )
7059              { $prev_sibling->merge_text( $first_child); }
7060            if( $next_sibling && $next_sibling->is_text && ($XML::Twig::index2gi[$last_child->{'gi'}] eq $XML::Twig::index2gi[$next_sibling->{'gi'}]) )
7061              { $last_child->merge_text( $next_sibling); }
7062
7063            # if parsing and have now a PCDATA text, mark so we can normalize later on if need be
7064            if( $elt->{parent}->{twig_current} && $elt->{last_child}->is_text) {  $elt->{parent}->{twig_to_be_normalized}=1; }
7065
7066            # elt is not referenced any more, so it will be DESTROYed
7067            # so we'd better break the links to its children ## FIX
7068            undef $elt->{first_child};
7069            undef $elt->{last_child};
7070            undef $elt->{parent};
7071            undef $elt->{next_sibling};
7072            undef $elt->{prev_sibling};
7073
7074          }
7075          { # elt had no child, delete it
7076             $elt->delete;
7077          }
7078
7079      }
7080    return $elt;
7081
7082  }
7083
7084sub _move_extra_data_after_erase
7085  { my( $elt)= @_;
7086    # extra_data
7087    if( my $extra_data= $elt->{extra_data})
7088      { my $target= $elt->{first_child} || $elt->{next_sibling};
7089        if( $target)
7090          {
7091            if( $target->is( $ELT))
7092              { $target->set_extra_data( $extra_data . ($target->extra_data || '')); }
7093            elsif( $target->is( $TEXT))
7094              { $target->_unshift_extra_data_in_pcdata( $extra_data, 0); }  # TO CHECK
7095          }
7096        else
7097          { my $parent= $elt->{parent}; # always exists or the erase cannot be performed
7098            $parent->_prefix_extra_data_before_end_tag( $extra_data);
7099          }
7100      }
7101
7102     # extra_data_before_end_tag
7103    if( my $extra_data= $elt->{extra_data_before_end_tag})
7104      { if( my $target= $elt->{next_sibling})
7105          { if( $target->is( $ELT))
7106              { $target->set_extra_data( $extra_data . ($target->extra_data || '')); }
7107            elsif( $target->is( $TEXT))
7108              {
7109                $target->_unshift_extra_data_in_pcdata( $extra_data, 0);
7110             }
7111          }
7112        elsif( my $parent= $elt->{parent})
7113          { $parent->_prefix_extra_data_before_end_tag( $extra_data); }
7114       }
7115
7116    return $elt;
7117
7118  }
7119BEGIN
7120  { my %method= ( before      => \&paste_before,
7121                  after       => \&paste_after,
7122                  first_child => \&paste_first_child,
7123                  last_child  => \&paste_last_child,
7124                  within      => \&paste_within,
7125        );
7126
7127    # paste elt somewhere around ref
7128    # pos can be first_child (default), last_child, before, after or within
7129    sub paste ## no critic (Subroutines::ProhibitNestedSubs);
7130      { my $elt= shift;
7131        if( $elt->{parent})
7132          { croak "cannot paste an element that belongs to a tree"; }
7133        my $pos;
7134        my $ref;
7135        if( ref $_[0])
7136          { $pos= 'first_child';
7137            croak "wrong argument order in paste, should be $_[1] first" if($_[1]);
7138          }
7139        else
7140          { $pos= shift; }
7141
7142        if( my $method= $method{$pos})
7143          {
7144            unless( ref( $_[0]) && isa( $_[0], 'XML::Twig::Elt'))
7145              { if( ! defined( $_[0]))
7146                  { croak "missing target in paste"; }
7147                elsif( ! ref( $_[0]))
7148                  { croak "wrong target type in paste (not a reference), should be XML::Twig::Elt or a subclass"; }
7149                else
7150                  { my $ref= ref $_[0];
7151                    croak "wrong target type in paste: '$ref', should be XML::Twig::Elt or a subclass";
7152                  }
7153              }
7154            $ref= $_[0];
7155            # check here so error message lists the caller file/line
7156            if( !$ref->{parent} && ($pos=~ m{^(before|after)$}) && !(exists $elt->{'target'}) && !(exists $elt->{'comment'}))
7157              { croak "cannot paste $1 root"; }
7158            $elt->$method( @_);
7159          }
7160        else
7161          { croak "tried to paste in wrong position '$pos', allowed positions " .
7162              " are 'first_child', 'last_child', 'before', 'after' and "    .
7163              "'within'";
7164          }
7165        if( (my $ids= $elt->{twig_id_list}) && (my $t= $ref->twig) )
7166          { $t->{twig_id_list}||={};
7167            foreach my $id (keys %$ids)
7168              { $t->{twig_id_list}->{$id}= $ids->{$id};
7169                if( $XML::Twig::weakrefs) { weaken( $t->{twig_id_list}->{$id}); }
7170              }
7171          }
7172        return $elt;
7173      }
7174
7175
7176    sub paste_before
7177      { my( $elt, $ref)= @_;
7178        my( $parent, $prev_sibling, $next_sibling );
7179
7180        # trying to paste before an orphan (root or detached wlt)
7181        unless( $ref->{parent})
7182          { if( my $t= $ref->twig)
7183              { if( (exists $elt->{'comment'}) || (exists $elt->{'target'})) # we can still do this
7184                  { $t->_add_cpi_outside_of_root( leading_cpi => $elt); return; }
7185                else
7186                  { croak "cannot paste before root"; }
7187              }
7188            else
7189              { croak "cannot paste before an orphan element"; }
7190          }
7191        $parent= $ref->{parent};
7192        $prev_sibling= $ref->{prev_sibling};
7193        $next_sibling= $ref;
7194
7195        $elt->{parent}=$parent; if( $XML::Twig::weakrefs) { weaken( $elt->{parent});} ;
7196        if( $parent->{first_child} == $ref) { $parent->{first_child}=  $elt; }
7197
7198        if( $prev_sibling) { $prev_sibling->{next_sibling}=  $elt; }
7199        $elt->{prev_sibling}=$prev_sibling; if( $XML::Twig::weakrefs) { weaken( $elt->{prev_sibling});} ;
7200
7201        $next_sibling->{prev_sibling}=$elt; if( $XML::Twig::weakrefs) { weaken( $next_sibling->{prev_sibling});} ;
7202        $elt->{next_sibling}=  $ref;
7203        return $elt;
7204      }
7205
7206     sub paste_after
7207      { my( $elt, $ref)= @_;
7208        my( $parent, $prev_sibling, $next_sibling );
7209
7210        # trying to paste after an orphan (root or detached wlt)
7211        unless( $ref->{parent})
7212            { if( my $t= $ref->twig)
7213                { if( (exists $elt->{'comment'}) || (exists $elt->{'target'})) # we can still do this
7214                    { $t->_add_cpi_outside_of_root( trailing_cpi => $elt); return; }
7215                  else
7216                    { croak "cannot paste after root"; }
7217                }
7218              else
7219                { croak "cannot paste after an orphan element"; }
7220            }
7221        $parent= $ref->{parent};
7222        $prev_sibling= $ref;
7223        $next_sibling= $ref->{next_sibling};
7224
7225        $elt->{parent}=$parent; if( $XML::Twig::weakrefs) { weaken( $elt->{parent});} ;
7226        if( $parent->{last_child}== $ref) {  $parent->{empty}=0; $parent->{last_child}=$elt; if( $XML::Twig::weakrefs) { weaken( $parent->{last_child});} ; }
7227
7228        $prev_sibling->{next_sibling}=  $elt;
7229        $elt->{prev_sibling}=$prev_sibling; if( $XML::Twig::weakrefs) { weaken( $elt->{prev_sibling});} ;
7230
7231        if( $next_sibling) { $next_sibling->{prev_sibling}=$elt; if( $XML::Twig::weakrefs) { weaken( $next_sibling->{prev_sibling});} ; }
7232        $elt->{next_sibling}=  $next_sibling;
7233        return $elt;
7234
7235      }
7236
7237    sub paste_first_child
7238      { my( $elt, $ref)= @_;
7239        my( $parent, $prev_sibling, $next_sibling );
7240        $parent= $ref;
7241        $next_sibling= $ref->{first_child};
7242
7243        $elt->{parent}=$parent; if( $XML::Twig::weakrefs) { weaken( $elt->{parent});} ;
7244        $parent->{first_child}=  $elt;
7245        unless( $parent->{last_child}) {  $parent->{empty}=0; $parent->{last_child}=$elt; if( $XML::Twig::weakrefs) { weaken( $parent->{last_child});} ; }
7246
7247        $elt->{prev_sibling}=undef; if( $XML::Twig::weakrefs) { weaken( $elt->{prev_sibling});} ;
7248
7249        if( $next_sibling) { $next_sibling->{prev_sibling}=$elt; if( $XML::Twig::weakrefs) { weaken( $next_sibling->{prev_sibling});} ; }
7250        $elt->{next_sibling}=  $next_sibling;
7251        return $elt;
7252      }
7253
7254    sub paste_last_child
7255      { my( $elt, $ref)= @_;
7256        my( $parent, $prev_sibling, $next_sibling );
7257        $parent= $ref;
7258        $prev_sibling= $ref->{last_child};
7259
7260        $elt->{parent}=$parent; if( $XML::Twig::weakrefs) { weaken( $elt->{parent});} ;
7261         $parent->{empty}=0; $parent->{last_child}=$elt; if( $XML::Twig::weakrefs) { weaken( $parent->{last_child});} ;
7262        unless( $parent->{first_child}) { $parent->{first_child}=  $elt; }
7263
7264        $elt->{prev_sibling}=$prev_sibling; if( $XML::Twig::weakrefs) { weaken( $elt->{prev_sibling});} ;
7265        if( $prev_sibling) { $prev_sibling->{next_sibling}=  $elt; }
7266
7267        $elt->{next_sibling}=  undef;
7268        return $elt;
7269      }
7270
7271    sub paste_within
7272      { my( $elt, $ref, $offset)= @_;
7273        my $text= $ref->is_text ? $ref : $ref->next_elt( $TEXT, $ref);
7274        my $new= $text->split_at( $offset);
7275        $elt->paste_before( $new);
7276        return $elt;
7277      }
7278  }
7279
7280# load an element into a structure similar to XML::Simple's
7281sub simplify
7282  { my $elt= shift;
7283
7284    # normalize option names
7285    my %options= @_;
7286    %options= map { my ($key, $val)= ($_, $options{$_});
7287                       $key=~ s{(\w)([A-Z])}{$1_\L$2}g;
7288                       $key => $val
7289                     } keys %options;
7290
7291    # check options
7292    my @allowed_options= qw( keyattr forcearray noattr content_key
7293                             var var_regexp variables var_attr
7294                             group_tags forcecontent
7295                             normalise_space normalize_space
7296                   );
7297    my %allowed_options= map { $_ => 1 } @allowed_options;
7298    foreach my $option (keys %options)
7299      { carp "invalid option $option\n" unless( $allowed_options{$option}); }
7300
7301    $options{normalise_space} ||= $options{normalize_space} || 0;
7302
7303    $options{content_key} ||= 'content';
7304    if( $options{content_key}=~ m{^-})
7305      { # need to remove the - and to activate extra folding
7306        $options{content_key}=~ s{^-}{};
7307        $options{extra_folding}= 1;
7308      }
7309    else
7310      { $options{extra_folding}= 0; }
7311
7312    $options{forcearray} ||=0;
7313    if( isa( $options{forcearray}, 'ARRAY'))
7314      { my %forcearray_tags= map { $_ => 1 } @{$options{forcearray}};
7315        $options{forcearray_tags}= \%forcearray_tags;
7316        $options{forcearray}= 0;
7317      }
7318
7319    $options{keyattr}     ||= ['name', 'key', 'id'];
7320    if( ref $options{keyattr} eq 'ARRAY')
7321      { foreach my $keyattr (@{$options{keyattr}})
7322          { my( $prefix, $att)= ($keyattr=~ m{^([+-])?(.*)});
7323            $prefix ||= '';
7324            $options{key_for_all}->{$att}= 1;
7325            $options{remove_key_for_all}->{$att}=1 unless( $prefix eq '+');
7326            $options{prefix_key_for_all}->{$att}=1 if( $prefix eq '-');
7327          }
7328      }
7329    elsif( ref $options{keyattr} eq 'HASH')
7330      { while( my( $elt, $keyattr)= each %{$options{keyattr}})
7331         { my( $prefix, $att)= ($keyattr=~ m{^([+-])?(.*)});
7332           $prefix ||='';
7333           $options{key_for_elt}->{$elt}= $att;
7334           $options{remove_key_for_elt}->{"$elt#$att"}=1 unless( $prefix);
7335           $options{prefix_key_for_elt}->{"$elt#$att"}=1 if( $prefix eq '-');
7336         }
7337      }
7338
7339
7340    $options{var}||= $options{var_attr}; # for compat with XML::Simple
7341    if( $options{var}) { $options{var_values}= {}; }
7342    else               { $options{var}='';         }
7343
7344    if( $options{variables})
7345      { $options{var}||= 1;
7346        $options{var_values}= $options{variables};
7347      }
7348
7349    if( $options{var_regexp} and !$options{var})
7350      { warn "var option not used, var_regexp option ignored\n"; }
7351    $options{var_regexp} ||= '\$\{?(\w+)\}?';
7352
7353    $elt->_simplify( \%options);
7354
7355 }
7356
7357sub _simplify
7358  { my( $elt, $options)= @_;
7359
7360    my $data;
7361
7362    my $gi= $XML::Twig::index2gi[$elt->{'gi'}];
7363    my @children= do { my $elt= $elt; my @children=(); my $child= $elt->{first_child}; while( $child) { push @children, $child; $child= $child->{next_sibling}; } @children; };
7364    my %atts= $options->{noattr} || !$elt->{att} ? () : %{$elt->{att}};
7365    my $nb_atts= keys %atts;
7366    my $nb_children= $elt->children_count + $nb_atts;
7367
7368    my %nb_children;
7369    foreach (@children)   { $nb_children{$_->tag}++; }
7370    foreach (keys %atts)  { $nb_children{$_}++;      }
7371
7372    my $arrays; # tag => array where elements are stored
7373
7374
7375    # store children
7376    foreach my $child (@children)
7377      { if( $child->is_text)
7378          { # generate with a content key
7379            my $text= $elt->_text_with_vars( $options);
7380            if( $options->{normalise_space} >= 2) { $text= _normalize_space( $text); }
7381            if(    $options->{force_content}
7382                || $nb_atts
7383                || (scalar @children > 1)
7384              )
7385              { $data->{$options->{content_key}}= $text; }
7386            else
7387              { $data= $text; }
7388          }
7389        else
7390          { # element with sub-elements
7391            my $child_gi= $XML::Twig::index2gi[$child->{'gi'}];
7392
7393            my $child_data= $child->_simplify( $options);
7394
7395            # first see if we need to simplify further the child data
7396            # simplify because of grouped tags
7397            if( my $grouped_tag= $options->{group_tags}->{$child_gi})
7398              { # check that the child data is a hash with a single field
7399                unless(    (ref( $child_data) eq 'HASH')
7400                        && (keys %$child_data == 1)
7401                        && defined ( my $grouped_child_data= $child_data->{$grouped_tag})
7402                      )
7403                  { croak "error in grouped tag $child_gi"; }
7404                else
7405                  { $child_data=  $grouped_child_data; }
7406              }
7407            # simplify because of extra folding
7408            if( $options->{extra_folding})
7409              { if(    (ref( $child_data) eq 'HASH')
7410                    && (keys %$child_data == 1)
7411                    && defined( my $content= $child_data->{$options->{content_key}})
7412                  )
7413                  { $child_data= $content; }
7414              }
7415
7416            if( my $keyatt= $child->_key_attr( $options))
7417              { # simplify element with key
7418                my $key= $child->{'att'}->{$keyatt};
7419                if( $options->{normalise_space} >= 1) { $key= _normalize_space( $key); }
7420                $data->{$child_gi}->{$key}= $child_data;
7421              }
7422            elsif(      $options->{forcearray}
7423                   ||   $options->{forcearray_tags}->{$child_gi}
7424                   || ( $nb_children{$child_gi} > 1)
7425                 )
7426              { # simplify element to store in an array
7427                $data->{$child_gi} ||= [];
7428                push @{$data->{$child_gi}}, $child_data;
7429              }
7430            else
7431              { # simplify element to store as a hash field
7432                $data->{$child_gi}= $child_data;
7433              }
7434          }
7435    }
7436
7437    # store atts
7438    # TODO: deal with att that already have an element by that name
7439    foreach my $att (keys %atts)
7440      { # do not store if the att is a key that needs to be removed
7441        if(    $options->{remove_key_for_all}->{$att}
7442            || $options->{remove_key_for_elt}->{"$gi#$att"}
7443          )
7444          { next; }
7445
7446        my $att_text= $options->{var} ?  _replace_vars_in_text( $atts{$att}, $options) : $atts{$att} ;
7447        if( $options->{normalise_space} >= 2) { $att_text= _normalize_space( $att_text); }
7448
7449        if(    $options->{prefix_key_for_all}->{$att}
7450            || $options->{prefix_key_for_elt}->{"$gi#$att"}
7451          )
7452          { # prefix the att
7453            $data->{"-$att"}= $att_text;
7454          }
7455        else
7456          { # normal case
7457            $data->{$att}= $att_text;
7458          }
7459      }
7460
7461    return $data;
7462  }
7463
7464sub _key_attr
7465  { my( $elt, $options)=@_;
7466    return if( $options->{noattr});
7467    if( $options->{key_for_all})
7468      { foreach my $att ($elt->att_names)
7469          { if( $options->{key_for_all}->{$att})
7470              { return $att; }
7471          }
7472      }
7473    elsif( $options->{key_for_elt})
7474      { if( my $key_for_elt= $options->{key_for_elt}->{$XML::Twig::index2gi[$elt->{'gi'}]} )
7475          { return $key_for_elt if( defined( $elt->{'att'}->{$key_for_elt})); }
7476      }
7477    return;
7478  }
7479
7480sub _text_with_vars
7481  { my( $elt, $options)= @_;
7482    my $text;
7483    if( $options->{var})
7484      { $text= _replace_vars_in_text( $elt->text, $options);
7485        $elt->_store_var( $options);
7486      }
7487     else
7488      { $text= $elt->text; }
7489    return $text;
7490  }
7491
7492
7493sub _normalize_space
7494  { my $text= shift;
7495    $text=~ s{\s+}{ }sg;
7496    $text=~ s{^\s}{};
7497    $text=~ s{\s$}{};
7498    return $text;
7499  }
7500
7501
7502sub att_nb
7503  { return 0 unless( my $atts= $_[0]->{att});
7504    return scalar keys %$atts;
7505  }
7506
7507sub has_no_atts
7508  { return 1 unless( my $atts= $_[0]->{att});
7509    return scalar keys %$atts ? 0 : 1;
7510  }
7511
7512sub _replace_vars_in_text
7513  { my( $text, $options)= @_;
7514
7515    $text=~ s{($options->{var_regexp})}
7516             { if( defined( my $value= $options->{var_values}->{$2}))
7517                 { $value }
7518               else
7519                 { warn "unknown variable $2\n";
7520                   $1
7521                 }
7522             }gex;
7523    return $text;
7524  }
7525
7526sub _store_var
7527  { my( $elt, $options)= @_;
7528    if( defined (my $var_name= $elt->{'att'}->{$options->{var}}))
7529       { $options->{var_values}->{$var_name}= $elt->text;
7530       }
7531  }
7532
7533
7534# split a text element at a given offset
7535sub split_at
7536  { my( $elt, $offset)= @_;
7537    my $text_elt= $elt->is_text ? $elt : $elt->first_child( $TEXT) || return '';
7538    my $string= $text_elt->text;
7539    my $left_string= substr( $string, 0, $offset);
7540    my $right_string= substr( $string, $offset);
7541    $text_elt->{pcdata}= (delete $text_elt->{empty} || 1) &&  $left_string;
7542    my $new_elt= $elt->new( $XML::Twig::index2gi[$elt->{'gi'}], $right_string);
7543    $new_elt->paste( after => $elt);
7544    return $new_elt;
7545  }
7546
7547
7548# split an element or its text descendants into several, in place
7549# all elements (new and untouched) are returned
7550sub split
7551  { my $elt= shift;
7552    my @text_chunks;
7553    my @result;
7554    if( $elt->is_text) { @text_chunks= ($elt); }
7555    else               { @text_chunks= $elt->descendants( $TEXT); }
7556    foreach my $text_chunk (@text_chunks)
7557      { push @result, $text_chunk->_split( 1, @_); }
7558    return @result;
7559  }
7560
7561# split an element or its text descendants into several, in place
7562# created elements (those which match the regexp) are returned
7563sub mark
7564  { my $elt= shift;
7565    my @text_chunks;
7566    my @result;
7567    if( $elt->is_text) { @text_chunks= ($elt); }
7568    else               { @text_chunks= $elt->descendants( $TEXT); }
7569    foreach my $text_chunk (@text_chunks)
7570      { push @result, $text_chunk->_split( 0, @_); }
7571    return @result;
7572  }
7573
7574# split a single text element
7575# return_all defines what is returned: if it is true
7576# only returns the elements created by matches in the split regexp
7577# otherwise all elements (new and untouched) are returned
7578
7579
7580{
7581
7582  sub _split
7583    { my $elt= shift;
7584      my $return_all= shift;
7585      my $regexp= shift;
7586      my @tags;
7587
7588      while( @_)
7589        { my $tag= shift();
7590          if( ref $_[0])
7591            { push @tags, { tag => $tag, atts => shift }; }
7592          else
7593            { push @tags, { tag => $tag }; }
7594        }
7595
7596      unless( @tags) { @tags= { tag => $elt->{parent}->gi }; }
7597
7598      my @result;                                 # the returned list of elements
7599      my $text= $elt->text;
7600      my $gi= $XML::Twig::index2gi[$elt->{'gi'}];
7601
7602      # 2 uses: if split matches then the first substring reuses $elt
7603      #         once a split has occurred then the last match needs to be put in
7604      #         a new element
7605      my $previous_match= 0;
7606
7607      while( my( $pre_match, @matches)= $text=~ /^(.*?)$regexp(.*)$/gcs)
7608        { $text= pop @matches;
7609          if( $previous_match)
7610            { # match, not the first one, create a new text ($gi) element
7611              _utf8_ify( $pre_match) if( $] < 5.010);
7612              $elt= $elt->insert_new_elt( after => $gi, $pre_match);
7613              push @result, $elt if( $return_all);
7614            }
7615          else
7616            { # first match in $elt, re-use $elt for the first sub-string
7617              _utf8_ify( $pre_match) if( $] < 5.010);
7618              $elt->set_text( $pre_match);
7619              $previous_match++;                # store the fact that there was a match
7620              push @result, $elt if( $return_all);
7621            }
7622
7623          # now deal with matches captured in the regexp
7624          if( @matches)
7625            { # match, with capture
7626              my $i=0;
7627              foreach my $match (@matches)
7628                { # create new element, text is the match
7629                  _utf8_ify( $match) if( $] < 5.010);
7630                  my $tag  = _repl_match( $tags[$i]->{tag}, @matches) || '#PCDATA';
7631                  my $atts = \%{$tags[$i]->{atts}} || {};
7632                  my %atts= map { _repl_match( $_, @matches) => _repl_match( $atts->{$_}, @matches) } keys %$atts;
7633                  $elt= $elt->insert_new_elt( after => $tag, \%atts, $match);
7634                  push @result, $elt;
7635                  $i= ($i + 1) % @tags;
7636                }
7637            }
7638          else
7639            { # match, no captures
7640              my $tag  = $tags[0]->{tag};
7641              my $atts = \%{$tags[0]->{atts}} || {};
7642              $elt=  $elt->insert_new_elt( after => $tag, $atts);
7643              push @result, $elt;
7644            }
7645        }
7646      if( $previous_match && $text)
7647        { # there was at least 1 match, and there is text left after the match
7648          $elt= $elt->insert_new_elt( after => $gi, $text);
7649        }
7650
7651      push @result, $elt if( $return_all);
7652
7653      return @result; # return all elements
7654   }
7655
7656sub _repl_match
7657  { my( $val, @matches)= @_;
7658    $val=~ s{\$(\d+)}{$matches[$1-1]}g;
7659    return $val;
7660  }
7661
7662  # evil hack needed as sometimes
7663  my $encode_is_loaded=0;   # so we only load Encode once
7664  sub _utf8_ify
7665    {
7666      if( $perl_version >= 5.008 and $perl_version < 5.010 and !_keep_encoding())
7667        { unless( $encode_is_loaded) { require Encode; import Encode; $encode_is_loaded++; }
7668          Encode::_utf8_on( $_[0]); # the flag should be set but is not
7669        }
7670    }
7671
7672
7673}
7674
7675{ my %replace_sub; # cache for complex expressions (expression => sub)
7676
7677  sub subs_text
7678    { my( $elt, $regexp, $replace)= @_;
7679
7680      my $replacement_string;
7681      my $is_string= _is_string( $replace);
7682
7683      my @parents;
7684
7685      foreach my $text_elt ($elt->descendants_or_self( $TEXT))
7686        {
7687          if( $is_string)
7688            { my $text= $text_elt->text;
7689              $text=~ s{$regexp}{ _replace_var( $replace, $1, $2, $3, $4, $5, $6, $7, $8, $9)}egx;
7690              $text_elt->set_text( $text);
7691           }
7692          else
7693            {
7694              no utf8; # = perl 5.6
7695              my $replace_sub= ( $replace_sub{$replace} ||= _install_replace_sub( $replace));
7696              my $text= $text_elt->text;
7697              my $pos=0;  # used to skip text that was previously matched
7698              my $found_hit;
7699              while( my( $pre_match_string, $match_string, @var)= ($text=~ m{(.*?)($regexp)}sg))
7700                { $found_hit=1;
7701                  my $match_start  = length( $pre_match_string);
7702                  my $match        = $match_start ? $text_elt->split_at( $match_start + $pos) : $text_elt;
7703                  my $match_length = length( $match_string);
7704                  my $post_match   = $match->split_at( $match_length);
7705                  $replace_sub->( $match, @var);
7706
7707                  # go to next
7708                  $text_elt= $post_match;
7709                  $text= $post_match->text;
7710
7711                  if( $found_hit) { push @parents, $text_elt->{parent} unless $parents[-1] && $parents[-1]== $text_elt->{parent}; }
7712
7713                }
7714            }
7715        }
7716
7717      foreach my $parent (@parents) { $parent->normalize; }
7718
7719      return $elt;
7720    }
7721
7722
7723  sub _is_string
7724    { return ($_[0]=~ m{&e[ln]t}) ? 0: 1 }
7725
7726  sub _replace_var
7727    { my( $string, @var)= @_;
7728      unshift @var, undef;
7729      $string=~ s{\$(\d)}{$var[$1]}g;
7730      return $string;
7731    }
7732
7733  sub _install_replace_sub
7734    { my $replace_exp= shift;
7735      my @item= split m{(&e[ln]t\s*\([^)]*\))}, $replace_exp;
7736      my $sub= q{ my( $match, @var)= @_; my $new; my $last_inserted=$match;};
7737      my( $gi, $exp);
7738      foreach my $item (@item)
7739        { next if ! length $item;
7740          if(    $item=~ m{^&elt\s*\(([^)]*)\)})
7741            { $exp= $1; }
7742          elsif( $item=~ m{^&ent\s*\(\s*([^\s)]*)\s*\)})
7743            { $exp= " '#ENT' => $1"; }
7744          else
7745            { $exp= qq{ '#PCDATA' => "$item"}; }
7746          $exp=~ s{\$(\d)}{my $i= $1-1; "\$var[$i]"}eg; # replace references to matches
7747          $sub.= qq{ \$new= \$match->new( $exp); };
7748          $sub .= q{ $new->paste( after => $last_inserted); $last_inserted=$new;};
7749        }
7750      $sub .= q{ $match->delete; };
7751      #$sub=~ s/;/;\n/g; warn "subs: $sub";
7752      my $coderef= eval "sub { $NO_WARNINGS; $sub }";
7753      if( $@) { croak( "invalid replacement expression $replace_exp: ",$@); }
7754      return $coderef;
7755    }
7756
7757  }
7758
7759
7760sub merge_text
7761  { my( $e1, $e2)= @_;
7762    croak "invalid merge: can only merge 2 elements"
7763        unless( isa( $e2, 'XML::Twig::Elt'));
7764    croak "invalid merge: can only merge 2 text elements"
7765        unless( $e1->is_text && $e2->is_text && ($e1->gi eq $e2->gi));
7766
7767    my $t1_length= length( $e1->text);
7768
7769    $e1->set_text( $e1->text . $e2->text);
7770
7771    if( my $extra_data_in_pcdata= $e2->_extra_data_in_pcdata)
7772      { foreach my $data (@$extra_data_in_pcdata) { $e1->_push_extra_data_in_pcdata( $data->{text}, $data->{offset} + $t1_length); } }
7773
7774    $e2->delete;
7775
7776    return $e1;
7777  }
7778
7779sub merge
7780  { my( $e1, $e2)= @_;
7781    my @e2_children= $e2->_children;
7782    if(     $e1->_last_child && $e1->_last_child->is_pcdata
7783        &&  @e2_children && $e2_children[0]->is_pcdata
7784      )
7785      { my $t1_length= length( $e1->_last_child->{pcdata});
7786        my $child1= $e1->_last_child;
7787        my $child2= shift @e2_children;
7788        $child1->{pcdata} .= $child2->{pcdata};
7789
7790        my $extra_data= $e1->_extra_data_before_end_tag . $e2->extra_data;
7791
7792        if( $extra_data)
7793          { $e1->_del_extra_data_before_end_tag;
7794            $child1->_push_extra_data_in_pcdata( $extra_data, $t1_length);
7795          }
7796
7797        if( my $extra_data_in_pcdata= $child2->_extra_data_in_pcdata)
7798          { foreach my $data (@$extra_data_in_pcdata) { $child1->_push_extra_data_in_pcdata( $data->{text}, $data->{offset} + $t1_length); } }
7799
7800        if( my $extra_data_before_end_tag= $e2->_extra_data_before_end_tag)
7801          { $e1->_set_extra_data_before_end_tag( $extra_data_before_end_tag); }
7802      }
7803
7804    foreach my $e (@e2_children) { $e->move( last_child => $e1); }
7805
7806    $e2->delete;
7807    return $e1;
7808  }
7809
7810
7811# recursively copy an element and returns the copy (can be huge and long)
7812sub copy
7813  { my $elt= shift;
7814    my $copy= $elt->new( $XML::Twig::index2gi[$elt->{'gi'}]);
7815
7816    if( $elt->extra_data) { $copy->set_extra_data( $elt->extra_data); }
7817    if( $elt->{extra_data_before_end_tag}) { $copy->_set_extra_data_before_end_tag( $elt->{extra_data_before_end_tag}); }
7818
7819    if( $elt->is_asis)   { $copy->set_asis; }
7820
7821    if( (exists $elt->{'pcdata'}))
7822      { $copy->{pcdata}= (delete $copy->{empty} || 1) &&  $elt->{pcdata};
7823        if( $elt->{extra_data_in_pcdata}) { $copy->_set_extra_data_in_pcdata( $elt->{extra_data_in_pcdata}); }
7824      }
7825    elsif( (exists $elt->{'cdata'}))
7826      { $copy->_set_cdata( $elt->{cdata});
7827        if( $elt->{extra_data_in_pcdata}) { $copy->_set_extra_data_in_pcdata( $elt->{extra_data_in_pcdata}); }
7828      }
7829    elsif( (exists $elt->{'target'}))
7830      { $copy->_set_pi( $elt->{target}, $elt->{data}); }
7831    elsif( (exists $elt->{'comment'}))
7832      { $copy->_set_comment( $elt->{comment}); }
7833    elsif( (exists $elt->{'ent'}))
7834      { $copy->{ent}=  $elt->{ent}; }
7835    else
7836      { my @children= do { my $elt= $elt; my @children=(); my $child= $elt->{first_child}; while( $child) { push @children, $child; $child= $child->{next_sibling}; } @children; };
7837        if( my $atts= $elt->{att})
7838          { my %atts;
7839            tie %atts, 'Tie::IxHash' if (keep_atts_order());
7840            %atts= %{$atts}; # we want to do a real copy of the attributes
7841            $copy->set_atts( \%atts);
7842          }
7843        foreach my $child (@children)
7844          { my $child_copy= $child->copy;
7845            $child_copy->paste( 'last_child', $copy);
7846          }
7847      }
7848    # save links to the original location, which can be convenient and is used for namespace resolution
7849    foreach my $link ( qw(parent prev_sibling next_sibling) )
7850      { $copy->{former}->{$link}= $elt->{$link};
7851        if( $XML::Twig::weakrefs) { weaken( $copy->{former}->{$link}); }
7852      }
7853
7854    $copy->{empty}=  $elt->{'empty'};
7855
7856    return $copy;
7857  }
7858
7859
7860sub delete
7861  { my $elt= shift;
7862    $elt->cut;
7863    $elt->DESTROY unless $XML::Twig::weakrefs;
7864    return undef;
7865  }
7866
7867sub __destroy
7868  { my $elt= shift;
7869    return if( $XML::Twig::weakrefs);
7870    my $t= shift || $elt->twig; # optional argument, passed in recursive calls
7871
7872    foreach( @{[$elt->_children]}) { $_->DESTROY( $t); }
7873
7874    # the id reference needs to be destroyed
7875    # lots of tests to avoid warnings during the cleanup phase
7876    $elt->del_id( $t) if( $ID && $t && defined( $elt->{att}) && exists( $elt->{att}->{$ID}));
7877    if( $elt->{former}) { foreach (keys %{$elt->{former}}) { delete $elt->{former}->{$_}; } delete $elt->{former}; }
7878    foreach (qw( keys %$elt)) { delete $elt->{$_}; }
7879    undef $elt;
7880  }
7881
7882BEGIN
7883{ sub set_destroy { if( $XML::Twig::weakrefs) { undef *DESTROY } else { *DESTROY= *__destroy; } }
7884  set_destroy();
7885}
7886
7887# ignores the element
7888sub ignore
7889  { my $elt= shift;
7890    my $t= $elt->twig;
7891    $t->ignore( $elt, @_);
7892  }
7893
7894BEGIN {
7895  my $pretty                    = 0;
7896  my $quote                     = '"';
7897  my $INDENT                    = '  ';
7898  my $empty_tag_style           = 0;
7899  my $remove_cdata              = 0;
7900  my $keep_encoding             = 0;
7901  my $expand_external_entities  = 0;
7902  my $keep_atts_order           = 0;
7903  my $do_not_escape_amp_in_atts = 0;
7904  my $WRAP                      = '80';
7905  my $REPLACED_ENTS             = qq{&<};
7906
7907  my ($NSGMLS, $NICE, $INDENTED, $INDENTEDCT, $INDENTEDC, $WRAPPED, $RECORD1, $RECORD2, $INDENTEDA)= (1..9);
7908  my %KEEP_TEXT_TAG_ON_ONE_LINE= map { $_ => 1 } ( $INDENTED, $INDENTEDCT, $INDENTEDC, $INDENTEDA, $WRAPPED);
7909  my %WRAPPED =  map { $_ => 1 } ( $WRAPPED, $INDENTEDA, $INDENTEDC);
7910
7911  my %pretty_print_style=
7912    ( none       => 0,          # no added \n
7913      nsgmls     => $NSGMLS,    # nsgmls-style, \n in tags
7914      # below this line styles are UNSAFE (the generated XML can be well-formed but invalid)
7915      nice       => $NICE,      # \n after open/close tags except when the
7916                                # element starts with text
7917      indented   => $INDENTED,  # nice plus idented
7918      indented_close_tag   => $INDENTEDCT,  # nice plus idented
7919      indented_c => $INDENTEDC, # slightly more compact than indented (closing
7920                                # tags are on the same line)
7921      wrapped    => $WRAPPED,   # text is wrapped at column
7922      record_c   => $RECORD1,   # for record-like data (compact)
7923      record     => $RECORD2,   # for record-like data  (not so compact)
7924      indented_a => $INDENTEDA, # nice, indented, and with attributes on separate
7925                                # lines as the nsgmls style, as well as wrapped
7926                                # lines - to make the xml friendly to line-oriented tools
7927      cvs        => $INDENTEDA, # alias for indented_a
7928    );
7929
7930  my ($HTML, $EXPAND)= (1..2);
7931  my %empty_tag_style=
7932    ( normal => 0,        # <tag/>
7933      html   => $HTML,    # <tag />
7934      xhtml  => $HTML,    # <tag />
7935      expand => $EXPAND,  # <tag></tag>
7936    );
7937
7938  my %quote_style=
7939    ( double  => '"',
7940      single  => "'",
7941      # smart  => "smart",
7942    );
7943
7944  my $xml_space_preserve; # set when an element includes xml:space="preserve"
7945
7946  my $output_filter;      # filters the entire output (including < and >)
7947  my $output_text_filter; # filters only the text part (tag names, attributes, pcdata)
7948
7949  my $replaced_ents= $REPLACED_ENTS;
7950
7951
7952  # returns those pesky "global" variables so you can switch between twigs
7953  sub global_state ## no critic (Subroutines::ProhibitNestedSubs);
7954    { return
7955       { pretty                    => $pretty,
7956         quote                     => $quote,
7957         indent                    => $INDENT,
7958         empty_tag_style           => $empty_tag_style,
7959         remove_cdata              => $remove_cdata,
7960         keep_encoding             => $keep_encoding,
7961         expand_external_entities  => $expand_external_entities,
7962         output_filter             => $output_filter,
7963         output_text_filter        => $output_text_filter,
7964         keep_atts_order           => $keep_atts_order,
7965         do_not_escape_amp_in_atts => $do_not_escape_amp_in_atts,
7966         wrap                      => $WRAP,
7967         replaced_ents             => $replaced_ents,
7968        };
7969    }
7970
7971  # restores the global variables
7972  sub set_global_state
7973    { my $state= shift;
7974      $pretty                    = $state->{pretty};
7975      $quote                     = $state->{quote};
7976      $INDENT                    = $state->{indent};
7977      $empty_tag_style           = $state->{empty_tag_style};
7978      $remove_cdata              = $state->{remove_cdata};
7979      $keep_encoding             = $state->{keep_encoding};
7980      $expand_external_entities  = $state->{expand_external_entities};
7981      $output_filter             = $state->{output_filter};
7982      $output_text_filter        = $state->{output_text_filter};
7983      $keep_atts_order           = $state->{keep_atts_order};
7984      $do_not_escape_amp_in_atts = $state->{do_not_escape_amp_in_atts};
7985      $WRAP                      = $state->{wrap};
7986      $replaced_ents             = $state->{replaced_ents},
7987    }
7988
7989  # sets global state to defaults
7990  sub init_global_state
7991    { set_global_state(
7992       { pretty                    => 0,
7993         quote                     => '"',
7994         indent                    => $INDENT,
7995         empty_tag_style           => 0,
7996         remove_cdata              => 0,
7997         keep_encoding             => 0,
7998         expand_external_entities  => 0,
7999         output_filter             => undef,
8000         output_text_filter        => undef,
8001         keep_atts_order           => undef,
8002         do_not_escape_amp_in_atts => 0,
8003         wrap                      => $WRAP,
8004         replaced_ents             => $REPLACED_ENTS,
8005        });
8006    }
8007
8008
8009  # set the pretty_print style (in $pretty) and returns the old one
8010  # can be called from outside the package with 2 arguments (elt, style)
8011  # or from inside with only one argument (style)
8012  # the style can be either a string (one of the keys of %pretty_print_style
8013  # or a number (presumably an old value saved)
8014  sub set_pretty_print
8015    { my $style= lc( defined $_[1] ? $_[1] : $_[0]); # so we cover both cases
8016      my $old_pretty= $pretty;
8017      if( $style=~ /^\d+$/)
8018        { croak "invalid pretty print style $style" unless( $style < keys %pretty_print_style);
8019          $pretty= $style;
8020        }
8021      else
8022        { croak "invalid pretty print style '$style'" unless( exists $pretty_print_style{$style});
8023          $pretty= $pretty_print_style{$style};
8024        }
8025      if( $WRAPPED{$pretty} )
8026        { XML::Twig::_use( 'Text::Wrap') or croak( "Text::Wrap not available, cannot use style $style"); }
8027      return $old_pretty;
8028    }
8029
8030  sub _pretty_print { return $pretty; }
8031
8032  # set the empty tag style (in $empty_tag_style) and returns the old one
8033  # can be called from outside the package with 2 arguments (elt, style)
8034  # or from inside with only one argument (style)
8035  # the style can be either a string (one of the keys of %empty_tag_style
8036  # or a number (presumably an old value saved)
8037  sub set_empty_tag_style
8038    { my $style= lc( defined $_[1] ? $_[1] : $_[0]); # so we cover both cases
8039      my $old_style= $empty_tag_style;
8040      if( $style=~ /^\d+$/)
8041        { croak "invalid empty tag style $style"
8042        unless( $style < keys %empty_tag_style);
8043        $empty_tag_style= $style;
8044        }
8045      else
8046        { croak "invalid empty tag style '$style'"
8047            unless( exists $empty_tag_style{$style});
8048          $empty_tag_style= $empty_tag_style{$style};
8049        }
8050      return $old_style;
8051    }
8052
8053  sub _pretty_print_styles
8054    { return (sort { $pretty_print_style{$a} <=> $pretty_print_style{$b} || $a cmp $b } keys %pretty_print_style); }
8055
8056  sub set_quote
8057    { my $style= $_[1] || $_[0];
8058      my $old_quote= $quote;
8059      croak "invalid quote '$style'" unless( exists $quote_style{$style});
8060      $quote= $quote_style{$style};
8061      return $old_quote;
8062    }
8063
8064  sub set_remove_cdata
8065    { my $new_value= defined $_[1] ? $_[1] : $_[0];
8066      my $old_value= $remove_cdata;
8067      $remove_cdata= $new_value;
8068      return $old_value;
8069    }
8070
8071
8072  sub set_indent
8073    { my $new_value= defined $_[1] ? $_[1] : $_[0];
8074      my $old_value= $INDENT;
8075      $INDENT= $new_value;
8076      return $old_value;
8077    }
8078
8079  sub set_wrap
8080    { my $new_value= defined $_[1] ? $_[1] : $_[0];
8081      my $old_value= $WRAP;
8082      $WRAP= $new_value;
8083      return $old_value;
8084    }
8085
8086
8087  sub set_keep_encoding
8088    { my $new_value= defined $_[1] ? $_[1] : $_[0];
8089      my $old_value= $keep_encoding;
8090      $keep_encoding= $new_value;
8091      return $old_value;
8092   }
8093
8094  sub set_replaced_ents
8095    { my $new_value= defined $_[1] ? $_[1] : $_[0];
8096      my $old_value= $replaced_ents;
8097      $replaced_ents= $new_value;
8098      return $old_value;
8099   }
8100
8101  sub do_not_escape_gt
8102    { my $old_value= $replaced_ents;
8103      $replaced_ents= q{&<}; # & needs to be first
8104      return $old_value;
8105    }
8106
8107  sub escape_gt
8108    { my $old_value= $replaced_ents;
8109      $replaced_ents= qq{&<>}; # & needs to be first
8110      return $old_value;
8111    }
8112
8113  sub _keep_encoding { return $keep_encoding; } # so I can use elsewhere in the module
8114
8115  sub set_do_not_escape_amp_in_atts
8116    { my $new_value= defined $_[1] ? $_[1] : $_[0];
8117      my $old_value= $do_not_escape_amp_in_atts;
8118      $do_not_escape_amp_in_atts= $new_value;
8119      return $old_value;
8120   }
8121
8122  sub output_filter      { return $output_filter; }
8123  sub output_text_filter { return $output_text_filter; }
8124
8125  sub set_output_filter
8126    { my $new_value= defined $_[1] ? $_[1] : $_[0]; # can be called in object/non-object mode
8127      # if called in object mode with no argument, the filter is undefined
8128      if( isa( $new_value, 'XML::Twig::Elt') || isa( $new_value, 'XML::Twig')) { undef $new_value; }
8129      my $old_value= $output_filter;
8130      if( !$new_value || isa( $new_value, 'CODE') )
8131        { $output_filter= $new_value; }
8132      elsif( $new_value eq 'latin1')
8133        { $output_filter= XML::Twig::latin1();
8134        }
8135      elsif( $XML::Twig::filter{$new_value})
8136        {  $output_filter= $XML::Twig::filter{$new_value}; }
8137      else
8138        { croak "invalid output filter '$new_value'"; }
8139
8140      return $old_value;
8141    }
8142
8143  sub set_output_text_filter
8144    { my $new_value= defined $_[1] ? $_[1] : $_[0]; # can be called in object/non-object mode
8145      # if called in object mode with no argument, the filter is undefined
8146      if( isa( $new_value, 'XML::Twig::Elt') || isa( $new_value, 'XML::Twig')) { undef $new_value; }
8147      my $old_value= $output_text_filter;
8148      if( !$new_value || isa( $new_value, 'CODE') )
8149        { $output_text_filter= $new_value; }
8150      elsif( $new_value eq 'latin1')
8151        { $output_text_filter= XML::Twig::latin1();
8152        }
8153      elsif( $XML::Twig::filter{$new_value})
8154        {  $output_text_filter= $XML::Twig::filter{$new_value}; }
8155      else
8156        { croak "invalid output text filter '$new_value'"; }
8157
8158      return $old_value;
8159    }
8160
8161  sub set_expand_external_entities
8162    { my $new_value= defined $_[1] ? $_[1] : $_[0];
8163      my $old_value= $expand_external_entities;
8164      $expand_external_entities= $new_value;
8165      return $old_value;
8166    }
8167
8168  sub set_keep_atts_order
8169    { my $new_value= defined $_[1] ? $_[1] : $_[0];
8170      my $old_value= $keep_atts_order;
8171      $keep_atts_order= $new_value;
8172      return $old_value;
8173
8174   }
8175
8176  sub keep_atts_order { return $keep_atts_order; } # so I can use elsewhere in the module
8177
8178  my %html_empty_elt;
8179  BEGIN { %html_empty_elt= map { $_ => 1} qw( base meta link hr br param img area input col); }
8180
8181  sub start_tag
8182    { my( $elt, $option)= @_;
8183
8184
8185      return if( $elt->{gi} < $XML::Twig::SPECIAL_GI);
8186
8187      my $extra_data= $elt->{extra_data} || '';
8188
8189      my $gi= $XML::Twig::index2gi[$elt->{'gi'}];
8190      my $att= $elt->{att}; # should be $elt->{att}, optimized into a pure hash look-up
8191
8192      my $ns_map= $att ? $att->{'#original_gi'} : '';
8193      if( $ns_map) { $gi= _restore_original_prefix( $ns_map, $gi); }
8194      $gi=~ s{^#default:}{}; # remove default prefix
8195
8196      if( $output_text_filter) { $gi= $output_text_filter->( $gi); }
8197
8198      # get the attribute and their values
8199      my $att_sep = $pretty==$NSGMLS    ? "\n"
8200                  : $pretty==$INDENTEDA ? "\n" . $INDENT x ($elt->level+1) . '  '
8201                  :                       ' '
8202                  ;
8203
8204      my $replace_in_att_value= $replaced_ents . "$quote\t\r\n";
8205      if( $option->{escape_gt} && $replaced_ents !~ m{>}) { $replace_in_att_value.= '>'; }
8206
8207      my $tag;
8208      my @att_names= grep { !( $_=~ m{^#(?!default:)} ) } $keep_atts_order ?  keys %{$att} : sort keys %{$att};
8209      if( @att_names)
8210        { my $atts= join $att_sep, map  { my $output_att_name= $ns_map ? _restore_original_prefix( $ns_map, $_) : $_;
8211                                          if( $output_text_filter)
8212                                            { $output_att_name=  $output_text_filter->( $output_att_name); }
8213                                          $output_att_name . '=' . $quote . _att_xml_string( $att->{$_}, $replace_in_att_value) . $quote
8214
8215                                        }
8216                                        @att_names
8217                                   ;
8218           if( $pretty==$INDENTEDA && @att_names == 1) { $att_sep= ' '; }
8219           $tag= "<$gi$att_sep$atts";
8220        }
8221      else
8222        { $tag= "<$gi"; }
8223
8224      $tag .= "\n" if($pretty==$NSGMLS);
8225
8226
8227      # force empty if suitable HTML tag, otherwise use the value from the input tree
8228      if( ($empty_tag_style eq $HTML) && !$elt->{first_child} && !$elt->{extra_data_before_end_tag} && $html_empty_elt{$gi})
8229        { $elt->{empty}= 1; }
8230      my $empty= defined $elt->{empty} ? $elt->{empty}
8231               : $elt->{first_child}    ? 0
8232               :                         1;
8233
8234      $tag .= (!$elt->{empty} || $elt->{extra_data_before_end_tag})  ? '>'            # element has content
8235            : (($empty_tag_style eq $HTML) && $html_empty_elt{$gi}) ? ' />'          # html empty element
8236                                                                                     # cvs-friendly format
8237            : ( $pretty == $INDENTEDA && @att_names > 1)            ? "\n" .  $INDENT x $elt->level . "/>"
8238            : ( $pretty == $INDENTEDA && @att_names == 1)           ? " />"
8239            : $empty_tag_style                                      ? "></" . $XML::Twig::index2gi[$elt->{'gi'}] . ">" # $empty_tag_style is $HTML or $EXPAND
8240            :                                                         '/>'
8241            ;
8242
8243      if( ( (substr( $XML::Twig::index2gi[$elt->{'gi'}], 0, 1) eq '#') && (substr( $XML::Twig::index2gi[$elt->{'gi'}], 0, 9) ne '#default:') )) { $tag= ''; }
8244
8245#warn "TRACE: ", $tag,": ", Encode::is_utf8( $tag) ? "has flag" : "FLAG NOT SET";
8246
8247      unless( $pretty) { return defined( $extra_data) ? $extra_data . $tag : $tag;  }
8248
8249      my $prefix='';
8250      my $return='';   # '' or \n is to be printed before the tag
8251      my $indent=0;    # number of indents before the tag
8252
8253      if( $pretty==$RECORD1)
8254        { my $level= $elt->level;
8255          $return= "\n" if( $level < 2);
8256          $indent= 1 if( $level == 1);
8257        }
8258
8259     elsif( $pretty==$RECORD2)
8260        { $return= "\n";
8261          $indent= $elt->level;
8262        }
8263
8264      elsif( $pretty==$NICE)
8265        { my $parent= $elt->{parent};
8266          unless( !$parent || $parent->{contains_text})
8267            { $return= "\n"; }
8268          $elt->{contains_text}= 1 if( ($parent && $parent->{contains_text})
8269                                     || $elt->contains_text);
8270        }
8271
8272      elsif( $KEEP_TEXT_TAG_ON_ONE_LINE{$pretty})
8273        { my $parent= $elt->{parent};
8274          unless( !$parent || $parent->{contains_text})
8275            { $return= "\n";
8276              $indent= $elt->level;
8277            }
8278          $elt->{contains_text}= 1 if( ($parent && $parent->{contains_text})
8279                                     || $elt->contains_text);
8280        }
8281
8282      if( $return || $indent)
8283        { # check for elements in which spaces should be kept
8284          my $t= $elt->twig;
8285          return $extra_data . $tag if( $xml_space_preserve);
8286          if( $t && $t->{twig_keep_spaces_in})
8287            { foreach my $ancestor ($elt->ancestors)
8288                { return $extra_data . $tag if( $t->{twig_keep_spaces_in}->{$XML::Twig::index2gi[$ancestor->{'gi'}]}) }
8289            }
8290
8291          $prefix= $INDENT x $indent;
8292          if( $extra_data)
8293            { $extra_data=~ s{\s+$}{};
8294              $extra_data=~ s{^\s+}{};
8295              $extra_data= $prefix .  $extra_data . $return;
8296            }
8297        }
8298
8299
8300      return $return . $extra_data . $prefix . $tag;
8301    }
8302
8303  sub end_tag
8304    { my $elt= shift;
8305      return  '' if(    ($elt->{gi}<$XML::Twig::SPECIAL_GI)
8306                     || ($elt->{'empty'} && !$elt->{extra_data_before_end_tag})
8307                   );
8308      my $tag= "<";
8309      my $gi= $XML::Twig::index2gi[$elt->{'gi'}];
8310
8311      if( my $map= $elt->{'att'}->{'#original_gi'}) { $gi= _restore_original_prefix( $map, $gi); }
8312      $gi=~ s{^#default:}{}; # remove default prefix
8313
8314      if( $output_text_filter) { $gi= $output_text_filter->( $XML::Twig::index2gi[$elt->{'gi'}]); }
8315      $tag .=  "/$gi>";
8316
8317      $tag = ($elt->{extra_data_before_end_tag} || '') . $tag;
8318
8319      if( ( (substr( $XML::Twig::index2gi[$elt->{'gi'}], 0, 1) eq '#') && (substr( $XML::Twig::index2gi[$elt->{'gi'}], 0, 9) ne '#default:') )) { $tag= ''; }
8320
8321      return $tag unless $pretty;
8322
8323      my $prefix='';
8324      my $return=0;    # 1 if a \n is to be printed before the tag
8325      my $indent=0;    # number of indents before the tag
8326
8327      if( $pretty==$RECORD1)
8328        { $return= 1 if( $elt->level == 0);
8329        }
8330
8331     elsif( $pretty==$RECORD2)
8332        { unless( $elt->contains_text)
8333            { $return= 1 ;
8334              $indent= $elt->level;
8335            }
8336        }
8337
8338      elsif( $pretty==$NICE)
8339        { my $parent= $elt->{parent};
8340          if( (    ($parent && !$parent->{contains_text}) || !$parent )
8341            && ( !$elt->{contains_text}
8342             && ($elt->{has_flushed_child} || $elt->{first_child})
8343           )
8344         )
8345            { $return= 1; }
8346        }
8347
8348      elsif( $KEEP_TEXT_TAG_ON_ONE_LINE{$pretty})
8349        { my $parent= $elt->{parent};
8350          if( (    ($parent && !$parent->{contains_text}) || !$parent )
8351            && ( !$elt->{contains_text}
8352             && ($elt->{has_flushed_child} || $elt->{first_child})
8353           )
8354         )
8355            { $return= 1;
8356              $indent= $elt->level;
8357            }
8358        }
8359
8360      if( $return || $indent)
8361        { # check for elements in which spaces should be kept
8362          my $t= $elt->twig;
8363          return $tag if( $xml_space_preserve);
8364          if( $t && $t->{twig_keep_spaces_in})
8365            { foreach my $ancestor ($elt, $elt->ancestors)
8366                { return $tag if( $t->{twig_keep_spaces_in}->{$XML::Twig::index2gi[$ancestor->{'gi'}]}) }
8367            }
8368
8369          if( $return) { $prefix= ($pretty== $INDENTEDCT) ? "\n$INDENT" : "\n"; }
8370          $prefix.= $INDENT x $indent;
8371    }
8372
8373      # add a \n at the end of the document (after the root element)
8374      $tag .= "\n" unless( $elt->{parent});
8375
8376      return $prefix . $tag;
8377    }
8378
8379  sub _restore_original_prefix
8380    { my( $map, $name)= @_;
8381      my $prefix= _ns_prefix( $name);
8382      if( my $original_prefix= $map->{$prefix})
8383        { if( $original_prefix eq '#default')
8384            { $name=~ s{^$prefix:}{}; }
8385          else
8386            { $name=~ s{^$prefix(?=:)}{$original_prefix}; }
8387        }
8388      return $name;
8389    }
8390
8391  # buffer used to hold the text to print/sprint, to avoid passing it back and forth between methods
8392  my @sprint;
8393
8394  # $elt is an element to print
8395  # $fh is an optional filehandle to print to
8396  # $pretty is an optional value, if true a \n is printed after the < of the
8397  # opening tag
8398  sub print
8399    { my $elt= shift;
8400
8401      my $fh= isa( $_[0], 'GLOB') || isa( $_[0], 'IO::Scalar') ? shift : undef;
8402      my $old_select= defined $fh ? select $fh : undef;
8403      print $elt->sprint( @_);
8404      select $old_select if( defined $old_select);
8405    }
8406
8407
8408# those next 2 methods need to be refactored, they are copies of the same methods in XML::Twig
8409sub print_to_file
8410  { my( $elt, $filename)= (shift, shift);
8411    my $out_fh;
8412#    open( $out_fh, ">$filename") or _croak( "cannot create file $filename: $!");     # < perl 5.8
8413    my $mode= $keep_encoding ? '>' : '>:utf8';                                       # >= perl 5.8
8414    open( $out_fh, $mode, $filename) or _croak( "cannot create file $filename: $!"); # >= perl 5.8
8415    $elt->print( $out_fh, @_);
8416    close $out_fh;
8417    return $elt;
8418  }
8419
8420# probably only works on *nix (at least the chmod bit)
8421# first print to a temporary file, then rename that file to the desired file name, then change permissions
8422# to the original file permissions (or to the current umask)
8423sub safe_print_to_file
8424  { my( $elt, $filename)= (shift, shift);
8425    my $perm= -f $filename ? (stat $filename)[2] & 07777 : ~umask() ;
8426    XML::Twig::_use( 'File::Temp') || croak "need File::Temp to use safe_print_to_file\n";
8427    XML::Twig::_use( 'File::Basename') || croak "need File::Basename to use safe_print_to_file\n";
8428    my $tmpdir= File::Basename::dirname( $filename);
8429    my( $fh, $tmpfilename) = File::Temp::tempfile( DIR => $tmpdir);
8430    $elt->print_to_file( $tmpfilename, @_);
8431    rename( $tmpfilename, $filename) or unlink $tmpfilename && _croak( "cannot move temporary file to $filename: $!");
8432    chmod $perm, $filename;
8433    return $elt;
8434  }
8435
8436
8437  # same as print but does not output the start tag if the element
8438  # is marked as flushed
8439  sub flush
8440    { my $elt= shift;
8441      my $up_to= $_[0] && isa( $_[0], 'XML::Twig::Elt') ? shift : $elt;
8442      $elt->twig->flush_up_to( $up_to, @_);
8443    }
8444  sub purge
8445    { my $elt= shift;
8446      my $up_to= $_[0] && isa( $_[0], 'XML::Twig::Elt') ? shift : $elt;
8447      $elt->twig->purge_up_to( $up_to, @_);
8448    }
8449
8450  sub _flush
8451    { my $elt= shift;
8452
8453      my $pretty;
8454      my $fh=  isa( $_[0], 'GLOB') || isa( $_[0], 'IO::Scalar') ? shift : undef;
8455      my $old_select= defined $fh ? select $fh : undef;
8456      my $old_pretty= defined ($pretty= shift) ? set_pretty_print( $pretty) : undef;
8457
8458      $xml_space_preserve= 1 if( ($elt->inherit_att( 'xml:space') || '') eq 'preserve');
8459
8460      $elt->__flush();
8461
8462      $xml_space_preserve= 0;
8463
8464      select $old_select if( defined $old_select);
8465      set_pretty_print( $old_pretty) if( defined $old_pretty);
8466    }
8467
8468  sub __flush
8469    { my $elt= shift;
8470
8471      if( $elt->{gi} >= $XML::Twig::SPECIAL_GI)
8472        { my $preserve= ($elt->{'att'}->{'xml:space'} || '') eq 'preserve';
8473          $xml_space_preserve++ if $preserve;
8474          unless( $elt->_flushed)
8475            { print $elt->start_tag();
8476            }
8477
8478          # flush the children
8479          my @children= do { my $elt= $elt; my @children=(); my $child= $elt->{first_child}; while( $child) { push @children, $child; $child= $child->{next_sibling}; } @children; };
8480          foreach my $child (@children)
8481            { $child->_flush( $pretty); }
8482          unless( $elt->{end_tag_flushed}) { print $elt->end_tag; }
8483          $xml_space_preserve-- if $preserve;
8484          # used for pretty printing
8485          if( my $parent= $elt->{parent}) { $parent->{has_flushed_child}= 1; }
8486        }
8487      else # text or special element
8488        { my $text;
8489          if( (exists $elt->{'pcdata'}))     { $text= $elt->pcdata_xml_string;
8490                                     if( my $parent= $elt->{parent})
8491                                       { $parent->{contains_text}= 1; }
8492                                   }
8493          elsif( (exists $elt->{'cdata'}))   { $text= $elt->cdata_string;
8494                                     if( my $parent= $elt->{parent})
8495                                       { $parent->{contains_text}= 1; }
8496                                   }
8497          elsif( (exists $elt->{'target'}))      { $text= $elt->pi_string;          }
8498          elsif( (exists $elt->{'comment'})) { $text= $elt->comment_string;     }
8499          elsif( (exists $elt->{'ent'}))     { $text= $elt->ent_string;         }
8500
8501          print $output_filter ? $output_filter->( $text) : $text;
8502        }
8503    }
8504
8505
8506  sub xml_text
8507    { my( $elt, @options)= @_;
8508
8509      if( @options && grep { lc( $_) eq 'no_recurse' } @options) { return $elt->xml_text_only; }
8510
8511      my $string='';
8512
8513      if( ($elt->{gi} >= $XML::Twig::SPECIAL_GI) )
8514        { # sprint the children
8515          my $child= $elt->{first_child} || '';
8516          while( $child)
8517            { $string.= $child->xml_text;
8518            } continue { $child= $child->{next_sibling}; }
8519        }
8520      elsif( (exists $elt->{'pcdata'}))  { $string .= $output_filter ?  $output_filter->($elt->pcdata_xml_string)
8521                                                           : $elt->pcdata_xml_string;
8522                               }
8523      elsif( (exists $elt->{'cdata'}))   { $string .= $output_filter ?  $output_filter->($elt->cdata_string)
8524                                                           : $elt->cdata_string;
8525                               }
8526      elsif( (exists $elt->{'ent'}))     { $string .= $elt->ent_string; }
8527
8528      return $string;
8529    }
8530
8531  sub xml_text_only
8532    { return join '', map { $_->xml_text if( $_->is_text || (exists $_->{'ent'})) } $_[0]->_children; }
8533
8534  # same as print but except... it does not print but rather returns the string
8535  # if the second parameter is set then only the content is returned, not the
8536  # start and end tags of the element (but the tags of the included elements are
8537  # returned)
8538
8539  sub sprint
8540    { my $elt= shift;
8541      my( $old_pretty, $old_empty_tag_style);
8542
8543      if( $_[0] && isa( $_[0], 'HASH'))
8544        { my %args= XML::Twig::_normalize_args( %{shift()});
8545          if( defined $args{PrettyPrint}) { $old_pretty          = set_pretty_print( $args{PrettyPrint});  }
8546           if( defined $args{EmptyTags})  { $old_empty_tag_style = set_empty_tag_style( $args{EmptyTags}); }
8547        }
8548
8549      $xml_space_preserve= 1 if( ($elt->inherit_att( 'xml:space') || '') eq 'preserve');
8550
8551      @sprint=();
8552      $elt->_sprint( @_);
8553      my $sprint= join( '', @sprint);
8554      if( $output_filter) { $sprint= $output_filter->( $sprint); }
8555
8556      if( ( ($pretty== $WRAPPED) || ($pretty==$INDENTEDC)) && !$xml_space_preserve)
8557        { $sprint= _wrap_text( $sprint); }
8558      $xml_space_preserve= 0;
8559
8560
8561      if( defined $old_pretty)          { set_pretty_print( $old_pretty);             }
8562      if( defined $old_empty_tag_style) { set_empty_tag_style( $old_empty_tag_style); }
8563
8564      return $sprint;
8565    }
8566
8567  sub _wrap_text
8568    { my( $string)= @_;
8569      my $wrapped;
8570      foreach my $line (split /\n/, $string)
8571        { my( $initial_indent)= $line=~ m{^(\s*)};
8572          my $wrapped_line= Text::Wrap::wrap(  '',  $initial_indent . $INDENT, $line) . "\n";
8573
8574          # fix glitch with Text::wrap when the first line is long and does not include spaces
8575          # the first line ends up being too short by 2 chars, but we'll have to live with it!
8576          $wrapped_line=~ s{^ +\n  }{}s; # this prefix needs to be removed
8577
8578          $wrapped .= $wrapped_line;
8579        }
8580
8581      return $wrapped;
8582    }
8583
8584
8585  sub _sprint
8586    { my $elt= shift;
8587      my $no_tag= shift || 0;
8588      # in case there's some comments or PI's piggybacking
8589
8590      if( $elt->{gi} >= $XML::Twig::SPECIAL_GI)
8591        {
8592          my $preserve= ($elt->{'att'}->{'xml:space'} || '') eq 'preserve';
8593          $xml_space_preserve++ if $preserve;
8594
8595          push @sprint, $elt->start_tag unless( $no_tag);
8596
8597          # sprint the children
8598          my $child= $elt->{first_child};
8599          while( $child)
8600            { $child->_sprint;
8601              $child= $child->{next_sibling};
8602            }
8603          push @sprint, $elt->end_tag unless( $no_tag);
8604          $xml_space_preserve-- if $preserve;
8605        }
8606      else
8607        { push @sprint, $elt->{extra_data} if( $elt->{extra_data}) ;
8608          if(    (exists $elt->{'pcdata'}))  { push @sprint, $elt->pcdata_xml_string; }
8609          elsif( (exists $elt->{'cdata'}))   { push @sprint, $elt->cdata_string;      }
8610          elsif( (exists $elt->{'target'}))      { if( ($pretty >= $INDENTED) && !$elt->{parent}->{contains_text}) { push @sprint, "\n" . $INDENT x $elt->level; }
8611                                     push @sprint, $elt->pi_string;
8612                                   }
8613          elsif( (exists $elt->{'comment'})) { if( ($pretty >= $INDENTED) && !$elt->{parent}->{contains_text}) { push @sprint, "\n" . $INDENT x $elt->level; }
8614                                     push @sprint, $elt->comment_string;
8615                                   }
8616          elsif( (exists $elt->{'ent'}))     { push @sprint, $elt->ent_string;        }
8617        }
8618
8619      return;
8620    }
8621
8622  # just a shortcut to $elt->sprint( 1)
8623  sub xml_string
8624    { my $elt= shift;
8625      isa( $_[0], 'HASH') ?  $elt->sprint( shift(), 1) : $elt->sprint( 1);
8626    }
8627
8628  sub pcdata_xml_string
8629    { my $elt= shift;
8630      if( defined( my $string= $elt->{pcdata}) )
8631        {
8632          if( ! $elt->{extra_data_in_pcdata})
8633            {
8634              $string=~ s/([$replaced_ents])/$XML::Twig::base_ent{$1}/g unless( !$replaced_ents || $keep_encoding || $elt->{asis});
8635              $string=~ s{\Q]]>}{]]&gt;}g;
8636            }
8637          else
8638            { _gen_mark( $string); # used by _(un)?protect_extra_data
8639              foreach my $data (reverse @{$elt->{extra_data_in_pcdata}})
8640                { my $substr= substr( $string, $data->{offset});
8641                  if( $keep_encoding || $elt->{asis})
8642                    { substr( $string, $data->{offset}, 0, $data->{text}); }
8643                  else
8644                    { substr( $string, $data->{offset}, 0, _protect_extra_data( $data->{text})); }
8645                }
8646              unless( $keep_encoding || $elt->{asis})
8647                {
8648                  $string=~ s{([$replaced_ents])}{$XML::Twig::base_ent{$1}}g ;
8649                  $string=~ s{\Q]]>}{]]&gt;}g;
8650                  _unprotect_extra_data( $string);
8651                }
8652            }
8653          return $output_text_filter ? $output_text_filter->( $string) : $string;
8654        }
8655      else
8656        { return ''; }
8657    }
8658
8659  { my $mark;
8660    my( %char2ent, %ent2char);
8661    BEGIN
8662      { %char2ent= ( '<' => 'lt', '&' => 'amp', '>' => 'gt');
8663        %ent2char= map { $char2ent{$_} => $_ } keys %char2ent;
8664      }
8665
8666    # generate a unique mark (a string) not found in the string,
8667    # used to mark < and & in the extra data
8668    sub _gen_mark
8669      { $mark="AAAA";
8670        $mark++ while( index( $_[0], $mark) > -1);
8671        return $mark;
8672      }
8673
8674    sub _protect_extra_data
8675      { my( $extra_data)= @_;
8676        $extra_data=~ s{([<&>])}{:$mark:$char2ent{$1}:}g;
8677        return $extra_data;
8678      }
8679
8680    sub _unprotect_extra_data
8681      { $_[0]=~ s{:$mark:(\w+):}{$ent2char{$1}}g; }
8682
8683  }
8684
8685  sub cdata_string
8686    { my $cdata= $_[0]->{cdata};
8687      unless( defined $cdata) { return ''; }
8688      if( $remove_cdata)
8689        { $cdata=~ s/([$replaced_ents])/$XML::Twig::base_ent{$1}/g; }
8690      else
8691        { $cdata= $CDATA_START . $cdata . $CDATA_END; }
8692      return $cdata;
8693   }
8694
8695  sub att_xml_string
8696    { my $elt= shift;
8697      my $att= shift;
8698
8699      my $replace= $replaced_ents . "$quote\n\r\t";
8700      if($_[0] && $_[0]->{escape_gt} && ($replace!~ m{>}) ) { $replace .='>'; }
8701
8702      if( defined (my $string= $elt->{att}->{$att}))
8703        { return _att_xml_string( $string, $replace); }
8704      else
8705        { return ''; }
8706    }
8707
8708  # escaped xml string for an attribute value
8709  sub _att_xml_string
8710    { my( $string, $escape)= @_;
8711      if( !defined( $string)) { return ''; }
8712      if( $keep_encoding)
8713        { $string=~ s{$quote}{$XML::Twig::base_ent{$quote}}g;
8714        }
8715      else
8716        {
8717          if( $do_not_escape_amp_in_atts)
8718            { $escape=~ s{^.}{}; # seems like the most backward compatible way to remove & from the list
8719              $string=~ s{([$escape])}{$XML::Twig::base_ent{$1}}g;
8720              $string=~ s{&(?!(\w+|#\d+|[xX][0-9a-fA-F]+);)}{&amp;}g; # dodgy: escape & that do not start an entity
8721            }
8722          else
8723            { $string=~ s{([$escape])}{$XML::Twig::base_ent{$1}}g;
8724              $string=~ s{\Q]]>}{]]&gt;}g;
8725            }
8726        }
8727
8728      return $output_text_filter ? $output_text_filter->( $string) : $string;
8729    }
8730
8731  sub ent_string
8732    { my $ent= shift;
8733      my $ent_text= $ent->{ent};
8734      my( $t, $el, $ent_string);
8735      if(    $expand_external_entities
8736          && ($t= $ent->twig)
8737          && ($el= $t->entity_list)
8738          && ($ent_string= $el->{entities}->{$ent->ent_name}->{val})
8739        )
8740        { return $ent_string; }
8741       else
8742         { return $ent_text;  }
8743    }
8744
8745  # returns just the text, no tags, for an element
8746  sub text
8747    { my( $elt, @options)= @_;
8748
8749      if( @options && grep { lc( $_) eq 'no_recurse' } @options) { return $elt->text_only; }
8750
8751      my $string;
8752
8753      if( (exists $elt->{'pcdata'}))     { return  $elt->{pcdata};   }
8754      elsif( (exists $elt->{'cdata'}))   { return  $elt->{cdata};    }
8755      elsif( (exists $elt->{'target'}))      { return  $elt->pi_string;}
8756      elsif( (exists $elt->{'comment'})) { return  $elt->{comment};  }
8757      elsif( (exists $elt->{'ent'}))     { return  $elt->{ent} ;     }
8758
8759      my $child= $elt->{first_child} ||'';
8760      while( $child)
8761        {
8762          my $child_text= $child->text;
8763          $string.= defined( $child_text) ? $child_text : '';
8764        } continue { $child= $child->{next_sibling}; }
8765
8766      unless( defined $string) { $string=''; }
8767
8768      return $output_text_filter ? $output_text_filter->( $string) : $string;
8769    }
8770
8771  sub text_only
8772    { return join '', map { $_->text if( $_->is_text || (exists $_->{'ent'})) } $_[0]->_children; }
8773
8774  sub trimmed_text
8775    { my $elt= shift;
8776      my $text= $elt->text( @_);
8777      $text=~ s{\s+}{ }sg;
8778      $text=~ s{^\s*}{};
8779      $text=~ s{\s*$}{};
8780      return $text;
8781    }
8782
8783  sub trim
8784    { my( $elt)= @_;
8785      my $pcdata= $elt->first_descendant( $TEXT);
8786      (my $pcdata_text= $pcdata->text)=~ s{^\s+}{}s;
8787      $pcdata->set_text( $pcdata_text);
8788      $pcdata= $elt->last_descendant( $TEXT);
8789      ($pcdata_text= $pcdata->text)=~ s{\s+$}{};
8790      $pcdata->set_text( $pcdata_text);
8791      foreach my $pcdata ($elt->descendants( $TEXT))
8792        { ($pcdata_text= $pcdata->text)=~ s{\s+}{ }g;
8793          $pcdata->set_text( $pcdata_text);
8794        }
8795      return $elt;
8796    }
8797
8798
8799  # remove cdata sections (turns them into regular pcdata) in an element
8800  sub remove_cdata
8801    { my $elt= shift;
8802      foreach my $cdata ($elt->descendants_or_self( $CDATA))
8803        { if( $keep_encoding)
8804            { my $data= $cdata->{cdata};
8805              $data=~ s{([&<"'])}{$XML::Twig::base_ent{$1}}g;
8806              $cdata->{pcdata}= (delete $cdata->{empty} || 1) &&  $data;
8807            }
8808          else
8809            { $cdata->{pcdata}= (delete $cdata->{empty} || 1) &&  $cdata->{cdata}; }
8810          $cdata->{gi}=$XML::Twig::gi2index{$PCDATA} or $cdata->set_gi( $PCDATA);
8811          undef $cdata->{cdata};
8812        }
8813    }
8814
8815sub _is_private      { return _is_private_name( $_[0]->gi); }
8816sub _is_private_name { return $_[0]=~ m{^#(?!default:)};                }
8817
8818
8819} # end of block containing package globals ($pretty_print, $quotes, keep_encoding...)
8820
8821# merges consecutive #PCDATAs in am element
8822sub normalize
8823  { my( $elt)= @_;
8824    my @descendants= $elt->descendants( $PCDATA);
8825    while( my $desc= shift @descendants)
8826      { if( ! length $desc->{pcdata}) { $desc->delete; next; }
8827        while( @descendants && $desc->{next_sibling} && $desc->{next_sibling}== $descendants[0])
8828          { my $to_merge= shift @descendants;
8829            $desc->merge_text( $to_merge);
8830          }
8831      }
8832    return $elt;
8833  }
8834
8835# SAX export methods
8836sub toSAX1
8837  { _toSAX(@_, \&_start_tag_data_SAX1, \&_end_tag_data_SAX1); }
8838
8839sub toSAX2
8840  { _toSAX(@_, \&_start_tag_data_SAX2, \&_end_tag_data_SAX2); }
8841
8842sub _toSAX
8843  { my( $elt, $handler, $start_tag_data, $end_tag_data)= @_;
8844    if( $elt->{gi} >= $XML::Twig::SPECIAL_GI)
8845      { my $data= $start_tag_data->( $elt);
8846        _start_prefix_mapping( $elt, $handler, $data);
8847        if( $data && (my $start_element = $handler->can( 'start_element')))
8848          { unless( $elt->_flushed) { $start_element->( $handler, $data); } }
8849
8850        foreach my $child ($elt->_children)
8851          { $child->_toSAX( $handler, $start_tag_data, $end_tag_data); }
8852
8853        if( (my $data= $end_tag_data->( $elt)) && (my $end_element = $handler->can( 'end_element')) )
8854          { $end_element->( $handler, $data); }
8855        _end_prefix_mapping( $elt, $handler);
8856      }
8857    else # text or special element
8858      { if( (exists $elt->{'pcdata'}) && (my $characters= $handler->can( 'characters')))
8859          { $characters->( $handler, { Data => $elt->{pcdata} });  }
8860        elsif( (exists $elt->{'cdata'}))
8861          { if( my $start_cdata= $handler->can( 'start_cdata'))
8862              { $start_cdata->( $handler); }
8863            if( my $characters= $handler->can( 'characters'))
8864              { $characters->( $handler, {Data => $elt->{cdata} });  }
8865            if( my $end_cdata= $handler->can( 'end_cdata'))
8866              { $end_cdata->( $handler); }
8867          }
8868        elsif( ((exists $elt->{'target'}))  && (my $pi= $handler->can( 'processing_instruction')))
8869          { $pi->( $handler, { Target =>$elt->{target}, Data => $elt->{data} });  }
8870        elsif( ((exists $elt->{'comment'}))  && (my $comment= $handler->can( 'comment')))
8871          { $comment->( $handler, { Data => $elt->{comment} });  }
8872        elsif( ((exists $elt->{'ent'})))
8873          {
8874            if( my $se=   $handler->can( 'skipped_entity'))
8875              { $se->( $handler, { Name => $elt->ent_name });  }
8876            elsif( my $characters= $handler->can( 'characters'))
8877              { if( defined $elt->ent_string)
8878                  { $characters->( $handler, {Data => $elt->ent_string});  }
8879                else
8880                  { $characters->( $handler, {Data => $elt->ent_name});  }
8881              }
8882          }
8883
8884      }
8885  }
8886
8887sub _start_tag_data_SAX1
8888  { my( $elt)= @_;
8889    my $name= $XML::Twig::index2gi[$elt->{'gi'}];
8890    return if( ( (substr( $XML::Twig::index2gi[$elt->{'gi'}], 0, 1) eq '#') && (substr( $XML::Twig::index2gi[$elt->{'gi'}], 0, 9) ne '#default:') ));
8891    my $attributes={};
8892    my $atts= $elt->{att};
8893    while( my( $att, $value)= each %$atts)
8894      { $attributes->{$att}= $value unless( ( $att=~ m{^#(?!default:)} )); }
8895    my $data= { Name => $name, Attributes => $attributes};
8896    return $data;
8897  }
8898
8899sub _end_tag_data_SAX1
8900  { my( $elt)= @_;
8901    return if( ( (substr( $XML::Twig::index2gi[$elt->{'gi'}], 0, 1) eq '#') && (substr( $XML::Twig::index2gi[$elt->{'gi'}], 0, 9) ne '#default:') ));
8902    return  { Name => $XML::Twig::index2gi[$elt->{'gi'}] };
8903  }
8904
8905sub _start_tag_data_SAX2
8906  { my( $elt)= @_;
8907    my $data={};
8908
8909    my $name= $XML::Twig::index2gi[$elt->{'gi'}];
8910    return if( ( (substr( $XML::Twig::index2gi[$elt->{'gi'}], 0, 1) eq '#') && (substr( $XML::Twig::index2gi[$elt->{'gi'}], 0, 9) ne '#default:') ));
8911    $data->{Name}         = $name;
8912    $data->{Prefix}       = $elt->ns_prefix;
8913    $data->{LocalName}    = $elt->local_name;
8914    $data->{NamespaceURI} = $elt->namespace;
8915
8916    # save a copy of the data so we can re-use it for the end tag
8917    my %sax2_data= %$data;
8918    $elt->{twig_elt_SAX2_data}= \%sax2_data;
8919
8920    # add the attributes
8921    $data->{Attributes}= $elt->_atts_to_SAX2;
8922
8923    return $data;
8924  }
8925
8926sub _atts_to_SAX2
8927  { my $elt= shift;
8928    my $SAX2_atts= {};
8929    foreach my $att (keys %{$elt->{att}})
8930      {
8931        next if( ( $att=~ m{^#(?!default:)} ));
8932        my $SAX2_att={};
8933        $SAX2_att->{Name}         = $att;
8934        $SAX2_att->{Prefix}       = _ns_prefix( $att);
8935        $SAX2_att->{LocalName}    = _local_name( $att);
8936        $SAX2_att->{NamespaceURI} = $elt->namespace( $SAX2_att->{Prefix});
8937        $SAX2_att->{Value}        = $elt->{'att'}->{$att};
8938        my $SAX2_att_name= "{$SAX2_att->{NamespaceURI}}$SAX2_att->{LocalName}";
8939
8940        $SAX2_atts->{$SAX2_att_name}= $SAX2_att;
8941      }
8942    return $SAX2_atts;
8943  }
8944
8945sub _start_prefix_mapping
8946  { my( $elt, $handler, $data)= @_;
8947    if( my $start_prefix_mapping= $handler->can( 'start_prefix_mapping')
8948        and my @new_prefix_mappings= grep { /^\{[^}]*\}xmlns/ || /^\{$XMLNS_URI\}/ } keys %{$data->{Attributes}}
8949      )
8950      { foreach my $prefix (@new_prefix_mappings)
8951          { my $prefix_string= $data->{Attributes}->{$prefix}->{LocalName};
8952            if( $prefix_string eq 'xmlns') { $prefix_string=''; }
8953            my $prefix_data=
8954              {  Prefix       => $prefix_string,
8955                 NamespaceURI => $data->{Attributes}->{$prefix}->{Value}
8956              };
8957            $start_prefix_mapping->( $handler, $prefix_data);
8958            $elt->{twig_end_prefix_mapping} ||= [];
8959            push @{$elt->{twig_end_prefix_mapping}}, $prefix_string;
8960          }
8961      }
8962  }
8963
8964sub _end_prefix_mapping
8965  { my( $elt, $handler)= @_;
8966    if( my $end_prefix_mapping= $handler->can( 'end_prefix_mapping'))
8967      { foreach my $prefix (@{$elt->{twig_end_prefix_mapping}})
8968          { $end_prefix_mapping->( $handler, { Prefix => $prefix} ); }
8969      }
8970  }
8971
8972sub _end_tag_data_SAX2
8973  { my( $elt)= @_;
8974    return if( ( (substr( $XML::Twig::index2gi[$elt->{'gi'}], 0, 1) eq '#') && (substr( $XML::Twig::index2gi[$elt->{'gi'}], 0, 9) ne '#default:') ));
8975    return $elt->{twig_elt_SAX2_data};
8976  }
8977
8978sub contains_text
8979  { my $elt= shift;
8980    my $child= $elt->{first_child};
8981    while ($child)
8982      { return 1 if( $child->is_text || (exists $child->{'ent'}));
8983        $child= $child->{next_sibling};
8984      }
8985    return 0;
8986  }
8987
8988# creates a single pcdata element containing the text as child of the element
8989# options:
8990#   - force_pcdata: when set to a true value forces the text to be in a #PCDATA
8991#                   even if the original element was a #CDATA
8992sub set_text
8993  { my( $elt, $string, %option)= @_;
8994
8995    if( $XML::Twig::index2gi[$elt->{'gi'}] eq $PCDATA)
8996      { return $elt->{pcdata}= (delete $elt->{empty} || 1) &&  $string; }
8997    elsif( $XML::Twig::index2gi[$elt->{'gi'}] eq $CDATA)
8998      { if( $option{force_pcdata})
8999          { $elt->{gi}=$XML::Twig::gi2index{$PCDATA} or $elt->set_gi( $PCDATA);
9000            $elt->_set_cdata('');
9001            return $elt->{pcdata}= (delete $elt->{empty} || 1) &&  $string;
9002          }
9003        else
9004          { return $elt->_set_cdata( $string); }
9005      }
9006    elsif( $elt->contains_a_single( $PCDATA) )
9007      { # optimized so we have a slight chance of not loosing embedded comments and pi's
9008        $elt->{first_child}->set_pcdata( $string);
9009        return $elt;
9010      }
9011
9012    foreach my $child (@{[$elt->_children]})
9013      { $child->delete; }
9014
9015    my $pcdata= $elt->_new_pcdata( $string);
9016    $pcdata->paste( $elt);
9017
9018    $elt->{empty}=0;
9019
9020    return $elt;
9021  }
9022
9023# set the content of an element from a list of strings and elements
9024sub set_content
9025  { my $elt= shift;
9026
9027    return $elt unless defined $_[0];
9028
9029    # attributes can be given as a hash (passed by ref)
9030    if( ref $_[0] eq 'HASH')
9031      { my $atts= shift;
9032        $elt->del_atts; # usually useless but better safe than sorry
9033        $elt->set_atts( $atts);
9034        return $elt unless defined $_[0];
9035      }
9036
9037    # check next argument for #EMPTY
9038    if( !(ref $_[0]) && ($_[0] eq $EMPTY) )
9039      { $elt->{empty}= 1; return $elt; }
9040
9041    # case where we really want to do a set_text, the element is '#PCDATA'
9042    # or contains a single PCDATA and we only want to add text in it
9043    if( ($XML::Twig::index2gi[$elt->{'gi'}] eq $PCDATA || $elt->contains_a_single( $PCDATA))
9044        && (@_ == 1) && !( ref $_[0]))
9045      { $elt->set_text( $_[0]);
9046        return $elt;
9047      }
9048    elsif( ($XML::Twig::index2gi[$elt->{'gi'}] eq $CDATA) && (@_ == 1) && !( ref $_[0]))
9049      { $elt->_set_cdata( $_[0]);
9050        return $elt;
9051      }
9052
9053    # delete the children
9054    foreach my $child (@{[$elt->_children]})
9055      { $child->delete; }
9056
9057    if( @_) { $elt->{empty}=0; }
9058
9059    foreach my $child (@_)
9060      { if( ref( $child) && isa( $child, 'XML::Twig::Elt'))
9061          { # argument is an element
9062            $child->paste( 'last_child', $elt);
9063          }
9064        else
9065          { # argument is a string
9066            if( (my $pcdata= $elt->{last_child}) && $elt->{last_child}->is_pcdata)
9067              { # previous child is also pcdata: just concatenate
9068                $pcdata->{pcdata}= (delete $pcdata->{empty} || 1) &&  $pcdata->{pcdata} . $child
9069              }
9070            else
9071              { # previous child is not a string: create a new pcdata element
9072                $pcdata= $elt->_new_pcdata( $child);
9073                $pcdata->paste( 'last_child', $elt);
9074              }
9075          }
9076      }
9077
9078
9079    return $elt;
9080  }
9081
9082# inserts an element (whose gi is given) as child of the element
9083# all children of the element are now children of the new element
9084# returns the new element
9085sub insert
9086  { my ($elt, @args)= @_;
9087    # first cut the children
9088    my @children= do { my $elt= $elt; my @children=(); my $child= $elt->{first_child}; while( $child) { push @children, $child; $child= $child->{next_sibling}; } @children; };
9089    foreach my $child (@children)
9090      { $child->cut; }
9091    # insert elements
9092    while( my $gi= shift @args)
9093      { my $new_elt= $elt->new( $gi);
9094        # add attributes if needed
9095        if( defined( $args[0]) && ( isa( $args[0], 'HASH')) )
9096          { $new_elt->set_atts( shift @args); }
9097        # paste the element
9098        $new_elt->paste( $elt);
9099        $elt->{empty}=0;
9100        $elt= $new_elt;
9101      }
9102    # paste back the children
9103    foreach my $child (@children)
9104      { $child->paste( 'last_child', $elt); }
9105    return $elt;
9106  }
9107
9108# insert a new element
9109# $elt->insert_new_element( $opt_position, $gi, $opt_atts_hash, @opt_content);
9110# the element is created with the same syntax as new
9111# position is the same as in paste, first_child by default
9112sub insert_new_elt
9113  { my $elt= shift;
9114    my $position= $_[0];
9115    if(     ($position eq 'before') || ($position eq 'after')
9116         || ($position eq 'first_child') || ($position eq 'last_child'))
9117      { shift; }
9118    else
9119      { $position= 'first_child'; }
9120
9121    my $new_elt= $elt->new( @_);
9122    $new_elt->paste( $position, $elt);
9123
9124    #if( defined $new_elt->{'att'}->{$ID}) { $new_elt->set_id( $new_elt->{'att'}->{$ID}); }
9125
9126    return $new_elt;
9127  }
9128
9129# wraps an element in elements which gi's are given as arguments
9130# $elt->wrap_in( 'td', 'tr', 'table') wraps the element as a single
9131# cell in a table for example
9132# returns the new element
9133sub wrap_in
9134  { my $elt= shift;
9135    while( my $gi = shift @_)
9136      { my $new_elt = $elt->new( $gi);
9137        if( $elt->{twig_current})
9138          { my $t= $elt->twig;
9139            $t->{twig_current}= $new_elt;
9140            delete $elt->{'twig_current'};
9141            $new_elt->{'twig_current'}=1;
9142          }
9143
9144        if( my $parent= $elt->{parent})
9145          { $new_elt->{parent}=$parent; if( $XML::Twig::weakrefs) { weaken( $new_elt->{parent});} ;
9146            if( $parent->{first_child} == $elt) { $parent->{first_child}=  $new_elt; }
9147             if( $parent->{last_child} == $elt) {  $parent->{empty}=0; $parent->{last_child}=$new_elt; if( $XML::Twig::weakrefs) { weaken( $parent->{last_child});} ;  }
9148          }
9149        else
9150          { # wrapping the root
9151            my $twig= $elt->twig;
9152            if( $twig && $twig->root && ($twig->root eq $elt) )
9153              { $twig->set_root( $new_elt);
9154              }
9155          }
9156
9157        if( my $prev_sibling= $elt->{prev_sibling})
9158          { $new_elt->{prev_sibling}=$prev_sibling; if( $XML::Twig::weakrefs) { weaken( $new_elt->{prev_sibling});} ;
9159            $prev_sibling->{next_sibling}=  $new_elt;
9160          }
9161
9162        if( my $next_sibling= $elt->{next_sibling})
9163          { $new_elt->{next_sibling}=  $next_sibling;
9164            $next_sibling->{prev_sibling}=$new_elt; if( $XML::Twig::weakrefs) { weaken( $next_sibling->{prev_sibling});} ;
9165          }
9166        $new_elt->{first_child}=  $elt;
9167         $new_elt->{empty}=0; $new_elt->{last_child}=$elt; if( $XML::Twig::weakrefs) { weaken( $new_elt->{last_child});} ;
9168
9169        $elt->{parent}=$new_elt; if( $XML::Twig::weakrefs) { weaken( $elt->{parent});} ;
9170        $elt->{prev_sibling}=undef; if( $XML::Twig::weakrefs) { weaken( $elt->{prev_sibling});} ;
9171        $elt->{next_sibling}=  undef;
9172
9173        # add the attributes if the next argument is a hash ref
9174        if( defined( $_[0]) && (isa( $_[0], 'HASH')) )
9175          { $new_elt->set_atts( shift @_); }
9176
9177        $elt= $new_elt;
9178      }
9179
9180    return $elt;
9181  }
9182
9183sub replace
9184  { my( $elt, $ref)= @_;
9185
9186    if( $elt->{parent}) { $elt->cut; }
9187
9188    if( my $parent= $ref->{parent})
9189      { $elt->{parent}=$parent; if( $XML::Twig::weakrefs) { weaken( $elt->{parent});} ;
9190        if( $parent->{first_child} == $ref) { $parent->{first_child}=  $elt; }
9191        if( $parent->{last_child} == $ref)  {  $parent->{empty}=0; $parent->{last_child}=$elt; if( $XML::Twig::weakrefs) { weaken( $parent->{last_child});}  ; }
9192      }
9193    elsif( $ref->twig && $ref == $ref->twig->root)
9194      { $ref->twig->set_root( $elt); }
9195
9196    if( my $prev_sibling= $ref->{prev_sibling})
9197      { $elt->{prev_sibling}=$prev_sibling; if( $XML::Twig::weakrefs) { weaken( $elt->{prev_sibling});} ;
9198        $prev_sibling->{next_sibling}=  $elt;
9199      }
9200    if( my $next_sibling= $ref->{next_sibling})
9201      { $elt->{next_sibling}=  $next_sibling;
9202        $next_sibling->{prev_sibling}=$elt; if( $XML::Twig::weakrefs) { weaken( $next_sibling->{prev_sibling});} ;
9203      }
9204
9205    $ref->{parent}=undef; if( $XML::Twig::weakrefs) { weaken( $ref->{parent});} ;
9206    $ref->{prev_sibling}=undef; if( $XML::Twig::weakrefs) { weaken( $ref->{prev_sibling});} ;
9207    $ref->{next_sibling}=  undef;
9208    return $ref;
9209  }
9210
9211sub replace_with
9212  { my $ref= shift;
9213    my $elt= shift;
9214    $elt->replace( $ref);
9215    foreach my $new_elt (reverse @_)
9216      { $new_elt->paste( after => $elt); }
9217    return $elt;
9218  }
9219
9220
9221# move an element, same syntax as paste, except the element is first cut
9222sub move
9223  { my $elt= shift;
9224    $elt->cut;
9225    $elt->paste( @_);
9226    return $elt;
9227  }
9228
9229
9230# adds a prefix to an element, creating a pcdata child if needed
9231sub prefix
9232  { my ($elt, $prefix, $option)= @_;
9233    my $asis= ($option && ($option eq 'asis')) ? 1 : 0;
9234    if( (exists $elt->{'pcdata'})
9235        && (($asis && $elt->{asis}) || (!$asis && ! $elt->{asis}))
9236      )
9237      { $elt->{pcdata}= (delete $elt->{empty} || 1) &&  $prefix . $elt->{pcdata}; }
9238    elsif( $elt->{first_child} && $elt->{first_child}->is_pcdata
9239        && (   ($asis && $elt->{first_child}->{asis})
9240            || (!$asis && ! $elt->{first_child}->{asis}))
9241         )
9242      {
9243        $elt->{first_child}->set_pcdata( $prefix . $elt->{first_child}->pcdata);
9244      }
9245    else
9246      { my $new_elt= $elt->_new_pcdata( $prefix);
9247        my $pos= (exists $elt->{'pcdata'}) ? 'before' : 'first_child';
9248        $new_elt->paste( $pos => $elt);
9249        if( $asis) { $new_elt->set_asis; }
9250      }
9251    return $elt;
9252  }
9253
9254# adds a suffix to an element, creating a pcdata child if needed
9255sub suffix
9256  { my ($elt, $suffix, $option)= @_;
9257    my $asis= ($option && ($option eq 'asis')) ? 1 : 0;
9258    if( (exists $elt->{'pcdata'})
9259        && (($asis && $elt->{asis}) || (!$asis && ! $elt->{asis}))
9260      )
9261      { $elt->{pcdata}= (delete $elt->{empty} || 1) &&  $elt->{pcdata} . $suffix; }
9262    elsif( $elt->{last_child} && $elt->{last_child}->is_pcdata
9263        && (   ($asis && $elt->{last_child}->{asis})
9264            || (!$asis && ! $elt->{last_child}->{asis}))
9265         )
9266      { $elt->{last_child}->set_pcdata( $elt->{last_child}->pcdata . $suffix); }
9267    else
9268      { my $new_elt= $elt->_new_pcdata( $suffix);
9269        my $pos= (exists $elt->{'pcdata'}) ? 'after' : 'last_child';
9270        $new_elt->paste( $pos => $elt);
9271        if( $asis) { $new_elt->set_asis; }
9272      }
9273    return $elt;
9274  }
9275
9276# create a path to an element ('/root/.../gi)
9277sub path
9278  { my $elt= shift;
9279    my @context= ( $elt, $elt->ancestors);
9280    return "/" . join( "/", reverse map {$_->gi} @context);
9281  }
9282
9283sub xpath
9284  { my $elt= shift;
9285    my $xpath;
9286    foreach my $ancestor (reverse $elt->ancestors_or_self)
9287      { my $gi= $XML::Twig::index2gi[$ancestor->{'gi'}];
9288        $xpath.= "/$gi";
9289        my $index= $ancestor->prev_siblings( $gi) + 1;
9290        unless( ($index == 1) && !$ancestor->next_sibling( $gi))
9291          { $xpath.= "[$index]"; }
9292      }
9293    return $xpath;
9294  }
9295
9296# methods used mainly by wrap_children
9297
9298# return a string with the
9299# for an element <foo><elt att="val">...</elt><elt2/><elt>...</elt></foo>
9300# returns '<elt att="val"><elt2><elt>'
9301sub _stringify_struct
9302  { my( $elt, %opt)= @_;
9303    my $string='';
9304    my $pretty_print= set_pretty_print( 'none');
9305    foreach my $child ($elt->_children)
9306      { $child->add_id; $string .= $child->start_tag( { escape_gt => 1 }) ||''; }
9307    set_pretty_print( $pretty_print);
9308    return $string;
9309  }
9310
9311# wrap a series of elements in a new one
9312sub _wrap_range
9313  { my $elt= shift;
9314    my $gi= shift;
9315    my $atts= isa( $_[0], 'HASH') ? shift : undef;
9316    my $range= shift; # the string with the tags to wrap
9317
9318    my $t= $elt->twig;
9319
9320    # get the tags to wrap
9321    my @to_wrap;
9322    while( $range=~ m{<\w+\s+[^>]*id=("[^"]*"|'[^']*')[^>]*>}g)
9323      { push @to_wrap, $t->elt_id( substr( $1, 1, -1)); }
9324
9325    return '' unless @to_wrap;
9326
9327    my $to_wrap= shift @to_wrap;
9328    my %atts= %$atts;
9329    my $new_elt= $to_wrap->wrap_in( $gi, \%atts);
9330    $_->move( last_child => $new_elt) foreach (@to_wrap);
9331
9332    return '';
9333  }
9334
9335# wrap children matching a regexp in a new element
9336sub wrap_children
9337  { my( $elt, $regexp, $gi, $atts)= @_;
9338
9339    $atts ||={};
9340
9341    my $elt_as_string= $elt->_stringify_struct; # stringify the elt structure
9342    $regexp=~ s{(<[^>]*>)}{_match_expr( $1)}eg; # in the regexp, replace gi's by the proper regexp
9343    $elt_as_string=~ s{($regexp)}{$elt->_wrap_range( $gi, $atts, $1)}eg; # then do the actual replace
9344
9345    return $elt;
9346  }
9347
9348sub _match_expr
9349  { my $tag= shift;
9350    my( $gi, %atts)= XML::Twig::_parse_start_tag( $tag);
9351    return _match_tag( $gi, %atts);
9352  }
9353
9354
9355sub _match_tag
9356  { my( $elt, %atts)= @_;
9357    my $string= "<$elt\\b";
9358    foreach my $key (sort keys %atts)
9359      { my $val= qq{\Q$atts{$key}\E};
9360        $string.= qq{[^>]*$key=(?:"$val"|'$val')};
9361      }
9362    $string.=  qq{[^>]*>};
9363    return "(?:$string)";
9364  }
9365
9366sub field_to_att
9367  { my( $elt, $cond, $att)= @_;
9368    $att ||= $cond;
9369    my $child= $elt->first_child( $cond) or return undef;
9370    $elt->set_att( $att => $child->text);
9371    $child->cut;
9372    return $elt;
9373  }
9374
9375sub att_to_field
9376  { my( $elt, $att, $tag)= @_;
9377    $tag ||= $att;
9378    my $child= $elt->insert_new_elt( first_child => $tag, $elt->{'att'}->{$att});
9379    $elt->del_att( $att);
9380    return $elt;
9381  }
9382
9383# sort children methods
9384
9385sub sort_children_on_field
9386  { my $elt   = shift;
9387    my $field = shift;
9388    my $get_key= sub { return $_[0]->field( $field) };
9389    return $elt->sort_children( $get_key, @_);
9390  }
9391
9392sub sort_children_on_att
9393  { my $elt = shift;
9394    my $att = shift;
9395    my $get_key= sub { return $_[0]->{'att'}->{$att} };
9396    return $elt->sort_children( $get_key, @_);
9397  }
9398
9399sub sort_children_on_value
9400  { my $elt   = shift;
9401    #my $get_key= eval qq{ sub { $NO_WARNINGS; return \$_[0]->text } };
9402    my $get_key= \&text;
9403    return $elt->sort_children( $get_key, @_);
9404  }
9405
9406sub sort_children
9407  { my( $elt, $get_key, %opt)=@_;
9408    $opt{order} ||= 'normal';
9409    $opt{type}  ||= 'alpha';
9410    my( $par_a, $par_b)= ($opt{order} eq 'reverse') ? qw( b a) : qw ( a b) ;
9411    my $op= ($opt{type} eq 'numeric') ? '<=>' : 'cmp' ;
9412    my @children= $elt->cut_children;
9413    if( $opt{type} eq 'numeric')
9414      {  @children= map  { $_->[1] }
9415                    sort { $a->[0] <=> $b->[0] }
9416                    map  { [ $get_key->( $_), $_] } @children;
9417      }
9418    elsif( $opt{type} eq 'alpha')
9419      {  @children= map  { $_->[1] }
9420                    sort { $a->[0] cmp $b->[0] }
9421                    map  { [ $get_key->( $_), $_] } @children;
9422      }
9423    else
9424      { croak "wrong sort type '$opt{type}', should be either 'alpha' or 'numeric'"; }
9425
9426    @children= reverse @children if( $opt{order} eq 'reverse');
9427    $elt->set_content( @children);
9428  }
9429
9430
9431# comparison methods
9432
9433sub before
9434  { my( $a, $b)=@_;
9435    if( $a->cmp( $b) == -1) { return 1; } else { return 0; }
9436  }
9437
9438sub after
9439  { my( $a, $b)=@_;
9440    if( $a->cmp( $b) == 1) { return 1; } else { return 0; }
9441  }
9442
9443sub lt
9444  { my( $a, $b)=@_;
9445    return 1 if( $a->cmp( $b) == -1);
9446    return 0;
9447  }
9448
9449sub le
9450  { my( $a, $b)=@_;
9451    return 1 unless( $a->cmp( $b) == 1);
9452    return 0;
9453  }
9454
9455sub gt
9456  { my( $a, $b)=@_;
9457    return 1 if( $a->cmp( $b) == 1);
9458    return 0;
9459  }
9460
9461sub ge
9462  { my( $a, $b)=@_;
9463    return 1 unless( $a->cmp( $b) == -1);
9464    return 0;
9465  }
9466
9467
9468sub cmp
9469  { my( $a, $b)=@_;
9470
9471    # easy cases
9472    return  0 if( $a == $b);
9473    return  1 if( $a->in($b)); # a in b => a starts after b
9474    return -1 if( $b->in($a)); # b in a => a starts before b
9475
9476    # ancestors does not include the element itself
9477    my @a_pile= ($a, $a->ancestors);
9478    my @b_pile= ($b, $b->ancestors);
9479
9480    # the 2 elements are not in the same twig
9481    return undef unless( $a_pile[-1] == $b_pile[-1]);
9482
9483    # find the first non common ancestors (they are siblings)
9484    my $a_anc= pop @a_pile;
9485    my $b_anc= pop @b_pile;
9486
9487    while( $a_anc == $b_anc)
9488      { $a_anc= pop @a_pile;
9489        $b_anc= pop @b_pile;
9490      }
9491
9492    # from there move left and right and figure out the order
9493    my( $a_prev, $a_next, $b_prev, $b_next)= ($a_anc, $a_anc, $b_anc, $b_anc);
9494    while()
9495      { $a_prev= $a_prev->{prev_sibling} || return( -1);
9496        return 1 if( $a_prev == $b_next);
9497        $a_next= $a_next->{next_sibling} || return( 1);
9498        return -1 if( $a_next == $b_prev);
9499        $b_prev= $b_prev->{prev_sibling} || return( 1);
9500        return -1 if( $b_prev == $a_next);
9501        $b_next= $b_next->{next_sibling} || return( -1);
9502        return 1 if( $b_next == $a_prev);
9503      }
9504  }
9505
9506sub _dump
9507  { my( $elt, $option)= @_;
9508
9509    my $atts       = defined $option->{atts}       ? $option->{atts}       :  1;
9510    my $extra      = defined $option->{extra}      ? $option->{extra}      :  0;
9511    my $short_text = defined $option->{short_text} ? $option->{short_text} : 40;
9512
9513    my $sp= '| ';
9514    my $indent= $sp x $elt->level;
9515    my $indent_sp= '  ' x $elt->level;
9516
9517    my $dump='';
9518    if( $elt->is_elt)
9519      {
9520        $dump .= $indent  . '|-' . $XML::Twig::index2gi[$elt->{'gi'}];
9521
9522        if( $atts && (my @atts= $elt->att_names) )
9523          { $dump .= ' ' . join( ' ', map { qq{$_="} . $elt->{'att'}->{$_} . qq{"} } @atts); }
9524
9525        $dump .= "\n";
9526        if( $extra) { $dump .= $elt->_dump_extra_data( $indent, $indent_sp, $short_text); }
9527        $dump .= join( "", map { $_->_dump( $option) } do { my $elt= $elt; my @children=(); my $child= $elt->{first_child}; while( $child) { push @children, $child; $child= $child->{next_sibling}; } @children; });
9528      }
9529    else
9530      {
9531        if( (exists $elt->{'pcdata'}))
9532          { $dump .= "$indent|-PCDATA:  '"  . _short_text( $elt->{pcdata}, $short_text) . "'\n" }
9533        elsif( (exists $elt->{'ent'}))
9534          { $dump .= "$indent|-ENTITY:  '" . _short_text( $elt->{ent}, $short_text) . "'\n" }
9535        elsif( (exists $elt->{'cdata'}))
9536          { $dump .= "$indent|-CDATA:   '" . _short_text( $elt->{cdata}, $short_text) . "'\n" }
9537        elsif( (exists $elt->{'comment'}))
9538          { $dump .= "$indent|-COMMENT: '" . _short_text( $elt->comment_string, $short_text) . "'\n" }
9539        elsif( (exists $elt->{'target'}))
9540          { $dump .= "$indent|-PI:      '"      . $elt->{target} . "' - '" . _short_text( $elt->{data}, $short_text) . "'\n" }
9541        if( $extra) { $dump .= $elt->_dump_extra_data( $indent, $indent_sp, $short_text); }
9542      }
9543    return $dump;
9544  }
9545
9546sub _dump_extra_data
9547  { my( $elt, $indent, $indent_sp, $short_text)= @_;
9548    my $dump='';
9549    if( $elt->extra_data)
9550      { my $extra_data = $indent . "|-- (cpi before) '" . _short_text( $elt->extra_data, $short_text) . "'";
9551        $extra_data=~ s{\n}{$indent_sp}g;
9552        $dump .= $extra_data . "\n";
9553      }
9554    if( $elt->{extra_data_in_pcdata})
9555      { foreach my $data ( @{$elt->{extra_data_in_pcdata}})
9556          { my $extra_data = $indent . "|-- (cpi offset $data->{offset}) '" . _short_text( $data->{text}, $short_text) . "'";
9557            $extra_data=~ s{\n}{$indent_sp}g;
9558            $dump .= $extra_data . "\n";
9559          }
9560      }
9561    if( $elt->{extra_data_before_end_tag})
9562      { my $extra_data = $indent . "|-- (cpi end) '" . _short_text( $elt->{extra_data_before_end_tag}, $short_text) . "'";
9563        $extra_data=~ s{\n}{$indent_sp}g;
9564        $dump .= $extra_data . "\n";
9565      }
9566    return $dump;
9567  }
9568
9569
9570sub _short_text
9571  { my( $string, $length)= @_;
9572    if( !$length || (length( $string) < $length) ) { return $string; }
9573    my $l1= (length( $string) -5) /2;
9574    my $l2= length( $string) - ($l1 + 5);
9575    return substr( $string, 0, $l1) . ' ... ' . substr( $string, -$l2);
9576  }
9577
9578
9579sub _and { return _join_defined( ' && ',  @_); }
9580sub _join_defined { return join( shift(), grep { $_ } @_); }
9581
95821;
9583__END__
9584
9585=head1 NAME
9586
9587XML::Twig - A perl module for processing huge XML documents in tree mode.
9588
9589=head1 SYNOPSIS
9590
9591Note that this documentation is intended as a reference to the module.
9592
9593Complete docs, including a tutorial, examples, an easier to use HTML version,
9594a quick reference card and a FAQ are available at L<http://www.xmltwig.org/xmltwig>
9595
9596Small documents (loaded in memory as a tree):
9597
9598  my $twig=XML::Twig->new();    # create the twig
9599  $twig->parsefile( 'doc.xml'); # build it
9600  my_process( $twig);           # use twig methods to process it
9601  $twig->print;                 # output the twig
9602
9603Huge documents (processed in combined stream/tree mode):
9604
9605  # at most one div will be loaded in memory
9606  my $twig=XML::Twig->new(
9607    twig_handlers =>
9608      { title   => sub { $_->set_tag( 'h2') }, # change title tags to h2
9609        para    => sub { $_->set_tag( 'p')  }, # change para to p
9610        hidden  => sub { $_->delete;       },  # remove hidden elements
9611        list    => \&my_list_process,          # process list elements
9612        div     => sub { $_[0]->flush;     },  # output and free memory
9613      },
9614    pretty_print => 'indented',                # output will be nicely formatted
9615    empty_tags   => 'html',                    # outputs <empty_tag />
9616                         );
9617  $twig->parsefile( 'my_big.xml');
9618
9619  sub my_list_process
9620    { my( $twig, $list)= @_;
9621      # ...
9622    }
9623
9624See L<XML::Twig 101|/XML::Twig 101> for other ways to use the module, as a
9625filter for example.
9626
9627=encoding utf8
9628
9629=head1 DESCRIPTION
9630
9631This module provides a way to process XML documents. It is build on top
9632of C<XML::Parser>.
9633
9634The module offers a tree interface to the document, while allowing you
9635to output the parts of it that have been completely processed.
9636
9637It allows minimal resource (CPU and memory) usage by building the tree
9638only for the parts of the documents that need actual processing, through the
9639use of the C<L<twig_roots> > and
9640C<L<twig_print_outside_roots> > options. The
9641C<L<finish> > and C<L<finish_print> > methods also help
9642to increase performances.
9643
9644XML::Twig tries to make simple things easy so it tries its best to takes care
9645of a lot of the (usually) annoying (but sometimes necessary) features that
9646come with XML and XML::Parser.
9647
9648=head1 TOOLS
9649
9650XML::Twig comes with a few command-line utilities:
9651
9652=head2 xml_pp - xml pretty-printer
9653
9654XML pretty printer using XML::Twig
9655
9656=head2 xml_grep - grep XML files looking for specific elements
9657
9658C<xml_grep> does a grep on XML files. Instead of using regular expressions
9659it uses XPath expressions (in fact the subset of XPath supported by
9660XML::Twig).
9661
9662=head2 xml_split - cut a big XML file into smaller chunks
9663
9664C<xml_split> takes a (presumably big) XML file and split it in several smaller
9665files, based on various criteria (level in the tree, size or an XPath
9666expression)
9667
9668=head2 xml_merge - merge back XML files split with xml_split
9669
9670C<xml_merge> takes several xml files that have been split using C<xml_split>
9671and recreates a single file.
9672
9673=head2 xml_spellcheck - spellcheck XML files
9674
9675C<xml_spellcheck> lets you spell check the content of an XML file. It extracts
9676the text (the content of elements and optionally of attributes), call a spell
9677checker on it and then recreates the XML document.
9678
9679
9680=head1 XML::Twig 101
9681
9682XML::Twig can be used either on "small" XML documents (that fit in memory)
9683or on huge ones, by processing parts of the document and outputting or
9684discarding them once they are processed.
9685
9686
9687=head2 Loading an XML document and processing it
9688
9689  my $t= XML::Twig->new();
9690  $t->parse( '<d><title>title</title><para>p 1</para><para>p 2</para></d>');
9691  my $root= $t->root;
9692  $root->set_tag( 'html');              # change doc to html
9693  $title= $root->first_child( 'title'); # get the title
9694  $title->set_tag( 'h1');               # turn it into h1
9695  my @para= $root->children( 'para');   # get the para children
9696  foreach my $para (@para)
9697    { $para->set_tag( 'p'); }           # turn them into p
9698  $t->print;                            # output the document
9699
9700Other useful methods include:
9701
9702L<att>: C<< $elt->{'att'}->{'foo'} >> return the C<foo> attribute for an
9703element,
9704
9705L<set_att> : C<< $elt->set_att( foo => "bar") >> sets the C<foo>
9706attribute to the C<bar> value,
9707
9708L<next_sibling>: C<< $elt->{next_sibling} >> return the next sibling
9709in the document (in the example C<< $title->{next_sibling} >> is the first
9710C<para>, you can also (and actually should) use
9711C<< $elt->next_sibling( 'para') >> to get it
9712
9713The document can also be transformed through the use of the L<cut>,
9714L<copy>, L<paste> and L<move> methods:
9715C<< $title->cut; $title->paste( after => $p); >> for example
9716
9717And much, much more, see L<XML::Twig::Elt|/XML::Twig::Elt>.
9718
9719=head2 Processing an XML document chunk by chunk
9720
9721One of the strengths of XML::Twig is that it let you work with files that do
9722not fit in memory (BTW storing an XML document in memory as a tree is quite
9723memory-expensive, the expansion factor being often around 10).
9724
9725To do this you can define handlers, that will be called once a specific
9726element has been completely parsed. In these handlers you can access the
9727element and process it as you see fit, using the navigation and the
9728cut-n-paste methods, plus lots of convenient ones like C<L<prefix> >.
9729Once the element is completely processed you can then C<L<flush> > it,
9730which will output it and free the memory. You can also C<L<purge> > it
9731if you don't need to output it (if you are just extracting some data from
9732the document for example). The handler will be called again once the next
9733relevant element has been parsed.
9734
9735  my $t= XML::Twig->new( twig_handlers =>
9736                          { section => \&section,
9737                            para   => sub { $_->set_tag( 'p'); }
9738                          },
9739                       );
9740  $t->parsefile( 'doc.xml');
9741
9742  # the handler is called once a section is completely parsed, ie when
9743  # the end tag for section is found, it receives the twig itself and
9744  # the element (including all its sub-elements) as arguments
9745  sub section
9746    { my( $t, $section)= @_;      # arguments for all twig_handlers
9747      $section->set_tag( 'div');  # change the tag name.4, my favourite method...
9748      # let's use the attribute nb as a prefix to the title
9749      my $title= $section->first_child( 'title'); # find the title
9750      my $nb= $title->{'att'}->{'nb'}; # get the attribute
9751      $title->prefix( "$nb - ");  # easy isn't it?
9752      $section->flush;            # outputs the section and frees memory
9753    }
9754
9755
9756There is of course more to it: you can trigger handlers on more elaborate
9757conditions than just the name of the element, C<section/title> for example.
9758
9759  my $t= XML::Twig->new( twig_handlers =>
9760                           { 'section/title' => sub { $_->print } }
9761                       )
9762                  ->parsefile( 'doc.xml');
9763
9764Here C<< sub { $_->print } >> simply prints the current element (C<$_> is aliased
9765to the element in the handler).
9766
9767You can also trigger a handler on a test on an attribute:
9768
9769  my $t= XML::Twig->new( twig_handlers =>
9770                      { 'section[@level="1"]' => sub { $_->print } }
9771                       );
9772                  ->parsefile( 'doc.xml');
9773
9774You can also use C<L<start_tag_handlers> > to process an
9775element as soon as the start tag is found. Besides C<L<prefix> > you
9776can also use C<L<suffix> >,
9777
9778=head2 Processing just parts of an XML document
9779
9780The twig_roots mode builds only the required sub-trees from the document
9781Anything outside of the twig roots will just be ignored:
9782
9783  my $t= XML::Twig->new(
9784       # the twig will include just the root and selected titles
9785           twig_roots   => { 'section/title' => \&print_n_purge,
9786                             'annex/title'   => \&print_n_purge
9787           }
9788                      );
9789  $t->parsefile( 'doc.xml');
9790
9791  sub print_n_purge
9792    { my( $t, $elt)= @_;
9793      print $elt->text;    # print the text (including sub-element texts)
9794      $t->purge;           # frees the memory
9795    }
9796
9797You can use that mode when you want to process parts of a documents but are
9798not interested in the rest and you don't want to pay the price, either in
9799time or memory, to build the tree for the it.
9800
9801
9802=head2 Building an XML filter
9803
9804You can combine the C<twig_roots> and the C<twig_print_outside_roots> options to
9805build filters, which let you modify selected elements and will output the rest
9806of the document as is.
9807
9808This would convert prices in $ to prices in Euro in a document:
9809
9810  my $t= XML::Twig->new(
9811           twig_roots   => { 'price' => \&convert, },   # process prices
9812           twig_print_outside_roots => 1,               # print the rest
9813                      );
9814  $t->parsefile( 'doc.xml');
9815
9816  sub convert
9817    { my( $t, $price)= @_;
9818      my $currency=  $price->{'att'}->{'currency'};          # get the currency
9819      if( $currency eq 'USD')
9820        { $usd_price= $price->text;                     # get the price
9821          # %rate is just a conversion table
9822          my $euro_price= $usd_price * $rate{usd2euro};
9823          $price->set_text( $euro_price);               # set the new price
9824          $price->set_att( currency => 'EUR');          # don't forget this!
9825        }
9826      $price->print;                                    # output the price
9827    }
9828
9829=head2 XML::Twig and various versions of Perl, XML::Parser and expat:
9830
9831XML::Twig is a lot more sensitive to variations in versions of perl,
9832XML::Parser and expat than to the OS, so this should cover some
9833reasonable configurations.
9834
9835The "recommended configuration" is perl 5.8.3+ (for good Unicode
9836support), XML::Parser 2.31+ and expat 1.95.5+
9837
9838See L<http://testers.cpan.org/search?request=dist&dist=XML-Twig> for the
9839CPAN testers reports on XML::Twig, which list all tested configurations.
9840
9841An Atom feed of the CPAN Testers results is available at
9842L<http://xmltwig.org/rss/twig_testers.rss>
9843
9844Finally:
9845
9846=over 4
9847
9848=item XML::Twig does B<NOT> work with expat 1.95.4
9849
9850=item  XML::Twig only works with XML::Parser 2.27 in perl 5.6.*
9851
9852Note that I can't compile XML::Parser 2.27 anymore, so I can't guarantee
9853that it still works
9854
9855=item XML::Parser 2.28 does not really work
9856
9857=back
9858
9859When in doubt, upgrade expat, XML::Parser and Scalar::Util
9860
9861Finally, for some optional features, XML::Twig depends on some additional
9862modules. The complete list, which depends somewhat on the version of Perl
9863that you are running, is given by running C<t/zz_dump_config.t>
9864
9865=head1 Simplifying XML processing
9866
9867=over 4
9868
9869=item Whitespaces
9870
9871Whitespaces that look non-significant are discarded, this behaviour can be
9872controlled using the C<L<keep_spaces> >,
9873C<L<keep_spaces_in> > and
9874C<L<discard_spaces_in> > options.
9875
9876=item Encoding
9877
9878You can specify that you want the output in the same encoding as the input
9879(provided you have valid XML, which means you have to specify the encoding
9880either in the document or when you create the Twig object) using the
9881C<L<keep_encoding> > option
9882
9883You can also use C<L<output_encoding>> to convert the internal UTF-8 format
9884to the required encoding.
9885
9886=item Comments and Processing Instructions (PI)
9887
9888Comments and PI's can be hidden from the processing, but still appear in the
9889output (they are carried by the "real" element closer to them)
9890
9891=item Pretty Printing
9892
9893XML::Twig can output the document pretty printed so it is easier to read for
9894us humans.
9895
9896=item Surviving an untimely death
9897
9898XML parsers are supposed to react violently when fed improper XML.
9899XML::Parser just dies.
9900
9901XML::Twig provides the C<L<safe_parse> > and the
9902C<L<safe_parsefile> > methods which wrap the parse in an eval
9903and return either the parsed twig or 0 in case of failure.
9904
9905=item Private attributes
9906
9907Attributes with a name starting with # (illegal in XML) will not be
9908output, so you can safely use them to store temporary values during
9909processing. Note that you can store anything in a private attribute,
9910not just text, it's just a regular Perl variable, so a reference to
9911an object or a huge data structure is perfectly fine.
9912
9913=back
9914
9915=head1 CLASSES
9916
9917XML::Twig uses a very limited number of classes. The ones you are most likely to use
9918are C<L<XML::Twig>> of course, which represents a complete XML document, including the
9919document itself (the root of the document itself is C<L<root>>), its handlers, its
9920input or output filters... The other main class is C<L<XML::Twig::Elt>>, which models
9921an XML element. Element here has a very wide definition: it can be a regular element, or
9922but also text, with an element C<L<tag>> of C<#PCDATA> (or C<#CDATA>), an entity (tag is
9923C<#ENT>), a Processing Instruction (C<#PI>), a comment (C<#COMMENT>).
9924
9925Those are the 2 commonly used classes.
9926
9927You might want to look the C<L<elt_class>> option if you want to subclass C<XML::Twig::Elt>.
9928
9929Attributes are just attached to their parent element, they are not objects per se. (Please
9930use the provided methods C<L<att>> and C<L<set_att>> to access them, if you access them
9931as a hash, then your code becomes implementation dependent and might break in the future).
9932
9933Other classes that are seldom used are C<L<XML::Twig::Entity_list>> and C<L<XML::Twig::Entity>>.
9934
9935If you use C<L<XML::Twig::XPath>> instead of C<XML::Twig>, elements are then created as
9936C<L<XML::Twig::XPath::Elt>>
9937
9938
9939=head1 METHODS
9940
9941=head2 XML::Twig
9942
9943A twig is a subclass of XML::Parser, so all XML::Parser methods can be
9944called on a twig object, including parse and parsefile.
9945C<setHandlers> on the other hand cannot be used, see C<L<BUGS> >
9946
9947
9948=over 4
9949
9950=item new
9951
9952This is a class method, the constructor for XML::Twig. Options are passed
9953as keyword value pairs. Recognized options are the same as XML::Parser,
9954plus some (in fact a lot!) XML::Twig specifics.
9955
9956New Options:
9957
9958=over 4
9959
9960=item twig_handlers
9961
9962This argument consists of a hash C<{ expression => \&handler}> where
9963expression is a an I<XPath-like expression> (+ some others).
9964
9965XPath expressions are limited to using the child and descendant axis
9966(indeed you can't specify an axis), and predicates cannot be nested.
9967You can use the C<string>, or C<< string(<tag>) >> function (except
9968in C<twig_roots> triggers).
9969
9970Additionally you can use regexps (/ delimited) to match attribute
9971and string values.
9972
9973Examples:
9974
9975  foo
9976  foo/bar
9977  foo//bar
9978  /foo/bar
9979  /foo//bar
9980  /foo/bar[@att1 = "val1" and @att2 = "val2"]/baz[@a >= 1]
9981  foo[string()=~ /^duh!+/]
9982  /foo[string(bar)=~ /\d+/]/baz[@att != 3]
9983
9984#CDATA can be used to call a handler for a CDATA section.
9985#COMMENT can be used to call a handler for comments
9986
9987Some additional (non-XPath) expressions are also provided for convenience:
9988
9989=over 4
9990
9991=item processing instructions
9992
9993C<'?'> or C<'#PI'> triggers the handler for any processing instruction,
9994and C<< '?<target>' >> or C<< '#PI <target>' >> triggers a handler for processing
9995instruction with the given target( ex: C<'#PI xml-stylesheet'>).
9996
9997=item level(<level>)
9998
9999Triggers the handler on any element at that level in the tree (root is level 1)
10000
10001=item _all_
10002
10003Triggers the handler for B<all> elements in the tree
10004
10005=item _default_
10006
10007Triggers the handler for each element that does NOT have any other handler.
10008
10009=back
10010
10011Expressions are evaluated against the input document.
10012Which means that even if you have changed the tag of an element (changing the
10013tag of a parent element from a handler for example) the change will not impact
10014the expression evaluation. There is an exception to this: "private" attributes
10015(which name start with a '#', and can only be created during the parsing, as
10016they are not valid XML) are checked against the current twig.
10017
10018Handlers are triggered in fixed order, sorted by their type (xpath expressions
10019first, then regexps, then level), then by whether they specify a full path
10020(starting at the root element) or
10021not, then by number of steps in the expression , then number of
10022predicates, then number of tests in predicates. Handlers where the last
10023step does not specify a step (C<foo/bar/*>) are triggered after other XPath
10024handlers. Finally C<_all_> handlers are triggered last.
10025
10026B<Important>: once a handler has been triggered if it returns 0 then no other
10027handler is called, except a C<_all_> handler which will be called anyway.
10028
10029If a handler returns a true value and other handlers apply, then the next
10030applicable handler will be called. Repeat, rinse, lather..; The exception
10031to that rule is when the C<L<do_not_chain_handlers>>
10032option is set, in which case only the first handler will be called.
10033
10034Note that it might be a good idea to explicitly return a short true value
10035(like 1) from handlers: this ensures that other applicable handlers are
10036called even if the last statement for the handler happens to evaluate to
10037false. This might also speedup the code by avoiding the result of the last
10038statement of the code to be copied and passed to the code managing handlers.
10039It can really pay to have 1 instead of a long string returned.
10040
10041When the closing tag for an element is parsed the corresponding handler is
10042called, with 2 arguments: the twig and the C<L<Element> >. The twig includes
10043the document tree that has been built so far, the element is the complete
10044sub-tree for the element. The fact that the handler is called only when the
10045closing tag for the element is found means that handlers for inner elements
10046are called before handlers for outer elements.
10047
10048C<$_> is also set to the element, so it is easy to write inline handlers like
10049
10050  para => sub { $_->set_tag( 'p'); }
10051
10052Text is stored in elements whose tag name is #PCDATA (due to mixed content,
10053text and sub-element in an element there is no way to store the text as just
10054an attribute of the enclosing element).
10055
10056B<Warning>: if you have used purge or flush on the twig the element might not
10057be complete, some of its children might have been entirely flushed or purged,
10058and the start tag might even have been printed (by C<flush>) already, so changing
10059its tag might not give the expected result.
10060
10061
10062=item twig_roots
10063
10064This argument let's you build the tree only for those elements you are
10065interested in.
10066
10067  Example: my $t= XML::Twig->new( twig_roots => { title => 1, subtitle => 1});
10068           $t->parsefile( file);
10069           my $t= XML::Twig->new( twig_roots => { 'section/title' => 1});
10070           $t->parsefile( file);
10071
10072
10073return a twig containing a document including only C<title> and C<subtitle>
10074elements, as children of the root element.
10075
10076You can use I<generic_attribute_condition>, I<attribute_condition>,
10077I<full_path>, I<partial_path>, I<tag>, I<tag_regexp>, I<_default_> and
10078I<_all_> to trigger the building of the twig.
10079I<string_condition> and I<regexp_condition> cannot be used as the content
10080of the element, and the string, have not yet been parsed when the condition
10081is checked.
10082
10083B<WARNING>: path are checked for the document. Even if the C<twig_roots> option
10084is used they will be checked against the full document tree, not the virtual
10085tree created by XML::Twig
10086
10087
10088B<WARNING>: twig_roots elements should NOT be nested, that would hopelessly
10089confuse XML::Twig ;--(
10090
10091Note: you can set handlers (twig_handlers) using twig_roots
10092  Example: my $t= XML::Twig->new( twig_roots =>
10093                                   { title    => sub { $_[1]->print;},
10094                                     subtitle => \&process_subtitle
10095                                   }
10096                               );
10097           $t->parsefile( file);
10098
10099
10100=item twig_print_outside_roots
10101
10102To be used in conjunction with the C<twig_roots> argument. When set to a true
10103value this will print the document outside of the C<twig_roots> elements.
10104
10105 Example: my $t= XML::Twig->new( twig_roots => { title => \&number_title },
10106                                twig_print_outside_roots => 1,
10107                               );
10108           $t->parsefile( file);
10109           { my $nb;
10110           sub number_title
10111             { my( $twig, $title);
10112               $nb++;
10113               $title->prefix( "$nb ");
10114               $title->print;
10115             }
10116           }
10117
10118
10119This example prints the document outside of the title element, calls
10120C<number_title> for each C<title> element, prints it, and then resumes printing
10121the document. The twig is built only for the C<title> elements.
10122
10123If the value is a reference to a file handle then the document outside the
10124C<twig_roots> elements will be output to this file handle:
10125
10126  open( my $out, '>', 'out_file.xml') or die "cannot open out file.xml out_file:$!";
10127  my $t= XML::Twig->new( twig_roots => { title => \&number_title },
10128                         # default output to $out
10129                         twig_print_outside_roots => $out,
10130                       );
10131
10132         { my $nb;
10133           sub number_title
10134             { my( $twig, $title);
10135               $nb++;
10136               $title->prefix( "$nb ");
10137               $title->print( $out);    # you have to print to \*OUT here
10138             }
10139           }
10140
10141
10142=item start_tag_handlers
10143
10144A hash C<{ expression => \&handler}>. Sets element handlers that are called when
10145the element is open (at the end of the XML::Parser C<Start> handler). The handlers
10146are called with 2 params: the twig and the element. The element is empty at
10147that point, its attributes are created though.
10148
10149You can use I<generic_attribute_condition>, I<attribute_condition>,
10150I<full_path>, I<partial_path>, I<tag>, I<tag_regexp>, I<_default_>  and I<_all_>
10151to trigger the handler.
10152
10153I<string_condition> and I<regexp_condition> cannot be used as the content of
10154the element, and the string, have not yet been parsed when the condition is
10155checked.
10156
10157The main uses for those handlers are to change the tag name (you might have to
10158do it as soon as you find the open tag if you plan to C<flush> the twig at some
10159point in the element, and to create temporary attributes that will be used
10160when processing sub-element with C<twig_hanlders>.
10161
10162You should also use it to change tags if you use C<flush>. If you change the tag
10163in a regular C<twig_handler> then the start tag might already have been flushed.
10164
10165B<Note>: C<start_tag> handlers can be called outside of C<twig_roots> if this
10166argument is used, in this case handlers are called with the following arguments:
10167C<$t> (the twig), C<$tag> (the tag of the element) and C<%att> (a hash of the
10168attributes of the element).
10169
10170If the C<twig_print_outside_roots> argument is also used, if the last handler
10171called returns  a C<true> value, then the start tag will be output as it
10172appeared in the original document, if the handler returns a C<false> value
10173then the start tag will B<not> be printed (so you can print a modified string
10174yourself for example).
10175
10176Note that you can use the L<ignore> method in C<start_tag_handlers>
10177(and only there).
10178
10179=item end_tag_handlers
10180
10181A hash C<{ expression => \&handler}>. Sets element handlers that are called when
10182the element is closed (at the end of the XML::Parser C<End> handler). The handlers
10183are called with 2 params: the twig and the tag of the element.
10184
10185I<twig_handlers> are called when an element is completely parsed, so why have
10186this redundant option? There is only one use for C<end_tag_handlers>: when using
10187the C<twig_roots> option, to trigger a handler for an element B<outside> the roots.
10188It is for example very useful to number titles in a document using nested
10189sections:
10190
10191  my @no= (0);
10192  my $no;
10193  my $t= XML::Twig->new(
10194          start_tag_handlers =>
10195           { section => sub { $no[$#no]++; $no= join '.', @no; push @no, 0; } },
10196          twig_roots         =>
10197           { title   => sub { $_[1]->prefix( $no); $_[1]->print; } },
10198          end_tag_handlers   => { section => sub { pop @no;  } },
10199          twig_print_outside_roots => 1
10200                      );
10201   $t->parsefile( $file);
10202
10203Using the C<end_tag_handlers> argument without C<twig_roots> will result in an
10204error.
10205
10206=item do_not_chain_handlers
10207
10208If this option is set to a true value, then only one handler will be called for
10209each element, even if several satisfy the condition
10210
10211Note that the C<_all_> handler will still be called regardless
10212
10213=item ignore_elts
10214
10215This option lets you ignore elements when building the twig. This is useful
10216in cases where you cannot use C<twig_roots> to ignore elements, for example if
10217the element to ignore is a sibling of elements you are interested in.
10218
10219Example:
10220
10221  my $twig= XML::Twig->new( ignore_elts => { elt => 'discard' });
10222  $twig->parsefile( 'doc.xml');
10223
10224This will build the complete twig for the document, except that all C<elt>
10225elements (and their children) will be left out.
10226
10227The keys in the hash are triggers, limited to the same subset as
10228C<L<start_tag_handlers>>. The values can be C<discard>, to discard
10229the element, C<print>, to output the element as-is, C<string> to
10230store the text of the ignored element(s), including markup, in a field of
10231the twig: C<< $t->{twig_buffered_string} >> or a reference to a scalar, in
10232which case the text of the ignored element(s), including markup, will be
10233stored in the scalar. Any other value will be treated as C<discard>.
10234
10235
10236=item char_handler
10237
10238A reference to a subroutine that will be called every time C<PCDATA> is found.
10239
10240The subroutine receives the string as argument, and returns the modified string:
10241
10242  # we want all strings in upper case
10243  sub my_char_handler
10244    { my( $text)= @_;
10245      $text= uc( $text);
10246      return $text;
10247    }
10248
10249=item elt_class
10250
10251The name of a class used to store elements. this class should inherit from
10252C<XML::Twig::Elt> (and by default it is C<XML::Twig::Elt>). This option is used
10253to subclass the element class and extend it with new methods.
10254
10255This option is needed because during the parsing of the XML, elements are created
10256by C<XML::Twig>, without any control from the user code.
10257
10258=item keep_atts_order
10259
10260Setting this option to a true value causes the attribute hash to be tied to
10261a C<Tie::IxHash> object.
10262This means that C<Tie::IxHash> needs to be installed for this option to be
10263available. It also means that the hash keeps its order, so you will get
10264the attributes in order. This allows outputting the attributes in the same
10265order as they were in the original document.
10266
10267=item keep_encoding
10268
10269This is a (slightly?) evil option: if the XML document is not UTF-8 encoded and
10270you want to keep it that way, then setting keep_encoding will use theC<Expat>
10271original_string method for character, thus keeping the original encoding, as
10272well as the original entities in the strings.
10273
10274See the C<t/test6.t> test file to see what results you can expect from the
10275various encoding options.
10276
10277B<WARNING>: if the original encoding is multi-byte then attribute parsing will
10278be EXTREMELY unsafe under any Perl before 5.6, as it uses regular expressions
10279which do not deal properly with multi-byte characters. You can specify an
10280alternate function to parse the start tags with the C<parse_start_tag> option
10281(see below)
10282
10283B<WARNING>: this option is NOT used when parsing with the non-blocking parser
10284(C<parse_start>, C<parse_more>, parse_done methods) which you probably should
10285not use with XML::Twig anyway as they are totally untested!
10286
10287=item output_encoding
10288
10289This option generates an output_filter using C<Encode>,  C<Text::Iconv> or
10290C<Unicode::Map8> and C<Unicode::Strings>, and sets the encoding in the XML
10291declaration. This is the easiest way to deal with encodings, if you need
10292more sophisticated features, look at C<output_filter> below
10293
10294
10295=item output_filter
10296
10297This option is used to convert the character encoding of the output document.
10298It is passed either a string corresponding to a predefined filter or
10299a subroutine reference. The filter will be called every time a document or
10300element is processed by the "print" functions (C<print>, C<sprint>, C<flush>).
10301
10302Pre-defined filters:
10303
10304=over 4
10305
10306=item latin1
10307
10308uses either C<Encode>, C<Text::Iconv> or C<Unicode::Map8> and C<Unicode::String>
10309or a regexp (which works only with XML::Parser 2.27), in this order, to convert
10310all characters to ISO-8859-15 (usually latin1 is synonym to ISO-8859-1, but
10311in practice it seems that ISO-8859-15, which includes the euro sign, is more
10312useful and probably what most people want).
10313
10314=item html
10315
10316does the same conversion as C<latin1>, plus encodes entities using
10317C<HTML::Entities> (oddly enough you will need to have HTML::Entities installed
10318for it to be available). This should only be used if the tags and attribute
10319names themselves are in US-ASCII, or they will be converted and the output will
10320not be valid XML any more
10321
10322=item safe
10323
10324converts the output to ASCII (US) only  plus I<character entities> (C<&#nnn;>)
10325this should be used only if the tags and attribute names themselves are in
10326US-ASCII, or they will be converted and the output will not be valid XML any
10327more
10328
10329=item safe_hex
10330
10331same as C<safe> except that the character entities are in hex (C<&#xnnn;>)
10332
10333=item encode_convert ($encoding)
10334
10335Return a subref that can be used to convert utf8 strings to C<$encoding>).
10336Uses C<Encode>.
10337
10338   my $conv = XML::Twig::encode_convert( 'latin1');
10339   my $t = XML::Twig->new(output_filter => $conv);
10340
10341=item iconv_convert ($encoding)
10342
10343this function is used to create a filter subroutine that will be used to
10344convert the characters to the target encoding using C<Text::Iconv> (which needs
10345to be installed, look at the documentation for the module and for the
10346C<iconv> library to find out which encodings are available on your system)
10347
10348   my $conv = XML::Twig::iconv_convert( 'latin1');
10349   my $t = XML::Twig->new(output_filter => $conv);
10350
10351=item unicode_convert ($encoding)
10352
10353this function is used to create a filter subroutine that will be used to
10354convert the characters to the target encoding using  C<Unicode::Strings>
10355and C<Unicode::Map8> (which need to be installed, look at the documentation
10356for the modules to find out which encodings are available on your system)
10357
10358   my $conv = XML::Twig::unicode_convert( 'latin1');
10359   my $t = XML::Twig->new(output_filter => $conv);
10360
10361=back
10362
10363The C<text> and C<att> methods do not use the filter, so their
10364result are always in unicode.
10365
10366Those predeclared filters are based on subroutines that can be used
10367by themselves (as C<XML::Twig::foo>).
10368
10369=over 4
10370
10371=item html_encode ($string)
10372
10373Use C<HTML::Entities> to encode a utf8 string
10374
10375=item safe_encode ($string)
10376
10377Use either a regexp (perl < 5.8) or C<Encode> to encode non-ascii characters
10378in the string in C<< &#<nnnn>; >> format
10379
10380=item safe_encode_hex ($string)
10381
10382Use either a regexp (perl < 5.8) or C<Encode> to encode non-ascii characters
10383in the string in C<< &#x<nnnn>; >> format
10384
10385=item regexp2latin1 ($string)
10386
10387Use a regexp to encode a utf8 string into latin 1 (ISO-8859-1). Does not
10388work with Perl 5.8.0!
10389
10390=back
10391
10392=item output_text_filter
10393
10394same as output_filter, except it doesn't apply to the brackets and quotes
10395around attribute values. This is useful for all filters that could change
10396the tagging, basically anything that does not just change the encoding of
10397the output. C<html>, C<safe> and C<safe_hex> are better used with this option.
10398
10399=item input_filter
10400
10401This option is similar to C<output_filter> except the filter is applied to
10402the characters before they are stored in the twig, at parsing time.
10403
10404=item remove_cdata
10405
10406Setting this option to a true value will force the twig to output CDATA
10407sections as regular (escaped) PCDATA
10408
10409=item parse_start_tag
10410
10411If you use the C<keep_encoding> option then this option can be used to replace
10412the default parsing function. You should provide a coderef (a reference to a
10413subroutine) as the argument, this subroutine takes the original tag (given
10414by XML::Parser::Expat C<original_string()> method) and returns a tag and the
10415attributes in a hash (or in a list attribute_name/attribute value).
10416
10417=item expand_external_ents
10418
10419When this option is used external entities (that are defined) are expanded
10420when the document is output using "print" functions such as C<L<print> >,
10421C<L<sprint> >, C<L<flush> > and C<L<xml_string> >.
10422Note that in the twig the entity will be stored as an element with a
10423tag 'C<#ENT>', the entity will not be expanded there, so you might want to
10424process the entities before outputting it.
10425
10426If an external entity is not available, then the parse will fail.
10427
10428A special case is when the value of this option is -1. In that case a missing
10429entity will not cause the parser to die, but its C<name>, C<sysid> and C<pubid>
10430will be stored in the twig as C<< $twig->{twig_missing_system_entities} >>
10431(a reference to an array of hashes { name => <name>, sysid => <sysid>,
10432pubid => <pubid> }). Yes, this is a bit of a hack, but it's useful in some
10433cases.
10434
10435=item load_DTD
10436
10437If this argument is set to a true value, C<parse> or C<parsefile> on the twig
10438will load  the DTD information. This information can then be accessed through
10439the twig, in a C<DTD_handler> for example. This will load even an external DTD.
10440
10441Default and fixed values for attributes will also be filled, based on the DTD.
10442
10443Note that to do this the module will generate a temporary file in the current
10444directory. If this is a problem let me know and I will add an option to
10445specify an alternate directory.
10446
10447See L<DTD Handling> for more information
10448
10449=item DTD_handler
10450
10451Set a handler that will be called once the doctype (and the DTD) have been
10452loaded, with 2 arguments, the twig and the DTD.
10453
10454=item no_prolog
10455
10456Does not output a prolog (XML declaration and DTD)
10457
10458=item id
10459
10460This optional argument gives the name of an attribute that can be used as
10461an ID in the document. Elements whose ID is known can be accessed through
10462the elt_id method. id defaults to 'id'.
10463See C<L<BUGS> >
10464
10465=item discard_spaces
10466
10467If this optional argument is set to a true value then spaces are discarded
10468when they look non-significant: strings containing only spaces and at least
10469one line feed are discarded. This argument is set to true by default.
10470
10471The exact algorithm to drop spaces is: strings including only spaces (perl \s)
10472and at least one \n right before an open or close tag are dropped.
10473
10474=item discard_all_spaces
10475
10476If this argument is set to a true value, spaces are discarded more
10477aggressively than with C<discard_spaces>: strings not including a \n are also
10478dropped. This option is appropriate for data-oriented XML.
10479
10480
10481=item keep_spaces
10482
10483If this optional argument is set to a true value then all spaces in the
10484document are kept, and stored as C<PCDATA>.
10485
10486B<Warning>: adding this option can result in changes in the twig generated:
10487space that was previously discarded might end up in a new text element. see
10488the difference by calling the following code with 0 and 1 as arguments:
10489
10490  perl -MXML::Twig -e'print XML::Twig->new( keep_spaces => shift)->parse( "<d> \n<e/></d>")->_dump'
10491
10492
10493C<keep_spaces> and C<discard_spaces> cannot be both set.
10494
10495=item discard_spaces_in
10496
10497This argument sets C<keep_spaces> to true but will cause the twig builder to
10498discard spaces in the elements listed.
10499
10500The syntax for using this argument is:
10501
10502  XML::Twig->new( discard_spaces_in => [ 'elt1', 'elt2']);
10503
10504=item keep_spaces_in
10505
10506This argument sets C<discard_spaces> to true but will cause the twig builder to
10507keep spaces in the elements listed.
10508
10509The syntax for using this argument is:
10510
10511  XML::Twig->new( keep_spaces_in => [ 'elt1', 'elt2']);
10512
10513B<Warning>: adding this option can result in changes in the twig generated:
10514space that was previously discarded might end up in a new text element.
10515
10516=item pretty_print
10517
10518Set the pretty print method, amongst 'C<none>' (default), 'C<nsgmls>',
10519'C<nice>', 'C<indented>', 'C<indented_c>', 'C<indented_a>',
10520'C<indented_close_tag>', 'C<cvs>', 'C<wrapped>', 'C<record>' and 'C<record_c>'
10521
10522pretty_print formats:
10523
10524=over 4
10525
10526=item none
10527
10528The document is output as one ling string, with no line breaks except those
10529found within text elements
10530
10531=item nsgmls
10532
10533Line breaks are inserted in safe places: that is within tags, between a tag
10534and an attribute, between attributes and before the > at the end of a tag.
10535
10536This is quite ugly but better than C<none>, and it is very safe, the document
10537will still be valid (conforming to its DTD).
10538
10539This is how the SGML parser C<sgmls> splits documents, hence the name.
10540
10541=item nice
10542
10543This option inserts line breaks before any tag that does not contain text (so
10544element with textual content are not broken as the \n is the significant).
10545
10546B<WARNING>: this option leaves the document well-formed but might make it
10547invalid (not conformant to its DTD). If you have elements declared as
10548
10549  <!ELEMENT foo (#PCDATA|bar)>
10550
10551then a C<foo> element including a C<bar> one will be printed as
10552
10553  <foo>
10554  <bar>bar is just pcdata</bar>
10555  </foo>
10556
10557This is invalid, as the parser will take the line break after the C<foo> tag
10558as a sign that the element contains PCDATA, it will then die when it finds the
10559C<bar> tag. This may or may not be important for you, but be aware of it!
10560
10561=item indented
10562
10563Same as C<nice> (and with the same warning) but indents elements according to
10564their level
10565
10566=item indented_c
10567
10568Same as C<indented> but a little more compact: the closing tags are on the
10569same line as the preceding text
10570
10571=item indented_close_tag
10572
10573Same as C<indented> except that the closing tag is also indented, to line up
10574with the tags within the element
10575
10576=item idented_a
10577
10578This formats XML files in a line-oriented version control friendly way.
10579The format is described in L<http://tinyurl.com/2kwscq> (that's an Oracle
10580document with an insanely long URL).
10581
10582Note that to be totaly conformant to the "spec", the order of attributes
10583should not be changed, so if they are not already in alphabetical order
10584you will need to use the C<L<keep_atts_order>> option.
10585
10586=item cvs
10587
10588Same as C<L<idented_a>>.
10589
10590=item wrapped
10591
10592Same as C<indented_c> but lines are wrapped using L<Text::Wrap::wrap>. The
10593default length for lines is the default for C<$Text::Wrap::columns>, and can
10594be changed by changing that variable.
10595
10596=item record
10597
10598This is a record-oriented pretty print, that display data in records, one field
10599per line (which looks a LOT like C<indented>)
10600
10601=item record_c
10602
10603Stands for record compact, one record per line
10604
10605=back
10606
10607
10608=item empty_tags
10609
10610Set the empty tag display style ('C<normal>', 'C<html>' or 'C<expand>').
10611
10612C<normal> outputs an empty tag 'C<< <tag/> >>', C<html> adds a space
10613'C<< <tag /> >>' for elements that can be empty in XHTML and C<expand> outputs
10614'C<< <tag></tag> >>'
10615
10616=item quote
10617
10618Set the quote character for attributes ('C<single>' or 'C<double>').
10619
10620=item escape_gt
10621
10622By default XML::Twig does not escape the character > in its output, as it is not
10623mandated by the XML spec. With this option on, > will be replaced by C<&gt;>
10624
10625=item comments
10626
10627Set the way comments are processed: 'C<drop>' (default), 'C<keep>' or
10628'C<process>'
10629
10630Comments processing options:
10631
10632=over 4
10633
10634=item drop
10635
10636drops the comments, they are not read, nor printed to the output
10637
10638=item keep
10639
10640comments are loaded and will appear on the output, they are not
10641accessible within the twig and will not interfere with processing
10642though
10643
10644B<Note>: comments in the middle of a text element such as
10645
10646  <p>text <!-- comment --> more text --></p>
10647
10648are kept at their original position in the text. Using ˝"print"
10649methods like C<print> or C<sprint> will return the comments in the
10650text. Using C<text> or C<field> on the other hand will not.
10651
10652Any use of C<set_pcdata> on the C<#PCDATA> element (directly or
10653through other methods like C<set_content>) will delete the comment(s).
10654
10655=item process
10656
10657comments are loaded in the twig and will be treated as regular elements
10658(their C<tag> is C<#COMMENT>) this can interfere with processing if you
10659expect C<< $elt->{first_child} >> to be an element but find a comment there.
10660Validation will not protect you from this as comments can happen anywhere.
10661You can use C<< $elt->first_child( 'tag') >> (which is a good habit anyway)
10662to get where you want.
10663
10664Consider using C<process> if you are outputting SAX events from XML::Twig.
10665
10666=back
10667
10668=item pi
10669
10670Set the way processing instructions are processed: 'C<drop>', 'C<keep>'
10671(default) or 'C<process>'
10672
10673Note that you can also set PI handlers in the C<twig_handlers> option:
10674
10675  '?'       => \&handler
10676  '?target' => \&handler 2
10677
10678The handlers will be called with 2 parameters, the twig and the PI element if
10679C<pi> is set to C<process>, and with 3, the twig, the target and the data if
10680C<pi> is set to C<keep>. Of course they will not be called if C<pi> is set to
10681C<drop>.
10682
10683If C<pi> is set to C<keep> the handler should return a string that will be used
10684as-is as the PI text (it should look like "C< <?target data?> >" or '' if you
10685want to remove the PI),
10686
10687Only one handler will be called, C<?target> or C<?> if no specific handler for
10688that target is available.
10689
10690=item map_xmlns
10691
10692This option is passed a hashref that maps uri's to prefixes. The prefixes in
10693the document will be replaced by the ones in the map. The mapped prefixes can
10694(actually have to) be used to trigger handlers, navigate or query the document.
10695
10696Here is an example:
10697
10698  my $t= XML::Twig->new( map_xmlns => {'http://www.w3.org/2000/svg' => "svg"},
10699                         twig_handlers =>
10700                           { 'svg:circle' => sub { $_->set_att( r => 20) } },
10701                         pretty_print => 'indented',
10702                       )
10703                  ->parse( '<doc xmlns:gr="http://www.w3.org/2000/svg">
10704                              <gr:circle cx="10" cy="90" r="10"/>
10705                           </doc>'
10706                         )
10707                  ->print;
10708
10709This will output:
10710
10711  <doc xmlns:svg="http://www.w3.org/2000/svg">
10712     <svg:circle cx="10" cy="90" r="20"/>
10713  </doc>
10714
10715=item keep_original_prefix
10716
10717When used with C<L<map_xmlns>> this option will make C<XML::Twig> use the original
10718namespace prefixes when outputting a document. The mapped prefix will still be used
10719for triggering handlers and in navigation and query methods.
10720
10721  my $t= XML::Twig->new( map_xmlns => {'http://www.w3.org/2000/svg' => "svg"},
10722                         twig_handlers =>
10723                           { 'svg:circle' => sub { $_->set_att( r => 20) } },
10724                         keep_original_prefix => 1,
10725                         pretty_print => 'indented',
10726                       )
10727                  ->parse( '<doc xmlns:gr="http://www.w3.org/2000/svg">
10728                              <gr:circle cx="10" cy="90" r="10"/>
10729                           </doc>'
10730                         )
10731                  ->print;
10732
10733This will output:
10734
10735  <doc xmlns:gr="http://www.w3.org/2000/svg">
10736     <gr:circle cx="10" cy="90" r="20"/>
10737  </doc>
10738
10739=item original_uri ($prefix)
10740
10741called within a handler, this will return the uri bound to the namespace prefix
10742in the original document.
10743
10744=item index ($arrayref or $hashref)
10745
10746This option creates lists of specific elements during the parsing of the XML.
10747It takes a reference to either a list of triggering expressions or to a hash
10748name => expression, and for each one generates the list of elements that
10749match the expression. The list can be accessed through the C<L<index>> method.
10750
10751example:
10752
10753  # using an array ref
10754  my $t= XML::Twig->new( index => [ 'div', 'table' ])
10755                  ->parsefile( "foo.xml");
10756  my $divs= $t->index( 'div');
10757  my $first_div= $divs->[0];
10758  my $last_table= $t->index( table => -1);
10759
10760  # using a hashref to name the indexes
10761  my $t= XML::Twig->new( index => { email => 'a[@href=~/^ \s*mailto:/]'})
10762                  ->parsefile( "foo.xml");
10763  my $last_emails= $t->index( email => -1);
10764
10765Note that the index is not maintained after the parsing. If elements are
10766deleted, renamed or otherwise hurt during processing, the index is NOT updated.
10767(changing the id element OTOH will update the index)
10768
10769=item att_accessors <list of attribute names>
10770
10771creates methods that give direct access to attribute:
10772
10773  my $t= XML::Twig->new( att_accessors => [ 'href', 'src'])
10774                  ->parsefile( $file);
10775  my $first_href= $t->first_elt( 'img')->src; # same as ->att( 'src')
10776  $t->first_elt( 'img')->src( 'new_logo.png') # changes the attribute value
10777
10778=item elt_accessors
10779
10780creates methods that give direct access to the first child element (in scalar context)
10781or the list of elements (in list context):
10782
10783the list of accessors to create can be given 1 2 different ways: in an array,
10784or in a hash alias => expression
10785  my $t=  XML::Twig->new( elt_accessors => [ 'head'])
10786                  ->parsefile( $file);
10787  my $title_text= $t->root->head->field( 'title');
10788  # same as $title_text= $t->root->first_child( 'head')->field( 'title');
10789
10790  my $t=  XML::Twig->new( elt_accessors => { warnings => 'p[@class="warning"]', d2 => 'div[2]'}, )
10791                  ->parsefile( $file);
10792  my $body= $t->first_elt( 'body');
10793  my @warnings= $body->warnings; # same as $body->children( 'p[@class="warning"]');
10794  my $s2= $body->d2;             # same as $body->first_child( 'div[2]')
10795
10796=item field_accessors
10797
10798creates methods that give direct access to the first child element text:
10799
10800  my $t=  XML::Twig->new( field_accessors => [ 'h1'])
10801                  ->parsefile( $file);
10802  my $div_title_text= $t->first_elt( 'div')->title;
10803  # same as $title_text= $t->first_elt( 'div')->field( 'title');
10804
10805=item use_tidy
10806
10807set this option to use HTML::Tidy instead of HTML::TreeBuilder to convert
10808HTML to XML. HTML, especially real (real "crap") HTML found in the wild,
10809so depending on the data, one module or the other does a better job at
10810the conversion. Also, HTML::Tidy can be a bit difficult to install, so
10811XML::Twig offers both option. TIMTOWTDI
10812
10813=item output_html_doctype
10814
10815when using HTML::TreeBuilder to convert HTML, this option causes the DOCTYPE
10816declaration to be output, which may be important for some legacy browsers.
10817Without that option the DOCTYPE definition is NOT output. Also if the definition
10818is completely wrong (ie not easily parsable), it is not output either.
10819
10820=back
10821
10822B<Note>: I _HATE_ the Java-like name of arguments used by most XML modules.
10823So in pure TIMTOWTDI fashion all arguments can be written either as
10824C<UglyJavaLikeName> or as C<readable_perl_name>: C<twig_print_outside_roots>
10825or C<TwigPrintOutsideRoots> (or even C<twigPrintOutsideRoots> {shudder}).
10826XML::Twig normalizes them before processing them.
10827
10828=item parse ( $source)
10829
10830The C<$source> parameter should either be a string containing the whole XML
10831document, or it should be an open C<IO::Handle> (aka a filehandle).
10832
10833A die call is thrown if a parse error occurs. Otherwise it will return
10834the twig built by the parse. Use C<safe_parse> if you want the parsing
10835to return even when an error occurs.
10836
10837If this method is called as a class method
10838(C<< XML::Twig->parse( $some_xml_or_html) >>) then an XML::Twig object is
10839created, using the parameters except the last one (eg
10840C<< XML::Twig->parse( pretty_print => 'indented', $some_xml_or_html) >>)
10841and C<L<xparse>> is called on it.
10842
10843Note that when parsing a filehandle, the handle should NOT be open with an
10844encoding (ie open with C<open( my $in, '<', $filename)>. The file will be
10845parsed by C<expat>, so specifying the encoding actually causes problems
10846for the parser (as in: it can crash it, see
10847https://rt.cpan.org/Ticket/Display.html?id=78877). For parsing a file it
10848is actually recommended to use C<parsefile> on the file name, instead of
10849<parse> on the open file.
10850
10851=item parsestring
10852
10853This is just an alias for C<parse> for backwards compatibility.
10854
10855=item parsefile (FILE [, OPT => OPT_VALUE [...]])
10856
10857Open C<FILE> for reading, then call C<parse> with the open handle. The file
10858is closed no matter how C<parse> returns.
10859
10860A C<die> call is thrown if a parse error occurs. Otherwise it will return
10861the twig built by the parse. Use C<safe_parsefile> if you want the parsing
10862to return even when an error occurs.
10863
10864=item parsefile_inplace ( $file, $optional_extension)
10865
10866Parse and update a file "in place". It does this by creating a temp file,
10867selecting it as the default for print() statements (and methods), then parsing
10868the input file. If the parsing is successful, then the temp file is
10869moved to replace the input file.
10870
10871If an extension is given then the original file is backed-up (the rules for
10872the extension are the same as the rule for the -i option in perl).
10873
10874=item parsefile_html_inplace ( $file, $optional_extension)
10875
10876Same as parsefile_inplace, except that it parses HTML instead of XML
10877
10878=item parseurl ($url $optional_user_agent)
10879
10880Gets the data from C<$url> and parse it. The data is piped to the parser in
10881chunks the size of the XML::Parser::Expat buffer, so memory consumption and
10882hopefully speed are optimal.
10883
10884For most (read "small") XML it is probably as efficient (and easier to debug)
10885to just C<get> the XML file and then parse it as a string.
10886
10887  use XML::Twig;
10888  use LWP::Simple;
10889  my $twig= XML::Twig->new();
10890  $twig->parse( LWP::Simple::get( $URL ));
10891
10892or
10893
10894  use XML::Twig;
10895  my $twig= XML::Twig->nparse( $URL);
10896
10897
10898If the C<$optional_user_agent> argument is used then it is used, otherwise a
10899new one is created.
10900
10901=item safe_parse ( SOURCE [, OPT => OPT_VALUE [...]])
10902
10903This method is similar to C<parse> except that it wraps the parsing in an
10904C<eval> block. It returns the twig on success and 0 on failure (the twig object
10905also contains the parsed twig). C<$@> contains the error message on failure.
10906
10907Note that the parsing still stops as soon as an error is detected, there is
10908no way to keep going after an error.
10909
10910=item safe_parsefile (FILE [, OPT => OPT_VALUE [...]])
10911
10912This method is similar to C<parsefile> except that it wraps the parsing in an
10913C<eval> block. It returns the twig on success and 0 on failure (the twig object
10914also contains the parsed twig) . C<$@> contains the error message on failure
10915
10916Note that the parsing still stops as soon as an error is detected, there is
10917no way to keep going after an error.
10918
10919=item safe_parseurl ($url $optional_user_agent)
10920
10921Same as C<parseurl> except that it wraps the parsing in an C<eval> block. It
10922returns the twig on success and 0 on failure (the twig object also contains
10923the parsed twig) . C<$@> contains the error message on failure
10924
10925=item parse_html ($string_or_fh)
10926
10927parse an HTML string or file handle (by converting it to XML using
10928HTML::TreeBuilder, which needs to be available).
10929
10930This works nicely, but some information gets lost in the process:
10931newlines are removed, and (at least on the version I use), comments
10932get an extra CDATA section inside ( <!-- foo --> becomes
10933<!-- <![CDATA[ foo ]]> -->
10934
10935=item parsefile_html ($file)
10936
10937parse an HTML file (by converting it to XML using HTML::TreeBuilder, which
10938needs to be available, or HTML::Tidy if the C<use_tidy> option was used).
10939The file is loaded completely in memory and converted to XML before being parsed.
10940
10941this method is to be used with caution though, as it doesn't know about the
10942file encoding, it is usually better to use C<L<parse_html>>, which gives you
10943a chance to open the file with the proper encoding layer.
10944
10945=item parseurl_html ($url $optional_user_agent)
10946
10947parse an URL as html the same way C<L<parse_html>> does
10948
10949=item safe_parseurl_html ($url $optional_user_agent)
10950
10951Same as C<L<parseurl_html>>> except that it wraps the parsing in an C<eval>
10952block.  It returns the twig on success and 0 on failure (the twig object also
10953contains the parsed twig) . C<$@> contains the error message on failure
10954
10955=item safe_parsefile_html ($file $optional_user_agent)
10956
10957Same as C<L<parsefile_html>>> except that it wraps the parsing in an C<eval>
10958block.  It returns the twig on success and 0 on failure (the twig object also
10959contains the parsed twig) . C<$@> contains the error message on failure
10960
10961=item safe_parse_html ($string_or_fh)
10962
10963Same as C<L<parse_html>> except that it wraps the parsing in an C<eval> block.
10964It returns the twig on success and 0 on failure (the twig object also contains
10965the parsed twig) . C<$@> contains the error message on failure
10966
10967=item xparse ($thing_to_parse)
10968
10969parse the C<$thing_to_parse>, whether it is a filehandle, a string, an HTML
10970file, an HTML URL, an URL or a file.
10971
10972Note that this is mostly a convenience method for one-off scripts. For example
10973files that end in '.htm' or '.html' are parsed first as XML, and if this fails
10974as HTML. This is certainly not the most efficient way to do this in general.
10975
10976=item nparse ($optional_twig_options, $thing_to_parse)
10977
10978create a twig with the C<$optional_options>, and parse the C<$thing_to_parse>,
10979whether it is a filehandle, a string, an HTML file, an HTML URL, an URL or a
10980file.
10981
10982Examples:
10983
10984   XML::Twig->nparse( "file.xml");
10985   XML::Twig->nparse( error_context => 1, "file://file.xml");
10986
10987=item nparse_pp ($optional_twig_options, $thing_to_parse)
10988
10989same as C<L<nparse>> but also sets the C<pretty_print> option to C<indented>.
10990
10991=item nparse_e ($optional_twig_options, $thing_to_parse)
10992
10993same as C<L<nparse>> but also sets the C<error_context> option to 1.
10994
10995=item nparse_ppe ($optional_twig_options, $thing_to_parse)
10996
10997same as C<L<nparse>> but also sets the C<pretty_print> option to C<indented>
10998and the C<error_context> option to 1.
10999
11000=item parser
11001
11002This method returns the C<expat> object (actually the XML::Parser::Expat object)
11003used during parsing. It is useful for example to call XML::Parser::Expat methods
11004on it. To get the line of a tag for example use C<< $t->parser->current_line >>.
11005
11006=item setTwigHandlers ($handlers)
11007
11008Set the twig_handlers. C<$handlers> is a reference to a hash similar to the
11009one in the C<twig_handlers> option of new. All previous handlers are unset.
11010The method returns the reference to the previous handlers.
11011
11012=item setTwigHandler ($exp $handler)
11013
11014Set a single twig_handler for elements matching C<$exp>. C<$handler> is a
11015reference to a subroutine. If the handler was previously set then the reference
11016to the previous handler is returned.
11017
11018=item setStartTagHandlers ($handlers)
11019
11020Set the start_tag handlers. C<$handlers> is a reference to a hash similar to the
11021one in the C<start_tag_handlers> option of new. All previous handlers are unset.
11022The method returns the reference to the previous handlers.
11023
11024=item setStartTagHandler ($exp $handler)
11025
11026Set a single start_tag handlers for elements matching C<$exp>. C<$handler> is a
11027reference to a subroutine. If the handler was previously set then the reference
11028to the previous handler is returned.
11029
11030=item setEndTagHandlers ($handlers)
11031
11032Set the end_tag handlers. C<$handlers> is a reference to a hash similar to the
11033one in the C<end_tag_handlers> option of new. All previous handlers are unset.
11034The method returns the reference to the previous handlers.
11035
11036=item setEndTagHandler ($exp $handler)
11037
11038Set a single end_tag handlers for elements matching C<$exp>. C<$handler> is a
11039reference to a subroutine. If the handler was previously set then the
11040reference to the previous handler is returned.
11041
11042=item setTwigRoots ($handlers)
11043
11044Same as using the C<L<twig_roots>> option when creating the twig
11045
11046=item setCharHandler ($exp $handler)
11047
11048Set a C<char_handler>
11049
11050=item setIgnoreEltsHandler ($exp)
11051
11052Set a C<ignore_elt> handler (elements that match C<$exp> will be ignored
11053
11054=item setIgnoreEltsHandlers ($exp)
11055
11056Set all C<ignore_elt> handlers (previous handlers are replaced)
11057
11058=item dtd
11059
11060Return the dtd (an L<XML::Twig::DTD> object) of a twig
11061
11062=item xmldecl
11063
11064Return the XML declaration for the document, or a default one if it doesn't
11065have one
11066
11067=item doctype
11068
11069Return the doctype for the document
11070
11071=item doctype_name
11072
11073returns the doctype of the document from the doctype declaration
11074
11075=item system_id
11076
11077returns the system value of the DTD of the document from the doctype declaration
11078
11079=item public_id
11080
11081returns the public doctype of the document from the doctype declaration
11082
11083=item internal_subset
11084
11085returns the internal subset of the DTD
11086
11087=item dtd_text
11088
11089Return the DTD text
11090
11091=item dtd_print
11092
11093Print the DTD
11094
11095=item model ($tag)
11096
11097Return the model (in the DTD) for the element C<$tag>
11098
11099=item root
11100
11101Return the root element of a twig
11102
11103=item set_root ($elt)
11104
11105Set the root of a twig
11106
11107=item first_elt ($optional_condition)
11108
11109Return the first element matching C<$optional_condition> of a twig, if
11110no condition is given then the root is returned
11111
11112=item last_elt ($optional_condition)
11113
11114Return the last element matching C<$optional_condition> of a twig, if
11115no condition is given then the last element of the twig is returned
11116
11117=item elt_id        ($id)
11118
11119Return the element whose C<id> attribute is $id
11120
11121=item getEltById
11122
11123Same as C<L<elt_id>>
11124
11125=item index ($index_name, $optional_index)
11126
11127If the C<$optional_index> argument is present, return the corresponding element
11128in the index (created using the C<index> option for C<XML::Twig->new>)
11129
11130If the argument is not present, return an arrayref to the index
11131
11132=item normalize
11133
11134merge together all consecutive pcdata elements in the document (if for example
11135you have turned some elements into pcdata using C<L<erase>>, this will give you
11136a "clean" document in which there all text elements are as long as possible).
11137
11138=item encoding
11139
11140This method returns the encoding of the XML document, as defined by the
11141C<encoding> attribute in the XML declaration (ie it is C<undef> if the attribute
11142is not defined)
11143
11144=item set_encoding
11145
11146This method sets the value of the C<encoding> attribute in the XML declaration.
11147Note that if the document did not have a declaration it is generated (with
11148an XML version of 1.0)
11149
11150=item xml_version
11151
11152This method returns the XML version, as defined by the C<version> attribute in
11153the XML declaration (ie it is C<undef> if the attribute is not defined)
11154
11155=item set_xml_version
11156
11157This method sets the value of the C<version> attribute in the XML declaration.
11158If the declaration did not exist it is created.
11159
11160=item standalone
11161
11162This method returns the value of the C<standalone> declaration for the document
11163
11164=item set_standalone
11165
11166This method sets the value of the C<standalone> attribute in the XML
11167declaration.  Note that if the document did not have a declaration it is
11168generated (with an XML version of 1.0)
11169
11170=item set_output_encoding
11171
11172Set the C<encoding> "attribute" in the XML declaration
11173
11174=item set_doctype ($name, $system, $public, $internal)
11175
11176Set the doctype of the element. If an argument is C<undef> (or not present)
11177then its former value is retained, if a false ('' or 0) value is passed then
11178the former value is deleted;
11179
11180=item entity_list
11181
11182Return the entity list of a twig
11183
11184=item entity_names
11185
11186Return the list of all defined entities
11187
11188=item entity ($entity_name)
11189
11190Return the entity
11191
11192=item change_gi      ($old_gi, $new_gi)
11193
11194Performs a (very fast) global change. All elements C<$old_gi> are now
11195C<$new_gi>. This is a bit dangerous though and should be avoided if
11196< possible, as the new tag might be ignored in subsequent processing.
11197
11198See C<L<BUGS> >
11199
11200=item flush            ($optional_filehandle, %options)
11201
11202Flushes a twig up to (and including) the current element, then deletes
11203all unnecessary elements from the tree that's kept in memory.
11204C<flush> keeps track of which elements need to be open/closed, so if you
11205flush from handlers you don't have to worry about anything. Just keep
11206flushing the twig every time you're done with a sub-tree and it will
11207come out well-formed. After the whole parsing don't forget toC<flush>
11208one more time to print the end of the document.
11209The doctype and entity declarations are also printed.
11210
11211flush take an optional filehandle as an argument.
11212
11213If you use C<flush> at any point during parsing, the document will be flushed
11214one last time at the end of the parsing, to the proper filehandle.
11215
11216options: use the C<update_DTD> option if you have updated the (internal) DTD
11217and/or the entity list and you want the updated DTD to be output
11218
11219The C<pretty_print> option sets the pretty printing of the document.
11220
11221   Example: $t->flush( Update_DTD => 1);
11222            $t->flush( $filehandle, pretty_print => 'indented');
11223            $t->flush( \*FILE);
11224
11225
11226=item flush_up_to ($elt, $optional_filehandle, %options)
11227
11228Flushes up to the C<$elt> element. This allows you to keep part of the
11229tree in memory when you C<flush>.
11230
11231options: see flush.
11232
11233=item purge
11234
11235Does the same as a C<flush> except it does not print the twig. It just deletes
11236all elements that have been completely parsed so far.
11237
11238=item purge_up_to ($elt)
11239
11240Purges up to the C<$elt> element. This allows you to keep part of the tree in
11241memory when you C<purge>.
11242
11243=item print            ($optional_filehandle, %options)
11244
11245Prints the whole document associated with the twig. To be used only AFTER the
11246parse.
11247
11248options: see C<flush>.
11249
11250=item print_to_file    ($filename, %options)
11251
11252Prints the whole document associated with the twig to file C<$filename>.
11253To be used only AFTER the parse.
11254
11255options: see C<flush>.
11256
11257=item safe_print_to_file    ($filename, %options)
11258
11259Prints the whole document associated with the twig to file C<$filename>.
11260This variant, which probably only works on *nix prints to a temp file,
11261then move the temp file to overwrite the original file.
11262
11263This is a bit safer when 2 processes an potentiallywrite the same file:
11264only the last one will succeed, but the file won't be corruted. I often
11265use this for cron jobs, so testing the code doesn't interfere with the
11266cron job running at the same time.
11267
11268options: see C<flush>.
11269
11270=item sprint
11271
11272Return the text of the whole document associated with the twig. To be used only
11273AFTER the parse.
11274
11275options: see C<flush>.
11276
11277=item trim
11278
11279Trim the document: gets rid of initial and trailing spaces, and replaces multiple spaces
11280by a single one.
11281
11282=item toSAX1 ($handler)
11283
11284Send SAX events for the twig to the SAX1 handler C<$handler>
11285
11286=item toSAX2 ($handler)
11287
11288Send SAX events for the twig to the SAX2 handler C<$handler>
11289
11290=item flush_toSAX1 ($handler)
11291
11292Same as flush, except that SAX events are sent to the SAX1 handler
11293C<$handler> instead of the twig being printed
11294
11295=item flush_toSAX2 ($handler)
11296
11297Same as flush, except that SAX events are sent to the SAX2 handler
11298C<$handler> instead of the twig being printed
11299
11300=item ignore
11301
11302This method should be called during parsing, usually in C<start_tag_handlers>.
11303It causes the element to be skipped during the parsing: the twig is not built
11304for this element, it will not be accessible during parsing or after it. The
11305element will not take up any memory and parsing will be faster.
11306
11307Note that this method can also be called on an element. If the element is a
11308parent of the current element then this element will be ignored (the twig will
11309not be built any more for it and what has already been built will be deleted).
11310
11311=item set_pretty_print  ($style)
11312
11313Set the pretty print method, amongst 'C<none>' (default), 'C<nsgmls>',
11314'C<nice>', 'C<indented>', C<indented_c>, 'C<wrapped>', 'C<record>' and
11315'C<record_c>'
11316
11317B<WARNING:> the pretty print style is a B<GLOBAL> variable, so once set it's
11318applied to B<ALL> C<print>'s (and C<sprint>'s). Same goes if you use XML::Twig
11319with C<mod_perl> . This should not be a problem as the XML that's generated
11320is valid anyway, and XML processors (as well as HTML processors, including
11321browsers) should not care. Let me know if this is a big problem, but at the
11322moment the performance/cleanliness trade-off clearly favors the global
11323approach.
11324
11325=item set_empty_tag_style  ($style)
11326
11327Set the empty tag display style ('C<normal>', 'C<html>' or 'C<expand>'). As
11328with C<L<set_pretty_print>> this sets a global flag.
11329
11330C<normal> outputs an empty tag 'C<< <tag/> >>', C<html> adds a space
11331'C<< <tag /> >>' for elements that can be empty in XHTML and C<expand> outputs
11332'C<< <tag></tag> >>'
11333
11334=item set_remove_cdata  ($flag)
11335
11336set (or unset) the flag that forces the twig to output CDATA sections as
11337regular (escaped) PCDATA
11338
11339=item print_prolog     ($optional_filehandle, %options)
11340
11341Prints the prolog (XML declaration + DTD + entity declarations) of a document.
11342
11343options: see C<L<flush>>.
11344
11345=item prolog     ($optional_filehandle, %options)
11346
11347Return the prolog (XML declaration + DTD + entity declarations) of a document.
11348
11349options: see C<L<flush>>.
11350
11351=item finish
11352
11353Call Expat C<finish> method.
11354Unsets all handlers (including internal ones that set context), but expat
11355continues parsing to the end of the document or until it finds an error.
11356It should finish up a lot faster than with the handlers set.
11357
11358=item finish_print
11359
11360Stops twig processing, flush the twig and proceed to finish printing the
11361document as fast as possible. Use this method when modifying a document and
11362the modification is done.
11363
11364=item finish_now
11365
11366Stops twig processing, does not finish parsing the document (which could
11367actually be not well-formed after the point where C<finish_now> is called).
11368Execution resumes after the C<Lparse>> or C<L<parsefile>> call. The content
11369of the twig is what has been parsed so far (all open elements at the time
11370C<finish_now> is called are considered closed).
11371
11372=item set_expand_external_entities
11373
11374Same as using the C<L<expand_external_ents>> option when creating the twig
11375
11376=item set_input_filter
11377
11378Same as using the C<L<input_filter>> option when creating the twig
11379
11380=item set_keep_atts_order
11381
11382Same as using the C<L<keep_atts_order>> option when creating the twig
11383
11384=item set_keep_encoding
11385
11386Same as using the C<L<keep_encoding>> option when creating the twig
11387
11388=item escape_gt
11389
11390usually XML::Twig does not escape > in its output. Using this option
11391makes it replace > by &gt;
11392
11393=item do_not_escape_gt
11394
11395reverts XML::Twig behavior to its default of not escaping > in its output.
11396
11397=item set_output_filter
11398
11399Same as using the C<L<output_filter>> option when creating the twig
11400
11401=item set_output_text_filter
11402
11403Same as using the C<L<output_text_filter>> option when creating the twig
11404
11405=item add_stylesheet ($type, @options)
11406
11407Adds an external stylesheet to an XML document.
11408
11409Supported types and options:
11410
11411=over 4
11412
11413=item xsl
11414
11415option: the url of the stylesheet
11416
11417Example:
11418
11419  $t->add_stylesheet( xsl => "xsl_style.xsl");
11420
11421will generate the following PI at the beginning of the document:
11422
11423  <?xml-stylesheet type="text/xsl" href="xsl_style.xsl"?>
11424
11425=item css
11426
11427option: the url of the stylesheet
11428
11429=item active_twig
11430
11431a class method that returns the last processed twig, so you don't necessarily
11432need the object to call methods on it.
11433
11434=back
11435
11436=item Methods inherited from XML::Parser::Expat
11437
11438A twig inherits all the relevant methods from XML::Parser::Expat. These
11439methods can only be used during the parsing phase (they will generate
11440a fatal error otherwise).
11441
11442Inherited methods are:
11443
11444=over 4
11445
11446=item depth
11447
11448Returns the size of the context list.
11449
11450=item in_element
11451
11452Returns true if NAME is equal to the name of the innermost cur‐
11453rently opened element. If namespace processing is being used and
11454you want to check against a name that may be in a namespace, then
11455use the generate_ns_name method to create the NAME argument.
11456
11457=item within_element
11458
11459Returns the number of times the given name appears in the context
11460list.  If namespace processing is being used and you want to check
11461against a name that may be in a namespace, then use the gener‐
11462ate_ns_name method to create the NAME argument.
11463
11464=item context
11465
11466Returns a list of element names that represent open elements, with
11467the last one being the innermost. Inside start and end tag han‐
11468dlers, this will be the tag of the parent element.
11469
11470=item current_line
11471
11472Returns the line number of the current position of the parse.
11473
11474=item current_column
11475
11476Returns the column number of the current position of the parse.
11477
11478=item current_byte
11479
11480Returns the current position of the parse.
11481
11482=item position_in_context
11483
11484Returns a string that shows the current parse position. LINES
11485should be an integer >= 0 that represents the number of lines on
11486either side of the current parse line to place into the returned
11487string.
11488
11489=item base ([NEWBASE])
11490
11491Returns the current value of the base for resolving relative URIs.
11492If NEWBASE is supplied, changes the base to that value.
11493
11494=item current_element
11495
11496Returns the name of the innermost currently opened element. Inside
11497start or end handlers, returns the parent of the element associated
11498with those tags.
11499
11500=item element_index
11501
11502Returns an integer that is the depth-first visit order of the cur‐
11503rent element. This will be zero outside of the root element. For
11504example, this will return 1 when called from the start handler for
11505the root element start tag.
11506
11507=item recognized_string
11508
11509Returns the string from the document that was recognized in order
11510to call the current handler. For instance, when called from a start
11511handler, it will give us the start-tag string. The string is
11512encoded in UTF-8.  This method doesn't return a meaningful string
11513inside declaration handlers.
11514
11515=item original_string
11516
11517Returns the verbatim string from the document that was recognized
11518in order to call the current handler. The string is in the original
11519document encoding. This method doesn't return a meaningful string
11520inside declaration handlers.
11521
11522=item xpcroak
11523
11524Concatenate onto the given message the current line number within
11525the XML document plus the message implied by ErrorContext. Then
11526croak with the formed message.
11527
11528=item xpcarp
11529
11530Concatenate onto the given message the current line number within
11531the XML document plus the message implied by ErrorContext. Then
11532carp with the formed message.
11533
11534=item xml_escape(TEXT [, CHAR [, CHAR ...]])
11535
11536Returns TEXT with markup characters turned into character entities.
11537Any additional characters provided as arguments are also turned
11538into character references where found in TEXT.
11539
11540(this method is broken on some versions of expat/XML::Parser)
11541
11542=back
11543
11544=item path ( $optional_tag)
11545
11546Return the element context in a form similar to XPath's short
11547form: 'C</root/tag1/../tag>'
11548
11549=item get_xpath  ( $optional_array_ref, $xpath, $optional_offset)
11550
11551Performs a C<get_xpath> on the document root (see <Elt|"Elt">)
11552
11553If the C<$optional_array_ref> argument is used the array must contain
11554elements. The C<$xpath> expression is applied to each element in turn
11555and the result is union of all results. This way a first query can be
11556refined in further steps.
11557
11558
11559=item find_nodes ( $optional_array_ref, $xpath, $optional_offset)
11560
11561same as C<get_xpath>
11562
11563=item findnodes ( $optional_array_ref, $xpath, $optional_offset)
11564
11565same as C<get_xpath> (similar to the XML::LibXML method)
11566
11567=item findvalue ( $optional_array_ref, $xpath, $optional_offset)
11568
11569Return the C<join> of all texts of the results of applying C<L<get_xpath>>
11570to the node (similar to the XML::LibXML method)
11571
11572=item findvalues ( $optional_array_ref, $xpath, $optional_offset)
11573
11574Return an array of all texts of the results of applying C<L<get_xpath>>
11575to the node
11576
11577=item subs_text ($regexp, $replace)
11578
11579subs_text does text substitution on the whole document, similar to perl's
11580C< s///> operator.
11581
11582=item dispose
11583
11584Useful only if you don't have C<Scalar::Util> or C<WeakRef> installed.
11585
11586Reclaims properly the memory used by an XML::Twig object. As the object has
11587circular references it never goes out of scope, so if you want to parse lots
11588of XML documents then the memory leak becomes a problem. Use
11589C<< $twig->dispose >> to clear this problem.
11590
11591=item att_accessors (list_of_attribute_names)
11592
11593A convenience method that creates l-valued accessors for attributes.
11594So C<< $twig->create_accessors( 'foo') >> will create a C<foo> method
11595that can be called on elements:
11596
11597  $elt->foo;         # equivalent to $elt->{'att'}->{'foo'};
11598  $elt->foo( 'bar'); # equivalent to $elt->set_att( foo => 'bar');
11599
11600The methods are l-valued only under those perl's that support this
11601feature (5.6 and above)
11602
11603=item create_accessors (list_of_attribute_names)
11604
11605Same as att_accessors
11606
11607=item elt_accessors (list_of_attribute_names)
11608
11609A convenience method that creates accessors for elements.
11610So C<< $twig->create_accessors( 'foo') >> will create a C<foo> method
11611that can be called on elements:
11612
11613  $elt->foo;         # equivalent to $elt->first_child( 'foo');
11614
11615=item field_accessors (list_of_attribute_names)
11616
11617A convenience method that creates accessors for element values (C<field>).
11618So C<< $twig->create_accessors( 'foo') >> will create a C<foo> method
11619that can be called on elements:
11620
11621  $elt->foo;         # equivalent to $elt->field( 'foo');
11622
11623=item set_do_not_escape_amp_in_atts
11624
11625An evil method, that I only document because Test::Pod::Coverage complaints otherwise,
11626but really, you don't want to know about it.
11627
11628=back
11629
11630=head2 XML::Twig::Elt
11631
11632=over 4
11633
11634=item new          ($optional_tag, $optional_atts, @optional_content)
11635
11636The C<tag> is optional (but then you can't have a content ), the C<$optional_atts>
11637argument is a reference to a hash of attributes, the content can be just a
11638string or a list of strings and element. A content of 'C<#EMPTY>' creates an empty
11639element;
11640
11641 Examples: my $elt= XML::Twig::Elt->new();
11642           my $elt= XML::Twig::Elt->new( para => { align => 'center' });
11643           my $elt= XML::Twig::Elt->new( para => { align => 'center' }, 'foo');
11644           my $elt= XML::Twig::Elt->new( br   => '#EMPTY');
11645           my $elt= XML::Twig::Elt->new( 'para');
11646           my $elt= XML::Twig::Elt->new( para => 'this is a para');
11647           my $elt= XML::Twig::Elt->new( para => $elt3, 'another para');
11648
11649The strings are not parsed, the element is not attached to any twig.
11650
11651B<WARNING>: if you rely on ID's then you will have to set the id yourself. At
11652this point the element does not belong to a twig yet, so the ID attribute
11653is not known so it won't be stored in the ID list.
11654
11655Note that C<#COMMENT>, C<#PCDATA> or C<#CDATA> are valid tag names, that will
11656create text elements.
11657
11658To create an element C<foo> containing a CDATA section:
11659
11660           my $foo= XML::Twig::Elt->new( '#CDATA' => "content of the CDATA section")
11661                                  ->wrap_in( 'foo');
11662
11663An attribute of '#CDATA', will create the content of the element as CDATA:
11664
11665  my $elt= XML::Twig::Elt->new( 'p' => { '#CDATA' => 1}, 'foo < bar');
11666
11667creates an element
11668
11669  <p><![CDATA[foo < bar]]></>
11670
11671=item parse         ($string, %args)
11672
11673Creates an element from an XML string. The string is actually
11674parsed as a new twig, then the root of that twig is returned.
11675The arguments in C<%args> are passed to the twig.
11676As always if the parse fails the parser will die, so use an
11677eval if you want to trap syntax errors.
11678
11679As obviously the element does not exist beforehand this method has to be
11680called on the class:
11681
11682  my $elt= parse XML::Twig::Elt( "<a> string to parse, with <sub/>
11683                                  <elements>, actually tons of </elements>
11684                  h</a>");
11685
11686=item set_inner_xml ($string)
11687
11688Sets the content of the element to be the tree created from the string
11689
11690=item set_inner_html ($string)
11691
11692Sets the content of the element, after parsing the string with an HTML
11693parser (HTML::Parser)
11694
11695=item set_outer_xml ($string)
11696
11697Replaces the element with the tree created from the string
11698
11699=item print         ($optional_filehandle, $optional_pretty_print_style)
11700
11701Prints an entire element, including the tags, optionally to a
11702C<$optional_filehandle>, optionally with a C<$pretty_print_style>.
11703
11704The print outputs XML data so base entities are escaped.
11705
11706=item print_to_file    ($filename, %options)
11707
11708Prints the element to file C<$filename>.
11709
11710options: see C<flush>.
11711=item sprint       ($elt, $optional_no_enclosing_tag)
11712
11713Return the xml string for an entire element, including the tags.
11714If the optional second argument is true then only the string inside the
11715element is returned (the start and end tag for $elt are not).
11716The text is XML-escaped: base entities (& and < in text, & < and " in
11717attribute values) are turned into entities.
11718
11719=item gi
11720
11721Return the gi of the element (the gi is the C<generic identifier> the tag
11722name in SGML parlance).
11723
11724C<tag> and C<name> are synonyms of C<gi>.
11725
11726=item tag
11727
11728Same as C<L<gi>>
11729
11730=item name
11731
11732Same as C<L<tag>>
11733
11734=item set_gi         ($tag)
11735
11736Set the gi (tag) of an element
11737
11738=item set_tag        ($tag)
11739
11740Set the tag (=C<L<tag>>) of an element
11741
11742=item set_name       ($name)
11743
11744Set the name (=C<L<tag>>) of an element
11745
11746=item root
11747
11748Return the root of the twig in which the element is contained.
11749
11750=item twig
11751
11752Return the twig containing the element.
11753
11754=item parent        ($optional_condition)
11755
11756Return the parent of the element, or the first ancestor matching the
11757C<$optional_condition>
11758
11759=item first_child   ($optional_condition)
11760
11761Return the first child of the element, or the first child matching the
11762C<$optional_condition>
11763
11764=item has_child ($optional_condition)
11765
11766Return the first child of the element, or the first child matching the
11767C<$optional_condition> (same as L<first_child>)
11768
11769=item has_children ($optional_condition)
11770
11771Return the first child of the element, or the first child matching the
11772C<$optional_condition> (same as L<first_child>)
11773
11774
11775=item first_child_text   ($optional_condition)
11776
11777Return the text of the first child of the element, or the first child
11778 matching the C<$optional_condition>
11779If there is no first_child then returns ''. This avoids getting the
11780child, checking for its existence then getting the text for trivial cases.
11781
11782Similar methods are available for the other navigation methods:
11783
11784=over 4
11785
11786=item last_child_text
11787
11788=item prev_sibling_text
11789
11790=item next_sibling_text
11791
11792=item prev_elt_text
11793
11794=item next_elt_text
11795
11796=item child_text
11797
11798=item parent_text
11799
11800=back
11801
11802All this methods also exist in "trimmed" variant:
11803
11804=over 4
11805
11806=item first_child_trimmed_text
11807
11808=item last_child_trimmed_text
11809
11810=item prev_sibling_trimmed_text
11811
11812=item next_sibling_trimmed_text
11813
11814=item prev_elt_trimmed_text
11815
11816=item next_elt_trimmed_text
11817
11818=item child_trimmed_text
11819
11820=item parent_trimmed_text
11821
11822=back
11823
11824=item field         ($condition)
11825
11826Same method as C<first_child_text> with a different name
11827
11828=item fields         ($condition_list)
11829
11830Return the list of field (text of first child matching the conditions),
11831missing fields are returned as the empty string.
11832
11833Same method as C<first_child_text> with a different name
11834
11835=item trimmed_field         ($optional_condition)
11836
11837Same method as C<first_child_trimmed_text> with a different name
11838
11839=item set_field ($condition, $optional_atts, @list_of_elt_and_strings)
11840
11841Set the content of the first child of the element that matches
11842C<$condition>, the rest of the arguments is the same as for C<L<set_content>>
11843
11844If no child matches C<$condition> _and_ if C<$condition> is a valid
11845XML element name, then a new element by that name is created and
11846inserted as the last child.
11847
11848=item first_child_matches   ($optional_condition)
11849
11850Return the element if the first child of the element (if it exists) passes
11851the C<$optional_condition> C<undef> otherwise
11852
11853  if( $elt->first_child_matches( 'title')) ...
11854
11855is equivalent to
11856
11857  if( $elt->{first_child} && $elt->{first_child}->passes( 'title'))
11858
11859C<first_child_is> is an other name for this method
11860
11861Similar methods are available for the other navigation methods:
11862
11863=over 4
11864
11865=item last_child_matches
11866
11867=item prev_sibling_matches
11868
11869=item next_sibling_matches
11870
11871=item prev_elt_matches
11872
11873=item next_elt_matches
11874
11875=item child_matches
11876
11877=item parent_matches
11878
11879=back
11880
11881=item is_first_child ($optional_condition)
11882
11883returns true (the element) if the element is the first child of its parent
11884(optionally that satisfies the C<$optional_condition>)
11885
11886=item is_last_child ($optional_condition)
11887
11888returns true (the element) if the element is the last child of its parent
11889(optionally that satisfies the C<$optional_condition>)
11890
11891=item prev_sibling  ($optional_condition)
11892
11893Return the previous sibling of the element, or the previous sibling matching
11894C<$optional_condition>
11895
11896=item next_sibling  ($optional_condition)
11897
11898Return the next sibling of the element, or the first one matching
11899C<$optional_condition>.
11900
11901=item next_elt     ($optional_elt, $optional_condition)
11902
11903Return the next elt (optionally matching C<$optional_condition>) of the element. This
11904is defined as the next element which opens after the current element opens.
11905Which usually means the first child of the element.
11906Counter-intuitive as it might look this allows you to loop through the
11907whole document by starting from the root.
11908
11909The C<$optional_elt> is the root of a subtree. When the C<next_elt> is out of the
11910subtree then the method returns undef. You can then walk a sub-tree with:
11911
11912  my $elt= $subtree_root;
11913  while( $elt= $elt->next_elt( $subtree_root))
11914    { # insert processing code here
11915    }
11916
11917=item prev_elt     ($optional_condition)
11918
11919Return the previous elt (optionally matching C<$optional_condition>) of the
11920element. This is the first element which opens before the current one.
11921It is usually either the last descendant of the previous sibling or
11922simply the parent
11923
11924=item next_n_elt   ($offset, $optional_condition)
11925
11926Return the C<$offset>-th element that matches the C<$optional_condition>
11927
11928=item following_elt
11929
11930Return the following element (as per the XPath following axis)
11931
11932=item preceding_elt
11933
11934Return the preceding element (as per the XPath preceding axis)
11935
11936=item following_elts
11937
11938Return the list of following elements (as per the XPath following axis)
11939
11940=item preceding_elts
11941
11942Return the list of preceding elements (as per the XPath preceding axis)
11943
11944=item children     ($optional_condition)
11945
11946Return the list of children (optionally which matches C<$optional_condition>) of
11947the element. The list is in document order.
11948
11949=item children_count ($optional_condition)
11950
11951Return the number of children of the element (optionally which matches
11952C<$optional_condition>)
11953
11954=item children_text ($optional_condition)
11955
11956In array context, returns an array containing the text of children of the
11957element (optionally which matches C<$optional_condition>)
11958
11959In scalar context, returns the concatenation of the text of children of
11960the element
11961
11962=item children_trimmed_text ($optional_condition)
11963
11964In array context, returns an array containing the trimmed text of children
11965of the element (optionally which matches C<$optional_condition>)
11966
11967In scalar context, returns the concatenation of the trimmed text of children of
11968the element
11969
11970
11971=item children_copy ($optional_condition)
11972
11973Return a list of elements that are copies of the children of the element,
11974optionally which matches C<$optional_condition>
11975
11976=item descendants     ($optional_condition)
11977
11978Return the list of all descendants (optionally which matches
11979C<$optional_condition>) of the element. This is the equivalent of the
11980C<getElementsByTagName> of the DOM (by the way, if you are really a DOM
11981addict, you can use C<getElementsByTagName> instead)
11982
11983=item getElementsByTagName ($optional_condition)
11984
11985Same as C<L<descendants>>
11986
11987=item find_by_tag_name ($optional_condition)
11988
11989Same as C<L<descendants>>
11990
11991=item descendants_or_self ($optional_condition)
11992
11993Same as C<L<descendants>> except that the element itself is included in the list
11994if it matches the C<$optional_condition>
11995
11996=item first_descendant  ($optional_condition)
11997
11998Return the first descendant of the element that matches the condition
11999
12000=item last_descendant  ($optional_condition)
12001
12002Return the last descendant of the element that matches the condition
12003
12004=item ancestors    ($optional_condition)
12005
12006Return the list of ancestors (optionally matching C<$optional_condition>) of the
12007element.  The list is ordered from the innermost ancestor to the outermost one
12008
12009NOTE: the element itself is not part of the list, in order to include it
12010you will have to use ancestors_or_self
12011
12012=item ancestors_or_self     ($optional_condition)
12013
12014Return the list of ancestors (optionally matching C<$optional_condition>) of the
12015element, including the element (if it matches the condition>).
12016The list is ordered from the innermost ancestor to the outermost one
12017
12018=item passes ($condition)
12019
12020Return the element if it passes the C<$condition>
12021
12022=item att          ($att)
12023
12024Return the value of attribute C<$att> or C<undef>
12025
12026=item latt          ($att)
12027
12028Return the value of attribute C<$att> or C<undef>
12029
12030this method is an lvalue, so you can do C<< $elt->latt( 'foo')= 'bar' >> or C<< $elt->latt( 'foo')++; >>
12031
12032=item set_att      ($att, $att_value)
12033
12034Set the attribute of the element to the given value
12035
12036You can actually set several attributes this way:
12037
12038  $elt->set_att( att1 => "val1", att2 => "val2");
12039
12040=item del_att      ($att)
12041
12042Delete the attribute for the element
12043
12044You can actually delete several attributes at once:
12045
12046  $elt->del_att( 'att1', 'att2', 'att3');
12047
12048=item att_exists ($att)
12049
12050Returns true if the attribute C<$att> exists for the element, false
12051otherwise
12052
12053=item cut
12054
12055Cut the element from the tree. The element still exists, it can be copied
12056or pasted somewhere else, it is just not attached to the tree anymore.
12057
12058Note that the "old" links to the parent, previous and next siblings can
12059still be accessed using the former_* methods
12060
12061=item former_next_sibling
12062
12063Returns the former next sibling of a cut node (or undef if the node has not been cut)
12064
12065This makes it easier to write loops where you cut elements:
12066
12067    my $child= $parent->first_child( 'achild');
12068    while( $child->{'att'}->{'cut'})
12069      { $child->cut; $child= ($child->{former} && $child->{former}->{next_sibling}); }
12070
12071=item former_prev_sibling
12072
12073Returns the former previous sibling of a cut node (or undef if the node has not been cut)
12074
12075=item former_parent
12076
12077Returns the former parent of a cut node (or undef if the node has not been cut)
12078
12079=item cut_children ($optional_condition)
12080
12081Cut all the children of the element (or all of those which satisfy the
12082C<$optional_condition>).
12083
12084Return the list of children
12085
12086=item cut_descendants ($optional_condition)
12087
12088Cut all the descendants of the element (or all of those which satisfy the
12089C<$optional_condition>).
12090
12091Return the list of descendants
12092
12093=item copy        ($elt)
12094
12095Return a copy of the element. The copy is a "deep" copy: all sub-elements of
12096the element are duplicated.
12097
12098=item paste       ($optional_position, $ref)
12099
12100Paste a (previously C<cut> or newly generated) element. Die if the element
12101already belongs to a tree.
12102
12103Note that the calling element is pasted:
12104
12105  $child->paste( first_child => $existing_parent);
12106  $new_sibling->paste( after => $this_sibling_is_already_in_the_tree);
12107
12108or
12109
12110  my $new_elt= XML::Twig::Elt->new( tag => $content);
12111  $new_elt->paste( $position => $existing_elt);
12112
12113Example:
12114
12115  my $t= XML::Twig->new->parse( 'doc.xml')
12116  my $toc= $t->root->new( 'toc');
12117  $toc->paste( $t->root); # $toc is pasted as first child of the root
12118  foreach my $title ($t->findnodes( '/doc/section/title'))
12119    { my $title_toc= $title->copy;
12120      # paste $title_toc as the last child of toc
12121      $title_toc->paste( last_child => $toc)
12122    }
12123
12124Position options:
12125
12126=over 4
12127
12128=item first_child (default)
12129
12130The element is pasted as the first child of C<$ref>
12131
12132=item last_child
12133
12134The element is pasted as the last child of C<$ref>
12135
12136=item before
12137
12138The element is pasted before C<$ref>, as its previous sibling.
12139
12140=item after
12141
12142The element is pasted after C<$ref>, as its next sibling.
12143
12144=item within
12145
12146In this case an extra argument, C<$offset>, should be supplied. The element
12147will be pasted in the reference element (or in its first text child) at the
12148given offset. To achieve this the reference element will be split at the
12149offset.
12150
12151=back
12152
12153Note that you can call directly the underlying method:
12154
12155=over 4
12156
12157=item paste_before
12158
12159=item paste_after
12160
12161=item paste_first_child
12162
12163=item paste_last_child
12164
12165=item paste_within
12166
12167=back
12168
12169=item move       ($optional_position, $ref)
12170
12171Move an element in the tree.
12172This is just a C<cut> then a C<paste>.  The syntax is the same as C<paste>.
12173
12174=item replace       ($ref)
12175
12176Replaces an element in the tree. Sometimes it is just not possible toC<cut>
12177an element then C<paste> another in its place, so C<replace> comes in handy.
12178The calling element replaces C<$ref>.
12179
12180=item replace_with   (@elts)
12181
12182Replaces the calling element with one or more elements
12183
12184=item delete
12185
12186Cut the element and frees the memory.
12187
12188=item prefix       ($text, $optional_option)
12189
12190Add a prefix to an element. If the element is a C<PCDATA> element the text
12191is added to the pcdata, if the elements first child is a C<PCDATA> then the
12192text is added to it's pcdata, otherwise a new C<PCDATA> element is created
12193and pasted as the first child of the element.
12194
12195If the option is C<asis> then the prefix is added asis: it is created in
12196a separate C<PCDATA> element with an C<asis> property. You can then write:
12197
12198  $elt1->prefix( '<b>', 'asis');
12199
12200to create a C<< <b> >> in the output of C<print>.
12201
12202=item suffix       ($text, $optional_option)
12203
12204Add a suffix to an element. If the element is a C<PCDATA> element the text
12205is added to the pcdata, if the elements last child is a C<PCDATA> then the
12206text is added to it's pcdata, otherwise a new PCDATA element is created
12207and pasted as the last child of the element.
12208
12209If the option is C<asis> then the suffix is added asis: it is created in
12210a separate C<PCDATA> element with an C<asis> property. You can then write:
12211
12212  $elt2->suffix( '</b>', 'asis');
12213
12214=item trim
12215
12216Trim the element in-place: spaces at the beginning and at the end of the element
12217are discarded and multiple spaces within the element (or its descendants) are
12218replaced by a single space.
12219
12220Note that in some cases you can still end up with multiple spaces, if they are
12221split between several elements:
12222
12223  <doc>  text <b>  hah! </b>  yep</doc>
12224
12225gets trimmed to
12226
12227  <doc>text <b> hah! </b> yep</doc>
12228
12229This is somewhere in between a bug and a feature.
12230
12231=item normalize
12232
12233merge together all consecutive pcdata elements in the element (if for example
12234you have turned some elements into pcdata using C<L<erase>>, this will give you
12235a "clean" element in which there all text fragments are as long as possible).
12236
12237
12238=item simplify (%options)
12239
12240Return a data structure suspiciously similar to XML::Simple's. Options are
12241identical to XMLin options, see XML::Simple doc for more details (or use
12242DATA::dumper or YAML to dump the data structure)
12243
12244B<Note>: there is no magic here, if you write
12245C<< $twig->parsefile( $file )->simplify(); >> then it will load the entire
12246document in memory. I am afraid you will have to put some work into it to
12247get just the bits you want and discard the rest. Look at the synopsis or
12248the XML::Twig 101 section at the top of the docs for more information.
12249
12250=over 4
12251
12252=item content_key
12253
12254=item forcearray
12255
12256=item keyattr
12257
12258=item noattr
12259
12260=item normalize_space
12261
12262aka normalise_space
12263
12264=item variables (%var_hash)
12265
12266%var_hash is a hash { name => value }
12267
12268This option allows variables in the XML to be expanded when the file is read. (there is no facility for putting the variable names back if you regenerate XML using XMLout).
12269
12270A 'variable' is any text of the form ${name} (or $name) which occurs in an attribute value or in the text content of an element. If 'name' matches a key in the supplied hashref, ${name} will be replaced with the corresponding value from the hashref. If no matching key is found, the variable will not be replaced.
12271
12272=item var_att ($attribute_name)
12273
12274This option gives the name of an attribute that will be used to create
12275variables in the XML:
12276
12277  <dirs>
12278    <dir name="prefix">/usr/local</dir>
12279    <dir name="exec_prefix">$prefix/bin</dir>
12280  </dirs>
12281
12282use C<< var => 'name' >> to get $prefix replaced by /usr/local in the
12283generated data structure
12284
12285By default variables are captured by the following regexp: /$(\w+)/
12286
12287=item var_regexp (regexp)
12288
12289This option changes the regexp used to capture variables. The variable
12290name should be in $1
12291
12292=item group_tags { grouping tag => grouped tag, grouping tag 2 => grouped tag 2...}
12293
12294Option used to simplify the structure: elements listed will not be used.
12295Their children will be, they will be considered children of the element
12296parent.
12297
12298If the element is:
12299
12300  <config host="laptop.xmltwig.org">
12301    <server>localhost</server>
12302    <dirs>
12303      <dir name="base">/home/mrodrigu/standards</dir>
12304      <dir name="tools">$base/tools</dir>
12305    </dirs>
12306    <templates>
12307      <template name="std_def">std_def.templ</template>
12308      <template name="dummy">dummy</template>
12309    </templates>
12310  </config>
12311
12312Then calling simplify with C<< group_tags => { dirs => 'dir',
12313templates => 'template'} >>
12314makes the data structure be exactly as if the start and end tags for C<dirs> and
12315C<templates> were not there.
12316
12317A YAML dump of the structure
12318
12319  base: '/home/mrodrigu/standards'
12320  host: laptop.xmltwig.org
12321  server: localhost
12322  template:
12323    - std_def.templ
12324    - dummy.templ
12325  tools: '$base/tools'
12326
12327
12328=back
12329
12330=item split_at        ($offset)
12331
12332Split a text (C<PCDATA> or C<CDATA>) element in 2 at C<$offset>, the original
12333element now holds the first part of the string and a new element holds the
12334right part. The new element is returned
12335
12336If the element is not a text element then the first text child of the element
12337is split
12338
12339=item split        ( $optional_regexp, $tag1, $atts1, $tag2, $atts2...)
12340
12341Split the text descendants of an element in place, the text is split using
12342the C<$regexp>, if the regexp includes () then the matched separators will be
12343wrapped in elements.  C<$1> is wrapped in $tag1, with attributes C<$atts1> if
12344C<$atts1> is given (as a hashref), C<$2> is wrapped in $tag2...
12345
12346if $elt is C<< <p>tati tata <b>tutu tati titi</b> tata tati tata</p> >>
12347
12348  $elt->split( qr/(ta)ti/, 'foo', {type => 'toto'} )
12349
12350will change $elt to
12351
12352  <p><foo type="toto">ta</foo> tata <b>tutu <foo type="toto">ta</foo>
12353      titi</b> tata <foo type="toto">ta</foo> tata</p>
12354
12355The regexp can be passed either as a string or as C<qr//> (perl 5.005 and
12356later), it defaults to \s+ just as the C<split> built-in (but this would be
12357quite a useless behaviour without the C<$optional_tag> parameter)
12358
12359C<$optional_tag> defaults to PCDATA or CDATA, depending on the initial element
12360type
12361
12362The list of descendants is returned (including un-touched original elements
12363and newly created ones)
12364
12365=item mark        ( $regexp, $optional_tag, $optional_attribute_ref)
12366
12367This method behaves exactly as L<split>, except only the newly created
12368elements are returned
12369
12370=item wrap_children ( $regexp_string, $tag, $optional_attribute_hashref)
12371
12372Wrap the children of the element that match the regexp in an element C<$tag>.
12373If $optional_attribute_hashref is passed then the new element will
12374have these attributes.
12375
12376The $regexp_string includes tags, within pointy brackets, as in
12377C<< <title><para>+ >> and the usual Perl modifiers (+*?...).
12378Tags can be further qualified with attributes:
12379C<< <para type="warning" classif="cosmic_secret">+ >>. The values
12380for attributes should be xml-escaped: C<< <candy type="M&amp;Ms">* >>
12381(C<E<lt>>, C<&> B<C<E<gt>>> and C<"> should be escaped).
12382
12383Note that elements might get extra C<id> attributes in the process. See L<add_id>.
12384Use L<strip_att> to remove unwanted id's.
12385
12386Here is an example:
12387
12388If the element C<$elt> has the following content:
12389
12390  <elt>
12391   <p>para 1</p>
12392   <l_l1_1>list 1 item 1 para 1</l_l1_1>
12393     <l_l1>list 1 item 1 para 2</l_l1>
12394   <l_l1_n>list 1 item 2 para 1 (only para)</l_l1_n>
12395   <l_l1_n>list 1 item 3 para 1</l_l1_n>
12396     <l_l1>list 1 item 3 para 2</l_l1>
12397     <l_l1>list 1 item 3 para 3</l_l1>
12398   <l_l1_1>list 2 item 1 para 1</l_l1_1>
12399     <l_l1>list 2 item 1 para 2</l_l1>
12400   <l_l1_n>list 2 item 2 para 1 (only para)</l_l1_n>
12401   <l_l1_n>list 2 item 3 para 1</l_l1_n>
12402     <l_l1>list 2 item 3 para 2</l_l1>
12403     <l_l1>list 2 item 3 para 3</l_l1>
12404  </elt>
12405
12406Then the code
12407
12408  $elt->wrap_children( q{<l_l1_1><l_l1>*} , li => { type => "ul1" });
12409  $elt->wrap_children( q{<l_l1_n><l_l1>*} , li => { type => "ul" });
12410
12411  $elt->wrap_children( q{<li type="ul1"><li type="ul">+}, "ul");
12412  $elt->strip_att( 'id');
12413  $elt->strip_att( 'type');
12414  $elt->print;
12415
12416will output:
12417
12418  <elt>
12419     <p>para 1</p>
12420     <ul>
12421       <li>
12422         <l_l1_1>list 1 item 1 para 1</l_l1_1>
12423         <l_l1>list 1 item 1 para 2</l_l1>
12424       </li>
12425       <li>
12426         <l_l1_n>list 1 item 2 para 1 (only para)</l_l1_n>
12427       </li>
12428       <li>
12429         <l_l1_n>list 1 item 3 para 1</l_l1_n>
12430         <l_l1>list 1 item 3 para 2</l_l1>
12431         <l_l1>list 1 item 3 para 3</l_l1>
12432       </li>
12433     </ul>
12434     <ul>
12435       <li>
12436         <l_l1_1>list 2 item 1 para 1</l_l1_1>
12437         <l_l1>list 2 item 1 para 2</l_l1>
12438       </li>
12439       <li>
12440         <l_l1_n>list 2 item 2 para 1 (only para)</l_l1_n>
12441       </li>
12442       <li>
12443         <l_l1_n>list 2 item 3 para 1</l_l1_n>
12444         <l_l1>list 2 item 3 para 2</l_l1>
12445         <l_l1>list 2 item 3 para 3</l_l1>
12446       </li>
12447     </ul>
12448  </elt>
12449
12450=item subs_text ($regexp, $replace)
12451
12452subs_text does text substitution, similar to perl's C< s///> operator.
12453
12454C<$regexp> must be a perl regexp, created with the C<qr> operator.
12455
12456C<$replace> can include C<$1, $2>... from the C<$regexp>. It can also be
12457used to create element and entities, by using
12458C<< &elt( tag => { att => val }, text) >> (similar syntax as C<L<new>>) and
12459C<< &ent( name) >>.
12460
12461Here is a rather complex example:
12462
12463  $elt->subs_text( qr{(?<!do not )link to (http://([^\s,]*))},
12464                   'see &elt( a =>{ href => $1 }, $2)'
12465                 );
12466
12467This will replace text like I<link to http://www.xmltwig.org> by
12468I<< see <a href="www.xmltwig.org">www.xmltwig.org</a> >>, but not
12469I<do not link to...>
12470
12471Generating entities (here replacing spaces with &nbsp;):
12472
12473  $elt->subs_text( qr{ }, '&ent( "&nbsp;")');
12474
12475or, using a variable:
12476
12477  my $ent="&nbsp;";
12478  $elt->subs_text( qr{ }, "&ent( '$ent')");
12479
12480Note that the substitution is always global, as in using the C<g> modifier
12481in a perl substitution, and that it is performed on all text descendants
12482of the element.
12483
12484B<Bug>: in the C<$regexp>, you can only use C<\1>, C<\2>... if the replacement
12485expression does not include elements or attributes. eg
12486
12487  $t->subs_text( qr/((t[aiou])\2)/, '$2');             # ok, replaces toto, tata, titi, tutu by to, ta, ti, tu
12488  $t->subs_text( qr/((t[aiou])\2)/, '&elt(p => $1)' ); # NOK, does not find toto...
12489
12490=item add_id ($optional_coderef)
12491
12492Add an id to the element.
12493
12494The id is an attribute, C<id> by default, see the C<id> option for XML::Twig
12495C<new> to change it. Use an id starting with C<#> to get an id that's not
12496output by L<print>, L<flush> or L<sprint>, yet that allows you to use the
12497L<elt_id> method to get the element easily.
12498
12499If the element already has an id, no new id is generated.
12500
12501By default the method create an id of the form C<< twig_id_<nnnn> >>,
12502where C<< <nnnn> >> is a number, incremented each time the method is called
12503successfully.
12504
12505=item set_id_seed ($prefix)
12506
12507by default the id generated by C<L<add_id>> is C<< twig_id_<nnnn> >>,
12508C<set_id_seed> changes the prefix to C<$prefix> and resets the number
12509to 1
12510
12511=item strip_att ($att)
12512
12513Remove the attribute C<$att> from all descendants of the element (including
12514the element)
12515
12516Return the element
12517
12518=item change_att_name ($old_name, $new_name)
12519
12520Change the name of the attribute from C<$old_name> to C<$new_name>. If there is no
12521attribute C<$old_name> nothing happens.
12522
12523=item lc_attnames
12524
12525Lower cases the name all the attributes of the element.
12526
12527=item sort_children_on_value( %options)
12528
12529Sort the children of the element in place according to their text.
12530All children are sorted.
12531
12532Return the element, with its children sorted.
12533
12534
12535C<%options> are
12536
12537  type  : numeric |  alpha     (default: alpha)
12538  order : normal  |  reverse   (default: normal)
12539
12540Return the element, with its children sorted
12541
12542
12543=item sort_children_on_att ($att, %options)
12544
12545Sort the children of the  element in place according to attribute C<$att>.
12546C<%options> are the same as for C<sort_children_on_value>
12547
12548Return the element.
12549
12550
12551=item sort_children_on_field ($tag, %options)
12552
12553Sort the children of the element in place, according to the field C<$tag> (the
12554text of the first child of the child with this tag). C<%options> are the same
12555as for C<sort_children_on_value>.
12556
12557Return the element, with its children sorted
12558
12559
12560=item sort_children( $get_key, %options)
12561
12562Sort the children of the element in place. The C<$get_key> argument is
12563a reference to a function that returns the sort key when passed an element.
12564
12565For example:
12566
12567  $elt->sort_children( sub { $_[0]->{'att'}->{"nb"} + $_[0]->text },
12568                       type => 'numeric', order => 'reverse'
12569                     );
12570
12571=item field_to_att ($cond, $att)
12572
12573Turn the text of the first sub-element matched by C<$cond> into the value of
12574attribute C<$att> of the element. If C<$att> is omitted then C<$cond> is used
12575as the name of the attribute, which makes sense only if C<$cond> is a valid
12576element (and attribute) name.
12577
12578The sub-element is then cut.
12579
12580=item att_to_field ($att, $tag)
12581
12582Take the value of attribute C<$att> and create a sub-element C<$tag> as first
12583child of the element. If C<$tag> is omitted then C<$att> is used as the name of
12584the sub-element.
12585
12586
12587=item get_xpath  ($xpath, $optional_offset)
12588
12589Return a list of elements satisfying the C<$xpath>. C<$xpath> is an XPATH-like
12590expression.
12591
12592A subset of the XPATH abbreviated syntax is covered:
12593
12594  tag
12595  tag[1] (or any other positive number)
12596  tag[last()]
12597  tag[@att] (the attribute exists for the element)
12598  tag[@att="val"]
12599  tag[@att=~ /regexp/]
12600  tag[att1="val1" and att2="val2"]
12601  tag[att1="val1" or att2="val2"]
12602  tag[string()="toto"] (returns tag elements which text (as per the text method)
12603                       is toto)
12604  tag[string()=~/regexp/] (returns tag elements which text (as per the text
12605                          method) matches regexp)
12606  expressions can start with / (search starts at the document root)
12607  expressions can start with . (search starts at the current element)
12608  // can be used to get all descendants instead of just direct children
12609  * matches any tag
12610
12611So the following examples from the
12612F<XPath recommendationL<http://www.w3.org/TR/xpath.html#path-abbrev>> work:
12613
12614  para selects the para element children of the context node
12615  * selects all element children of the context node
12616  para[1] selects the first para child of the context node
12617  para[last()] selects the last para child of the context node
12618  */para selects all para grandchildren of the context node
12619  /doc/chapter[5]/section[2] selects the second section of the fifth chapter
12620     of the doc
12621  chapter//para selects the para element descendants of the chapter element
12622     children of the context node
12623  //para selects all the para descendants of the document root and thus selects
12624     all para elements in the same document as the context node
12625  //olist/item selects all the item elements in the same document as the
12626     context node that have an olist parent
12627  .//para selects the para element descendants of the context node
12628  .. selects the parent of the context node
12629  para[@type="warning"] selects all para children of the context node that have
12630     a type attribute with value warning
12631  employee[@secretary and @assistant] selects all the employee children of the
12632     context node that have both a secretary attribute and an assistant
12633     attribute
12634
12635
12636The elements will be returned in the document order.
12637
12638If C<$optional_offset> is used then only one element will be returned, the one
12639with the appropriate offset in the list, starting at 0
12640
12641Quoting and interpolating variables can be a pain when the Perl syntax and the
12642XPATH syntax collide, so use alternate quoting mechanisms like q or qq
12643(I like q{} and qq{} myself).
12644
12645Here are some more examples to get you started:
12646
12647  my $p1= "p1";
12648  my $p2= "p2";
12649  my @res= $t->get_xpath( qq{p[string( "$p1") or string( "$p2")]});
12650
12651  my $a= "a1";
12652  my @res= $t->get_xpath( qq{//*[@att="$a"]});
12653
12654  my $val= "a1";
12655  my $exp= qq{//p[ \@att='$val']}; # you need to use \@ or you will get a warning
12656  my @res= $t->get_xpath( $exp);
12657
12658Note that the only supported regexps delimiters are / and that you must
12659backslash all / in regexps AND in regular strings.
12660
12661XML::Twig does not provide natively full XPATH support, but you can use
12662C<L<XML::Twig::XPath>> to get C<findnodes> to use C<XML::XPath> as the
12663XPath engine, with full coverage of the spec.
12664
12665C<L<XML::Twig::XPath>> to get C<findnodes> to use C<XML::XPath> as the
12666XPath engine, with full coverage of the spec.
12667
12668=item find_nodes
12669
12670same asC<get_xpath>
12671
12672=item findnodes
12673
12674same as C<get_xpath>
12675
12676
12677=item text @optional_options
12678
12679Return a string consisting of all the C<PCDATA> and C<CDATA> in an element,
12680without any tags. The text is not XML-escaped: base entities such as C<&>
12681and C<< < >> are not escaped.
12682
12683The 'C<no_recurse>' option will only return the text of the element, not
12684of any included sub-elements (same as C<L<text_only>>).
12685
12686=item text_only
12687
12688Same as C<L<text>> except that the text returned doesn't include
12689the text of sub-elements.
12690
12691=item trimmed_text
12692
12693Same as C<text> except that the text is trimmed: leading and trailing spaces
12694are discarded, consecutive spaces are collapsed
12695
12696=item set_text        ($string)
12697
12698Set the text for the element: if the element is a C<PCDATA>, just set its
12699text, otherwise cut all the children of the element and create a single
12700C<PCDATA> child for it, which holds the text.
12701
12702=item merge ($elt2)
12703
12704Move the content of C<$elt2> within the element
12705
12706=item insert         ($tag1, [$optional_atts1], $tag2, [$optional_atts2],...)
12707
12708For each tag in the list inserts an element C<$tag> as the only child of the
12709element.  The element gets the optional attributes inC<< $optional_atts<n>. >>
12710All children of the element are set as children of the new element.
12711The upper level element is returned.
12712
12713  $p->insert( table => { border=> 1}, 'tr', 'td')
12714
12715put C<$p> in a table with a visible border, a single C<tr> and a single C<td>
12716and return the C<table> element:
12717
12718  <p><table border="1"><tr><td>original content of p</td></tr></table></p>
12719
12720=item wrap_in        (@tag)
12721
12722Wrap elements in C<@tag> as the successive ancestors of the element, returns the
12723new element.
12724C<< $elt->wrap_in( 'td', 'tr', 'table') >> wraps the element as a single cell in a
12725table for example.
12726
12727Optionally each tag can be followed by a hashref of attributes, that will be
12728set on the wrapping element:
12729
12730  $elt->wrap_in( p => { class => "advisory" }, div => { class => "intro", id => "div_intro" });
12731
12732=item insert_new_elt ($opt_position, $tag, $opt_atts_hashref, @opt_content)
12733
12734Combines a C<L<new> > and a C<L<paste> >: creates a new element using
12735C<$tag>, C<$opt_atts_hashref >and C<@opt_content> which are arguments similar
12736to those for C<new>, then paste it, using C<$opt_position> or C<'first_child'>,
12737relative to C<$elt>.
12738
12739Return the newly created element
12740
12741=item erase
12742
12743Erase the element: the element is deleted and all of its children are
12744pasted in its place.
12745
12746=item set_content    ( $optional_atts, @list_of_elt_and_strings)
12747                     ( $optional_atts, '#EMPTY')
12748
12749Set the content for the element, from a list of strings and
12750elements.  Cuts all the element children, then pastes the list
12751elements as the children.  This method will create a C<PCDATA> element
12752for any strings in the list.
12753
12754The C<$optional_atts> argument is the ref of a hash of attributes. If this
12755argument is used then the previous attributes are deleted, otherwise they
12756are left untouched.
12757
12758B<WARNING>: if you rely on ID's then you will have to set the id yourself. At
12759this point the element does not belong to a twig yet, so the ID attribute
12760is not known so it won't be stored in the ID list.
12761
12762A content of 'C<#EMPTY>' creates an empty element;
12763
12764=item namespace ($optional_prefix)
12765
12766Return the URI of the namespace that C<$optional_prefix> or the element name
12767belongs to. If the name doesn't belong to any namespace, C<undef> is returned.
12768
12769=item local_name
12770
12771Return the local name (without the prefix) for the element
12772
12773=item ns_prefix
12774
12775Return the namespace prefix for the element
12776
12777=item current_ns_prefixes
12778
12779Return a list of namespace prefixes valid for the element. The order of the
12780prefixes in the list has no meaning. If the default namespace is currently
12781bound, '' appears in the list.
12782
12783
12784=item inherit_att  ($att, @optional_tag_list)
12785
12786Return the value of an attribute inherited from parent tags. The value
12787returned is found by looking for the attribute in the element then in turn
12788in each of its ancestors. If the C<@optional_tag_list> is supplied only those
12789ancestors whose tag is in the list will be checked.
12790
12791=item all_children_are ($optional_condition)
12792
12793return 1 if all children of the element pass the C<$optional_condition>,
127940 otherwise
12795
12796=item level       ($optional_condition)
12797
12798Return the depth of the element in the twig (root is 0).
12799If C<$optional_condition> is given then only ancestors that match the condition are
12800counted.
12801
12802B<WARNING>: in a tree created using the C<twig_roots> option this will not return
12803the level in the document tree, level 0 will be the document root, level 1
12804will be the C<twig_roots> elements. During the parsing (in a C<twig_handler>)
12805you can use the C<depth> method on the twig object to get the real parsing depth.
12806
12807=item in           ($potential_parent)
12808
12809Return true if the element is in the potential_parent (C<$potential_parent> is
12810an element)
12811
12812=item in_context   ($cond, $optional_level)
12813
12814Return true if the element is included in an element which passes C<$cond>
12815optionally within C<$optional_level> levels. The returned value is the
12816including element.
12817
12818=item pcdata
12819
12820Return the text of a C<PCDATA> element or C<undef> if the element is not
12821C<PCDATA>.
12822
12823=item pcdata_xml_string
12824
12825Return the text of a C<PCDATA> element or undef if the element is not C<PCDATA>.
12826The text is "XML-escaped" ('&' and '<' are replaced by '&amp;' and '&lt;')
12827
12828=item set_pcdata     ($text)
12829
12830Set the text of a C<PCDATA> element. This method does not check that the element is
12831indeed a C<PCDATA> so usually you should use C<L<set_text>> instead.
12832
12833=item append_pcdata  ($text)
12834
12835Add the text at the end of a C<PCDATA> element.
12836
12837=item is_cdata
12838
12839Return 1 if the element is a C<CDATA> element, returns 0 otherwise.
12840
12841=item is_text
12842
12843Return 1 if the element is a C<CDATA> or C<PCDATA> element, returns 0 otherwise.
12844
12845=item cdata
12846
12847Return the text of a C<CDATA> element or C<undef> if the element is not
12848C<CDATA>.
12849
12850=item cdata_string
12851
12852Return the XML string of a C<CDATA> element, including the opening and
12853closing markers.
12854
12855=item set_cdata     ($text)
12856
12857Set the text of a C<CDATA> element.
12858
12859=item append_cdata  ($text)
12860
12861Add the text at the end of a C<CDATA> element.
12862
12863=item remove_cdata
12864
12865Turns all C<CDATA> sections in the element into regular C<PCDATA> elements. This is useful
12866when converting XML to HTML, as browsers do not support CDATA sections.
12867
12868=item extra_data
12869
12870Return the extra_data (comments and PI's) attached to an element
12871
12872=item set_extra_data     ($extra_data)
12873
12874Set the extra_data (comments and PI's) attached to an element
12875
12876=item append_extra_data  ($extra_data)
12877
12878Append extra_data to the existing extra_data before the element (if no
12879previous extra_data exists then it is created)
12880
12881=item set_asis
12882
12883Set a property of the element that causes it to be output without being XML
12884escaped by the print functions: if it contains C<< a < b >> it will be output
12885as such and not as C<< a &lt; b >>. This can be useful to create text elements
12886that will be output as markup. Note that all C<PCDATA> descendants of the
12887element are also marked as having the property (they are the ones that are
12888actually impacted by the change).
12889
12890If the element is a C<CDATA> element it will also be output asis, without the
12891C<CDATA> markers. The same goes for any C<CDATA> descendant of the element
12892
12893=item set_not_asis
12894
12895Unsets the C<asis> property for the element and its text descendants.
12896
12897=item is_asis
12898
12899Return the C<asis> property status of the element ( 1 or C<undef>)
12900
12901=item closed
12902
12903Return true if the element has been closed. Might be useful if you are
12904somewhere in the tree, during the parse, and have no idea whether a parent
12905element is completely loaded or not.
12906
12907=item get_type
12908
12909Return the type of the element: 'C<#ELT>' for "real" elements, or 'C<#PCDATA>',
12910'C<#CDATA>', 'C<#COMMENT>', 'C<#ENT>', 'C<#PI>'
12911
12912=item is_elt
12913
12914Return the tag if the element is a "real" element, or 0 if it is C<PCDATA>,
12915C<CDATA>...
12916
12917=item contains_only_text
12918
12919Return 1 if the element does not contain any other "real" element
12920
12921=item contains_only ($exp)
12922
12923Return the list of children if all children of the element match
12924the expression C<$exp>
12925
12926  if( $para->contains_only( 'tt')) { ... }
12927
12928=item contains_a_single ($exp)
12929
12930If the element contains a single child that matches the expression C<$exp>
12931returns that element. Otherwise returns 0.
12932
12933=item is_field
12934
12935same as C<contains_only_text>
12936
12937=item is_pcdata
12938
12939Return 1 if the element is a C<PCDATA> element, returns 0 otherwise.
12940
12941=item is_ent
12942
12943Return 1 if the element is an entity (an unexpanded entity) element,
12944return 0 otherwise.
12945
12946=item is_empty
12947
12948Return 1 if the element is empty, 0 otherwise
12949
12950=item set_empty
12951
12952Flags the element as empty. No further check is made, so if the element
12953is actually not empty the output will be messed. The only effect of this
12954method is that the output will be C<< <tag att="value""/> >>.
12955
12956=item set_not_empty
12957
12958Flags the element as not empty. if it is actually empty then the element will
12959be output as C<< <tag att="value""></tag> >>
12960
12961=item is_pi
12962
12963Return 1 if the element is a processing instruction (C<#PI>) element,
12964return 0 otherwise.
12965
12966=item target
12967
12968Return the target of a processing instruction
12969
12970=item set_target ($target)
12971
12972Set the target of a processing instruction
12973
12974=item data
12975
12976Return the data part of a processing instruction
12977
12978=item set_data ($data)
12979
12980Set the data of a processing instruction
12981
12982=item set_pi ($target, $data)
12983
12984Set the target and data of a processing instruction
12985
12986=item pi_string
12987
12988Return the string form of a processing instruction
12989(C<< <?target data?> >>)
12990
12991=item is_comment
12992
12993Return 1 if the element is a comment (C<#COMMENT>) element,
12994return 0 otherwise.
12995
12996=item set_comment ($comment_text)
12997
12998Set the text for a comment
12999
13000=item comment
13001
13002Return the content of a comment (just the text, not the C<< <!-- >>
13003and C<< --> >>)
13004
13005=item comment_string
13006
13007Return the XML string for a comment (C<< <!-- comment --> >>)
13008
13009Note that an XML comment cannot start or end with a '-', or include '--'
13010(http://www.w3.org/TR/2008/REC-xml-20081126/#sec-comments),
13011if that is the case (because you have created the comment yourself presumably,
13012as it could not be in the input XML), then a space will be inserted before
13013an initial '-', after a trailing one or between two '-' in the comment
13014(which could presumably mangle javascript "hidden" in an XHTML comment);
13015
13016=item set_ent ($entity)
13017
13018Set an (non-expanded) entity (C<#ENT>). C<$entity>) is the entity
13019text (C<&ent;>)
13020
13021=item ent
13022
13023Return the entity for an entity (C<#ENT>) element (C<&ent;>)
13024
13025=item ent_name
13026
13027Return the entity name for an entity (C<#ENT>) element (C<ent>)
13028
13029=item ent_string
13030
13031Return the entity, either expanded if the expanded version is available,
13032or non-expanded (C<&ent;>) otherwise
13033
13034=item child ($offset, $optional_condition)
13035
13036Return the C<$offset>-th child of the element, optionally the C<$offset>-th
13037child that matches C<$optional_condition>. The children are treated as a list, so
13038C<< $elt->child( 0) >> is the first child, while C<< $elt->child( -1) >> is
13039the last child.
13040
13041=item child_text ($offset, $optional_condition)
13042
13043Return the text of a child or C<undef> if the sibling does not exist. Arguments
13044are the same as child.
13045
13046=item last_child    ($optional_condition)
13047
13048Return the last child of the element, or the last child matching
13049C<$optional_condition> (ie the last of the element children matching
13050the condition).
13051
13052=item last_child_text   ($optional_condition)
13053
13054Same as C<first_child_text> but for the last child.
13055
13056=item sibling  ($offset, $optional_condition)
13057
13058Return the next or previous C<$offset>-th sibling of the element, or the
13059C<$offset>-th one matching C<$optional_condition>. If C<$offset> is negative then a
13060previous sibling is returned, if $offset is positive then  a next sibling is
13061returned. C<$offset=0> returns the element if there is no condition or
13062if the element matches the condition>, C<undef> otherwise.
13063
13064=item sibling_text ($offset, $optional_condition)
13065
13066Return the text of a sibling or C<undef> if the sibling does not exist.
13067Arguments are the same as C<sibling>.
13068
13069=item prev_siblings ($optional_condition)
13070
13071Return the list of previous siblings (optionally matching C<$optional_condition>)
13072for the element. The elements are ordered in document order.
13073
13074=item next_siblings ($optional_condition)
13075
13076Return the list of siblings (optionally matching C<$optional_condition>)
13077following the element. The elements are ordered in document order.
13078
13079=item siblings ($optional_condition)
13080
13081Return the list of siblings (optionally matching C<$optional_condition>)
13082of the element (excluding the element itself). The elements are ordered
13083in document order.
13084
13085=item pos ($optional_condition)
13086
13087Return the position of the element in the children list. The first child has a
13088position of 1 (as in XPath).
13089
13090If the C<$optional_condition> is given then only siblings that match the condition
13091are counted. If the element itself does not match the  condition then
130920 is returned.
13093
13094=item atts
13095
13096Return a hash ref containing the element attributes
13097
13098=item set_atts      ({ att1=>$att1_val, att2=> $att2_val... })
13099
13100Set the element attributes with the hash ref supplied as the argument. The previous
13101attributes are lost (ie the attributes set by C<set_atts> replace all of the
13102attributes of the element).
13103
13104You can also pass a list instead of a hashref: C<< $elt->set_atts( att1 => 'val1',...) >>
13105
13106=item del_atts
13107
13108Deletes all the element attributes.
13109
13110=item att_nb
13111
13112Return the number of attributes for the element
13113
13114=item has_atts
13115
13116Return true if the element has attributes (in fact return the number of
13117attributes, thus being an alias to C<L<att_nb>>
13118
13119=item has_no_atts
13120
13121Return true if the element has no attributes, false (0) otherwise
13122
13123=item att_names
13124
13125return a list of the attribute names for the element
13126
13127=item att_xml_string ($att, $options)
13128
13129Return the attribute value, where '&', '<' and quote (" or the value of the quote option
13130at twig creation) are XML-escaped.
13131
13132The options are passed as a hashref, setting C<escape_gt> to a true value will also escape
13133'>' ($elt( 'myatt', { escape_gt => 1 });
13134
13135=item set_id       ($id)
13136
13137Set the C<id> attribute of the element to the value.
13138See C<L<elt_id> > to change the id attribute name
13139
13140=item id
13141
13142Gets the id attribute value
13143
13144=item del_id       ($id)
13145
13146Deletes the C<id> attribute of the element and remove it from the id list
13147for the document
13148
13149=item class
13150
13151Return the C<class> attribute for the element (methods on the C<class>
13152attribute are quite convenient when dealing with XHTML, or plain XML that
13153will eventually be displayed using CSS)
13154
13155=item lclass
13156
13157same as class, except that
13158this method is an lvalue, so you can do C<< $elt->lclass= "foo" >>
13159
13160=item set_class ($class)
13161
13162Set the C<class> attribute for the element to C<$class>
13163
13164=item add_class ($class)
13165
13166Add C<$class> to the element C<class> attribute: the new class is added
13167only if it is not already present.
13168
13169Note that classes are then sorted alphabetically, so the C<class> attribute
13170can be changed even if the class is already there
13171
13172=item remove_class ($class)
13173
13174Remove C<$class> from the element C<class> attribute.
13175
13176Note that classes are then sorted alphabetically, so the C<class> attribute can be
13177changed even if the class is already there
13178
13179
13180=item add_to_class ($class)
13181
13182alias for add_class
13183
13184=item att_to_class ($att)
13185
13186Set the C<class> attribute to the value of attribute C<$att>
13187
13188=item add_att_to_class ($att)
13189
13190Add the value of attribute C<$att> to the C<class> attribute of the element
13191
13192=item move_att_to_class ($att)
13193
13194Add the value of attribute C<$att> to the C<class> attribute of the element
13195and delete the attribute
13196
13197=item tag_to_class
13198
13199Set the C<class> attribute of the element to the element tag
13200
13201=item add_tag_to_class
13202
13203Add the element tag to its C<class> attribute
13204
13205=item set_tag_class ($new_tag)
13206
13207Add the element tag to its C<class> attribute and sets the tag to C<$new_tag>
13208
13209=item in_class ($class)
13210
13211Return true (C<1>) if the element is in the class C<$class> (if C<$class> is
13212one of the tokens in the element C<class> attribute)
13213
13214=item tag_to_span
13215
13216Change the element tag tp C<span> and set its class to the old tag
13217
13218=item tag_to_div
13219
13220Change the element tag tp C<div> and set its class to the old tag
13221
13222=item DESTROY
13223
13224Frees the element from memory.
13225
13226=item start_tag
13227
13228Return the string for the start tag for the element, including
13229the C<< /> >> at the end of an empty element tag
13230
13231=item end_tag
13232
13233Return the string for the end tag of an element.  For an empty
13234element, this returns the empty string ('').
13235
13236=item xml_string @optional_options
13237
13238Equivalent to C<< $elt->sprint( 1) >>, returns the string for the entire
13239element, excluding the element's tags (but nested element tags are present)
13240
13241The 'C<no_recurse>' option will only return the text of the element, not
13242of any included sub-elements (same as C<L<xml_text_only>>).
13243
13244=item inner_xml
13245
13246Another synonym for xml_string
13247
13248=item outer_xml
13249
13250An other synonym for sprint
13251
13252=item xml_text
13253
13254Return the text of the element, encoded (and processed by the current
13255C<L<output_filter>> or C<L<output_encoding>> options, without any tag.
13256
13257=item xml_text_only
13258
13259Same as C<L<xml_text>> except that the text returned doesn't include
13260the text of sub-elements.
13261
13262=item set_pretty_print ($style)
13263
13264Set the pretty print method, amongst 'C<none>' (default), 'C<nsgmls>',
13265'C<nice>', 'C<indented>', 'C<record>' and 'C<record_c>'
13266
13267pretty_print styles:
13268
13269=over 4
13270
13271=item none
13272
13273the default, no C<\n> is used
13274
13275=item nsgmls
13276
13277nsgmls style, with C<\n> added within tags
13278
13279=item nice
13280
13281adds C<\n> wherever possible (NOT SAFE, can lead to invalid XML)
13282
13283=item indented
13284
13285same as C<nice> plus indents elements (NOT SAFE, can lead to invalid XML)
13286
13287=item record
13288
13289table-oriented pretty print, one field per line
13290
13291=item record_c
13292
13293table-oriented pretty print, more compact than C<record>, one record per line
13294
13295=back
13296
13297=item set_empty_tag_style ($style)
13298
13299Set the method to output empty tags, amongst 'C<normal>' (default), 'C<html>',
13300and 'C<expand>',
13301
13302C<normal> outputs an empty tag 'C<< <tag/> >>', C<html> adds a space
13303'C<< <tag /> >>' for elements that can be empty in XHTML and C<expand> outputs
13304'C<< <tag></tag> >>'
13305
13306=item set_remove_cdata  ($flag)
13307
13308set (or unset) the flag that forces the twig to output CDATA sections as
13309regular (escaped) PCDATA
13310
13311
13312=item set_indent ($string)
13313
13314Set the indentation for the indented pretty print style (default is 2 spaces)
13315
13316=item set_quote ($quote)
13317
13318Set the quotes used for attributes. can be 'C<double>' (default) or 'C<single>'
13319
13320=item cmp       ($elt)
13321
13322  Compare the order of the 2 elements in a twig.
13323
13324  C<$a> is the <A>..</A> element, C<$b> is the <B>...</B> element
13325
13326  document                        $a->cmp( $b)
13327  <A> ... </A> ... <B>  ... </B>     -1
13328  <A> ... <B>  ... </B> ... </A>     -1
13329  <B> ... </B> ... <A>  ... </A>      1
13330  <B> ... <A>  ... </A> ... </B>      1
13331   $a == $b                           0
13332   $a and $b not in the same tree   undef
13333
13334=item before       ($elt)
13335
13336Return 1 if C<$elt> starts before the element, 0 otherwise. If the 2 elements
13337are not in the same twig then return C<undef>.
13338
13339    if( $a->cmp( $b) == -1) { return 1; } else { return 0; }
13340
13341=item after       ($elt)
13342
13343Return 1 if $elt starts after the element, 0 otherwise. If the 2 elements
13344are not in the same twig then return C<undef>.
13345
13346    if( $a->cmp( $b) == -1) { return 1; } else { return 0; }
13347
13348=item other comparison methods
13349
13350=over 4
13351
13352=item lt
13353
13354=item le
13355
13356=item gt
13357
13358=item ge
13359
13360=back
13361
13362=item path
13363
13364Return the element context in a form similar to XPath's short
13365form: 'C</root/tag1/../tag>'
13366
13367=item xpath
13368
13369Return a unique XPath expression that can be used to find the element
13370again.
13371
13372It looks like C</doc/sect[3]/title>: unique elements do not have an index,
13373the others do.
13374
13375=item flush
13376
13377flushes the twig up to the current element (strictly equivalent to
13378C<< $elt->root->flush >>)
13379
13380=item private methods
13381
13382Low-level methods on the twig:
13383
13384=over 4
13385
13386=item set_parent        ($parent)
13387
13388=item set_first_child   ($first_child)
13389
13390=item set_last_child    ($last_child)
13391
13392=item set_prev_sibling  ($prev_sibling)
13393
13394=item set_next_sibling  ($next_sibling)
13395
13396=item set_twig_current
13397
13398=item del_twig_current
13399
13400=item twig_current
13401
13402=item contains_text
13403
13404=back
13405
13406Those methods should not be used, unless of course you find some creative
13407and interesting, not to mention useful, ways to do it.
13408
13409=back
13410
13411=head2 cond
13412
13413Most of the navigation functions accept a condition as an optional argument
13414The first element (or all elements for C<L<children> > or
13415C<L<ancestors> >) that passes the condition is returned.
13416
13417The condition is a single step of an XPath expression using the XPath subset
13418defined by C<L<get_xpath>>. Additional conditions are:
13419
13420The condition can be
13421
13422=over 4
13423
13424=item #ELT
13425
13426return a "real" element (not a PCDATA, CDATA, comment or pi element)
13427
13428=item #TEXT
13429
13430return a PCDATA or CDATA element
13431
13432=item regular expression
13433
13434return an element whose tag matches the regexp. The regexp has to be created
13435with C<qr//> (hence this is available only on perl 5.005 and above)
13436
13437=item code reference
13438
13439applies the code, passing the current element as argument, if the code returns
13440true then the element is returned, if it returns false then the code is applied
13441to the next candidate.
13442
13443=back
13444
13445=head2 XML::Twig::XPath
13446
13447XML::Twig implements a subset of XPath through the C<L<get_xpath>> method.
13448
13449If you want to use the whole XPath power, then you can use C<XML::Twig::XPath>
13450instead. In this case C<XML::Twig> uses C<XML::XPath> to execute XPath queries.
13451You will of course need C<XML::XPath> installed to be able to use C<XML::Twig::XPath>.
13452
13453See L<XML::XPath> for more information.
13454
13455The methods you can use are:
13456
13457=over 4
13458
13459=item findnodes              ($path)
13460
13461return a list of nodes found by C<$path>.
13462
13463=item findnodes_as_string    ($path)
13464
13465return the nodes found reproduced as XML. The result is not guaranteed
13466to be valid XML though.
13467
13468=item findvalue              ($path)
13469
13470return the concatenation of the text content of the result nodes
13471
13472=back
13473
13474In order for C<XML::XPath> to be used as the XPath engine the following methods
13475are included in C<XML::Twig>:
13476
13477in XML::Twig
13478
13479=over 4
13480
13481=item getRootNode
13482
13483=item getParentNode
13484
13485=item getChildNodes
13486
13487=back
13488
13489in XML::Twig::Elt
13490
13491=over 4
13492
13493=item string_value
13494
13495=item toString
13496
13497=item getName
13498
13499=item getRootNode
13500
13501=item getNextSibling
13502
13503=item getPreviousSibling
13504
13505=item isElementNode
13506
13507=item isTextNode
13508
13509=item isPI
13510
13511=item isPINode
13512
13513=item isProcessingInstructionNode
13514
13515=item isComment
13516
13517=item isCommentNode
13518
13519=item getTarget
13520
13521=item getChildNodes
13522
13523=item getElementById
13524
13525=back
13526
13527=head2 XML::Twig::XPath::Elt
13528
13529The methods you can use are the same as on C<XML::Twig::XPath> elements:
13530
13531=over 4
13532
13533=item findnodes              ($path)
13534
13535return a list of nodes found by C<$path>.
13536
13537=item findnodes_as_string    ($path)
13538
13539return the nodes found reproduced as XML. The result is not guaranteed
13540to be valid XML though.
13541
13542=item findvalue              ($path)
13543
13544return the concatenation of the text content of the result nodes
13545
13546=back
13547
13548
13549=head2 XML::Twig::Entity_list
13550
13551=over 4
13552
13553=item new
13554
13555Create an entity list.
13556
13557=item add         ($ent)
13558
13559Add an entity to an entity list.
13560
13561=item add_new_ent ($name, $val, $sysid, $pubid, $ndata, $param)
13562
13563Create a new entity and add it to the entity list
13564
13565=item delete     ($ent or $tag).
13566
13567Delete an entity (defined by its name or by the Entity object)
13568from the list.
13569
13570=item print      ($optional_filehandle)
13571
13572Print the entity list.
13573
13574=item list
13575
13576Return the list as an array
13577
13578=back
13579
13580
13581=head2 XML::Twig::Entity
13582
13583=over 4
13584
13585=item new        ($name, $val, $sysid, $pubid, $ndata, $param)
13586
13587Same arguments as the Entity handler for XML::Parser.
13588
13589=item print       ($optional_filehandle)
13590
13591Print an entity declaration.
13592
13593=item name
13594
13595Return the name of the entity
13596
13597=item val
13598
13599Return the value of the entity
13600
13601=item sysid
13602
13603Return the system id for the entity (for NDATA entities)
13604
13605=item pubid
13606
13607Return the public id for the entity (for NDATA entities)
13608
13609=item ndata
13610
13611Return true if the entity is an NDATA entity
13612
13613=item param
13614
13615Return true if the entity is a parameter entity
13616
13617
13618=item text
13619
13620Return the entity declaration text.
13621
13622=back
13623
13624
13625=head1 EXAMPLES
13626
13627Additional examples (and a complete tutorial) can be found  on the
13628F<XML::Twig PageL<http://www.xmltwig.org/xmltwig/>>
13629
13630To figure out what flush does call the following script with an
13631XML file and an element name as arguments
13632
13633  use XML::Twig;
13634
13635  my ($file, $elt)= @ARGV;
13636  my $t= XML::Twig->new( twig_handlers =>
13637      { $elt => sub {$_[0]->flush; print "\n[flushed here]\n";} });
13638  $t->parsefile( $file, ErrorContext => 2);
13639  $t->flush;
13640  print "\n";
13641
13642
13643=head1 NOTES
13644
13645=head2 Subclassing XML::Twig
13646
13647Useful methods:
13648
13649=over 4
13650
13651=item elt_class
13652
13653In order to subclass C<XML::Twig> you will probably need to subclass also
13654C<L<XML::Twig::Elt>>. Use the C<elt_class> option when you create the
13655C<XML::Twig> object to get the elements created in a different class
13656(which should be a subclass of C<XML::Twig::Elt>.
13657
13658=item add_options
13659
13660If you inherit C<XML::Twig> new method but want to add more options to it
13661you can use this method to prevent XML::Twig to issue warnings for those
13662additional options.
13663
13664=back
13665
13666=head2 DTD Handling
13667
13668There are 3 possibilities here.  They are:
13669
13670=over 4
13671
13672=item No DTD
13673
13674No doctype, no DTD information, no entity information, the world is simple...
13675
13676=item Internal DTD
13677
13678The XML document includes an internal DTD, and maybe entity declarations.
13679
13680If you use the load_DTD option when creating the twig the DTD information and
13681the entity declarations can be accessed.
13682
13683The DTD and the entity declarations will be C<flush>'ed (or C<print>'ed) either
13684as is (if they have not been modified) or as reconstructed (poorly, comments
13685are lost, order is not kept, due to it's content this DTD should not be viewed
13686by anyone) if they have been modified. You can also modify them directly by
13687changing the C<< $twig->{twig_doctype}->{internal} >> field (straight from
13688XML::Parser, see the C<Doctype> handler doc)
13689
13690=item External DTD
13691
13692The XML document includes a reference to an external DTD, and maybe entity
13693declarations.
13694
13695If you use the C<load_DTD> when creating the twig the DTD information and the
13696entity declarations can be accessed. The entity declarations will be
13697C<flush>'ed (or C<print>'ed) either as is (if they have not been modified) or
13698as reconstructed (badly, comments are lost, order is not kept).
13699
13700You can change the doctype through the C<< $twig->set_doctype >> method and
13701print the dtd through the C<< $twig->dtd_text >> or C<< $twig->dtd_print >>
13702 methods.
13703
13704If you need to modify the entity list this is probably the easiest way to do it.
13705
13706=back
13707
13708
13709=head2 Flush
13710
13711Remember that element handlers are called when the element is CLOSED, so
13712if you have handlers for nested elements the inner handlers will be called
13713first. It makes it for example trickier than it would seem to number nested
13714sections (or clauses, or divs), as the titles in the inner sections are handled
13715before the outer sections.
13716
13717
13718=head1 BUGS
13719
13720=over 4
13721
13722=item segfault during parsing
13723
13724This happens when parsing huge documents, or lots of small ones, with a version
13725of Perl before 5.16.
13726
13727This is due to a bug in the way weak references are handled in Perl itself.
13728
13729The fix is either to upgrade to Perl 5.16 or later (C<perlbrew> is a great
13730tool to manage several installations of perl on the same machine).
13731
13732An other, NOT RECOMMENDED, way of fixing the problem, is to switch off weak
13733references by writing C<XML::Twig::_set_weakrefs( 0);> at the top of the code.
13734This is totally unsupported, and may lead to other problems though,
13735
13736=item entity handling
13737
13738Due to XML::Parser behaviour, non-base entities in attribute values disappear if
13739they are not declared in the document:
13740C<att="val&ent;"> will be turned into C<< att => val >>, unless you use the
13741C<keep_encoding> argument to C<< XML::Twig->new >>
13742
13743=item DTD handling
13744
13745The DTD handling methods are quite bugged. No one uses them and
13746it seems very difficult to get them to work in all cases, including with
13747several slightly incompatible versions of XML::Parser and of libexpat.
13748
13749Basically you can read the DTD, output it back properly, and update entities,
13750but not much more.
13751
13752So use XML::Twig with standalone documents, or with documents referring to an
13753external DTD, but don't expect it to properly parse and even output back the
13754DTD.
13755
13756=item memory leak
13757
13758If you use a REALLY old Perl (5.005!) and
13759a lot of twigs you might find that you leak quite a lot of memory
13760(about 2Ks per twig). You can use the C<L<dispose> > method to free
13761that memory after you are done.
13762
13763If you create elements the same thing might happen, use the C<L<delete>>
13764method to get rid of them.
13765
13766Alternatively installing the C<Scalar::Util> (or C<WeakRef>) module on a version
13767of Perl that supports it (>5.6.0) will get rid of the memory leaks automagically.
13768
13769=item ID list
13770
13771The ID list is NOT updated when elements are cut or deleted.
13772
13773=item change_gi
13774
13775This method will not function properly if you do:
13776
13777     $twig->change_gi( $old1, $new);
13778     $twig->change_gi( $old2, $new);
13779     $twig->change_gi( $new, $even_newer);
13780
13781=item sanity check on XML::Parser method calls
13782
13783XML::Twig should really prevent calls to some XML::Parser methods, especially
13784the C<setHandlers> method.
13785
13786=item pretty printing
13787
13788Pretty printing (at least using the 'C<indented>' style) is hard to get right!
13789Only elements that belong to the document will be properly indented. Printing
13790elements that do not belong to the twig makes it impossible for XML::Twig to
13791figure out their depth, and thus their indentation level.
13792
13793Also there is an unavoidable bug when using C<flush> and pretty printing for
13794elements with mixed content that start with an embedded element:
13795
13796  <elt><b>b</b>toto<b>bold</b></elt>
13797
13798  will be output as
13799
13800  <elt>
13801    <b>b</b>toto<b>bold</b></elt>
13802
13803if you flush the twig when you find the C<< <b> >> element
13804
13805
13806=back
13807
13808=head1 Globals
13809
13810These are the things that can mess up calling code, especially if threaded.
13811They might also cause problem under mod_perl.
13812
13813=over 4
13814
13815=item Exported constants
13816
13817Whether you want them or not you get them! These are subroutines to use
13818as constant when creating or testing elements
13819
13820  PCDATA  return '#PCDATA'
13821  CDATA   return '#CDATA'
13822  PI      return '#PI', I had the choice between PROC and PI :--(
13823
13824=item Module scoped values: constants
13825
13826these should cause no trouble:
13827
13828  %base_ent= ( '>' => '&gt;',
13829               '<' => '&lt;',
13830               '&' => '&amp;',
13831               "'" => '&apos;',
13832               '"' => '&quot;',
13833             );
13834  CDATA_START   = "<![CDATA[";
13835  CDATA_END     = "]]>";
13836  PI_START      = "<?";
13837  PI_END        = "?>";
13838  COMMENT_START = "<!--";
13839  COMMENT_END   = "-->";
13840
13841pretty print styles
13842
13843  ( $NSGMLS, $NICE, $INDENTED, $INDENTED_C, $WRAPPED, $RECORD1, $RECORD2)= (1..7);
13844
13845empty tag output style
13846
13847  ( $HTML, $EXPAND)= (1..2);
13848
13849=item Module scoped values: might be changed
13850
13851Most of these deal with pretty printing, so the worst that can
13852happen is probably that XML output does not look right, but is
13853still valid and processed identically by XML processors.
13854
13855C<$empty_tag_style> can mess up HTML bowsers though and changing C<$ID>
13856would most likely create problems.
13857
13858  $pretty=0;           # pretty print style
13859  $quote='"';          # quote for attributes
13860  $INDENT= '  ';       # indent for indented pretty print
13861  $empty_tag_style= 0; # how to display empty tags
13862  $ID                  # attribute used as an id ('id' by default)
13863
13864=item Module scoped values: definitely changed
13865
13866These 2 variables are used to replace tags by an index, thus
13867saving some space when creating a twig. If they really cause
13868you too much trouble, let me know, it is probably possible to
13869create either a switch or at least a version of XML::Twig that
13870does not perform this optimization.
13871
13872  %gi2index;     # tag => index
13873  @index2gi;     # list of tags
13874
13875=back
13876
13877If you need to manipulate all those values, you can use the following methods on the
13878XML::Twig object:
13879
13880=over 4
13881
13882=item global_state
13883
13884Return a hashref with all the global variables used by XML::Twig
13885
13886The hash has the following fields:  C<pretty>, C<quote>, C<indent>,
13887C<empty_tag_style>, C<keep_encoding>, C<expand_external_entities>,
13888C<output_filter>, C<output_text_filter>, C<keep_atts_order>
13889
13890=item set_global_state ($state)
13891
13892Set the global state, C<$state> is a hashref
13893
13894=item save_global_state
13895
13896Save the current global state
13897
13898=item restore_global_state
13899
13900Restore the previously saved (using C<Lsave_global_state>> state
13901
13902=back
13903
13904=head1 TODO
13905
13906=over 4
13907
13908=item SAX handlers
13909
13910Allowing XML::Twig to work on top of any SAX parser
13911
13912=item multiple twigs are not well supported
13913
13914A number of twig features are just global at the moment. These include
13915the ID list and the "tag pool" (if you use C<change_gi> then you change the tag
13916for ALL twigs).
13917
13918A future version will try to support this while trying not to be to
13919hard on performance (at least when a single twig is used!).
13920
13921=back
13922
13923=head1 AUTHOR
13924
13925Michel Rodriguez <mirod@cpan.org>
13926
13927=head1 LICENSE
13928
13929This library is free software; you can redistribute it and/or modify
13930it under the same terms as Perl itself.
13931
13932Bug reports should be sent using:
13933F<RT L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=XML-Twig>>
13934
13935Comments can be sent to mirod@cpan.org
13936
13937The XML::Twig page is at L<http://www.xmltwig.org/xmltwig/>
13938It includes the development version of the module, a slightly better version
13939of the documentation, examples, a tutorial and a:
13940F<Processing XML efficiently with Perl and XML::Twig:
13941L<http://www.xmltwig.org/xmltwig/tutorial/index.html>>
13942
13943=head1 SEE ALSO
13944
13945Complete docs, including a tutorial, examples, an easier to use HTML version of
13946the docs, a quick reference card and a FAQ are available at
13947L<http://www.xmltwig.org/xmltwig/>
13948
13949git repository at L<http://github.com/mirod/xmltwig>
13950
13951L<XML::Parser>, L<XML::Parser::Expat>, L<XML::XPath>, L<Encode>,
13952L<Text::Iconv>, L<Scalar::Utils>
13953
13954
13955=head2 Alternative Modules
13956
13957XML::Twig is not the only XML::Processing module available on CPAN (far from
13958it!).
13959
13960The main alternative I would recommend is L<XML::LibXML>.
13961
13962Here is a quick comparison of the 2 modules:
13963
13964XML::LibXML, actually C<libxml2> on which it is based, sticks to the standards,
13965and implements a good number of them in a rather strict way: XML, XPath, DOM,
13966RelaxNG, I must be forgetting a couple (XInclude?). It is fast and rather
13967frugal memory-wise.
13968
13969XML::Twig is older: when I started writing it XML::Parser/expat was the only
13970game in town. It implements XML and that's about it (plus a subset of XPath,
13971and you can use XML::Twig::XPath if you have XML::XPathEngine installed for full
13972support). It is slower and requires more memory for a full tree than
13973XML::LibXML. On the plus side (yes, there is a plus side!) it lets you process
13974a big document in chunks, and thus let you tackle documents that couldn't be
13975loaded in memory by XML::LibXML, and it offers a lot (and I mean a LOT!) of
13976higher-level methods, for everything, from adding structure to "low-level" XML,
13977to shortcuts for XHTML conversions and more. It also DWIMs quite a bit, getting
13978comments and non-significant whitespaces out of the way but preserving them in
13979the output for example. As it does not stick to the DOM, is also usually leads
13980to shorter code than in XML::LibXML.
13981
13982Beyond the pure features of the 2 modules, XML::LibXML seems to be preferred by
13983"XML-purists", while XML::Twig seems to be more used by Perl Hackers who have
13984to deal with XML. As you have noted, XML::Twig also comes with quite a lot of
13985docs, but I am sure if you ask for help about XML::LibXML here or on Perlmonks
13986you will get answers.
13987
13988Note that it is actually quite hard for me to compare the 2 modules: on one hand
13989I know XML::Twig inside-out and I can get it to do pretty much anything I need
13990to (or I improve it ;--), while I have a very basic knowledge of XML::LibXML.
13991So feature-wise, I'd rather use XML::Twig ;--). On the other hand, I am
13992painfully aware of some of the deficiencies, potential bugs and plain ugly code
13993that lurk in XML::Twig, even though you are unlikely to be affected by them
13994(unless for example you need to change the DTD of a document programmatically),
13995while I haven't looked much into XML::LibXML so it still looks shinny and clean
13996to me.
13997
13998That said, if you need to process a document that is too big to fit memory
13999and XML::Twig is too slow for you, my reluctant advice would be to use "bare"
14000XML::Parser.  It won't be as easy to use as XML::Twig: basically with XML::Twig
14001you trade some speed (depending on what you do from a factor 3 to... none)
14002for ease-of-use, but it will be easier IMHO than using SAX (albeit not
14003standard), and at this point a LOT faster (see the last test in
14004L<http://www.xmltwig.org/article/simple_benchmark/>).
14005
14006=cut
14007
14008
14009