1# This is a fork of HTML::Element.  Eventually the code may be merged.
2
3package HTML::DOM::_TreeBuilder;
4
5use warnings;
6use strict;
7use integer;    # vroom vroom!
8use Carp ();
9use vars qw(@ISA $VERSION $DEBUG);
10
11#---------------------------------------------------------------------------
12# Make a 'DEBUG' constant...
13
14BEGIN {
15
16    # We used to have things like
17    #  print $indent, "lalala" if $Debug;
18    # But there were an awful lot of having to evaluate $Debug's value.
19    # If we make that depend on a constant, like so:
20    #   sub DEBUG () { 1 } # or whatever value.
21    #   ...
22    #   print $indent, "lalala" if DEBUG;
23    # Which at compile-time (thru the miracle of constant folding) turns into:
24    #   print $indent, "lalala";
25    # or, if DEBUG is a constant with a true value, then that print statement
26    # is simply optimized away, and doesn't appear in the target code at all.
27    # If you don't believe me, run:
28    #    perl -MO=Deparse,-uHTML::DOM::_TreeBuilder -e 'BEGIN { \
29    #      $HTML::DOM::_TreeBuilder::DEBUG = 4}  use HTML::DOM::_TreeBuilder'
30    # and see for yourself (substituting whatever value you want for $DEBUG
31    # there).
32## no critic
33    if ( defined &DEBUG ) {
34
35        # Already been defined!  Do nothing.
36    }
37    elsif ( $] < 5.00404 ) {
38
39        # Grudgingly accomodate ancient (pre-constant) versions.
40        eval 'sub DEBUG { $Debug } ';
41    }
42    elsif ( !$DEBUG ) {
43        eval 'sub DEBUG () {0}';    # Make it a constant.
44    }
45    elsif ( $DEBUG =~ m<^\d+$>s ) {
46        eval 'sub DEBUG () { ' . $DEBUG . ' }';    # Make THAT a constant.
47    }
48    else {                                         # WTF?
49        warn "Non-numeric value \"$DEBUG\" in \$HTML::DOM::_Element::DEBUG";
50        eval 'sub DEBUG () { $DEBUG }';            # I guess.
51    }
52## use critic
53}
54
55#---------------------------------------------------------------------------
56
57use HTML::Entities ();
58use HTML::Tagset 3.02 ();
59
60use HTML::DOM::_Element ();
61use HTML::Parser  ();
62@ISA = qw(HTML::DOM::_Element HTML::Parser);
63$VERSION = 4.2001;
64
65# This looks schizoid, I know.
66# It's not that we ARE an element AND a parser.
67# We ARE an element, but one that knows how to handle signals
68#  (method calls) from Parser in order to elaborate its subtree.
69
70# Legacy aliases:
71*HTML::DOM::_TreeBuilder::isKnown             = \%HTML::Tagset::isKnown;
72*HTML::DOM::_TreeBuilder::canTighten          = \%HTML::Tagset::canTighten;
73*HTML::DOM::_TreeBuilder::isHeadElement       = \%HTML::Tagset::isHeadElement;
74*HTML::DOM::_TreeBuilder::isBodyElement       = \%HTML::Tagset::isBodyElement;
75*HTML::DOM::_TreeBuilder::isPhraseMarkup      = \%HTML::Tagset::isPhraseMarkup;
76*HTML::DOM::_TreeBuilder::isHeadOrBodyElement = \%HTML::Tagset::isHeadOrBodyElement;
77*HTML::DOM::_TreeBuilder::isList              = \%HTML::Tagset::isList;
78*HTML::DOM::_TreeBuilder::isTableElement      = \%HTML::Tagset::isTableElement;
79*HTML::DOM::_TreeBuilder::isFormElement       = \%HTML::Tagset::isFormElement;
80*HTML::DOM::_TreeBuilder::p_closure_barriers  = \@HTML::Tagset::p_closure_barriers;
81
82#==========================================================================
83# Two little shortcut constructors:
84
85sub new_from_file {    # or from a FH
86    my $class = shift;
87    Carp::croak("new_from_file takes only one argument")
88        unless @_ == 1;
89    Carp::croak("new_from_file is a class method only")
90        if ref $class;
91    my $new = $class->new();
92    $new->parse_file( $_[0] );
93    return $new;
94}
95
96sub new_from_content {    # from any number of scalars
97    my $class = shift;
98    Carp::croak("new_from_content is a class method only")
99        if ref $class;
100    my $new = $class->new();
101    foreach my $whunk (@_) {
102        if ( ref($whunk) eq 'SCALAR' ) {
103            $new->parse($$whunk);
104        }
105        else {
106            $new->parse($whunk);
107        }
108        last if $new->{'_stunted'};    # might as well check that.
109    }
110    $new->eof();
111    return $new;
112}
113
114# TODO: document more fully?
115sub parse_content {                    # from any number of scalars
116    my $tree = shift;
117    my $retval;
118    foreach my $whunk (@_) {
119        if ( ref($whunk) eq 'SCALAR' ) {
120            $retval = $tree->parse($$whunk);
121        }
122        else {
123            $retval = $tree->parse($whunk);
124        }
125        last if $tree->{'_stunted'};    # might as well check that.
126    }
127    $tree->eof();
128    return $retval;
129}
130
131#---------------------------------------------------------------------------
132
133sub new {                               # constructor!
134    my $class = shift;
135    $class = ref($class) || $class;
136
137    # Initialize HTML::DOM::_Element part
138    my $self = $class->element_class->new('html');
139
140    {
141
142        # A hack for certain strange versions of Parser:
143        my $other_self = HTML::Parser->new();
144        %$self = ( %$self, %$other_self );    # copy fields
145           # Yes, multiple inheritance is messy.  Kids, don't try this at home.
146        bless $other_self, "HTML::DOM::_TreeBuilder::_hideyhole";
147
148        # whack it out of the HTML::Parser class, to avoid the destructor
149    }
150
151    # The root of the tree is special, as it has these funny attributes,
152    # and gets reblessed into this class.
153
154    # Initialize parser settings
155    $self->{'_implicit_tags'}       = 1;
156    $self->{'_implicit_body_p_tag'} = 0;
157
158    # If true, trying to insert text, or any of %isPhraseMarkup right
159    #  under 'body' will implicate a 'p'.  If false, will just go there.
160
161    $self->{'_tighten'} = 1;
162
163    # whether ignorable WS in this tree should be deleted
164
165    $self->{'_implicit'} = 1; # to delete, once we find a real open-"html" tag
166
167    $self->{'_ignore_unknown'}      = 1;
168    $self->{'_ignore_text'}         = 0;
169    $self->{'_warn'}                = 0;
170    $self->{'_no_space_compacting'} = 0;
171    $self->{'_store_comments'}      = 0;
172    $self->{'_store_declarations'}  = 1;
173    $self->{'_store_pis'}           = 0;
174    $self->{'_p_strict'}            = 0;
175    $self->{'_no_expand_entities'}  = 0;
176
177    # Parse attributes passed in as arguments
178    if (@_) {
179        my %attr = @_;
180        for ( keys %attr ) {
181            $self->{"_$_"} = $attr{$_};
182        }
183    }
184
185    $HTML::DOM::_Element::encoded_content = $self->{'_no_expand_entities'};
186
187    # rebless to our class
188    bless $self, $class;
189
190    $self->{'_element_count'} = 1;
191
192    # undocumented, informal, and maybe not exactly correct
193
194    $self->{'_head'} = $self->insert_element( 'head', 1 );
195    $self->{'_pos'}  = undef;                                # pull it back up
196    $self->{'_body'} = $self->insert_element( 'body', 1 );
197    $self->{'_pos'} = undef;    # pull it back up again
198
199    return $self;
200}
201
202#==========================================================================
203
204sub _elem                       # universal accessor...
205{
206    my ( $self, $elem, $val ) = @_;
207    my $old = $self->{$elem};
208    $self->{$elem} = $val if defined $val;
209    return $old;
210}
211
212# accessors....
213sub implicit_tags       { shift->_elem( '_implicit_tags',       @_ ); }
214sub implicit_body_p_tag { shift->_elem( '_implicit_body_p_tag', @_ ); }
215sub p_strict            { shift->_elem( '_p_strict',            @_ ); }
216sub no_space_compacting { shift->_elem( '_no_space_compacting', @_ ); }
217sub ignore_unknown      { shift->_elem( '_ignore_unknown',      @_ ); }
218sub ignore_text         { shift->_elem( '_ignore_text',         @_ ); }
219sub ignore_ignorable_whitespace { shift->_elem( '_tighten',            @_ ); }
220sub store_comments              { shift->_elem( '_store_comments',     @_ ); }
221sub store_declarations          { shift->_elem( '_store_declarations', @_ ); }
222sub store_pis                   { shift->_elem( '_store_pis',          @_ ); }
223sub warn                        { shift->_elem( '_warn',               @_ ); }
224
225sub no_expand_entities {
226    shift->_elem( '_no_expand_entities', @_ );
227    $HTML::DOM::_Element::encoded_content = @_;
228}
229
230#==========================================================================
231
232sub warning {
233    my $self = shift;
234    CORE::warn("HTML::Parse: $_[0]\n") if $self->{'_warn'};
235
236    # should maybe say HTML::DOM::_TreeBuilder instead
237}
238
239#==========================================================================
240
241{
242
243    # To avoid having to rebuild these lists constantly...
244    my $_Closed_by_structurals = [qw(p h1 h2 h3 h4 h5 h6 pre textarea)];
245    my $indent;
246
247    sub start {
248        return if $_[0]{'_stunted'};
249
250        # Accept a signal from HTML::Parser for start-tags.
251        my ( $self, $tag, $attr ) = @_;
252
253        # Parser passes more, actually:
254        #   $self->start($tag, $attr, $attrseq, $origtext)
255        # But we can merrily ignore $attrseq and $origtext.
256
257        if ( $tag eq 'x-html' ) {
258            print "Ignoring open-x-html tag.\n" if DEBUG;
259
260            # inserted by some lame code-generators.
261            return;    # bypass tweaking.
262        }
263
264        $tag =~ s{/$}{}s;    # So <b/> turns into <b>.  Silently forgive.
265
266        unless ( $tag =~ m/^[-_a-zA-Z0-9:%]+$/s ) {
267            DEBUG and print "Start-tag name $tag is no good.  Skipping.\n";
268            return;
269
270            # This avoids having Element's new() throw an exception.
271        }
272
273        my $ptag = ( my $pos = $self->{'_pos'} || $self )->{'_tag'};
274        my $already_inserted;
275
276        #my($indent);
277        if (DEBUG) {
278
279       # optimization -- don't figure out indenting unless we're in debug mode
280            my @lineage = $pos->lineage;
281            $indent = '  ' x ( 1 + @lineage );
282            print $indent, "Proposing a new \U$tag\E under ",
283                join( '/', map $_->{'_tag'}, reverse( $pos, @lineage ) )
284                || 'Root',
285                ".\n";
286
287            #} else {
288            #  $indent = ' ';
289        }
290
291        #print $indent, "POS: $pos ($ptag)\n" if DEBUG > 2;
292        # $attr = {%$attr};
293
294        foreach my $k ( keys %$attr ) {
295
296            # Make sure some stooge doesn't have "<span _content='pie'>".
297            # That happens every few million Web pages.
298            $attr->{ ' ' . $k } = delete $attr->{$k}
299                if length $k and substr( $k, 0, 1 ) eq '_';
300
301            # Looks bad, but is fine for round-tripping.
302        }
303
304        my $e = $self->element_class->new( $tag, %$attr );
305
306        # Make a new element object.
307        # (Only rarely do we end up just throwing it away later in this call.)
308
309      # Some prep -- custom messiness for those damned tables, and strict P's.
310        if ( $self->{'_implicit_tags'} ) {    # wallawallawalla!
311
312            unless ( $HTML::DOM::_TreeBuilder::isTableElement{$tag} ) {
313                if ( $ptag eq 'table' ) {
314                    print $indent,
315                        " * Phrasal \U$tag\E right under TABLE makes implicit TR and TD\n"
316                        if DEBUG > 1;
317                    $self->insert_element( 'tr', 1 );
318                    $pos = $self->insert_element( 'td', 1 )
319                        ;                     # yes, needs updating
320                }
321                elsif ( $ptag eq 'tr' ) {
322                    print $indent,
323                        " * Phrasal \U$tag\E right under TR makes an implicit TD\n"
324                        if DEBUG > 1;
325                    $pos = $self->insert_element( 'td', 1 )
326                        ;                     # yes, needs updating
327                }
328                $ptag = $pos->{'_tag'};       # yes, needs updating
329            }
330
331            # end of table-implication block.
332
333            # Now maybe do a little dance to enforce P-strictness.
334            # This seems like it should be integrated with the big
335            # "ALL HOPE..." block, further below, but that doesn't
336            # seem feasable.
337            if (    $self->{'_p_strict'}
338                and $HTML::DOM::_TreeBuilder::isKnown{$tag}
339                and not $HTML::Tagset::is_Possible_Strict_P_Content{$tag} )
340            {
341                my $here     = $pos;
342                my $here_tag = $ptag;
343                while (1) {
344                    if ( $here_tag eq 'p' ) {
345                        print $indent, " * Inserting $tag closes strict P.\n"
346                            if DEBUG > 1;
347                        $self->end( \q{p} );
348
349                    # NB: same as \'q', but less confusing to emacs cperl-mode
350                        last;
351                    }
352
353                    #print("Lasting from $here_tag\n"),
354                    last
355                        if $HTML::DOM::_TreeBuilder::isKnown{$here_tag}
356                            and
357                            not $HTML::Tagset::is_Possible_Strict_P_Content{
358                                $here_tag};
359
360               # Don't keep looking up the tree if we see something that can't
361               #  be strict-P content.
362
363                    $here_tag
364                        = ( $here = $here->{'_parent'} || last )->{'_tag'};
365                }    # end while
366                $ptag = ( $pos = $self->{'_pos'} || $self )
367                    ->{'_tag'};    # better update!
368            }
369
370            # end of strict-p block.
371        }
372
373       # And now, get busy...
374       #----------------------------------------------------------------------
375        if ( !$self->{'_implicit_tags'} ) {    # bimskalabim
376                                               # do nothing
377            print $indent, " * _implicit_tags is off.  doing nothing\n"
378                if DEBUG > 1;
379
380       #----------------------------------------------------------------------
381        }
382        elsif ( $HTML::DOM::_TreeBuilder::isHeadOrBodyElement{$tag} ) {
383            if ( $pos->is_inside('body') ) {    # all is well
384                print $indent,
385                    " * ambilocal element \U$tag\E is fine under BODY.\n"
386                    if DEBUG > 1;
387            }
388            elsif ( $pos->is_inside('head') ) {
389                print $indent,
390                    " * ambilocal element \U$tag\E is fine under HEAD.\n"
391                    if DEBUG > 1;
392            }
393            else {
394
395                # In neither head nor body!  mmmmm... put under head?
396
397                if ( $ptag eq 'html' ) {    # expected case
398                     # TODO?? : would there ever be a case where _head would be
399                     #  absent from a tree that would ever be accessed at this
400                     #  point?
401                    die "Where'd my head go?" unless ref $self->{'_head'};
402                    if ( $self->{'_head'}{'_implicit'} ) {
403                        print $indent,
404                            " * ambilocal element \U$tag\E makes an implicit HEAD.\n"
405                            if DEBUG > 1;
406
407                        # or rather, points us at it.
408                        $self->{'_pos'}
409                            = $self->{'_head'};    # to insert under...
410                    }
411                    else {
412                        $self->warning(
413                            "Ambilocal element <$tag> not under HEAD or BODY!?"
414                        );
415
416                        # Put it under HEAD by default, I guess
417                        $self->{'_pos'}
418                            = $self->{'_head'};    # to insert under...
419                    }
420
421                }
422                else {
423
424             # Neither under head nor body, nor right under html... pass thru?
425                    $self->warning(
426                        "Ambilocal element <$tag> neither under head nor body, nor right under html!?"
427                    );
428                }
429            }
430
431       #----------------------------------------------------------------------
432        }
433        elsif ( $HTML::DOM::_TreeBuilder::isBodyElement{$tag} ) {
434
435            # Ensure that we are within <body>
436            if ( $ptag eq 'body' ) {
437
438                # We're good.
439            }
440            elsif (
441                $HTML::DOM::_TreeBuilder::isBodyElement{$ptag}    # glarg
442                and not $HTML::DOM::_TreeBuilder::isHeadOrBodyElement{$ptag}
443                )
444            {
445
446              # Special case: Save ourselves a call to is_inside further down.
447              # If our $ptag is an isBodyElement element (but not an
448              # isHeadOrBodyElement element), then we must be under body!
449                print $indent, " * Inferring that $ptag is under BODY.\n",
450                    if DEBUG > 3;
451
452                # I think this and the test for 'body' trap everything
453                # bodyworthy, except the case where the parent element is
454                # under an unknown element that's a descendant of body.
455            }
456            elsif ( $pos->is_inside('head') ) {
457                print $indent,
458                    " * body-element \U$tag\E minimizes HEAD, makes implicit BODY.\n"
459                    if DEBUG > 1;
460                $ptag = (
461                    $pos = $self->{'_pos'}
462                        = $self->{'_body'}    # yes, needs updating
463                        || die "Where'd my body go?"
464                )->{'_tag'};                  # yes, needs updating
465            }
466            elsif ( !$pos->is_inside('body') ) {
467                print $indent,
468                    " * body-element \U$tag\E makes implicit BODY.\n"
469                    if DEBUG > 1;
470                $ptag = (
471                    $pos = $self->{'_pos'}
472                        = $self->{'_body'}    # yes, needs updating
473                        || die "Where'd my body go?"
474                )->{'_tag'};                  # yes, needs updating
475            }
476
477            # else we ARE under body, so okay.
478
479            # Handle implicit endings and insert based on <tag> and position
480            # ... ALL HOPE ABANDON ALL YE WHO ENTER HERE ...
481            if (   $tag eq 'p'
482                or $tag eq 'h1'
483                or $tag eq 'h2'
484                or $tag eq 'h3'
485                or $tag eq 'h4'
486                or $tag eq 'h5'
487                or $tag eq 'h6'
488                or $tag eq 'form'
489
490                # Hm, should <form> really be here?!
491                )
492            {
493
494                # Can't have <p>, <h#> or <form> inside these
495                $self->end(
496                    $_Closed_by_structurals,
497                    @HTML::DOM::_TreeBuilder::p_closure_barriers
498
499                        # used to be just li!
500                );
501
502            }
503            elsif ( $tag eq 'ol' or $tag eq 'ul' or $tag eq 'dl' ) {
504
505                # Can't have lists inside <h#> -- in the unlikely
506                #  event anyone tries to put them there!
507                if (   $ptag eq 'h1'
508                    or $ptag eq 'h2'
509                    or $ptag eq 'h3'
510                    or $ptag eq 'h4'
511                    or $ptag eq 'h5'
512                    or $ptag eq 'h6' )
513                {
514                    $self->end( \$ptag );
515                }
516
517                # TODO: Maybe keep closing up the tree until
518                #  the ptag isn't any of the above?
519                # But anyone that says <h1><h2><ul>...
520                #  deserves what they get anyway.
521
522            }
523            elsif ( $tag eq 'li' ) {    # list item
524                    # Get under a list tag, one way or another
525                unless (
526                    exists $HTML::DOM::_TreeBuilder::isList{$ptag}
527                    or $self->end( \q{*}, keys %HTML::DOM::_TreeBuilder::isList ) #'
528                    )
529                {
530                    print $indent,
531                        " * inserting implicit UL for lack of containing ",
532                        join( '|', keys %HTML::DOM::_TreeBuilder::isList ), ".\n"
533                        if DEBUG > 1;
534                    $self->insert_element( 'ul', 1 );
535                }
536
537            }
538            elsif ( $tag eq 'dt' or $tag eq 'dd' ) {
539
540                # Get under a DL, one way or another
541                unless ( $ptag eq 'dl' or $self->end( \q{*}, 'dl' ) ) {    #'
542                    print $indent,
543                        " * inserting implicit DL for lack of containing DL.\n"
544                        if DEBUG > 1;
545                    $self->insert_element( 'dl', 1 );
546                }
547
548            }
549            elsif ( $HTML::DOM::_TreeBuilder::isFormElement{$tag} ) {
550                if ($self->{
551                        '_ignore_formies_outside_form'}  # TODO: document this
552                    and not $pos->is_inside('form')
553                    )
554                {
555                    print $indent,
556                        " * ignoring \U$tag\E because not in a FORM.\n"
557                        if DEBUG > 1;
558                    return;                              # bypass tweaking.
559                }
560                if ( $tag eq 'option' ) {
561
562                    # return unless $ptag eq 'select';
563                    $self->end( \q{option} );
564                    $ptag = ( $self->{'_pos'} || $self )->{'_tag'};
565                    unless ( $ptag eq 'select' or $ptag eq 'optgroup' ) {
566                        print $indent,
567                            " * \U$tag\E makes an implicit SELECT.\n"
568                            if DEBUG > 1;
569                        $pos = $self->insert_element( 'select', 1 );
570
571                    # but not a very useful select -- has no 'name' attribute!
572                    # is $pos's value used after this?
573                    }
574                }
575            }
576            elsif ( $HTML::DOM::_TreeBuilder::isTableElement{$tag} ) {
577                if ( !$pos->is_inside('table') ) {
578                    print $indent, " * \U$tag\E makes an implicit TABLE\n"
579                        if DEBUG > 1;
580                    $self->insert_element( 'table', 1 );
581                }
582
583                if ( $tag eq 'td' or $tag eq 'th' ) {
584
585                    # Get under a tr one way or another
586                    unless (
587                        $ptag eq 'tr'    # either under a tr
588                        or $self->end( \q{*}, 'tr',
589                            'table' )    #or we can get under one
590                        )
591                    {
592                        print $indent,
593                            " * \U$tag\E under \U$ptag\E makes an implicit TR\n"
594                            if DEBUG > 1;
595                        $self->insert_element( 'tr', 1 );
596
597                        # presumably pos's value isn't used after this.
598                    }
599                }
600                else {
601                    $self->end( \$tag, 'table' );    #'
602                }
603
604                # Hmm, I guess this is right.  To work it out:
605                #   tr closes any open tr (limited at a table)
606                #   thead closes any open thead (limited at a table)
607                #   tbody closes any open tbody (limited at a table)
608                #   tfoot closes any open tfoot (limited at a table)
609                #   colgroup closes any open colgroup (limited at a table)
610                #   col can try, but will always fail, at the enclosing table,
611                #     as col is empty, and therefore never open!
612                # But!
613                #   td closes any open td OR th (limited at a table)
614                #   th closes any open th OR td (limited at a table)
615                #   ...implementable as "close to a tr, or make a tr"
616
617            }
618            elsif ( $HTML::DOM::_TreeBuilder::isPhraseMarkup{$tag} ) {
619                if ( $ptag eq 'body' and $self->{'_implicit_body_p_tag'} ) {
620                    print
621                        " * Phrasal \U$tag\E right under BODY makes an implicit P\n"
622                        if DEBUG > 1;
623                    $pos = $self->insert_element( 'p', 1 );
624
625                    # is $pos's value used after this?
626                }
627            }
628
629            # End of implicit endings logic
630
631       # End of "elsif ($HTML::DOM::_TreeBuilder::isBodyElement{$tag}"
632       #----------------------------------------------------------------------
633
634        }
635        elsif ( $HTML::DOM::_TreeBuilder::isHeadElement{$tag} ) {
636            if ( $pos->is_inside('body') ) {
637                print $indent, " * head element \U$tag\E found inside BODY!\n"
638                    if DEBUG;
639                $self->warning("Header element <$tag> in body");    # [sic]
640            }
641            elsif ( !$pos->is_inside('head') ) {
642                print $indent,
643                    " * head element \U$tag\E makes an implicit HEAD.\n"
644                    if DEBUG > 1;
645            }
646            else {
647                print $indent,
648                    " * head element \U$tag\E goes inside existing HEAD.\n"
649                    if DEBUG > 1;
650            }
651            $self->{'_pos'} = $self->{'_head'} || die "Where'd my head go?";
652
653       #----------------------------------------------------------------------
654        }
655        elsif ( $tag eq 'html' ) {
656            if ( delete $self->{'_implicit'} ) {    # first time here
657                print $indent, " * good! found the real HTML element!\n"
658                    if DEBUG > 1;
659            }
660            else {
661                print $indent, " * Found a second HTML element\n"
662                    if DEBUG;
663                $self->warning("Found a nested <html> element");
664            }
665
666            # in either case, migrate attributes to the real element
667            for ( keys %$attr ) {
668                $self->attr( $_, $attr->{$_} );
669            }
670            $self->{'_pos'} = undef;
671            return $self;    # bypass tweaking.
672
673       #----------------------------------------------------------------------
674        }
675        elsif ( $tag eq 'head' ) {
676            my $head = $self->{'_head'} || die "Where'd my head go?";
677            if ( delete $head->{'_implicit'} ) {    # first time here
678                print $indent, " * good! found the real HEAD element!\n"
679                    if DEBUG > 1;
680            }
681            else {                                  # been here before
682                print $indent, " * Found a second HEAD element\n"
683                    if DEBUG;
684                $self->warning("Found a second <head> element");
685            }
686
687            # in either case, migrate attributes to the real element
688            for ( keys %$attr ) {
689                $head->attr( $_, $attr->{$_} );
690            }
691            return $self->{'_pos'} = $head;         # bypass tweaking.
692
693       #----------------------------------------------------------------------
694        }
695        elsif ( $tag eq 'body' ) {
696            my $body = $self->{'_body'} || die "Where'd my body go?";
697            if ( delete $body->{'_implicit'} ) {    # first time here
698                print $indent, " * good! found the real BODY element!\n"
699                    if DEBUG > 1;
700            }
701            else {                                  # been here before
702                print $indent, " * Found a second BODY element\n"
703                    if DEBUG;
704                $self->warning("Found a second <body> element");
705            }
706
707            # in either case, migrate attributes to the real element
708            for ( keys %$attr ) {
709                $body->attr( $_, $attr->{$_} );
710            }
711            $self->{'_pos'} = $body unless $pos->is_inside('body');
712            return $body;                           # bypass tweaking.
713
714       #----------------------------------------------------------------------
715        }
716        elsif ( $tag eq 'frameset' ) {
717            if (!( $self->{'_frameset_seen'}++ )    # first frameset seen
718                and !$self->{'_noframes_seen'}
719
720                # otherwise it'll be under the noframes already
721                and !$self->is_inside('body')
722                )
723            {
724
725           # The following is a bit of a hack.  We don't use the normal
726           #  insert_element because 1) we don't want it as _pos, but instead
727           #  right under $self, and 2), more importantly, that we don't want
728           #  this inserted at the /end/ of $self's content_list, but instead
729           #  in the middle of it, specifiaclly right before the body element.
730           #
731                my $c    = $self->{'_content'} || die "Contentless root?";
732                my $body = $self->{'_body'}    || die "Where'd my BODY go?";
733                for ( my $i = 0; $i < @$c; ++$i ) {
734                    if ( $c->[$i] eq $body ) {
735                        splice( @$c, $i, 0, $self->{'_pos'} = $pos = $e );
736                        $e->{'_parent'} = $self;
737                        $already_inserted = 1;
738                        print $indent,
739                            " * inserting 'frameset' right before BODY.\n"
740                            if DEBUG > 1;
741                        last;
742                    }
743                }
744                die "BODY not found in children of root?"
745                    unless $already_inserted;
746            }
747
748        }
749        elsif ( $tag eq 'frame' ) {
750
751            # Okay, fine, pass thru.
752            # Should probably enforce that these should be under a frameset.
753            # But hey.  Ditto for enforcing that 'noframes' should be under
754            # a 'frameset', as the DTDs say.
755
756        }
757        elsif ( $tag eq 'noframes' ) {
758
759           # This basically assumes there'll be exactly one 'noframes' element
760           #  per document.  At least, only the first one gets to have the
761           #  body under it.  And if there are no noframes elements, then
762           #  the body pretty much stays where it is.  Is that ever a problem?
763            if ( $self->{'_noframes_seen'}++ ) {
764                print $indent, " * ANOTHER noframes element?\n" if DEBUG;
765            }
766            else {
767                if ( $pos->is_inside('body') ) {
768                    print $indent, " * 'noframes' inside 'body'.  Odd!\n"
769                        if DEBUG;
770
771               # In that odd case, we /can't/ make body a child of 'noframes',
772               # because it's an ancestor of the 'noframes'!
773                }
774                else {
775                    $e->push_content( $self->{'_body'}
776                            || die "Where'd my body go?" );
777                    print $indent, " * Moving body to be under noframes.\n"
778                        if DEBUG;
779                }
780            }
781
782       #----------------------------------------------------------------------
783        }
784        else {
785
786            # unknown tag
787            if ( $self->{'_ignore_unknown'} ) {
788                print $indent, " * Ignoring unknown tag \U$tag\E\n" if DEBUG;
789                $self->warning("Skipping unknown tag $tag");
790                return;
791            }
792            else {
793                print $indent, " * Accepting unknown tag \U$tag\E\n"
794                    if DEBUG;
795            }
796        }
797
798       #----------------------------------------------------------------------
799       # End of mumbo-jumbo
800
801        print $indent, "(Attaching ", $e->{'_tag'}, " under ",
802            ( $self->{'_pos'} || $self )->{'_tag'}, ")\n"
803
804            # because if _pos isn't defined, it goes under self
805            if DEBUG;
806
807        # The following if-clause is to delete /some/ ignorable whitespace
808        #  nodes, as we're making the tree.
809        # This'd be a node we'd catch later anyway, but we might as well
810        #  nip it in the bud now.
811        # This doesn't catch /all/ deletable WS-nodes, so we do have to call
812        #  the tightener later to catch the rest.
813
814        if ( $self->{'_tighten'} and !$self->{'_ignore_text'} )
815        {    # if tightenable
816            my ( $sibs, $par );
817            if (( $sibs = ( $par = $self->{'_pos'} || $self )->{'_content'} )
818                and @$sibs            # parent already has content
819                and !
820                ref( $sibs->[-1] )    # and the last one there is a text node
821                and $sibs->[-1] !~ m<[^\n\r\f\t ]>s  # and it's all whitespace
822
823                and (    # one of these has to be eligible...
824                    $HTML::DOM::_TreeBuilder::canTighten{$tag}
825                    or (( @$sibs == 1 )
826                        ?    # WS is leftmost -- so parent matters
827                        $HTML::DOM::_TreeBuilder::canTighten{ $par->{'_tag'} }
828                        :    # WS is after another node -- it matters
829                        (   ref $sibs->[-2]
830                                and
831                                $HTML::DOM::_TreeBuilder::canTighten{ $sibs->[-2]
832                                    {'_tag'} }
833                        )
834                    )
835                )
836
837                and !$par->is_inside( 'pre', 'xmp', 'textarea', 'plaintext' )
838
839                # we're clear
840                )
841            {
842                pop @$sibs;
843                print $indent, "Popping a preceding all-WS node\n" if DEBUG;
844            }
845        }
846
847        $self->insert_element($e) unless $already_inserted;
848
849        if (DEBUG) {
850            if ( $self->{'_pos'} ) {
851                print $indent, "(Current lineage of pos:  \U$tag\E under ",
852                    join(
853                    '/',
854                    reverse(
855
856                        # $self->{'_pos'}{'_tag'},  # don't list myself!
857                        $self->{'_pos'}->lineage_tag_names
858                    )
859                    ),
860                    ".)\n";
861            }
862            else {
863                print $indent, "(Pos points nowhere!?)\n";
864            }
865        }
866
867        unless ( ( $self->{'_pos'} || '' ) eq $e ) {
868
869            # if it's an empty element -- i.e., if it didn't change the _pos
870            &{         $self->{"_tweak_$tag"}
871                    || $self->{'_tweak_*'}
872                    || return $e }( map $_, $e, $tag, $self )
873                ;    # make a list so the user can't clobber
874        }
875
876        return $e;
877    }
878}
879
880#==========================================================================
881
882{
883    my $indent;
884
885    sub end {
886        return if $_[0]{'_stunted'};
887
888       # Either: Acccept an end-tag signal from HTML::Parser
889       # Or: Method for closing currently open elements in some fairly complex
890       #  way, as used by other methods in this class.
891        my ( $self, $tag, @stop ) = @_;
892        if ( $tag eq 'x-html' ) {
893            print "Ignoring close-x-html tag.\n" if DEBUG;
894
895            # inserted by some lame code-generators.
896            return;
897        }
898
899        unless ( ref($tag) or $tag =~ m/^[-_a-zA-Z0-9:%]+$/s ) {
900            DEBUG and print "End-tag name $tag is no good.  Skipping.\n";
901            return;
902
903            # This avoids having Element's new() throw an exception.
904        }
905
906       # This method accepts two calling formats:
907       #  1) from Parser:  $self->end('tag_name', 'origtext')
908       #        in which case we shouldn't mistake origtext as a blocker tag
909       #  2) from myself:  $self->end(\q{tagname1}, 'blk1', ... )
910       #     from myself:  $self->end(['tagname1', 'tagname2'], 'blk1',  ... )
911
912        # End the specified tag, but don't move above any of the blocker tags.
913        # The tag can also be a reference to an array.  Terminate the first
914        # tag found.
915
916        my $ptag = ( my $p = $self->{'_pos'} || $self )->{'_tag'};
917
918        # $p and $ptag are sort-of stratch
919
920        if ( ref($tag) ) {
921
922            # First param is a ref of one sort or another --
923            #  THE CALL IS COMING FROM INSIDE THE HOUSE!
924            $tag = $$tag if ref($tag) eq 'SCALAR';
925
926            # otherwise it's an arrayref.
927        }
928        else {
929
930            # the call came from Parser -- just ignore origtext
931            # except in a table ignore unmatched table tags RT #59980
932            @stop = $tag =~ /^t[hdr]\z/ ? 'table' : ();
933        }
934
935        #my($indent);
936        if (DEBUG) {
937
938           # optimization -- don't figure out depth unless we're in debug mode
939            my @lineage_tags = $p->lineage_tag_names;
940            $indent = '  ' x ( 1 + @lineage_tags );
941
942            # now announce ourselves
943            print $indent, "Ending ",
944                ref($tag) ? ( '[', join( ' ', @$tag ), ']' ) : "\U$tag\E",
945                scalar(@stop)
946                ? ( " no higher than [", join( ' ', @stop ), "]" )
947                : (), ".\n";
948
949            print $indent, " (Current lineage: ", join( '/', @lineage_tags ),
950                ".)\n"
951                if DEBUG > 1;
952
953            if ( DEBUG > 3 ) {
954
955                #my(
956                # $package, $filename, $line, $subroutine,
957                # $hasargs, $wantarray, $evaltext, $is_require) = caller;
958                print $indent,
959                    " (Called from ", ( caller(1) )[3], ' line ',
960                    ( caller(1) )[2],
961                    ")\n";
962            }
963
964            #} else {
965            #  $indent = ' ';
966        }
967
968        # End of if DEBUG
969
970        # Now actually do it
971        my @to_close;
972        if ( $tag eq '*' ) {
973
974        # Special -- close everything up to (but not including) the first
975        #  limiting tag, or return if none found.  Somewhat of a special case.
976        PARENT:
977            while ( defined $p ) {
978                $ptag = $p->{'_tag'};
979                print $indent, " (Looking at $ptag.)\n" if DEBUG > 2;
980                for (@stop) {
981                    if ( $ptag eq $_ ) {
982                        print $indent,
983                            " (Hit a $_; closing everything up to here.)\n"
984                            if DEBUG > 2;
985                        last PARENT;
986                    }
987                }
988                push @to_close, $p;
989                $p = $p->{'_parent'};    # no match so far? keep moving up
990                print $indent,
991                    " (Moving on up to ", $p ? $p->{'_tag'} : 'nil', ")\n"
992                    if DEBUG > 1;
993            }
994            unless ( defined $p ) { # We never found what we were looking for.
995                print $indent, " (We never found a limit.)\n" if DEBUG > 1;
996                return;
997            }
998
999            #print
1000            #   $indent,
1001            #   " (To close: ", join('/', map $_->tag, @to_close), ".)\n"
1002            #  if DEBUG > 4;
1003
1004            # Otherwise update pos and fall thru.
1005            $self->{'_pos'} = $p;
1006        }
1007        elsif ( ref $tag ) {
1008
1009           # Close the first of any of the matching tags, giving up if you hit
1010           #  any of the stop-tags.
1011        PARENT:
1012            while ( defined $p ) {
1013                $ptag = $p->{'_tag'};
1014                print $indent, " (Looking at $ptag.)\n" if DEBUG > 2;
1015                for (@$tag) {
1016                    if ( $ptag eq $_ ) {
1017                        print $indent, " (Closing $_.)\n" if DEBUG > 2;
1018                        last PARENT;
1019                    }
1020                }
1021                for (@stop) {
1022                    if ( $ptag eq $_ ) {
1023                        print $indent,
1024                            " (Hit a limiting $_ -- bailing out.)\n"
1025                            if DEBUG > 1;
1026                        return;    # so it was all for naught
1027                    }
1028                }
1029                push @to_close, $p;
1030                $p = $p->{'_parent'};
1031            }
1032            return unless defined $p;    # We went off the top of the tree.
1033               # Otherwise specified element was found; set pos to its parent.
1034            push @to_close, $p;
1035            $self->{'_pos'} = $p->{'_parent'};
1036        }
1037        else {
1038
1039            # Close the first of the specified tag, giving up if you hit
1040            #  any of the stop-tags.
1041            while ( defined $p ) {
1042                $ptag = $p->{'_tag'};
1043                print $indent, " (Looking at $ptag.)\n" if DEBUG > 2;
1044                if ( $ptag eq $tag ) {
1045                    print $indent, " (Closing $tag.)\n" if DEBUG > 2;
1046                    last;
1047                }
1048                for (@stop) {
1049                    if ( $ptag eq $_ ) {
1050                        print $indent,
1051                            " (Hit a limiting $_ -- bailing out.)\n"
1052                            if DEBUG > 1;
1053                        return;    # so it was all for naught
1054                    }
1055                }
1056                push @to_close, $p;
1057                $p = $p->{'_parent'};
1058            }
1059            return unless defined $p;    # We went off the top of the tree.
1060               # Otherwise specified element was found; set pos to its parent.
1061            push @to_close, $p;
1062            $self->{'_pos'} = $p->{'_parent'};
1063        }
1064
1065        $self->{'_pos'} = undef if $self eq ( $self->{'_pos'} || '' );
1066        print $indent, "(Pos now points to ",
1067            $self->{'_pos'} ? $self->{'_pos'}{'_tag'} : '???', ".)\n"
1068            if DEBUG > 1;
1069
1070        ### EXPENSIVE, because has to check that it's not under a pre
1071        ### or a CDATA-parent.  That's one more method call per end()!
1072        ### Might as well just do this at the end of the tree-parse, I guess,
1073        ### at which point we'd be parsing top-down, and just not traversing
1074        ### under pre's or CDATA-parents.
1075        ##
1076        ## Take this opportunity to nix any terminal whitespace nodes.
1077        ## TODO: consider whether this (plus the logic in start(), above)
1078        ## would ever leave any WS nodes in the tree.
1079        ## If not, then there's no reason to have eof() call
1080        ## delete_ignorable_whitespace on the tree, is there?
1081        ##
1082    #if(@to_close and $self->{'_tighten'} and !$self->{'_ignore_text'} and
1083    #  ! $to_close[-1]->is_inside('pre', keys %HTML::Tagset::isCDATA_Parent)
1084    #) {  # if tightenable
1085    #  my($children, $e_tag);
1086    #  foreach my $e (reverse @to_close) { # going top-down
1087    #    last if 'pre' eq ($e_tag = $e->{'_tag'}) or
1088    #     $HTML::Tagset::isCDATA_Parent{$e_tag};
1089    #
1090    #    if(
1091    #      $children = $e->{'_content'}
1092    #      and @$children      # has children
1093    #      and !ref($children->[-1])
1094    #      and $children->[-1] =~ m<^\s+$>s # last node is all-WS
1095    #      and
1096    #        (
1097    #         # has a tightable parent:
1098    #         $HTML::DOM::_TreeBuilder::canTighten{ $e_tag }
1099    #         or
1100    #          ( # has a tightenable left sibling:
1101    #            @$children > 1 and
1102    #            ref($children->[-2])
1103    #            and $HTML::DOM::_TreeBuilder::canTighten{ $children->[-2]{'_tag'} }
1104    #          )
1105    #        )
1106    #    ) {
1107    #      pop @$children;
1108    #      #print $indent, "Popping a terminal WS node from ", $e->{'_tag'},
1109    #      #  " (", $e->address, ") while exiting.\n" if DEBUG;
1110    #    }
1111    #  }
1112    #}
1113
1114        foreach my $e (@to_close) {
1115
1116            # Call the applicable callback, if any
1117            $ptag = $e->{'_tag'};
1118            &{         $self->{"_tweak_$ptag"}
1119                    || $self->{'_tweak_*'}
1120                    || next }( map $_, $e, $ptag, $self );
1121            print $indent, "Back from tweaking.\n" if DEBUG;
1122            last
1123                if $self->{ '_stunted'
1124                    };    # in case one of the handlers called stunt
1125        }
1126        return @to_close;
1127    }
1128}
1129
1130#==========================================================================
1131{
1132    my ( $indent, $nugget );
1133
1134    sub text {
1135        return if $_[0]{'_stunted'};
1136
1137        # Accept a "here's a text token" signal from HTML::Parser.
1138        my ( $self, $text, $is_cdata ) = @_;
1139
1140        # the >3.0 versions of Parser may pass a cdata node.
1141        # Thanks to Gisle Aas for pointing this out.
1142
1143        return unless length $text;    # I guess that's always right
1144
1145        my $ignore_text         = $self->{'_ignore_text'};
1146        my $no_space_compacting = $self->{'_no_space_compacting'};
1147        my $no_expand_entities  = $self->{'_no_expand_entities'};
1148        my $pos                 = $self->{'_pos'} || $self;
1149
1150        HTML::Entities::decode($text)
1151            unless $ignore_text
1152                || $is_cdata
1153                || $HTML::Tagset::isCDATA_Parent{ $pos->{'_tag'} }
1154                || $no_expand_entities;
1155
1156        #my($indent, $nugget);
1157        if (DEBUG) {
1158
1159           # optimization -- don't figure out depth unless we're in debug mode
1160            my @lineage_tags = $pos->lineage_tag_names;
1161            $indent = '  ' x ( 1 + @lineage_tags );
1162
1163            $nugget
1164                = ( length($text) <= 25 )
1165                ? $text
1166                : ( substr( $text, 0, 25 ) . '...' );
1167            $nugget =~ s<([\x00-\x1F])>
1168                 <'\\x'.(unpack("H2",$1))>eg;
1169            print $indent, "Proposing a new text node ($nugget) under ",
1170                join( '/', reverse( $pos->{'_tag'}, @lineage_tags ) )
1171                || 'Root',
1172                ".\n";
1173
1174            #} else {
1175            #  $indent = ' ';
1176        }
1177
1178        my $ptag;
1179        if ($HTML::Tagset::isCDATA_Parent{ $ptag = $pos->{'_tag'} }
1180
1181            #or $pos->is_inside('pre')
1182            or $pos->is_inside( 'pre', 'textarea' )
1183            )
1184        {
1185            return if $ignore_text;
1186            $pos->push_content($text);
1187        }
1188        else {
1189
1190            # return unless $text =~ /\S/;  # This is sometimes wrong
1191
1192            if ( !$self->{'_implicit_tags'} || $text !~ /[^\n\r\f\t ]/ ) {
1193
1194                # don't change anything
1195            }
1196            elsif ( $ptag eq 'head' or $ptag eq 'noframes' ) {
1197                if ( $self->{'_implicit_body_p_tag'} ) {
1198                    print $indent,
1199                        " * Text node under \U$ptag\E closes \U$ptag\E, implicates BODY and P.\n"
1200                        if DEBUG > 1;
1201                    $self->end( \$ptag );
1202                    $pos = $self->{'_body'}
1203                        ? ( $self->{'_pos'}
1204                            = $self->{'_body'} )    # expected case
1205                        : $self->insert_element( 'body', 1 );
1206                    $pos = $self->insert_element( 'p', 1 );
1207                }
1208                else {
1209                    print $indent,
1210                        " * Text node under \U$ptag\E closes, implicates BODY.\n"
1211                        if DEBUG > 1;
1212                    $self->end( \$ptag );
1213                    $pos = $self->{'_body'}
1214                        ? ( $self->{'_pos'}
1215                            = $self->{'_body'} )    # expected case
1216                        : $self->insert_element( 'body', 1 );
1217                }
1218            }
1219            elsif ( $ptag eq 'html' ) {
1220                if ( $self->{'_implicit_body_p_tag'} ) {
1221                    print $indent,
1222                        " * Text node under HTML implicates BODY and P.\n"
1223                        if DEBUG > 1;
1224                    $pos = $self->{'_body'}
1225                        ? ( $self->{'_pos'}
1226                            = $self->{'_body'} )    # expected case
1227                        : $self->insert_element( 'body', 1 );
1228                    $pos = $self->insert_element( 'p', 1 );
1229                }
1230                else {
1231                    print $indent,
1232                        " * Text node under HTML implicates BODY.\n"
1233                        if DEBUG > 1;
1234                    $pos = $self->{'_body'}
1235                        ? ( $self->{'_pos'}
1236                            = $self->{'_body'} )    # expected case
1237                        : $self->insert_element( 'body', 1 );
1238
1239                    #print "POS is $pos, ", $pos->{'_tag'}, "\n";
1240                }
1241            }
1242            elsif ( $ptag eq 'body' ) {
1243                if ( $self->{'_implicit_body_p_tag'} ) {
1244                    print $indent, " * Text node under BODY implicates P.\n"
1245                        if DEBUG > 1;
1246                    $pos = $self->insert_element( 'p', 1 );
1247                }
1248            }
1249            elsif ( $ptag eq 'table' ) {
1250                print $indent,
1251                    " * Text node under TABLE implicates TR and TD.\n"
1252                    if DEBUG > 1;
1253                $self->insert_element( 'tr', 1 );
1254                $pos = $self->insert_element( 'td', 1 );
1255
1256                # double whammy!
1257            }
1258            elsif ( $ptag eq 'tr' ) {
1259                print $indent, " * Text node under TR implicates TD.\n"
1260                    if DEBUG > 1;
1261                $pos = $self->insert_element( 'td', 1 );
1262            }
1263
1264            # elsif (
1265            #       # $ptag eq 'li'   ||
1266            #       # $ptag eq 'dd'   ||
1267            #         $ptag eq 'form') {
1268            #    $pos = $self->insert_element('p', 1);
1269            #}
1270
1271            # Whatever we've done above should have had the side
1272            # effect of updating $self->{'_pos'}
1273
1274            #print "POS is now $pos, ", $pos->{'_tag'}, "\n";
1275
1276            return if $ignore_text;
1277            $text =~ s/[\n\r\f\t ]+/ /g    # canonical space
1278                unless $no_space_compacting;
1279
1280            print $indent, " (Attaching text node ($nugget) under ",
1281
1282           # was: $self->{'_pos'} ? $self->{'_pos'}{'_tag'} : $self->{'_tag'},
1283                $pos->{'_tag'}, ").\n"
1284                if DEBUG > 1;
1285
1286            $pos->push_content($text);
1287        }
1288
1289        &{ $self->{'_tweak_~text'} || return }( $text, $pos,
1290            $pos->{'_tag'} . '' );
1291
1292        # Note that this is very exceptional -- it doesn't fall back to
1293        #  _tweak_*, and it gives its tweak different arguments.
1294        return;
1295    }
1296}
1297
1298#==========================================================================
1299
1300# TODO: test whether comment(), declaration(), and process(), do the right
1301#  thing as far as tightening and whatnot.
1302# Also, currently, doctypes and comments that appear before head or body
1303#  show up in the tree in the wrong place.  Something should be done about
1304#  this.  Tricky.  Maybe this whole business of pre-making the body and
1305#  whatnot is wrong.
1306
1307sub comment {
1308    return if $_[0]{'_stunted'};
1309
1310    # Accept a "here's a comment" signal from HTML::Parser.
1311
1312    my ( $self, $text ) = @_;
1313    my $pos = $self->{'_pos'} || $self;
1314    return
1315        unless $self->{'_store_comments'}
1316            || $HTML::Tagset::isCDATA_Parent{ $pos->{'_tag'} };
1317
1318    if (DEBUG) {
1319        my @lineage_tags = $pos->lineage_tag_names;
1320        my $indent = '  ' x ( 1 + @lineage_tags );
1321
1322        my $nugget
1323            = ( length($text) <= 25 )
1324            ? $text
1325            : ( substr( $text, 0, 25 ) . '...' );
1326        $nugget =~ s<([\x00-\x1F])>
1327                 <'\\x'.(unpack("H2",$1))>eg;
1328        print $indent, "Proposing a Comment ($nugget) under ",
1329            join( '/', reverse( $pos->{'_tag'}, @lineage_tags ) ) || 'Root',
1330            ".\n";
1331    }
1332
1333    ( my $e = $self->element_class->new('~comment') )->{'text'} = $text;
1334    $pos->push_content($e);
1335    ++( $self->{'_element_count'} );
1336
1337    &{         $self->{'_tweak_~comment'}
1338            || $self->{'_tweak_*'}
1339            || return $e }( map $_, $e, '~comment', $self );
1340
1341    return $e;
1342}
1343
1344sub declaration {
1345    return if $_[0]{'_stunted'};
1346
1347    # Accept a "here's a markup declaration" signal from HTML::Parser.
1348
1349    my ( $self, $text ) = @_;
1350    my $pos = $self->{'_pos'} || $self;
1351
1352    if (DEBUG) {
1353        my @lineage_tags = $pos->lineage_tag_names;
1354        my $indent = '  ' x ( 1 + @lineage_tags );
1355
1356        my $nugget
1357            = ( length($text) <= 25 )
1358            ? $text
1359            : ( substr( $text, 0, 25 ) . '...' );
1360        $nugget =~ s<([\x00-\x1F])>
1361                 <'\\x'.(unpack("H2",$1))>eg;
1362        print $indent, "Proposing a Declaration ($nugget) under ",
1363            join( '/', reverse( $pos->{'_tag'}, @lineage_tags ) ) || 'Root',
1364            ".\n";
1365    }
1366    ( my $e = $self->element_class->new('~declaration') )->{'text'} = $text;
1367
1368    $self->{_decl} = $e;
1369    return $e;
1370}
1371
1372#==========================================================================
1373
1374sub process {
1375    return if $_[0]{'_stunted'};
1376
1377    # Accept a "here's a PI" signal from HTML::Parser.
1378
1379    return unless $_[0]->{'_store_pis'};
1380    my ( $self, $text ) = @_;
1381    my $pos = $self->{'_pos'} || $self;
1382
1383    if (DEBUG) {
1384        my @lineage_tags = $pos->lineage_tag_names;
1385        my $indent = '  ' x ( 1 + @lineage_tags );
1386
1387        my $nugget
1388            = ( length($text) <= 25 )
1389            ? $text
1390            : ( substr( $text, 0, 25 ) . '...' );
1391        $nugget =~ s<([\x00-\x1F])>
1392                 <'\\x'.(unpack("H2",$1))>eg;
1393        print $indent, "Proposing a PI ($nugget) under ",
1394            join( '/', reverse( $pos->{'_tag'}, @lineage_tags ) ) || 'Root',
1395            ".\n";
1396    }
1397    ( my $e = $self->element_class->new('~pi') )->{'text'} = $text;
1398    $pos->push_content($e);
1399    ++( $self->{'_element_count'} );
1400
1401    &{ $self->{'_tweak_~pi'} || $self->{'_tweak_*'} || return $e }( map $_,
1402        $e, '~pi', $self );
1403
1404    return $e;
1405}
1406
1407#==========================================================================
1408
1409#When you call $tree->parse_file($filename), and the
1410#tree's ignore_ignorable_whitespace attribute is on (as it is
1411#by default), HTML::DOM::_TreeBuilder's logic will manage to avoid
1412#creating some, but not all, nodes that represent ignorable
1413#whitespace.  However, at the end of its parse, it traverses the
1414#tree and deletes any that it missed.  (It does this with an
1415#around-method around HTML::Parser's eof method.)
1416#
1417#However, with $tree->parse($content), the cleanup-traversal step
1418#doesn't happen automatically -- so when you're done parsing all
1419#content for a document (regardless of whether $content is the only
1420#bit, or whether it's just another chunk of content you're parsing into
1421#the tree), call $tree->eof() to signal that you're at the end of the
1422#text you're inputting to the tree.  Besides properly cleaning any bits
1423#of ignorable whitespace from the tree, this will also ensure that
1424#HTML::Parser's internal buffer is flushed.
1425
1426sub eof {
1427
1428    # Accept an "end-of-file" signal from HTML::Parser, or thrown by the user.
1429
1430    return if $_[0]->{'_done'};    # we've already been here
1431
1432    return $_[0]->SUPER::eof() if $_[0]->{'_stunted'};
1433
1434    my $x = $_[0];
1435    print "EOF received.\n" if DEBUG;
1436    my (@rv);
1437    if (wantarray) {
1438
1439        # I don't think this makes any difference for this particular
1440        #  method, but let's be scrupulous, for once.
1441        @rv = $x->SUPER::eof();
1442    }
1443    else {
1444        $rv[0] = $x->SUPER::eof();
1445    }
1446
1447    $x->end('html') unless $x eq ( $x->{'_pos'} || $x );
1448
1449    # That SHOULD close everything, and will run the appropriate tweaks.
1450    # We /could/ be running under some insane mode such that there's more
1451    #  than one HTML element, but really, that's just insane to do anyhow.
1452
1453    unless ( $x->{'_implicit_tags'} ) {
1454
1455        # delete those silly implicit head and body in case we put
1456        # them there in implicit tags mode
1457        foreach my $node ( $x->{'_head'}, $x->{'_body'} ) {
1458            $node->replace_with_content
1459                if defined $node
1460                    and ref $node
1461                    and $node->{'_implicit'}
1462                    and $node->{'_parent'};
1463
1464            # I think they should be empty anyhow, since the only
1465            # logic that'd insert under them can apply only, I think,
1466            # in the case where _implicit_tags is on
1467        }
1468
1469        # this may still leave an implicit 'html' at the top, but there's
1470        # nothing we can do about that, is there?
1471    }
1472
1473    $x->delete_ignorable_whitespace()
1474
1475        # this's why we trap this -- an after-method
1476        if $x->{'_tighten'} and !$x->{'_ignore_text'};
1477    $x->{'_done'} = 1;
1478
1479    return @rv if wantarray;
1480    return $rv[0];
1481}
1482
1483#==========================================================================
1484
1485# TODO: document
1486
1487sub stunt {
1488    my $self = $_[0];
1489    print "Stunting the tree.\n" if DEBUG;
1490    $self->{'_done'} = 1;
1491
1492    if ( $HTML::Parser::VERSION < 3 ) {
1493
1494        #This is a MEAN MEAN HACK.  And it works most of the time!
1495        $self->{'_buf'} = '';
1496        my $fh = *HTML::Parser::F{IO};
1497
1498        # the local'd FH used by parse_file loop
1499        if ( defined $fh ) {
1500            print "Closing Parser's filehandle $fh\n" if DEBUG;
1501            close($fh);
1502        }
1503
1504      # But if they called $tree->parse_file($filehandle)
1505      #  or $tree->parse_file(*IO), then there will be no *HTML::Parser::F{IO}
1506      #  to close.  Ahwell.  Not a problem for most users these days.
1507
1508    }
1509    else {
1510        $self->SUPER::eof();
1511
1512        # Under 3+ versions, calling eof from inside a parse will abort the
1513        #  parse / parse_file
1514    }
1515
1516    # In the off chance that the above didn't work, we'll throw
1517    #  this flag to make any future events be no-ops.
1518    $self->stunted(1);
1519    return;
1520}
1521
1522# TODO: document
1523sub stunted { shift->_elem( '_stunted', @_ ); }
1524sub done    { shift->_elem( '_done',    @_ ); }
1525
1526#==========================================================================
1527
1528sub delete {
1529
1530    # Override Element's delete method.
1531    # This does most, if not all, of what Element's delete does anyway.
1532    # Deletes content, including content in some special attributes.
1533    # But doesn't empty out the hash.
1534
1535    $_[0]->{'_element_count'} = 1;    # never hurts to be scrupulously correct
1536
1537    delete @{ $_[0] }{ '_body', '_head', '_pos' };
1538    for (
1539        @{ delete( $_[0]->{'_content'} ) || [] },    # all/any content
1540
1541     #       delete @{$_[0]}{'_body', '_head', '_pos'}
1542     # ...and these, in case these elements don't appear in the
1543     #   content, which is possible.  If they did appear (as they
1544     #   usually do), then calling $_->delete on them again is harmless.
1545     #  I don't think that's such a hot idea now.  Thru creative reattachment,
1546     #  those could actually now point to elements in OTHER trees (which we do
1547     #  NOT want to delete!).
1548## Reasoned out:
1549  #  If these point to elements not in the content list of any element in this
1550  #   tree, but not in the content list of any element in any OTHER tree, then
1551  #   just deleting these will make their refcounts hit zero.
1552  #  If these point to elements in the content lists of elements in THIS tree,
1553  #   then we'll get to deleting them when we delete from the top.
1554  #  If these point to elements in the content lists of elements in SOME OTHER
1555  #   tree, then they're not to be deleted.
1556        )
1557    {
1558        $_->delete
1559            if defined $_ and ref $_    #  Make sure it's an object.
1560                and $_ ne $_[0];    #  And avoid hitting myself, just in case!
1561    }
1562
1563    $_[0]->detach if $_[0]->{'_parent'} and $_[0]->{'_parent'}{'_content'};
1564
1565    # An 'html' element having a parent is quite unlikely.
1566
1567    return;
1568}
1569
1570sub tighten_up {                    # legacy
1571    shift->delete_ignorable_whitespace(@_);
1572}
1573
1574sub elementify {
1575
1576    # Rebless this object down into the normal element class.
1577    my $self     = $_[0];
1578    my $to_class = $self->element_class;
1579    delete @{$self}{
1580        grep {
1581            ;
1582            length $_ and substr( $_, 0, 1 ) eq '_'
1583
1584                # The private attributes that we'll retain:
1585                and $_ ne '_tag'
1586                and $_ ne '_parent'
1587                and $_ ne '_content'
1588                and $_ ne '_implicit'
1589                and $_ ne '_pos'
1590                and $_ ne '_element_class'
1591            } keys %$self
1592        };
1593    bless $self, $to_class;    # Returns the same object we were fed
1594}
1595
1596sub element_class {
1597    return 'HTML::DOM::_Element' if not ref $_[0];
1598    return $_[0]->{_element_class} || 'HTML::DOM::_Element';
1599}
1600
1601#--------------------------------------------------------------------------
1602
1603sub guts {
1604    my @out;
1605    my @stack       = ( $_[0] );
1606    my $destructive = $_[1];
1607    my $this;
1608    while (@stack) {
1609        $this = shift @stack;
1610        if ( !ref $this ) {
1611            push @out, $this;    # yes, it can include text nodes
1612        }
1613        elsif ( !$this->{'_implicit'} ) {
1614            push @out, $this;
1615            delete $this->{'_parent'} if $destructive;
1616        }
1617        else {
1618
1619            # it's an implicit node.  Delete it and recurse
1620            delete $this->{'_parent'} if $destructive;
1621            unshift @stack,
1622                @{
1623                (   $destructive
1624                    ? delete( $this->{'_content'} )
1625                    : $this->{'_content'}
1626                    )
1627                    || []
1628                };
1629        }
1630    }
1631
1632    # Doesn't call a real $root->delete on the (when implicit) root,
1633    #  but I don't think it needs to.
1634
1635    return @out if wantarray;    # one simple normal case.
1636    return unless @out;
1637    return $out[0] if @out == 1 and ref( $out[0] );
1638    my $x = HTML::DOM::_Element->new( 'div', '_implicit' => 1 );
1639    $x->push_content(@out);
1640    return $x;
1641}
1642
1643sub disembowel { $_[0]->guts(1) }
1644
1645#--------------------------------------------------------------------------
16461;
1647
1648__END__
1649
1650