1# This is a fork of HTML::Element. Eventually the code may be merged. 2 3package HTML::DOM::_TreeBuilder; 4 5use warnings; 6use strict; 7use integer; # vroom vroom! 8use Carp (); 9use vars qw(@ISA $VERSION $DEBUG); 10 11#--------------------------------------------------------------------------- 12# Make a 'DEBUG' constant... 13 14BEGIN { 15 16 # We used to have things like 17 # print $indent, "lalala" if $Debug; 18 # But there were an awful lot of having to evaluate $Debug's value. 19 # If we make that depend on a constant, like so: 20 # sub DEBUG () { 1 } # or whatever value. 21 # ... 22 # print $indent, "lalala" if DEBUG; 23 # Which at compile-time (thru the miracle of constant folding) turns into: 24 # print $indent, "lalala"; 25 # or, if DEBUG is a constant with a true value, then that print statement 26 # is simply optimized away, and doesn't appear in the target code at all. 27 # If you don't believe me, run: 28 # perl -MO=Deparse,-uHTML::DOM::_TreeBuilder -e 'BEGIN { \ 29 # $HTML::DOM::_TreeBuilder::DEBUG = 4} use HTML::DOM::_TreeBuilder' 30 # and see for yourself (substituting whatever value you want for $DEBUG 31 # there). 32## no critic 33 if ( defined &DEBUG ) { 34 35 # Already been defined! Do nothing. 36 } 37 elsif ( $] < 5.00404 ) { 38 39 # Grudgingly accomodate ancient (pre-constant) versions. 40 eval 'sub DEBUG { $Debug } '; 41 } 42 elsif ( !$DEBUG ) { 43 eval 'sub DEBUG () {0}'; # Make it a constant. 44 } 45 elsif ( $DEBUG =~ m<^\d+$>s ) { 46 eval 'sub DEBUG () { ' . $DEBUG . ' }'; # Make THAT a constant. 47 } 48 else { # WTF? 49 warn "Non-numeric value \"$DEBUG\" in \$HTML::DOM::_Element::DEBUG"; 50 eval 'sub DEBUG () { $DEBUG }'; # I guess. 51 } 52## use critic 53} 54 55#--------------------------------------------------------------------------- 56 57use HTML::Entities (); 58use HTML::Tagset 3.02 (); 59 60use HTML::DOM::_Element (); 61use HTML::Parser (); 62@ISA = qw(HTML::DOM::_Element HTML::Parser); 63$VERSION = 4.2001; 64 65# This looks schizoid, I know. 66# It's not that we ARE an element AND a parser. 67# We ARE an element, but one that knows how to handle signals 68# (method calls) from Parser in order to elaborate its subtree. 69 70# Legacy aliases: 71*HTML::DOM::_TreeBuilder::isKnown = \%HTML::Tagset::isKnown; 72*HTML::DOM::_TreeBuilder::canTighten = \%HTML::Tagset::canTighten; 73*HTML::DOM::_TreeBuilder::isHeadElement = \%HTML::Tagset::isHeadElement; 74*HTML::DOM::_TreeBuilder::isBodyElement = \%HTML::Tagset::isBodyElement; 75*HTML::DOM::_TreeBuilder::isPhraseMarkup = \%HTML::Tagset::isPhraseMarkup; 76*HTML::DOM::_TreeBuilder::isHeadOrBodyElement = \%HTML::Tagset::isHeadOrBodyElement; 77*HTML::DOM::_TreeBuilder::isList = \%HTML::Tagset::isList; 78*HTML::DOM::_TreeBuilder::isTableElement = \%HTML::Tagset::isTableElement; 79*HTML::DOM::_TreeBuilder::isFormElement = \%HTML::Tagset::isFormElement; 80*HTML::DOM::_TreeBuilder::p_closure_barriers = \@HTML::Tagset::p_closure_barriers; 81 82#========================================================================== 83# Two little shortcut constructors: 84 85sub new_from_file { # or from a FH 86 my $class = shift; 87 Carp::croak("new_from_file takes only one argument") 88 unless @_ == 1; 89 Carp::croak("new_from_file is a class method only") 90 if ref $class; 91 my $new = $class->new(); 92 $new->parse_file( $_[0] ); 93 return $new; 94} 95 96sub new_from_content { # from any number of scalars 97 my $class = shift; 98 Carp::croak("new_from_content is a class method only") 99 if ref $class; 100 my $new = $class->new(); 101 foreach my $whunk (@_) { 102 if ( ref($whunk) eq 'SCALAR' ) { 103 $new->parse($$whunk); 104 } 105 else { 106 $new->parse($whunk); 107 } 108 last if $new->{'_stunted'}; # might as well check that. 109 } 110 $new->eof(); 111 return $new; 112} 113 114# TODO: document more fully? 115sub parse_content { # from any number of scalars 116 my $tree = shift; 117 my $retval; 118 foreach my $whunk (@_) { 119 if ( ref($whunk) eq 'SCALAR' ) { 120 $retval = $tree->parse($$whunk); 121 } 122 else { 123 $retval = $tree->parse($whunk); 124 } 125 last if $tree->{'_stunted'}; # might as well check that. 126 } 127 $tree->eof(); 128 return $retval; 129} 130 131#--------------------------------------------------------------------------- 132 133sub new { # constructor! 134 my $class = shift; 135 $class = ref($class) || $class; 136 137 # Initialize HTML::DOM::_Element part 138 my $self = $class->element_class->new('html'); 139 140 { 141 142 # A hack for certain strange versions of Parser: 143 my $other_self = HTML::Parser->new(); 144 %$self = ( %$self, %$other_self ); # copy fields 145 # Yes, multiple inheritance is messy. Kids, don't try this at home. 146 bless $other_self, "HTML::DOM::_TreeBuilder::_hideyhole"; 147 148 # whack it out of the HTML::Parser class, to avoid the destructor 149 } 150 151 # The root of the tree is special, as it has these funny attributes, 152 # and gets reblessed into this class. 153 154 # Initialize parser settings 155 $self->{'_implicit_tags'} = 1; 156 $self->{'_implicit_body_p_tag'} = 0; 157 158 # If true, trying to insert text, or any of %isPhraseMarkup right 159 # under 'body' will implicate a 'p'. If false, will just go there. 160 161 $self->{'_tighten'} = 1; 162 163 # whether ignorable WS in this tree should be deleted 164 165 $self->{'_implicit'} = 1; # to delete, once we find a real open-"html" tag 166 167 $self->{'_ignore_unknown'} = 1; 168 $self->{'_ignore_text'} = 0; 169 $self->{'_warn'} = 0; 170 $self->{'_no_space_compacting'} = 0; 171 $self->{'_store_comments'} = 0; 172 $self->{'_store_declarations'} = 1; 173 $self->{'_store_pis'} = 0; 174 $self->{'_p_strict'} = 0; 175 $self->{'_no_expand_entities'} = 0; 176 177 # Parse attributes passed in as arguments 178 if (@_) { 179 my %attr = @_; 180 for ( keys %attr ) { 181 $self->{"_$_"} = $attr{$_}; 182 } 183 } 184 185 $HTML::DOM::_Element::encoded_content = $self->{'_no_expand_entities'}; 186 187 # rebless to our class 188 bless $self, $class; 189 190 $self->{'_element_count'} = 1; 191 192 # undocumented, informal, and maybe not exactly correct 193 194 $self->{'_head'} = $self->insert_element( 'head', 1 ); 195 $self->{'_pos'} = undef; # pull it back up 196 $self->{'_body'} = $self->insert_element( 'body', 1 ); 197 $self->{'_pos'} = undef; # pull it back up again 198 199 return $self; 200} 201 202#========================================================================== 203 204sub _elem # universal accessor... 205{ 206 my ( $self, $elem, $val ) = @_; 207 my $old = $self->{$elem}; 208 $self->{$elem} = $val if defined $val; 209 return $old; 210} 211 212# accessors.... 213sub implicit_tags { shift->_elem( '_implicit_tags', @_ ); } 214sub implicit_body_p_tag { shift->_elem( '_implicit_body_p_tag', @_ ); } 215sub p_strict { shift->_elem( '_p_strict', @_ ); } 216sub no_space_compacting { shift->_elem( '_no_space_compacting', @_ ); } 217sub ignore_unknown { shift->_elem( '_ignore_unknown', @_ ); } 218sub ignore_text { shift->_elem( '_ignore_text', @_ ); } 219sub ignore_ignorable_whitespace { shift->_elem( '_tighten', @_ ); } 220sub store_comments { shift->_elem( '_store_comments', @_ ); } 221sub store_declarations { shift->_elem( '_store_declarations', @_ ); } 222sub store_pis { shift->_elem( '_store_pis', @_ ); } 223sub warn { shift->_elem( '_warn', @_ ); } 224 225sub no_expand_entities { 226 shift->_elem( '_no_expand_entities', @_ ); 227 $HTML::DOM::_Element::encoded_content = @_; 228} 229 230#========================================================================== 231 232sub warning { 233 my $self = shift; 234 CORE::warn("HTML::Parse: $_[0]\n") if $self->{'_warn'}; 235 236 # should maybe say HTML::DOM::_TreeBuilder instead 237} 238 239#========================================================================== 240 241{ 242 243 # To avoid having to rebuild these lists constantly... 244 my $_Closed_by_structurals = [qw(p h1 h2 h3 h4 h5 h6 pre textarea)]; 245 my $indent; 246 247 sub start { 248 return if $_[0]{'_stunted'}; 249 250 # Accept a signal from HTML::Parser for start-tags. 251 my ( $self, $tag, $attr ) = @_; 252 253 # Parser passes more, actually: 254 # $self->start($tag, $attr, $attrseq, $origtext) 255 # But we can merrily ignore $attrseq and $origtext. 256 257 if ( $tag eq 'x-html' ) { 258 print "Ignoring open-x-html tag.\n" if DEBUG; 259 260 # inserted by some lame code-generators. 261 return; # bypass tweaking. 262 } 263 264 $tag =~ s{/$}{}s; # So <b/> turns into <b>. Silently forgive. 265 266 unless ( $tag =~ m/^[-_a-zA-Z0-9:%]+$/s ) { 267 DEBUG and print "Start-tag name $tag is no good. Skipping.\n"; 268 return; 269 270 # This avoids having Element's new() throw an exception. 271 } 272 273 my $ptag = ( my $pos = $self->{'_pos'} || $self )->{'_tag'}; 274 my $already_inserted; 275 276 #my($indent); 277 if (DEBUG) { 278 279 # optimization -- don't figure out indenting unless we're in debug mode 280 my @lineage = $pos->lineage; 281 $indent = ' ' x ( 1 + @lineage ); 282 print $indent, "Proposing a new \U$tag\E under ", 283 join( '/', map $_->{'_tag'}, reverse( $pos, @lineage ) ) 284 || 'Root', 285 ".\n"; 286 287 #} else { 288 # $indent = ' '; 289 } 290 291 #print $indent, "POS: $pos ($ptag)\n" if DEBUG > 2; 292 # $attr = {%$attr}; 293 294 foreach my $k ( keys %$attr ) { 295 296 # Make sure some stooge doesn't have "<span _content='pie'>". 297 # That happens every few million Web pages. 298 $attr->{ ' ' . $k } = delete $attr->{$k} 299 if length $k and substr( $k, 0, 1 ) eq '_'; 300 301 # Looks bad, but is fine for round-tripping. 302 } 303 304 my $e = $self->element_class->new( $tag, %$attr ); 305 306 # Make a new element object. 307 # (Only rarely do we end up just throwing it away later in this call.) 308 309 # Some prep -- custom messiness for those damned tables, and strict P's. 310 if ( $self->{'_implicit_tags'} ) { # wallawallawalla! 311 312 unless ( $HTML::DOM::_TreeBuilder::isTableElement{$tag} ) { 313 if ( $ptag eq 'table' ) { 314 print $indent, 315 " * Phrasal \U$tag\E right under TABLE makes implicit TR and TD\n" 316 if DEBUG > 1; 317 $self->insert_element( 'tr', 1 ); 318 $pos = $self->insert_element( 'td', 1 ) 319 ; # yes, needs updating 320 } 321 elsif ( $ptag eq 'tr' ) { 322 print $indent, 323 " * Phrasal \U$tag\E right under TR makes an implicit TD\n" 324 if DEBUG > 1; 325 $pos = $self->insert_element( 'td', 1 ) 326 ; # yes, needs updating 327 } 328 $ptag = $pos->{'_tag'}; # yes, needs updating 329 } 330 331 # end of table-implication block. 332 333 # Now maybe do a little dance to enforce P-strictness. 334 # This seems like it should be integrated with the big 335 # "ALL HOPE..." block, further below, but that doesn't 336 # seem feasable. 337 if ( $self->{'_p_strict'} 338 and $HTML::DOM::_TreeBuilder::isKnown{$tag} 339 and not $HTML::Tagset::is_Possible_Strict_P_Content{$tag} ) 340 { 341 my $here = $pos; 342 my $here_tag = $ptag; 343 while (1) { 344 if ( $here_tag eq 'p' ) { 345 print $indent, " * Inserting $tag closes strict P.\n" 346 if DEBUG > 1; 347 $self->end( \q{p} ); 348 349 # NB: same as \'q', but less confusing to emacs cperl-mode 350 last; 351 } 352 353 #print("Lasting from $here_tag\n"), 354 last 355 if $HTML::DOM::_TreeBuilder::isKnown{$here_tag} 356 and 357 not $HTML::Tagset::is_Possible_Strict_P_Content{ 358 $here_tag}; 359 360 # Don't keep looking up the tree if we see something that can't 361 # be strict-P content. 362 363 $here_tag 364 = ( $here = $here->{'_parent'} || last )->{'_tag'}; 365 } # end while 366 $ptag = ( $pos = $self->{'_pos'} || $self ) 367 ->{'_tag'}; # better update! 368 } 369 370 # end of strict-p block. 371 } 372 373 # And now, get busy... 374 #---------------------------------------------------------------------- 375 if ( !$self->{'_implicit_tags'} ) { # bimskalabim 376 # do nothing 377 print $indent, " * _implicit_tags is off. doing nothing\n" 378 if DEBUG > 1; 379 380 #---------------------------------------------------------------------- 381 } 382 elsif ( $HTML::DOM::_TreeBuilder::isHeadOrBodyElement{$tag} ) { 383 if ( $pos->is_inside('body') ) { # all is well 384 print $indent, 385 " * ambilocal element \U$tag\E is fine under BODY.\n" 386 if DEBUG > 1; 387 } 388 elsif ( $pos->is_inside('head') ) { 389 print $indent, 390 " * ambilocal element \U$tag\E is fine under HEAD.\n" 391 if DEBUG > 1; 392 } 393 else { 394 395 # In neither head nor body! mmmmm... put under head? 396 397 if ( $ptag eq 'html' ) { # expected case 398 # TODO?? : would there ever be a case where _head would be 399 # absent from a tree that would ever be accessed at this 400 # point? 401 die "Where'd my head go?" unless ref $self->{'_head'}; 402 if ( $self->{'_head'}{'_implicit'} ) { 403 print $indent, 404 " * ambilocal element \U$tag\E makes an implicit HEAD.\n" 405 if DEBUG > 1; 406 407 # or rather, points us at it. 408 $self->{'_pos'} 409 = $self->{'_head'}; # to insert under... 410 } 411 else { 412 $self->warning( 413 "Ambilocal element <$tag> not under HEAD or BODY!?" 414 ); 415 416 # Put it under HEAD by default, I guess 417 $self->{'_pos'} 418 = $self->{'_head'}; # to insert under... 419 } 420 421 } 422 else { 423 424 # Neither under head nor body, nor right under html... pass thru? 425 $self->warning( 426 "Ambilocal element <$tag> neither under head nor body, nor right under html!?" 427 ); 428 } 429 } 430 431 #---------------------------------------------------------------------- 432 } 433 elsif ( $HTML::DOM::_TreeBuilder::isBodyElement{$tag} ) { 434 435 # Ensure that we are within <body> 436 if ( $ptag eq 'body' ) { 437 438 # We're good. 439 } 440 elsif ( 441 $HTML::DOM::_TreeBuilder::isBodyElement{$ptag} # glarg 442 and not $HTML::DOM::_TreeBuilder::isHeadOrBodyElement{$ptag} 443 ) 444 { 445 446 # Special case: Save ourselves a call to is_inside further down. 447 # If our $ptag is an isBodyElement element (but not an 448 # isHeadOrBodyElement element), then we must be under body! 449 print $indent, " * Inferring that $ptag is under BODY.\n", 450 if DEBUG > 3; 451 452 # I think this and the test for 'body' trap everything 453 # bodyworthy, except the case where the parent element is 454 # under an unknown element that's a descendant of body. 455 } 456 elsif ( $pos->is_inside('head') ) { 457 print $indent, 458 " * body-element \U$tag\E minimizes HEAD, makes implicit BODY.\n" 459 if DEBUG > 1; 460 $ptag = ( 461 $pos = $self->{'_pos'} 462 = $self->{'_body'} # yes, needs updating 463 || die "Where'd my body go?" 464 )->{'_tag'}; # yes, needs updating 465 } 466 elsif ( !$pos->is_inside('body') ) { 467 print $indent, 468 " * body-element \U$tag\E makes implicit BODY.\n" 469 if DEBUG > 1; 470 $ptag = ( 471 $pos = $self->{'_pos'} 472 = $self->{'_body'} # yes, needs updating 473 || die "Where'd my body go?" 474 )->{'_tag'}; # yes, needs updating 475 } 476 477 # else we ARE under body, so okay. 478 479 # Handle implicit endings and insert based on <tag> and position 480 # ... ALL HOPE ABANDON ALL YE WHO ENTER HERE ... 481 if ( $tag eq 'p' 482 or $tag eq 'h1' 483 or $tag eq 'h2' 484 or $tag eq 'h3' 485 or $tag eq 'h4' 486 or $tag eq 'h5' 487 or $tag eq 'h6' 488 or $tag eq 'form' 489 490 # Hm, should <form> really be here?! 491 ) 492 { 493 494 # Can't have <p>, <h#> or <form> inside these 495 $self->end( 496 $_Closed_by_structurals, 497 @HTML::DOM::_TreeBuilder::p_closure_barriers 498 499 # used to be just li! 500 ); 501 502 } 503 elsif ( $tag eq 'ol' or $tag eq 'ul' or $tag eq 'dl' ) { 504 505 # Can't have lists inside <h#> -- in the unlikely 506 # event anyone tries to put them there! 507 if ( $ptag eq 'h1' 508 or $ptag eq 'h2' 509 or $ptag eq 'h3' 510 or $ptag eq 'h4' 511 or $ptag eq 'h5' 512 or $ptag eq 'h6' ) 513 { 514 $self->end( \$ptag ); 515 } 516 517 # TODO: Maybe keep closing up the tree until 518 # the ptag isn't any of the above? 519 # But anyone that says <h1><h2><ul>... 520 # deserves what they get anyway. 521 522 } 523 elsif ( $tag eq 'li' ) { # list item 524 # Get under a list tag, one way or another 525 unless ( 526 exists $HTML::DOM::_TreeBuilder::isList{$ptag} 527 or $self->end( \q{*}, keys %HTML::DOM::_TreeBuilder::isList ) #' 528 ) 529 { 530 print $indent, 531 " * inserting implicit UL for lack of containing ", 532 join( '|', keys %HTML::DOM::_TreeBuilder::isList ), ".\n" 533 if DEBUG > 1; 534 $self->insert_element( 'ul', 1 ); 535 } 536 537 } 538 elsif ( $tag eq 'dt' or $tag eq 'dd' ) { 539 540 # Get under a DL, one way or another 541 unless ( $ptag eq 'dl' or $self->end( \q{*}, 'dl' ) ) { #' 542 print $indent, 543 " * inserting implicit DL for lack of containing DL.\n" 544 if DEBUG > 1; 545 $self->insert_element( 'dl', 1 ); 546 } 547 548 } 549 elsif ( $HTML::DOM::_TreeBuilder::isFormElement{$tag} ) { 550 if ($self->{ 551 '_ignore_formies_outside_form'} # TODO: document this 552 and not $pos->is_inside('form') 553 ) 554 { 555 print $indent, 556 " * ignoring \U$tag\E because not in a FORM.\n" 557 if DEBUG > 1; 558 return; # bypass tweaking. 559 } 560 if ( $tag eq 'option' ) { 561 562 # return unless $ptag eq 'select'; 563 $self->end( \q{option} ); 564 $ptag = ( $self->{'_pos'} || $self )->{'_tag'}; 565 unless ( $ptag eq 'select' or $ptag eq 'optgroup' ) { 566 print $indent, 567 " * \U$tag\E makes an implicit SELECT.\n" 568 if DEBUG > 1; 569 $pos = $self->insert_element( 'select', 1 ); 570 571 # but not a very useful select -- has no 'name' attribute! 572 # is $pos's value used after this? 573 } 574 } 575 } 576 elsif ( $HTML::DOM::_TreeBuilder::isTableElement{$tag} ) { 577 if ( !$pos->is_inside('table') ) { 578 print $indent, " * \U$tag\E makes an implicit TABLE\n" 579 if DEBUG > 1; 580 $self->insert_element( 'table', 1 ); 581 } 582 583 if ( $tag eq 'td' or $tag eq 'th' ) { 584 585 # Get under a tr one way or another 586 unless ( 587 $ptag eq 'tr' # either under a tr 588 or $self->end( \q{*}, 'tr', 589 'table' ) #or we can get under one 590 ) 591 { 592 print $indent, 593 " * \U$tag\E under \U$ptag\E makes an implicit TR\n" 594 if DEBUG > 1; 595 $self->insert_element( 'tr', 1 ); 596 597 # presumably pos's value isn't used after this. 598 } 599 } 600 else { 601 $self->end( \$tag, 'table' ); #' 602 } 603 604 # Hmm, I guess this is right. To work it out: 605 # tr closes any open tr (limited at a table) 606 # thead closes any open thead (limited at a table) 607 # tbody closes any open tbody (limited at a table) 608 # tfoot closes any open tfoot (limited at a table) 609 # colgroup closes any open colgroup (limited at a table) 610 # col can try, but will always fail, at the enclosing table, 611 # as col is empty, and therefore never open! 612 # But! 613 # td closes any open td OR th (limited at a table) 614 # th closes any open th OR td (limited at a table) 615 # ...implementable as "close to a tr, or make a tr" 616 617 } 618 elsif ( $HTML::DOM::_TreeBuilder::isPhraseMarkup{$tag} ) { 619 if ( $ptag eq 'body' and $self->{'_implicit_body_p_tag'} ) { 620 print 621 " * Phrasal \U$tag\E right under BODY makes an implicit P\n" 622 if DEBUG > 1; 623 $pos = $self->insert_element( 'p', 1 ); 624 625 # is $pos's value used after this? 626 } 627 } 628 629 # End of implicit endings logic 630 631 # End of "elsif ($HTML::DOM::_TreeBuilder::isBodyElement{$tag}" 632 #---------------------------------------------------------------------- 633 634 } 635 elsif ( $HTML::DOM::_TreeBuilder::isHeadElement{$tag} ) { 636 if ( $pos->is_inside('body') ) { 637 print $indent, " * head element \U$tag\E found inside BODY!\n" 638 if DEBUG; 639 $self->warning("Header element <$tag> in body"); # [sic] 640 } 641 elsif ( !$pos->is_inside('head') ) { 642 print $indent, 643 " * head element \U$tag\E makes an implicit HEAD.\n" 644 if DEBUG > 1; 645 } 646 else { 647 print $indent, 648 " * head element \U$tag\E goes inside existing HEAD.\n" 649 if DEBUG > 1; 650 } 651 $self->{'_pos'} = $self->{'_head'} || die "Where'd my head go?"; 652 653 #---------------------------------------------------------------------- 654 } 655 elsif ( $tag eq 'html' ) { 656 if ( delete $self->{'_implicit'} ) { # first time here 657 print $indent, " * good! found the real HTML element!\n" 658 if DEBUG > 1; 659 } 660 else { 661 print $indent, " * Found a second HTML element\n" 662 if DEBUG; 663 $self->warning("Found a nested <html> element"); 664 } 665 666 # in either case, migrate attributes to the real element 667 for ( keys %$attr ) { 668 $self->attr( $_, $attr->{$_} ); 669 } 670 $self->{'_pos'} = undef; 671 return $self; # bypass tweaking. 672 673 #---------------------------------------------------------------------- 674 } 675 elsif ( $tag eq 'head' ) { 676 my $head = $self->{'_head'} || die "Where'd my head go?"; 677 if ( delete $head->{'_implicit'} ) { # first time here 678 print $indent, " * good! found the real HEAD element!\n" 679 if DEBUG > 1; 680 } 681 else { # been here before 682 print $indent, " * Found a second HEAD element\n" 683 if DEBUG; 684 $self->warning("Found a second <head> element"); 685 } 686 687 # in either case, migrate attributes to the real element 688 for ( keys %$attr ) { 689 $head->attr( $_, $attr->{$_} ); 690 } 691 return $self->{'_pos'} = $head; # bypass tweaking. 692 693 #---------------------------------------------------------------------- 694 } 695 elsif ( $tag eq 'body' ) { 696 my $body = $self->{'_body'} || die "Where'd my body go?"; 697 if ( delete $body->{'_implicit'} ) { # first time here 698 print $indent, " * good! found the real BODY element!\n" 699 if DEBUG > 1; 700 } 701 else { # been here before 702 print $indent, " * Found a second BODY element\n" 703 if DEBUG; 704 $self->warning("Found a second <body> element"); 705 } 706 707 # in either case, migrate attributes to the real element 708 for ( keys %$attr ) { 709 $body->attr( $_, $attr->{$_} ); 710 } 711 $self->{'_pos'} = $body unless $pos->is_inside('body'); 712 return $body; # bypass tweaking. 713 714 #---------------------------------------------------------------------- 715 } 716 elsif ( $tag eq 'frameset' ) { 717 if (!( $self->{'_frameset_seen'}++ ) # first frameset seen 718 and !$self->{'_noframes_seen'} 719 720 # otherwise it'll be under the noframes already 721 and !$self->is_inside('body') 722 ) 723 { 724 725 # The following is a bit of a hack. We don't use the normal 726 # insert_element because 1) we don't want it as _pos, but instead 727 # right under $self, and 2), more importantly, that we don't want 728 # this inserted at the /end/ of $self's content_list, but instead 729 # in the middle of it, specifiaclly right before the body element. 730 # 731 my $c = $self->{'_content'} || die "Contentless root?"; 732 my $body = $self->{'_body'} || die "Where'd my BODY go?"; 733 for ( my $i = 0; $i < @$c; ++$i ) { 734 if ( $c->[$i] eq $body ) { 735 splice( @$c, $i, 0, $self->{'_pos'} = $pos = $e ); 736 $e->{'_parent'} = $self; 737 $already_inserted = 1; 738 print $indent, 739 " * inserting 'frameset' right before BODY.\n" 740 if DEBUG > 1; 741 last; 742 } 743 } 744 die "BODY not found in children of root?" 745 unless $already_inserted; 746 } 747 748 } 749 elsif ( $tag eq 'frame' ) { 750 751 # Okay, fine, pass thru. 752 # Should probably enforce that these should be under a frameset. 753 # But hey. Ditto for enforcing that 'noframes' should be under 754 # a 'frameset', as the DTDs say. 755 756 } 757 elsif ( $tag eq 'noframes' ) { 758 759 # This basically assumes there'll be exactly one 'noframes' element 760 # per document. At least, only the first one gets to have the 761 # body under it. And if there are no noframes elements, then 762 # the body pretty much stays where it is. Is that ever a problem? 763 if ( $self->{'_noframes_seen'}++ ) { 764 print $indent, " * ANOTHER noframes element?\n" if DEBUG; 765 } 766 else { 767 if ( $pos->is_inside('body') ) { 768 print $indent, " * 'noframes' inside 'body'. Odd!\n" 769 if DEBUG; 770 771 # In that odd case, we /can't/ make body a child of 'noframes', 772 # because it's an ancestor of the 'noframes'! 773 } 774 else { 775 $e->push_content( $self->{'_body'} 776 || die "Where'd my body go?" ); 777 print $indent, " * Moving body to be under noframes.\n" 778 if DEBUG; 779 } 780 } 781 782 #---------------------------------------------------------------------- 783 } 784 else { 785 786 # unknown tag 787 if ( $self->{'_ignore_unknown'} ) { 788 print $indent, " * Ignoring unknown tag \U$tag\E\n" if DEBUG; 789 $self->warning("Skipping unknown tag $tag"); 790 return; 791 } 792 else { 793 print $indent, " * Accepting unknown tag \U$tag\E\n" 794 if DEBUG; 795 } 796 } 797 798 #---------------------------------------------------------------------- 799 # End of mumbo-jumbo 800 801 print $indent, "(Attaching ", $e->{'_tag'}, " under ", 802 ( $self->{'_pos'} || $self )->{'_tag'}, ")\n" 803 804 # because if _pos isn't defined, it goes under self 805 if DEBUG; 806 807 # The following if-clause is to delete /some/ ignorable whitespace 808 # nodes, as we're making the tree. 809 # This'd be a node we'd catch later anyway, but we might as well 810 # nip it in the bud now. 811 # This doesn't catch /all/ deletable WS-nodes, so we do have to call 812 # the tightener later to catch the rest. 813 814 if ( $self->{'_tighten'} and !$self->{'_ignore_text'} ) 815 { # if tightenable 816 my ( $sibs, $par ); 817 if (( $sibs = ( $par = $self->{'_pos'} || $self )->{'_content'} ) 818 and @$sibs # parent already has content 819 and ! 820 ref( $sibs->[-1] ) # and the last one there is a text node 821 and $sibs->[-1] !~ m<[^\n\r\f\t ]>s # and it's all whitespace 822 823 and ( # one of these has to be eligible... 824 $HTML::DOM::_TreeBuilder::canTighten{$tag} 825 or (( @$sibs == 1 ) 826 ? # WS is leftmost -- so parent matters 827 $HTML::DOM::_TreeBuilder::canTighten{ $par->{'_tag'} } 828 : # WS is after another node -- it matters 829 ( ref $sibs->[-2] 830 and 831 $HTML::DOM::_TreeBuilder::canTighten{ $sibs->[-2] 832 {'_tag'} } 833 ) 834 ) 835 ) 836 837 and !$par->is_inside( 'pre', 'xmp', 'textarea', 'plaintext' ) 838 839 # we're clear 840 ) 841 { 842 pop @$sibs; 843 print $indent, "Popping a preceding all-WS node\n" if DEBUG; 844 } 845 } 846 847 $self->insert_element($e) unless $already_inserted; 848 849 if (DEBUG) { 850 if ( $self->{'_pos'} ) { 851 print $indent, "(Current lineage of pos: \U$tag\E under ", 852 join( 853 '/', 854 reverse( 855 856 # $self->{'_pos'}{'_tag'}, # don't list myself! 857 $self->{'_pos'}->lineage_tag_names 858 ) 859 ), 860 ".)\n"; 861 } 862 else { 863 print $indent, "(Pos points nowhere!?)\n"; 864 } 865 } 866 867 unless ( ( $self->{'_pos'} || '' ) eq $e ) { 868 869 # if it's an empty element -- i.e., if it didn't change the _pos 870 &{ $self->{"_tweak_$tag"} 871 || $self->{'_tweak_*'} 872 || return $e }( map $_, $e, $tag, $self ) 873 ; # make a list so the user can't clobber 874 } 875 876 return $e; 877 } 878} 879 880#========================================================================== 881 882{ 883 my $indent; 884 885 sub end { 886 return if $_[0]{'_stunted'}; 887 888 # Either: Acccept an end-tag signal from HTML::Parser 889 # Or: Method for closing currently open elements in some fairly complex 890 # way, as used by other methods in this class. 891 my ( $self, $tag, @stop ) = @_; 892 if ( $tag eq 'x-html' ) { 893 print "Ignoring close-x-html tag.\n" if DEBUG; 894 895 # inserted by some lame code-generators. 896 return; 897 } 898 899 unless ( ref($tag) or $tag =~ m/^[-_a-zA-Z0-9:%]+$/s ) { 900 DEBUG and print "End-tag name $tag is no good. Skipping.\n"; 901 return; 902 903 # This avoids having Element's new() throw an exception. 904 } 905 906 # This method accepts two calling formats: 907 # 1) from Parser: $self->end('tag_name', 'origtext') 908 # in which case we shouldn't mistake origtext as a blocker tag 909 # 2) from myself: $self->end(\q{tagname1}, 'blk1', ... ) 910 # from myself: $self->end(['tagname1', 'tagname2'], 'blk1', ... ) 911 912 # End the specified tag, but don't move above any of the blocker tags. 913 # The tag can also be a reference to an array. Terminate the first 914 # tag found. 915 916 my $ptag = ( my $p = $self->{'_pos'} || $self )->{'_tag'}; 917 918 # $p and $ptag are sort-of stratch 919 920 if ( ref($tag) ) { 921 922 # First param is a ref of one sort or another -- 923 # THE CALL IS COMING FROM INSIDE THE HOUSE! 924 $tag = $$tag if ref($tag) eq 'SCALAR'; 925 926 # otherwise it's an arrayref. 927 } 928 else { 929 930 # the call came from Parser -- just ignore origtext 931 # except in a table ignore unmatched table tags RT #59980 932 @stop = $tag =~ /^t[hdr]\z/ ? 'table' : (); 933 } 934 935 #my($indent); 936 if (DEBUG) { 937 938 # optimization -- don't figure out depth unless we're in debug mode 939 my @lineage_tags = $p->lineage_tag_names; 940 $indent = ' ' x ( 1 + @lineage_tags ); 941 942 # now announce ourselves 943 print $indent, "Ending ", 944 ref($tag) ? ( '[', join( ' ', @$tag ), ']' ) : "\U$tag\E", 945 scalar(@stop) 946 ? ( " no higher than [", join( ' ', @stop ), "]" ) 947 : (), ".\n"; 948 949 print $indent, " (Current lineage: ", join( '/', @lineage_tags ), 950 ".)\n" 951 if DEBUG > 1; 952 953 if ( DEBUG > 3 ) { 954 955 #my( 956 # $package, $filename, $line, $subroutine, 957 # $hasargs, $wantarray, $evaltext, $is_require) = caller; 958 print $indent, 959 " (Called from ", ( caller(1) )[3], ' line ', 960 ( caller(1) )[2], 961 ")\n"; 962 } 963 964 #} else { 965 # $indent = ' '; 966 } 967 968 # End of if DEBUG 969 970 # Now actually do it 971 my @to_close; 972 if ( $tag eq '*' ) { 973 974 # Special -- close everything up to (but not including) the first 975 # limiting tag, or return if none found. Somewhat of a special case. 976 PARENT: 977 while ( defined $p ) { 978 $ptag = $p->{'_tag'}; 979 print $indent, " (Looking at $ptag.)\n" if DEBUG > 2; 980 for (@stop) { 981 if ( $ptag eq $_ ) { 982 print $indent, 983 " (Hit a $_; closing everything up to here.)\n" 984 if DEBUG > 2; 985 last PARENT; 986 } 987 } 988 push @to_close, $p; 989 $p = $p->{'_parent'}; # no match so far? keep moving up 990 print $indent, 991 " (Moving on up to ", $p ? $p->{'_tag'} : 'nil', ")\n" 992 if DEBUG > 1; 993 } 994 unless ( defined $p ) { # We never found what we were looking for. 995 print $indent, " (We never found a limit.)\n" if DEBUG > 1; 996 return; 997 } 998 999 #print 1000 # $indent, 1001 # " (To close: ", join('/', map $_->tag, @to_close), ".)\n" 1002 # if DEBUG > 4; 1003 1004 # Otherwise update pos and fall thru. 1005 $self->{'_pos'} = $p; 1006 } 1007 elsif ( ref $tag ) { 1008 1009 # Close the first of any of the matching tags, giving up if you hit 1010 # any of the stop-tags. 1011 PARENT: 1012 while ( defined $p ) { 1013 $ptag = $p->{'_tag'}; 1014 print $indent, " (Looking at $ptag.)\n" if DEBUG > 2; 1015 for (@$tag) { 1016 if ( $ptag eq $_ ) { 1017 print $indent, " (Closing $_.)\n" if DEBUG > 2; 1018 last PARENT; 1019 } 1020 } 1021 for (@stop) { 1022 if ( $ptag eq $_ ) { 1023 print $indent, 1024 " (Hit a limiting $_ -- bailing out.)\n" 1025 if DEBUG > 1; 1026 return; # so it was all for naught 1027 } 1028 } 1029 push @to_close, $p; 1030 $p = $p->{'_parent'}; 1031 } 1032 return unless defined $p; # We went off the top of the tree. 1033 # Otherwise specified element was found; set pos to its parent. 1034 push @to_close, $p; 1035 $self->{'_pos'} = $p->{'_parent'}; 1036 } 1037 else { 1038 1039 # Close the first of the specified tag, giving up if you hit 1040 # any of the stop-tags. 1041 while ( defined $p ) { 1042 $ptag = $p->{'_tag'}; 1043 print $indent, " (Looking at $ptag.)\n" if DEBUG > 2; 1044 if ( $ptag eq $tag ) { 1045 print $indent, " (Closing $tag.)\n" if DEBUG > 2; 1046 last; 1047 } 1048 for (@stop) { 1049 if ( $ptag eq $_ ) { 1050 print $indent, 1051 " (Hit a limiting $_ -- bailing out.)\n" 1052 if DEBUG > 1; 1053 return; # so it was all for naught 1054 } 1055 } 1056 push @to_close, $p; 1057 $p = $p->{'_parent'}; 1058 } 1059 return unless defined $p; # We went off the top of the tree. 1060 # Otherwise specified element was found; set pos to its parent. 1061 push @to_close, $p; 1062 $self->{'_pos'} = $p->{'_parent'}; 1063 } 1064 1065 $self->{'_pos'} = undef if $self eq ( $self->{'_pos'} || '' ); 1066 print $indent, "(Pos now points to ", 1067 $self->{'_pos'} ? $self->{'_pos'}{'_tag'} : '???', ".)\n" 1068 if DEBUG > 1; 1069 1070 ### EXPENSIVE, because has to check that it's not under a pre 1071 ### or a CDATA-parent. That's one more method call per end()! 1072 ### Might as well just do this at the end of the tree-parse, I guess, 1073 ### at which point we'd be parsing top-down, and just not traversing 1074 ### under pre's or CDATA-parents. 1075 ## 1076 ## Take this opportunity to nix any terminal whitespace nodes. 1077 ## TODO: consider whether this (plus the logic in start(), above) 1078 ## would ever leave any WS nodes in the tree. 1079 ## If not, then there's no reason to have eof() call 1080 ## delete_ignorable_whitespace on the tree, is there? 1081 ## 1082 #if(@to_close and $self->{'_tighten'} and !$self->{'_ignore_text'} and 1083 # ! $to_close[-1]->is_inside('pre', keys %HTML::Tagset::isCDATA_Parent) 1084 #) { # if tightenable 1085 # my($children, $e_tag); 1086 # foreach my $e (reverse @to_close) { # going top-down 1087 # last if 'pre' eq ($e_tag = $e->{'_tag'}) or 1088 # $HTML::Tagset::isCDATA_Parent{$e_tag}; 1089 # 1090 # if( 1091 # $children = $e->{'_content'} 1092 # and @$children # has children 1093 # and !ref($children->[-1]) 1094 # and $children->[-1] =~ m<^\s+$>s # last node is all-WS 1095 # and 1096 # ( 1097 # # has a tightable parent: 1098 # $HTML::DOM::_TreeBuilder::canTighten{ $e_tag } 1099 # or 1100 # ( # has a tightenable left sibling: 1101 # @$children > 1 and 1102 # ref($children->[-2]) 1103 # and $HTML::DOM::_TreeBuilder::canTighten{ $children->[-2]{'_tag'} } 1104 # ) 1105 # ) 1106 # ) { 1107 # pop @$children; 1108 # #print $indent, "Popping a terminal WS node from ", $e->{'_tag'}, 1109 # # " (", $e->address, ") while exiting.\n" if DEBUG; 1110 # } 1111 # } 1112 #} 1113 1114 foreach my $e (@to_close) { 1115 1116 # Call the applicable callback, if any 1117 $ptag = $e->{'_tag'}; 1118 &{ $self->{"_tweak_$ptag"} 1119 || $self->{'_tweak_*'} 1120 || next }( map $_, $e, $ptag, $self ); 1121 print $indent, "Back from tweaking.\n" if DEBUG; 1122 last 1123 if $self->{ '_stunted' 1124 }; # in case one of the handlers called stunt 1125 } 1126 return @to_close; 1127 } 1128} 1129 1130#========================================================================== 1131{ 1132 my ( $indent, $nugget ); 1133 1134 sub text { 1135 return if $_[0]{'_stunted'}; 1136 1137 # Accept a "here's a text token" signal from HTML::Parser. 1138 my ( $self, $text, $is_cdata ) = @_; 1139 1140 # the >3.0 versions of Parser may pass a cdata node. 1141 # Thanks to Gisle Aas for pointing this out. 1142 1143 return unless length $text; # I guess that's always right 1144 1145 my $ignore_text = $self->{'_ignore_text'}; 1146 my $no_space_compacting = $self->{'_no_space_compacting'}; 1147 my $no_expand_entities = $self->{'_no_expand_entities'}; 1148 my $pos = $self->{'_pos'} || $self; 1149 1150 HTML::Entities::decode($text) 1151 unless $ignore_text 1152 || $is_cdata 1153 || $HTML::Tagset::isCDATA_Parent{ $pos->{'_tag'} } 1154 || $no_expand_entities; 1155 1156 #my($indent, $nugget); 1157 if (DEBUG) { 1158 1159 # optimization -- don't figure out depth unless we're in debug mode 1160 my @lineage_tags = $pos->lineage_tag_names; 1161 $indent = ' ' x ( 1 + @lineage_tags ); 1162 1163 $nugget 1164 = ( length($text) <= 25 ) 1165 ? $text 1166 : ( substr( $text, 0, 25 ) . '...' ); 1167 $nugget =~ s<([\x00-\x1F])> 1168 <'\\x'.(unpack("H2",$1))>eg; 1169 print $indent, "Proposing a new text node ($nugget) under ", 1170 join( '/', reverse( $pos->{'_tag'}, @lineage_tags ) ) 1171 || 'Root', 1172 ".\n"; 1173 1174 #} else { 1175 # $indent = ' '; 1176 } 1177 1178 my $ptag; 1179 if ($HTML::Tagset::isCDATA_Parent{ $ptag = $pos->{'_tag'} } 1180 1181 #or $pos->is_inside('pre') 1182 or $pos->is_inside( 'pre', 'textarea' ) 1183 ) 1184 { 1185 return if $ignore_text; 1186 $pos->push_content($text); 1187 } 1188 else { 1189 1190 # return unless $text =~ /\S/; # This is sometimes wrong 1191 1192 if ( !$self->{'_implicit_tags'} || $text !~ /[^\n\r\f\t ]/ ) { 1193 1194 # don't change anything 1195 } 1196 elsif ( $ptag eq 'head' or $ptag eq 'noframes' ) { 1197 if ( $self->{'_implicit_body_p_tag'} ) { 1198 print $indent, 1199 " * Text node under \U$ptag\E closes \U$ptag\E, implicates BODY and P.\n" 1200 if DEBUG > 1; 1201 $self->end( \$ptag ); 1202 $pos = $self->{'_body'} 1203 ? ( $self->{'_pos'} 1204 = $self->{'_body'} ) # expected case 1205 : $self->insert_element( 'body', 1 ); 1206 $pos = $self->insert_element( 'p', 1 ); 1207 } 1208 else { 1209 print $indent, 1210 " * Text node under \U$ptag\E closes, implicates BODY.\n" 1211 if DEBUG > 1; 1212 $self->end( \$ptag ); 1213 $pos = $self->{'_body'} 1214 ? ( $self->{'_pos'} 1215 = $self->{'_body'} ) # expected case 1216 : $self->insert_element( 'body', 1 ); 1217 } 1218 } 1219 elsif ( $ptag eq 'html' ) { 1220 if ( $self->{'_implicit_body_p_tag'} ) { 1221 print $indent, 1222 " * Text node under HTML implicates BODY and P.\n" 1223 if DEBUG > 1; 1224 $pos = $self->{'_body'} 1225 ? ( $self->{'_pos'} 1226 = $self->{'_body'} ) # expected case 1227 : $self->insert_element( 'body', 1 ); 1228 $pos = $self->insert_element( 'p', 1 ); 1229 } 1230 else { 1231 print $indent, 1232 " * Text node under HTML implicates BODY.\n" 1233 if DEBUG > 1; 1234 $pos = $self->{'_body'} 1235 ? ( $self->{'_pos'} 1236 = $self->{'_body'} ) # expected case 1237 : $self->insert_element( 'body', 1 ); 1238 1239 #print "POS is $pos, ", $pos->{'_tag'}, "\n"; 1240 } 1241 } 1242 elsif ( $ptag eq 'body' ) { 1243 if ( $self->{'_implicit_body_p_tag'} ) { 1244 print $indent, " * Text node under BODY implicates P.\n" 1245 if DEBUG > 1; 1246 $pos = $self->insert_element( 'p', 1 ); 1247 } 1248 } 1249 elsif ( $ptag eq 'table' ) { 1250 print $indent, 1251 " * Text node under TABLE implicates TR and TD.\n" 1252 if DEBUG > 1; 1253 $self->insert_element( 'tr', 1 ); 1254 $pos = $self->insert_element( 'td', 1 ); 1255 1256 # double whammy! 1257 } 1258 elsif ( $ptag eq 'tr' ) { 1259 print $indent, " * Text node under TR implicates TD.\n" 1260 if DEBUG > 1; 1261 $pos = $self->insert_element( 'td', 1 ); 1262 } 1263 1264 # elsif ( 1265 # # $ptag eq 'li' || 1266 # # $ptag eq 'dd' || 1267 # $ptag eq 'form') { 1268 # $pos = $self->insert_element('p', 1); 1269 #} 1270 1271 # Whatever we've done above should have had the side 1272 # effect of updating $self->{'_pos'} 1273 1274 #print "POS is now $pos, ", $pos->{'_tag'}, "\n"; 1275 1276 return if $ignore_text; 1277 $text =~ s/[\n\r\f\t ]+/ /g # canonical space 1278 unless $no_space_compacting; 1279 1280 print $indent, " (Attaching text node ($nugget) under ", 1281 1282 # was: $self->{'_pos'} ? $self->{'_pos'}{'_tag'} : $self->{'_tag'}, 1283 $pos->{'_tag'}, ").\n" 1284 if DEBUG > 1; 1285 1286 $pos->push_content($text); 1287 } 1288 1289 &{ $self->{'_tweak_~text'} || return }( $text, $pos, 1290 $pos->{'_tag'} . '' ); 1291 1292 # Note that this is very exceptional -- it doesn't fall back to 1293 # _tweak_*, and it gives its tweak different arguments. 1294 return; 1295 } 1296} 1297 1298#========================================================================== 1299 1300# TODO: test whether comment(), declaration(), and process(), do the right 1301# thing as far as tightening and whatnot. 1302# Also, currently, doctypes and comments that appear before head or body 1303# show up in the tree in the wrong place. Something should be done about 1304# this. Tricky. Maybe this whole business of pre-making the body and 1305# whatnot is wrong. 1306 1307sub comment { 1308 return if $_[0]{'_stunted'}; 1309 1310 # Accept a "here's a comment" signal from HTML::Parser. 1311 1312 my ( $self, $text ) = @_; 1313 my $pos = $self->{'_pos'} || $self; 1314 return 1315 unless $self->{'_store_comments'} 1316 || $HTML::Tagset::isCDATA_Parent{ $pos->{'_tag'} }; 1317 1318 if (DEBUG) { 1319 my @lineage_tags = $pos->lineage_tag_names; 1320 my $indent = ' ' x ( 1 + @lineage_tags ); 1321 1322 my $nugget 1323 = ( length($text) <= 25 ) 1324 ? $text 1325 : ( substr( $text, 0, 25 ) . '...' ); 1326 $nugget =~ s<([\x00-\x1F])> 1327 <'\\x'.(unpack("H2",$1))>eg; 1328 print $indent, "Proposing a Comment ($nugget) under ", 1329 join( '/', reverse( $pos->{'_tag'}, @lineage_tags ) ) || 'Root', 1330 ".\n"; 1331 } 1332 1333 ( my $e = $self->element_class->new('~comment') )->{'text'} = $text; 1334 $pos->push_content($e); 1335 ++( $self->{'_element_count'} ); 1336 1337 &{ $self->{'_tweak_~comment'} 1338 || $self->{'_tweak_*'} 1339 || return $e }( map $_, $e, '~comment', $self ); 1340 1341 return $e; 1342} 1343 1344sub declaration { 1345 return if $_[0]{'_stunted'}; 1346 1347 # Accept a "here's a markup declaration" signal from HTML::Parser. 1348 1349 my ( $self, $text ) = @_; 1350 my $pos = $self->{'_pos'} || $self; 1351 1352 if (DEBUG) { 1353 my @lineage_tags = $pos->lineage_tag_names; 1354 my $indent = ' ' x ( 1 + @lineage_tags ); 1355 1356 my $nugget 1357 = ( length($text) <= 25 ) 1358 ? $text 1359 : ( substr( $text, 0, 25 ) . '...' ); 1360 $nugget =~ s<([\x00-\x1F])> 1361 <'\\x'.(unpack("H2",$1))>eg; 1362 print $indent, "Proposing a Declaration ($nugget) under ", 1363 join( '/', reverse( $pos->{'_tag'}, @lineage_tags ) ) || 'Root', 1364 ".\n"; 1365 } 1366 ( my $e = $self->element_class->new('~declaration') )->{'text'} = $text; 1367 1368 $self->{_decl} = $e; 1369 return $e; 1370} 1371 1372#========================================================================== 1373 1374sub process { 1375 return if $_[0]{'_stunted'}; 1376 1377 # Accept a "here's a PI" signal from HTML::Parser. 1378 1379 return unless $_[0]->{'_store_pis'}; 1380 my ( $self, $text ) = @_; 1381 my $pos = $self->{'_pos'} || $self; 1382 1383 if (DEBUG) { 1384 my @lineage_tags = $pos->lineage_tag_names; 1385 my $indent = ' ' x ( 1 + @lineage_tags ); 1386 1387 my $nugget 1388 = ( length($text) <= 25 ) 1389 ? $text 1390 : ( substr( $text, 0, 25 ) . '...' ); 1391 $nugget =~ s<([\x00-\x1F])> 1392 <'\\x'.(unpack("H2",$1))>eg; 1393 print $indent, "Proposing a PI ($nugget) under ", 1394 join( '/', reverse( $pos->{'_tag'}, @lineage_tags ) ) || 'Root', 1395 ".\n"; 1396 } 1397 ( my $e = $self->element_class->new('~pi') )->{'text'} = $text; 1398 $pos->push_content($e); 1399 ++( $self->{'_element_count'} ); 1400 1401 &{ $self->{'_tweak_~pi'} || $self->{'_tweak_*'} || return $e }( map $_, 1402 $e, '~pi', $self ); 1403 1404 return $e; 1405} 1406 1407#========================================================================== 1408 1409#When you call $tree->parse_file($filename), and the 1410#tree's ignore_ignorable_whitespace attribute is on (as it is 1411#by default), HTML::DOM::_TreeBuilder's logic will manage to avoid 1412#creating some, but not all, nodes that represent ignorable 1413#whitespace. However, at the end of its parse, it traverses the 1414#tree and deletes any that it missed. (It does this with an 1415#around-method around HTML::Parser's eof method.) 1416# 1417#However, with $tree->parse($content), the cleanup-traversal step 1418#doesn't happen automatically -- so when you're done parsing all 1419#content for a document (regardless of whether $content is the only 1420#bit, or whether it's just another chunk of content you're parsing into 1421#the tree), call $tree->eof() to signal that you're at the end of the 1422#text you're inputting to the tree. Besides properly cleaning any bits 1423#of ignorable whitespace from the tree, this will also ensure that 1424#HTML::Parser's internal buffer is flushed. 1425 1426sub eof { 1427 1428 # Accept an "end-of-file" signal from HTML::Parser, or thrown by the user. 1429 1430 return if $_[0]->{'_done'}; # we've already been here 1431 1432 return $_[0]->SUPER::eof() if $_[0]->{'_stunted'}; 1433 1434 my $x = $_[0]; 1435 print "EOF received.\n" if DEBUG; 1436 my (@rv); 1437 if (wantarray) { 1438 1439 # I don't think this makes any difference for this particular 1440 # method, but let's be scrupulous, for once. 1441 @rv = $x->SUPER::eof(); 1442 } 1443 else { 1444 $rv[0] = $x->SUPER::eof(); 1445 } 1446 1447 $x->end('html') unless $x eq ( $x->{'_pos'} || $x ); 1448 1449 # That SHOULD close everything, and will run the appropriate tweaks. 1450 # We /could/ be running under some insane mode such that there's more 1451 # than one HTML element, but really, that's just insane to do anyhow. 1452 1453 unless ( $x->{'_implicit_tags'} ) { 1454 1455 # delete those silly implicit head and body in case we put 1456 # them there in implicit tags mode 1457 foreach my $node ( $x->{'_head'}, $x->{'_body'} ) { 1458 $node->replace_with_content 1459 if defined $node 1460 and ref $node 1461 and $node->{'_implicit'} 1462 and $node->{'_parent'}; 1463 1464 # I think they should be empty anyhow, since the only 1465 # logic that'd insert under them can apply only, I think, 1466 # in the case where _implicit_tags is on 1467 } 1468 1469 # this may still leave an implicit 'html' at the top, but there's 1470 # nothing we can do about that, is there? 1471 } 1472 1473 $x->delete_ignorable_whitespace() 1474 1475 # this's why we trap this -- an after-method 1476 if $x->{'_tighten'} and !$x->{'_ignore_text'}; 1477 $x->{'_done'} = 1; 1478 1479 return @rv if wantarray; 1480 return $rv[0]; 1481} 1482 1483#========================================================================== 1484 1485# TODO: document 1486 1487sub stunt { 1488 my $self = $_[0]; 1489 print "Stunting the tree.\n" if DEBUG; 1490 $self->{'_done'} = 1; 1491 1492 if ( $HTML::Parser::VERSION < 3 ) { 1493 1494 #This is a MEAN MEAN HACK. And it works most of the time! 1495 $self->{'_buf'} = ''; 1496 my $fh = *HTML::Parser::F{IO}; 1497 1498 # the local'd FH used by parse_file loop 1499 if ( defined $fh ) { 1500 print "Closing Parser's filehandle $fh\n" if DEBUG; 1501 close($fh); 1502 } 1503 1504 # But if they called $tree->parse_file($filehandle) 1505 # or $tree->parse_file(*IO), then there will be no *HTML::Parser::F{IO} 1506 # to close. Ahwell. Not a problem for most users these days. 1507 1508 } 1509 else { 1510 $self->SUPER::eof(); 1511 1512 # Under 3+ versions, calling eof from inside a parse will abort the 1513 # parse / parse_file 1514 } 1515 1516 # In the off chance that the above didn't work, we'll throw 1517 # this flag to make any future events be no-ops. 1518 $self->stunted(1); 1519 return; 1520} 1521 1522# TODO: document 1523sub stunted { shift->_elem( '_stunted', @_ ); } 1524sub done { shift->_elem( '_done', @_ ); } 1525 1526#========================================================================== 1527 1528sub delete { 1529 1530 # Override Element's delete method. 1531 # This does most, if not all, of what Element's delete does anyway. 1532 # Deletes content, including content in some special attributes. 1533 # But doesn't empty out the hash. 1534 1535 $_[0]->{'_element_count'} = 1; # never hurts to be scrupulously correct 1536 1537 delete @{ $_[0] }{ '_body', '_head', '_pos' }; 1538 for ( 1539 @{ delete( $_[0]->{'_content'} ) || [] }, # all/any content 1540 1541 # delete @{$_[0]}{'_body', '_head', '_pos'} 1542 # ...and these, in case these elements don't appear in the 1543 # content, which is possible. If they did appear (as they 1544 # usually do), then calling $_->delete on them again is harmless. 1545 # I don't think that's such a hot idea now. Thru creative reattachment, 1546 # those could actually now point to elements in OTHER trees (which we do 1547 # NOT want to delete!). 1548## Reasoned out: 1549 # If these point to elements not in the content list of any element in this 1550 # tree, but not in the content list of any element in any OTHER tree, then 1551 # just deleting these will make their refcounts hit zero. 1552 # If these point to elements in the content lists of elements in THIS tree, 1553 # then we'll get to deleting them when we delete from the top. 1554 # If these point to elements in the content lists of elements in SOME OTHER 1555 # tree, then they're not to be deleted. 1556 ) 1557 { 1558 $_->delete 1559 if defined $_ and ref $_ # Make sure it's an object. 1560 and $_ ne $_[0]; # And avoid hitting myself, just in case! 1561 } 1562 1563 $_[0]->detach if $_[0]->{'_parent'} and $_[0]->{'_parent'}{'_content'}; 1564 1565 # An 'html' element having a parent is quite unlikely. 1566 1567 return; 1568} 1569 1570sub tighten_up { # legacy 1571 shift->delete_ignorable_whitespace(@_); 1572} 1573 1574sub elementify { 1575 1576 # Rebless this object down into the normal element class. 1577 my $self = $_[0]; 1578 my $to_class = $self->element_class; 1579 delete @{$self}{ 1580 grep { 1581 ; 1582 length $_ and substr( $_, 0, 1 ) eq '_' 1583 1584 # The private attributes that we'll retain: 1585 and $_ ne '_tag' 1586 and $_ ne '_parent' 1587 and $_ ne '_content' 1588 and $_ ne '_implicit' 1589 and $_ ne '_pos' 1590 and $_ ne '_element_class' 1591 } keys %$self 1592 }; 1593 bless $self, $to_class; # Returns the same object we were fed 1594} 1595 1596sub element_class { 1597 return 'HTML::DOM::_Element' if not ref $_[0]; 1598 return $_[0]->{_element_class} || 'HTML::DOM::_Element'; 1599} 1600 1601#-------------------------------------------------------------------------- 1602 1603sub guts { 1604 my @out; 1605 my @stack = ( $_[0] ); 1606 my $destructive = $_[1]; 1607 my $this; 1608 while (@stack) { 1609 $this = shift @stack; 1610 if ( !ref $this ) { 1611 push @out, $this; # yes, it can include text nodes 1612 } 1613 elsif ( !$this->{'_implicit'} ) { 1614 push @out, $this; 1615 delete $this->{'_parent'} if $destructive; 1616 } 1617 else { 1618 1619 # it's an implicit node. Delete it and recurse 1620 delete $this->{'_parent'} if $destructive; 1621 unshift @stack, 1622 @{ 1623 ( $destructive 1624 ? delete( $this->{'_content'} ) 1625 : $this->{'_content'} 1626 ) 1627 || [] 1628 }; 1629 } 1630 } 1631 1632 # Doesn't call a real $root->delete on the (when implicit) root, 1633 # but I don't think it needs to. 1634 1635 return @out if wantarray; # one simple normal case. 1636 return unless @out; 1637 return $out[0] if @out == 1 and ref( $out[0] ); 1638 my $x = HTML::DOM::_Element->new( 'div', '_implicit' => 1 ); 1639 $x->push_content(@out); 1640 return $x; 1641} 1642 1643sub disembowel { $_[0]->guts(1) } 1644 1645#-------------------------------------------------------------------------- 16461; 1647 1648__END__ 1649 1650