1package HTML::LinkExtractor; 2 3use strict; 4 5use HTML::TokeParser 2; # use HTML::TokeParser::Simple 2; 6use URI 1; 7use Carp qw( croak ); 8 9use vars qw( $VERSION ); 10$VERSION = '0.13'; 11 12## The html tags which might have URLs 13# the master list of tagolas and required attributes (to constitute a link) 14use vars qw( %TAGS ); 15%TAGS = ( 16 a => [qw( href )], 17 applet => [qw( archive code codebase src )], 18 area => [qw( href )], 19 base => [qw( href )], 20 bgsound => [qw( src )], 21 blockquote => [qw( cite )], 22 body => [qw( background )], 23 del => [qw( cite )], 24 div => [qw( src )], # IE likes it, but don't know where it's documented 25 embed => [qw( pluginspage pluginurl src )], 26 form => [qw( action )], 27 frame => [qw( src longdesc )], 28 iframe => [qw( src )], 29 ilayer => [qw( background src )], 30 img => [qw( dynsrc longdesc lowsrc src usemap )], 31 input => [qw( dynsrc lowsrc src )], 32 ins => [qw( cite )], 33 isindex => [qw( action )], # real oddball 34 layer => [qw( src )], 35 link => [qw( src href )], 36 object => [qw( archive classid code codebase data usemap )], 37 q => [qw( cite )], 38 script => [qw( src )], # HTML::Tagset has 'for' ~ it's WRONG! 39 sound => [qw( src )], 40 table => [qw( background )], 41 td => [qw( background )], 42 th => [qw( background )], 43 tr => [qw( background )], 44 ## the exotic cases 45 meta => undef, 46 '!doctype' => [qw( url )], # is really a process instruction 47); 48 49## tags which contain <.*?> STUFF TO GET </\w+> 50use vars qw( @TAGS_IN_NEED ); 51@TAGS_IN_NEED = qw( 52 a 53 blockquote 54 del 55 ins 56 q 57); 58 59use vars qw( @VALID_URL_ATTRIBUTES ); 60@VALID_URL_ATTRIBUTES = qw( 61 action 62 archive 63 background 64 cite 65 classid 66 code 67 codebase 68 data 69 dynsrc 70 href 71 longdesc 72 lowsrc 73 pluginspage 74 pluginurl 75 src 76 usemap 77); 78 79 80sub new { 81 my($class, $cb, $base, $strip) = @_; 82 my $self = bless {}, $class; 83 84 85 $self->{_cb} = $cb if defined $cb; 86 $self->{_base} = URI->new($base) if defined $base; 87 $self->strip( $strip || 0 ); 88 89 return $self; 90} 91 92sub strip { 93 my( $self, $on ) = @_; 94 return $self->{_strip} unless defined $on; 95 return $self->{_strip} = $on ? 1 : 0; 96} 97 98## $p= HTML::TokeParser->new($filename || FILEHANDLE ||\$filecontents); # ## $p= HTML::TokeParser::Simple->new($filename || FILEHANDLE ||\$filecontents); 99 100sub parse { 101 my( $this, $hmmm ) = @_; 102 my $tp = new HTML::TokeParser( $hmmm ); # my $tp = new HTML::TokeParser::Simple( $hmmm ); 103 104 unless($tp) { 105 croak qq[ Couldn't create a HTML::TokeParser object: $!]; # croak qq[ Couldn't create a HTML::TokeParser::Simple object: $!]; 106 } 107 108 $this->{_tp} = $tp; 109 110 $this->_parsola(); 111 return(); 112} 113 114sub _parsola { 115 my $self = shift; 116 117## a stack of links for keeping track of TEXT 118## which is all of "<a href>text</a>" 119 my @TEXT = (); 120 $self->{_LINKS} = []; 121 122 123# ["S", $tag, $attr, $attrseq, $text] 124# ["E", $tag, $text] 125# ["T", $text, $is_data] 126# ["C", $text] 127# ["D", $text] 128# ["PI", $token0, $text] 129 130 while (my $T = $self->{_tp}->get_token() ) { 131 my $NL; #NewLink 132 my $Tag = $T->[1]; # my $Tag = $T->return_tag; 133 my $got_TAGS_IN_NEED=0; 134## Start tag? 135 if($T->[0] eq 'S' ) { # if($T->is_start_tag) { 136 next unless exists $TAGS{$Tag}; 137 138## Do we have a tag for which we want to capture text? 139 $got_TAGS_IN_NEED = 0; 140 $got_TAGS_IN_NEED = grep { /^\Q$Tag\E$/i } @TAGS_IN_NEED; 141 142## then check to see if we got things besides META :) 143 if(defined $TAGS{ $Tag }) { 144 145 for my $Btag(@{$TAGS{$Tag}}) { 146## and we check if they do have one with a value 147 if(exists $T->[2]->{ $Btag }) { # if(exists $T->return_attr()->{ $Btag }) { 148 149 $NL = $T->[2]; # $NL = $T->return_attr(); 150## TAGS_IN_NEED are tags in deed (start capturing the <a>STUFF</a>) 151 if($got_TAGS_IN_NEED) { 152 push @TEXT, $NL; 153 $NL->{_TEXT} = ""; 154 } 155 } 156 } 157 }elsif($Tag eq 'meta') { 158 $NL = $T->[2]; # $NL = $T->return_attr(); 159 160 if(defined $$NL{content} and length $$NL{content} and ( 161 defined $$NL{'http-equiv'} && $$NL{'http-equiv'} =~ /refresh/i 162 or 163 defined $$NL{'name'} && $$NL{'name'} =~ /refresh/i 164 ) ) { 165 166 my( $timeout, $url ) = split m{;\s*?URL=}i, $$NL{content},2; 167 my $base = $self->{_base}; 168 $$NL{url} = URI->new_abs( $url, $base ) if $base; 169 $$NL{url} = $url unless exists $$NL{url}; 170 $$NL{timeout} = $timeout if $timeout; 171 } 172 } 173 174 ## In case we got nested tags 175 if(@TEXT) { 176 $TEXT[-1]->{_TEXT} .= $T->[-1] ; # $TEXT[-1]->{_TEXT} .= $T->as_is; 177 } 178 179## Text? 180 }elsif($T->[0] eq 'T' ) { # }elsif($T->is_text) { 181 $TEXT[-1]->{_TEXT} .= $T->[-2] if @TEXT; # $TEXT[-1]->{_TEXT} .= $T->as_is if @TEXT; 182## Declaration? 183 }elsif($T->[0] eq 'D' ) { # }elsif($T->is_declaration) { 184## We look at declarations, to get anly custom .dtd's (tis linky) 185 my $text = $T->[-1] ; # my $text = $T->as_is; 186 if( $text =~ m{ SYSTEM \s \" ( [^\"]* ) \" > $ }ix ) { 187 $NL = { raw => $text, url => $1, tag => '!doctype' }; 188 } 189## End tag? 190 }elsif($T->[0] eq 'E' ){ # }elsif($T->is_end_tag){ 191## these be ignored (maybe not in between <a...></a> tags 192## unless we're stacking (bug #5723) 193 if(@TEXT and exists $TAGS{$Tag}) { 194 $TEXT[-1]->{_TEXT} .= $T->[-1] ; # $TEXT[-1]->{_TEXT} .= $T->as_is; 195 my $pop = pop @TEXT; 196 $TEXT[-1]->{_TEXT} .= $pop->{_TEXT} if @TEXT; 197 $pop->{_TEXT} = _stripHTML( \$pop->{_TEXT} ) if $self->strip; 198 $self->{_cb}->($self, $pop) if exists $self->{_cb}; 199 } 200 } 201 202 if(defined $NL) { 203 $$NL{tag} = $Tag; 204 205 my $base = $self->{_base}; 206 207 for my $at( @VALID_URL_ATTRIBUTES ) { 208 if( exists $$NL{$at} ) { 209 $$NL{$at} = URI->new_abs( $$NL{$at}, $base) if $base; 210 } 211 } 212 213 if(exists $self->{_cb}) { 214 $self->{_cb}->($self, $NL ) if not $got_TAGS_IN_NEED or not @TEXT; #bug#5470 215 } else { 216 push @{$self->{_LINKS}}, $NL; 217 } 218 } 219 }## endof while (my $token = $p->get_token) 220 221 undef $self->{_tp}; 222 return(); 223} 224 225sub links { 226 my $self = shift; 227 ## just like HTML::LinkExtor's 228 return $self->{_LINKS}; 229} 230 231 232sub _stripHTML { 233 my $HtmlRef = shift; 234 my $tp = new HTML::TokeParser( $HtmlRef ); # my $tp = new HTML::TokeParser::Simple( $HtmlRef ); 235 my $t = $tp->get_token(); # MUST BE A START TAG (@TAGS_IN_NEED) 236 # otherwise it ain't come from LinkExtractor 237 if($t->[0] eq 'S' ) { # if($t->is_start_tag) { 238 return $tp->get_trimmed_text( '/'.$t->[1] ); # return $tp->get_trimmed_text( '/'.$t->return_tag ); 239 } else { 240 require Data::Dumper; 241 local $Data::Dumper::Indent=1; 242 die " IMPOSSIBLE!!!! ", 243 Data::Dumper::Dumper( 244 '$HtmlRef',$HtmlRef, 245 '$t', $t, 246 ); 247 } 248} 249 2501; 251 252package main; 253 254unless(caller()) { 255 require Data::Dumper; 256 if(@ARGV) { 257 for my $file( @ARGV ) { 258 if( -e $file ) { 259 my $LX = new HTML::LinkExtractor( ); 260 $LX->parse( $file ); 261 print Data::Dumper::Dumper($LX->links); 262 undef $LX; 263 } else { 264 warn "The file `$file' doesn't exist\n"; 265 } 266 } 267 268 } else { 269 270 my $INPUT = q{ 271COUNT THEM BOYS AND GIRLS, LINKS OUTGHT TO HAVE 9 ELEMENTS. 272 2731 <!DOCTYPE HTML SYSTEM "http://www.w3.org/DTD/HTML4-strict.dtd"> 2742 <meta HTTP-EQUIV="Refresh" CONTENT="5; URL=http://www.foo.com/foo.html"> 2753 <base href="http://perl.org"> 2764 <a href="http://www.perlmonks.org">Perlmonks.org</a> 277<p> 278 2795 <a href="#BUTTER" href="#SCOTCH"> 280 hello there 2816 <img src="#AND" src="#PEANUTS"> 2827 <a href="#butter"> now </a> 283</a> 284 2858 <q CITE="http://www.shakespeare.com/">To be or not to be.</q> 2869 <blockquote CITE="http://www.stonehenge.com/merlyn/"> 287 Just Another Perl Hacker, 288</blockquote> 289 }; 290 291 my $LX = new HTML::LinkExtractor(); 292 $LX->parse(\$INPUT); 293 294 print scalar(@{$LX->links()})." we GOT\n"; 295 print Data::Dumper::Dumper( $LX->links() ); 296 } 297 298} 299 300__END__ 301 302 303=head1 NAME 304 305HTML::LinkExtractor - Extract I<L<links|/"WHAT'S A LINK-type tag">> from an HTML document 306 307=head1 DESCRIPTION 308 309HTML::LinkExtractor is used for extracting links from HTML. 310It is very similar to L<HTML::LinkExtor|HTML::LinkExtor>, 311except that besides getting the URL, you also get the link-text. 312 313Example ( B<please run the examples> ): 314 315 use HTML::LinkExtractor; 316 use Data::Dumper; 317 318 my $input = q{If <a href="http://perl.com/"> I am a LINK!!! </a>}; 319 my $LX = new HTML::LinkExtractor(); 320 321 $LX->parse(\$input); 322 323 print Dumper($LX->links); 324 __END__ 325 # the above example will yield 326 $VAR1 = [ 327 { 328 '_TEXT' => '<a href="http://perl.com/"> I am a LINK!!! </a>', 329 'href' => bless(do{\(my $o = 'http://perl.com/')}, 'URI::http'), 330 'tag' => 'a' 331 } 332 ]; 333 334C<HTML::LinkExtractor> will also correctly extract nested 335I<L<link-type|/"WHAT'S A LINK-type tag">> tags. 336 337=head1 SYNOPSIS 338 339 ## the demo 340 perl LinkExtractor.pm 341 perl LinkExtractor.pm file.html othefile.html 342 343 ## or if the module is installed, but you don't know where 344 345 perl -MHTML::LinkExtractor -e" system $^X, $INC{q{HTML/LinkExtractor.pm}} " 346 perl -MHTML::LinkExtractor -e' system $^X, $INC{q{HTML/LinkExtractor.pm}} ' 347 348 ## or 349 350 use HTML::LinkExtractor; 351 use LWP qw( get ); # use LWP::Simple qw( get ); 352 353 my $base = 'http://search.cpan.org'; 354 my $html = get($base.'/recent'); 355 my $LX = new HTML::LinkExtractor(); 356 357 $LX->parse(\$html); 358 359 print qq{<base href="$base">\n}; 360 361 for my $Link( @{ $LX->links } ) { 362 ## new modules are linked by /author/NAME/Dist 363 if( $$Link{href}=~ m{^\/author\/\w+} ) { 364 print $$Link{_TEXT}."\n"; 365 } 366 } 367 368 undef $LX; 369 __END__ 370 371 ## or 372 373 use HTML::LinkExtractor; 374 use Data::Dumper; 375 376 my $input = q{If <a href="http://perl.com/"> I am a LINK!!! </a>}; 377 my $LX = new HTML::LinkExtractor( 378 sub { 379 print Data::Dumper::Dumper(@_); 380 }, 381 'http://perlFox.org/', 382 ); 383 384 $LX->parse(\$input); 385 $LX->strip(1); 386 $LX->parse(\$input); 387 __END__ 388 389 #### Calculate to total size of a web-page 390 #### adds up the sizes of all the images and stylesheets and stuff 391 392 use strict; 393 use LWP; # use LWP::Simple; 394 use HTML::LinkExtractor; 395 # 396 my $url = shift || 'http://www.google.com'; 397 my $html = get($url); 398 my $Total = length $html; 399 # 400 print "initial size $Total\n"; 401 # 402 my $LX = new HTML::LinkExtractor( 403 sub { 404 my( $X, $tag ) = @_; 405 # 406 unless( grep {$_ eq $tag->{tag} } @HTML::LinkExtractor::TAGS_IN_NEED ) { 407 # 408 print "$$tag{tag}\n"; 409 # 410 for my $urlAttr ( @{$HTML::LinkExtractor::TAGS{$$tag{tag}}} ) { 411 if( exists $$tag{$urlAttr} ) { 412 my $size = (head( $$tag{$urlAttr} ))[1]; 413 $Total += $size if $size; 414 print "adding $size\n" if $size; 415 } 416 } 417 } 418 }, 419 $url, 420 0 421 ); 422 # 423 $LX->parse(\$html); 424 # 425 print "The total size of \n$url\n is $Total bytes\n"; 426 __END__ 427 428 429=head1 METHODS 430 431=head2 C<$LX-E<gt>new([\&callback, [$baseUrl, [1]]])> 432 433Accepts 3 arguments, all of which are optional. 434If for example you want to pass a C<$baseUrl>, but don't 435want to have a callback invoked, just put C<undef> in place of a subref. 436 437This is the only class method. 438 439=over 4 440 441=item 1 442 443a callback ( a sub reference, as in C<sub{}>, or C<\&sub>) 444which is to be called each time a new LINK is encountered 445( for C<@HTML::LinkExtractor::TAGS_IN_NEED> this means 446 after the closing tag is encountered ) 447 448The callback receives an object reference(C<$LX>) and a link hashref. 449 450 451=item 2 452 453and a base URL ( URI->new, so its up to you to make sure it's valid 454which is used to convert all relative URI's to absolute ones. 455 456 $ALinkP{href} = URI->new_abs( $ALink{href}, $base ); 457 458=item 3 459 460A "boolean" (just stick with 1). 461See the example in L<"DESCRIPTION">. 462Normally, you'd get back _TEXT that looks like 463 464 '_TEXT' => '<a href="http://perl.com/"> I am a LINK!!! </a>', 465 466If you turn this option on, you'll get the following instead 467 468 '_TEXT' => ' I am a LINK!!! ', 469 470The private utility function C<_stripHTML> does this 471by using L<HTML::TokeParser|HTML::TokeParser>s 472method get_trimmed_text. 473 474You can turn this feature on an off by using 475C<$LX-E<gt>strip(undef E<verbar>E<verbar> 0 E<verbar>E<verbar> 1)> 476 477=back 478 479=head2 C<$LX-E<gt>parse( $filename E<verbar>E<verbar> *FILEHANDLE E<verbar>E<verbar> \$FileContent )> 480 481Each time you call C<parse>, you should pass it a 482C<$filename> a C<*FILEHANDLE> or a C<\$FileContent> 483 484Each time you call C<parse> a new C<HTML::TokeParser> object 485is created and stored in C<$this-E<gt>{_tp}>. 486 487You shouldn't need to mess with the TokeParser object. 488 489=head2 C<$LX-E<gt>links()> 490 491Only after you call C<parse> will this method return anything. 492This method returns a reference to an ArrayOfHashes, 493which basically looks like (Data::Dumper output) 494 495 $VAR1 = [ { tag => 'img', src => 'image.png' }, ]; 496 497Please note that if yo provide a callback this array will be empty. 498 499 500=head2 C<$LX-E<gt>strip( [ 0 || 1 ])> 501 502If you pass in C<undef> (or nothing), returns the state of the option. 503Passing in a true or false value sets the option. 504 505If you wanna know what the option does see 506L<C<$LX-E<gt>new([\&callback, [$baseUrl, [1]]])>|/"METHODS"> 507 508=head1 WHAT'S A LINK-type tag 509 510Take a look at C<%HTML::LinkExtractor::TAGS> to see 511what I consider to be link-type-tag. 512 513Take a look at C<@HTML::LinkExtractor::VALID_URL_ATTRIBUTES> to see 514all the possible tag attributes which can contain URI's (the links!!) 515 516Take a look at C<@HTML::LinkExtractor::TAGS_IN_NEED> to see 517the tags for which the C<'_TEXT'> attribute is provided, 518like C<E<lt>a href="#"E<gt> TEST E<lt>/aE<gt>> 519 520 521=head2 How can that be?!?! 522 523I took at look at L<C<%HTML::Tagset::linkElements>|HTML::Tagset> 524and the following URL's 525 526 http://www.blooberry.com/indexdot/html/tagindex/all.htm 527 528 http://www.blooberry.com/indexdot/html/tagpages/a/a-hyperlink.htm 529 http://www.blooberry.com/indexdot/html/tagpages/a/applet.htm 530 http://www.blooberry.com/indexdot/html/tagpages/a/area.htm 531 532 http://www.blooberry.com/indexdot/html/tagpages/b/base.htm 533 http://www.blooberry.com/indexdot/html/tagpages/b/bgsound.htm 534 535 http://www.blooberry.com/indexdot/html/tagpages/d/del.htm 536 http://www.blooberry.com/indexdot/html/tagpages/d/div.htm 537 538 http://www.blooberry.com/indexdot/html/tagpages/e/embed.htm 539 http://www.blooberry.com/indexdot/html/tagpages/f/frame.htm 540 541 http://www.blooberry.com/indexdot/html/tagpages/i/ins.htm 542 http://www.blooberry.com/indexdot/html/tagpages/i/image.htm 543 http://www.blooberry.com/indexdot/html/tagpages/i/iframe.htm 544 http://www.blooberry.com/indexdot/html/tagpages/i/ilayer.htm 545 http://www.blooberry.com/indexdot/html/tagpages/i/inputimage.htm 546 547 http://www.blooberry.com/indexdot/html/tagpages/l/layer.htm 548 http://www.blooberry.com/indexdot/html/tagpages/l/link.htm 549 550 http://www.blooberry.com/indexdot/html/tagpages/o/object.htm 551 552 http://www.blooberry.com/indexdot/html/tagpages/q/q.htm 553 554 http://www.blooberry.com/indexdot/html/tagpages/s/script.htm 555 http://www.blooberry.com/indexdot/html/tagpages/s/sound.htm 556 557 And the special cases 558 559 <!DOCTYPE HTML SYSTEM "http://www.w3.org/DTD/HTML4-strict.dtd"> 560 http://www.blooberry.com/indexdot/html/tagpages/d/doctype.htm 561 '!doctype' is really a process instruction, but is still listed 562 in %TAGS with 'url' as the attribute 563 564 and 565 566 <meta HTTP-EQUIV="Refresh" CONTENT="5; URL=http://www.foo.com/foo.html"> 567 http://www.blooberry.com/indexdot/html/tagpages/m/meta.htm 568 If there is a valid url, 'url' is set as the attribute. 569 The meta tag has no 'attributes' listed in %TAGS. 570 571 572=head1 SEE ALSO 573 574L<HTML::LinkExtor>, L<HTML::TokeParser>, L<HTML::Tagset>. 575 576=head1 AUTHOR 577 578D.H (PodMaster) 579 580 581Please use http://rt.cpan.org/ to report bugs. 582 583Just go to 584http://rt.cpan.org/NoAuth/Bugs.html?Dist=HTML-Scrubber 585to see a bug list and/or repot new ones. 586 587=head1 LICENSE 588 589Copyright (c) 2003, 2004 by D.H. (PodMaster). 590All rights reserved. 591 592This module is free software; 593you can redistribute it and/or modify it under 594the same terms as Perl itself. 595The LICENSE file contains the full text of the license. 596 597=cut 598 599