1package HTML::LinkExtractor;
2
3use strict;
4
5use HTML::TokeParser 2; # use HTML::TokeParser::Simple 2;
6use URI 1;
7use Carp qw( croak );
8
9use vars qw( $VERSION );
10$VERSION = '0.13';
11
12## The html tags which might have URLs
13# the master list of tagolas and required attributes (to constitute a link)
14use vars qw( %TAGS );
15%TAGS = (
16              a => [qw( href )],
17         applet => [qw( archive code codebase src )],
18           area => [qw( href )],
19           base => [qw( href )],
20        bgsound => [qw( src )],
21     blockquote => [qw( cite )],
22           body => [qw( background )],
23            del => [qw( cite )],
24            div => [qw( src )], # IE likes it, but don't know where it's documented
25          embed => [qw( pluginspage pluginurl src )],
26           form => [qw( action )],
27          frame => [qw( src longdesc  )],
28         iframe => [qw( src )],
29         ilayer => [qw( background src )],
30            img => [qw( dynsrc longdesc lowsrc src usemap )],
31          input => [qw( dynsrc lowsrc src )],
32            ins => [qw( cite )],
33        isindex => [qw( action )], # real oddball
34          layer => [qw( src )],
35           link => [qw( src href )],
36         object => [qw( archive classid code codebase data usemap )],
37              q => [qw( cite )],
38         script => [qw( src  )], # HTML::Tagset has 'for' ~ it's WRONG!
39          sound => [qw( src )],
40          table => [qw( background )],
41             td => [qw( background )],
42             th => [qw( background )],
43             tr => [qw( background )],
44  ## the exotic cases
45           meta => undef,
46     '!doctype' => [qw( url )], # is really a process instruction
47);
48
49## tags which contain <.*?> STUFF TO GET </\w+>
50use vars qw( @TAGS_IN_NEED );
51@TAGS_IN_NEED = qw(
52    a
53    blockquote
54    del
55    ins
56    q
57);
58
59use vars qw( @VALID_URL_ATTRIBUTES );
60@VALID_URL_ATTRIBUTES = qw(
61        action
62        archive
63        background
64        cite
65        classid
66        code
67        codebase
68        data
69        dynsrc
70        href
71        longdesc
72        lowsrc
73        pluginspage
74        pluginurl
75        src
76        usemap
77);
78
79
80sub new {
81    my($class, $cb, $base, $strip) = @_;
82    my $self = bless {}, $class;
83
84
85    $self->{_cb} = $cb if defined $cb;
86    $self->{_base} = URI->new($base) if defined $base;
87    $self->strip( $strip || 0 );
88
89    return $self;
90}
91
92sub strip {
93    my( $self, $on ) = @_;
94    return $self->{_strip} unless defined $on;
95    return $self->{_strip} = $on ? 1 : 0;
96}
97
98## $p= HTML::TokeParser->new($filename || FILEHANDLE ||\$filecontents); # ## $p= HTML::TokeParser::Simple->new($filename || FILEHANDLE ||\$filecontents);
99
100sub parse {
101    my( $this, $hmmm ) = @_;
102    my $tp = new HTML::TokeParser( $hmmm ); #     my $tp = new HTML::TokeParser::Simple( $hmmm );
103
104    unless($tp) {
105        croak qq[ Couldn't create a HTML::TokeParser object: $!]; #         croak qq[ Couldn't create a HTML::TokeParser::Simple object: $!];
106    }
107
108    $this->{_tp} = $tp;
109
110    $this->_parsola();
111    return();
112}
113
114sub _parsola {
115    my $self = shift;
116
117## a stack of links for keeping track of TEXT
118## which is all of "<a href>text</a>"
119    my @TEXT = ();
120    $self->{_LINKS} = [];
121
122
123#  ["S",  $tag, $attr, $attrseq, $text]
124#  ["E",  $tag, $text]
125#  ["T",  $text, $is_data]
126#  ["C",  $text]
127#  ["D",  $text]
128#  ["PI", $token0, $text]
129
130    while (my $T = $self->{_tp}->get_token() ) {
131        my $NL; #NewLink
132        my $Tag = $T->[1]; #         my $Tag = $T->return_tag;
133        my $got_TAGS_IN_NEED=0;
134## Start tag?
135        if($T->[0] eq 'S' ) { #         if($T->is_start_tag) {
136            next unless exists $TAGS{$Tag};
137
138## Do we have a tag for which we want to capture text?
139            $got_TAGS_IN_NEED = 0;
140            $got_TAGS_IN_NEED = grep { /^\Q$Tag\E$/i } @TAGS_IN_NEED;
141
142## then check to see if we got things besides META :)
143            if(defined $TAGS{ $Tag }) {
144
145                for my $Btag(@{$TAGS{$Tag}}) {
146## and we check if they do have one with a value
147                    if(exists $T->[2]->{ $Btag }) { #                     if(exists $T->return_attr()->{ $Btag }) {
148
149                        $NL = $T->[2]; #                         $NL = $T->return_attr();
150## TAGS_IN_NEED are tags in deed (start capturing the <a>STUFF</a>)
151                        if($got_TAGS_IN_NEED) {
152                            push @TEXT, $NL;
153                            $NL->{_TEXT} = "";
154                        }
155                    }
156                }
157            }elsif($Tag eq 'meta') {
158                $NL = $T->[2]; #                 $NL = $T->return_attr();
159
160                if(defined $$NL{content} and length $$NL{content} and (
161                    defined $$NL{'http-equiv'} &&  $$NL{'http-equiv'} =~ /refresh/i
162                    or
163                    defined $$NL{'name'} &&  $$NL{'name'} =~ /refresh/i
164                    ) ) {
165
166                    my( $timeout, $url ) = split m{;\s*?URL=}i, $$NL{content},2;
167                    my $base = $self->{_base};
168                    $$NL{url} = URI->new_abs( $url, $base ) if $base;
169                    $$NL{url} = $url unless exists $$NL{url};
170                    $$NL{timeout} = $timeout if $timeout;
171                }
172            }
173
174            ## In case we got nested tags
175            if(@TEXT) {
176                $TEXT[-1]->{_TEXT} .= $T->[-1] ; #                 $TEXT[-1]->{_TEXT} .= $T->as_is;
177            }
178
179## Text?
180        }elsif($T->[0] eq 'T' ) { #         }elsif($T->is_text) {
181            $TEXT[-1]->{_TEXT} .= $T->[-2]  if @TEXT; #             $TEXT[-1]->{_TEXT} .= $T->as_is if @TEXT;
182## Declaration?
183        }elsif($T->[0] eq 'D' ) { #         }elsif($T->is_declaration) {
184## We look at declarations, to get anly custom .dtd's (tis linky)
185            my $text = $T->[-1] ; #             my $text = $T->as_is;
186            if( $text =~ m{ SYSTEM \s \" ( [^\"]* ) \" > $ }ix ) {
187                $NL = { raw => $text, url => $1, tag => '!doctype' };
188            }
189## End tag?
190        }elsif($T->[0] eq 'E' ){ #         }elsif($T->is_end_tag){
191## these be ignored (maybe not in between <a...></a> tags
192## unless we're stacking (bug #5723)
193            if(@TEXT and exists $TAGS{$Tag}) {
194                $TEXT[-1]->{_TEXT} .= $T->[-1] ; #                 $TEXT[-1]->{_TEXT} .= $T->as_is;
195                my $pop = pop @TEXT;
196                $TEXT[-1]->{_TEXT} .= $pop->{_TEXT} if @TEXT;
197                $pop->{_TEXT} = _stripHTML( \$pop->{_TEXT} ) if $self->strip;
198                $self->{_cb}->($self, $pop) if exists $self->{_cb};
199            }
200        }
201
202        if(defined $NL) {
203            $$NL{tag} = $Tag;
204
205            my $base = $self->{_base};
206
207            for my $at( @VALID_URL_ATTRIBUTES ) {
208                if( exists $$NL{$at} ) {
209                    $$NL{$at} = URI->new_abs( $$NL{$at}, $base) if $base;
210                }
211            }
212
213            if(exists $self->{_cb}) {
214                $self->{_cb}->($self, $NL ) if not $got_TAGS_IN_NEED or not @TEXT; #bug#5470
215            } else {
216                push @{$self->{_LINKS}}, $NL;
217            }
218        }
219    }## endof while (my $token = $p->get_token)
220
221    undef $self->{_tp};
222    return();
223}
224
225sub links {
226    my $self = shift;
227    ## just like HTML::LinkExtor's
228    return $self->{_LINKS};
229}
230
231
232sub _stripHTML {
233    my $HtmlRef = shift;
234    my $tp = new HTML::TokeParser( $HtmlRef ); #     my $tp = new HTML::TokeParser::Simple( $HtmlRef );
235    my $t = $tp->get_token(); # MUST BE A START TAG (@TAGS_IN_NEED)
236                              # otherwise it ain't come from LinkExtractor
237    if($t->[0] eq 'S' ) { #     if($t->is_start_tag) {
238        return $tp->get_trimmed_text( '/'.$t->[1] ); #         return $tp->get_trimmed_text( '/'.$t->return_tag );
239    } else {
240        require Data::Dumper;
241        local $Data::Dumper::Indent=1;
242        die " IMPOSSIBLE!!!! ",
243            Data::Dumper::Dumper(
244                '$HtmlRef',$HtmlRef,
245                '$t', $t,
246            );
247    }
248}
249
2501;
251
252package main;
253
254unless(caller()) {
255    require Data::Dumper;
256    if(@ARGV) {
257        for my $file( @ARGV ) {
258            if( -e $file ) {
259                my $LX = new HTML::LinkExtractor( );
260                $LX->parse( $file );
261                print Data::Dumper::Dumper($LX->links);
262                undef $LX;
263            } else {
264                warn "The file `$file' doesn't exist\n";
265            }
266        }
267
268    } else {
269
270        my $INPUT = q{
271COUNT THEM BOYS AND GIRLS, LINKS OUTGHT TO HAVE 9 ELEMENTS.
272
2731 <!DOCTYPE HTML SYSTEM "http://www.w3.org/DTD/HTML4-strict.dtd">
2742 <meta HTTP-EQUIV="Refresh" CONTENT="5; URL=http://www.foo.com/foo.html">
2753 <base href="http://perl.org">
2764 <a href="http://www.perlmonks.org">Perlmonks.org</a>
277<p>
278
2795 <a href="#BUTTER"  href="#SCOTCH">
280    hello there
2816 <img src="#AND" src="#PEANUTS">
2827    <a href="#butter"> now </a>
283</a>
284
2858 <q CITE="http://www.shakespeare.com/">To be or not to be.</q>
2869 <blockquote CITE="http://www.stonehenge.com/merlyn/">
287    Just Another Perl Hacker,
288</blockquote>
289    };
290
291        my $LX = new HTML::LinkExtractor();
292        $LX->parse(\$INPUT);
293
294        print scalar(@{$LX->links()})." we GOT\n";
295        print Data::Dumper::Dumper( $LX->links() );
296    }
297
298}
299
300__END__
301
302
303=head1 NAME
304
305HTML::LinkExtractor - Extract I<L<links|/"WHAT'S A LINK-type tag">> from an HTML document
306
307=head1 DESCRIPTION
308
309HTML::LinkExtractor is used for extracting links from HTML.
310It is very similar to L<HTML::LinkExtor|HTML::LinkExtor>,
311except that besides getting the URL, you also get the link-text.
312
313Example ( B<please run the examples> ):
314
315    use HTML::LinkExtractor;
316    use Data::Dumper;
317
318    my $input = q{If <a href="http://perl.com/"> I am a LINK!!! </a>};
319    my $LX = new HTML::LinkExtractor();
320
321    $LX->parse(\$input);
322
323    print Dumper($LX->links);
324    __END__
325    # the above example will yield
326    $VAR1 = [
327              {
328                '_TEXT' => '<a href="http://perl.com/"> I am a LINK!!! </a>',
329                'href' => bless(do{\(my $o = 'http://perl.com/')}, 'URI::http'),
330                'tag' => 'a'
331              }
332            ];
333
334C<HTML::LinkExtractor> will also correctly extract nested
335I<L<link-type|/"WHAT'S A LINK-type tag">> tags.
336
337=head1 SYNOPSIS
338
339    ## the demo
340    perl LinkExtractor.pm
341    perl LinkExtractor.pm file.html othefile.html
342
343    ## or if the module is installed, but you don't know where
344
345    perl -MHTML::LinkExtractor -e" system $^X, $INC{q{HTML/LinkExtractor.pm}} "
346    perl -MHTML::LinkExtractor -e' system $^X, $INC{q{HTML/LinkExtractor.pm}} '
347
348    ## or
349
350    use HTML::LinkExtractor;
351    use LWP qw( get ); #     use LWP::Simple qw( get );
352
353    my $base = 'http://search.cpan.org';
354    my $html = get($base.'/recent');
355    my $LX = new HTML::LinkExtractor();
356
357    $LX->parse(\$html);
358
359    print qq{<base href="$base">\n};
360
361    for my $Link( @{ $LX->links } ) {
362    ## new modules are linked  by /author/NAME/Dist
363        if( $$Link{href}=~ m{^\/author\/\w+} ) {
364            print $$Link{_TEXT}."\n";
365        }
366    }
367
368    undef $LX;
369    __END__
370
371    ## or
372
373    use HTML::LinkExtractor;
374    use Data::Dumper;
375
376    my $input = q{If <a href="http://perl.com/"> I am a LINK!!! </a>};
377    my $LX = new HTML::LinkExtractor(
378        sub {
379            print Data::Dumper::Dumper(@_);
380        },
381        'http://perlFox.org/',
382    );
383
384    $LX->parse(\$input);
385    $LX->strip(1);
386    $LX->parse(\$input);
387    __END__
388
389    #### Calculate to total size of a web-page
390    #### adds up the sizes of all the images and stylesheets and stuff
391
392    use strict;
393    use LWP; #     use LWP::Simple;
394    use HTML::LinkExtractor;
395                                                        #
396    my $url  = shift || 'http://www.google.com';
397    my $html = get($url);
398    my $Total = length $html;
399                                                        #
400    print "initial size $Total\n";
401                                                        #
402    my $LX = new HTML::LinkExtractor(
403        sub {
404            my( $X, $tag ) = @_;
405                                                        #
406            unless( grep {$_ eq $tag->{tag} } @HTML::LinkExtractor::TAGS_IN_NEED ) {
407                                                        #
408    print "$$tag{tag}\n";
409                                                        #
410                for my $urlAttr ( @{$HTML::LinkExtractor::TAGS{$$tag{tag}}} ) {
411                    if( exists $$tag{$urlAttr} ) {
412                        my $size = (head( $$tag{$urlAttr} ))[1];
413                        $Total += $size if $size;
414    print "adding $size\n" if $size;
415                    }
416                }
417            }
418        },
419        $url,
420        0
421    );
422                                                        #
423    $LX->parse(\$html);
424                                                        #
425    print "The total size of \n$url\n is $Total bytes\n";
426    __END__
427
428
429=head1 METHODS
430
431=head2 C<$LX-E<gt>new([\&callback, [$baseUrl, [1]]])>
432
433Accepts 3 arguments, all of which are optional.
434If for example you want to pass a C<$baseUrl>, but don't
435want to have a callback invoked, just put C<undef> in place of a subref.
436
437This is the only class method.
438
439=over 4
440
441=item 1
442
443a callback ( a sub reference, as in C<sub{}>, or C<\&sub>)
444which is to be called each time a new LINK is encountered
445( for C<@HTML::LinkExtractor::TAGS_IN_NEED> this means
446 after the closing tag is encountered )
447
448The callback receives an object reference(C<$LX>) and a link hashref.
449
450
451=item 2
452
453and a base URL ( URI->new, so its up to you to make sure it's valid
454which is used to convert all relative URI's to absolute ones.
455
456    $ALinkP{href} = URI->new_abs( $ALink{href}, $base );
457
458=item 3
459
460A "boolean" (just stick with 1).
461See the example in L<"DESCRIPTION">.
462Normally, you'd get back _TEXT that looks like
463
464    '_TEXT' => '<a href="http://perl.com/"> I am a LINK!!! </a>',
465
466If you turn this option on, you'll get the following instead
467
468    '_TEXT' => ' I am a LINK!!! ',
469
470The private utility function C<_stripHTML> does this
471by using L<HTML::TokeParser|HTML::TokeParser>s
472method get_trimmed_text.
473
474You can turn this feature on an off by using
475C<$LX-E<gt>strip(undef E<verbar>E<verbar> 0 E<verbar>E<verbar> 1)>
476
477=back
478
479=head2 C<$LX-E<gt>parse( $filename E<verbar>E<verbar> *FILEHANDLE E<verbar>E<verbar> \$FileContent )>
480
481Each time you call C<parse>, you should pass it a
482C<$filename> a C<*FILEHANDLE> or a C<\$FileContent>
483
484Each time you call C<parse> a new C<HTML::TokeParser> object
485is created and stored in C<$this-E<gt>{_tp}>.
486
487You shouldn't need to mess with the TokeParser object.
488
489=head2 C<$LX-E<gt>links()>
490
491Only after you call C<parse> will this method return anything.
492This method returns a reference to an ArrayOfHashes,
493which basically looks like (Data::Dumper output)
494
495    $VAR1 = [ { tag => 'img', src => 'image.png' }, ];
496
497Please note that if yo provide a callback this array will be empty.
498
499
500=head2 C<$LX-E<gt>strip( [ 0 || 1 ])>
501
502If you pass in C<undef> (or nothing), returns the state of the option.
503Passing in a true or false value sets the option.
504
505If you wanna know what the option does see
506L<C<$LX-E<gt>new([\&callback, [$baseUrl, [1]]])>|/"METHODS">
507
508=head1 WHAT'S A LINK-type tag
509
510Take a look at C<%HTML::LinkExtractor::TAGS> to see
511what I consider to be link-type-tag.
512
513Take a look at C<@HTML::LinkExtractor::VALID_URL_ATTRIBUTES> to see
514all the possible tag attributes which can contain URI's (the links!!)
515
516Take a look at C<@HTML::LinkExtractor::TAGS_IN_NEED> to see
517the tags for which the C<'_TEXT'> attribute is provided,
518like C<E<lt>a href="#"E<gt> TEST E<lt>/aE<gt>>
519
520
521=head2 How can that be?!?!
522
523I took at look at L<C<%HTML::Tagset::linkElements>|HTML::Tagset>
524and the following URL's
525
526    http://www.blooberry.com/indexdot/html/tagindex/all.htm
527
528    http://www.blooberry.com/indexdot/html/tagpages/a/a-hyperlink.htm
529    http://www.blooberry.com/indexdot/html/tagpages/a/applet.htm
530    http://www.blooberry.com/indexdot/html/tagpages/a/area.htm
531
532    http://www.blooberry.com/indexdot/html/tagpages/b/base.htm
533    http://www.blooberry.com/indexdot/html/tagpages/b/bgsound.htm
534
535    http://www.blooberry.com/indexdot/html/tagpages/d/del.htm
536    http://www.blooberry.com/indexdot/html/tagpages/d/div.htm
537
538    http://www.blooberry.com/indexdot/html/tagpages/e/embed.htm
539    http://www.blooberry.com/indexdot/html/tagpages/f/frame.htm
540
541    http://www.blooberry.com/indexdot/html/tagpages/i/ins.htm
542    http://www.blooberry.com/indexdot/html/tagpages/i/image.htm
543    http://www.blooberry.com/indexdot/html/tagpages/i/iframe.htm
544    http://www.blooberry.com/indexdot/html/tagpages/i/ilayer.htm
545    http://www.blooberry.com/indexdot/html/tagpages/i/inputimage.htm
546
547    http://www.blooberry.com/indexdot/html/tagpages/l/layer.htm
548    http://www.blooberry.com/indexdot/html/tagpages/l/link.htm
549
550    http://www.blooberry.com/indexdot/html/tagpages/o/object.htm
551
552    http://www.blooberry.com/indexdot/html/tagpages/q/q.htm
553
554    http://www.blooberry.com/indexdot/html/tagpages/s/script.htm
555    http://www.blooberry.com/indexdot/html/tagpages/s/sound.htm
556
557    And the special cases
558
559    <!DOCTYPE HTML SYSTEM "http://www.w3.org/DTD/HTML4-strict.dtd">
560    http://www.blooberry.com/indexdot/html/tagpages/d/doctype.htm
561    '!doctype'  is really a process instruction, but is still listed
562    in %TAGS with 'url' as the attribute
563
564    and
565
566    <meta HTTP-EQUIV="Refresh" CONTENT="5; URL=http://www.foo.com/foo.html">
567    http://www.blooberry.com/indexdot/html/tagpages/m/meta.htm
568    If there is a valid url, 'url' is set as the attribute.
569    The meta tag has no 'attributes' listed in %TAGS.
570
571
572=head1 SEE ALSO
573
574L<HTML::LinkExtor>, L<HTML::TokeParser>, L<HTML::Tagset>.
575
576=head1 AUTHOR
577
578D.H (PodMaster)
579
580
581Please use http://rt.cpan.org/ to report bugs.
582
583Just go to
584http://rt.cpan.org/NoAuth/Bugs.html?Dist=HTML-Scrubber
585to see a bug list and/or repot new ones.
586
587=head1 LICENSE
588
589Copyright (c) 2003, 2004 by D.H. (PodMaster).
590All rights reserved.
591
592This module is free software;
593you can redistribute it and/or modify it under
594the same terms as Perl itself.
595The LICENSE file contains the full text of the license.
596
597=cut
598
599