1package HTML::LinkExtor; 2 3require HTML::Parser; 4our @ISA = qw(HTML::Parser); 5our $VERSION = '3.76'; 6 7=head1 NAME 8 9HTML::LinkExtor - Extract links from an HTML document 10 11=head1 SYNOPSIS 12 13 require HTML::LinkExtor; 14 $p = HTML::LinkExtor->new(\&cb, "http://www.perl.org/"); 15 sub cb { 16 my($tag, %links) = @_; 17 print "$tag @{[%links]}\n"; 18 } 19 $p->parse_file("index.html"); 20 21=head1 DESCRIPTION 22 23I<HTML::LinkExtor> is an HTML parser that extracts links from an 24HTML document. The I<HTML::LinkExtor> is a subclass of 25I<HTML::Parser>. This means that the document should be given to the 26parser by calling the $p->parse() or $p->parse_file() methods. 27 28=cut 29 30use strict; 31use HTML::Tagset (); 32 33# legacy (some applications grabs this hash directly) 34our %LINK_ELEMENT; 35*LINK_ELEMENT = \%HTML::Tagset::linkElements; 36 37=over 4 38 39=item $p = HTML::LinkExtor->new 40 41=item $p = HTML::LinkExtor->new( $callback ) 42 43=item $p = HTML::LinkExtor->new( $callback, $base ) 44 45The constructor takes two optional arguments. The first is a reference 46to a callback routine. It will be called as links are found. If a 47callback is not provided, then links are just accumulated internally 48and can be retrieved by calling the $p->links() method. 49 50The $base argument is an optional base URL used to absolutize all URLs found. 51You need to have the I<URI> module installed if you provide $base. 52 53The callback is called with the lowercase tag name as first argument, 54and then all link attributes as separate key/value pairs. All 55non-link attributes are removed. 56 57=cut 58 59sub new 60{ 61 my($class, $cb, $base) = @_; 62 my $self = $class->SUPER::new( 63 start_h => ["_start_tag", "self,tagname,attr"], 64 report_tags => [keys %HTML::Tagset::linkElements], 65 ); 66 $self->{extractlink_cb} = $cb; 67 if ($base) { 68 require URI; 69 $self->{extractlink_base} = URI->new($base); 70 } 71 $self; 72} 73 74sub _start_tag 75{ 76 my($self, $tag, $attr) = @_; 77 78 my $base = $self->{extractlink_base}; 79 my $links = $HTML::Tagset::linkElements{$tag}; 80 $links = [$links] unless ref $links; 81 82 my @links; 83 my $a; 84 for $a (@$links) { 85 next unless exists $attr->{$a}; 86 (my $link = $attr->{$a}) =~ s/^\s+//; $link =~ s/\s+$//; # HTML5 87 push(@links, $a, $base ? URI->new($link, $base)->abs($base) : $link); 88 } 89 return unless @links; 90 $self->_found_link($tag, @links); 91} 92 93sub _found_link 94{ 95 my $self = shift; 96 my $cb = $self->{extractlink_cb}; 97 if ($cb) { 98 &$cb(@_); 99 } else { 100 push(@{$self->{'links'}}, [@_]); 101 } 102} 103 104=item $p->links 105 106Returns a list of all links found in the document. The returned 107values will be anonymous arrays with the following elements: 108 109 [$tag, $attr => $url1, $attr2 => $url2,...] 110 111The $p->links method will also truncate the internal link list. This 112means that if the method is called twice without any parsing 113between them the second call will return an empty list. 114 115Also note that $p->links will always be empty if a callback routine 116was provided when the I<HTML::LinkExtor> was created. 117 118=cut 119 120sub links 121{ 122 my $self = shift; 123 exists($self->{'links'}) ? @{delete $self->{'links'}} : (); 124} 125 126# We override the parse_file() method so that we can clear the links 127# before we start a new file. 128sub parse_file 129{ 130 my $self = shift; 131 delete $self->{'links'}; 132 $self->SUPER::parse_file(@_); 133} 134 135=back 136 137=head1 EXAMPLE 138 139This is an example showing how you can extract links from a document 140received using LWP: 141 142 use LWP::UserAgent; 143 use HTML::LinkExtor; 144 use URI::URL; 145 146 $url = "http://www.perl.org/"; # for instance 147 $ua = LWP::UserAgent->new; 148 149 # Set up a callback that collect image links 150 my @imgs = (); 151 sub callback { 152 my($tag, %attr) = @_; 153 return if $tag ne 'img'; # we only look closer at <img ...> 154 push(@imgs, values %attr); 155 } 156 157 # Make the parser. Unfortunately, we don't know the base yet 158 # (it might be different from $url) 159 $p = HTML::LinkExtor->new(\&callback); 160 161 # Request document and parse it as it arrives 162 $res = $ua->request(HTTP::Request->new(GET => $url), 163 sub {$p->parse($_[0])}); 164 165 # Expand all image URLs to absolute ones 166 my $base = $res->base; 167 @imgs = map { $_ = url($_, $base)->abs; } @imgs; 168 169 # Print them out 170 print join("\n", @imgs), "\n"; 171 172=head1 SEE ALSO 173 174L<HTML::Parser>, L<HTML::Tagset>, L<LWP>, L<URI::URL> 175 176=head1 COPYRIGHT 177 178Copyright 1996-2001 Gisle Aas. 179 180This library is free software; you can redistribute it and/or 181modify it under the same terms as Perl itself. 182 183=cut 184 1851; 186