1package HTML::LinkExtor;
2
3require HTML::Parser;
4our @ISA = qw(HTML::Parser);
5our $VERSION = '3.76';
6
7=head1 NAME
8
9HTML::LinkExtor - Extract links from an HTML document
10
11=head1 SYNOPSIS
12
13 require HTML::LinkExtor;
14 $p = HTML::LinkExtor->new(\&cb, "http://www.perl.org/");
15 sub cb {
16     my($tag, %links) = @_;
17     print "$tag @{[%links]}\n";
18 }
19 $p->parse_file("index.html");
20
21=head1 DESCRIPTION
22
23I<HTML::LinkExtor> is an HTML parser that extracts links from an
24HTML document.  The I<HTML::LinkExtor> is a subclass of
25I<HTML::Parser>. This means that the document should be given to the
26parser by calling the $p->parse() or $p->parse_file() methods.
27
28=cut
29
30use strict;
31use HTML::Tagset ();
32
33# legacy (some applications grabs this hash directly)
34our %LINK_ELEMENT;
35*LINK_ELEMENT = \%HTML::Tagset::linkElements;
36
37=over 4
38
39=item $p = HTML::LinkExtor->new
40
41=item $p = HTML::LinkExtor->new( $callback )
42
43=item $p = HTML::LinkExtor->new( $callback, $base )
44
45The constructor takes two optional arguments. The first is a reference
46to a callback routine. It will be called as links are found. If a
47callback is not provided, then links are just accumulated internally
48and can be retrieved by calling the $p->links() method.
49
50The $base argument is an optional base URL used to absolutize all URLs found.
51You need to have the I<URI> module installed if you provide $base.
52
53The callback is called with the lowercase tag name as first argument,
54and then all link attributes as separate key/value pairs.  All
55non-link attributes are removed.
56
57=cut
58
59sub new
60{
61    my($class, $cb, $base) = @_;
62    my $self = $class->SUPER::new(
63                    start_h => ["_start_tag", "self,tagname,attr"],
64		    report_tags => [keys %HTML::Tagset::linkElements],
65	       );
66    $self->{extractlink_cb} = $cb;
67    if ($base) {
68	require URI;
69	$self->{extractlink_base} = URI->new($base);
70    }
71    $self;
72}
73
74sub _start_tag
75{
76    my($self, $tag, $attr) = @_;
77
78    my $base = $self->{extractlink_base};
79    my $links = $HTML::Tagset::linkElements{$tag};
80    $links = [$links] unless ref $links;
81
82    my @links;
83    my $a;
84    for $a (@$links) {
85	next unless exists $attr->{$a};
86	(my $link = $attr->{$a}) =~ s/^\s+//; $link =~ s/\s+$//; # HTML5
87	push(@links, $a, $base ? URI->new($link, $base)->abs($base) : $link);
88    }
89    return unless @links;
90    $self->_found_link($tag, @links);
91}
92
93sub _found_link
94{
95    my $self = shift;
96    my $cb = $self->{extractlink_cb};
97    if ($cb) {
98	&$cb(@_);
99    } else {
100	push(@{$self->{'links'}}, [@_]);
101    }
102}
103
104=item $p->links
105
106Returns a list of all links found in the document.  The returned
107values will be anonymous arrays with the following elements:
108
109  [$tag, $attr => $url1, $attr2 => $url2,...]
110
111The $p->links method will also truncate the internal link list.  This
112means that if the method is called twice without any parsing
113between them the second call will return an empty list.
114
115Also note that $p->links will always be empty if a callback routine
116was provided when the I<HTML::LinkExtor> was created.
117
118=cut
119
120sub links
121{
122    my $self = shift;
123    exists($self->{'links'}) ? @{delete $self->{'links'}} : ();
124}
125
126# We override the parse_file() method so that we can clear the links
127# before we start a new file.
128sub parse_file
129{
130    my $self = shift;
131    delete $self->{'links'};
132    $self->SUPER::parse_file(@_);
133}
134
135=back
136
137=head1 EXAMPLE
138
139This is an example showing how you can extract links from a document
140received using LWP:
141
142  use LWP::UserAgent;
143  use HTML::LinkExtor;
144  use URI::URL;
145
146  $url = "http://www.perl.org/";  # for instance
147  $ua = LWP::UserAgent->new;
148
149  # Set up a callback that collect image links
150  my @imgs = ();
151  sub callback {
152     my($tag, %attr) = @_;
153     return if $tag ne 'img';  # we only look closer at <img ...>
154     push(@imgs, values %attr);
155  }
156
157  # Make the parser.  Unfortunately, we don't know the base yet
158  # (it might be different from $url)
159  $p = HTML::LinkExtor->new(\&callback);
160
161  # Request document and parse it as it arrives
162  $res = $ua->request(HTTP::Request->new(GET => $url),
163                      sub {$p->parse($_[0])});
164
165  # Expand all image URLs to absolute ones
166  my $base = $res->base;
167  @imgs = map { $_ = url($_, $base)->abs; } @imgs;
168
169  # Print them out
170  print join("\n", @imgs), "\n";
171
172=head1 SEE ALSO
173
174L<HTML::Parser>, L<HTML::Tagset>, L<LWP>, L<URI::URL>
175
176=head1 COPYRIGHT
177
178Copyright 1996-2001 Gisle Aas.
179
180This library is free software; you can redistribute it and/or
181modify it under the same terms as Perl itself.
182
183=cut
184
1851;
186