1package HTML::PullParser;
2
3use strict;
4
5require HTML::Parser;
6our @ISA = qw(HTML::Parser);
7our $VERSION = '3.76';
8
9use Carp ();
10
11sub new
12{
13    my($class, %cnf) = @_;
14
15    # Construct argspecs for the various events
16    my %argspec;
17    for (qw(start end text declaration comment process default)) {
18	my $tmp = delete $cnf{$_};
19	next unless defined $tmp;
20	$argspec{$_} = $tmp;
21    }
22    Carp::croak("Info not collected for any events")
23	  unless %argspec;
24
25    my $file = delete $cnf{file};
26    my $doc  = delete $cnf{doc};
27    Carp::croak("Can't parse from both 'doc' and 'file' at the same time")
28	  if defined($file) && defined($doc);
29    Carp::croak("No 'doc' or 'file' given to parse from")
30	  unless defined($file) || defined($doc);
31
32    # Create object
33    $cnf{api_version} = 3;
34    my $self = $class->SUPER::new(%cnf);
35
36    my $accum = $self->{pullparser_accum} = [];
37    while (my($event, $argspec) = each %argspec) {
38	$self->SUPER::handler($event => $accum, $argspec);
39    }
40
41    if (defined $doc) {
42	$self->{pullparser_str_ref} = ref($doc) ? $doc : \$doc;
43	$self->{pullparser_str_pos} = 0;
44    }
45    else {
46	if (!ref($file) && ref(\$file) ne "GLOB") {
47	    require IO::File;
48	    $file = IO::File->new($file, "r") || return;
49	}
50
51	$self->{pullparser_file} = $file;
52    }
53    $self;
54}
55
56
57sub handler
58{
59    Carp::croak("Can't set handlers for HTML::PullParser");
60}
61
62
63sub get_token
64{
65    my $self = shift;
66    while (!@{$self->{pullparser_accum}} && !$self->{pullparser_eof}) {
67	if (my $f = $self->{pullparser_file}) {
68	    # must try to parse more from the file
69	    my $buf;
70	    if (read($f, $buf, 512)) {
71		$self->parse($buf);
72	    } else {
73		$self->eof;
74		$self->{pullparser_eof}++;
75		delete $self->{pullparser_file};
76	    }
77	}
78	elsif (my $sref = $self->{pullparser_str_ref}) {
79	    # must try to parse more from the scalar
80	    my $pos = $self->{pullparser_str_pos};
81	    my $chunk = substr($$sref, $pos, 512);
82	    $self->parse($chunk);
83	    $pos += length($chunk);
84	    if ($pos < length($$sref)) {
85		$self->{pullparser_str_pos} = $pos;
86	    }
87	    else {
88		$self->eof;
89		$self->{pullparser_eof}++;
90		delete $self->{pullparser_str_ref};
91		delete $self->{pullparser_str_pos};
92	    }
93	}
94	else {
95	    die;
96	}
97    }
98    shift @{$self->{pullparser_accum}};
99}
100
101
102sub unget_token
103{
104    my $self = shift;
105    unshift @{$self->{pullparser_accum}}, @_;
106    $self;
107}
108
1091;
110
111
112__END__
113
114=head1 NAME
115
116HTML::PullParser - Alternative HTML::Parser interface
117
118=head1 SYNOPSIS
119
120 use HTML::PullParser;
121
122 $p = HTML::PullParser->new(file => "index.html",
123                            start => 'event, tagname, @attr',
124                            end   => 'event, tagname',
125                            ignore_elements => [qw(script style)],
126                           ) || die "Can't open: $!";
127 while (my $token = $p->get_token) {
128     #...do something with $token
129 }
130
131=head1 DESCRIPTION
132
133The HTML::PullParser is an alternative interface to the HTML::Parser class.
134It basically turns the HTML::Parser inside out.  You associate a file
135(or any IO::Handle object or string) with the parser at construction time and
136then repeatedly call $parser->get_token to obtain the tags and text
137found in the parsed document.
138
139The following methods are provided:
140
141=over 4
142
143=item $p = HTML::PullParser->new( file => $file, %options )
144
145=item $p = HTML::PullParser->new( doc => \$doc, %options )
146
147A C<HTML::PullParser> can be made to parse from either a file or a
148literal document based on whether the C<file> or C<doc> option is
149passed to the parser's constructor.
150
151The C<file> passed in can either be a file name or a file handle
152object.  If a file name is passed, and it can't be opened for reading,
153then the constructor will return an undefined value and $!  will tell
154you why it failed.  Otherwise the argument is taken to be some object
155that the C<HTML::PullParser> can read() from when it needs more data.
156The stream will be read() until EOF, but not closed.
157
158A C<doc> can be passed plain or as a reference
159to a scalar.  If a reference is passed then the value of this scalar
160should not be changed before all tokens have been extracted.
161
162Next the information to be returned for the different token types must
163be set up.  This is done by simply associating an argspec (as defined
164in L<HTML::Parser>) with the events you have an interest in.  For
165instance, if you want C<start> tokens to be reported as the string
166C<'S'> followed by the tagname and the attributes you might pass an
167C<start>-option like this:
168
169   $p = HTML::PullParser->new(
170          doc   => $document_to_parse,
171          start => '"S", tagname, @attr',
172          end   => '"E", tagname',
173        );
174
175At last other C<HTML::Parser> options, like C<ignore_tags>, and
176C<unbroken_text>, can be passed in.  Note that you should not use the
177I<event>_h options to set up parser handlers.  That would confuse the
178inner logic of C<HTML::PullParser>.
179
180=item $token = $p->get_token
181
182This method will return the next I<token> found in the HTML document,
183or C<undef> at the end of the document.  The token is returned as an
184array reference.  The content of this array match the argspec set up
185during C<HTML::PullParser> construction.
186
187=item $p->unget_token( @tokens )
188
189If you find out you have read too many tokens you can push them back,
190so that they are returned again the next time $p->get_token is called.
191
192=back
193
194=head1 EXAMPLES
195
196The 'eg/hform' script shows how we might parse the form section of
197HTML::Documents using HTML::PullParser.
198
199=head1 SEE ALSO
200
201L<HTML::Parser>, L<HTML::TokeParser>
202
203=head1 COPYRIGHT
204
205Copyright 1998-2001 Gisle Aas.
206
207This library is free software; you can redistribute it and/or
208modify it under the same terms as Perl itself.
209
210=cut
211