1package HTML::PullParser; 2 3use strict; 4 5require HTML::Parser; 6our @ISA = qw(HTML::Parser); 7our $VERSION = '3.76'; 8 9use Carp (); 10 11sub new 12{ 13 my($class, %cnf) = @_; 14 15 # Construct argspecs for the various events 16 my %argspec; 17 for (qw(start end text declaration comment process default)) { 18 my $tmp = delete $cnf{$_}; 19 next unless defined $tmp; 20 $argspec{$_} = $tmp; 21 } 22 Carp::croak("Info not collected for any events") 23 unless %argspec; 24 25 my $file = delete $cnf{file}; 26 my $doc = delete $cnf{doc}; 27 Carp::croak("Can't parse from both 'doc' and 'file' at the same time") 28 if defined($file) && defined($doc); 29 Carp::croak("No 'doc' or 'file' given to parse from") 30 unless defined($file) || defined($doc); 31 32 # Create object 33 $cnf{api_version} = 3; 34 my $self = $class->SUPER::new(%cnf); 35 36 my $accum = $self->{pullparser_accum} = []; 37 while (my($event, $argspec) = each %argspec) { 38 $self->SUPER::handler($event => $accum, $argspec); 39 } 40 41 if (defined $doc) { 42 $self->{pullparser_str_ref} = ref($doc) ? $doc : \$doc; 43 $self->{pullparser_str_pos} = 0; 44 } 45 else { 46 if (!ref($file) && ref(\$file) ne "GLOB") { 47 require IO::File; 48 $file = IO::File->new($file, "r") || return; 49 } 50 51 $self->{pullparser_file} = $file; 52 } 53 $self; 54} 55 56 57sub handler 58{ 59 Carp::croak("Can't set handlers for HTML::PullParser"); 60} 61 62 63sub get_token 64{ 65 my $self = shift; 66 while (!@{$self->{pullparser_accum}} && !$self->{pullparser_eof}) { 67 if (my $f = $self->{pullparser_file}) { 68 # must try to parse more from the file 69 my $buf; 70 if (read($f, $buf, 512)) { 71 $self->parse($buf); 72 } else { 73 $self->eof; 74 $self->{pullparser_eof}++; 75 delete $self->{pullparser_file}; 76 } 77 } 78 elsif (my $sref = $self->{pullparser_str_ref}) { 79 # must try to parse more from the scalar 80 my $pos = $self->{pullparser_str_pos}; 81 my $chunk = substr($$sref, $pos, 512); 82 $self->parse($chunk); 83 $pos += length($chunk); 84 if ($pos < length($$sref)) { 85 $self->{pullparser_str_pos} = $pos; 86 } 87 else { 88 $self->eof; 89 $self->{pullparser_eof}++; 90 delete $self->{pullparser_str_ref}; 91 delete $self->{pullparser_str_pos}; 92 } 93 } 94 else { 95 die; 96 } 97 } 98 shift @{$self->{pullparser_accum}}; 99} 100 101 102sub unget_token 103{ 104 my $self = shift; 105 unshift @{$self->{pullparser_accum}}, @_; 106 $self; 107} 108 1091; 110 111 112__END__ 113 114=head1 NAME 115 116HTML::PullParser - Alternative HTML::Parser interface 117 118=head1 SYNOPSIS 119 120 use HTML::PullParser; 121 122 $p = HTML::PullParser->new(file => "index.html", 123 start => 'event, tagname, @attr', 124 end => 'event, tagname', 125 ignore_elements => [qw(script style)], 126 ) || die "Can't open: $!"; 127 while (my $token = $p->get_token) { 128 #...do something with $token 129 } 130 131=head1 DESCRIPTION 132 133The HTML::PullParser is an alternative interface to the HTML::Parser class. 134It basically turns the HTML::Parser inside out. You associate a file 135(or any IO::Handle object or string) with the parser at construction time and 136then repeatedly call $parser->get_token to obtain the tags and text 137found in the parsed document. 138 139The following methods are provided: 140 141=over 4 142 143=item $p = HTML::PullParser->new( file => $file, %options ) 144 145=item $p = HTML::PullParser->new( doc => \$doc, %options ) 146 147A C<HTML::PullParser> can be made to parse from either a file or a 148literal document based on whether the C<file> or C<doc> option is 149passed to the parser's constructor. 150 151The C<file> passed in can either be a file name or a file handle 152object. If a file name is passed, and it can't be opened for reading, 153then the constructor will return an undefined value and $! will tell 154you why it failed. Otherwise the argument is taken to be some object 155that the C<HTML::PullParser> can read() from when it needs more data. 156The stream will be read() until EOF, but not closed. 157 158A C<doc> can be passed plain or as a reference 159to a scalar. If a reference is passed then the value of this scalar 160should not be changed before all tokens have been extracted. 161 162Next the information to be returned for the different token types must 163be set up. This is done by simply associating an argspec (as defined 164in L<HTML::Parser>) with the events you have an interest in. For 165instance, if you want C<start> tokens to be reported as the string 166C<'S'> followed by the tagname and the attributes you might pass an 167C<start>-option like this: 168 169 $p = HTML::PullParser->new( 170 doc => $document_to_parse, 171 start => '"S", tagname, @attr', 172 end => '"E", tagname', 173 ); 174 175At last other C<HTML::Parser> options, like C<ignore_tags>, and 176C<unbroken_text>, can be passed in. Note that you should not use the 177I<event>_h options to set up parser handlers. That would confuse the 178inner logic of C<HTML::PullParser>. 179 180=item $token = $p->get_token 181 182This method will return the next I<token> found in the HTML document, 183or C<undef> at the end of the document. The token is returned as an 184array reference. The content of this array match the argspec set up 185during C<HTML::PullParser> construction. 186 187=item $p->unget_token( @tokens ) 188 189If you find out you have read too many tokens you can push them back, 190so that they are returned again the next time $p->get_token is called. 191 192=back 193 194=head1 EXAMPLES 195 196The 'eg/hform' script shows how we might parse the form section of 197HTML::Documents using HTML::PullParser. 198 199=head1 SEE ALSO 200 201L<HTML::Parser>, L<HTML::TokeParser> 202 203=head1 COPYRIGHT 204 205Copyright 1998-2001 Gisle Aas. 206 207This library is free software; you can redistribute it and/or 208modify it under the same terms as Perl itself. 209 210=cut 211