1package DocSet::Source::HTML; 2 3use strict; 4use warnings; 5 6use DocSet::Util; 7 8use vars qw(@ISA); 9require DocSet::Doc; 10@ISA = qw(DocSet::Doc); 11 12use constant ENCODE_CHARS => '<>&" '; 13 14sub retrieve_meta_data { 15 my ($self) = @_; 16 17 $self->parse; 18 19 use Pod::POM::View::HTML; 20 my $mode = 'Pod::POM::View::HTML'; 21 #print Pod::POM::View::HTML->print($pom); 22 23 my $title = $self->{parsed_tree}->{title}; 24 25 $self->{meta} = 26 { 27 title => $title, 28 stitle => $title, # stitle is the same as title in docs 29 abstract => $self->{parsed_tree}->{abstract} || '', 30 link => $self->{rel_dst_path}, 31 }; 32 33 # there is no autogenerated TOC for HTML files 34} 35 36my %linkElements = ( # from HTML::Element.pm 37 body => 'background', 38 base => 'href', 39 a => 'href', 40 img => [qw(src lowsrc usemap)], # lowsrc is a Netscape invention 41 form => 'action', 42 input => 'src', 43 'link' => 'href', # need quoting since link is a perl builtin 44 frame => 'src', 45 applet => 'codebase', 46 area => 'href', 47); 48my %tag_attr; 49for my $tag (keys %linkElements) { 50 my $tagval = $linkElements{$tag}; 51 for my $attr (ref $tagval ? @$tagval : $tagval) { 52 $tag_attr{"$tag $attr"}++; 53 } 54} 55 56sub parse { 57 my ($self) = @_; 58 59 # already parsed 60 return if exists $self->{parsed_tree} && $self->{parsed_tree}; 61 62 require HTML::Parser; 63 require HTML::Entities; 64 65 my $new_content; 66 67 # this parsing is for fixing up unsafe chars in URLs 68 { 69 # accum_h(self, $text) 70 sub accum_h { 71 my $self = shift; 72 #print "[ @_ ]"; 73 $self->{content} .= join '', @_; 74 } 75 76 # encode unsafe chars in the URL attributes 77 sub start_h { 78 my ($self, $tagname, $attr, $text) = @_; 79 80 # store away the HTML as is 81 unless ($linkElements{$tagname}) { 82 accum_h($self, $text); 83 return; 84 } 85 86 # escape those that include link elements 87 accum_h($self, qq{<$tagname}); 88 for (keys %$attr) { 89 accum_h($self, qq{ $_="}); 90 my $val = $attr->{$_}; 91 if ($tag_attr{"$tagname $_"}) { 92 $val = HTML::Entities::encode($val, ENCODE_CHARS); 93 } 94 accum_h($self, qq{$val"}); 95 } 96 accum_h($self, qq{>}); 97 } 98 99 sub end_h { 100 my ($self, $tagname) = @_; 101 accum_h($self, "</$tagname>"); 102 } 103 104 sub text_h { 105 my ($self, $text) = @_; 106 accum_h($self, $text); 107 } 108 109 my $p = HTML::Parser->new( 110 api_version => 3, 111 start_h => [\&start_h, "self, tagname, attr, text"], 112 end_h => [\&end_h, "self, tagname"], 113 text_h => [\&text_h, "self, text"], 114 ); 115 # Parse document text chunk by chunk 116 $p->parse(${ $self->{content} }); 117 $p->eof; 118 $new_content = $p->{content}; 119 $self->{content} = \$new_content; 120 #print $new_content, "\n\n\n"; 121 } 122 123 { 124 # this parsing extracts the following elements and makes them 125 # available to templates as: 126 # meta.title 127 # head.meta.* (+ renames: description -> abstract) 128 # head.base 129 # head.link 130 # body 131 132 # init 133 my $start_h = sub { 134 my ($self, $tagname, $attr, $text) = @_; 135 my $meta = $self->{parsed_tree}{head}{meta}; 136 137 # special treatment 138 if ($tagname eq 'meta' && exists $attr->{name} && 139 lc $attr->{name} eq 'description') { 140 $self->{parsed_tree}{abstract} = $attr->{content}; 141 } 142 elsif ($tagname eq 'meta' && exists $attr->{content}) { 143 # note: doesn't take into account the 'scheme' attr, 144 # but that one isn't used much 145 if (exists $attr->{name}) { 146 $meta->{name}{ $attr->{name} } = $attr->{content}; 147 } 148 elsif (exists $attr->{'http-equiv'}) { 149 $meta->{'http-equiv'}{ $attr->{'http-equiv'} } 150 = $attr->{content}; 151 } 152 else { 153 # unsupported head element? 154 } 155 } 156 elsif ($tagname eq 'base') { 157 # there is usually only one <base> 158 $self->{parsed_tree}{head}{base} = $attr->{href} 159 if exists $attr->{href}; 160 } 161 elsif ($tagname eq 'link') { 162 # link elements won't overlap, because each is 163 # additive -> easier to store text 164 $self->{parsed_tree}{head}{link} .= $text if length $text; 165 } 166 # note: if adding other elements that also appear outside <head>, 167 # you will need to check that you are inside <head> by setting 168 # a flag when entering it and unsetting it when exiting 169 }; 170 171 my $end_h = sub { 172 my ($self, $tagname, $skipped_text) = @_; 173 # use $p itself as a tmp storage (ok according to the docs) 174 # <title> and <body> get special treatment 175 if ($tagname eq 'title' or $tagname eq 'body') { 176 $self->{parsed_tree}->{$tagname} = $skipped_text; 177 } 178 }; 179 180 my $p = HTML::Parser->new( 181 api_version => 3, 182 report_tags => [qw(title meta body base link)], 183 start_h => [$start_h, "self, tagname, attr, text"], 184 end_h => [$end_h, "self, tagname, skipped_text"], 185 ); 186 # init 187 $p->{parsed_tree}{head}{meta} = {}; 188 # Parse document text chunk by chunk 189 $p->parse(${ $self->{content} }); 190 $p->eof; 191 192 # store the tree away 193 $self->{parsed_tree} = $p->{parsed_tree}; 194 } 195 196} 197 198 1991; 200__END__ 201 202=head1 NAME 203 204C<DocSet::Source::HTML> - A class for parsing input document in the HTML format 205 206=head1 SYNOPSIS 207 208See C<DocSet::Source> 209 210=head1 DESCRIPTION 211 212=head1 METHODS 213 214=over 215 216=item * parse 217 218Converts the source HTML document into a parsed tree. 219 220=item * retrieve_meta_data 221 222Retrieve and set the meta data that describes the input document into 223the I<meta> object attribute. The I<title> and I<link> meta attributes 224are getting set. the rest of the E<lt>headE<gt> is made available for 225the templates too. 226 227=back 228 229=head1 AUTHORS 230 231Stas Bekman E<lt>stas (at) stason.orgE<gt> 232 233 234=cut 235