1package DocSet::Source::HTML;
2
3use strict;
4use warnings;
5
6use DocSet::Util;
7
8use vars qw(@ISA);
9require DocSet::Doc;
10@ISA = qw(DocSet::Doc);
11
12use constant ENCODE_CHARS => '<>&" ';
13
14sub retrieve_meta_data {
15    my ($self) = @_;
16
17    $self->parse;
18
19    use Pod::POM::View::HTML;
20    my $mode = 'Pod::POM::View::HTML';
21    #print Pod::POM::View::HTML->print($pom);
22
23    my $title = $self->{parsed_tree}->{title};
24
25    $self->{meta} =
26        {
27         title    => $title,
28         stitle   => $title, # stitle is the same as title in docs
29         abstract => $self->{parsed_tree}->{abstract} || '',
30         link     => $self->{rel_dst_path},
31        };
32
33    # there is no autogenerated TOC for HTML files
34}
35
36my %linkElements =  (          # from HTML::Element.pm
37    body   => 'background',
38    base   => 'href',
39    a      => 'href',
40    img    => [qw(src lowsrc usemap)], # lowsrc is a Netscape invention
41    form   => 'action',
42    input  => 'src',
43    'link'  => 'href',         # need quoting since link is a perl builtin
44    frame  => 'src',
45    applet => 'codebase',
46    area   => 'href',
47);
48my %tag_attr;
49for my $tag (keys %linkElements) {
50    my $tagval = $linkElements{$tag};
51    for my $attr (ref $tagval ? @$tagval : $tagval) {
52        $tag_attr{"$tag $attr"}++;
53    }
54}
55
56sub parse {
57    my ($self) = @_;
58
59    # already parsed
60    return if exists $self->{parsed_tree} && $self->{parsed_tree};
61
62    require HTML::Parser;
63    require HTML::Entities;
64
65    my $new_content;
66
67    # this parsing is for fixing up unsafe chars in URLs
68    {
69        # accum_h(self, $text)
70        sub accum_h {
71            my $self = shift;
72            #print "[ @_ ]";
73            $self->{content} .= join '', @_;
74        }
75
76        # encode unsafe chars in the URL attributes
77        sub start_h {
78            my ($self, $tagname, $attr, $text) = @_;
79
80            # store away the HTML as is
81            unless ($linkElements{$tagname}) {
82                accum_h($self, $text);
83                return;
84            }
85
86            # escape those that include link elements
87            accum_h($self, qq{<$tagname});
88            for (keys %$attr) {
89                accum_h($self, qq{ $_="});
90                my $val = $attr->{$_};
91                if ($tag_attr{"$tagname $_"}) {
92                    $val = HTML::Entities::encode($val, ENCODE_CHARS);
93                }
94                accum_h($self, qq{$val"});
95            }
96            accum_h($self, qq{>});
97        }
98
99        sub end_h {
100            my ($self, $tagname) = @_;
101            accum_h($self, "</$tagname>");
102        }
103
104        sub text_h {
105            my ($self, $text) = @_;
106            accum_h($self, $text);
107        }
108
109        my $p = HTML::Parser->new(
110            api_version => 3,
111            start_h     => [\&start_h, "self, tagname, attr, text"],
112            end_h       => [\&end_h,  "self, tagname"],
113            text_h      => [\&text_h, "self, text"],
114        );
115        # Parse document text chunk by chunk
116        $p->parse(${ $self->{content} });
117        $p->eof;
118        $new_content = $p->{content};
119        $self->{content} = \$new_content;
120        #print $new_content, "\n\n\n";
121    }
122
123    {
124        # this parsing extracts the following elements and makes them
125        # available to templates as:
126        # meta.title
127        # head.meta.* (+ renames: description -> abstract)
128        # head.base
129        # head.link
130        # body
131
132        # init
133        my $start_h = sub {
134            my ($self, $tagname, $attr, $text) = @_;
135            my $meta = $self->{parsed_tree}{head}{meta};
136
137            # special treatment
138            if ($tagname eq 'meta' && exists $attr->{name} &&
139                lc $attr->{name} eq 'description') {
140                $self->{parsed_tree}{abstract} = $attr->{content};
141            }
142            elsif ($tagname eq 'meta' && exists $attr->{content}) {
143                # note: doesn't take into account the 'scheme' attr,
144                # but that one isn't used much
145                if (exists $attr->{name}) {
146                    $meta->{name}{ $attr->{name} } = $attr->{content};
147                }
148                elsif (exists $attr->{'http-equiv'}) {
149                    $meta->{'http-equiv'}{ $attr->{'http-equiv'} }
150                        = $attr->{content};
151                }
152                else {
153                    # unsupported head element?
154                }
155            }
156            elsif ($tagname eq 'base') {
157                # there is usually only one <base>
158                $self->{parsed_tree}{head}{base} = $attr->{href}
159                    if exists $attr->{href};
160            }
161            elsif ($tagname eq 'link') {
162                # link elements won't overlap, because each is
163                # additive -> easier to store text
164                $self->{parsed_tree}{head}{link} .= $text if length $text;
165            }
166            # note: if adding other elements that also appear outside <head>,
167            # you will need to check that you are inside <head>  by setting
168            # a flag when entering it and unsetting it when exiting
169        };
170
171        my $end_h = sub {
172            my ($self, $tagname, $skipped_text) = @_;
173            # use $p itself as a tmp storage (ok according to the docs)
174            # <title> and <body> get special treatment
175            if ($tagname eq 'title' or $tagname eq 'body') {
176                $self->{parsed_tree}->{$tagname} = $skipped_text;
177            }
178        };
179
180        my $p = HTML::Parser->new(
181            api_version => 3,
182            report_tags => [qw(title meta body base link)],
183            start_h     => [$start_h, "self, tagname, attr, text"],
184            end_h       => [$end_h, "self, tagname, skipped_text"],
185        );
186        # init
187        $p->{parsed_tree}{head}{meta} = {};
188        # Parse document text chunk by chunk
189        $p->parse(${ $self->{content} });
190        $p->eof;
191
192        # store the tree away
193        $self->{parsed_tree} = $p->{parsed_tree};
194    }
195
196}
197
198
1991;
200__END__
201
202=head1 NAME
203
204C<DocSet::Source::HTML> - A class for parsing input document in the HTML format
205
206=head1 SYNOPSIS
207
208See C<DocSet::Source>
209
210=head1 DESCRIPTION
211
212=head1 METHODS
213
214=over
215
216=item * parse
217
218Converts the source HTML document into a parsed tree.
219
220=item * retrieve_meta_data
221
222Retrieve and set the meta data that describes the input document into
223the I<meta> object attribute. The I<title> and I<link> meta attributes
224are getting set. the rest of the E<lt>headE<gt> is made available for
225the templates too.
226
227=back
228
229=head1 AUTHORS
230
231Stas Bekman E<lt>stas (at) stason.orgE<gt>
232
233
234=cut
235