DocSet/Source/HTML.pm

package DocSet::Source::HTML;

use strict;
use warnings;

use DocSet::Util;

use vars qw(@ISA);
require DocSet::Doc;
@ISA = qw(DocSet::Doc);

use constant ENCODE_CHARS => '<>&" ';

sub retrieve_meta_data {
    my ($self) = @_;

    $self->parse;

    use Pod::POM::View::HTML;
    my $mode = 'Pod::POM::View::HTML';
    #print Pod::POM::View::HTML->print($pom);

    my $title = $self->{parsed_tree}->{title};

    $self->{meta} =
        {
         title    => $title,
         stitle   => $title, # stitle is the same as title in docs
         abstract => $self->{parsed_tree}->{abstract} || '',
         link     => $self->{rel_dst_path},
        };

    # there is no autogenerated TOC for HTML files
}

my %linkElements =  (          # from HTML::Element.pm
    body   => 'background',
    base   => 'href',
    a      => 'href',
    img    => [qw(src lowsrc usemap)], # lowsrc is a Netscape invention
    form   => 'action',
    input  => 'src',
    'link'  => 'href',         # need quoting since link is a perl builtin
    frame  => 'src',
    applet => 'codebase',
    area   => 'href',
);
my %tag_attr;
for my $tag (keys %linkElements) {
    my $tagval = $linkElements{$tag};
    for my $attr (ref $tagval ? @$tagval : $tagval) {
        $tag_attr{"$tag $attr"}++;
    }
}

sub parse {
    my ($self) = @_;

    # already parsed
    return if exists $self->{parsed_tree} && $self->{parsed_tree};

    require HTML::Parser;
    require HTML::Entities;

    my $new_content;

    # this parsing is for fixing up unsafe chars in URLs
    {
        # accum_h(self, $text)
        sub accum_h {
            my $self = shift;
            #print "[ @_ ]";
            $self->{content} .= join '', @_;
        }

        # encode unsafe chars in the URL attributes
        sub start_h {
            my ($self, $tagname, $attr, $text) = @_;

            # store away the HTML as is
            unless ($linkElements{$tagname}) {
                accum_h($self, $text);
                return;
            }

            # escape those that include link elements
            accum_h($self, qq{<$tagname});
            for (keys %$attr) {
                accum_h($self, qq{ $_="});
                my $val = $attr->{$_};
                if ($tag_attr{"$tagname $_"}) {
                    $val = HTML::Entities::encode($val, ENCODE_CHARS);
                }
                accum_h($self, qq{$val"});
            }
            accum_h($self, qq{>});
        }

        sub end_h {
            my ($self, $tagname) = @_;
            accum_h($self, "</$tagname>");
        }

        sub text_h {
            my ($self, $text) = @_;
            accum_h($self, $text);
        }

        my $p = HTML::Parser->new(
            api_version => 3,
            start_h     => [\&start_h, "self, tagname, attr, text"],
            end_h       => [\&end_h,  "self, tagname"],
            text_h      => [\&text_h, "self, text"],
        );
        # Parse document text chunk by chunk
        $p->parse(${ $self->{content} });
        $p->eof;
        $new_content = $p->{content};
        $self->{content} = \$new_content;
        #print $new_content, "\n\n\n";
    }

    {
        # this parsing extracts the following elements and makes them
        # available to templates as:
        # meta.title
        # head.meta.* (+ renames: description -> abstract)
        # head.base
        # head.link
        # body

        # init
        my $start_h = sub {
            my ($self, $tagname, $attr, $text) = @_;
            my $meta = $self->{parsed_tree}{head}{meta};

            # special treatment
            if ($tagname eq 'meta' && exists $attr->{name} &&
                lc $attr->{name} eq 'description') {
                $self->{parsed_tree}{abstract} = $attr->{content};
            }
            elsif ($tagname eq 'meta' && exists $attr->{content}) {
                # note: doesn't take into account the 'scheme' attr,
                # but that one isn't used much
                if (exists $attr->{name}) {
                    $meta->{name}{ $attr->{name} } = $attr->{content};
                }
                elsif (exists $attr->{'http-equiv'}) {
                    $meta->{'http-equiv'}{ $attr->{'http-equiv'} }
                        = $attr->{content};
                }
                else {
                    # unsupported head element?
                }
            }
            elsif ($tagname eq 'base') {
                # there is usually only one <base>
                $self->{parsed_tree}{head}{base} = $attr->{href}
                    if exists $attr->{href};
            }
            elsif ($tagname eq 'link') {
                # link elements won't overlap, because each is
                # additive -> easier to store text
                $self->{parsed_tree}{head}{link} .= $text if length $text;
            }
            # note: if adding other elements that also appear outside <head>,
            # you will need to check that you are inside <head>  by setting
            # a flag when entering it and unsetting it when exiting
        };

        my $end_h = sub {
            my ($self, $tagname, $skipped_text) = @_;
            # use $p itself as a tmp storage (ok according to the docs)
            # <title> and <body> get special treatment
            if ($tagname eq 'title' or $tagname eq 'body') {
                $self->{parsed_tree}->{$tagname} = $skipped_text;
            }
        };

        my $p = HTML::Parser->new(
            api_version => 3,
            report_tags => [qw(title meta body base link)],
            start_h     => [$start_h, "self, tagname, attr, text"],
            end_h       => [$end_h, "self, tagname, skipped_text"],
        );
        # init
        $p->{parsed_tree}{head}{meta} = {};
        # Parse document text chunk by chunk
        $p->parse(${ $self->{content} });
        $p->eof;

        # store the tree away
        $self->{parsed_tree} = $p->{parsed_tree};
    }

}


1;
__END__

=head1 NAME

C<DocSet::Source::HTML> - A class for parsing input document in the HTML format

=head1 SYNOPSIS

See C<DocSet::Source>

=head1 DESCRIPTION

=head1 METHODS

=over

=item * parse

Converts the source HTML document into a parsed tree.

=item * retrieve_meta_data

Retrieve and set the meta data that describes the input document into
the I<meta> object attribute. The I<title> and I<link> meta attributes
are getting set. the rest of the E<lt>headE<gt> is made available for
the templates too.

=back

=head1 AUTHORS

Stas Bekman E<lt>stas (at) stason.orgE<gt>


=cut