1#!/usr/bin/perl
2# Copyright 2012 Jeffrey Kegler
3# This file is part of Marpa::PP.  Marpa::PP is free software: you can
4# redistribute it and/or modify it under the terms of the GNU Lesser
5# General Public License as published by the Free Software Foundation,
6# either version 3 of the License, or (at your option) any later version.
7#
8# Marpa::PP is distributed in the hope that it will be useful,
9# but WITHOUT ANY WARRANTY; without even the implied warranty of
10# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11# Lesser General Public License for more details.
12#
13# You should have received a copy of the GNU Lesser
14# General Public License along with Marpa::PP.  If not, see
15# http://www.gnu.org/licenses/.
16
17use 5.010;
18use strict;
19use warnings;
20
21use LWP::UserAgent;
22use URI::URL;
23use HTML::LinkExtor;
24use English qw( -no_match_vars );
25use Fatal qw(open close);
26use CPAN;
27use Getopt::Long;
28
29my $verbose = 0;
30Carp::croak("usage: $PROGRAM_NAME [--verbose=[0|1|2] [distribution]")
31    if not Getopt::Long::GetOptions( 'verbose=i' => \$verbose );
32
33use constant OK => 200;
34
35my $most_recent_distribution = pop @ARGV;
36if ( not $most_recent_distribution ) {
37    my @distributions =
38        grep     {/\A Marpa [-] PP [-] \d /xms}
39        sort map { $_->[2] }
40        CPAN::Shell->expand( 'Author', 'JKEGL' )->ls( 'Marpa-PP-*', 2 );
41    $most_recent_distribution = pop @distributions;
42    $most_recent_distribution =~ s/\.tar\.gz$//xms;
43} ## end if ( not $most_recent_distribution )
44
45my $cpan_base      = 'http://search.cpan.org';
46my $marpa_doc_base = $cpan_base . '/~jkegl/' . "$most_recent_distribution/";
47
48if ($verbose) {
49    print "Starting at $marpa_doc_base\n"
50        or Carp::croak("Cannot print: $ERRNO");
51}
52
53$OUTPUT_AUTOFLUSH = 1;
54
55my @doc_urls = ();
56
57{
58    my $p  = HTML::LinkExtor->new();
59    my $ua = LWP::UserAgent->new;
60
61    # Request document and parse it as it arrives
62    my $response = $ua->request( HTTP::Request->new( GET => $marpa_doc_base ),
63        sub { $p->parse( $_[0] ) } );
64
65    my $page_response_status_line = $response->status_line;
66    if ( $response->code != OK ) {
67        Carp::croak( 'PAGE: ', $page_response_status_line, q{ },
68            $marpa_doc_base );
69    }
70
71    my @links =
72        map { $_->[2] }
73        grep { $_->[0] eq 'a' and $_->[1] eq 'href' and $_->[2] !~ /^[#]/xms }
74        $p->links();
75
76    @doc_urls = grep {/^pod\//xms} @links;
77}
78
79my %url_seen = ();
80
81my $at_col_0 = 1;
82PAGE: for my $url (@doc_urls) {
83    $url = $marpa_doc_base . $url;
84    print "Examining document $url" or Carp::croak("Cannot print: $ERRNO");
85    $at_col_0 = 0;
86
87    my $p  = HTML::LinkExtor->new();
88    my $ua = LWP::UserAgent->new;
89
90    # Request document and parse it as it arrives
91    my $response = $ua->request( HTTP::Request->new( GET => $url ),
92        sub { $p->parse( $_[0] ) } );
93
94    my $page_response_status_line = $response->status_line;
95    if ( $response->code != OK ) {
96        say 'PAGE: ', $page_response_status_line, q{ }, $url
97            or Carp::croak("Cannot print: $ERRNO");
98        next PAGE;
99    }
100
101    my @links =
102        map { $_->[2] }
103        grep { $_->[0] eq 'a' and $_->[1] eq 'href' } $p->links();
104
105    LINK: for my $link (@links) {
106
107        given ($link) {
108            when (/\A\//xms) {
109                $link = 'http://search.cpan.org' . $link;
110            }
111            when (/\A[#]/xms) {
112                $link = $url . $link;
113            }
114        } ## end given
115
116        if ( $url_seen{$link}++ ) {
117            if ( $verbose >= 2 ) {
118                say {*STDERR} "Already tried $link"
119                    or Carp::croak("Cannot print: $ERRNO");
120                $at_col_0 = 1;
121            }
122            next LINK;
123        } ## end if ( $url_seen{$link}++ )
124
125        if ( $verbose > 1 ) {
126            $at_col_0 or print "\n" or Carp::croak("Cannot print: $ERRNO");
127            say {*STDERR} "Trying $link"
128                or Carp::croak("Cannot print: $ERRNO");
129            $at_col_0 = 1;
130        } ## end if ( $verbose > 1 )
131
132        my $link_response =
133            $ua->request( HTTP::Request->new( GET => $link ) );
134
135        if ( $link_response->code != OK ) {
136            $at_col_0 or print "\n" or Carp::croak("Cannot print: $ERRNO");
137            say 'FAIL: ', $link_response->status_line, q{ }, $link
138                or Carp::croak("Cannot print: $ERRNO");
139            $at_col_0 = 1;
140            next LINK;
141        } ## end if ( $link_response->code != OK )
142
143        if ( not $verbose ) {
144            print {*STDERR} q{.}
145                or Carp::croak("Cannot print: $ERRNO");
146            $at_col_0 = 0;
147        }
148
149        if ($verbose) {
150            $at_col_0 or print "\n" or Carp::croak("Cannot print: $ERRNO");
151            my $uri = $link_response->base();
152            say {*STDERR} "FOUND $link"
153                or Carp::croak("Cannot print: $ERRNO");
154            say {*STDERR} "  uri: $uri"
155                or Carp::croak("Cannot print: $ERRNO");
156            if ( $verbose >= 3 ) {
157                for my $redirect ( $link_response->redirects() ) {
158                    my $redirect_uri = $redirect->base();
159                    say {*STDERR} "  redirect: $redirect_uri"
160                        or Carp::croak("Cannot print: $ERRNO");
161                }
162            } ## end if ( $verbose >= 3 )
163            $at_col_0 = 1;
164        } ## end if ($verbose)
165
166    } ## end for my $link (@links)
167
168    $at_col_0 or print "\n" or Carp::croak("Cannot print: $ERRNO");
169    $at_col_0 = 1;
170
171    if ($verbose) {
172        say " PAGE: $page_response_status_line: $url"
173            or Carp::croak("Cannot print: $ERRNO");
174        $at_col_0 = 1;
175    }
176
177} ## end for my $url (@doc_urls)
178