1#!/usr/bin/perl
2
3use 5.010;
4use strict;
5use warnings;
6
7use LWP::UserAgent;
8use URI::URL;
9use HTML::LinkExtor;
10use English qw( -no_match_vars );
11use Fatal qw(open close);
12use CPAN;
13use Getopt::Long;
14
15my $verbose = 0;
16Carp::croak("usage: $PROGRAM_NAME [--verbose=[0|1|2] [distribution]")
17    if not Getopt::Long::GetOptions( 'verbose=i' => \$verbose );
18
19use constant OK => 200;
20
21my $most_recent_distribution = pop @ARGV;
22if ( not $most_recent_distribution ) {
23    my @distributions =
24        grep     {/\A Marpa [-] \d /xms}
25        sort map { $_->[2] }
26        CPAN::Shell->expand( 'Author', 'JKEGL' )->ls( 'Marpa-*', 2 );
27    $most_recent_distribution = pop @distributions;
28    $most_recent_distribution =~ s/\.tar\.gz$//xms;
29} ## end if ( not $most_recent_distribution )
30
31my $cpan_base      = 'http://search.cpan.org';
32my $marpa_doc_base = $cpan_base . '/~jkegl/' . "$most_recent_distribution/";
33
34if ($verbose) {
35    print "Starting at $marpa_doc_base\n"
36        or Carp::croak("Cannot print: $ERRNO");
37}
38
39$OUTPUT_AUTOFLUSH = 1;
40
41my @doc_urls = ();
42
43{
44    my $p  = HTML::LinkExtor->new();
45    my $ua = LWP::UserAgent->new;
46
47    # Request document and parse it as it arrives
48    my $response = $ua->request( HTTP::Request->new( GET => $marpa_doc_base ),
49        sub { $p->parse( $_[0] ) } );
50
51    my $page_response_status_line = $response->status_line;
52    if ( $response->code != OK ) {
53        Carp::croak( 'PAGE: ', $page_response_status_line, q{ },
54            $marpa_doc_base );
55    }
56
57    my @links =
58        map { $_->[2] }
59        grep { $_->[0] eq 'a' and $_->[1] eq 'href' and $_->[2] !~ /^[#]/xms }
60        $p->links();
61    @doc_urls = grep {/^lib\//xms} @links;
62}
63
64my %url_seen = ();
65
66my $at_col_0 = 1;
67PAGE: for my $url (@doc_urls) {
68    $url = $marpa_doc_base . $url;
69    print "Examining document $url" or Carp::croak("Cannot print: $ERRNO");
70    $at_col_0 = 0;
71
72    my $p  = HTML::LinkExtor->new();
73    my $ua = LWP::UserAgent->new;
74
75    # Request document and parse it as it arrives
76    my $response = $ua->request( HTTP::Request->new( GET => $url ),
77        sub { $p->parse( $_[0] ) } );
78
79    my $page_response_status_line = $response->status_line;
80    if ( $response->code != OK ) {
81        say 'PAGE: ', $page_response_status_line, q{ }, $url
82            or Carp::croak("Cannot print: $ERRNO");
83        next PAGE;
84    }
85
86    my @links =
87        map { $_->[2] }
88        grep { $_->[0] eq 'a' and $_->[1] eq 'href' } $p->links();
89
90    LINK: for my $link (@links) {
91
92        given ($link) {
93            when (/\A\//xms) {
94                $link = 'http://search.cpan.org' . $link;
95            }
96            when (/\A[#]/xms) {
97                $link = $url . $link;
98            }
99        } ## end given
100
101        if ( $url_seen{$link}++ ) {
102            if ( $verbose >= 2 ) {
103                say {*STDERR} "Already tried $link"
104                    or Carp::croak("Cannot print: $ERRNO");
105                $at_col_0 = 1;
106            }
107            next LINK;
108        } ## end if ( $url_seen{$link}++ )
109
110        if ( $verbose > 1 ) {
111            $at_col_0 or print "\n" or Carp::croak("Cannot print: $ERRNO");
112            say {*STDERR} "Trying $link"
113                or Carp::croak("Cannot print: $ERRNO");
114            $at_col_0 = 1;
115        } ## end if ( $verbose > 1 )
116
117        my $link_response =
118            $ua->request( HTTP::Request->new( GET => $link ) );
119
120        if ( $link_response->code != OK ) {
121            $at_col_0 or print "\n" or Carp::croak("Cannot print: $ERRNO");
122            say 'FAIL: ', $link_response->status_line, q{ }, $link
123                or Carp::croak("Cannot print: $ERRNO");
124            $at_col_0 = 1;
125            next LINK;
126        } ## end if ( $link_response->code != OK )
127
128        if ( not $verbose ) {
129            print {*STDERR} q{.}
130                or Carp::croak("Cannot print: $ERRNO");
131            $at_col_0 = 0;
132        }
133
134        if ($verbose) {
135            $at_col_0 or print "\n" or Carp::croak("Cannot print: $ERRNO");
136            my $uri = $link_response->base();
137            say {*STDERR} "FOUND $link"
138                or Carp::croak("Cannot print: $ERRNO");
139            say {*STDERR} "  uri: $uri"
140                or Carp::croak("Cannot print: $ERRNO");
141            if ( $verbose >= 3 ) {
142                for my $redirect ( $link_response->redirects() ) {
143                    my $redirect_uri = $redirect->base();
144                    say {*STDERR} "  redirect: $redirect_uri"
145                        or Carp::croak("Cannot print: $ERRNO");
146                }
147            } ## end if ( $verbose >= 3 )
148            $at_col_0 = 1;
149        } ## end if ($verbose)
150
151    } ## end for my $link (@links)
152
153    $at_col_0 or print "\n" or Carp::croak("Cannot print: $ERRNO");
154    $at_col_0 = 1;
155
156    if ($verbose) {
157        say " PAGE: $page_response_status_line: $url"
158            or Carp::croak("Cannot print: $ERRNO");
159        $at_col_0 = 1;
160    }
161
162} ## end for my $url (@doc_urls)
163