1#!/usr/bin/perl
2# This software is copyright (c) 2011 by Jeffrey Kegler
3# This is free software; you can redistribute it and/or modify it
4# under the same terms as the Perl 5 programming language system
5# itself.
6
7use 5.010;
8use strict;
9use warnings;
10
11use LWP::UserAgent;
12use URI::URL;
13use HTML::LinkExtor;
14use English qw( -no_match_vars );
15use Fatal qw(open close);
16use CPAN;
17use Getopt::Long;
18
19my $verbose = 0;
20Carp::croak("usage: $PROGRAM_NAME [--verbose=[0|1|2] [distribution]")
21    if not Getopt::Long::GetOptions( 'verbose=i' => \$verbose );
22
23use constant OK => 200;
24
25my $most_recent_distribution = pop @ARGV;
26if ( not $most_recent_distribution ) {
27    my @distributions =
28        sort map { $_->[2] }
29        CPAN::Shell->expand( 'Author', 'JKEGL' )->ls( 'Marpa-*', 2 );
30    $most_recent_distribution = pop @distributions;
31    $most_recent_distribution =~ s/\.tar\.gz$//xms;
32} ## end if ( not $most_recent_distribution )
33
34my $cpan_base      = 'http://search.cpan.org';
35my $marpa_doc_base = $cpan_base . '/~jkegl/' . "$most_recent_distribution/";
36
37if ($verbose) {
38    print "Starting at $marpa_doc_base\n"
39        or Carp::croak("Cannot print: $ERRNO");
40}
41
42$OUTPUT_AUTOFLUSH = 1;
43
44my @doc_urls = ();
45
46{
47    my $p  = HTML::LinkExtor->new();
48    my $ua = LWP::UserAgent->new;
49
50    # Request document and parse it as it arrives
51    my $response = $ua->request( HTTP::Request->new( GET => $marpa_doc_base ),
52        sub { $p->parse( $_[0] ) } );
53
54    my $page_response_status_line = $response->status_line;
55    if ( $response->code != OK ) {
56        Carp::croak( 'PAGE: ', $page_response_status_line, q{ },
57            $marpa_doc_base );
58    }
59
60    my @links =
61        map { $_->[2] }
62        grep { $_->[0] eq 'a' and $_->[1] eq 'href' and $_->[2] !~ /^[#]/xms }
63        $p->links();
64    @doc_urls = grep {/^lib\//xms} @links;
65}
66
67my %url_seen = ();
68
69my $at_col_0 = 1;
70PAGE: for my $url (@doc_urls) {
71    $url = $marpa_doc_base . $url;
72    print "Examining document $url" or Carp::croak("Cannot print: $ERRNO");
73    $at_col_0 = 0;
74
75    my $p  = HTML::LinkExtor->new();
76    my $ua = LWP::UserAgent->new;
77
78    # Request document and parse it as it arrives
79    my $response = $ua->request( HTTP::Request->new( GET => $url ),
80        sub { $p->parse( $_[0] ) } );
81
82    my $page_response_status_line = $response->status_line;
83    if ( $response->code != OK ) {
84        say 'PAGE: ', $page_response_status_line, q{ }, $url
85            or Carp::croak("Cannot print: $ERRNO");
86        next PAGE;
87    }
88
89    my @links =
90        map { $_->[2] }
91        grep { $_->[0] eq 'a' and $_->[1] eq 'href' } $p->links();
92
93    LINK: for my $link (@links) {
94
95        given ($link) {
96            when (/\A\//xms) {
97                $link = 'http://search.cpan.org' . $link;
98            }
99            when (/\A[#]/xms) {
100                $link = $url . $link;
101            }
102        } ## end given
103
104        if ( $url_seen{$link}++ ) {
105            if ( $verbose >= 2 ) {
106                say STDERR "Already tried $link"
107                    or Carp::croak("Cannot print: $ERRNO");
108                $at_col_0 = 1;
109            }
110            next LINK;
111        } ## end if ( $url_seen{$link}++ )
112
113        if ( $verbose > 1 ) {
114            $at_col_0 or print "\n" or Carp::croak("Cannot print: $ERRNO");
115            say STDERR "Trying $link" or Carp::croak("Cannot print: $ERRNO");
116            $at_col_0 = 1;
117        }
118
119        my $link_response =
120            $ua->request( HTTP::Request->new( GET => $link ) );
121
122        if ( $link_response->code != OK ) {
123            $at_col_0 or print "\n" or Carp::croak("Cannot print: $ERRNO");
124            say 'FAIL: ', $link_response->status_line, q{ }, $link
125                or Carp::croak("Cannot print: $ERRNO");
126            $at_col_0 = 1;
127            next LINK;
128        } ## end if ( $link_response->code != OK )
129
130        if ( not $verbose ) {
131            print {*STDERR} q{.}
132                or Carp::croak("Cannot print: $ERRNO");
133            $at_col_0 = 0;
134        }
135
136        if ($verbose) {
137            $at_col_0 or print "\n" or Carp::croak("Cannot print: $ERRNO");
138            my $uri = $link_response->base();
139            say STDERR "FOUND $link" or Carp::croak("Cannot print: $ERRNO");
140            say STDERR "  uri: $uri" or Carp::croak("Cannot print: $ERRNO");
141            if ( $verbose >= 3 ) {
142                for my $redirect ( $link_response->redirects() ) {
143                    my $redirect_uri = $redirect->base();
144                    say STDERR "  redirect: $redirect_uri"
145                        or Carp::croak("Cannot print: $ERRNO");
146                }
147            } ## end if ( $verbose >= 3 )
148            $at_col_0 = 1;
149        } ## end if ($verbose)
150
151    } ## end for my $link (@links)
152
153    $at_col_0 or print "\n" or Carp::croak("Cannot print: $ERRNO");
154    $at_col_0 = 1;
155
156    if ($verbose) {
157        say " PAGE: $page_response_status_line: $url"
158            or Carp::croak("Cannot print: $ERRNO");
159        $at_col_0 = 1;
160    }
161
162} ## end for my $url (@doc_urls)
163