1#!/usr/bin/perl 2 3use 5.010; 4use strict; 5use warnings; 6 7use LWP::UserAgent; 8use URI::URL; 9use HTML::LinkExtor; 10use English qw( -no_match_vars ); 11use Fatal qw(open close); 12use CPAN; 13use Getopt::Long; 14 15my $verbose = 0; 16Carp::croak("usage: $PROGRAM_NAME [--verbose=[0|1|2] [distribution]") 17 if not Getopt::Long::GetOptions( 'verbose=i' => \$verbose ); 18 19use constant OK => 200; 20 21my $most_recent_distribution = pop @ARGV; 22if ( not $most_recent_distribution ) { 23 my @distributions = 24 grep {/\A Marpa [-] \d /xms} 25 sort map { $_->[2] } 26 CPAN::Shell->expand( 'Author', 'JKEGL' )->ls( 'Marpa-*', 2 ); 27 $most_recent_distribution = pop @distributions; 28 $most_recent_distribution =~ s/\.tar\.gz$//xms; 29} ## end if ( not $most_recent_distribution ) 30 31my $cpan_base = 'http://search.cpan.org'; 32my $marpa_doc_base = $cpan_base . '/~jkegl/' . "$most_recent_distribution/"; 33 34if ($verbose) { 35 print "Starting at $marpa_doc_base\n" 36 or Carp::croak("Cannot print: $ERRNO"); 37} 38 39$OUTPUT_AUTOFLUSH = 1; 40 41my @doc_urls = (); 42 43{ 44 my $p = HTML::LinkExtor->new(); 45 my $ua = LWP::UserAgent->new; 46 47 # Request document and parse it as it arrives 48 my $response = $ua->request( HTTP::Request->new( GET => $marpa_doc_base ), 49 sub { $p->parse( $_[0] ) } ); 50 51 my $page_response_status_line = $response->status_line; 52 if ( $response->code != OK ) { 53 Carp::croak( 'PAGE: ', $page_response_status_line, q{ }, 54 $marpa_doc_base ); 55 } 56 57 my @links = 58 map { $_->[2] } 59 grep { $_->[0] eq 'a' and $_->[1] eq 'href' and $_->[2] !~ /^[#]/xms } 60 $p->links(); 61 @doc_urls = grep {/^lib\//xms} @links; 62} 63 64my %url_seen = (); 65 66my $at_col_0 = 1; 67PAGE: for my $url (@doc_urls) { 68 $url = $marpa_doc_base . $url; 69 print "Examining document $url" or Carp::croak("Cannot print: $ERRNO"); 70 $at_col_0 = 0; 71 72 my $p = HTML::LinkExtor->new(); 73 my $ua = LWP::UserAgent->new; 74 75 # Request document and parse it as it arrives 76 my $response = $ua->request( HTTP::Request->new( GET => $url ), 77 sub { $p->parse( $_[0] ) } ); 78 79 my $page_response_status_line = $response->status_line; 80 if ( $response->code != OK ) { 81 say 'PAGE: ', $page_response_status_line, q{ }, $url 82 or Carp::croak("Cannot print: $ERRNO"); 83 next PAGE; 84 } 85 86 my @links = 87 map { $_->[2] } 88 grep { $_->[0] eq 'a' and $_->[1] eq 'href' } $p->links(); 89 90 LINK: for my $link (@links) { 91 92 given ($link) { 93 when (/\A\//xms) { 94 $link = 'http://search.cpan.org' . $link; 95 } 96 when (/\A[#]/xms) { 97 $link = $url . $link; 98 } 99 } ## end given 100 101 if ( $url_seen{$link}++ ) { 102 if ( $verbose >= 2 ) { 103 say {*STDERR} "Already tried $link" 104 or Carp::croak("Cannot print: $ERRNO"); 105 $at_col_0 = 1; 106 } 107 next LINK; 108 } ## end if ( $url_seen{$link}++ ) 109 110 if ( $verbose > 1 ) { 111 $at_col_0 or print "\n" or Carp::croak("Cannot print: $ERRNO"); 112 say {*STDERR} "Trying $link" 113 or Carp::croak("Cannot print: $ERRNO"); 114 $at_col_0 = 1; 115 } ## end if ( $verbose > 1 ) 116 117 my $link_response = 118 $ua->request( HTTP::Request->new( GET => $link ) ); 119 120 if ( $link_response->code != OK ) { 121 $at_col_0 or print "\n" or Carp::croak("Cannot print: $ERRNO"); 122 say 'FAIL: ', $link_response->status_line, q{ }, $link 123 or Carp::croak("Cannot print: $ERRNO"); 124 $at_col_0 = 1; 125 next LINK; 126 } ## end if ( $link_response->code != OK ) 127 128 if ( not $verbose ) { 129 print {*STDERR} q{.} 130 or Carp::croak("Cannot print: $ERRNO"); 131 $at_col_0 = 0; 132 } 133 134 if ($verbose) { 135 $at_col_0 or print "\n" or Carp::croak("Cannot print: $ERRNO"); 136 my $uri = $link_response->base(); 137 say {*STDERR} "FOUND $link" 138 or Carp::croak("Cannot print: $ERRNO"); 139 say {*STDERR} " uri: $uri" 140 or Carp::croak("Cannot print: $ERRNO"); 141 if ( $verbose >= 3 ) { 142 for my $redirect ( $link_response->redirects() ) { 143 my $redirect_uri = $redirect->base(); 144 say {*STDERR} " redirect: $redirect_uri" 145 or Carp::croak("Cannot print: $ERRNO"); 146 } 147 } ## end if ( $verbose >= 3 ) 148 $at_col_0 = 1; 149 } ## end if ($verbose) 150 151 } ## end for my $link (@links) 152 153 $at_col_0 or print "\n" or Carp::croak("Cannot print: $ERRNO"); 154 $at_col_0 = 1; 155 156 if ($verbose) { 157 say " PAGE: $page_response_status_line: $url" 158 or Carp::croak("Cannot print: $ERRNO"); 159 $at_col_0 = 1; 160 } 161 162} ## end for my $url (@doc_urls) 163