1#!/usr/bin/perl 2# This software is copyright (c) 2011 by Jeffrey Kegler 3# This is free software; you can redistribute it and/or modify it 4# under the same terms as the Perl 5 programming language system 5# itself. 6 7use 5.010; 8use strict; 9use warnings; 10 11use LWP::UserAgent; 12use URI::URL; 13use HTML::LinkExtor; 14use English qw( -no_match_vars ); 15use Fatal qw(open close); 16use CPAN; 17use Getopt::Long; 18 19my $verbose = 0; 20Carp::croak("usage: $PROGRAM_NAME [--verbose=[0|1|2] [distribution]") 21 if not Getopt::Long::GetOptions( 'verbose=i' => \$verbose ); 22 23use constant OK => 200; 24 25my $most_recent_distribution = pop @ARGV; 26if ( not $most_recent_distribution ) { 27 my @distributions = 28 sort map { $_->[2] } 29 CPAN::Shell->expand( 'Author', 'JKEGL' )->ls( 'Marpa-*', 2 ); 30 $most_recent_distribution = pop @distributions; 31 $most_recent_distribution =~ s/\.tar\.gz$//xms; 32} ## end if ( not $most_recent_distribution ) 33 34my $cpan_base = 'http://search.cpan.org'; 35my $marpa_doc_base = $cpan_base . '/~jkegl/' . "$most_recent_distribution/"; 36 37if ($verbose) { 38 print "Starting at $marpa_doc_base\n" 39 or Carp::croak("Cannot print: $ERRNO"); 40} 41 42$OUTPUT_AUTOFLUSH = 1; 43 44my @doc_urls = (); 45 46{ 47 my $p = HTML::LinkExtor->new(); 48 my $ua = LWP::UserAgent->new; 49 50 # Request document and parse it as it arrives 51 my $response = $ua->request( HTTP::Request->new( GET => $marpa_doc_base ), 52 sub { $p->parse( $_[0] ) } ); 53 54 my $page_response_status_line = $response->status_line; 55 if ( $response->code != OK ) { 56 Carp::croak( 'PAGE: ', $page_response_status_line, q{ }, 57 $marpa_doc_base ); 58 } 59 60 my @links = 61 map { $_->[2] } 62 grep { $_->[0] eq 'a' and $_->[1] eq 'href' and $_->[2] !~ /^[#]/xms } 63 $p->links(); 64 @doc_urls = grep {/^lib\//xms} @links; 65} 66 67my %url_seen = (); 68 69my $at_col_0 = 1; 70PAGE: for my $url (@doc_urls) { 71 $url = $marpa_doc_base . $url; 72 print "Examining document $url" or Carp::croak("Cannot print: $ERRNO"); 73 $at_col_0 = 0; 74 75 my $p = HTML::LinkExtor->new(); 76 my $ua = LWP::UserAgent->new; 77 78 # Request document and parse it as it arrives 79 my $response = $ua->request( HTTP::Request->new( GET => $url ), 80 sub { $p->parse( $_[0] ) } ); 81 82 my $page_response_status_line = $response->status_line; 83 if ( $response->code != OK ) { 84 say 'PAGE: ', $page_response_status_line, q{ }, $url 85 or Carp::croak("Cannot print: $ERRNO"); 86 next PAGE; 87 } 88 89 my @links = 90 map { $_->[2] } 91 grep { $_->[0] eq 'a' and $_->[1] eq 'href' } $p->links(); 92 93 LINK: for my $link (@links) { 94 95 given ($link) { 96 when (/\A\//xms) { 97 $link = 'http://search.cpan.org' . $link; 98 } 99 when (/\A[#]/xms) { 100 $link = $url . $link; 101 } 102 } ## end given 103 104 if ( $url_seen{$link}++ ) { 105 if ( $verbose >= 2 ) { 106 say STDERR "Already tried $link" 107 or Carp::croak("Cannot print: $ERRNO"); 108 $at_col_0 = 1; 109 } 110 next LINK; 111 } ## end if ( $url_seen{$link}++ ) 112 113 if ( $verbose > 1 ) { 114 $at_col_0 or print "\n" or Carp::croak("Cannot print: $ERRNO"); 115 say STDERR "Trying $link" or Carp::croak("Cannot print: $ERRNO"); 116 $at_col_0 = 1; 117 } 118 119 my $link_response = 120 $ua->request( HTTP::Request->new( GET => $link ) ); 121 122 if ( $link_response->code != OK ) { 123 $at_col_0 or print "\n" or Carp::croak("Cannot print: $ERRNO"); 124 say 'FAIL: ', $link_response->status_line, q{ }, $link 125 or Carp::croak("Cannot print: $ERRNO"); 126 $at_col_0 = 1; 127 next LINK; 128 } ## end if ( $link_response->code != OK ) 129 130 if ( not $verbose ) { 131 print {*STDERR} q{.} 132 or Carp::croak("Cannot print: $ERRNO"); 133 $at_col_0 = 0; 134 } 135 136 if ($verbose) { 137 $at_col_0 or print "\n" or Carp::croak("Cannot print: $ERRNO"); 138 my $uri = $link_response->base(); 139 say STDERR "FOUND $link" or Carp::croak("Cannot print: $ERRNO"); 140 say STDERR " uri: $uri" or Carp::croak("Cannot print: $ERRNO"); 141 if ( $verbose >= 3 ) { 142 for my $redirect ( $link_response->redirects() ) { 143 my $redirect_uri = $redirect->base(); 144 say STDERR " redirect: $redirect_uri" 145 or Carp::croak("Cannot print: $ERRNO"); 146 } 147 } ## end if ( $verbose >= 3 ) 148 $at_col_0 = 1; 149 } ## end if ($verbose) 150 151 } ## end for my $link (@links) 152 153 $at_col_0 or print "\n" or Carp::croak("Cannot print: $ERRNO"); 154 $at_col_0 = 1; 155 156 if ($verbose) { 157 say " PAGE: $page_response_status_line: $url" 158 or Carp::croak("Cannot print: $ERRNO"); 159 $at_col_0 = 1; 160 } 161 162} ## end for my $url (@doc_urls) 163