1#!/usr/bin/perl 2# Copyright 2012 Jeffrey Kegler 3# This file is part of Marpa::PP. Marpa::PP is free software: you can 4# redistribute it and/or modify it under the terms of the GNU Lesser 5# General Public License as published by the Free Software Foundation, 6# either version 3 of the License, or (at your option) any later version. 7# 8# Marpa::PP is distributed in the hope that it will be useful, 9# but WITHOUT ANY WARRANTY; without even the implied warranty of 10# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11# Lesser General Public License for more details. 12# 13# You should have received a copy of the GNU Lesser 14# General Public License along with Marpa::PP. If not, see 15# http://www.gnu.org/licenses/. 16 17use 5.010; 18use strict; 19use warnings; 20 21use LWP::UserAgent; 22use URI::URL; 23use HTML::LinkExtor; 24use English qw( -no_match_vars ); 25use Fatal qw(open close); 26use CPAN; 27use Getopt::Long; 28 29my $verbose = 0; 30Carp::croak("usage: $PROGRAM_NAME [--verbose=[0|1|2] [distribution]") 31 if not Getopt::Long::GetOptions( 'verbose=i' => \$verbose ); 32 33use constant OK => 200; 34 35my $most_recent_distribution = pop @ARGV; 36if ( not $most_recent_distribution ) { 37 my @distributions = 38 grep {/\A Marpa [-] PP [-] \d /xms} 39 sort map { $_->[2] } 40 CPAN::Shell->expand( 'Author', 'JKEGL' )->ls( 'Marpa-PP-*', 2 ); 41 $most_recent_distribution = pop @distributions; 42 $most_recent_distribution =~ s/\.tar\.gz$//xms; 43} ## end if ( not $most_recent_distribution ) 44 45my $cpan_base = 'http://search.cpan.org'; 46my $marpa_doc_base = $cpan_base . '/~jkegl/' . "$most_recent_distribution/"; 47 48if ($verbose) { 49 print "Starting at $marpa_doc_base\n" 50 or Carp::croak("Cannot print: $ERRNO"); 51} 52 53$OUTPUT_AUTOFLUSH = 1; 54 55my @doc_urls = (); 56 57{ 58 my $p = HTML::LinkExtor->new(); 59 my $ua = LWP::UserAgent->new; 60 61 # Request document and parse it as it arrives 62 my $response = $ua->request( HTTP::Request->new( GET => $marpa_doc_base ), 63 sub { $p->parse( $_[0] ) } ); 64 65 my $page_response_status_line = $response->status_line; 66 if ( $response->code != OK ) { 67 Carp::croak( 'PAGE: ', $page_response_status_line, q{ }, 68 $marpa_doc_base ); 69 } 70 71 my @links = 72 map { $_->[2] } 73 grep { $_->[0] eq 'a' and $_->[1] eq 'href' and $_->[2] !~ /^[#]/xms } 74 $p->links(); 75 76 @doc_urls = grep {/^pod\//xms} @links; 77} 78 79my %url_seen = (); 80 81my $at_col_0 = 1; 82PAGE: for my $url (@doc_urls) { 83 $url = $marpa_doc_base . $url; 84 print "Examining document $url" or Carp::croak("Cannot print: $ERRNO"); 85 $at_col_0 = 0; 86 87 my $p = HTML::LinkExtor->new(); 88 my $ua = LWP::UserAgent->new; 89 90 # Request document and parse it as it arrives 91 my $response = $ua->request( HTTP::Request->new( GET => $url ), 92 sub { $p->parse( $_[0] ) } ); 93 94 my $page_response_status_line = $response->status_line; 95 if ( $response->code != OK ) { 96 say 'PAGE: ', $page_response_status_line, q{ }, $url 97 or Carp::croak("Cannot print: $ERRNO"); 98 next PAGE; 99 } 100 101 my @links = 102 map { $_->[2] } 103 grep { $_->[0] eq 'a' and $_->[1] eq 'href' } $p->links(); 104 105 LINK: for my $link (@links) { 106 107 given ($link) { 108 when (/\A\//xms) { 109 $link = 'http://search.cpan.org' . $link; 110 } 111 when (/\A[#]/xms) { 112 $link = $url . $link; 113 } 114 } ## end given 115 116 if ( $url_seen{$link}++ ) { 117 if ( $verbose >= 2 ) { 118 say {*STDERR} "Already tried $link" 119 or Carp::croak("Cannot print: $ERRNO"); 120 $at_col_0 = 1; 121 } 122 next LINK; 123 } ## end if ( $url_seen{$link}++ ) 124 125 if ( $verbose > 1 ) { 126 $at_col_0 or print "\n" or Carp::croak("Cannot print: $ERRNO"); 127 say {*STDERR} "Trying $link" 128 or Carp::croak("Cannot print: $ERRNO"); 129 $at_col_0 = 1; 130 } ## end if ( $verbose > 1 ) 131 132 my $link_response = 133 $ua->request( HTTP::Request->new( GET => $link ) ); 134 135 if ( $link_response->code != OK ) { 136 $at_col_0 or print "\n" or Carp::croak("Cannot print: $ERRNO"); 137 say 'FAIL: ', $link_response->status_line, q{ }, $link 138 or Carp::croak("Cannot print: $ERRNO"); 139 $at_col_0 = 1; 140 next LINK; 141 } ## end if ( $link_response->code != OK ) 142 143 if ( not $verbose ) { 144 print {*STDERR} q{.} 145 or Carp::croak("Cannot print: $ERRNO"); 146 $at_col_0 = 0; 147 } 148 149 if ($verbose) { 150 $at_col_0 or print "\n" or Carp::croak("Cannot print: $ERRNO"); 151 my $uri = $link_response->base(); 152 say {*STDERR} "FOUND $link" 153 or Carp::croak("Cannot print: $ERRNO"); 154 say {*STDERR} " uri: $uri" 155 or Carp::croak("Cannot print: $ERRNO"); 156 if ( $verbose >= 3 ) { 157 for my $redirect ( $link_response->redirects() ) { 158 my $redirect_uri = $redirect->base(); 159 say {*STDERR} " redirect: $redirect_uri" 160 or Carp::croak("Cannot print: $ERRNO"); 161 } 162 } ## end if ( $verbose >= 3 ) 163 $at_col_0 = 1; 164 } ## end if ($verbose) 165 166 } ## end for my $link (@links) 167 168 $at_col_0 or print "\n" or Carp::croak("Cannot print: $ERRNO"); 169 $at_col_0 = 1; 170 171 if ($verbose) { 172 say " PAGE: $page_response_status_line: $url" 173 or Carp::croak("Cannot print: $ERRNO"); 174 $at_col_0 = 1; 175 } 176 177} ## end for my $url (@doc_urls) 178