1#!perl 2use strict; 3use warnings; 4use autodie; 5use feature qw(say); 6require File::Find::Rule; 7require File::Slurp; 8require File::Spec; 9require IO::Socket::SSL; 10use List::Util qw(sum); 11require LWP::UserAgent; 12require Net::FTP; 13require Parallel::Fork::BossWorkerAsync; 14require Term::ProgressBar::Simple; 15require URI::Find::Simple; 16$| = 1; 17 18my %ignore; 19while ( my $line = <main::DATA> ) { 20 chomp $line; 21 next if $line =~ /^#/; 22 next unless $line; 23 $ignore{$line} = 1; 24} 25 26my $ua = LWP::UserAgent->new(ssl_opts => { verify_hostname => 0 }); 27$ua->timeout(58); 28$ua->env_proxy; 29 30my @filenames = @ARGV; 31@filenames = sort grep { $_ !~ /^\.git/ } File::Find::Rule->new->file->in('.') 32 unless @filenames; 33 34my $total_bytes = sum map {-s} @filenames; 35 36my $extract_progress = Term::ProgressBar::Simple->new( 37 { count => $total_bytes, 38 name => 'Extracting URIs', 39 } 40); 41 42my %uris; 43foreach my $filename (@filenames) { 44 next if $filename =~ /uris\.txt/; 45 next if $filename =~ /check_uris/; 46 next if $filename =~ /\.patch$/; 47 next if $filename =~ 'cpan/Pod-Simple/t/perlfaqo?\.pod'; 48 next if $filename =~ /checkURL\.pl$/; 49 my $contents = File::Slurp::read_file($filename); 50 my @uris = URI::Find::Simple::list_uris($contents); 51 foreach my $uri (@uris) { 52 next unless $uri =~ /^(http|ftp)/; 53 next if $ignore{$uri}; 54 55 # no need to hit rt.perl.org 56 next 57 if $uri =~ m{^https?://rt.perl.org/(?:rt3/)?Ticket/Display.html?id=\d+$}; 58 59 # no need to hit rt.cpan.org 60 next 61 if $uri =~ m{^https?://rt.cpan.org/Public/Bug/Display.html?id=\d+$}; 62 63 # no need to hit google groups (weird redirect LWP does not like) 64 next 65 if $uri =~ m{^http://groups\.google\.com/}; 66 67 push @{ $uris{$uri} }, $filename; 68 } 69 $extract_progress += -s $filename; 70} 71 72my $bw = Parallel::Fork::BossWorkerAsync->new( 73 work_handler => \&work_alarmed, 74 global_timeout => 120, 75 worker_count => 20, 76); 77 78foreach my $uri ( keys %uris ) { 79 my @filenames = @{ $uris{$uri} }; 80 $bw->add_work( { uri => $uri, filenames => \@filenames } ); 81} 82 83undef $extract_progress; 84 85my $fetch_progress = Term::ProgressBar::Simple->new( 86 { count => scalar( keys %uris ), 87 name => 'Fetching URIs', 88 } 89); 90 91my %filenames; 92while ( $bw->pending() ) { 93 my $response = $bw->get_result(); 94 my $uri = $response->{uri}; 95 my @filenames = @{ $response->{filenames} }; 96 my $is_success = $response->{is_success}; 97 my $message = $response->{message}; 98 99 unless ($is_success) { 100 foreach my $filename (@filenames) { 101 push @{ $filenames{$filename} }, 102 { uri => $uri, message => $message }; 103 } 104 } 105 $fetch_progress++; 106} 107$bw->shut_down(); 108 109my $fh = IO::File->new('> uris.txt'); 110foreach my $filename ( sort keys %filenames ) { 111 $fh->say("* $filename"); 112 my @bits = @{ $filenames{$filename} }; 113 foreach my $bit (@bits) { 114 my $uri = $bit->{uri}; 115 my $message = $bit->{message}; 116 $fh->say(" $uri"); 117 $fh->say(" $message"); 118 } 119} 120$fh->close; 121 122say 'Finished, see uris.txt'; 123 124sub work_alarmed { 125 my $conf = shift; 126 eval { 127 local $SIG{ALRM} = sub { die "alarm\n" }; # NB: \n required 128 alarm 60; 129 $conf = work($conf); 130 alarm 0; 131 }; 132 if ($@) { 133 $conf->{is_success} = 0; 134 $conf->{message} = 'Timed out'; 135 136 } 137 return $conf; 138} 139 140sub work { 141 my $conf = shift; 142 my $uri = $conf->{uri}; 143 my @filenames = @{ $conf->{filenames} }; 144 145 if ( $uri =~ /^http/ ) { 146 my $uri_without_fragment = URI->new($uri); 147 my $fragment = $uri_without_fragment->fragment(undef); 148 my $response = $ua->head($uri_without_fragment); 149 150 $conf->{is_success} = $response->is_success; 151 $conf->{message} = $response->status_line; 152 return $conf; 153 } else { 154 155 my $uri_object = URI->new($uri); 156 my $host = $uri_object->host; 157 my $path = $uri_object->path; 158 my ( $volume, $directories, $filename ) 159 = File::Spec->splitpath($path); 160 161 my $ftp = Net::FTP->new( $host, Passive => 1, Timeout => 60 ); 162 unless ($ftp) { 163 $conf->{is_succcess} = 0; 164 $conf->{message} = "Can not connect to $host: $@"; 165 return $conf; 166 } 167 168 my $can_login = $ftp->login( "anonymous", '-anonymous@' ); 169 unless ($can_login) { 170 $conf->{is_success} = 0; 171 $conf->{message} = "Can not login ", $ftp->message; 172 return $conf; 173 } 174 175 my $can_binary = $ftp->binary(); 176 unless ($can_binary) { 177 $conf->{is_success} = 0; 178 $conf->{message} = "Can not binary ", $ftp->message; 179 return $conf; 180 } 181 182 my $can_cwd = $ftp->cwd($directories); 183 unless ($can_cwd) { 184 $conf->{is_success} = 0; 185 $conf->{message} = "Can not cwd to $directories ", $ftp->message; 186 return $conf; 187 } 188 189 if ($filename) { 190 my $can_size = $ftp->size($filename); 191 unless ($can_size) { 192 $conf->{is_success} = 0; 193 $conf->{message} 194 = "Can not size $filename in $directories", 195 $ftp->message; 196 return $conf; 197 } 198 } else { 199 my ($can_dir) = $ftp->dir; 200 unless ($can_dir) { 201 my ($can_ls) = $ftp->ls; 202 unless ($can_ls) { 203 $conf->{is_success} = 0; 204 $conf->{message} 205 = "Can not dir or ls in $directories ", 206 $ftp->message; 207 return $conf; 208 } 209 } 210 } 211 212 $conf->{is_success} = 1; 213 return $conf; 214 } 215} 216 217__DATA__ 218# these are fine but give errors 219ftp://ftp.stratus.com/pub/vos/posix/ga/ga.html 220ftp://ftp.stratus.com/pub/vos/utility/utility.html 221 222# these are missing, sigh 223ftp://ftp.sco.com/SLS/ptf7051e.Z 224http://perlmonks.thepen.com/42898.html 225http://svn.mutatus.co.uk/wsvn/Scalar-List-Utils/trunk/ 226http://public.activestate.com/cgi-bin/perlbrowse 227http://svn.mutatus.co.uk/browse/libnet/tags/libnet-1.17/ChangeLog 228http://aspn.activestate.com/ASPN/Mail/Message/perl6-internals/2746631 229http://my.smithmicro.com/mac/stuffit/ 230http://www.wg.omron.co.jp/cgi-bin/j-e/jfriedl.html 231http://persephone.cps.unizar.es/general/gente/spd/gzip/gzip.html 232 233# these are URI extraction bugs 234http://www.perl.org/E 235http://en.wikipedia.org/wiki/SREC_(file_format 236http://somewhere.else',-type=/ 237ftp:passive-mode 238ftp: 239http:[- 240http://search.cpan.org/src/KWILLIAMS/Module-Build-0.30/lib/Module/Build/Platform/Windows.pm:split_like_shell 241http://www.xray.mpe.mpg.de/mailing-lists/perl5- 242http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP: 243http://perl.come/ 244http://www.perl.come/ 245 246# these are used as an example 247http://example.com/ 248http://something.here/ 249http://users.perl5.git.perl.org/~yourlogin/ 250http://github.com/USERNAME/perl/tree/orange 251http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi 252http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar 253http://somewhere.else$/ 254http://somewhere.else$/ 255http://somewhere.else/bin/foo&bar',-Type= 256http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi 257http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar 258http://www.perl.org/test.cgi 259http://cpan2.local/ 260http://search.cpan.org/perldoc? 261http://cpan1.local/ 262http://cpan.dev.local/CPAN 263http:/// 264ftp:// 265ftp://myurl/ 266ftp://ftp.software.ibm.com/aix/fixes/v4/other/vac.C.5.0.2.6.bff 267http://www14.software.ibm.com/webapp/download/downloadaz.jsp 268http://www.cpan.org/modules/by-module/Archive/Archive-Zip-*.tar.gz 269http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-*.TXT 270http://localhost/tmp/index.txt 271http://example.com/foo/bar.html 272http://example.com/Text-Bastardize-1.06.tar.gz 273ftp://example.com/sources/packages.txt 274http://example.com/sources/packages.txt 275http://example.com/sources 276ftp://example.com/sources 277http://some.where.com/dir/file.txt 278http://some.where.com/dir/a.txt 279http://foo.com/X.tgz 280ftp://foo.com/X.tgz 281http://foo/ 282http://www.foo.com:8000/ 283http://mysite.com/cgi-bin/log.cgi/http://www.some.other.site/args 284http://decoded/mirror/path 285http://a/b/c/d/e/f/g/h/i/j 286http://foo/bar.gz 287ftp://ftp.perl.org 288http://purl.org/rss/1.0/modules/taxonomy/ 289ftp://ftp.sun.ac.za/CPAN/CPAN/ 290ftp://ftp.cpan.org/pub/mirror/index.txt 291ftp://cpan.org/pub/mirror/index.txt 292http://example.com/~eh/ 293http://plagger.org/.../rss 294http://umn.dl.sourceforge.net/mingw/gcc-g++-3.4.5-20060117-1.tar.gz 295http://umn.dl.sourceforge.net/mingw/binutils-2.16.91-20060119-1.tar.gz 296http://umn.dl.sourceforge.net/mingw/mingw-runtime-3.10.tar.gz 297http://umn.dl.sourceforge.net/mingw/gcc-core-3.4.5-20060117-1.tar.gz 298http://umn.dl.sourceforge.net/mingw/w32api-3.6.tar.gz 299http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip 300http://module-build.sourceforge.net/META-spec-new.html 301http://module-build.sourceforge.net/META-spec-v1.4.html 302http://www.cs.vu.nl/~tmgil/vi.html 303http://perlcomposer.sourceforge.net/vperl.html 304http://www.perl.com/CPAN/authors/Tom_Christiansen/scripts/rep 305http://w4.lns.cornell.edu/~pvhp/ptk/ptkTOC.html 306http://world.std.com/~aep/ptkdb/ 307http://www.castlelink.co.uk/object_system/ 308http://www.fh-wedel.de/elvis/ 309ftp://ftp.blarg.net/users/amol/zsh/ 310ftp://ftp.funet.fi/pub/languages/perl/CPAN 311http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip 312http://users.perl5.git.perl.org/~USERNAME 313http://foo/x//y/script.cgi/a//b 314http://xxx/script.cgi/http://foo 315http://foo/./x//z/script.cgi/a/../b//c 316http://somewhere.else/in/movie/land 317http://somewhere.else/finished.html 318http://somewhere.else/bin/foo&bar$ 319http://somewhere.else/ 320http://proxy:8484/ 321http://proxy/ 322http://myrepo.example.com/ 323http://remote/source 324https://example.com/ 325http://example.com:1024/ 326http:///path?foo=bar 327http://[::]:1024/ 328http://([/ 329http://example.com:9000/index.html 330http://proxy.example.com:8080/ 331http:///index.html 332http://[www.json::pp.org]/ 333http://localhost/ 334http://foo.example.com/ 335http://abc.com/a.js 336http://whatever/man/1/crontab 337http://abc.com/c.js 338http://whatever/Foo%3A%3ABar 339http://abc.com/b.js 340http://remote.server.com/jquery.css 341http://some.other.com/page.html 342https://text.com/1/2 343https://text.com/1/2 344http://link.included.here?o=1&p=2 345http://link.included.here?o=1&p=2 346http://link.included.here?o=1&p=2 347http://link.included.here/ 348http://foo/x//y/script.cgi/a//b 349http://xxx/script.cgi/http://foo 350http://foo/./x//z/script.cgi/a/../b//c 351http://somewhere.else/in/movie/land 352http://somewhere.else/finished.html 353http://webproxy:3128/ 354http://www/ 355 356# these are used to generate or match URLs 357http://www.cpan.org/modules/by-authors/id/$1/$1$2/$dist 358http://www.cpantesters.org/show/%s.yaml 359ftp://(.*?)/(.*)/(.* 360ftp://(.*?)/(.*)/(.* 361ftp://(.*?)/(.*)/(.* 362ftp://ftp.foo.bar/ 363http://$host/ 364http://wwwe%3C46/ 365ftp:/ 366http://$addr/mark?commit=$ 367http://search.cpan.org/~ 368http:/ 369ftp:%5Cn$url 370http://www.ietf.org/rfc/rfc$2.txt 371http://search.cpan.org/~ 372ftp:%5Cn$url 373 374# weird redirects that LWP doesn't like 375http://www.theperlreview.com/community_calendar 376http://www.software.hp.com/portal/swdepot/displayProductInfo.do?productNumber=PERL 377http://sunsolve.sun.com 378 379# broken webserver that doesn't like HEAD requests 380http://docs.sun.com/app/docs/doc/816-5168/syslog-3c?a=view 381http://www.w3.org/TR/html4/loose.dtd 382 383# these have been reported upstream to CPAN authors 384http://www.gnu.org/manual/tar/html_node/tar_139.html 385http://www.w3.org/pub/WWW/TR/Wd-css-1.html 386http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP 387http://msdn.microsoft.com/workshop/author/dhtml/httponly_cookies.asp 388http://search.cpan.org/search?query=Module::Build::Convert 389http://www.refcnt.org/papers/module-build-convert 390http://csrc.nist.gov/cryptval/shs.html 391http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset4.asp 392http://www.debian.or.jp/~kubota/unicode-symbols.html.en 393http://www.mail-archive.com/perl5-porters@perl.org/msg69766.html 394http://www.debian.or.jp/~kubota/unicode-symbols.html.en 395http://rfc.net/rfc2781.html 396http://www.icu-project.org/charset/ 397http://ppewww.ph.gla.ac.uk/~flavell/charset/form-i18n.html 398http://www.rfc-editor.org/ 399http://www.rfc.net/ 400http://www.oreilly.com/people/authors/lunde/cjk_inf.html 401http://www.oreilly.com/catalog/cjkvinfo/ 402http://www.cpan.org/modules/by-author/Damian_Conway/Filter-Simple.tar.gz 403http://www.csse.monash.edu.au/~damian/CPAN/Filter-Simple.tar.gz 404http://www.egt.ie/standards/iso3166/iso3166-1-en.html 405http://www.bsi-global.com/iso4217currency 406http://www.plover.com/~mjd/perl/Memoize/ 407http://www.plover.com/~mjd/perl/MiniMemoize/ 408http://www.sysadminmag.com/tpj/issues/vol5_5/ 409ftp://ftp.tpc.int/tpc/server/UNIX/ 410http://www.nara.gov/genealogy/ 411http://home.utah-inter.net/kinsearch/Soundex.html 412http://www.nara.gov/genealogy/soundex/soundex.html 413http://rfc.net/rfc3461.html 414ftp://ftp.cs.pdx.edu/pub/elvis/ 415http://www.fh-wedel.de/elvis/ 416http://lists.perl.org/list/perl-mvs.html 417http://www.cpan.org/ports/os2/ 418http://github.com/dagolden/cpan-meta-spec 419http://github.com/dagolden/cpan-meta-spec/issues 420http://www.opensource.org/licenses/lgpl-license.phpt 421http://reality.sgi.com/ariel 422http://www.chiark.greenend.org.uk/pipermail/ukcrypto/1999-February/003538.html 423http://www.chiark.greenend.org.uk/pipermail/ukcrypto/1999-February/003538.html 424http://www.nsrl.nist.gov/testdata/ 425http://public.activestate.com/cgi-bin/perlbrowse/p/31194 426http://public.activestate.com/cgi-bin/perlbrowse?patch=16173 427http://public.activestate.com/cgi-bin/perlbrowse?patch=16049 428http://www.li18nux.org/docs/html/CodesetAliasTable-V10.html 429http://aspn.activestate.com/ASPN/Mail/Message/perl5-porters/3486118 430http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.ut 431http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.uf 432http://github.com/schwern/extutils-makemaker 433https://github.com/dagolden/cpanpm/compare/master...private%2Fuse-http-lite 434http://www.json.org/JSON::PP_checker/ 435ftp://ftp.kiae.su/pub/unix/fido/ 436http://www.gallistel.net/nparker/weather/code/ 437http://www.javaworld.com/javaworld/jw-09-2001/jw-0928-ntmessages.html 438ftp://ftp-usa.tpc.int/pub/tpc/server/UNIX/ 439http://www.cis.ohio-state.edu/htbin/rfc/rfc959.html 440http://public.activestate.com/cgi-bin/perlbrowse/p/33567 441http://public.activestate.com/cgi-bin/perlbrowse/p/33566 442http://www.dsmit.com/cons/ 443http://www.makemaker.org/wiki/index.cgi?ModuleBuildConversionGuide 444 445__END__ 446 447=head1 NAME 448 449checkURL.pl - Check that all the URLs in the Perl source are valid 450 451=head1 DESCRIPTION 452 453This program checks that all the URLs in the Perl source are valid. It 454checks HTTP and FTP links in parallel and contains a list of known 455bad example links in its source. It takes 4 minutes to run on my 456machine. The results are written to 'uris.txt' and list the filename, 457the URL and the error: 458 459 * ext/Locale-Maketext/lib/Locale/Maketext.pod 460 http://sunsite.dk/RFC/rfc/rfc2277.html 461 404 Not Found 462 ... 463 464It should be run every so often and links fixed and upstream authors 465notified. 466 467Note that the web is unstable and some websites are temporarily down. 468