1#!perl 2use strict; 3use warnings; 4use autodie; 5use feature qw(say); 6require File::Find::Rule; 7require File::Slurp; 8require File::Spec; 9require IO::Socket::SSL; 10use List::Util qw(sum); 11require LWP::UserAgent; 12require Net::FTP; 13require Parallel::Fork::BossWorkerAsync; 14require Term::ProgressBar::Simple; 15require URI::Find::Simple; 16$| = 1; 17 18my %ignore; 19while ( my $line = <main::DATA> ) { 20 chomp $line; 21 next if $line =~ /^#/; 22 next unless $line; 23 $ignore{$line} = 1; 24} 25 26my $ua = LWP::UserAgent->new(ssl_opts => { verify_hostname => 0 }); 27$ua->timeout(58); 28$ua->env_proxy; 29 30my @filenames = @ARGV; 31@filenames = sort grep { $_ !~ /^\.git/ } File::Find::Rule->new->file->in('.') 32 unless @filenames; 33 34my $total_bytes = sum map {-s} @filenames; 35 36my $extract_progress = Term::ProgressBar::Simple->new( 37 { count => $total_bytes, 38 name => 'Extracting URIs', 39 } 40); 41 42my %uris; 43foreach my $filename (@filenames) { 44 next if $filename =~ /uris\.txt/; 45 next if $filename =~ /check_uris/; 46 next if $filename =~ /\.patch$/; 47 next if $filename =~ 'cpan/Pod-Simple/t/perlfaqo?\.pod'; 48 next if $filename =~ /checkURL\.pl$/; 49 my $contents = File::Slurp::read_file($filename); 50 my @uris = URI::Find::Simple::list_uris($contents); 51 foreach my $uri (@uris) { 52 next unless $uri =~ /^(http|ftp)/; 53 next if $ignore{$uri}; 54 55 # no need to hit rt.perl.org 56 next 57 if $uri =~ m{^https?://rt.perl.org/(?:rt3/)?Ticket/Display.html?id=\d+$}; 58 59 # no need to hit github 60 next 61 if $uri =~ m{^https?://(?:www\.)?github\.com/[pP]erl/perl5/issues/\d+$}; 62 63 # no need to hit rt.cpan.org 64 next 65 if $uri =~ m{^https?://rt.cpan.org/Public/Bug/Display.html?id=\d+$}; 66 67 # no need to hit google groups (weird redirect LWP does not like) 68 next 69 if $uri =~ m{^http://groups\.google\.com/}; 70 71 push @{ $uris{$uri} }, $filename; 72 } 73 $extract_progress += -s $filename; 74} 75 76my $bw = Parallel::Fork::BossWorkerAsync->new( 77 work_handler => \&work_alarmed, 78 global_timeout => 120, 79 worker_count => 20, 80); 81 82foreach my $uri ( keys %uris ) { 83 my @filenames = @{ $uris{$uri} }; 84 $bw->add_work( { uri => $uri, filenames => \@filenames } ); 85} 86 87undef $extract_progress; 88 89my $fetch_progress = Term::ProgressBar::Simple->new( 90 { count => scalar( keys %uris ), 91 name => 'Fetching URIs', 92 } 93); 94 95my %filenames; 96while ( $bw->pending() ) { 97 my $response = $bw->get_result(); 98 my $uri = $response->{uri}; 99 my @filenames = @{ $response->{filenames} }; 100 my $is_success = $response->{is_success}; 101 my $message = $response->{message}; 102 103 unless ($is_success) { 104 foreach my $filename (@filenames) { 105 push @{ $filenames{$filename} }, 106 { uri => $uri, message => $message }; 107 } 108 } 109 $fetch_progress++; 110} 111$bw->shut_down(); 112 113my $fh = IO::File->new('> uris.txt'); 114foreach my $filename ( sort keys %filenames ) { 115 $fh->say("* $filename"); 116 my @bits = @{ $filenames{$filename} }; 117 foreach my $bit (@bits) { 118 my $uri = $bit->{uri}; 119 my $message = $bit->{message}; 120 $fh->say(" $uri"); 121 $fh->say(" $message"); 122 } 123} 124$fh->close; 125 126say 'Finished, see uris.txt'; 127 128sub work_alarmed { 129 my $conf = shift; 130 eval { 131 local $SIG{ALRM} = sub { die "alarm\n" }; # NB: \n required 132 alarm 60; 133 $conf = work($conf); 134 alarm 0; 135 }; 136 if ($@) { 137 $conf->{is_success} = 0; 138 $conf->{message} = 'Timed out'; 139 140 } 141 return $conf; 142} 143 144sub work { 145 my $conf = shift; 146 my $uri = $conf->{uri}; 147 my @filenames = @{ $conf->{filenames} }; 148 149 if ( $uri =~ /^http/ ) { 150 my $uri_without_fragment = URI->new($uri); 151 my $fragment = $uri_without_fragment->fragment(undef); 152 my $response = $ua->head($uri_without_fragment); 153 154 $conf->{is_success} = $response->is_success; 155 $conf->{message} = $response->status_line; 156 return $conf; 157 } else { 158 159 my $uri_object = URI->new($uri); 160 my $host = $uri_object->host; 161 my $path = $uri_object->path; 162 my ( $volume, $directories, $filename ) 163 = File::Spec->splitpath($path); 164 165 my $ftp = Net::FTP->new( $host, Passive => 1, Timeout => 60 ); 166 unless ($ftp) { 167 $conf->{is_succcess} = 0; 168 $conf->{message} = "Can not connect to $host: $@"; 169 return $conf; 170 } 171 172 my $can_login = $ftp->login( "anonymous", '-anonymous@' ); 173 unless ($can_login) { 174 $conf->{is_success} = 0; 175 $conf->{message} = "Can not login ", $ftp->message; 176 return $conf; 177 } 178 179 my $can_binary = $ftp->binary(); 180 unless ($can_binary) { 181 $conf->{is_success} = 0; 182 $conf->{message} = "Can not binary ", $ftp->message; 183 return $conf; 184 } 185 186 my $can_cwd = $ftp->cwd($directories); 187 unless ($can_cwd) { 188 $conf->{is_success} = 0; 189 $conf->{message} = "Can not cwd to $directories ", $ftp->message; 190 return $conf; 191 } 192 193 if ($filename) { 194 my $can_size = $ftp->size($filename); 195 unless ($can_size) { 196 $conf->{is_success} = 0; 197 $conf->{message} 198 = "Can not size $filename in $directories", 199 $ftp->message; 200 return $conf; 201 } 202 } else { 203 my ($can_dir) = $ftp->dir; 204 unless ($can_dir) { 205 my ($can_ls) = $ftp->ls; 206 unless ($can_ls) { 207 $conf->{is_success} = 0; 208 $conf->{message} 209 = "Can not dir or ls in $directories ", 210 $ftp->message; 211 return $conf; 212 } 213 } 214 } 215 216 $conf->{is_success} = 1; 217 return $conf; 218 } 219} 220 221__DATA__ 222# these are fine but give errors 223ftp://ftp.stratus.com/pub/vos/posix/ga/ga.html 224ftp://ftp.stratus.com/pub/vos/utility/utility.html 225 226# these are missing, sigh 227ftp://ftp.sco.com/SLS/ptf7051e.Z 228http://perlmonks.thepen.com/42898.html 229http://svn.mutatus.co.uk/wsvn/Scalar-List-Utils/trunk/ 230http://public.activestate.com/cgi-bin/perlbrowse 231http://svn.mutatus.co.uk/browse/libnet/tags/libnet-1.17/ChangeLog 232http://aspn.activestate.com/ASPN/Mail/Message/perl6-internals/2746631 233http://my.smithmicro.com/mac/stuffit/ 234http://www.wg.omron.co.jp/cgi-bin/j-e/jfriedl.html 235http://persephone.cps.unizar.es/general/gente/spd/gzip/gzip.html 236http://www.openzaurus.org/ 237http://Casbah.org/ 238http://download.microsoft.com/download/vc15/Patch/1.52/W95/EN-US/nmake15.exe 239http://www.pvhp.com/~pvhp/ 240http://www.pvhp.com/%7Epvhp/ 241http://www.pvhp.com/%7epvhp/ 242http://www.leo.org 243http://www.madgoat.com 244http://www.mks.com/s390/gnu/ 245http://www.tpj.com/ 246http://safaribooksonline.com/ 247http://use.perl.org/~autrijus/journal/25768 248http://www.s390.ibm.com/products/oe/bpxqp1.html 249http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg01396.html 250http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg01489.html 251http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg01491.html 252http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg01608.html 253http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg02144.html 254http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg02998.html 255http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1999-03/msg00520.html 256http://www.w3.org/Security/Faq/ 257 258# these are URI extraction bugs 259http://www.perl.org/E 260http://en.wikipedia.org/wiki/SREC_(file_format 261http://somewhere.else',-type=/ 262ftp:passive-mode 263ftp: 264http:[- 265http://search.cpan.org/src/KWILLIAMS/Module-Build-0.30/lib/Module/Build/Platform/Windows.pm:split_like_shell 266http://www.xray.mpe.mpg.de/mailing-lists/perl5- 267http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP: 268http://perl.come/ 269http://www.perl.come/ 270 271# these are used as an example 272http://example.com/ 273http://something.here/ 274http://users.perl5.git.perl.org/~yourlogin/ 275http://github.com/USERNAME/perl/tree/orange 276http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi 277http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar 278http://somewhere.else$/ 279http://somewhere.else$/ 280http://somewhere.else/bin/foo&bar',-Type= 281http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi 282http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar 283http://www.perl.org/test.cgi 284http://cpan2.local/ 285http://search.cpan.org/perldoc? 286http://cpan1.local/ 287http://cpan.dev.local/CPAN 288http:/// 289ftp:// 290ftp://myurl/ 291ftp://ftp.software.ibm.com/aix/fixes/v4/other/vac.C.5.0.2.6.bff 292http://www14.software.ibm.com/webapp/download/downloadaz.jsp 293http://www.cpan.org/modules/by-module/Archive/Archive-Zip-*.tar.gz 294http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-*.TXT 295http://localhost/tmp/index.txt 296http://example.com/foo/bar.html 297http://example.com/Text-Bastardize-1.06.tar.gz 298ftp://example.com/sources/packages.txt 299http://example.com/sources/packages.txt 300http://example.com/sources 301ftp://example.com/sources 302http://some.where.com/dir/file.txt 303http://some.where.com/dir/a.txt 304http://foo.com/X.tgz 305ftp://foo.com/X.tgz 306http://foo/ 307http://www.foo.com:8000/ 308http://mysite.com/cgi-bin/log.cgi/http://www.some.other.site/args 309http://decoded/mirror/path 310http://a/b/c/d/e/f/g/h/i/j 311http://foo/bar.gz 312ftp://ftp.perl.org 313http://purl.org/rss/1.0/modules/taxonomy/ 314ftp://ftp.sun.ac.za/CPAN/CPAN/ 315ftp://ftp.cpan.org/pub/mirror/index.txt 316ftp://cpan.org/pub/mirror/index.txt 317http://example.com/~eh/ 318http://plagger.org/.../rss 319http://umn.dl.sourceforge.net/mingw/gcc-g++-3.4.5-20060117-1.tar.gz 320http://umn.dl.sourceforge.net/mingw/binutils-2.16.91-20060119-1.tar.gz 321http://umn.dl.sourceforge.net/mingw/mingw-runtime-3.10.tar.gz 322http://umn.dl.sourceforge.net/mingw/gcc-core-3.4.5-20060117-1.tar.gz 323http://umn.dl.sourceforge.net/mingw/w32api-3.6.tar.gz 324http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip 325http://module-build.sourceforge.net/META-spec-new.html 326http://module-build.sourceforge.net/META-spec-v1.4.html 327http://www.cs.vu.nl/~tmgil/vi.html 328http://perlcomposer.sourceforge.net/vperl.html 329http://www.perl.com/CPAN/authors/Tom_Christiansen/scripts/rep 330http://w4.lns.cornell.edu/~pvhp/ptk/ptkTOC.html 331http://world.std.com/~aep/ptkdb/ 332http://www.castlelink.co.uk/object_system/ 333http://www.fh-wedel.de/elvis/ 334ftp://ftp.blarg.net/users/amol/zsh/ 335ftp://ftp.funet.fi/pub/languages/perl/CPAN 336http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip 337http://users.perl5.git.perl.org/~USERNAME 338http://foo/x//y/script.cgi/a//b 339http://xxx/script.cgi/http://foo 340http://foo/./x//z/script.cgi/a/../b//c 341http://somewhere.else/in/movie/land 342http://somewhere.else/finished.html 343http://somewhere.else/bin/foo&bar$ 344http://somewhere.else/ 345http://proxy:8484/ 346http://proxy/ 347http://myrepo.example.com/ 348http://remote/source 349https://example.com/ 350http://example.com:1024/ 351http:///path?foo=bar 352http://[::]:1024/ 353http://([/ 354http://example.com:9000/index.html 355http://proxy.example.com:8080/ 356http:///index.html 357http://[www.json::pp.org]/ 358http://localhost/ 359http://foo.example.com/ 360http://abc.com/a.js 361http://whatever/man/1/crontab 362http://abc.com/c.js 363http://whatever/Foo%3A%3ABar 364http://abc.com/b.js 365http://remote.server.com/jquery.css 366http://some.other.com/page.html 367https://text.com/1/2 368https://text.com/1/2 369http://link.included.here?o=1&p=2 370http://link.included.here?o=1&p=2 371http://link.included.here?o=1&p=2 372http://link.included.here/ 373http://foo/x//y/script.cgi/a//b 374http://xxx/script.cgi/http://foo 375http://foo/./x//z/script.cgi/a/../b//c 376http://somewhere.else/in/movie/land 377http://somewhere.else/finished.html 378http://webproxy:3128/ 379http://www/ 380 381# these are used to generate or match URLs 382http://www.cpan.org/modules/by-authors/id/$1/$1$2/$dist 383http://www.cpantesters.org/show/%s.yaml 384ftp://(.*?)/(.*)/(.* 385ftp://(.*?)/(.*)/(.* 386ftp://(.*?)/(.*)/(.* 387ftp://ftp.foo.bar/ 388http://$host/ 389http://wwwe%3C46/ 390ftp:/ 391http://$addr/mark?commit=$ 392http://search.cpan.org/~ 393http:/ 394ftp:%5Cn$url 395http://www.ietf.org/rfc/rfc$2.txt 396http://search.cpan.org/~ 397ftp:%5Cn$url 398 399# weird redirects that LWP doesn't like 400http://www.theperlreview.com/community_calendar 401http://www.software.hp.com/portal/swdepot/displayProductInfo.do?productNumber=PERL 402http://sunsolve.sun.com 403 404# broken webserver that doesn't like HEAD requests 405http://docs.sun.com/app/docs/doc/816-5168/syslog-3c?a=view 406http://www.w3.org/TR/html4/loose.dtd 407 408# these have been reported upstream to CPAN authors 409http://www.gnu.org/manual/tar/html_node/tar_139.html 410http://www.w3.org/pub/WWW/TR/Wd-css-1.html 411http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP 412http://msdn.microsoft.com/workshop/author/dhtml/httponly_cookies.asp 413http://search.cpan.org/search?query=Module::Build::Convert 414http://www.refcnt.org/papers/module-build-convert 415http://csrc.nist.gov/cryptval/shs.html 416http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset4.asp 417http://www.debian.or.jp/~kubota/unicode-symbols.html.en 418http://www.mail-archive.com/perl5-porters@perl.org/msg69766.html 419http://www.debian.or.jp/~kubota/unicode-symbols.html.en 420http://rfc.net/rfc2781.html 421http://www.icu-project.org/charset/ 422http://ppewww.ph.gla.ac.uk/~flavell/charset/form-i18n.html 423http://www.rfc-editor.org/ 424http://www.rfc.net/ 425http://www.oreilly.com/people/authors/lunde/cjk_inf.html 426http://www.oreilly.com/catalog/cjkvinfo/ 427http://www.cpan.org/modules/by-author/Damian_Conway/Filter-Simple.tar.gz 428http://www.csse.monash.edu.au/~damian/CPAN/Filter-Simple.tar.gz 429http://www.egt.ie/standards/iso3166/iso3166-1-en.html 430http://www.bsi-global.com/iso4217currency 431http://www.plover.com/~mjd/perl/Memoize/ 432http://www.plover.com/~mjd/perl/MiniMemoize/ 433http://www.sysadminmag.com/tpj/issues/vol5_5/ 434ftp://ftp.tpc.int/tpc/server/UNIX/ 435http://www.nara.gov/genealogy/ 436http://home.utah-inter.net/kinsearch/Soundex.html 437http://www.nara.gov/genealogy/soundex/soundex.html 438http://rfc.net/rfc3461.html 439ftp://ftp.cs.pdx.edu/pub/elvis/ 440http://www.fh-wedel.de/elvis/ 441http://lists.perl.org/list/perl-mvs.html 442http://www.cpan.org/ports/os2/ 443http://github.com/dagolden/cpan-meta-spec 444http://github.com/dagolden/cpan-meta-spec/issues 445http://www.opensource.org/licenses/lgpl-license.phpt 446http://reality.sgi.com/ariel 447http://www.chiark.greenend.org.uk/pipermail/ukcrypto/1999-February/003538.html 448http://www.chiark.greenend.org.uk/pipermail/ukcrypto/1999-February/003538.html 449http://www.nsrl.nist.gov/testdata/ 450http://public.activestate.com/cgi-bin/perlbrowse/p/31194 451http://public.activestate.com/cgi-bin/perlbrowse?patch=16173 452http://public.activestate.com/cgi-bin/perlbrowse?patch=16049 453http://www.li18nux.org/docs/html/CodesetAliasTable-V10.html 454http://aspn.activestate.com/ASPN/Mail/Message/perl5-porters/3486118 455http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.ut 456http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.uf 457http://github.com/schwern/extutils-makemaker 458https://github.com/dagolden/cpanpm/compare/master...private%2Fuse-http-lite 459http://www.json.org/JSON::PP_checker/ 460ftp://ftp.kiae.su/pub/unix/fido/ 461http://www.gallistel.net/nparker/weather/code/ 462http://www.javaworld.com/javaworld/jw-09-2001/jw-0928-ntmessages.html 463ftp://ftp-usa.tpc.int/pub/tpc/server/UNIX/ 464http://www.cis.ohio-state.edu/htbin/rfc/rfc959.html 465http://public.activestate.com/cgi-bin/perlbrowse/p/33567 466http://public.activestate.com/cgi-bin/perlbrowse/p/33566 467http://www.dsmit.com/cons/ 468http://www.makemaker.org/wiki/index.cgi?ModuleBuildConversionGuide 469 470__END__ 471 472=head1 NAME 473 474checkURL.pl - Check that all the URLs in the Perl source are valid 475 476=head1 DESCRIPTION 477 478This program checks that all the URLs in the Perl source are valid. It 479checks HTTP and FTP links in parallel and contains a list of known 480bad example links in its source. It takes 4 minutes to run on my 481machine. The results are written to 'uris.txt' and list the filename, 482the URL and the error: 483 484 * ext/Locale-Maketext/lib/Locale/Maketext.pod 485 http://sunsite.dk/RFC/rfc/rfc2277.html 486 404 Not Found 487 ... 488 489It should be run every so often and links fixed and upstream authors 490notified. 491 492Note that the web is unstable and some websites are temporarily down. 493