xref: /openbsd/gnu/usr.bin/perl/Porting/checkURL.pl (revision eac174f2)
1#!perl
2use strict;
3use warnings;
4use autodie;
5use feature qw(say);
6require File::Find::Rule;
7require File::Slurp;
8require File::Spec;
9require IO::Socket::SSL;
10use List::Util qw(sum);
11require LWP::UserAgent;
12require Net::FTP;
13require Parallel::Fork::BossWorkerAsync;
14require Term::ProgressBar::Simple;
15require URI::Find::Simple;
16$| = 1;
17
18my %ignore;
19while ( my $line = <main::DATA> ) {
20    chomp $line;
21    next if $line =~ /^#/;
22    next unless $line;
23    $ignore{$line} = 1;
24}
25
26my $ua = LWP::UserAgent->new(ssl_opts => { verify_hostname => 0 });
27$ua->timeout(58);
28$ua->env_proxy;
29
30my @filenames = @ARGV;
31@filenames = sort grep { $_ !~ /^\.git/ } File::Find::Rule->new->file->in('.')
32    unless @filenames;
33
34my $total_bytes = sum map {-s} @filenames;
35
36my $extract_progress = Term::ProgressBar::Simple->new(
37    {   count => $total_bytes,
38        name  => 'Extracting URIs',
39    }
40);
41
42my %uris;
43foreach my $filename (@filenames) {
44    next if $filename =~ /uris\.txt/;
45    next if $filename =~ /check_uris/;
46    next if $filename =~ /\.patch$/;
47    next if $filename =~ 'cpan/Pod-Simple/t/perlfaqo?\.pod';
48    next if $filename =~ /checkURL\.pl$/;
49    my $contents = File::Slurp::read_file($filename);
50    my @uris     = URI::Find::Simple::list_uris($contents);
51    foreach my $uri (@uris) {
52        next unless $uri =~ /^(http|ftp)/;
53        next if $ignore{$uri};
54
55        # no need to hit rt.perl.org
56        next
57            if $uri =~ m{^https?://rt.perl.org/(?:rt3/)?Ticket/Display.html?id=\d+$};
58
59        # no need to hit github
60        next
61            if $uri =~ m{^https?://(?:www\.)?github\.com/[pP]erl/perl5/issues/\d+$};
62
63        # no need to hit rt.cpan.org
64        next
65            if $uri =~ m{^https?://rt.cpan.org/Public/Bug/Display.html?id=\d+$};
66
67        # no need to hit google groups (weird redirect LWP does not like)
68        next
69            if $uri =~ m{^http://groups\.google\.com/};
70
71        push @{ $uris{$uri} }, $filename;
72    }
73    $extract_progress += -s $filename;
74}
75
76my $bw = Parallel::Fork::BossWorkerAsync->new(
77    work_handler   => \&work_alarmed,
78    global_timeout => 120,
79    worker_count   => 20,
80);
81
82foreach my $uri ( keys %uris ) {
83    my @filenames = @{ $uris{$uri} };
84    $bw->add_work( { uri => $uri, filenames => \@filenames } );
85}
86
87undef $extract_progress;
88
89my $fetch_progress = Term::ProgressBar::Simple->new(
90    {   count => scalar( keys %uris ),
91        name  => 'Fetching URIs',
92    }
93);
94
95my %filenames;
96while ( $bw->pending() ) {
97    my $response   = $bw->get_result();
98    my $uri        = $response->{uri};
99    my @filenames  = @{ $response->{filenames} };
100    my $is_success = $response->{is_success};
101    my $message    = $response->{message};
102
103    unless ($is_success) {
104        foreach my $filename (@filenames) {
105            push @{ $filenames{$filename} },
106                { uri => $uri, message => $message };
107        }
108    }
109    $fetch_progress++;
110}
111$bw->shut_down();
112
113my $fh = IO::File->new('> uris.txt');
114foreach my $filename ( sort keys %filenames ) {
115    $fh->say("* $filename");
116    my @bits = @{ $filenames{$filename} };
117    foreach my $bit (@bits) {
118        my $uri     = $bit->{uri};
119        my $message = $bit->{message};
120        $fh->say("  $uri");
121        $fh->say("    $message");
122    }
123}
124$fh->close;
125
126say 'Finished, see uris.txt';
127
128sub work_alarmed {
129    my $conf = shift;
130    eval {
131        local $SIG{ALRM} = sub { die "alarm\n" };    # NB: \n required
132        alarm 60;
133        $conf = work($conf);
134        alarm 0;
135    };
136    if ($@) {
137        $conf->{is_success} = 0;
138        $conf->{message}    = 'Timed out';
139
140    }
141    return $conf;
142}
143
144sub work {
145    my $conf      = shift;
146    my $uri       = $conf->{uri};
147    my @filenames = @{ $conf->{filenames} };
148
149    if ( $uri =~ /^http/ ) {
150        my $uri_without_fragment = URI->new($uri);
151        my $fragment             = $uri_without_fragment->fragment(undef);
152        my $response             = $ua->head($uri_without_fragment);
153
154        $conf->{is_success} = $response->is_success;
155        $conf->{message}    = $response->status_line;
156        return $conf;
157    } else {
158
159        my $uri_object = URI->new($uri);
160        my $host       = $uri_object->host;
161        my $path       = $uri_object->path;
162        my ( $volume, $directories, $filename )
163            = File::Spec->splitpath($path);
164
165        my $ftp = Net::FTP->new( $host, Passive => 1, Timeout => 60 );
166        unless ($ftp) {
167            $conf->{is_succcess} = 0;
168            $conf->{message}     = "Can not connect to $host: $@";
169            return $conf;
170        }
171
172        my $can_login = $ftp->login( "anonymous", '-anonymous@' );
173        unless ($can_login) {
174            $conf->{is_success} = 0;
175            $conf->{message} = "Can not login ", $ftp->message;
176            return $conf;
177        }
178
179        my $can_binary = $ftp->binary();
180        unless ($can_binary) {
181            $conf->{is_success} = 0;
182            $conf->{message} = "Can not binary ", $ftp->message;
183            return $conf;
184        }
185
186        my $can_cwd = $ftp->cwd($directories);
187        unless ($can_cwd) {
188            $conf->{is_success} = 0;
189            $conf->{message} = "Can not cwd to $directories ", $ftp->message;
190            return $conf;
191        }
192
193        if ($filename) {
194            my $can_size = $ftp->size($filename);
195            unless ($can_size) {
196                $conf->{is_success} = 0;
197                $conf->{message}
198                    = "Can not size $filename in $directories",
199                    $ftp->message;
200                return $conf;
201            }
202        } else {
203            my ($can_dir) = $ftp->dir;
204            unless ($can_dir) {
205                my ($can_ls) = $ftp->ls;
206                unless ($can_ls) {
207                    $conf->{is_success} = 0;
208                    $conf->{message}
209                        = "Can not dir or ls in $directories ",
210                        $ftp->message;
211                    return $conf;
212                }
213            }
214        }
215
216        $conf->{is_success} = 1;
217        return $conf;
218    }
219}
220
221__DATA__
222# these are fine but give errors
223ftp://ftp.stratus.com/pub/vos/posix/ga/ga.html
224ftp://ftp.stratus.com/pub/vos/utility/utility.html
225
226# these are missing, sigh
227ftp://ftp.sco.com/SLS/ptf7051e.Z
228http://perlmonks.thepen.com/42898.html
229http://svn.mutatus.co.uk/wsvn/Scalar-List-Utils/trunk/
230http://public.activestate.com/cgi-bin/perlbrowse
231http://svn.mutatus.co.uk/browse/libnet/tags/libnet-1.17/ChangeLog
232http://aspn.activestate.com/ASPN/Mail/Message/perl6-internals/2746631
233http://my.smithmicro.com/mac/stuffit/
234http://www.wg.omron.co.jp/cgi-bin/j-e/jfriedl.html
235http://persephone.cps.unizar.es/general/gente/spd/gzip/gzip.html
236http://www.openzaurus.org/
237http://Casbah.org/
238http://download.microsoft.com/download/vc15/Patch/1.52/W95/EN-US/nmake15.exe
239http://www.pvhp.com/~pvhp/
240http://www.pvhp.com/%7Epvhp/
241http://www.pvhp.com/%7epvhp/
242http://www.leo.org
243http://www.madgoat.com
244http://www.mks.com/s390/gnu/
245http://www.tpj.com/
246http://safaribooksonline.com/
247http://use.perl.org/~autrijus/journal/25768
248http://www.s390.ibm.com/products/oe/bpxqp1.html
249http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg01396.html
250http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg01489.html
251http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg01491.html
252http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg01608.html
253http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg02144.html
254http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg02998.html
255http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1999-03/msg00520.html
256http://www.w3.org/Security/Faq/
257
258# these are URI extraction bugs
259http://www.perl.org/E
260http://en.wikipedia.org/wiki/SREC_(file_format
261http://somewhere.else',-type=/
262ftp:passive-mode
263ftp:
264http:[-
265http://search.cpan.org/src/KWILLIAMS/Module-Build-0.30/lib/Module/Build/Platform/Windows.pm:split_like_shell
266http://www.xray.mpe.mpg.de/mailing-lists/perl5-
267http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP:
268http://perl.come/
269http://www.perl.come/
270
271# these are used as an example
272http://example.com/
273http://something.here/
274http://users.perl5.git.perl.org/~yourlogin/
275http://github.com/USERNAME/perl/tree/orange
276http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi
277http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar
278http://somewhere.else$/
279http://somewhere.else$/
280http://somewhere.else/bin/foo&bar',-Type=
281http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi
282http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar
283http://www.perl.org/test.cgi
284http://cpan2.local/
285http://search.cpan.org/perldoc?
286http://cpan1.local/
287http://cpan.dev.local/CPAN
288http:///
289ftp://
290ftp://myurl/
291ftp://ftp.software.ibm.com/aix/fixes/v4/other/vac.C.5.0.2.6.bff
292http://www14.software.ibm.com/webapp/download/downloadaz.jsp
293http://www.cpan.org/modules/by-module/Archive/Archive-Zip-*.tar.gz
294http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-*.TXT
295http://localhost/tmp/index.txt
296http://example.com/foo/bar.html
297http://example.com/Text-Bastardize-1.06.tar.gz
298ftp://example.com/sources/packages.txt
299http://example.com/sources/packages.txt
300http://example.com/sources
301ftp://example.com/sources
302http://some.where.com/dir/file.txt
303http://some.where.com/dir/a.txt
304http://foo.com/X.tgz
305ftp://foo.com/X.tgz
306http://foo/
307http://www.foo.com:8000/
308http://mysite.com/cgi-bin/log.cgi/http://www.some.other.site/args
309http://decoded/mirror/path
310http://a/b/c/d/e/f/g/h/i/j
311http://foo/bar.gz
312ftp://ftp.perl.org
313http://purl.org/rss/1.0/modules/taxonomy/
314ftp://ftp.sun.ac.za/CPAN/CPAN/
315ftp://ftp.cpan.org/pub/mirror/index.txt
316ftp://cpan.org/pub/mirror/index.txt
317http://example.com/~eh/
318http://plagger.org/.../rss
319http://umn.dl.sourceforge.net/mingw/gcc-g++-3.4.5-20060117-1.tar.gz
320http://umn.dl.sourceforge.net/mingw/binutils-2.16.91-20060119-1.tar.gz
321http://umn.dl.sourceforge.net/mingw/mingw-runtime-3.10.tar.gz
322http://umn.dl.sourceforge.net/mingw/gcc-core-3.4.5-20060117-1.tar.gz
323http://umn.dl.sourceforge.net/mingw/w32api-3.6.tar.gz
324http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip
325http://module-build.sourceforge.net/META-spec-new.html
326http://module-build.sourceforge.net/META-spec-v1.4.html
327http://www.cs.vu.nl/~tmgil/vi.html
328http://perlcomposer.sourceforge.net/vperl.html
329http://www.perl.com/CPAN/authors/Tom_Christiansen/scripts/rep
330http://w4.lns.cornell.edu/~pvhp/ptk/ptkTOC.html
331http://world.std.com/~aep/ptkdb/
332http://www.castlelink.co.uk/object_system/
333http://www.fh-wedel.de/elvis/
334ftp://ftp.blarg.net/users/amol/zsh/
335ftp://ftp.funet.fi/pub/languages/perl/CPAN
336http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip
337http://users.perl5.git.perl.org/~USERNAME
338http://foo/x//y/script.cgi/a//b
339http://xxx/script.cgi/http://foo
340http://foo/./x//z/script.cgi/a/../b//c
341http://somewhere.else/in/movie/land
342http://somewhere.else/finished.html
343http://somewhere.else/bin/foo&bar$
344http://somewhere.else/
345http://proxy:8484/
346http://proxy/
347http://myrepo.example.com/
348http://remote/source
349https://example.com/
350http://example.com:1024/
351http:///path?foo=bar
352http://[::]:1024/
353http://([/
354http://example.com:9000/index.html
355http://proxy.example.com:8080/
356http:///index.html
357http://[www.json::pp.org]/
358http://localhost/
359http://foo.example.com/
360http://abc.com/a.js
361http://whatever/man/1/crontab
362http://abc.com/c.js
363http://whatever/Foo%3A%3ABar
364http://abc.com/b.js
365http://remote.server.com/jquery.css
366http://some.other.com/page.html
367https://text.com/1/2
368https://text.com/1/2
369http://link.included.here?o=1&p=2
370http://link.included.here?o=1&amp;p=2
371http://link.included.here?o=1&amp;p=2
372http://link.included.here/
373http://foo/x//y/script.cgi/a//b
374http://xxx/script.cgi/http://foo
375http://foo/./x//z/script.cgi/a/../b//c
376http://somewhere.else/in/movie/land
377http://somewhere.else/finished.html
378http://webproxy:3128/
379http://www/
380
381# these are used to generate or match URLs
382http://www.cpan.org/modules/by-authors/id/$1/$1$2/$dist
383http://www.cpantesters.org/show/%s.yaml
384ftp://(.*?)/(.*)/(.*
385ftp://(.*?)/(.*)/(.*
386ftp://(.*?)/(.*)/(.*
387ftp://ftp.foo.bar/
388http://$host/
389http://wwwe%3C46/
390ftp:/
391http://$addr/mark?commit=$
392http://search.cpan.org/~
393http:/
394ftp:%5Cn$url
395http://www.ietf.org/rfc/rfc$2.txt
396http://search.cpan.org/~
397ftp:%5Cn$url
398
399# weird redirects that LWP doesn't like
400http://www.theperlreview.com/community_calendar
401http://www.software.hp.com/portal/swdepot/displayProductInfo.do?productNumber=PERL
402http://sunsolve.sun.com
403
404# broken webserver that doesn't like HEAD requests
405http://docs.sun.com/app/docs/doc/816-5168/syslog-3c?a=view
406http://www.w3.org/TR/html4/loose.dtd
407
408# these have been reported upstream to CPAN authors
409http://www.gnu.org/manual/tar/html_node/tar_139.html
410http://www.w3.org/pub/WWW/TR/Wd-css-1.html
411http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP
412http://msdn.microsoft.com/workshop/author/dhtml/httponly_cookies.asp
413http://search.cpan.org/search?query=Module::Build::Convert
414http://www.refcnt.org/papers/module-build-convert
415http://csrc.nist.gov/cryptval/shs.html
416http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset4.asp
417http://www.debian.or.jp/~kubota/unicode-symbols.html.en
418http://www.mail-archive.com/perl5-porters@perl.org/msg69766.html
419http://www.debian.or.jp/~kubota/unicode-symbols.html.en
420http://rfc.net/rfc2781.html
421http://www.icu-project.org/charset/
422http://ppewww.ph.gla.ac.uk/~flavell/charset/form-i18n.html
423http://www.rfc-editor.org/
424http://www.rfc.net/
425http://www.oreilly.com/people/authors/lunde/cjk_inf.html
426http://www.oreilly.com/catalog/cjkvinfo/
427http://www.cpan.org/modules/by-author/Damian_Conway/Filter-Simple.tar.gz
428http://www.csse.monash.edu.au/~damian/CPAN/Filter-Simple.tar.gz
429http://www.egt.ie/standards/iso3166/iso3166-1-en.html
430http://www.bsi-global.com/iso4217currency
431http://www.plover.com/~mjd/perl/Memoize/
432http://www.plover.com/~mjd/perl/MiniMemoize/
433http://www.sysadminmag.com/tpj/issues/vol5_5/
434ftp://ftp.tpc.int/tpc/server/UNIX/
435http://www.nara.gov/genealogy/
436http://home.utah-inter.net/kinsearch/Soundex.html
437http://www.nara.gov/genealogy/soundex/soundex.html
438http://rfc.net/rfc3461.html
439ftp://ftp.cs.pdx.edu/pub/elvis/
440http://www.fh-wedel.de/elvis/
441http://lists.perl.org/list/perl-mvs.html
442http://www.cpan.org/ports/os2/
443http://github.com/dagolden/cpan-meta-spec
444http://github.com/dagolden/cpan-meta-spec/issues
445http://www.opensource.org/licenses/lgpl-license.phpt
446http://reality.sgi.com/ariel
447http://www.chiark.greenend.org.uk/pipermail/ukcrypto/1999-February/003538.html
448http://www.chiark.greenend.org.uk/pipermail/ukcrypto/1999-February/003538.html
449http://www.nsrl.nist.gov/testdata/
450http://public.activestate.com/cgi-bin/perlbrowse/p/31194
451http://public.activestate.com/cgi-bin/perlbrowse?patch=16173
452http://public.activestate.com/cgi-bin/perlbrowse?patch=16049
453http://www.li18nux.org/docs/html/CodesetAliasTable-V10.html
454http://aspn.activestate.com/ASPN/Mail/Message/perl5-porters/3486118
455http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.ut
456http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.uf
457http://github.com/schwern/extutils-makemaker
458https://github.com/dagolden/cpanpm/compare/master...private%2Fuse-http-lite
459http://www.json.org/JSON::PP_checker/
460ftp://ftp.kiae.su/pub/unix/fido/
461http://www.gallistel.net/nparker/weather/code/
462http://www.javaworld.com/javaworld/jw-09-2001/jw-0928-ntmessages.html
463ftp://ftp-usa.tpc.int/pub/tpc/server/UNIX/
464http://www.cis.ohio-state.edu/htbin/rfc/rfc959.html
465http://public.activestate.com/cgi-bin/perlbrowse/p/33567
466http://public.activestate.com/cgi-bin/perlbrowse/p/33566
467http://www.dsmit.com/cons/
468http://www.makemaker.org/wiki/index.cgi?ModuleBuildConversionGuide
469
470__END__
471
472=head1 NAME
473
474checkURL.pl - Check that all the URLs in the Perl source are valid
475
476=head1 DESCRIPTION
477
478This program checks that all the URLs in the Perl source are valid. It
479checks HTTP and FTP links in parallel and contains a list of known
480bad example links in its source. It takes 4 minutes to run on my
481machine. The results are written to 'uris.txt' and list the filename,
482the URL and the error:
483
484  * ext/Locale-Maketext/lib/Locale/Maketext.pod
485    http://sunsite.dk/RFC/rfc/rfc2277.html
486      404 Not Found
487  ...
488
489It should be run every so often and links fixed and upstream authors
490notified.
491
492Note that the web is unstable and some websites are temporarily down.
493