xref: /openbsd/gnu/usr.bin/perl/Porting/checkURL.pl (revision 8529ddd3)
1#!perl
2use strict;
3use warnings;
4use autodie;
5use feature qw(say);
6require File::Find::Rule;
7require File::Slurp;
8require File::Spec;
9require IO::Socket::SSL;
10use List::Util qw(sum);
11require LWP::UserAgent;
12require Net::FTP;
13require Parallel::Fork::BossWorkerAsync;
14require Term::ProgressBar::Simple;
15require URI::Find::Simple;
16$| = 1;
17
18my %ignore;
19while ( my $line = <main::DATA> ) {
20    chomp $line;
21    next if $line =~ /^#/;
22    next unless $line;
23    $ignore{$line} = 1;
24}
25
26my $ua = LWP::UserAgent->new(ssl_opts => { verify_hostname => 0 });
27$ua->timeout(58);
28$ua->env_proxy;
29
30my @filenames = @ARGV;
31@filenames = sort grep { $_ !~ /^\.git/ } File::Find::Rule->new->file->in('.')
32    unless @filenames;
33
34my $total_bytes = sum map {-s} @filenames;
35
36my $extract_progress = Term::ProgressBar::Simple->new(
37    {   count => $total_bytes,
38        name  => 'Extracting URIs',
39    }
40);
41
42my %uris;
43foreach my $filename (@filenames) {
44    next if $filename =~ /uris\.txt/;
45    next if $filename =~ /check_uris/;
46    next if $filename =~ /\.patch$/;
47    next if $filename =~ 'cpan/Pod-Simple/t/perlfaqo?\.pod';
48    next if $filename =~ /checkURL\.pl$/;
49    my $contents = File::Slurp::read_file($filename);
50    my @uris     = URI::Find::Simple::list_uris($contents);
51    foreach my $uri (@uris) {
52        next unless $uri =~ /^(http|ftp)/;
53        next if $ignore{$uri};
54
55        # no need to hit rt.perl.org
56        next
57            if $uri =~ m{^https?://rt.perl.org/(?:rt3/)?Ticket/Display.html?id=\d+$};
58
59        # no need to hit rt.cpan.org
60        next
61            if $uri =~ m{^https?://rt.cpan.org/Public/Bug/Display.html?id=\d+$};
62
63        # no need to hit google groups (weird redirect LWP does not like)
64        next
65            if $uri =~ m{^http://groups\.google\.com/};
66
67        push @{ $uris{$uri} }, $filename;
68    }
69    $extract_progress += -s $filename;
70}
71
72my $bw = Parallel::Fork::BossWorkerAsync->new(
73    work_handler   => \&work_alarmed,
74    global_timeout => 120,
75    worker_count   => 20,
76);
77
78foreach my $uri ( keys %uris ) {
79    my @filenames = @{ $uris{$uri} };
80    $bw->add_work( { uri => $uri, filenames => \@filenames } );
81}
82
83undef $extract_progress;
84
85my $fetch_progress = Term::ProgressBar::Simple->new(
86    {   count => scalar( keys %uris ),
87        name  => 'Fetching URIs',
88    }
89);
90
91my %filenames;
92while ( $bw->pending() ) {
93    my $response   = $bw->get_result();
94    my $uri        = $response->{uri};
95    my @filenames  = @{ $response->{filenames} };
96    my $is_success = $response->{is_success};
97    my $message    = $response->{message};
98
99    unless ($is_success) {
100        foreach my $filename (@filenames) {
101            push @{ $filenames{$filename} },
102                { uri => $uri, message => $message };
103        }
104    }
105    $fetch_progress++;
106}
107$bw->shut_down();
108
109my $fh = IO::File->new('> uris.txt');
110foreach my $filename ( sort keys %filenames ) {
111    $fh->say("* $filename");
112    my @bits = @{ $filenames{$filename} };
113    foreach my $bit (@bits) {
114        my $uri     = $bit->{uri};
115        my $message = $bit->{message};
116        $fh->say("  $uri");
117        $fh->say("    $message");
118    }
119}
120$fh->close;
121
122say 'Finished, see uris.txt';
123
124sub work_alarmed {
125    my $conf = shift;
126    eval {
127        local $SIG{ALRM} = sub { die "alarm\n" };    # NB: \n required
128        alarm 60;
129        $conf = work($conf);
130        alarm 0;
131    };
132    if ($@) {
133        $conf->{is_success} = 0;
134        $conf->{message}    = 'Timed out';
135
136    }
137    return $conf;
138}
139
140sub work {
141    my $conf      = shift;
142    my $uri       = $conf->{uri};
143    my @filenames = @{ $conf->{filenames} };
144
145    if ( $uri =~ /^http/ ) {
146        my $uri_without_fragment = URI->new($uri);
147        my $fragment             = $uri_without_fragment->fragment(undef);
148        my $response             = $ua->head($uri_without_fragment);
149
150        $conf->{is_success} = $response->is_success;
151        $conf->{message}    = $response->status_line;
152        return $conf;
153    } else {
154
155        my $uri_object = URI->new($uri);
156        my $host       = $uri_object->host;
157        my $path       = $uri_object->path;
158        my ( $volume, $directories, $filename )
159            = File::Spec->splitpath($path);
160
161        my $ftp = Net::FTP->new( $host, Passive => 1, Timeout => 60 );
162        unless ($ftp) {
163            $conf->{is_succcess} = 0;
164            $conf->{message}     = "Can not connect to $host: $@";
165            return $conf;
166        }
167
168        my $can_login = $ftp->login( "anonymous", '-anonymous@' );
169        unless ($can_login) {
170            $conf->{is_success} = 0;
171            $conf->{message} = "Can not login ", $ftp->message;
172            return $conf;
173        }
174
175        my $can_binary = $ftp->binary();
176        unless ($can_binary) {
177            $conf->{is_success} = 0;
178            $conf->{message} = "Can not binary ", $ftp->message;
179            return $conf;
180        }
181
182        my $can_cwd = $ftp->cwd($directories);
183        unless ($can_cwd) {
184            $conf->{is_success} = 0;
185            $conf->{message} = "Can not cwd to $directories ", $ftp->message;
186            return $conf;
187        }
188
189        if ($filename) {
190            my $can_size = $ftp->size($filename);
191            unless ($can_size) {
192                $conf->{is_success} = 0;
193                $conf->{message}
194                    = "Can not size $filename in $directories",
195                    $ftp->message;
196                return $conf;
197            }
198        } else {
199            my ($can_dir) = $ftp->dir;
200            unless ($can_dir) {
201                my ($can_ls) = $ftp->ls;
202                unless ($can_ls) {
203                    $conf->{is_success} = 0;
204                    $conf->{message}
205                        = "Can not dir or ls in $directories ",
206                        $ftp->message;
207                    return $conf;
208                }
209            }
210        }
211
212        $conf->{is_success} = 1;
213        return $conf;
214    }
215}
216
217__DATA__
218# these are fine but give errors
219ftp://ftp.stratus.com/pub/vos/posix/ga/ga.html
220ftp://ftp.stratus.com/pub/vos/utility/utility.html
221
222# these are missing, sigh
223ftp://ftp.sco.com/SLS/ptf7051e.Z
224http://perlmonks.thepen.com/42898.html
225http://svn.mutatus.co.uk/wsvn/Scalar-List-Utils/trunk/
226http://public.activestate.com/cgi-bin/perlbrowse
227http://svn.mutatus.co.uk/browse/libnet/tags/libnet-1.17/ChangeLog
228http://aspn.activestate.com/ASPN/Mail/Message/perl6-internals/2746631
229http://my.smithmicro.com/mac/stuffit/
230http://www.wg.omron.co.jp/cgi-bin/j-e/jfriedl.html
231http://persephone.cps.unizar.es/general/gente/spd/gzip/gzip.html
232
233# these are URI extraction bugs
234http://www.perl.org/E
235http://en.wikipedia.org/wiki/SREC_(file_format
236http://somewhere.else',-type=/
237ftp:passive-mode
238ftp:
239http:[-
240http://search.cpan.org/src/KWILLIAMS/Module-Build-0.30/lib/Module/Build/Platform/Windows.pm:split_like_shell
241http://www.xray.mpe.mpg.de/mailing-lists/perl5-
242http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP:
243http://perl.come/
244http://www.perl.come/
245
246# these are used as an example
247http://example.com/
248http://something.here/
249http://users.perl5.git.perl.org/~yourlogin/
250http://github.com/USERNAME/perl/tree/orange
251http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi
252http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar
253http://somewhere.else$/
254http://somewhere.else$/
255http://somewhere.else/bin/foo&bar',-Type=
256http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi
257http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar
258http://www.perl.org/test.cgi
259http://cpan2.local/
260http://search.cpan.org/perldoc?
261http://cpan1.local/
262http://cpan.dev.local/CPAN
263http:///
264ftp://
265ftp://myurl/
266ftp://ftp.software.ibm.com/aix/fixes/v4/other/vac.C.5.0.2.6.bff
267http://www14.software.ibm.com/webapp/download/downloadaz.jsp
268http://www.cpan.org/modules/by-module/Archive/Archive-Zip-*.tar.gz
269http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-*.TXT
270http://localhost/tmp/index.txt
271http://example.com/foo/bar.html
272http://example.com/Text-Bastardize-1.06.tar.gz
273ftp://example.com/sources/packages.txt
274http://example.com/sources/packages.txt
275http://example.com/sources
276ftp://example.com/sources
277http://some.where.com/dir/file.txt
278http://some.where.com/dir/a.txt
279http://foo.com/X.tgz
280ftp://foo.com/X.tgz
281http://foo/
282http://www.foo.com:8000/
283http://mysite.com/cgi-bin/log.cgi/http://www.some.other.site/args
284http://decoded/mirror/path
285http://a/b/c/d/e/f/g/h/i/j
286http://foo/bar.gz
287ftp://ftp.perl.org
288http://purl.org/rss/1.0/modules/taxonomy/
289ftp://ftp.sun.ac.za/CPAN/CPAN/
290ftp://ftp.cpan.org/pub/mirror/index.txt
291ftp://cpan.org/pub/mirror/index.txt
292http://example.com/~eh/
293http://plagger.org/.../rss
294http://umn.dl.sourceforge.net/mingw/gcc-g++-3.4.5-20060117-1.tar.gz
295http://umn.dl.sourceforge.net/mingw/binutils-2.16.91-20060119-1.tar.gz
296http://umn.dl.sourceforge.net/mingw/mingw-runtime-3.10.tar.gz
297http://umn.dl.sourceforge.net/mingw/gcc-core-3.4.5-20060117-1.tar.gz
298http://umn.dl.sourceforge.net/mingw/w32api-3.6.tar.gz
299http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip
300http://module-build.sourceforge.net/META-spec-new.html
301http://module-build.sourceforge.net/META-spec-v1.4.html
302http://www.cs.vu.nl/~tmgil/vi.html
303http://perlcomposer.sourceforge.net/vperl.html
304http://www.perl.com/CPAN/authors/Tom_Christiansen/scripts/rep
305http://w4.lns.cornell.edu/~pvhp/ptk/ptkTOC.html
306http://world.std.com/~aep/ptkdb/
307http://www.castlelink.co.uk/object_system/
308http://www.fh-wedel.de/elvis/
309ftp://ftp.blarg.net/users/amol/zsh/
310ftp://ftp.funet.fi/pub/languages/perl/CPAN
311http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip
312http://users.perl5.git.perl.org/~USERNAME
313http://foo/x//y/script.cgi/a//b
314http://xxx/script.cgi/http://foo
315http://foo/./x//z/script.cgi/a/../b//c
316http://somewhere.else/in/movie/land
317http://somewhere.else/finished.html
318http://somewhere.else/bin/foo&bar$
319http://somewhere.else/
320http://proxy:8484/
321http://proxy/
322http://myrepo.example.com/
323http://remote/source
324https://example.com/
325http://example.com:1024/
326http:///path?foo=bar
327http://[::]:1024/
328http://([/
329http://example.com:9000/index.html
330http://proxy.example.com:8080/
331http:///index.html
332http://[www.json::pp.org]/
333http://localhost/
334http://foo.example.com/
335http://abc.com/a.js
336http://whatever/man/1/crontab
337http://abc.com/c.js
338http://whatever/Foo%3A%3ABar
339http://abc.com/b.js
340http://remote.server.com/jquery.css
341http://some.other.com/page.html
342https://text.com/1/2
343https://text.com/1/2
344http://link.included.here?o=1&p=2
345http://link.included.here?o=1&amp;p=2
346http://link.included.here?o=1&amp;p=2
347http://link.included.here/
348http://foo/x//y/script.cgi/a//b
349http://xxx/script.cgi/http://foo
350http://foo/./x//z/script.cgi/a/../b//c
351http://somewhere.else/in/movie/land
352http://somewhere.else/finished.html
353http://webproxy:3128/
354http://www/
355
356# these are used to generate or match URLs
357http://www.cpan.org/modules/by-authors/id/$1/$1$2/$dist
358http://www.cpantesters.org/show/%s.yaml
359ftp://(.*?)/(.*)/(.*
360ftp://(.*?)/(.*)/(.*
361ftp://(.*?)/(.*)/(.*
362ftp://ftp.foo.bar/
363http://$host/
364http://wwwe%3C46/
365ftp:/
366http://$addr/mark?commit=$
367http://search.cpan.org/~
368http:/
369ftp:%5Cn$url
370http://www.ietf.org/rfc/rfc$2.txt
371http://search.cpan.org/~
372ftp:%5Cn$url
373
374# weird redirects that LWP doesn't like
375http://www.theperlreview.com/community_calendar
376http://www.software.hp.com/portal/swdepot/displayProductInfo.do?productNumber=PERL
377http://sunsolve.sun.com
378
379# broken webserver that doesn't like HEAD requests
380http://docs.sun.com/app/docs/doc/816-5168/syslog-3c?a=view
381http://www.w3.org/TR/html4/loose.dtd
382
383# these have been reported upstream to CPAN authors
384http://www.gnu.org/manual/tar/html_node/tar_139.html
385http://www.w3.org/pub/WWW/TR/Wd-css-1.html
386http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP
387http://msdn.microsoft.com/workshop/author/dhtml/httponly_cookies.asp
388http://search.cpan.org/search?query=Module::Build::Convert
389http://www.refcnt.org/papers/module-build-convert
390http://csrc.nist.gov/cryptval/shs.html
391http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset4.asp
392http://www.debian.or.jp/~kubota/unicode-symbols.html.en
393http://www.mail-archive.com/perl5-porters@perl.org/msg69766.html
394http://www.debian.or.jp/~kubota/unicode-symbols.html.en
395http://rfc.net/rfc2781.html
396http://www.icu-project.org/charset/
397http://ppewww.ph.gla.ac.uk/~flavell/charset/form-i18n.html
398http://www.rfc-editor.org/
399http://www.rfc.net/
400http://www.oreilly.com/people/authors/lunde/cjk_inf.html
401http://www.oreilly.com/catalog/cjkvinfo/
402http://www.cpan.org/modules/by-author/Damian_Conway/Filter-Simple.tar.gz
403http://www.csse.monash.edu.au/~damian/CPAN/Filter-Simple.tar.gz
404http://www.egt.ie/standards/iso3166/iso3166-1-en.html
405http://www.bsi-global.com/iso4217currency
406http://www.plover.com/~mjd/perl/Memoize/
407http://www.plover.com/~mjd/perl/MiniMemoize/
408http://www.sysadminmag.com/tpj/issues/vol5_5/
409ftp://ftp.tpc.int/tpc/server/UNIX/
410http://www.nara.gov/genealogy/
411http://home.utah-inter.net/kinsearch/Soundex.html
412http://www.nara.gov/genealogy/soundex/soundex.html
413http://rfc.net/rfc3461.html
414ftp://ftp.cs.pdx.edu/pub/elvis/
415http://www.fh-wedel.de/elvis/
416http://lists.perl.org/list/perl-mvs.html
417http://www.cpan.org/ports/os2/
418http://github.com/dagolden/cpan-meta-spec
419http://github.com/dagolden/cpan-meta-spec/issues
420http://www.opensource.org/licenses/lgpl-license.phpt
421http://reality.sgi.com/ariel
422http://www.chiark.greenend.org.uk/pipermail/ukcrypto/1999-February/003538.html
423http://www.chiark.greenend.org.uk/pipermail/ukcrypto/1999-February/003538.html
424http://www.nsrl.nist.gov/testdata/
425http://public.activestate.com/cgi-bin/perlbrowse/p/31194
426http://public.activestate.com/cgi-bin/perlbrowse?patch=16173
427http://public.activestate.com/cgi-bin/perlbrowse?patch=16049
428http://www.li18nux.org/docs/html/CodesetAliasTable-V10.html
429http://aspn.activestate.com/ASPN/Mail/Message/perl5-porters/3486118
430http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.ut
431http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.uf
432http://github.com/schwern/extutils-makemaker
433https://github.com/dagolden/cpanpm/compare/master...private%2Fuse-http-lite
434http://www.json.org/JSON::PP_checker/
435ftp://ftp.kiae.su/pub/unix/fido/
436http://www.gallistel.net/nparker/weather/code/
437http://www.javaworld.com/javaworld/jw-09-2001/jw-0928-ntmessages.html
438ftp://ftp-usa.tpc.int/pub/tpc/server/UNIX/
439http://www.cis.ohio-state.edu/htbin/rfc/rfc959.html
440http://public.activestate.com/cgi-bin/perlbrowse/p/33567
441http://public.activestate.com/cgi-bin/perlbrowse/p/33566
442http://www.dsmit.com/cons/
443http://www.makemaker.org/wiki/index.cgi?ModuleBuildConversionGuide
444
445__END__
446
447=head1 NAME
448
449checkURL.pl - Check that all the URLs in the Perl source are valid
450
451=head1 DESCRIPTION
452
453This program checks that all the URLs in the Perl source are valid. It
454checks HTTP and FTP links in parallel and contains a list of known
455bad example links in its source. It takes 4 minutes to run on my
456machine. The results are written to 'uris.txt' and list the filename,
457the URL and the error:
458
459  * ext/Locale-Maketext/lib/Locale/Maketext.pod
460    http://sunsite.dk/RFC/rfc/rfc2277.html
461      404 Not Found
462  ...
463
464It should be run every so often and links fixed and upstream authors
465notified.
466
467Note that the web is unstable and some websites are temporarily down.
468