1#!/usr/bin/perl -w
2#
3# checkbot - A perl5 script to check validity of links in www document trees
4#
5# Hans de Graaff <hans@degraaff.org>, 1994-2005.
6# Based on Dimitri Tischenko, Delft University of Technology, 1994
7# Based on the testlinks script by Roy Fielding
8# With contributions from Bruce Speyer <bruce.speyer@elecomm.com>
9#
10# This application is free software; you can redistribute it and/or
11# modify it under the same terms as Perl itself.
12#
13# Info-URL: http://degraaff.org/checkbot/
14#
15# $Id: checkbot 238 2008-10-15 12:55:00Z graaff $
16# (Log information can be found at the end of the script)
17
18require 5.004;
19use strict;
20
21require LWP;
22use File::Basename;
23
24BEGIN {
25  eval "use Time::Duration qw(duration)";
26  $main::useduration = ($@ ? 0 : 1);
27}
28
29# Version information
30my
31$VERSION = '1.80';
32
33
34=head1 NAME
35
36Checkbot - WWW Link Verifier
37
38=head1 SYNOPSIS
39
40checkbot [B<--cookies>] [B<--debug>] [B<--file> file name] [B<--help>]
41         [B<--mailto> email addresses] [B<--noproxy> list of domains]
42         [B<--verbose>]
43         [B<--url> start URL]
44         [B<--match> match string] [B<--exclude> exclude string]
45         [B<--proxy> proxy URL] [B<--internal-only>]
46         [B<--ignore> ignore string]
47         [B<--filter> substitution regular expression]
48         [B<--style> style file URL]
49         [B<--note> note] [B<--sleep> seconds] [B<--timeout> timeout]
50         [B<--interval> seconds] [B<--dontwarn> HTTP responde codes]
51         [B<--enable-virtual>]
52         [B<--language> language code]
53         [B<--suppress> suppression file]
54         [start URLs]
55
56=head1 DESCRIPTION
57
58Checkbot verifies the links in a specific portion of the World Wide
59Web. It creates HTML pages with diagnostics.
60
61Checkbot uses LWP to find URLs on pages and to check them. It supports
62the same schemes as LWP does, and finds the same links that
63HTML::LinkExtor will find.
64
65Checkbot considers links to be either 'internal' or
66'external'. Internal links are links within the web space that needs
67to be checked. If an internal link points to a web document this
68document is retrieved, and its links are extracted and
69processed. External links are only checked to be working.  Checkbot
70checks links as it finds them, so internal and external links are
71checked at the same time, even though they are treated differently.
72
73Options for Checkbot are:
74
75=over 4
76
77=item --cookies
78
79Accept cookies from the server and offer them again at later
80requests. This may be useful for servers that use cookies to handle
81sessions. By default Checkbot does not accept any cookies.
82
83=item --debug
84
85Enable debugging mode. Not really supported anymore, but it will keep
86some files around that otherwise would be deleted.
87
88=item --file <file name>
89
90Use the file I<file name> as the basis for the summary file names. The
91summary page will get the I<file name> given, and the server pages are
92based on the I<file name> without the .html extension. For example,
93setting this option to C<index.html> will create a summary page called
94index.html and server pages called index-server1.html and
95index-server2.html.
96
97The default value for this option is C<checkbot.html>.
98
99=item --help
100
101Shows brief help message on the standard output.
102
103=item --mailto <email address>[,<email address>]
104
105Send mail to the I<email address> when Checkbot is done checking. You
106can give more than one address separated by commas. The notification
107email includes a small summary of the results. As of Checkbot 1.76
108email is only sent if problems have been found during the Checkbot
109run.
110
111=item --noproxy <list of domains>
112
113Do not proxy requests to the given domains. The list of domains must
114be a comma-separated list. For example, so avoid using the proxy for
115the localhost and someserver.xyz, you can use C<--noproxy
116localhost,someserver.xyz>.
117
118=item --verbose
119
120Show verbose output while running. Includes all links checked, results
121from the checks, etc.
122
123
124
125
126
127=item --url <start URL>
128
129Set the start URL. Checkbot starts checking at this URL, and then
130recursively checks all links found on this page. The start URL takes
131precedence over additional URLs specified on the command line.
132
133If no scheme is specified for the URL, the file protocol is assumed.
134
135=item --match <match string>
136
137This option selects which pages Checkbot considers local. If the
138I<match string> is contained within the URL, then Checkbot considers
139the page local, retrieves it, and will check all the links contained
140on it. Otherwise the page is considered external and it is only
141checked with a HEAD request.
142
143If no explicit I<match string> is given, the start URLs (See option
144C<--url>) will be used as a match string instead. In this case the
145last page name, if any, will be trimmed. For example, a start URL like
146C<http://some.site/index.html> will result in a default I<match
147string> of C<http://some.site/>.
148
149The I<match string> can be a perl regular expression.  For example, to
150check the main server page and all HTML pages directly underneath it,
151but not the HTML pages in the subdirectories of the server, the
152I<match string> would be C<www.someserver.xyz/($|[^/]+.html)>.
153
154=item --exclude <exclude string>
155
156URLs matching the I<exclude string> are considered to be external,
157even if they happen to match the I<match string> (See option
158C<--match>). URLs matching the --exclude string are still being
159checked and will be reported if problems are found, but they will not
160be checked for further links into the site.
161
162The I<exclude string> can be a perl regular expression. For example,
163to consider all URLs with a query string external, use C<[=\?]>. This
164can be useful when a URL with a query string unlocks the path to a
165huge database which will be checked.
166
167=item --filter <filter string>
168
169This option defines a I<filter string>, which is a perl regular
170expression. This filter is run on each URL found, thus rewriting the
171URL before it enters the queue to be checked. It can be used to remove
172elements from a URL. This option can be useful when symbolic links
173point to the same directory, or when a content management system adds
174session IDs to URLs.
175
176For example C</old/new/> would replace occurrences of 'old' with 'new'
177in each URL.
178
179=item --ignore <ignore string>
180
181URLs matching the I<ignore string> are not checked at all, they are
182completely ignored by Checkbot. This can be useful to ignore known
183problem links, or to ignore links leading into databases. The I<ignore
184string> is matched after the I<filter string> has been applied.
185
186The I<ignore string> can be a perl regular expression.
187
188For example C<www.server.com\/(one|two)> would match all URLs starting
189with either www.server.com/one or www.server.com/two.
190
191
192=item --proxy <proxy URL>
193
194This attribute specifies the URL of a proxy server. Only the HTTP and
195FTP requests will be sent to that proxy server.
196
197=item --internal-only
198
199Skip the checking of external links at the end of the Checkbot
200run. Only matching links are checked. Note that some redirections may
201still cause external links to be checked.
202
203=item --note <note>
204
205The I<note> is included verbatim in the mail message (See option
206C<--mailto>). This can be useful to include the URL of the summary HTML page
207for easy reference, for instance.
208
209Only meaningful in combination with the C<--mailto> option.
210
211=item --sleep <seconds>
212
213Number of I<seconds> to sleep in between requests. Default is 0
214seconds, i.e. do not sleep at all between requests. Setting this
215option can be useful to keep the load on the web server down while
216running Checkbot. This option can also be set to a fractional number,
217i.e. a value of 0.1 will sleep one tenth of a second between requests.
218
219=item --timeout <timeout>
220
221Default timeout for the requests, specified in seconds. The default is
2222 minutes.
223
224=item --interval <seconds>
225
226The maximum interval between updates of the results web pages in
227seconds. Default is 3 hours (10800 seconds). Checkbot will start the
228interval at one minute, and gradually extend it towards the maximum
229interval.
230
231=item --style <URL of style file>
232
233When this option is used, Checkbot embeds this URL as a link to a
234style file on each page it writes. This makes it easy to customize the
235layout of pages generated by Checkbot.
236
237=item --dontwarn <HTTP response codes regular expression>
238
239Do not include warnings on the result pages for those HTTP response
240codes which match the regular expression. For instance, --dontwarn
241"(301|404)" would not include 301 and 404 response codes.
242
243Checkbot uses the response codes generated by the server, even if this
244response code is not defined in RFC 2616 (HTTP/1.1). In addition to
245the normal HTTP response code, Checkbot defines a few response codes
246for situations which are not technically a problem, but which causes
247problems in many cases anyway. These codes are:
248
249  901 Host name expected but not found
250      In this case the URL supports a host name, but non was found
251      in the URL. This usually indicates a mistake in the URL. An
252      exception is that this check is not applied to news: URLs.
253
254  902 Unqualified host name found
255      In this case the host name does not contain the domain part.
256      This usually means that the pages work fine when viewed within
257      the original domain, but not when viewed from outside it.
258
259  903 Double slash in URL path
260      The URL has a double slash in it. This is legal, but some web
261      servers cannot handle it very well and may cause Checkbot to
262      run away. See also the comments below.
263
264  904 Unknown scheme in URL
265      The URL starts with a scheme that Checkbot does not know
266      about. This is often caused by mistyping the scheme of the URL,
267      but the scheme can also be a legal one. In that case please let
268      me know so that it can be added to Checkbot.
269
270=item --enable-virtual
271
272This option enables dealing with virtual servers. Checkbot then
273assumes that all hostnames for internal servers are unique, even
274though their IP addresses may be the same. Normally Checkbot uses the
275IP address to distinguish servers. This has the advantage that if a
276server has two names (e.g. www and bamboozle) its pages only get
277checked once. When you want to check multiple virtual servers this
278causes problems, which this feature works around by using the hostname
279to distinguish the server.
280
281=item --language
282
283The argument for this option is a two-letter language code. Checkbot
284will use language negotiation to request files in that language. The
285default is to request English language (language code 'en').
286
287=item --suppress
288
289The argument for this option is a file which contains combinations of
290error codes and URLs for which to suppress warnings. This can be used
291to avoid reporting of known and unfixable URL errors or warnings.
292
293The format of the suppression file is a simple whitespace delimited
294format, first listing the error code followed by the URL. Each error
295code and URL combination is listed on a new line. Comments can be
296added to the file by starting the line with a C<#> character.
297
298  # 301 Moved Permanently
299  301   http://www.w3.org/P3P
300
301  # 403 Forbidden
302  403   http://www.herring.com/
303
304For further flexibility a regular expression can be used instead of a
305normal URL. The regular expression must be enclosed with forward
306slashes. For example, to suppress all 403 errors on wikipedia:
307
308  403   /http:\/\/wikipedia.org\/.*/
309
310=back
311
312Deprecated options which will disappear in a future release:
313
314=over
315
316=item --allow-simple-hosts (deprecated)
317
318This option turns off warnings about URLs which contain unqualified
319host names. This is useful for intranet sites which often use just a
320simple host name or even C<localhost> in their links.
321
322Use of this option is deprecated. Please use the --dontwarn mechanism
323for error 902 instead.
324
325=back
326
327
328=head1 HINTS AND TIPS
329
330=over
331
332=item Problems with checking FTP links
333
334Some users may experience consistent problems with checking FTP
335links. In these cases it may be useful to instruct Net::FTP to use
336passive FTP mode to check files. This can be done by setting the
337environment variable FTP_PASSIVE to 1. For example, using the bash
338shell: C<FTP_PASSIVE=1 checkbot ...>. See the Net::FTP documentation
339for more details.
340
341=item Run-away Checkbot
342
343In some cases Checkbot literally takes forever to finish. There are two
344common causes for this problem.
345
346First, there might be a database application as part of the web site
347which generates a new page based on links on another page. Since
348Checkbot tries to travel through all links this will create an
349infinite number of pages. This kind of run-away effect is usually predictable. It can be avoided by using the --exclude option.
350
351Second, a server configuration problem can cause a loop in generating
352URLs for pages that really do not exist. This will result in URLs of
353the form http://some.server/images/images/images/logo.png, with ever
354more 'images' included. Checkbot cannot check for this because the
355server should have indicated that the requested pages do not
356exist. There is no easy way to solve this other than fixing the
357offending web server or the broken links.
358
359=item Problems with https:// links
360
361The error message
362
363  Can't locate object method "new" via package "LWP::Protocol::https::Socket"
364
365usually means that the current installation of LWP does not support
366checking of SSL links (i.e. links starting with https://). This
367problem can be solved by installing the Crypt::SSLeay module.
368
369=back
370
371=head1 EXAMPLES
372
373The most simple use of Checkbot is to check a set of pages on a
374server. To check my checkbot pages I would use:
375
376    checkbot http://degraaff.org/checkbot/
377
378Checkbot runs can take some time so Checkbot can send a notification
379mail when the run is done:
380
381    checkbot --mailto hans@degraaff.org http://degraaff.org/checkbot/
382
383It is possible to check a set of local file without using a web
384server. This only works for static files but may be useful in some
385cases.
386
387    checkbot file:///var/www/documents/
388
389=head1 PREREQUISITES
390
391This script uses the C<LWP> modules.
392
393=head1 COREQUISITES
394
395This script can send mail when C<Mail::Send> is present.
396
397=head1 AUTHOR
398
399Hans de Graaff <hans@degraaff.org>
400
401=pod OSNAMES
402
403any
404
405=cut
406
407# Declare some global variables, avoids ugly use of main:: all around
408my %checkbot_errors = ('901' => 'Host name expected but not found',
409		       '902' => 'Unqualified host name in URL',
410		       '903' => 'URL contains double slash in URL',
411		       '904' => 'Unknown scheme in URL',
412		      );
413
414my @starturls = ();
415
416# Two hashes to store the response to a URL, and all the parents of the URL
417my %url_error = ();
418my %url_parent = ();
419
420# Hash for storing the title of a URL for use in reports. TODO: remove
421# this and store title as part of queue.
422my %url_title = ();
423
424# Hash for suppressions, which are defined as a combination of code and URL
425my %suppression = ();
426
427# Hash to store statistics on link checking
428my %stats = ('todo' => 0,
429	     'link' => 0,
430	     'problem' => 0 );
431
432# Options hash (to be filled by GetOptions)
433my %options = ();
434
435# Keep track of start time so that we can use it in reports
436my $start_time = time();
437
438# If on a Mac we should ask for the arguments through some MacPerl stuff
439if ($^O eq 'MacOS') {
440  $main::mac_answer = eval "MacPerl::Ask('Enter Command-Line Options')";
441  push(@ARGV, split(' ', $main::mac_answer));
442}
443
444# Prepare
445check_options();
446init_modules();
447init_globals();
448init_suppression();
449
450# Start actual application
451check_links();
452
453# Finish up
454create_page(1);
455send_mail() if defined $main::opt_mailto and $stats{problem} > 0;
456
457exit 0;
458
459# output prints stuff on stderr if --verbose, and takes care of proper
460# indentation
461sub output {
462  my ($line, $level) = @_;
463
464  return unless $main::opt_verbose;
465
466  chomp $line;
467
468  my $indent = '';
469
470  if (defined $level) {
471    while ($level-- > 0) {
472    $indent .= '    ';
473    }
474  }
475
476  print STDERR $indent, $line, "\n";
477}
478
479### Initialization and setup routines
480
481sub check_options {
482
483  # Get command-line arguments
484  use Getopt::Long;
485  my $result = GetOptions(qw(cookies debug help noproxy=s verbose url=s match=s exclude|x=s file=s filter=s style=s ignore|z=s mailto|M=s note|N=s proxy=s internal-only sleep=f timeout=i interval=i dontwarn=s enable-virtual language=s allow-simple-hosts suppress=s));
486
487  # Handle arguments, some are mandatory, some have defaults
488  &print_help if (($main::opt_help && $main::opt_help)
489                  || (!$main::opt_url && $#ARGV == -1));
490  $main::opt_timeout = 120 unless defined($main::opt_timeout) && length($main::opt_timeout);
491  $main::opt_verbose = 0 unless $main::opt_verbose;
492  $main::opt_sleep = 0 unless defined($main::opt_sleep) && length($main::opt_sleep);
493  $main::opt_interval = 10800 unless defined $main::opt_interval and length $main::opt_interval;
494  $main::opt_dontwarn = "xxx" unless defined $main::opt_dontwarn and length $main::opt_dontwarn;
495  $main::opt_enable_virtual = 0 unless defined $main::opt_enable_virtual;
496  # Set the default language and make sure it is a two letter, lowercase code
497  $main::opt_language = 'en' unless defined $main::opt_language;
498  $main::opt_language = lc(substr($main::opt_language, 0, 2));
499  $main::opt_language =~ tr/a-z//cd;
500  if ($main::opt_language !~ /[a-z][a-z]/) {
501    warn "Argument --language $main::opt_language is not a valid language code\nUsing English as a default.\n";
502    $main::opt_language = 'en';
503  }
504  $main::opt_allow_simple_hosts = 0
505	  unless $main::opt_allow_simple_hosts;
506  output "--allow-simple-hosts is deprecated, please use the --dontwarn mechanism", 0 if $main::opt_allow_simple_hosts;
507
508  # The default for opt_match will be set later, because we might want
509  # to muck with opt_url first.
510
511  # Display messages about the options
512  output "*** Starting Checkbot $VERSION in verbose mode";
513  output 'Will skip checking of external links', 1
514    if $main::opt_internal_only;
515  output "Allowing unqualified host names", 1
516    if $main::opt_allow_simple_hosts;
517  output "Not using optional Time::Duration module: not found", 1
518	unless $main::useduration;
519}
520
521sub init_modules {
522
523  use URI;
524  # Prepare the user agent to be used:
525  use LWP::UserAgent;
526  use LWP::MediaTypes;
527  #use LWP::Debug qw(- +debug);
528  use HTML::LinkExtor;
529  $main::ua = new LWP::UserAgent;
530  $main::ua->agent("Checkbot/$VERSION LWP/" . LWP::Version);
531  $main::ua->timeout($main::opt_timeout);
532  # Add a proxy to the user agent, if defined
533  $main::ua->proxy(['http', 'ftp'], $main::opt_proxy)
534    if defined($main::opt_proxy);
535  $main::ua->no_proxy(split(',', $main::opt_noproxy))
536    if defined $main::opt_noproxy;
537  # Add a cookie jar to the UA if requested by the user
538  $main::ua->cookie_jar( {} )
539    if defined $main::opt_cookies or $main::opt_cookies;
540
541  require Mail::Send if defined $main::opt_mailto;
542
543  use HTTP::Status;
544}
545
546sub init_globals {
547  my $url;
548
549  # Directory and files for output
550  if ($main::opt_file) {
551    $main::file = $main::opt_file;
552    $main::file =~ /(.*)\./;
553    $main::server_prefix = $1;
554  } else {
555    $main::file = "checkbot.html";
556    $main::server_prefix = "checkbot";
557  }
558  $main::tmpdir = ($ENV{'TMPDIR'} or $ENV{'TMP'} or $ENV{'TEMP'} or "/tmp") . "/Checkbot.$$";
559
560  $main::cur_queue  = $main::tmpdir . "/queue";
561  $main::new_queue  = $main::tmpdir . "/queue-new";
562
563  # Make sure we catch signals so that we can clean up temporary files
564  $SIG{'INT'} = $SIG{'TERM'} = $SIG{'HUP'} = $SIG{'QUIT'} = \&got_signal;
565
566  # Set up hashes to be used
567  %main::checked = ();
568  %main::servers = ();
569  %main::servers_get_only = ();
570
571  # Initialize the start URLs. --url takes precedence. Otherwise
572  # just process URLs in order as they appear on the command line.
573  unshift(@ARGV, $main::opt_url) if $main::opt_url;
574  foreach (@ARGV) {
575    $url = URI->new($_);
576    # If no scheme is defined we will assume file is used, so that
577    # it becomes easy to check a single file.
578    $url->scheme('file') unless defined $url->scheme;
579    $url->host('localhost') if $url->scheme eq 'file';
580    if (!defined $url->host) {
581      warn "No host specified in URL $url, ignoring it.\n";
582      next;
583    }
584    push(@starturls, $url);
585  }
586  die "There are no valid starting URLs to begin checking with!\n"
587    if scalar(@starturls) == -1;
588
589  # Set the automatic matching expression to a concatenation of the starturls
590  if (!defined $main::opt_match) {
591    my @matchurls;
592    foreach my $url (@starturls) {
593      # Remove trailing files from the match, e.g. remove index.html
594      # stuff so that we match on the host and/or directory instead,
595      # but only if there is a path component in the first place.
596      my $matchurl = $url->as_string;
597      $matchurl =~ s!/[^/]+$!/! unless $url->path eq '';
598      push(@matchurls, quotemeta $matchurl);
599    }
600    $main::opt_match = '^(' . join('|', @matchurls) . ')';
601    output "--match defaults to $main::opt_match";
602  }
603
604  # Initialize statistics hash with number of start URLs
605  $stats{'todo'} = scalar(@starturls);
606
607  # We write out our status every now and then.
608  $main::cp_int = 1;
609  $main::cp_last = 0;
610}
611
612sub init_suppression {
613  return if not defined $main::opt_suppress;
614
615  die "Suppression file \"$main::opt_suppress\" is in fact a directory"
616	if -d $main::opt_suppress;
617
618  open(SUPPRESSIONS, $main::opt_suppress)
619    or die "Unable to open $main::opt_suppress for reading: $!\n";
620  while (my $line = <SUPPRESSIONS>) {
621    chomp $line;
622    next if $line =~ /^#/ or $line =~ /^\s*$/;
623
624    if ($line !~ /^\s*(\d+)\s+(\S+)/) {
625      output "WARNING: Unable to parse line in suppression file $main::opt_suppress:\n    $line\n";
626    } else {
627      output "Suppressed: $1 $2\n" if $main::opt_verbose;
628      $suppression{$1}{$2} = $2;
629    }
630  }
631  close SUPPRESSIONS;
632}
633
634
635
636
637### Main application code
638
639sub check_links {
640  my $line;
641
642  mkdir $main::tmpdir, 0755
643    || die "$0: unable to create directory $main::tmpdir: $!\n";
644
645  # Explicitly set the record separator. I had the problem that this
646  # was not defined under my perl 5.00502. This should fix that, and
647  # not cause problems for older versions of perl.
648  $/ = "\n";
649
650  open(CURRENT, ">$main::cur_queue")
651    || die "$0: Unable to open CURRENT $main::cur_queue for writing: $!\n";
652  open(QUEUE, ">$main::new_queue")
653    || die "$0: Unable to open QUEUE $main::new_queue for writing: $!\n";
654
655  # Prepare CURRENT queue with starting URLs
656  foreach (@starturls) {
657    print CURRENT $_->as_string . "|\n";
658  }
659  close CURRENT;
660
661  open(CURRENT, $main::cur_queue)
662    || die "$0: Unable to open CURRENT $main::cur_queue for reading: $!\n";
663
664  do {
665    # Read a line from the queue, and process it
666    while (defined ($line = <CURRENT>) ) {
667      chomp($line);
668      &handle_url($line);
669      &check_point();
670    }
671
672    # Move queues around, and try again, but only if there are still
673    # things to do
674    output "*** Moving queues around, " . $stats{'todo'} . " links to do.";
675    close CURRENT
676      or warn "Error while closing CURRENT filehandle: $!\n";
677    close QUEUE;
678
679    # TODO: should check whether these succeed
680    unlink($main::cur_queue);
681    rename($main::new_queue, $main::cur_queue);
682
683    open(CURRENT, "$main::cur_queue")
684      || die "$0: Unable to open $main::cur_queue for reading: $!\n";
685    open(QUEUE, ">$main::new_queue")
686      || die "$0: Unable to open $main::new_queue for writing: $!\n";
687
688  } while (not -z $main::cur_queue);
689
690  close CURRENT;
691  close QUEUE;
692
693  unless (defined($main::opt_debug)) {
694    clean_up();
695  }
696}
697
698sub clean_up {
699  unlink $main::cur_queue, $main::new_queue;
700  rmdir $main::tmpdir;
701  output "Removed temporary directory $main::tmpdir and its contents.\n", 1;
702}
703
704sub got_signal {
705  my ($signalname) = @_;
706
707  clean_up() unless defined $main::opt_debug;
708
709  print STDERR "Caught SIG$signalname.\n";
710  exit 1;
711}
712
713# Whether URL is 'internal' or 'external'
714sub is_internal ($) {
715  my ($url) = @_;
716
717  return ( $url =~ /$main::opt_match/o
718	   and not (defined $main::opt_exclude and $url =~ /$main::opt_exclude/o));
719}
720
721
722sub handle_url {
723  my ($line) = @_;
724  my ($urlstr, $urlparent) = split(/\|/, $line);
725
726  my $reqtype;
727  my $response;
728  my $type;
729
730  $stats{'todo'}--;
731
732  # Add this URL to the ones we've seen already, return if it is a
733  # duplicate.
734  return if add_checked($urlstr);
735
736  $stats{'link'}++;
737
738  # Is this an external URL and we only check internal stuff?
739  return if defined $main::opt_internal_only
740    and not is_internal($urlstr);
741
742  my $url = URI->new($urlstr);
743
744  # Perhaps this is a URL we are not interested in checking...
745  if (not defined($url->scheme)
746      or $url->scheme !~ /^(https?|file|ftp|gopher|nntp)$/o ) {
747    # Ignore URLs which we know we can ignore, create error for others
748    if ($url->scheme =~ /^(news|mailto|javascript|mms)$/o) {
749      output "Ignore $url", 1;
750    } else {
751      add_error($urlstr, $urlparent, 904, "Unknown scheme in URL: "
752				. $url->scheme);
753    }
754    return;
755  }
756
757  # Guess/determine the type of document we might retrieve from this
758  # URL. We do this because we only want to use a full GET for HTML
759  # document. No need to retrieve images, etc.
760  if ($url->path =~ /\/$/o || $url->path eq "") {
761    $type = 'text/html';
762  } else {
763    $type = guess_media_type($url->path);
764  }
765  # application/octet-stream is the fallback of LWP's guess stuff, so
766  # if we get this then we ask the server what we got just to be sure.
767  if ($type eq 'application/octet-stream') {
768    $response = performRequest('HEAD', $url, $urlparent, $type, $main::opt_language);
769    $type = $response->content_type;
770  }
771
772  # Determine if this is a URL we should GET fully or partially (using HEAD)
773  if ($type =~ /html/o
774      && $url->scheme =~ /^(https?|file|ftp|gopher)$/o
775      and is_internal($url->as_string)
776      && (!defined $main::opt_exclude || $url !~ /$main::opt_exclude/o)) {
777    $reqtype = 'GET';
778  } else {
779    $reqtype = 'HEAD';
780  }
781
782  # Get the document, unless we already did while determining the type
783  $response = performRequest($reqtype, $url, $urlparent, $type, $main::opt_language)
784    unless defined($response) and $reqtype eq 'HEAD';
785
786  # Ok, we got something back from checking, let's see what it is
787  if ($response->is_success) {
788    select(undef, undef, undef, $main::opt_sleep)
789      unless $main::opt_debug || $url->scheme eq 'file';
790
791    # Internal HTML documents need to be given to handle_doc for processing
792	if ($reqtype eq 'GET' and is_internal($url->as_string)) {
793	  handle_doc($response, $urlstr);
794	}
795  } else {
796
797    # Right, so it wasn't the smashing succes we hoped for, so bring
798    # the bad news and store the pertinent information for later
799    add_error($url, $urlparent, $response->code, $response->message);
800
801    if ($response->is_redirect and is_internal($url->as_string)) {
802      if ($response->code == 300) {  # multiple choices, but no redirection available
803	output 'Multiple choices', 2;
804      } else {
805	my $baseURI = URI->new($url);
806	if (defined $response->header('Location')) {
807	  my $redir_url = URI->new_abs($response->header('Location'), $baseURI);
808	  output "Redirected to $redir_url", 2;
809	  add_to_queue($redir_url, $urlparent);
810	  $stats{'todo'}++;
811	} else {
812	  output 'Location header missing from redirect response', 2;
813	}
814      }
815    }
816  }
817  # Done with this URL
818}
819
820sub performRequest {
821  my ($reqtype, $url, $urlparent, $type, $language) = @_;
822
823  my ($response);
824
825  # A better solution here would be to use GET exclusively. Here is how
826  # to do that. We would have to set this max_size thing in
827  # check_external, I guess...
828  # Set $ua->max_size(1) and then try a normal GET request. However,
829  # that doesn't always work as evidenced by an FTP server that just
830  # hangs in this case... Needs more testing to see if the timeout
831  # catches this.
832
833  # Normally, we would only need to do a HEAD, but given the way LWP
834  # handles gopher requests, we need to do a GET on those to get at
835  # least a 500 and 501 error. We would need to parse the document
836  # returned by LWP to find out if we had problems finding the
837  # file. -- Patch by Bruce Speyer <bspeyer@texas-one.org>
838
839  # We also need to do GET instead of HEAD if we know the remote
840  # server won't accept it.  The standard way for an HTTP server to
841  # indicate this is by returning a 405 ("Method Not Allowed") or 501
842  # ("Not Implemented").  Other circumstances may also require sending
843  # GETs instead of HEADs to a server.  Details are documented below.
844  # -- Larry Gilbert <larry@n2h2.com>
845
846  # Normally we try a HEAD request first, then a GET request if
847  # needed. There may be circumstances in which we skip doing a HEAD
848  # (e.g. when we should be getting the whole document).
849  foreach my $try ('HEAD', 'GET') {
850
851    # Skip trying HEAD when we know we need to do a GET or when we
852    # know only a GET will work anyway.
853    next if $try eq 'HEAD' and
854      ($reqtype eq 'GET'
855       or $url->scheme eq 'gopher'
856       or (defined $url->authority and $main::servers_get_only{$url->authority}));
857
858    # Output what we are going to do with this link
859    output(sprintf("%4s %s (%s)\n", $try, $url, $type), 1);
860
861    # Create the request with all appropriate headers
862    my %header_hash = ( 'Referer' => $urlparent );
863    if (defined($language) && ($language ne '')) {
864      $header_hash{'Accept-Language'} = $language;
865    }
866    my $ref_header = new HTTP::Headers(%header_hash);
867    my $request = new HTTP::Request($try, $url, $ref_header);
868    $response = $main::ua->simple_request($request);
869
870    # If we are doing a HEAD request we need to make sure nothing
871    # fishy happened. we use some heuristics to see if we are ok, or
872    # if we should try again with a GET request.
873    if ($try eq 'HEAD') {
874
875      # 400, 405, 406 and 501 are standard indications that HEAD
876      # shouldn't be used
877	  # We used to check for 403 here also, but according to the HTTP spec
878      # a 403 indicates that the server understood us fine but really does
879	  # not want us to see the page, so we SHOULD NOT retry.
880      if ($response->code =~ /^(400|405|406|501)$/o) {
881		output "Server does not seem to like HEAD requests; retrying", 2;
882		$main::servers_get_only{$url->authority}++;
883		next;
884      };
885
886	  # There are many servers out there that have real trouble with
887	  # HEAD, so if we get a 500 Internal Server error just retry with
888	  # a GET request to get an authoritive answer. We used to do this
889	  # only for special cases, but the list got big and some
890	  # combinations (e.g. Zope server behind Apache proxy) can't
891	  # easily be detected from the headers.
892	  if ($response->code =~ /^500$/o) {
893		output "Internal server error on HEAD request; retrying with GET", 2;
894		$main::servers_get_only{$url->authority}++ if defined $url->authority;
895		next;
896	  }
897
898      # If we know the server we can try some specific heuristics
899      if (defined $response->server) {
900
901		# Netscape Enterprise has been seen returning 500 and even 404
902		# (yes, 404!!) in response to HEAD requests
903		if ($response->server =~ /^Netscape-Enterprise/o
904			and $response->code =~ /^404$/o) {
905		  output "Unreliable Netscape-Enterprise response to HEAD request; retrying", 2;
906		  $main::servers_get_only{$url->authority}++;
907		  next;
908		};
909	  }
910
911      # If a HEAD request resulted in nothing noteworthy, no need for
912      # any further attempts using GET, we are done.
913      last;
914    }
915  }
916
917  return $response;
918}
919
920
921# This routine creates a (temporary) WWW page based on the current
922# findings This allows somebody to monitor the process, but is also
923# convenient when this program crashes or waits because of diskspace
924# or memory problems
925
926sub create_page {
927    my($final_page) = @_;
928
929    my $path = "";
930    my $prevpath = "";
931    my $prevcode = 0;
932    my $prevmessage = "";
933
934    output "*** Start writing results page";
935
936    open(OUT, ">$main::file.new")
937	|| die "$0: Unable to open $main::file.new for writing:\n";
938    print OUT "<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>\n";
939    print OUT "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">\n";
940    print OUT "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">\n";
941    print OUT "<head>\n";
942    if (!$final_page) {
943      printf OUT "<meta http-equiv=\"Refresh\" content=\"%d\" />\n",
944      int($main::cp_int * 60 / 2 - 5);
945    }
946
947    print OUT "<title>Checkbot report</title>\n";
948    print OUT "<link rel=\"stylesheet\" type=\"text/css\" href=\"$main::opt_style\" />\n" if defined $main::opt_style;
949    print OUT "</head>\n";
950    print OUT "<body>\n";
951    print OUT "<h1><em>Checkbot</em>: main report</h1>\n";
952
953    # Show the status of this checkbot session
954    print OUT "<table summary=\"Status of this Checkbot session\" class='status'><tr><th>Status:</th><td>";
955    if ($final_page) {
956      print OUT "Done.<br />\n";
957      print OUT 'Run started on ' . localtime($start_time) . ".<br />\n";
958      print OUT 'Run duration ', duration(time() - $start_time), ".\n"
959	if $main::useduration;
960    } else {
961      print OUT "Running since " . localtime($start_time) . ".<br />\n";
962      print OUT "Last update at ". localtime() . ".<br />\n";
963      print OUT "Next update in <strong>", int($main::cp_int), "</strong> minutes.\n";
964    }
965    print OUT "</td></tr></table>\n\n";
966
967    # Summary (very brief overview of key statistics)
968    print OUT "<hr /><h2 class='summary'>Report summary</h2>\n";
969
970    print OUT "<table summary=\"Report summary\" class='summary'>\n";
971    print OUT "<tr id='checked'><th>Links checked</th><td class='value'>", $stats{'link'}, "</td></tr>\n";
972    print OUT "<tr id='problems'><th>Problems so far</th><td class='value'>", $stats{'problem'}, "</td></tr>\n";
973    print OUT "<tr id='todo'><th>Links to do</th><td class='value'>", $stats{'todo'}, "</td></tr>\n";
974    print OUT "</table>\n";
975
976    # Server information
977    printAllServers($final_page);
978
979    # Checkbot session parameters
980    print OUT "<hr /><h2 class='params'>Checkbot session parameters</h2>\n";
981    print OUT "<table summary=\"Checkbot session parameters\" class='params'>\n";
982    print OUT "<tr><th align=\"left\">--url &amp;<br/> &lt;command line urls&gt;</th><td class='text'>Start URL(s)</td><td class='value' id='url'>",
983              join(',', @starturls), "</td></tr>\n";
984    print OUT "<tr><th align=\"left\">--match</th><td class='text'>Match regular expression</td><td class='value' id='match'>$main::opt_match</td></tr>\n";
985    print OUT "<tr><th align=\"left\">--exclude</th><td class='text'>Exclude regular expression</td><td class='value' id='exclude'>$main::opt_exclude</td></tr>\n" if defined $main::opt_exclude;
986    print OUT "<tr><th align=\"left\">--filter</th><td class='text'>Filter regular expression</td><td class='value' id='filter'>$main::opt_filter</td></tr>\n" if defined $main::opt_filter;
987    print OUT "<tr><th align=\"left\">--noproxy</th><td class='text'>No Proxy for the following domains</td><td class='value' id='noproxy'>$main::opt_noproxy</td></tr>\n" if defined $main::opt_noproxy;
988    print OUT "<tr><th align=\"left\">--ignore</th><td class='text'>Ignore regular expression</td><td class='value' id='ignore'>$main::opt_ignore</td></tr>\n" if defined $main::opt_ignore;
989    print OUT "<tr><th align=\"left\">--suppress</th><td class='text'>Suppress error code and URL specified by</td><td class='value' id='suppress'>$main::opt_suppress</td></tr>\n" if defined $main::opt_suppress;
990    print OUT "<tr><th align=\"left\">--dontwarn</th><td class='text'>Don't warn for these codes</td><td class='value' id='dontwarn'>$main::opt_dontwarn</td></tr>\n" if $main::opt_dontwarn ne 'xxx';
991    print OUT "<tr><th align=\"left\">--enable-virtual</th><td class='text'>Use virtual names only</td><td class='value' id='enable_virtual'>yes</td></tr>\n" if $main::opt_enable_virtual;
992    print OUT "<tr><th align=\"left\">--internal-only</th><td class='text'>Check only internal links</td><td class='value' id='internal_only'>yes</td></tr>\n" if defined $main::opt_internal_only;
993    print OUT "<tr><th align=\"left\">--cookies</th><td class='text'>Accept cookies</td><td class='value' id='cookies'>yes</td></tr>\n" if defined $main::opt_cookies;
994    print OUT "<tr><th align=\"left\">--sleep</th><td class='text'>Sleep seconds between requests</td><td class='value' id='sleep'>$main::opt_sleep</td></tr>\n" if ($main::opt_sleep != 0);
995    print OUT "<tr><th align=\"left\">--timeout</th><td class='text'>Request timeout seconds</td><td class='value' id='timeout'>$main::opt_timeout</td></tr>\n";
996    print OUT "</table>\n";
997
998    # Statistics for types of links
999
1000    print OUT signature();
1001
1002    close(OUT);
1003
1004    rename($main::file, $main::file . ".bak");
1005    rename($main::file . ".new", $main::file);
1006
1007    unlink $main::file . ".bak" unless $main::opt_debug;
1008
1009    output "*** Done writing result page";
1010}
1011
1012# Create a list of all the servers, and create the corresponding table
1013# and subpages. We use the servers overview for this. This can result
1014# in strange effects when the same server (e.g. IP address) has
1015# several names, because several entries will appear. However, when
1016# using the IP address there are also a number of tricky situations,
1017# e.g. with virtual hosting. Given that likely the servers have
1018# different names for a reasons, I think it is better to have
1019# duplicate entries in some cases, instead of working off of the IP
1020# addresses.
1021
1022sub printAllServers {
1023  my ($finalPage) = @_;
1024
1025  my $server;
1026  print OUT "<hr /><h2 class='overview'>Overview per server</h2>\n";
1027  print OUT "<table summary=\"Overview per server\" class='overview'><tr><th>Server</th><th>Server<br />Type</th><th>Documents<br />scanned</th><th>Problem<br />links</th><th>Ratio</th></tr>\n";
1028
1029  foreach $server (sort keys %main::servers) {
1030    print_server($server, $finalPage);
1031  }
1032  print OUT "</table>\n\n";
1033}
1034
1035sub get_server_type {
1036  my($server) = @_;
1037
1038  my $result;
1039
1040  if ( ! defined($main::server_type{$server})) {
1041    if ($server eq 'localhost') {
1042      $result = 'Direct access through filesystem';
1043    } else {
1044      my $request = new HTTP::Request('HEAD', "http://$server/");
1045      my $response = $main::ua->simple_request($request);
1046      $result = $response->header('Server');
1047    }
1048    $result = "Unknown server type" if ! defined $result or $result eq "";
1049    output "=== Server $server is a $result";
1050    $main::server_type{$server} = $result;
1051  }
1052  $main::server_type{$server};
1053}
1054
1055sub add_checked {
1056  my($urlstr) = @_;
1057  my $item;
1058  my $result = 0;
1059
1060  if (is_internal($urlstr) and not $main::opt_enable_virtual) {
1061    # Substitute hostname with IP-address. This keeps us from checking
1062    # the same pages for each name of the server, wasting time & resources.
1063    # Only do this if we are not dealing with virtual servers. Also, we
1064    # only do this for internal servers, because it makes no sense for
1065    # external links.
1066    my $url = URI->new($urlstr);
1067    $url->host(ip_address($url->host)) if $url->can('host');
1068    $urlstr = $url->as_string;
1069  }
1070
1071  if (defined $main::checked{$urlstr}) {
1072    $result = 1;
1073    $main::checked{$urlstr}++;
1074  } else {
1075    $main::checked{$urlstr} = 1;
1076  }
1077
1078  return $result;
1079}
1080
1081# Has this URL already been checked?
1082sub is_checked {
1083  my ($urlstr) = @_;
1084
1085  if (is_internal($urlstr) and not $main::opt_enable_virtual) {
1086    # Substitute hostname with IP-address. This keeps us from checking
1087    # the same pages for each name of the server, wasting time & resources.
1088    # Only do this if we are not dealing with virtual servers. Also, we
1089    # only do this for internal servers, because it makes no sense for
1090    # external links.
1091    my $url = URI->new($urlstr);
1092    $url->host(ip_address($url->host)) if $url->can('host');
1093    $urlstr = $url->as_string;
1094  }
1095
1096  return defined $main::checked{$urlstr};
1097}
1098
1099sub add_error ($$$$) {
1100  my ($url, $urlparent, $code, $status) = @_;
1101
1102  # Check for the quick eliminations first
1103  return if $code =~ /$main::opt_dontwarn/o
1104    or defined $suppression{$code}{$url};
1105
1106  # Check for matches on the regular expressions in the supression file
1107  if (defined $suppression{$code}) {
1108	foreach my $item ( %{$suppression{$code}} ) {
1109	  if ($item =~ /^\/(.*)\/$/) {
1110		my $regexp = $1;
1111		if ($url =~ $regexp) {
1112		  output "Supressing error $code for $url due to regular expression match on $regexp", 2;
1113		  return;
1114		}
1115	  }
1116	}
1117  }
1118
1119  $status = checkbot_status_message($code) if not defined $status;
1120
1121  output "$code $status", 2;
1122
1123  $url_error{$url}{'code'} = $code;
1124  $url_error{$url}{'status'} = $status;
1125  push @{$url_parent{$url}}, $urlparent;
1126  $stats{'problem'}++;
1127}
1128
1129# Parse document, and get the links
1130sub handle_doc {
1131  my ($response, $urlstr) = @_;
1132
1133  my $num_links = 0;
1134  my $new_links = 0;
1135
1136  # TODO: we are making an assumption here that the $reponse->base is
1137  # valid, which might not always be true! This needs to be fixed, but
1138  # first let's try to find out why this stuff is sometimes not
1139  # valid... Aha. a simple <base href="news:"> will do the trick. It is
1140  # not clear what the right fix for this is.
1141
1142  # We use the URL we used to retrieve this document as the URL to
1143  # attach the problem reports to, even though this may not be the
1144  # proper base url.
1145  my $baseurl = URI->new($urlstr);
1146
1147  # When we received the document we can add a notch to its server
1148  $main::servers{$baseurl->authority}++;
1149
1150  # Retrieve useful information from this document.
1151  # TODO: using a regexp is NOT how this should be done, but it is
1152  # easy. The right way would be to write a HTML::Parser or to use
1153  # XPath on the document DOM provided that the document is easily
1154  # parsed as XML. Either method is a lot of overhead.
1155  if ($response->content =~ /title\>(.*?)\<\/title/si) {
1156
1157	# TODO: using a general hash that stores titles for all pages may
1158	# consume too much memory. It would be better to only store the
1159	# titles for requests that had problems. That requires passing them
1160	# down to the queue. Take the easy way out for now.
1161	$url_title{$baseurl} = $1;
1162  }
1163
1164  # Check if this document has a Robots META tag. If so, check if
1165  # Checkbot is allowed to FOLLOW the links on this page. Note that we
1166  # ignore the INDEX directive because Checkbot is not an indexing
1167  # robot. See http://www.robotstxt.org/wc/meta-user.html
1168  # TODO: one more reason (see title) to properly parse this document...
1169  if ($response->content =~ /\<meta[^\>]*?robots[^\>]*?nofollow[^\>]*?\>/si) {
1170	output "Obeying robots meta tag $&, skipping document", 2;
1171	return;
1172  }
1173
1174
1175  # Parse the document just downloaded, using the base url as defined
1176  # in the response, otherwise we won't get the same behavior as
1177  # browsers and miss things like a BASE url in pages.
1178  my $p = HTML::LinkExtor->new(undef, $response->base);
1179
1180  # If charset information is missing then decoded_content doesn't
1181  # work. Fall back to content in this case, even though that may lead
1182  # to charset warnings. See bug 1665075 for reference.
1183  my $content = $response->decoded_content || $response->content;
1184  $p->parse($content);
1185  $p->eof;
1186
1187  # Deal with the links we found in this document
1188  my @links = $p->links();
1189  foreach (@links) {
1190    my ($tag, %l) = @{$_};
1191    foreach (keys %l) {
1192      # Get the canonical URL, so we don't need to worry about base, case, etc.
1193      my $url = $l{$_}->canonical;
1194
1195      # Remove fragments, if any
1196      $url->fragment(undef);
1197
1198      # Determine in which tag this URL was found
1199      # Ignore <base> tags because they need not point to a valid URL
1200      # in order to work (e.g. when directory indexing is turned off).
1201      next if $tag eq 'base';
1202
1203	  # Skip some 'links' that are not required to link to an actual
1204	  # live link but which LinkExtor returns as links anyway.
1205	  next if $tag eq 'applet' and $_ eq 'code';
1206	  next if $tag eq 'object' and $_ eq 'classid';
1207
1208      # Run filter on the URL if defined
1209      if (defined $main::opt_filter) {
1210	die "Filter supplied with --filter option contains errors!\n$@\n"
1211	  unless defined eval '$url =~ s' . $main::opt_filter
1212      }
1213
1214      # Should we ignore this URL?
1215      if (defined $main::opt_ignore and $url =~ /$main::opt_ignore/o) {
1216	output "--ignore: $url", 1;
1217	next;
1218      }
1219
1220      # Check whether URL has fully-qualified hostname
1221      if ($url->can('host') and $url->scheme ne 'news') {
1222        if (! defined $url->host) {
1223		  add_error($url, $baseurl->as_string, '901',
1224					$checkbot_errors{'901'});
1225        } elsif (!$main::opt_allow_simple_hosts && $url->host !~ /\./) {
1226		  add_error($url, $baseurl->as_string, '902',
1227					$checkbot_errors{'902'});
1228        }
1229      }
1230
1231      # Some servers do not process // correctly in requests for relative
1232      # URLs. We should flag them here. Note that // in a URL path is
1233      # actually valid per RFC 2396, and that they should not be removed
1234      # when processing relative URLs as per RFC 1808. See
1235      # e.g. <http://deesse.univ-lemans.fr:8003/Connected/RFC/1808/18.html>.
1236      # Thanks to Randal Schwartz and Reinier Post for their explanations.
1237      if ($url =~ /^http:\/\/.*\/\//) {
1238		add_error($url, $baseurl->as_string, '903',
1239				  $checkbot_errors{'903'});
1240      }
1241
1242      # We add all URLs found to the queue, unless we already checked
1243      # it earlier
1244      if (is_checked($url)) {
1245
1246		# If an error has already been logged for this URL we add the
1247		# current parent to the list of parents on which this URL
1248		# appears.
1249		if (defined $url_error{$url}) {
1250		  push @{$url_parent{$url}}, $baseurl->as_string;
1251		  $stats{'problem'}++;
1252		}
1253
1254		$stats{'link'}++;
1255      } else {
1256		add_to_queue($url, $baseurl);
1257		$stats{'todo'}++;
1258		$new_links++;
1259      }
1260      $num_links++;
1261    }
1262  }
1263  output "Got $num_links links ($new_links new) from document", 2;
1264}
1265
1266
1267sub add_to_queue {
1268  my ($url, $parent) = @_;
1269
1270  print QUEUE $url . '|' . $parent . "\n";
1271}
1272
1273sub checkbot_status_message ($) {
1274  my ($code) = @_;
1275
1276  my $result = status_message($code) || $checkbot_errors{$code}
1277    || '(Undefined status)';
1278}
1279
1280sub print_server ($$) {
1281  my($server, $final_page) = @_;
1282
1283  my $host = $server;
1284  $host =~ s/(.*):\d+/$1/;
1285
1286  output "Writing server $server (really " . ip_address($host) . ")", 1;
1287
1288  my $server_problem = count_problems($server);
1289  my $filename = "$main::server_prefix-$server.html";
1290  $filename =~ s/:/-/o;
1291
1292  print OUT "<tr><td class='server'>";
1293  print OUT "<a href=\"@{[ (fileparse($filename))[0] ]}\">" if $server_problem > 0;
1294  print OUT "$server";
1295  print OUT "</a>" if $server_problem > 0;
1296  print OUT "</td>";
1297  print OUT "<td class='servertype'>" . get_server_type($server) . "</td>";
1298  printf OUT "<td class='unique' align=\"right\">%d</td>",
1299  $main::servers{$server} + $server_problem;
1300  if ($server_problem) {
1301    printf OUT "<td class='problems' id='oops' align=\"right\">%d</td>",
1302    $server_problem;
1303  } else {
1304    printf OUT "<td class='problems' id='zero_defects' align=\"right\">%d</td>",
1305    $server_problem;
1306  }
1307
1308  my $ratio = $server_problem / ($main::servers{$server} + $server_problem) * 100;
1309  print OUT "<td class='ratio' align=\"right\">";
1310  print OUT "<strong>" unless $ratio < 0.5;
1311  printf OUT "%4d%%", $ratio;
1312  print OUT "</strong>" unless $ratio < 0.5;
1313  print OUT "</td>";
1314  print OUT "</tr>\n";
1315
1316  # Create this server file
1317  open(SERVER, ">$filename")
1318    || die "Unable to open server file $filename for writing: $!";
1319  print SERVER "<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>\n";
1320  print SERVER "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">\n";
1321  print SERVER "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">\n";
1322  print SERVER "<head>\n";
1323  if (!$final_page) {
1324    printf SERVER "<meta http-equiv=\"Refresh\" content=\"%d\" />\n",
1325    int($main::cp_int * 60 / 2 - 5);
1326  }
1327  print SERVER "<link rel=\"stylesheet\" type=\"text/css\" href=\"$main::opt_style\" />\n" if defined $main::opt_style;
1328  print SERVER "<title>Checkbot: output for server $server</title></head>\n";
1329  print SERVER "<body><h2><em>Checkbot</em>: report for server <tt>$server</tt></h2>\n";
1330  print SERVER "<p>Go To: <a href=\"@{[ (fileparse($main::file))[0] ]}\">Main report page</a>";
1331
1332  printServerProblems($server, $final_page);
1333
1334  print SERVER "\n";
1335  print SERVER signature();
1336
1337  close SERVER;
1338}
1339
1340# Return a string containing Checkbot's signature for HTML pages
1341sub signature {
1342  return "<hr />\n<p class='signature'>Page created by <a href=\"http://degraaff.org/checkbot/\">Checkbot $VERSION</a> on <em>" . localtime() . "</em>.</p>\n".
1343    "<p><a href=\"http://validator.w3.org/check/?uri=referer\"><img src=\"http://www.w3.org/Icons/valid-xhtml11\" alt=\"Valid XHTML 1.1\" height=\"31\" width=\"88\" /></a></p>".
1344    "</body></html>";
1345}
1346
1347# Loop through all possible problems, select relevant ones for this server
1348# and display them in a meaningful way.
1349sub printServerProblems ($$) {
1350  my ($server, $final_page) = @_;
1351  $server = quotemeta $server;
1352
1353  my $separator = "<hr />\n";
1354
1355  my %thisServerList = ();
1356
1357  # First we find all the problems for this particular server
1358  foreach my $url (keys %url_parent) {
1359    foreach my $parent (@{$url_parent{$url}}) {
1360      next if $parent !~ $server;
1361      chomp $parent;
1362      $thisServerList{$url_error{$url}{'code'}}{$parent}{$url}
1363		= $url_error{$url}{'status'};
1364    }
1365  }
1366
1367  # Do a run to find all error codes on this page, and include a table
1368  # of contents to the actual report
1369  foreach my $code (sort keys %thisServerList) {
1370    print SERVER ", <a href=\"#rc$code\">$code ";
1371    print SERVER checkbot_status_message($code);
1372    print SERVER "</a>";
1373  }
1374  print SERVER ".</p>\n";
1375
1376
1377  # Now run through this list and print the errors
1378  foreach my $code (sort keys %thisServerList) {
1379    my $codeOut = '';
1380
1381    foreach my $parent (sort keys %{ $thisServerList{$code} }) {
1382      my $urlOut = '';
1383      foreach my $url (sort keys %{ $thisServerList{$code}{$parent} }) {
1384	my $status = $thisServerList{$code}{$parent}{$url};
1385	$urlOut .= "<li><a href=\"$url\">$url</a><br/>\n";
1386	$urlOut .= "$status"
1387	  if defined $status and $status ne checkbot_status_message($code);
1388	$urlOut .= "</li>\n";
1389      }
1390      if ($urlOut ne '') {
1391	$codeOut .= "<dt><a href=\"$parent\">$parent</a>";
1392	$codeOut .= "<br />$url_title{$parent}\n" if defined $url_title{$parent};
1393	$codeOut .= "<dd><ul>\n$urlOut\n</ul>\n\n";
1394      }
1395    }
1396
1397    if ($codeOut ne '') {
1398      print SERVER $separator if $separator;
1399      $separator = '';
1400      print SERVER "<h4 id=\"rc$code\">$code ";
1401      print SERVER checkbot_status_message($code);
1402      print SERVER "</h4>\n<dl>\n$codeOut\n</dl>\n";
1403    }
1404  }
1405}
1406
1407sub check_point {
1408  if ( ($main::cp_last + 60 * $main::cp_int < time())
1409	   || ($main::opt_debug && $main::opt_verbose)) {
1410	&create_page(0);
1411	$main::cp_last = time();
1412	# Increase the intervall from one snapshot to the next by 25%
1413	# until we have reached the maximum.
1414	$main::cp_int *= 1.25 unless $main::opt_debug;
1415	$main::cp_int = $main::opt_interval if $main::cp_int > $main::opt_interval;
1416  }
1417}
1418
1419sub send_mail {
1420  my $msg = new Mail::Send;
1421  my $sub = 'Checkbot results for ';
1422  $sub .= join(', ', @starturls);
1423  $sub .= ': ' . $stats{'problem'} . ' errors';
1424
1425  $msg->to($main::opt_mailto);
1426  $msg->subject($sub);
1427
1428  my $fh = $msg->open;
1429
1430  print $fh "Checkbot results for:\n  " . join("\n  ", @starturls) . "\n\n";
1431  print $fh "User-supplied note: $main::opt_note\n\n"
1432    if defined $main::opt_note;
1433
1434  print $fh $stats{'link'}, " links were checked, and ";
1435  print $fh $stats{'problem'}, " problems were detected.\n";
1436
1437  print $fh 'Run started on ' . localtime($start_time) . "\n";
1438  print $fh 'Run duration ', duration(time() - $start_time), "\n"
1439    if $main::useduration;
1440
1441
1442  print $fh "\n-- \nCheckbot $VERSION\n";
1443  print $fh "<URL:http://degraaff.org/checkbot/>\n";
1444
1445  $fh->close;
1446}
1447
1448sub print_help {
1449  print <<"__EOT__";
1450Checkbot $VERSION command line options:
1451
1452  --cookies          Accept cookies from the server
1453  --debug            Debugging mode: No pauses, stop after 25 links.
1454  --file file        Use file as basis for output file names.
1455  --help             Provide this message.
1456  --mailto address   Mail brief synopsis to address when done.
1457  --noproxy domains  Do not proxy requests to given domains.
1458  --verbose          Verbose mode: display many messages about progress.
1459  --url url          Start URL
1460  --match match      Check pages only if URL matches `match'
1461                     If no match is given, the start URL is used as a match
1462  --exclude exclude  Exclude pages if the URL matches 'exclude'
1463  --filter regexp    Run regexp on each URL found
1464  --ignore ignore    Ignore URLs matching 'ignore'
1465  --suppress file    Use contents of 'file' to suppress errors in output
1466  --note note        Include Note (e.g. URL to report) along with Mail message.
1467  --proxy URL        URL of proxy server for HTTP and FTP requests.
1468  --internal-only    Only check internal links, skip checking external links.
1469  --sleep seconds    Sleep this many seconds between requests (default 0)
1470  --style url        Reference the style sheet at this URL.
1471  --timeout seconds  Timeout for http requests in seconds (default 120)
1472  --interval seconds Maximum time interval between updates (default 10800)
1473  --dontwarn codes   Do not write warnings for these HTTP response codes
1474  --enable-virtual   Use only virtual names, not IP numbers for servers
1475  --language         Specify 2-letter language code for language negotiation
1476
1477Options --match, --exclude, and --ignore can take a perl regular expression
1478as their argument\n
1479Use 'perldoc checkbot' for more verbose documentation.
1480Checkbot WWW page     : http://degraaff.org/checkbot/
1481Mail bugs and problems: checkbot\@degraaff.org
1482__EOT__
1483
1484  exit 0;
1485}
1486
1487sub ip_address {
1488  my($host) = @_;
1489
1490  return $main::ip_cache{$host} if defined $main::ip_cache{$host};
1491
1492  my($name,$aliases,$adrtype,$length,@addrs) = gethostbyname($host);
1493  if (defined $addrs[0]) {
1494    my($n1,$n2,$n3,$n4) = unpack ('C4',$addrs[0]);
1495    $main::ip_cache{$host} = "$n1.$n2.$n3.$n4";
1496  } else {
1497    # Whee! No IP-address found for this host. Just keep whatever we
1498    # got for the host. If this really is some kind of error it will
1499    # be found later on.
1500    $main::ip_cache{$host} = $host;
1501   }
1502}
1503
1504sub count_problems {
1505  my ($server) = @_;
1506  $server = quotemeta $server;
1507  my $count = 0;
1508
1509  foreach my $url (sort keys %url_parent) {
1510    foreach my $parent (@{ $url_parent{$url} }) {
1511	$count++ if $parent =~ m/$server/;
1512    }
1513  }
1514  return $count;
1515}
1516
1517