1#!/usr/bin/perl -w 2# 3# checkbot - A perl5 script to check validity of links in www document trees 4# 5# Hans de Graaff <hans@degraaff.org>, 1994-2005. 6# Based on Dimitri Tischenko, Delft University of Technology, 1994 7# Based on the testlinks script by Roy Fielding 8# With contributions from Bruce Speyer <bruce.speyer@elecomm.com> 9# 10# This application is free software; you can redistribute it and/or 11# modify it under the same terms as Perl itself. 12# 13# Info-URL: http://degraaff.org/checkbot/ 14# 15# $Id: checkbot 238 2008-10-15 12:55:00Z graaff $ 16# (Log information can be found at the end of the script) 17 18require 5.004; 19use strict; 20 21require LWP; 22use File::Basename; 23 24BEGIN { 25 eval "use Time::Duration qw(duration)"; 26 $main::useduration = ($@ ? 0 : 1); 27} 28 29# Version information 30my 31$VERSION = '1.80'; 32 33 34=head1 NAME 35 36Checkbot - WWW Link Verifier 37 38=head1 SYNOPSIS 39 40checkbot [B<--cookies>] [B<--debug>] [B<--file> file name] [B<--help>] 41 [B<--mailto> email addresses] [B<--noproxy> list of domains] 42 [B<--verbose>] 43 [B<--url> start URL] 44 [B<--match> match string] [B<--exclude> exclude string] 45 [B<--proxy> proxy URL] [B<--internal-only>] 46 [B<--ignore> ignore string] 47 [B<--filter> substitution regular expression] 48 [B<--style> style file URL] 49 [B<--note> note] [B<--sleep> seconds] [B<--timeout> timeout] 50 [B<--interval> seconds] [B<--dontwarn> HTTP responde codes] 51 [B<--enable-virtual>] 52 [B<--language> language code] 53 [B<--suppress> suppression file] 54 [start URLs] 55 56=head1 DESCRIPTION 57 58Checkbot verifies the links in a specific portion of the World Wide 59Web. It creates HTML pages with diagnostics. 60 61Checkbot uses LWP to find URLs on pages and to check them. It supports 62the same schemes as LWP does, and finds the same links that 63HTML::LinkExtor will find. 64 65Checkbot considers links to be either 'internal' or 66'external'. Internal links are links within the web space that needs 67to be checked. If an internal link points to a web document this 68document is retrieved, and its links are extracted and 69processed. External links are only checked to be working. Checkbot 70checks links as it finds them, so internal and external links are 71checked at the same time, even though they are treated differently. 72 73Options for Checkbot are: 74 75=over 4 76 77=item --cookies 78 79Accept cookies from the server and offer them again at later 80requests. This may be useful for servers that use cookies to handle 81sessions. By default Checkbot does not accept any cookies. 82 83=item --debug 84 85Enable debugging mode. Not really supported anymore, but it will keep 86some files around that otherwise would be deleted. 87 88=item --file <file name> 89 90Use the file I<file name> as the basis for the summary file names. The 91summary page will get the I<file name> given, and the server pages are 92based on the I<file name> without the .html extension. For example, 93setting this option to C<index.html> will create a summary page called 94index.html and server pages called index-server1.html and 95index-server2.html. 96 97The default value for this option is C<checkbot.html>. 98 99=item --help 100 101Shows brief help message on the standard output. 102 103=item --mailto <email address>[,<email address>] 104 105Send mail to the I<email address> when Checkbot is done checking. You 106can give more than one address separated by commas. The notification 107email includes a small summary of the results. As of Checkbot 1.76 108email is only sent if problems have been found during the Checkbot 109run. 110 111=item --noproxy <list of domains> 112 113Do not proxy requests to the given domains. The list of domains must 114be a comma-separated list. For example, so avoid using the proxy for 115the localhost and someserver.xyz, you can use C<--noproxy 116localhost,someserver.xyz>. 117 118=item --verbose 119 120Show verbose output while running. Includes all links checked, results 121from the checks, etc. 122 123 124 125 126 127=item --url <start URL> 128 129Set the start URL. Checkbot starts checking at this URL, and then 130recursively checks all links found on this page. The start URL takes 131precedence over additional URLs specified on the command line. 132 133If no scheme is specified for the URL, the file protocol is assumed. 134 135=item --match <match string> 136 137This option selects which pages Checkbot considers local. If the 138I<match string> is contained within the URL, then Checkbot considers 139the page local, retrieves it, and will check all the links contained 140on it. Otherwise the page is considered external and it is only 141checked with a HEAD request. 142 143If no explicit I<match string> is given, the start URLs (See option 144C<--url>) will be used as a match string instead. In this case the 145last page name, if any, will be trimmed. For example, a start URL like 146C<http://some.site/index.html> will result in a default I<match 147string> of C<http://some.site/>. 148 149The I<match string> can be a perl regular expression. For example, to 150check the main server page and all HTML pages directly underneath it, 151but not the HTML pages in the subdirectories of the server, the 152I<match string> would be C<www.someserver.xyz/($|[^/]+.html)>. 153 154=item --exclude <exclude string> 155 156URLs matching the I<exclude string> are considered to be external, 157even if they happen to match the I<match string> (See option 158C<--match>). URLs matching the --exclude string are still being 159checked and will be reported if problems are found, but they will not 160be checked for further links into the site. 161 162The I<exclude string> can be a perl regular expression. For example, 163to consider all URLs with a query string external, use C<[=\?]>. This 164can be useful when a URL with a query string unlocks the path to a 165huge database which will be checked. 166 167=item --filter <filter string> 168 169This option defines a I<filter string>, which is a perl regular 170expression. This filter is run on each URL found, thus rewriting the 171URL before it enters the queue to be checked. It can be used to remove 172elements from a URL. This option can be useful when symbolic links 173point to the same directory, or when a content management system adds 174session IDs to URLs. 175 176For example C</old/new/> would replace occurrences of 'old' with 'new' 177in each URL. 178 179=item --ignore <ignore string> 180 181URLs matching the I<ignore string> are not checked at all, they are 182completely ignored by Checkbot. This can be useful to ignore known 183problem links, or to ignore links leading into databases. The I<ignore 184string> is matched after the I<filter string> has been applied. 185 186The I<ignore string> can be a perl regular expression. 187 188For example C<www.server.com\/(one|two)> would match all URLs starting 189with either www.server.com/one or www.server.com/two. 190 191 192=item --proxy <proxy URL> 193 194This attribute specifies the URL of a proxy server. Only the HTTP and 195FTP requests will be sent to that proxy server. 196 197=item --internal-only 198 199Skip the checking of external links at the end of the Checkbot 200run. Only matching links are checked. Note that some redirections may 201still cause external links to be checked. 202 203=item --note <note> 204 205The I<note> is included verbatim in the mail message (See option 206C<--mailto>). This can be useful to include the URL of the summary HTML page 207for easy reference, for instance. 208 209Only meaningful in combination with the C<--mailto> option. 210 211=item --sleep <seconds> 212 213Number of I<seconds> to sleep in between requests. Default is 0 214seconds, i.e. do not sleep at all between requests. Setting this 215option can be useful to keep the load on the web server down while 216running Checkbot. This option can also be set to a fractional number, 217i.e. a value of 0.1 will sleep one tenth of a second between requests. 218 219=item --timeout <timeout> 220 221Default timeout for the requests, specified in seconds. The default is 2222 minutes. 223 224=item --interval <seconds> 225 226The maximum interval between updates of the results web pages in 227seconds. Default is 3 hours (10800 seconds). Checkbot will start the 228interval at one minute, and gradually extend it towards the maximum 229interval. 230 231=item --style <URL of style file> 232 233When this option is used, Checkbot embeds this URL as a link to a 234style file on each page it writes. This makes it easy to customize the 235layout of pages generated by Checkbot. 236 237=item --dontwarn <HTTP response codes regular expression> 238 239Do not include warnings on the result pages for those HTTP response 240codes which match the regular expression. For instance, --dontwarn 241"(301|404)" would not include 301 and 404 response codes. 242 243Checkbot uses the response codes generated by the server, even if this 244response code is not defined in RFC 2616 (HTTP/1.1). In addition to 245the normal HTTP response code, Checkbot defines a few response codes 246for situations which are not technically a problem, but which causes 247problems in many cases anyway. These codes are: 248 249 901 Host name expected but not found 250 In this case the URL supports a host name, but non was found 251 in the URL. This usually indicates a mistake in the URL. An 252 exception is that this check is not applied to news: URLs. 253 254 902 Unqualified host name found 255 In this case the host name does not contain the domain part. 256 This usually means that the pages work fine when viewed within 257 the original domain, but not when viewed from outside it. 258 259 903 Double slash in URL path 260 The URL has a double slash in it. This is legal, but some web 261 servers cannot handle it very well and may cause Checkbot to 262 run away. See also the comments below. 263 264 904 Unknown scheme in URL 265 The URL starts with a scheme that Checkbot does not know 266 about. This is often caused by mistyping the scheme of the URL, 267 but the scheme can also be a legal one. In that case please let 268 me know so that it can be added to Checkbot. 269 270=item --enable-virtual 271 272This option enables dealing with virtual servers. Checkbot then 273assumes that all hostnames for internal servers are unique, even 274though their IP addresses may be the same. Normally Checkbot uses the 275IP address to distinguish servers. This has the advantage that if a 276server has two names (e.g. www and bamboozle) its pages only get 277checked once. When you want to check multiple virtual servers this 278causes problems, which this feature works around by using the hostname 279to distinguish the server. 280 281=item --language 282 283The argument for this option is a two-letter language code. Checkbot 284will use language negotiation to request files in that language. The 285default is to request English language (language code 'en'). 286 287=item --suppress 288 289The argument for this option is a file which contains combinations of 290error codes and URLs for which to suppress warnings. This can be used 291to avoid reporting of known and unfixable URL errors or warnings. 292 293The format of the suppression file is a simple whitespace delimited 294format, first listing the error code followed by the URL. Each error 295code and URL combination is listed on a new line. Comments can be 296added to the file by starting the line with a C<#> character. 297 298 # 301 Moved Permanently 299 301 http://www.w3.org/P3P 300 301 # 403 Forbidden 302 403 http://www.herring.com/ 303 304For further flexibility a regular expression can be used instead of a 305normal URL. The regular expression must be enclosed with forward 306slashes. For example, to suppress all 403 errors on wikipedia: 307 308 403 /http:\/\/wikipedia.org\/.*/ 309 310=back 311 312Deprecated options which will disappear in a future release: 313 314=over 315 316=item --allow-simple-hosts (deprecated) 317 318This option turns off warnings about URLs which contain unqualified 319host names. This is useful for intranet sites which often use just a 320simple host name or even C<localhost> in their links. 321 322Use of this option is deprecated. Please use the --dontwarn mechanism 323for error 902 instead. 324 325=back 326 327 328=head1 HINTS AND TIPS 329 330=over 331 332=item Problems with checking FTP links 333 334Some users may experience consistent problems with checking FTP 335links. In these cases it may be useful to instruct Net::FTP to use 336passive FTP mode to check files. This can be done by setting the 337environment variable FTP_PASSIVE to 1. For example, using the bash 338shell: C<FTP_PASSIVE=1 checkbot ...>. See the Net::FTP documentation 339for more details. 340 341=item Run-away Checkbot 342 343In some cases Checkbot literally takes forever to finish. There are two 344common causes for this problem. 345 346First, there might be a database application as part of the web site 347which generates a new page based on links on another page. Since 348Checkbot tries to travel through all links this will create an 349infinite number of pages. This kind of run-away effect is usually predictable. It can be avoided by using the --exclude option. 350 351Second, a server configuration problem can cause a loop in generating 352URLs for pages that really do not exist. This will result in URLs of 353the form http://some.server/images/images/images/logo.png, with ever 354more 'images' included. Checkbot cannot check for this because the 355server should have indicated that the requested pages do not 356exist. There is no easy way to solve this other than fixing the 357offending web server or the broken links. 358 359=item Problems with https:// links 360 361The error message 362 363 Can't locate object method "new" via package "LWP::Protocol::https::Socket" 364 365usually means that the current installation of LWP does not support 366checking of SSL links (i.e. links starting with https://). This 367problem can be solved by installing the Crypt::SSLeay module. 368 369=back 370 371=head1 EXAMPLES 372 373The most simple use of Checkbot is to check a set of pages on a 374server. To check my checkbot pages I would use: 375 376 checkbot http://degraaff.org/checkbot/ 377 378Checkbot runs can take some time so Checkbot can send a notification 379mail when the run is done: 380 381 checkbot --mailto hans@degraaff.org http://degraaff.org/checkbot/ 382 383It is possible to check a set of local file without using a web 384server. This only works for static files but may be useful in some 385cases. 386 387 checkbot file:///var/www/documents/ 388 389=head1 PREREQUISITES 390 391This script uses the C<LWP> modules. 392 393=head1 COREQUISITES 394 395This script can send mail when C<Mail::Send> is present. 396 397=head1 AUTHOR 398 399Hans de Graaff <hans@degraaff.org> 400 401=pod OSNAMES 402 403any 404 405=cut 406 407# Declare some global variables, avoids ugly use of main:: all around 408my %checkbot_errors = ('901' => 'Host name expected but not found', 409 '902' => 'Unqualified host name in URL', 410 '903' => 'URL contains double slash in URL', 411 '904' => 'Unknown scheme in URL', 412 ); 413 414my @starturls = (); 415 416# Two hashes to store the response to a URL, and all the parents of the URL 417my %url_error = (); 418my %url_parent = (); 419 420# Hash for storing the title of a URL for use in reports. TODO: remove 421# this and store title as part of queue. 422my %url_title = (); 423 424# Hash for suppressions, which are defined as a combination of code and URL 425my %suppression = (); 426 427# Hash to store statistics on link checking 428my %stats = ('todo' => 0, 429 'link' => 0, 430 'problem' => 0 ); 431 432# Options hash (to be filled by GetOptions) 433my %options = (); 434 435# Keep track of start time so that we can use it in reports 436my $start_time = time(); 437 438# If on a Mac we should ask for the arguments through some MacPerl stuff 439if ($^O eq 'MacOS') { 440 $main::mac_answer = eval "MacPerl::Ask('Enter Command-Line Options')"; 441 push(@ARGV, split(' ', $main::mac_answer)); 442} 443 444# Prepare 445check_options(); 446init_modules(); 447init_globals(); 448init_suppression(); 449 450# Start actual application 451check_links(); 452 453# Finish up 454create_page(1); 455send_mail() if defined $main::opt_mailto and $stats{problem} > 0; 456 457exit 0; 458 459# output prints stuff on stderr if --verbose, and takes care of proper 460# indentation 461sub output { 462 my ($line, $level) = @_; 463 464 return unless $main::opt_verbose; 465 466 chomp $line; 467 468 my $indent = ''; 469 470 if (defined $level) { 471 while ($level-- > 0) { 472 $indent .= ' '; 473 } 474 } 475 476 print STDERR $indent, $line, "\n"; 477} 478 479### Initialization and setup routines 480 481sub check_options { 482 483 # Get command-line arguments 484 use Getopt::Long; 485 my $result = GetOptions(qw(cookies debug help noproxy=s verbose url=s match=s exclude|x=s file=s filter=s style=s ignore|z=s mailto|M=s note|N=s proxy=s internal-only sleep=f timeout=i interval=i dontwarn=s enable-virtual language=s allow-simple-hosts suppress=s)); 486 487 # Handle arguments, some are mandatory, some have defaults 488 &print_help if (($main::opt_help && $main::opt_help) 489 || (!$main::opt_url && $#ARGV == -1)); 490 $main::opt_timeout = 120 unless defined($main::opt_timeout) && length($main::opt_timeout); 491 $main::opt_verbose = 0 unless $main::opt_verbose; 492 $main::opt_sleep = 0 unless defined($main::opt_sleep) && length($main::opt_sleep); 493 $main::opt_interval = 10800 unless defined $main::opt_interval and length $main::opt_interval; 494 $main::opt_dontwarn = "xxx" unless defined $main::opt_dontwarn and length $main::opt_dontwarn; 495 $main::opt_enable_virtual = 0 unless defined $main::opt_enable_virtual; 496 # Set the default language and make sure it is a two letter, lowercase code 497 $main::opt_language = 'en' unless defined $main::opt_language; 498 $main::opt_language = lc(substr($main::opt_language, 0, 2)); 499 $main::opt_language =~ tr/a-z//cd; 500 if ($main::opt_language !~ /[a-z][a-z]/) { 501 warn "Argument --language $main::opt_language is not a valid language code\nUsing English as a default.\n"; 502 $main::opt_language = 'en'; 503 } 504 $main::opt_allow_simple_hosts = 0 505 unless $main::opt_allow_simple_hosts; 506 output "--allow-simple-hosts is deprecated, please use the --dontwarn mechanism", 0 if $main::opt_allow_simple_hosts; 507 508 # The default for opt_match will be set later, because we might want 509 # to muck with opt_url first. 510 511 # Display messages about the options 512 output "*** Starting Checkbot $VERSION in verbose mode"; 513 output 'Will skip checking of external links', 1 514 if $main::opt_internal_only; 515 output "Allowing unqualified host names", 1 516 if $main::opt_allow_simple_hosts; 517 output "Not using optional Time::Duration module: not found", 1 518 unless $main::useduration; 519} 520 521sub init_modules { 522 523 use URI; 524 # Prepare the user agent to be used: 525 use LWP::UserAgent; 526 use LWP::MediaTypes; 527 #use LWP::Debug qw(- +debug); 528 use HTML::LinkExtor; 529 $main::ua = new LWP::UserAgent; 530 $main::ua->agent("Checkbot/$VERSION LWP/" . LWP::Version); 531 $main::ua->timeout($main::opt_timeout); 532 # Add a proxy to the user agent, if defined 533 $main::ua->proxy(['http', 'ftp'], $main::opt_proxy) 534 if defined($main::opt_proxy); 535 $main::ua->no_proxy(split(',', $main::opt_noproxy)) 536 if defined $main::opt_noproxy; 537 # Add a cookie jar to the UA if requested by the user 538 $main::ua->cookie_jar( {} ) 539 if defined $main::opt_cookies or $main::opt_cookies; 540 541 require Mail::Send if defined $main::opt_mailto; 542 543 use HTTP::Status; 544} 545 546sub init_globals { 547 my $url; 548 549 # Directory and files for output 550 if ($main::opt_file) { 551 $main::file = $main::opt_file; 552 $main::file =~ /(.*)\./; 553 $main::server_prefix = $1; 554 } else { 555 $main::file = "checkbot.html"; 556 $main::server_prefix = "checkbot"; 557 } 558 $main::tmpdir = ($ENV{'TMPDIR'} or $ENV{'TMP'} or $ENV{'TEMP'} or "/tmp") . "/Checkbot.$$"; 559 560 $main::cur_queue = $main::tmpdir . "/queue"; 561 $main::new_queue = $main::tmpdir . "/queue-new"; 562 563 # Make sure we catch signals so that we can clean up temporary files 564 $SIG{'INT'} = $SIG{'TERM'} = $SIG{'HUP'} = $SIG{'QUIT'} = \&got_signal; 565 566 # Set up hashes to be used 567 %main::checked = (); 568 %main::servers = (); 569 %main::servers_get_only = (); 570 571 # Initialize the start URLs. --url takes precedence. Otherwise 572 # just process URLs in order as they appear on the command line. 573 unshift(@ARGV, $main::opt_url) if $main::opt_url; 574 foreach (@ARGV) { 575 $url = URI->new($_); 576 # If no scheme is defined we will assume file is used, so that 577 # it becomes easy to check a single file. 578 $url->scheme('file') unless defined $url->scheme; 579 $url->host('localhost') if $url->scheme eq 'file'; 580 if (!defined $url->host) { 581 warn "No host specified in URL $url, ignoring it.\n"; 582 next; 583 } 584 push(@starturls, $url); 585 } 586 die "There are no valid starting URLs to begin checking with!\n" 587 if scalar(@starturls) == -1; 588 589 # Set the automatic matching expression to a concatenation of the starturls 590 if (!defined $main::opt_match) { 591 my @matchurls; 592 foreach my $url (@starturls) { 593 # Remove trailing files from the match, e.g. remove index.html 594 # stuff so that we match on the host and/or directory instead, 595 # but only if there is a path component in the first place. 596 my $matchurl = $url->as_string; 597 $matchurl =~ s!/[^/]+$!/! unless $url->path eq ''; 598 push(@matchurls, quotemeta $matchurl); 599 } 600 $main::opt_match = '^(' . join('|', @matchurls) . ')'; 601 output "--match defaults to $main::opt_match"; 602 } 603 604 # Initialize statistics hash with number of start URLs 605 $stats{'todo'} = scalar(@starturls); 606 607 # We write out our status every now and then. 608 $main::cp_int = 1; 609 $main::cp_last = 0; 610} 611 612sub init_suppression { 613 return if not defined $main::opt_suppress; 614 615 die "Suppression file \"$main::opt_suppress\" is in fact a directory" 616 if -d $main::opt_suppress; 617 618 open(SUPPRESSIONS, $main::opt_suppress) 619 or die "Unable to open $main::opt_suppress for reading: $!\n"; 620 while (my $line = <SUPPRESSIONS>) { 621 chomp $line; 622 next if $line =~ /^#/ or $line =~ /^\s*$/; 623 624 if ($line !~ /^\s*(\d+)\s+(\S+)/) { 625 output "WARNING: Unable to parse line in suppression file $main::opt_suppress:\n $line\n"; 626 } else { 627 output "Suppressed: $1 $2\n" if $main::opt_verbose; 628 $suppression{$1}{$2} = $2; 629 } 630 } 631 close SUPPRESSIONS; 632} 633 634 635 636 637### Main application code 638 639sub check_links { 640 my $line; 641 642 mkdir $main::tmpdir, 0755 643 || die "$0: unable to create directory $main::tmpdir: $!\n"; 644 645 # Explicitly set the record separator. I had the problem that this 646 # was not defined under my perl 5.00502. This should fix that, and 647 # not cause problems for older versions of perl. 648 $/ = "\n"; 649 650 open(CURRENT, ">$main::cur_queue") 651 || die "$0: Unable to open CURRENT $main::cur_queue for writing: $!\n"; 652 open(QUEUE, ">$main::new_queue") 653 || die "$0: Unable to open QUEUE $main::new_queue for writing: $!\n"; 654 655 # Prepare CURRENT queue with starting URLs 656 foreach (@starturls) { 657 print CURRENT $_->as_string . "|\n"; 658 } 659 close CURRENT; 660 661 open(CURRENT, $main::cur_queue) 662 || die "$0: Unable to open CURRENT $main::cur_queue for reading: $!\n"; 663 664 do { 665 # Read a line from the queue, and process it 666 while (defined ($line = <CURRENT>) ) { 667 chomp($line); 668 &handle_url($line); 669 &check_point(); 670 } 671 672 # Move queues around, and try again, but only if there are still 673 # things to do 674 output "*** Moving queues around, " . $stats{'todo'} . " links to do."; 675 close CURRENT 676 or warn "Error while closing CURRENT filehandle: $!\n"; 677 close QUEUE; 678 679 # TODO: should check whether these succeed 680 unlink($main::cur_queue); 681 rename($main::new_queue, $main::cur_queue); 682 683 open(CURRENT, "$main::cur_queue") 684 || die "$0: Unable to open $main::cur_queue for reading: $!\n"; 685 open(QUEUE, ">$main::new_queue") 686 || die "$0: Unable to open $main::new_queue for writing: $!\n"; 687 688 } while (not -z $main::cur_queue); 689 690 close CURRENT; 691 close QUEUE; 692 693 unless (defined($main::opt_debug)) { 694 clean_up(); 695 } 696} 697 698sub clean_up { 699 unlink $main::cur_queue, $main::new_queue; 700 rmdir $main::tmpdir; 701 output "Removed temporary directory $main::tmpdir and its contents.\n", 1; 702} 703 704sub got_signal { 705 my ($signalname) = @_; 706 707 clean_up() unless defined $main::opt_debug; 708 709 print STDERR "Caught SIG$signalname.\n"; 710 exit 1; 711} 712 713# Whether URL is 'internal' or 'external' 714sub is_internal ($) { 715 my ($url) = @_; 716 717 return ( $url =~ /$main::opt_match/o 718 and not (defined $main::opt_exclude and $url =~ /$main::opt_exclude/o)); 719} 720 721 722sub handle_url { 723 my ($line) = @_; 724 my ($urlstr, $urlparent) = split(/\|/, $line); 725 726 my $reqtype; 727 my $response; 728 my $type; 729 730 $stats{'todo'}--; 731 732 # Add this URL to the ones we've seen already, return if it is a 733 # duplicate. 734 return if add_checked($urlstr); 735 736 $stats{'link'}++; 737 738 # Is this an external URL and we only check internal stuff? 739 return if defined $main::opt_internal_only 740 and not is_internal($urlstr); 741 742 my $url = URI->new($urlstr); 743 744 # Perhaps this is a URL we are not interested in checking... 745 if (not defined($url->scheme) 746 or $url->scheme !~ /^(https?|file|ftp|gopher|nntp)$/o ) { 747 # Ignore URLs which we know we can ignore, create error for others 748 if ($url->scheme =~ /^(news|mailto|javascript|mms)$/o) { 749 output "Ignore $url", 1; 750 } else { 751 add_error($urlstr, $urlparent, 904, "Unknown scheme in URL: " 752 . $url->scheme); 753 } 754 return; 755 } 756 757 # Guess/determine the type of document we might retrieve from this 758 # URL. We do this because we only want to use a full GET for HTML 759 # document. No need to retrieve images, etc. 760 if ($url->path =~ /\/$/o || $url->path eq "") { 761 $type = 'text/html'; 762 } else { 763 $type = guess_media_type($url->path); 764 } 765 # application/octet-stream is the fallback of LWP's guess stuff, so 766 # if we get this then we ask the server what we got just to be sure. 767 if ($type eq 'application/octet-stream') { 768 $response = performRequest('HEAD', $url, $urlparent, $type, $main::opt_language); 769 $type = $response->content_type; 770 } 771 772 # Determine if this is a URL we should GET fully or partially (using HEAD) 773 if ($type =~ /html/o 774 && $url->scheme =~ /^(https?|file|ftp|gopher)$/o 775 and is_internal($url->as_string) 776 && (!defined $main::opt_exclude || $url !~ /$main::opt_exclude/o)) { 777 $reqtype = 'GET'; 778 } else { 779 $reqtype = 'HEAD'; 780 } 781 782 # Get the document, unless we already did while determining the type 783 $response = performRequest($reqtype, $url, $urlparent, $type, $main::opt_language) 784 unless defined($response) and $reqtype eq 'HEAD'; 785 786 # Ok, we got something back from checking, let's see what it is 787 if ($response->is_success) { 788 select(undef, undef, undef, $main::opt_sleep) 789 unless $main::opt_debug || $url->scheme eq 'file'; 790 791 # Internal HTML documents need to be given to handle_doc for processing 792 if ($reqtype eq 'GET' and is_internal($url->as_string)) { 793 handle_doc($response, $urlstr); 794 } 795 } else { 796 797 # Right, so it wasn't the smashing succes we hoped for, so bring 798 # the bad news and store the pertinent information for later 799 add_error($url, $urlparent, $response->code, $response->message); 800 801 if ($response->is_redirect and is_internal($url->as_string)) { 802 if ($response->code == 300) { # multiple choices, but no redirection available 803 output 'Multiple choices', 2; 804 } else { 805 my $baseURI = URI->new($url); 806 if (defined $response->header('Location')) { 807 my $redir_url = URI->new_abs($response->header('Location'), $baseURI); 808 output "Redirected to $redir_url", 2; 809 add_to_queue($redir_url, $urlparent); 810 $stats{'todo'}++; 811 } else { 812 output 'Location header missing from redirect response', 2; 813 } 814 } 815 } 816 } 817 # Done with this URL 818} 819 820sub performRequest { 821 my ($reqtype, $url, $urlparent, $type, $language) = @_; 822 823 my ($response); 824 825 # A better solution here would be to use GET exclusively. Here is how 826 # to do that. We would have to set this max_size thing in 827 # check_external, I guess... 828 # Set $ua->max_size(1) and then try a normal GET request. However, 829 # that doesn't always work as evidenced by an FTP server that just 830 # hangs in this case... Needs more testing to see if the timeout 831 # catches this. 832 833 # Normally, we would only need to do a HEAD, but given the way LWP 834 # handles gopher requests, we need to do a GET on those to get at 835 # least a 500 and 501 error. We would need to parse the document 836 # returned by LWP to find out if we had problems finding the 837 # file. -- Patch by Bruce Speyer <bspeyer@texas-one.org> 838 839 # We also need to do GET instead of HEAD if we know the remote 840 # server won't accept it. The standard way for an HTTP server to 841 # indicate this is by returning a 405 ("Method Not Allowed") or 501 842 # ("Not Implemented"). Other circumstances may also require sending 843 # GETs instead of HEADs to a server. Details are documented below. 844 # -- Larry Gilbert <larry@n2h2.com> 845 846 # Normally we try a HEAD request first, then a GET request if 847 # needed. There may be circumstances in which we skip doing a HEAD 848 # (e.g. when we should be getting the whole document). 849 foreach my $try ('HEAD', 'GET') { 850 851 # Skip trying HEAD when we know we need to do a GET or when we 852 # know only a GET will work anyway. 853 next if $try eq 'HEAD' and 854 ($reqtype eq 'GET' 855 or $url->scheme eq 'gopher' 856 or (defined $url->authority and $main::servers_get_only{$url->authority})); 857 858 # Output what we are going to do with this link 859 output(sprintf("%4s %s (%s)\n", $try, $url, $type), 1); 860 861 # Create the request with all appropriate headers 862 my %header_hash = ( 'Referer' => $urlparent ); 863 if (defined($language) && ($language ne '')) { 864 $header_hash{'Accept-Language'} = $language; 865 } 866 my $ref_header = new HTTP::Headers(%header_hash); 867 my $request = new HTTP::Request($try, $url, $ref_header); 868 $response = $main::ua->simple_request($request); 869 870 # If we are doing a HEAD request we need to make sure nothing 871 # fishy happened. we use some heuristics to see if we are ok, or 872 # if we should try again with a GET request. 873 if ($try eq 'HEAD') { 874 875 # 400, 405, 406 and 501 are standard indications that HEAD 876 # shouldn't be used 877 # We used to check for 403 here also, but according to the HTTP spec 878 # a 403 indicates that the server understood us fine but really does 879 # not want us to see the page, so we SHOULD NOT retry. 880 if ($response->code =~ /^(400|405|406|501)$/o) { 881 output "Server does not seem to like HEAD requests; retrying", 2; 882 $main::servers_get_only{$url->authority}++; 883 next; 884 }; 885 886 # There are many servers out there that have real trouble with 887 # HEAD, so if we get a 500 Internal Server error just retry with 888 # a GET request to get an authoritive answer. We used to do this 889 # only for special cases, but the list got big and some 890 # combinations (e.g. Zope server behind Apache proxy) can't 891 # easily be detected from the headers. 892 if ($response->code =~ /^500$/o) { 893 output "Internal server error on HEAD request; retrying with GET", 2; 894 $main::servers_get_only{$url->authority}++ if defined $url->authority; 895 next; 896 } 897 898 # If we know the server we can try some specific heuristics 899 if (defined $response->server) { 900 901 # Netscape Enterprise has been seen returning 500 and even 404 902 # (yes, 404!!) in response to HEAD requests 903 if ($response->server =~ /^Netscape-Enterprise/o 904 and $response->code =~ /^404$/o) { 905 output "Unreliable Netscape-Enterprise response to HEAD request; retrying", 2; 906 $main::servers_get_only{$url->authority}++; 907 next; 908 }; 909 } 910 911 # If a HEAD request resulted in nothing noteworthy, no need for 912 # any further attempts using GET, we are done. 913 last; 914 } 915 } 916 917 return $response; 918} 919 920 921# This routine creates a (temporary) WWW page based on the current 922# findings This allows somebody to monitor the process, but is also 923# convenient when this program crashes or waits because of diskspace 924# or memory problems 925 926sub create_page { 927 my($final_page) = @_; 928 929 my $path = ""; 930 my $prevpath = ""; 931 my $prevcode = 0; 932 my $prevmessage = ""; 933 934 output "*** Start writing results page"; 935 936 open(OUT, ">$main::file.new") 937 || die "$0: Unable to open $main::file.new for writing:\n"; 938 print OUT "<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>\n"; 939 print OUT "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">\n"; 940 print OUT "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">\n"; 941 print OUT "<head>\n"; 942 if (!$final_page) { 943 printf OUT "<meta http-equiv=\"Refresh\" content=\"%d\" />\n", 944 int($main::cp_int * 60 / 2 - 5); 945 } 946 947 print OUT "<title>Checkbot report</title>\n"; 948 print OUT "<link rel=\"stylesheet\" type=\"text/css\" href=\"$main::opt_style\" />\n" if defined $main::opt_style; 949 print OUT "</head>\n"; 950 print OUT "<body>\n"; 951 print OUT "<h1><em>Checkbot</em>: main report</h1>\n"; 952 953 # Show the status of this checkbot session 954 print OUT "<table summary=\"Status of this Checkbot session\" class='status'><tr><th>Status:</th><td>"; 955 if ($final_page) { 956 print OUT "Done.<br />\n"; 957 print OUT 'Run started on ' . localtime($start_time) . ".<br />\n"; 958 print OUT 'Run duration ', duration(time() - $start_time), ".\n" 959 if $main::useduration; 960 } else { 961 print OUT "Running since " . localtime($start_time) . ".<br />\n"; 962 print OUT "Last update at ". localtime() . ".<br />\n"; 963 print OUT "Next update in <strong>", int($main::cp_int), "</strong> minutes.\n"; 964 } 965 print OUT "</td></tr></table>\n\n"; 966 967 # Summary (very brief overview of key statistics) 968 print OUT "<hr /><h2 class='summary'>Report summary</h2>\n"; 969 970 print OUT "<table summary=\"Report summary\" class='summary'>\n"; 971 print OUT "<tr id='checked'><th>Links checked</th><td class='value'>", $stats{'link'}, "</td></tr>\n"; 972 print OUT "<tr id='problems'><th>Problems so far</th><td class='value'>", $stats{'problem'}, "</td></tr>\n"; 973 print OUT "<tr id='todo'><th>Links to do</th><td class='value'>", $stats{'todo'}, "</td></tr>\n"; 974 print OUT "</table>\n"; 975 976 # Server information 977 printAllServers($final_page); 978 979 # Checkbot session parameters 980 print OUT "<hr /><h2 class='params'>Checkbot session parameters</h2>\n"; 981 print OUT "<table summary=\"Checkbot session parameters\" class='params'>\n"; 982 print OUT "<tr><th align=\"left\">--url &<br/> <command line urls></th><td class='text'>Start URL(s)</td><td class='value' id='url'>", 983 join(',', @starturls), "</td></tr>\n"; 984 print OUT "<tr><th align=\"left\">--match</th><td class='text'>Match regular expression</td><td class='value' id='match'>$main::opt_match</td></tr>\n"; 985 print OUT "<tr><th align=\"left\">--exclude</th><td class='text'>Exclude regular expression</td><td class='value' id='exclude'>$main::opt_exclude</td></tr>\n" if defined $main::opt_exclude; 986 print OUT "<tr><th align=\"left\">--filter</th><td class='text'>Filter regular expression</td><td class='value' id='filter'>$main::opt_filter</td></tr>\n" if defined $main::opt_filter; 987 print OUT "<tr><th align=\"left\">--noproxy</th><td class='text'>No Proxy for the following domains</td><td class='value' id='noproxy'>$main::opt_noproxy</td></tr>\n" if defined $main::opt_noproxy; 988 print OUT "<tr><th align=\"left\">--ignore</th><td class='text'>Ignore regular expression</td><td class='value' id='ignore'>$main::opt_ignore</td></tr>\n" if defined $main::opt_ignore; 989 print OUT "<tr><th align=\"left\">--suppress</th><td class='text'>Suppress error code and URL specified by</td><td class='value' id='suppress'>$main::opt_suppress</td></tr>\n" if defined $main::opt_suppress; 990 print OUT "<tr><th align=\"left\">--dontwarn</th><td class='text'>Don't warn for these codes</td><td class='value' id='dontwarn'>$main::opt_dontwarn</td></tr>\n" if $main::opt_dontwarn ne 'xxx'; 991 print OUT "<tr><th align=\"left\">--enable-virtual</th><td class='text'>Use virtual names only</td><td class='value' id='enable_virtual'>yes</td></tr>\n" if $main::opt_enable_virtual; 992 print OUT "<tr><th align=\"left\">--internal-only</th><td class='text'>Check only internal links</td><td class='value' id='internal_only'>yes</td></tr>\n" if defined $main::opt_internal_only; 993 print OUT "<tr><th align=\"left\">--cookies</th><td class='text'>Accept cookies</td><td class='value' id='cookies'>yes</td></tr>\n" if defined $main::opt_cookies; 994 print OUT "<tr><th align=\"left\">--sleep</th><td class='text'>Sleep seconds between requests</td><td class='value' id='sleep'>$main::opt_sleep</td></tr>\n" if ($main::opt_sleep != 0); 995 print OUT "<tr><th align=\"left\">--timeout</th><td class='text'>Request timeout seconds</td><td class='value' id='timeout'>$main::opt_timeout</td></tr>\n"; 996 print OUT "</table>\n"; 997 998 # Statistics for types of links 999 1000 print OUT signature(); 1001 1002 close(OUT); 1003 1004 rename($main::file, $main::file . ".bak"); 1005 rename($main::file . ".new", $main::file); 1006 1007 unlink $main::file . ".bak" unless $main::opt_debug; 1008 1009 output "*** Done writing result page"; 1010} 1011 1012# Create a list of all the servers, and create the corresponding table 1013# and subpages. We use the servers overview for this. This can result 1014# in strange effects when the same server (e.g. IP address) has 1015# several names, because several entries will appear. However, when 1016# using the IP address there are also a number of tricky situations, 1017# e.g. with virtual hosting. Given that likely the servers have 1018# different names for a reasons, I think it is better to have 1019# duplicate entries in some cases, instead of working off of the IP 1020# addresses. 1021 1022sub printAllServers { 1023 my ($finalPage) = @_; 1024 1025 my $server; 1026 print OUT "<hr /><h2 class='overview'>Overview per server</h2>\n"; 1027 print OUT "<table summary=\"Overview per server\" class='overview'><tr><th>Server</th><th>Server<br />Type</th><th>Documents<br />scanned</th><th>Problem<br />links</th><th>Ratio</th></tr>\n"; 1028 1029 foreach $server (sort keys %main::servers) { 1030 print_server($server, $finalPage); 1031 } 1032 print OUT "</table>\n\n"; 1033} 1034 1035sub get_server_type { 1036 my($server) = @_; 1037 1038 my $result; 1039 1040 if ( ! defined($main::server_type{$server})) { 1041 if ($server eq 'localhost') { 1042 $result = 'Direct access through filesystem'; 1043 } else { 1044 my $request = new HTTP::Request('HEAD', "http://$server/"); 1045 my $response = $main::ua->simple_request($request); 1046 $result = $response->header('Server'); 1047 } 1048 $result = "Unknown server type" if ! defined $result or $result eq ""; 1049 output "=== Server $server is a $result"; 1050 $main::server_type{$server} = $result; 1051 } 1052 $main::server_type{$server}; 1053} 1054 1055sub add_checked { 1056 my($urlstr) = @_; 1057 my $item; 1058 my $result = 0; 1059 1060 if (is_internal($urlstr) and not $main::opt_enable_virtual) { 1061 # Substitute hostname with IP-address. This keeps us from checking 1062 # the same pages for each name of the server, wasting time & resources. 1063 # Only do this if we are not dealing with virtual servers. Also, we 1064 # only do this for internal servers, because it makes no sense for 1065 # external links. 1066 my $url = URI->new($urlstr); 1067 $url->host(ip_address($url->host)) if $url->can('host'); 1068 $urlstr = $url->as_string; 1069 } 1070 1071 if (defined $main::checked{$urlstr}) { 1072 $result = 1; 1073 $main::checked{$urlstr}++; 1074 } else { 1075 $main::checked{$urlstr} = 1; 1076 } 1077 1078 return $result; 1079} 1080 1081# Has this URL already been checked? 1082sub is_checked { 1083 my ($urlstr) = @_; 1084 1085 if (is_internal($urlstr) and not $main::opt_enable_virtual) { 1086 # Substitute hostname with IP-address. This keeps us from checking 1087 # the same pages for each name of the server, wasting time & resources. 1088 # Only do this if we are not dealing with virtual servers. Also, we 1089 # only do this for internal servers, because it makes no sense for 1090 # external links. 1091 my $url = URI->new($urlstr); 1092 $url->host(ip_address($url->host)) if $url->can('host'); 1093 $urlstr = $url->as_string; 1094 } 1095 1096 return defined $main::checked{$urlstr}; 1097} 1098 1099sub add_error ($$$$) { 1100 my ($url, $urlparent, $code, $status) = @_; 1101 1102 # Check for the quick eliminations first 1103 return if $code =~ /$main::opt_dontwarn/o 1104 or defined $suppression{$code}{$url}; 1105 1106 # Check for matches on the regular expressions in the supression file 1107 if (defined $suppression{$code}) { 1108 foreach my $item ( %{$suppression{$code}} ) { 1109 if ($item =~ /^\/(.*)\/$/) { 1110 my $regexp = $1; 1111 if ($url =~ $regexp) { 1112 output "Supressing error $code for $url due to regular expression match on $regexp", 2; 1113 return; 1114 } 1115 } 1116 } 1117 } 1118 1119 $status = checkbot_status_message($code) if not defined $status; 1120 1121 output "$code $status", 2; 1122 1123 $url_error{$url}{'code'} = $code; 1124 $url_error{$url}{'status'} = $status; 1125 push @{$url_parent{$url}}, $urlparent; 1126 $stats{'problem'}++; 1127} 1128 1129# Parse document, and get the links 1130sub handle_doc { 1131 my ($response, $urlstr) = @_; 1132 1133 my $num_links = 0; 1134 my $new_links = 0; 1135 1136 # TODO: we are making an assumption here that the $reponse->base is 1137 # valid, which might not always be true! This needs to be fixed, but 1138 # first let's try to find out why this stuff is sometimes not 1139 # valid... Aha. a simple <base href="news:"> will do the trick. It is 1140 # not clear what the right fix for this is. 1141 1142 # We use the URL we used to retrieve this document as the URL to 1143 # attach the problem reports to, even though this may not be the 1144 # proper base url. 1145 my $baseurl = URI->new($urlstr); 1146 1147 # When we received the document we can add a notch to its server 1148 $main::servers{$baseurl->authority}++; 1149 1150 # Retrieve useful information from this document. 1151 # TODO: using a regexp is NOT how this should be done, but it is 1152 # easy. The right way would be to write a HTML::Parser or to use 1153 # XPath on the document DOM provided that the document is easily 1154 # parsed as XML. Either method is a lot of overhead. 1155 if ($response->content =~ /title\>(.*?)\<\/title/si) { 1156 1157 # TODO: using a general hash that stores titles for all pages may 1158 # consume too much memory. It would be better to only store the 1159 # titles for requests that had problems. That requires passing them 1160 # down to the queue. Take the easy way out for now. 1161 $url_title{$baseurl} = $1; 1162 } 1163 1164 # Check if this document has a Robots META tag. If so, check if 1165 # Checkbot is allowed to FOLLOW the links on this page. Note that we 1166 # ignore the INDEX directive because Checkbot is not an indexing 1167 # robot. See http://www.robotstxt.org/wc/meta-user.html 1168 # TODO: one more reason (see title) to properly parse this document... 1169 if ($response->content =~ /\<meta[^\>]*?robots[^\>]*?nofollow[^\>]*?\>/si) { 1170 output "Obeying robots meta tag $&, skipping document", 2; 1171 return; 1172 } 1173 1174 1175 # Parse the document just downloaded, using the base url as defined 1176 # in the response, otherwise we won't get the same behavior as 1177 # browsers and miss things like a BASE url in pages. 1178 my $p = HTML::LinkExtor->new(undef, $response->base); 1179 1180 # If charset information is missing then decoded_content doesn't 1181 # work. Fall back to content in this case, even though that may lead 1182 # to charset warnings. See bug 1665075 for reference. 1183 my $content = $response->decoded_content || $response->content; 1184 $p->parse($content); 1185 $p->eof; 1186 1187 # Deal with the links we found in this document 1188 my @links = $p->links(); 1189 foreach (@links) { 1190 my ($tag, %l) = @{$_}; 1191 foreach (keys %l) { 1192 # Get the canonical URL, so we don't need to worry about base, case, etc. 1193 my $url = $l{$_}->canonical; 1194 1195 # Remove fragments, if any 1196 $url->fragment(undef); 1197 1198 # Determine in which tag this URL was found 1199 # Ignore <base> tags because they need not point to a valid URL 1200 # in order to work (e.g. when directory indexing is turned off). 1201 next if $tag eq 'base'; 1202 1203 # Skip some 'links' that are not required to link to an actual 1204 # live link but which LinkExtor returns as links anyway. 1205 next if $tag eq 'applet' and $_ eq 'code'; 1206 next if $tag eq 'object' and $_ eq 'classid'; 1207 1208 # Run filter on the URL if defined 1209 if (defined $main::opt_filter) { 1210 die "Filter supplied with --filter option contains errors!\n$@\n" 1211 unless defined eval '$url =~ s' . $main::opt_filter 1212 } 1213 1214 # Should we ignore this URL? 1215 if (defined $main::opt_ignore and $url =~ /$main::opt_ignore/o) { 1216 output "--ignore: $url", 1; 1217 next; 1218 } 1219 1220 # Check whether URL has fully-qualified hostname 1221 if ($url->can('host') and $url->scheme ne 'news') { 1222 if (! defined $url->host) { 1223 add_error($url, $baseurl->as_string, '901', 1224 $checkbot_errors{'901'}); 1225 } elsif (!$main::opt_allow_simple_hosts && $url->host !~ /\./) { 1226 add_error($url, $baseurl->as_string, '902', 1227 $checkbot_errors{'902'}); 1228 } 1229 } 1230 1231 # Some servers do not process // correctly in requests for relative 1232 # URLs. We should flag them here. Note that // in a URL path is 1233 # actually valid per RFC 2396, and that they should not be removed 1234 # when processing relative URLs as per RFC 1808. See 1235 # e.g. <http://deesse.univ-lemans.fr:8003/Connected/RFC/1808/18.html>. 1236 # Thanks to Randal Schwartz and Reinier Post for their explanations. 1237 if ($url =~ /^http:\/\/.*\/\//) { 1238 add_error($url, $baseurl->as_string, '903', 1239 $checkbot_errors{'903'}); 1240 } 1241 1242 # We add all URLs found to the queue, unless we already checked 1243 # it earlier 1244 if (is_checked($url)) { 1245 1246 # If an error has already been logged for this URL we add the 1247 # current parent to the list of parents on which this URL 1248 # appears. 1249 if (defined $url_error{$url}) { 1250 push @{$url_parent{$url}}, $baseurl->as_string; 1251 $stats{'problem'}++; 1252 } 1253 1254 $stats{'link'}++; 1255 } else { 1256 add_to_queue($url, $baseurl); 1257 $stats{'todo'}++; 1258 $new_links++; 1259 } 1260 $num_links++; 1261 } 1262 } 1263 output "Got $num_links links ($new_links new) from document", 2; 1264} 1265 1266 1267sub add_to_queue { 1268 my ($url, $parent) = @_; 1269 1270 print QUEUE $url . '|' . $parent . "\n"; 1271} 1272 1273sub checkbot_status_message ($) { 1274 my ($code) = @_; 1275 1276 my $result = status_message($code) || $checkbot_errors{$code} 1277 || '(Undefined status)'; 1278} 1279 1280sub print_server ($$) { 1281 my($server, $final_page) = @_; 1282 1283 my $host = $server; 1284 $host =~ s/(.*):\d+/$1/; 1285 1286 output "Writing server $server (really " . ip_address($host) . ")", 1; 1287 1288 my $server_problem = count_problems($server); 1289 my $filename = "$main::server_prefix-$server.html"; 1290 $filename =~ s/:/-/o; 1291 1292 print OUT "<tr><td class='server'>"; 1293 print OUT "<a href=\"@{[ (fileparse($filename))[0] ]}\">" if $server_problem > 0; 1294 print OUT "$server"; 1295 print OUT "</a>" if $server_problem > 0; 1296 print OUT "</td>"; 1297 print OUT "<td class='servertype'>" . get_server_type($server) . "</td>"; 1298 printf OUT "<td class='unique' align=\"right\">%d</td>", 1299 $main::servers{$server} + $server_problem; 1300 if ($server_problem) { 1301 printf OUT "<td class='problems' id='oops' align=\"right\">%d</td>", 1302 $server_problem; 1303 } else { 1304 printf OUT "<td class='problems' id='zero_defects' align=\"right\">%d</td>", 1305 $server_problem; 1306 } 1307 1308 my $ratio = $server_problem / ($main::servers{$server} + $server_problem) * 100; 1309 print OUT "<td class='ratio' align=\"right\">"; 1310 print OUT "<strong>" unless $ratio < 0.5; 1311 printf OUT "%4d%%", $ratio; 1312 print OUT "</strong>" unless $ratio < 0.5; 1313 print OUT "</td>"; 1314 print OUT "</tr>\n"; 1315 1316 # Create this server file 1317 open(SERVER, ">$filename") 1318 || die "Unable to open server file $filename for writing: $!"; 1319 print SERVER "<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>\n"; 1320 print SERVER "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">\n"; 1321 print SERVER "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">\n"; 1322 print SERVER "<head>\n"; 1323 if (!$final_page) { 1324 printf SERVER "<meta http-equiv=\"Refresh\" content=\"%d\" />\n", 1325 int($main::cp_int * 60 / 2 - 5); 1326 } 1327 print SERVER "<link rel=\"stylesheet\" type=\"text/css\" href=\"$main::opt_style\" />\n" if defined $main::opt_style; 1328 print SERVER "<title>Checkbot: output for server $server</title></head>\n"; 1329 print SERVER "<body><h2><em>Checkbot</em>: report for server <tt>$server</tt></h2>\n"; 1330 print SERVER "<p>Go To: <a href=\"@{[ (fileparse($main::file))[0] ]}\">Main report page</a>"; 1331 1332 printServerProblems($server, $final_page); 1333 1334 print SERVER "\n"; 1335 print SERVER signature(); 1336 1337 close SERVER; 1338} 1339 1340# Return a string containing Checkbot's signature for HTML pages 1341sub signature { 1342 return "<hr />\n<p class='signature'>Page created by <a href=\"http://degraaff.org/checkbot/\">Checkbot $VERSION</a> on <em>" . localtime() . "</em>.</p>\n". 1343 "<p><a href=\"http://validator.w3.org/check/?uri=referer\"><img src=\"http://www.w3.org/Icons/valid-xhtml11\" alt=\"Valid XHTML 1.1\" height=\"31\" width=\"88\" /></a></p>". 1344 "</body></html>"; 1345} 1346 1347# Loop through all possible problems, select relevant ones for this server 1348# and display them in a meaningful way. 1349sub printServerProblems ($$) { 1350 my ($server, $final_page) = @_; 1351 $server = quotemeta $server; 1352 1353 my $separator = "<hr />\n"; 1354 1355 my %thisServerList = (); 1356 1357 # First we find all the problems for this particular server 1358 foreach my $url (keys %url_parent) { 1359 foreach my $parent (@{$url_parent{$url}}) { 1360 next if $parent !~ $server; 1361 chomp $parent; 1362 $thisServerList{$url_error{$url}{'code'}}{$parent}{$url} 1363 = $url_error{$url}{'status'}; 1364 } 1365 } 1366 1367 # Do a run to find all error codes on this page, and include a table 1368 # of contents to the actual report 1369 foreach my $code (sort keys %thisServerList) { 1370 print SERVER ", <a href=\"#rc$code\">$code "; 1371 print SERVER checkbot_status_message($code); 1372 print SERVER "</a>"; 1373 } 1374 print SERVER ".</p>\n"; 1375 1376 1377 # Now run through this list and print the errors 1378 foreach my $code (sort keys %thisServerList) { 1379 my $codeOut = ''; 1380 1381 foreach my $parent (sort keys %{ $thisServerList{$code} }) { 1382 my $urlOut = ''; 1383 foreach my $url (sort keys %{ $thisServerList{$code}{$parent} }) { 1384 my $status = $thisServerList{$code}{$parent}{$url}; 1385 $urlOut .= "<li><a href=\"$url\">$url</a><br/>\n"; 1386 $urlOut .= "$status" 1387 if defined $status and $status ne checkbot_status_message($code); 1388 $urlOut .= "</li>\n"; 1389 } 1390 if ($urlOut ne '') { 1391 $codeOut .= "<dt><a href=\"$parent\">$parent</a>"; 1392 $codeOut .= "<br />$url_title{$parent}\n" if defined $url_title{$parent}; 1393 $codeOut .= "<dd><ul>\n$urlOut\n</ul>\n\n"; 1394 } 1395 } 1396 1397 if ($codeOut ne '') { 1398 print SERVER $separator if $separator; 1399 $separator = ''; 1400 print SERVER "<h4 id=\"rc$code\">$code "; 1401 print SERVER checkbot_status_message($code); 1402 print SERVER "</h4>\n<dl>\n$codeOut\n</dl>\n"; 1403 } 1404 } 1405} 1406 1407sub check_point { 1408 if ( ($main::cp_last + 60 * $main::cp_int < time()) 1409 || ($main::opt_debug && $main::opt_verbose)) { 1410 &create_page(0); 1411 $main::cp_last = time(); 1412 # Increase the intervall from one snapshot to the next by 25% 1413 # until we have reached the maximum. 1414 $main::cp_int *= 1.25 unless $main::opt_debug; 1415 $main::cp_int = $main::opt_interval if $main::cp_int > $main::opt_interval; 1416 } 1417} 1418 1419sub send_mail { 1420 my $msg = new Mail::Send; 1421 my $sub = 'Checkbot results for '; 1422 $sub .= join(', ', @starturls); 1423 $sub .= ': ' . $stats{'problem'} . ' errors'; 1424 1425 $msg->to($main::opt_mailto); 1426 $msg->subject($sub); 1427 1428 my $fh = $msg->open; 1429 1430 print $fh "Checkbot results for:\n " . join("\n ", @starturls) . "\n\n"; 1431 print $fh "User-supplied note: $main::opt_note\n\n" 1432 if defined $main::opt_note; 1433 1434 print $fh $stats{'link'}, " links were checked, and "; 1435 print $fh $stats{'problem'}, " problems were detected.\n"; 1436 1437 print $fh 'Run started on ' . localtime($start_time) . "\n"; 1438 print $fh 'Run duration ', duration(time() - $start_time), "\n" 1439 if $main::useduration; 1440 1441 1442 print $fh "\n-- \nCheckbot $VERSION\n"; 1443 print $fh "<URL:http://degraaff.org/checkbot/>\n"; 1444 1445 $fh->close; 1446} 1447 1448sub print_help { 1449 print <<"__EOT__"; 1450Checkbot $VERSION command line options: 1451 1452 --cookies Accept cookies from the server 1453 --debug Debugging mode: No pauses, stop after 25 links. 1454 --file file Use file as basis for output file names. 1455 --help Provide this message. 1456 --mailto address Mail brief synopsis to address when done. 1457 --noproxy domains Do not proxy requests to given domains. 1458 --verbose Verbose mode: display many messages about progress. 1459 --url url Start URL 1460 --match match Check pages only if URL matches `match' 1461 If no match is given, the start URL is used as a match 1462 --exclude exclude Exclude pages if the URL matches 'exclude' 1463 --filter regexp Run regexp on each URL found 1464 --ignore ignore Ignore URLs matching 'ignore' 1465 --suppress file Use contents of 'file' to suppress errors in output 1466 --note note Include Note (e.g. URL to report) along with Mail message. 1467 --proxy URL URL of proxy server for HTTP and FTP requests. 1468 --internal-only Only check internal links, skip checking external links. 1469 --sleep seconds Sleep this many seconds between requests (default 0) 1470 --style url Reference the style sheet at this URL. 1471 --timeout seconds Timeout for http requests in seconds (default 120) 1472 --interval seconds Maximum time interval between updates (default 10800) 1473 --dontwarn codes Do not write warnings for these HTTP response codes 1474 --enable-virtual Use only virtual names, not IP numbers for servers 1475 --language Specify 2-letter language code for language negotiation 1476 1477Options --match, --exclude, and --ignore can take a perl regular expression 1478as their argument\n 1479Use 'perldoc checkbot' for more verbose documentation. 1480Checkbot WWW page : http://degraaff.org/checkbot/ 1481Mail bugs and problems: checkbot\@degraaff.org 1482__EOT__ 1483 1484 exit 0; 1485} 1486 1487sub ip_address { 1488 my($host) = @_; 1489 1490 return $main::ip_cache{$host} if defined $main::ip_cache{$host}; 1491 1492 my($name,$aliases,$adrtype,$length,@addrs) = gethostbyname($host); 1493 if (defined $addrs[0]) { 1494 my($n1,$n2,$n3,$n4) = unpack ('C4',$addrs[0]); 1495 $main::ip_cache{$host} = "$n1.$n2.$n3.$n4"; 1496 } else { 1497 # Whee! No IP-address found for this host. Just keep whatever we 1498 # got for the host. If this really is some kind of error it will 1499 # be found later on. 1500 $main::ip_cache{$host} = $host; 1501 } 1502} 1503 1504sub count_problems { 1505 my ($server) = @_; 1506 $server = quotemeta $server; 1507 my $count = 0; 1508 1509 foreach my $url (sort keys %url_parent) { 1510 foreach my $parent (@{ $url_parent{$url} }) { 1511 $count++ if $parent =~ m/$server/; 1512 } 1513 } 1514 return $count; 1515} 1516 1517