1#!/usr/bin/perl -wT 2# 3# W3C Link Checker 4# by Hugo Haas <hugo@w3.org> 5# (c) 1999-2011 World Wide Web Consortium 6# based on Renaud Bruyeron's checklink.pl 7# 8# This program is licensed under the W3C(r) Software License: 9# http://www.w3.org/Consortium/Legal/copyright-software 10# 11# The documentation is at: 12# http://validator.w3.org/docs/checklink.html 13# 14# See the Mercurial interface at: 15# http://dvcs.w3.org/hg/link-checker/ 16# 17# An online version is available at: 18# http://validator.w3.org/checklink 19# 20# Comments and suggestions should be sent to the www-validator mailing list: 21# www-validator@w3.org (with 'checklink' in the subject) 22# http://lists.w3.org/Archives/Public/www-validator/ (archives) 23 24use strict; 25use 5.008; 26 27# Get rid of potentially unsafe and unneeded environment variables. 28delete(@ENV{qw(IFS CDPATH ENV BASH_ENV)}); 29$ENV{PATH} = undef; 30 31# ...but we want PERL5?LIB honored even in taint mode, see perlsec, perl5lib, 32# http://www.mail-archive.com/cpan-testers-discuss%40perl.org/msg01064.html 33use Config qw(%Config); 34use lib map { /(.*)/ } 35 defined($ENV{PERL5LIB}) ? split(/$Config{path_sep}/, $ENV{PERL5LIB}) : 36 defined($ENV{PERLLIB}) ? split(/$Config{path_sep}/, $ENV{PERLLIB}) : 37 (); 38 39# ----------------------------------------------------------------------------- 40 41package W3C::UserAgent; 42 43use LWP::RobotUA 1.19 qw(); 44use LWP::UserAgent qw(); 45use Net::HTTP::Methods 5.833 qw(); # >= 5.833 for 4kB cookies (#6678) 46 47# if 0, ignore robots exclusion (useful for testing) 48use constant USE_ROBOT_UA => 1; 49 50if (USE_ROBOT_UA) { 51 @W3C::UserAgent::ISA = qw(LWP::RobotUA); 52} 53else { 54 @W3C::UserAgent::ISA = qw(LWP::UserAgent); 55} 56 57sub new 58{ 59 my $proto = shift; 60 my $class = ref($proto) || $proto; 61 my ($name, $from, $rules) = @_; 62 63 # For security/privacy reasons, if $from was not given, do not send it. 64 # Cheat by defining something for the constructor, and resetting it later. 65 my $from_ok = $from; 66 $from ||= 'www-validator@w3.org'; 67 68 my $self; 69 if (USE_ROBOT_UA) { 70 $self = $class->SUPER::new($name, $from, $rules); 71 } 72 else { 73 my %cnf; 74 @cnf{qw(agent from)} = ($name, $from); 75 $self = LWP::UserAgent->new(%cnf); 76 $self = bless $self, $class; 77 } 78 79 $self->from(undef) unless $from_ok; 80 81 $self->env_proxy(); 82 83 $self->allow_private_ips(1); 84 85 $self->protocols_forbidden([qw(mailto javascript)]); 86 87 return $self; 88} 89 90sub allow_private_ips 91{ 92 my $self = shift; 93 if (@_) { 94 $self->{Checklink_allow_private_ips} = shift; 95 if (!$self->{Checklink_allow_private_ips}) { 96 97 # Pull in dependencies 98 require Net::IP; 99 require Socket; 100 require Net::hostent; 101 } 102 } 103 return $self->{Checklink_allow_private_ips}; 104} 105 106sub redirect_progress_callback 107{ 108 my $self = shift; 109 $self->{Checklink_redirect_callback} = shift if @_; 110 return $self->{Checklink_redirect_callback}; 111} 112 113sub simple_request 114{ 115 my $self = shift; 116 117 my $response = $self->ip_disallowed($_[0]->uri()); 118 119 # RFC 2616, section 15.1.3 120 $_[0]->remove_header("Referer") 121 if ($_[0]->referer() && 122 (!$_[0]->uri()->secure() && URI->new($_[0]->referer())->secure())); 123 124 $response ||= do { 125 local $SIG{__WARN__} = 126 sub { # Suppress some warnings, rt.cpan.org #18902 127 warn($_[0]) if ($_[0] && $_[0] !~ /^RobotRules/); 128 }; 129 130 # @@@ Why not just $self->SUPER::simple_request? 131 $self->W3C::UserAgent::SUPER::simple_request(@_); 132 }; 133 134 if (!defined($self->{FirstResponse})) { 135 $self->{FirstResponse} = $response->code(); 136 $self->{FirstMessage} = $response->message() || '(no message)'; 137 } 138 139 return $response; 140} 141 142sub redirect_ok 143{ 144 my ($self, $request, $response) = @_; 145 146 if (my $callback = $self->redirect_progress_callback()) { 147 148 # @@@ TODO: when an LWP internal robots.txt request gets redirected, 149 # this will a bit confusingly fire for it too. Would need a robust 150 # way to determine whether the request is such a LWP "internal 151 # robots.txt" one. 152 &$callback($request->method(), $request->uri()); 153 } 154 155 return 0 unless $self->SUPER::redirect_ok($request, $response); 156 157 if (my $res = $self->ip_disallowed($request->uri())) { 158 $response->previous($response->clone()); 159 $response->request($request); 160 $response->code($res->code()); 161 $response->message($res->message()); 162 return 0; 163 } 164 165 return 1; 166} 167 168# 169# Checks whether we're allowed to retrieve the document based on its IP 170# address. Takes an URI object and returns a HTTP::Response containing the 171# appropriate status and error message if the IP was disallowed, 0 172# otherwise. URIs without hostname or IP address are always allowed, 173# including schemes where those make no sense (eg. data:, often javascript:). 174# 175sub ip_disallowed 176{ 177 my ($self, $uri) = @_; 178 return 0 if $self->allow_private_ips(); # Short-circuit 179 180 my $hostname = undef; 181 eval { $hostname = $uri->host() }; # Not all URIs implement host()... 182 return 0 unless $hostname; 183 184 my $addr = my $iptype = my $resp = undef; 185 if (my $host = Net::hostent::gethostbyname($hostname)) { 186 $addr = Socket::inet_ntoa($host->addr()) if $host->addr(); 187 if ($addr && (my $ip = Net::IP->new($addr))) { 188 $iptype = $ip->iptype(); 189 } 190 } 191 if ($iptype && $iptype ne 'PUBLIC') { 192 $resp = HTTP::Response->new(403, 193 'Checking non-public IP address disallowed by link checker configuration' 194 ); 195 $resp->header('Client-Warning', 'Internal response'); 196 } 197 return $resp; 198} 199 200# ----------------------------------------------------------------------------- 201 202package W3C::LinkChecker; 203 204use vars qw($AGENT $PACKAGE $PROGRAM $VERSION $REVISION 205 $DocType $Head $Accept $ContentTypes %Cfg $CssUrl); 206 207use CSS::DOM 0.09 qw(); # >= 0.09 for many bugfixes 208use CSS::DOM::Constants qw(:rule); 209use CSS::DOM::Style qw(); 210use CSS::DOM::Util qw(); 211use Encode qw(); 212use HTML::Entities qw(); 213use HTML::Parser 3.40 qw(); # >= 3.40 for utf8_mode() 214use HTTP::Headers::Util qw(); 215use HTTP::Message 5.827 qw(); # >= 5.827 for content_charset() 216use HTTP::Request 5.814 qw(); # >= 5.814 for accept_decodable() 217use HTTP::Response 1.50 qw(); # >= 1.50 for decoded_content() 218use Time::HiRes qw(); 219use URI 1.53 qw(); # >= 1.53 for secure() 220use URI::Escape qw(); 221use URI::Heuristic qw(); 222 223# @@@ Needs also W3C::UserAgent but can't use() it here. 224 225use constant RC_ROBOTS_TXT => -1; 226use constant RC_DNS_ERROR => -2; 227use constant RC_IP_DISALLOWED => -3; 228use constant RC_PROTOCOL_DISALLOWED => -4; 229 230use constant LINE_UNKNOWN => -1; 231 232use constant MP2 => 233 (exists($ENV{MOD_PERL_API_VERSION}) && $ENV{MOD_PERL_API_VERSION} >= 2); 234 235# Tag=>attribute mapping of things we treat as links. 236# Note: meta/@http-equiv gets special treatment, see start() for details. 237use constant LINK_ATTRS => { 238 a => ['href'], 239 240 # base/@href intentionally not checked 241 # http://www.w3.org/mid/200802091439.27764.ville.skytta%40iki.fi 242 area => ['href'], 243 audio => ['src'], 244 blockquote => ['cite'], 245 body => ['background'], 246 command => ['icon'], 247 248 # button/@formaction not checked (side effects) 249 del => ['cite'], 250 251 # @pluginspage, @pluginurl, @href: pre-HTML5 proprietary 252 embed => ['href', 'pluginspage', 'pluginurl', 'src'], 253 254 # form/@action not checked (side effects) 255 frame => ['longdesc', 'src'], 256 html => ['manifest'], 257 iframe => ['longdesc', 'src'], 258 img => ['longdesc', 'src'], 259 260 # input/@action, input/@formaction not checked (side effects) 261 input => ['src'], 262 ins => ['cite'], 263 link => ['href'], 264 object => ['data'], 265 q => ['cite'], 266 script => ['src'], 267 source => ['src'], 268 track => ['src'], 269 video => ['src', 'poster'], 270}; 271 272# Tag=>[separator, attributes] mapping of things we treat as lists of links. 273use constant LINK_LIST_ATTRS => { 274 a => [qr/\s+/, ['ping']], 275 applet => [qr/[\s,]+/, ['archive']], 276 area => [qr/\s+/, ['ping']], 277 head => [qr/\s+/, ['profile']], 278 object => [qr/\s+/, ['archive']], 279}; 280 281# TBD/TODO: 282# - applet/@code? 283# - bgsound/@src? 284# - object/@classid? 285# - isindex/@action? 286# - layer/@background,@src? 287# - ilayer/@background? 288# - table,tr,td,th/@background? 289# - xmp/@href? 290 291@W3C::LinkChecker::ISA = qw(HTML::Parser); 292 293BEGIN { 294 295 # Version info 296 $PACKAGE = 'W3C Link Checker'; 297 $PROGRAM = 'W3C-checklink'; 298 $VERSION = '4.81'; 299 $REVISION = sprintf('version %s (c) 1999-2011 W3C', $VERSION); 300 $AGENT = sprintf( 301 '%s/%s %s', 302 $PROGRAM, $VERSION, 303 ( W3C::UserAgent::USE_ROBOT_UA ? LWP::RobotUA->_agent() : 304 LWP::UserAgent->_agent() 305 ) 306 ); 307 308 # Pull in mod_perl modules if applicable. 309 eval { 310 local $SIG{__DIE__} = undef; 311 require Apache2::RequestUtil; 312 } if MP2(); 313 314 my @content_types = qw( 315 text/html 316 application/xhtml+xml;q=0.9 317 application/vnd.wap.xhtml+xml;q=0.6 318 ); 319 $Accept = join(', ', @content_types, '*/*;q=0.5'); 320 push(@content_types, 'text/css', 'text/html-sandboxed'); 321 my $re = join('|', map { s/;.*//; quotemeta } @content_types); 322 $ContentTypes = qr{\b(?:$re)\b}io; 323 324 # Regexp for matching URL values in CSS. 325 $CssUrl = qr/(?:\s|^)url\(\s*(['"]?)(.*?)\1\s*\)(?=\s|$)/; 326 327 # 328 # Read configuration. If the W3C_CHECKLINK_CFG environment variable has 329 # been set or the default contains a non-empty file, read it. Otherwise, 330 # skip silently. 331 # 332 my $defaultconfig = '/etc/w3c/checklink.conf'; 333 if ($ENV{W3C_CHECKLINK_CFG} || -s $defaultconfig) { 334 335 require Config::General; 336 Config::General->require_version(2.06); # Need 2.06 for -SplitPolicy 337 338 my $conffile = $ENV{W3C_CHECKLINK_CFG} || $defaultconfig; 339 eval { 340 my %config_opts = ( 341 -ConfigFile => $conffile, 342 -SplitPolicy => 'equalsign', 343 -AllowMultiOptions => 'no', 344 ); 345 %Cfg = Config::General->new(%config_opts)->getall(); 346 }; 347 if ($@) { 348 die <<"EOF"; 349Failed to read configuration from '$conffile': 350$@ 351EOF 352 } 353 } 354 $Cfg{Markup_Validator_URI} ||= 'http://validator.w3.org/check?uri=%s'; 355 $Cfg{CSS_Validator_URI} ||= 356 'http://jigsaw.w3.org/css-validator/validator?uri=%s'; 357 $Cfg{Doc_URI} ||= 'http://validator.w3.org/docs/checklink.html'; 358 359 # Untaint config params that are used as the format argument to (s)printf(), 360 # Perl 5.10 does not want to see that in taint mode. 361 ($Cfg{Markup_Validator_URI}) = ($Cfg{Markup_Validator_URI} =~ /^(.*)$/); 362 ($Cfg{CSS_Validator_URI}) = ($Cfg{CSS_Validator_URI} =~ /^(.*)$/); 363 364 $DocType = 365 '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">'; 366 my $css_url = URI->new_abs('linkchecker.css', $Cfg{Doc_URI}); 367 my $js_url = URI->new_abs('linkchecker.js', $Cfg{Doc_URI}); 368 $Head = 369 sprintf(<<'EOF', HTML::Entities::encode($AGENT), $css_url, $js_url); 370<meta http-equiv="Content-Script-Type" content="text/javascript" /> 371<meta name="generator" content="%s" /> 372<link rel="stylesheet" type="text/css" href="%s" /> 373<script type="text/javascript" src="%s"></script> 374EOF 375 376 # Trusted environment variables that need laundering in taint mode. 377 for (qw(NNTPSERVER NEWSHOST)) { 378 ($ENV{$_}) = ($ENV{$_} =~ /^(.*)$/) if $ENV{$_}; 379 } 380 381 # Use passive FTP by default, see Net::FTP(3). 382 $ENV{FTP_PASSIVE} = 1 unless exists($ENV{FTP_PASSIVE}); 383} 384 385# Autoflush 386$| = 1; 387 388# Different options specified by the user 389my $cmdline = !($ENV{GATEWAY_INTERFACE} && $ENV{GATEWAY_INTERFACE} =~ /^CGI/); 390my %Opts = ( 391 Command_Line => $cmdline, 392 Quiet => 0, 393 Summary_Only => 0, 394 Verbose => 0, 395 Progress => 0, 396 HTML => 0, 397 Timeout => 30, 398 Redirects => 1, 399 Dir_Redirects => 1, 400 Accept_Language => $cmdline ? undef : $ENV{HTTP_ACCEPT_LANGUAGE}, 401 Cookies => undef, 402 No_Referer => 0, 403 Hide_Same_Realm => 0, 404 Depth => 0, # < 0 means unlimited recursion. 405 Sleep_Time => 1, 406 Connection_Cache_Size => 2, 407 Max_Documents => 150, # For the online version. 408 User => undef, 409 Password => undef, 410 Base_Locations => [], 411 Exclude => undef, 412 Exclude_Docs => undef, 413 Suppress_Redirect => [], 414 Suppress_Redirect_Prefix => [], 415 Suppress_Redirect_Regexp => [], 416 Suppress_Temp_Redirects => 1, 417 Suppress_Broken => [], 418 Suppress_Fragment => [], 419 Masquerade => 0, 420 Masquerade_From => '', 421 Masquerade_To => '', 422 Trusted => $Cfg{Trusted}, 423 Allow_Private_IPs => defined($Cfg{Allow_Private_IPs}) ? 424 $Cfg{Allow_Private_IPs} : 425 $cmdline, 426); 427undef $cmdline; 428 429# Global variables 430# What URI's did we process? (used for recursive mode) 431my %processed; 432 433# Result of the HTTP query 434my %results; 435 436# List of redirects 437my %redirects; 438 439# Count of the number of documents checked 440my $doc_count = 0; 441 442# Time stamp 443my $timestamp = &get_timestamp(); 444 445# Per-document header; undefined if already printed. See print_doc_header(). 446my $doc_header; 447 448&parse_arguments() if $Opts{Command_Line}; 449 450my $ua = W3C::UserAgent->new($AGENT); # @@@ TODO: admin address 451 452$ua->conn_cache({total_capacity => $Opts{Connection_Cache_Size}}); 453if ($ua->can('delay')) { 454 $ua->delay($Opts{Sleep_Time} / 60); 455} 456$ua->timeout($Opts{Timeout}); 457 458# Set up cookie stash if requested 459if (defined($Opts{Cookies})) { 460 require HTTP::Cookies; 461 my $cookie_file = $Opts{Cookies}; 462 if ($cookie_file eq 'tmp') { 463 $cookie_file = undef; 464 } 465 elsif ($cookie_file =~ /^(.*)$/) { 466 $cookie_file = $1; # untaint 467 } 468 $ua->cookie_jar(HTTP::Cookies->new(file => $cookie_file, autosave => 1)); 469} 470eval { $ua->allow_private_ips($Opts{Allow_Private_IPs}); }; 471if ($@) { 472 die <<"EOF"; 473Allow_Private_IPs is false; this feature requires the Net::IP, Socket, and 474Net::hostent modules: 475$@ 476EOF 477} 478 479# Add configured forbidden protocols 480if ($Cfg{Forbidden_Protocols}) { 481 my $forbidden = $ua->protocols_forbidden(); 482 push(@$forbidden, split(/[,\s]+/, lc($Cfg{Forbidden_Protocols}))); 483 $ua->protocols_forbidden($forbidden); 484} 485 486if ($Opts{Command_Line}) { 487 488 require Text::Wrap; 489 Text::Wrap->import('wrap'); 490 491 require URI::file; 492 493 &usage(1) unless scalar(@ARGV); 494 495 $Opts{_Self_URI} = 'http://validator.w3.org/checklink'; # For HTML output 496 497 &ask_password() if ($Opts{User} && !$Opts{Password}); 498 499 if (!$Opts{Summary_Only}) { 500 printf("%s %s\n", $PACKAGE, $REVISION) unless $Opts{HTML}; 501 } 502 else { 503 $Opts{Verbose} = 0; 504 $Opts{Progress} = 0; 505 } 506 507 # Populate data for print_form() 508 my %params = ( 509 summary => $Opts{Summary_Only}, 510 hide_redirects => !$Opts{Redirects}, 511 hide_type => $Opts{Dir_Redirects} ? 'dir' : 'all', 512 no_accept_language => !( 513 defined($Opts{Accept_Language}) && $Opts{Accept_Language} eq 'auto' 514 ), 515 no_referer => $Opts{No_Referer}, 516 recursive => ($Opts{Depth} != 0), 517 depth => $Opts{Depth}, 518 ); 519 520 my $check_num = 1; 521 my @bases = @{$Opts{Base_Locations}}; 522 for my $uri (@ARGV) { 523 524 # Reset base locations so that previous URI's given on the command line 525 # won't affect the recursion scope for this URI (see check_uri()) 526 @{$Opts{Base_Locations}} = @bases; 527 528 # Transform the parameter into a URI 529 $uri = &urize($uri); 530 $params{uri} = $uri; 531 &check_uri(\%params, $uri, $check_num, $Opts{Depth}, undef, undef, 1); 532 $check_num++; 533 } 534 undef $check_num; 535 536 if ($Opts{HTML}) { 537 &html_footer(); 538 } 539 elsif ($doc_count > 0 && !$Opts{Summary_Only}) { 540 printf("\n%s\n", &global_stats()); 541 } 542 543} 544else { 545 546 require CGI; 547 require CGI::Carp; 548 CGI::Carp->import(qw(fatalsToBrowser)); 549 require CGI::Cookie; 550 551 # file: URIs are not allowed in CGI mode 552 my $forbidden = $ua->protocols_forbidden(); 553 push(@$forbidden, 'file'); 554 $ua->protocols_forbidden($forbidden); 555 556 my $query = CGI->new(); 557 558 for my $param ($query->param()) { 559 my @values = map { Encode::decode_utf8($_) } $query->param($param); 560 $query->param($param, @values); 561 } 562 563 # Set a few parameters in CGI mode 564 $Opts{Verbose} = 0; 565 $Opts{Progress} = 0; 566 $Opts{HTML} = 1; 567 $Opts{_Self_URI} = $query->url(-relative => 1); 568 569 # Backwards compatibility 570 my $uri = undef; 571 if ($uri = $query->param('url')) { 572 $query->param('uri', $uri) unless $query->param('uri'); 573 $query->delete('url'); 574 } 575 $uri = $query->param('uri'); 576 577 if (!$uri) { 578 &html_header('', undef); # Set cookie only from results page. 579 my %cookies = CGI::Cookie->fetch(); 580 &print_form(scalar($query->Vars()), $cookies{$PROGRAM}, 1); 581 &html_footer(); 582 exit; 583 } 584 585 # Backwards compatibility 586 if ($query->param('hide_dir_redirects')) { 587 $query->param('hide_redirects', 'on'); 588 $query->param('hide_type', 'dir'); 589 $query->delete('hide_dir_redirects'); 590 } 591 592 $Opts{Summary_Only} = 1 if $query->param('summary'); 593 594 if ($query->param('hide_redirects')) { 595 $Opts{Dir_Redirects} = 0; 596 if (my $type = $query->param('hide_type')) { 597 $Opts{Redirects} = 0 if ($type ne 'dir'); 598 } 599 else { 600 $Opts{Redirects} = 0; 601 } 602 } 603 604 $Opts{Accept_Language} = undef if $query->param('no_accept_language'); 605 $Opts{No_Referer} = $query->param('no_referer'); 606 607 $Opts{Depth} = -1 if ($query->param('recursive') && $Opts{Depth} == 0); 608 if (my $depth = $query->param('depth')) { 609 610 # @@@ Ignore invalid depth silently for now. 611 $Opts{Depth} = $1 if ($depth =~ /(-?\d+)/); 612 } 613 614 # Save, clear or leave cookie as is. 615 my $cookie = undef; 616 if (my $action = $query->param('cookie')) { 617 if ($action eq 'clear') { 618 619 # Clear the cookie. 620 $cookie = CGI::Cookie->new(-name => $PROGRAM); 621 $cookie->value({clear => 1}); 622 $cookie->expires('-1M'); 623 } 624 elsif ($action eq 'set') { 625 626 # Set the options. 627 $cookie = CGI::Cookie->new(-name => $PROGRAM); 628 my %options = $query->Vars(); 629 delete($options{$_}) 630 for qw(url uri check cookie); # Non-persistent. 631 $cookie->value(\%options); 632 } 633 } 634 if (!$cookie) { 635 my %cookies = CGI::Cookie->fetch(); 636 $cookie = $cookies{$PROGRAM}; 637 } 638 639 # Always refresh cookie expiration time. 640 $cookie->expires('+1M') if ($cookie && !$cookie->expires()); 641 642 # All Apache configurations don't set HTTP_AUTHORIZATION for CGI scripts. 643 # If we're under mod_perl, there is a way around it... 644 eval { 645 local $SIG{__DIE__} = undef; 646 my $auth = 647 Apache2::RequestUtil->request()->headers_in()->{Authorization}; 648 $ENV{HTTP_AUTHORIZATION} = $auth if $auth; 649 } if (MP2() && !$ENV{HTTP_AUTHORIZATION}); 650 651 $uri =~ s/^\s+//g; 652 if ($uri =~ /:/) { 653 $uri = URI->new($uri); 654 } 655 else { 656 if ($uri =~ m|^//|) { 657 $uri = URI->new("http:$uri"); 658 } 659 else { 660 local $ENV{URL_GUESS_PATTERN} = ''; 661 my $guess = URI::Heuristic::uf_uri($uri); 662 if ($guess->scheme() && $ua->is_protocol_supported($guess)) { 663 $uri = $guess; 664 } 665 else { 666 $uri = URI->new("http://$uri"); 667 } 668 } 669 } 670 $uri = $uri->canonical(); 671 $query->param("uri", $uri); 672 673 &check_uri(scalar($query->Vars()), $uri, 1, $Opts{Depth}, $cookie); 674 undef $query; # Not needed any more. 675 &html_footer(); 676} 677 678############################################################################### 679 680################################ 681# Command line and usage stuff # 682################################ 683 684sub parse_arguments () 685{ 686 require Encode::Locale; 687 Encode::Locale::decode_argv(); 688 689 require Getopt::Long; 690 Getopt::Long->require_version(2.17); 691 Getopt::Long->import('GetOptions'); 692 Getopt::Long::Configure('bundling', 'no_ignore_case'); 693 my $masq = ''; 694 my @locs = (); 695 696 GetOptions( 697 'help|h|?' => sub { usage(0) }, 698 'q|quiet' => sub { 699 $Opts{Quiet} = 1; 700 $Opts{Summary_Only} = 1; 701 }, 702 's|summary' => \$Opts{Summary_Only}, 703 'b|broken' => sub { 704 $Opts{Redirects} = 0; 705 $Opts{Dir_Redirects} = 0; 706 }, 707 'e|dir-redirects' => sub { $Opts{Dir_Redirects} = 0; }, 708 'v|verbose' => \$Opts{Verbose}, 709 'i|indicator' => \$Opts{Progress}, 710 'H|html' => \$Opts{HTML}, 711 'r|recursive' => sub { 712 $Opts{Depth} = -1 713 if $Opts{Depth} == 0; 714 }, 715 'l|location=s' => \@locs, 716 'X|exclude=s' => \$Opts{Exclude}, 717 'exclude-docs=s@' => \@{$Opts{Exclude_Docs}}, 718 'suppress-redirect=s@' => \@{$Opts{Suppress_Redirect}}, 719 'suppress-redirect-prefix=s@' => \@{$Opts{Suppress_Redirect_Prefix}}, 720 'suppress-temp-redirects' => \$Opts{Suppress_Temp_Redirects}, 721 'suppress-broken=s@' => \@{$Opts{Suppress_Broken}}, 722 'suppress-fragment=s@' => \@{$Opts{Suppress_Fragment}}, 723 'u|user=s' => \$Opts{User}, 724 'p|password=s' => \$Opts{Password}, 725 't|timeout=i' => \$Opts{Timeout}, 726 'C|connection-cache=i' => \$Opts{Connection_Cache_Size}, 727 'S|sleep=i' => \$Opts{Sleep_Time}, 728 'L|languages=s' => \$Opts{Accept_Language}, 729 'c|cookies=s' => \$Opts{Cookies}, 730 'R|no-referer' => \$Opts{No_Referer}, 731 'D|depth=i' => sub { 732 $Opts{Depth} = $_[1] 733 unless $_[1] == 0; 734 }, 735 'd|domain=s' => \$Opts{Trusted}, 736 'masquerade=s' => \$masq, 737 'hide-same-realm' => \$Opts{Hide_Same_Realm}, 738 'V|version' => \&version, 739 ) || 740 usage(1); 741 742 if ($masq) { 743 $Opts{Masquerade} = 1; 744 my @masq = split(/\s+/, $masq); 745 if (scalar(@masq) != 2 || 746 !defined($masq[0]) || 747 $masq[0] !~ /\S/ || 748 !defined($masq[1]) || 749 $masq[1] !~ /\S/) 750 { 751 usage(1, 752 "Error: --masquerade takes two whitespace separated URIs."); 753 } 754 else { 755 require URI::file; 756 $Opts{Masquerade_From} = $masq[0]; 757 my $u = URI->new($masq[1]); 758 $Opts{Masquerade_To} = 759 $u->scheme() ? $u : URI::file->new_abs($masq[1]); 760 } 761 } 762 763 if ($Opts{Accept_Language} && $Opts{Accept_Language} eq 'auto') { 764 $Opts{Accept_Language} = &guess_language(); 765 } 766 767 if (($Opts{Sleep_Time} || 0) < 1) { 768 warn( 769 "*** Warning: minimum allowed sleep time is 1 second, resetting.\n" 770 ); 771 $Opts{Sleep_Time} = 1; 772 } 773 774 push(@{$Opts{Base_Locations}}, map { URI->new($_)->canonical() } @locs); 775 776 $Opts{Depth} = -1 if ($Opts{Depth} == 0 && @locs); 777 778 # Precompile/error-check regular expressions. 779 if (defined($Opts{Exclude})) { 780 eval { $Opts{Exclude} = qr/$Opts{Exclude}/o; }; 781 &usage(1, "Error in exclude regexp: $@") if $@; 782 } 783 for my $i (0 .. $#{$Opts{Exclude_Docs}}) { 784 eval { $Opts{Exclude_Docs}->[$i] = qr/$Opts{Exclude_Docs}->[$i]/; }; 785 &usage(1, "Error in exclude-docs regexp: $@") if $@; 786 } 787 if (defined($Opts{Trusted})) { 788 eval { $Opts{Trusted} = qr/$Opts{Trusted}/io; }; 789 &usage(1, "Error in trusted domains regexp: $@") if $@; 790 } 791 792 # Sanity-check error-suppression arguments 793 for my $i (0 .. $#{$Opts{Suppress_Redirect}}) { 794 ${$Opts{Suppress_Redirect}}[$i] =~ s/ /->/; 795 my $sr_arg = ${$Opts{Suppress_Redirect}}[$i]; 796 if ($sr_arg !~ /.->./) { 797 &usage(1, 798 "Bad suppress-redirect argument, should contain \"->\": $sr_arg" 799 ); 800 } 801 } 802 for my $i (0 .. $#{$Opts{Suppress_Redirect_Prefix}}) { 803 my $srp_arg = ${$Opts{Suppress_Redirect_Prefix}}[$i]; 804 $srp_arg =~ s/ /->/; 805 if ($srp_arg !~ /^(.*)->(.*)$/) { 806 &usage(1, 807 "Bad suppress-redirect-prefix argument, should contain \"->\": $srp_arg" 808 ); 809 } 810 811 # Turn prefixes into a regexp. 812 ${$Opts{Suppress_Redirect_Prefix}}[$i] = qr/^\Q$1\E(.*)->\Q$2\E\1$/ism; 813 } 814 for my $i (0 .. $#{$Opts{Suppress_Broken}}) { 815 ${$Opts{Suppress_Broken}}[$i] =~ s/ /:/; 816 my $sb_arg = ${$Opts{Suppress_Broken}}[$i]; 817 if ($sb_arg !~ /^(-1|[0-9]+):./) { 818 &usage(1, 819 "Bad suppress-broken argument, should be prefixed by a numeric response code: $sb_arg" 820 ); 821 } 822 } 823 for my $sf_arg (@{$Opts{Suppress_Fragment}}) { 824 if ($sf_arg !~ /.#./) { 825 &usage(1, 826 "Bad suppress-fragment argument, should contain \"#\": $sf_arg" 827 ); 828 } 829 } 830 831 return; 832} 833 834sub version () 835{ 836 print "$PACKAGE $REVISION\n"; 837 exit 0; 838} 839 840sub usage () 841{ 842 my ($exitval, $msg) = @_; 843 $exitval = 0 unless defined($exitval); 844 $msg ||= ''; 845 $msg =~ s/[\r\n]*$/\n\n/ if $msg; 846 847 die($msg) unless $Opts{Command_Line}; 848 849 my $trust = defined($Cfg{Trusted}) ? $Cfg{Trusted} : 'same host only'; 850 851 select(STDERR) if $exitval; 852 print "$msg$PACKAGE $REVISION 853 854Usage: checklink <options> <uris> 855Options: 856 -s, --summary Result summary only. 857 -b, --broken Show only the broken links, not the redirects. 858 -e, --directory Hide directory redirects, for example 859 http://www.w3.org/TR -> http://www.w3.org/TR/ 860 -r, --recursive Check the documents linked from the first one. 861 -D, --depth N Check the documents linked from the first one to 862 depth N (implies --recursive). 863 -l, --location URI Scope of the documents checked in recursive mode 864 (implies --recursive). Can be specified multiple 865 times. If not specified, the default eg. for 866 http://www.w3.org/TR/html4/Overview.html 867 would be http://www.w3.org/TR/html4/ 868 -X, --exclude REGEXP Do not check links whose full, canonical URIs 869 match REGEXP; also limits recursion the same way 870 as --exclude-docs with the same regexp would. 871 --exclude-docs REGEXP In recursive mode, do not check links in documents 872 whose full, canonical URIs match REGEXP. This 873 option may be specified multiple times. 874 --suppress-redirect URI->URI Do not report a redirect from the first to the 875 second URI. This option may be specified multiple 876 times. 877 --suppress-redirect-prefix URI->URI Do not report a redirect from a child of 878 the first URI to the same child of the second URI. 879 This option may be specified multiple times. 880 --suppress-temp-redirects Suppress warnings about temporary redirects. 881 --suppress-broken CODE:URI Do not report a broken link with the given CODE. 882 CODE is HTTP response, or -1 for robots exclusion. 883 This option may be specified multiple times. 884 --suppress-fragment URI Do not report the given broken fragment URI. 885 A fragment URI contains \"#\". This option may be 886 specified multiple times. 887 -L, --languages LANGS Accept-Language header to send. The special value 888 'auto' causes autodetection from the environment. 889 -c, --cookies FILE Use cookies, load/save them in FILE. The special 890 value 'tmp' causes non-persistent use of cookies. 891 -R, --no-referer Do not send the Referer HTTP header. 892 -q, --quiet No output if no errors are found (implies -s). 893 -v, --verbose Verbose mode. 894 -i, --indicator Show percentage of lines processed while parsing. 895 -u, --user USERNAME Specify a username for authentication. 896 -p, --password PASSWORD Specify a password. 897 --hide-same-realm Hide 401's that are in the same realm as the 898 document checked. 899 -S, --sleep SECS Sleep SECS seconds between requests to each server 900 (default and minimum: 1 second). 901 -t, --timeout SECS Timeout for requests in seconds (default: 30). 902 -d, --domain DOMAIN Regular expression describing the domain to which 903 authentication information will be sent 904 (default: $trust). 905 --masquerade \"BASE1 BASE2\" Masquerade base URI BASE1 as BASE2. See the 906 manual page for more information. 907 -H, --html HTML output. 908 -?, -h, --help Show this message and exit. 909 -V, --version Output version information and exit. 910 911See \"perldoc LWP\" for information about proxy server support, 912\"perldoc Net::FTP\" for information about various environment variables 913affecting FTP connections and \"perldoc Net::NNTP\" for setting a default 914NNTP server for news: URIs. 915 916The W3C_CHECKLINK_CFG environment variable can be used to set the 917configuration file to use. See details in the full manual page, it can 918be displayed with: perldoc checklink 919 920More documentation at: $Cfg{Doc_URI} 921Please send bug reports and comments to the www-validator mailing list: 922 www-validator\@w3.org (with 'checklink' in the subject) 923 Archives are at: http://lists.w3.org/Archives/Public/www-validator/ 924"; 925 exit $exitval; 926} 927 928sub ask_password () 929{ 930 eval { 931 local $SIG{__DIE__} = undef; 932 require Term::ReadKey; 933 Term::ReadKey->require_version(2.00); 934 Term::ReadKey->import(qw(ReadMode)); 935 }; 936 if ($@) { 937 warn('Warning: Term::ReadKey 2.00 or newer not available, ' . 938 "password input disabled.\n"); 939 return; 940 } 941 printf(STDERR 'Enter the password for user %s: ', $Opts{User}); 942 ReadMode('noecho', *STDIN); 943 chomp($Opts{Password} = <STDIN>); 944 ReadMode('restore', *STDIN); 945 print(STDERR "ok.\n"); 946 return; 947} 948 949############################################################################### 950 951########################################################################### 952# Guess an Accept-Language header based on the $LANG environment variable # 953########################################################################### 954 955sub guess_language () 956{ 957 my $lang = $ENV{LANG} or return; 958 959 $lang =~ s/[\.@].*$//; # en_US.UTF-8, fi_FI@euro... 960 961 return 'en' if ($lang eq 'C' || $lang eq 'POSIX'); 962 963 my $res = undef; 964 eval { 965 require Locale::Language; 966 if (my $tmp = Locale::Language::language2code($lang)) { 967 $lang = $tmp; 968 } 969 if (my ($l, $c) = (lc($lang) =~ /^([a-z]+)(?:[-_]([a-z]+))?/)) { 970 if (Locale::Language::code2language($l)) { 971 $res = $l; 972 if ($c) { 973 require Locale::Country; 974 $res .= "-$c" if Locale::Country::code2country($c); 975 } 976 } 977 } 978 }; 979 return $res; 980} 981 982############################ 983# Transform foo into a URI # 984############################ 985 986sub urize ($) 987{ 988 my $arg = shift; 989 my $uarg = URI::Escape::uri_unescape($arg); 990 my $uri; 991 if (-d $uarg) { 992 993 # look for an "index" file in dir, return it if found 994 require File::Spec; 995 for my $index (map { File::Spec->catfile($uarg, $_) } 996 qw(index.html index.xhtml index.htm index.xhtm)) 997 { 998 if (-e $index) { 999 $uri = URI::file->new_abs($index); 1000 last; 1001 } 1002 } 1003 1004 # return dir itself if an index file was not found 1005 $uri ||= URI::file->new_abs($uarg); 1006 } 1007 elsif ($uarg =~ /^[.\/\\]/ || -e $uarg) { 1008 $uri = URI::file->new_abs($uarg); 1009 } 1010 else { 1011 my $newuri = URI->new($arg); 1012 if ($newuri->scheme()) { 1013 $uri = $newuri; 1014 } 1015 else { 1016 local $ENV{URL_GUESS_PATTERN} = ''; 1017 $uri = URI::Heuristic::uf_uri($arg); 1018 $uri = URI::file->new_abs($uri) unless $uri->scheme(); 1019 } 1020 } 1021 return $uri->canonical(); 1022} 1023 1024######################################## 1025# Check for broken links in a resource # 1026######################################## 1027 1028sub check_uri (\%\$$$$;\$$) 1029{ 1030 my ($params, $uri, $check_num, $depth, $cookie, $referer, $is_start) = @_; 1031 $is_start ||= ($check_num == 1); 1032 1033 my $start = $Opts{Summary_Only} ? 0 : &get_timestamp(); 1034 1035 # Get and parse the document 1036 my $response = &get_document( 1037 'GET', $uri, $doc_count, \%redirects, $referer, 1038 $cookie, $params, $check_num, $is_start 1039 ); 1040 1041 # Can we check the resource? If not, we exit here... 1042 return if defined($response->{Stop}); 1043 1044 if ($Opts{HTML}) { 1045 &html_header($uri, $cookie) if ($check_num == 1); 1046 &print_form($params, $cookie, $check_num) if $is_start; 1047 } 1048 1049 if ($is_start) { # Starting point of a new check, eg. from the command line 1050 # Use the first URI as the recursion base unless specified otherwise. 1051 push(@{$Opts{Base_Locations}}, $response->{absolute_uri}->canonical()) 1052 unless @{$Opts{Base_Locations}}; 1053 } 1054 else { 1055 1056 # Before fetching the document, we don't know if we'll be within the 1057 # recursion scope or not (think redirects). 1058 if (!&in_recursion_scope($response->{absolute_uri})) { 1059 hprintf("Not in recursion scope: %s\n") 1060 if ($Opts{Verbose}); 1061 $response->content(""); 1062 return; 1063 } 1064 } 1065 1066 # Define the document header, and perhaps print it. 1067 # (It might still be defined if the previous document had no errors; 1068 # just redefine it in that case.) 1069 1070 if ($check_num != 1) { 1071 if ($Opts{HTML}) { 1072 $doc_header = "\n<hr />\n"; 1073 } 1074 else { 1075 $doc_header = "\n" . ('-' x 40) . "\n"; 1076 } 1077 } 1078 1079 if ($Opts{HTML}) { 1080 $doc_header .= 1081 ("<h2>\nProcessing\t" . &show_url($response->{absolute_uri}) . 1082 "\n</h2>\n\n"); 1083 } 1084 else { 1085 $doc_header .= "\nProcessing\t$response->{absolute_uri}\n\n"; 1086 } 1087 1088 if (!$Opts{Quiet}) { 1089 print_doc_header(); 1090 } 1091 1092 # We are checking a new document 1093 $doc_count++; 1094 1095 my $result_anchor = 'results' . $doc_count; 1096 1097 if ($check_num == 1 && !$Opts{HTML} && !$Opts{Summary_Only}) { 1098 my $s = $Opts{Sleep_Time} == 1 ? '' : 's'; 1099 my $acclang = $Opts{Accept_Language} || '(not sent)'; 1100 my $send_referer = $Opts{No_Referer} ? 'not sent' : 'sending'; 1101 my $cookies = 'not used'; 1102 if (defined($Opts{Cookies})) { 1103 $cookies = 'used, '; 1104 if ($Opts{Cookies} eq 'tmp') { 1105 $cookies .= 'non-persistent'; 1106 } 1107 else { 1108 $cookies .= "file $Opts{Cookies}"; 1109 } 1110 } 1111 printf( 1112 <<'EOF', $Accept, $acclang, $send_referer, $cookies, $Opts{Sleep_Time}, $s); 1113 1114Settings used: 1115- Accept: %s 1116- Accept-Language: %s 1117- Referer: %s 1118- Cookies: %s 1119- Sleeping %d second%s between requests to each server 1120EOF 1121 printf("- Excluding links matching %s\n", $Opts{Exclude}) 1122 if defined($Opts{Exclude}); 1123 printf("- Excluding links in documents whose URIs match %s\n", 1124 join(', ', @{$Opts{Exclude_Docs}})) 1125 if @{$Opts{Exclude_Docs}}; 1126 } 1127 1128 if ($Opts{HTML}) { 1129 if (!$Opts{Summary_Only}) { 1130 my $accept = &encode($Accept); 1131 my $acclang = &encode($Opts{Accept_Language} || '(not sent)'); 1132 my $send_referer = $Opts{No_Referer} ? 'not sent' : 'sending'; 1133 my $s = $Opts{Sleep_Time} == 1 ? '' : 's'; 1134 printf( 1135 <<'EOF', $accept, $acclang, $send_referer, $Opts{Sleep_Time}, $s); 1136<div class="settings"> 1137Settings used: 1138 <ul> 1139 <li><tt><a href="http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.1">Accept</a></tt>: %s</li> 1140 <li><tt><a href="http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4">Accept-Language</a></tt>: %s</li> 1141 <li><tt><a href="http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.36">Referer</a></tt>: %s</li> 1142 <li>Sleeping %d second%s between requests to each server</li> 1143 </ul> 1144</div> 1145EOF 1146 printf("<p>Go to <a href=\"#%s\">the results</a>.</p>\n", 1147 $result_anchor); 1148 my $esc_uri = URI::Escape::uri_escape($response->{absolute_uri}, 1149 "^A-Za-z0-9."); 1150 print "<p>For reliable link checking results, check "; 1151 1152 if (!$response->{IsCss}) { 1153 printf("<a href=\"%s\">HTML validity</a> and ", 1154 &encode(sprintf($Cfg{Markup_Validator_URI}, $esc_uri))); 1155 } 1156 printf( 1157 "<a href=\"%s\">CSS validity</a> first.</p> 1158<p>Back to the <a accesskey=\"1\" href=\"%s\">link checker</a>.</p>\n", 1159 &encode(sprintf($Cfg{CSS_Validator_URI}, $esc_uri)), 1160 &encode($Opts{_Self_URI}) 1161 ); 1162 1163 printf(<<'EOF', $result_anchor); 1164<div class="progress" id="progress%s"> 1165<h3>Status: <span></span></h3> 1166<div class="progressbar"><div></div></div> 1167<pre> 1168EOF 1169 } 1170 } 1171 1172 if ($Opts{Summary_Only} && !$Opts{Quiet}) { 1173 print '<p>' if $Opts{HTML}; 1174 print 'This may take some time'; 1175 print "... (<a href=\"$Cfg{Doc_URI}#wait\">why?</a>)</p>" 1176 if $Opts{HTML}; 1177 print " if the document has many links to check.\n" unless $Opts{HTML}; 1178 } 1179 1180 # Record that we have processed this resource 1181 $processed{$response->{absolute_uri}} = 1; 1182 1183 # Parse the document 1184 my $p = 1185 &parse_document($uri, $response->base(), $response, 1, ($depth != 0)); 1186 my $base = URI->new($p->{base}); 1187 1188 # Check anchors 1189 ############### 1190 1191 print "Checking anchors...\n" unless $Opts{Summary_Only}; 1192 1193 my %errors; 1194 while (my ($anchor, $lines) = each(%{$p->{Anchors}})) { 1195 if (!length($anchor)) { 1196 1197 # Empty IDREF's are not allowed 1198 $errors{$anchor} = 1; 1199 } 1200 else { 1201 my $times = 0; 1202 $times += $_ for values(%$lines); 1203 1204 # They should appear only once 1205 $errors{$anchor} = 1 if ($times > 1); 1206 } 1207 } 1208 print " done.\n" unless $Opts{Summary_Only}; 1209 1210 # Check links 1211 ############# 1212 1213 &hprintf("Recording all the links found: %d\n", 1214 scalar(keys %{$p->{Links}})) 1215 if ($Opts{Verbose}); 1216 my %links; 1217 my %hostlinks; 1218 1219 # Record all the links found 1220 while (my ($link, $lines) = each(%{$p->{Links}})) { 1221 my $link_uri = URI->new($link); 1222 my $abs_link_uri = URI->new_abs($link_uri, $base); 1223 1224 if ($Opts{Masquerade}) { 1225 if ($abs_link_uri =~ m|^\Q$Opts{Masquerade_From}\E|) { 1226 print_doc_header(); 1227 printf("processing %s in base %s\n", 1228 $abs_link_uri, $Opts{Masquerade_To}); 1229 my $nlink = $abs_link_uri; 1230 $nlink =~ s|^\Q$Opts{Masquerade_From}\E|$Opts{Masquerade_To}|; 1231 $abs_link_uri = URI->new($nlink); 1232 } 1233 } 1234 1235 my $canon_uri = URI->new($abs_link_uri->canonical()); 1236 my $fragment = $canon_uri->fragment(undef); 1237 if (!defined($Opts{Exclude}) || $canon_uri !~ $Opts{Exclude}) { 1238 if (!exists($links{$canon_uri})) { 1239 my $hostport; 1240 $hostport = $canon_uri->host_port() 1241 if $canon_uri->can('host_port'); 1242 $hostport = '' unless defined $hostport; 1243 push(@{$hostlinks{$hostport}}, $canon_uri); 1244 } 1245 for my $line_num (keys(%$lines)) { 1246 if (!defined($fragment) || !length($fragment)) { 1247 1248 # Document without fragment 1249 $links{$canon_uri}{location}{$line_num} = 1; 1250 } 1251 else { 1252 1253 # Resource with a fragment 1254 $links{$canon_uri}{fragments}{$fragment}{$line_num} = 1; 1255 } 1256 } 1257 } 1258 } 1259 1260 my @order = &distribute_links(\%hostlinks); 1261 undef %hostlinks; 1262 1263 # Build the list of broken URI's 1264 1265 my $nlinks = scalar(@order); 1266 1267 &hprintf("Checking %d links to build list of broken URI's\n", $nlinks) 1268 if ($Opts{Verbose}); 1269 1270 my %broken; 1271 my $link_num = 0; 1272 for my $u (@order) { 1273 my $ulinks = $links{$u}; 1274 1275 if ($Opts{Summary_Only}) { 1276 1277 # Hack: avoid browser/server timeouts in summary only CGI mode, bug 896 1278 print ' ' if ($Opts{HTML} && !$Opts{Command_Line}); 1279 } 1280 else { 1281 &hprintf("\nChecking link %s\n", $u); 1282 my $progress = ($link_num / $nlinks) * 100; 1283 printf( 1284 '<script type="text/javascript">show_progress("%s", "Checking link %s", "%.1f%%");</script>', 1285 $result_anchor, &encode($u), $progress) 1286 if (!$Opts{Command_Line} && 1287 $Opts{HTML} && 1288 !$Opts{Summary_Only}); 1289 } 1290 $link_num++; 1291 1292 # Check that a link is valid 1293 &check_validity($uri, $u, ($depth != 0 && &in_recursion_scope($u)), 1294 \%links, \%redirects); 1295 &hprintf("\tReturn code: %s\n", $results{$u}{location}{code}) 1296 if ($Opts{Verbose}); 1297 if ($results{$u}{location}{success}) { 1298 1299 # Even though it was not broken, we might want to display it 1300 # on the results page (e.g. because it required authentication) 1301 $broken{$u}{location} = 1 1302 if ($results{$u}{location}{display} >= 400); 1303 1304 # List the broken fragments 1305 while (my ($fragment, $lines) = each(%{$ulinks->{fragments}})) { 1306 1307 my $fragment_ok = $results{$u}{fragments}{$fragment}; 1308 1309 if ($Opts{Verbose}) { 1310 my @line_nums = sort { $a <=> $b } keys(%$lines); 1311 &hprintf( 1312 "\t\t%s %s - Line%s: %s\n", 1313 $fragment, 1314 $fragment_ok ? 'OK' : 'Not found', 1315 (scalar(@line_nums) > 1) ? 's' : '', 1316 join(', ', @line_nums) 1317 ); 1318 } 1319 1320 # A broken fragment? 1321 $broken{$u}{fragments}{$fragment} += 2 unless $fragment_ok; 1322 } 1323 } 1324 elsif (!($Opts{Quiet} && &informational($results{$u}{location}{code}))) 1325 { 1326 1327 # Couldn't find the document 1328 $broken{$u}{location} = 1; 1329 1330 # All the fragments associated are hence broken 1331 for my $fragment (keys %{$ulinks->{fragments}}) { 1332 $broken{$u}{fragments}{$fragment}++; 1333 } 1334 } 1335 } 1336 &hprintf( 1337 "\nProcessed in %s seconds.\n", 1338 &time_diff($start, &get_timestamp()) 1339 ) unless $Opts{Summary_Only}; 1340 printf( 1341 '<script type="text/javascript">show_progress("%s", "Done. Document processed in %s seconds.", "100%%");</script>', 1342 $result_anchor, &time_diff($start, &get_timestamp())) 1343 if ($Opts{HTML} && !$Opts{Summary_Only}); 1344 1345 # Display results 1346 if ($Opts{HTML} && !$Opts{Summary_Only}) { 1347 print("</pre>\n</div>\n"); 1348 printf("<h2><a name=\"%s\">Results</a></h2>\n", $result_anchor); 1349 } 1350 print "\n" unless $Opts{Quiet}; 1351 1352 &links_summary(\%links, \%results, \%broken, \%redirects); 1353 &anchors_summary($p->{Anchors}, \%errors); 1354 1355 # Do we want to process other documents? 1356 if ($depth != 0) { 1357 1358 for my $u (map { URI->new($_) } keys %links) { 1359 1360 next unless $results{$u}{location}{success}; # Broken link? 1361 1362 next unless &in_recursion_scope($u); 1363 1364 # Do we understand its content type? 1365 next unless ($results{$u}{location}{type} =~ $ContentTypes); 1366 1367 # Have we already processed this URI? 1368 next if &already_processed($u, $uri); 1369 1370 # Do the job 1371 print "\n" unless $Opts{Quiet}; 1372 if ($Opts{HTML}) { 1373 if (!$Opts{Command_Line}) { 1374 if ($doc_count == $Opts{Max_Documents}) { 1375 print( 1376 "<hr />\n<p><strong>Maximum number of documents ($Opts{Max_Documents}) reached!</strong></p>\n" 1377 ); 1378 } 1379 if ($doc_count >= $Opts{Max_Documents}) { 1380 $doc_count++; 1381 print("<p>Not checking <strong>$u</strong></p>\n"); 1382 $processed{$u} = 1; 1383 next; 1384 } 1385 } 1386 } 1387 1388 # This is an inherently recursive algorithm, so Perl's warning is not 1389 # helpful. You may wish to comment this out when debugging, though. 1390 no warnings 'recursion'; 1391 1392 if ($depth < 0) { 1393 &check_uri($params, $u, 0, -1, $cookie, $uri); 1394 } 1395 else { 1396 &check_uri($params, $u, 0, $depth - 1, $cookie, $uri); 1397 } 1398 } 1399 } 1400 return; 1401} 1402 1403############################################################### 1404# Distribute links based on host:port to avoid RobotUA delays # 1405############################################################### 1406 1407sub distribute_links(\%) 1408{ 1409 my $hostlinks = shift; 1410 1411 # Hosts ordered by weight (number of links), descending 1412 my @order = 1413 sort { scalar(@{$hostlinks->{$b}}) <=> scalar(@{$hostlinks->{$a}}) } 1414 keys %$hostlinks; 1415 1416 # All link list flattened into one, in host weight order 1417 my @all; 1418 push(@all, @{$hostlinks->{$_}}) for @order; 1419 1420 return @all if (scalar(@order) < 2); 1421 1422 # Indexes and chunk size for "zipping" the end result list 1423 my $num = scalar(@{$hostlinks->{$order[0]}}); 1424 my @indexes = map { $_ * $num } (0 .. $num - 1); 1425 1426 # Distribute them 1427 my @result; 1428 while (my @chunk = splice(@all, 0, $num)) { 1429 @result[@indexes] = @chunk; 1430 @indexes = map { $_ + 1 } @indexes; 1431 } 1432 1433 # Weed out undefs 1434 @result = grep(defined, @result); 1435 1436 return @result; 1437} 1438 1439########################################## 1440# Decode Content-Encodings in a response # 1441########################################## 1442 1443sub decode_content ($) 1444{ 1445 my $response = shift; 1446 my $error = undef; 1447 1448 my $docref = $response->decoded_content(ref => 1); 1449 if (defined($docref)) { 1450 utf8::encode($$docref); 1451 $response->content_ref($docref); 1452 1453 # Remove Content-Encoding so it won't be decoded again later. 1454 $response->remove_header('Content-Encoding'); 1455 } 1456 else { 1457 my $ce = $response->header('Content-Encoding'); 1458 $ce = defined($ce) ? "'$ce'" : 'undefined'; 1459 my $ct = $response->header('Content-Type'); 1460 $ct = defined($ct) ? "'$ct'" : 'undefined'; 1461 my $request_uri = $response->request->url; 1462 1463 my $cs = $response->content_charset(); 1464 $cs = defined($cs) ? "'$cs'" : 'unknown'; 1465 $error = 1466 "Error decoding document at <$request_uri>, Content-Type $ct, " . 1467 "Content-Encoding $ce, content charset $cs: '$@'"; 1468 } 1469 return $error; 1470} 1471 1472####################################### 1473# Get and parse a resource to process # 1474####################################### 1475 1476sub get_document ($\$$;\%\$$$$$) 1477{ 1478 my ($method, $uri, $in_recursion, $redirects, $referer, 1479 $cookie, $params, $check_num, $is_start 1480 ) = @_; 1481 1482 # $method contains the HTTP method the use (GET or HEAD) 1483 # $uri object contains the identifier of the resource 1484 # $in_recursion is > 0 if we are in recursion mode (i.e. it is at least 1485 # the second resource checked) 1486 # $redirects is a pointer to the hash containing the map of the redirects 1487 # $referer is the URI object of the referring document 1488 # $cookie, $params, $check_num, and $is_start are for printing HTTP headers 1489 # and the form if $in_recursion == 0 and not authenticating 1490 1491 # Get the resource 1492 my $response; 1493 if (defined($results{$uri}{response}) && 1494 !($method eq 'GET' && $results{$uri}{method} eq 'HEAD')) 1495 { 1496 $response = $results{$uri}{response}; 1497 } 1498 else { 1499 $response = &get_uri($method, $uri, $referer); 1500 &record_results($uri, $method, $response, $referer); 1501 &record_redirects($redirects, $response); 1502 } 1503 if (!$response->is_success()) { 1504 if (!$in_recursion) { 1505 1506 # Is it too late to request authentication? 1507 if ($response->code() == 401) { 1508 &authentication($response, $cookie, $params, $check_num, 1509 $is_start); 1510 } 1511 else { 1512 if ($Opts{HTML}) { 1513 &html_header($uri, $cookie) if ($check_num == 1); 1514 &print_form($params, $cookie, $check_num) if $is_start; 1515 print "<p>", &status_icon($response->code()); 1516 } 1517 &hprintf("\nError: %d %s\n", 1518 $response->code(), $response->message() || '(no message)'); 1519 print "</p>\n" if $Opts{HTML}; 1520 } 1521 } 1522 $response->{Stop} = 1; 1523 $response->content(""); 1524 return ($response); 1525 } 1526 1527 # What is the URI of the resource that we are processing by the way? 1528 my $base_uri = $response->base(); 1529 my $request_uri = URI->new($response->request->url); 1530 $response->{absolute_uri} = $request_uri->abs($base_uri); 1531 1532 # Can we parse the document? 1533 my $failed_reason; 1534 my $ct = $response->header('Content-Type'); 1535 if (!$ct || $ct !~ $ContentTypes) { 1536 $failed_reason = "Content-Type for <$request_uri> is " . 1537 (defined($ct) ? "'$ct'" : 'undefined'); 1538 } 1539 else { 1540 $failed_reason = decode_content($response); 1541 } 1542 if ($failed_reason) { 1543 1544 # No, there is a problem... 1545 if (!$in_recursion) { 1546 if ($Opts{HTML}) { 1547 &html_header($uri, $cookie) if ($check_num == 1); 1548 &print_form($params, $cookie, $check_num) if $is_start; 1549 print "<p>", &status_icon(406); 1550 1551 } 1552 &hprintf("Can't check links: %s.\n", $failed_reason); 1553 print "</p>\n" if $Opts{HTML}; 1554 } 1555 $response->{Stop} = 1; 1556 $response->content(""); 1557 } 1558 1559 # Ok, return the information 1560 return ($response); 1561} 1562 1563######################################################### 1564# Check whether a URI is within the scope of recursion. # 1565######################################################### 1566 1567sub in_recursion_scope (\$) 1568{ 1569 my ($uri) = @_; 1570 return 0 unless $uri; 1571 1572 my $candidate = $uri->canonical(); 1573 1574 return 0 if (defined($Opts{Exclude}) && $candidate =~ $Opts{Exclude}); 1575 1576 for my $excluded_doc (@{$Opts{Exclude_Docs}}) { 1577 return 0 if ($candidate =~ $excluded_doc); 1578 } 1579 1580 for my $base (@{$Opts{Base_Locations}}) { 1581 my $rel = $candidate->rel($base); 1582 next if ($candidate eq $rel); # Relative path not possible? 1583 next if ($rel =~ m|^(\.\.)?/|); # Relative path upwards? 1584 return 1; 1585 } 1586 1587 return 0; # We always have at least one base location, but none matched. 1588} 1589 1590################################# 1591# Check for content type match. # 1592################################# 1593 1594sub is_content_type ($$) 1595{ 1596 my ($candidate, $type) = @_; 1597 return 0 unless ($candidate && $type); 1598 my @v = HTTP::Headers::Util::split_header_words($candidate); 1599 return scalar(@v) ? $type eq lc($v[0]->[0]) : 0; 1600} 1601 1602################################################## 1603# Check whether a URI has already been processed # 1604################################################## 1605 1606sub already_processed (\$\$) 1607{ 1608 my ($uri, $referer) = @_; 1609 1610 # Don't be verbose for that part... 1611 my $summary_value = $Opts{Summary_Only}; 1612 $Opts{Summary_Only} = 1; 1613 1614 # Do a GET: if it fails, we stop, if not, the results are cached 1615 my $response = &get_document('GET', $uri, 1, undef, $referer); 1616 1617 # ... but just for that part 1618 $Opts{Summary_Only} = $summary_value; 1619 1620 # Can we process the resource? 1621 return -1 if defined($response->{Stop}); 1622 1623 # Have we already processed it? 1624 return 1 if defined($processed{$response->{absolute_uri}->as_string()}); 1625 1626 # It's not processed yet and it is processable: return 0 1627 return 0; 1628} 1629 1630############################ 1631# Get the content of a URI # 1632############################ 1633 1634sub get_uri ($\$;\$$\%$$$$) 1635{ 1636 1637 # Here we have a lot of extra parameters in order not to lose information 1638 # if the function is called several times (401's) 1639 my ($method, $uri, $referer, $start, $redirects, 1640 $code, $realm, $message, $auth 1641 ) = @_; 1642 1643 # $method contains the method used 1644 # $uri object contains the target of the request 1645 # $referer is the URI object of the referring document 1646 # $start is a timestamp (not defined the first time the function is called) 1647 # $redirects is a map of redirects 1648 # $code is the first HTTP return code 1649 # $realm is the realm of the request 1650 # $message is the HTTP message received 1651 # $auth equals 1 if we want to send out authentication information 1652 1653 # For timing purposes 1654 $start = &get_timestamp() unless defined($start); 1655 1656 # Prepare the query 1657 1658 # Do we want printouts of progress? 1659 my $verbose_progress = 1660 !($Opts{Summary_Only} || (!$doc_count && $Opts{HTML})); 1661 1662 &hprintf("%s %s ", $method, $uri) if $verbose_progress; 1663 1664 my $request = HTTP::Request->new($method, $uri); 1665 1666 $request->header('Accept-Language' => $Opts{Accept_Language}) 1667 if $Opts{Accept_Language}; 1668 $request->header('Accept', $Accept); 1669 $request->accept_decodable(); 1670 1671 # Are we providing authentication info? 1672 if ($auth && $request->url()->host() =~ $Opts{Trusted}) { 1673 if (defined($ENV{HTTP_AUTHORIZATION})) { 1674 $request->header(Authorization => $ENV{HTTP_AUTHORIZATION}); 1675 } 1676 elsif (defined($Opts{User}) && defined($Opts{Password})) { 1677 $request->authorization_basic($Opts{User}, $Opts{Password}); 1678 } 1679 } 1680 1681 # Tell the user agent if we want progress reports for redirects or not. 1682 $ua->redirect_progress_callback(sub { &hprintf("\n-> %s %s ", @_); }) 1683 if $verbose_progress; 1684 1685 # Set referer 1686 $request->referer($referer) if (!$Opts{No_Referer} && $referer); 1687 1688 # Telling caches in the middle we want a fresh copy (Bug 4998) 1689 $request->header(Cache_Control => "max-age=0"); 1690 1691 # Do the query 1692 my $response = $ua->request($request); 1693 1694 # Get the results 1695 # Record the very first response 1696 if (!defined($code)) { 1697 ($code, $message) = delete(@$ua{qw(FirstResponse FirstMessage)}); 1698 } 1699 1700 # Authentication requested? 1701 if ($response->code() == 401 && 1702 !defined($auth) && 1703 (defined($ENV{HTTP_AUTHORIZATION}) || 1704 (defined($Opts{User}) && defined($Opts{Password}))) 1705 ) 1706 { 1707 1708 # Set host as trusted domain unless we already have one. 1709 if (!$Opts{Trusted}) { 1710 my $re = sprintf('^%s$', quotemeta($response->base()->host())); 1711 $Opts{Trusted} = qr/$re/io; 1712 } 1713 1714 # Deal with authentication and avoid loops 1715 if (!defined($realm) && 1716 $response->www_authenticate() =~ /Basic realm=\"([^\"]+)\"/) 1717 { 1718 $realm = $1; 1719 } 1720 1721 print "\n" if $verbose_progress; 1722 return &get_uri($method, $response->request()->url(), 1723 $referer, $start, $redirects, $code, $realm, $message, 1); 1724 } 1725 1726 # @@@ subtract robot delay from the "fetched in" time? 1727 &hprintf(" fetched in %s seconds\n", &time_diff($start, &get_timestamp())) 1728 if $verbose_progress; 1729 1730 $response->{IsCss} = 1731 is_content_type($response->content_type(), "text/css"); 1732 $response->{Realm} = $realm if defined($realm); 1733 1734 return $response; 1735} 1736 1737######################################### 1738# Record the results of an HTTP request # 1739######################################### 1740 1741sub record_results (\$$$$) 1742{ 1743 my ($uri, $method, $response, $referer) = @_; 1744 $results{$uri}{referer} = $referer; 1745 $results{$uri}{response} = $response; 1746 $results{$uri}{method} = $method; 1747 $results{$uri}{location}{code} = $response->code(); 1748 $results{$uri}{location}{code} = RC_ROBOTS_TXT() 1749 if ($results{$uri}{location}{code} == 403 && 1750 $response->message() =~ /Forbidden by robots\.txt/); 1751 $results{$uri}{location}{code} = RC_IP_DISALLOWED() 1752 if ($results{$uri}{location}{code} == 403 && 1753 $response->message() =~ /non-public IP/); 1754 $results{$uri}{location}{code} = RC_DNS_ERROR() 1755 if ($results{$uri}{location}{code} == 500 && 1756 $response->message() =~ /Bad hostname '[^\']*'/); 1757 $results{$uri}{location}{code} = RC_PROTOCOL_DISALLOWED() 1758 if ($results{$uri}{location}{code} == 500 && 1759 $response->message() =~ /Access to '[^\']*' URIs has been disabled/); 1760 $results{$uri}{location}{type} = $response->header('Content-type'); 1761 $results{$uri}{location}{display} = $results{$uri}{location}{code}; 1762 1763 # Rewind, check for the original code and message. 1764 for (my $tmp = $response->previous(); $tmp; $tmp = $tmp->previous()) { 1765 $results{$uri}{location}{orig} = $tmp->code(); 1766 $results{$uri}{location}{orig_message} = $tmp->message() || 1767 '(no message)'; 1768 } 1769 $results{$uri}{location}{success} = $response->is_success(); 1770 1771 # If a suppressed broken link, fill the data structure like a typical success. 1772 # print STDERR "success? " . $results{$uri}{location}{success} . ": $uri\n"; 1773 if (!$results{$uri}{location}{success}) { 1774 my $code = $results{$uri}{location}{code}; 1775 my $match = grep { $_ eq "$code:$uri" } @{$Opts{Suppress_Broken}}; 1776 if ($match) { 1777 $results{$uri}{location}{success} = 1; 1778 $results{$uri}{location}{code} = 100; 1779 $results{$uri}{location}{display} = 100; 1780 } 1781 } 1782 1783 # Stores the authentication information 1784 if (defined($response->{Realm})) { 1785 $results{$uri}{location}{realm} = $response->{Realm}; 1786 $results{$uri}{location}{display} = 401 unless $Opts{Hide_Same_Realm}; 1787 } 1788 1789 # What type of broken link is it? (stored in {record} - the {display} 1790 # information is just for visual use only) 1791 if ($results{$uri}{location}{display} == 401 && 1792 $results{$uri}{location}{code} == 404) 1793 { 1794 $results{$uri}{location}{record} = 404; 1795 } 1796 else { 1797 $results{$uri}{location}{record} = $results{$uri}{location}{display}; 1798 } 1799 1800 # Did it fail? 1801 $results{$uri}{location}{message} = $response->message() || '(no message)'; 1802 if (!$results{$uri}{location}{success}) { 1803 &hprintf( 1804 "Error: %d %s\n", 1805 $results{$uri}{location}{code}, 1806 $results{$uri}{location}{message} 1807 ) if ($Opts{Verbose}); 1808 } 1809 return; 1810} 1811 1812#################### 1813# Parse a document # 1814#################### 1815 1816sub parse_document (\$\$$$$) 1817{ 1818 my ($uri, $base_uri, $response, $links, $rec_needs_links) = @_; 1819 1820 print("parse_document($uri, $base_uri, ..., $links, $rec_needs_links)\n") 1821 if $Opts{Verbose}; 1822 1823 my $p; 1824 1825 if (defined($results{$uri}{parsing})) { 1826 1827 # We have already done the job. Woohoo! 1828 $p->{base} = $results{$uri}{parsing}{base}; 1829 $p->{Anchors} = $results{$uri}{parsing}{Anchors}; 1830 $p->{Links} = $results{$uri}{parsing}{Links}; 1831 return $p; 1832 } 1833 1834 $p = W3C::LinkChecker->new(); 1835 $p->{base} = $base_uri; 1836 1837 my $stype = $response->header("Content-Style-Type"); 1838 $p->{style_is_css} = !$stype || is_content_type($stype, "text/css"); 1839 1840 my $start; 1841 if (!$Opts{Summary_Only}) { 1842 $start = &get_timestamp(); 1843 print("Parsing...\n"); 1844 } 1845 1846 # Content-Encoding etc already decoded in get_document(). 1847 my $docref = $response->content_ref(); 1848 1849 # Count lines beforehand if needed (for progress indicator, or CSS while 1850 # we don't get any line context out of the parser). In case of HTML, the 1851 # actual final number of lines processed shown is populated by our 1852 # end_document handler. 1853 $p->{Total} = ($$docref =~ tr/\n//) 1854 if ($response->{IsCss} || $Opts{Progress}); 1855 1856 # We only look for anchors if we are not interested in the links 1857 # obviously, or if we are running a recursive checking because we 1858 # might need this information later 1859 $p->{only_anchors} = !($links || $rec_needs_links); 1860 1861 if ($response->{IsCss}) { 1862 1863 # Parse as CSS 1864 1865 $p->parse_css($$docref, LINE_UNKNOWN()); 1866 } 1867 else { 1868 1869 # Parse as HTML 1870 1871 # Transform <?xml:stylesheet ...?> into <xml:stylesheet ...> for parsing 1872 # Processing instructions are not parsed by process, but in this case 1873 # it should be. It's expensive, it's horrible, but it's the easiest way 1874 # for right now. 1875 $$docref =~ s/\<\?(xml:stylesheet.*?)\?\>/\<$1\>/ 1876 unless $p->{only_anchors}; 1877 1878 $p->xml_mode(1) if ($response->content_type() =~ /\+xml$/); 1879 1880 $p->parse($$docref)->eof(); 1881 } 1882 1883 $response->content(""); 1884 1885 if (!$Opts{Summary_Only}) { 1886 my $stop = &get_timestamp(); 1887 print "\r" if $Opts{Progress}; 1888 &hprintf(" done (%d lines in %s seconds).\n", 1889 $p->{Total}, &time_diff($start, $stop)); 1890 } 1891 1892 # Save the results before exiting 1893 $results{$uri}{parsing}{base} = $p->{base}; 1894 $results{$uri}{parsing}{Anchors} = $p->{Anchors}; 1895 $results{$uri}{parsing}{Links} = $p->{Links}; 1896 1897 return $p; 1898} 1899 1900#################################### 1901# Constructor for W3C::LinkChecker # 1902#################################### 1903 1904sub new 1905{ 1906 my $p = HTML::Parser::new(@_, api_version => 3); 1907 $p->utf8_mode(1); 1908 1909 # Set up handlers 1910 1911 $p->handler(start => 'start', 'self, tagname, attr, line'); 1912 $p->handler(end => 'end', 'self, tagname, line'); 1913 $p->handler(text => 'text', 'self, dtext, line'); 1914 $p->handler( 1915 declaration => sub { 1916 my $self = shift; 1917 $self->declaration(substr($_[0], 2, -1)); 1918 }, 1919 'self, text, line' 1920 ); 1921 $p->handler(end_document => 'end_document', 'self, line'); 1922 if ($Opts{Progress}) { 1923 $p->handler(default => 'parse_progress', 'self, line'); 1924 $p->{last_percentage} = 0; 1925 } 1926 1927 # Check <a [..] name="...">? 1928 $p->{check_name} = 1; 1929 1930 # Check <[..] id="..">? 1931 $p->{check_id} = 1; 1932 1933 # Don't interpret comment loosely 1934 $p->strict_comment(1); 1935 1936 return $p; 1937} 1938 1939################################################# 1940# Record or return the doctype of the document # 1941################################################# 1942 1943sub doctype 1944{ 1945 my ($self, $dc) = @_; 1946 return $self->{doctype} unless $dc; 1947 $_ = $self->{doctype} = $dc; 1948 1949 # What to look for depending on the doctype 1950 1951 # Check for <a name="...">? 1952 $self->{check_name} = 0 1953 if m%^-//(W3C|WAPFORUM)//DTD XHTML (Basic|Mobile) %; 1954 1955 # Check for <* id="...">? 1956 $self->{check_id} = 0 1957 if (m%^-//IETF//DTD HTML [23]\.0//% || m%^-//W3C//DTD HTML 3\.2//%); 1958 1959 # Enable XML mode (XHTML, XHTML Mobile, XHTML-Print, XHTML+RDFa, ...) 1960 $self->xml_mode(1) if (m%^-//(W3C|WAPFORUM)//DTD XHTML[ \-\+]%); 1961 1962 return; 1963} 1964 1965################################### 1966# Print parse progress indication # 1967################################### 1968 1969sub parse_progress 1970{ 1971 my ($self, $line) = @_; 1972 return unless defined($line) && $line > 0 && $self->{Total} > 0; 1973 1974 my $percentage = int($line / $self->{Total} * 100); 1975 if ($percentage != $self->{last_percentage}) { 1976 printf("\r%4d%%", $percentage); 1977 $self->{last_percentage} = $percentage; 1978 } 1979 1980 return; 1981} 1982 1983############################# 1984# Extraction of the anchors # 1985############################# 1986 1987sub get_anchor 1988{ 1989 my ($self, $tag, $attr) = @_; 1990 1991 my $anchor = $self->{check_id} ? $attr->{id} : undef; 1992 if ($self->{check_name} && ($tag eq 'a')) { 1993 1994 # @@@@ In XHTML, <a name="foo" id="foo"> is mandatory 1995 # Force an error if it's not the case (or if id's and name's values 1996 # are different) 1997 # If id is defined, name if defined must have the same value 1998 $anchor ||= $attr->{name}; 1999 } 2000 2001 return $anchor; 2002} 2003 2004############################# 2005# W3C::LinkChecker handlers # 2006############################# 2007 2008sub add_link 2009{ 2010 my ($self, $uri, $base, $line) = @_; 2011 if (defined($uri)) { 2012 2013 # Remove repeated slashes after the . or .. in relative links, to avoid 2014 # duplicated checking or infinite recursion. 2015 $uri =~ s|^(\.\.?/)/+|$1|o; 2016 $uri = Encode::decode_utf8($uri); 2017 $uri = URI->new_abs($uri, $base) if defined($base); 2018 $self->{Links}{$uri}{defined($line) ? $line : LINE_UNKNOWN()}++; 2019 } 2020 return; 2021} 2022 2023sub start 2024{ 2025 my ($self, $tag, $attr, $line) = @_; 2026 $line = LINE_UNKNOWN() unless defined($line); 2027 2028 # Anchors 2029 my $anchor = $self->get_anchor($tag, $attr); 2030 $self->{Anchors}{$anchor}{$line}++ if defined($anchor); 2031 2032 # Links 2033 if (!$self->{only_anchors}) { 2034 2035 my $tag_local_base = undef; 2036 2037 # Special case: base/@href 2038 # @@@TODO: The reason for handling <base href> ourselves is that LWP's 2039 # head parsing magic fails at least for responses that have 2040 # Content-Encodings: https://rt.cpan.org/Ticket/Display.html?id=54361 2041 if ($tag eq 'base') { 2042 2043 # Ignore <base> with missing/empty href. 2044 $self->{base} = $attr->{href} 2045 if (defined($attr->{href}) && length($attr->{href})); 2046 } 2047 2048 # Special case: meta[@http-equiv=Refresh]/@content 2049 elsif ($tag eq 'meta') { 2050 if ($attr->{'http-equiv'} && 2051 lc($attr->{'http-equiv'}) eq 'refresh') 2052 { 2053 my $content = $attr->{content}; 2054 if ($content && $content =~ /.*?;\s*(?:url=)?(.+)/i) { 2055 $self->add_link($1, undef, $line); 2056 } 2057 } 2058 } 2059 2060 # Special case: tags that have "local base" 2061 elsif ($tag eq 'applet' || $tag eq 'object') { 2062 if (my $codebase = $attr->{codebase}) { 2063 2064 # Applet codebases are directories, append trailing slash 2065 # if it's not there so that new_abs does the right thing. 2066 $codebase .= "/" if ($tag eq 'applet' && $codebase !~ m|/$|); 2067 2068 # TODO: HTML 4 spec says applet/@codebase may only point to 2069 # subdirs of the directory containing the current document. 2070 # Should we do something about that? 2071 $tag_local_base = URI->new_abs($codebase, $self->{base}); 2072 } 2073 } 2074 2075 # Link attributes: 2076 if (my $link_attrs = LINK_ATTRS()->{$tag}) { 2077 for my $la (@$link_attrs) { 2078 $self->add_link($attr->{$la}, $tag_local_base, $line); 2079 } 2080 } 2081 2082 # List of links attributes: 2083 if (my $link_attrs = LINK_LIST_ATTRS()->{$tag}) { 2084 my ($sep, $attrs) = @$link_attrs; 2085 for my $la (@$attrs) { 2086 if (defined(my $value = $attr->{$la})) { 2087 for my $link (split($sep, $value)) { 2088 $self->add_link($link, $tag_local_base, $line); 2089 } 2090 } 2091 } 2092 } 2093 2094 # Inline CSS: 2095 delete $self->{csstext}; 2096 if ($tag eq 'style') { 2097 $self->{csstext} = '' 2098 if ((!$attr->{type} && $self->{style_is_css}) || 2099 is_content_type($attr->{type}, "text/css")); 2100 } 2101 elsif ($self->{style_is_css} && (my $style = $attr->{style})) { 2102 $style = CSS::DOM::Style::parse($style); 2103 $self->parse_style($style, $line); 2104 } 2105 } 2106 2107 $self->parse_progress($line) if $Opts{Progress}; 2108 return; 2109} 2110 2111sub end 2112{ 2113 my ($self, $tagname, $line) = @_; 2114 2115 $self->parse_css($self->{csstext}, $line) if ($tagname eq 'style'); 2116 delete $self->{csstext}; 2117 2118 $self->parse_progress($line) if $Opts{Progress}; 2119 return; 2120} 2121 2122sub parse_css 2123{ 2124 my ($self, $css, $line) = @_; 2125 return unless $css; 2126 2127 my $sheet = CSS::DOM::parse($css); 2128 for my $rule (@{$sheet->cssRules()}) { 2129 if ($rule->type() == IMPORT_RULE()) { 2130 $self->add_link($rule->href(), $self->{base}, $line); 2131 } 2132 elsif ($rule->type == STYLE_RULE()) { 2133 $self->parse_style($rule->style(), $line); 2134 } 2135 } 2136 return; 2137} 2138 2139sub parse_style 2140{ 2141 my ($self, $style, $line) = @_; 2142 return unless $style; 2143 2144 for (my $i = 0, my $len = $style->length(); $i < $len; $i++) { 2145 my $prop = $style->item($i); 2146 my $val = $style->getPropertyValue($prop); 2147 2148 while ($val =~ /$CssUrl/go) { 2149 my $url = CSS::DOM::Util::unescape($2); 2150 $self->add_link($url, $self->{base}, $line); 2151 } 2152 } 2153 2154 return; 2155} 2156 2157sub declaration 2158{ 2159 my ($self, $text, $line) = @_; 2160 2161 # Extract the doctype 2162 my @declaration = split(/\s+/, $text, 4); 2163 if ($#declaration >= 3 && 2164 $declaration[0] eq 'DOCTYPE' && 2165 lc($declaration[1]) eq 'html') 2166 { 2167 2168 # Parse the doctype declaration 2169 if ($text =~ 2170 m/^DOCTYPE\s+html\s+(?:PUBLIC\s+"([^"]+)"|SYSTEM)(\s+"([^"]+)")?\s*$/i 2171 ) 2172 { 2173 2174 # Store the doctype 2175 $self->doctype($1) if $1; 2176 2177 # If there is a link to the DTD, record it 2178 $self->add_link($3, undef, $line) 2179 if (!$self->{only_anchors} && $3); 2180 } 2181 } 2182 2183 $self->text($text) unless $self->{only_anchors}; 2184 2185 return; 2186} 2187 2188sub text 2189{ 2190 my ($self, $text, $line) = @_; 2191 $self->{csstext} .= $text if defined($self->{csstext}); 2192 $self->parse_progress($line) if $Opts{Progress}; 2193 return; 2194} 2195 2196sub end_document 2197{ 2198 my ($self, $line) = @_; 2199 $self->{Total} = $line; 2200 delete $self->{csstext}; 2201 return; 2202} 2203 2204################################ 2205# Check the validity of a link # 2206################################ 2207 2208sub check_validity (\$\$$\%\%) 2209{ 2210 my ($referer, $uri, $want_links, $links, $redirects) = @_; 2211 2212 # $referer is the URI object of the document checked 2213 # $uri is the URI object of the target that we are verifying 2214 # $want_links is true if we're interested in links in the target doc 2215 # $links is a hash of the links in the documents checked 2216 # $redirects is a map of the redirects encountered 2217 2218 # Get the document with the appropriate method: GET if there are 2219 # fragments to check or links are wanted, HEAD is enough otherwise. 2220 my $fragments = $links->{$uri}{fragments} || {}; 2221 my $method = ($want_links || %$fragments) ? 'GET' : 'HEAD'; 2222 2223 my $response; 2224 my $being_processed = 0; 2225 if (!defined($results{$uri}) || 2226 ($method eq 'GET' && $results{$uri}{method} eq 'HEAD')) 2227 { 2228 $being_processed = 1; 2229 $response = &get_uri($method, $uri, $referer); 2230 2231 # Get the information back from get_uri() 2232 &record_results($uri, $method, $response, $referer); 2233 2234 # Record the redirects 2235 &record_redirects($redirects, $response); 2236 } 2237 elsif (!($Opts{Summary_Only} || (!$doc_count && $Opts{HTML}))) { 2238 my $ref = $results{$uri}{referer}; 2239 &hprintf("Already checked%s\n", $ref ? ", referrer $ref" : "."); 2240 } 2241 2242 # We got the response of the HTTP request. Stop here if it was a HEAD. 2243 return if ($method eq 'HEAD'); 2244 2245 # There are fragments. Parse the document. 2246 my $p; 2247 if ($being_processed) { 2248 2249 # Can we really parse the document? 2250 if (!defined($results{$uri}{location}{type}) || 2251 $results{$uri}{location}{type} !~ $ContentTypes) 2252 { 2253 &hprintf("Can't check content: Content-Type for '%s' is '%s'.\n", 2254 $uri, $results{$uri}{location}{type}) 2255 if ($Opts{Verbose}); 2256 $response->content(""); 2257 return; 2258 } 2259 2260 # Do it then 2261 if (my $error = decode_content($response)) { 2262 &hprintf("%s\n.", $error); 2263 } 2264 2265 # @@@TODO: this isn't the best thing to do if a decode error occurred 2266 $p = 2267 &parse_document($uri, $response->base(), $response, 0, 2268 $want_links); 2269 } 2270 else { 2271 2272 # We already had the information 2273 $p->{Anchors} = $results{$uri}{parsing}{Anchors}; 2274 } 2275 2276 # Check that the fragments exist 2277 for my $fragment (keys %$fragments) { 2278 if (defined($p->{Anchors}{$fragment}) || 2279 &escape_match($fragment, $p->{Anchors}) || 2280 grep { $_ eq "$uri#$fragment" } @{$Opts{Suppress_Fragment}}) 2281 { 2282 $results{$uri}{fragments}{$fragment} = 1; 2283 } 2284 else { 2285 $results{$uri}{fragments}{$fragment} = 0; 2286 } 2287 } 2288 return; 2289} 2290 2291sub escape_match ($\%) 2292{ 2293 my ($a, $hash) = (URI::Escape::uri_unescape($_[0]), $_[1]); 2294 for my $b (keys %$hash) { 2295 return 1 if ($a eq URI::Escape::uri_unescape($b)); 2296 } 2297 return 0; 2298} 2299 2300########################## 2301# Ask for authentication # 2302########################## 2303 2304sub authentication ($;$$$$) 2305{ 2306 my ($response, $cookie, $params, $check_num, $is_start) = @_; 2307 2308 my $realm = ''; 2309 if ($response->www_authenticate() =~ /Basic realm=\"([^\"]+)\"/) { 2310 $realm = $1; 2311 } 2312 2313 if ($Opts{Command_Line}) { 2314 printf STDERR <<'EOF', $response->request()->url(), $realm; 2315 2316Authentication is required for %s. 2317The realm is "%s". 2318Use the -u and -p options to specify a username and password and the -d option 2319to specify trusted domains. 2320EOF 2321 } 2322 else { 2323 2324 printf( 2325 "Status: 401 Authorization Required\nWWW-Authenticate: %s\n%sConnection: close\nContent-Language: en\nContent-Type: text/html; charset=utf-8\n\n", 2326 $response->www_authenticate(), 2327 $cookie ? "Set-Cookie: $cookie\n" : "", 2328 ); 2329 2330 printf( 2331 "%s 2332<html lang=\"en\" xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\"> 2333<head> 2334<title>W3C Link Checker: 401 Authorization Required</title> 2335%s</head> 2336<body>", $DocType, $Head 2337 ); 2338 &banner(': 401 Authorization Required'); 2339 &print_form($params, $cookie, $check_num) if $is_start; 2340 printf( 2341 '<p> 2342 %s 2343 You need "%s" access to <a href="%s">%s</a> to perform link checking.<br /> 2344', 2345 &status_icon(401), 2346 &encode($realm), (&encode($response->request()->url())) x 2 2347 ); 2348 2349 my $host = $response->request()->url()->host(); 2350 if ($Opts{Trusted} && $host !~ $Opts{Trusted}) { 2351 printf <<'EOF', &encode($Opts{Trusted}), &encode($host); 2352 This service has been configured to send authentication only to hostnames 2353 matching the regular expression <code>%s</code>, but the hostname 2354 <code>%s</code> does not match it. 2355EOF 2356 } 2357 2358 print "</p>\n"; 2359 } 2360 return; 2361} 2362 2363################## 2364# Get statistics # 2365################## 2366 2367sub get_timestamp () 2368{ 2369 return pack('LL', Time::HiRes::gettimeofday()); 2370} 2371 2372sub time_diff ($$) 2373{ 2374 my @start = unpack('LL', $_[0]); 2375 my @stop = unpack('LL', $_[1]); 2376 for ($start[1], $stop[1]) { 2377 $_ /= 1_000_000; 2378 } 2379 return (sprintf("%.2f", ($stop[0] + $stop[1]) - ($start[0] + $start[1]))); 2380} 2381 2382######################## 2383# Handle the redirects # 2384######################## 2385 2386# Record the redirects in a hash 2387sub record_redirects (\%$) 2388{ 2389 my ($redirects, $response) = @_; 2390 for (my $prev = $response->previous(); $prev; $prev = $prev->previous()) { 2391 2392 # Check for redirect match. 2393 my $from = $prev->request()->url(); 2394 my $to = $response->request()->url(); # same on every loop iteration 2395 my $from_to = $from . '->' . $to; 2396 my $match = grep { $_ eq $from_to } @{$Opts{Suppress_Redirect}}; 2397 2398 # print STDERR "Result $match of redirect checking $from_to\n"; 2399 if ($match) { next; } 2400 2401 $match = grep { $from_to =~ /$_/ } @{$Opts{Suppress_Redirect_Prefix}}; 2402 2403 # print STDERR "Result $match of regexp checking $from_to\n"; 2404 if ($match) { next; } 2405 2406 my $c = $prev->code(); 2407 if ($Opts{Suppress_Temp_Redirects} && ($c == 307 || $c == 302)) { 2408 next; 2409 } 2410 2411 $redirects->{$prev->request()->url()} = $response->request()->url(); 2412 } 2413 return; 2414} 2415 2416# Determine if a request is redirected 2417sub is_redirected ($%) 2418{ 2419 my ($uri, %redirects) = @_; 2420 return (defined($redirects{$uri})); 2421} 2422 2423# Get a list of redirects for a URI 2424sub get_redirects ($%) 2425{ 2426 my ($uri, %redirects) = @_; 2427 my @history = ($uri); 2428 my %seen = ($uri => 1); # for tracking redirect loops 2429 my $loop = 0; 2430 while ($redirects{$uri}) { 2431 $uri = $redirects{$uri}; 2432 push(@history, $uri); 2433 if ($seen{$uri}) { 2434 $loop = 1; 2435 last; 2436 } 2437 else { 2438 $seen{$uri}++; 2439 } 2440 } 2441 return ($loop, @history); 2442} 2443 2444#################################################### 2445# Tool for sorting the unique elements of an array # 2446#################################################### 2447 2448sub sort_unique (@) 2449{ 2450 my %saw; 2451 @saw{@_} = (); 2452 return (sort { $a <=> $b } keys %saw); 2453} 2454 2455##################### 2456# Print the results # 2457##################### 2458 2459sub line_number ($) 2460{ 2461 my $line = shift; 2462 return $line if ($line >= 0); 2463 return "(N/A)"; 2464} 2465 2466sub http_rc ($) 2467{ 2468 my $rc = shift; 2469 return $rc if ($rc >= 0); 2470 return "(N/A)"; 2471} 2472 2473# returns true if the given code is informational 2474sub informational ($) 2475{ 2476 my $rc = shift; 2477 return $rc == RC_ROBOTS_TXT() || 2478 $rc == RC_IP_DISALLOWED() || 2479 $rc == RC_PROTOCOL_DISALLOWED(); 2480} 2481 2482sub anchors_summary (\%\%) 2483{ 2484 my ($anchors, $errors) = @_; 2485 2486 # Number of anchors found. 2487 my $n = scalar(keys(%$anchors)); 2488 if (!$Opts{Quiet}) { 2489 if ($Opts{HTML}) { 2490 print("<h3>Anchors</h3>\n<p>"); 2491 } 2492 else { 2493 print("Anchors\n\n"); 2494 } 2495 &hprintf("Found %d anchor%s.\n", $n, ($n == 1) ? '' : 's'); 2496 print("</p>\n") if $Opts{HTML}; 2497 } 2498 2499 # List of the duplicates, if any. 2500 my @errors = keys %{$errors}; 2501 if (!scalar(@errors)) { 2502 print("<p>Valid anchors!</p>\n") 2503 if (!$Opts{Quiet} && $Opts{HTML} && $n); 2504 return; 2505 } 2506 undef $n; 2507 2508 print_doc_header(); 2509 print('<p>') if $Opts{HTML}; 2510 print('List of duplicate and empty anchors'); 2511 print <<'EOF' if $Opts{HTML}; 2512</p> 2513<table class="report" border="1" summary="List of duplicate and empty anchors."> 2514<thead> 2515<tr> 2516<th>Anchor</th> 2517<th>Lines</th> 2518</tr> 2519</thead> 2520<tbody> 2521EOF 2522 print("\n"); 2523 2524 for my $anchor (@errors) { 2525 my $format; 2526 my @unique = &sort_unique( 2527 map { line_number($_) } 2528 keys %{$anchors->{$anchor}} 2529 ); 2530 if ($Opts{HTML}) { 2531 $format = "<tr><td class=\"broken\">%s</td><td>%s</td></tr>\n"; 2532 } 2533 else { 2534 my $s = (scalar(@unique) > 1) ? 's' : ''; 2535 $format = "\t%s\tLine$s: %s\n"; 2536 } 2537 printf($format, 2538 &encode(length($anchor) ? $anchor : 'Empty anchor'), 2539 join(', ', @unique)); 2540 } 2541 2542 print("</tbody>\n</table>\n") if $Opts{HTML}; 2543 2544 return; 2545} 2546 2547sub show_link_report (\%\%\%\%\@;$\%) 2548{ 2549 my ($links, $results, $broken, $redirects, $urls, $codes, $todo) = @_; 2550 2551 print("\n<dl class=\"report\">") if $Opts{HTML}; 2552 print("\n") if (!$Opts{Quiet}); 2553 2554 # Process each URL 2555 my ($c, $previous_c); 2556 for my $u (@$urls) { 2557 my @fragments = keys %{$broken->{$u}{fragments}}; 2558 2559 # Did we get a redirect? 2560 my $redirected = &is_redirected($u, %$redirects); 2561 2562 # List of lines 2563 my @total_lines; 2564 push(@total_lines, keys(%{$links->{$u}{location}})); 2565 for my $f (@fragments) { 2566 push(@total_lines, keys(%{$links->{$u}{fragments}{$f}})) 2567 unless ($f eq $u && defined($links->{$u}{$u}{LINE_UNKNOWN()})); 2568 } 2569 2570 my ($redirect_loop, @redirects_urls) = get_redirects($u, %$redirects); 2571 my $currloc = $results->{$u}{location}; 2572 2573 # Error type 2574 $c = &code_shown($u, $results); 2575 2576 # What to do 2577 my $whattodo; 2578 my $redirect_too; 2579 if ($todo) { 2580 if ($u =~ m/^javascript:/) { 2581 if ($Opts{HTML}) { 2582 $whattodo = 2583 'You must change this link: people using a browser without JavaScript support 2584will <em>not</em> be able to follow this link. See the 2585<a href="http://www.w3.org/TR/WAI-WEBCONTENT/#tech-scripts">Web Content 2586Accessibility Guidelines on the use of scripting on the Web</a> and the 2587<a href="http://www.w3.org/TR/WCAG10-HTML-TECHS/#directly-accessible-scripts">techniques 2588on how to solve this</a>.'; 2589 } 2590 else { 2591 $whattodo = 2592 'Change this link: people using a browser without JavaScript support will not be able to follow this link.'; 2593 } 2594 } 2595 elsif ($c == RC_ROBOTS_TXT()) { 2596 $whattodo = 2597 'The link was not checked due to robots exclusion ' . 2598 'rules. Check the link manually.'; 2599 } 2600 elsif ($redirect_loop) { 2601 $whattodo = 2602 'Retrieving the URI results in a redirect loop, that should be ' 2603 . 'fixed. Examine the redirect sequence to see where the loop ' 2604 . 'occurs.'; 2605 } 2606 else { 2607 $whattodo = $todo->{$c}; 2608 } 2609 } 2610 elsif (defined($redirects{$u})) { 2611 2612 # Redirects 2613 if (($u . '/') eq $redirects{$u}) { 2614 $whattodo = 2615 'The link is missing a trailing slash, and caused a redirect. Adding the trailing slash would speed up browsing.'; 2616 } 2617 elsif ($c == 307 || $c == 302) { 2618 $whattodo = 2619 'This is a temporary redirect. Update the link if you believe it makes sense, or leave it as is.'; 2620 } 2621 elsif ($c == 301) { 2622 $whattodo = 2623 'This is a permanent redirect. The link should be updated.'; 2624 } 2625 } 2626 2627 my @unique = &sort_unique(map { line_number($_) } @total_lines); 2628 my $lines_list = join(', ', @unique); 2629 my $s = (scalar(@unique) > 1) ? 's' : ''; 2630 undef @unique; 2631 2632 my @http_codes = ($currloc->{code}); 2633 unshift(@http_codes, $currloc->{orig}) if $currloc->{orig}; 2634 @http_codes = map { http_rc($_) } @http_codes; 2635 2636 if ($Opts{HTML}) { 2637 2638 # Style stuff 2639 my $idref = ''; 2640 if ($codes && (!defined($previous_c) || ($c != $previous_c))) { 2641 $idref = ' id="d' . $doc_count . 'code_' . $c . '"'; 2642 $previous_c = $c; 2643 } 2644 2645 # Main info 2646 for (@redirects_urls) { 2647 $_ = &show_url($_); 2648 } 2649 2650 # HTTP message 2651 my $http_message; 2652 if ($currloc->{message}) { 2653 $http_message = &encode($currloc->{message}); 2654 if ($c == 404 || $c == 500) { 2655 $http_message = 2656 '<span class="broken">' . $http_message . '</span>'; 2657 } 2658 } 2659 my $redirmsg = 2660 $redirect_loop ? ' <em>redirect loop detected</em>' : ''; 2661 printf(" 2662<dt%s>%s <span class='msg_loc'>Line%s: %s</span> %s</dt> 2663<dd class='responsecode'><strong>Status</strong>: %s %s %s</dd> 2664<dd class='message_explanation'><p>%s %s</p></dd>\n", 2665 2666 # Anchor for return codes 2667 $idref, 2668 2669 # Color 2670 &status_icon($c), 2671 $s, 2672 2673 # List of lines 2674 $lines_list, 2675 2676 # List of redirects 2677 $redirected ? 2678 join(' redirected to ', @redirects_urls) . $redirmsg : 2679 &show_url($u), 2680 2681 # Realm 2682 defined($currloc->{realm}) ? 2683 sprintf('Realm: %s<br />', &encode($currloc->{realm})) : 2684 '', 2685 2686 # HTTP original message 2687 # defined($currloc->{orig_message}) 2688 # ? &encode($currloc->{orig_message}). 2689 # ' <span title="redirected to">-></span> ' 2690 # : '', 2691 2692 # Response code chain 2693 join( 2694 ' <span class="redirected_to" title="redirected to">-></span> ', 2695 map { &encode($_) } @http_codes), 2696 2697 # HTTP final message 2698 $http_message, 2699 2700 # What to do 2701 $whattodo, 2702 2703 # Redirect too? 2704 $redirect_too ? 2705 sprintf(' <span %s>%s</span>', 2706 &bgcolor(301), $redirect_too) : 2707 '', 2708 ); 2709 if ($#fragments >= 0) { 2710 printf("<dd>Broken fragments: <ul>\n"); 2711 } 2712 } 2713 else { 2714 my $redirmsg = $redirect_loop ? ' redirect loop detected' : ''; 2715 printf( 2716 "\n%s\t%s\n Code: %s %s\n%s\n", 2717 2718 # List of redirects 2719 $redirected ? join("\n-> ", @redirects_urls) . $redirmsg : $u, 2720 2721 # List of lines 2722 $lines_list ? sprintf("\n%6s: %s", "Line$s", $lines_list) : '', 2723 2724 # Response code chain 2725 join(' -> ', @http_codes), 2726 2727 # HTTP message 2728 $currloc->{message} || '', 2729 2730 # What to do 2731 wrap(' To do: ', ' ', $whattodo) 2732 ); 2733 if ($#fragments >= 0) { 2734 if ($currloc->{code} == 200) { 2735 print("The following fragments need to be fixed:\n"); 2736 } 2737 else { 2738 print("Fragments:\n"); 2739 } 2740 } 2741 } 2742 2743 # Fragments 2744 for my $f (@fragments) { 2745 my @unique_lines = 2746 &sort_unique(keys %{$links->{$u}{fragments}{$f}}); 2747 my $plural = (scalar(@unique_lines) > 1) ? 's' : ''; 2748 my $unique_lines = join(', ', @unique_lines); 2749 if ($Opts{HTML}) { 2750 printf("<li>%s<em>#%s</em> (line%s %s)</li>\n", 2751 &encode($u), &encode($f), $plural, $unique_lines); 2752 } 2753 else { 2754 printf("\t%-30s\tLine%s: %s\n", $f, $plural, $unique_lines); 2755 } 2756 } 2757 2758 print("</ul></dd>\n") if ($Opts{HTML} && scalar(@fragments)); 2759 } 2760 2761 # End of the table 2762 print("</dl>\n") if $Opts{HTML}; 2763 2764 return; 2765} 2766 2767sub code_shown ($$) 2768{ 2769 my ($u, $results) = @_; 2770 2771 if ($results->{$u}{location}{record} == 200) { 2772 return $results->{$u}{location}{orig} || 2773 $results->{$u}{location}{record}; 2774 } 2775 else { 2776 return $results->{$u}{location}{record}; 2777 } 2778} 2779 2780sub links_summary (\%\%\%\%) 2781{ 2782 2783 # Advices to fix the problems 2784 2785 my %todo = ( 2786 200 => 2787 'Some of the links to this resource point to broken URI fragments (such as index.html#fragment).', 2788 300 => 2789 'This often happens when a typo in the link gets corrected automatically by the server. For the sake of performance, the link should be fixed.', 2790 301 => 2791 'This is a permanent redirect. The link should be updated to point to the more recent URI.', 2792 302 => 2793 'This is a temporary redirect. Update the link if you believe it makes sense, or leave it as is.', 2794 303 => 2795 'This rare status code points to a "See Other" resource. There is generally nothing to be done.', 2796 307 => 2797 'This is a temporary redirect. Update the link if you believe it makes sense, or leave it as is.', 2798 400 => 2799 'This is usually the sign of a malformed URL that cannot be parsed by the server. Check the syntax of the link.', 2800 401 => 2801 "The link is not public and the actual resource is only available behind authentication. If not already done, you could specify it.", 2802 403 => 2803 'The link is forbidden! This needs fixing. Usual suspects: a missing index.html or Overview.html, or a missing ACL.', 2804 404 => 2805 'The link is broken. Double-check that you have not made any typo, or mistake in copy-pasting. If the link points to a resource that no longer exists, you may want to remove or fix the link.', 2806 405 => 2807 'The server does not allow HTTP HEAD requests, which prevents the Link Checker to check the link automatically. Check the link manually.', 2808 406 => 2809 "The server isn't capable of responding according to the Accept* headers sent. This is likely to be a server-side issue with negotiation.", 2810 407 => 'The link is a proxy, but requires Authentication.', 2811 408 => 'The request timed out.', 2812 410 => 'The resource is gone. You should remove this link.', 2813 415 => 'The media type is not supported.', 2814 500 => 'This is a server side problem. Check the URI.', 2815 501 => 2816 'Could not check this link: method not implemented or scheme not supported.', 2817 503 => 2818 'The server cannot service the request, for some unknown reason.', 2819 2820 # Non-HTTP codes: 2821 RC_ROBOTS_TXT() => sprintf( 2822 'The link was not checked due to %srobots exclusion rules%s. Check the link manually, and see also the link checker %sdocumentation on robots exclusion%s.', 2823 $Opts{HTML} ? ( 2824 '<a href="http://www.robotstxt.org/robotstxt.html">', '</a>', 2825 "<a href=\"$Cfg{Doc_URI}#bot\">", '</a>' 2826 ) : ('') x 4 2827 ), 2828 RC_DNS_ERROR() => 2829 'The hostname could not be resolved. Check the link for typos.', 2830 RC_IP_DISALLOWED() => 2831 sprintf( 2832 'The link resolved to a %snon-public IP address%s, and this link checker instance has been configured to not access such addresses. This may be a real error or just a quirk of the name resolver configuration on the server where the link checker runs. Check the link manually, in particular its hostname/IP address.', 2833 $Opts{HTML} ? 2834 ('<a href="http://www.ietf.org/rfc/rfc1918.txt">', '</a>') : 2835 ('') x 2), 2836 RC_PROTOCOL_DISALLOWED() => 2837 'Accessing links with this URI scheme has been disabled in link checker.', 2838 ); 2839 my %priority = ( 2840 410 => 1, 2841 404 => 2, 2842 403 => 5, 2843 200 => 10, 2844 300 => 15, 2845 401 => 20 2846 ); 2847 2848 my ($links, $results, $broken, $redirects) = @_; 2849 2850 # List of the broken links 2851 my @urls = keys %{$broken}; 2852 my @dir_redirect_urls = (); 2853 if ($Opts{Redirects}) { 2854 2855 # Add the redirected URI's to the report 2856 for my $l (keys %$redirects) { 2857 next 2858 unless (defined($results->{$l}) && 2859 defined($links->{$l}) && 2860 !defined($broken->{$l})); 2861 2862 # Check whether we have a "directory redirect" 2863 # e.g. http://www.w3.org/TR -> http://www.w3.org/TR/ 2864 my ($redirect_loop, @redirects) = get_redirects($l, %$redirects); 2865 if ($#redirects == 1) { 2866 push(@dir_redirect_urls, $l); 2867 next; 2868 } 2869 push(@urls, $l); 2870 } 2871 } 2872 2873 # Broken links and redirects 2874 if ($#urls < 0) { 2875 if (!$Opts{Quiet}) { 2876 print_doc_header(); 2877 if ($Opts{HTML}) { 2878 print "<h3>Links</h3>\n<p>Valid links!</p>\n"; 2879 } 2880 else { 2881 print "\nValid links.\n"; 2882 } 2883 } 2884 } 2885 else { 2886 print_doc_header(); 2887 print('<h3>') if $Opts{HTML}; 2888 print("\nList of broken links and other issues"); 2889 2890 #print(' and redirects') if $Opts{Redirects}; 2891 2892 # Sort the URI's by HTTP Code 2893 my %code_summary; 2894 my @idx; 2895 for my $u (@urls) { 2896 if (defined($results->{$u}{location}{record})) { 2897 my $c = &code_shown($u, $results); 2898 $code_summary{$c}++; 2899 push(@idx, $c); 2900 } 2901 } 2902 my @sorted = @urls[ 2903 sort { 2904 defined($priority{$idx[$a]}) ? 2905 defined($priority{$idx[$b]}) ? 2906 $priority{$idx[$a]} <=> $priority{$idx[$b]} : 2907 -1 : 2908 defined($priority{$idx[$b]}) ? 1 : 2909 $idx[$a] <=> $idx[$b] 2910 } 0 .. $#idx 2911 ]; 2912 @urls = @sorted; 2913 undef(@sorted); 2914 undef(@idx); 2915 2916 if ($Opts{HTML}) { 2917 2918 # Print a summary 2919 print <<'EOF'; 2920</h3> 2921<p><em>There are issues with the URLs listed below. The table summarizes the 2922issues and suggested actions by HTTP response status code.</em></p> 2923<table class="report" border="1" summary="List of issues and suggested actions."> 2924<thead> 2925<tr> 2926<th>Code</th> 2927<th>Occurrences</th> 2928<th>What to do</th> 2929</tr> 2930</thead> 2931<tbody> 2932EOF 2933 for my $code (sort(keys(%code_summary))) { 2934 printf('<tr%s>', &bgcolor($code)); 2935 printf('<td><a href="#d%scode_%s">%s</a></td>', 2936 $doc_count, $code, http_rc($code)); 2937 printf('<td>%s</td>', $code_summary{$code}); 2938 printf('<td>%s</td>', $todo{$code}); 2939 print "</tr>\n"; 2940 } 2941 print "</tbody>\n</table>\n"; 2942 } 2943 else { 2944 print(':'); 2945 } 2946 &show_link_report($links, $results, $broken, $redirects, \@urls, 1, 2947 \%todo); 2948 } 2949 2950 # Show directory redirects 2951 if ($Opts{Dir_Redirects} && ($#dir_redirect_urls > -1)) { 2952 print_doc_header(); 2953 print('<h3>') if $Opts{HTML}; 2954 print("\nList of redirects"); 2955 print( 2956 "</h3>\n<p>The links below are not broken, but the document does not use the exact URL, and the links were redirected. It may be a good idea to link to the final location, for the sake of speed.</p>" 2957 ) if $Opts{HTML}; 2958 &show_link_report($links, $results, $broken, $redirects, 2959 \@dir_redirect_urls); 2960 } 2961 2962 return; 2963} 2964 2965############################################################################### 2966 2967################ 2968# Global stats # 2969################ 2970 2971sub global_stats () 2972{ 2973 my $stop = &get_timestamp(); 2974 my $n_docs = 2975 ($doc_count <= $Opts{Max_Documents}) ? $doc_count : 2976 $Opts{Max_Documents}; 2977 return sprintf( 2978 'Checked %d document%s in %s seconds.', 2979 $n_docs, 2980 ($n_docs == 1) ? '' : 's', 2981 &time_diff($timestamp, $stop) 2982 ); 2983} 2984 2985################## 2986# HTML interface # 2987################## 2988 2989sub html_header ($$) 2990{ 2991 my ($uri, $cookie) = @_; 2992 2993 my $title = defined($uri) ? $uri : ''; 2994 $title = ': ' . $title if ($title =~ /\S/); 2995 2996 my $headers = ''; 2997 if (!$Opts{Command_Line}) { 2998 $headers .= "Cache-Control: no-cache\nPragma: no-cache\n" if $uri; 2999 $headers .= "Content-Type: text/html; charset=utf-8\n"; 3000 $headers .= "Set-Cookie: $cookie\n" if $cookie; 3001 3002 # mod_perl 1.99_05 doesn't seem to like it if the "\n\n" isn't in the same 3003 # print() statement as the last header 3004 $headers .= "Content-Language: en\n\n"; 3005 } 3006 3007 my $onload = $uri ? '' : 3008 ' onload="if(document.getElementById){document.getElementById(\'uri_1\').focus()}"'; 3009 3010 print $headers, $DocType, " 3011<html lang=\"en\" xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\"> 3012<head> 3013<title>W3C Link Checker", &encode($title), "</title> 3014", $Head, "</head> 3015<body", $onload, '>'; 3016 &banner($title); 3017 return; 3018} 3019 3020sub banner ($) 3021{ 3022 my $tagline = "Check links and anchors in Web pages or full Web sites"; 3023 3024 printf( 3025 <<'EOF', URI->new_abs("../images/no_w3c.png", $Cfg{Doc_URI}), $tagline); 3026<div id="banner"><h1 id="title"><a href="http://www.w3.org/" title="W3C"><img alt="W3C" id="logo" src="%s" width="110" height="61" /></a> 3027<a href="checklink"><span>Link Checker</span></a></h1> 3028<p id="tagline">%s</p></div> 3029<div id="main"> 3030EOF 3031 return; 3032} 3033 3034sub status_icon($) 3035{ 3036 my ($code) = @_; 3037 my $icon_type; 3038 my $r = HTTP::Response->new($code); 3039 if ($r->is_success()) { 3040 $icon_type = 'error' 3041 ; # if is success but reported, it's because of broken frags => error 3042 } 3043 elsif (&informational($code)) { 3044 $icon_type = 'info'; 3045 } 3046 elsif ($code == 300) { 3047 $icon_type = 'info'; 3048 } 3049 elsif ($code == 401) { 3050 $icon_type = 'error'; 3051 } 3052 elsif ($r->is_redirect()) { 3053 $icon_type = 'warning'; 3054 } 3055 elsif ($r->is_error()) { 3056 $icon_type = 'error'; 3057 } 3058 else { 3059 $icon_type = 'error'; 3060 } 3061 return sprintf('<span class="err_type"><img src="%s" alt="%s" /></span>', 3062 URI->new_abs("../images/info_icons/$icon_type.png", $Cfg{Doc_URI}), 3063 $icon_type); 3064} 3065 3066sub bgcolor ($) 3067{ 3068 my ($code) = @_; 3069 my $class; 3070 my $r = HTTP::Response->new($code); 3071 if ($r->is_success()) { 3072 return ''; 3073 } 3074 elsif ($code == RC_ROBOTS_TXT() || $code == RC_IP_DISALLOWED()) { 3075 $class = 'dubious'; 3076 } 3077 elsif ($code == 300) { 3078 $class = 'multiple'; 3079 } 3080 elsif ($code == 401) { 3081 $class = 'unauthorized'; 3082 } 3083 elsif ($r->is_redirect()) { 3084 $class = 'redirect'; 3085 } 3086 elsif ($r->is_error()) { 3087 $class = 'broken'; 3088 } 3089 else { 3090 $class = 'broken'; 3091 } 3092 return (' class="' . $class . '"'); 3093} 3094 3095sub show_url ($) 3096{ 3097 my ($url) = @_; 3098 return sprintf('<a href="%s">%s</a>', (&encode($url)) x 2); 3099} 3100 3101sub html_footer () 3102{ 3103 printf("<p>%s</p>\n", &global_stats()) 3104 if ($doc_count > 0 && !$Opts{Quiet}); 3105 if (!$doc_count) { 3106 print <<'EOF'; 3107<div class="intro"> 3108 <p> 3109 This Link Checker looks for issues in links, anchors and referenced objects 3110 in a Web page, CSS style sheet, or recursively on a whole Web site. For 3111 best results, it is recommended to first ensure that the documents checked 3112 use Valid <a href="http://validator.w3.org/">(X)HTML Markup</a> and 3113 <a href="http://jigsaw.w3.org/css-validator/">CSS</a>. The Link Checker is 3114 part of the W3C's <a href="http://www.w3.org/QA/Tools/">validators and 3115 Quality Web tools</a>. 3116 </p> 3117</div> 3118EOF 3119 } 3120 printf(<<'EOF', $Cfg{Doc_URI}, $Cfg{Doc_URI}, $PACKAGE, $REVISION); 3121</div><!-- main --> 3122<ul class="navbar" id="menu"> 3123 <li><a href="%s" accesskey="3" title="Documentation for this Link Checker Service">Docs</a></li> 3124 <li><a href="http://search.cpan.org/dist/W3C-LinkChecker/" accesskey="2" title="Download the source / Install this service">Download</a></li> 3125 <li><a href="%s#csb" title="feedback: comments, suggestions and bugs" accesskey="4">Feedback</a></li> 3126 <li><a href="http://validator.w3.org/" title="Validate your markup with the W3C Markup Validation Service">Validator</a></li> 3127</ul> 3128<div> 3129<address> 3130%s<br /> %s 3131</address> 3132</div> 3133</body> 3134</html> 3135EOF 3136 return; 3137} 3138 3139sub print_form (\%$$) 3140{ 3141 my ($params, $cookie, $check_num) = @_; 3142 3143 # Split params on \0, see CGI's docs on Vars() 3144 while (my ($key, $value) = each(%$params)) { 3145 if ($value) { 3146 my @vals = split(/\0/, $value, 2); 3147 $params->{$key} = $vals[0]; 3148 } 3149 } 3150 3151 # Override undefined values from the cookie, if we got one. 3152 my $valid_cookie = 0; 3153 if ($cookie) { 3154 my %cookie_values = $cookie->value(); 3155 if (!$cookie_values{clear}) 3156 { # XXX no easy way to check if cookie expired? 3157 $valid_cookie = 1; 3158 while (my ($key, $value) = each(%cookie_values)) { 3159 $params->{$key} = $value unless defined($params->{$key}); 3160 } 3161 } 3162 } 3163 3164 my $chk = ' checked="checked"'; 3165 $params->{hide_type} = 'all' unless $params->{hide_type}; 3166 3167 my $requested_uri = &encode($params->{uri} || ''); 3168 my $sum = $params->{summary} ? $chk : ''; 3169 my $red = $params->{hide_redirects} ? $chk : ''; 3170 my $all = ($params->{hide_type} ne 'dir') ? $chk : ''; 3171 my $dir = $all ? '' : $chk; 3172 my $acc = $params->{no_accept_language} ? $chk : ''; 3173 my $ref = $params->{no_referer} ? $chk : ''; 3174 my $rec = $params->{recursive} ? $chk : ''; 3175 my $dep = &encode($params->{depth} || ''); 3176 3177 my $cookie_options = ''; 3178 if ($valid_cookie) { 3179 $cookie_options = " 3180 <label for=\"cookie1_$check_num\"><input type=\"radio\" id=\"cookie1_$check_num\" name=\"cookie\" value=\"nochanges\" checked=\"checked\" /> Don't modify saved options</label> 3181 <label for=\"cookie2_$check_num\"><input type=\"radio\" id=\"cookie2_$check_num\" name=\"cookie\" value=\"set\" /> Save these options</label> 3182 <label for=\"cookie3_$check_num\"><input type=\"radio\" id=\"cookie3_$check_num\" name=\"cookie\" value=\"clear\" /> Clear saved options</label>"; 3183 } 3184 else { 3185 $cookie_options = " 3186 <label for=\"cookie_$check_num\"><input type=\"checkbox\" id=\"cookie_$check_num\" name=\"cookie\" value=\"set\" /> Save options in a <a href=\"http://www.w3.org/Protocols/rfc2109/rfc2109\">cookie</a></label>"; 3187 } 3188 3189 print "<form action=\"", $Opts{_Self_URI}, 3190 "\" method=\"get\" onsubmit=\"return uriOk($check_num)\" accept-charset=\"UTF-8\"> 3191<p><label for=\"uri_$check_num\">Enter the address (<a href=\"http://www.w3.org/Addressing/\">URL</a>) 3192of a document that you would like to check:</label></p> 3193<p><input type=\"text\" size=\"50\" id=\"uri_$check_num\" name=\"uri\" value=\"", 3194 $requested_uri, "\" /></p> 3195<fieldset id=\"extra_opt_uri_$check_num\" class=\"moreoptions\"> 3196 <legend class=\"toggletext\">More Options</legend> 3197 <div class=\"options\"> 3198 <p> 3199 <label for=\"summary_$check_num\"><input type=\"checkbox\" id=\"summary_$check_num\" name=\"summary\" value=\"on\"", 3200 $sum, " /> Summary only</label> 3201 <br /> 3202 <label for=\"hide_redirects_$check_num\"><input type=\"checkbox\" id=\"hide_redirects_$check_num\" name=\"hide_redirects\" value=\"on\"", 3203 $red, 3204 " /> Hide <a href=\"http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html#sec10.3\">redirects</a>:</label> 3205 <label for=\"hide_type_all_$check_num\"><input type=\"radio\" id=\"hide_type_all_$check_num\" name=\"hide_type\" value=\"all\"", 3206 $all, " /> all</label> 3207 <label for=\"hide_type_dir_$check_num\"><input type=\"radio\" id=\"hide_type_dir_$check_num\" name=\"hide_type\" value=\"dir\"", 3208 $dir, " /> for directories only</label> 3209 <br /> 3210 <label for=\"no_accept_language_$check_num\"><input type=\"checkbox\" id=\"no_accept_language_$check_num\" name=\"no_accept_language\" value=\"on\"", 3211 $acc, 3212 " /> Don't send the <tt><a href=\"http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4\">Accept-Language</a></tt> header</label> 3213 <br /> 3214 <label for=\"no_referer_$check_num\"><input type=\"checkbox\" id=\"no_referer_$check_num\" name=\"no_referer\" value=\"on\"", 3215 $ref, 3216 " /> Don't send the <tt><a href=\"http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.36\">Referer</a></tt> header</label> 3217 <br /> 3218 <label title=\"Check linked documents recursively (maximum: ", 3219 $Opts{Max_Documents}, 3220 " documents)\" for=\"recursive_$check_num\"><input type=\"checkbox\" id=\"recursive_$check_num\" name=\"recursive\" value=\"on\"", 3221 $rec, " /> Check linked documents recursively</label>, 3222 <label title=\"Depth of the recursion (-1 is the default and means unlimited)\" for=\"depth_$check_num\">recursion depth: <input type=\"text\" size=\"3\" maxlength=\"3\" id=\"depth_$check_num\" name=\"depth\" value=\"", 3223 $dep, "\" /></label> 3224 <br /><br />", $cookie_options, " 3225 </p> 3226 </div> 3227</fieldset> 3228<p class=\"submit_button\"><input type=\"submit\" name=\"check\" value=\"Check\" /></p> 3229</form> 3230<div class=\"intro\" id=\"don_program\"></div> 3231<script type=\"text/javascript\" src=\"http://www.w3.org/QA/Tools/don_prog.js\"></script> 3232"; 3233 return; 3234} 3235 3236sub encode (@) 3237{ 3238 return $Opts{HTML} ? HTML::Entities::encode(@_) : @_; 3239} 3240 3241sub hprintf (@) 3242{ 3243 print_doc_header(); 3244 if (!$Opts{HTML}) { 3245 printf(@_); 3246 } 3247 else { 3248 print HTML::Entities::encode(sprintf($_[0], @_[1 .. @_ - 1])); 3249 } 3250 return; 3251} 3252 3253# Print the document header, if it hasn't been printed already. 3254# This is invoked before most other output operations, in order 3255# to enable quiet processing that doesn't clutter the output with 3256# "Processing..." messages when nothing else will be reported. 3257sub print_doc_header () 3258{ 3259 if (defined($doc_header)) { 3260 print $doc_header; 3261 undef($doc_header); 3262 } 3263} 3264 3265# Local Variables: 3266# mode: perl 3267# indent-tabs-mode: nil 3268# cperl-indent-level: 4 3269# cperl-continued-statement-offset: 4 3270# cperl-brace-offset: -4 3271# perl-indent-level: 4 3272# End: 3273# ex: ts=4 sw=4 et 3274