1#!/usr/bin/perl -wT
2#
3# W3C Link Checker
4# by Hugo Haas <hugo@w3.org>
5# (c) 1999-2011 World Wide Web Consortium
6# based on Renaud Bruyeron's checklink.pl
7#
8# This program is licensed under the W3C(r) Software License:
9#       http://www.w3.org/Consortium/Legal/copyright-software
10#
11# The documentation is at:
12#       http://validator.w3.org/docs/checklink.html
13#
14# See the Mercurial interface at:
15#       http://dvcs.w3.org/hg/link-checker/
16#
17# An online version is available at:
18#       http://validator.w3.org/checklink
19#
20# Comments and suggestions should be sent to the www-validator mailing list:
21#       www-validator@w3.org (with 'checklink' in the subject)
22#       http://lists.w3.org/Archives/Public/www-validator/ (archives)
23
24use strict;
25use 5.008;
26
27# Get rid of potentially unsafe and unneeded environment variables.
28delete(@ENV{qw(IFS CDPATH ENV BASH_ENV)});
29$ENV{PATH} = undef;
30
31# ...but we want PERL5?LIB honored even in taint mode, see perlsec, perl5lib,
32# http://www.mail-archive.com/cpan-testers-discuss%40perl.org/msg01064.html
33use Config qw(%Config);
34use lib map { /(.*)/ }
35    defined($ENV{PERL5LIB}) ? split(/$Config{path_sep}/, $ENV{PERL5LIB}) :
36    defined($ENV{PERLLIB})  ? split(/$Config{path_sep}/, $ENV{PERLLIB}) :
37                              ();
38
39# -----------------------------------------------------------------------------
40
41package W3C::UserAgent;
42
43use LWP::RobotUA 1.19 qw();
44use LWP::UserAgent qw();
45use Net::HTTP::Methods 5.833 qw();    # >= 5.833 for 4kB cookies (#6678)
46
47# if 0, ignore robots exclusion (useful for testing)
48use constant USE_ROBOT_UA => 1;
49
50if (USE_ROBOT_UA) {
51    @W3C::UserAgent::ISA = qw(LWP::RobotUA);
52}
53else {
54    @W3C::UserAgent::ISA = qw(LWP::UserAgent);
55}
56
57sub new
58{
59    my $proto = shift;
60    my $class = ref($proto) || $proto;
61    my ($name, $from, $rules) = @_;
62
63    # For security/privacy reasons, if $from was not given, do not send it.
64    # Cheat by defining something for the constructor, and resetting it later.
65    my $from_ok = $from;
66    $from ||= 'www-validator@w3.org';
67
68    my $self;
69    if (USE_ROBOT_UA) {
70        $self = $class->SUPER::new($name, $from, $rules);
71    }
72    else {
73        my %cnf;
74        @cnf{qw(agent from)} = ($name, $from);
75        $self = LWP::UserAgent->new(%cnf);
76        $self = bless $self, $class;
77    }
78
79    $self->from(undef) unless $from_ok;
80
81    $self->env_proxy();
82
83    $self->allow_private_ips(1);
84
85    $self->protocols_forbidden([qw(mailto javascript)]);
86
87    return $self;
88}
89
90sub allow_private_ips
91{
92    my $self = shift;
93    if (@_) {
94        $self->{Checklink_allow_private_ips} = shift;
95        if (!$self->{Checklink_allow_private_ips}) {
96
97            # Pull in dependencies
98            require Net::IP;
99            require Socket;
100            require Net::hostent;
101        }
102    }
103    return $self->{Checklink_allow_private_ips};
104}
105
106sub redirect_progress_callback
107{
108    my $self = shift;
109    $self->{Checklink_redirect_callback} = shift if @_;
110    return $self->{Checklink_redirect_callback};
111}
112
113sub simple_request
114{
115    my $self = shift;
116
117    my $response = $self->ip_disallowed($_[0]->uri());
118
119    # RFC 2616, section 15.1.3
120    $_[0]->remove_header("Referer")
121        if ($_[0]->referer() &&
122        (!$_[0]->uri()->secure() && URI->new($_[0]->referer())->secure()));
123
124    $response ||= do {
125        local $SIG{__WARN__} =
126            sub {    # Suppress some warnings, rt.cpan.org #18902
127            warn($_[0]) if ($_[0] && $_[0] !~ /^RobotRules/);
128            };
129
130        # @@@ Why not just $self->SUPER::simple_request?
131        $self->W3C::UserAgent::SUPER::simple_request(@_);
132    };
133
134    if (!defined($self->{FirstResponse})) {
135        $self->{FirstResponse} = $response->code();
136        $self->{FirstMessage} = $response->message() || '(no message)';
137    }
138
139    return $response;
140}
141
142sub redirect_ok
143{
144    my ($self, $request, $response) = @_;
145
146    if (my $callback = $self->redirect_progress_callback()) {
147
148        # @@@ TODO: when an LWP internal robots.txt request gets redirected,
149        # this will a bit confusingly fire for it too.  Would need a robust
150        # way to determine whether the request is such a LWP "internal
151        # robots.txt" one.
152        &$callback($request->method(), $request->uri());
153    }
154
155    return 0 unless $self->SUPER::redirect_ok($request, $response);
156
157    if (my $res = $self->ip_disallowed($request->uri())) {
158        $response->previous($response->clone());
159        $response->request($request);
160        $response->code($res->code());
161        $response->message($res->message());
162        return 0;
163    }
164
165    return 1;
166}
167
168#
169# Checks whether we're allowed to retrieve the document based on its IP
170# address.  Takes an URI object and returns a HTTP::Response containing the
171# appropriate status and error message if the IP was disallowed, 0
172# otherwise.  URIs without hostname or IP address are always allowed,
173# including schemes where those make no sense (eg. data:, often javascript:).
174#
175sub ip_disallowed
176{
177    my ($self, $uri) = @_;
178    return 0 if $self->allow_private_ips();    # Short-circuit
179
180    my $hostname = undef;
181    eval { $hostname = $uri->host() };    # Not all URIs implement host()...
182    return 0 unless $hostname;
183
184    my $addr = my $iptype = my $resp = undef;
185    if (my $host = Net::hostent::gethostbyname($hostname)) {
186        $addr = Socket::inet_ntoa($host->addr()) if $host->addr();
187        if ($addr && (my $ip = Net::IP->new($addr))) {
188            $iptype = $ip->iptype();
189        }
190    }
191    if ($iptype && $iptype ne 'PUBLIC') {
192        $resp = HTTP::Response->new(403,
193            'Checking non-public IP address disallowed by link checker configuration'
194        );
195        $resp->header('Client-Warning', 'Internal response');
196    }
197    return $resp;
198}
199
200# -----------------------------------------------------------------------------
201
202package W3C::LinkChecker;
203
204use vars qw($AGENT $PACKAGE $PROGRAM $VERSION $REVISION
205    $DocType $Head $Accept $ContentTypes %Cfg $CssUrl);
206
207use CSS::DOM 0.09 qw();    # >= 0.09 for many bugfixes
208use CSS::DOM::Constants qw(:rule);
209use CSS::DOM::Style qw();
210use CSS::DOM::Util qw();
211use Encode qw();
212use HTML::Entities qw();
213use HTML::Parser 3.40 qw();    # >= 3.40 for utf8_mode()
214use HTTP::Headers::Util qw();
215use HTTP::Message 5.827 qw();    # >= 5.827 for content_charset()
216use HTTP::Request 5.814 qw();    # >= 5.814 for accept_decodable()
217use HTTP::Response 1.50 qw();    # >= 1.50 for decoded_content()
218use Time::HiRes qw();
219use URI 1.53 qw();               # >= 1.53 for secure()
220use URI::Escape qw();
221use URI::Heuristic qw();
222
223# @@@ Needs also W3C::UserAgent but can't use() it here.
224
225use constant RC_ROBOTS_TXT          => -1;
226use constant RC_DNS_ERROR           => -2;
227use constant RC_IP_DISALLOWED       => -3;
228use constant RC_PROTOCOL_DISALLOWED => -4;
229
230use constant LINE_UNKNOWN => -1;
231
232use constant MP2 =>
233    (exists($ENV{MOD_PERL_API_VERSION}) && $ENV{MOD_PERL_API_VERSION} >= 2);
234
235# Tag=>attribute mapping of things we treat as links.
236# Note: meta/@http-equiv gets special treatment, see start() for details.
237use constant LINK_ATTRS => {
238    a => ['href'],
239
240    # base/@href intentionally not checked
241    # http://www.w3.org/mid/200802091439.27764.ville.skytta%40iki.fi
242    area       => ['href'],
243    audio      => ['src'],
244    blockquote => ['cite'],
245    body       => ['background'],
246    command    => ['icon'],
247
248    # button/@formaction not checked (side effects)
249    del => ['cite'],
250
251    # @pluginspage, @pluginurl, @href: pre-HTML5 proprietary
252    embed => ['href', 'pluginspage', 'pluginurl', 'src'],
253
254    # form/@action not checked (side effects)
255    frame  => ['longdesc', 'src'],
256    html   => ['manifest'],
257    iframe => ['longdesc', 'src'],
258    img    => ['longdesc', 'src'],
259
260    # input/@action, input/@formaction not checked (side effects)
261    input  => ['src'],
262    ins    => ['cite'],
263    link   => ['href'],
264    object => ['data'],
265    q      => ['cite'],
266    script => ['src'],
267    source => ['src'],
268    track  => ['src'],
269    video  => ['src', 'poster'],
270};
271
272# Tag=>[separator, attributes] mapping of things we treat as lists of links.
273use constant LINK_LIST_ATTRS => {
274    a      => [qr/\s+/,    ['ping']],
275    applet => [qr/[\s,]+/, ['archive']],
276    area   => [qr/\s+/,    ['ping']],
277    head   => [qr/\s+/,    ['profile']],
278    object => [qr/\s+/,    ['archive']],
279};
280
281# TBD/TODO:
282# - applet/@code?
283# - bgsound/@src?
284# - object/@classid?
285# - isindex/@action?
286# - layer/@background,@src?
287# - ilayer/@background?
288# - table,tr,td,th/@background?
289# - xmp/@href?
290
291@W3C::LinkChecker::ISA = qw(HTML::Parser);
292
293BEGIN {
294
295    # Version info
296    $PACKAGE  = 'W3C Link Checker';
297    $PROGRAM  = 'W3C-checklink';
298    $VERSION  = '4.81';
299    $REVISION = sprintf('version %s (c) 1999-2011 W3C', $VERSION);
300    $AGENT    = sprintf(
301        '%s/%s %s',
302        $PROGRAM, $VERSION,
303        (   W3C::UserAgent::USE_ROBOT_UA ? LWP::RobotUA->_agent() :
304                LWP::UserAgent->_agent()
305        )
306    );
307
308    # Pull in mod_perl modules if applicable.
309    eval {
310        local $SIG{__DIE__} = undef;
311        require Apache2::RequestUtil;
312    } if MP2();
313
314    my @content_types = qw(
315        text/html
316        application/xhtml+xml;q=0.9
317        application/vnd.wap.xhtml+xml;q=0.6
318    );
319    $Accept = join(', ', @content_types, '*/*;q=0.5');
320    push(@content_types, 'text/css', 'text/html-sandboxed');
321    my $re = join('|', map { s/;.*//; quotemeta } @content_types);
322    $ContentTypes = qr{\b(?:$re)\b}io;
323
324    # Regexp for matching URL values in CSS.
325    $CssUrl = qr/(?:\s|^)url\(\s*(['"]?)(.*?)\1\s*\)(?=\s|$)/;
326
327    #
328    # Read configuration.  If the W3C_CHECKLINK_CFG environment variable has
329    # been set or the default contains a non-empty file, read it.  Otherwise,
330    # skip silently.
331    #
332    my $defaultconfig = '/etc/w3c/checklink.conf';
333    if ($ENV{W3C_CHECKLINK_CFG} || -s $defaultconfig) {
334
335        require Config::General;
336        Config::General->require_version(2.06);    # Need 2.06 for -SplitPolicy
337
338        my $conffile = $ENV{W3C_CHECKLINK_CFG} || $defaultconfig;
339        eval {
340            my %config_opts = (
341                -ConfigFile        => $conffile,
342                -SplitPolicy       => 'equalsign',
343                -AllowMultiOptions => 'no',
344            );
345            %Cfg = Config::General->new(%config_opts)->getall();
346        };
347        if ($@) {
348            die <<"EOF";
349Failed to read configuration from '$conffile':
350$@
351EOF
352        }
353    }
354    $Cfg{Markup_Validator_URI} ||= 'http://validator.w3.org/check?uri=%s';
355    $Cfg{CSS_Validator_URI} ||=
356        'http://jigsaw.w3.org/css-validator/validator?uri=%s';
357    $Cfg{Doc_URI} ||= 'http://validator.w3.org/docs/checklink.html';
358
359    # Untaint config params that are used as the format argument to (s)printf(),
360    # Perl 5.10 does not want to see that in taint mode.
361    ($Cfg{Markup_Validator_URI}) = ($Cfg{Markup_Validator_URI} =~ /^(.*)$/);
362    ($Cfg{CSS_Validator_URI})    = ($Cfg{CSS_Validator_URI}    =~ /^(.*)$/);
363
364    $DocType =
365        '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">';
366    my $css_url = URI->new_abs('linkchecker.css', $Cfg{Doc_URI});
367    my $js_url  = URI->new_abs('linkchecker.js',  $Cfg{Doc_URI});
368    $Head =
369        sprintf(<<'EOF', HTML::Entities::encode($AGENT), $css_url, $js_url);
370<meta http-equiv="Content-Script-Type" content="text/javascript" />
371<meta name="generator" content="%s" />
372<link rel="stylesheet" type="text/css" href="%s" />
373<script type="text/javascript" src="%s"></script>
374EOF
375
376    # Trusted environment variables that need laundering in taint mode.
377    for (qw(NNTPSERVER NEWSHOST)) {
378        ($ENV{$_}) = ($ENV{$_} =~ /^(.*)$/) if $ENV{$_};
379    }
380
381    # Use passive FTP by default, see Net::FTP(3).
382    $ENV{FTP_PASSIVE} = 1 unless exists($ENV{FTP_PASSIVE});
383}
384
385# Autoflush
386$| = 1;
387
388# Different options specified by the user
389my $cmdline = !($ENV{GATEWAY_INTERFACE} && $ENV{GATEWAY_INTERFACE} =~ /^CGI/);
390my %Opts = (
391    Command_Line    => $cmdline,
392    Quiet           => 0,
393    Summary_Only    => 0,
394    Verbose         => 0,
395    Progress        => 0,
396    HTML            => 0,
397    Timeout         => 30,
398    Redirects       => 1,
399    Dir_Redirects   => 1,
400    Accept_Language => $cmdline ? undef : $ENV{HTTP_ACCEPT_LANGUAGE},
401    Cookies         => undef,
402    No_Referer      => 0,
403    Hide_Same_Realm => 0,
404    Depth                    => 0,             # < 0 means unlimited recursion.
405    Sleep_Time               => 1,
406    Connection_Cache_Size    => 2,
407    Max_Documents            => 150,           # For the online version.
408    User                     => undef,
409    Password                 => undef,
410    Base_Locations           => [],
411    Exclude                  => undef,
412    Exclude_Docs             => undef,
413    Suppress_Redirect        => [],
414    Suppress_Redirect_Prefix => [],
415    Suppress_Redirect_Regexp => [],
416    Suppress_Temp_Redirects  => 1,
417    Suppress_Broken          => [],
418    Suppress_Fragment        => [],
419    Masquerade               => 0,
420    Masquerade_From          => '',
421    Masquerade_To            => '',
422    Trusted                  => $Cfg{Trusted},
423    Allow_Private_IPs => defined($Cfg{Allow_Private_IPs}) ?
424        $Cfg{Allow_Private_IPs} :
425        $cmdline,
426);
427undef $cmdline;
428
429# Global variables
430# What URI's did we process? (used for recursive mode)
431my %processed;
432
433# Result of the HTTP query
434my %results;
435
436# List of redirects
437my %redirects;
438
439# Count of the number of documents checked
440my $doc_count = 0;
441
442# Time stamp
443my $timestamp = &get_timestamp();
444
445# Per-document header; undefined if already printed.  See print_doc_header().
446my $doc_header;
447
448&parse_arguments() if $Opts{Command_Line};
449
450my $ua = W3C::UserAgent->new($AGENT);    # @@@ TODO: admin address
451
452$ua->conn_cache({total_capacity => $Opts{Connection_Cache_Size}});
453if ($ua->can('delay')) {
454    $ua->delay($Opts{Sleep_Time} / 60);
455}
456$ua->timeout($Opts{Timeout});
457
458# Set up cookie stash if requested
459if (defined($Opts{Cookies})) {
460    require HTTP::Cookies;
461    my $cookie_file = $Opts{Cookies};
462    if ($cookie_file eq 'tmp') {
463        $cookie_file = undef;
464    }
465    elsif ($cookie_file =~ /^(.*)$/) {
466        $cookie_file = $1;    # untaint
467    }
468    $ua->cookie_jar(HTTP::Cookies->new(file => $cookie_file, autosave => 1));
469}
470eval { $ua->allow_private_ips($Opts{Allow_Private_IPs}); };
471if ($@) {
472    die <<"EOF";
473Allow_Private_IPs is false; this feature requires the Net::IP, Socket, and
474Net::hostent modules:
475$@
476EOF
477}
478
479# Add configured forbidden protocols
480if ($Cfg{Forbidden_Protocols}) {
481    my $forbidden = $ua->protocols_forbidden();
482    push(@$forbidden, split(/[,\s]+/, lc($Cfg{Forbidden_Protocols})));
483    $ua->protocols_forbidden($forbidden);
484}
485
486if ($Opts{Command_Line}) {
487
488    require Text::Wrap;
489    Text::Wrap->import('wrap');
490
491    require URI::file;
492
493    &usage(1) unless scalar(@ARGV);
494
495    $Opts{_Self_URI} = 'http://validator.w3.org/checklink';   # For HTML output
496
497    &ask_password() if ($Opts{User} && !$Opts{Password});
498
499    if (!$Opts{Summary_Only}) {
500        printf("%s %s\n", $PACKAGE, $REVISION) unless $Opts{HTML};
501    }
502    else {
503        $Opts{Verbose}  = 0;
504        $Opts{Progress} = 0;
505    }
506
507    # Populate data for print_form()
508    my %params = (
509        summary            => $Opts{Summary_Only},
510        hide_redirects     => !$Opts{Redirects},
511        hide_type          => $Opts{Dir_Redirects} ? 'dir' : 'all',
512        no_accept_language => !(
513            defined($Opts{Accept_Language}) && $Opts{Accept_Language} eq 'auto'
514        ),
515        no_referer => $Opts{No_Referer},
516        recursive  => ($Opts{Depth} != 0),
517        depth      => $Opts{Depth},
518    );
519
520    my $check_num = 1;
521    my @bases     = @{$Opts{Base_Locations}};
522    for my $uri (@ARGV) {
523
524        # Reset base locations so that previous URI's given on the command line
525        # won't affect the recursion scope for this URI (see check_uri())
526        @{$Opts{Base_Locations}} = @bases;
527
528        # Transform the parameter into a URI
529        $uri = &urize($uri);
530        $params{uri} = $uri;
531        &check_uri(\%params, $uri, $check_num, $Opts{Depth}, undef, undef, 1);
532        $check_num++;
533    }
534    undef $check_num;
535
536    if ($Opts{HTML}) {
537        &html_footer();
538    }
539    elsif ($doc_count > 0 && !$Opts{Summary_Only}) {
540        printf("\n%s\n", &global_stats());
541    }
542
543}
544else {
545
546    require CGI;
547    require CGI::Carp;
548    CGI::Carp->import(qw(fatalsToBrowser));
549    require CGI::Cookie;
550
551    # file: URIs are not allowed in CGI mode
552    my $forbidden = $ua->protocols_forbidden();
553    push(@$forbidden, 'file');
554    $ua->protocols_forbidden($forbidden);
555
556    my $query = CGI->new();
557
558    for my $param ($query->param()) {
559        my @values = map { Encode::decode_utf8($_) } $query->param($param);
560        $query->param($param, @values);
561    }
562
563    # Set a few parameters in CGI mode
564    $Opts{Verbose}   = 0;
565    $Opts{Progress}  = 0;
566    $Opts{HTML}      = 1;
567    $Opts{_Self_URI} = $query->url(-relative => 1);
568
569    # Backwards compatibility
570    my $uri = undef;
571    if ($uri = $query->param('url')) {
572        $query->param('uri', $uri) unless $query->param('uri');
573        $query->delete('url');
574    }
575    $uri = $query->param('uri');
576
577    if (!$uri) {
578        &html_header('', undef);    # Set cookie only from results page.
579        my %cookies = CGI::Cookie->fetch();
580        &print_form(scalar($query->Vars()), $cookies{$PROGRAM}, 1);
581        &html_footer();
582        exit;
583    }
584
585    # Backwards compatibility
586    if ($query->param('hide_dir_redirects')) {
587        $query->param('hide_redirects', 'on');
588        $query->param('hide_type',      'dir');
589        $query->delete('hide_dir_redirects');
590    }
591
592    $Opts{Summary_Only} = 1 if $query->param('summary');
593
594    if ($query->param('hide_redirects')) {
595        $Opts{Dir_Redirects} = 0;
596        if (my $type = $query->param('hide_type')) {
597            $Opts{Redirects} = 0 if ($type ne 'dir');
598        }
599        else {
600            $Opts{Redirects} = 0;
601        }
602    }
603
604    $Opts{Accept_Language} = undef if $query->param('no_accept_language');
605    $Opts{No_Referer} = $query->param('no_referer');
606
607    $Opts{Depth} = -1 if ($query->param('recursive') && $Opts{Depth} == 0);
608    if (my $depth = $query->param('depth')) {
609
610        # @@@ Ignore invalid depth silently for now.
611        $Opts{Depth} = $1 if ($depth =~ /(-?\d+)/);
612    }
613
614    # Save, clear or leave cookie as is.
615    my $cookie = undef;
616    if (my $action = $query->param('cookie')) {
617        if ($action eq 'clear') {
618
619            # Clear the cookie.
620            $cookie = CGI::Cookie->new(-name => $PROGRAM);
621            $cookie->value({clear => 1});
622            $cookie->expires('-1M');
623        }
624        elsif ($action eq 'set') {
625
626            # Set the options.
627            $cookie = CGI::Cookie->new(-name => $PROGRAM);
628            my %options = $query->Vars();
629            delete($options{$_})
630                for qw(url uri check cookie);    # Non-persistent.
631            $cookie->value(\%options);
632        }
633    }
634    if (!$cookie) {
635        my %cookies = CGI::Cookie->fetch();
636        $cookie = $cookies{$PROGRAM};
637    }
638
639    # Always refresh cookie expiration time.
640    $cookie->expires('+1M') if ($cookie && !$cookie->expires());
641
642    # All Apache configurations don't set HTTP_AUTHORIZATION for CGI scripts.
643    # If we're under mod_perl, there is a way around it...
644    eval {
645        local $SIG{__DIE__} = undef;
646        my $auth =
647            Apache2::RequestUtil->request()->headers_in()->{Authorization};
648        $ENV{HTTP_AUTHORIZATION} = $auth if $auth;
649    } if (MP2() && !$ENV{HTTP_AUTHORIZATION});
650
651    $uri =~ s/^\s+//g;
652    if ($uri =~ /:/) {
653        $uri = URI->new($uri);
654    }
655    else {
656        if ($uri =~ m|^//|) {
657            $uri = URI->new("http:$uri");
658        }
659        else {
660            local $ENV{URL_GUESS_PATTERN} = '';
661            my $guess = URI::Heuristic::uf_uri($uri);
662            if ($guess->scheme() && $ua->is_protocol_supported($guess)) {
663                $uri = $guess;
664            }
665            else {
666                $uri = URI->new("http://$uri");
667            }
668        }
669    }
670    $uri = $uri->canonical();
671    $query->param("uri", $uri);
672
673    &check_uri(scalar($query->Vars()), $uri, 1, $Opts{Depth}, $cookie);
674    undef $query;    # Not needed any more.
675    &html_footer();
676}
677
678###############################################################################
679
680################################
681# Command line and usage stuff #
682################################
683
684sub parse_arguments ()
685{
686    require Encode::Locale;
687    Encode::Locale::decode_argv();
688
689    require Getopt::Long;
690    Getopt::Long->require_version(2.17);
691    Getopt::Long->import('GetOptions');
692    Getopt::Long::Configure('bundling', 'no_ignore_case');
693    my $masq = '';
694    my @locs = ();
695
696    GetOptions(
697        'help|h|?' => sub { usage(0) },
698        'q|quiet'  => sub {
699            $Opts{Quiet}        = 1;
700            $Opts{Summary_Only} = 1;
701        },
702        's|summary' => \$Opts{Summary_Only},
703        'b|broken'  => sub {
704            $Opts{Redirects}     = 0;
705            $Opts{Dir_Redirects} = 0;
706        },
707        'e|dir-redirects' => sub { $Opts{Dir_Redirects} = 0; },
708        'v|verbose'       => \$Opts{Verbose},
709        'i|indicator'     => \$Opts{Progress},
710        'H|html'          => \$Opts{HTML},
711        'r|recursive'     => sub {
712            $Opts{Depth} = -1
713                if $Opts{Depth} == 0;
714        },
715        'l|location=s'                => \@locs,
716        'X|exclude=s'                 => \$Opts{Exclude},
717        'exclude-docs=s@'             => \@{$Opts{Exclude_Docs}},
718        'suppress-redirect=s@'        => \@{$Opts{Suppress_Redirect}},
719        'suppress-redirect-prefix=s@' => \@{$Opts{Suppress_Redirect_Prefix}},
720        'suppress-temp-redirects'     => \$Opts{Suppress_Temp_Redirects},
721        'suppress-broken=s@'          => \@{$Opts{Suppress_Broken}},
722        'suppress-fragment=s@'        => \@{$Opts{Suppress_Fragment}},
723        'u|user=s'                    => \$Opts{User},
724        'p|password=s'                => \$Opts{Password},
725        't|timeout=i'                 => \$Opts{Timeout},
726        'C|connection-cache=i'        => \$Opts{Connection_Cache_Size},
727        'S|sleep=i'                   => \$Opts{Sleep_Time},
728        'L|languages=s'               => \$Opts{Accept_Language},
729        'c|cookies=s'                 => \$Opts{Cookies},
730        'R|no-referer'                => \$Opts{No_Referer},
731        'D|depth=i'                   => sub {
732            $Opts{Depth} = $_[1]
733                unless $_[1] == 0;
734        },
735        'd|domain=s'      => \$Opts{Trusted},
736        'masquerade=s'    => \$masq,
737        'hide-same-realm' => \$Opts{Hide_Same_Realm},
738        'V|version'       => \&version,
739        ) ||
740        usage(1);
741
742    if ($masq) {
743        $Opts{Masquerade} = 1;
744        my @masq = split(/\s+/, $masq);
745        if (scalar(@masq) != 2 ||
746            !defined($masq[0]) ||
747            $masq[0] !~ /\S/ ||
748            !defined($masq[1]) ||
749            $masq[1] !~ /\S/)
750        {
751            usage(1,
752                "Error: --masquerade takes two whitespace separated URIs.");
753        }
754        else {
755            require URI::file;
756            $Opts{Masquerade_From} = $masq[0];
757            my $u = URI->new($masq[1]);
758            $Opts{Masquerade_To} =
759                $u->scheme() ? $u : URI::file->new_abs($masq[1]);
760        }
761    }
762
763    if ($Opts{Accept_Language} && $Opts{Accept_Language} eq 'auto') {
764        $Opts{Accept_Language} = &guess_language();
765    }
766
767    if (($Opts{Sleep_Time} || 0) < 1) {
768        warn(
769            "*** Warning: minimum allowed sleep time is 1 second, resetting.\n"
770        );
771        $Opts{Sleep_Time} = 1;
772    }
773
774    push(@{$Opts{Base_Locations}}, map { URI->new($_)->canonical() } @locs);
775
776    $Opts{Depth} = -1 if ($Opts{Depth} == 0 && @locs);
777
778    # Precompile/error-check regular expressions.
779    if (defined($Opts{Exclude})) {
780        eval { $Opts{Exclude} = qr/$Opts{Exclude}/o; };
781        &usage(1, "Error in exclude regexp: $@") if $@;
782    }
783    for my $i (0 .. $#{$Opts{Exclude_Docs}}) {
784        eval { $Opts{Exclude_Docs}->[$i] = qr/$Opts{Exclude_Docs}->[$i]/; };
785        &usage(1, "Error in exclude-docs regexp: $@") if $@;
786    }
787    if (defined($Opts{Trusted})) {
788        eval { $Opts{Trusted} = qr/$Opts{Trusted}/io; };
789        &usage(1, "Error in trusted domains regexp: $@") if $@;
790    }
791
792    # Sanity-check error-suppression arguments
793    for my $i (0 .. $#{$Opts{Suppress_Redirect}}) {
794        ${$Opts{Suppress_Redirect}}[$i] =~ s/ /->/;
795        my $sr_arg = ${$Opts{Suppress_Redirect}}[$i];
796        if ($sr_arg !~ /.->./) {
797            &usage(1,
798                "Bad suppress-redirect argument, should contain \"->\": $sr_arg"
799            );
800        }
801    }
802    for my $i (0 .. $#{$Opts{Suppress_Redirect_Prefix}}) {
803        my $srp_arg = ${$Opts{Suppress_Redirect_Prefix}}[$i];
804        $srp_arg =~ s/ /->/;
805        if ($srp_arg !~ /^(.*)->(.*)$/) {
806            &usage(1,
807                "Bad suppress-redirect-prefix argument, should contain \"->\": $srp_arg"
808            );
809        }
810
811        # Turn prefixes into a regexp.
812        ${$Opts{Suppress_Redirect_Prefix}}[$i] = qr/^\Q$1\E(.*)->\Q$2\E\1$/ism;
813    }
814    for my $i (0 .. $#{$Opts{Suppress_Broken}}) {
815        ${$Opts{Suppress_Broken}}[$i] =~ s/ /:/;
816        my $sb_arg = ${$Opts{Suppress_Broken}}[$i];
817        if ($sb_arg !~ /^(-1|[0-9]+):./) {
818            &usage(1,
819                "Bad suppress-broken argument, should be prefixed by a numeric response code: $sb_arg"
820            );
821        }
822    }
823    for my $sf_arg (@{$Opts{Suppress_Fragment}}) {
824        if ($sf_arg !~ /.#./) {
825            &usage(1,
826                "Bad suppress-fragment argument, should contain \"#\": $sf_arg"
827            );
828        }
829    }
830
831    return;
832}
833
834sub version ()
835{
836    print "$PACKAGE $REVISION\n";
837    exit 0;
838}
839
840sub usage ()
841{
842    my ($exitval, $msg) = @_;
843    $exitval = 0 unless defined($exitval);
844    $msg ||= '';
845    $msg =~ s/[\r\n]*$/\n\n/ if $msg;
846
847    die($msg) unless $Opts{Command_Line};
848
849    my $trust = defined($Cfg{Trusted}) ? $Cfg{Trusted} : 'same host only';
850
851    select(STDERR) if $exitval;
852    print "$msg$PACKAGE $REVISION
853
854Usage: checklink <options> <uris>
855Options:
856 -s, --summary              Result summary only.
857 -b, --broken               Show only the broken links, not the redirects.
858 -e, --directory            Hide directory redirects, for example
859                            http://www.w3.org/TR -> http://www.w3.org/TR/
860 -r, --recursive            Check the documents linked from the first one.
861 -D, --depth N              Check the documents linked from the first one to
862                            depth N (implies --recursive).
863 -l, --location URI         Scope of the documents checked in recursive mode
864                            (implies --recursive).  Can be specified multiple
865                            times.  If not specified, the default eg. for
866                            http://www.w3.org/TR/html4/Overview.html
867                            would be http://www.w3.org/TR/html4/
868 -X, --exclude REGEXP       Do not check links whose full, canonical URIs
869                            match REGEXP; also limits recursion the same way
870                            as --exclude-docs with the same regexp would.
871 --exclude-docs REGEXP      In recursive mode, do not check links in documents
872                            whose full, canonical URIs match REGEXP.  This
873                            option may be specified multiple times.
874 --suppress-redirect URI->URI  Do not report a redirect from the first to the
875                            second URI.  This option may be specified multiple
876                            times.
877 --suppress-redirect-prefix URI->URI  Do not report a redirect from a child of
878                            the first URI to the same child of the second URI.
879                            This option may be specified multiple times.
880 --suppress-temp-redirects  Suppress warnings about temporary redirects.
881 --suppress-broken CODE:URI  Do not report a broken link with the given CODE.
882                            CODE is HTTP response, or -1 for robots exclusion.
883                            This option may be specified multiple times.
884 --suppress-fragment URI    Do not report the given broken fragment URI.
885                            A fragment URI contains \"#\".  This option may be
886                            specified multiple times.
887 -L, --languages LANGS      Accept-Language header to send.  The special value
888                            'auto' causes autodetection from the environment.
889 -c, --cookies FILE         Use cookies, load/save them in FILE.  The special
890                            value 'tmp' causes non-persistent use of cookies.
891 -R, --no-referer           Do not send the Referer HTTP header.
892 -q, --quiet                No output if no errors are found (implies -s).
893 -v, --verbose              Verbose mode.
894 -i, --indicator            Show percentage of lines processed while parsing.
895 -u, --user USERNAME        Specify a username for authentication.
896 -p, --password PASSWORD    Specify a password.
897 --hide-same-realm          Hide 401's that are in the same realm as the
898                            document checked.
899 -S, --sleep SECS           Sleep SECS seconds between requests to each server
900                            (default and minimum: 1 second).
901 -t, --timeout SECS         Timeout for requests in seconds (default: 30).
902 -d, --domain DOMAIN        Regular expression describing the domain to which
903                            authentication information will be sent
904                            (default: $trust).
905 --masquerade \"BASE1 BASE2\" Masquerade base URI BASE1 as BASE2.  See the
906                            manual page for more information.
907 -H, --html                 HTML output.
908 -?, -h, --help             Show this message and exit.
909 -V, --version              Output version information and exit.
910
911See \"perldoc LWP\" for information about proxy server support,
912\"perldoc Net::FTP\" for information about various environment variables
913affecting FTP connections and \"perldoc Net::NNTP\" for setting a default
914NNTP server for news: URIs.
915
916The W3C_CHECKLINK_CFG environment variable can be used to set the
917configuration file to use.  See details in the full manual page, it can
918be displayed with: perldoc checklink
919
920More documentation at: $Cfg{Doc_URI}
921Please send bug reports and comments to the www-validator mailing list:
922  www-validator\@w3.org (with 'checklink' in the subject)
923  Archives are at: http://lists.w3.org/Archives/Public/www-validator/
924";
925    exit $exitval;
926}
927
928sub ask_password ()
929{
930    eval {
931        local $SIG{__DIE__} = undef;
932        require Term::ReadKey;
933        Term::ReadKey->require_version(2.00);
934        Term::ReadKey->import(qw(ReadMode));
935    };
936    if ($@) {
937        warn('Warning: Term::ReadKey 2.00 or newer not available, ' .
938                "password input disabled.\n");
939        return;
940    }
941    printf(STDERR 'Enter the password for user %s: ', $Opts{User});
942    ReadMode('noecho', *STDIN);
943    chomp($Opts{Password} = <STDIN>);
944    ReadMode('restore', *STDIN);
945    print(STDERR "ok.\n");
946    return;
947}
948
949###############################################################################
950
951###########################################################################
952# Guess an Accept-Language header based on the $LANG environment variable #
953###########################################################################
954
955sub guess_language ()
956{
957    my $lang = $ENV{LANG} or return;
958
959    $lang =~ s/[\.@].*$//;    # en_US.UTF-8, fi_FI@euro...
960
961    return 'en' if ($lang eq 'C' || $lang eq 'POSIX');
962
963    my $res = undef;
964    eval {
965        require Locale::Language;
966        if (my $tmp = Locale::Language::language2code($lang)) {
967            $lang = $tmp;
968        }
969        if (my ($l, $c) = (lc($lang) =~ /^([a-z]+)(?:[-_]([a-z]+))?/)) {
970            if (Locale::Language::code2language($l)) {
971                $res = $l;
972                if ($c) {
973                    require Locale::Country;
974                    $res .= "-$c" if Locale::Country::code2country($c);
975                }
976            }
977        }
978    };
979    return $res;
980}
981
982############################
983# Transform foo into a URI #
984############################
985
986sub urize ($)
987{
988    my $arg  = shift;
989    my $uarg = URI::Escape::uri_unescape($arg);
990    my $uri;
991    if (-d $uarg) {
992
993        # look for an "index" file in dir, return it if found
994        require File::Spec;
995        for my $index (map { File::Spec->catfile($uarg, $_) }
996            qw(index.html index.xhtml index.htm index.xhtm))
997        {
998            if (-e $index) {
999                $uri = URI::file->new_abs($index);
1000                last;
1001            }
1002        }
1003
1004        # return dir itself if an index file was not found
1005        $uri ||= URI::file->new_abs($uarg);
1006    }
1007    elsif ($uarg =~ /^[.\/\\]/ || -e $uarg) {
1008        $uri = URI::file->new_abs($uarg);
1009    }
1010    else {
1011        my $newuri = URI->new($arg);
1012        if ($newuri->scheme()) {
1013            $uri = $newuri;
1014        }
1015        else {
1016            local $ENV{URL_GUESS_PATTERN} = '';
1017            $uri = URI::Heuristic::uf_uri($arg);
1018            $uri = URI::file->new_abs($uri) unless $uri->scheme();
1019        }
1020    }
1021    return $uri->canonical();
1022}
1023
1024########################################
1025# Check for broken links in a resource #
1026########################################
1027
1028sub check_uri (\%\$$$$;\$$)
1029{
1030    my ($params, $uri, $check_num, $depth, $cookie, $referer, $is_start) = @_;
1031    $is_start ||= ($check_num == 1);
1032
1033    my $start = $Opts{Summary_Only} ? 0 : &get_timestamp();
1034
1035    # Get and parse the document
1036    my $response = &get_document(
1037        'GET',   $uri,    $doc_count, \%redirects, $referer,
1038        $cookie, $params, $check_num, $is_start
1039    );
1040
1041    # Can we check the resource? If not, we exit here...
1042    return if defined($response->{Stop});
1043
1044    if ($Opts{HTML}) {
1045        &html_header($uri, $cookie) if ($check_num == 1);
1046        &print_form($params, $cookie, $check_num) if $is_start;
1047    }
1048
1049    if ($is_start) { # Starting point of a new check, eg. from the command line
1050          # Use the first URI as the recursion base unless specified otherwise.
1051        push(@{$Opts{Base_Locations}}, $response->{absolute_uri}->canonical())
1052            unless @{$Opts{Base_Locations}};
1053    }
1054    else {
1055
1056        # Before fetching the document, we don't know if we'll be within the
1057        # recursion scope or not (think redirects).
1058        if (!&in_recursion_scope($response->{absolute_uri})) {
1059            hprintf("Not in recursion scope: %s\n")
1060                if ($Opts{Verbose});
1061            $response->content("");
1062            return;
1063        }
1064    }
1065
1066    # Define the document header, and perhaps print it.
1067    # (It might still be defined if the previous document had no errors;
1068    # just redefine it in that case.)
1069
1070    if ($check_num != 1) {
1071        if ($Opts{HTML}) {
1072            $doc_header = "\n<hr />\n";
1073        }
1074        else {
1075            $doc_header = "\n" . ('-' x 40) . "\n";
1076        }
1077    }
1078
1079    if ($Opts{HTML}) {
1080        $doc_header .=
1081            ("<h2>\nProcessing\t" . &show_url($response->{absolute_uri}) .
1082                "\n</h2>\n\n");
1083    }
1084    else {
1085        $doc_header .= "\nProcessing\t$response->{absolute_uri}\n\n";
1086    }
1087
1088    if (!$Opts{Quiet}) {
1089        print_doc_header();
1090    }
1091
1092    # We are checking a new document
1093    $doc_count++;
1094
1095    my $result_anchor = 'results' . $doc_count;
1096
1097    if ($check_num == 1 && !$Opts{HTML} && !$Opts{Summary_Only}) {
1098        my $s = $Opts{Sleep_Time} == 1 ? '' : 's';
1099        my $acclang = $Opts{Accept_Language} || '(not sent)';
1100        my $send_referer = $Opts{No_Referer} ? 'not sent' : 'sending';
1101        my $cookies = 'not used';
1102        if (defined($Opts{Cookies})) {
1103            $cookies = 'used, ';
1104            if ($Opts{Cookies} eq 'tmp') {
1105                $cookies .= 'non-persistent';
1106            }
1107            else {
1108                $cookies .= "file $Opts{Cookies}";
1109            }
1110        }
1111        printf(
1112            <<'EOF', $Accept, $acclang, $send_referer, $cookies, $Opts{Sleep_Time}, $s);
1113
1114Settings used:
1115- Accept: %s
1116- Accept-Language: %s
1117- Referer: %s
1118- Cookies: %s
1119- Sleeping %d second%s between requests to each server
1120EOF
1121        printf("- Excluding links matching %s\n", $Opts{Exclude})
1122            if defined($Opts{Exclude});
1123        printf("- Excluding links in documents whose URIs match %s\n",
1124            join(', ', @{$Opts{Exclude_Docs}}))
1125            if @{$Opts{Exclude_Docs}};
1126    }
1127
1128    if ($Opts{HTML}) {
1129        if (!$Opts{Summary_Only}) {
1130            my $accept       = &encode($Accept);
1131            my $acclang      = &encode($Opts{Accept_Language} || '(not sent)');
1132            my $send_referer = $Opts{No_Referer} ? 'not sent' : 'sending';
1133            my $s            = $Opts{Sleep_Time} == 1 ? '' : 's';
1134            printf(
1135                <<'EOF', $accept, $acclang, $send_referer, $Opts{Sleep_Time}, $s);
1136<div class="settings">
1137Settings used:
1138 <ul>
1139  <li><tt><a href="http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.1">Accept</a></tt>: %s</li>
1140  <li><tt><a href="http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4">Accept-Language</a></tt>: %s</li>
1141  <li><tt><a href="http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.36">Referer</a></tt>: %s</li>
1142  <li>Sleeping %d second%s between requests to each server</li>
1143 </ul>
1144</div>
1145EOF
1146            printf("<p>Go to <a href=\"#%s\">the results</a>.</p>\n",
1147                $result_anchor);
1148            my $esc_uri = URI::Escape::uri_escape($response->{absolute_uri},
1149                "^A-Za-z0-9.");
1150            print "<p>For reliable link checking results, check ";
1151
1152            if (!$response->{IsCss}) {
1153                printf("<a href=\"%s\">HTML validity</a> and ",
1154                    &encode(sprintf($Cfg{Markup_Validator_URI}, $esc_uri)));
1155            }
1156            printf(
1157                "<a href=\"%s\">CSS validity</a> first.</p>
1158<p>Back to the <a accesskey=\"1\" href=\"%s\">link checker</a>.</p>\n",
1159                &encode(sprintf($Cfg{CSS_Validator_URI}, $esc_uri)),
1160                &encode($Opts{_Self_URI})
1161            );
1162
1163            printf(<<'EOF', $result_anchor);
1164<div class="progress" id="progress%s">
1165<h3>Status: <span></span></h3>
1166<div class="progressbar"><div></div></div>
1167<pre>
1168EOF
1169        }
1170    }
1171
1172    if ($Opts{Summary_Only} && !$Opts{Quiet}) {
1173        print '<p>' if $Opts{HTML};
1174        print 'This may take some time';
1175        print "... (<a href=\"$Cfg{Doc_URI}#wait\">why?</a>)</p>"
1176            if $Opts{HTML};
1177        print " if the document has many links to check.\n" unless $Opts{HTML};
1178    }
1179
1180    # Record that we have processed this resource
1181    $processed{$response->{absolute_uri}} = 1;
1182
1183    # Parse the document
1184    my $p =
1185        &parse_document($uri, $response->base(), $response, 1, ($depth != 0));
1186    my $base = URI->new($p->{base});
1187
1188    # Check anchors
1189    ###############
1190
1191    print "Checking anchors...\n" unless $Opts{Summary_Only};
1192
1193    my %errors;
1194    while (my ($anchor, $lines) = each(%{$p->{Anchors}})) {
1195        if (!length($anchor)) {
1196
1197            # Empty IDREF's are not allowed
1198            $errors{$anchor} = 1;
1199        }
1200        else {
1201            my $times = 0;
1202            $times += $_ for values(%$lines);
1203
1204            # They should appear only once
1205            $errors{$anchor} = 1 if ($times > 1);
1206        }
1207    }
1208    print " done.\n" unless $Opts{Summary_Only};
1209
1210    # Check links
1211    #############
1212
1213    &hprintf("Recording all the links found: %d\n",
1214        scalar(keys %{$p->{Links}}))
1215        if ($Opts{Verbose});
1216    my %links;
1217    my %hostlinks;
1218
1219    # Record all the links found
1220    while (my ($link, $lines) = each(%{$p->{Links}})) {
1221        my $link_uri = URI->new($link);
1222        my $abs_link_uri = URI->new_abs($link_uri, $base);
1223
1224        if ($Opts{Masquerade}) {
1225            if ($abs_link_uri =~ m|^\Q$Opts{Masquerade_From}\E|) {
1226                print_doc_header();
1227                printf("processing %s in base %s\n",
1228                    $abs_link_uri, $Opts{Masquerade_To});
1229                my $nlink = $abs_link_uri;
1230                $nlink =~ s|^\Q$Opts{Masquerade_From}\E|$Opts{Masquerade_To}|;
1231                $abs_link_uri = URI->new($nlink);
1232            }
1233        }
1234
1235        my $canon_uri = URI->new($abs_link_uri->canonical());
1236        my $fragment  = $canon_uri->fragment(undef);
1237        if (!defined($Opts{Exclude}) || $canon_uri !~ $Opts{Exclude}) {
1238            if (!exists($links{$canon_uri})) {
1239                my $hostport;
1240                $hostport = $canon_uri->host_port()
1241                    if $canon_uri->can('host_port');
1242                $hostport = '' unless defined $hostport;
1243                push(@{$hostlinks{$hostport}}, $canon_uri);
1244            }
1245            for my $line_num (keys(%$lines)) {
1246                if (!defined($fragment) || !length($fragment)) {
1247
1248                    # Document without fragment
1249                    $links{$canon_uri}{location}{$line_num} = 1;
1250                }
1251                else {
1252
1253                    # Resource with a fragment
1254                    $links{$canon_uri}{fragments}{$fragment}{$line_num} = 1;
1255                }
1256            }
1257        }
1258    }
1259
1260    my @order = &distribute_links(\%hostlinks);
1261    undef %hostlinks;
1262
1263    # Build the list of broken URI's
1264
1265    my $nlinks = scalar(@order);
1266
1267    &hprintf("Checking %d links to build list of broken URI's\n", $nlinks)
1268        if ($Opts{Verbose});
1269
1270    my %broken;
1271    my $link_num = 0;
1272    for my $u (@order) {
1273        my $ulinks = $links{$u};
1274
1275        if ($Opts{Summary_Only}) {
1276
1277            # Hack: avoid browser/server timeouts in summary only CGI mode, bug 896
1278            print ' ' if ($Opts{HTML} && !$Opts{Command_Line});
1279        }
1280        else {
1281            &hprintf("\nChecking link %s\n", $u);
1282            my $progress = ($link_num / $nlinks) * 100;
1283            printf(
1284                '<script type="text/javascript">show_progress("%s", "Checking link %s", "%.1f%%");</script>',
1285                $result_anchor, &encode($u), $progress)
1286                if (!$Opts{Command_Line} &&
1287                $Opts{HTML} &&
1288                !$Opts{Summary_Only});
1289        }
1290        $link_num++;
1291
1292        # Check that a link is valid
1293        &check_validity($uri, $u, ($depth != 0 && &in_recursion_scope($u)),
1294            \%links, \%redirects);
1295        &hprintf("\tReturn code: %s\n", $results{$u}{location}{code})
1296            if ($Opts{Verbose});
1297        if ($results{$u}{location}{success}) {
1298
1299            # Even though it was not broken, we might want to display it
1300            # on the results page (e.g. because it required authentication)
1301            $broken{$u}{location} = 1
1302                if ($results{$u}{location}{display} >= 400);
1303
1304            # List the broken fragments
1305            while (my ($fragment, $lines) = each(%{$ulinks->{fragments}})) {
1306
1307                my $fragment_ok = $results{$u}{fragments}{$fragment};
1308
1309                if ($Opts{Verbose}) {
1310                    my @line_nums = sort { $a <=> $b } keys(%$lines);
1311                    &hprintf(
1312                        "\t\t%s %s - Line%s: %s\n",
1313                        $fragment,
1314                        $fragment_ok             ? 'OK' : 'Not found',
1315                        (scalar(@line_nums) > 1) ? 's'  : '',
1316                        join(', ', @line_nums)
1317                    );
1318                }
1319
1320                # A broken fragment?
1321                $broken{$u}{fragments}{$fragment} += 2 unless $fragment_ok;
1322            }
1323        }
1324        elsif (!($Opts{Quiet} && &informational($results{$u}{location}{code})))
1325        {
1326
1327            # Couldn't find the document
1328            $broken{$u}{location} = 1;
1329
1330            # All the fragments associated are hence broken
1331            for my $fragment (keys %{$ulinks->{fragments}}) {
1332                $broken{$u}{fragments}{$fragment}++;
1333            }
1334        }
1335    }
1336    &hprintf(
1337        "\nProcessed in %s seconds.\n",
1338        &time_diff($start, &get_timestamp())
1339    ) unless $Opts{Summary_Only};
1340    printf(
1341        '<script type="text/javascript">show_progress("%s", "Done. Document processed in %s seconds.", "100%%");</script>',
1342        $result_anchor, &time_diff($start, &get_timestamp()))
1343        if ($Opts{HTML} && !$Opts{Summary_Only});
1344
1345    # Display results
1346    if ($Opts{HTML} && !$Opts{Summary_Only}) {
1347        print("</pre>\n</div>\n");
1348        printf("<h2><a name=\"%s\">Results</a></h2>\n", $result_anchor);
1349    }
1350    print "\n" unless $Opts{Quiet};
1351
1352    &links_summary(\%links, \%results, \%broken, \%redirects);
1353    &anchors_summary($p->{Anchors}, \%errors);
1354
1355    # Do we want to process other documents?
1356    if ($depth != 0) {
1357
1358        for my $u (map { URI->new($_) } keys %links) {
1359
1360            next unless $results{$u}{location}{success};    # Broken link?
1361
1362            next unless &in_recursion_scope($u);
1363
1364            # Do we understand its content type?
1365            next unless ($results{$u}{location}{type} =~ $ContentTypes);
1366
1367            # Have we already processed this URI?
1368            next if &already_processed($u, $uri);
1369
1370            # Do the job
1371            print "\n" unless $Opts{Quiet};
1372            if ($Opts{HTML}) {
1373                if (!$Opts{Command_Line}) {
1374                    if ($doc_count == $Opts{Max_Documents}) {
1375                        print(
1376                            "<hr />\n<p><strong>Maximum number of documents ($Opts{Max_Documents}) reached!</strong></p>\n"
1377                        );
1378                    }
1379                    if ($doc_count >= $Opts{Max_Documents}) {
1380                        $doc_count++;
1381                        print("<p>Not checking <strong>$u</strong></p>\n");
1382                        $processed{$u} = 1;
1383                        next;
1384                    }
1385                }
1386            }
1387
1388            # This is an inherently recursive algorithm, so Perl's warning is not
1389            # helpful.  You may wish to comment this out when debugging, though.
1390            no warnings 'recursion';
1391
1392            if ($depth < 0) {
1393                &check_uri($params, $u, 0, -1, $cookie, $uri);
1394            }
1395            else {
1396                &check_uri($params, $u, 0, $depth - 1, $cookie, $uri);
1397            }
1398        }
1399    }
1400    return;
1401}
1402
1403###############################################################
1404# Distribute links based on host:port to avoid RobotUA delays #
1405###############################################################
1406
1407sub distribute_links(\%)
1408{
1409    my $hostlinks = shift;
1410
1411    # Hosts ordered by weight (number of links), descending
1412    my @order =
1413        sort { scalar(@{$hostlinks->{$b}}) <=> scalar(@{$hostlinks->{$a}}) }
1414        keys %$hostlinks;
1415
1416    # All link list flattened into one, in host weight order
1417    my @all;
1418    push(@all, @{$hostlinks->{$_}}) for @order;
1419
1420    return @all if (scalar(@order) < 2);
1421
1422    # Indexes and chunk size for "zipping" the end result list
1423    my $num = scalar(@{$hostlinks->{$order[0]}});
1424    my @indexes = map { $_ * $num } (0 .. $num - 1);
1425
1426    # Distribute them
1427    my @result;
1428    while (my @chunk = splice(@all, 0, $num)) {
1429        @result[@indexes] = @chunk;
1430        @indexes = map { $_ + 1 } @indexes;
1431    }
1432
1433    # Weed out undefs
1434    @result = grep(defined, @result);
1435
1436    return @result;
1437}
1438
1439##########################################
1440# Decode Content-Encodings in a response #
1441##########################################
1442
1443sub decode_content ($)
1444{
1445    my $response = shift;
1446    my $error    = undef;
1447
1448    my $docref = $response->decoded_content(ref => 1);
1449    if (defined($docref)) {
1450        utf8::encode($$docref);
1451        $response->content_ref($docref);
1452
1453        # Remove Content-Encoding so it won't be decoded again later.
1454        $response->remove_header('Content-Encoding');
1455    }
1456    else {
1457        my $ce = $response->header('Content-Encoding');
1458        $ce = defined($ce) ? "'$ce'" : 'undefined';
1459        my $ct = $response->header('Content-Type');
1460        $ct = defined($ct) ? "'$ct'" : 'undefined';
1461        my $request_uri = $response->request->url;
1462
1463        my $cs = $response->content_charset();
1464        $cs = defined($cs) ? "'$cs'" : 'unknown';
1465        $error =
1466            "Error decoding document at <$request_uri>, Content-Type $ct, " .
1467            "Content-Encoding $ce, content charset $cs: '$@'";
1468    }
1469    return $error;
1470}
1471
1472#######################################
1473# Get and parse a resource to process #
1474#######################################
1475
1476sub get_document ($\$$;\%\$$$$$)
1477{
1478    my ($method, $uri,    $in_recursion, $redirects, $referer,
1479        $cookie, $params, $check_num,    $is_start
1480    ) = @_;
1481
1482    # $method contains the HTTP method the use (GET or HEAD)
1483    # $uri object contains the identifier of the resource
1484    # $in_recursion is > 0 if we are in recursion mode (i.e. it is at least
1485    #                        the second resource checked)
1486    # $redirects is a pointer to the hash containing the map of the redirects
1487    # $referer is the URI object of the referring document
1488    # $cookie, $params, $check_num, and $is_start are for printing HTTP headers
1489    #                  and the form if $in_recursion == 0 and not authenticating
1490
1491    # Get the resource
1492    my $response;
1493    if (defined($results{$uri}{response}) &&
1494        !($method eq 'GET' && $results{$uri}{method} eq 'HEAD'))
1495    {
1496        $response = $results{$uri}{response};
1497    }
1498    else {
1499        $response = &get_uri($method, $uri, $referer);
1500        &record_results($uri, $method, $response, $referer);
1501        &record_redirects($redirects, $response);
1502    }
1503    if (!$response->is_success()) {
1504        if (!$in_recursion) {
1505
1506            # Is it too late to request authentication?
1507            if ($response->code() == 401) {
1508                &authentication($response, $cookie, $params, $check_num,
1509                    $is_start);
1510            }
1511            else {
1512                if ($Opts{HTML}) {
1513                    &html_header($uri, $cookie) if ($check_num == 1);
1514                    &print_form($params, $cookie, $check_num) if $is_start;
1515                    print "<p>", &status_icon($response->code());
1516                }
1517                &hprintf("\nError: %d %s\n",
1518                    $response->code(), $response->message() || '(no message)');
1519                print "</p>\n" if $Opts{HTML};
1520            }
1521        }
1522        $response->{Stop} = 1;
1523        $response->content("");
1524        return ($response);
1525    }
1526
1527    # What is the URI of the resource that we are processing by the way?
1528    my $base_uri    = $response->base();
1529    my $request_uri = URI->new($response->request->url);
1530    $response->{absolute_uri} = $request_uri->abs($base_uri);
1531
1532    # Can we parse the document?
1533    my $failed_reason;
1534    my $ct = $response->header('Content-Type');
1535    if (!$ct || $ct !~ $ContentTypes) {
1536        $failed_reason = "Content-Type for <$request_uri> is " .
1537            (defined($ct) ? "'$ct'" : 'undefined');
1538    }
1539    else {
1540        $failed_reason = decode_content($response);
1541    }
1542    if ($failed_reason) {
1543
1544        # No, there is a problem...
1545        if (!$in_recursion) {
1546            if ($Opts{HTML}) {
1547                &html_header($uri, $cookie) if ($check_num == 1);
1548                &print_form($params, $cookie, $check_num) if $is_start;
1549                print "<p>", &status_icon(406);
1550
1551            }
1552            &hprintf("Can't check links: %s.\n", $failed_reason);
1553            print "</p>\n" if $Opts{HTML};
1554        }
1555        $response->{Stop} = 1;
1556        $response->content("");
1557    }
1558
1559    # Ok, return the information
1560    return ($response);
1561}
1562
1563#########################################################
1564# Check whether a URI is within the scope of recursion. #
1565#########################################################
1566
1567sub in_recursion_scope (\$)
1568{
1569    my ($uri) = @_;
1570    return 0 unless $uri;
1571
1572    my $candidate = $uri->canonical();
1573
1574    return 0 if (defined($Opts{Exclude}) && $candidate =~ $Opts{Exclude});
1575
1576    for my $excluded_doc (@{$Opts{Exclude_Docs}}) {
1577        return 0 if ($candidate =~ $excluded_doc);
1578    }
1579
1580    for my $base (@{$Opts{Base_Locations}}) {
1581        my $rel = $candidate->rel($base);
1582        next if ($candidate eq $rel);    # Relative path not possible?
1583        next if ($rel =~ m|^(\.\.)?/|);  # Relative path upwards?
1584        return 1;
1585    }
1586
1587    return 0;    # We always have at least one base location, but none matched.
1588}
1589
1590#################################
1591# Check for content type match. #
1592#################################
1593
1594sub is_content_type ($$)
1595{
1596    my ($candidate, $type) = @_;
1597    return 0 unless ($candidate && $type);
1598    my @v = HTTP::Headers::Util::split_header_words($candidate);
1599    return scalar(@v) ? $type eq lc($v[0]->[0]) : 0;
1600}
1601
1602##################################################
1603# Check whether a URI has already been processed #
1604##################################################
1605
1606sub already_processed (\$\$)
1607{
1608    my ($uri, $referer) = @_;
1609
1610    # Don't be verbose for that part...
1611    my $summary_value = $Opts{Summary_Only};
1612    $Opts{Summary_Only} = 1;
1613
1614    # Do a GET: if it fails, we stop, if not, the results are cached
1615    my $response = &get_document('GET', $uri, 1, undef, $referer);
1616
1617    # ... but just for that part
1618    $Opts{Summary_Only} = $summary_value;
1619
1620    # Can we process the resource?
1621    return -1 if defined($response->{Stop});
1622
1623    # Have we already processed it?
1624    return 1 if defined($processed{$response->{absolute_uri}->as_string()});
1625
1626    # It's not processed yet and it is processable: return 0
1627    return 0;
1628}
1629
1630############################
1631# Get the content of a URI #
1632############################
1633
1634sub get_uri ($\$;\$$\%$$$$)
1635{
1636
1637    # Here we have a lot of extra parameters in order not to lose information
1638    # if the function is called several times (401's)
1639    my ($method, $uri,   $referer, $start, $redirects,
1640        $code,   $realm, $message, $auth
1641    ) = @_;
1642
1643    # $method contains the method used
1644    # $uri object contains the target of the request
1645    # $referer is the URI object of the referring document
1646    # $start is a timestamp (not defined the first time the function is called)
1647    # $redirects is a map of redirects
1648    # $code is the first HTTP return code
1649    # $realm is the realm of the request
1650    # $message is the HTTP message received
1651    # $auth equals 1 if we want to send out authentication information
1652
1653    # For timing purposes
1654    $start = &get_timestamp() unless defined($start);
1655
1656    # Prepare the query
1657
1658    # Do we want printouts of progress?
1659    my $verbose_progress =
1660        !($Opts{Summary_Only} || (!$doc_count && $Opts{HTML}));
1661
1662    &hprintf("%s %s ", $method, $uri) if $verbose_progress;
1663
1664    my $request = HTTP::Request->new($method, $uri);
1665
1666    $request->header('Accept-Language' => $Opts{Accept_Language})
1667        if $Opts{Accept_Language};
1668    $request->header('Accept', $Accept);
1669    $request->accept_decodable();
1670
1671    # Are we providing authentication info?
1672    if ($auth && $request->url()->host() =~ $Opts{Trusted}) {
1673        if (defined($ENV{HTTP_AUTHORIZATION})) {
1674            $request->header(Authorization => $ENV{HTTP_AUTHORIZATION});
1675        }
1676        elsif (defined($Opts{User}) && defined($Opts{Password})) {
1677            $request->authorization_basic($Opts{User}, $Opts{Password});
1678        }
1679    }
1680
1681    # Tell the user agent if we want progress reports for redirects or not.
1682    $ua->redirect_progress_callback(sub { &hprintf("\n-> %s %s ", @_); })
1683        if $verbose_progress;
1684
1685    # Set referer
1686    $request->referer($referer) if (!$Opts{No_Referer} && $referer);
1687
1688    # Telling caches in the middle we want a fresh copy (Bug 4998)
1689    $request->header(Cache_Control => "max-age=0");
1690
1691    # Do the query
1692    my $response = $ua->request($request);
1693
1694    # Get the results
1695    # Record the very first response
1696    if (!defined($code)) {
1697        ($code, $message) = delete(@$ua{qw(FirstResponse FirstMessage)});
1698    }
1699
1700    # Authentication requested?
1701    if ($response->code() == 401 &&
1702        !defined($auth) &&
1703        (defined($ENV{HTTP_AUTHORIZATION}) ||
1704            (defined($Opts{User}) && defined($Opts{Password})))
1705        )
1706    {
1707
1708        # Set host as trusted domain unless we already have one.
1709        if (!$Opts{Trusted}) {
1710            my $re = sprintf('^%s$', quotemeta($response->base()->host()));
1711            $Opts{Trusted} = qr/$re/io;
1712        }
1713
1714        # Deal with authentication and avoid loops
1715        if (!defined($realm) &&
1716            $response->www_authenticate() =~ /Basic realm=\"([^\"]+)\"/)
1717        {
1718            $realm = $1;
1719        }
1720
1721        print "\n" if $verbose_progress;
1722        return &get_uri($method, $response->request()->url(),
1723            $referer, $start, $redirects, $code, $realm, $message, 1);
1724    }
1725
1726    # @@@ subtract robot delay from the "fetched in" time?
1727    &hprintf(" fetched in %s seconds\n", &time_diff($start, &get_timestamp()))
1728        if $verbose_progress;
1729
1730    $response->{IsCss} =
1731        is_content_type($response->content_type(), "text/css");
1732    $response->{Realm} = $realm if defined($realm);
1733
1734    return $response;
1735}
1736
1737#########################################
1738# Record the results of an HTTP request #
1739#########################################
1740
1741sub record_results (\$$$$)
1742{
1743    my ($uri, $method, $response, $referer) = @_;
1744    $results{$uri}{referer}        = $referer;
1745    $results{$uri}{response}       = $response;
1746    $results{$uri}{method}         = $method;
1747    $results{$uri}{location}{code} = $response->code();
1748    $results{$uri}{location}{code} = RC_ROBOTS_TXT()
1749        if ($results{$uri}{location}{code} == 403 &&
1750        $response->message() =~ /Forbidden by robots\.txt/);
1751    $results{$uri}{location}{code} = RC_IP_DISALLOWED()
1752        if ($results{$uri}{location}{code} == 403 &&
1753        $response->message() =~ /non-public IP/);
1754    $results{$uri}{location}{code} = RC_DNS_ERROR()
1755        if ($results{$uri}{location}{code} == 500 &&
1756        $response->message() =~ /Bad hostname '[^\']*'/);
1757    $results{$uri}{location}{code} = RC_PROTOCOL_DISALLOWED()
1758        if ($results{$uri}{location}{code} == 500 &&
1759        $response->message() =~ /Access to '[^\']*' URIs has been disabled/);
1760    $results{$uri}{location}{type}    = $response->header('Content-type');
1761    $results{$uri}{location}{display} = $results{$uri}{location}{code};
1762
1763    # Rewind, check for the original code and message.
1764    for (my $tmp = $response->previous(); $tmp; $tmp = $tmp->previous()) {
1765        $results{$uri}{location}{orig}         = $tmp->code();
1766        $results{$uri}{location}{orig_message} = $tmp->message() ||
1767            '(no message)';
1768    }
1769    $results{$uri}{location}{success} = $response->is_success();
1770
1771    # If a suppressed broken link, fill the data structure like a typical success.
1772    # print STDERR "success? " . $results{$uri}{location}{success} . ": $uri\n";
1773    if (!$results{$uri}{location}{success}) {
1774        my $code = $results{$uri}{location}{code};
1775        my $match = grep { $_ eq "$code:$uri" } @{$Opts{Suppress_Broken}};
1776        if ($match) {
1777            $results{$uri}{location}{success} = 1;
1778            $results{$uri}{location}{code}    = 100;
1779            $results{$uri}{location}{display} = 100;
1780        }
1781    }
1782
1783    # Stores the authentication information
1784    if (defined($response->{Realm})) {
1785        $results{$uri}{location}{realm} = $response->{Realm};
1786        $results{$uri}{location}{display} = 401 unless $Opts{Hide_Same_Realm};
1787    }
1788
1789    # What type of broken link is it? (stored in {record} - the {display}
1790    #              information is just for visual use only)
1791    if ($results{$uri}{location}{display} == 401 &&
1792        $results{$uri}{location}{code} == 404)
1793    {
1794        $results{$uri}{location}{record} = 404;
1795    }
1796    else {
1797        $results{$uri}{location}{record} = $results{$uri}{location}{display};
1798    }
1799
1800    # Did it fail?
1801    $results{$uri}{location}{message} = $response->message() || '(no message)';
1802    if (!$results{$uri}{location}{success}) {
1803        &hprintf(
1804            "Error: %d %s\n",
1805            $results{$uri}{location}{code},
1806            $results{$uri}{location}{message}
1807        ) if ($Opts{Verbose});
1808    }
1809    return;
1810}
1811
1812####################
1813# Parse a document #
1814####################
1815
1816sub parse_document (\$\$$$$)
1817{
1818    my ($uri, $base_uri, $response, $links, $rec_needs_links) = @_;
1819
1820    print("parse_document($uri, $base_uri, ..., $links, $rec_needs_links)\n")
1821        if $Opts{Verbose};
1822
1823    my $p;
1824
1825    if (defined($results{$uri}{parsing})) {
1826
1827        # We have already done the job. Woohoo!
1828        $p->{base}    = $results{$uri}{parsing}{base};
1829        $p->{Anchors} = $results{$uri}{parsing}{Anchors};
1830        $p->{Links}   = $results{$uri}{parsing}{Links};
1831        return $p;
1832    }
1833
1834    $p = W3C::LinkChecker->new();
1835    $p->{base} = $base_uri;
1836
1837    my $stype = $response->header("Content-Style-Type");
1838    $p->{style_is_css} = !$stype || is_content_type($stype, "text/css");
1839
1840    my $start;
1841    if (!$Opts{Summary_Only}) {
1842        $start = &get_timestamp();
1843        print("Parsing...\n");
1844    }
1845
1846    # Content-Encoding etc already decoded in get_document().
1847    my $docref = $response->content_ref();
1848
1849    # Count lines beforehand if needed (for progress indicator, or CSS while
1850    # we don't get any line context out of the parser).  In case of HTML, the
1851    # actual final number of lines processed shown is populated by our
1852    # end_document handler.
1853    $p->{Total} = ($$docref =~ tr/\n//)
1854        if ($response->{IsCss} || $Opts{Progress});
1855
1856    # We only look for anchors if we are not interested in the links
1857    # obviously, or if we are running a recursive checking because we
1858    # might need this information later
1859    $p->{only_anchors} = !($links || $rec_needs_links);
1860
1861    if ($response->{IsCss}) {
1862
1863        # Parse as CSS
1864
1865        $p->parse_css($$docref, LINE_UNKNOWN());
1866    }
1867    else {
1868
1869        # Parse as HTML
1870
1871        # Transform <?xml:stylesheet ...?> into <xml:stylesheet ...> for parsing
1872        # Processing instructions are not parsed by process, but in this case
1873        # it should be. It's expensive, it's horrible, but it's the easiest way
1874        # for right now.
1875        $$docref =~ s/\<\?(xml:stylesheet.*?)\?\>/\<$1\>/
1876            unless $p->{only_anchors};
1877
1878        $p->xml_mode(1) if ($response->content_type() =~ /\+xml$/);
1879
1880        $p->parse($$docref)->eof();
1881    }
1882
1883    $response->content("");
1884
1885    if (!$Opts{Summary_Only}) {
1886        my $stop = &get_timestamp();
1887        print "\r" if $Opts{Progress};
1888        &hprintf(" done (%d lines in %s seconds).\n",
1889            $p->{Total}, &time_diff($start, $stop));
1890    }
1891
1892    # Save the results before exiting
1893    $results{$uri}{parsing}{base}    = $p->{base};
1894    $results{$uri}{parsing}{Anchors} = $p->{Anchors};
1895    $results{$uri}{parsing}{Links}   = $p->{Links};
1896
1897    return $p;
1898}
1899
1900####################################
1901# Constructor for W3C::LinkChecker #
1902####################################
1903
1904sub new
1905{
1906    my $p = HTML::Parser::new(@_, api_version => 3);
1907    $p->utf8_mode(1);
1908
1909    # Set up handlers
1910
1911    $p->handler(start => 'start', 'self, tagname, attr, line');
1912    $p->handler(end   => 'end',   'self, tagname, line');
1913    $p->handler(text  => 'text',  'self, dtext, line');
1914    $p->handler(
1915        declaration => sub {
1916            my $self = shift;
1917            $self->declaration(substr($_[0], 2, -1));
1918        },
1919        'self, text, line'
1920    );
1921    $p->handler(end_document => 'end_document', 'self, line');
1922    if ($Opts{Progress}) {
1923        $p->handler(default => 'parse_progress', 'self, line');
1924        $p->{last_percentage} = 0;
1925    }
1926
1927    # Check <a [..] name="...">?
1928    $p->{check_name} = 1;
1929
1930    # Check <[..] id="..">?
1931    $p->{check_id} = 1;
1932
1933    # Don't interpret comment loosely
1934    $p->strict_comment(1);
1935
1936    return $p;
1937}
1938
1939#################################################
1940# Record or return  the doctype of the document #
1941#################################################
1942
1943sub doctype
1944{
1945    my ($self, $dc) = @_;
1946    return $self->{doctype} unless $dc;
1947    $_ = $self->{doctype} = $dc;
1948
1949    # What to look for depending on the doctype
1950
1951    # Check for <a name="...">?
1952    $self->{check_name} = 0
1953        if m%^-//(W3C|WAPFORUM)//DTD XHTML (Basic|Mobile) %;
1954
1955    # Check for <* id="...">?
1956    $self->{check_id} = 0
1957        if (m%^-//IETF//DTD HTML [23]\.0//% || m%^-//W3C//DTD HTML 3\.2//%);
1958
1959    # Enable XML mode (XHTML, XHTML Mobile, XHTML-Print, XHTML+RDFa, ...)
1960    $self->xml_mode(1) if (m%^-//(W3C|WAPFORUM)//DTD XHTML[ \-\+]%);
1961
1962    return;
1963}
1964
1965###################################
1966# Print parse progress indication #
1967###################################
1968
1969sub parse_progress
1970{
1971    my ($self, $line) = @_;
1972    return unless defined($line) && $line > 0 && $self->{Total} > 0;
1973
1974    my $percentage = int($line / $self->{Total} * 100);
1975    if ($percentage != $self->{last_percentage}) {
1976        printf("\r%4d%%", $percentage);
1977        $self->{last_percentage} = $percentage;
1978    }
1979
1980    return;
1981}
1982
1983#############################
1984# Extraction of the anchors #
1985#############################
1986
1987sub get_anchor
1988{
1989    my ($self, $tag, $attr) = @_;
1990
1991    my $anchor = $self->{check_id} ? $attr->{id} : undef;
1992    if ($self->{check_name} && ($tag eq 'a')) {
1993
1994        # @@@@ In XHTML, <a name="foo" id="foo"> is mandatory
1995        # Force an error if it's not the case (or if id's and name's values
1996        #                                      are different)
1997        # If id is defined, name if defined must have the same value
1998        $anchor ||= $attr->{name};
1999    }
2000
2001    return $anchor;
2002}
2003
2004#############################
2005# W3C::LinkChecker handlers #
2006#############################
2007
2008sub add_link
2009{
2010    my ($self, $uri, $base, $line) = @_;
2011    if (defined($uri)) {
2012
2013        # Remove repeated slashes after the . or .. in relative links, to avoid
2014        # duplicated checking or infinite recursion.
2015        $uri =~ s|^(\.\.?/)/+|$1|o;
2016        $uri = Encode::decode_utf8($uri);
2017        $uri = URI->new_abs($uri, $base) if defined($base);
2018        $self->{Links}{$uri}{defined($line) ? $line : LINE_UNKNOWN()}++;
2019    }
2020    return;
2021}
2022
2023sub start
2024{
2025    my ($self, $tag, $attr, $line) = @_;
2026    $line = LINE_UNKNOWN() unless defined($line);
2027
2028    # Anchors
2029    my $anchor = $self->get_anchor($tag, $attr);
2030    $self->{Anchors}{$anchor}{$line}++ if defined($anchor);
2031
2032    # Links
2033    if (!$self->{only_anchors}) {
2034
2035        my $tag_local_base = undef;
2036
2037        # Special case: base/@href
2038        # @@@TODO: The reason for handling <base href> ourselves is that LWP's
2039        # head parsing magic fails at least for responses that have
2040        # Content-Encodings: https://rt.cpan.org/Ticket/Display.html?id=54361
2041        if ($tag eq 'base') {
2042
2043            # Ignore <base> with missing/empty href.
2044            $self->{base} = $attr->{href}
2045                if (defined($attr->{href}) && length($attr->{href}));
2046        }
2047
2048        # Special case: meta[@http-equiv=Refresh]/@content
2049        elsif ($tag eq 'meta') {
2050            if ($attr->{'http-equiv'} &&
2051                lc($attr->{'http-equiv'}) eq 'refresh')
2052            {
2053                my $content = $attr->{content};
2054                if ($content && $content =~ /.*?;\s*(?:url=)?(.+)/i) {
2055                    $self->add_link($1, undef, $line);
2056                }
2057            }
2058        }
2059
2060        # Special case: tags that have "local base"
2061        elsif ($tag eq 'applet' || $tag eq 'object') {
2062            if (my $codebase = $attr->{codebase}) {
2063
2064                # Applet codebases are directories, append trailing slash
2065                # if it's not there so that new_abs does the right thing.
2066                $codebase .= "/" if ($tag eq 'applet' && $codebase !~ m|/$|);
2067
2068                # TODO: HTML 4 spec says applet/@codebase may only point to
2069                # subdirs of the directory containing the current document.
2070                # Should we do something about that?
2071                $tag_local_base = URI->new_abs($codebase, $self->{base});
2072            }
2073        }
2074
2075        # Link attributes:
2076        if (my $link_attrs = LINK_ATTRS()->{$tag}) {
2077            for my $la (@$link_attrs) {
2078                $self->add_link($attr->{$la}, $tag_local_base, $line);
2079            }
2080        }
2081
2082        # List of links attributes:
2083        if (my $link_attrs = LINK_LIST_ATTRS()->{$tag}) {
2084            my ($sep, $attrs) = @$link_attrs;
2085            for my $la (@$attrs) {
2086                if (defined(my $value = $attr->{$la})) {
2087                    for my $link (split($sep, $value)) {
2088                        $self->add_link($link, $tag_local_base, $line);
2089                    }
2090                }
2091            }
2092        }
2093
2094        # Inline CSS:
2095        delete $self->{csstext};
2096        if ($tag eq 'style') {
2097            $self->{csstext} = ''
2098                if ((!$attr->{type} && $self->{style_is_css}) ||
2099                is_content_type($attr->{type}, "text/css"));
2100        }
2101        elsif ($self->{style_is_css} && (my $style = $attr->{style})) {
2102            $style = CSS::DOM::Style::parse($style);
2103            $self->parse_style($style, $line);
2104        }
2105    }
2106
2107    $self->parse_progress($line) if $Opts{Progress};
2108    return;
2109}
2110
2111sub end
2112{
2113    my ($self, $tagname, $line) = @_;
2114
2115    $self->parse_css($self->{csstext}, $line) if ($tagname eq 'style');
2116    delete $self->{csstext};
2117
2118    $self->parse_progress($line) if $Opts{Progress};
2119    return;
2120}
2121
2122sub parse_css
2123{
2124    my ($self, $css, $line) = @_;
2125    return unless $css;
2126
2127    my $sheet = CSS::DOM::parse($css);
2128    for my $rule (@{$sheet->cssRules()}) {
2129        if ($rule->type() == IMPORT_RULE()) {
2130            $self->add_link($rule->href(), $self->{base}, $line);
2131        }
2132        elsif ($rule->type == STYLE_RULE()) {
2133            $self->parse_style($rule->style(), $line);
2134        }
2135    }
2136    return;
2137}
2138
2139sub parse_style
2140{
2141    my ($self, $style, $line) = @_;
2142    return unless $style;
2143
2144    for (my $i = 0, my $len = $style->length(); $i < $len; $i++) {
2145        my $prop = $style->item($i);
2146        my $val  = $style->getPropertyValue($prop);
2147
2148        while ($val =~ /$CssUrl/go) {
2149            my $url = CSS::DOM::Util::unescape($2);
2150            $self->add_link($url, $self->{base}, $line);
2151        }
2152    }
2153
2154    return;
2155}
2156
2157sub declaration
2158{
2159    my ($self, $text, $line) = @_;
2160
2161    # Extract the doctype
2162    my @declaration = split(/\s+/, $text, 4);
2163    if ($#declaration >= 3 &&
2164        $declaration[0] eq 'DOCTYPE' &&
2165        lc($declaration[1]) eq 'html')
2166    {
2167
2168        # Parse the doctype declaration
2169        if ($text =~
2170            m/^DOCTYPE\s+html\s+(?:PUBLIC\s+"([^"]+)"|SYSTEM)(\s+"([^"]+)")?\s*$/i
2171            )
2172        {
2173
2174            # Store the doctype
2175            $self->doctype($1) if $1;
2176
2177            # If there is a link to the DTD, record it
2178            $self->add_link($3, undef, $line)
2179                if (!$self->{only_anchors} && $3);
2180        }
2181    }
2182
2183    $self->text($text) unless $self->{only_anchors};
2184
2185    return;
2186}
2187
2188sub text
2189{
2190    my ($self, $text, $line) = @_;
2191    $self->{csstext} .= $text if defined($self->{csstext});
2192    $self->parse_progress($line) if $Opts{Progress};
2193    return;
2194}
2195
2196sub end_document
2197{
2198    my ($self, $line) = @_;
2199    $self->{Total} = $line;
2200    delete $self->{csstext};
2201    return;
2202}
2203
2204################################
2205# Check the validity of a link #
2206################################
2207
2208sub check_validity (\$\$$\%\%)
2209{
2210    my ($referer, $uri, $want_links, $links, $redirects) = @_;
2211
2212    # $referer is the URI object of the document checked
2213    # $uri is the URI object of the target that we are verifying
2214    # $want_links is true if we're interested in links in the target doc
2215    # $links is a hash of the links in the documents checked
2216    # $redirects is a map of the redirects encountered
2217
2218    # Get the document with the appropriate method: GET if there are
2219    # fragments to check or links are wanted, HEAD is enough otherwise.
2220    my $fragments = $links->{$uri}{fragments} || {};
2221    my $method = ($want_links || %$fragments) ? 'GET' : 'HEAD';
2222
2223    my $response;
2224    my $being_processed = 0;
2225    if (!defined($results{$uri}) ||
2226        ($method eq 'GET' && $results{$uri}{method} eq 'HEAD'))
2227    {
2228        $being_processed = 1;
2229        $response = &get_uri($method, $uri, $referer);
2230
2231        # Get the information back from get_uri()
2232        &record_results($uri, $method, $response, $referer);
2233
2234        # Record the redirects
2235        &record_redirects($redirects, $response);
2236    }
2237    elsif (!($Opts{Summary_Only} || (!$doc_count && $Opts{HTML}))) {
2238        my $ref = $results{$uri}{referer};
2239        &hprintf("Already checked%s\n", $ref ? ", referrer $ref" : ".");
2240    }
2241
2242    # We got the response of the HTTP request. Stop here if it was a HEAD.
2243    return if ($method eq 'HEAD');
2244
2245    # There are fragments. Parse the document.
2246    my $p;
2247    if ($being_processed) {
2248
2249        # Can we really parse the document?
2250        if (!defined($results{$uri}{location}{type}) ||
2251            $results{$uri}{location}{type} !~ $ContentTypes)
2252        {
2253            &hprintf("Can't check content: Content-Type for '%s' is '%s'.\n",
2254                $uri, $results{$uri}{location}{type})
2255                if ($Opts{Verbose});
2256            $response->content("");
2257            return;
2258        }
2259
2260        # Do it then
2261        if (my $error = decode_content($response)) {
2262            &hprintf("%s\n.", $error);
2263        }
2264
2265        # @@@TODO: this isn't the best thing to do if a decode error occurred
2266        $p =
2267            &parse_document($uri, $response->base(), $response, 0,
2268            $want_links);
2269    }
2270    else {
2271
2272        # We already had the information
2273        $p->{Anchors} = $results{$uri}{parsing}{Anchors};
2274    }
2275
2276    # Check that the fragments exist
2277    for my $fragment (keys %$fragments) {
2278        if (defined($p->{Anchors}{$fragment}) ||
2279            &escape_match($fragment, $p->{Anchors}) ||
2280            grep { $_ eq "$uri#$fragment" } @{$Opts{Suppress_Fragment}})
2281        {
2282            $results{$uri}{fragments}{$fragment} = 1;
2283        }
2284        else {
2285            $results{$uri}{fragments}{$fragment} = 0;
2286        }
2287    }
2288    return;
2289}
2290
2291sub escape_match ($\%)
2292{
2293    my ($a, $hash) = (URI::Escape::uri_unescape($_[0]), $_[1]);
2294    for my $b (keys %$hash) {
2295        return 1 if ($a eq URI::Escape::uri_unescape($b));
2296    }
2297    return 0;
2298}
2299
2300##########################
2301# Ask for authentication #
2302##########################
2303
2304sub authentication ($;$$$$)
2305{
2306    my ($response, $cookie, $params, $check_num, $is_start) = @_;
2307
2308    my $realm = '';
2309    if ($response->www_authenticate() =~ /Basic realm=\"([^\"]+)\"/) {
2310        $realm = $1;
2311    }
2312
2313    if ($Opts{Command_Line}) {
2314        printf STDERR <<'EOF', $response->request()->url(), $realm;
2315
2316Authentication is required for %s.
2317The realm is "%s".
2318Use the -u and -p options to specify a username and password and the -d option
2319to specify trusted domains.
2320EOF
2321    }
2322    else {
2323
2324        printf(
2325            "Status: 401 Authorization Required\nWWW-Authenticate: %s\n%sConnection: close\nContent-Language: en\nContent-Type: text/html; charset=utf-8\n\n",
2326            $response->www_authenticate(),
2327            $cookie ? "Set-Cookie: $cookie\n" : "",
2328        );
2329
2330        printf(
2331            "%s
2332<html lang=\"en\" xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">
2333<head>
2334<title>W3C Link Checker: 401 Authorization Required</title>
2335%s</head>
2336<body>", $DocType, $Head
2337        );
2338        &banner(': 401 Authorization Required');
2339        &print_form($params, $cookie, $check_num) if $is_start;
2340        printf(
2341            '<p>
2342  %s
2343  You need "%s" access to <a href="%s">%s</a> to perform link checking.<br />
2344',
2345            &status_icon(401),
2346            &encode($realm), (&encode($response->request()->url())) x 2
2347        );
2348
2349        my $host = $response->request()->url()->host();
2350        if ($Opts{Trusted} && $host !~ $Opts{Trusted}) {
2351            printf <<'EOF', &encode($Opts{Trusted}), &encode($host);
2352  This service has been configured to send authentication only to hostnames
2353  matching the regular expression <code>%s</code>, but the hostname
2354  <code>%s</code> does not match it.
2355EOF
2356        }
2357
2358        print "</p>\n";
2359    }
2360    return;
2361}
2362
2363##################
2364# Get statistics #
2365##################
2366
2367sub get_timestamp ()
2368{
2369    return pack('LL', Time::HiRes::gettimeofday());
2370}
2371
2372sub time_diff ($$)
2373{
2374    my @start = unpack('LL', $_[0]);
2375    my @stop  = unpack('LL', $_[1]);
2376    for ($start[1], $stop[1]) {
2377        $_ /= 1_000_000;
2378    }
2379    return (sprintf("%.2f", ($stop[0] + $stop[1]) - ($start[0] + $start[1])));
2380}
2381
2382########################
2383# Handle the redirects #
2384########################
2385
2386# Record the redirects in a hash
2387sub record_redirects (\%$)
2388{
2389    my ($redirects, $response) = @_;
2390    for (my $prev = $response->previous(); $prev; $prev = $prev->previous()) {
2391
2392        # Check for redirect match.
2393        my $from = $prev->request()->url();
2394        my $to   = $response->request()->url();  # same on every loop iteration
2395        my $from_to = $from . '->' . $to;
2396        my $match = grep { $_ eq $from_to } @{$Opts{Suppress_Redirect}};
2397
2398        # print STDERR "Result $match of redirect checking $from_to\n";
2399        if ($match) { next; }
2400
2401        $match = grep { $from_to =~ /$_/ } @{$Opts{Suppress_Redirect_Prefix}};
2402
2403        # print STDERR "Result $match of regexp checking $from_to\n";
2404        if ($match) { next; }
2405
2406        my $c = $prev->code();
2407        if ($Opts{Suppress_Temp_Redirects} && ($c == 307 || $c == 302)) {
2408            next;
2409        }
2410
2411        $redirects->{$prev->request()->url()} = $response->request()->url();
2412    }
2413    return;
2414}
2415
2416# Determine if a request is redirected
2417sub is_redirected ($%)
2418{
2419    my ($uri, %redirects) = @_;
2420    return (defined($redirects{$uri}));
2421}
2422
2423# Get a list of redirects for a URI
2424sub get_redirects ($%)
2425{
2426    my ($uri, %redirects) = @_;
2427    my @history = ($uri);
2428    my %seen    = ($uri => 1);    # for tracking redirect loops
2429    my $loop    = 0;
2430    while ($redirects{$uri}) {
2431        $uri = $redirects{$uri};
2432        push(@history, $uri);
2433        if ($seen{$uri}) {
2434            $loop = 1;
2435            last;
2436        }
2437        else {
2438            $seen{$uri}++;
2439        }
2440    }
2441    return ($loop, @history);
2442}
2443
2444####################################################
2445# Tool for sorting the unique elements of an array #
2446####################################################
2447
2448sub sort_unique (@)
2449{
2450    my %saw;
2451    @saw{@_} = ();
2452    return (sort { $a <=> $b } keys %saw);
2453}
2454
2455#####################
2456# Print the results #
2457#####################
2458
2459sub line_number ($)
2460{
2461    my $line = shift;
2462    return $line if ($line >= 0);
2463    return "(N/A)";
2464}
2465
2466sub http_rc ($)
2467{
2468    my $rc = shift;
2469    return $rc if ($rc >= 0);
2470    return "(N/A)";
2471}
2472
2473# returns true if the given code is informational
2474sub informational ($)
2475{
2476    my $rc = shift;
2477    return $rc == RC_ROBOTS_TXT() ||
2478        $rc == RC_IP_DISALLOWED() ||
2479        $rc == RC_PROTOCOL_DISALLOWED();
2480}
2481
2482sub anchors_summary (\%\%)
2483{
2484    my ($anchors, $errors) = @_;
2485
2486    # Number of anchors found.
2487    my $n = scalar(keys(%$anchors));
2488    if (!$Opts{Quiet}) {
2489        if ($Opts{HTML}) {
2490            print("<h3>Anchors</h3>\n<p>");
2491        }
2492        else {
2493            print("Anchors\n\n");
2494        }
2495        &hprintf("Found %d anchor%s.\n", $n, ($n == 1) ? '' : 's');
2496        print("</p>\n") if $Opts{HTML};
2497    }
2498
2499    # List of the duplicates, if any.
2500    my @errors = keys %{$errors};
2501    if (!scalar(@errors)) {
2502        print("<p>Valid anchors!</p>\n")
2503            if (!$Opts{Quiet} && $Opts{HTML} && $n);
2504        return;
2505    }
2506    undef $n;
2507
2508    print_doc_header();
2509    print('<p>') if $Opts{HTML};
2510    print('List of duplicate and empty anchors');
2511    print <<'EOF' if $Opts{HTML};
2512</p>
2513<table class="report" border="1" summary="List of duplicate and empty anchors.">
2514<thead>
2515<tr>
2516<th>Anchor</th>
2517<th>Lines</th>
2518</tr>
2519</thead>
2520<tbody>
2521EOF
2522    print("\n");
2523
2524    for my $anchor (@errors) {
2525        my $format;
2526        my @unique = &sort_unique(
2527            map { line_number($_) }
2528                keys %{$anchors->{$anchor}}
2529        );
2530        if ($Opts{HTML}) {
2531            $format = "<tr><td class=\"broken\">%s</td><td>%s</td></tr>\n";
2532        }
2533        else {
2534            my $s = (scalar(@unique) > 1) ? 's' : '';
2535            $format = "\t%s\tLine$s: %s\n";
2536        }
2537        printf($format,
2538            &encode(length($anchor) ? $anchor : 'Empty anchor'),
2539            join(', ', @unique));
2540    }
2541
2542    print("</tbody>\n</table>\n") if $Opts{HTML};
2543
2544    return;
2545}
2546
2547sub show_link_report (\%\%\%\%\@;$\%)
2548{
2549    my ($links, $results, $broken, $redirects, $urls, $codes, $todo) = @_;
2550
2551    print("\n<dl class=\"report\">") if $Opts{HTML};
2552    print("\n") if (!$Opts{Quiet});
2553
2554    # Process each URL
2555    my ($c, $previous_c);
2556    for my $u (@$urls) {
2557        my @fragments = keys %{$broken->{$u}{fragments}};
2558
2559        # Did we get a redirect?
2560        my $redirected = &is_redirected($u, %$redirects);
2561
2562        # List of lines
2563        my @total_lines;
2564        push(@total_lines, keys(%{$links->{$u}{location}}));
2565        for my $f (@fragments) {
2566            push(@total_lines, keys(%{$links->{$u}{fragments}{$f}}))
2567                unless ($f eq $u && defined($links->{$u}{$u}{LINE_UNKNOWN()}));
2568        }
2569
2570        my ($redirect_loop, @redirects_urls) = get_redirects($u, %$redirects);
2571        my $currloc = $results->{$u}{location};
2572
2573        # Error type
2574        $c = &code_shown($u, $results);
2575
2576        # What to do
2577        my $whattodo;
2578        my $redirect_too;
2579        if ($todo) {
2580            if ($u =~ m/^javascript:/) {
2581                if ($Opts{HTML}) {
2582                    $whattodo =
2583                        'You must change this link: people using a browser without JavaScript support
2584will <em>not</em> be able to follow this link. See the
2585<a href="http://www.w3.org/TR/WAI-WEBCONTENT/#tech-scripts">Web Content
2586Accessibility Guidelines on the use of scripting on the Web</a> and the
2587<a href="http://www.w3.org/TR/WCAG10-HTML-TECHS/#directly-accessible-scripts">techniques
2588on how to solve this</a>.';
2589                }
2590                else {
2591                    $whattodo =
2592                        'Change this link: people using a browser without JavaScript support will not be able to follow this link.';
2593                }
2594            }
2595            elsif ($c == RC_ROBOTS_TXT()) {
2596                $whattodo =
2597                    'The link was not checked due to robots exclusion ' .
2598                    'rules. Check the link manually.';
2599            }
2600            elsif ($redirect_loop) {
2601                $whattodo =
2602                    'Retrieving the URI results in a redirect loop, that should be '
2603                    . 'fixed.  Examine the redirect sequence to see where the loop '
2604                    . 'occurs.';
2605            }
2606            else {
2607                $whattodo = $todo->{$c};
2608            }
2609        }
2610        elsif (defined($redirects{$u})) {
2611
2612            # Redirects
2613            if (($u . '/') eq $redirects{$u}) {
2614                $whattodo =
2615                    'The link is missing a trailing slash, and caused a redirect. Adding the trailing slash would speed up browsing.';
2616            }
2617            elsif ($c == 307 || $c == 302) {
2618                $whattodo =
2619                    'This is a temporary redirect. Update the link if you believe it makes sense, or leave it as is.';
2620            }
2621            elsif ($c == 301) {
2622                $whattodo =
2623                    'This is a permanent redirect. The link should be updated.';
2624            }
2625        }
2626
2627        my @unique = &sort_unique(map { line_number($_) } @total_lines);
2628        my $lines_list = join(', ', @unique);
2629        my $s = (scalar(@unique) > 1) ? 's' : '';
2630        undef @unique;
2631
2632        my @http_codes = ($currloc->{code});
2633        unshift(@http_codes, $currloc->{orig}) if $currloc->{orig};
2634        @http_codes = map { http_rc($_) } @http_codes;
2635
2636        if ($Opts{HTML}) {
2637
2638            # Style stuff
2639            my $idref = '';
2640            if ($codes && (!defined($previous_c) || ($c != $previous_c))) {
2641                $idref      = ' id="d' . $doc_count . 'code_' . $c . '"';
2642                $previous_c = $c;
2643            }
2644
2645            # Main info
2646            for (@redirects_urls) {
2647                $_ = &show_url($_);
2648            }
2649
2650            # HTTP message
2651            my $http_message;
2652            if ($currloc->{message}) {
2653                $http_message = &encode($currloc->{message});
2654                if ($c == 404 || $c == 500) {
2655                    $http_message =
2656                        '<span class="broken">' . $http_message . '</span>';
2657                }
2658            }
2659            my $redirmsg =
2660                $redirect_loop ? ' <em>redirect loop detected</em>' : '';
2661            printf("
2662<dt%s>%s <span class='msg_loc'>Line%s: %s</span> %s</dt>
2663<dd class='responsecode'><strong>Status</strong>: %s %s %s</dd>
2664<dd class='message_explanation'><p>%s %s</p></dd>\n",
2665
2666                # Anchor for return codes
2667                $idref,
2668
2669                # Color
2670                &status_icon($c),
2671                $s,
2672
2673                # List of lines
2674                $lines_list,
2675
2676                # List of redirects
2677                $redirected ?
2678                    join(' redirected to ', @redirects_urls) . $redirmsg :
2679                    &show_url($u),
2680
2681                # Realm
2682                defined($currloc->{realm}) ?
2683                    sprintf('Realm: %s<br />', &encode($currloc->{realm})) :
2684                    '',
2685
2686                # HTTP original message
2687                # defined($currloc->{orig_message})
2688                # ? &encode($currloc->{orig_message}).
2689                # ' <span title="redirected to">-&gt;</span> '
2690                # : '',
2691
2692                # Response code chain
2693                join(
2694                    ' <span class="redirected_to" title="redirected to">-&gt;</span> ',
2695                    map { &encode($_) } @http_codes),
2696
2697                # HTTP final message
2698                $http_message,
2699
2700                # What to do
2701                $whattodo,
2702
2703                # Redirect too?
2704                $redirect_too ?
2705                    sprintf(' <span %s>%s</span>',
2706                    &bgcolor(301), $redirect_too) :
2707                    '',
2708            );
2709            if ($#fragments >= 0) {
2710                printf("<dd>Broken fragments: <ul>\n");
2711            }
2712        }
2713        else {
2714            my $redirmsg = $redirect_loop ? ' redirect loop detected' : '';
2715            printf(
2716                "\n%s\t%s\n  Code: %s %s\n%s\n",
2717
2718                # List of redirects
2719                $redirected ? join("\n-> ", @redirects_urls) . $redirmsg : $u,
2720
2721                # List of lines
2722                $lines_list ? sprintf("\n%6s: %s", "Line$s", $lines_list) : '',
2723
2724                # Response code chain
2725                join(' -> ', @http_codes),
2726
2727                # HTTP message
2728                $currloc->{message} || '',
2729
2730                # What to do
2731                wrap(' To do: ', '        ', $whattodo)
2732            );
2733            if ($#fragments >= 0) {
2734                if ($currloc->{code} == 200) {
2735                    print("The following fragments need to be fixed:\n");
2736                }
2737                else {
2738                    print("Fragments:\n");
2739                }
2740            }
2741        }
2742
2743        # Fragments
2744        for my $f (@fragments) {
2745            my @unique_lines =
2746                &sort_unique(keys %{$links->{$u}{fragments}{$f}});
2747            my $plural = (scalar(@unique_lines) > 1) ? 's' : '';
2748            my $unique_lines = join(', ', @unique_lines);
2749            if ($Opts{HTML}) {
2750                printf("<li>%s<em>#%s</em> (line%s %s)</li>\n",
2751                    &encode($u), &encode($f), $plural, $unique_lines);
2752            }
2753            else {
2754                printf("\t%-30s\tLine%s: %s\n", $f, $plural, $unique_lines);
2755            }
2756        }
2757
2758        print("</ul></dd>\n") if ($Opts{HTML} && scalar(@fragments));
2759    }
2760
2761    # End of the table
2762    print("</dl>\n") if $Opts{HTML};
2763
2764    return;
2765}
2766
2767sub code_shown ($$)
2768{
2769    my ($u, $results) = @_;
2770
2771    if ($results->{$u}{location}{record} == 200) {
2772        return $results->{$u}{location}{orig} ||
2773            $results->{$u}{location}{record};
2774    }
2775    else {
2776        return $results->{$u}{location}{record};
2777    }
2778}
2779
2780sub links_summary (\%\%\%\%)
2781{
2782
2783    # Advices to fix the problems
2784
2785    my %todo = (
2786        200 =>
2787            'Some of the links to this resource point to broken URI fragments (such as index.html#fragment).',
2788        300 =>
2789            'This often happens when a typo in the link gets corrected automatically by the server. For the sake of performance, the link should be fixed.',
2790        301 =>
2791            'This is a permanent redirect. The link should be updated to point to the more recent URI.',
2792        302 =>
2793            'This is a temporary redirect. Update the link if you believe it makes sense, or leave it as is.',
2794        303 =>
2795            'This rare status code points to a "See Other" resource. There is generally nothing to be done.',
2796        307 =>
2797            'This is a temporary redirect. Update the link if you believe it makes sense, or leave it as is.',
2798        400 =>
2799            'This is usually the sign of a malformed URL that cannot be parsed by the server. Check the syntax of the link.',
2800        401 =>
2801            "The link is not public and the actual resource is only available behind authentication. If not already done, you could specify it.",
2802        403 =>
2803            'The link is forbidden! This needs fixing. Usual suspects: a missing index.html or Overview.html, or a missing ACL.',
2804        404 =>
2805            'The link is broken. Double-check that you have not made any typo, or mistake in copy-pasting. If the link points to a resource that no longer exists, you may want to remove or fix the link.',
2806        405 =>
2807            'The server does not allow HTTP HEAD requests, which prevents the Link Checker to check the link automatically.  Check the link manually.',
2808        406 =>
2809            "The server isn't capable of responding according to the Accept* headers sent. This is likely to be a server-side issue with negotiation.",
2810        407 => 'The link is a proxy, but requires Authentication.',
2811        408 => 'The request timed out.',
2812        410 => 'The resource is gone. You should remove this link.',
2813        415 => 'The media type is not supported.',
2814        500 => 'This is a server side problem. Check the URI.',
2815        501 =>
2816            'Could not check this link: method not implemented or scheme not supported.',
2817        503 =>
2818            'The server cannot service the request, for some unknown reason.',
2819
2820        # Non-HTTP codes:
2821        RC_ROBOTS_TXT() => sprintf(
2822            'The link was not checked due to %srobots exclusion rules%s. Check the link manually, and see also the link checker %sdocumentation on robots exclusion%s.',
2823            $Opts{HTML} ? (
2824                '<a href="http://www.robotstxt.org/robotstxt.html">', '</a>',
2825                "<a href=\"$Cfg{Doc_URI}#bot\">",                     '</a>'
2826                ) : ('') x 4
2827        ),
2828        RC_DNS_ERROR() =>
2829            'The hostname could not be resolved. Check the link for typos.',
2830        RC_IP_DISALLOWED() =>
2831            sprintf(
2832            'The link resolved to a %snon-public IP address%s, and this link checker instance has been configured to not access such addresses. This may be a real error or just a quirk of the name resolver configuration on the server where the link checker runs. Check the link manually, in particular its hostname/IP address.',
2833            $Opts{HTML} ?
2834                ('<a href="http://www.ietf.org/rfc/rfc1918.txt">', '</a>') :
2835                ('') x 2),
2836        RC_PROTOCOL_DISALLOWED() =>
2837            'Accessing links with this URI scheme has been disabled in link checker.',
2838    );
2839    my %priority = (
2840        410 => 1,
2841        404 => 2,
2842        403 => 5,
2843        200 => 10,
2844        300 => 15,
2845        401 => 20
2846    );
2847
2848    my ($links, $results, $broken, $redirects) = @_;
2849
2850    # List of the broken links
2851    my @urls              = keys %{$broken};
2852    my @dir_redirect_urls = ();
2853    if ($Opts{Redirects}) {
2854
2855        # Add the redirected URI's to the report
2856        for my $l (keys %$redirects) {
2857            next
2858                unless (defined($results->{$l}) &&
2859                defined($links->{$l}) &&
2860                !defined($broken->{$l}));
2861
2862            # Check whether we have a "directory redirect"
2863            # e.g. http://www.w3.org/TR -> http://www.w3.org/TR/
2864            my ($redirect_loop, @redirects) = get_redirects($l, %$redirects);
2865            if ($#redirects == 1) {
2866                push(@dir_redirect_urls, $l);
2867                next;
2868            }
2869            push(@urls, $l);
2870        }
2871    }
2872
2873    # Broken links and redirects
2874    if ($#urls < 0) {
2875        if (!$Opts{Quiet}) {
2876            print_doc_header();
2877            if ($Opts{HTML}) {
2878                print "<h3>Links</h3>\n<p>Valid links!</p>\n";
2879            }
2880            else {
2881                print "\nValid links.\n";
2882            }
2883        }
2884    }
2885    else {
2886        print_doc_header();
2887        print('<h3>') if $Opts{HTML};
2888        print("\nList of broken links and other issues");
2889
2890        #print(' and redirects') if $Opts{Redirects};
2891
2892        # Sort the URI's by HTTP Code
2893        my %code_summary;
2894        my @idx;
2895        for my $u (@urls) {
2896            if (defined($results->{$u}{location}{record})) {
2897                my $c = &code_shown($u, $results);
2898                $code_summary{$c}++;
2899                push(@idx, $c);
2900            }
2901        }
2902        my @sorted = @urls[
2903            sort {
2904                defined($priority{$idx[$a]}) ?
2905                    defined($priority{$idx[$b]}) ?
2906                    $priority{$idx[$a]} <=> $priority{$idx[$b]} :
2907                        -1 :
2908                    defined($priority{$idx[$b]}) ? 1 :
2909                    $idx[$a] <=> $idx[$b]
2910            } 0 .. $#idx
2911        ];
2912        @urls = @sorted;
2913        undef(@sorted);
2914        undef(@idx);
2915
2916        if ($Opts{HTML}) {
2917
2918            # Print a summary
2919            print <<'EOF';
2920</h3>
2921<p><em>There are issues with the URLs listed below. The table summarizes the
2922issues and suggested actions by HTTP response status code.</em></p>
2923<table class="report" border="1" summary="List of issues and suggested actions.">
2924<thead>
2925<tr>
2926<th>Code</th>
2927<th>Occurrences</th>
2928<th>What to do</th>
2929</tr>
2930</thead>
2931<tbody>
2932EOF
2933            for my $code (sort(keys(%code_summary))) {
2934                printf('<tr%s>', &bgcolor($code));
2935                printf('<td><a href="#d%scode_%s">%s</a></td>',
2936                    $doc_count, $code, http_rc($code));
2937                printf('<td>%s</td>', $code_summary{$code});
2938                printf('<td>%s</td>', $todo{$code});
2939                print "</tr>\n";
2940            }
2941            print "</tbody>\n</table>\n";
2942        }
2943        else {
2944            print(':');
2945        }
2946        &show_link_report($links, $results, $broken, $redirects, \@urls, 1,
2947            \%todo);
2948    }
2949
2950    # Show directory redirects
2951    if ($Opts{Dir_Redirects} && ($#dir_redirect_urls > -1)) {
2952        print_doc_header();
2953        print('<h3>') if $Opts{HTML};
2954        print("\nList of redirects");
2955        print(
2956            "</h3>\n<p>The links below are not broken, but the document does not use the exact URL, and the links were redirected. It may be a good idea to link to the final location, for the sake of speed.</p>"
2957        ) if $Opts{HTML};
2958        &show_link_report($links, $results, $broken, $redirects,
2959            \@dir_redirect_urls);
2960    }
2961
2962    return;
2963}
2964
2965###############################################################################
2966
2967################
2968# Global stats #
2969################
2970
2971sub global_stats ()
2972{
2973    my $stop = &get_timestamp();
2974    my $n_docs =
2975        ($doc_count <= $Opts{Max_Documents}) ? $doc_count :
2976                                               $Opts{Max_Documents};
2977    return sprintf(
2978        'Checked %d document%s in %s seconds.',
2979        $n_docs,
2980        ($n_docs == 1) ? '' : 's',
2981        &time_diff($timestamp, $stop)
2982    );
2983}
2984
2985##################
2986# HTML interface #
2987##################
2988
2989sub html_header ($$)
2990{
2991    my ($uri, $cookie) = @_;
2992
2993    my $title = defined($uri) ? $uri : '';
2994    $title = ': ' . $title if ($title =~ /\S/);
2995
2996    my $headers = '';
2997    if (!$Opts{Command_Line}) {
2998        $headers .= "Cache-Control: no-cache\nPragma: no-cache\n" if $uri;
2999        $headers .= "Content-Type: text/html; charset=utf-8\n";
3000        $headers .= "Set-Cookie: $cookie\n"                       if $cookie;
3001
3002        # mod_perl 1.99_05 doesn't seem to like it if the "\n\n" isn't in the same
3003        # print() statement as the last header
3004        $headers .= "Content-Language: en\n\n";
3005    }
3006
3007    my $onload = $uri ? '' :
3008          ' onload="if(document.getElementById){document.getElementById(\'uri_1\').focus()}"';
3009
3010    print $headers, $DocType, "
3011<html lang=\"en\" xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">
3012<head>
3013<title>W3C Link Checker", &encode($title), "</title>
3014",      $Head,   "</head>
3015<body", $onload, '>';
3016    &banner($title);
3017    return;
3018}
3019
3020sub banner ($)
3021{
3022    my $tagline = "Check links and anchors in Web pages or full Web sites";
3023
3024    printf(
3025        <<'EOF', URI->new_abs("../images/no_w3c.png", $Cfg{Doc_URI}), $tagline);
3026<div id="banner"><h1 id="title"><a href="http://www.w3.org/" title="W3C"><img alt="W3C" id="logo" src="%s" width="110" height="61" /></a>
3027<a href="checklink"><span>Link Checker</span></a></h1>
3028<p id="tagline">%s</p></div>
3029<div id="main">
3030EOF
3031    return;
3032}
3033
3034sub status_icon($)
3035{
3036    my ($code) = @_;
3037    my $icon_type;
3038    my $r = HTTP::Response->new($code);
3039    if ($r->is_success()) {
3040        $icon_type = 'error'
3041            ; # if is success but reported, it's because of broken frags => error
3042    }
3043    elsif (&informational($code)) {
3044        $icon_type = 'info';
3045    }
3046    elsif ($code == 300) {
3047        $icon_type = 'info';
3048    }
3049    elsif ($code == 401) {
3050        $icon_type = 'error';
3051    }
3052    elsif ($r->is_redirect()) {
3053        $icon_type = 'warning';
3054    }
3055    elsif ($r->is_error()) {
3056        $icon_type = 'error';
3057    }
3058    else {
3059        $icon_type = 'error';
3060    }
3061    return sprintf('<span class="err_type"><img src="%s" alt="%s" /></span>',
3062        URI->new_abs("../images/info_icons/$icon_type.png", $Cfg{Doc_URI}),
3063        $icon_type);
3064}
3065
3066sub bgcolor ($)
3067{
3068    my ($code) = @_;
3069    my $class;
3070    my $r = HTTP::Response->new($code);
3071    if ($r->is_success()) {
3072        return '';
3073    }
3074    elsif ($code == RC_ROBOTS_TXT() || $code == RC_IP_DISALLOWED()) {
3075        $class = 'dubious';
3076    }
3077    elsif ($code == 300) {
3078        $class = 'multiple';
3079    }
3080    elsif ($code == 401) {
3081        $class = 'unauthorized';
3082    }
3083    elsif ($r->is_redirect()) {
3084        $class = 'redirect';
3085    }
3086    elsif ($r->is_error()) {
3087        $class = 'broken';
3088    }
3089    else {
3090        $class = 'broken';
3091    }
3092    return (' class="' . $class . '"');
3093}
3094
3095sub show_url ($)
3096{
3097    my ($url) = @_;
3098    return sprintf('<a href="%s">%s</a>', (&encode($url)) x 2);
3099}
3100
3101sub html_footer ()
3102{
3103    printf("<p>%s</p>\n", &global_stats())
3104        if ($doc_count > 0 && !$Opts{Quiet});
3105    if (!$doc_count) {
3106        print <<'EOF';
3107<div class="intro">
3108  <p>
3109    This Link Checker looks for issues in links, anchors and referenced objects
3110    in a Web page, CSS style sheet, or recursively on a whole Web site. For
3111    best results, it is recommended to first ensure that the documents checked
3112    use Valid <a href="http://validator.w3.org/">(X)HTML Markup</a> and
3113    <a href="http://jigsaw.w3.org/css-validator/">CSS</a>. The Link Checker is
3114    part of the W3C's <a href="http://www.w3.org/QA/Tools/">validators and
3115    Quality Web tools</a>.
3116  </p>
3117</div>
3118EOF
3119    }
3120    printf(<<'EOF', $Cfg{Doc_URI}, $Cfg{Doc_URI}, $PACKAGE, $REVISION);
3121</div><!-- main -->
3122<ul class="navbar" id="menu">
3123  <li><a href="%s" accesskey="3" title="Documentation for this Link Checker Service">Docs</a></li>
3124  <li><a href="http://search.cpan.org/dist/W3C-LinkChecker/" accesskey="2" title="Download the source / Install this service">Download</a></li>
3125  <li><a href="%s#csb" title="feedback: comments, suggestions and bugs" accesskey="4">Feedback</a></li>
3126  <li><a href="http://validator.w3.org/" title="Validate your markup with the W3C Markup Validation Service">Validator</a></li>
3127</ul>
3128<div>
3129<address>
3130%s<br /> %s
3131</address>
3132</div>
3133</body>
3134</html>
3135EOF
3136    return;
3137}
3138
3139sub print_form (\%$$)
3140{
3141    my ($params, $cookie, $check_num) = @_;
3142
3143    # Split params on \0, see CGI's docs on Vars()
3144    while (my ($key, $value) = each(%$params)) {
3145        if ($value) {
3146            my @vals = split(/\0/, $value, 2);
3147            $params->{$key} = $vals[0];
3148        }
3149    }
3150
3151    # Override undefined values from the cookie, if we got one.
3152    my $valid_cookie = 0;
3153    if ($cookie) {
3154        my %cookie_values = $cookie->value();
3155        if (!$cookie_values{clear})
3156        {    # XXX no easy way to check if cookie expired?
3157            $valid_cookie = 1;
3158            while (my ($key, $value) = each(%cookie_values)) {
3159                $params->{$key} = $value unless defined($params->{$key});
3160            }
3161        }
3162    }
3163
3164    my $chk = ' checked="checked"';
3165    $params->{hide_type} = 'all' unless $params->{hide_type};
3166
3167    my $requested_uri = &encode($params->{uri} || '');
3168    my $sum = $params->{summary}        ? $chk : '';
3169    my $red = $params->{hide_redirects} ? $chk : '';
3170    my $all = ($params->{hide_type} ne 'dir') ? $chk : '';
3171    my $dir = $all                            ? ''   : $chk;
3172    my $acc = $params->{no_accept_language}   ? $chk : '';
3173    my $ref = $params->{no_referer}           ? $chk : '';
3174    my $rec = $params->{recursive}            ? $chk : '';
3175    my $dep = &encode($params->{depth} || '');
3176
3177    my $cookie_options = '';
3178    if ($valid_cookie) {
3179        $cookie_options = "
3180    <label for=\"cookie1_$check_num\"><input type=\"radio\" id=\"cookie1_$check_num\" name=\"cookie\" value=\"nochanges\" checked=\"checked\" /> Don't modify saved options</label>
3181    <label for=\"cookie2_$check_num\"><input type=\"radio\" id=\"cookie2_$check_num\" name=\"cookie\" value=\"set\" /> Save these options</label>
3182    <label for=\"cookie3_$check_num\"><input type=\"radio\" id=\"cookie3_$check_num\" name=\"cookie\" value=\"clear\" /> Clear saved options</label>";
3183    }
3184    else {
3185        $cookie_options = "
3186    <label for=\"cookie_$check_num\"><input type=\"checkbox\" id=\"cookie_$check_num\" name=\"cookie\" value=\"set\" /> Save options in a <a href=\"http://www.w3.org/Protocols/rfc2109/rfc2109\">cookie</a></label>";
3187    }
3188
3189    print "<form action=\"", $Opts{_Self_URI},
3190        "\" method=\"get\" onsubmit=\"return uriOk($check_num)\" accept-charset=\"UTF-8\">
3191<p><label for=\"uri_$check_num\">Enter the address (<a href=\"http://www.w3.org/Addressing/\">URL</a>)
3192of a document that you would like to check:</label></p>
3193<p><input type=\"text\" size=\"50\" id=\"uri_$check_num\" name=\"uri\" value=\"",
3194        $requested_uri, "\" /></p>
3195<fieldset id=\"extra_opt_uri_$check_num\" class=\"moreoptions\">
3196	<legend class=\"toggletext\">More Options</legend>
3197	<div class=\"options\">
3198  <p>
3199    <label for=\"summary_$check_num\"><input type=\"checkbox\" id=\"summary_$check_num\" name=\"summary\" value=\"on\"",
3200        $sum, " /> Summary only</label>
3201    <br />
3202    <label for=\"hide_redirects_$check_num\"><input type=\"checkbox\" id=\"hide_redirects_$check_num\" name=\"hide_redirects\" value=\"on\"",
3203        $red,
3204        " /> Hide <a href=\"http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html#sec10.3\">redirects</a>:</label>
3205    <label for=\"hide_type_all_$check_num\"><input type=\"radio\" id=\"hide_type_all_$check_num\" name=\"hide_type\" value=\"all\"",
3206        $all, " /> all</label>
3207    <label for=\"hide_type_dir_$check_num\"><input type=\"radio\" id=\"hide_type_dir_$check_num\" name=\"hide_type\" value=\"dir\"",
3208        $dir, " /> for directories only</label>
3209    <br />
3210    <label for=\"no_accept_language_$check_num\"><input type=\"checkbox\" id=\"no_accept_language_$check_num\" name=\"no_accept_language\" value=\"on\"",
3211        $acc,
3212        " /> Don't send the <tt><a href=\"http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4\">Accept-Language</a></tt> header</label>
3213    <br />
3214    <label for=\"no_referer_$check_num\"><input type=\"checkbox\" id=\"no_referer_$check_num\" name=\"no_referer\" value=\"on\"",
3215        $ref,
3216        " /> Don't send the <tt><a href=\"http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.36\">Referer</a></tt> header</label>
3217    <br />
3218    <label title=\"Check linked documents recursively (maximum: ",
3219        $Opts{Max_Documents},
3220        " documents)\" for=\"recursive_$check_num\"><input type=\"checkbox\" id=\"recursive_$check_num\" name=\"recursive\" value=\"on\"",
3221        $rec, " /> Check linked documents recursively</label>,
3222    <label title=\"Depth of the recursion (-1 is the default and means unlimited)\" for=\"depth_$check_num\">recursion depth: <input type=\"text\" size=\"3\" maxlength=\"3\" id=\"depth_$check_num\" name=\"depth\" value=\"",
3223        $dep, "\" /></label>
3224    <br /><br />", $cookie_options, "
3225  </p>
3226  </div>
3227</fieldset>
3228<p class=\"submit_button\"><input type=\"submit\" name=\"check\" value=\"Check\" /></p>
3229</form>
3230<div class=\"intro\" id=\"don_program\"></div>
3231<script type=\"text/javascript\" src=\"http://www.w3.org/QA/Tools/don_prog.js\"></script>
3232";
3233    return;
3234}
3235
3236sub encode (@)
3237{
3238    return $Opts{HTML} ? HTML::Entities::encode(@_) : @_;
3239}
3240
3241sub hprintf (@)
3242{
3243    print_doc_header();
3244    if (!$Opts{HTML}) {
3245        printf(@_);
3246    }
3247    else {
3248        print HTML::Entities::encode(sprintf($_[0], @_[1 .. @_ - 1]));
3249    }
3250    return;
3251}
3252
3253# Print the document header, if it hasn't been printed already.
3254# This is invoked before most other output operations, in order
3255# to enable quiet processing that doesn't clutter the output with
3256# "Processing..." messages when nothing else will be reported.
3257sub print_doc_header ()
3258{
3259    if (defined($doc_header)) {
3260        print $doc_header;
3261        undef($doc_header);
3262    }
3263}
3264
3265# Local Variables:
3266# mode: perl
3267# indent-tabs-mode: nil
3268# cperl-indent-level: 4
3269# cperl-continued-statement-offset: 4
3270# cperl-brace-offset: -4
3271# perl-indent-level: 4
3272# End:
3273# ex: ts=4 sw=4 et
3274