1#!/usr/local/bin/perl -T
2#
3# W3C Markup Validation Service
4# A CGI script to retrieve and validate a markup file
5#
6# Copyright 1995-2012 World Wide Web Consortium, (Massachusetts
7# Institute of Technology, European Research Consortium for Informatics
8# and Mathematics, Keio University). All Rights Reserved.
9#
10# Originally written by Gerald Oskoboiny <gerald@w3.org>
11# for additional contributors, see
12# http://dvcs.w3.org/hg/markup-validator/shortlog/tip and
13# http://validator.w3.org/about.html#credits
14#
15# This source code is available under the license at:
16#     http://www.w3.org/Consortium/Legal/copyright-software
17
18#
19# We need Perl 5.8.0+.
20use 5.008;
21
22###############################################################################
23#### Load modules. ############################################################
24###############################################################################
25
26#
27# Pragmas.
28use strict;
29use warnings;
30use utf8;
31
32package W3C::Validator::MarkupValidator;
33
34#
35# Modules.  See also the BEGIN block further down below.
36#
37# Version numbers given where we absolutely need a minimum version of a given
38# module (gives nicer error messages). By default, add an empty import list
39# when loading modules to prevent non-OO or poorly written modules from
40# polluting our namespace.
41#
42
43# Need 3.40 for query string and path info fixes, #4365
44use CGI 3.40 qw(-newstyle_urls -private_tempfiles redirect);
45use CGI::Carp qw(carp croak fatalsToBrowser);
46use Config qw(%Config);
47use Config::General 2.32 qw();    # Need 2.32 for <msg 0>, rt.cpan.org#17852
48use Encode qw();
49use Encode::Alias qw();
50use Encode::HanExtra qw();        # for some chinese character encodings,
51                                  # e.g gb18030
52use File::Spec::Functions qw(catfile rel2abs tmpdir);
53use HTML::Encoding 0.52 qw();
54use HTML::HeadParser 3.60 qw();    # Needed for HTML5 meta charset workaround
55use HTML::Parser 3.24 qw();        # Need 3.24 for $p->parse($code_ref)
56use HTML::Template qw();           # Need 2.6 for path param, other things.
57                                   # Specifying 2.6 would break with 2.10,
58                                   # rt.cpan.org#70190
59use HTTP::Headers::Util qw();
60use HTTP::Message 1.52 qw();       # Need 1.52 for decoded_content()
61use HTTP::Request qw();
62use HTTP::Headers::Auth qw();      # Needs to be imported after other HTTP::*.
63use JSON 2.00 qw();
64use SGML::Parser::OpenSP 0.991 qw();
65use URI 1.53 qw();                 # Need 1.53 for secure()
66use URI::Escape qw(uri_escape);
67use URI::file;
68use URI::Heuristic qw();
69
70###############################################################################
71#### Constant definitions. ####################################################
72###############################################################################
73
74#
75# Define global constants
76use constant TRUE  => 1;
77use constant FALSE => 0;
78
79#
80# Tentative Validation Severities.
81use constant T_WARN  => 4;    # 0000 0100
82use constant T_ERROR => 8;    # 0000 1000
83
84#
85# Define global variables.
86use vars qw($DEBUG $CFG %RSRC $VERSION);
87$VERSION = '1.3';
88
89use constant IS_MODPERL2 =>
90    (exists($ENV{MOD_PERL_API_VERSION}) && $ENV{MOD_PERL_API_VERSION} >= 2);
91
92#
93# Things inside BEGIN don't happen on every request in persistent environments
94# (such as mod_perl); so let's do the globals, eg. read config, here.
95BEGIN {
96
97    my $base = $ENV{W3C_VALIDATOR_HOME} || '/usr/local/www/validator';
98
99    # Launder data for -T; -AutoLaunder doesn't catch this one.
100    if ($base =~ /^(.*)$/) {
101        $base = $1;
102    }
103
104    #
105    # Read Config Files.
106    eval {
107        my %config_opts = (
108            -ConfigFile =>
109                ($ENV{W3C_VALIDATOR_CFG} || '/usr/local/www/validator/htdocs/config/validator.conf'),
110            -MergeDuplicateOptions => TRUE,
111            -MergeDuplicateBlocks  => TRUE,
112            -SplitPolicy           => 'equalsign',
113            -UseApacheInclude      => TRUE,
114            -IncludeRelative       => TRUE,
115            -InterPolateVars       => TRUE,
116            -AutoLaunder           => TRUE,
117            -AutoTrue              => TRUE,
118            -CComments             => FALSE,
119            -DefaultConfig         => {
120                Protocols => {Allow => 'http,https'},
121                Paths     => {
122                    Base  => $base,
123                    Cache => '',
124                },
125                External => {HTML5 => FALSE,},
126            },
127        );
128        my %cfg = Config::General->new(%config_opts)->getall();
129        $CFG = \%cfg;
130    };
131    if ($@) {
132        die <<"EOF";
133Could not read configuration.  Set the W3C_VALIDATOR_CFG environment variable
134or copy conf/* to /etc/w3c/. Make sure that the configuration file and all
135included files are readable by the web server user. The error was:\n'$@'
136EOF
137    }
138
139    #
140    # Check paths in config
141    # @@FIXME: This does not do a very good job error-message-wise if
142    # a path is missing...
143    {
144        my %paths = map { $_ => [-d $_, -r _] } $CFG->{Paths}->{Base},
145            $CFG->{Paths}->{Templates}, $CFG->{Paths}->{SGML}->{Library};
146        my @_d = grep { not $paths{$_}->[0] } keys %paths;
147        my @_r = grep { not $paths{$_}->[1] } keys %paths;
148        die "Does not exist or is not a directory: @_d\n"       if scalar(@_d);
149        die "Directory not readable (permission denied): @_r\n" if scalar(@_r);
150    }
151
152    #
153    # Split allowed protocols into a list.
154    if (my $allowed = delete($CFG->{Protocols}->{Allow})) {
155        $CFG->{Protocols}->{Allow} = [split(/\s*,\s*/, $allowed)];
156    }
157
158    # Split available languages into a list
159    if (my $langs = delete($CFG->{Languages})) {
160        $CFG->{Languages} = [split(/\s+/, $langs)];
161    }
162    else {
163
164        # Default to english
165        $CFG->{Languages} = ["en"];
166    }
167
168    {    # Make types config indexed by FPI.
169        my $types = {};
170        while (my ($key, $value) = each %{$CFG->{Types}}) {
171            $types->{$CFG->{Types}->{$key}->{PubID}} = $value;
172        }
173        $CFG->{Types} = $types;
174    }
175
176    #
177    # Change strings to internal constants in MIME type mapping.
178    while (my ($key, $value) = each %{$CFG->{MIME}}) {
179        $CFG->{MIME}->{$key} = 'TBD'
180            unless ($value eq 'SGML' || $value eq 'XML');
181    }
182
183    #
184    # Register Encode aliases.
185    while (my ($key, $value) = each %{$CFG->{Charsets}}) {
186        Encode::Alias::define_alias($key, $1) if ($value =~ /^[AX] (\S+)/);
187    }
188
189    #
190    # Set debug flag.
191    if ($CFG->{'Allow Debug'}) {
192        $DEBUG = TRUE if $ENV{W3C_VALIDATOR_DEBUG} || $CFG->{'Enable Debug'};
193    }
194    else {
195        $DEBUG = FALSE;
196    }
197
198    # Read friendly error message file
199    # 'en_US' should be replaced by $lang for lang-neg
200    %RSRC = Config::General->new(
201        -MergeDuplicateBlocks => 1,
202        -ConfigFile =>
203            catfile($CFG->{Paths}->{Templates}, 'en_US', 'error_messages.cfg'),
204    )->getall();
205
206    eval {
207        local $SIG{__DIE__} = undef;
208        require Encode::JIS2K;    # for optional extra Japanese encodings
209    };
210
211    # Tell libxml to load _only_ our XML catalog.  This is because our entity
212    # load jailing may trap the libxml internal default catalog (which is
213    # automatically loaded).  Preventing loading that from the input callback
214    # will cause libxml to not see the document content at all but to throw
215    # weird "Document is empty" errors, at least as of XML::LibXML 1.70 and
216    # libxml 2.7.7.  XML_CATALOG_FILES needs to be in effect at XML::LibXML
217    # load time which is why we're using "require" here instead of pulling it
218    # in with "use" as usual.  And finally, libxml should have support for
219    # SGML open catalogs but they don't seem to work (again as of 1.70 and
220    # 2.7.7); if we use xml.soc here, no entities seem to end up being resolved
221    # from it - so we use a (redundant) XML catalog which works.
222    # Note that setting XML_CATALOG_FILES here does not seem to work with
223    # mod_perl (it doesn't end up being used by XML::LibXML), therefore we do
224    # it in the mod_perl/startup.pl startup file for it too.
225    local $ENV{XML_CATALOG_FILES} =
226        catfile($CFG->{Paths}->{SGML}->{Library}, 'catalog.xml');
227    require XML::LibXML;
228    XML::LibXML->VERSION(1.73);    # Need 1.73 for rt.cpan.org #66642
229
230}    # end of BEGIN block.
231
232#
233# Get rid of (possibly insecure) $PATH.
234delete $ENV{PATH};
235
236#@@DEBUG: Dump $CFG datastructure. Used only as a developer aid.
237#use Data::Dumper qw(Dumper);
238#print Dumper($CFG);
239#exit;
240#@@DEBUG;
241
242###############################################################################
243#### Process CGI variables and initialize. ####################################
244###############################################################################
245
246#
247# Create a new CGI object.
248my $q = CGI->new();
249
250#
251# The data structure that will hold all session data.
252# @@FIXME This can't be my() as $File will sooner or
253# later be undef and add_warning will cause the script
254# to die. our() seems to work but has other problems.
255# @@FIXME Apparently, this must be set to {} also,
256# otherwise the script might pick up an old object
257# after abort_if_error_flagged under mod_perl.
258our $File = {};
259
260#################################
261# Initialize the datastructure. #
262#################################
263
264#
265# Charset data (casing policy: lowercase early).
266$File->{Charset}->{Use}      = ''; # The charset used for validation.
267$File->{Charset}->{Auto}     = ''; # Autodetection using XML rules (Appendix F)
268$File->{Charset}->{HTTP}     = ''; # From HTTP's "charset" parameter.
269$File->{Charset}->{META}     = ''; # From HTML's <meta http-equiv>.
270$File->{Charset}->{XML}      = ''; # From the XML Declaration.
271$File->{Charset}->{Override} = ''; # From CGI/user override.
272
273#
274# Misc simple types.
275$File->{Mode} =
276    'DTD+SGML';    # Default parse mode is  DTD validation in SGML mode.
277
278# By default, perform validation (we may perform only xml-wf in some cases)
279$File->{XMLWF_ONLY} = FALSE;
280
281#
282# Listrefs.
283$File->{Warnings}   = [];    # Warnings...
284$File->{Namespaces} = [];    # Other (non-root) Namespaces.
285$File->{Parsers}    = [];    # Parsers used {name, link, type, options}
286
287# By default, doctype-less documents cannot be valid
288$File->{"DOCTYPEless OK"}             = FALSE;
289$File->{"Default DOCTYPE"}->{"HTML"}  = 'HTML 4.01 Transitional';
290$File->{"Default DOCTYPE"}->{"XHTML"} = 'XHTML 1.0 Transitional';
291
292###############################################################################
293#### Generate Template for Result. ############################################
294###############################################################################
295
296# first we determine the chosen language based on
297# 1) lang argument given as parameter (if this language is available)
298# 2) HTTP language negotiation between variants available and user-agent choices
299# 3) English by default
300my $lang = $q->param('lang') || '';
301my @localizations;
302foreach my $lang_available (@{$CFG->{Languages}}) {
303    if ($lang eq $lang_available) {
304
305        # Requested language (from parameters) is available, just use it
306        undef @localizations;
307        last;
308    }
309    push @localizations,
310        [
311        $lang_available, 1,               'text/html', undef,
312        'utf-8',         $lang_available, undef
313        ];
314}
315
316# If language is not chosen yet, use HTTP-based negotiation
317if (@localizations) {
318    require HTTP::Negotiate;
319    $lang = HTTP::Negotiate::choose(\@localizations);
320}
321
322# HTTP::Negotiate::choose may return undef e.g if sent Accept-Language: en;q=0
323$lang ||= 'en_US';
324
325if ($lang eq "en") {
326    $lang = 'en_US';    # legacy
327}
328
329$File->{Template_Defaults} = {
330    die_on_bad_params => FALSE,
331    loop_context_vars => TRUE,
332    global_vars       => TRUE,
333    case_sensitive    => TRUE,
334    path              => [catfile($CFG->{Paths}->{Templates}, $lang)],
335    filter => sub { my $ref = shift; ${$ref} = Encode::decode_utf8(${$ref}); },
336};
337if (IS_MODPERL2()) {
338    $File->{Template_Defaults}->{cache} = TRUE;
339}
340elsif ($CFG->{Paths}->{Cache}) {
341    $File->{Template_Defaults}->{file_cache} = TRUE;
342    $File->{Template_Defaults}->{file_cache_dir} =
343        rel2abs($CFG->{Paths}->{Cache}, tmpdir());
344}
345
346undef $lang;
347
348#########################################
349# Populate $File->{Opt} -- CGI Options. #
350#########################################
351
352#
353# Preprocess the CGI parameters.
354$q = &prepCGI($File, $q);
355
356#
357# Set session switches.
358$File->{Opt}->{Outline}        = $q->param('outline') ? TRUE : FALSE;
359$File->{Opt}->{'Show Source'}  = $q->param('ss')      ? TRUE : FALSE;
360$File->{Opt}->{'Show Tidy'}    = $q->param('st')      ? TRUE : FALSE;
361$File->{Opt}->{Verbose}        = $q->param('verbose') ? TRUE : FALSE;
362$File->{Opt}->{'Group Errors'} = $q->param('group')   ? TRUE : FALSE;
363$File->{Opt}->{Debug}          = $q->param('debug')   ? TRUE : FALSE;
364$File->{Opt}->{No200}          = $q->param('No200')   ? TRUE : FALSE;
365$File->{Opt}->{Prefill}        = $q->param('prefill') ? TRUE : FALSE;
366$File->{Opt}->{'Prefill Doctype'} = $q->param('prefill_doctype') || 'html401';
367$File->{Opt}->{Charset} = lc($q->param('charset') || '');
368$File->{Opt}->{DOCTYPE} = $q->param('doctype') || '';
369
370$File->{Opt}->{'User Agent'} =
371    $q->param('user-agent') &&
372    $q->param('user-agent') ne "1" ? $q->param('user-agent') :
373                                     "W3C_Validator/$VERSION";
374$File->{Opt}->{'User Agent'} =~ tr/\x00-\x09\x0b\x0c-\x1f//d;
375
376if ($File->{Opt}->{'User Agent'} eq 'mobileok') {
377    $File->{Opt}->{'User Agent'} =
378        'W3C-mobileOK/DDC-1.0 (see http://www.w3.org/2006/07/mobileok-ddc)';
379}
380
381$File->{Opt}->{'Accept Header'}          = $q->param('accept')          || '';
382$File->{Opt}->{'Accept-Language Header'} = $q->param('accept-language') || '';
383$File->{Opt}->{'Accept-Charset Header'}  = $q->param('accept-charset')  || '';
384$File->{Opt}->{$_} =~ tr/\x00-\x09\x0b\x0c-\x1f//d
385    for ('Accept Header', 'Accept-Language Header', 'Accept-Charset Header');
386
387#
388# "Fallback" info for Character Encoding (fbc), Content-Type (fbt),
389# and DOCTYPE (fbd). If TRUE, the Override values are treated as
390# Fallbacks instead of Overrides.
391$File->{Opt}->{FB}->{Charset} = $q->param('fbc') ? TRUE : FALSE;
392$File->{Opt}->{FB}->{Type}    = $q->param('fbt') ? TRUE : FALSE;
393$File->{Opt}->{FB}->{DOCTYPE} = $q->param('fbd') ? TRUE : FALSE;
394
395#
396# If ";debug" was given, let it overrule the value from the config file,
397# regardless of whether it's "0" or "1" (on or off), but only if config
398# allows the debugging options.
399if ($CFG->{'Allow Debug'}) {
400    $DEBUG = $q->param('debug') if defined $q->param('debug');
401    $File->{Opt}->{Verbose} = TRUE if $DEBUG;
402}
403else {
404    $DEBUG = FALSE;    # The default.
405}
406$File->{Opt}->{Debug} = $DEBUG;
407
408&abort_if_error_flagged($File);
409
410#
411# Get the file and metadata.
412if ($q->param('uploaded_file')) {
413    $File = &handle_file($q, $File);
414}
415elsif ($q->param('fragment')) {
416    $File = &handle_frag($q, $File);
417}
418elsif ($q->param('uri')) {
419    $File = &handle_uri($q, $File);
420}
421
422#
423# Abort if an error was flagged during initialization.
424&abort_if_error_flagged($File);
425
426#
427# Get rid of the CGI object.
428undef $q;
429
430#
431# We don't need STDIN any more, so get rid of it to avoid getting clobbered
432# by Apache::Registry's idiotic interference under mod_perl.
433untie *STDIN;
434
435###############################################################################
436#### Output validation results. ###############################################
437###############################################################################
438
439if (!$File->{ContentType} && !$File->{'Direct Input'} && !$File->{'Is Upload'})
440{
441    &add_warning('W08', {});
442}
443
444$File = find_encodings($File);
445
446#
447# Decide on a charset to use (first part)
448#
449if ($File->{Charset}->{HTTP}) {    # HTTP, if given, is authoritative.
450    $File->{Charset}->{Use} = $File->{Charset}->{HTTP};
451}
452elsif ($File->{ContentType} =~ m(^text/(?:[-.a-zA-Z0-9]\+)?xml$)) {
453
454    # Act as if $http_charset was 'us-ascii'. (MIME rules)
455    $File->{Charset}->{Use} = 'us-ascii';
456
457    &add_warning(
458        'W01',
459        {   W01_upload => $File->{'Is Upload'},
460            W01_agent  => $File->{Server},
461            W01_ct     => $File->{ContentType},
462        }
463    );
464
465}
466elsif ($File->{Charset}->{XML}) {
467    $File->{Charset}->{Use} = $File->{Charset}->{XML};
468}
469elsif ($File->{BOM} &&
470    $File->{BOM} == 2 &&
471    $File->{Charset}->{Auto} =~ /^utf-16[bl]e$/)
472{
473    $File->{Charset}->{Use} = 'utf-16';
474}
475elsif ($File->{ContentType} =~ m(^application/(?:[-.a-zA-Z0-9]+\+)?xml$)) {
476    $File->{Charset}->{Use} = "utf-8";
477}
478elsif (&is_xml($File) and not $File->{ContentType} =~ m(^text/)) {
479    $File->{Charset}->{Use} = 'utf-8';    # UTF-8 (image/svg+xml etc.)
480}
481$File->{Charset}->{Use} ||= $File->{Charset}->{META};
482
483#
484# Handle any Fallback or Override for the charset.
485if (charset_not_equal($File->{Opt}->{Charset}, '(detect automatically)')) {
486
487    # charset=foo was given to the CGI and it wasn't "autodetect" or empty.
488    #
489    # Extract the user-requested charset from CGI param.
490    my ($override, undef) = split(/\s/, $File->{Opt}->{Charset}, 2);
491    $File->{Charset}->{Override} = lc($override);
492
493    if ($File->{Opt}->{FB}->{Charset}) {    # charset fallback mode
494        unless ($File->{Charset}->{Use})
495        {    # no charset detected, actual fallback
496            &add_warning('W02', {W02_charset => $File->{Charset}->{Override}});
497            $File->{Tentative} |= T_ERROR;    # Tag it as Invalid.
498            $File->{Charset}->{Use} = $File->{Charset}->{Override};
499        }
500    }
501    else {                                    # charset "hard override" mode
502        if (!$File->{Charset}->{Use}) {       # overriding "nothing"
503            &add_warning(
504                'W04',
505                {   W04_charset  => $File->{Charset}->{Override},
506                    W04_override => TRUE
507                }
508            );
509            $File->{Tentative} |= T_ERROR;
510            $File->{Charset}->{Use} = $File->{Charset}->{Override};
511        }
512        elsif ($File->{Charset}->{Override} ne $File->{Charset}->{Use}) {
513
514            # Actually overriding something; warn about override.
515            &add_warning(
516                'W03',
517                {   W03_use => $File->{Charset}->{Use},
518                    W03_opt => $File->{Charset}->{Override}
519                }
520            );
521            $File->{Tentative} |= T_ERROR;
522            $File->{Charset}->{Use} = $File->{Charset}->{Override};
523        }
524    }
525}
526
527if ($File->{'Direct Input'}) {    #explain why UTF-8 is forced
528    &add_warning('W28', {});
529}
530unless ($File->{Charset}->{XML} || $File->{Charset}->{META})
531{                                 #suggest character encoding info within doc
532    &add_warning('W27', {});
533}
534
535#
536# Abort if an error was flagged while finding the encoding.
537&abort_if_error_flagged($File);
538
539$File->{Charset}->{Default} = FALSE;
540unless ($File->{Charset}->{Use}) {    # No charset given...
541    $File->{Charset}->{Use}     = 'utf-8';
542    $File->{Charset}->{Default} = TRUE;
543    $File->{Tentative} |= T_ERROR;    # Can never be valid.
544    &add_warning('W04', {W04_charset => "UTF-8"});
545}
546
547# Always transcode, even if the content claims to be UTF-8
548$File = transcode($File);
549
550# Try guessing if it didn't work out
551if ($File->{ContentType} eq 'text/html' && $File->{Charset}->{Default}) {
552    my $also_tried = 'UTF-8';
553    for my $cs (qw(windows-1252 iso-8859-1)) {
554        last unless $File->{'Error Flagged'};
555        $File->{'Error Flagged'} = FALSE;    # reset
556        $File->{Charset}->{Use} = $cs;
557        &add_warning('W04',
558            {W04_charset => $cs, W04_also_tried => $also_tried});
559        $File = transcode($File);
560        $also_tried .= ", $cs";
561    }
562}
563
564# if it still does not work, we abandon hope here
565&abort_if_error_flagged($File);
566
567#
568# Add a warning if doc is UTF-8 and contains a BOM.
569if ($File->{Charset}->{Use} eq 'utf-8' &&
570    @{$File->{Content}} &&
571    $File->{Content}->[0] =~ m(^\x{FEFF}))
572{
573    &add_warning('W21', {});
574}
575
576#
577# Overall parsing algorithm for documents returned as text/html:
578#
579# For documents that come to us as text/html,
580#
581#  1. check if there's a doctype
582#  2. if there is a doctype, parse/validate against that DTD
583#  3. if no doctype, check for an xmlns= attribute on the first element, or
584#     XML declaration
585#  4. if no doctype and XML mode, check for XML well-formedness
586#  5. otherwise, punt.
587#
588
589#
590# Override DOCTYPE if user asked for it.
591if ($File->{Opt}->{DOCTYPE}) {
592    if ($File->{Opt}->{DOCTYPE} !~ /(?:Inline|detect)/i) {
593        $File = &override_doctype($File);
594    }
595    else {
596
597        # Get rid of inline|detect for easy truth value checking later
598        $File->{Opt}->{DOCTYPE} = '';
599    }
600}
601
602# Try to extract a DOCTYPE or xmlns.
603$File = &preparse_doctype($File);
604
605if ($File->{Opt}->{DOCTYPE} eq "HTML5") {
606    $File->{DOCTYPE} = "HTML5";
607    $File->{Version} = $File->{DOCTYPE};
608}
609
610set_parse_mode($File, $CFG);
611
612#
613# Sanity check Charset information and add any warnings necessary.
614$File = &charset_conflicts($File);
615
616# before we start the parsing, clean slate
617$File->{'Is Valid'} = TRUE;
618$File->{Errors}     = [];
619$File->{WF_Errors}  = [];
620
621if (($File->{DOCTYPE} eq "HTML5") or ($File->{DOCTYPE} eq "XHTML5")) {
622    if ($CFG->{External}->{HTML5}) {
623        $File = &html5_validate($File);
624        &add_warning(
625            'W00',
626            {   W00_experimental_name => "HTML5 Conformance Checker",
627                W00_experimental_URI  => "feedback.html"
628            }
629        );
630    }
631    else {
632        $File->{'Error Flagged'} = TRUE;
633        my $tmpl = &get_error_template($File);
634        $tmpl->param(fatal_no_checker      => TRUE);
635        $tmpl->param(fatal_missing_checker => 'HTML5 Validator');
636    }
637}
638elsif (($File->{DOCTYPE} eq '') and
639    (($File->{Root} eq "svg") or @{$File->{Namespaces}} > 1))
640{
641
642    # we send doctypeless SVG, or any doctypeless XML document with multiple
643    # namespaces found, to a different engine. WARNING this is experimental.
644    if ($CFG->{External}->{CompoundXML}) {
645        $File = &compoundxml_validate($File);
646        &add_warning(
647            'W00',
648            {   W00_experimental_name => "validator.nu Conformance Checker",
649                W00_experimental_URI  => "feedback.html"
650            }
651        );
652    }
653}
654else {
655    $File = &dtd_validate($File);
656}
657&abort_if_error_flagged($File);
658if (&is_xml($File)) {
659    if ($File->{DOCTYPE} eq "HTML5") {
660
661        # $File->{DOCTYPE} = "XHTML5";
662        # $File->{Version} = "XHTML5";
663    }
664    else {
665
666        # XMLWF check can be slow, skip if we already know the doc can't pass.
667        # http://www.w3.org/Bugs/Public/show_bug.cgi?id=9899
668        $File = &xmlwf($File) if $File->{'Is Valid'};
669    }
670    &abort_if_error_flagged($File);
671}
672
673#
674# Force "XML" if type is an XML type and an FPI was not found.
675# Otherwise set the type to be the FPI.
676if (&is_xml($File) and not $File->{DOCTYPE} and lc($File->{Root}) ne 'html') {
677    $File->{Version} = 'XML';
678}
679else {
680    $File->{Version} ||= $File->{DOCTYPE};
681}
682
683#
684# Get the pretty text version of the FPI if a mapping exists.
685if (my $prettyver = $CFG->{Types}->{$File->{Version}}->{Display}) {
686    $File->{Version} = $prettyver;
687}
688
689#
690# check the received mime type against Allowed mime types
691if ($File->{ContentType}) {
692    my @allowedMediaType =
693        split(/\s+/,
694        $CFG->{Types}->{$File->{DOCTYPE}}->{Types}->{Allowed} || '');
695    my $usedCTisAllowed;
696    if (scalar @allowedMediaType) {
697        $usedCTisAllowed = FALSE;
698        foreach (@allowedMediaType) {
699            $usedCTisAllowed = TRUE if ($_ eq $File->{ContentType});
700        }
701    }
702    else {
703
704        # wedon't know what media type is recommended, so better shut up
705        $usedCTisAllowed = TRUE;
706    }
707    if (!$usedCTisAllowed) {
708        &add_warning(
709            'W23',
710            {   W23_type => $File->{ContentType},
711                W23_type_pref =>
712                    $CFG->{Types}->{$File->{DOCTYPE}}->{Types}->{Preferred},
713                w23_doctype => $File->{Version}
714            }
715        );
716    }
717}
718
719#
720# Warn about unknown, incorrect, or missing Namespaces.
721if ($File->{Namespace}) {
722    my $ns = $CFG->{Types}->{$File->{Version}}->{Namespace} || FALSE;
723
724    if (&is_xml($File)) {
725        if ($ns eq $File->{Namespace}) {
726            &add_warning(
727                'W10',
728                {   W10_ns   => $File->{Namespace},
729                    W10_type => $File->{Type},
730                }
731            );
732        }
733    }
734    elsif ($File->{DOCTYPE} ne 'HTML5') {
735        &add_warning(
736            'W11',
737            {   W11_ns      => $File->{Namespace},
738                w11_doctype => $File->{DOCTYPE}
739            }
740        );
741    }
742}
743else {
744    if (&is_xml($File) and $CFG->{Types}->{$File->{Version}}->{Namespace}) {
745        &add_warning('W12', {});
746    }
747}
748
749## if invalid content, AND if requested, pass through tidy
750if (!$File->{'Is Valid'} && $File->{Opt}->{'Show Tidy'}) {
751    eval {
752        local $SIG{__DIE__} = undef;
753        require HTML::Tidy;
754        my $tidy = HTML::Tidy->new({config_file => $CFG->{Paths}->{TidyConf}});
755        my $cleaned = $tidy->clean(join("\n", @{$File->{Content}}));
756        $cleaned = Encode::decode_utf8($cleaned);
757        $File->{Tidy} = $cleaned;
758    };
759    if ($@) {
760        (my $errmsg = $@) =~ s/ at .*//s;
761        &add_warning('W29', {W29_msg => $errmsg});
762    }
763}
764
765my %templates = (
766    earl => ['earl_xml.tmpl', default_escape => 'HTML'],
767    n3   => ['earl_n3.tmpl'],
768    json => ['json_output.tmpl'],
769    ucn  => ['ucn_output.tmpl'],
770);
771my $template = $templates{$File->{Opt}->{Output}};
772if ($template) {
773    my $tname = shift(@$template);
774    my $tmpl = &get_template($File, $tname, @$template);
775    $template = $tmpl;
776}
777elsif ($File->{Opt}->{Output} eq 'soap12') {
778    if ($CFG->{'Enable SOAP'} != 1) {
779
780        # API disabled - ideally this should have been sent before performing
781        # validation...
782        print CGI::header(
783            -status           => 503,
784            -content_language => "en",
785            -type             => "text/html",
786            -charset          => "utf-8"
787        );
788        $template = &get_template($File, 'soap_disabled.tmpl');
789    }
790    else {
791        $template = &get_template($File, 'soap_output.tmpl');
792    }
793}
794else {
795    $template = &get_template($File, 'result.tmpl');
796}
797
798&prep_template($File, $template);
799&fin_template($File, $template);
800
801$template->param(tidy_output   => $File->{Tidy});
802$template->param(file_source   => &source($File))
803    if ($template->param('opt_show_source') or
804    ($File->{'Is Upload'}) or
805    ($File->{'Direct Input'}));
806
807if ($File->{Opt}->{Output} eq 'json') {
808
809    # No JSON escaping in HTML::Template (and "JS" is not the right thing here)
810    my $json = JSON->new();
811    $json->allow_nonref(TRUE);
812    if (my $msgs = $template->param("file_errors")) {
813        for my $msg (@$msgs) {
814            for my $key (qw(msg expl)) {
815                $msg->{$key} = $json->encode($msg->{$key}) if $msg->{$key};
816            }
817
818            # Drop non-numeric char indicators from output, e.g.
819            # "> 80" for some XML parse error ones (see the non-structured
820            # XML::LibXML code branch in XML preparsing below).
821            if ($msg->{char} && $msg->{char} !~ /^\d+$/) {
822                delete($msg->{char});
823            }
824        }
825    }
826}
827
828# transcode output from perl's internal to utf-8 and output
829print Encode::encode('UTF-8', $template->output);
830
831#
832# Get rid of $File object and exit.
833undef $File;
834exit;
835
836#############################################################################
837# Subroutine definitions
838#############################################################################
839
840sub get_template ($$;@)
841{
842    my ($File, $fname, @opts) = @_;
843    if (!$File->{_Templates}->{$fname}) {
844        my $tmpl = HTML::Template->new(
845            %{$File->{Template_Defaults}},
846            filename => $fname,
847            @opts
848        );
849        $tmpl->param(env_home_page     => $File->{Env}->{'Home Page'});
850        $tmpl->param(validator_version => $VERSION);
851        $File->{_Templates}->{$fname} = $tmpl;
852    }
853    return $File->{_Templates}->{$fname};
854}
855
856sub get_error_template ($;@)
857{
858    my ($File, @opts) = @_;
859    my $fname = 'fatal-error.tmpl';
860    if ($File->{Opt}->{Output} eq 'soap12') {
861        $fname = 'soap_fault.tmpl';
862    }
863    elsif ($File->{Opt}->{Output} eq 'ucn') {
864        $fname = 'ucn_fault.tmpl';
865    }
866    return &get_template($File, $fname, @opts);
867}
868
869# TODO: need to bring in fixes from html5_validate() here
870sub compoundxml_validate (\$)
871{
872    my $File = shift;
873    my $ua = W3C::Validator::UserAgent->new($CFG, $File);
874
875    push(
876        @{$File->{Parsers}},
877        {   name    => "Compound XML",
878            link    => "http://qa-dev.w3.org/",    # TODO?
879            type    => "",
880            options => ""
881        }
882    );
883
884    my $url = URI->new($CFG->{External}->{CompoundXML});
885    $url->query("out=xml");
886
887    my $req = HTTP::Request->new(POST => $url);
888
889    if ($File->{Opt}->{DOCTYPE} || $File->{Charset}->{Override}) {
890
891        # Doctype or charset overridden, need to use $File->{Content} in UTF-8
892        # because $File->{Bytes} is not affected by the overrides.  This will
893        # most likely be a source of errors about internal/actual charset
894        # differences as long as our transcoding process does not "fix" the
895        # charset info in XML declaration and meta http-equiv (any others?).
896        if ($File->{'Direct Input'})
897        {    # sane default when using html5 validator by direct input
898            $req->content_type("application/xml; charset=UTF-8");
899        }
900        else {
901            $req->content_type("$File->{ContentType}; charset=UTF-8");
902        }
903        $req->content(Encode::encode_utf8(join("\n", @{$File->{Content}})));
904    }
905    else {
906
907        # Pass original bytes, Content-Type and charset as-is.
908        # We trust that our and validator.nu's interpretation of line numbers
909        # is the same (regardless of EOL chars used in the document).
910
911        my @content_type = ($File->{ContentType} => undef);
912        push(@content_type, charset => $File->{Charset}->{HTTP})
913            if $File->{Charset}->{HTTP};
914
915        $req->content_type(
916            HTTP::Headers::Util::join_header_words(@content_type));
917        $req->content_ref(\$File->{Bytes});
918    }
919
920    $req->content_language($File->{ContentLang}) if $File->{ContentLang};
921
922    # Intentionally using direct header access instead of $req->last_modified
923    $req->header('Last-Modified', $File->{Modified}) if $File->{Modified};
924
925    # If not in debug mode, gzip the request (LWP >= 5.817)
926    eval { $req->encode("gzip"); } unless $File->{Opt}->{Debug};
927
928    my $res = $ua->request($req);
929    if (!$res->is_success()) {
930        $File->{'Error Flagged'} = TRUE;
931        my $tmpl = &get_error_template($File);
932        $tmpl->param(fatal_no_checker      => TRUE);
933        $tmpl->param(fatal_missing_checker => 'HTML5 Validator');
934        $tmpl->param(fatal_checker_error   => $res->status_line());
935    }
936    else {
937        my $content = &get_content($File, $res);
938        return $File if $File->{'Error Flagged'};
939
940        # and now we parse according to
941        # http://wiki.whatwg.org/wiki/Validator.nu_XML_Output
942        # I wish we could use XML::LibXML::Reader here. but SHAME on those
943        # major unix distributions still shipping with libxml2 2.6.16… 4 years
944        # after its release
945        # …and we could use now as we require libxml2 >= 2.6.21 anyway…
946        my $xml_reader = XML::LibXML->new();
947        $xml_reader->base_uri($res->base());
948
949        my $xmlDOM;
950        eval { $xmlDOM = $xml_reader->parse_string($content); };
951        if ($@) {
952            my $errmsg = $@;
953            $File->{'Error Flagged'} = TRUE;
954            my $tmpl = &get_error_template($File);
955            $tmpl->param(fatal_no_checker      => TRUE);
956            $tmpl->param(fatal_missing_checker => 'HTML5 Validator');
957            $tmpl->param(fatal_checker_error   => $errmsg);
958            return $File;
959        }
960        my @nodelist      = $xmlDOM->getElementsByTagName("messages");
961        my $messages_node = $nodelist[0];
962        my @message_nodes = $messages_node->childNodes;
963        foreach my $message_node (@message_nodes) {
964            my $message_type = $message_node->localname;
965            my ($err, $xml_error_msg, $xml_error_expl);
966
967            if ($message_type eq "error") {
968                $err->{type} = "E";
969                $File->{'Is Valid'} = FALSE;
970            }
971            elsif ($message_type eq "info") {
972
973                # by default - we find warnings in the type attribute (below)
974                $err->{type} = "I";
975            }
976            if ($message_node->hasAttributes()) {
977                my @attributelist = $message_node->attributes();
978                foreach my $attribute (@attributelist) {
979                    if ($attribute->name eq "type") {
980                        if (($attribute->getValue() eq "warning") and
981                            ($message_type eq "info"))
982                        {
983                            $err->{type} = "W";
984                        }
985
986                    }
987                    if ($attribute->name eq "last-column") {
988                        $err->{char} = $attribute->getValue();
989                    }
990                    if ($attribute->name eq "last-line") {
991                        $err->{line} = $attribute->getValue();
992                    }
993
994                }
995            }
996            my @child_nodes = $message_node->childNodes;
997            foreach my $child_node (@child_nodes) {
998                if ($child_node->localname eq "message") {
999                    $xml_error_msg = $child_node->toString();
1000                    $xml_error_msg =~ s,</?[^>]*>,,gsi;
1001                }
1002                if ($child_node->localname eq "elaboration") {
1003                    $xml_error_expl = $child_node->toString();
1004                    $xml_error_expl =~ s,</?elaboration>,,gi;
1005                    $xml_error_expl =
1006                        "\n<div class=\"ve xml\">$xml_error_expl</div>\n";
1007                }
1008            }
1009
1010            # formatting the error message for output
1011            $err->{src}  = "" if $err->{uri};    # TODO...
1012            $err->{num}  = 'validator.nu';
1013            $err->{msg}  = $xml_error_msg;
1014            $err->{expl} = $xml_error_expl;
1015
1016            if ($err->{msg} =~
1017                /Using the preset for (.*) based on the root namespace/)
1018            {
1019                $File->{DOCTYPE} = $1;
1020            }
1021            else {
1022                push @{$File->{Errors}}, $err;
1023            }
1024
1025            # @@ TODO message explanation / elaboration
1026        }
1027    }
1028    return $File;
1029}
1030
1031sub html5_validate (\$)
1032{
1033    my $File = shift;
1034    my $ua = W3C::Validator::UserAgent->new($CFG, $File);
1035
1036    push(
1037        @{$File->{Parsers}},
1038        {   name    => "validator.nu",
1039            link    => "http://validator.nu/",
1040            type    => "HTML5",
1041            options => ""
1042        }
1043    );
1044
1045    my $url = URI->new($CFG->{External}->{HTML5});
1046    $url->query("out=xml");
1047
1048    my $req = HTTP::Request->new(POST => $url);
1049    my $ct = &is_xml($File) ? "application/xhtml+xml" : "text/html";
1050
1051    if ($File->{Opt}->{DOCTYPE} || $File->{Charset}->{Override} ||
1052        $File->{'Direct Input'})
1053    {
1054
1055        # Doctype or charset overridden, need to use $File->{Content} in UTF-8
1056        # because $File->{Bytes} is not affected by the  overrides.  Note that
1057        # direct input is always considered an override here.
1058
1059        &override_charset($File, "UTF-8");
1060
1061        $ct = $File->{ContentType} unless $File->{'Direct Input'};
1062        my @ct = ($ct => undef, charset => "UTF-8");
1063        $ct = HTTP::Headers::Util::join_header_words(@ct);
1064
1065        $req->content(Encode::encode_utf8(join("\n", @{$File->{Content}})));
1066    }
1067    else {
1068
1069        # Pass original bytes, Content-Type and charset as-is.
1070        # We trust that our and validator.nu's interpretation of line numbers
1071        # is the same later when displaying error contexts (regardless of EOL
1072        # chars used in the document).
1073
1074        my @ct = ($File->{ContentType} => undef);
1075        push(@ct, charset => $File->{Charset}->{HTTP})
1076            if $File->{Charset}->{HTTP};
1077        $ct = HTTP::Headers::Util::join_header_words(@ct);
1078
1079        $req->content_ref(\$File->{Bytes});
1080    }
1081    $req->content_type($ct);
1082
1083    $req->content_language($File->{ContentLang}) if $File->{ContentLang};
1084
1085    # Intentionally using direct header access instead of $req->last_modified
1086    # (the latter takes seconds since epoch, but $File->{Modified} is an already
1087    # formatted string).
1088    $req->header('Last-Modified', $File->{Modified}) if $File->{Modified};
1089
1090    # Use gzip in non-debug, remote HTML5 validator mode (LWP >= 5.817).
1091    if (!$File->{Opt}->{Debug} &&
1092        $url->host() !~ /^(?:localhost|127(?:\.\d+){3}|.*\.localdomain)$/i)
1093    {
1094        eval { $req->encode("gzip"); };
1095    }
1096    else {
1097        $req->header('Accept-Encoding', 'identity');
1098    }
1099
1100    my $res = $ua->request($req);
1101    if (!$res->is_success()) {
1102        $File->{'Error Flagged'} = TRUE;
1103        my $tmpl = &get_error_template($File);
1104        $tmpl->param(fatal_no_checker      => TRUE);
1105        $tmpl->param(fatal_missing_checker => 'HTML5 Validator');
1106        $tmpl->param(fatal_checker_error   => $res->status_line());
1107    }
1108    else {
1109        my $content = &get_content($File, $res);
1110        return $File if $File->{'Error Flagged'};
1111
1112        # and now we parse according to
1113        # http://wiki.whatwg.org/wiki/Validator.nu_XML_Output
1114        # I wish we could use XML::LibXML::Reader here. but SHAME on those
1115        # major unix distributions still shipping with libxml2 2.6.16… 4 years
1116        # after its release
1117        my $xml_reader = XML::LibXML->new();
1118        $xml_reader->base_uri($res->base());
1119
1120        my $xmlDOM;
1121        eval { $xmlDOM = $xml_reader->parse_string($content); };
1122        if ($@) {
1123            my $errmsg = $@;
1124            $File->{'Error Flagged'} = TRUE;
1125            my $tmpl = &get_error_template($File);
1126            $tmpl->param(fatal_no_checker      => TRUE);
1127            $tmpl->param(fatal_missing_checker => 'HTML5 Validator');
1128            $tmpl->param(fatal_checker_error   => $errmsg);
1129            return $File;
1130        }
1131        my @nodelist      = $xmlDOM->getElementsByTagName("messages");
1132        my $messages_node = $nodelist[0];
1133        my @message_nodes = $messages_node->childNodes;
1134        foreach my $message_node (@message_nodes) {
1135            my $message_type = $message_node->localname;
1136            my ($html5_error_msg, $html5_error_expl);
1137            my $err = {};
1138
1139            # TODO: non-document errors should receive different/better
1140            # treatment, but this is better than hiding all problems for now
1141            # (#6747)
1142            if ($message_type eq "error" ||
1143                $message_type eq "non-document-error")
1144            {
1145                $err->{type} = "E";
1146                $File->{'Is Valid'} = FALSE;
1147            }
1148            elsif ($message_type eq "info") {
1149
1150                # by default - we find warnings in the type attribute (below)
1151                $err->{type} = "I";
1152            }
1153            if ($message_node->hasAttributes()) {
1154                my @attributelist = $message_node->attributes();
1155                foreach my $attribute (@attributelist) {
1156                    if ($attribute->name eq "type") {
1157                        if (($attribute->getValue() eq "warning") and
1158                            ($message_type eq "info"))
1159                        {
1160                            $err->{type} = "W";
1161                        }
1162
1163                    }
1164                    elsif ($attribute->name eq "last-column") {
1165                        $err->{char} = $attribute->getValue();
1166                    }
1167                    elsif ($attribute->name eq "last-line") {
1168                        $err->{line} = $attribute->getValue();
1169                    }
1170                    elsif ($attribute->name eq "url") {
1171                        &set_error_uri($err, $attribute->getValue());
1172                    }
1173                }
1174            }
1175            my @child_nodes = $message_node->childNodes;
1176            foreach my $child_node (@child_nodes) {
1177                if ($child_node->localname eq "message") {
1178                    $html5_error_msg = $child_node->textContent();
1179                }
1180                elsif ($child_node->localname eq "elaboration") {
1181                    $html5_error_expl = $child_node->toString();
1182                    $html5_error_expl =~ s,</?elaboration>,,gi;
1183                    $html5_error_expl =
1184                        "\n<div class=\"ve html5\">$html5_error_expl</div>\n";
1185                }
1186            }
1187
1188            # formatting the error message for output
1189
1190            # TODO: set $err->{src} from extract if we got an URI for the error:
1191            # http://wiki.whatwg.org/wiki/Validator.nu_XML_Output#The_extract_Element
1192            # For now, set it directly to empty to prevent report_errors() from
1193            # trying to populate it from our doc.
1194            $err->{src} = "" if $err->{uri};
1195
1196            $err->{num}  = 'html5';
1197            $err->{msg}  = $html5_error_msg;
1198            $err->{expl} = $html5_error_expl;
1199            push @{$File->{Errors}}, $err;
1200
1201            # @@ TODO message explanation / elaboration
1202        }
1203    }
1204    return $File;
1205}
1206
1207sub dtd_validate (\$)
1208{
1209    my $File   = shift;
1210    my $opensp = SGML::Parser::OpenSP->new();
1211
1212    #
1213    # By default, use SGML catalog file and SGML Declaration.
1214    my $catalog = catfile($CFG->{Paths}->{SGML}->{Library}, 'sgml.soc');
1215
1216    # default parsing options
1217    my @spopt = qw(valid non-sgml-char-ref no-duplicate);
1218
1219    #
1220    # Switch to XML semantics if file is XML.
1221    if (&is_xml($File)) {
1222        $catalog = catfile($CFG->{Paths}->{SGML}->{Library}, 'xml.soc');
1223        push(@spopt, 'xml');
1224    }
1225    else {
1226
1227        # add warnings for shorttags
1228        push(@spopt, 'min-tag');
1229    }
1230
1231    push(
1232        @{$File->{Parsers}},
1233        {   name    => "OpenSP",
1234            link    => "http://openjade.sourceforge.net/",
1235            type    => "SGML/XML",
1236            options => join(" ", @spopt)
1237        }
1238    );
1239
1240    #
1241    # Parser configuration
1242    $opensp->search_dirs($CFG->{Paths}->{SGML}->{Library});
1243    $opensp->catalogs($catalog);
1244    $opensp->show_error_numbers(1);
1245    $opensp->warnings(@spopt);
1246
1247    #
1248    # Restricted file reading is disabled on Win32 for the time
1249    # being since neither SGML::Parser::OpenSP nor check auto-
1250    # magically set search_dirs to include the temp directory
1251    # so restricted file reading would defunct the Validator.
1252    $opensp->restrict_file_reading(1) unless $^O eq 'MSWin32';
1253
1254    my $h;    # event handler
1255    if ($File->{Opt}->{Outline}) {
1256        $h = W3C::Validator::EventHandler::Outliner->new($opensp, $File, $CFG);
1257    }
1258    else {
1259        $h = W3C::Validator::EventHandler->new($opensp, $File, $CFG);
1260    }
1261
1262    $opensp->handler($h);
1263    $opensp->parse_string(join "\n", @{$File->{Content}});
1264
1265    # Make sure there are no circular references, otherwise the script
1266    # would leak memory until mod_perl unloads it which could take some
1267    # time. @@FIXME It's probably overly careful though.
1268    $opensp->handler(undef);
1269    undef $h->{_parser};
1270    undef $h->{_file};
1271    undef $h;
1272    undef $opensp;
1273
1274    #
1275    # Set Version to be the FPI initially.
1276    $File->{Version} = $File->{DOCTYPE};
1277    return $File;
1278}
1279
1280sub xmlwf (\$)
1281{
1282
1283    # we should really be using a SAX ErrorHandler, but I can't find a way to
1284    # make it work with XML::LibXML::SAX::Parser... ** FIXME **
1285    # ditto, we should try using W3C::Validator::EventHandler, but it's badly
1286    # linked to opensp at the moment
1287
1288    my $File      = shift;
1289    my $xmlparser = XML::LibXML->new();
1290    $xmlparser->line_numbers(1);
1291    $xmlparser->validation(0);
1292    $xmlparser->base_uri($File->{URI})
1293        unless ($File->{'Direct Input'} || $File->{'Is Upload'});
1294
1295    push(
1296        @{$File->{Parsers}},
1297        {   name    => "libxml2",
1298            link    => "http://xmlsoft.org/",
1299            type    => "XML",
1300            options => ""
1301        }
1302    );
1303
1304    # Restrict file reading similar to what SGML::Parser::OpenSP does.  Note
1305    # that all inputs go through the callback so if we were passing a
1306    # URI/filename to the parser, it would be affected as well and would break
1307    # fetching the initial document.  As long as we pass the doc as string,
1308    # this should work.
1309    my $cb = XML::LibXML::InputCallback->new();
1310    $cb->register_callbacks([\&xml_jail_match, sub { }, sub { }, sub { }]);
1311    $xmlparser->input_callbacks($cb);
1312
1313    &override_charset($File, "UTF-8");
1314
1315    eval { $xmlparser->parse_string(join("\n", @{$File->{Content}})); };
1316
1317    if (ref($@)) {
1318
1319        # handle a structured error (XML::LibXML::Error object)
1320
1321        my $err_obj = $@;
1322        while ($err_obj) {
1323            my $err = {};
1324            &set_error_uri($err, $err_obj->file());
1325            $err->{src}  = &ent($err_obj->context()) if $err->{uri};
1326            $err->{line} = $err_obj->line();
1327            $err->{char} = $err_obj->column();
1328            $err->{num}  = "libxml2-" . $err_obj->code();
1329            $err->{type} = "E";
1330            $err->{msg}  = $err_obj->message();
1331
1332            $err_obj = $err_obj->_prev();
1333
1334            unshift(@{$File->{WF_Errors}}, $err);
1335        }
1336    }
1337    elsif ($@) {
1338        my $xmlwf_errors      = $@;
1339        my $xmlwf_error_line  = undef;
1340        my $xmlwf_error_col   = undef;
1341        my $xmlwf_error_msg   = undef;
1342        my $got_error_message = undef;
1343        my $got_quoted_line   = undef;
1344        foreach my $msg_line (split "\n", $xmlwf_errors) {
1345
1346            $msg_line =~ s{[^\x0d\x0a](:\d+:)}{\n$1}g;
1347            $msg_line =~ s{[^\x0d\x0a]+[\x0d\x0a]$}{};
1348
1349            # first we get the actual error message
1350            if (!$got_error_message &&
1351                $msg_line =~ /^(:\d+:)( parser error : .*)/)
1352            {
1353                $xmlwf_error_line = $1;
1354                $xmlwf_error_msg  = $2;
1355                $xmlwf_error_line =~ s/:(\d+):/$1/;
1356                $xmlwf_error_msg  =~ s/ parser error :/XML Parsing Error: /;
1357                $got_error_message = 1;
1358            }
1359
1360            # then we skip the second line, which shows the context
1361            # (we don't use that)
1362            elsif ($got_error_message && !$got_quoted_line) {
1363                $got_quoted_line = 1;
1364            }
1365
1366            # we now take the third line, with the pointer to the error's
1367            # column
1368            elsif (($msg_line =~ /(\s+)\^/) and
1369                $got_error_message and
1370                $got_quoted_line)
1371            {
1372                $xmlwf_error_col = length($1);
1373            }
1374
1375            #  cleanup for a number of bugs for the column number
1376            if (defined($xmlwf_error_col)) {
1377                if ((   my $l =
1378                        length($File->{Content}->[$xmlwf_error_line - 1])
1379                    ) < $xmlwf_error_col
1380                    )
1381                {
1382
1383                    # http://bugzilla.gnome.org/show_bug.cgi?id=434196
1384                    #warn("Warning: reported error column larger than line length " .
1385                    #     "($xmlwf_error_col > $l) in $File->{URI} line " .
1386                    #     "$xmlwf_error_line, libxml2 bug? Resetting to line length.");
1387                    $xmlwf_error_col = $l;
1388                }
1389                elsif ($xmlwf_error_col == 79) {
1390
1391                    # working around an apparent odd limitation of libxml which
1392                    # only gives context for lines up to 80 chars
1393                    # http://www.w3.org/Bugs/Public/show_bug.cgi?id=4420
1394                    # http://bugzilla.gnome.org/show_bug.cgi?id=424017
1395                    $xmlwf_error_col = "> 80";
1396
1397                    # non-int line number will trigger the proper behavior in
1398                    # report_error
1399                }
1400            }
1401
1402            # when we have all the info (one full error message), proceed
1403            # and move on to the next error
1404            if ((defined $xmlwf_error_line) and
1405                (defined $xmlwf_error_col) and
1406                (defined $xmlwf_error_msg))
1407            {
1408
1409                # Reinitializing for the next batch of 3 lines
1410                $got_error_message = undef;
1411                $got_quoted_line   = undef;
1412
1413                # formatting the error message for output
1414                my $err = {};
1415
1416                # TODO: set_error_uri() (need test case)
1417                $err->{src}  = "" if $err->{uri};    # TODO...
1418                $err->{line} = $xmlwf_error_line;
1419                $err->{char} = $xmlwf_error_col;
1420                $err->{num}  = 'xmlwf';
1421                $err->{type} = "E";
1422                $err->{msg}  = $xmlwf_error_msg;
1423
1424                push(@{$File->{WF_Errors}}, $err);
1425                $xmlwf_error_line = undef;
1426                $xmlwf_error_col  = undef;
1427                $xmlwf_error_msg  = undef;
1428            }
1429        }
1430    }
1431
1432    $File->{'Is Valid'} = FALSE if @{$File->{WF_Errors}};
1433    return $File;
1434}
1435
1436#
1437# Generate HTML report.
1438sub prep_template ($$)
1439{
1440    my $File = shift;
1441    my $T    = shift;
1442
1443    #
1444    # XML mode...
1445    $T->param(is_xml => &is_xml($File));
1446
1447    #
1448    # Upload?
1449    $T->param(is_upload => $File->{'Is Upload'});
1450
1451    #
1452    # Direct Input?
1453    $T->param(is_direct_input => $File->{'Direct Input'});
1454
1455    #
1456    # The URI...
1457    $T->param(file_uri => $File->{URI});
1458
1459    #
1460    # HTTPS note?
1461    $T->param(file_https_note => $File->{'Is Upload'} ||
1462            $File->{'Direct Input'} ||
1463            URI->new($File->{URI})->secure());
1464
1465    #
1466    # Set URL for page title.
1467    $T->param(page_title_url => $File->{URI});
1468
1469    #
1470    # Metadata...
1471    $T->param(file_modified    => $File->{Modified});
1472    $T->param(file_server      => $File->{Server});
1473    $T->param(file_size        => $File->{Size});
1474    $T->param(file_contenttype => $File->{ContentType});
1475    $T->param(file_charset     => $File->{Charset}->{Use});
1476    $T->param(file_doctype     => $File->{DOCTYPE});
1477
1478    #
1479    # Output options...
1480    $T->param(opt_show_source  => $File->{Opt}->{'Show Source'});
1481    $T->param(opt_show_tidy    => $File->{Opt}->{'Show Tidy'});
1482    $T->param(opt_show_outline => $File->{Opt}->{Outline});
1483    $T->param(opt_verbose      => $File->{Opt}->{Verbose});
1484    $T->param(opt_group_errors => $File->{Opt}->{'Group Errors'});
1485    $T->param(opt_no200        => $File->{Opt}->{No200});
1486
1487    # Root Element
1488    $T->param(root_element => $File->{Root});
1489
1490    # Namespaces...
1491    $T->param(file_namespace => $File->{Namespace});
1492
1493    # Non-root ones; unique, preserving occurrence order
1494    my %seen_ns = ();
1495    $seen_ns{$File->{Namespace}}++ if defined($File->{Namespace});
1496    my @nss =
1497        map { $seen_ns{$_}++ == 0 ? {uri => $_} : () } @{$File->{Namespaces}};
1498    $T->param(file_namespaces => \@nss) if @nss;
1499
1500    if ($File->{Opt}->{DOCTYPE}) {
1501        my $over_doctype_param = "override doctype $File->{Opt}->{DOCTYPE}";
1502        $T->param($over_doctype_param => TRUE);
1503    }
1504
1505    if ($File->{Opt}->{Charset}) {
1506        my $over_charset_param = "override charset $File->{Opt}->{Charset}";
1507        $T->param($over_charset_param => TRUE);
1508    }
1509
1510    # Allow content-negotiation
1511    if ($File->{Opt}->{'Accept Header'}) {
1512        $T->param('accept' => $File->{Opt}->{'Accept Header'});
1513    }
1514    if ($File->{Opt}->{'Accept-Language Header'}) {
1515        $T->param(
1516            'accept-language' => $File->{Opt}->{'Accept-Language Header'});
1517    }
1518    if ($File->{Opt}->{'Accept-Charset Header'}) {
1519        $T->param('accept-charset' => $File->{Opt}->{'Accept-Charset Header'});
1520    }
1521    if ($File->{Opt}->{'User Agent'}) {
1522        $T->param('user-agent' => $File->{Opt}->{'User Agent'});
1523    }
1524    if ($File->{'Error Flagged'}) {
1525        $T->param(fatal_error => TRUE);
1526    }
1527}
1528
1529sub fin_template ($$)
1530{
1531    my $File = shift;
1532    my $T    = shift;
1533
1534    #
1535    # Set debug info for HTML and SOAP reports.
1536    if ($DEBUG) {
1537        my @parsers;
1538        for my $parser (@{$File->{Parsers}}) {
1539            my $p = $parser->{name};
1540            $p .= " (" . $parser->{options} . ")" if $parser->{options};
1541            push(@parsers, $p);
1542        }
1543        $T->param(
1544            debug => [
1545                map({name => $_, value => $ENV{$_}},
1546                    qw(no_proxy http_proxy https_proxy ftp_proxy FTP_PASSIVE)),
1547                {name => 'Content-Encoding',  value => $File->{ContentEnc}},
1548                {name => 'Content-Language',  value => $File->{ContentLang}},
1549                {name => 'Content-Location',  value => $File->{ContentLoc}},
1550                {name => 'Transfer-Encoding', value => $File->{TransferEnc}},
1551                {name => 'Parse Mode',        value => $File->{Mode}},
1552                {name => 'Parse Mode Factor', value => $File->{ModeChoice}},
1553                {name => 'Parsers Used',      value => join(", ", @parsers)},
1554            ],
1555        );
1556    }
1557
1558    $T->param(parsers => $File->{Parsers});
1559
1560    if (!$File->{Doctype} &&
1561        (!$File->{Version} ||
1562            $File->{Version} eq 'unknown' ||
1563            $File->{Version} eq 'SGML')
1564        )
1565    {
1566        my $default_doctype =
1567            $File->{"Default DOCTYPE"}->{&is_xml($File) ? "XHTML" : "HTML"};
1568        $T->param(file_version => "$default_doctype");
1569    }
1570    else {
1571        $T->param(file_version => $File->{Version});
1572    }
1573    my ($num_errors, $num_warnings, $num_info, $reported_errors) =
1574        &report_errors($File);
1575    if ($num_errors + $num_warnings > 0) {
1576        $T->param(has_errors => 1);
1577    }
1578    $T->param(valid_errors_num => $num_errors);
1579    $num_warnings += scalar @{$File->{Warnings}};
1580    $T->param(valid_warnings_num => $num_warnings);
1581    my $number_of_errors   = "";    # textual form of $num_errors
1582    my $number_of_warnings = "";    # textual form of $num_errors
1583
1584    # The following is a bit hack-ish, but will enable us to have some logic
1585    # for a human-readable display of the number, with cases for 0, 1, 2 and
1586    # above (the case of 2 appears to be useful for localization in some
1587    # languages where the plural is different for 2, and above)
1588
1589    if ($num_errors > 1) {
1590        $T->param(number_of_errors_is_0 => FALSE);
1591        $T->param(number_of_errors_is_1 => FALSE);
1592        if ($num_errors == 2) {
1593            $T->param(number_of_errors_is_2 => TRUE);
1594        }
1595        else {
1596            $T->param(number_of_errors_is_2 => FALSE);
1597        }
1598        $T->param(number_of_errors_is_plural => TRUE);
1599    }
1600    elsif ($num_errors == 1) {
1601        $T->param(number_of_errors_is_0      => FALSE);
1602        $T->param(number_of_errors_is_1      => TRUE);
1603        $T->param(number_of_errors_is_2      => FALSE);
1604        $T->param(number_of_errors_is_plural => FALSE);
1605    }
1606    else {    # 0
1607        $T->param(number_of_errors_is_0      => TRUE);
1608        $T->param(number_of_errors_is_1      => FALSE);
1609        $T->param(number_of_errors_is_2      => FALSE);
1610        $T->param(number_of_errors_is_plural => FALSE);
1611    }
1612
1613    if ($num_warnings > 1) {
1614        $T->param(number_of_warnings_is_0 => FALSE);
1615        $T->param(number_of_warnings_is_1 => FALSE);
1616        if ($num_warnings == 2) {
1617            $T->param(number_of_warnings_is_2 => TRUE);
1618        }
1619        else {
1620            $T->param(number_of_warnings_is_2 => FALSE);
1621        }
1622        $T->param(number_of_warnings_is_plural => TRUE);
1623    }
1624    elsif ($num_warnings == 1) {
1625        $T->param(number_of_warnings_is_0      => FALSE);
1626        $T->param(number_of_warnings_is_1      => TRUE);
1627        $T->param(number_of_warnings_is_2      => FALSE);
1628        $T->param(number_of_warnings_is_plural => FALSE);
1629    }
1630    else {    # 0
1631        $T->param(number_of_warnings_is_0      => TRUE);
1632        $T->param(number_of_warnings_is_1      => FALSE);
1633        $T->param(number_of_warnings_is_2      => FALSE);
1634        $T->param(number_of_warnings_is_plural => FALSE);
1635    }
1636
1637    $T->param(file_outline => $File->{heading_outline})
1638        if $File->{Opt}->{Outline};
1639
1640    $T->param(file_errors => $reported_errors);
1641    if ($File->{'Is Valid'}) {
1642        $T->param(VALID        => TRUE);
1643        $T->param(valid_status => 'Valid');
1644        &report_valid($File, $T);
1645    }
1646    else {
1647        $T->param(VALID        => FALSE);
1648        $T->param(valid_status => 'Invalid');
1649    }
1650}
1651
1652#
1653# Output "This page is Valid" report.
1654sub report_valid
1655{
1656    my $File = shift;
1657    my $T    = shift;
1658
1659    unless ($File->{Version} eq 'unknown' or defined $File->{Tentative}) {
1660
1661        if (exists $CFG->{Types}->{$File->{DOCTYPE}}->{Badge}) {
1662            my $cfg = $CFG->{Types}->{$File->{DOCTYPE}};
1663            $T->param(badge_uri           => $cfg->{Badge}->{URI});
1664            $T->param(local_badge_uri     => $cfg->{Badge}->{'Local URI'});
1665            $T->param(badge_alt_uri       => $cfg->{Badge}->{'Alt URI'});
1666            $T->param(local_alt_badge_uri => $cfg->{Badge}->{'Local ALT URI'});
1667            $T->param(badge_alt           => $cfg->{Badge}->{Alt});
1668            $T->param(badge_rdfa          => $cfg->{Badge}->{RDFa});
1669            $T->param(badge_h             => $cfg->{Badge}->{Height});
1670            $T->param(badge_w             => $cfg->{Badge}->{Width});
1671            $T->param(badge_onclick       => $cfg->{Badge}->{OnClick});
1672            $T->param(badge_tagc => $cfg->{'Parse Mode'} eq 'XML' ? ' /' : '');
1673        }
1674    }
1675    elsif (defined $File->{Tentative}) {
1676        $T->param(is_tentative => TRUE);
1677    }
1678
1679    if ($File->{XMLWF_ONLY}) {
1680        $T->param(xmlwf_only => TRUE);
1681    }
1682    my $thispage = self_url_file($File);
1683    $T->param(file_thispage => $thispage);
1684}
1685
1686#
1687# Add a warning message to the output.
1688sub add_warning ($$)
1689{
1690    my $WID    = shift;
1691    my $params = shift;
1692
1693    push @{$File->{Warnings}}, $WID;
1694
1695    my %tmplparams = (
1696        $WID          => TRUE,
1697        have_warnings => TRUE,
1698        %$params,
1699    );
1700    for my $tmpl (qw(result fatal-error soap_output ucn_output)) {
1701        &get_template($File, "$tmpl.tmpl")->param(%tmplparams);
1702    }
1703}
1704
1705#
1706# Proxy authentication requests.
1707# Note: expects the third argument to be a hash ref (see HTTP::Headers::Auth).
1708sub authenticate
1709{
1710    my $File       = shift;
1711    my $resource   = shift;
1712    my $authHeader = shift || {};
1713
1714    my $realm = $resource;
1715    $realm =~ s([^\w\d.-]*){}g;
1716
1717    while (my ($scheme, $header) = each %$authHeader) {
1718        my $origrealm = $header->{realm};
1719        if (not defined $origrealm or $scheme !~ /^(?:basic|digest)$/i) {
1720            delete($authHeader->{$scheme});
1721            next;
1722        }
1723        $header->{realm} = "$realm-$origrealm";
1724    }
1725
1726    my $headers = HTTP::Headers->new(Connection => 'close');
1727    $headers->www_authenticate(%$authHeader);
1728    $headers = $headers->as_string();
1729    chomp($headers);
1730
1731    my $tmpl = &get_template($File, 'http_401_authrequired.tmpl');
1732    $tmpl->param(http_401_headers => $headers);
1733    $tmpl->param(http_401_url     => $resource);
1734
1735    print Encode::encode('UTF-8', $tmpl->output);
1736    exit;    # Further interaction will be a new HTTP request.
1737}
1738
1739#
1740# Fetch an URL and return the content and selected meta-info.
1741sub handle_uri
1742{
1743    my $q    = shift;    # The CGI object.
1744    my $File = shift;    # The master datastructure.
1745
1746    my $ua = W3C::Validator::UserAgent->new($CFG, $File);
1747
1748    my $uri = URI->new(ref $q ? $q->param('uri') : $q)->canonical();
1749    $uri->fragment(undef);
1750
1751    if (!$uri->scheme()) {
1752        local $ENV{URL_GUESS_PATTERN} = '';
1753        my $guess = URI::Heuristic::uf_uri($uri);
1754        if ($guess->scheme() && $ua->is_protocol_supported($guess)) {
1755            $uri = $guess;
1756        }
1757        else {
1758            $uri = URI->new("http://$uri");
1759        }
1760    }
1761
1762    unless ($ua->is_protocol_supported($uri)) {
1763        $File->{'Error Flagged'} = TRUE;
1764        my $tmpl = &get_error_template($File);
1765
1766        # If uri param is empty (also for empty direct or upload), it's been
1767        # set to TRUE in sub prepCGI()
1768        if ($uri->canonical() eq "1") {
1769            $tmpl->param(fatal_no_content => TRUE);
1770        }
1771        else {
1772            $tmpl->param(fatal_uri_error  => TRUE);
1773            $tmpl->param(fatal_uri_scheme => $uri->scheme());
1774        }
1775        return $File;
1776    }
1777
1778    return $File unless $ua->uri_ok($uri);
1779
1780    my $req = HTTP::Request->new(GET => $uri);
1781
1782    # if one wants to use the accept, accept-charset and accept-language params
1783    # in order to trigger specific negotiation
1784    if ($File->{Opt}->{'Accept Header'}) {
1785        $req->header(Accept => $File->{Opt}->{'Accept Header'});
1786    }
1787    if ($File->{Opt}->{'Accept-Language Header'}) {
1788        $req->header(
1789            Accept_Language => $File->{Opt}->{'Accept-Language Header'});
1790    }
1791    if ($File->{Opt}->{'Accept-Charset Header'}) {
1792        $req->header(
1793            Accept_Charset => $File->{Opt}->{'Accept-Charset Header'});
1794    }
1795
1796    # All Apache configurations don't set HTTP_AUTHORIZATION for CGI scripts.
1797    # If we're under mod_perl, there is a way around it...
1798    my $http_auth = $ENV{HTTP_AUTHORIZATION};
1799    eval {
1800        local $SIG{__DIE__} = undef;
1801        my $auth =
1802            Apache2::RequestUtil->request()->headers_in()->{Authorization};
1803        $http_auth = $auth if $auth;
1804    } if (IS_MODPERL2() && !$http_auth);
1805
1806    # If we got a Authorization header, the client is back at it after being
1807    # prompted for a password so we insert the header as is in the request.
1808    $req->headers->header(Authorization => $http_auth) if $http_auth;
1809
1810    my $res = $ua->request($req);
1811
1812    return $File if $File->{'Error Flagged'};    # Redirect IP rejected?
1813
1814    unless ($res->code == 200 or $File->{Opt}->{'No200'}) {
1815        if ($res->code == 401) {
1816            my %auth = $res->www_authenticate();    # HTTP::Headers::Auth
1817            &authenticate($File, $res->request->uri, \%auth);
1818        }
1819        else {
1820            $File->{'Error Flagged'} = TRUE;
1821
1822            my $no200url = undef;
1823            if (!$File->{Opt}->{No200}) {
1824
1825                # $File->{URI} not set yet; setting it non-local has side
1826                # effects
1827                local $File->{URI} = $uri->as_string;
1828                local $File->{Opt}->{No200} = TRUE;
1829                $no200url = &self_url_file($File);
1830            }
1831
1832            my $warning = $res->header("Client-Warning");
1833            if ($warning && $warning =~ /Internal response/i) {
1834
1835                # Response doc generated internally by LWP, no need to show
1836                # that info nor to provide error doc validation link to it.
1837                $warning  = undef;
1838                $no200url = undef;
1839            }
1840
1841            my $tmpl = &get_error_template($File);
1842            $tmpl->param(fatal_http_error => TRUE);
1843            $tmpl->param(fatal_http_uri   => $uri->as_string);
1844            $tmpl->param(fatal_http_code  => $res->code);
1845            $tmpl->param(fatal_http_msg   => $res->message);
1846            $tmpl->param(fatal_http_warn  => $warning);
1847            $tmpl->param(fatal_http_no200 => $no200url);
1848            $tmpl->param(fatal_http_dns   => TRUE) if ($res->code == 500);
1849        }
1850
1851        return $File;
1852    }
1853
1854    #
1855    # Enforce Max Recursion level.
1856    &check_recursion($File, $res);
1857
1858    my ($mode, $ct, $charset) = &parse_content_type(
1859        $File,
1860        scalar($res->header('Content-Type')),
1861        scalar($res->request->uri),
1862    );
1863
1864    my $content = &get_content($File, $res);
1865    return $File if $File->{'Error Flagged'};
1866
1867    $File->{Bytes}           = $content;
1868    $File->{Mode}            = $mode;
1869    $File->{ContentType}     = $ct;
1870    $File->{ContentEnc}      = $res->content_encoding;
1871    $File->{ContentLang}     = $res->content_language;
1872    $File->{ContentLoc}      = $res->header('Content-Location');
1873    $File->{TransferEnc}     = $res->header('Client-Transfer-Encoding');
1874    $File->{Charset}->{HTTP} = lc $charset if defined $charset;
1875    $File->{Modified}        = $res->header('Last-Modified');
1876    $File->{Server}          = scalar $res->server;
1877
1878    # TODO: Content-Length is not always set, so either this should
1879    # be renamed to 'Content-Length' or it should consider more than
1880    # the Content-Length header.
1881    $File->{Size}           = scalar $res->content_length;
1882    $File->{URI}            = scalar $res->request->uri->canonical;
1883    $File->{'Is Upload'}    = FALSE;
1884    $File->{'Direct Input'} = FALSE;
1885
1886    return $File;
1887}
1888
1889#
1890# Handle uploaded file and return the content and selected meta-info.
1891sub handle_file
1892{
1893    my $q    = shift;    # The CGI object.
1894    my $File = shift;    # The master datastructure.
1895
1896    my $p = $q->param('uploaded_file');
1897    my $f = $q->upload('uploaded_file');
1898    if (!defined($f)) {
1899
1900        # Probably not an uploaded file as far as CGI is concerned,
1901        # treat as a fragment.
1902        $q->param('fragment', $p);
1903        return &handle_frag($q, $File);
1904    }
1905
1906    my $h = $q->uploadInfo($p);
1907
1908    local $/ = undef;    # set line delimiter so that <> reads rest of file
1909    my $file = <$f>;
1910
1911    my ($mode, $ct, $charset) =
1912        &parse_content_type($File, $h->{'Content-Type'});
1913
1914    $File->{Bytes}           = $file;
1915    $File->{Mode}            = $mode;
1916    $File->{ContentType}     = $ct;
1917    $File->{Charset}->{HTTP} = lc $charset if defined $charset;
1918    $File->{Modified}        = $q->http('Last-Modified');
1919    $File->{Server}          = $q->http('User-Agent');   # Fake a "server". :-)
1920    $File->{Size}           = $q->http('Content-Length');
1921    $File->{URI}            = "$p";
1922    $File->{'Is Upload'}    = TRUE;
1923    $File->{'Direct Input'} = FALSE;
1924
1925    return $File;
1926}
1927
1928#
1929# Handle uploaded file and return the content and selected meta-info.
1930sub handle_frag
1931{
1932    my $q    = shift;    # The CGI object.
1933    my $File = shift;    # The master datastructure.
1934
1935    $File->{Bytes}          = $q->param('fragment');
1936    $File->{Mode}           = 'TBD';
1937    $File->{Modified}       = '';
1938    $File->{Server}         = '';
1939    $File->{Size}           = '';
1940    $File->{ContentType}    = '';                           # @@TODO?
1941    $File->{URI}            = 'upload://Form Submission';
1942    $File->{'Is Upload'}    = FALSE;
1943    $File->{'Direct Input'} = TRUE;
1944    $File->{Charset}->{HTTP} =
1945        "utf-8";    # by default, the form accepts utf-8 chars
1946
1947    if ($File->{Opt}->{Prefill}) {
1948
1949        # we surround the HTML fragment with some basic document structure
1950        my $prefill_Template;
1951        if ($File->{Opt}->{'Prefill Doctype'} eq 'html401') {
1952            $prefill_Template = &get_template($File, 'prefill_html401.tmpl');
1953        }
1954        else {
1955            $prefill_Template = &get_template($File, 'prefill_xhtml10.tmpl');
1956        }
1957        $prefill_Template->param(fragment => $File->{Bytes});
1958        $File->{Bytes} = $prefill_Template->output();
1959
1960        # Let's force the view source so that the user knows what we've put
1961        # around their code.
1962        $File->{Opt}->{'Show Source'} = TRUE;
1963
1964        # Ignore doctype overrides (#5132).
1965        $File->{Opt}->{DOCTYPE} = 'Inline';
1966    }
1967
1968    return $File;
1969}
1970
1971#
1972# Parse a Content-Type and parameters. Return document type and charset.
1973sub parse_content_type
1974{
1975    my $File         = shift;
1976    my $Content_Type = shift;
1977    my $url          = shift;
1978    my $charset      = '';
1979
1980    my ($ct) = lc($Content_Type) =~ /^\s*([^\s;]*)/g;
1981
1982    my $mode = $CFG->{MIME}->{$ct} || $ct;
1983
1984    $charset = HTML::Encoding::encoding_from_content_type($Content_Type);
1985
1986    if (index($mode, '/') != -1) {   # a "/" means it's unknown or we'd have a mode here.
1987        if ($ct eq 'text/css' and defined $url) {
1988            print redirect
1989                'http://jigsaw.w3.org/css-validator/validator?uri=' .
1990                uri_escape $url;
1991            exit;
1992        }
1993        elsif ($ct eq 'application/atom+xml' and defined $url) {
1994            print redirect 'http://validator.w3.org/feed/check.cgi?url=' .
1995                uri_escape $url;
1996            exit;
1997        }
1998        elsif ($ct =~ m(^application/.+\+xml$)) {
1999
2000            # unknown media types which should be XML - we give these a try
2001            $mode = "XML";
2002        }
2003        else {
2004            $File->{'Error Flagged'} = TRUE;
2005            my $tmpl = &get_error_template($File);
2006            $tmpl->param(fatal_mime_error => TRUE);
2007            $tmpl->param(fatal_mime_ct    => $ct);
2008        }
2009    }
2010
2011    return $mode, $ct, $charset;
2012}
2013
2014#
2015# Get content with Content-Encodings decoded from a response.
2016sub get_content ($$)
2017{
2018    my $File = shift;
2019    my $res  = shift;
2020
2021    my $content;
2022    eval {
2023        $content = $res->decoded_content(charset => 'none', raise_error => 1);
2024    };
2025    if ($@) {
2026        (my $errmsg = $@) =~ s/ at .*//s;
2027        my $cenc = $res->header("Content-Encoding");
2028        my $uri  = $res->request->uri;
2029        $File->{'Error Flagged'} = TRUE;
2030        my $tmpl = &get_error_template($File);
2031        $tmpl->param(fatal_decode_error  => TRUE);
2032        $tmpl->param(fatal_decode_errmsg => $errmsg);
2033        $tmpl->param(fatal_decode_cenc   => $cenc);
2034
2035        # Include URI because it might be a subsystem (eg. HTML5 validator) one
2036        $tmpl->param(fatal_decode_uri => $uri);
2037    }
2038
2039    return $content;
2040}
2041
2042#
2043# Check recursion level and enforce Max Recursion limit.
2044sub check_recursion ($$)
2045{
2046    my $File = shift;
2047    my $res  = shift;
2048
2049    # Not looking at our own output.
2050    return unless defined $res->header('X-W3C-Validator-Recursion');
2051
2052    my $lvl = $res->header('X-W3C-Validator-Recursion');
2053    return unless $lvl =~ m/^\d+$/;    # Non-digit, i.e. garbage, ignore.
2054
2055    if ($lvl >= $CFG->{'Max Recursion'}) {
2056        print redirect $File->{Env}->{'Home Page'};
2057    }
2058    else {
2059
2060        # Increase recursion level in output.
2061        &get_template($File, 'result.tmpl')->param(depth => $lvl++);
2062    }
2063}
2064
2065#
2066# XML::LibXML::InputCallback matcher using our SGML search path jail.
2067sub xml_jail_match
2068{
2069    my $arg = shift;
2070
2071    # Ensure we have a file:// URI if we get a file.
2072    my $uri = URI->new($arg);
2073    if (!$uri->scheme()) {
2074        $uri = URI::file->new_abs($arg);
2075    }
2076    $uri = $uri->canonical();
2077
2078    # Do not trap non-file URIs.
2079    return 0 unless ($uri->scheme() eq "file");
2080
2081    # Do not trap file URIs within our jail.
2082    for my $dir ($CFG->{Paths}->{SGML}->{Library},
2083        split(/\Q$Config{path_sep}\E/o, $ENV{SGML_SEARCH_PATH} || ''))
2084    {
2085        next unless $dir;
2086        my $dir_uri = URI::file->new_abs($dir)->canonical()->as_string();
2087        $dir_uri =~ s|/*$|/|;    # ensure it ends with a slash
2088        return 0 if ($uri =~ /^\Q$dir_uri\E/);
2089    }
2090
2091    # We have a match (a file outside the jail).
2092    return 1;
2093}
2094
2095#
2096# Escape text to be included in markup comment.
2097sub escape_comment
2098{
2099    local $_ = shift;
2100    return '' unless defined;
2101    s/--/- /g;
2102    return $_;
2103}
2104
2105#
2106# Return $_[0] encoded for HTML entities (cribbed from merlyn).
2107#
2108# Note that this is used both for HTML and XML escaping (so e.g. no &apos;).
2109#
2110sub ent
2111{
2112    my $str = shift;
2113    return '' unless defined($str);    # Eliminate warnings
2114
2115    # should switch to hex sooner or later
2116    $str =~ s/&/&#38;/g;
2117    $str =~ s/</&#60;/g;
2118    $str =~ s/>/&#62;/g;
2119    $str =~ s/"/&#34;/g;
2120    $str =~ s/'/&#39;/g;
2121
2122    return $str;
2123}
2124
2125#
2126# Truncate source lines for report.
2127# Expects 1-based column indexes.
2128sub truncate_line
2129{
2130    my $line   = shift;
2131    my $col    = shift;
2132    my $maxlen = 80;      # max line length to truncate to
2133
2134    my $diff = length($line) - $maxlen;
2135
2136    # Don't truncate at all if it fits.
2137    return ($line, $col) if ($diff <= 0);
2138
2139    my $start = $col - int($maxlen / 2);
2140    if ($start < 0) {
2141
2142        # Truncate only from end of line.
2143        $start = 0;
2144        $line = substr($line, $start, $maxlen - 1) . '…';
2145    }
2146    elsif ($start > $diff) {
2147
2148        # Truncate only from beginning of line.
2149        $start = $diff;
2150        $line = '…' . substr($line, $start + 1);
2151    }
2152    else {
2153
2154        # Truncate from both beginning and end of line.
2155        $line = '…' . substr($line, $start + 1, $maxlen - 2) . '…';
2156    }
2157
2158    # Shift column if we truncated from beginning of line.
2159    $col -= $start;
2160
2161    return ($line, $col);
2162}
2163
2164#
2165# Suppress any existing DOCTYPE by commenting it out.
2166sub override_doctype
2167{
2168    my $File = shift;
2169
2170    my ($dt) =
2171        grep { $_->{Display} eq $File->{Opt}->{DOCTYPE} }
2172        values %{$CFG->{Types}};
2173
2174    # @@TODO: abort/whine about unrecognized doctype if $dt is undef.;
2175    my $pubid = $dt->{PubID};
2176    my $sysid = $dt->{SysID};
2177    my $name  = $dt->{Name};
2178
2179    # The HTML5 PubID is a fake, reset it out of the way.
2180    $pubid = undef if ($pubid eq 'HTML5');
2181
2182    # We don't have public/system ids for all types.
2183    my $dtd = "<!DOCTYPE $name";
2184    if ($pubid) {
2185        $dtd .= qq( PUBLIC "$pubid");
2186        $dtd .= qq( "$sysid") if $sysid;
2187    }
2188    elsif ($sysid) {
2189        $dtd .= qq( SYSTEM "$sysid");
2190    }
2191    $dtd .= '>';
2192
2193    my $org_dtd      = '';
2194    my $HTML         = '';
2195    my $seen_doctype = FALSE;
2196
2197    my $declaration = sub {
2198        my ($tag, $text) = @_;
2199        if ($seen_doctype || uc($tag) ne '!DOCTYPE') {
2200            $HTML .= $text;
2201            return;
2202        }
2203
2204        $seen_doctype = TRUE;
2205
2206        $org_dtd = &ent($text);
2207        ($File->{Root}, undef, $File->{DOCTYPE}) = $text =~
2208            /<!DOCTYPE\s+(\w[\w\.-]+)(?:\s+(?:PUBLIC|SYSTEM)\s+(['"])(.*?)\2)?\s*>/si;
2209
2210        $File->{DOCTYPE} = 'HTML5'
2211            if (
2212            lc($File->{Root} || '') eq 'html' &&
2213            (!defined($File->{DOCTYPE}) ||
2214                $File->{DOCTYPE} eq 'about:legacy-compat')
2215            );
2216
2217        # No Override if Fallback was requested, or if override is the same as
2218        # detected
2219        my $known = $CFG->{Types}->{$File->{DOCTYPE}};
2220        if ($File->{Opt}->{FB}->{DOCTYPE} or
2221            ($known && $File->{Opt}->{DOCTYPE} eq $known->{Display}))
2222        {
2223            $HTML .= $text;    # Stash it as is...
2224        }
2225        else {
2226            $HTML .= "$dtd<!-- " . &escape_comment($text) . " -->";
2227        }
2228    };
2229
2230    my $start_element = sub {
2231        my $p = shift;
2232        # Sneak chosen doctype before the root elt if none replaced thus far.
2233        $HTML .= $dtd unless $seen_doctype;
2234        $HTML .= shift;
2235        # We're done with this handler.
2236        $p->handler(start => undef);
2237    };
2238
2239    HTML::Parser->new(
2240        default_h => [sub { $HTML .= shift }, 'text'],
2241        declaration_h => [$declaration,   'tag,text'],
2242        start_h       => [$start_element, 'self,text']
2243    )->parse(join "\n", @{$File->{Content}})->eof();
2244
2245    $File->{Content} = [split /\n/, $HTML];
2246
2247    if ($seen_doctype) {
2248        my $known = $CFG->{Types}->{$File->{DOCTYPE}};
2249        unless ($File->{Opt}->{FB}->{DOCTYPE} or
2250            ($known && $File->{Opt}->{DOCTYPE} eq $known->{Display}))
2251        {
2252            &add_warning(
2253                'W13',
2254                {   W13_org => $org_dtd,
2255                    W13_new => $File->{Opt}->{DOCTYPE},
2256                }
2257            );
2258            $File->{Tentative} |= T_ERROR;    # Tag it as Invalid.
2259        }
2260    }
2261    else {
2262        if ($File->{"DOCTYPEless OK"}) {
2263            &add_warning('W25', {W25_dtd => $File->{Opt}->{DOCTYPE}});
2264        }
2265        elsif ($File->{Opt}->{FB}->{DOCTYPE}) {
2266            &add_warning('W16', {W16_dtd => $File->{Opt}->{DOCTYPE}});
2267            $File->{Tentative} |= T_ERROR;    # Tag it as Invalid.
2268        }
2269        else {
2270            &add_warning('W15', {W15_dtd => $File->{Opt}->{DOCTYPE}});
2271            $File->{Tentative} |= T_ERROR;    # Tag it as Invalid.
2272        }
2273    }
2274
2275    return $File;
2276}
2277
2278#
2279# Override inline charset declarations, for use e.g. when passing
2280# transcoded results to external parsers that use them.
2281sub override_charset ($$)
2282{
2283    my ($File, $charset) = @_;
2284
2285    my $ws = qr/[\x20\x09\x0D\x0A]/o;
2286    my $cs = qr/[A-Za-z][a-zA-Z0-9_-]+/o;
2287
2288    my $content = join("\n", @{$File->{Content}});
2289
2290    # Flatten newlines (so that we don't end up changing line numbers while
2291    # overriding) and comment-escape a string.
2292    sub escape_original ($)
2293    {
2294        my $str = shift;
2295        $str =~ tr/\r\n/ /;
2296        return &escape_comment($str);
2297    }
2298
2299    # <?xml encoding="charset"?>
2300    $content =~ s/(
2301              (^<\?xml\b[^>]*?${ws}encoding${ws}*=${ws}*(["']))
2302              (${cs})
2303              (\3.*?\?>)
2304          )/lc($4) eq lc($charset) ?
2305              "$1" : "$2$charset$5<!-- " . &escape_original($1) . " -->"/esx;
2306
2307    # <meta charset="charset">
2308    $content =~ s/(
2309              (<meta\b[^>]*?${ws}charset${ws}*=${ws}*["']?${ws}*)
2310              (${cs})
2311              (.*?>)
2312          )/lc($3) eq lc($charset) ?
2313              "$1" : "$2$charset$4<!-- " . &escape_original($1) . " -->"/esix;
2314
2315    # <meta http-equiv="content-type" content="some/type; charset=charset">
2316    $content =~ s/(
2317              (<meta\b[^>]*${ws}
2318                  http-equiv${ws}*=${ws}*["']?${ws}*content-type\b[^>]*?${ws}
2319                  content${ws}*=${ws}*["']?[^"'>]+?;${ws}*charset${ws}*=${ws}*)
2320              (${cs})
2321              (.*?>)
2322          )/lc($3) eq lc($charset) ?
2323              "$1" : "$2$charset$4<!-- " . &escape_original($1) . " -->"/esix;
2324
2325    # <meta content="some/type; charset=charset" http-equiv="content-type">
2326    $content =~ s/(
2327              (<meta\b[^>]*${ws}
2328                  content${ws}*=${ws}*["']?[^"'>]+?;${ws}*charset${ws}*=${ws}*)
2329              (${cs})
2330              ([^>]*?${ws}http-equiv${ws}*=${ws}*["']?${ws}*content-type\b.*?>)
2331          )/lc($3) eq lc($charset) ?
2332              "$1" : "$2$charset$4<!-- " . &escape_original($1) . " -->"/esix;
2333
2334    $File->{Content} = [split /\n/, $content];
2335}
2336
2337sub set_error_uri ($$)
2338{
2339    my ($err, $uri) = @_;
2340
2341    # We want errors in the doc that was validated to appear without
2342    # $err->{uri}, and non-doc errors with it pointing to the external entity
2343    # or the like where the error is.  This usually works as long as we're
2344    # passing docs to parsers as strings, but S::P::O (at least as of 0.994)
2345    # seems to give us "3" as the FileName in those cases so we try to filter
2346    # out everything that doesn't look like a useful URI.
2347    if ($uri && index($uri, '/') != -1) {
2348
2349        # Mask local file paths
2350        my $euri = URI->new($uri);
2351        if (!$euri->scheme() || $euri->scheme() eq 'file') {
2352            $err->{uri_is_file} = TRUE;
2353            $err->{uri}         = ($euri->path_segments())[-1];
2354        }
2355        else {
2356            $err->{uri} = $euri->canonical();
2357        }
2358    }
2359}
2360
2361#
2362# Generate a HTML report of detected errors.
2363sub report_errors ($)
2364{
2365    my $File   = shift;
2366    my $Errors = [];
2367    my %Errors_bytype;
2368    my $number_of_errors   = 0;
2369    my $number_of_warnings = 0;
2370    my $number_of_info     = 0;
2371
2372    # for the sake of readability, at least until the xmlwf errors have
2373    # explanations, we push the errors from the XML parser at the END of the
2374    # error list.
2375    push @{$File->{Errors}}, @{$File->{WF_Errors}};
2376
2377    if (scalar @{$File->{Errors}}) {
2378        foreach my $err (@{$File->{Errors}}) {
2379            my $col = 0;
2380
2381            # Populate source/context for errors in our doc that don't have it
2382            # already.  Checkers should always have populated $err->{src} with
2383            # _something_ for non-doc errors.
2384            if (!defined($err->{src})) {
2385                my $line = undef;
2386
2387                # Avoid truncating lines that do not exist.
2388                if (defined($err->{line}) &&
2389                    $File->{Content}->[$err->{line} - 1])
2390                {
2391                    if (defined($err->{char}) && $err->{char} =~ /^[0-9]+$/) {
2392                        ($line, $col) =
2393                            &truncate_line(
2394                            $File->{Content}->[$err->{line} - 1],
2395                            $err->{char});
2396                        $line = &mark_error($line, $col);
2397                    }
2398                    elsif (defined($err->{line})) {
2399                        $col = length($File->{Content}->[$err->{line} - 1]);
2400                        $col = 80 if ($col > 80);
2401                        ($line, $col) =
2402                            &truncate_line(
2403                            $File->{Content}->[$err->{line} - 1], $col);
2404                        $line = &ent($line);
2405                        $col  = 0;
2406                    }
2407                }
2408                else {
2409                    $col = 0;
2410                }
2411                $err->{src} = $line;
2412            }
2413
2414            my $explanation = "";
2415            if ($err->{expl}) {
2416
2417            }
2418            else {
2419                if ($err->{num}) {
2420                    my $num = $err->{num};
2421                    $explanation .= Encode::decode_utf8(
2422                        "\n    $RSRC{msg}->{$num}->{verbose}\n")
2423                        if exists $RSRC{msg}->{$num} &&
2424                            exists $RSRC{msg}->{$num}->{verbose};
2425                    my $_msg = $RSRC{msg}->{nomsg}->{verbose};
2426                    $_msg =~ s/<!--MID-->/$num/g;
2427                    if (($File->{'Is Upload'}) or ($File->{'Direct Input'})) {
2428                        $_msg =~ s/<!--URI-->//g;
2429                    }
2430                    else {
2431                        my $escaped_uri = uri_escape($File->{URI});
2432                        $_msg =~ s/<!--URI-->/$escaped_uri/g;
2433                    }
2434
2435                    # The send feedback plea.
2436                    $explanation = "    $_msg\n$explanation";
2437                    $explanation =~ s/<!--#echo\s+var="relroot"\s*-->//g;
2438                }
2439                $err->{expl} = $explanation;
2440            }
2441
2442            $err->{col} = ' ' x $col;
2443            if ($err->{type} eq 'I') {
2444                $err->{class}         = 'msg_info';
2445                $err->{err_type_err}  = 0;
2446                $err->{err_type_warn} = 0;
2447                $err->{err_type_info} = 1;
2448                $number_of_info += 1;
2449            }
2450            elsif ($err->{type} eq 'E') {
2451                $err->{class}         = 'msg_err';
2452                $err->{err_type_err}  = 1;
2453                $err->{err_type_warn} = 0;
2454                $err->{err_type_info} = 0;
2455                $number_of_errors += 1;
2456            }
2457            elsif (($err->{type} eq 'W') or ($err->{type} eq 'X')) {
2458                $err->{class}         = 'msg_warn';
2459                $err->{err_type_err}  = 0;
2460                $err->{err_type_warn} = 1;
2461                $err->{err_type_info} = 0;
2462                $number_of_warnings += 1;
2463            }
2464
2465            # TODO other classes for "X" etc? FIXME find all types of message.
2466
2467            push @{$Errors}, $err;
2468
2469            if (($File->{Opt}->{'Group Errors'}) and
2470                (($err->{type} eq 'E') or
2471                    ($err->{type} eq 'W') or
2472                    ($err->{type} eq 'X'))
2473                )
2474            {
2475
2476                # index by num for errors and warnings only - info usually
2477                # gives context of error or warning
2478                if (!exists $Errors_bytype{$err->{num}}) {
2479                    $Errors_bytype{$err->{num}}->{instances} = [];
2480                    my $msg_text;
2481                    if ($err->{num} eq 'xmlwf') {
2482
2483                        # FIXME need a catalog of errors from XML::LibXML
2484                        $msg_text = "XML Parsing Error";
2485                    }
2486                    elsif ($err->{num} eq 'html5') {
2487                        $msg_text = "HTML5 Validator Error";
2488                    }
2489                    else {
2490                        $msg_text = $RSRC{msg}->{$err->{num}}->{original};
2491                        $msg_text =~ s/%1/X/;
2492                        $msg_text =~ s/%2/Y/;
2493                    }
2494                    $Errors_bytype{$err->{num}}->{expl}        = $err->{expl};
2495                    $Errors_bytype{$err->{num}}->{generic_msg} = $msg_text;
2496                    $Errors_bytype{$err->{num}}->{msg}         = $err->{msg};
2497                    $Errors_bytype{$err->{num}}->{type}        = $err->{type};
2498                    $Errors_bytype{$err->{num}}->{class}       = $err->{class};
2499                    $Errors_bytype{$err->{num}}->{err_type_err} =
2500                        $err->{err_type_err};
2501                    $Errors_bytype{$err->{num}}->{err_type_warn} =
2502                        $err->{err_type_warn};
2503                    $Errors_bytype{$err->{num}}->{err_type_info} =
2504                        $err->{err_type_info};
2505                }
2506                push @{$Errors_bytype{$err->{num}}->{instances}}, $err;
2507            }
2508        }
2509    }
2510
2511    @$Errors = values(%Errors_bytype) if $File->{Opt}->{'Group Errors'};
2512
2513    # we are not sorting errors by line, as it would break the position
2514    # of auxiliary messages such as "start tag was here". We'll have to live
2515    # with the fact that XML well-formedness errors are listed first, then
2516    # validation errors
2517    #else {
2518    #   sort error by lines
2519    #  @{$Errors} = sort {$a->{line} <=> $b->{line} } @{$Errors};
2520    #}
2521    return $number_of_errors, $number_of_warnings, $number_of_info, $Errors;
2522}
2523
2524#
2525# Chop the source line into 3 pieces; the character at which the error
2526# was detected, and everything to the left and right of that position.
2527# That way we can add markup to the relevant char without breaking &ent().
2528# Expects 1-based column indexes.
2529sub mark_error ($$)
2530{
2531    my $line    = shift;
2532    my $col     = shift;
2533    my $linelen = length($line);
2534
2535    # Coerce column into an index valid within the line.
2536    if ($col < 1) {
2537        $col = 1;
2538    }
2539    elsif ($col > $linelen) {
2540        $col = $linelen;
2541    }
2542    $col--;
2543
2544    my $left = substr($line, 0,    $col);
2545    my $char = substr($line, $col, 1);
2546    my $right = substr($line, $col + 1);
2547
2548    $char = &ent($char);
2549    $char =
2550        qq(<strong title="Position where error was detected.">$char</strong>);
2551    $line = &ent($left) . $char . &ent($right);
2552
2553    return $line;
2554}
2555
2556#
2557# Create a HTML representation of the document.
2558sub source
2559{
2560    my $File = shift;
2561
2562    # Remove any BOM since we're not at BOT anymore...
2563    $File->{Content}->[0] = substr($File->{Content}->[0], 1)
2564        if ($File->{BOM} && scalar(@{$File->{Content}}));
2565
2566    my @source = map({file_source_line => $_}, @{$File->{Content}});
2567    return \@source;
2568}
2569
2570sub match_DTD_FPI_SI
2571{
2572    my ($File, $FPI, $SI) = @_;
2573    if ($CFG->{Types}->{$FPI}) {
2574        if ($CFG->{Types}->{$FPI}->{SysID}) {
2575            if ($SI ne $CFG->{Types}->{$FPI}->{SysID}) {
2576                &add_warning(
2577                    'W26',
2578                    {   W26_dtd_pub => $FPI,
2579                        W26_dtd_pub_display =>
2580                            $CFG->{Types}->{$FPI}->{Display},
2581                        W26_dtd_sys           => $SI,
2582                        W26_dtd_sys_recommend => $CFG->{Types}->{$FPI}->{SysID}
2583                    }
2584                );
2585            }
2586        }
2587    }
2588    else {    # FPI not known, checking if the SI is
2589        while (my ($proper_FPI, $value) = each %{$CFG->{Types}}) {
2590            if ($value->{SysID} && $value->{SysID} eq $SI) {
2591                &add_warning(
2592                    'W26',
2593                    {   W26_dtd_pub           => $FPI,
2594                        W26_dtd_pub_display   => $value->{Display},
2595                        W26_dtd_sys           => $SI,
2596                        W26_dtd_pub_recommend => $proper_FPI
2597                    }
2598                );
2599            }
2600        }
2601    }
2602}
2603
2604#
2605# Do an initial parse of the Document Entity to extract FPI.
2606sub preparse_doctype
2607{
2608    my $File = shift;
2609
2610    #
2611    # Reset DOCTYPE, Root (for second invocation, probably not needed anymore).
2612    $File->{DOCTYPE} = '';
2613    $File->{Root}    = '';
2614
2615    my $dtd = sub {
2616        return if $File->{Root};
2617
2618        # TODO: The \s and \w are probably wrong now that the strings are
2619        # utf8_on
2620        my $declaration = shift;
2621        my $doctype_type;
2622        my $doctype_secondpart;
2623        if ($declaration =~
2624            /<!DOCTYPE\s+html(?:\s+SYSTEM\s+(['"])about:legacy-compat\1)?\s*>/si
2625            )
2626        {
2627            $File->{Root}    = "html";
2628            $File->{DOCTYPE} = "HTML5";
2629        }
2630        elsif ($declaration =~
2631            m(<!DOCTYPE\s+(\w[\w\.-]+)\s+(PUBLIC|SYSTEM)\s+(?:[\'\"])([^\"\']+)(?:[\"\'])(.*)>)si
2632            )
2633        {
2634            (   $File->{Root},    $doctype_type,
2635                $File->{DOCTYPE}, $doctype_secondpart
2636            ) = ($1, $2, $3, $4);
2637            if (($doctype_type eq "PUBLIC") and
2638                (($doctype_secondpart) =
2639                    $doctype_secondpart =~
2640                    m(\s+(?:[\'\"])([^\"\']+)(?:[\"\']).*)si)
2641                )
2642            {
2643                &match_DTD_FPI_SI($File, $File->{DOCTYPE},
2644                    $doctype_secondpart);
2645            }
2646        }
2647    };
2648
2649    my $start = sub {
2650        my ($p, $tag, $attr) = @_;
2651
2652        if ($File->{Root}) {
2653            return unless $tag eq $File->{Root};
2654        }
2655        else {
2656            $File->{Root} = $tag;
2657        }
2658        if ($attr->{xmlns}) {
2659            $File->{Namespace} = $attr->{xmlns};
2660        }
2661        if ($attr->{version}) {
2662            $File->{'Root Version'} = $attr->{version};
2663        }
2664        if ($attr->{baseProfile}) {
2665            $File->{'Root BaseProfile'} = $attr->{baseProfile};
2666        }
2667
2668        # We're done parsing.
2669        $p->eof();
2670    };
2671
2672    # we use HTML::Parser as pre-parser. May use html5lib or other in the future
2673    my $p = HTML::Parser->new(api_version => 3);
2674
2675    # if content-type has shown we should pre-parse with XML mode, use that
2676    # otherwise (mostly text/html cases) use default mode
2677    $p->xml_mode(&is_xml($File));
2678    $p->handler(declaration => $dtd,   'text');
2679    $p->handler(start       => $start, 'self,tag,attr');
2680
2681    my $line = 0;
2682    my $max  = scalar(@{$File->{Content}});
2683    $p->parse(
2684        sub {
2685            return ($line < $max) ? $File->{Content}->[$line++] . "\n" : undef;
2686        }
2687    );
2688    $p->eof();
2689
2690    # TODO: These \s here are probably wrong now that the strings are utf8_on
2691    $File->{DOCTYPE} = '' unless defined $File->{DOCTYPE};
2692    $File->{DOCTYPE} =~ s(^\s+){ }g;
2693    $File->{DOCTYPE} =~ s(\s+$){ }g;
2694    $File->{DOCTYPE} =~ s(\s+) { }g;
2695
2696    # Some document types actually need no doctype to be identified,
2697    # root element and some version attribute is enough
2698    # TODO applicable doctypes should be migrated to a config file?
2699
2700    # if (($File->{DOCTYPE} eq '') and ($File->{Root} eq "svg") ) {
2701    #   if (($File->{'Root Version'}) or ($File->{'Root BaseProfile'}))
2702    #   {
2703    #     if (! $File->{'Root Version'}) { $File->{'Root Version'} = "0"; }
2704    #     if (! $File->{'Root BaseProfile'}) { $File->{'Root BaseProfile'} = "0"; }
2705    #     if ($File->{'Root Version'} eq "1.0"){
2706    #       $File->{DOCTYPE} = "-//W3C//DTD SVG 1.0//EN";
2707    #       $File->{"DOCTYPEless OK"} = TRUE;
2708    #       $File->{Opt}->{DOCTYPE} = "SVG 1.0";
2709    #     }
2710    #     if ((($File->{'Root Version'} eq "1.1") or ($File->{'Root Version'} eq "0")) and ($File->{'Root BaseProfile'} eq "tiny")) {
2711    #         $File->{DOCTYPE} = "-//W3C//DTD SVG 1.1 Tiny//EN";
2712    #         $File->{"DOCTYPEless OK"} = TRUE;
2713    #         $File->{Opt}->{DOCTYPE} = "SVG 1.1 Tiny";
2714    #     }
2715    #     elsif ((($File->{'Root Version'} eq "1.1")  or ($File->{'Root Version'} eq "0")) and ($File->{'Root BaseProfile'} eq "basic")) {
2716    #         $File->{DOCTYPE} = "-//W3C//DTD SVG 1.1 Basic//EN";
2717    #         $File->{Opt}->{DOCTYPE} = "SVG 1.1 Basic";
2718    #         $File->{"DOCTYPEless OK"} = TRUE;
2719    #     }
2720    #     elsif (($File->{'Root Version'} eq "1.1") and (!$File->{'Root BaseProfile'})) {
2721    #         $File->{DOCTYPE} = "-//W3C//DTD SVG 1.1//EN";
2722    #         $File->{Opt}->{DOCTYPE} = "SVG 1.1";
2723    #         $File->{"DOCTYPEless OK"} = TRUE;
2724    #     }
2725    #     if ($File->{'Root Version'} eq "0") { $File->{'Root Version'} = undef; }
2726    #     if ($File->{'Root BaseProfile'} eq "0") { $File->{'Root BaseProfile'} = undef; }
2727    #   }
2728    #   else {
2729    #     # by default for an svg root elt, we use SVG 1.1
2730    #     $File->{DOCTYPE} = "-//W3C//DTD SVG 1.1//EN";
2731    #     $File->{Opt}->{DOCTYPE} = "SVG 1.1";
2732    #     $File->{"DOCTYPEless OK"} = TRUE;
2733    #   }
2734    # }
2735    if (($File->{"DOCTYPEless OK"}) and ($File->{Opt}->{DOCTYPE})) {
2736
2737        # doctypeless document type found, we fake the override
2738        # so that the parser will have something to validate against
2739        $File = &override_doctype($File);
2740    }
2741    return $File;
2742}
2743
2744#
2745# Preprocess CGI parameters.
2746sub prepCGI
2747{
2748    my $File = shift;
2749    my $q    = shift;
2750
2751    # The URL to this CGI script.
2752    $File->{Env}->{'Self URI'} = $q->url();
2753
2754    # Decode parameter values, set booleans the way we expect them.
2755    foreach my $param ($q->param()) {
2756
2757        # 'uploaded_file' and 'fragment' contain data we treat as is.
2758        next if ($param eq 'uploaded_file' || $param eq 'fragment');
2759
2760        # Decode all other defined values as UTF-8.
2761        my @values = map { Encode::decode_utf8($_) } $q->param($param);
2762        $q->param($param, @values);
2763
2764        # Skip parameters that should not be treated as booleans.
2765        next if $param =~ /^(?:accept(?:-(?:language|charset))?|ur[il])$/;
2766
2767        # Keep false-but-set params.
2768        next if $q->param($param) eq '0';
2769
2770        # Parameters that are given to us without specifying a value get set
2771        # to a true value.
2772        $q->param($param, TRUE) unless $q->param($param);
2773    }
2774
2775    $File->{Env}->{'Home Page'} =
2776        URI->new_abs(".", $File->{Env}->{'Self URI'});
2777
2778    # Use "url" unless a "uri" was also given.
2779    if ($q->param('url') and not $q->param('uri')) {
2780        $q->param('uri', $q->param('url'));
2781    }
2782
2783    # Set output mode; needed in get_error_template if we end up there.
2784    $File->{Opt}->{Output} = $q->param('output') || 'html';
2785
2786    # Issue a redirect for uri=referer.
2787    if ($q->param('uri') and $q->param('uri') eq 'referer') {
2788        if ($q->referer) {
2789            $q->param('uri', $q->referer);
2790            $q->param('accept', $q->http('Accept')) if ($q->http('Accept'));
2791            $q->param('accept-language', $q->http('Accept-Language'))
2792                if ($q->http('Accept-Language'));
2793            $q->param('accept-charset', $q->http('Accept-Charset'))
2794                if ($q->http('Accept-Charset'));
2795            print redirect(-uri => &self_url_q($q, $File), -vary => 'Referer');
2796            exit;
2797        }
2798        else {
2799
2800            # No Referer header was found.
2801            $File->{'Error Flagged'} = TRUE;
2802            &get_error_template($File)->param(fatal_referer_error => TRUE);
2803        }
2804    }
2805
2806    # Supersede URL with an uploaded file.
2807    if ($q->param('uploaded_file')) {
2808        $q->param('uri', 'upload://' . $q->param('uploaded_file'));
2809        $File->{'Is Upload'} = TRUE;    # Tag it for later use.
2810    }
2811
2812    # Supersede URL with an uploaded fragment.
2813    if ($q->param('fragment')) {
2814        $q->param('uri', 'upload://Form Submission');
2815        $File->{'Direct Input'} = TRUE;    # Tag it for later use.
2816    }
2817
2818    # Redirect to a GETable URL if method is POST without a file upload.
2819    if (defined $q->request_method and
2820        $q->request_method eq 'POST' and
2821        not($File->{'Is Upload'} or $File->{'Direct Input'}))
2822    {
2823        my $thispage = &self_url_q($q, $File);
2824        print redirect $thispage;
2825        exit;
2826    }
2827
2828    #
2829    # Flag an error if we didn't get a file to validate.
2830    unless ($q->param('uri')) {
2831        $File->{'Error Flagged'} = TRUE;
2832        my $tmpl = &get_error_template($File);
2833        $tmpl->param(fatal_uri_error  => TRUE);
2834        $tmpl->param(fatal_uri_scheme => 'undefined');
2835    }
2836
2837    return $q;
2838}
2839
2840#
2841# Set parse mode (SGML or XML) based on a number of preparsed factors:
2842# * HTTP Content-Type
2843# * Doctype Declaration
2844# * XML Declaration
2845# * XML namespaces
2846sub set_parse_mode
2847{
2848    my $File = shift;
2849    my $CFG  = shift;
2850    my $fpi  = $File->{DOCTYPE};
2851    $File->{ModeChoice} = '';
2852    my $parseModeFromDoctype = $CFG->{Types}->{$fpi}->{'Parse Mode'} || 'TBD';
2853
2854    my $xmlws = qr/[\x20\x09\x0D\x0A]/o;
2855
2856    # $File->{Mode} may have been set in parse_content_type
2857    # and it would come from the Media Type
2858    my $parseModeFromMimeType = $File->{Mode};
2859    my $begincontent          = join "\x20",
2860        @{$File->{Content}};    # for the sake of xml decl detection,
2861                                # the 10 first lines should be safe
2862    my $parseModeFromXMLDecl = (
2863        $begincontent =~
2864            /^ ${xmlws}*                # whitespace before the decl should not be happening
2865                                        # but we are greedy for the sake of detection, not validation
2866      <\?xml ${xmlws}+                  # start matching an XML Declaration
2867      version ${xmlws}* =               # for documents, version info is mandatory
2868      ${xmlws}* (["'])1.[01]\1          # hardcoding the existing XML versions.
2869                                        # Maybe we should use \d\.\d
2870      (?:${xmlws}+ encoding
2871       ${xmlws}* = ${xmlws}*
2872       (["'])[A-Za-z][a-zA-Z0-9_-]+\2
2873      )?                                # encoding info is optional
2874      (?:${xmlws}+ standalone
2875       ${xmlws}* = ${xmlws}*
2876       (["'])(?:yes|no)\3
2877      )?                                # ditto standalone info, optional
2878      ${xmlws}* \?>                     # end of XML Declaration
2879    /ox
2880        ?
2881            'XML' :
2882            'TBD'
2883    );
2884
2885    my $parseModeFromNamespace = 'TBD';
2886    # http://www.w3.org/Bugs/Public/show_bug.cgi?id=9967
2887    $parseModeFromNamespace = 'XML'
2888        if ($File->{Namespace} && $parseModeFromDoctype ne 'HTML5');
2889
2890    if (($parseModeFromMimeType eq 'TBD') and
2891        ($parseModeFromXMLDecl   eq 'TBD') and
2892        ($parseModeFromNamespace eq 'TBD') and
2893        (!exists $CFG->{Types}->{$fpi}))
2894    {
2895
2896        # if the mime type is text/html (ambiguous, hence TBD mode)
2897        # and the doctype isn't in the catalogue
2898        # and XML prolog detection was unsuccessful
2899        # and we found no namespace at the root
2900        # ... throw in a warning
2901        &add_warning(
2902            'W06',
2903            {   W06_mime    => $File->{ContentType},
2904                w06_doctype => $File->{DOCTYPE}
2905            }
2906        );
2907        return;
2908    }
2909
2910    $parseModeFromDoctype = 'TBD'
2911        unless $parseModeFromDoctype eq 'SGML' or
2912            $parseModeFromDoctype eq 'HTML5' or
2913            $parseModeFromDoctype eq 'XML'   or
2914            $parseModeFromNamespace eq 'XML';
2915
2916    if (($parseModeFromDoctype eq 'TBD') and
2917        ($parseModeFromXMLDecl  eq 'TBD') and
2918        ($parseModeFromMimeType eq 'TBD') and
2919        ($parseModeFromNamespace eq 'TBD'))
2920    {
2921
2922        # if all factors are useless to give us a parse mode
2923        # => we use SGML-based DTD validation as a default
2924        $File->{Mode}       = 'DTD+SGML';
2925        $File->{ModeChoice} = 'Fallback';
2926
2927        # and send warning about the fallback
2928        &add_warning(
2929            'W06',
2930            {   W06_mime    => $File->{ContentType},
2931                w06_doctype => $File->{DOCTYPE}
2932            }
2933        );
2934        return;
2935    }
2936
2937    if ($parseModeFromMimeType ne 'TBD') {
2938
2939        # if The mime type gives clear indication of whether the document is
2940        # XML or not
2941        if (($parseModeFromDoctype ne 'TBD') and
2942            ($parseModeFromDoctype ne 'HTML5') and
2943            ($parseModeFromMimeType ne $parseModeFromDoctype))
2944        {
2945
2946            # if document-type recommended mode and content-type recommended
2947            # mode clash, shoot a warning
2948            # unknown doctypes will not trigger this
2949            # neither will html5 documents, which can be XML or not
2950            &add_warning(
2951                'W07',
2952                {   W07_mime => $File->{ContentType},
2953                    W07_ct   => $parseModeFromMimeType,
2954                    W07_dtd  => $parseModeFromDoctype,
2955                }
2956            );
2957        }
2958
2959        # mime type has precedence, we stick to it
2960        $File->{ModeChoice} = 'Mime';
2961        if ($parseModeFromDoctype eq "HTML5") {
2962            $File->{Mode} = 'HTML5+' . $File->{Mode};
2963        }
2964        else {
2965            $File->{Mode} = 'DTD+' . $File->{Mode};
2966        }
2967        return;
2968    }
2969
2970    if ($parseModeFromDoctype ne 'TBD') {
2971
2972        # the mime type is ambiguous (hence we didn't stop at the previous test)
2973        # but by now we're sure that the document type is a good indication
2974        # so we use that.
2975        if ($parseModeFromDoctype eq "HTML5") {
2976            if ($parseModeFromXMLDecl eq "XML" or
2977                $parseModeFromNamespace eq "XML")
2978            {
2979                $File->{Mode} = "HTML5+XML";
2980            }
2981            else {
2982                $File->{Mode} = "HTML5";
2983            }
2984        }
2985        else {    # not HTML5
2986            $File->{Mode} = "DTD+" . $parseModeFromDoctype;
2987        }
2988        $File->{ModeChoice} = 'Doctype';
2989        return;
2990    }
2991
2992    if ($parseModeFromXMLDecl ne 'TBD') {
2993
2994        # the mime type is ambiguous (hence we didn't stop at the previous test)
2995        # and so was the doctype
2996        # but we found an XML declaration so we use that.
2997        if ($File->{Mode} eq "") {
2998            $File->{Mode} = "DTD+" . $parseModeFromXMLDecl;
2999        }
3000        elsif ((my $ix = index($File->{Mode}, '+')) != -1) {
3001            substr($File->{Mode}, $ix + 1) = $parseModeFromXMLDecl;
3002        }
3003        else {
3004            $File->{Mode} = $File->{Mode} . "+" . $parseModeFromXMLDecl;
3005        }
3006        $File->{ModeChoice} = 'XMLDecl';
3007        return;
3008    }
3009
3010    # this is the last case. We know that all  modes are not TBD,
3011    # yet mime type, doctype AND XML DECL tests have failed => we are saved
3012    # by the presence of namespaces
3013    if ($File->{Mode} eq "") {
3014        $File->{Mode} = "DTD+" . $parseModeFromNamespace;
3015    }
3016    elsif ((my $ix = index($File->{Mode}, '+')) != -1) {
3017        substr($File->{Mode}, $ix + 1) = $parseModeFromNamespace;
3018    }
3019    else {
3020        $File->{Mode} = $File->{Mode} . "+" . $parseModeFromNamespace;
3021    }
3022    $File->{ModeChoice} = 'Namespace';
3023}
3024
3025#
3026# Utility sub to tell if mode "is" XML.
3027sub is_xml
3028{
3029    index(shift->{Mode}, 'XML') != -1;
3030}
3031
3032#
3033# Check charset conflicts and add any warnings necessary.
3034sub charset_conflicts
3035{
3036    my $File = shift;
3037
3038    #
3039    # Handle the case where there was no charset to be found.
3040    unless ($File->{Charset}->{Use}) {
3041        &add_warning('W17', {});
3042        $File->{Tentative} |= T_WARN;
3043    }
3044
3045    #
3046    # Add a warning if there was charset info conflict (HTTP header,
3047    # XML declaration, or <meta> element).
3048    # filtering out some of the warnings in direct input mode where HTTP
3049    # encoding is a "fake"
3050    if ((   charset_not_equal(
3051                $File->{Charset}->{HTTP},
3052                $File->{Charset}->{XML}
3053            )
3054        ) and
3055        not($File->{'Direct Input'})
3056        )
3057    {
3058        &add_warning(
3059            'W18',
3060            {   W18_http => $File->{Charset}->{HTTP},
3061                W18_xml  => $File->{Charset}->{XML},
3062                W18_use  => $File->{Charset}->{Use},
3063            }
3064        );
3065    }
3066    elsif (
3067        charset_not_equal($File->{Charset}->{HTTP}, $File->{Charset}->{META})
3068        and
3069        not($File->{'Direct Input'}))
3070    {
3071        &add_warning(
3072            'W19',
3073            {   W19_http => $File->{Charset}->{HTTP},
3074                W19_meta => $File->{Charset}->{META},
3075                W19_use  => $File->{Charset}->{Use},
3076            }
3077        );
3078    }
3079    elsif (
3080        charset_not_equal($File->{Charset}->{XML}, $File->{Charset}->{META}))
3081    {
3082        &add_warning(
3083            'W20',
3084            {   W20_http => $File->{Charset}->{XML},
3085                W20_xml  => $File->{Charset}->{META},
3086            }
3087        );
3088        $File->{Tentative} |= T_WARN;
3089    }
3090
3091    return $File;
3092}
3093
3094#
3095# Transcode to UTF-8
3096sub transcode
3097{
3098    my $File = shift;
3099
3100    my $general_charset = $File->{Charset}->{Use};
3101    my $exact_charset   = $general_charset;
3102
3103    # TODO: This should be done before transcode()
3104    if ($general_charset eq 'utf-16') {
3105        if ($File->{Charset}->{Auto} =~ m/^utf-16[bl]e$/) {
3106            $exact_charset = $File->{Charset}->{Auto};
3107        }
3108        else { $exact_charset = 'utf-16be'; }
3109    }
3110
3111    my $cs = $exact_charset;
3112
3113    if ($CFG->{Charsets}->{$cs}) {
3114        if (index($CFG->{Charsets}->{$cs}, 'ERR ') != -1) {
3115
3116            # The encoding is not supported due to policy
3117
3118            $File->{'Error Flagged'} = TRUE;
3119            my $tmpl = &get_error_template($File);
3120            $tmpl->param(fatal_transcode_error   => TRUE);
3121            $tmpl->param(fatal_transcode_charset => $cs);
3122
3123            # @@FIXME might need better text
3124            $tmpl->param(fatal_transcode_errmsg =>
3125                    'This encoding is not supported by the validator.');
3126            return $File;
3127        }
3128        elsif (index($CFG->{Charsets}->{$cs}, 'X ') != -1) {
3129
3130            # possibly problematic, we recommend another alias
3131            my $recommended_charset = $CFG->{Charsets}->{$cs};
3132            $recommended_charset =~ s/X //;
3133            &add_warning(
3134                'W22',
3135                {   W22_declared  => $cs,
3136                    W22_suggested => $recommended_charset,
3137                }
3138            );
3139        }
3140    }
3141
3142    # Does the system support decoding this encoding?
3143    my $enc = Encode::find_encoding($cs);
3144
3145    if (!$enc) {
3146
3147        # This system's Encode installation does not support
3148        # the character encoding; might need additional modules
3149
3150        $File->{'Error Flagged'} = TRUE;
3151        my $tmpl = &get_error_template($File);
3152        $tmpl->param(fatal_transcode_error   => TRUE);
3153        $tmpl->param(fatal_transcode_charset => $cs);
3154
3155        # @@FIXME might need better text
3156        $tmpl->param(fatal_transcode_errmsg => 'Encoding not supported.');
3157        return $File;
3158    }
3159    elsif (!$CFG->{Charsets}->{$cs}) {
3160
3161        # not in the list, but technically OK -> we warn
3162        &add_warning('W24', {W24_declared => $cs,});
3163
3164    }
3165
3166    my $output;
3167    my $input = $File->{Bytes};
3168
3169    # Try to transcode
3170    eval { $output = $enc->decode($input, Encode::FB_CROAK); };
3171
3172    if ($@) {
3173
3174        # Transcoding failed - do it again line by line to find out exactly
3175        # where
3176        my $line_num = 0;
3177        while ($input =~ /(.*?)(?:\r\n|\n|\r|\z)/g) {
3178            $line_num++;
3179            eval { $enc->decode($1, Encode::FB_CROAK); };
3180            if ($@) {
3181                my $croak_message = $@;
3182                $croak_message =~ s/ at .*//;
3183                $File->{'Error Flagged'} = TRUE;
3184                my $tmpl = &get_error_template($File);
3185                $tmpl->param(fatal_byte_error     => TRUE);
3186                $tmpl->param(fatal_byte_lines     => $line_num);
3187                $tmpl->param(fatal_byte_charset   => $cs);
3188                $tmpl->param(fatal_byte_error_msg => $croak_message);
3189                last;
3190            }
3191        }
3192        return $File;
3193    }
3194
3195    # @@FIXME is this what we want?
3196    $output =~ s/\015?\012/\n/g;
3197
3198    # make sure we deal only with unix newlines
3199    # tentative fix for http://www.w3.org/Bugs/Public/show_bug.cgi?id=3992
3200    $output =~ s/(\r\n|\n|\r)/\n/g;
3201
3202    #debug: we could check if the content has utf8 bit on with
3203    #$output= utf8::is_utf8($output) ? 1 : 0;
3204    $File->{Content} = [split /\n/, $output];
3205
3206    return $File;
3207}
3208
3209sub find_encodings
3210{
3211    my $File  = shift;
3212    my $bom   = HTML::Encoding::encoding_from_byte_order_mark($File->{Bytes});
3213    my @first = HTML::Encoding::encoding_from_first_chars($File->{Bytes});
3214
3215    if (defined $bom) {
3216
3217        # @@FIXME this BOM entry should not be needed at all!
3218        $File->{BOM} = length(Encode::encode($bom, "\x{FEFF}"));
3219        $File->{Charset}->{Auto} = lc $bom;
3220    }
3221    else {
3222        $File->{Charset}->{Auto} = lc($first[0]) if @first;
3223    }
3224
3225    my $xml = HTML::Encoding::encoding_from_xml_document($File->{Bytes});
3226    $File->{Charset}->{XML} = lc $xml if defined $xml;
3227
3228    my %metah;
3229    foreach my $try (@first) {
3230
3231        # @@FIXME I think the old code used HTML::Parser xml mode, check if ok
3232        my $meta =
3233            HTML::Encoding::encoding_from_meta_element($File->{Bytes}, $try);
3234        $metah{lc($meta)}++ if defined $meta and length $meta;
3235    }
3236
3237    if (!%metah) {
3238
3239        # HTML::Encoding doesn't support HTML5 <meta charset> as of 0.60,
3240        # check it ourselves.  HTML::HeadParser >= 3.60 is required for this.
3241
3242        my $hp           = HTML::HeadParser->new();
3243        my $seen_doctype = FALSE;
3244        my $is_html5     = FALSE;
3245        $hp->handler(
3246            declaration => sub {
3247                my ($tag, $text) = @_;
3248                return if ($seen_doctype || uc($tag) ne '!DOCTYPE');
3249                $seen_doctype = TRUE;
3250                $is_html5     = TRUE
3251                    if (
3252                    $text =~ /<!DOCTYPE\s+html
3253                                    (\s+SYSTEM\s+(['"])about:legacy-compat\2)?
3254                                    \s*>/six
3255                    );
3256            },
3257            'tag,text'
3258        );
3259        $hp->parse($File->{Bytes});
3260        if ($is_html5) {
3261            my $cs = $hp->header('X-Meta-Charset');
3262            $metah{lc($cs)}++ if (defined($cs) && length($cs));
3263        }
3264    }
3265
3266    if (%metah) {
3267        my @meta = sort { $metah{$b} <=> $metah{$a} } keys %metah;
3268        $File->{Charset}->{META} = $meta[0];
3269    }
3270
3271    return $File;
3272}
3273
3274#
3275# Abort with a message if an error was flagged at point.
3276sub abort_if_error_flagged
3277{
3278    my $File = shift;
3279
3280    return unless $File->{'Error Flagged'};
3281    return if $File->{'Error Handled'};    # Previous error, keep going.
3282
3283    my $tmpl = &get_error_template($File);
3284    $tmpl->param(fatal_error => TRUE);
3285
3286    &prep_template($File, $tmpl);
3287
3288    # transcode output from perl's internal to utf-8 and output
3289    print Encode::encode('UTF-8', $tmpl->output);
3290    exit;
3291}
3292
3293#
3294# conflicting encodings
3295sub charset_not_equal
3296{
3297    my $encodingA = shift;
3298    my $encodingB = shift;
3299    return $encodingA && $encodingB && ($encodingA ne $encodingB);
3300}
3301
3302#
3303# Construct a self-referential URL from a CGI.pm $q object.
3304sub self_url_q
3305{
3306    my ($q, $File) = @_;
3307    my $thispage = $File->{Env}->{'Self URI'} . '?';
3308
3309    # Pass-through parameters
3310    for my $param (qw(uri accept accept-language accept-charset)) {
3311        $thispage .= "$param=" . uri_escape($q->param($param)) . ';'
3312            if $q->param($param);
3313    }
3314
3315    # Boolean parameters
3316    for my $param (qw(ss outline No200 verbose group)) {
3317        $thispage .= "$param=1;" if $q->param($param);
3318    }
3319
3320    # Others
3321    if ($q->param('doctype') and $q->param('doctype') !~ /(?:Inline|detect)/i)
3322    {
3323        $thispage .= 'doctype=' . uri_escape($q->param('doctype')) . ';';
3324    }
3325    if ($q->param('charset') and $q->param('charset') !~ /detect/i) {
3326        $thispage .= 'charset=' . uri_escape($q->param('charset')) . ';';
3327    }
3328
3329    $thispage =~ s/[\?;]$//;
3330    return $thispage;
3331}
3332
3333#
3334# Construct a self-referential URL from a $File object.
3335sub self_url_file
3336{
3337    my $File = shift;
3338
3339    my $thispage    = $File->{Env}->{'Self URI'};
3340    my $escaped_uri = uri_escape($File->{URI});
3341    $thispage .= qq(?uri=$escaped_uri);
3342    $thispage .= ';ss=1' if $File->{Opt}->{'Show Source'};
3343    $thispage .= ';st=1' if $File->{Opt}->{'Show Tidy'};
3344    $thispage .= ';outline=1' if $File->{Opt}->{Outline};
3345    $thispage .= ';No200=1' if $File->{Opt}->{No200};
3346    $thispage .= ';verbose=1' if $File->{Opt}->{Verbose};
3347    $thispage .= ';group=1' if $File->{Opt}->{'Group Errors'};
3348    $thispage .= ';accept=' . uri_escape($File->{Opt}->{'Accept Header'})
3349        if $File->{Opt}->{'Accept Header'};
3350    $thispage .=
3351        ';accept-language=' .
3352        uri_escape($File->{Opt}->{'Accept-Language Header'})
3353        if $File->{Opt}->{'Accept-Language Header'};
3354    $thispage .=
3355        ';accept-charset=' .
3356        uri_escape($File->{Opt}->{'Accept-Charset Header'})
3357        if $File->{Opt}->{'Accept-Charset Header'};
3358
3359    return $thispage;
3360}
3361
3362#####
3363
3364package W3C::Validator::EventHandler;
3365
3366#
3367# Define global constants
3368use constant TRUE  => 1;
3369use constant FALSE => 0;
3370
3371#
3372# Tentative Validation Severities.
3373use constant T_WARN  => 4;    # 0000 0100
3374use constant T_ERROR => 8;    # 0000 1000
3375
3376sub new
3377{
3378    my $class  = shift;
3379    my $parser = shift;
3380    my $File   = shift;
3381    my $CFG    = shift;
3382    my $self   = {_file => $File, CFG => $CFG, _parser => $parser};
3383    bless $self, $class;
3384}
3385
3386sub start_element
3387{
3388    my ($self, $element) = @_;
3389
3390    my $has_xmlns   = FALSE;
3391    my $xmlns_value = undef;
3392
3393    # If in XML mode, find namespace used for each element.
3394    if ((my $attr = $element->{Attributes}->{xmlns}) &&
3395        &W3C::Validator::MarkupValidator::is_xml($self->{_file}))
3396    {
3397        $xmlns_value = "";
3398
3399        # Try with SAX method
3400        if ($attr->{Value}) {
3401            $has_xmlns   = TRUE;
3402            $xmlns_value = $attr->{Value};
3403        }
3404
3405        #next if $has_xmlns;
3406
3407        # The following is not SAX, but OpenSP specific.
3408        my $defaulted = $attr->{Defaulted} || '';
3409        if ($defaulted eq "specified") {
3410            $has_xmlns = TRUE;
3411            $xmlns_value .=
3412                join("", map { $_->{Data} } @{$attr->{CdataChunks}});
3413        }
3414    }
3415
3416    my $doctype = $self->{_file}->{DOCTYPE};
3417
3418    if (!defined($self->{CFG}->{Types}->{$doctype}->{Name}) ||
3419        $element->{Name} ne $self->{CFG}->{Types}->{$doctype}->{Name})
3420    {
3421
3422        # add to list of non-root namespaces
3423        push(@{$self->{_file}->{Namespaces}}, $xmlns_value) if $has_xmlns;
3424    }
3425    elsif (!$has_xmlns &&
3426        $self->{CFG}->{Types}->{$doctype}->{"Namespace Required"})
3427    {
3428
3429        # whine if the root xmlns attribute is noted as required by spec,
3430        # but not present
3431        my $err      = {};
3432        my $location = $self->{_parser}->get_location();
3433        &W3C::Validator::MarkupValidator::set_error_uri($err,
3434            $location->{FileName});
3435
3436        # S::P::O does not provide src context, set to empty for non-doc errors.
3437        $err->{src}  = "" if $err->{uri};
3438        $err->{line} = $location->{LineNumber};
3439        $err->{char} = $location->{ColumnNumber};
3440        $err->{num}  = "no-xmlns";
3441        $err->{type} = "E";
3442        $err->{msg} =
3443            "Missing xmlns attribute for element $element->{Name}. The " .
3444            "value should be: $self->{CFG}->{Types}->{$doctype}->{Namespace}";
3445
3446        # ...
3447        $self->{_file}->{'Is Valid'} = FALSE;
3448        push @{$self->{_file}->{Errors}}, $err;
3449    }
3450    elsif ($has_xmlns and
3451        (defined $self->{CFG}->{Types}->{$doctype}->{Namespace}) and
3452        ($xmlns_value ne $self->{CFG}->{Types}->{$doctype}->{Namespace}))
3453    {
3454
3455        # whine if root xmlns element is not the one specificed by the spec
3456        my $err      = {};
3457        my $location = $self->{_parser}->get_location();
3458        &W3C::Validator::MarkupValidator::set_error_uri($err,
3459            $location->{FileName});
3460
3461        # S::P::O does not provide src context, set to empty for non-doc errors.
3462        $err->{line} = $location->{LineNumber};
3463        $err->{char} = $location->{ColumnNumber};
3464        $err->{num}  = "wrong-xmlns";
3465        $err->{type} = "E";
3466        $err->{msg} =
3467            "Wrong xmlns attribute for element $element->{Name}. The " .
3468            "value should be: $self->{CFG}->{Types}->{$doctype}->{Namespace}";
3469
3470        # ...
3471        $self->{_file}->{'Is Valid'} = FALSE;
3472        push @{$self->{_file}->{Errors}}, $err;
3473    }
3474}
3475
3476sub error
3477{
3478    my $self  = shift;
3479    my $error = shift;
3480    my $mess;
3481    eval { $mess = $self->{_parser}->split_message($error); };
3482    if ($@) {
3483
3484        # this is a message that S:P:O could not handle, we skip its croaking
3485        return;
3486    }
3487    my $File = $self->{_file};
3488
3489    my $err = {};
3490    &W3C::Validator::MarkupValidator::set_error_uri($err,
3491        $self->{_parser}->get_location()->{FileName});
3492
3493    # S::P::O does not provide src context, set to empty for non-doc errors.
3494    $err->{src}  = "" if $err->{uri};
3495    $err->{line} = $mess->{primary_message}{LineNumber};
3496    $err->{char} = $mess->{primary_message}{ColumnNumber} + 1;
3497    $err->{num}  = $mess->{primary_message}{Number};
3498    $err->{type} = $mess->{primary_message}{Severity};
3499    $err->{msg}  = $mess->{primary_message}{Text};
3500
3501    # our parser OpenSP is not quite XML-aware, or XML Namespaces Aware,
3502    # so we filter out a few errors for now
3503
3504    my $is_xml = &W3C::Validator::MarkupValidator::is_xml($File);
3505
3506    if ($is_xml and $err->{num} eq '108' and $err->{msg} =~ m{ "xmlns:\S+"}) {
3507
3508        # the error is about a missing xmlns: attribute definition"
3509        return;    # this is not an error, 'cause we said so
3510    }
3511
3512    if ($err->{num} eq '187')
3513
3514        # filtering out no "document type declaration; will parse without
3515        # validation" if root element is not html and mode is xml...
3516    {
3517
3518        # since parsing was done without validation, result can only be
3519        # "well-formed"
3520        if ($is_xml and lc($File->{Root}) ne 'html') {
3521            $File->{XMLWF_ONLY} = TRUE;
3522            W3C::Validator::MarkupValidator::add_warning('W09xml', {});
3523            return;    # don't report this as an error, just proceed
3524        }
3525
3526        # if mode is not XML, we do report the error. It should not happen in
3527        # the case of <html> without doctype, in that case the error message
3528        # will be #344
3529    }
3530
3531    if (($err->{num} eq '113') and index($err->{msg}, 'xml:space') != -1) {
3532
3533        # FIXME
3534        # this is a problem with some of the "flattened" W3C DTDs, filtering
3535        # them out to not confuse users. hoping to get the DTDs fixed, see
3536        # http://lists.w3.org/Archives/Public/www-html-editor/2007AprJun/0010.html
3537        return;    # don't report this, just proceed
3538    }
3539
3540    if ($is_xml and $err->{num} eq '344' and $File->{Namespace}) {
3541
3542        # we are in XML mode, we have a namespace, but no doctype.
3543        # the validator will already have said "no doctype, falling back to
3544        # default" above
3545        # no need to report this.
3546        return;    # don't report this, just proceed
3547    }
3548
3549    if (($err->{num} eq '248') or
3550        ($err->{num} eq '247') or
3551        ($err->{num} eq '246'))
3552    {
3553
3554        # these two errors should be triggered by -wmin-tag to report shorttag
3555        # used, but we're making them warnings, not errors
3556        # see http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.7
3557        $err->{type} = "W";
3558    }
3559
3560    # Workaround for onsgmls as of 1.5 sometimes allegedly reporting errors
3561    # beyond EOL.  If you see this warning in your web server logs, please
3562    # let the validator developers know, see http://validator.w3.org/feedback.html
3563    # As long as $err may be from somewhere else than the document (such as
3564    # from a DTD) and we have no way of identifying these cases, this
3565    # produces bogus results and error log spewage, so commented out for now.
3566    #  if ((my $l = length($File->{Content}->[$err->{line}-1])) < $err->{char}) {
3567    #    warn("Warning: reported error column larger than line length " .
3568    #         "($err->{char} > $l) in $File->{URI} line $err->{line}, " .
3569    #         "OpenSP bug? Resetting to line length.");
3570    #    $err->{char} = $l;
3571    #  }
3572
3573    # No or unknown FPI and a relative SI.
3574    if ($err->{msg} =~ m(cannot (?:open|find))) {
3575        $File->{'Error Flagged'} = TRUE;
3576        my $tmpl = &W3C::Validator::MarkupValidator::get_error_template($File);
3577        $tmpl->param(fatal_parse_extid_error => TRUE);
3578        $tmpl->param(fatal_parse_extid_msg   => $err->{msg});
3579    }
3580
3581    # No DOCTYPE found! We are falling back to vanilla DTD
3582    if (index($err->{msg}, "prolog can't be omitted") != -1) {
3583        if (lc($File->{Root}) eq 'html') {
3584            my $dtd = $File->{"Default DOCTYPE"}->{$is_xml ? "XHTML" : "HTML"};
3585            W3C::Validator::MarkupValidator::add_warning('W09',
3586                {W09_dtd => $dtd});
3587        }
3588        else {    # not html root element, we are not using fallback
3589            unless ($is_xml) {
3590                $File->{'Is Valid'} = FALSE;
3591                W3C::Validator::MarkupValidator::add_warning('W09nohtml', {});
3592            }
3593        }
3594
3595        return;    # Don't report this as a normal error.
3596    }
3597
3598    # TODO: calling exit() here is probably a bad idea
3599    W3C::Validator::MarkupValidator::abort_if_error_flagged($File);
3600
3601    push @{$File->{Errors}}, $err;
3602
3603    # ...
3604    $File->{'Is Valid'} = FALSE if $err->{type} eq 'E';
3605
3606    if (defined $mess->{aux_message}) {
3607
3608        # "duplicate id ... first defined here" style messages
3609        push @{$File->{Errors}},
3610            {
3611            line => $mess->{aux_message}{LineNumber},
3612            char => $mess->{aux_message}{ColumnNumber} + 1,
3613            msg  => $mess->{aux_message}{Text},
3614            type => 'I',
3615            };
3616    }
3617}
3618
3619package W3C::Validator::EventHandler::Outliner;
3620
3621#
3622# Define global constants
3623use constant TRUE  => 1;
3624use constant FALSE => 0;
3625
3626#
3627# Tentative Validation Severities.
3628use constant T_WARN  => 4;    # 0000 0100
3629use constant T_ERROR => 8;    # 0000 1000
3630
3631use base qw(W3C::Validator::EventHandler);
3632
3633sub new
3634{
3635    my $class  = shift;
3636    my $parser = shift;
3637    my $File   = shift;
3638    my $CFG    = shift;
3639    my $self   = $class->SUPER::new($parser, $File, $CFG);
3640    $self->{am_in_heading} = 0;
3641    $self->{heading_text}  = [];
3642    bless $self, $class;
3643}
3644
3645sub data
3646{
3647    my ($self, $chars) = @_;
3648    push(@{$self->{heading_text}}, $chars->{Data}) if $self->{am_in_heading};
3649}
3650
3651sub start_element
3652{
3653    my ($self, $element) = @_;
3654    if ($element->{Name} =~ /^h([1-6])$/i) {
3655        $self->{_file}->{heading_outline} ||= "";
3656        $self->{_file}->{heading_outline} .=
3657            "    " x int($1) . "[$element->{Name}] ";
3658        $self->{am_in_heading} = 1;
3659    }
3660
3661    return $self->SUPER::start_element($element);
3662}
3663
3664sub end_element
3665{
3666    my ($self, $element) = @_;
3667    if ($element->{Name} =~ /^h[1-6]$/i) {
3668        my $text = join("", @{$self->{heading_text}});
3669        $text =~ s/^\s+//g;
3670        $text =~ s/\s+/ /g;
3671        $text =~ s/\s+$//g;
3672        $self->{_file}->{heading_outline} .= "$text\n";
3673        $self->{am_in_heading} = 0;
3674        $self->{heading_text}  = [];
3675    }
3676}
3677
3678#####
3679
3680package W3C::Validator::UserAgent;
3681
3682use HTTP::Message qw();
3683use LWP::UserAgent 2.032 qw();    # Need 2.032 for default_header()
3684use Net::hostent qw(gethostbyname);
3685use Net::IP qw();
3686use Socket qw(inet_ntoa);
3687
3688use base qw(LWP::UserAgent);
3689
3690BEGIN {
3691
3692    # The 4k default line length in LWP <= 5.832 isn't enough for example to
3693    # accommodate 4kB cookies (RFC 2985); bump it (#6678).
3694    require LWP::Protocol::http;
3695    push(@LWP::Protocol::http::EXTRA_SOCK_OPTS, MaxLineLength => 8 * 1024);
3696}
3697
3698sub new
3699{
3700    my ($proto, $CFG, $File, @rest) = @_;
3701    my $class = ref($proto) || $proto;
3702    my $self = $class->SUPER::new(@rest);
3703
3704    $self->{'W3C::Validator::CFG'}  = $CFG;
3705    $self->{'W3C::Validator::File'} = $File;
3706
3707    $self->env_proxy();
3708    $self->agent($File->{Opt}->{'User Agent'});
3709    $self->protocols_allowed($CFG->{Protocols}->{Allow} || ['http', 'https']);
3710
3711    # Don't parse the http-equiv stuff.
3712    $self->parse_head(0);
3713
3714    # Tell caches in the middle we want a fresh copy (Bug 4998).
3715    $self->default_header('Cache-Control' => 'max-age=0');
3716
3717    # If not in debug mode, set Accept-Encoding to what LWP (>= 5.816) can handle
3718    $self->default_header(
3719        'Accept-Encoding' => scalar HTTP::Message::decodable())
3720        if (!$File->{Opt}->{Debug} && HTTP::Message->can('decodable'));
3721
3722    # Our timeout should be set to something lower than the web server's,
3723    # remembering to give some head room for the actual validation to take
3724    # place after the document has been fetched (something like 15 seconds
3725    # should be plenty).  validator.w3.org instances have their timeout set
3726    # to 60 seconds as of writing this (#4985, #6950).
3727    $self->timeout(45);
3728
3729    return $self;
3730}
3731
3732sub redirect_ok
3733{
3734    my ($self, $req, $res) = @_;
3735    return $self->SUPER::redirect_ok($req, $res) && $self->uri_ok($req->uri());
3736}
3737
3738sub uri_ok
3739{
3740    my ($self, $uri) = @_;
3741
3742    return 1
3743        if ($self->{'W3C::Validator::CFG'}->{'Allow Private IPs'} ||
3744        !$uri->can('host'));
3745
3746    my $h5uri = $self->{'W3C::Validator::CFG'}->{External}->{HTML5};
3747    if ($h5uri) {
3748        my $clone = $uri->clone();
3749        $clone->query(undef);
3750        $clone->fragment(undef);
3751        $h5uri = URI->new($h5uri);
3752        $h5uri->query(undef);
3753        $h5uri->fragment(undef);
3754        return 1 if $clone->eq($h5uri);
3755    }
3756
3757    my $addr = my $iptype = undef;
3758    if (my $host = gethostbyname($uri->host())) {
3759        $addr = inet_ntoa($host->addr()) if $host->addr();
3760        if ($addr && (my $ip = Net::IP->new($addr))) {
3761            $iptype = $ip->iptype();
3762        }
3763    }
3764    if ($iptype && $iptype ne 'PUBLIC') {
3765        my $File = $self->{'W3C::Validator::File'};
3766        $File->{'Error Flagged'} = 1;
3767        my $tmpl = &W3C::Validator::MarkupValidator::get_error_template($File);
3768        $tmpl->param(fatal_ip_error    => 1);
3769        $tmpl->param(fatal_ip_host     => $uri->host() || 'undefined');
3770        $tmpl->param(fatal_ip_hostname => 1)
3771            if ($addr and $uri->host() ne $addr);
3772        return 0;
3773    }
3774    return 1;
3775}
3776
3777# Local Variables:
3778# mode: perl
3779# indent-tabs-mode: nil
3780# cperl-indent-level: 4
3781# cperl-continued-statement-offset: 4
3782# cperl-brace-offset: -4
3783# perl-indent-level: 4
3784# End:
3785# ex: ts=4 sw=4 et
3786