1#!/usr/local/bin/perl -T 2# 3# W3C Markup Validation Service 4# A CGI script to retrieve and validate a markup file 5# 6# Copyright 1995-2012 World Wide Web Consortium, (Massachusetts 7# Institute of Technology, European Research Consortium for Informatics 8# and Mathematics, Keio University). All Rights Reserved. 9# 10# Originally written by Gerald Oskoboiny <gerald@w3.org> 11# for additional contributors, see 12# http://dvcs.w3.org/hg/markup-validator/shortlog/tip and 13# http://validator.w3.org/about.html#credits 14# 15# This source code is available under the license at: 16# http://www.w3.org/Consortium/Legal/copyright-software 17 18# 19# We need Perl 5.8.0+. 20use 5.008; 21 22############################################################################### 23#### Load modules. ############################################################ 24############################################################################### 25 26# 27# Pragmas. 28use strict; 29use warnings; 30use utf8; 31 32package W3C::Validator::MarkupValidator; 33 34# 35# Modules. See also the BEGIN block further down below. 36# 37# Version numbers given where we absolutely need a minimum version of a given 38# module (gives nicer error messages). By default, add an empty import list 39# when loading modules to prevent non-OO or poorly written modules from 40# polluting our namespace. 41# 42 43# Need 3.40 for query string and path info fixes, #4365 44use CGI 3.40 qw(-newstyle_urls -private_tempfiles redirect); 45use CGI::Carp qw(carp croak fatalsToBrowser); 46use Config qw(%Config); 47use Config::General 2.32 qw(); # Need 2.32 for <msg 0>, rt.cpan.org#17852 48use Encode qw(); 49use Encode::Alias qw(); 50use Encode::HanExtra qw(); # for some chinese character encodings, 51 # e.g gb18030 52use File::Spec::Functions qw(catfile rel2abs tmpdir); 53use HTML::Encoding 0.52 qw(); 54use HTML::HeadParser 3.60 qw(); # Needed for HTML5 meta charset workaround 55use HTML::Parser 3.24 qw(); # Need 3.24 for $p->parse($code_ref) 56use HTML::Template qw(); # Need 2.6 for path param, other things. 57 # Specifying 2.6 would break with 2.10, 58 # rt.cpan.org#70190 59use HTTP::Headers::Util qw(); 60use HTTP::Message 1.52 qw(); # Need 1.52 for decoded_content() 61use HTTP::Request qw(); 62use HTTP::Headers::Auth qw(); # Needs to be imported after other HTTP::*. 63use JSON 2.00 qw(); 64use SGML::Parser::OpenSP 0.991 qw(); 65use URI 1.53 qw(); # Need 1.53 for secure() 66use URI::Escape qw(uri_escape); 67use URI::file; 68use URI::Heuristic qw(); 69 70############################################################################### 71#### Constant definitions. #################################################### 72############################################################################### 73 74# 75# Define global constants 76use constant TRUE => 1; 77use constant FALSE => 0; 78 79# 80# Tentative Validation Severities. 81use constant T_WARN => 4; # 0000 0100 82use constant T_ERROR => 8; # 0000 1000 83 84# 85# Define global variables. 86use vars qw($DEBUG $CFG %RSRC $VERSION); 87$VERSION = '1.3'; 88 89use constant IS_MODPERL2 => 90 (exists($ENV{MOD_PERL_API_VERSION}) && $ENV{MOD_PERL_API_VERSION} >= 2); 91 92# 93# Things inside BEGIN don't happen on every request in persistent environments 94# (such as mod_perl); so let's do the globals, eg. read config, here. 95BEGIN { 96 97 my $base = $ENV{W3C_VALIDATOR_HOME} || '/usr/local/www/validator'; 98 99 # Launder data for -T; -AutoLaunder doesn't catch this one. 100 if ($base =~ /^(.*)$/) { 101 $base = $1; 102 } 103 104 # 105 # Read Config Files. 106 eval { 107 my %config_opts = ( 108 -ConfigFile => 109 ($ENV{W3C_VALIDATOR_CFG} || '/usr/local/www/validator/htdocs/config/validator.conf'), 110 -MergeDuplicateOptions => TRUE, 111 -MergeDuplicateBlocks => TRUE, 112 -SplitPolicy => 'equalsign', 113 -UseApacheInclude => TRUE, 114 -IncludeRelative => TRUE, 115 -InterPolateVars => TRUE, 116 -AutoLaunder => TRUE, 117 -AutoTrue => TRUE, 118 -CComments => FALSE, 119 -DefaultConfig => { 120 Protocols => {Allow => 'http,https'}, 121 Paths => { 122 Base => $base, 123 Cache => '', 124 }, 125 External => {HTML5 => FALSE,}, 126 }, 127 ); 128 my %cfg = Config::General->new(%config_opts)->getall(); 129 $CFG = \%cfg; 130 }; 131 if ($@) { 132 die <<"EOF"; 133Could not read configuration. Set the W3C_VALIDATOR_CFG environment variable 134or copy conf/* to /etc/w3c/. Make sure that the configuration file and all 135included files are readable by the web server user. The error was:\n'$@' 136EOF 137 } 138 139 # 140 # Check paths in config 141 # @@FIXME: This does not do a very good job error-message-wise if 142 # a path is missing... 143 { 144 my %paths = map { $_ => [-d $_, -r _] } $CFG->{Paths}->{Base}, 145 $CFG->{Paths}->{Templates}, $CFG->{Paths}->{SGML}->{Library}; 146 my @_d = grep { not $paths{$_}->[0] } keys %paths; 147 my @_r = grep { not $paths{$_}->[1] } keys %paths; 148 die "Does not exist or is not a directory: @_d\n" if scalar(@_d); 149 die "Directory not readable (permission denied): @_r\n" if scalar(@_r); 150 } 151 152 # 153 # Split allowed protocols into a list. 154 if (my $allowed = delete($CFG->{Protocols}->{Allow})) { 155 $CFG->{Protocols}->{Allow} = [split(/\s*,\s*/, $allowed)]; 156 } 157 158 # Split available languages into a list 159 if (my $langs = delete($CFG->{Languages})) { 160 $CFG->{Languages} = [split(/\s+/, $langs)]; 161 } 162 else { 163 164 # Default to english 165 $CFG->{Languages} = ["en"]; 166 } 167 168 { # Make types config indexed by FPI. 169 my $types = {}; 170 while (my ($key, $value) = each %{$CFG->{Types}}) { 171 $types->{$CFG->{Types}->{$key}->{PubID}} = $value; 172 } 173 $CFG->{Types} = $types; 174 } 175 176 # 177 # Change strings to internal constants in MIME type mapping. 178 while (my ($key, $value) = each %{$CFG->{MIME}}) { 179 $CFG->{MIME}->{$key} = 'TBD' 180 unless ($value eq 'SGML' || $value eq 'XML'); 181 } 182 183 # 184 # Register Encode aliases. 185 while (my ($key, $value) = each %{$CFG->{Charsets}}) { 186 Encode::Alias::define_alias($key, $1) if ($value =~ /^[AX] (\S+)/); 187 } 188 189 # 190 # Set debug flag. 191 if ($CFG->{'Allow Debug'}) { 192 $DEBUG = TRUE if $ENV{W3C_VALIDATOR_DEBUG} || $CFG->{'Enable Debug'}; 193 } 194 else { 195 $DEBUG = FALSE; 196 } 197 198 # Read friendly error message file 199 # 'en_US' should be replaced by $lang for lang-neg 200 %RSRC = Config::General->new( 201 -MergeDuplicateBlocks => 1, 202 -ConfigFile => 203 catfile($CFG->{Paths}->{Templates}, 'en_US', 'error_messages.cfg'), 204 )->getall(); 205 206 eval { 207 local $SIG{__DIE__} = undef; 208 require Encode::JIS2K; # for optional extra Japanese encodings 209 }; 210 211 # Tell libxml to load _only_ our XML catalog. This is because our entity 212 # load jailing may trap the libxml internal default catalog (which is 213 # automatically loaded). Preventing loading that from the input callback 214 # will cause libxml to not see the document content at all but to throw 215 # weird "Document is empty" errors, at least as of XML::LibXML 1.70 and 216 # libxml 2.7.7. XML_CATALOG_FILES needs to be in effect at XML::LibXML 217 # load time which is why we're using "require" here instead of pulling it 218 # in with "use" as usual. And finally, libxml should have support for 219 # SGML open catalogs but they don't seem to work (again as of 1.70 and 220 # 2.7.7); if we use xml.soc here, no entities seem to end up being resolved 221 # from it - so we use a (redundant) XML catalog which works. 222 # Note that setting XML_CATALOG_FILES here does not seem to work with 223 # mod_perl (it doesn't end up being used by XML::LibXML), therefore we do 224 # it in the mod_perl/startup.pl startup file for it too. 225 local $ENV{XML_CATALOG_FILES} = 226 catfile($CFG->{Paths}->{SGML}->{Library}, 'catalog.xml'); 227 require XML::LibXML; 228 XML::LibXML->VERSION(1.73); # Need 1.73 for rt.cpan.org #66642 229 230} # end of BEGIN block. 231 232# 233# Get rid of (possibly insecure) $PATH. 234delete $ENV{PATH}; 235 236#@@DEBUG: Dump $CFG datastructure. Used only as a developer aid. 237#use Data::Dumper qw(Dumper); 238#print Dumper($CFG); 239#exit; 240#@@DEBUG; 241 242############################################################################### 243#### Process CGI variables and initialize. #################################### 244############################################################################### 245 246# 247# Create a new CGI object. 248my $q = CGI->new(); 249 250# 251# The data structure that will hold all session data. 252# @@FIXME This can't be my() as $File will sooner or 253# later be undef and add_warning will cause the script 254# to die. our() seems to work but has other problems. 255# @@FIXME Apparently, this must be set to {} also, 256# otherwise the script might pick up an old object 257# after abort_if_error_flagged under mod_perl. 258our $File = {}; 259 260################################# 261# Initialize the datastructure. # 262################################# 263 264# 265# Charset data (casing policy: lowercase early). 266$File->{Charset}->{Use} = ''; # The charset used for validation. 267$File->{Charset}->{Auto} = ''; # Autodetection using XML rules (Appendix F) 268$File->{Charset}->{HTTP} = ''; # From HTTP's "charset" parameter. 269$File->{Charset}->{META} = ''; # From HTML's <meta http-equiv>. 270$File->{Charset}->{XML} = ''; # From the XML Declaration. 271$File->{Charset}->{Override} = ''; # From CGI/user override. 272 273# 274# Misc simple types. 275$File->{Mode} = 276 'DTD+SGML'; # Default parse mode is DTD validation in SGML mode. 277 278# By default, perform validation (we may perform only xml-wf in some cases) 279$File->{XMLWF_ONLY} = FALSE; 280 281# 282# Listrefs. 283$File->{Warnings} = []; # Warnings... 284$File->{Namespaces} = []; # Other (non-root) Namespaces. 285$File->{Parsers} = []; # Parsers used {name, link, type, options} 286 287# By default, doctype-less documents cannot be valid 288$File->{"DOCTYPEless OK"} = FALSE; 289$File->{"Default DOCTYPE"}->{"HTML"} = 'HTML 4.01 Transitional'; 290$File->{"Default DOCTYPE"}->{"XHTML"} = 'XHTML 1.0 Transitional'; 291 292############################################################################### 293#### Generate Template for Result. ############################################ 294############################################################################### 295 296# first we determine the chosen language based on 297# 1) lang argument given as parameter (if this language is available) 298# 2) HTTP language negotiation between variants available and user-agent choices 299# 3) English by default 300my $lang = $q->param('lang') || ''; 301my @localizations; 302foreach my $lang_available (@{$CFG->{Languages}}) { 303 if ($lang eq $lang_available) { 304 305 # Requested language (from parameters) is available, just use it 306 undef @localizations; 307 last; 308 } 309 push @localizations, 310 [ 311 $lang_available, 1, 'text/html', undef, 312 'utf-8', $lang_available, undef 313 ]; 314} 315 316# If language is not chosen yet, use HTTP-based negotiation 317if (@localizations) { 318 require HTTP::Negotiate; 319 $lang = HTTP::Negotiate::choose(\@localizations); 320} 321 322# HTTP::Negotiate::choose may return undef e.g if sent Accept-Language: en;q=0 323$lang ||= 'en_US'; 324 325if ($lang eq "en") { 326 $lang = 'en_US'; # legacy 327} 328 329$File->{Template_Defaults} = { 330 die_on_bad_params => FALSE, 331 loop_context_vars => TRUE, 332 global_vars => TRUE, 333 case_sensitive => TRUE, 334 path => [catfile($CFG->{Paths}->{Templates}, $lang)], 335 filter => sub { my $ref = shift; ${$ref} = Encode::decode_utf8(${$ref}); }, 336}; 337if (IS_MODPERL2()) { 338 $File->{Template_Defaults}->{cache} = TRUE; 339} 340elsif ($CFG->{Paths}->{Cache}) { 341 $File->{Template_Defaults}->{file_cache} = TRUE; 342 $File->{Template_Defaults}->{file_cache_dir} = 343 rel2abs($CFG->{Paths}->{Cache}, tmpdir()); 344} 345 346undef $lang; 347 348######################################### 349# Populate $File->{Opt} -- CGI Options. # 350######################################### 351 352# 353# Preprocess the CGI parameters. 354$q = &prepCGI($File, $q); 355 356# 357# Set session switches. 358$File->{Opt}->{Outline} = $q->param('outline') ? TRUE : FALSE; 359$File->{Opt}->{'Show Source'} = $q->param('ss') ? TRUE : FALSE; 360$File->{Opt}->{'Show Tidy'} = $q->param('st') ? TRUE : FALSE; 361$File->{Opt}->{Verbose} = $q->param('verbose') ? TRUE : FALSE; 362$File->{Opt}->{'Group Errors'} = $q->param('group') ? TRUE : FALSE; 363$File->{Opt}->{Debug} = $q->param('debug') ? TRUE : FALSE; 364$File->{Opt}->{No200} = $q->param('No200') ? TRUE : FALSE; 365$File->{Opt}->{Prefill} = $q->param('prefill') ? TRUE : FALSE; 366$File->{Opt}->{'Prefill Doctype'} = $q->param('prefill_doctype') || 'html401'; 367$File->{Opt}->{Charset} = lc($q->param('charset') || ''); 368$File->{Opt}->{DOCTYPE} = $q->param('doctype') || ''; 369 370$File->{Opt}->{'User Agent'} = 371 $q->param('user-agent') && 372 $q->param('user-agent') ne "1" ? $q->param('user-agent') : 373 "W3C_Validator/$VERSION"; 374$File->{Opt}->{'User Agent'} =~ tr/\x00-\x09\x0b\x0c-\x1f//d; 375 376if ($File->{Opt}->{'User Agent'} eq 'mobileok') { 377 $File->{Opt}->{'User Agent'} = 378 'W3C-mobileOK/DDC-1.0 (see http://www.w3.org/2006/07/mobileok-ddc)'; 379} 380 381$File->{Opt}->{'Accept Header'} = $q->param('accept') || ''; 382$File->{Opt}->{'Accept-Language Header'} = $q->param('accept-language') || ''; 383$File->{Opt}->{'Accept-Charset Header'} = $q->param('accept-charset') || ''; 384$File->{Opt}->{$_} =~ tr/\x00-\x09\x0b\x0c-\x1f//d 385 for ('Accept Header', 'Accept-Language Header', 'Accept-Charset Header'); 386 387# 388# "Fallback" info for Character Encoding (fbc), Content-Type (fbt), 389# and DOCTYPE (fbd). If TRUE, the Override values are treated as 390# Fallbacks instead of Overrides. 391$File->{Opt}->{FB}->{Charset} = $q->param('fbc') ? TRUE : FALSE; 392$File->{Opt}->{FB}->{Type} = $q->param('fbt') ? TRUE : FALSE; 393$File->{Opt}->{FB}->{DOCTYPE} = $q->param('fbd') ? TRUE : FALSE; 394 395# 396# If ";debug" was given, let it overrule the value from the config file, 397# regardless of whether it's "0" or "1" (on or off), but only if config 398# allows the debugging options. 399if ($CFG->{'Allow Debug'}) { 400 $DEBUG = $q->param('debug') if defined $q->param('debug'); 401 $File->{Opt}->{Verbose} = TRUE if $DEBUG; 402} 403else { 404 $DEBUG = FALSE; # The default. 405} 406$File->{Opt}->{Debug} = $DEBUG; 407 408&abort_if_error_flagged($File); 409 410# 411# Get the file and metadata. 412if ($q->param('uploaded_file')) { 413 $File = &handle_file($q, $File); 414} 415elsif ($q->param('fragment')) { 416 $File = &handle_frag($q, $File); 417} 418elsif ($q->param('uri')) { 419 $File = &handle_uri($q, $File); 420} 421 422# 423# Abort if an error was flagged during initialization. 424&abort_if_error_flagged($File); 425 426# 427# Get rid of the CGI object. 428undef $q; 429 430# 431# We don't need STDIN any more, so get rid of it to avoid getting clobbered 432# by Apache::Registry's idiotic interference under mod_perl. 433untie *STDIN; 434 435############################################################################### 436#### Output validation results. ############################################### 437############################################################################### 438 439if (!$File->{ContentType} && !$File->{'Direct Input'} && !$File->{'Is Upload'}) 440{ 441 &add_warning('W08', {}); 442} 443 444$File = find_encodings($File); 445 446# 447# Decide on a charset to use (first part) 448# 449if ($File->{Charset}->{HTTP}) { # HTTP, if given, is authoritative. 450 $File->{Charset}->{Use} = $File->{Charset}->{HTTP}; 451} 452elsif ($File->{ContentType} =~ m(^text/(?:[-.a-zA-Z0-9]\+)?xml$)) { 453 454 # Act as if $http_charset was 'us-ascii'. (MIME rules) 455 $File->{Charset}->{Use} = 'us-ascii'; 456 457 &add_warning( 458 'W01', 459 { W01_upload => $File->{'Is Upload'}, 460 W01_agent => $File->{Server}, 461 W01_ct => $File->{ContentType}, 462 } 463 ); 464 465} 466elsif ($File->{Charset}->{XML}) { 467 $File->{Charset}->{Use} = $File->{Charset}->{XML}; 468} 469elsif ($File->{BOM} && 470 $File->{BOM} == 2 && 471 $File->{Charset}->{Auto} =~ /^utf-16[bl]e$/) 472{ 473 $File->{Charset}->{Use} = 'utf-16'; 474} 475elsif ($File->{ContentType} =~ m(^application/(?:[-.a-zA-Z0-9]+\+)?xml$)) { 476 $File->{Charset}->{Use} = "utf-8"; 477} 478elsif (&is_xml($File) and not $File->{ContentType} =~ m(^text/)) { 479 $File->{Charset}->{Use} = 'utf-8'; # UTF-8 (image/svg+xml etc.) 480} 481$File->{Charset}->{Use} ||= $File->{Charset}->{META}; 482 483# 484# Handle any Fallback or Override for the charset. 485if (charset_not_equal($File->{Opt}->{Charset}, '(detect automatically)')) { 486 487 # charset=foo was given to the CGI and it wasn't "autodetect" or empty. 488 # 489 # Extract the user-requested charset from CGI param. 490 my ($override, undef) = split(/\s/, $File->{Opt}->{Charset}, 2); 491 $File->{Charset}->{Override} = lc($override); 492 493 if ($File->{Opt}->{FB}->{Charset}) { # charset fallback mode 494 unless ($File->{Charset}->{Use}) 495 { # no charset detected, actual fallback 496 &add_warning('W02', {W02_charset => $File->{Charset}->{Override}}); 497 $File->{Tentative} |= T_ERROR; # Tag it as Invalid. 498 $File->{Charset}->{Use} = $File->{Charset}->{Override}; 499 } 500 } 501 else { # charset "hard override" mode 502 if (!$File->{Charset}->{Use}) { # overriding "nothing" 503 &add_warning( 504 'W04', 505 { W04_charset => $File->{Charset}->{Override}, 506 W04_override => TRUE 507 } 508 ); 509 $File->{Tentative} |= T_ERROR; 510 $File->{Charset}->{Use} = $File->{Charset}->{Override}; 511 } 512 elsif ($File->{Charset}->{Override} ne $File->{Charset}->{Use}) { 513 514 # Actually overriding something; warn about override. 515 &add_warning( 516 'W03', 517 { W03_use => $File->{Charset}->{Use}, 518 W03_opt => $File->{Charset}->{Override} 519 } 520 ); 521 $File->{Tentative} |= T_ERROR; 522 $File->{Charset}->{Use} = $File->{Charset}->{Override}; 523 } 524 } 525} 526 527if ($File->{'Direct Input'}) { #explain why UTF-8 is forced 528 &add_warning('W28', {}); 529} 530unless ($File->{Charset}->{XML} || $File->{Charset}->{META}) 531{ #suggest character encoding info within doc 532 &add_warning('W27', {}); 533} 534 535# 536# Abort if an error was flagged while finding the encoding. 537&abort_if_error_flagged($File); 538 539$File->{Charset}->{Default} = FALSE; 540unless ($File->{Charset}->{Use}) { # No charset given... 541 $File->{Charset}->{Use} = 'utf-8'; 542 $File->{Charset}->{Default} = TRUE; 543 $File->{Tentative} |= T_ERROR; # Can never be valid. 544 &add_warning('W04', {W04_charset => "UTF-8"}); 545} 546 547# Always transcode, even if the content claims to be UTF-8 548$File = transcode($File); 549 550# Try guessing if it didn't work out 551if ($File->{ContentType} eq 'text/html' && $File->{Charset}->{Default}) { 552 my $also_tried = 'UTF-8'; 553 for my $cs (qw(windows-1252 iso-8859-1)) { 554 last unless $File->{'Error Flagged'}; 555 $File->{'Error Flagged'} = FALSE; # reset 556 $File->{Charset}->{Use} = $cs; 557 &add_warning('W04', 558 {W04_charset => $cs, W04_also_tried => $also_tried}); 559 $File = transcode($File); 560 $also_tried .= ", $cs"; 561 } 562} 563 564# if it still does not work, we abandon hope here 565&abort_if_error_flagged($File); 566 567# 568# Add a warning if doc is UTF-8 and contains a BOM. 569if ($File->{Charset}->{Use} eq 'utf-8' && 570 @{$File->{Content}} && 571 $File->{Content}->[0] =~ m(^\x{FEFF})) 572{ 573 &add_warning('W21', {}); 574} 575 576# 577# Overall parsing algorithm for documents returned as text/html: 578# 579# For documents that come to us as text/html, 580# 581# 1. check if there's a doctype 582# 2. if there is a doctype, parse/validate against that DTD 583# 3. if no doctype, check for an xmlns= attribute on the first element, or 584# XML declaration 585# 4. if no doctype and XML mode, check for XML well-formedness 586# 5. otherwise, punt. 587# 588 589# 590# Override DOCTYPE if user asked for it. 591if ($File->{Opt}->{DOCTYPE}) { 592 if ($File->{Opt}->{DOCTYPE} !~ /(?:Inline|detect)/i) { 593 $File = &override_doctype($File); 594 } 595 else { 596 597 # Get rid of inline|detect for easy truth value checking later 598 $File->{Opt}->{DOCTYPE} = ''; 599 } 600} 601 602# Try to extract a DOCTYPE or xmlns. 603$File = &preparse_doctype($File); 604 605if ($File->{Opt}->{DOCTYPE} eq "HTML5") { 606 $File->{DOCTYPE} = "HTML5"; 607 $File->{Version} = $File->{DOCTYPE}; 608} 609 610set_parse_mode($File, $CFG); 611 612# 613# Sanity check Charset information and add any warnings necessary. 614$File = &charset_conflicts($File); 615 616# before we start the parsing, clean slate 617$File->{'Is Valid'} = TRUE; 618$File->{Errors} = []; 619$File->{WF_Errors} = []; 620 621if (($File->{DOCTYPE} eq "HTML5") or ($File->{DOCTYPE} eq "XHTML5")) { 622 if ($CFG->{External}->{HTML5}) { 623 $File = &html5_validate($File); 624 &add_warning( 625 'W00', 626 { W00_experimental_name => "HTML5 Conformance Checker", 627 W00_experimental_URI => "feedback.html" 628 } 629 ); 630 } 631 else { 632 $File->{'Error Flagged'} = TRUE; 633 my $tmpl = &get_error_template($File); 634 $tmpl->param(fatal_no_checker => TRUE); 635 $tmpl->param(fatal_missing_checker => 'HTML5 Validator'); 636 } 637} 638elsif (($File->{DOCTYPE} eq '') and 639 (($File->{Root} eq "svg") or @{$File->{Namespaces}} > 1)) 640{ 641 642 # we send doctypeless SVG, or any doctypeless XML document with multiple 643 # namespaces found, to a different engine. WARNING this is experimental. 644 if ($CFG->{External}->{CompoundXML}) { 645 $File = &compoundxml_validate($File); 646 &add_warning( 647 'W00', 648 { W00_experimental_name => "validator.nu Conformance Checker", 649 W00_experimental_URI => "feedback.html" 650 } 651 ); 652 } 653} 654else { 655 $File = &dtd_validate($File); 656} 657&abort_if_error_flagged($File); 658if (&is_xml($File)) { 659 if ($File->{DOCTYPE} eq "HTML5") { 660 661 # $File->{DOCTYPE} = "XHTML5"; 662 # $File->{Version} = "XHTML5"; 663 } 664 else { 665 666 # XMLWF check can be slow, skip if we already know the doc can't pass. 667 # http://www.w3.org/Bugs/Public/show_bug.cgi?id=9899 668 $File = &xmlwf($File) if $File->{'Is Valid'}; 669 } 670 &abort_if_error_flagged($File); 671} 672 673# 674# Force "XML" if type is an XML type and an FPI was not found. 675# Otherwise set the type to be the FPI. 676if (&is_xml($File) and not $File->{DOCTYPE} and lc($File->{Root}) ne 'html') { 677 $File->{Version} = 'XML'; 678} 679else { 680 $File->{Version} ||= $File->{DOCTYPE}; 681} 682 683# 684# Get the pretty text version of the FPI if a mapping exists. 685if (my $prettyver = $CFG->{Types}->{$File->{Version}}->{Display}) { 686 $File->{Version} = $prettyver; 687} 688 689# 690# check the received mime type against Allowed mime types 691if ($File->{ContentType}) { 692 my @allowedMediaType = 693 split(/\s+/, 694 $CFG->{Types}->{$File->{DOCTYPE}}->{Types}->{Allowed} || ''); 695 my $usedCTisAllowed; 696 if (scalar @allowedMediaType) { 697 $usedCTisAllowed = FALSE; 698 foreach (@allowedMediaType) { 699 $usedCTisAllowed = TRUE if ($_ eq $File->{ContentType}); 700 } 701 } 702 else { 703 704 # wedon't know what media type is recommended, so better shut up 705 $usedCTisAllowed = TRUE; 706 } 707 if (!$usedCTisAllowed) { 708 &add_warning( 709 'W23', 710 { W23_type => $File->{ContentType}, 711 W23_type_pref => 712 $CFG->{Types}->{$File->{DOCTYPE}}->{Types}->{Preferred}, 713 w23_doctype => $File->{Version} 714 } 715 ); 716 } 717} 718 719# 720# Warn about unknown, incorrect, or missing Namespaces. 721if ($File->{Namespace}) { 722 my $ns = $CFG->{Types}->{$File->{Version}}->{Namespace} || FALSE; 723 724 if (&is_xml($File)) { 725 if ($ns eq $File->{Namespace}) { 726 &add_warning( 727 'W10', 728 { W10_ns => $File->{Namespace}, 729 W10_type => $File->{Type}, 730 } 731 ); 732 } 733 } 734 elsif ($File->{DOCTYPE} ne 'HTML5') { 735 &add_warning( 736 'W11', 737 { W11_ns => $File->{Namespace}, 738 w11_doctype => $File->{DOCTYPE} 739 } 740 ); 741 } 742} 743else { 744 if (&is_xml($File) and $CFG->{Types}->{$File->{Version}}->{Namespace}) { 745 &add_warning('W12', {}); 746 } 747} 748 749## if invalid content, AND if requested, pass through tidy 750if (!$File->{'Is Valid'} && $File->{Opt}->{'Show Tidy'}) { 751 eval { 752 local $SIG{__DIE__} = undef; 753 require HTML::Tidy; 754 my $tidy = HTML::Tidy->new({config_file => $CFG->{Paths}->{TidyConf}}); 755 my $cleaned = $tidy->clean(join("\n", @{$File->{Content}})); 756 $cleaned = Encode::decode_utf8($cleaned); 757 $File->{Tidy} = $cleaned; 758 }; 759 if ($@) { 760 (my $errmsg = $@) =~ s/ at .*//s; 761 &add_warning('W29', {W29_msg => $errmsg}); 762 } 763} 764 765my %templates = ( 766 earl => ['earl_xml.tmpl', default_escape => 'HTML'], 767 n3 => ['earl_n3.tmpl'], 768 json => ['json_output.tmpl'], 769 ucn => ['ucn_output.tmpl'], 770); 771my $template = $templates{$File->{Opt}->{Output}}; 772if ($template) { 773 my $tname = shift(@$template); 774 my $tmpl = &get_template($File, $tname, @$template); 775 $template = $tmpl; 776} 777elsif ($File->{Opt}->{Output} eq 'soap12') { 778 if ($CFG->{'Enable SOAP'} != 1) { 779 780 # API disabled - ideally this should have been sent before performing 781 # validation... 782 print CGI::header( 783 -status => 503, 784 -content_language => "en", 785 -type => "text/html", 786 -charset => "utf-8" 787 ); 788 $template = &get_template($File, 'soap_disabled.tmpl'); 789 } 790 else { 791 $template = &get_template($File, 'soap_output.tmpl'); 792 } 793} 794else { 795 $template = &get_template($File, 'result.tmpl'); 796} 797 798&prep_template($File, $template); 799&fin_template($File, $template); 800 801$template->param(tidy_output => $File->{Tidy}); 802$template->param(file_source => &source($File)) 803 if ($template->param('opt_show_source') or 804 ($File->{'Is Upload'}) or 805 ($File->{'Direct Input'})); 806 807if ($File->{Opt}->{Output} eq 'json') { 808 809 # No JSON escaping in HTML::Template (and "JS" is not the right thing here) 810 my $json = JSON->new(); 811 $json->allow_nonref(TRUE); 812 if (my $msgs = $template->param("file_errors")) { 813 for my $msg (@$msgs) { 814 for my $key (qw(msg expl)) { 815 $msg->{$key} = $json->encode($msg->{$key}) if $msg->{$key}; 816 } 817 818 # Drop non-numeric char indicators from output, e.g. 819 # "> 80" for some XML parse error ones (see the non-structured 820 # XML::LibXML code branch in XML preparsing below). 821 if ($msg->{char} && $msg->{char} !~ /^\d+$/) { 822 delete($msg->{char}); 823 } 824 } 825 } 826} 827 828# transcode output from perl's internal to utf-8 and output 829print Encode::encode('UTF-8', $template->output); 830 831# 832# Get rid of $File object and exit. 833undef $File; 834exit; 835 836############################################################################# 837# Subroutine definitions 838############################################################################# 839 840sub get_template ($$;@) 841{ 842 my ($File, $fname, @opts) = @_; 843 if (!$File->{_Templates}->{$fname}) { 844 my $tmpl = HTML::Template->new( 845 %{$File->{Template_Defaults}}, 846 filename => $fname, 847 @opts 848 ); 849 $tmpl->param(env_home_page => $File->{Env}->{'Home Page'}); 850 $tmpl->param(validator_version => $VERSION); 851 $File->{_Templates}->{$fname} = $tmpl; 852 } 853 return $File->{_Templates}->{$fname}; 854} 855 856sub get_error_template ($;@) 857{ 858 my ($File, @opts) = @_; 859 my $fname = 'fatal-error.tmpl'; 860 if ($File->{Opt}->{Output} eq 'soap12') { 861 $fname = 'soap_fault.tmpl'; 862 } 863 elsif ($File->{Opt}->{Output} eq 'ucn') { 864 $fname = 'ucn_fault.tmpl'; 865 } 866 return &get_template($File, $fname, @opts); 867} 868 869# TODO: need to bring in fixes from html5_validate() here 870sub compoundxml_validate (\$) 871{ 872 my $File = shift; 873 my $ua = W3C::Validator::UserAgent->new($CFG, $File); 874 875 push( 876 @{$File->{Parsers}}, 877 { name => "Compound XML", 878 link => "http://qa-dev.w3.org/", # TODO? 879 type => "", 880 options => "" 881 } 882 ); 883 884 my $url = URI->new($CFG->{External}->{CompoundXML}); 885 $url->query("out=xml"); 886 887 my $req = HTTP::Request->new(POST => $url); 888 889 if ($File->{Opt}->{DOCTYPE} || $File->{Charset}->{Override}) { 890 891 # Doctype or charset overridden, need to use $File->{Content} in UTF-8 892 # because $File->{Bytes} is not affected by the overrides. This will 893 # most likely be a source of errors about internal/actual charset 894 # differences as long as our transcoding process does not "fix" the 895 # charset info in XML declaration and meta http-equiv (any others?). 896 if ($File->{'Direct Input'}) 897 { # sane default when using html5 validator by direct input 898 $req->content_type("application/xml; charset=UTF-8"); 899 } 900 else { 901 $req->content_type("$File->{ContentType}; charset=UTF-8"); 902 } 903 $req->content(Encode::encode_utf8(join("\n", @{$File->{Content}}))); 904 } 905 else { 906 907 # Pass original bytes, Content-Type and charset as-is. 908 # We trust that our and validator.nu's interpretation of line numbers 909 # is the same (regardless of EOL chars used in the document). 910 911 my @content_type = ($File->{ContentType} => undef); 912 push(@content_type, charset => $File->{Charset}->{HTTP}) 913 if $File->{Charset}->{HTTP}; 914 915 $req->content_type( 916 HTTP::Headers::Util::join_header_words(@content_type)); 917 $req->content_ref(\$File->{Bytes}); 918 } 919 920 $req->content_language($File->{ContentLang}) if $File->{ContentLang}; 921 922 # Intentionally using direct header access instead of $req->last_modified 923 $req->header('Last-Modified', $File->{Modified}) if $File->{Modified}; 924 925 # If not in debug mode, gzip the request (LWP >= 5.817) 926 eval { $req->encode("gzip"); } unless $File->{Opt}->{Debug}; 927 928 my $res = $ua->request($req); 929 if (!$res->is_success()) { 930 $File->{'Error Flagged'} = TRUE; 931 my $tmpl = &get_error_template($File); 932 $tmpl->param(fatal_no_checker => TRUE); 933 $tmpl->param(fatal_missing_checker => 'HTML5 Validator'); 934 $tmpl->param(fatal_checker_error => $res->status_line()); 935 } 936 else { 937 my $content = &get_content($File, $res); 938 return $File if $File->{'Error Flagged'}; 939 940 # and now we parse according to 941 # http://wiki.whatwg.org/wiki/Validator.nu_XML_Output 942 # I wish we could use XML::LibXML::Reader here. but SHAME on those 943 # major unix distributions still shipping with libxml2 2.6.16… 4 years 944 # after its release 945 # …and we could use now as we require libxml2 >= 2.6.21 anyway… 946 my $xml_reader = XML::LibXML->new(); 947 $xml_reader->base_uri($res->base()); 948 949 my $xmlDOM; 950 eval { $xmlDOM = $xml_reader->parse_string($content); }; 951 if ($@) { 952 my $errmsg = $@; 953 $File->{'Error Flagged'} = TRUE; 954 my $tmpl = &get_error_template($File); 955 $tmpl->param(fatal_no_checker => TRUE); 956 $tmpl->param(fatal_missing_checker => 'HTML5 Validator'); 957 $tmpl->param(fatal_checker_error => $errmsg); 958 return $File; 959 } 960 my @nodelist = $xmlDOM->getElementsByTagName("messages"); 961 my $messages_node = $nodelist[0]; 962 my @message_nodes = $messages_node->childNodes; 963 foreach my $message_node (@message_nodes) { 964 my $message_type = $message_node->localname; 965 my ($err, $xml_error_msg, $xml_error_expl); 966 967 if ($message_type eq "error") { 968 $err->{type} = "E"; 969 $File->{'Is Valid'} = FALSE; 970 } 971 elsif ($message_type eq "info") { 972 973 # by default - we find warnings in the type attribute (below) 974 $err->{type} = "I"; 975 } 976 if ($message_node->hasAttributes()) { 977 my @attributelist = $message_node->attributes(); 978 foreach my $attribute (@attributelist) { 979 if ($attribute->name eq "type") { 980 if (($attribute->getValue() eq "warning") and 981 ($message_type eq "info")) 982 { 983 $err->{type} = "W"; 984 } 985 986 } 987 if ($attribute->name eq "last-column") { 988 $err->{char} = $attribute->getValue(); 989 } 990 if ($attribute->name eq "last-line") { 991 $err->{line} = $attribute->getValue(); 992 } 993 994 } 995 } 996 my @child_nodes = $message_node->childNodes; 997 foreach my $child_node (@child_nodes) { 998 if ($child_node->localname eq "message") { 999 $xml_error_msg = $child_node->toString(); 1000 $xml_error_msg =~ s,</?[^>]*>,,gsi; 1001 } 1002 if ($child_node->localname eq "elaboration") { 1003 $xml_error_expl = $child_node->toString(); 1004 $xml_error_expl =~ s,</?elaboration>,,gi; 1005 $xml_error_expl = 1006 "\n<div class=\"ve xml\">$xml_error_expl</div>\n"; 1007 } 1008 } 1009 1010 # formatting the error message for output 1011 $err->{src} = "" if $err->{uri}; # TODO... 1012 $err->{num} = 'validator.nu'; 1013 $err->{msg} = $xml_error_msg; 1014 $err->{expl} = $xml_error_expl; 1015 1016 if ($err->{msg} =~ 1017 /Using the preset for (.*) based on the root namespace/) 1018 { 1019 $File->{DOCTYPE} = $1; 1020 } 1021 else { 1022 push @{$File->{Errors}}, $err; 1023 } 1024 1025 # @@ TODO message explanation / elaboration 1026 } 1027 } 1028 return $File; 1029} 1030 1031sub html5_validate (\$) 1032{ 1033 my $File = shift; 1034 my $ua = W3C::Validator::UserAgent->new($CFG, $File); 1035 1036 push( 1037 @{$File->{Parsers}}, 1038 { name => "validator.nu", 1039 link => "http://validator.nu/", 1040 type => "HTML5", 1041 options => "" 1042 } 1043 ); 1044 1045 my $url = URI->new($CFG->{External}->{HTML5}); 1046 $url->query("out=xml"); 1047 1048 my $req = HTTP::Request->new(POST => $url); 1049 my $ct = &is_xml($File) ? "application/xhtml+xml" : "text/html"; 1050 1051 if ($File->{Opt}->{DOCTYPE} || $File->{Charset}->{Override} || 1052 $File->{'Direct Input'}) 1053 { 1054 1055 # Doctype or charset overridden, need to use $File->{Content} in UTF-8 1056 # because $File->{Bytes} is not affected by the overrides. Note that 1057 # direct input is always considered an override here. 1058 1059 &override_charset($File, "UTF-8"); 1060 1061 $ct = $File->{ContentType} unless $File->{'Direct Input'}; 1062 my @ct = ($ct => undef, charset => "UTF-8"); 1063 $ct = HTTP::Headers::Util::join_header_words(@ct); 1064 1065 $req->content(Encode::encode_utf8(join("\n", @{$File->{Content}}))); 1066 } 1067 else { 1068 1069 # Pass original bytes, Content-Type and charset as-is. 1070 # We trust that our and validator.nu's interpretation of line numbers 1071 # is the same later when displaying error contexts (regardless of EOL 1072 # chars used in the document). 1073 1074 my @ct = ($File->{ContentType} => undef); 1075 push(@ct, charset => $File->{Charset}->{HTTP}) 1076 if $File->{Charset}->{HTTP}; 1077 $ct = HTTP::Headers::Util::join_header_words(@ct); 1078 1079 $req->content_ref(\$File->{Bytes}); 1080 } 1081 $req->content_type($ct); 1082 1083 $req->content_language($File->{ContentLang}) if $File->{ContentLang}; 1084 1085 # Intentionally using direct header access instead of $req->last_modified 1086 # (the latter takes seconds since epoch, but $File->{Modified} is an already 1087 # formatted string). 1088 $req->header('Last-Modified', $File->{Modified}) if $File->{Modified}; 1089 1090 # Use gzip in non-debug, remote HTML5 validator mode (LWP >= 5.817). 1091 if (!$File->{Opt}->{Debug} && 1092 $url->host() !~ /^(?:localhost|127(?:\.\d+){3}|.*\.localdomain)$/i) 1093 { 1094 eval { $req->encode("gzip"); }; 1095 } 1096 else { 1097 $req->header('Accept-Encoding', 'identity'); 1098 } 1099 1100 my $res = $ua->request($req); 1101 if (!$res->is_success()) { 1102 $File->{'Error Flagged'} = TRUE; 1103 my $tmpl = &get_error_template($File); 1104 $tmpl->param(fatal_no_checker => TRUE); 1105 $tmpl->param(fatal_missing_checker => 'HTML5 Validator'); 1106 $tmpl->param(fatal_checker_error => $res->status_line()); 1107 } 1108 else { 1109 my $content = &get_content($File, $res); 1110 return $File if $File->{'Error Flagged'}; 1111 1112 # and now we parse according to 1113 # http://wiki.whatwg.org/wiki/Validator.nu_XML_Output 1114 # I wish we could use XML::LibXML::Reader here. but SHAME on those 1115 # major unix distributions still shipping with libxml2 2.6.16… 4 years 1116 # after its release 1117 my $xml_reader = XML::LibXML->new(); 1118 $xml_reader->base_uri($res->base()); 1119 1120 my $xmlDOM; 1121 eval { $xmlDOM = $xml_reader->parse_string($content); }; 1122 if ($@) { 1123 my $errmsg = $@; 1124 $File->{'Error Flagged'} = TRUE; 1125 my $tmpl = &get_error_template($File); 1126 $tmpl->param(fatal_no_checker => TRUE); 1127 $tmpl->param(fatal_missing_checker => 'HTML5 Validator'); 1128 $tmpl->param(fatal_checker_error => $errmsg); 1129 return $File; 1130 } 1131 my @nodelist = $xmlDOM->getElementsByTagName("messages"); 1132 my $messages_node = $nodelist[0]; 1133 my @message_nodes = $messages_node->childNodes; 1134 foreach my $message_node (@message_nodes) { 1135 my $message_type = $message_node->localname; 1136 my ($html5_error_msg, $html5_error_expl); 1137 my $err = {}; 1138 1139 # TODO: non-document errors should receive different/better 1140 # treatment, but this is better than hiding all problems for now 1141 # (#6747) 1142 if ($message_type eq "error" || 1143 $message_type eq "non-document-error") 1144 { 1145 $err->{type} = "E"; 1146 $File->{'Is Valid'} = FALSE; 1147 } 1148 elsif ($message_type eq "info") { 1149 1150 # by default - we find warnings in the type attribute (below) 1151 $err->{type} = "I"; 1152 } 1153 if ($message_node->hasAttributes()) { 1154 my @attributelist = $message_node->attributes(); 1155 foreach my $attribute (@attributelist) { 1156 if ($attribute->name eq "type") { 1157 if (($attribute->getValue() eq "warning") and 1158 ($message_type eq "info")) 1159 { 1160 $err->{type} = "W"; 1161 } 1162 1163 } 1164 elsif ($attribute->name eq "last-column") { 1165 $err->{char} = $attribute->getValue(); 1166 } 1167 elsif ($attribute->name eq "last-line") { 1168 $err->{line} = $attribute->getValue(); 1169 } 1170 elsif ($attribute->name eq "url") { 1171 &set_error_uri($err, $attribute->getValue()); 1172 } 1173 } 1174 } 1175 my @child_nodes = $message_node->childNodes; 1176 foreach my $child_node (@child_nodes) { 1177 if ($child_node->localname eq "message") { 1178 $html5_error_msg = $child_node->textContent(); 1179 } 1180 elsif ($child_node->localname eq "elaboration") { 1181 $html5_error_expl = $child_node->toString(); 1182 $html5_error_expl =~ s,</?elaboration>,,gi; 1183 $html5_error_expl = 1184 "\n<div class=\"ve html5\">$html5_error_expl</div>\n"; 1185 } 1186 } 1187 1188 # formatting the error message for output 1189 1190 # TODO: set $err->{src} from extract if we got an URI for the error: 1191 # http://wiki.whatwg.org/wiki/Validator.nu_XML_Output#The_extract_Element 1192 # For now, set it directly to empty to prevent report_errors() from 1193 # trying to populate it from our doc. 1194 $err->{src} = "" if $err->{uri}; 1195 1196 $err->{num} = 'html5'; 1197 $err->{msg} = $html5_error_msg; 1198 $err->{expl} = $html5_error_expl; 1199 push @{$File->{Errors}}, $err; 1200 1201 # @@ TODO message explanation / elaboration 1202 } 1203 } 1204 return $File; 1205} 1206 1207sub dtd_validate (\$) 1208{ 1209 my $File = shift; 1210 my $opensp = SGML::Parser::OpenSP->new(); 1211 1212 # 1213 # By default, use SGML catalog file and SGML Declaration. 1214 my $catalog = catfile($CFG->{Paths}->{SGML}->{Library}, 'sgml.soc'); 1215 1216 # default parsing options 1217 my @spopt = qw(valid non-sgml-char-ref no-duplicate); 1218 1219 # 1220 # Switch to XML semantics if file is XML. 1221 if (&is_xml($File)) { 1222 $catalog = catfile($CFG->{Paths}->{SGML}->{Library}, 'xml.soc'); 1223 push(@spopt, 'xml'); 1224 } 1225 else { 1226 1227 # add warnings for shorttags 1228 push(@spopt, 'min-tag'); 1229 } 1230 1231 push( 1232 @{$File->{Parsers}}, 1233 { name => "OpenSP", 1234 link => "http://openjade.sourceforge.net/", 1235 type => "SGML/XML", 1236 options => join(" ", @spopt) 1237 } 1238 ); 1239 1240 # 1241 # Parser configuration 1242 $opensp->search_dirs($CFG->{Paths}->{SGML}->{Library}); 1243 $opensp->catalogs($catalog); 1244 $opensp->show_error_numbers(1); 1245 $opensp->warnings(@spopt); 1246 1247 # 1248 # Restricted file reading is disabled on Win32 for the time 1249 # being since neither SGML::Parser::OpenSP nor check auto- 1250 # magically set search_dirs to include the temp directory 1251 # so restricted file reading would defunct the Validator. 1252 $opensp->restrict_file_reading(1) unless $^O eq 'MSWin32'; 1253 1254 my $h; # event handler 1255 if ($File->{Opt}->{Outline}) { 1256 $h = W3C::Validator::EventHandler::Outliner->new($opensp, $File, $CFG); 1257 } 1258 else { 1259 $h = W3C::Validator::EventHandler->new($opensp, $File, $CFG); 1260 } 1261 1262 $opensp->handler($h); 1263 $opensp->parse_string(join "\n", @{$File->{Content}}); 1264 1265 # Make sure there are no circular references, otherwise the script 1266 # would leak memory until mod_perl unloads it which could take some 1267 # time. @@FIXME It's probably overly careful though. 1268 $opensp->handler(undef); 1269 undef $h->{_parser}; 1270 undef $h->{_file}; 1271 undef $h; 1272 undef $opensp; 1273 1274 # 1275 # Set Version to be the FPI initially. 1276 $File->{Version} = $File->{DOCTYPE}; 1277 return $File; 1278} 1279 1280sub xmlwf (\$) 1281{ 1282 1283 # we should really be using a SAX ErrorHandler, but I can't find a way to 1284 # make it work with XML::LibXML::SAX::Parser... ** FIXME ** 1285 # ditto, we should try using W3C::Validator::EventHandler, but it's badly 1286 # linked to opensp at the moment 1287 1288 my $File = shift; 1289 my $xmlparser = XML::LibXML->new(); 1290 $xmlparser->line_numbers(1); 1291 $xmlparser->validation(0); 1292 $xmlparser->base_uri($File->{URI}) 1293 unless ($File->{'Direct Input'} || $File->{'Is Upload'}); 1294 1295 push( 1296 @{$File->{Parsers}}, 1297 { name => "libxml2", 1298 link => "http://xmlsoft.org/", 1299 type => "XML", 1300 options => "" 1301 } 1302 ); 1303 1304 # Restrict file reading similar to what SGML::Parser::OpenSP does. Note 1305 # that all inputs go through the callback so if we were passing a 1306 # URI/filename to the parser, it would be affected as well and would break 1307 # fetching the initial document. As long as we pass the doc as string, 1308 # this should work. 1309 my $cb = XML::LibXML::InputCallback->new(); 1310 $cb->register_callbacks([\&xml_jail_match, sub { }, sub { }, sub { }]); 1311 $xmlparser->input_callbacks($cb); 1312 1313 &override_charset($File, "UTF-8"); 1314 1315 eval { $xmlparser->parse_string(join("\n", @{$File->{Content}})); }; 1316 1317 if (ref($@)) { 1318 1319 # handle a structured error (XML::LibXML::Error object) 1320 1321 my $err_obj = $@; 1322 while ($err_obj) { 1323 my $err = {}; 1324 &set_error_uri($err, $err_obj->file()); 1325 $err->{src} = &ent($err_obj->context()) if $err->{uri}; 1326 $err->{line} = $err_obj->line(); 1327 $err->{char} = $err_obj->column(); 1328 $err->{num} = "libxml2-" . $err_obj->code(); 1329 $err->{type} = "E"; 1330 $err->{msg} = $err_obj->message(); 1331 1332 $err_obj = $err_obj->_prev(); 1333 1334 unshift(@{$File->{WF_Errors}}, $err); 1335 } 1336 } 1337 elsif ($@) { 1338 my $xmlwf_errors = $@; 1339 my $xmlwf_error_line = undef; 1340 my $xmlwf_error_col = undef; 1341 my $xmlwf_error_msg = undef; 1342 my $got_error_message = undef; 1343 my $got_quoted_line = undef; 1344 foreach my $msg_line (split "\n", $xmlwf_errors) { 1345 1346 $msg_line =~ s{[^\x0d\x0a](:\d+:)}{\n$1}g; 1347 $msg_line =~ s{[^\x0d\x0a]+[\x0d\x0a]$}{}; 1348 1349 # first we get the actual error message 1350 if (!$got_error_message && 1351 $msg_line =~ /^(:\d+:)( parser error : .*)/) 1352 { 1353 $xmlwf_error_line = $1; 1354 $xmlwf_error_msg = $2; 1355 $xmlwf_error_line =~ s/:(\d+):/$1/; 1356 $xmlwf_error_msg =~ s/ parser error :/XML Parsing Error: /; 1357 $got_error_message = 1; 1358 } 1359 1360 # then we skip the second line, which shows the context 1361 # (we don't use that) 1362 elsif ($got_error_message && !$got_quoted_line) { 1363 $got_quoted_line = 1; 1364 } 1365 1366 # we now take the third line, with the pointer to the error's 1367 # column 1368 elsif (($msg_line =~ /(\s+)\^/) and 1369 $got_error_message and 1370 $got_quoted_line) 1371 { 1372 $xmlwf_error_col = length($1); 1373 } 1374 1375 # cleanup for a number of bugs for the column number 1376 if (defined($xmlwf_error_col)) { 1377 if (( my $l = 1378 length($File->{Content}->[$xmlwf_error_line - 1]) 1379 ) < $xmlwf_error_col 1380 ) 1381 { 1382 1383 # http://bugzilla.gnome.org/show_bug.cgi?id=434196 1384 #warn("Warning: reported error column larger than line length " . 1385 # "($xmlwf_error_col > $l) in $File->{URI} line " . 1386 # "$xmlwf_error_line, libxml2 bug? Resetting to line length."); 1387 $xmlwf_error_col = $l; 1388 } 1389 elsif ($xmlwf_error_col == 79) { 1390 1391 # working around an apparent odd limitation of libxml which 1392 # only gives context for lines up to 80 chars 1393 # http://www.w3.org/Bugs/Public/show_bug.cgi?id=4420 1394 # http://bugzilla.gnome.org/show_bug.cgi?id=424017 1395 $xmlwf_error_col = "> 80"; 1396 1397 # non-int line number will trigger the proper behavior in 1398 # report_error 1399 } 1400 } 1401 1402 # when we have all the info (one full error message), proceed 1403 # and move on to the next error 1404 if ((defined $xmlwf_error_line) and 1405 (defined $xmlwf_error_col) and 1406 (defined $xmlwf_error_msg)) 1407 { 1408 1409 # Reinitializing for the next batch of 3 lines 1410 $got_error_message = undef; 1411 $got_quoted_line = undef; 1412 1413 # formatting the error message for output 1414 my $err = {}; 1415 1416 # TODO: set_error_uri() (need test case) 1417 $err->{src} = "" if $err->{uri}; # TODO... 1418 $err->{line} = $xmlwf_error_line; 1419 $err->{char} = $xmlwf_error_col; 1420 $err->{num} = 'xmlwf'; 1421 $err->{type} = "E"; 1422 $err->{msg} = $xmlwf_error_msg; 1423 1424 push(@{$File->{WF_Errors}}, $err); 1425 $xmlwf_error_line = undef; 1426 $xmlwf_error_col = undef; 1427 $xmlwf_error_msg = undef; 1428 } 1429 } 1430 } 1431 1432 $File->{'Is Valid'} = FALSE if @{$File->{WF_Errors}}; 1433 return $File; 1434} 1435 1436# 1437# Generate HTML report. 1438sub prep_template ($$) 1439{ 1440 my $File = shift; 1441 my $T = shift; 1442 1443 # 1444 # XML mode... 1445 $T->param(is_xml => &is_xml($File)); 1446 1447 # 1448 # Upload? 1449 $T->param(is_upload => $File->{'Is Upload'}); 1450 1451 # 1452 # Direct Input? 1453 $T->param(is_direct_input => $File->{'Direct Input'}); 1454 1455 # 1456 # The URI... 1457 $T->param(file_uri => $File->{URI}); 1458 1459 # 1460 # HTTPS note? 1461 $T->param(file_https_note => $File->{'Is Upload'} || 1462 $File->{'Direct Input'} || 1463 URI->new($File->{URI})->secure()); 1464 1465 # 1466 # Set URL for page title. 1467 $T->param(page_title_url => $File->{URI}); 1468 1469 # 1470 # Metadata... 1471 $T->param(file_modified => $File->{Modified}); 1472 $T->param(file_server => $File->{Server}); 1473 $T->param(file_size => $File->{Size}); 1474 $T->param(file_contenttype => $File->{ContentType}); 1475 $T->param(file_charset => $File->{Charset}->{Use}); 1476 $T->param(file_doctype => $File->{DOCTYPE}); 1477 1478 # 1479 # Output options... 1480 $T->param(opt_show_source => $File->{Opt}->{'Show Source'}); 1481 $T->param(opt_show_tidy => $File->{Opt}->{'Show Tidy'}); 1482 $T->param(opt_show_outline => $File->{Opt}->{Outline}); 1483 $T->param(opt_verbose => $File->{Opt}->{Verbose}); 1484 $T->param(opt_group_errors => $File->{Opt}->{'Group Errors'}); 1485 $T->param(opt_no200 => $File->{Opt}->{No200}); 1486 1487 # Root Element 1488 $T->param(root_element => $File->{Root}); 1489 1490 # Namespaces... 1491 $T->param(file_namespace => $File->{Namespace}); 1492 1493 # Non-root ones; unique, preserving occurrence order 1494 my %seen_ns = (); 1495 $seen_ns{$File->{Namespace}}++ if defined($File->{Namespace}); 1496 my @nss = 1497 map { $seen_ns{$_}++ == 0 ? {uri => $_} : () } @{$File->{Namespaces}}; 1498 $T->param(file_namespaces => \@nss) if @nss; 1499 1500 if ($File->{Opt}->{DOCTYPE}) { 1501 my $over_doctype_param = "override doctype $File->{Opt}->{DOCTYPE}"; 1502 $T->param($over_doctype_param => TRUE); 1503 } 1504 1505 if ($File->{Opt}->{Charset}) { 1506 my $over_charset_param = "override charset $File->{Opt}->{Charset}"; 1507 $T->param($over_charset_param => TRUE); 1508 } 1509 1510 # Allow content-negotiation 1511 if ($File->{Opt}->{'Accept Header'}) { 1512 $T->param('accept' => $File->{Opt}->{'Accept Header'}); 1513 } 1514 if ($File->{Opt}->{'Accept-Language Header'}) { 1515 $T->param( 1516 'accept-language' => $File->{Opt}->{'Accept-Language Header'}); 1517 } 1518 if ($File->{Opt}->{'Accept-Charset Header'}) { 1519 $T->param('accept-charset' => $File->{Opt}->{'Accept-Charset Header'}); 1520 } 1521 if ($File->{Opt}->{'User Agent'}) { 1522 $T->param('user-agent' => $File->{Opt}->{'User Agent'}); 1523 } 1524 if ($File->{'Error Flagged'}) { 1525 $T->param(fatal_error => TRUE); 1526 } 1527} 1528 1529sub fin_template ($$) 1530{ 1531 my $File = shift; 1532 my $T = shift; 1533 1534 # 1535 # Set debug info for HTML and SOAP reports. 1536 if ($DEBUG) { 1537 my @parsers; 1538 for my $parser (@{$File->{Parsers}}) { 1539 my $p = $parser->{name}; 1540 $p .= " (" . $parser->{options} . ")" if $parser->{options}; 1541 push(@parsers, $p); 1542 } 1543 $T->param( 1544 debug => [ 1545 map({name => $_, value => $ENV{$_}}, 1546 qw(no_proxy http_proxy https_proxy ftp_proxy FTP_PASSIVE)), 1547 {name => 'Content-Encoding', value => $File->{ContentEnc}}, 1548 {name => 'Content-Language', value => $File->{ContentLang}}, 1549 {name => 'Content-Location', value => $File->{ContentLoc}}, 1550 {name => 'Transfer-Encoding', value => $File->{TransferEnc}}, 1551 {name => 'Parse Mode', value => $File->{Mode}}, 1552 {name => 'Parse Mode Factor', value => $File->{ModeChoice}}, 1553 {name => 'Parsers Used', value => join(", ", @parsers)}, 1554 ], 1555 ); 1556 } 1557 1558 $T->param(parsers => $File->{Parsers}); 1559 1560 if (!$File->{Doctype} && 1561 (!$File->{Version} || 1562 $File->{Version} eq 'unknown' || 1563 $File->{Version} eq 'SGML') 1564 ) 1565 { 1566 my $default_doctype = 1567 $File->{"Default DOCTYPE"}->{&is_xml($File) ? "XHTML" : "HTML"}; 1568 $T->param(file_version => "$default_doctype"); 1569 } 1570 else { 1571 $T->param(file_version => $File->{Version}); 1572 } 1573 my ($num_errors, $num_warnings, $num_info, $reported_errors) = 1574 &report_errors($File); 1575 if ($num_errors + $num_warnings > 0) { 1576 $T->param(has_errors => 1); 1577 } 1578 $T->param(valid_errors_num => $num_errors); 1579 $num_warnings += scalar @{$File->{Warnings}}; 1580 $T->param(valid_warnings_num => $num_warnings); 1581 my $number_of_errors = ""; # textual form of $num_errors 1582 my $number_of_warnings = ""; # textual form of $num_errors 1583 1584 # The following is a bit hack-ish, but will enable us to have some logic 1585 # for a human-readable display of the number, with cases for 0, 1, 2 and 1586 # above (the case of 2 appears to be useful for localization in some 1587 # languages where the plural is different for 2, and above) 1588 1589 if ($num_errors > 1) { 1590 $T->param(number_of_errors_is_0 => FALSE); 1591 $T->param(number_of_errors_is_1 => FALSE); 1592 if ($num_errors == 2) { 1593 $T->param(number_of_errors_is_2 => TRUE); 1594 } 1595 else { 1596 $T->param(number_of_errors_is_2 => FALSE); 1597 } 1598 $T->param(number_of_errors_is_plural => TRUE); 1599 } 1600 elsif ($num_errors == 1) { 1601 $T->param(number_of_errors_is_0 => FALSE); 1602 $T->param(number_of_errors_is_1 => TRUE); 1603 $T->param(number_of_errors_is_2 => FALSE); 1604 $T->param(number_of_errors_is_plural => FALSE); 1605 } 1606 else { # 0 1607 $T->param(number_of_errors_is_0 => TRUE); 1608 $T->param(number_of_errors_is_1 => FALSE); 1609 $T->param(number_of_errors_is_2 => FALSE); 1610 $T->param(number_of_errors_is_plural => FALSE); 1611 } 1612 1613 if ($num_warnings > 1) { 1614 $T->param(number_of_warnings_is_0 => FALSE); 1615 $T->param(number_of_warnings_is_1 => FALSE); 1616 if ($num_warnings == 2) { 1617 $T->param(number_of_warnings_is_2 => TRUE); 1618 } 1619 else { 1620 $T->param(number_of_warnings_is_2 => FALSE); 1621 } 1622 $T->param(number_of_warnings_is_plural => TRUE); 1623 } 1624 elsif ($num_warnings == 1) { 1625 $T->param(number_of_warnings_is_0 => FALSE); 1626 $T->param(number_of_warnings_is_1 => TRUE); 1627 $T->param(number_of_warnings_is_2 => FALSE); 1628 $T->param(number_of_warnings_is_plural => FALSE); 1629 } 1630 else { # 0 1631 $T->param(number_of_warnings_is_0 => TRUE); 1632 $T->param(number_of_warnings_is_1 => FALSE); 1633 $T->param(number_of_warnings_is_2 => FALSE); 1634 $T->param(number_of_warnings_is_plural => FALSE); 1635 } 1636 1637 $T->param(file_outline => $File->{heading_outline}) 1638 if $File->{Opt}->{Outline}; 1639 1640 $T->param(file_errors => $reported_errors); 1641 if ($File->{'Is Valid'}) { 1642 $T->param(VALID => TRUE); 1643 $T->param(valid_status => 'Valid'); 1644 &report_valid($File, $T); 1645 } 1646 else { 1647 $T->param(VALID => FALSE); 1648 $T->param(valid_status => 'Invalid'); 1649 } 1650} 1651 1652# 1653# Output "This page is Valid" report. 1654sub report_valid 1655{ 1656 my $File = shift; 1657 my $T = shift; 1658 1659 unless ($File->{Version} eq 'unknown' or defined $File->{Tentative}) { 1660 1661 if (exists $CFG->{Types}->{$File->{DOCTYPE}}->{Badge}) { 1662 my $cfg = $CFG->{Types}->{$File->{DOCTYPE}}; 1663 $T->param(badge_uri => $cfg->{Badge}->{URI}); 1664 $T->param(local_badge_uri => $cfg->{Badge}->{'Local URI'}); 1665 $T->param(badge_alt_uri => $cfg->{Badge}->{'Alt URI'}); 1666 $T->param(local_alt_badge_uri => $cfg->{Badge}->{'Local ALT URI'}); 1667 $T->param(badge_alt => $cfg->{Badge}->{Alt}); 1668 $T->param(badge_rdfa => $cfg->{Badge}->{RDFa}); 1669 $T->param(badge_h => $cfg->{Badge}->{Height}); 1670 $T->param(badge_w => $cfg->{Badge}->{Width}); 1671 $T->param(badge_onclick => $cfg->{Badge}->{OnClick}); 1672 $T->param(badge_tagc => $cfg->{'Parse Mode'} eq 'XML' ? ' /' : ''); 1673 } 1674 } 1675 elsif (defined $File->{Tentative}) { 1676 $T->param(is_tentative => TRUE); 1677 } 1678 1679 if ($File->{XMLWF_ONLY}) { 1680 $T->param(xmlwf_only => TRUE); 1681 } 1682 my $thispage = self_url_file($File); 1683 $T->param(file_thispage => $thispage); 1684} 1685 1686# 1687# Add a warning message to the output. 1688sub add_warning ($$) 1689{ 1690 my $WID = shift; 1691 my $params = shift; 1692 1693 push @{$File->{Warnings}}, $WID; 1694 1695 my %tmplparams = ( 1696 $WID => TRUE, 1697 have_warnings => TRUE, 1698 %$params, 1699 ); 1700 for my $tmpl (qw(result fatal-error soap_output ucn_output)) { 1701 &get_template($File, "$tmpl.tmpl")->param(%tmplparams); 1702 } 1703} 1704 1705# 1706# Proxy authentication requests. 1707# Note: expects the third argument to be a hash ref (see HTTP::Headers::Auth). 1708sub authenticate 1709{ 1710 my $File = shift; 1711 my $resource = shift; 1712 my $authHeader = shift || {}; 1713 1714 my $realm = $resource; 1715 $realm =~ s([^\w\d.-]*){}g; 1716 1717 while (my ($scheme, $header) = each %$authHeader) { 1718 my $origrealm = $header->{realm}; 1719 if (not defined $origrealm or $scheme !~ /^(?:basic|digest)$/i) { 1720 delete($authHeader->{$scheme}); 1721 next; 1722 } 1723 $header->{realm} = "$realm-$origrealm"; 1724 } 1725 1726 my $headers = HTTP::Headers->new(Connection => 'close'); 1727 $headers->www_authenticate(%$authHeader); 1728 $headers = $headers->as_string(); 1729 chomp($headers); 1730 1731 my $tmpl = &get_template($File, 'http_401_authrequired.tmpl'); 1732 $tmpl->param(http_401_headers => $headers); 1733 $tmpl->param(http_401_url => $resource); 1734 1735 print Encode::encode('UTF-8', $tmpl->output); 1736 exit; # Further interaction will be a new HTTP request. 1737} 1738 1739# 1740# Fetch an URL and return the content and selected meta-info. 1741sub handle_uri 1742{ 1743 my $q = shift; # The CGI object. 1744 my $File = shift; # The master datastructure. 1745 1746 my $ua = W3C::Validator::UserAgent->new($CFG, $File); 1747 1748 my $uri = URI->new(ref $q ? $q->param('uri') : $q)->canonical(); 1749 $uri->fragment(undef); 1750 1751 if (!$uri->scheme()) { 1752 local $ENV{URL_GUESS_PATTERN} = ''; 1753 my $guess = URI::Heuristic::uf_uri($uri); 1754 if ($guess->scheme() && $ua->is_protocol_supported($guess)) { 1755 $uri = $guess; 1756 } 1757 else { 1758 $uri = URI->new("http://$uri"); 1759 } 1760 } 1761 1762 unless ($ua->is_protocol_supported($uri)) { 1763 $File->{'Error Flagged'} = TRUE; 1764 my $tmpl = &get_error_template($File); 1765 1766 # If uri param is empty (also for empty direct or upload), it's been 1767 # set to TRUE in sub prepCGI() 1768 if ($uri->canonical() eq "1") { 1769 $tmpl->param(fatal_no_content => TRUE); 1770 } 1771 else { 1772 $tmpl->param(fatal_uri_error => TRUE); 1773 $tmpl->param(fatal_uri_scheme => $uri->scheme()); 1774 } 1775 return $File; 1776 } 1777 1778 return $File unless $ua->uri_ok($uri); 1779 1780 my $req = HTTP::Request->new(GET => $uri); 1781 1782 # if one wants to use the accept, accept-charset and accept-language params 1783 # in order to trigger specific negotiation 1784 if ($File->{Opt}->{'Accept Header'}) { 1785 $req->header(Accept => $File->{Opt}->{'Accept Header'}); 1786 } 1787 if ($File->{Opt}->{'Accept-Language Header'}) { 1788 $req->header( 1789 Accept_Language => $File->{Opt}->{'Accept-Language Header'}); 1790 } 1791 if ($File->{Opt}->{'Accept-Charset Header'}) { 1792 $req->header( 1793 Accept_Charset => $File->{Opt}->{'Accept-Charset Header'}); 1794 } 1795 1796 # All Apache configurations don't set HTTP_AUTHORIZATION for CGI scripts. 1797 # If we're under mod_perl, there is a way around it... 1798 my $http_auth = $ENV{HTTP_AUTHORIZATION}; 1799 eval { 1800 local $SIG{__DIE__} = undef; 1801 my $auth = 1802 Apache2::RequestUtil->request()->headers_in()->{Authorization}; 1803 $http_auth = $auth if $auth; 1804 } if (IS_MODPERL2() && !$http_auth); 1805 1806 # If we got a Authorization header, the client is back at it after being 1807 # prompted for a password so we insert the header as is in the request. 1808 $req->headers->header(Authorization => $http_auth) if $http_auth; 1809 1810 my $res = $ua->request($req); 1811 1812 return $File if $File->{'Error Flagged'}; # Redirect IP rejected? 1813 1814 unless ($res->code == 200 or $File->{Opt}->{'No200'}) { 1815 if ($res->code == 401) { 1816 my %auth = $res->www_authenticate(); # HTTP::Headers::Auth 1817 &authenticate($File, $res->request->uri, \%auth); 1818 } 1819 else { 1820 $File->{'Error Flagged'} = TRUE; 1821 1822 my $no200url = undef; 1823 if (!$File->{Opt}->{No200}) { 1824 1825 # $File->{URI} not set yet; setting it non-local has side 1826 # effects 1827 local $File->{URI} = $uri->as_string; 1828 local $File->{Opt}->{No200} = TRUE; 1829 $no200url = &self_url_file($File); 1830 } 1831 1832 my $warning = $res->header("Client-Warning"); 1833 if ($warning && $warning =~ /Internal response/i) { 1834 1835 # Response doc generated internally by LWP, no need to show 1836 # that info nor to provide error doc validation link to it. 1837 $warning = undef; 1838 $no200url = undef; 1839 } 1840 1841 my $tmpl = &get_error_template($File); 1842 $tmpl->param(fatal_http_error => TRUE); 1843 $tmpl->param(fatal_http_uri => $uri->as_string); 1844 $tmpl->param(fatal_http_code => $res->code); 1845 $tmpl->param(fatal_http_msg => $res->message); 1846 $tmpl->param(fatal_http_warn => $warning); 1847 $tmpl->param(fatal_http_no200 => $no200url); 1848 $tmpl->param(fatal_http_dns => TRUE) if ($res->code == 500); 1849 } 1850 1851 return $File; 1852 } 1853 1854 # 1855 # Enforce Max Recursion level. 1856 &check_recursion($File, $res); 1857 1858 my ($mode, $ct, $charset) = &parse_content_type( 1859 $File, 1860 scalar($res->header('Content-Type')), 1861 scalar($res->request->uri), 1862 ); 1863 1864 my $content = &get_content($File, $res); 1865 return $File if $File->{'Error Flagged'}; 1866 1867 $File->{Bytes} = $content; 1868 $File->{Mode} = $mode; 1869 $File->{ContentType} = $ct; 1870 $File->{ContentEnc} = $res->content_encoding; 1871 $File->{ContentLang} = $res->content_language; 1872 $File->{ContentLoc} = $res->header('Content-Location'); 1873 $File->{TransferEnc} = $res->header('Client-Transfer-Encoding'); 1874 $File->{Charset}->{HTTP} = lc $charset if defined $charset; 1875 $File->{Modified} = $res->header('Last-Modified'); 1876 $File->{Server} = scalar $res->server; 1877 1878 # TODO: Content-Length is not always set, so either this should 1879 # be renamed to 'Content-Length' or it should consider more than 1880 # the Content-Length header. 1881 $File->{Size} = scalar $res->content_length; 1882 $File->{URI} = scalar $res->request->uri->canonical; 1883 $File->{'Is Upload'} = FALSE; 1884 $File->{'Direct Input'} = FALSE; 1885 1886 return $File; 1887} 1888 1889# 1890# Handle uploaded file and return the content and selected meta-info. 1891sub handle_file 1892{ 1893 my $q = shift; # The CGI object. 1894 my $File = shift; # The master datastructure. 1895 1896 my $p = $q->param('uploaded_file'); 1897 my $f = $q->upload('uploaded_file'); 1898 if (!defined($f)) { 1899 1900 # Probably not an uploaded file as far as CGI is concerned, 1901 # treat as a fragment. 1902 $q->param('fragment', $p); 1903 return &handle_frag($q, $File); 1904 } 1905 1906 my $h = $q->uploadInfo($p); 1907 1908 local $/ = undef; # set line delimiter so that <> reads rest of file 1909 my $file = <$f>; 1910 1911 my ($mode, $ct, $charset) = 1912 &parse_content_type($File, $h->{'Content-Type'}); 1913 1914 $File->{Bytes} = $file; 1915 $File->{Mode} = $mode; 1916 $File->{ContentType} = $ct; 1917 $File->{Charset}->{HTTP} = lc $charset if defined $charset; 1918 $File->{Modified} = $q->http('Last-Modified'); 1919 $File->{Server} = $q->http('User-Agent'); # Fake a "server". :-) 1920 $File->{Size} = $q->http('Content-Length'); 1921 $File->{URI} = "$p"; 1922 $File->{'Is Upload'} = TRUE; 1923 $File->{'Direct Input'} = FALSE; 1924 1925 return $File; 1926} 1927 1928# 1929# Handle uploaded file and return the content and selected meta-info. 1930sub handle_frag 1931{ 1932 my $q = shift; # The CGI object. 1933 my $File = shift; # The master datastructure. 1934 1935 $File->{Bytes} = $q->param('fragment'); 1936 $File->{Mode} = 'TBD'; 1937 $File->{Modified} = ''; 1938 $File->{Server} = ''; 1939 $File->{Size} = ''; 1940 $File->{ContentType} = ''; # @@TODO? 1941 $File->{URI} = 'upload://Form Submission'; 1942 $File->{'Is Upload'} = FALSE; 1943 $File->{'Direct Input'} = TRUE; 1944 $File->{Charset}->{HTTP} = 1945 "utf-8"; # by default, the form accepts utf-8 chars 1946 1947 if ($File->{Opt}->{Prefill}) { 1948 1949 # we surround the HTML fragment with some basic document structure 1950 my $prefill_Template; 1951 if ($File->{Opt}->{'Prefill Doctype'} eq 'html401') { 1952 $prefill_Template = &get_template($File, 'prefill_html401.tmpl'); 1953 } 1954 else { 1955 $prefill_Template = &get_template($File, 'prefill_xhtml10.tmpl'); 1956 } 1957 $prefill_Template->param(fragment => $File->{Bytes}); 1958 $File->{Bytes} = $prefill_Template->output(); 1959 1960 # Let's force the view source so that the user knows what we've put 1961 # around their code. 1962 $File->{Opt}->{'Show Source'} = TRUE; 1963 1964 # Ignore doctype overrides (#5132). 1965 $File->{Opt}->{DOCTYPE} = 'Inline'; 1966 } 1967 1968 return $File; 1969} 1970 1971# 1972# Parse a Content-Type and parameters. Return document type and charset. 1973sub parse_content_type 1974{ 1975 my $File = shift; 1976 my $Content_Type = shift; 1977 my $url = shift; 1978 my $charset = ''; 1979 1980 my ($ct) = lc($Content_Type) =~ /^\s*([^\s;]*)/g; 1981 1982 my $mode = $CFG->{MIME}->{$ct} || $ct; 1983 1984 $charset = HTML::Encoding::encoding_from_content_type($Content_Type); 1985 1986 if (index($mode, '/') != -1) { # a "/" means it's unknown or we'd have a mode here. 1987 if ($ct eq 'text/css' and defined $url) { 1988 print redirect 1989 'http://jigsaw.w3.org/css-validator/validator?uri=' . 1990 uri_escape $url; 1991 exit; 1992 } 1993 elsif ($ct eq 'application/atom+xml' and defined $url) { 1994 print redirect 'http://validator.w3.org/feed/check.cgi?url=' . 1995 uri_escape $url; 1996 exit; 1997 } 1998 elsif ($ct =~ m(^application/.+\+xml$)) { 1999 2000 # unknown media types which should be XML - we give these a try 2001 $mode = "XML"; 2002 } 2003 else { 2004 $File->{'Error Flagged'} = TRUE; 2005 my $tmpl = &get_error_template($File); 2006 $tmpl->param(fatal_mime_error => TRUE); 2007 $tmpl->param(fatal_mime_ct => $ct); 2008 } 2009 } 2010 2011 return $mode, $ct, $charset; 2012} 2013 2014# 2015# Get content with Content-Encodings decoded from a response. 2016sub get_content ($$) 2017{ 2018 my $File = shift; 2019 my $res = shift; 2020 2021 my $content; 2022 eval { 2023 $content = $res->decoded_content(charset => 'none', raise_error => 1); 2024 }; 2025 if ($@) { 2026 (my $errmsg = $@) =~ s/ at .*//s; 2027 my $cenc = $res->header("Content-Encoding"); 2028 my $uri = $res->request->uri; 2029 $File->{'Error Flagged'} = TRUE; 2030 my $tmpl = &get_error_template($File); 2031 $tmpl->param(fatal_decode_error => TRUE); 2032 $tmpl->param(fatal_decode_errmsg => $errmsg); 2033 $tmpl->param(fatal_decode_cenc => $cenc); 2034 2035 # Include URI because it might be a subsystem (eg. HTML5 validator) one 2036 $tmpl->param(fatal_decode_uri => $uri); 2037 } 2038 2039 return $content; 2040} 2041 2042# 2043# Check recursion level and enforce Max Recursion limit. 2044sub check_recursion ($$) 2045{ 2046 my $File = shift; 2047 my $res = shift; 2048 2049 # Not looking at our own output. 2050 return unless defined $res->header('X-W3C-Validator-Recursion'); 2051 2052 my $lvl = $res->header('X-W3C-Validator-Recursion'); 2053 return unless $lvl =~ m/^\d+$/; # Non-digit, i.e. garbage, ignore. 2054 2055 if ($lvl >= $CFG->{'Max Recursion'}) { 2056 print redirect $File->{Env}->{'Home Page'}; 2057 } 2058 else { 2059 2060 # Increase recursion level in output. 2061 &get_template($File, 'result.tmpl')->param(depth => $lvl++); 2062 } 2063} 2064 2065# 2066# XML::LibXML::InputCallback matcher using our SGML search path jail. 2067sub xml_jail_match 2068{ 2069 my $arg = shift; 2070 2071 # Ensure we have a file:// URI if we get a file. 2072 my $uri = URI->new($arg); 2073 if (!$uri->scheme()) { 2074 $uri = URI::file->new_abs($arg); 2075 } 2076 $uri = $uri->canonical(); 2077 2078 # Do not trap non-file URIs. 2079 return 0 unless ($uri->scheme() eq "file"); 2080 2081 # Do not trap file URIs within our jail. 2082 for my $dir ($CFG->{Paths}->{SGML}->{Library}, 2083 split(/\Q$Config{path_sep}\E/o, $ENV{SGML_SEARCH_PATH} || '')) 2084 { 2085 next unless $dir; 2086 my $dir_uri = URI::file->new_abs($dir)->canonical()->as_string(); 2087 $dir_uri =~ s|/*$|/|; # ensure it ends with a slash 2088 return 0 if ($uri =~ /^\Q$dir_uri\E/); 2089 } 2090 2091 # We have a match (a file outside the jail). 2092 return 1; 2093} 2094 2095# 2096# Escape text to be included in markup comment. 2097sub escape_comment 2098{ 2099 local $_ = shift; 2100 return '' unless defined; 2101 s/--/- /g; 2102 return $_; 2103} 2104 2105# 2106# Return $_[0] encoded for HTML entities (cribbed from merlyn). 2107# 2108# Note that this is used both for HTML and XML escaping (so e.g. no '). 2109# 2110sub ent 2111{ 2112 my $str = shift; 2113 return '' unless defined($str); # Eliminate warnings 2114 2115 # should switch to hex sooner or later 2116 $str =~ s/&/&/g; 2117 $str =~ s/</</g; 2118 $str =~ s/>/>/g; 2119 $str =~ s/"/"/g; 2120 $str =~ s/'/'/g; 2121 2122 return $str; 2123} 2124 2125# 2126# Truncate source lines for report. 2127# Expects 1-based column indexes. 2128sub truncate_line 2129{ 2130 my $line = shift; 2131 my $col = shift; 2132 my $maxlen = 80; # max line length to truncate to 2133 2134 my $diff = length($line) - $maxlen; 2135 2136 # Don't truncate at all if it fits. 2137 return ($line, $col) if ($diff <= 0); 2138 2139 my $start = $col - int($maxlen / 2); 2140 if ($start < 0) { 2141 2142 # Truncate only from end of line. 2143 $start = 0; 2144 $line = substr($line, $start, $maxlen - 1) . '…'; 2145 } 2146 elsif ($start > $diff) { 2147 2148 # Truncate only from beginning of line. 2149 $start = $diff; 2150 $line = '…' . substr($line, $start + 1); 2151 } 2152 else { 2153 2154 # Truncate from both beginning and end of line. 2155 $line = '…' . substr($line, $start + 1, $maxlen - 2) . '…'; 2156 } 2157 2158 # Shift column if we truncated from beginning of line. 2159 $col -= $start; 2160 2161 return ($line, $col); 2162} 2163 2164# 2165# Suppress any existing DOCTYPE by commenting it out. 2166sub override_doctype 2167{ 2168 my $File = shift; 2169 2170 my ($dt) = 2171 grep { $_->{Display} eq $File->{Opt}->{DOCTYPE} } 2172 values %{$CFG->{Types}}; 2173 2174 # @@TODO: abort/whine about unrecognized doctype if $dt is undef.; 2175 my $pubid = $dt->{PubID}; 2176 my $sysid = $dt->{SysID}; 2177 my $name = $dt->{Name}; 2178 2179 # The HTML5 PubID is a fake, reset it out of the way. 2180 $pubid = undef if ($pubid eq 'HTML5'); 2181 2182 # We don't have public/system ids for all types. 2183 my $dtd = "<!DOCTYPE $name"; 2184 if ($pubid) { 2185 $dtd .= qq( PUBLIC "$pubid"); 2186 $dtd .= qq( "$sysid") if $sysid; 2187 } 2188 elsif ($sysid) { 2189 $dtd .= qq( SYSTEM "$sysid"); 2190 } 2191 $dtd .= '>'; 2192 2193 my $org_dtd = ''; 2194 my $HTML = ''; 2195 my $seen_doctype = FALSE; 2196 2197 my $declaration = sub { 2198 my ($tag, $text) = @_; 2199 if ($seen_doctype || uc($tag) ne '!DOCTYPE') { 2200 $HTML .= $text; 2201 return; 2202 } 2203 2204 $seen_doctype = TRUE; 2205 2206 $org_dtd = &ent($text); 2207 ($File->{Root}, undef, $File->{DOCTYPE}) = $text =~ 2208 /<!DOCTYPE\s+(\w[\w\.-]+)(?:\s+(?:PUBLIC|SYSTEM)\s+(['"])(.*?)\2)?\s*>/si; 2209 2210 $File->{DOCTYPE} = 'HTML5' 2211 if ( 2212 lc($File->{Root} || '') eq 'html' && 2213 (!defined($File->{DOCTYPE}) || 2214 $File->{DOCTYPE} eq 'about:legacy-compat') 2215 ); 2216 2217 # No Override if Fallback was requested, or if override is the same as 2218 # detected 2219 my $known = $CFG->{Types}->{$File->{DOCTYPE}}; 2220 if ($File->{Opt}->{FB}->{DOCTYPE} or 2221 ($known && $File->{Opt}->{DOCTYPE} eq $known->{Display})) 2222 { 2223 $HTML .= $text; # Stash it as is... 2224 } 2225 else { 2226 $HTML .= "$dtd<!-- " . &escape_comment($text) . " -->"; 2227 } 2228 }; 2229 2230 my $start_element = sub { 2231 my $p = shift; 2232 # Sneak chosen doctype before the root elt if none replaced thus far. 2233 $HTML .= $dtd unless $seen_doctype; 2234 $HTML .= shift; 2235 # We're done with this handler. 2236 $p->handler(start => undef); 2237 }; 2238 2239 HTML::Parser->new( 2240 default_h => [sub { $HTML .= shift }, 'text'], 2241 declaration_h => [$declaration, 'tag,text'], 2242 start_h => [$start_element, 'self,text'] 2243 )->parse(join "\n", @{$File->{Content}})->eof(); 2244 2245 $File->{Content} = [split /\n/, $HTML]; 2246 2247 if ($seen_doctype) { 2248 my $known = $CFG->{Types}->{$File->{DOCTYPE}}; 2249 unless ($File->{Opt}->{FB}->{DOCTYPE} or 2250 ($known && $File->{Opt}->{DOCTYPE} eq $known->{Display})) 2251 { 2252 &add_warning( 2253 'W13', 2254 { W13_org => $org_dtd, 2255 W13_new => $File->{Opt}->{DOCTYPE}, 2256 } 2257 ); 2258 $File->{Tentative} |= T_ERROR; # Tag it as Invalid. 2259 } 2260 } 2261 else { 2262 if ($File->{"DOCTYPEless OK"}) { 2263 &add_warning('W25', {W25_dtd => $File->{Opt}->{DOCTYPE}}); 2264 } 2265 elsif ($File->{Opt}->{FB}->{DOCTYPE}) { 2266 &add_warning('W16', {W16_dtd => $File->{Opt}->{DOCTYPE}}); 2267 $File->{Tentative} |= T_ERROR; # Tag it as Invalid. 2268 } 2269 else { 2270 &add_warning('W15', {W15_dtd => $File->{Opt}->{DOCTYPE}}); 2271 $File->{Tentative} |= T_ERROR; # Tag it as Invalid. 2272 } 2273 } 2274 2275 return $File; 2276} 2277 2278# 2279# Override inline charset declarations, for use e.g. when passing 2280# transcoded results to external parsers that use them. 2281sub override_charset ($$) 2282{ 2283 my ($File, $charset) = @_; 2284 2285 my $ws = qr/[\x20\x09\x0D\x0A]/o; 2286 my $cs = qr/[A-Za-z][a-zA-Z0-9_-]+/o; 2287 2288 my $content = join("\n", @{$File->{Content}}); 2289 2290 # Flatten newlines (so that we don't end up changing line numbers while 2291 # overriding) and comment-escape a string. 2292 sub escape_original ($) 2293 { 2294 my $str = shift; 2295 $str =~ tr/\r\n/ /; 2296 return &escape_comment($str); 2297 } 2298 2299 # <?xml encoding="charset"?> 2300 $content =~ s/( 2301 (^<\?xml\b[^>]*?${ws}encoding${ws}*=${ws}*(["'])) 2302 (${cs}) 2303 (\3.*?\?>) 2304 )/lc($4) eq lc($charset) ? 2305 "$1" : "$2$charset$5<!-- " . &escape_original($1) . " -->"/esx; 2306 2307 # <meta charset="charset"> 2308 $content =~ s/( 2309 (<meta\b[^>]*?${ws}charset${ws}*=${ws}*["']?${ws}*) 2310 (${cs}) 2311 (.*?>) 2312 )/lc($3) eq lc($charset) ? 2313 "$1" : "$2$charset$4<!-- " . &escape_original($1) . " -->"/esix; 2314 2315 # <meta http-equiv="content-type" content="some/type; charset=charset"> 2316 $content =~ s/( 2317 (<meta\b[^>]*${ws} 2318 http-equiv${ws}*=${ws}*["']?${ws}*content-type\b[^>]*?${ws} 2319 content${ws}*=${ws}*["']?[^"'>]+?;${ws}*charset${ws}*=${ws}*) 2320 (${cs}) 2321 (.*?>) 2322 )/lc($3) eq lc($charset) ? 2323 "$1" : "$2$charset$4<!-- " . &escape_original($1) . " -->"/esix; 2324 2325 # <meta content="some/type; charset=charset" http-equiv="content-type"> 2326 $content =~ s/( 2327 (<meta\b[^>]*${ws} 2328 content${ws}*=${ws}*["']?[^"'>]+?;${ws}*charset${ws}*=${ws}*) 2329 (${cs}) 2330 ([^>]*?${ws}http-equiv${ws}*=${ws}*["']?${ws}*content-type\b.*?>) 2331 )/lc($3) eq lc($charset) ? 2332 "$1" : "$2$charset$4<!-- " . &escape_original($1) . " -->"/esix; 2333 2334 $File->{Content} = [split /\n/, $content]; 2335} 2336 2337sub set_error_uri ($$) 2338{ 2339 my ($err, $uri) = @_; 2340 2341 # We want errors in the doc that was validated to appear without 2342 # $err->{uri}, and non-doc errors with it pointing to the external entity 2343 # or the like where the error is. This usually works as long as we're 2344 # passing docs to parsers as strings, but S::P::O (at least as of 0.994) 2345 # seems to give us "3" as the FileName in those cases so we try to filter 2346 # out everything that doesn't look like a useful URI. 2347 if ($uri && index($uri, '/') != -1) { 2348 2349 # Mask local file paths 2350 my $euri = URI->new($uri); 2351 if (!$euri->scheme() || $euri->scheme() eq 'file') { 2352 $err->{uri_is_file} = TRUE; 2353 $err->{uri} = ($euri->path_segments())[-1]; 2354 } 2355 else { 2356 $err->{uri} = $euri->canonical(); 2357 } 2358 } 2359} 2360 2361# 2362# Generate a HTML report of detected errors. 2363sub report_errors ($) 2364{ 2365 my $File = shift; 2366 my $Errors = []; 2367 my %Errors_bytype; 2368 my $number_of_errors = 0; 2369 my $number_of_warnings = 0; 2370 my $number_of_info = 0; 2371 2372 # for the sake of readability, at least until the xmlwf errors have 2373 # explanations, we push the errors from the XML parser at the END of the 2374 # error list. 2375 push @{$File->{Errors}}, @{$File->{WF_Errors}}; 2376 2377 if (scalar @{$File->{Errors}}) { 2378 foreach my $err (@{$File->{Errors}}) { 2379 my $col = 0; 2380 2381 # Populate source/context for errors in our doc that don't have it 2382 # already. Checkers should always have populated $err->{src} with 2383 # _something_ for non-doc errors. 2384 if (!defined($err->{src})) { 2385 my $line = undef; 2386 2387 # Avoid truncating lines that do not exist. 2388 if (defined($err->{line}) && 2389 $File->{Content}->[$err->{line} - 1]) 2390 { 2391 if (defined($err->{char}) && $err->{char} =~ /^[0-9]+$/) { 2392 ($line, $col) = 2393 &truncate_line( 2394 $File->{Content}->[$err->{line} - 1], 2395 $err->{char}); 2396 $line = &mark_error($line, $col); 2397 } 2398 elsif (defined($err->{line})) { 2399 $col = length($File->{Content}->[$err->{line} - 1]); 2400 $col = 80 if ($col > 80); 2401 ($line, $col) = 2402 &truncate_line( 2403 $File->{Content}->[$err->{line} - 1], $col); 2404 $line = &ent($line); 2405 $col = 0; 2406 } 2407 } 2408 else { 2409 $col = 0; 2410 } 2411 $err->{src} = $line; 2412 } 2413 2414 my $explanation = ""; 2415 if ($err->{expl}) { 2416 2417 } 2418 else { 2419 if ($err->{num}) { 2420 my $num = $err->{num}; 2421 $explanation .= Encode::decode_utf8( 2422 "\n $RSRC{msg}->{$num}->{verbose}\n") 2423 if exists $RSRC{msg}->{$num} && 2424 exists $RSRC{msg}->{$num}->{verbose}; 2425 my $_msg = $RSRC{msg}->{nomsg}->{verbose}; 2426 $_msg =~ s/<!--MID-->/$num/g; 2427 if (($File->{'Is Upload'}) or ($File->{'Direct Input'})) { 2428 $_msg =~ s/<!--URI-->//g; 2429 } 2430 else { 2431 my $escaped_uri = uri_escape($File->{URI}); 2432 $_msg =~ s/<!--URI-->/$escaped_uri/g; 2433 } 2434 2435 # The send feedback plea. 2436 $explanation = " $_msg\n$explanation"; 2437 $explanation =~ s/<!--#echo\s+var="relroot"\s*-->//g; 2438 } 2439 $err->{expl} = $explanation; 2440 } 2441 2442 $err->{col} = ' ' x $col; 2443 if ($err->{type} eq 'I') { 2444 $err->{class} = 'msg_info'; 2445 $err->{err_type_err} = 0; 2446 $err->{err_type_warn} = 0; 2447 $err->{err_type_info} = 1; 2448 $number_of_info += 1; 2449 } 2450 elsif ($err->{type} eq 'E') { 2451 $err->{class} = 'msg_err'; 2452 $err->{err_type_err} = 1; 2453 $err->{err_type_warn} = 0; 2454 $err->{err_type_info} = 0; 2455 $number_of_errors += 1; 2456 } 2457 elsif (($err->{type} eq 'W') or ($err->{type} eq 'X')) { 2458 $err->{class} = 'msg_warn'; 2459 $err->{err_type_err} = 0; 2460 $err->{err_type_warn} = 1; 2461 $err->{err_type_info} = 0; 2462 $number_of_warnings += 1; 2463 } 2464 2465 # TODO other classes for "X" etc? FIXME find all types of message. 2466 2467 push @{$Errors}, $err; 2468 2469 if (($File->{Opt}->{'Group Errors'}) and 2470 (($err->{type} eq 'E') or 2471 ($err->{type} eq 'W') or 2472 ($err->{type} eq 'X')) 2473 ) 2474 { 2475 2476 # index by num for errors and warnings only - info usually 2477 # gives context of error or warning 2478 if (!exists $Errors_bytype{$err->{num}}) { 2479 $Errors_bytype{$err->{num}}->{instances} = []; 2480 my $msg_text; 2481 if ($err->{num} eq 'xmlwf') { 2482 2483 # FIXME need a catalog of errors from XML::LibXML 2484 $msg_text = "XML Parsing Error"; 2485 } 2486 elsif ($err->{num} eq 'html5') { 2487 $msg_text = "HTML5 Validator Error"; 2488 } 2489 else { 2490 $msg_text = $RSRC{msg}->{$err->{num}}->{original}; 2491 $msg_text =~ s/%1/X/; 2492 $msg_text =~ s/%2/Y/; 2493 } 2494 $Errors_bytype{$err->{num}}->{expl} = $err->{expl}; 2495 $Errors_bytype{$err->{num}}->{generic_msg} = $msg_text; 2496 $Errors_bytype{$err->{num}}->{msg} = $err->{msg}; 2497 $Errors_bytype{$err->{num}}->{type} = $err->{type}; 2498 $Errors_bytype{$err->{num}}->{class} = $err->{class}; 2499 $Errors_bytype{$err->{num}}->{err_type_err} = 2500 $err->{err_type_err}; 2501 $Errors_bytype{$err->{num}}->{err_type_warn} = 2502 $err->{err_type_warn}; 2503 $Errors_bytype{$err->{num}}->{err_type_info} = 2504 $err->{err_type_info}; 2505 } 2506 push @{$Errors_bytype{$err->{num}}->{instances}}, $err; 2507 } 2508 } 2509 } 2510 2511 @$Errors = values(%Errors_bytype) if $File->{Opt}->{'Group Errors'}; 2512 2513 # we are not sorting errors by line, as it would break the position 2514 # of auxiliary messages such as "start tag was here". We'll have to live 2515 # with the fact that XML well-formedness errors are listed first, then 2516 # validation errors 2517 #else { 2518 # sort error by lines 2519 # @{$Errors} = sort {$a->{line} <=> $b->{line} } @{$Errors}; 2520 #} 2521 return $number_of_errors, $number_of_warnings, $number_of_info, $Errors; 2522} 2523 2524# 2525# Chop the source line into 3 pieces; the character at which the error 2526# was detected, and everything to the left and right of that position. 2527# That way we can add markup to the relevant char without breaking &ent(). 2528# Expects 1-based column indexes. 2529sub mark_error ($$) 2530{ 2531 my $line = shift; 2532 my $col = shift; 2533 my $linelen = length($line); 2534 2535 # Coerce column into an index valid within the line. 2536 if ($col < 1) { 2537 $col = 1; 2538 } 2539 elsif ($col > $linelen) { 2540 $col = $linelen; 2541 } 2542 $col--; 2543 2544 my $left = substr($line, 0, $col); 2545 my $char = substr($line, $col, 1); 2546 my $right = substr($line, $col + 1); 2547 2548 $char = &ent($char); 2549 $char = 2550 qq(<strong title="Position where error was detected.">$char</strong>); 2551 $line = &ent($left) . $char . &ent($right); 2552 2553 return $line; 2554} 2555 2556# 2557# Create a HTML representation of the document. 2558sub source 2559{ 2560 my $File = shift; 2561 2562 # Remove any BOM since we're not at BOT anymore... 2563 $File->{Content}->[0] = substr($File->{Content}->[0], 1) 2564 if ($File->{BOM} && scalar(@{$File->{Content}})); 2565 2566 my @source = map({file_source_line => $_}, @{$File->{Content}}); 2567 return \@source; 2568} 2569 2570sub match_DTD_FPI_SI 2571{ 2572 my ($File, $FPI, $SI) = @_; 2573 if ($CFG->{Types}->{$FPI}) { 2574 if ($CFG->{Types}->{$FPI}->{SysID}) { 2575 if ($SI ne $CFG->{Types}->{$FPI}->{SysID}) { 2576 &add_warning( 2577 'W26', 2578 { W26_dtd_pub => $FPI, 2579 W26_dtd_pub_display => 2580 $CFG->{Types}->{$FPI}->{Display}, 2581 W26_dtd_sys => $SI, 2582 W26_dtd_sys_recommend => $CFG->{Types}->{$FPI}->{SysID} 2583 } 2584 ); 2585 } 2586 } 2587 } 2588 else { # FPI not known, checking if the SI is 2589 while (my ($proper_FPI, $value) = each %{$CFG->{Types}}) { 2590 if ($value->{SysID} && $value->{SysID} eq $SI) { 2591 &add_warning( 2592 'W26', 2593 { W26_dtd_pub => $FPI, 2594 W26_dtd_pub_display => $value->{Display}, 2595 W26_dtd_sys => $SI, 2596 W26_dtd_pub_recommend => $proper_FPI 2597 } 2598 ); 2599 } 2600 } 2601 } 2602} 2603 2604# 2605# Do an initial parse of the Document Entity to extract FPI. 2606sub preparse_doctype 2607{ 2608 my $File = shift; 2609 2610 # 2611 # Reset DOCTYPE, Root (for second invocation, probably not needed anymore). 2612 $File->{DOCTYPE} = ''; 2613 $File->{Root} = ''; 2614 2615 my $dtd = sub { 2616 return if $File->{Root}; 2617 2618 # TODO: The \s and \w are probably wrong now that the strings are 2619 # utf8_on 2620 my $declaration = shift; 2621 my $doctype_type; 2622 my $doctype_secondpart; 2623 if ($declaration =~ 2624 /<!DOCTYPE\s+html(?:\s+SYSTEM\s+(['"])about:legacy-compat\1)?\s*>/si 2625 ) 2626 { 2627 $File->{Root} = "html"; 2628 $File->{DOCTYPE} = "HTML5"; 2629 } 2630 elsif ($declaration =~ 2631 m(<!DOCTYPE\s+(\w[\w\.-]+)\s+(PUBLIC|SYSTEM)\s+(?:[\'\"])([^\"\']+)(?:[\"\'])(.*)>)si 2632 ) 2633 { 2634 ( $File->{Root}, $doctype_type, 2635 $File->{DOCTYPE}, $doctype_secondpart 2636 ) = ($1, $2, $3, $4); 2637 if (($doctype_type eq "PUBLIC") and 2638 (($doctype_secondpart) = 2639 $doctype_secondpart =~ 2640 m(\s+(?:[\'\"])([^\"\']+)(?:[\"\']).*)si) 2641 ) 2642 { 2643 &match_DTD_FPI_SI($File, $File->{DOCTYPE}, 2644 $doctype_secondpart); 2645 } 2646 } 2647 }; 2648 2649 my $start = sub { 2650 my ($p, $tag, $attr) = @_; 2651 2652 if ($File->{Root}) { 2653 return unless $tag eq $File->{Root}; 2654 } 2655 else { 2656 $File->{Root} = $tag; 2657 } 2658 if ($attr->{xmlns}) { 2659 $File->{Namespace} = $attr->{xmlns}; 2660 } 2661 if ($attr->{version}) { 2662 $File->{'Root Version'} = $attr->{version}; 2663 } 2664 if ($attr->{baseProfile}) { 2665 $File->{'Root BaseProfile'} = $attr->{baseProfile}; 2666 } 2667 2668 # We're done parsing. 2669 $p->eof(); 2670 }; 2671 2672 # we use HTML::Parser as pre-parser. May use html5lib or other in the future 2673 my $p = HTML::Parser->new(api_version => 3); 2674 2675 # if content-type has shown we should pre-parse with XML mode, use that 2676 # otherwise (mostly text/html cases) use default mode 2677 $p->xml_mode(&is_xml($File)); 2678 $p->handler(declaration => $dtd, 'text'); 2679 $p->handler(start => $start, 'self,tag,attr'); 2680 2681 my $line = 0; 2682 my $max = scalar(@{$File->{Content}}); 2683 $p->parse( 2684 sub { 2685 return ($line < $max) ? $File->{Content}->[$line++] . "\n" : undef; 2686 } 2687 ); 2688 $p->eof(); 2689 2690 # TODO: These \s here are probably wrong now that the strings are utf8_on 2691 $File->{DOCTYPE} = '' unless defined $File->{DOCTYPE}; 2692 $File->{DOCTYPE} =~ s(^\s+){ }g; 2693 $File->{DOCTYPE} =~ s(\s+$){ }g; 2694 $File->{DOCTYPE} =~ s(\s+) { }g; 2695 2696 # Some document types actually need no doctype to be identified, 2697 # root element and some version attribute is enough 2698 # TODO applicable doctypes should be migrated to a config file? 2699 2700 # if (($File->{DOCTYPE} eq '') and ($File->{Root} eq "svg") ) { 2701 # if (($File->{'Root Version'}) or ($File->{'Root BaseProfile'})) 2702 # { 2703 # if (! $File->{'Root Version'}) { $File->{'Root Version'} = "0"; } 2704 # if (! $File->{'Root BaseProfile'}) { $File->{'Root BaseProfile'} = "0"; } 2705 # if ($File->{'Root Version'} eq "1.0"){ 2706 # $File->{DOCTYPE} = "-//W3C//DTD SVG 1.0//EN"; 2707 # $File->{"DOCTYPEless OK"} = TRUE; 2708 # $File->{Opt}->{DOCTYPE} = "SVG 1.0"; 2709 # } 2710 # if ((($File->{'Root Version'} eq "1.1") or ($File->{'Root Version'} eq "0")) and ($File->{'Root BaseProfile'} eq "tiny")) { 2711 # $File->{DOCTYPE} = "-//W3C//DTD SVG 1.1 Tiny//EN"; 2712 # $File->{"DOCTYPEless OK"} = TRUE; 2713 # $File->{Opt}->{DOCTYPE} = "SVG 1.1 Tiny"; 2714 # } 2715 # elsif ((($File->{'Root Version'} eq "1.1") or ($File->{'Root Version'} eq "0")) and ($File->{'Root BaseProfile'} eq "basic")) { 2716 # $File->{DOCTYPE} = "-//W3C//DTD SVG 1.1 Basic//EN"; 2717 # $File->{Opt}->{DOCTYPE} = "SVG 1.1 Basic"; 2718 # $File->{"DOCTYPEless OK"} = TRUE; 2719 # } 2720 # elsif (($File->{'Root Version'} eq "1.1") and (!$File->{'Root BaseProfile'})) { 2721 # $File->{DOCTYPE} = "-//W3C//DTD SVG 1.1//EN"; 2722 # $File->{Opt}->{DOCTYPE} = "SVG 1.1"; 2723 # $File->{"DOCTYPEless OK"} = TRUE; 2724 # } 2725 # if ($File->{'Root Version'} eq "0") { $File->{'Root Version'} = undef; } 2726 # if ($File->{'Root BaseProfile'} eq "0") { $File->{'Root BaseProfile'} = undef; } 2727 # } 2728 # else { 2729 # # by default for an svg root elt, we use SVG 1.1 2730 # $File->{DOCTYPE} = "-//W3C//DTD SVG 1.1//EN"; 2731 # $File->{Opt}->{DOCTYPE} = "SVG 1.1"; 2732 # $File->{"DOCTYPEless OK"} = TRUE; 2733 # } 2734 # } 2735 if (($File->{"DOCTYPEless OK"}) and ($File->{Opt}->{DOCTYPE})) { 2736 2737 # doctypeless document type found, we fake the override 2738 # so that the parser will have something to validate against 2739 $File = &override_doctype($File); 2740 } 2741 return $File; 2742} 2743 2744# 2745# Preprocess CGI parameters. 2746sub prepCGI 2747{ 2748 my $File = shift; 2749 my $q = shift; 2750 2751 # The URL to this CGI script. 2752 $File->{Env}->{'Self URI'} = $q->url(); 2753 2754 # Decode parameter values, set booleans the way we expect them. 2755 foreach my $param ($q->param()) { 2756 2757 # 'uploaded_file' and 'fragment' contain data we treat as is. 2758 next if ($param eq 'uploaded_file' || $param eq 'fragment'); 2759 2760 # Decode all other defined values as UTF-8. 2761 my @values = map { Encode::decode_utf8($_) } $q->param($param); 2762 $q->param($param, @values); 2763 2764 # Skip parameters that should not be treated as booleans. 2765 next if $param =~ /^(?:accept(?:-(?:language|charset))?|ur[il])$/; 2766 2767 # Keep false-but-set params. 2768 next if $q->param($param) eq '0'; 2769 2770 # Parameters that are given to us without specifying a value get set 2771 # to a true value. 2772 $q->param($param, TRUE) unless $q->param($param); 2773 } 2774 2775 $File->{Env}->{'Home Page'} = 2776 URI->new_abs(".", $File->{Env}->{'Self URI'}); 2777 2778 # Use "url" unless a "uri" was also given. 2779 if ($q->param('url') and not $q->param('uri')) { 2780 $q->param('uri', $q->param('url')); 2781 } 2782 2783 # Set output mode; needed in get_error_template if we end up there. 2784 $File->{Opt}->{Output} = $q->param('output') || 'html'; 2785 2786 # Issue a redirect for uri=referer. 2787 if ($q->param('uri') and $q->param('uri') eq 'referer') { 2788 if ($q->referer) { 2789 $q->param('uri', $q->referer); 2790 $q->param('accept', $q->http('Accept')) if ($q->http('Accept')); 2791 $q->param('accept-language', $q->http('Accept-Language')) 2792 if ($q->http('Accept-Language')); 2793 $q->param('accept-charset', $q->http('Accept-Charset')) 2794 if ($q->http('Accept-Charset')); 2795 print redirect(-uri => &self_url_q($q, $File), -vary => 'Referer'); 2796 exit; 2797 } 2798 else { 2799 2800 # No Referer header was found. 2801 $File->{'Error Flagged'} = TRUE; 2802 &get_error_template($File)->param(fatal_referer_error => TRUE); 2803 } 2804 } 2805 2806 # Supersede URL with an uploaded file. 2807 if ($q->param('uploaded_file')) { 2808 $q->param('uri', 'upload://' . $q->param('uploaded_file')); 2809 $File->{'Is Upload'} = TRUE; # Tag it for later use. 2810 } 2811 2812 # Supersede URL with an uploaded fragment. 2813 if ($q->param('fragment')) { 2814 $q->param('uri', 'upload://Form Submission'); 2815 $File->{'Direct Input'} = TRUE; # Tag it for later use. 2816 } 2817 2818 # Redirect to a GETable URL if method is POST without a file upload. 2819 if (defined $q->request_method and 2820 $q->request_method eq 'POST' and 2821 not($File->{'Is Upload'} or $File->{'Direct Input'})) 2822 { 2823 my $thispage = &self_url_q($q, $File); 2824 print redirect $thispage; 2825 exit; 2826 } 2827 2828 # 2829 # Flag an error if we didn't get a file to validate. 2830 unless ($q->param('uri')) { 2831 $File->{'Error Flagged'} = TRUE; 2832 my $tmpl = &get_error_template($File); 2833 $tmpl->param(fatal_uri_error => TRUE); 2834 $tmpl->param(fatal_uri_scheme => 'undefined'); 2835 } 2836 2837 return $q; 2838} 2839 2840# 2841# Set parse mode (SGML or XML) based on a number of preparsed factors: 2842# * HTTP Content-Type 2843# * Doctype Declaration 2844# * XML Declaration 2845# * XML namespaces 2846sub set_parse_mode 2847{ 2848 my $File = shift; 2849 my $CFG = shift; 2850 my $fpi = $File->{DOCTYPE}; 2851 $File->{ModeChoice} = ''; 2852 my $parseModeFromDoctype = $CFG->{Types}->{$fpi}->{'Parse Mode'} || 'TBD'; 2853 2854 my $xmlws = qr/[\x20\x09\x0D\x0A]/o; 2855 2856 # $File->{Mode} may have been set in parse_content_type 2857 # and it would come from the Media Type 2858 my $parseModeFromMimeType = $File->{Mode}; 2859 my $begincontent = join "\x20", 2860 @{$File->{Content}}; # for the sake of xml decl detection, 2861 # the 10 first lines should be safe 2862 my $parseModeFromXMLDecl = ( 2863 $begincontent =~ 2864 /^ ${xmlws}* # whitespace before the decl should not be happening 2865 # but we are greedy for the sake of detection, not validation 2866 <\?xml ${xmlws}+ # start matching an XML Declaration 2867 version ${xmlws}* = # for documents, version info is mandatory 2868 ${xmlws}* (["'])1.[01]\1 # hardcoding the existing XML versions. 2869 # Maybe we should use \d\.\d 2870 (?:${xmlws}+ encoding 2871 ${xmlws}* = ${xmlws}* 2872 (["'])[A-Za-z][a-zA-Z0-9_-]+\2 2873 )? # encoding info is optional 2874 (?:${xmlws}+ standalone 2875 ${xmlws}* = ${xmlws}* 2876 (["'])(?:yes|no)\3 2877 )? # ditto standalone info, optional 2878 ${xmlws}* \?> # end of XML Declaration 2879 /ox 2880 ? 2881 'XML' : 2882 'TBD' 2883 ); 2884 2885 my $parseModeFromNamespace = 'TBD'; 2886 # http://www.w3.org/Bugs/Public/show_bug.cgi?id=9967 2887 $parseModeFromNamespace = 'XML' 2888 if ($File->{Namespace} && $parseModeFromDoctype ne 'HTML5'); 2889 2890 if (($parseModeFromMimeType eq 'TBD') and 2891 ($parseModeFromXMLDecl eq 'TBD') and 2892 ($parseModeFromNamespace eq 'TBD') and 2893 (!exists $CFG->{Types}->{$fpi})) 2894 { 2895 2896 # if the mime type is text/html (ambiguous, hence TBD mode) 2897 # and the doctype isn't in the catalogue 2898 # and XML prolog detection was unsuccessful 2899 # and we found no namespace at the root 2900 # ... throw in a warning 2901 &add_warning( 2902 'W06', 2903 { W06_mime => $File->{ContentType}, 2904 w06_doctype => $File->{DOCTYPE} 2905 } 2906 ); 2907 return; 2908 } 2909 2910 $parseModeFromDoctype = 'TBD' 2911 unless $parseModeFromDoctype eq 'SGML' or 2912 $parseModeFromDoctype eq 'HTML5' or 2913 $parseModeFromDoctype eq 'XML' or 2914 $parseModeFromNamespace eq 'XML'; 2915 2916 if (($parseModeFromDoctype eq 'TBD') and 2917 ($parseModeFromXMLDecl eq 'TBD') and 2918 ($parseModeFromMimeType eq 'TBD') and 2919 ($parseModeFromNamespace eq 'TBD')) 2920 { 2921 2922 # if all factors are useless to give us a parse mode 2923 # => we use SGML-based DTD validation as a default 2924 $File->{Mode} = 'DTD+SGML'; 2925 $File->{ModeChoice} = 'Fallback'; 2926 2927 # and send warning about the fallback 2928 &add_warning( 2929 'W06', 2930 { W06_mime => $File->{ContentType}, 2931 w06_doctype => $File->{DOCTYPE} 2932 } 2933 ); 2934 return; 2935 } 2936 2937 if ($parseModeFromMimeType ne 'TBD') { 2938 2939 # if The mime type gives clear indication of whether the document is 2940 # XML or not 2941 if (($parseModeFromDoctype ne 'TBD') and 2942 ($parseModeFromDoctype ne 'HTML5') and 2943 ($parseModeFromMimeType ne $parseModeFromDoctype)) 2944 { 2945 2946 # if document-type recommended mode and content-type recommended 2947 # mode clash, shoot a warning 2948 # unknown doctypes will not trigger this 2949 # neither will html5 documents, which can be XML or not 2950 &add_warning( 2951 'W07', 2952 { W07_mime => $File->{ContentType}, 2953 W07_ct => $parseModeFromMimeType, 2954 W07_dtd => $parseModeFromDoctype, 2955 } 2956 ); 2957 } 2958 2959 # mime type has precedence, we stick to it 2960 $File->{ModeChoice} = 'Mime'; 2961 if ($parseModeFromDoctype eq "HTML5") { 2962 $File->{Mode} = 'HTML5+' . $File->{Mode}; 2963 } 2964 else { 2965 $File->{Mode} = 'DTD+' . $File->{Mode}; 2966 } 2967 return; 2968 } 2969 2970 if ($parseModeFromDoctype ne 'TBD') { 2971 2972 # the mime type is ambiguous (hence we didn't stop at the previous test) 2973 # but by now we're sure that the document type is a good indication 2974 # so we use that. 2975 if ($parseModeFromDoctype eq "HTML5") { 2976 if ($parseModeFromXMLDecl eq "XML" or 2977 $parseModeFromNamespace eq "XML") 2978 { 2979 $File->{Mode} = "HTML5+XML"; 2980 } 2981 else { 2982 $File->{Mode} = "HTML5"; 2983 } 2984 } 2985 else { # not HTML5 2986 $File->{Mode} = "DTD+" . $parseModeFromDoctype; 2987 } 2988 $File->{ModeChoice} = 'Doctype'; 2989 return; 2990 } 2991 2992 if ($parseModeFromXMLDecl ne 'TBD') { 2993 2994 # the mime type is ambiguous (hence we didn't stop at the previous test) 2995 # and so was the doctype 2996 # but we found an XML declaration so we use that. 2997 if ($File->{Mode} eq "") { 2998 $File->{Mode} = "DTD+" . $parseModeFromXMLDecl; 2999 } 3000 elsif ((my $ix = index($File->{Mode}, '+')) != -1) { 3001 substr($File->{Mode}, $ix + 1) = $parseModeFromXMLDecl; 3002 } 3003 else { 3004 $File->{Mode} = $File->{Mode} . "+" . $parseModeFromXMLDecl; 3005 } 3006 $File->{ModeChoice} = 'XMLDecl'; 3007 return; 3008 } 3009 3010 # this is the last case. We know that all modes are not TBD, 3011 # yet mime type, doctype AND XML DECL tests have failed => we are saved 3012 # by the presence of namespaces 3013 if ($File->{Mode} eq "") { 3014 $File->{Mode} = "DTD+" . $parseModeFromNamespace; 3015 } 3016 elsif ((my $ix = index($File->{Mode}, '+')) != -1) { 3017 substr($File->{Mode}, $ix + 1) = $parseModeFromNamespace; 3018 } 3019 else { 3020 $File->{Mode} = $File->{Mode} . "+" . $parseModeFromNamespace; 3021 } 3022 $File->{ModeChoice} = 'Namespace'; 3023} 3024 3025# 3026# Utility sub to tell if mode "is" XML. 3027sub is_xml 3028{ 3029 index(shift->{Mode}, 'XML') != -1; 3030} 3031 3032# 3033# Check charset conflicts and add any warnings necessary. 3034sub charset_conflicts 3035{ 3036 my $File = shift; 3037 3038 # 3039 # Handle the case where there was no charset to be found. 3040 unless ($File->{Charset}->{Use}) { 3041 &add_warning('W17', {}); 3042 $File->{Tentative} |= T_WARN; 3043 } 3044 3045 # 3046 # Add a warning if there was charset info conflict (HTTP header, 3047 # XML declaration, or <meta> element). 3048 # filtering out some of the warnings in direct input mode where HTTP 3049 # encoding is a "fake" 3050 if (( charset_not_equal( 3051 $File->{Charset}->{HTTP}, 3052 $File->{Charset}->{XML} 3053 ) 3054 ) and 3055 not($File->{'Direct Input'}) 3056 ) 3057 { 3058 &add_warning( 3059 'W18', 3060 { W18_http => $File->{Charset}->{HTTP}, 3061 W18_xml => $File->{Charset}->{XML}, 3062 W18_use => $File->{Charset}->{Use}, 3063 } 3064 ); 3065 } 3066 elsif ( 3067 charset_not_equal($File->{Charset}->{HTTP}, $File->{Charset}->{META}) 3068 and 3069 not($File->{'Direct Input'})) 3070 { 3071 &add_warning( 3072 'W19', 3073 { W19_http => $File->{Charset}->{HTTP}, 3074 W19_meta => $File->{Charset}->{META}, 3075 W19_use => $File->{Charset}->{Use}, 3076 } 3077 ); 3078 } 3079 elsif ( 3080 charset_not_equal($File->{Charset}->{XML}, $File->{Charset}->{META})) 3081 { 3082 &add_warning( 3083 'W20', 3084 { W20_http => $File->{Charset}->{XML}, 3085 W20_xml => $File->{Charset}->{META}, 3086 } 3087 ); 3088 $File->{Tentative} |= T_WARN; 3089 } 3090 3091 return $File; 3092} 3093 3094# 3095# Transcode to UTF-8 3096sub transcode 3097{ 3098 my $File = shift; 3099 3100 my $general_charset = $File->{Charset}->{Use}; 3101 my $exact_charset = $general_charset; 3102 3103 # TODO: This should be done before transcode() 3104 if ($general_charset eq 'utf-16') { 3105 if ($File->{Charset}->{Auto} =~ m/^utf-16[bl]e$/) { 3106 $exact_charset = $File->{Charset}->{Auto}; 3107 } 3108 else { $exact_charset = 'utf-16be'; } 3109 } 3110 3111 my $cs = $exact_charset; 3112 3113 if ($CFG->{Charsets}->{$cs}) { 3114 if (index($CFG->{Charsets}->{$cs}, 'ERR ') != -1) { 3115 3116 # The encoding is not supported due to policy 3117 3118 $File->{'Error Flagged'} = TRUE; 3119 my $tmpl = &get_error_template($File); 3120 $tmpl->param(fatal_transcode_error => TRUE); 3121 $tmpl->param(fatal_transcode_charset => $cs); 3122 3123 # @@FIXME might need better text 3124 $tmpl->param(fatal_transcode_errmsg => 3125 'This encoding is not supported by the validator.'); 3126 return $File; 3127 } 3128 elsif (index($CFG->{Charsets}->{$cs}, 'X ') != -1) { 3129 3130 # possibly problematic, we recommend another alias 3131 my $recommended_charset = $CFG->{Charsets}->{$cs}; 3132 $recommended_charset =~ s/X //; 3133 &add_warning( 3134 'W22', 3135 { W22_declared => $cs, 3136 W22_suggested => $recommended_charset, 3137 } 3138 ); 3139 } 3140 } 3141 3142 # Does the system support decoding this encoding? 3143 my $enc = Encode::find_encoding($cs); 3144 3145 if (!$enc) { 3146 3147 # This system's Encode installation does not support 3148 # the character encoding; might need additional modules 3149 3150 $File->{'Error Flagged'} = TRUE; 3151 my $tmpl = &get_error_template($File); 3152 $tmpl->param(fatal_transcode_error => TRUE); 3153 $tmpl->param(fatal_transcode_charset => $cs); 3154 3155 # @@FIXME might need better text 3156 $tmpl->param(fatal_transcode_errmsg => 'Encoding not supported.'); 3157 return $File; 3158 } 3159 elsif (!$CFG->{Charsets}->{$cs}) { 3160 3161 # not in the list, but technically OK -> we warn 3162 &add_warning('W24', {W24_declared => $cs,}); 3163 3164 } 3165 3166 my $output; 3167 my $input = $File->{Bytes}; 3168 3169 # Try to transcode 3170 eval { $output = $enc->decode($input, Encode::FB_CROAK); }; 3171 3172 if ($@) { 3173 3174 # Transcoding failed - do it again line by line to find out exactly 3175 # where 3176 my $line_num = 0; 3177 while ($input =~ /(.*?)(?:\r\n|\n|\r|\z)/g) { 3178 $line_num++; 3179 eval { $enc->decode($1, Encode::FB_CROAK); }; 3180 if ($@) { 3181 my $croak_message = $@; 3182 $croak_message =~ s/ at .*//; 3183 $File->{'Error Flagged'} = TRUE; 3184 my $tmpl = &get_error_template($File); 3185 $tmpl->param(fatal_byte_error => TRUE); 3186 $tmpl->param(fatal_byte_lines => $line_num); 3187 $tmpl->param(fatal_byte_charset => $cs); 3188 $tmpl->param(fatal_byte_error_msg => $croak_message); 3189 last; 3190 } 3191 } 3192 return $File; 3193 } 3194 3195 # @@FIXME is this what we want? 3196 $output =~ s/\015?\012/\n/g; 3197 3198 # make sure we deal only with unix newlines 3199 # tentative fix for http://www.w3.org/Bugs/Public/show_bug.cgi?id=3992 3200 $output =~ s/(\r\n|\n|\r)/\n/g; 3201 3202 #debug: we could check if the content has utf8 bit on with 3203 #$output= utf8::is_utf8($output) ? 1 : 0; 3204 $File->{Content} = [split /\n/, $output]; 3205 3206 return $File; 3207} 3208 3209sub find_encodings 3210{ 3211 my $File = shift; 3212 my $bom = HTML::Encoding::encoding_from_byte_order_mark($File->{Bytes}); 3213 my @first = HTML::Encoding::encoding_from_first_chars($File->{Bytes}); 3214 3215 if (defined $bom) { 3216 3217 # @@FIXME this BOM entry should not be needed at all! 3218 $File->{BOM} = length(Encode::encode($bom, "\x{FEFF}")); 3219 $File->{Charset}->{Auto} = lc $bom; 3220 } 3221 else { 3222 $File->{Charset}->{Auto} = lc($first[0]) if @first; 3223 } 3224 3225 my $xml = HTML::Encoding::encoding_from_xml_document($File->{Bytes}); 3226 $File->{Charset}->{XML} = lc $xml if defined $xml; 3227 3228 my %metah; 3229 foreach my $try (@first) { 3230 3231 # @@FIXME I think the old code used HTML::Parser xml mode, check if ok 3232 my $meta = 3233 HTML::Encoding::encoding_from_meta_element($File->{Bytes}, $try); 3234 $metah{lc($meta)}++ if defined $meta and length $meta; 3235 } 3236 3237 if (!%metah) { 3238 3239 # HTML::Encoding doesn't support HTML5 <meta charset> as of 0.60, 3240 # check it ourselves. HTML::HeadParser >= 3.60 is required for this. 3241 3242 my $hp = HTML::HeadParser->new(); 3243 my $seen_doctype = FALSE; 3244 my $is_html5 = FALSE; 3245 $hp->handler( 3246 declaration => sub { 3247 my ($tag, $text) = @_; 3248 return if ($seen_doctype || uc($tag) ne '!DOCTYPE'); 3249 $seen_doctype = TRUE; 3250 $is_html5 = TRUE 3251 if ( 3252 $text =~ /<!DOCTYPE\s+html 3253 (\s+SYSTEM\s+(['"])about:legacy-compat\2)? 3254 \s*>/six 3255 ); 3256 }, 3257 'tag,text' 3258 ); 3259 $hp->parse($File->{Bytes}); 3260 if ($is_html5) { 3261 my $cs = $hp->header('X-Meta-Charset'); 3262 $metah{lc($cs)}++ if (defined($cs) && length($cs)); 3263 } 3264 } 3265 3266 if (%metah) { 3267 my @meta = sort { $metah{$b} <=> $metah{$a} } keys %metah; 3268 $File->{Charset}->{META} = $meta[0]; 3269 } 3270 3271 return $File; 3272} 3273 3274# 3275# Abort with a message if an error was flagged at point. 3276sub abort_if_error_flagged 3277{ 3278 my $File = shift; 3279 3280 return unless $File->{'Error Flagged'}; 3281 return if $File->{'Error Handled'}; # Previous error, keep going. 3282 3283 my $tmpl = &get_error_template($File); 3284 $tmpl->param(fatal_error => TRUE); 3285 3286 &prep_template($File, $tmpl); 3287 3288 # transcode output from perl's internal to utf-8 and output 3289 print Encode::encode('UTF-8', $tmpl->output); 3290 exit; 3291} 3292 3293# 3294# conflicting encodings 3295sub charset_not_equal 3296{ 3297 my $encodingA = shift; 3298 my $encodingB = shift; 3299 return $encodingA && $encodingB && ($encodingA ne $encodingB); 3300} 3301 3302# 3303# Construct a self-referential URL from a CGI.pm $q object. 3304sub self_url_q 3305{ 3306 my ($q, $File) = @_; 3307 my $thispage = $File->{Env}->{'Self URI'} . '?'; 3308 3309 # Pass-through parameters 3310 for my $param (qw(uri accept accept-language accept-charset)) { 3311 $thispage .= "$param=" . uri_escape($q->param($param)) . ';' 3312 if $q->param($param); 3313 } 3314 3315 # Boolean parameters 3316 for my $param (qw(ss outline No200 verbose group)) { 3317 $thispage .= "$param=1;" if $q->param($param); 3318 } 3319 3320 # Others 3321 if ($q->param('doctype') and $q->param('doctype') !~ /(?:Inline|detect)/i) 3322 { 3323 $thispage .= 'doctype=' . uri_escape($q->param('doctype')) . ';'; 3324 } 3325 if ($q->param('charset') and $q->param('charset') !~ /detect/i) { 3326 $thispage .= 'charset=' . uri_escape($q->param('charset')) . ';'; 3327 } 3328 3329 $thispage =~ s/[\?;]$//; 3330 return $thispage; 3331} 3332 3333# 3334# Construct a self-referential URL from a $File object. 3335sub self_url_file 3336{ 3337 my $File = shift; 3338 3339 my $thispage = $File->{Env}->{'Self URI'}; 3340 my $escaped_uri = uri_escape($File->{URI}); 3341 $thispage .= qq(?uri=$escaped_uri); 3342 $thispage .= ';ss=1' if $File->{Opt}->{'Show Source'}; 3343 $thispage .= ';st=1' if $File->{Opt}->{'Show Tidy'}; 3344 $thispage .= ';outline=1' if $File->{Opt}->{Outline}; 3345 $thispage .= ';No200=1' if $File->{Opt}->{No200}; 3346 $thispage .= ';verbose=1' if $File->{Opt}->{Verbose}; 3347 $thispage .= ';group=1' if $File->{Opt}->{'Group Errors'}; 3348 $thispage .= ';accept=' . uri_escape($File->{Opt}->{'Accept Header'}) 3349 if $File->{Opt}->{'Accept Header'}; 3350 $thispage .= 3351 ';accept-language=' . 3352 uri_escape($File->{Opt}->{'Accept-Language Header'}) 3353 if $File->{Opt}->{'Accept-Language Header'}; 3354 $thispage .= 3355 ';accept-charset=' . 3356 uri_escape($File->{Opt}->{'Accept-Charset Header'}) 3357 if $File->{Opt}->{'Accept-Charset Header'}; 3358 3359 return $thispage; 3360} 3361 3362##### 3363 3364package W3C::Validator::EventHandler; 3365 3366# 3367# Define global constants 3368use constant TRUE => 1; 3369use constant FALSE => 0; 3370 3371# 3372# Tentative Validation Severities. 3373use constant T_WARN => 4; # 0000 0100 3374use constant T_ERROR => 8; # 0000 1000 3375 3376sub new 3377{ 3378 my $class = shift; 3379 my $parser = shift; 3380 my $File = shift; 3381 my $CFG = shift; 3382 my $self = {_file => $File, CFG => $CFG, _parser => $parser}; 3383 bless $self, $class; 3384} 3385 3386sub start_element 3387{ 3388 my ($self, $element) = @_; 3389 3390 my $has_xmlns = FALSE; 3391 my $xmlns_value = undef; 3392 3393 # If in XML mode, find namespace used for each element. 3394 if ((my $attr = $element->{Attributes}->{xmlns}) && 3395 &W3C::Validator::MarkupValidator::is_xml($self->{_file})) 3396 { 3397 $xmlns_value = ""; 3398 3399 # Try with SAX method 3400 if ($attr->{Value}) { 3401 $has_xmlns = TRUE; 3402 $xmlns_value = $attr->{Value}; 3403 } 3404 3405 #next if $has_xmlns; 3406 3407 # The following is not SAX, but OpenSP specific. 3408 my $defaulted = $attr->{Defaulted} || ''; 3409 if ($defaulted eq "specified") { 3410 $has_xmlns = TRUE; 3411 $xmlns_value .= 3412 join("", map { $_->{Data} } @{$attr->{CdataChunks}}); 3413 } 3414 } 3415 3416 my $doctype = $self->{_file}->{DOCTYPE}; 3417 3418 if (!defined($self->{CFG}->{Types}->{$doctype}->{Name}) || 3419 $element->{Name} ne $self->{CFG}->{Types}->{$doctype}->{Name}) 3420 { 3421 3422 # add to list of non-root namespaces 3423 push(@{$self->{_file}->{Namespaces}}, $xmlns_value) if $has_xmlns; 3424 } 3425 elsif (!$has_xmlns && 3426 $self->{CFG}->{Types}->{$doctype}->{"Namespace Required"}) 3427 { 3428 3429 # whine if the root xmlns attribute is noted as required by spec, 3430 # but not present 3431 my $err = {}; 3432 my $location = $self->{_parser}->get_location(); 3433 &W3C::Validator::MarkupValidator::set_error_uri($err, 3434 $location->{FileName}); 3435 3436 # S::P::O does not provide src context, set to empty for non-doc errors. 3437 $err->{src} = "" if $err->{uri}; 3438 $err->{line} = $location->{LineNumber}; 3439 $err->{char} = $location->{ColumnNumber}; 3440 $err->{num} = "no-xmlns"; 3441 $err->{type} = "E"; 3442 $err->{msg} = 3443 "Missing xmlns attribute for element $element->{Name}. The " . 3444 "value should be: $self->{CFG}->{Types}->{$doctype}->{Namespace}"; 3445 3446 # ... 3447 $self->{_file}->{'Is Valid'} = FALSE; 3448 push @{$self->{_file}->{Errors}}, $err; 3449 } 3450 elsif ($has_xmlns and 3451 (defined $self->{CFG}->{Types}->{$doctype}->{Namespace}) and 3452 ($xmlns_value ne $self->{CFG}->{Types}->{$doctype}->{Namespace})) 3453 { 3454 3455 # whine if root xmlns element is not the one specificed by the spec 3456 my $err = {}; 3457 my $location = $self->{_parser}->get_location(); 3458 &W3C::Validator::MarkupValidator::set_error_uri($err, 3459 $location->{FileName}); 3460 3461 # S::P::O does not provide src context, set to empty for non-doc errors. 3462 $err->{line} = $location->{LineNumber}; 3463 $err->{char} = $location->{ColumnNumber}; 3464 $err->{num} = "wrong-xmlns"; 3465 $err->{type} = "E"; 3466 $err->{msg} = 3467 "Wrong xmlns attribute for element $element->{Name}. The " . 3468 "value should be: $self->{CFG}->{Types}->{$doctype}->{Namespace}"; 3469 3470 # ... 3471 $self->{_file}->{'Is Valid'} = FALSE; 3472 push @{$self->{_file}->{Errors}}, $err; 3473 } 3474} 3475 3476sub error 3477{ 3478 my $self = shift; 3479 my $error = shift; 3480 my $mess; 3481 eval { $mess = $self->{_parser}->split_message($error); }; 3482 if ($@) { 3483 3484 # this is a message that S:P:O could not handle, we skip its croaking 3485 return; 3486 } 3487 my $File = $self->{_file}; 3488 3489 my $err = {}; 3490 &W3C::Validator::MarkupValidator::set_error_uri($err, 3491 $self->{_parser}->get_location()->{FileName}); 3492 3493 # S::P::O does not provide src context, set to empty for non-doc errors. 3494 $err->{src} = "" if $err->{uri}; 3495 $err->{line} = $mess->{primary_message}{LineNumber}; 3496 $err->{char} = $mess->{primary_message}{ColumnNumber} + 1; 3497 $err->{num} = $mess->{primary_message}{Number}; 3498 $err->{type} = $mess->{primary_message}{Severity}; 3499 $err->{msg} = $mess->{primary_message}{Text}; 3500 3501 # our parser OpenSP is not quite XML-aware, or XML Namespaces Aware, 3502 # so we filter out a few errors for now 3503 3504 my $is_xml = &W3C::Validator::MarkupValidator::is_xml($File); 3505 3506 if ($is_xml and $err->{num} eq '108' and $err->{msg} =~ m{ "xmlns:\S+"}) { 3507 3508 # the error is about a missing xmlns: attribute definition" 3509 return; # this is not an error, 'cause we said so 3510 } 3511 3512 if ($err->{num} eq '187') 3513 3514 # filtering out no "document type declaration; will parse without 3515 # validation" if root element is not html and mode is xml... 3516 { 3517 3518 # since parsing was done without validation, result can only be 3519 # "well-formed" 3520 if ($is_xml and lc($File->{Root}) ne 'html') { 3521 $File->{XMLWF_ONLY} = TRUE; 3522 W3C::Validator::MarkupValidator::add_warning('W09xml', {}); 3523 return; # don't report this as an error, just proceed 3524 } 3525 3526 # if mode is not XML, we do report the error. It should not happen in 3527 # the case of <html> without doctype, in that case the error message 3528 # will be #344 3529 } 3530 3531 if (($err->{num} eq '113') and index($err->{msg}, 'xml:space') != -1) { 3532 3533 # FIXME 3534 # this is a problem with some of the "flattened" W3C DTDs, filtering 3535 # them out to not confuse users. hoping to get the DTDs fixed, see 3536 # http://lists.w3.org/Archives/Public/www-html-editor/2007AprJun/0010.html 3537 return; # don't report this, just proceed 3538 } 3539 3540 if ($is_xml and $err->{num} eq '344' and $File->{Namespace}) { 3541 3542 # we are in XML mode, we have a namespace, but no doctype. 3543 # the validator will already have said "no doctype, falling back to 3544 # default" above 3545 # no need to report this. 3546 return; # don't report this, just proceed 3547 } 3548 3549 if (($err->{num} eq '248') or 3550 ($err->{num} eq '247') or 3551 ($err->{num} eq '246')) 3552 { 3553 3554 # these two errors should be triggered by -wmin-tag to report shorttag 3555 # used, but we're making them warnings, not errors 3556 # see http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.7 3557 $err->{type} = "W"; 3558 } 3559 3560 # Workaround for onsgmls as of 1.5 sometimes allegedly reporting errors 3561 # beyond EOL. If you see this warning in your web server logs, please 3562 # let the validator developers know, see http://validator.w3.org/feedback.html 3563 # As long as $err may be from somewhere else than the document (such as 3564 # from a DTD) and we have no way of identifying these cases, this 3565 # produces bogus results and error log spewage, so commented out for now. 3566 # if ((my $l = length($File->{Content}->[$err->{line}-1])) < $err->{char}) { 3567 # warn("Warning: reported error column larger than line length " . 3568 # "($err->{char} > $l) in $File->{URI} line $err->{line}, " . 3569 # "OpenSP bug? Resetting to line length."); 3570 # $err->{char} = $l; 3571 # } 3572 3573 # No or unknown FPI and a relative SI. 3574 if ($err->{msg} =~ m(cannot (?:open|find))) { 3575 $File->{'Error Flagged'} = TRUE; 3576 my $tmpl = &W3C::Validator::MarkupValidator::get_error_template($File); 3577 $tmpl->param(fatal_parse_extid_error => TRUE); 3578 $tmpl->param(fatal_parse_extid_msg => $err->{msg}); 3579 } 3580 3581 # No DOCTYPE found! We are falling back to vanilla DTD 3582 if (index($err->{msg}, "prolog can't be omitted") != -1) { 3583 if (lc($File->{Root}) eq 'html') { 3584 my $dtd = $File->{"Default DOCTYPE"}->{$is_xml ? "XHTML" : "HTML"}; 3585 W3C::Validator::MarkupValidator::add_warning('W09', 3586 {W09_dtd => $dtd}); 3587 } 3588 else { # not html root element, we are not using fallback 3589 unless ($is_xml) { 3590 $File->{'Is Valid'} = FALSE; 3591 W3C::Validator::MarkupValidator::add_warning('W09nohtml', {}); 3592 } 3593 } 3594 3595 return; # Don't report this as a normal error. 3596 } 3597 3598 # TODO: calling exit() here is probably a bad idea 3599 W3C::Validator::MarkupValidator::abort_if_error_flagged($File); 3600 3601 push @{$File->{Errors}}, $err; 3602 3603 # ... 3604 $File->{'Is Valid'} = FALSE if $err->{type} eq 'E'; 3605 3606 if (defined $mess->{aux_message}) { 3607 3608 # "duplicate id ... first defined here" style messages 3609 push @{$File->{Errors}}, 3610 { 3611 line => $mess->{aux_message}{LineNumber}, 3612 char => $mess->{aux_message}{ColumnNumber} + 1, 3613 msg => $mess->{aux_message}{Text}, 3614 type => 'I', 3615 }; 3616 } 3617} 3618 3619package W3C::Validator::EventHandler::Outliner; 3620 3621# 3622# Define global constants 3623use constant TRUE => 1; 3624use constant FALSE => 0; 3625 3626# 3627# Tentative Validation Severities. 3628use constant T_WARN => 4; # 0000 0100 3629use constant T_ERROR => 8; # 0000 1000 3630 3631use base qw(W3C::Validator::EventHandler); 3632 3633sub new 3634{ 3635 my $class = shift; 3636 my $parser = shift; 3637 my $File = shift; 3638 my $CFG = shift; 3639 my $self = $class->SUPER::new($parser, $File, $CFG); 3640 $self->{am_in_heading} = 0; 3641 $self->{heading_text} = []; 3642 bless $self, $class; 3643} 3644 3645sub data 3646{ 3647 my ($self, $chars) = @_; 3648 push(@{$self->{heading_text}}, $chars->{Data}) if $self->{am_in_heading}; 3649} 3650 3651sub start_element 3652{ 3653 my ($self, $element) = @_; 3654 if ($element->{Name} =~ /^h([1-6])$/i) { 3655 $self->{_file}->{heading_outline} ||= ""; 3656 $self->{_file}->{heading_outline} .= 3657 " " x int($1) . "[$element->{Name}] "; 3658 $self->{am_in_heading} = 1; 3659 } 3660 3661 return $self->SUPER::start_element($element); 3662} 3663 3664sub end_element 3665{ 3666 my ($self, $element) = @_; 3667 if ($element->{Name} =~ /^h[1-6]$/i) { 3668 my $text = join("", @{$self->{heading_text}}); 3669 $text =~ s/^\s+//g; 3670 $text =~ s/\s+/ /g; 3671 $text =~ s/\s+$//g; 3672 $self->{_file}->{heading_outline} .= "$text\n"; 3673 $self->{am_in_heading} = 0; 3674 $self->{heading_text} = []; 3675 } 3676} 3677 3678##### 3679 3680package W3C::Validator::UserAgent; 3681 3682use HTTP::Message qw(); 3683use LWP::UserAgent 2.032 qw(); # Need 2.032 for default_header() 3684use Net::hostent qw(gethostbyname); 3685use Net::IP qw(); 3686use Socket qw(inet_ntoa); 3687 3688use base qw(LWP::UserAgent); 3689 3690BEGIN { 3691 3692 # The 4k default line length in LWP <= 5.832 isn't enough for example to 3693 # accommodate 4kB cookies (RFC 2985); bump it (#6678). 3694 require LWP::Protocol::http; 3695 push(@LWP::Protocol::http::EXTRA_SOCK_OPTS, MaxLineLength => 8 * 1024); 3696} 3697 3698sub new 3699{ 3700 my ($proto, $CFG, $File, @rest) = @_; 3701 my $class = ref($proto) || $proto; 3702 my $self = $class->SUPER::new(@rest); 3703 3704 $self->{'W3C::Validator::CFG'} = $CFG; 3705 $self->{'W3C::Validator::File'} = $File; 3706 3707 $self->env_proxy(); 3708 $self->agent($File->{Opt}->{'User Agent'}); 3709 $self->protocols_allowed($CFG->{Protocols}->{Allow} || ['http', 'https']); 3710 3711 # Don't parse the http-equiv stuff. 3712 $self->parse_head(0); 3713 3714 # Tell caches in the middle we want a fresh copy (Bug 4998). 3715 $self->default_header('Cache-Control' => 'max-age=0'); 3716 3717 # If not in debug mode, set Accept-Encoding to what LWP (>= 5.816) can handle 3718 $self->default_header( 3719 'Accept-Encoding' => scalar HTTP::Message::decodable()) 3720 if (!$File->{Opt}->{Debug} && HTTP::Message->can('decodable')); 3721 3722 # Our timeout should be set to something lower than the web server's, 3723 # remembering to give some head room for the actual validation to take 3724 # place after the document has been fetched (something like 15 seconds 3725 # should be plenty). validator.w3.org instances have their timeout set 3726 # to 60 seconds as of writing this (#4985, #6950). 3727 $self->timeout(45); 3728 3729 return $self; 3730} 3731 3732sub redirect_ok 3733{ 3734 my ($self, $req, $res) = @_; 3735 return $self->SUPER::redirect_ok($req, $res) && $self->uri_ok($req->uri()); 3736} 3737 3738sub uri_ok 3739{ 3740 my ($self, $uri) = @_; 3741 3742 return 1 3743 if ($self->{'W3C::Validator::CFG'}->{'Allow Private IPs'} || 3744 !$uri->can('host')); 3745 3746 my $h5uri = $self->{'W3C::Validator::CFG'}->{External}->{HTML5}; 3747 if ($h5uri) { 3748 my $clone = $uri->clone(); 3749 $clone->query(undef); 3750 $clone->fragment(undef); 3751 $h5uri = URI->new($h5uri); 3752 $h5uri->query(undef); 3753 $h5uri->fragment(undef); 3754 return 1 if $clone->eq($h5uri); 3755 } 3756 3757 my $addr = my $iptype = undef; 3758 if (my $host = gethostbyname($uri->host())) { 3759 $addr = inet_ntoa($host->addr()) if $host->addr(); 3760 if ($addr && (my $ip = Net::IP->new($addr))) { 3761 $iptype = $ip->iptype(); 3762 } 3763 } 3764 if ($iptype && $iptype ne 'PUBLIC') { 3765 my $File = $self->{'W3C::Validator::File'}; 3766 $File->{'Error Flagged'} = 1; 3767 my $tmpl = &W3C::Validator::MarkupValidator::get_error_template($File); 3768 $tmpl->param(fatal_ip_error => 1); 3769 $tmpl->param(fatal_ip_host => $uri->host() || 'undefined'); 3770 $tmpl->param(fatal_ip_hostname => 1) 3771 if ($addr and $uri->host() ne $addr); 3772 return 0; 3773 } 3774 return 1; 3775} 3776 3777# Local Variables: 3778# mode: perl 3779# indent-tabs-mode: nil 3780# cperl-indent-level: 4 3781# cperl-continued-statement-offset: 4 3782# cperl-brace-offset: -4 3783# perl-indent-level: 4 3784# End: 3785# ex: ts=4 sw=4 et 3786