#!/usr/bin/perl -T # # W3C Markup Validation Service # A CGI script to retrieve and validate a markup file # # Copyright 1995-2013 World Wide Web Consortium, (Massachusetts # Institute of Technology, European Research Consortium for Informatics # and Mathematics, Keio University). All Rights Reserved. # # Originally written by Gerald Oskoboiny # for additional contributors, see # http://dvcs.w3.org/hg/markup-validator/shortlog/tip and # http://validator.w3.org/about.html#credits # # This source code is available under the license at: # http://www.w3.org/Consortium/Legal/copyright-software # # We need Perl 5.8.0+. use 5.008; ############################################################################### #### Load modules. ############################################################ ############################################################################### # # Pragmas. use strict; use warnings; use utf8; use MIME::Base64 (); package W3C::Validator::MarkupValidator; # # Modules. See also the BEGIN block further down below. # # Version numbers given where we absolutely need a minimum version of a given # module (gives nicer error messages). By default, add an empty import list # when loading modules to prevent non-OO or poorly written modules from # polluting our namespace. # # Need 3.40 for query string and path info fixes, #4365 use CGI 3.40 qw(-newstyle_urls -private_tempfiles redirect); use CGI::Carp qw(carp croak fatalsToBrowser); use Config qw(%Config); use Config::General 2.32 qw(); # Need 2.32 for , rt.cpan.org#17852 use Encode qw(); use Encode::Alias qw(); use Encode::HanExtra qw(); # for some chinese character encodings, # e.g gb18030 use File::Spec::Functions qw(catfile rel2abs tmpdir); use HTML::Encoding 0.52 qw(); use HTML::HeadParser 3.60 qw(); # Needed for HTML5 meta charset workaround use HTML::Parser 3.24 qw(); # Need 3.24 for $p->parse($code_ref) use HTML::Template qw(); # Need 2.6 for path param, other things. # Specifying 2.6 would break with 2.10, # rt.cpan.org#70190 use HTTP::Headers::Util qw(); use HTTP::Message 1.52 qw(); # Need 1.52 for decoded_content() use HTTP::Request qw(); use HTTP::Headers::Auth qw(); # Needs to be imported after other HTTP::*. use JSON 2.00 qw(); use SGML::Parser::OpenSP 0.991 qw(); use URI 1.53 qw(); # Need 1.53 for secure() use URI::Escape qw(uri_escape); use URI::file; use URI::Heuristic qw(); ############################################################################### #### Constant definitions. #################################################### ############################################################################### # # Define global constants use constant TRUE => 1; use constant FALSE => 0; # # Tentative Validation Severities. use constant T_WARN => 4; # 0000 0100 use constant T_ERROR => 8; # 0000 1000 # # Define global variables. use vars qw($DEBUG $CFG %RSRC $VERSION); $VERSION = '1.3'; use constant IS_MODPERL2 => (exists($ENV{MOD_PERL_API_VERSION}) && $ENV{MOD_PERL_API_VERSION} >= 2); # # Things inside BEGIN don't happen on every request in persistent environments # (such as mod_perl); so let's do the globals, eg. read config, here. BEGIN { my $base = $ENV{W3C_VALIDATOR_HOME} || '/usr/local/validator'; # Launder data for -T; -AutoLaunder doesn't catch this one. if ($base =~ /^(.*)$/) { $base = $1; } # # Read Config Files. eval { my %config_opts = ( -ConfigFile => ($ENV{W3C_VALIDATOR_CFG} || '/etc/w3c/validator.conf'), -MergeDuplicateOptions => TRUE, -MergeDuplicateBlocks => TRUE, -SplitPolicy => 'equalsign', -UseApacheInclude => TRUE, -IncludeRelative => TRUE, -InterPolateVars => TRUE, -AutoLaunder => TRUE, -AutoTrue => TRUE, -CComments => FALSE, -DefaultConfig => { Protocols => {Allow => 'http,https'}, Paths => { Base => $base, Cache => '', }, External => {HTML5 => FALSE,}, }, ); my %cfg = Config::General->new(%config_opts)->getall(); $CFG = \%cfg; }; if ($@) { die <<"EOF"; Could not read configuration. Set the W3C_VALIDATOR_CFG environment variable or copy conf/* to /etc/w3c/. Make sure that the configuration file and all included files are readable by the web server user. The error was:\n'$@' EOF } # # Check paths in config # @@FIXME: This does not do a very good job error-message-wise if # a path is missing... { my %paths = map { $_ => [-d $_, -r _] } $CFG->{Paths}->{Base}, $CFG->{Paths}->{Templates}, $CFG->{Paths}->{SGML}->{Library}; my @_d = grep { not $paths{$_}->[0] } keys %paths; my @_r = grep { not $paths{$_}->[1] } keys %paths; die "Does not exist or is not a directory: @_d\n" if scalar(@_d); die "Directory not readable (permission denied): @_r\n" if scalar(@_r); } # # Split allowed protocols into a list. if (my $allowed = delete($CFG->{Protocols}->{Allow})) { $CFG->{Protocols}->{Allow} = [split(/\s*,\s*/, $allowed)]; } # Split available languages into a list if (my $langs = delete($CFG->{Languages})) { $CFG->{Languages} = [split(/\s+/, $langs)]; } else { # Default to english $CFG->{Languages} = ["en"]; } { # Make types config indexed by FPI. my $types = {}; while (my ($key, $value) = each %{$CFG->{Types}}) { $types->{$CFG->{Types}->{$key}->{PubID}} = $value; } $CFG->{Types} = $types; } # # Change strings to internal constants in MIME type mapping. while (my ($key, $value) = each %{$CFG->{MIME}}) { $CFG->{MIME}->{$key} = 'TBD' unless ($value eq 'SGML' || $value eq 'XML'); } # # Register Encode aliases. while (my ($key, $value) = each %{$CFG->{Charsets}}) { Encode::Alias::define_alias($key, $1) if ($value =~ /^[AX] (\S+)/); } # # Set debug flag. if ($CFG->{'Allow Debug'}) { $DEBUG = TRUE if $ENV{W3C_VALIDATOR_DEBUG} || $CFG->{'Enable Debug'}; } else { $DEBUG = FALSE; } # Read friendly error message file # 'en_US' should be replaced by $lang for lang-neg %RSRC = Config::General->new( -MergeDuplicateBlocks => 1, -ConfigFile => catfile($CFG->{Paths}->{Templates}, 'en_US', 'error_messages.cfg'), )->getall(); eval { local $SIG{__DIE__} = undef; require Encode::JIS2K; # for optional extra Japanese encodings }; # Tell libxml to load _only_ our XML catalog. This is because our entity # load jailing may trap the libxml internal default catalog (which is # automatically loaded). Preventing loading that from the input callback # will cause libxml to not see the document content at all but to throw # weird "Document is empty" errors, at least as of XML::LibXML 1.70 and # libxml 2.7.7. XML_CATALOG_FILES needs to be in effect at XML::LibXML # load time which is why we're using "require" here instead of pulling it # in with "use" as usual. And finally, libxml should have support for # SGML open catalogs but they don't seem to work (again as of 1.70 and # 2.7.7); if we use xml.soc here, no entities seem to end up being resolved # from it - so we use a (redundant) XML catalog which works. # Note that setting XML_CATALOG_FILES here does not seem to work with # mod_perl (it doesn't end up being used by XML::LibXML), therefore we do # it in the mod_perl/startup.pl startup file for it too. local $ENV{XML_CATALOG_FILES} = catfile($CFG->{Paths}->{SGML}->{Library}, 'catalog.xml'); require XML::LibXML; XML::LibXML->VERSION(1.73); # Need 1.73 for rt.cpan.org #66642 } # end of BEGIN block. # # Get rid of (possibly insecure) $PATH. delete $ENV{PATH}; #@@DEBUG: Dump $CFG datastructure. Used only as a developer aid. #use Data::Dumper qw(Dumper); #print Dumper($CFG); #exit; #@@DEBUG; ############################################################################### #### Process CGI variables and initialize. #################################### ############################################################################### # # Create a new CGI object. my $q = CGI->new(); # # The data structure that will hold all session data. # @@FIXME This can't be my() as $File will sooner or # later be undef and add_warning will cause the script # to die. our() seems to work but has other problems. # @@FIXME Apparently, this must be set to {} also, # otherwise the script might pick up an old object # after abort_if_error_flagged under mod_perl. our $File = {}; ################################# # Initialize the datastructure. # ################################# # # Charset data (casing policy: lowercase early). $File->{Charset}->{Use} = ''; # The charset used for validation. $File->{Charset}->{Auto} = ''; # Autodetection using XML rules (Appendix F) $File->{Charset}->{HTTP} = ''; # From HTTP's "charset" parameter. $File->{Charset}->{META} = ''; # From HTML's . $File->{Charset}->{XML} = ''; # From the XML Declaration. $File->{Charset}->{Override} = ''; # From CGI/user override. # # Misc simple types. $File->{Mode} = 'DTD+SGML'; # Default parse mode is DTD validation in SGML mode. # By default, perform validation (we may perform only xml-wf in some cases) $File->{XMLWF_ONLY} = FALSE; # # Listrefs. $File->{Warnings} = []; # Warnings... $File->{Namespaces} = []; # Other (non-root) Namespaces. $File->{Parsers} = []; # Parsers used {name, link, type, options} # By default, doctype-less documents cannot be valid $File->{"DOCTYPEless OK"} = FALSE; $File->{"Default DOCTYPE"}->{"HTML"} = 'HTML 4.01 Transitional'; $File->{"Default DOCTYPE"}->{"XHTML"} = 'XHTML 1.0 Transitional'; ############################################################################### #### Generate Template for Result. ############################################ ############################################################################### # first we determine the chosen language based on # 1) lang argument given as parameter (if this language is available) # 2) HTTP language negotiation between variants available and user-agent choices # 3) English by default my $lang = $q->param('lang') || ''; my @localizations; foreach my $lang_available (@{$CFG->{Languages}}) { if ($lang eq $lang_available) { # Requested language (from parameters) is available, just use it undef @localizations; last; } push @localizations, [ $lang_available, 1, 'text/html', undef, 'utf-8', $lang_available, undef ]; } # If language is not chosen yet, use HTTP-based negotiation if (@localizations) { require HTTP::Negotiate; $lang = HTTP::Negotiate::choose(\@localizations); } # HTTP::Negotiate::choose may return undef e.g if sent Accept-Language: en;q=0 $lang ||= 'en_US'; if ($lang eq "en") { $lang = 'en_US'; # legacy } $File->{Template_Defaults} = { die_on_bad_params => FALSE, loop_context_vars => TRUE, global_vars => TRUE, case_sensitive => TRUE, path => [catfile($CFG->{Paths}->{Templates}, $lang)], filter => sub { my $ref = shift; ${$ref} = Encode::decode_utf8(${$ref}); }, }; if (IS_MODPERL2()) { $File->{Template_Defaults}->{cache} = TRUE; } elsif ($CFG->{Paths}->{Cache}) { $File->{Template_Defaults}->{file_cache} = TRUE; $File->{Template_Defaults}->{file_cache_dir} = rel2abs($CFG->{Paths}->{Cache}, tmpdir()); } undef $lang; ######################################### # Populate $File->{Opt} -- CGI Options. # ######################################### # # Preprocess the CGI parameters. $q = &prepCGI($File, $q); # # Set session switches. $File->{Opt}->{Outline} = $q->param('outline') ? TRUE : FALSE; $File->{Opt}->{'Show Source'} = $q->param('ss') ? TRUE : FALSE; $File->{Opt}->{'Show Tidy'} = $q->param('st') ? TRUE : FALSE; $File->{Opt}->{Verbose} = $q->param('verbose') ? TRUE : FALSE; $File->{Opt}->{'Group Errors'} = $q->param('group') ? TRUE : FALSE; $File->{Opt}->{Debug} = $q->param('debug') ? TRUE : FALSE; $File->{Opt}->{No200} = $q->param('No200') ? TRUE : FALSE; $File->{Opt}->{Prefill} = $q->param('prefill') ? TRUE : FALSE; $File->{Opt}->{'Prefill Doctype'} = $q->param('prefill_doctype') || 'html401'; $File->{Opt}->{Charset} = lc($q->param('charset') || ''); $File->{Opt}->{DOCTYPE} = $q->param('doctype') || ''; $File->{Opt}->{'User Agent'} = $q->param('user-agent') && $q->param('user-agent') ne "1" ? $q->param('user-agent') : "W3C_Validator/$VERSION " . $CFG->{'User Agent Info'}; $File->{Opt}->{'User Agent'} =~ tr/\x00-\x09\x0b\x0c-\x1f//d; if ($File->{Opt}->{'User Agent'} eq 'mobileok') { $File->{Opt}->{'User Agent'} = 'W3C-mobileOK/DDC-1.0 (see http://www.w3.org/2006/07/mobileok-ddc)'; } $File->{Opt}->{'Accept Header'} = $q->param('accept') || ''; $File->{Opt}->{'Accept-Language Header'} = $q->param('accept-language') || ''; $File->{Opt}->{'Accept-Charset Header'} = $q->param('accept-charset') || ''; $File->{Opt}->{$_} =~ tr/\x00-\x09\x0b\x0c-\x1f//d for ('Accept Header', 'Accept-Language Header', 'Accept-Charset Header'); # # "Fallback" info for Character Encoding (fbc), Content-Type (fbt), # and DOCTYPE (fbd). If TRUE, the Override values are treated as # Fallbacks instead of Overrides. $File->{Opt}->{FB}->{Charset} = $q->param('fbc') ? TRUE : FALSE; $File->{Opt}->{FB}->{Type} = $q->param('fbt') ? TRUE : FALSE; $File->{Opt}->{FB}->{DOCTYPE} = $q->param('fbd') ? TRUE : FALSE; # # If ";debug" was given, let it overrule the value from the config file, # regardless of whether it's "0" or "1" (on or off), but only if config # allows the debugging options. if ($CFG->{'Allow Debug'}) { $DEBUG = $q->param('debug') if defined $q->param('debug'); $File->{Opt}->{Verbose} = TRUE if $DEBUG; } else { $DEBUG = FALSE; # The default. } $File->{Opt}->{Debug} = $DEBUG; &abort_if_error_flagged($File); # # Get the file and metadata. if ($q->param('uploaded_file')) { $File = &handle_file($q, $File); } elsif ($q->param('fragment')) { $File = &handle_frag($q, $File); } elsif ($q->param('uri')) { $File = &handle_uri($q, $File); } # # Abort if an error was flagged during initialization. &abort_if_error_flagged($File); # # Get rid of the CGI object. undef $q; # # We don't need STDIN any more, so get rid of it to avoid getting clobbered # by Apache::Registry's idiotic interference under mod_perl. untie *STDIN; ############################################################################### #### Output validation results. ############################################### ############################################################################### if (!$File->{ContentType} && !$File->{'Direct Input'} && !$File->{'Is Upload'}) { &add_warning('W08', {}); } $File = find_encodings($File); # # Decide on a charset to use (first part) # if ($File->{Charset}->{HTTP}) { # HTTP, if given, is authoritative. $File->{Charset}->{Use} = $File->{Charset}->{HTTP}; } elsif ($File->{ContentType} =~ m(^text/(?:[-.a-zA-Z0-9]\+)?xml$)) { # Act as if $http_charset was 'us-ascii'. (MIME rules) $File->{Charset}->{Use} = 'us-ascii'; &add_warning( 'W01', { W01_upload => $File->{'Is Upload'}, W01_agent => $File->{Server}, W01_ct => $File->{ContentType}, } ); } elsif ($File->{Charset}->{XML}) { $File->{Charset}->{Use} = $File->{Charset}->{XML}; } elsif ($File->{BOM} && $File->{BOM} == 2 && $File->{Charset}->{Auto} =~ /^utf-16[bl]e$/) { $File->{Charset}->{Use} = 'utf-16'; } elsif ($File->{ContentType} =~ m(^application/(?:[-.a-zA-Z0-9]+\+)?xml$)) { $File->{Charset}->{Use} = "utf-8"; } elsif (&is_xml($File) and not $File->{ContentType} =~ m(^text/)) { $File->{Charset}->{Use} = 'utf-8'; # UTF-8 (image/svg+xml etc.) } $File->{Charset}->{Use} ||= $File->{Charset}->{META}; # # Handle any Fallback or Override for the charset. if (charset_not_equal($File->{Opt}->{Charset}, '(detect automatically)')) { # charset=foo was given to the CGI and it wasn't "autodetect" or empty. # # Extract the user-requested charset from CGI param. my ($override, undef) = split(/\s/, $File->{Opt}->{Charset}, 2); $File->{Charset}->{Override} = lc($override); if ($File->{Opt}->{FB}->{Charset}) { # charset fallback mode unless ($File->{Charset}->{Use}) { # no charset detected, actual fallback &add_warning('W02', {W02_charset => $File->{Charset}->{Override}}); $File->{Tentative} |= T_ERROR; # Tag it as Invalid. $File->{Charset}->{Use} = $File->{Charset}->{Override}; } } else { # charset "hard override" mode if (!$File->{Charset}->{Use}) { # overriding "nothing" &add_warning( 'W04', { W04_charset => $File->{Charset}->{Override}, W04_override => TRUE } ); $File->{Tentative} |= T_ERROR; $File->{Charset}->{Use} = $File->{Charset}->{Override}; } elsif ($File->{Charset}->{Override} ne $File->{Charset}->{Use}) { # Actually overriding something; warn about override. &add_warning( 'W03', { W03_use => $File->{Charset}->{Use}, W03_opt => $File->{Charset}->{Override} } ); $File->{Tentative} |= T_ERROR; $File->{Charset}->{Use} = $File->{Charset}->{Override}; } } } if ($File->{'Direct Input'}) { #explain why UTF-8 is forced &add_warning('W28', {}); } unless ($File->{Charset}->{XML} || $File->{Charset}->{META}) { #suggest character encoding info within doc &add_warning('W27', {}); } # # Abort if an error was flagged while finding the encoding. &abort_if_error_flagged($File); $File->{Charset}->{Default} = FALSE; unless ($File->{Charset}->{Use}) { # No charset given... $File->{Charset}->{Use} = 'utf-8'; $File->{Charset}->{Default} = TRUE; $File->{Tentative} |= T_ERROR; # Can never be valid. &add_warning('W04', {W04_charset => "UTF-8"}); } # Always transcode, even if the content claims to be UTF-8 $File = transcode($File); # Try guessing if it didn't work out if ($File->{ContentType} eq 'text/html' && $File->{Charset}->{Default}) { my $also_tried = 'UTF-8'; for my $cs (qw(windows-1252 iso-8859-1)) { last unless $File->{'Error Flagged'}; $File->{'Error Flagged'} = FALSE; # reset $File->{Charset}->{Use} = $cs; &add_warning('W04', {W04_charset => $cs, W04_also_tried => $also_tried}); $File = transcode($File); $also_tried .= ", $cs"; } } # if it still does not work, we abandon hope here &abort_if_error_flagged($File); # # Add a warning if doc is UTF-8 and contains a BOM. if ($File->{Charset}->{Use} eq 'utf-8' && @{$File->{Content}} && $File->{Content}->[0] =~ m(^\x{FEFF})) { &add_warning('W21', {}); } # # Overall parsing algorithm for documents returned as text/html: # # For documents that come to us as text/html, # # 1. check if there's a doctype # 2. if there is a doctype, parse/validate against that DTD # 3. if no doctype, check for an xmlns= attribute on the first element, or # XML declaration # 4. if no doctype and XML mode, check for XML well-formedness # 5. otherwise, punt. # # # Override DOCTYPE if user asked for it. if ($File->{Opt}->{DOCTYPE}) { if ($File->{Opt}->{DOCTYPE} !~ /(?:Inline|detect)/i) { $File = &override_doctype($File); } else { # Get rid of inline|detect for easy truth value checking later $File->{Opt}->{DOCTYPE} = ''; } } # Try to extract a DOCTYPE or xmlns. $File = &preparse_doctype($File); if ($File->{Opt}->{DOCTYPE} eq "HTML5") { $File->{DOCTYPE} = "HTML5"; $File->{Version} = $File->{DOCTYPE}; } set_parse_mode($File, $CFG); # # Sanity check Charset information and add any warnings necessary. $File = &charset_conflicts($File); # before we start the parsing, clean slate $File->{'Is Valid'} = TRUE; $File->{Errors} = []; $File->{WF_Errors} = []; if (($File->{DOCTYPE} eq "HTML5") or ($File->{DOCTYPE} eq "XHTML5")) { if ($CFG->{External}->{HTML5}) { $File = &html5_validate($File); &add_warning( 'W00', { W00_experimental_name => "HTML5 Conformance Checker", W00_experimental_URI => "feedback.html" } ); } else { $File->{'Error Flagged'} = TRUE; my $tmpl = &get_error_template($File); $tmpl->param(fatal_no_checker => TRUE); $tmpl->param(fatal_missing_checker => 'HTML5 Validator'); } } elsif (($File->{DOCTYPE} eq '') and (($File->{Root} eq "svg") or @{$File->{Namespaces}} > 1)) { # we send doctypeless SVG, or any doctypeless XML document with multiple # namespaces found, to a different engine. WARNING this is experimental. if ($CFG->{External}->{CompoundXML}) { $File = &compoundxml_validate($File); &add_warning( 'W00', { W00_experimental_name => "validator.nu Conformance Checker", W00_experimental_URI => "feedback.html" } ); } } else { $File = &dtd_validate($File); } &abort_if_error_flagged($File); if (&is_xml($File)) { if ($File->{DOCTYPE} eq "HTML5") { # $File->{DOCTYPE} = "XHTML5"; # $File->{Version} = "XHTML5"; } else { # XMLWF check can be slow, skip if we already know the doc can't pass. # http://www.w3.org/Bugs/Public/show_bug.cgi?id=9899 $File = &xmlwf($File) if $File->{'Is Valid'}; } &abort_if_error_flagged($File); } # # Force "XML" if type is an XML type and an FPI was not found. # Otherwise set the type to be the FPI. if (&is_xml($File) and not $File->{DOCTYPE} and lc($File->{Root}) ne 'html') { $File->{Version} = 'XML'; } else { $File->{Version} ||= $File->{DOCTYPE}; } # # Get the pretty text version of the FPI if a mapping exists. if (my $prettyver = $CFG->{Types}->{$File->{Version}}->{Display}) { $File->{Version} = $prettyver; } # # check the received mime type against Allowed mime types if ($File->{ContentType}) { my @allowedMediaType = split(/\s+/, $CFG->{Types}->{$File->{DOCTYPE}}->{Types}->{Allowed} || ''); my $usedCTisAllowed; if (scalar @allowedMediaType) { $usedCTisAllowed = FALSE; foreach (@allowedMediaType) { $usedCTisAllowed = TRUE if ($_ eq $File->{ContentType}); } } else { # wedon't know what media type is recommended, so better shut up $usedCTisAllowed = TRUE; } if (!$usedCTisAllowed) { &add_warning( 'W23', { W23_type => $File->{ContentType}, W23_type_pref => $CFG->{Types}->{$File->{DOCTYPE}}->{Types}->{Preferred}, w23_doctype => $File->{Version} } ); } } # # Warn about unknown, incorrect, or missing Namespaces. if ($File->{Namespace}) { my $ns = $CFG->{Types}->{$File->{Version}}->{Namespace} || FALSE; if (&is_xml($File)) { if ($ns eq $File->{Namespace}) { &add_warning( 'W10', { W10_ns => $File->{Namespace}, W10_type => $File->{Type}, } ); } } elsif ($File->{DOCTYPE} ne 'HTML5') { &add_warning( 'W11', { W11_ns => $File->{Namespace}, w11_doctype => $File->{DOCTYPE} } ); } } else { if (&is_xml($File) and $CFG->{Types}->{$File->{Version}}->{Namespace}) { &add_warning('W12', {}); } } ## if invalid content, AND if requested, pass through tidy if (!$File->{'Is Valid'} && $File->{Opt}->{'Show Tidy'}) { eval { local $SIG{__DIE__} = undef; require HTML::Tidy; my $tidy = HTML::Tidy->new({config_file => $CFG->{Paths}->{TidyConf}}); my $cleaned = $tidy->clean(join("\n", @{$File->{Content}})); $cleaned = Encode::decode_utf8($cleaned); $File->{Tidy} = $cleaned; }; if ($@) { (my $errmsg = $@) =~ s/ at .*//s; &add_warning('W29', {W29_msg => $errmsg}); } } my %templates = ( earl => ['earl_xml.tmpl', default_escape => 'HTML'], n3 => ['earl_n3.tmpl'], json => ['json_output.tmpl'], ucn => ['ucn_output.tmpl'], ); my $template = $templates{$File->{Opt}->{Output}}; if ($template) { my $tname = shift(@$template); my $tmpl = &get_template($File, $tname, @$template); $template = $tmpl; } elsif ($File->{Opt}->{Output} eq 'soap12') { if ($CFG->{'Enable SOAP'} != 1) { # API disabled - ideally this should have been sent before performing # validation... print CGI::header( -status => 503, -content_language => "en", -type => "text/html", -charset => "utf-8" ); $template = &get_template($File, 'soap_disabled.tmpl'); } else { $template = &get_template($File, 'soap_output.tmpl'); } } else { $template = &get_template($File, 'result.tmpl'); } &prep_template($File, $template); &fin_template($File, $template); $template->param(tidy_output => $File->{Tidy}); $template->param(file_source => &source($File)) if ($template->param('opt_show_source') or ($File->{'Is Upload'}) or ($File->{'Direct Input'})); if ($File->{Opt}->{Output} eq 'json') { # No JSON escaping in HTML::Template (and "JS" is not the right thing here) my $json = JSON->new(); $json->allow_nonref(TRUE); if (my $msgs = $template->param("file_errors")) { for my $msg (@$msgs) { for my $key (qw(msg expl)) { $msg->{$key} = $json->encode($msg->{$key}) if $msg->{$key}; } # Drop non-numeric char indicators from output, e.g. # "> 80" for some XML parse error ones (see the non-structured # XML::LibXML code branch in XML preparsing below). if ($msg->{char} && $msg->{char} !~ /^\d+$/) { delete($msg->{char}); } } } } # transcode output from perl's internal to utf-8 and output print Encode::encode('UTF-8', $template->output); # # Get rid of $File object and exit. undef $File; exit; ############################################################################# # Subroutine definitions ############################################################################# sub get_template ($$;@) { my ($File, $fname, @opts) = @_; if (!$File->{_Templates}->{$fname}) { my $tmpl = HTML::Template->new( %{$File->{Template_Defaults}}, filename => $fname, @opts ); $tmpl->param(env_home_page => $File->{Env}->{'Home Page'}); $tmpl->param(validator_version => $VERSION); $File->{_Templates}->{$fname} = $tmpl; } return $File->{_Templates}->{$fname}; } sub get_error_template ($;@) { my ($File, @opts) = @_; my $fname = 'fatal-error.tmpl'; if ($File->{Opt}->{Output} eq 'soap12') { $fname = 'soap_fault.tmpl'; } elsif ($File->{Opt}->{Output} eq 'ucn') { $fname = 'ucn_fault.tmpl'; } return &get_template($File, $fname, @opts); } # TODO: need to bring in fixes from html5_validate() here sub compoundxml_validate (\$) { my $File = shift; my $ua = W3C::Validator::UserAgent->new($CFG, $File); push( @{$File->{Parsers}}, { name => "Compound XML", link => "http://qa-dev.w3.org/", # TODO? type => "", options => "" } ); my $url = URI->new($CFG->{External}->{CompoundXML}); $url->query("out=xml"); my $req = HTTP::Request->new(POST => $url); if ($File->{Opt}->{DOCTYPE} || $File->{Charset}->{Override}) { # Doctype or charset overridden, need to use $File->{Content} in UTF-8 # because $File->{Bytes} is not affected by the overrides. This will # most likely be a source of errors about internal/actual charset # differences as long as our transcoding process does not "fix" the # charset info in XML declaration and meta http-equiv (any others?). if ($File->{'Direct Input'}) { # sane default when using html5 validator by direct input $req->content_type("application/xml; charset=UTF-8"); } else { $req->content_type("$File->{ContentType}; charset=UTF-8"); } $req->content(Encode::encode_utf8(join("\n", @{$File->{Content}}))); } else { # Pass original bytes, Content-Type and charset as-is. # We trust that our and validator.nu's interpretation of line numbers # is the same (regardless of EOL chars used in the document). my @content_type = ($File->{ContentType} => undef); push(@content_type, charset => $File->{Charset}->{HTTP}) if $File->{Charset}->{HTTP}; $req->content_type( HTTP::Headers::Util::join_header_words(@content_type)); $req->content_ref(\$File->{Bytes}); } $req->content_language($File->{ContentLang}) if $File->{ContentLang}; # Intentionally using direct header access instead of $req->last_modified $req->header('Last-Modified', $File->{Modified}) if $File->{Modified}; # If not in debug mode, gzip the request (LWP >= 5.817) eval { $req->encode("gzip"); } unless $File->{Opt}->{Debug}; my $res = $ua->request($req); if (!$res->is_success()) { $File->{'Error Flagged'} = TRUE; my $tmpl = &get_error_template($File); $tmpl->param(fatal_no_checker => TRUE); $tmpl->param(fatal_missing_checker => 'HTML5 Validator'); $tmpl->param(fatal_checker_error => $res->status_line()); } else { my $content = &get_content($File, $res); return $File if $File->{'Error Flagged'}; # and now we parse according to # http://wiki.whatwg.org/wiki/Validator.nu_XML_Output # I wish we could use XML::LibXML::Reader here. but SHAME on those # major unix distributions still shipping with libxml2 2.6.16… 4 years # after its release # …and we could use now as we require libxml2 >= 2.6.21 anyway… my $xml_reader = XML::LibXML->new(); $xml_reader->base_uri($res->base()); my $xmlDOM; eval { $xmlDOM = $xml_reader->parse_string($content); }; if ($@) { my $errmsg = $@; $File->{'Error Flagged'} = TRUE; my $tmpl = &get_error_template($File); $tmpl->param(fatal_no_checker => TRUE); $tmpl->param(fatal_missing_checker => 'HTML5 Validator'); $tmpl->param(fatal_checker_error => $errmsg); return $File; } my @nodelist = $xmlDOM->getElementsByTagName("messages"); my $messages_node = $nodelist[0]; my @message_nodes = $messages_node->childNodes; foreach my $message_node (@message_nodes) { my $message_type = $message_node->localname; my ($err, $xml_error_msg, $xml_error_expl); if ($message_type eq "error") { $err->{type} = "E"; $File->{'Is Valid'} = FALSE; } elsif ($message_type eq "info") { # by default - we find warnings in the type attribute (below) $err->{type} = "I"; } if ($message_node->hasAttributes()) { my @attributelist = $message_node->attributes(); foreach my $attribute (@attributelist) { if ($attribute->name eq "type") { if (($attribute->getValue() eq "warning") and ($message_type eq "info")) { $err->{type} = "W"; } } if ($attribute->name eq "last-column") { $err->{char} = $attribute->getValue(); } if ($attribute->name eq "last-line") { $err->{line} = $attribute->getValue(); } } } my @child_nodes = $message_node->childNodes; foreach my $child_node (@child_nodes) { if ($child_node->localname eq "message") { $xml_error_msg = $child_node->toString(); $xml_error_msg =~ s,]*>,,gsi; } if ($child_node->localname eq "elaboration") { $xml_error_expl = $child_node->toString(); $xml_error_expl =~ s,,,gi; $xml_error_expl = "\n
$xml_error_expl
\n"; } } # formatting the error message for output $err->{src} = "" if $err->{uri}; # TODO... $err->{num} = 'validator.nu'; $err->{msg} = $xml_error_msg; $err->{expl} = $xml_error_expl; if ($err->{msg} =~ /Using the preset for (.*) based on the root namespace/) { $File->{DOCTYPE} = $1; } else { push @{$File->{Errors}}, $err; } # @@ TODO message explanation / elaboration } } return $File; } sub html5_validate (\$) { my $File = shift; my $ua = W3C::Validator::UserAgent->new($CFG, $File); push( @{$File->{Parsers}}, { name => "validator.nu", link => "http://validator.nu/", type => "HTML5", options => "" } ); my $url = URI->new($CFG->{External}->{HTML5}); $url->query("out=xml"); my $req = HTTP::Request->new(POST => $url); my $ct = &is_xml($File) ? "application/xhtml+xml" : "text/html"; if ($File->{Opt}->{DOCTYPE} || $File->{Charset}->{Override} || $File->{'Direct Input'}) { # Doctype or charset overridden, need to use $File->{Content} in UTF-8 # because $File->{Bytes} is not affected by the overrides. Note that # direct input is always considered an override here. &override_charset($File, "UTF-8"); $ct = $File->{ContentType} unless $File->{'Direct Input'}; my @ct = ($ct => undef, charset => "UTF-8"); $ct = HTTP::Headers::Util::join_header_words(@ct); $req->content(Encode::encode_utf8(join("\n", @{$File->{Content}}))); } else { # Pass original bytes, Content-Type and charset as-is. # We trust that our and validator.nu's interpretation of line numbers # is the same later when displaying error contexts (regardless of EOL # chars used in the document). my @ct = ($File->{ContentType} => undef); push(@ct, charset => $File->{Charset}->{HTTP}) if $File->{Charset}->{HTTP}; $ct = HTTP::Headers::Util::join_header_words(@ct); $req->content_ref(\$File->{Bytes}); } $req->content_type($ct); $req->content_language($File->{ContentLang}) if $File->{ContentLang}; # Intentionally using direct header access instead of $req->last_modified # (the latter takes seconds since epoch, but $File->{Modified} is an already # formatted string). $req->header('Last-Modified', $File->{Modified}) if $File->{Modified}; # Use gzip in non-debug, remote HTML5 validator mode (LWP >= 5.817). if (!$File->{Opt}->{Debug} && $url->host() !~ /^(?:localhost|127(?:\.\d+){3}|.*\.localdomain)$/i) { eval { $req->encode("gzip"); }; } else { $req->header('Accept-Encoding', 'identity'); } my $source_option = $File->{Opt}->{'Show Source'} ? "&showsource=yes" : ""; my $outline_option = $File->{Opt}->{Outline} ? "&showoutline=yes" : ""; my $output_option = $File->{Opt}->{Output} eq 'json' ? "&out=json" : ""; my $uri = uri_escape($File->{'URI'}); if ($File->{'Direct Input'}) { # if $req isn't actually encoded, this decode() call does nothing $req->decode("gzip"); $uri = "data:text/html;charset=utf-8;base64," . uri_escape(MIME::Base64::encode_base64($req->content)); } if (!$File->{'Is Upload'}) { print redirect 'https://validator.w3.org/nu/?doc=' . $uri . $source_option . $outline_option . $output_option; } my $res = $ua->request($req); if (!$res->is_success()) { $File->{'Error Flagged'} = TRUE; my $tmpl = &get_error_template($File); $tmpl->param(fatal_no_checker => TRUE); $tmpl->param(fatal_missing_checker => 'HTML5 Validator'); $tmpl->param(fatal_checker_error => $res->status_line()); } else { my $content = &get_content($File, $res); return $File if $File->{'Error Flagged'}; # and now we parse according to # http://wiki.whatwg.org/wiki/Validator.nu_XML_Output # I wish we could use XML::LibXML::Reader here. but SHAME on those # major unix distributions still shipping with libxml2 2.6.16… 4 years # after its release my $xml_reader = XML::LibXML->new(); $xml_reader->base_uri($res->base()); my $xmlDOM; eval { $xmlDOM = $xml_reader->parse_string($content); }; if ($@) { my $errmsg = $@; $File->{'Error Flagged'} = TRUE; my $tmpl = &get_error_template($File); $tmpl->param(fatal_no_checker => TRUE); $tmpl->param(fatal_missing_checker => 'HTML5 Validator'); $tmpl->param(fatal_checker_error => $errmsg); return $File; } my @nodelist = $xmlDOM->getElementsByTagName("messages"); my $messages_node = $nodelist[0]; my @message_nodes = $messages_node->childNodes; foreach my $message_node (@message_nodes) { my $message_type = $message_node->localname; my ($html5_error_msg, $html5_error_expl); my $err = {}; # TODO: non-document errors should receive different/better # treatment, but this is better than hiding all problems for now # (#6747) if ($message_type eq "error" || $message_type eq "non-document-error") { $err->{type} = "E"; $File->{'Is Valid'} = FALSE; } elsif ($message_type eq "info") { # by default - we find warnings in the type attribute (below) $err->{type} = "I"; } if ($message_node->hasAttributes()) { my @attributelist = $message_node->attributes(); foreach my $attribute (@attributelist) { if ($attribute->name eq "type") { if (($attribute->getValue() eq "warning") and ($message_type eq "info")) { $err->{type} = "W"; } } elsif ($attribute->name eq "last-column") { $err->{char} = $attribute->getValue(); } elsif ($attribute->name eq "last-line") { $err->{line} = $attribute->getValue(); } elsif ($attribute->name eq "url") { &set_error_uri($err, $attribute->getValue()); } } } my @child_nodes = $message_node->childNodes; foreach my $child_node (@child_nodes) { if ($child_node->localname eq "message") { $html5_error_msg = $child_node->textContent(); } elsif ($child_node->localname eq "elaboration") { $html5_error_expl = $child_node->toString(); $html5_error_expl =~ s,,,gi; $html5_error_expl = "\n
$html5_error_expl
\n"; } } # formatting the error message for output # TODO: set $err->{src} from extract if we got an URI for the error: # http://wiki.whatwg.org/wiki/Validator.nu_XML_Output#The_extract_Element # For now, set it directly to empty to prevent report_errors() from # trying to populate it from our doc. $err->{src} = "" if $err->{uri}; $err->{num} = 'html5'; $err->{msg} = $html5_error_msg; $err->{expl} = $html5_error_expl; push @{$File->{Errors}}, $err; # @@ TODO message explanation / elaboration } } return $File; } sub dtd_validate (\$) { my $File = shift; my $opensp = SGML::Parser::OpenSP->new(); # # By default, use SGML catalog file and SGML Declaration. my $catalog = catfile($CFG->{Paths}->{SGML}->{Library}, 'sgml.soc'); # default parsing options my @spopt = qw(valid non-sgml-char-ref no-duplicate); # # Switch to XML semantics if file is XML. if (&is_xml($File)) { $catalog = catfile($CFG->{Paths}->{SGML}->{Library}, 'xml.soc'); push(@spopt, 'xml'); } else { # add warnings for shorttags push(@spopt, 'min-tag'); } push( @{$File->{Parsers}}, { name => "OpenSP", link => "http://openjade.sourceforge.net/", type => "SGML/XML", options => join(" ", @spopt) } ); # # Parser configuration $opensp->search_dirs($CFG->{Paths}->{SGML}->{Library}); $opensp->catalogs($catalog); $opensp->show_error_numbers(1); $opensp->warnings(@spopt); # # Restricted file reading is disabled on Win32 for the time # being since neither SGML::Parser::OpenSP nor check auto- # magically set search_dirs to include the temp directory # so restricted file reading would defunct the Validator. $opensp->restrict_file_reading(1) unless $^O eq 'MSWin32'; my $h; # event handler if ($File->{Opt}->{Outline}) { $h = W3C::Validator::EventHandler::Outliner->new($opensp, $File, $CFG); } else { $h = W3C::Validator::EventHandler->new($opensp, $File, $CFG); } $opensp->handler($h); $opensp->parse_string(join "\n", @{$File->{Content}}); # Make sure there are no circular references, otherwise the script # would leak memory until mod_perl unloads it which could take some # time. @@FIXME It's probably overly careful though. $opensp->handler(undef); undef $h->{_parser}; undef $h->{_file}; undef $h; undef $opensp; # # Set Version to be the FPI initially. $File->{Version} = $File->{DOCTYPE}; return $File; } sub xmlwf (\$) { # we should really be using a SAX ErrorHandler, but I can't find a way to # make it work with XML::LibXML::SAX::Parser... ** FIXME ** # ditto, we should try using W3C::Validator::EventHandler, but it's badly # linked to opensp at the moment my $File = shift; my $xmlparser = XML::LibXML->new(); $xmlparser->line_numbers(1); $xmlparser->validation(0); $xmlparser->base_uri($File->{URI}) unless ($File->{'Direct Input'} || $File->{'Is Upload'}); push( @{$File->{Parsers}}, { name => "libxml2", link => "http://xmlsoft.org/", type => "XML", options => "" } ); # Restrict file reading similar to what SGML::Parser::OpenSP does. Note # that all inputs go through the callback so if we were passing a # URI/filename to the parser, it would be affected as well and would break # fetching the initial document. As long as we pass the doc as string, # this should work. my $cb = XML::LibXML::InputCallback->new(); $cb->register_callbacks([\&xml_jail_match, sub { }, sub { }, sub { }]); $xmlparser->input_callbacks($cb); &override_charset($File, "UTF-8"); eval { $xmlparser->parse_string(join("\n", @{$File->{Content}})); }; if (ref($@)) { # handle a structured error (XML::LibXML::Error object) my $err_obj = $@; while ($err_obj) { my $err = {}; &set_error_uri($err, $err_obj->file()); $err->{src} = &ent($err_obj->context()) if $err->{uri}; $err->{line} = $err_obj->line(); $err->{char} = $err_obj->column(); $err->{num} = "libxml2-" . $err_obj->code(); $err->{type} = "E"; $err->{msg} = $err_obj->message(); $err_obj = $err_obj->_prev(); unshift(@{$File->{WF_Errors}}, $err); } } elsif ($@) { my $xmlwf_errors = $@; my $xmlwf_error_line = undef; my $xmlwf_error_col = undef; my $xmlwf_error_msg = undef; my $got_error_message = undef; my $got_quoted_line = undef; foreach my $msg_line (split "\n", $xmlwf_errors) { $msg_line =~ s{[^\x0d\x0a](:\d+:)}{\n$1}g; $msg_line =~ s{[^\x0d\x0a]+[\x0d\x0a]$}{}; # first we get the actual error message if (!$got_error_message && $msg_line =~ /^(:\d+:)( parser error : .*)/) { $xmlwf_error_line = $1; $xmlwf_error_msg = $2; $xmlwf_error_line =~ s/:(\d+):/$1/; $xmlwf_error_msg =~ s/ parser error :/XML Parsing Error: /; $got_error_message = 1; } # then we skip the second line, which shows the context # (we don't use that) elsif ($got_error_message && !$got_quoted_line) { $got_quoted_line = 1; } # we now take the third line, with the pointer to the error's # column elsif (($msg_line =~ /(\s+)\^/) and $got_error_message and $got_quoted_line) { $xmlwf_error_col = length($1); } # cleanup for a number of bugs for the column number if (defined($xmlwf_error_col)) { if (( my $l = length($File->{Content}->[$xmlwf_error_line - 1]) ) < $xmlwf_error_col ) { # http://bugzilla.gnome.org/show_bug.cgi?id=434196 #warn("Warning: reported error column larger than line length " . # "($xmlwf_error_col > $l) in $File->{URI} line " . # "$xmlwf_error_line, libxml2 bug? Resetting to line length."); $xmlwf_error_col = $l; } elsif ($xmlwf_error_col == 79) { # working around an apparent odd limitation of libxml which # only gives context for lines up to 80 chars # http://www.w3.org/Bugs/Public/show_bug.cgi?id=4420 # http://bugzilla.gnome.org/show_bug.cgi?id=424017 $xmlwf_error_col = "> 80"; # non-int line number will trigger the proper behavior in # report_error } } # when we have all the info (one full error message), proceed # and move on to the next error if ((defined $xmlwf_error_line) and (defined $xmlwf_error_col) and (defined $xmlwf_error_msg)) { # Reinitializing for the next batch of 3 lines $got_error_message = undef; $got_quoted_line = undef; # formatting the error message for output my $err = {}; # TODO: set_error_uri() (need test case) $err->{src} = "" if $err->{uri}; # TODO... $err->{line} = $xmlwf_error_line; $err->{char} = $xmlwf_error_col; $err->{num} = 'xmlwf'; $err->{type} = "E"; $err->{msg} = $xmlwf_error_msg; push(@{$File->{WF_Errors}}, $err); $xmlwf_error_line = undef; $xmlwf_error_col = undef; $xmlwf_error_msg = undef; } } } $File->{'Is Valid'} = FALSE if @{$File->{WF_Errors}}; return $File; } # # Generate HTML report. sub prep_template ($$) { my $File = shift; my $T = shift; # # XML mode... $T->param(is_xml => &is_xml($File)); # # Upload? $T->param(is_upload => $File->{'Is Upload'}); # # Direct Input? $T->param(is_direct_input => $File->{'Direct Input'}); # # The URI... $T->param(file_uri => $File->{URI}); # # HTTPS note? $T->param(file_https_note => $File->{'Is Upload'} || $File->{'Direct Input'} || URI->new($File->{URI})->secure()); # # Set URL for page title. $T->param(page_title_url => $File->{URI}); # # Metadata... $T->param(file_modified => $File->{Modified}); $T->param(file_server => $File->{Server}); $T->param(file_size => $File->{Size}); $T->param(file_contenttype => $File->{ContentType}); $T->param(file_charset => $File->{Charset}->{Use}); $T->param(file_doctype => $File->{DOCTYPE}); # # Output options... $T->param(opt_show_source => $File->{Opt}->{'Show Source'}); $T->param(opt_show_tidy => $File->{Opt}->{'Show Tidy'}); $T->param(opt_show_outline => $File->{Opt}->{Outline}); $T->param(opt_verbose => $File->{Opt}->{Verbose}); $T->param(opt_group_errors => $File->{Opt}->{'Group Errors'}); $T->param(opt_no200 => $File->{Opt}->{No200}); # Root Element $T->param(root_element => $File->{Root}); # Namespaces... $T->param(file_namespace => $File->{Namespace}); # Non-root ones; unique, preserving occurrence order my %seen_ns = (); $seen_ns{$File->{Namespace}}++ if defined($File->{Namespace}); my @nss = map { $seen_ns{$_}++ == 0 ? {uri => $_} : () } @{$File->{Namespaces}}; $T->param(file_namespaces => \@nss) if @nss; if ($File->{Opt}->{DOCTYPE}) { my $over_doctype_param = "override doctype $File->{Opt}->{DOCTYPE}"; $T->param($over_doctype_param => TRUE); } if ($File->{Opt}->{Charset}) { my $over_charset_param = "override charset $File->{Opt}->{Charset}"; $T->param($over_charset_param => TRUE); } # Allow content-negotiation if ($File->{Opt}->{'Accept Header'}) { $T->param('accept' => $File->{Opt}->{'Accept Header'}); } if ($File->{Opt}->{'Accept-Language Header'}) { $T->param( 'accept-language' => $File->{Opt}->{'Accept-Language Header'}); } if ($File->{Opt}->{'Accept-Charset Header'}) { $T->param('accept-charset' => $File->{Opt}->{'Accept-Charset Header'}); } if ($File->{Opt}->{'User Agent'}) { $T->param('user-agent' => $File->{Opt}->{'User Agent'}); } if ($File->{'Error Flagged'}) { $T->param(fatal_error => TRUE); } } sub fin_template ($$) { my $File = shift; my $T = shift; # # Set debug info for HTML and SOAP reports. if ($DEBUG) { my @parsers; for my $parser (@{$File->{Parsers}}) { my $p = $parser->{name}; $p .= " (" . $parser->{options} . ")" if $parser->{options}; push(@parsers, $p); } $T->param( debug => [ map({name => $_, value => $ENV{$_}}, qw(no_proxy http_proxy https_proxy ftp_proxy FTP_PASSIVE)), {name => 'Content-Encoding', value => $File->{ContentEnc}}, {name => 'Content-Language', value => $File->{ContentLang}}, {name => 'Content-Location', value => $File->{ContentLoc}}, {name => 'Transfer-Encoding', value => $File->{TransferEnc}}, {name => 'Parse Mode', value => $File->{Mode}}, {name => 'Parse Mode Factor', value => $File->{ModeChoice}}, {name => 'Parsers Used', value => join(", ", @parsers)}, ], ); } $T->param(parsers => $File->{Parsers}); if (!$File->{Doctype} && (!$File->{Version} || $File->{Version} eq 'unknown' || $File->{Version} eq 'SGML') ) { my $default_doctype = $File->{"Default DOCTYPE"}->{&is_xml($File) ? "XHTML" : "HTML"}; $T->param(file_version => "$default_doctype"); } else { $T->param(file_version => $File->{Version}); if ($File->{Version} eq "HTML5") { $T->param(is_html5 => TRUE); } } my ($num_errors, $num_warnings, $num_info, $reported_errors) = &report_errors($File); if ($num_errors + $num_warnings > 0) { $T->param(has_errors => 1); } $T->param(valid_errors_num => $num_errors); $num_warnings += scalar @{$File->{Warnings}}; $T->param(valid_warnings_num => $num_warnings); my $number_of_errors = ""; # textual form of $num_errors my $number_of_warnings = ""; # textual form of $num_errors # The following is a bit hack-ish, but will enable us to have some logic # for a human-readable display of the number, with cases for 0, 1, 2 and # above (the case of 2 appears to be useful for localization in some # languages where the plural is different for 2, and above) if ($num_errors > 1) { $T->param(number_of_errors_is_0 => FALSE); $T->param(number_of_errors_is_1 => FALSE); if ($num_errors == 2) { $T->param(number_of_errors_is_2 => TRUE); } else { $T->param(number_of_errors_is_2 => FALSE); } $T->param(number_of_errors_is_plural => TRUE); } elsif ($num_errors == 1) { $T->param(number_of_errors_is_0 => FALSE); $T->param(number_of_errors_is_1 => TRUE); $T->param(number_of_errors_is_2 => FALSE); $T->param(number_of_errors_is_plural => FALSE); } else { # 0 $T->param(number_of_errors_is_0 => TRUE); $T->param(number_of_errors_is_1 => FALSE); $T->param(number_of_errors_is_2 => FALSE); $T->param(number_of_errors_is_plural => FALSE); } if ($num_warnings > 1) { $T->param(number_of_warnings_is_0 => FALSE); $T->param(number_of_warnings_is_1 => FALSE); if ($num_warnings == 2) { $T->param(number_of_warnings_is_2 => TRUE); } else { $T->param(number_of_warnings_is_2 => FALSE); } $T->param(number_of_warnings_is_plural => TRUE); } elsif ($num_warnings == 1) { $T->param(number_of_warnings_is_0 => FALSE); $T->param(number_of_warnings_is_1 => TRUE); $T->param(number_of_warnings_is_2 => FALSE); $T->param(number_of_warnings_is_plural => FALSE); } else { # 0 $T->param(number_of_warnings_is_0 => TRUE); $T->param(number_of_warnings_is_1 => FALSE); $T->param(number_of_warnings_is_2 => FALSE); $T->param(number_of_warnings_is_plural => FALSE); } $T->param(file_outline => $File->{heading_outline}) if $File->{Opt}->{Outline}; $T->param(file_errors => $reported_errors); if ($File->{'Is Valid'}) { $T->param(VALID => TRUE); $T->param(valid_status => 'Valid'); &report_valid($File, $T); } else { $T->param(VALID => FALSE); $T->param(valid_status => 'Invalid'); } } # # Output "This page is Valid" report. sub report_valid { my $File = shift; my $T = shift; unless ($File->{Version} eq 'unknown' or defined $File->{Tentative}) { if (exists $CFG->{Types}->{$File->{DOCTYPE}}->{Badge}) { my $cfg = $CFG->{Types}->{$File->{DOCTYPE}}; $T->param(badge_uri => $cfg->{Badge}->{URI}); $T->param(local_badge_uri => $cfg->{Badge}->{'Local URI'}); $T->param(badge_alt_uri => $cfg->{Badge}->{'Alt URI'}); $T->param(local_alt_badge_uri => $cfg->{Badge}->{'Local ALT URI'}); $T->param(badge_alt => $cfg->{Badge}->{Alt}); $T->param(badge_rdfa => $cfg->{Badge}->{RDFa}); $T->param(badge_h => $cfg->{Badge}->{Height}); $T->param(badge_w => $cfg->{Badge}->{Width}); $T->param(badge_onclick => $cfg->{Badge}->{OnClick}); $T->param(badge_tagc => $cfg->{'Parse Mode'} eq 'XML' ? ' /' : ''); } } elsif (defined $File->{Tentative}) { $T->param(is_tentative => TRUE); } if ($File->{XMLWF_ONLY}) { $T->param(xmlwf_only => TRUE); } my $thispage = self_url_file($File); $T->param(file_thispage => $thispage); } # # Add a warning message to the output. sub add_warning ($$) { my $WID = shift; my $params = shift; push @{$File->{Warnings}}, $WID; my %tmplparams = ( $WID => TRUE, have_warnings => TRUE, %$params, ); for my $tmpl (qw(result fatal-error soap_output ucn_output)) { &get_template($File, "$tmpl.tmpl")->param(%tmplparams); } } # # Proxy authentication requests. # Note: expects the third argument to be a hash ref (see HTTP::Headers::Auth). sub authenticate { my $File = shift; my $resource = shift; my $authHeader = shift || {}; my $realm = $resource; $realm =~ s([^\w\d.-]*){}g; while (my ($scheme, $header) = each %$authHeader) { my $origrealm = $header->{realm}; if (not defined $origrealm or $scheme !~ /^(?:basic|digest)$/i) { delete($authHeader->{$scheme}); next; } $header->{realm} = "$realm-$origrealm"; } my $headers = HTTP::Headers->new(Connection => 'close'); $headers->www_authenticate(%$authHeader); $headers = $headers->as_string(); chomp($headers); my $tmpl = &get_template($File, 'http_401_authrequired.tmpl'); $tmpl->param(http_401_headers => $headers); $tmpl->param(http_401_url => $resource); print Encode::encode('UTF-8', $tmpl->output); exit; # Further interaction will be a new HTTP request. } # # Fetch an URL and return the content and selected meta-info. sub handle_uri { my $q = shift; # The CGI object. my $File = shift; # The master datastructure. my $ua = W3C::Validator::UserAgent->new($CFG, $File); my $uri = URI->new(ref $q ? $q->param('uri') : $q)->canonical(); $uri->fragment(undef); if (!$uri->scheme()) { local $ENV{URL_GUESS_PATTERN} = ''; my $guess = URI::Heuristic::uf_uri($uri); if ($guess->scheme() && $ua->is_protocol_supported($guess)) { $uri = $guess; } else { $uri = URI->new("http://$uri"); } } unless ($ua->is_protocol_supported($uri)) { $File->{'Error Flagged'} = TRUE; my $tmpl = &get_error_template($File); # If uri param is empty (also for empty direct or upload), it's been # set to TRUE in sub prepCGI() if ($uri->canonical() eq "1") { $tmpl->param(fatal_no_content => TRUE); } else { $tmpl->param(fatal_uri_error => TRUE); $tmpl->param(fatal_uri_scheme => $uri->scheme()); } return $File; } return $File unless $ua->uri_ok($uri); my $req = HTTP::Request->new(GET => $uri); # if one wants to use the accept, accept-charset and accept-language params # in order to trigger specific negotiation if ($File->{Opt}->{'Accept Header'}) { $req->header(Accept => $File->{Opt}->{'Accept Header'}); } if ($File->{Opt}->{'Accept-Language Header'}) { $req->header( Accept_Language => $File->{Opt}->{'Accept-Language Header'}); } if ($File->{Opt}->{'Accept-Charset Header'}) { $req->header( Accept_Charset => $File->{Opt}->{'Accept-Charset Header'}); } # All Apache configurations don't set HTTP_AUTHORIZATION for CGI scripts. # If we're under mod_perl, there is a way around it... my $http_auth = $ENV{HTTP_AUTHORIZATION}; eval { local $SIG{__DIE__} = undef; my $auth = Apache2::RequestUtil->request()->headers_in()->{Authorization}; $http_auth = $auth if $auth; } if (IS_MODPERL2() && !$http_auth); # If we got a Authorization header, the client is back at it after being # prompted for a password so we insert the header as is in the request. $req->headers->header(Authorization => $http_auth) if $http_auth; my $res = $ua->request($req); return $File if $File->{'Error Flagged'}; # Redirect IP rejected? unless ($res->code == 200 or $File->{Opt}->{'No200'}) { if ($res->code == 401) { my %auth = $res->www_authenticate(); # HTTP::Headers::Auth &authenticate($File, $res->request->uri, \%auth); } else { $File->{'Error Flagged'} = TRUE; my $no200url = undef; if (!$File->{Opt}->{No200}) { # $File->{URI} not set yet; setting it non-local has side # effects local $File->{URI} = $uri->as_string; local $File->{Opt}->{No200} = TRUE; $no200url = &self_url_file($File); } my $warning = $res->header("Client-Warning"); if ($warning && $warning =~ /Internal response/i) { # Response doc generated internally by LWP, no need to show # that info nor to provide error doc validation link to it. $warning = undef; $no200url = undef; } my $tmpl = &get_error_template($File); $tmpl->param(fatal_http_error => TRUE); $tmpl->param(fatal_http_uri => $uri->as_string); $tmpl->param(fatal_http_code => $res->code); $tmpl->param(fatal_http_msg => $res->message); $tmpl->param(fatal_http_warn => $warning); $tmpl->param(fatal_http_no200 => $no200url); $tmpl->param(fatal_http_dns => TRUE) if ($res->code == 500); } return $File; } # # Enforce Max Recursion level. &check_recursion($File, $res); my ($mode, $ct, $charset) = &parse_content_type( $File, scalar($res->header('Content-Type')), scalar($res->request->uri), ); my $content = &get_content($File, $res); return $File if $File->{'Error Flagged'}; $File->{Bytes} = $content; $File->{Mode} = $mode; $File->{ContentType} = $ct; $File->{ContentEnc} = $res->content_encoding; $File->{ContentLang} = $res->content_language; $File->{ContentLoc} = $res->header('Content-Location'); $File->{TransferEnc} = $res->header('Client-Transfer-Encoding'); $File->{Charset}->{HTTP} = lc $charset if defined $charset; $File->{Modified} = $res->header('Last-Modified'); $File->{Server} = scalar $res->server; # TODO: Content-Length is not always set, so either this should # be renamed to 'Content-Length' or it should consider more than # the Content-Length header. $File->{Size} = scalar $res->content_length; $File->{URI} = scalar $res->request->uri->canonical; $File->{'Is Upload'} = FALSE; $File->{'Direct Input'} = FALSE; return $File; } # # Handle uploaded file and return the content and selected meta-info. sub handle_file { my $q = shift; # The CGI object. my $File = shift; # The master datastructure. my $p = $q->param('uploaded_file'); my $f = $q->upload('uploaded_file'); if (!defined($f)) { # Probably not an uploaded file as far as CGI is concerned, # treat as a fragment. $q->param('fragment', $p); return &handle_frag($q, $File); } my $h = $q->uploadInfo($p); local $/ = undef; # set line delimiter so that <> reads rest of file my $file = <$f>; my ($mode, $ct, $charset) = &parse_content_type($File, $h->{'Content-Type'}); $File->{Bytes} = $file; $File->{Mode} = $mode; $File->{ContentType} = $ct; $File->{Charset}->{HTTP} = lc $charset if defined $charset; $File->{Modified} = $q->http('Last-Modified'); $File->{Server} = $q->http('User-Agent'); # Fake a "server". :-) $File->{Size} = $q->http('Content-Length'); $File->{URI} = "$p"; $File->{'Is Upload'} = TRUE; $File->{'Direct Input'} = FALSE; return $File; } # # Handle uploaded file and return the content and selected meta-info. sub handle_frag { my $q = shift; # The CGI object. my $File = shift; # The master datastructure. $File->{Bytes} = $q->param('fragment'); $File->{Mode} = 'TBD'; $File->{Modified} = ''; $File->{Server} = ''; $File->{Size} = ''; $File->{ContentType} = ''; # @@TODO? $File->{URI} = 'upload://Form Submission'; $File->{'Is Upload'} = FALSE; $File->{'Direct Input'} = TRUE; $File->{Charset}->{HTTP} = "utf-8"; # by default, the form accepts utf-8 chars if ($File->{Opt}->{Prefill}) { # we surround the HTML fragment with some basic document structure my $prefill_Template; if ($File->{Opt}->{'Prefill Doctype'} eq 'html401') { $prefill_Template = &get_template($File, 'prefill_html401.tmpl'); } else { $prefill_Template = &get_template($File, 'prefill_xhtml10.tmpl'); } $prefill_Template->param(fragment => $File->{Bytes}); $File->{Bytes} = $prefill_Template->output(); # Let's force the view source so that the user knows what we've put # around their code. $File->{Opt}->{'Show Source'} = TRUE; # Ignore doctype overrides (#5132). $File->{Opt}->{DOCTYPE} = 'Inline'; } return $File; } # # Parse a Content-Type and parameters. Return document type and charset. sub parse_content_type { my $File = shift; my $Content_Type = shift; my $url = shift; my $charset = ''; my ($ct) = lc($Content_Type) =~ /^\s*([^\s;]*)/g; my $mode = $CFG->{MIME}->{$ct} || $ct; $charset = HTML::Encoding::encoding_from_content_type($Content_Type); if (index($mode, '/') != -1) { # a "/" means it's unknown or we'd have a mode here. if ($ct eq 'text/css' and defined $url) { print redirect 'http://jigsaw.w3.org/css-validator/validator?uri=' . uri_escape $url; exit; } elsif ($ct eq 'application/atom+xml' and defined $url) { print redirect 'http://validator.w3.org/feed/check.cgi?url=' . uri_escape $url; exit; } elsif ($ct =~ m(^application/.+\+xml$)) { # unknown media types which should be XML - we give these a try $mode = "XML"; } else { $File->{'Error Flagged'} = TRUE; my $tmpl = &get_error_template($File); $tmpl->param(fatal_mime_error => TRUE); $tmpl->param(fatal_mime_ct => $ct); } } return $mode, $ct, $charset; } # # Get content with Content-Encodings decoded from a response. sub get_content ($$) { my $File = shift; my $res = shift; my $content; eval { $content = $res->decoded_content(charset => 'none', raise_error => 1); }; if ($@) { (my $errmsg = $@) =~ s/ at .*//s; my $cenc = $res->header("Content-Encoding"); my $uri = $res->request->uri; $File->{'Error Flagged'} = TRUE; my $tmpl = &get_error_template($File); $tmpl->param(fatal_decode_error => TRUE); $tmpl->param(fatal_decode_errmsg => $errmsg); $tmpl->param(fatal_decode_cenc => $cenc); # Include URI because it might be a subsystem (eg. HTML5 validator) one $tmpl->param(fatal_decode_uri => $uri); } return $content; } # # Check recursion level and enforce Max Recursion limit. sub check_recursion ($$) { my $File = shift; my $res = shift; # Not looking at our own output. return unless defined $res->header('X-W3C-Validator-Recursion'); my $lvl = $res->header('X-W3C-Validator-Recursion'); return unless $lvl =~ m/^\d+$/; # Non-digit, i.e. garbage, ignore. if ($lvl >= $CFG->{'Max Recursion'}) { print redirect $File->{Env}->{'Home Page'}; } else { # Increase recursion level in output. &get_template($File, 'result.tmpl')->param(depth => $lvl++); } } # # XML::LibXML::InputCallback matcher using our SGML search path jail. sub xml_jail_match { my $arg = shift; # Ensure we have a file:// URI if we get a file. my $uri = URI->new($arg); if (!$uri->scheme()) { $uri = URI::file->new_abs($arg); } $uri = $uri->canonical(); # Do not trap non-file URIs. return 0 unless ($uri->scheme() eq "file"); # Do not trap file URIs within our jail. for my $dir ($CFG->{Paths}->{SGML}->{Library}, split(/\Q$Config{path_sep}\E/o, $ENV{SGML_SEARCH_PATH} || '')) { next unless $dir; my $dir_uri = URI::file->new_abs($dir)->canonical()->as_string(); $dir_uri =~ s|/*$|/|; # ensure it ends with a slash return 0 if ($uri =~ /^\Q$dir_uri\E/); } # We have a match (a file outside the jail). return 1; } # # Escape text to be included in markup comment. sub escape_comment { local $_ = shift; return '' unless defined; s/--/- /g; return $_; } # # Return $_[0] encoded for HTML entities (cribbed from merlyn). # # Note that this is used both for HTML and XML escaping (so e.g. no '). # sub ent { my $str = shift; return '' unless defined($str); # Eliminate warnings # should switch to hex sooner or later $str =~ s/&/&/g; $str =~ s//>/g; $str =~ s/"/"/g; $str =~ s/'/'/g; return $str; } # # Truncate source lines for report. # Expects 1-based column indexes. sub truncate_line { my $line = shift; my $col = shift; my $maxlen = 80; # max line length to truncate to my $diff = length($line) - $maxlen; # Don't truncate at all if it fits. return ($line, $col) if ($diff <= 0); my $start = $col - int($maxlen / 2); if ($start < 0) { # Truncate only from end of line. $start = 0; $line = substr($line, $start, $maxlen - 1) . '…'; } elsif ($start > $diff) { # Truncate only from beginning of line. $start = $diff; $line = '…' . substr($line, $start + 1); } else { # Truncate from both beginning and end of line. $line = '…' . substr($line, $start + 1, $maxlen - 2) . '…'; } # Shift column if we truncated from beginning of line. $col -= $start; return ($line, $col); } # # Suppress any existing DOCTYPE by commenting it out. sub override_doctype { my $File = shift; my ($dt) = grep { $_->{Display} eq $File->{Opt}->{DOCTYPE} } values %{$CFG->{Types}}; # @@TODO: abort/whine about unrecognized doctype if $dt is undef.; my $pubid = $dt->{PubID}; my $sysid = $dt->{SysID}; my $name = $dt->{Name}; # The HTML5 PubID is a fake, reset it out of the way. $pubid = undef if ($pubid eq 'HTML5'); # We don't have public/system ids for all types. my $dtd = "'; my $org_dtd = ''; my $HTML = ''; my $seen_doctype = FALSE; my $declaration = sub { my ($tag, $text) = @_; if ($seen_doctype || uc($tag) ne '!DOCTYPE') { $HTML .= $text; return; } $seen_doctype = TRUE; $org_dtd = &ent($text); ($File->{Root}, undef, $File->{DOCTYPE}) = $text =~ //si; $File->{DOCTYPE} = 'HTML5' if ( lc($File->{Root} || '') eq 'html' && (!defined($File->{DOCTYPE}) || $File->{DOCTYPE} eq 'about:legacy-compat') ); # No Override if Fallback was requested, or if override is the same as # detected my $known = $CFG->{Types}->{$File->{DOCTYPE}}; if ($File->{Opt}->{FB}->{DOCTYPE} or ($known && $File->{Opt}->{DOCTYPE} eq $known->{Display})) { $HTML .= $text; # Stash it as is... } else { $HTML .= "$dtd"; } }; my $start_element = sub { my $p = shift; # Sneak chosen doctype before the root elt if none replaced thus far. $HTML .= $dtd unless $seen_doctype; $HTML .= shift; # We're done with this handler. $p->handler(start => undef); }; HTML::Parser->new( default_h => [sub { $HTML .= shift }, 'text'], declaration_h => [$declaration, 'tag,text'], start_h => [$start_element, 'self,text'] )->parse(join "\n", @{$File->{Content}})->eof(); $File->{Content} = [split /\n/, $HTML]; if ($seen_doctype) { my $known = $CFG->{Types}->{$File->{DOCTYPE}}; unless ($File->{Opt}->{FB}->{DOCTYPE} or ($known && $File->{Opt}->{DOCTYPE} eq $known->{Display})) { &add_warning( 'W13', { W13_org => $org_dtd, W13_new => $File->{Opt}->{DOCTYPE}, } ); $File->{Tentative} |= T_ERROR; # Tag it as Invalid. } } else { if ($File->{"DOCTYPEless OK"}) { &add_warning('W25', {W25_dtd => $File->{Opt}->{DOCTYPE}}); } elsif ($File->{Opt}->{FB}->{DOCTYPE}) { &add_warning('W16', {W16_dtd => $File->{Opt}->{DOCTYPE}}); $File->{Tentative} |= T_ERROR; # Tag it as Invalid. } else { &add_warning('W15', {W15_dtd => $File->{Opt}->{DOCTYPE}}); $File->{Tentative} |= T_ERROR; # Tag it as Invalid. } } return $File; } # # Override inline charset declarations, for use e.g. when passing # transcoded results to external parsers that use them. sub override_charset ($$) { my ($File, $charset) = @_; my $ws = qr/[\x20\x09\x0D\x0A]/o; my $cs = qr/[A-Za-z][a-zA-Z0-9_-]+/o; my $content = join("\n", @{$File->{Content}}); # Flatten newlines (so that we don't end up changing line numbers while # overriding) and comment-escape a string. sub escape_original ($) { my $str = shift; $str =~ tr/\r\n/ /; return &escape_comment($str); } # $content =~ s/( (^<\?xml\b[^>]*?${ws}encoding${ws}*=${ws}*(["'])) (${cs}) (\3.*?\?>) )/lc($4) eq lc($charset) ? "$1" : "$2$charset$5"/esx; # $content =~ s/( (]*?${ws}charset${ws}*=${ws}*["']?${ws}*) (${cs}) (.*?>) )/lc($3) eq lc($charset) ? "$1" : "$2$charset$4"/esix; # $content =~ s/( (]*${ws} http-equiv${ws}*=${ws}*["']?${ws}*content-type\b[^>]*?${ws} content${ws}*=${ws}*["']?[^"'>]+?;${ws}*charset${ws}*=${ws}*) (${cs}) (.*?>) )/lc($3) eq lc($charset) ? "$1" : "$2$charset$4"/esix; # $content =~ s/( (]*${ws} content${ws}*=${ws}*["']?[^"'>]+?;${ws}*charset${ws}*=${ws}*) (${cs}) ([^>]*?${ws}http-equiv${ws}*=${ws}*["']?${ws}*content-type\b.*?>) )/lc($3) eq lc($charset) ? "$1" : "$2$charset$4"/esix; $File->{Content} = [split /\n/, $content]; } sub set_error_uri ($$) { my ($err, $uri) = @_; # We want errors in the doc that was validated to appear without # $err->{uri}, and non-doc errors with it pointing to the external entity # or the like where the error is. This usually works as long as we're # passing docs to parsers as strings, but S::P::O (at least as of 0.994) # seems to give us "3" as the FileName in those cases so we try to filter # out everything that doesn't look like a useful URI. if ($uri && index($uri, '/') != -1) { # Mask local file paths my $euri = URI->new($uri); if (!$euri->scheme() || $euri->scheme() eq 'file') { $err->{uri_is_file} = TRUE; $err->{uri} = ($euri->path_segments())[-1]; } else { $err->{uri} = $euri->canonical(); } } } # # Generate a HTML report of detected errors. sub report_errors ($) { my $File = shift; my $Errors = []; my %Errors_bytype; my $number_of_errors = 0; my $number_of_warnings = 0; my $number_of_info = 0; # for the sake of readability, at least until the xmlwf errors have # explanations, we push the errors from the XML parser at the END of the # error list. push @{$File->{Errors}}, @{$File->{WF_Errors}}; if (scalar @{$File->{Errors}}) { foreach my $err (@{$File->{Errors}}) { my $col = 0; # Populate source/context for errors in our doc that don't have it # already. Checkers should always have populated $err->{src} with # _something_ for non-doc errors. if (!defined($err->{src})) { my $line = undef; # Avoid truncating lines that do not exist. if (defined($err->{line}) && $File->{Content}->[$err->{line} - 1]) { if (defined($err->{char}) && $err->{char} =~ /^[0-9]+$/) { ($line, $col) = &truncate_line( $File->{Content}->[$err->{line} - 1], $err->{char}); $line = &mark_error($line, $col); } elsif (defined($err->{line})) { $col = length($File->{Content}->[$err->{line} - 1]); $col = 80 if ($col > 80); ($line, $col) = &truncate_line( $File->{Content}->[$err->{line} - 1], $col); $line = &ent($line); $col = 0; } } else { $col = 0; } $err->{src} = $line; } my $explanation = ""; if ($err->{expl}) { } else { if ($err->{num}) { my $num = $err->{num}; $explanation .= Encode::decode_utf8( "\n $RSRC{msg}->{$num}->{verbose}\n") if exists $RSRC{msg}->{$num} && exists $RSRC{msg}->{$num}->{verbose}; my $_msg = $RSRC{msg}->{nomsg}->{verbose}; $_msg =~ s//$num/g; if (($File->{'Is Upload'}) or ($File->{'Direct Input'})) { $_msg =~ s///g; } else { my $escaped_uri = uri_escape($File->{URI}); $_msg =~ s//$escaped_uri/g; } # The send feedback plea. $explanation = " $_msg\n$explanation"; $explanation =~ s///g; } $err->{expl} = $explanation; } $err->{col} = ' ' x $col; if ($err->{type} eq 'I') { $err->{class} = 'msg_info'; $err->{err_type_err} = 0; $err->{err_type_warn} = 0; $err->{err_type_info} = 1; $number_of_info += 1; } elsif ($err->{type} eq 'E') { $err->{class} = 'msg_err'; $err->{err_type_err} = 1; $err->{err_type_warn} = 0; $err->{err_type_info} = 0; $number_of_errors += 1; } elsif (($err->{type} eq 'W') or ($err->{type} eq 'X')) { $err->{class} = 'msg_warn'; $err->{err_type_err} = 0; $err->{err_type_warn} = 1; $err->{err_type_info} = 0; $number_of_warnings += 1; } # TODO other classes for "X" etc? FIXME find all types of message. push @{$Errors}, $err; if (($File->{Opt}->{'Group Errors'}) and (($err->{type} eq 'E') or ($err->{type} eq 'W') or ($err->{type} eq 'X')) ) { # index by num for errors and warnings only - info usually # gives context of error or warning if (!exists $Errors_bytype{$err->{num}}) { $Errors_bytype{$err->{num}}->{instances} = []; my $msg_text; if ($err->{num} eq 'xmlwf') { # FIXME need a catalog of errors from XML::LibXML $msg_text = "XML Parsing Error"; } elsif ($err->{num} eq 'html5') { $msg_text = "HTML5 Validator Error"; } else { $msg_text = $RSRC{msg}->{$err->{num}}->{original}; $msg_text =~ s/%1/X/; $msg_text =~ s/%2/Y/; } $Errors_bytype{$err->{num}}->{expl} = $err->{expl}; $Errors_bytype{$err->{num}}->{generic_msg} = $msg_text; $Errors_bytype{$err->{num}}->{msg} = $err->{msg}; $Errors_bytype{$err->{num}}->{type} = $err->{type}; $Errors_bytype{$err->{num}}->{class} = $err->{class}; $Errors_bytype{$err->{num}}->{err_type_err} = $err->{err_type_err}; $Errors_bytype{$err->{num}}->{err_type_warn} = $err->{err_type_warn}; $Errors_bytype{$err->{num}}->{err_type_info} = $err->{err_type_info}; } push @{$Errors_bytype{$err->{num}}->{instances}}, $err; } } } @$Errors = values(%Errors_bytype) if $File->{Opt}->{'Group Errors'}; # we are not sorting errors by line, as it would break the position # of auxiliary messages such as "start tag was here". We'll have to live # with the fact that XML well-formedness errors are listed first, then # validation errors #else { # sort error by lines # @{$Errors} = sort {$a->{line} <=> $b->{line} } @{$Errors}; #} return $number_of_errors, $number_of_warnings, $number_of_info, $Errors; } # # Chop the source line into 3 pieces; the character at which the error # was detected, and everything to the left and right of that position. # That way we can add markup to the relevant char without breaking &ent(). # Expects 1-based column indexes. sub mark_error ($$) { my $line = shift; my $col = shift; my $linelen = length($line); # Coerce column into an index valid within the line. if ($col < 1) { $col = 1; } elsif ($col > $linelen) { $col = $linelen; } $col--; my $left = substr($line, 0, $col); my $char = substr($line, $col, 1); my $right = substr($line, $col + 1); $char = &ent($char); $char = qq($char); $line = &ent($left) . $char . &ent($right); return $line; } # # Create a HTML representation of the document. sub source { my $File = shift; # Remove any BOM since we're not at BOT anymore... $File->{Content}->[0] = substr($File->{Content}->[0], 1) if ($File->{BOM} && scalar(@{$File->{Content}})); my @source = map({file_source_line => $_}, @{$File->{Content}}); return \@source; } sub match_DTD_FPI_SI { my ($File, $FPI, $SI) = @_; if ($CFG->{Types}->{$FPI}) { if ($CFG->{Types}->{$FPI}->{SysID}) { if ($SI ne $CFG->{Types}->{$FPI}->{SysID}) { &add_warning( 'W26', { W26_dtd_pub => $FPI, W26_dtd_pub_display => $CFG->{Types}->{$FPI}->{Display}, W26_dtd_sys => $SI, W26_dtd_sys_recommend => $CFG->{Types}->{$FPI}->{SysID} } ); } } } else { # FPI not known, checking if the SI is while (my ($proper_FPI, $value) = each %{$CFG->{Types}}) { if ($value->{SysID} && $value->{SysID} eq $SI) { &add_warning( 'W26', { W26_dtd_pub => $FPI, W26_dtd_pub_display => $value->{Display}, W26_dtd_sys => $SI, W26_dtd_pub_recommend => $proper_FPI } ); } } } } # # Do an initial parse of the Document Entity to extract FPI. sub preparse_doctype { my $File = shift; # # Reset DOCTYPE, Root (for second invocation, probably not needed anymore). $File->{DOCTYPE} = ''; $File->{Root} = ''; my $dtd = sub { return if $File->{Root}; # TODO: The \s and \w are probably wrong now that the strings are # utf8_on my $declaration = shift; my $doctype_type; my $doctype_secondpart; if ($declaration =~ //si ) { $File->{Root} = "html"; $File->{DOCTYPE} = "HTML5"; } elsif ($declaration =~ m()si ) { ( $File->{Root}, $doctype_type, $File->{DOCTYPE}, $doctype_secondpart ) = ($1, $2, $3, $4); if (($doctype_type eq "PUBLIC") and (($doctype_secondpart) = $doctype_secondpart =~ m(\s+(?:[\'\"])([^\"\']+)(?:[\"\']).*)si) ) { &match_DTD_FPI_SI($File, $File->{DOCTYPE}, $doctype_secondpart); } } }; my $start = sub { my ($p, $tag, $attr) = @_; if ($File->{Root}) { return unless $tag eq $File->{Root}; } else { $File->{Root} = $tag; } if ($attr->{xmlns}) { $File->{Namespace} = $attr->{xmlns}; } if ($attr->{version}) { $File->{'Root Version'} = $attr->{version}; } if ($attr->{baseProfile}) { $File->{'Root BaseProfile'} = $attr->{baseProfile}; } # We're done parsing. $p->eof(); }; # we use HTML::Parser as pre-parser. May use html5lib or other in the future my $p = HTML::Parser->new(api_version => 3); # if content-type has shown we should pre-parse with XML mode, use that # otherwise (mostly text/html cases) use default mode $p->xml_mode(&is_xml($File)); $p->handler(declaration => $dtd, 'text'); $p->handler(start => $start, 'self,tag,attr'); my $line = 0; my $max = scalar(@{$File->{Content}}); $p->parse( sub { return ($line < $max) ? $File->{Content}->[$line++] . "\n" : undef; } ); $p->eof(); # TODO: These \s here are probably wrong now that the strings are utf8_on $File->{DOCTYPE} = '' unless defined $File->{DOCTYPE}; $File->{DOCTYPE} =~ s(^\s+){ }g; $File->{DOCTYPE} =~ s(\s+$){ }g; $File->{DOCTYPE} =~ s(\s+) { }g; # Some document types actually need no doctype to be identified, # root element and some version attribute is enough # TODO applicable doctypes should be migrated to a config file? # if (($File->{DOCTYPE} eq '') and ($File->{Root} eq "svg") ) { # if (($File->{'Root Version'}) or ($File->{'Root BaseProfile'})) # { # if (! $File->{'Root Version'}) { $File->{'Root Version'} = "0"; } # if (! $File->{'Root BaseProfile'}) { $File->{'Root BaseProfile'} = "0"; } # if ($File->{'Root Version'} eq "1.0"){ # $File->{DOCTYPE} = "-//W3C//DTD SVG 1.0//EN"; # $File->{"DOCTYPEless OK"} = TRUE; # $File->{Opt}->{DOCTYPE} = "SVG 1.0"; # } # if ((($File->{'Root Version'} eq "1.1") or ($File->{'Root Version'} eq "0")) and ($File->{'Root BaseProfile'} eq "tiny")) { # $File->{DOCTYPE} = "-//W3C//DTD SVG 1.1 Tiny//EN"; # $File->{"DOCTYPEless OK"} = TRUE; # $File->{Opt}->{DOCTYPE} = "SVG 1.1 Tiny"; # } # elsif ((($File->{'Root Version'} eq "1.1") or ($File->{'Root Version'} eq "0")) and ($File->{'Root BaseProfile'} eq "basic")) { # $File->{DOCTYPE} = "-//W3C//DTD SVG 1.1 Basic//EN"; # $File->{Opt}->{DOCTYPE} = "SVG 1.1 Basic"; # $File->{"DOCTYPEless OK"} = TRUE; # } # elsif (($File->{'Root Version'} eq "1.1") and (!$File->{'Root BaseProfile'})) { # $File->{DOCTYPE} = "-//W3C//DTD SVG 1.1//EN"; # $File->{Opt}->{DOCTYPE} = "SVG 1.1"; # $File->{"DOCTYPEless OK"} = TRUE; # } # if ($File->{'Root Version'} eq "0") { $File->{'Root Version'} = undef; } # if ($File->{'Root BaseProfile'} eq "0") { $File->{'Root BaseProfile'} = undef; } # } # else { # # by default for an svg root elt, we use SVG 1.1 # $File->{DOCTYPE} = "-//W3C//DTD SVG 1.1//EN"; # $File->{Opt}->{DOCTYPE} = "SVG 1.1"; # $File->{"DOCTYPEless OK"} = TRUE; # } # } if (($File->{"DOCTYPEless OK"}) and ($File->{Opt}->{DOCTYPE})) { # doctypeless document type found, we fake the override # so that the parser will have something to validate against $File = &override_doctype($File); } return $File; } # # Preprocess CGI parameters. sub prepCGI { my $File = shift; my $q = shift; # The URL to this CGI script. $File->{Env}->{'Self URI'} = $q->url(); # Decode parameter values, set booleans the way we expect them. foreach my $param ($q->param()) { # 'uploaded_file' and 'fragment' contain data we treat as is. next if ($param eq 'uploaded_file' || $param eq 'fragment'); # Decode all other defined values as UTF-8. my @values = map { Encode::decode_utf8($_) } $q->param($param); $q->param($param, @values); # Skip parameters that should not be treated as booleans. next if $param =~ /^(?:accept(?:-(?:language|charset))?|ur[il])$/; # Keep false-but-set params. next if $q->param($param) eq '0'; # Parameters that are given to us without specifying a value get set # to a true value. $q->param($param, TRUE) unless $q->param($param); } $File->{Env}->{'Home Page'} = URI->new_abs(".", $File->{Env}->{'Self URI'}); # Use "url" unless a "uri" was also given. if ($q->param('url') and not $q->param('uri')) { $q->param('uri', $q->param('url')); } # Set output mode; needed in get_error_template if we end up there. $File->{Opt}->{Output} = $q->param('output') || 'html'; # Issue a redirect for uri=referer. if ($q->param('uri') and $q->param('uri') eq 'referer') { if ($q->referer) { $q->param('uri', $q->referer); $q->param('accept', $q->http('Accept')) if ($q->http('Accept')); $q->param('accept-language', $q->http('Accept-Language')) if ($q->http('Accept-Language')); $q->param('accept-charset', $q->http('Accept-Charset')) if ($q->http('Accept-Charset')); print redirect(-uri => &self_url_q($q, $File), -vary => 'Referer'); exit; } else { # No Referer header was found. $File->{'Error Flagged'} = TRUE; &get_error_template($File)->param(fatal_referer_error => TRUE); } } # Supersede URL with an uploaded file. if ($q->param('uploaded_file')) { $q->param('uri', 'upload://' . $q->param('uploaded_file')); $File->{'Is Upload'} = TRUE; # Tag it for later use. } # Supersede URL with an uploaded fragment. if ($q->param('fragment')) { $q->param('uri', 'upload://Form Submission'); $File->{'Direct Input'} = TRUE; # Tag it for later use. } # Redirect to a GETable URL if method is POST without a file upload. if (defined $q->request_method and $q->request_method eq 'POST' and not($File->{'Is Upload'} or $File->{'Direct Input'})) { my $thispage = &self_url_q($q, $File); print redirect $thispage; exit; } # # Flag an error if we didn't get a file to validate. unless ($q->param('uri')) { $File->{'Error Flagged'} = TRUE; my $tmpl = &get_error_template($File); $tmpl->param(fatal_uri_error => TRUE); $tmpl->param(fatal_uri_scheme => 'undefined'); } return $q; } # # Set parse mode (SGML or XML) based on a number of preparsed factors: # * HTTP Content-Type # * Doctype Declaration # * XML Declaration # * XML namespaces sub set_parse_mode { my $File = shift; my $CFG = shift; my $fpi = $File->{DOCTYPE}; $File->{ModeChoice} = ''; my $parseModeFromDoctype = $CFG->{Types}->{$fpi}->{'Parse Mode'} || 'TBD'; my $xmlws = qr/[\x20\x09\x0D\x0A]/o; # $File->{Mode} may have been set in parse_content_type # and it would come from the Media Type my $parseModeFromMimeType = $File->{Mode}; my $begincontent = join "\x20", @{$File->{Content}}; # for the sake of xml decl detection, # the 10 first lines should be safe my $parseModeFromXMLDecl = ( $begincontent =~ /^ ${xmlws}* # whitespace before the decl should not be happening # but we are greedy for the sake of detection, not validation <\?xml ${xmlws}+ # start matching an XML Declaration version ${xmlws}* = # for documents, version info is mandatory ${xmlws}* (["'])1.[01]\1 # hardcoding the existing XML versions. # Maybe we should use \d\.\d (?:${xmlws}+ encoding ${xmlws}* = ${xmlws}* (["'])[A-Za-z][a-zA-Z0-9_-]+\2 )? # encoding info is optional (?:${xmlws}+ standalone ${xmlws}* = ${xmlws}* (["'])(?:yes|no)\3 )? # ditto standalone info, optional ${xmlws}* \?> # end of XML Declaration /ox ? 'XML' : 'TBD' ); my $parseModeFromNamespace = 'TBD'; # http://www.w3.org/Bugs/Public/show_bug.cgi?id=9967 $parseModeFromNamespace = 'XML' if ($File->{Namespace} && $parseModeFromDoctype ne 'HTML5'); if (($parseModeFromMimeType eq 'TBD') and ($parseModeFromXMLDecl eq 'TBD') and ($parseModeFromNamespace eq 'TBD') and (!exists $CFG->{Types}->{$fpi})) { # if the mime type is text/html (ambiguous, hence TBD mode) # and the doctype isn't in the catalogue # and XML prolog detection was unsuccessful # and we found no namespace at the root # ... throw in a warning &add_warning( 'W06', { W06_mime => $File->{ContentType}, w06_doctype => $File->{DOCTYPE} } ); return; } $parseModeFromDoctype = 'TBD' unless $parseModeFromDoctype eq 'SGML' or $parseModeFromDoctype eq 'HTML5' or $parseModeFromDoctype eq 'XML' or $parseModeFromNamespace eq 'XML'; if (($parseModeFromDoctype eq 'TBD') and ($parseModeFromXMLDecl eq 'TBD') and ($parseModeFromMimeType eq 'TBD') and ($parseModeFromNamespace eq 'TBD')) { # if all factors are useless to give us a parse mode # => we use SGML-based DTD validation as a default $File->{Mode} = 'DTD+SGML'; $File->{ModeChoice} = 'Fallback'; # and send warning about the fallback &add_warning( 'W06', { W06_mime => $File->{ContentType}, w06_doctype => $File->{DOCTYPE} } ); return; } if ($parseModeFromMimeType ne 'TBD') { # if The mime type gives clear indication of whether the document is # XML or not if (($parseModeFromDoctype ne 'TBD') and ($parseModeFromDoctype ne 'HTML5') and ($parseModeFromMimeType ne $parseModeFromDoctype)) { # if document-type recommended mode and content-type recommended # mode clash, shoot a warning # unknown doctypes will not trigger this # neither will html5 documents, which can be XML or not &add_warning( 'W07', { W07_mime => $File->{ContentType}, W07_ct => $parseModeFromMimeType, W07_dtd => $parseModeFromDoctype, } ); } # mime type has precedence, we stick to it $File->{ModeChoice} = 'Mime'; if ($parseModeFromDoctype eq "HTML5") { $File->{Mode} = 'HTML5+' . $File->{Mode}; } else { $File->{Mode} = 'DTD+' . $File->{Mode}; } return; } if ($parseModeFromDoctype ne 'TBD') { # the mime type is ambiguous (hence we didn't stop at the previous test) # but by now we're sure that the document type is a good indication # so we use that. if ($parseModeFromDoctype eq "HTML5") { if ($parseModeFromXMLDecl eq "XML" or $parseModeFromNamespace eq "XML") { $File->{Mode} = "HTML5+XML"; } else { $File->{Mode} = "HTML5"; } } else { # not HTML5 $File->{Mode} = "DTD+" . $parseModeFromDoctype; } $File->{ModeChoice} = 'Doctype'; return; } if ($parseModeFromXMLDecl ne 'TBD') { # the mime type is ambiguous (hence we didn't stop at the previous test) # and so was the doctype # but we found an XML declaration so we use that. if ($File->{Mode} eq "") { $File->{Mode} = "DTD+" . $parseModeFromXMLDecl; } elsif ((my $ix = index($File->{Mode}, '+')) != -1) { substr($File->{Mode}, $ix + 1) = $parseModeFromXMLDecl; } else { $File->{Mode} = $File->{Mode} . "+" . $parseModeFromXMLDecl; } $File->{ModeChoice} = 'XMLDecl'; return; } # this is the last case. We know that all modes are not TBD, # yet mime type, doctype AND XML DECL tests have failed => we are saved # by the presence of namespaces if ($File->{Mode} eq "") { $File->{Mode} = "DTD+" . $parseModeFromNamespace; } elsif ((my $ix = index($File->{Mode}, '+')) != -1) { substr($File->{Mode}, $ix + 1) = $parseModeFromNamespace; } else { $File->{Mode} = $File->{Mode} . "+" . $parseModeFromNamespace; } $File->{ModeChoice} = 'Namespace'; } # # Utility sub to tell if mode "is" XML. sub is_xml { index(shift->{Mode}, 'XML') != -1; } # # Check charset conflicts and add any warnings necessary. sub charset_conflicts { my $File = shift; # # Handle the case where there was no charset to be found. unless ($File->{Charset}->{Use}) { &add_warning('W17', {}); $File->{Tentative} |= T_WARN; } # # Add a warning if there was charset info conflict (HTTP header, # XML declaration, or element). # filtering out some of the warnings in direct input mode where HTTP # encoding is a "fake" if (( charset_not_equal( $File->{Charset}->{HTTP}, $File->{Charset}->{XML} ) ) and not($File->{'Direct Input'}) ) { &add_warning( 'W18', { W18_http => $File->{Charset}->{HTTP}, W18_xml => $File->{Charset}->{XML}, W18_use => $File->{Charset}->{Use}, } ); } elsif ( charset_not_equal($File->{Charset}->{HTTP}, $File->{Charset}->{META}) and not($File->{'Direct Input'})) { &add_warning( 'W19', { W19_http => $File->{Charset}->{HTTP}, W19_meta => $File->{Charset}->{META}, W19_use => $File->{Charset}->{Use}, } ); } elsif ( charset_not_equal($File->{Charset}->{XML}, $File->{Charset}->{META})) { &add_warning( 'W20', { W20_xml => $File->{Charset}->{XML}, W20_meta => $File->{Charset}->{META}, } ); $File->{Tentative} |= T_WARN; } return $File; } # # Transcode to UTF-8 sub transcode { my $File = shift; my $general_charset = $File->{Charset}->{Use}; my $exact_charset = $general_charset; # TODO: This should be done before transcode() if ($general_charset eq 'utf-16') { if ($File->{Charset}->{Auto} =~ m/^utf-16[bl]e$/) { $exact_charset = $File->{Charset}->{Auto}; } else { $exact_charset = 'utf-16be'; } } my $cs = $exact_charset; if ($CFG->{Charsets}->{$cs}) { if (index($CFG->{Charsets}->{$cs}, 'ERR ') != -1) { # The encoding is not supported due to policy $File->{'Error Flagged'} = TRUE; my $tmpl = &get_error_template($File); $tmpl->param(fatal_transcode_error => TRUE); $tmpl->param(fatal_transcode_charset => $cs); # @@FIXME might need better text $tmpl->param(fatal_transcode_errmsg => 'This encoding is not supported by the validator.'); return $File; } elsif (index($CFG->{Charsets}->{$cs}, 'X ') != -1) { # possibly problematic, we recommend another alias my $recommended_charset = $CFG->{Charsets}->{$cs}; $recommended_charset =~ s/X //; &add_warning( 'W22', { W22_declared => $cs, W22_suggested => $recommended_charset, } ); } } # Does the system support decoding this encoding? my $enc = Encode::find_encoding($cs); if (!$enc) { # This system's Encode installation does not support # the character encoding; might need additional modules $File->{'Error Flagged'} = TRUE; my $tmpl = &get_error_template($File); $tmpl->param(fatal_transcode_error => TRUE); $tmpl->param(fatal_transcode_charset => $cs); # @@FIXME might need better text $tmpl->param(fatal_transcode_errmsg => 'Encoding not supported.'); return $File; } elsif (!$CFG->{Charsets}->{$cs}) { # not in the list, but technically OK -> we warn &add_warning('W24', {W24_declared => $cs,}); } my $output; my $input = $File->{Bytes}; # Try to transcode eval { $output = $enc->decode($input, Encode::FB_CROAK); }; if ($@) { # Transcoding failed - do it again line by line to find out exactly # where my $line_num = 0; while ($input =~ /(.*?)(?:\r\n|\n|\r|\z)/g) { $line_num++; eval { $enc->decode($1, Encode::FB_CROAK); }; if ($@) { my $croak_message = $@; $croak_message =~ s/ at .*//; $File->{'Error Flagged'} = TRUE; my $tmpl = &get_error_template($File); $tmpl->param(fatal_byte_error => TRUE); $tmpl->param(fatal_byte_lines => $line_num); $tmpl->param(fatal_byte_charset => $cs); $tmpl->param(fatal_byte_error_msg => $croak_message); last; } } return $File; } # @@FIXME is this what we want? $output =~ s/\015?\012/\n/g; # make sure we deal only with unix newlines # tentative fix for http://www.w3.org/Bugs/Public/show_bug.cgi?id=3992 $output =~ s/(\r\n|\n|\r)/\n/g; #debug: we could check if the content has utf8 bit on with #$output= utf8::is_utf8($output) ? 1 : 0; $File->{Content} = [split /\n/, $output]; return $File; } sub find_encodings { my $File = shift; my $bom = HTML::Encoding::encoding_from_byte_order_mark($File->{Bytes}); my @first = HTML::Encoding::encoding_from_first_chars($File->{Bytes}); if (defined $bom) { # @@FIXME this BOM entry should not be needed at all! $File->{BOM} = length(Encode::encode($bom, "\x{FEFF}")); $File->{Charset}->{Auto} = lc $bom; } else { $File->{Charset}->{Auto} = lc($first[0]) if @first; } my $xml = HTML::Encoding::encoding_from_xml_document($File->{Bytes}); $File->{Charset}->{XML} = lc $xml if defined $xml; my %metah; foreach my $try (@first) { # @@FIXME I think the old code used HTML::Parser xml mode, check if ok my $meta = HTML::Encoding::encoding_from_meta_element($File->{Bytes}, $try); $metah{lc($meta)}++ if defined $meta and length $meta; } if (!%metah) { # HTML::Encoding doesn't support HTML5 as of 0.60, # check it ourselves. HTML::HeadParser >= 3.60 is required for this. my $hp = HTML::HeadParser->new(); my $seen_doctype = FALSE; my $is_html5 = FALSE; $hp->handler( declaration => sub { my ($tag, $text) = @_; return if ($seen_doctype || uc($tag) ne '!DOCTYPE'); $seen_doctype = TRUE; $is_html5 = TRUE if ( $text =~ //six ); }, 'tag,text' ); $hp->parse($File->{Bytes}); if ($is_html5) { my $cs = $hp->header('X-Meta-Charset'); $metah{lc($cs)}++ if (defined($cs) && length($cs)); } } if (%metah) { my @meta = sort { $metah{$b} <=> $metah{$a} } keys %metah; $File->{Charset}->{META} = $meta[0]; } return $File; } # # Abort with a message if an error was flagged at point. sub abort_if_error_flagged { my $File = shift; return unless $File->{'Error Flagged'}; return if $File->{'Error Handled'}; # Previous error, keep going. my $tmpl = &get_error_template($File); $tmpl->param(fatal_error => TRUE); &prep_template($File, $tmpl); # transcode output from perl's internal to utf-8 and output print Encode::encode('UTF-8', $tmpl->output); exit; } # # conflicting encodings sub charset_not_equal { my $encodingA = shift; my $encodingB = shift; return $encodingA && $encodingB && ($encodingA ne $encodingB); } # # Construct a self-referential URL from a CGI.pm $q object. sub self_url_q { my ($q, $File) = @_; my $thispage = $File->{Env}->{'Self URI'} . '?'; # Pass-through parameters for my $param (qw(uri accept accept-language accept-charset)) { $thispage .= "$param=" . uri_escape($q->param($param)) . ';' if $q->param($param); } # Boolean parameters for my $param (qw(ss outline No200 verbose group)) { $thispage .= "$param=1;" if $q->param($param); } # Others if ($q->param('doctype') and $q->param('doctype') !~ /(?:Inline|detect)/i) { $thispage .= 'doctype=' . uri_escape($q->param('doctype')) . ';'; } if ($q->param('charset') and $q->param('charset') !~ /detect/i) { $thispage .= 'charset=' . uri_escape($q->param('charset')) . ';'; } $thispage =~ s/[\?;]$//; return $thispage; } # # Construct a self-referential URL from a $File object. sub self_url_file { my $File = shift; my $thispage = $File->{Env}->{'Self URI'}; my $escaped_uri = uri_escape($File->{URI}); $thispage .= qq(?uri=$escaped_uri); $thispage .= ';ss=1' if $File->{Opt}->{'Show Source'}; $thispage .= ';st=1' if $File->{Opt}->{'Show Tidy'}; $thispage .= ';outline=1' if $File->{Opt}->{Outline}; $thispage .= ';No200=1' if $File->{Opt}->{No200}; $thispage .= ';verbose=1' if $File->{Opt}->{Verbose}; $thispage .= ';group=1' if $File->{Opt}->{'Group Errors'}; $thispage .= ';accept=' . uri_escape($File->{Opt}->{'Accept Header'}) if $File->{Opt}->{'Accept Header'}; $thispage .= ';accept-language=' . uri_escape($File->{Opt}->{'Accept-Language Header'}) if $File->{Opt}->{'Accept-Language Header'}; $thispage .= ';accept-charset=' . uri_escape($File->{Opt}->{'Accept-Charset Header'}) if $File->{Opt}->{'Accept-Charset Header'}; return $thispage; } ##### package W3C::Validator::EventHandler; # # Define global constants use constant TRUE => 1; use constant FALSE => 0; # # Tentative Validation Severities. use constant T_WARN => 4; # 0000 0100 use constant T_ERROR => 8; # 0000 1000 sub new { my $class = shift; my $parser = shift; my $File = shift; my $CFG = shift; my $self = {_file => $File, CFG => $CFG, _parser => $parser}; bless $self, $class; } sub start_element { my ($self, $element) = @_; my $has_xmlns = FALSE; my $xmlns_value = undef; # If in XML mode, find namespace used for each element. if ((my $attr = $element->{Attributes}->{xmlns}) && &W3C::Validator::MarkupValidator::is_xml($self->{_file})) { $xmlns_value = ""; # Try with SAX method if ($attr->{Value}) { $has_xmlns = TRUE; $xmlns_value = $attr->{Value}; } #next if $has_xmlns; # The following is not SAX, but OpenSP specific. my $defaulted = $attr->{Defaulted} || ''; if ($defaulted eq "specified") { $has_xmlns = TRUE; $xmlns_value .= join("", map { $_->{Data} } @{$attr->{CdataChunks}}); } } my $doctype = $self->{_file}->{DOCTYPE}; if (!defined($self->{CFG}->{Types}->{$doctype}->{Name}) || $element->{Name} ne $self->{CFG}->{Types}->{$doctype}->{Name}) { # add to list of non-root namespaces push(@{$self->{_file}->{Namespaces}}, $xmlns_value) if $has_xmlns; } elsif (!$has_xmlns && $self->{CFG}->{Types}->{$doctype}->{"Namespace Required"}) { # whine if the root xmlns attribute is noted as required by spec, # but not present my $err = {}; my $location = $self->{_parser}->get_location(); &W3C::Validator::MarkupValidator::set_error_uri($err, $location->{FileName}); # S::P::O does not provide src context, set to empty for non-doc errors. $err->{src} = "" if $err->{uri}; $err->{line} = $location->{LineNumber}; $err->{char} = $location->{ColumnNumber}; $err->{num} = "no-xmlns"; $err->{type} = "E"; $err->{msg} = "Missing xmlns attribute for element $element->{Name}. The " . "value should be: $self->{CFG}->{Types}->{$doctype}->{Namespace}"; # ... $self->{_file}->{'Is Valid'} = FALSE; push @{$self->{_file}->{Errors}}, $err; } elsif ($has_xmlns and (defined $self->{CFG}->{Types}->{$doctype}->{Namespace}) and ($xmlns_value ne $self->{CFG}->{Types}->{$doctype}->{Namespace})) { # whine if root xmlns element is not the one specificed by the spec my $err = {}; my $location = $self->{_parser}->get_location(); &W3C::Validator::MarkupValidator::set_error_uri($err, $location->{FileName}); # S::P::O does not provide src context, set to empty for non-doc errors. $err->{line} = $location->{LineNumber}; $err->{char} = $location->{ColumnNumber}; $err->{num} = "wrong-xmlns"; $err->{type} = "E"; $err->{msg} = "Wrong xmlns attribute for element $element->{Name}. The " . "value should be: $self->{CFG}->{Types}->{$doctype}->{Namespace}"; # ... $self->{_file}->{'Is Valid'} = FALSE; push @{$self->{_file}->{Errors}}, $err; } } sub error { my $self = shift; my $error = shift; my $mess; eval { $mess = $self->{_parser}->split_message($error); }; if ($@) { # this is a message that S:P:O could not handle, we skip its croaking return; } my $File = $self->{_file}; my $err = {}; &W3C::Validator::MarkupValidator::set_error_uri($err, $self->{_parser}->get_location()->{FileName}); # S::P::O does not provide src context, set to empty for non-doc errors. $err->{src} = "" if $err->{uri}; $err->{line} = $mess->{primary_message}{LineNumber}; $err->{char} = $mess->{primary_message}{ColumnNumber} + 1; $err->{num} = $mess->{primary_message}{Number}; $err->{type} = $mess->{primary_message}{Severity}; $err->{msg} = $mess->{primary_message}{Text}; # our parser OpenSP is not quite XML-aware, or XML Namespaces Aware, # so we filter out a few errors for now my $is_xml = &W3C::Validator::MarkupValidator::is_xml($File); if ($is_xml and $err->{num} eq '108' and $err->{msg} =~ m{ "xmlns:\S+"}) { # the error is about a missing xmlns: attribute definition" return; # this is not an error, 'cause we said so } if ($err->{num} eq '187') # filtering out no "document type declaration; will parse without # validation" if root element is not html and mode is xml... { # since parsing was done without validation, result can only be # "well-formed" if ($is_xml and lc($File->{Root}) ne 'html') { $File->{XMLWF_ONLY} = TRUE; W3C::Validator::MarkupValidator::add_warning('W09xml', {}); return; # don't report this as an error, just proceed } # if mode is not XML, we do report the error. It should not happen in # the case of without doctype, in that case the error message # will be #344 } if (($err->{num} eq '113') and index($err->{msg}, 'xml:space') != -1) { # FIXME # this is a problem with some of the "flattened" W3C DTDs, filtering # them out to not confuse users. hoping to get the DTDs fixed, see # http://lists.w3.org/Archives/Public/www-html-editor/2007AprJun/0010.html return; # don't report this, just proceed } if ($is_xml and $err->{num} eq '344' and $File->{Namespace}) { # we are in XML mode, we have a namespace, but no doctype. # the validator will already have said "no doctype, falling back to # default" above # no need to report this. return; # don't report this, just proceed } if (($err->{num} eq '248') or ($err->{num} eq '247') or ($err->{num} eq '246')) { # these two errors should be triggered by -wmin-tag to report shorttag # used, but we're making them warnings, not errors # see http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.7 $err->{type} = "W"; } # Workaround for onsgmls as of 1.5 sometimes allegedly reporting errors # beyond EOL. If you see this warning in your web server logs, please # let the validator developers know, see http://validator.w3.org/feedback.html # As long as $err may be from somewhere else than the document (such as # from a DTD) and we have no way of identifying these cases, this # produces bogus results and error log spewage, so commented out for now. # if ((my $l = length($File->{Content}->[$err->{line}-1])) < $err->{char}) { # warn("Warning: reported error column larger than line length " . # "($err->{char} > $l) in $File->{URI} line $err->{line}, " . # "OpenSP bug? Resetting to line length."); # $err->{char} = $l; # } # No or unknown FPI and a relative SI. if ($err->{msg} =~ m(cannot (?:open|find))) { $File->{'Error Flagged'} = TRUE; my $tmpl = &W3C::Validator::MarkupValidator::get_error_template($File); $tmpl->param(fatal_parse_extid_error => TRUE); $tmpl->param(fatal_parse_extid_msg => $err->{msg}); } # No DOCTYPE found! We are falling back to vanilla DTD if (index($err->{msg}, "prolog can't be omitted") != -1) { if (lc($File->{Root}) eq 'html') { my $dtd = $File->{"Default DOCTYPE"}->{$is_xml ? "XHTML" : "HTML"}; W3C::Validator::MarkupValidator::add_warning('W09', {W09_dtd => $dtd}); } else { # not html root element, we are not using fallback unless ($is_xml) { $File->{'Is Valid'} = FALSE; W3C::Validator::MarkupValidator::add_warning('W09nohtml', {}); } } return; # Don't report this as a normal error. } # TODO: calling exit() here is probably a bad idea W3C::Validator::MarkupValidator::abort_if_error_flagged($File); push @{$File->{Errors}}, $err; # ... $File->{'Is Valid'} = FALSE if $err->{type} eq 'E'; if (defined $mess->{aux_message}) { # "duplicate id ... first defined here" style messages push @{$File->{Errors}}, { line => $mess->{aux_message}{LineNumber}, char => $mess->{aux_message}{ColumnNumber} + 1, msg => $mess->{aux_message}{Text}, type => 'I', }; } } package W3C::Validator::EventHandler::Outliner; # # Define global constants use constant TRUE => 1; use constant FALSE => 0; # # Tentative Validation Severities. use constant T_WARN => 4; # 0000 0100 use constant T_ERROR => 8; # 0000 1000 use base qw(W3C::Validator::EventHandler); sub new { my $class = shift; my $parser = shift; my $File = shift; my $CFG = shift; my $self = $class->SUPER::new($parser, $File, $CFG); $self->{am_in_heading} = 0; $self->{heading_text} = []; bless $self, $class; } sub data { my ($self, $chars) = @_; push(@{$self->{heading_text}}, $chars->{Data}) if $self->{am_in_heading}; } sub start_element { my ($self, $element) = @_; if ($element->{Name} =~ /^h([1-6])$/i) { $self->{_file}->{heading_outline} ||= ""; $self->{_file}->{heading_outline} .= " " x int($1) . "[$element->{Name}] "; $self->{am_in_heading} = 1; } return $self->SUPER::start_element($element); } sub end_element { my ($self, $element) = @_; if ($element->{Name} =~ /^h[1-6]$/i) { my $text = join("", @{$self->{heading_text}}); $text =~ s/^\s+//g; $text =~ s/\s+/ /g; $text =~ s/\s+$//g; $self->{_file}->{heading_outline} .= "$text\n"; $self->{am_in_heading} = 0; $self->{heading_text} = []; } } ##### package W3C::Validator::UserAgent; use HTTP::Message qw(); use LWP::UserAgent 2.032 qw(); # Need 2.032 for default_header() use Net::hostent qw(gethostbyname); use Net::IP qw(); use Socket qw(inet_ntoa); use base qw(LWP::UserAgent); BEGIN { # The 4k default line length in LWP <= 5.832 isn't enough for example to # accommodate 4kB cookies (RFC 2985); bump it (#6678). require LWP::Protocol::http; push(@LWP::Protocol::http::EXTRA_SOCK_OPTS, MaxLineLength => 8 * 1024); } sub new { my ($proto, $CFG, $File, @rest) = @_; my $class = ref($proto) || $proto; my $self = $class->SUPER::new(@rest); $self->{'W3C::Validator::CFG'} = $CFG; $self->{'W3C::Validator::File'} = $File; $self->env_proxy(); $self->agent($File->{Opt}->{'User Agent'}); $self->protocols_allowed($CFG->{Protocols}->{Allow} || ['http', 'https']); # Don't parse the http-equiv stuff. $self->parse_head(0); # Tell caches in the middle we want a fresh copy (Bug 4998). $self->default_header('Cache-Control' => 'max-age=0'); # If not in debug mode, set Accept-Encoding to what LWP (>= 5.816) can handle $self->default_header( 'Accept-Encoding' => scalar HTTP::Message::decodable()) if (!$File->{Opt}->{Debug} && HTTP::Message->can('decodable')); # Our timeout should be set to something lower than the web server's, # remembering to give some head room for the actual validation to take # place after the document has been fetched (something like 15 seconds # should be plenty). validator.w3.org instances have their timeout set # to 60 seconds as of writing this (#4985, #6950). $self->timeout(45); return $self; } sub redirect_ok { my ($self, $req, $res) = @_; return $self->SUPER::redirect_ok($req, $res) && $self->uri_ok($req->uri()); } sub uri_ok { my ($self, $uri) = @_; return 1 if ($self->{'W3C::Validator::CFG'}->{'Allow Private IPs'} || !$uri->can('host')); my $h5uri = $self->{'W3C::Validator::CFG'}->{External}->{HTML5}; if ($h5uri) { my $clone = $uri->clone(); $clone->query(undef); $clone->fragment(undef); $h5uri = URI->new($h5uri); $h5uri->query(undef); $h5uri->fragment(undef); return 1 if $clone->eq($h5uri); } my $addr = my $iptype = undef; if (my $host = gethostbyname($uri->host())) { $addr = inet_ntoa($host->addr()) if $host->addr(); if ($addr && (my $ip = Net::IP->new($addr))) { $iptype = $ip->iptype(); } } if ($iptype && $iptype ne 'PUBLIC') { my $File = $self->{'W3C::Validator::File'}; $File->{'Error Flagged'} = 1; my $tmpl = &W3C::Validator::MarkupValidator::get_error_template($File); $tmpl->param(fatal_ip_error => 1); $tmpl->param(fatal_ip_host => $uri->host() || 'undefined'); $tmpl->param(fatal_ip_hostname => 1) if ($addr and $uri->host() ne $addr); return 0; } my $port = $uri->port(); if ($port !~ m/^(8[0-9]|443)$/ && $port < 1024) { my $File = $self->{'W3C::Validator::File'}; $File->{'Error Flagged'} = 1; my $tmpl = &W3C::Validator::MarkupValidator::get_error_template($File); $tmpl->param(fatal_port_error => 1); return 0; } my $scheme = $uri->scheme(); if ($scheme !~ m/^(http|https|ftp|urn)$/ ) { my $File = $self->{'W3C::Validator::File'}; $File->{'Error Flagged'} = 1; my $tmpl = &W3C::Validator::MarkupValidator::get_error_template($File); $tmpl->param(fatal_port_error => 1); return 0; } return 1; } # Local Variables: # mode: perl # indent-tabs-mode: nil # cperl-indent-level: 4 # cperl-continued-statement-offset: 4 # cperl-brace-offset: -4 # perl-indent-level: 4 # End: # ex: ts=4 sw=4 et