#!/usr/bin/perl -T # # W3C Markup Validation Service # A CGI script to retrieve and validate a markup file # # Copyright 1995-2009 World Wide Web Consortium, (Massachusetts # Institute of Technology, European Research Consortium for Informatics # and Mathematics, Keio University). All Rights Reserved. # # Originally written by Gerald Oskoboiny # for additional contributors, see http://dev.w3.org/cvsweb/validator/ # and http://validator.w3.org/about.html#credits # # This source code is available under the license at: # http://www.w3.org/Consortium/Legal/copyright-software # # $Id: check,v 1.710 2009-10-05 19:49:45 ville Exp $ # # Disable buffering on STDOUT! $| = 1; # # We need Perl 5.8.0+. use 5.008; ############################################################################### #### Load modules. ############################################################ ############################################################################### # # Pragmas. use strict; use warnings; use utf8; package W3C::Validator::MarkupValidator; # # Modules. See also the BEGIN block further down below. # # Version numbers given where we absolutely need a minimum version of a given # module (gives nicer error messages). By default, add an empty import list # when loading modules to prevent non-OO or poorly written modules from # polluting our namespace. # use CGI 2.81 qw(-newstyle_urls -private_tempfiles redirect); use CGI::Carp qw(carp croak fatalsToBrowser); use Config::General 2.32 qw(); # Need 2.32 for , rt.cpan.org#17852 use Encode qw(); use Encode::Alias qw(); use Encode::HanExtra qw(); # for some chinese character encodings, # e.g gb18030 use File::Spec::Functions qw(catfile); use HTML::Encoding 0.52 qw(); use HTML::Parser 3.24 qw(); # Need 3.24 for $p->parse($code_ref) use HTML::Template 2.6 qw(); # Need 2.6 for path param, other things. use HTTP::Headers::Util qw(); use HTTP::Message 1.52 qw(); # Need 1.52 for decoded_content() use HTTP::Request qw(); use HTTP::Headers::Auth qw(); # Needs to be imported after other HTTP::*. use JSON 2.00 qw(); use SGML::Parser::OpenSP 0.991 qw(); use URI qw(); use URI::Escape qw(uri_escape); use XML::LibXML qw(); ############################################################################### #### Constant definitions. #################################################### ############################################################################### # # Define global constants use constant TRUE => 1; use constant FALSE => 0; # # Tentative Validation Severities. use constant T_WARN => 4; # 0000 0100 use constant T_ERROR => 8; # 0000 1000 # # Output flags for error processing use constant O_SOURCE => 1; # 0000 0001 use constant O_CHARSET => 2; # 0000 0010 use constant O_DOCTYPE => 4; # 0000 0100 use constant O_NONE => 8; # 0000 1000 # # Define global variables. use vars qw($DEBUG $CFG %RSRC $VERSION); use constant IS_MODPERL2 => (exists($ENV{MOD_PERL_API_VERSION}) && $ENV{MOD_PERL_API_VERSION} >= 2); # # Things inside BEGIN don't happen on every request in persistent environments # (such as mod_perl); so let's do the globals, eg. read config, here. BEGIN { # Launder data for -T; -AutoLaunder doesn't catch this one. if (exists $ENV{W3C_VALIDATOR_HOME}) { $ENV{W3C_VALIDATOR_HOME} =~ /^(.*)$/; $ENV{W3C_VALIDATOR_HOME} = $1; } # # Read Config Files. eval { my %config_opts = ( -ConfigFile => ($ENV{W3C_VALIDATOR_CFG} || '/etc/w3c/validator.conf'), -MergeDuplicateOptions => TRUE, -MergeDuplicateBlocks => TRUE, -SplitPolicy => 'equalsign', -UseApacheInclude => TRUE, -IncludeRelative => TRUE, -InterPolateVars => TRUE, -AutoLaunder => TRUE, -AutoTrue => TRUE, -DefaultConfig => { Protocols => {Allow => 'http,https'}, Paths => { Base => ($ENV{W3C_VALIDATOR_HOME} || '/usr/local/validator'), }, External => { HTML5 => FALSE, }, }, ); my %cfg = Config::General->new(%config_opts)->getall(); $CFG = \%cfg; }; if ($@) { die <<".EOF."; Could not read configuration. Set the W3C_VALIDATOR_CFG environment variable or copy conf/* to /etc/w3c/. Make sure that the configuration file and all included files are readable by the web server user. The error was:\n'$@' .EOF. } # # Check a filesystem path for existance and "readability". sub pathcheck (@) { my %paths = map { $_ => [-d $_, -r _] } @_; my @_d = grep {not $paths{$_}->[0]} keys %paths; my @_r = grep {not $paths{$_}->[1]} keys %paths; return TRUE if (scalar(@_d) + scalar(@_r) == 0); die <<".EOF." if scalar @_d; Does not exist or is not a directory: @_d .EOF. die <<".EOF." if scalar @_r; Directory not readable (permission denied): @_r .EOF. } # # Check paths in config... # @@FIXME: This does not do a very good job error-message-wise if a path is # @@FIXME: missing...; { my @dirs = (); push @dirs, $CFG->{Paths}->{Base}; push @dirs, $CFG->{Paths}->{Templates}; push @dirs, $CFG->{Paths}->{SGML}->{Library}; &pathcheck(@dirs); } # # Split allowed protocols into a list. if (my $allowed = delete($CFG->{Protocols}->{Allow})) { $CFG->{Protocols}->{Allow} = [split(/\s*,\s*/, $allowed)]; } # Split available languages into a list if (my $langs = delete($CFG->{Languages})) { $CFG->{Languages} = [split(/\s+/, $langs)]; } else { # Default to english $CFG->{Languages} = ["en"]; } { # Make types config indexed by FPI. my $_types = {}; map { $_types->{$CFG->{Types}->{$_}->{PubID}} = $CFG->{Types}->{$_} } keys %{$CFG->{Types}}; $CFG->{Types} = $_types; } # # Change strings to internal constants in MIME type mapping. for (keys %{$CFG->{MIME}}) { $CFG->{MIME}->{$_} = 'TBD' unless $CFG->{MIME}->{$_} eq 'SGML' or $CFG->{MIME}->{$_} eq 'XML'; } # # Register Encode aliases. while (my ($key, $value) = each %{$CFG->{Charsets}}) { Encode::Alias::define_alias($key, $1) if ($value =~ /^[AX] (\S+)/); } # # Set debug flag. if ($CFG->{'Allow Debug'}) { $DEBUG = TRUE if $ENV{W3C_VALIDATOR_DEBUG} || $CFG->{'Enable Debug'}; } else { $DEBUG = FALSE; } # # Strings $VERSION = q$Revision: 1.710 $; $VERSION =~ s/Revision: ([\d\.]+) /$1/; # # Use passive FTP by default. $ENV{FTP_PASSIVE} = 1 unless exists($ENV{FTP_PASSIVE}); # Read friendly error message file # 'en_US' should be replaced by $lang for lang-neg %RSRC = Config::General->new( -MergeDuplicateBlocks => 1, -ConfigFile => catfile($CFG->{Paths}->{Templates}, 'en_US', 'error_messages.cfg'), )->getall(); eval { local $SIG{__DIE__}; require Encode::JIS2K; # for optional extra Japanese encodings }; } # end of BEGIN block. # # Get rid of (possibly insecure) $PATH. delete $ENV{PATH}; #@@DEBUG: Dump $CFG datastructure. Used only as a developer aid. #use Data::Dumper qw(Dumper); #print Dumper($CFG); #exit; #@@DEBUG; ############################################################################### #### Process CGI variables and initialize. #################################### ############################################################################### # # Create a new CGI object. my $q = new CGI; # # The data structure that will hold all session data. # @@FIXME This can't be my() as $File will sooner or # later be undef and add_warning will cause the script # to die. our() seems to work but has other problems. # @@FIXME Apparently, this must be set to {} also, # otherwise the script might pick up an old object # after abort_if_error_flagged under mod_perl. our $File = {}; ################################# # Initialize the datastructure. # ################################# # # Charset data (casing policy: lowercase early). $File->{Charset}->{Use} = ''; # The charset used for validation. $File->{Charset}->{Auto} = ''; # Autodetection using XML rules (Appendix F) $File->{Charset}->{HTTP} = ''; # From HTTP's "charset" parameter. $File->{Charset}->{META} = ''; # From HTML's . $File->{Charset}->{XML} = ''; # From the XML Declaration. $File->{Charset}->{Override} = ''; # From CGI/user override. # # Misc simple types. $File->{Mode} = 'DTD+SGML'; # Default parse mode is DTD validation in SGML mode. # By default, perform validation (we may perform only xml-wf in some cases) $File->{XMLWF_ONLY} = FALSE; # # Listrefs. $File->{Warnings} = []; # Warnings... $File->{Namespaces} = []; # Other (non-root) Namespaces. # By default, doctype-less documents can not be valid $File->{"DOCTYPEless OK"} = FALSE; $File->{"Default DOCTYPE"}->{"HTML"} = 'HTML 4.01 Transitional'; $File->{"Default DOCTYPE"}->{"XHTML"} = 'XHTML 1.0 Transitional'; ############################################################################### #### Generate Template for Result. ############################################ ############################################################################### # first we determine the chosen language based on # 1) lang argument given as parameter (if this language is available) # 2) HTTP language negotiation between variants available and user-agent choices # 3) English by default my $lang = $q->param('lang') || ''; my @localizations; foreach my $lang_available (@{$CFG->{Languages}}) { if ($lang eq $lang_available) { # Requested language (from parameters) is available, just use it undef @localizations; last; } push @localizations, [$lang_available, 1, 'text/html', undef, 'utf-8', $lang_available, undef]; } # If language is not chosen yet, use HTTP-based negotiation if (@localizations) { require HTTP::Negotiate; $lang = HTTP::Negotiate::choose(\@localizations); } # HTTP::Negotiate::choose may return undef e.g if sent Accept-Language: en;q=0 $lang ||= 'en_US'; if ($lang eq "en") { $lang = 'en_US'; # legacy } my %template_defaults = ( die_on_bad_params => FALSE, cache => TRUE, path => [ catfile($CFG->{Paths}->{Templates}, $lang) ], filter => sub { my $ref = shift; ${$ref} = Encode::decode_utf8(${$ref}); }, ); $File->{Templates}->{Result} = HTML::Template->new( %template_defaults, filename => 'result.tmpl', loop_context_vars => TRUE, ); $File->{Templates}->{Error} = HTML::Template->new( %template_defaults, filename => 'fatal-error.tmpl', ); $File->{Templates}->{AuthzReq} = HTML::Template->new( %template_defaults, filename => 'http_401_authrequired.tmpl', ); # templates for alternate output formats $File->{Templates}->{XML} = HTML::Template->new( %template_defaults, filename => 'xml_output.tmpl', loop_context_vars => TRUE, ); $File->{Templates}->{SOAP} = HTML::Template->new( %template_defaults, filename => 'soap_output.tmpl', loop_context_vars => TRUE, ); $File->{Templates}->{SOAPFault} = HTML::Template->new( %template_defaults, filename => 'soap_fault.tmpl', loop_context_vars => TRUE, ); $File->{Templates}->{SOAPDisabled} = HTML::Template->new( %template_defaults, filename => 'soap_disabled.tmpl', loop_context_vars => TRUE, ); $File->{Templates}->{UCN} = HTML::Template->new( %template_defaults, filename => 'ucn_output.tmpl', loop_context_vars => TRUE, ); $File->{Templates}->{EARLXML} = HTML::Template->new( %template_defaults, filename => 'earl_xml.tmpl', loop_context_vars => TRUE, global_vars => TRUE, ); $File->{Templates}->{EARLN3} = HTML::Template->new( %template_defaults, filename => 'earl_n3.tmpl', loop_context_vars => TRUE, global_vars => TRUE, ); $File->{Templates}->{JSON} = HTML::Template->new( %template_defaults, filename => 'json_output.tmpl', loop_context_vars => TRUE, global_vars => TRUE, ); $File->{Templates}->{PrefillHTML} = HTML::Template->new( %template_defaults, filename => 'prefill_html401.tmpl', loop_context_vars => TRUE, global_vars => TRUE, ); $File->{Templates}->{PrefillXHTML} = HTML::Template->new( %template_defaults, filename => 'prefill_xhtml10.tmpl', loop_context_vars => TRUE, global_vars => TRUE, ); $File->{Templates}->{Result}->param(cfg_home_page => $CFG->{'Home Page'}); $File->{Templates}->{SOAP}->param(cfg_home_page => $CFG->{'Home Page'}); $File->{Templates}->{UCN}->param(cfg_home_page => $CFG->{'Home Page'}); undef $lang; undef %template_defaults; ######################################### # Populate $File->{Opt} -- CGI Options. # ######################################### # # Preprocess the CGI parameters. $q = &prepCGI($File, $q); # # Set session switches. $File->{Opt}->{Outline} = $q->param('outline') ? TRUE : FALSE; $File->{Opt}->{'Show Source'} = $q->param('ss') ? TRUE : FALSE; $File->{Opt}->{'Show Tidy'} = $q->param('st') ? TRUE : FALSE; $File->{Opt}->{Verbose} = $q->param('verbose') ? TRUE : FALSE; $File->{Opt}->{'Group Errors'} = $q->param('group') ? TRUE : FALSE; $File->{Opt}->{Debug} = $q->param('debug') ? TRUE : FALSE; $File->{Opt}->{No200} = $q->param('No200') ? TRUE : FALSE; $File->{Opt}->{Prefill} = $q->param('prefill') ? TRUE : FALSE; $File->{Opt}->{'Prefill Doctype'} = $q->param('prefill_doctype') || 'html401'; $File->{Opt}->{Charset} = lc($q->param('charset') || ''); $File->{Opt}->{DOCTYPE} = $q->param('doctype') || ''; $File->{Opt}->{Output} = $q->param('output') || 'html'; $File->{Opt}->{'User Agent'} = $q->param('user-agent') && $q->param('user-agent') ne 1 ? $q->param('user-agent') : "W3C_Validator/$VERSION"; $File->{Opt}->{'User Agent'} =~ tr/\x00-\x09\x0b\x0c-\x1f//d; if ($File->{Opt}->{'User Agent'} eq 'mobileok') { $File->{Opt}->{'User Agent'} = 'W3C-mobileOK/DDC-1.0 (see http://www.w3.org/2006/07/mobileok-ddc)'; } $File->{Opt}->{'Accept Header'} = $q->param('accept') || ''; $File->{Opt}->{'Accept-Language Header'} = $q->param('accept-language') || ''; $File->{Opt}->{'Accept-Charset Header'} = $q->param('accept-charset') || ''; $File->{Opt}->{$_} =~ tr/\x00-\x09\x0b\x0c-\x1f//d for ('Accept Header', 'Accept-Language Header', 'Accept-Charset Header'); # # "Fallback" info for Character Encoding (fbc), Content-Type (fbt), # and DOCTYPE (fbd). If TRUE, the Override values are treated as # Fallbacks instead of Overrides. $File->{Opt}->{FB}->{Charset} = $q->param('fbc') ? TRUE : FALSE; $File->{Opt}->{FB}->{Type} = $q->param('fbt') ? TRUE : FALSE; $File->{Opt}->{FB}->{DOCTYPE} = $q->param('fbd') ? TRUE : FALSE; # # If ";debug" was given, let it overrule the value from the config file, # regardless of whether it's "0" or "1" (on or off), but only if config # allows the debugging options. if ($CFG->{'Allow Debug'}) { $DEBUG = $q->param('debug') if defined $q->param('debug'); $File->{Opt}->{Verbose} = TRUE if $DEBUG; } else { $DEBUG = FALSE; # The default. } $File->{Opt}->{Debug} = $DEBUG; &abort_if_error_flagged($File, O_NONE); # # Get the file and metadata. if ($q->param('uploaded_file')) { $File = &handle_file($q, $File); } elsif ($q->param('fragment')) { $File = &handle_frag($q, $File); } elsif ($q->param('uri')) { $File = &handle_uri($q, $File); } # # Abort if an error was flagged during initialization. &abort_if_error_flagged($File, 0); # # Get rid of the CGI object. undef $q; # # We don't need STDIN any more, so get rid of it to avoid getting clobbered # by Apache::Registry's idiotic interference under mod_perl. untie *STDIN; ############################################################################### #### Output validation results. ############################################### ############################################################################### $File = find_encodings($File); # # Decide on a charset to use (first part) # if ($File->{Charset}->{HTTP}) { # HTTP, if given, is authoritative. $File->{Charset}->{Use} = $File->{Charset}->{HTTP}; } elsif ($File->{ContentType} =~ m(^text/([-.a-zA-Z0-9]\+)?xml$)) { # Act as if $http_charset was 'us-ascii'. (MIME rules) $File->{Charset}->{Use} = 'us-ascii'; &add_warning('W01', { W01_upload => $File->{'Is Upload'}, W01_agent => $File->{Server}, W01_ct => $File->{ContentType}, }); } elsif ($File->{Charset}->{XML}) { $File->{Charset}->{Use} = $File->{Charset}->{XML}; } elsif ($File->{Charset}->{Auto} =~ /^utf-16[bl]e$/ && $File->{BOM} == 2) { $File->{Charset}->{Use} = 'utf-16'; } elsif ($File->{ContentType} =~ m(^application/([-.a-zA-Z0-9]+\+)?xml$)) { $File->{Charset}->{Use} = "utf-8"; } elsif (&is_xml($File) and not $File->{ContentType} =~ m(^text/)) { $File->{Charset}->{Use} = 'utf-8'; # UTF-8 (image/svg+xml etc.) } $File->{Charset}->{Use} ||= $File->{Charset}->{META}; # # Handle any Fallback or Override for the charset. if (charset_not_equal($File->{Opt}->{Charset}, '(detect automatically)')) { # charset=foo was given to the CGI and it wasn't "autodetect" or empty. # # Extract the user-requested charset from CGI param. my ($override, undef) = split(/\s/, $File->{Opt}->{Charset}, 2); $File->{Charset}->{Override} = lc($override); if ($File->{Opt}->{FB}->{Charset}) { # charset fallback mode unless ($File->{Charset}->{Use}) { # no charset detected, actual fallback &add_warning('W02', {W02_charset => $File->{Charset}->{Override}}); $File->{Tentative} |= T_ERROR; # Tag it as Invalid. $File->{Charset}->{Use} = $File->{Charset}->{Override}; } } else { # charset "hard override" mode if (! $File->{Charset}->{Use}) { # overriding "nothing" &add_warning('W04', { W04_charset => $File->{Charset}->{Override}, W04_override => TRUE}); $File->{Tentative} |= T_ERROR; $File->{Charset}->{Use} = $File->{Charset}->{Override}; } elsif ($File->{Charset}->{Override} ne $File->{Charset}->{Use}) { # Actually overriding something; warn about override. &add_warning('W03', { W03_use => $File->{Charset}->{Use}, W03_opt => $File->{Charset}->{Override}}); $File->{Tentative} |= T_ERROR; $File->{Charset}->{Use} = $File->{Charset}->{Override}; } } } if ($File->{'Direct Input'}) { #explain why UTF-8 is forced &add_warning('W28', {}); } unless ($File->{Charset}->{XML} || $File->{Charset}->{META}){ #suggest character encoding info within doc &add_warning('W27', {}); } # # Abort if an error was flagged while finding the encoding. &abort_if_error_flagged($File, O_CHARSET|O_DOCTYPE); $File->{Charset}->{Default} = FALSE; unless ($File->{Charset}->{Use}) { # No charset given... $File->{Charset}->{Use} = 'utf-8'; $File->{Charset}->{Default} = TRUE; $File->{Tentative} |= T_ERROR; # Can never be valid. &add_warning('W04', {W04_charset => "UTF-8"}); } # Always transcode, even if the content claims to be UTF-8 $File = transcode($File); if (($File->{ContentType} eq "text/html") and ($File->{Charset}->{Default}) and $File->{'Error Flagged'}) { $File->{'Error Flagged'} = FALSE; # reset # we try again, this time with win-1252 $File->{Charset}->{Use} = 'windows-1252'; &add_warning('W04', {W04_charset => "windows-1252", W04_also_tried=> "UTF-8"}); $File = transcode($File); } if (($File->{ContentType} eq "text/html") and ($File->{Charset}->{Default}) and $File->{'Error Flagged'}) { $File->{'Error Flagged'} = FALSE; # reset # we try again, this time with latin1... $File->{Charset}->{Use} = 'iso-8859-1'; &add_warning('W04', {W04_charset => "iso-8859-1", W04_also_tried => "UTF-8, windows-1252"}); $File = transcode($File); } # if it still does not work, we abandon hope here &abort_if_error_flagged($File, O_CHARSET); # # Add a warning if doc is UTF-8 and contains a BOM. if ($File->{Charset}->{Use} eq 'utf-8' && $File->{Content}->[0] =~ m(^\x{FEFF})) { &add_warning('W21', {}); } # # Overall parsing algorithm for documents returned as text/html: # # For documents that come to us as text/html, # # 1. check if there's a doctype # 2. if there is a doctype, parse/validate against that DTD # 3. if no doctype, check for an xmlns= attribute on the first element, or XML declaration # 4. if no doctype and XML mode, check for XML well-formedness # 5. otherwise , punt. # # # Override DOCTYPE if user asked for it. if ($File->{Opt}->{DOCTYPE}) { if ($File->{Opt}->{DOCTYPE} !~ /(Inline|detect)/i) { $File = &override_doctype($File); } else { # Get rid of inline|detect for easy truth value checking later $File->{Opt}->{DOCTYPE} = ''; } } # Try to extract a DOCTYPE or xmlns. $File = &preparse_doctype($File); if ($File->{Opt}->{DOCTYPE} eq "HTML5") { $File->{DOCTYPE} = "HTML5"; $File->{Version} = $File->{DOCTYPE}; } set_parse_mode($File, $CFG); # # Sanity check Charset information and add any warnings necessary. $File = &charset_conflicts($File); # before we start the parsing, clean slate $File->{'Is Valid'} = TRUE; $File->{Errors} = []; # preparse with XML parser if necessary # we should really be using a SAX ErrorHandler, but I can't find # a way to make it work with XML::LibXML::SAX::Parser... ** FIXME ** # ditto, we should try using W3C::Validator::EventHandler, # but it's badly linked to opensp at the moment if (&is_xml($File)) { if ($File->{DOCTYPE} eq "HTML5") { # $File->{DOCTYPE} = "XHTML5"; # $File->{Version} = "XHTML5"; } else { my $xmlparser = XML::LibXML->new(); $xmlparser->line_numbers(1); $xmlparser->validation(0); $xmlparser->load_ext_dtd(0); # [NOT] loading the XML catalog for entities resolution as it seems to cause a lot of unnecessary DTD/entities fetching (requires >= 1.53 if enabled) #$xmlparser->load_catalog(catfile($CFG->{Paths}->{SGML}->{Library}, 'xml.soc')); my $xml_string = join"\n",@{$File->{Content}}; # the XML parser will check the value of encoding attribute in XML declaration # so we have to amend it to reflect transcoding. see Bug 4867 $xml_string =~ s/(<\?xml.*) (encoding[\x20|\x09|\x0D|\x0A]*=[\x20|\x09|\x0D|\x0A]*(?:"[A-Za-z][a-zA-Z0-9_-]+"|'[A-Za-z][a-zA-Z0-9_-]+')) (.*\?>)/$1encoding="utf-8"$3/sx; eval { $xmlparser->parse_string($xml_string); }; $xml_string = undef; my $xml_parse_errors_line = undef; my @xmlwf_error_list; my @xmlwf_obj_error_list; if (ref($@)) { # handle a structured error (XML::LibXML::Error object) # (lib XML::LibXML > 0.66, but will work MUCH better > 0.69 ) push (@xmlwf_obj_error_list, $@); my $err_obj = $@; while($err_obj->_prev()) { $err_obj = $err_obj->_prev(); unshift(@xmlwf_obj_error_list, $err_obj); } my $num_xmlwf_error = 0; foreach my $err_obj (@xmlwf_obj_error_list){ my $err; $err->{src} = '...'; # do this with show_open_entities()? $err->{line} = $err_obj->line(); # -> column() is available in XML::LibXML >= 1.69_2 $err->{char} = eval { $err_obj->column() }; $err->{num} = "libxml2-".$err_obj->code(); $err->{type} = "E"; $err->{msg} = $err_obj->message(); # The validator will sometimes fail to dereference entities files # we're filtering the bogus resulting error if ($err->{msg} =~ /Entity '\w+' not defined/) { $err = undef; next; } push (@xmlwf_error_list, $err); $num_xmlwf_error++; } } elsif ($@) { my $xmlwf_errors = $@; my $xmlwf_error_line = undef; my $xmlwf_error_col = undef; my $xmlwf_error_msg = undef; my $got_error_message = 0; my $got_quoted_line = 0; my $num_xmlwf_error = 0; foreach my $msg_line (split "\n", $xmlwf_errors){ $msg_line =~ s{[^\x0d\x0a](:\d+:)}{\n$1}g; $msg_line =~ s{[^\x0d\x0a]+[\x0d\x0a]$}{}; # first we get the actual error message if (($got_error_message eq 0) and ($msg_line =~ /^(:\d+:)( parser error : .*)/ )) { $xmlwf_error_line = $1; $xmlwf_error_msg = $2; $xmlwf_error_line =~ s/:(\d+):/$1/; $xmlwf_error_msg =~ s/ parser error :/XML Parsing Error: /; $got_error_message = 1; } # then we skip the second line, which shows the context (we don't use that) elsif (($got_error_message eq 1) and ($got_quoted_line eq 0)) { $got_quoted_line = 1; } # we now take the third line, with the pointer to the error's column elsif (($msg_line =~ /(\s+)\^/) and ($got_error_message eq 1) and ($got_quoted_line eq 1)) { $xmlwf_error_col = length($1); } # cleanup for a number of bugs for the column number if (defined($xmlwf_error_col)) { if ((my $l = length($File->{Content}->[$xmlwf_error_line-1])) < $xmlwf_error_col) { # http://bugzilla.gnome.org/show_bug.cgi?id=434196 #warn("Warning: reported error column larger than line length " . # "($xmlwf_error_col > $l) in $File->{URI} line " . # "$xmlwf_error_line, libxml2 bug? Resetting to line length."); $xmlwf_error_col = $l; } elsif ($xmlwf_error_col == 79) { # working around an apparent odd limitation of libxml # which only gives context for lines up to 80 chars # http://www.w3.org/Bugs/Public/show_bug.cgi?id=4420 # http://bugzilla.gnome.org/show_bug.cgi?id=424017 $xmlwf_error_col = "> 80"; # non-int line number will trigger the proper behavior in report_error } } # when we have all the info (one full error message), proceed and move on to the next error if ((defined $xmlwf_error_line) and (defined $xmlwf_error_col) and (defined $xmlwf_error_msg)){ # Reinitializing for the next batch of 3 lines $got_error_message = 0; $got_quoted_line = 0; # formatting the error message for output my $err; $err->{src} = '...'; # do this with show_open_entities()? $err->{line} = $xmlwf_error_line; $err->{char} = $xmlwf_error_col; $err->{num} = 'xmlwf'; $err->{type} = "E"; $err->{msg} = $xmlwf_error_msg; # The validator will sometimes fail to dereference entities files # we're filtering the bogus resulting error if ($err->{msg} =~ /Entity '\w+' not defined/) { $xmlwf_error_line = undef; $xmlwf_error_col = undef; $xmlwf_error_msg = undef; next; } push (@xmlwf_error_list, $err); $xmlwf_error_line = undef; $xmlwf_error_col = undef; $xmlwf_error_msg = undef; $num_xmlwf_error++; } } } foreach my $errmsg (@xmlwf_error_list){ $File->{'Is Valid'} = FALSE; push @{$File->{WF_Errors}}, $errmsg; } } } if (($File->{DOCTYPE} eq "HTML5") or ($File->{DOCTYPE} eq "XHTML5")) { if ($CFG->{External}->{HTML5}) { $File = &html5_validate($File); &add_warning('W00', { W00_experimental_name => "HTML5 Conformance Checker", W00_experimental_URI => "feedback.html" }); } else { $File->{'Error Flagged'} = TRUE; $File->{Templates}->{Error}->param(fatal_no_checker => TRUE); $File->{Templates}->{Error}->param(fatal_missing_checker => "HTML5 Validator"); } } elsif(($File->{DOCTYPE} eq '') and (($File->{Root} eq "svg") or @{$File->{Namespaces}} >1)){ # we send doctypeless SVG, or any doctypeless XML document with multiple namespaces found, to a different engine # WARNING this is experimental. if ($CFG->{External}->{CompoundXML}) { $File = &compoundxml_validate($File); &add_warning('W00', { W00_experimental_name => "validator.nu Conformance Checker", W00_experimental_URI => "feedback.html" }); } } else { $File = &dtd_validate($File); } &abort_if_error_flagged($File, 0); # TODO: need to bring in fixes from html5_validate() here sub compoundxml_validate (\$) { my $File = shift; my $ua = new W3C::Validator::UserAgent ($CFG, $File); $File->{ParserName} = "validator.nu"; $File->{ParserOpts} = ""; my $url = URI->new($CFG->{External}->{CompoundXML}); $url->query_form(out => "xml"); my $req = HTTP::Request->new(POST => $url); if ($File->{Opt}->{DOCTYPE} || $File->{Charset}->{Override}) { # Doctype or charset overridden, need to use $File->{Content} in UTF-8 # because $File->{Bytes} is not affected by the overrides. This will # most likely be a source of errors about internal/actual charset # differences as long as our transcoding process does not "fix" the # charset info in XML declaration and meta http-equiv (any others?). if($File->{'Direct Input'}) { # sane default when using html5 validator by direct input $req->content_type("application/xml; charset=UTF-8"); } else { $req->content_type("$File->{ContentType}; charset=UTF-8"); } $req->content(Encode::encode_utf8(join("\n", @{$File->{Content}}))); } else { # Pass original bytes, Content-Type and charset as-is. # We trust that our and validator.nu's interpretation of line numbers # is the same (regardless of EOL chars used in the document). my @content_type = ($File->{ContentType} => undef); push(@content_type, charset => $File->{Charset}->{HTTP}) if $File->{Charset}->{HTTP}; $req->content_type(HTTP::Headers::Util::join_header_words(@content_type)); $req->content_ref(\$File->{Bytes}); } $req->content_language($File->{ContentLang}) if $File->{ContentLang}; # Intentionally using direct header access instead of $req->last_modified $req->header('Last-Modified', $File->{Modified}) if $File->{Modified}; # If not in debug mode, gzip the request (LWP >= 5.817) eval { $req->encode("gzip"); } unless $File->{Opt}->{Debug}; my $res = $ua->request($req); if (! $res->is_success()) { $File->{'Error Flagged'} = TRUE; $File->{Templates}->{Error}->param(fatal_no_checker => TRUE); $File->{Templates}->{Error}->param(fatal_missing_checker => "HTML5 Validator"); } else { my $content = &get_content($File, $res); return $File if $File->{'Error Flagged'}; # and now we parse according to http://wiki.whatwg.org/wiki/Validator.nu_XML_Output # I wish we could use XML::LibXML::Reader here. but SHAME on those major # unix distributions still shipping with libxml2 2.6.16… 4 years after its release my $xml_reader = XML::LibXML->new(); my $xmlDOM; eval { $xmlDOM = $xml_reader->parse_string( $content);}; if ($@) { $File->{'Error Flagged'} = TRUE; $File->{Templates}->{Error}->param(fatal_no_checker => TRUE); $File->{Templates}->{Error}->param(fatal_missing_checker => "HTML5 Validator"); return $File; } my @nodelist = $xmlDOM->getElementsByTagName("messages"); my $messages_node = $nodelist[0]; my @message_nodes = $messages_node->childNodes; foreach my $message_node (@message_nodes) { my $message_type = $message_node->localname; my $err; my ($xml_error_line, $xml_error_col, $xml_error_msg, $xml_error_expl); if ($message_type eq "error") { $err->{type} = "E"; $File->{'Is Valid'} = FALSE; } elsif ($message_type eq "info") { $err->{type} = "I"; # by default - we find warnings in the type attribute (below) } if ($message_node->hasAttributes()) { my @attributelist = $message_node->attributes(); foreach my $attribute (@attributelist) { if($attribute->name eq "type"){ if (($attribute->getValue() eq "warning") and ($message_type eq "info")) { $err->{type} = "W"; } } if($attribute->name eq "last-column") { $xml_error_col = $attribute->getValue(); } if($attribute->name eq "last-line") { $xml_error_line = $attribute->getValue(); } } } my @child_nodes = $message_node->childNodes; foreach my $child_node (@child_nodes) { if ($child_node->localname eq "message") { $xml_error_msg= $child_node->toString(); $xml_error_msg =~ s,]*>,,gsi; } if ($child_node->localname eq "elaboration") { $xml_error_expl = $child_node->toString(); $xml_error_expl =~ s,,,gi; $xml_error_expl = "\n
$xml_error_expl
\n"; } } # formatting the error message for output $err->{src} = '...'; # do this with show_open_entities()? $err->{line} = $xml_error_line; $err->{char} = $xml_error_col; $err->{num} = 'validator.nu'; $err->{msg} = $xml_error_msg; $err->{expl} = $xml_error_expl; if ($err->{msg} =~ /Using the preset for (.*) based on the root namespace/ ) { $File->{DOCTYPE} = $1; } else { push @{$File->{Errors}}, $err; } # @@ TODO message explanation / elaboration } } return $File; } sub html5_validate (\$) { my $File = shift; my $ua = new W3C::Validator::UserAgent ($CFG, $File); $File->{ParserName} = "validator.nu"; $File->{ParserOpts} = ""; my $url = URI->new($CFG->{External}->{HTML5}); $url->query_form(out => "xml"); my $req = HTTP::Request->new(POST => $url); if ($File->{Opt}->{DOCTYPE} || $File->{Charset}->{Override}) { # Doctype or charset overridden, need to use $File->{Content} in UTF-8 # because $File->{Bytes} is not affected by the overrides. This will # most likely be a source of errors about internal/actual charset # differences as long as our transcoding process does not "fix" the # charset info in XML declaration, meta http-equiv/charset and/or BOM # (any others?). my $ct = $File->{'Direct Input'} ? "text/html" : $File->{ContentType}; $req->content_type("$ct; charset=UTF-8"); $req->content(Encode::encode_utf8(join("\n", @{$File->{Content}}))); } else { # Pass original bytes, Content-Type and charset as-is. # We trust that our and validator.nu's interpretation of line numbers # is the same later when displaying error contexts (regardless of EOL chars # used in the document). if ($File->{'Direct Input'}) { $req->content_type("text/html; charset=UTF-8"); } else { my @ct = ($File->{ContentType} => undef); push(@ct, charset => $File->{Charset}->{HTTP}) if $File->{Charset}->{HTTP}; $req->content_type(HTTP::Headers::Util::join_header_words(@ct)); } $req->content_ref(\$File->{Bytes}); } $req->content_language($File->{ContentLang}) if $File->{ContentLang}; # Intentionally using direct header access instead of $req->last_modified # (the latter takes seconds since epoch, but $File->{Modified} is an already # formatted string). $req->header('Last-Modified', $File->{Modified}) if $File->{Modified}; # If not in debug mode, gzip the request (LWP >= 5.817) eval { $req->encode("gzip"); } unless $File->{Opt}->{Debug}; my $res = $ua->request($req); if (! $res->is_success()) { $File->{'Error Flagged'} = TRUE; $File->{Templates}->{Error}->param(fatal_no_checker => TRUE); $File->{Templates}->{Error}->param(fatal_missing_checker => "HTML5 Validator"); } else { my $content = &get_content($File, $res); return $File if $File->{'Error Flagged'}; # and now we parse according to http://wiki.whatwg.org/wiki/Validator.nu_XML_Output # I wish we could use XML::LibXML::Reader here. but SHAME on those major # unix distributions still shipping with libxml2 2.6.16… 4 years after its release my $xml_reader = XML::LibXML->new(); my $xmlDOM; eval { $xmlDOM = $xml_reader->parse_string( $content);}; if ($@) { $File->{'Error Flagged'} = TRUE; $File->{Templates}->{Error}->param(fatal_no_checker => TRUE); $File->{Templates}->{Error}->param(fatal_missing_checker => "HTML5 Validator"); return $File; } my @nodelist = $xmlDOM->getElementsByTagName("messages"); my $messages_node = $nodelist[0]; my @message_nodes = $messages_node->childNodes; foreach my $message_node (@message_nodes) { my $message_type = $message_node->localname; my $err; my ($html5_error_line, $html5_error_col, $html5_error_msg, $html5_error_expl); # TODO: non-document errors should receive different/better treatment, # but this is better than hiding all problems for now (#6747) if ($message_type eq "error" || $message_type eq "non-document-error") { $err->{type} = "E"; $File->{'Is Valid'} = FALSE; } elsif ($message_type eq "info") { $err->{type} = "I"; # by default - we find warnings in the type attribute (below) } if ($message_node->hasAttributes()) { my @attributelist = $message_node->attributes(); foreach my $attribute (@attributelist) { if($attribute->name eq "type"){ if (($attribute->getValue() eq "warning") and ($message_type eq "info")) { $err->{type} = "W"; } } elsif ($attribute->name eq "last-column") { $html5_error_col = $attribute->getValue(); } elsif ($attribute->name eq "last-line") { $html5_error_line = $attribute->getValue(); } } } my @child_nodes = $message_node->childNodes; foreach my $child_node (@child_nodes) { if ($child_node->localname eq "message") { $html5_error_msg = $child_node->textContent(); } elsif ($child_node->localname eq "elaboration") { $html5_error_expl = $child_node->toString(); $html5_error_expl =~ s,,,gi; $html5_error_expl = "\n
$html5_error_expl
\n"; } } # formatting the error message for output $err->{src} = '...'; # do this with show_open_entities()? $err->{line} = $html5_error_line; $err->{char} = $html5_error_col; $err->{num} = 'html5'; $err->{msg} = $html5_error_msg; $err->{expl} = $html5_error_expl; push @{$File->{Errors}}, $err; # @@ TODO message explanation / elaboration } } return $File; } sub dtd_validate (\$) { my $File = shift; my $opensp = SGML::Parser::OpenSP->new(); my $parser_name = "SGML::Parser::OpenSP"; # # By default, use SGML catalog file and SGML Declaration. my $catalog = catfile($CFG->{Paths}->{SGML}->{Library}, 'sgml.soc'); # default parsing options my @spopt = qw(valid non-sgml-char-ref no-duplicate); # # Switch to XML semantics if file is XML. if (&is_xml($File)) { $catalog = catfile($CFG->{Paths}->{SGML}->{Library}, 'xml.soc'); push(@spopt, 'xml'); } else { # add warnings for shorttags push(@spopt, 'min-tag'); } $File->{ParserName} = $parser_name; $File->{ParserOpts} = join(" ", @spopt); # # Parser configuration $opensp->search_dirs($CFG->{Paths}->{SGML}->{Library}); $opensp->catalogs($catalog); $opensp->show_error_numbers(1); $opensp->warnings(@spopt); # # Restricted file reading is disabled on Win32 for the time # beeing since neither SGML::Parser::OpenSP nor check auto- # magically set search_dirs to include the temp directory # so restricted file reading would defunct the Validator. $opensp->restrict_file_reading(1) unless $^O eq 'MSWin32'; my $h; # event handler if ($File->{Opt}->{Outline}) { $h = W3C::Validator::EventHandler::Outliner->new($opensp, $File, $CFG); } else { $h = W3C::Validator::EventHandler->new($opensp, $File, $CFG); } $opensp->handler($h); $opensp->parse_string(join"\n",@{$File->{Content}}); # Make sure there are no circular references, otherwise the script # would leak memory until mod_perl unloads it which could take some # time. @@FIXME It's probably overly careful though. $opensp->handler(undef); undef $h->{_parser}; undef $h->{_file}; undef $h; undef $opensp; # # Set Version to be the FPI initially. $File->{Version} = $File->{DOCTYPE}; return $File; } # # Force "XML" if type is an XML type and an FPI was not found. # Otherwise set the type to be the FPI. if (&is_xml($File) and not $File->{DOCTYPE} and lc($File->{Root}) ne 'html') { $File->{Version} = 'XML'; } else { $File->{Version} ||= $File->{DOCTYPE}; } # # Get the pretty text version of the FPI if a mapping exists. if (my $prettyver = $CFG->{Types}->{$File->{Version}}->{Display}) { $File->{Version} = $prettyver; } # # check the received mime type against Allowed mime types if ($File->{ContentType}){ my @allowedMediaType = split(/\s+/, $CFG->{Types}->{$File->{DOCTYPE}}->{Types}->{Allowed} || ''); my $usedCTisAllowed; if (scalar @allowedMediaType) { $usedCTisAllowed = FALSE; foreach (@allowedMediaType) { $usedCTisAllowed = TRUE if ($_ eq $File->{ContentType}); } } else { # wedon't know what media type is recommended, so better shut up $usedCTisAllowed = TRUE; } if(! $usedCTisAllowed ){ &add_warning('W23', { W23_type => $File->{ContentType}, W23_type_pref => $CFG->{Types}->{$File->{DOCTYPE}}->{Types}->{Preferred}, w23_doctype => $File->{Version} }); } } # # Warn about unknown, incorrect, or missing Namespaces. if ($File->{Namespace}) { my $ns = $CFG->{Types}->{$File->{Version}}->{Namespace} || FALSE; if (&is_xml($File)) { if ($ns eq $File->{Namespace}) { &add_warning('W10', { W10_ns => $File->{Namespace}, W10_type => $File->{Type}, }); } } else { &add_warning('W11', {W11_ns => $File->{Namespace}, w11_doctype => $File->{DOCTYPE}}); } } else { if (&is_xml($File) and $CFG->{Types}->{$File->{Version}}->{Namespace}) { &add_warning('W12', {}); } } ## if invalid content, AND if requested, pass through tidy if (!$File->{'Is Valid'} && $File->{Opt}->{'Show Tidy'}) { eval { local $SIG{__DIE__}; require HTML::Tidy; my $tidy = HTML::Tidy->new({config_file => $CFG->{Paths}->{TidyConf}}); my $cleaned = $tidy->clean(join("\n", @{$File->{Content}})); $cleaned = Encode::decode_utf8($cleaned); $File->{Tidy} = $cleaned; }; if ($@) { (my $errmsg = $@) =~ s/ at .*//s; &add_warning('W29', { W29_msg => $errmsg }); } } my $template; if ($File->{Opt}->{Output} eq 'xml') { $template = $File->{Templates}->{XML}; } elsif ($File->{Opt}->{Output} eq 'earl') { $template = $File->{Templates}->{EARLXML}; } elsif ($File->{Opt}->{Output} eq 'n3') { $template = $File->{Templates}->{EARLN3}; } elsif ($File->{Opt}->{Output} eq 'json') { $template = $File->{Templates}->{JSON}; } elsif ($File->{Opt}->{Output} eq 'ucn') { $template = $File->{Templates}->{UCN}; } elsif ($File->{Opt}->{Output} eq 'soap12') { if ($CFG->{'Enable SOAP'} != 1) { # API disabled - ideally this should have been sent before performing validation... print CGI::header(-status => 503, -content_language => "en", -type => "text/html", -charset => "utf-8" ); $template = $File->{Templates}->{SOAPDisabled}; } elsif ($File->{'Error Flagged'}) { # should send SOAP fault message $template = $File->{Templates}->{SOAPFault}; # we fill the soap fault template #with the variables that had been passed to the HTML fatal error template foreach my $fault_param ($File->{Templates}->{Error}->param()) { $template->param($fault_param => $File->{Templates}->{Error}->param($fault_param)); } } else { $template = $File->{Templates}->{SOAP}; } } else { $template = $File->{Templates}->{Result}; } &prep_template($File, $template); &fin_template($File, $template); $template->param(file_warnings => $File->{Warnings}); $template->param(tidy_output => $File->{Tidy}); $template->param(file_source => &source($File)) if ($template->param('opt_show_source') or ($File->{'Is Upload'}) or ($File->{'Direct Input'})); if ($File->{Opt}->{Output} eq 'json') { # No JSON escaping in HTML::Template (and "JS" is not the right thing here) my $json = JSON->new(); $json->allow_nonref(TRUE); for my $msgs ($template->param("file_errors"), $template->param("file_warnings")) { next unless $msgs; for my $msg (@$msgs) { for my $key (qw(msg expl)) { $msg->{$key} = $json->encode($msg->{$key}) if $msg->{$key}; } } } } # transcode output from perl's internal to utf-8 and output print Encode::encode('UTF-8', $template->output); # # Get rid of $File object and exit. undef $File; exit; ############################################################################# # Subroutine definitions ############################################################################# # # Generate HTML report. sub prep_template ($$) { my $File = shift; my $T = shift; # # XML mode... $T->param(is_xml => &is_xml($File)); # # Upload? $T->param(is_upload => $File->{'Is Upload'}); # # Direct Input? $T->param(is_direct_input => $File->{'Direct Input'}); # # The URI... $T->param(file_uri => $File->{URI}); $T->param(file_uri_param => uri_escape($File->{URI})); # # Set URL for page title. $T->param(page_title_url => $File->{URI}); # # Metadata... $T->param(file_modified => $File->{Modified}); $T->param(file_server => $File->{Server}); $T->param(file_size => $File->{Size}); $T->param(file_contenttype => $File->{ContentType}); $T->param(file_charset => $File->{Charset}->{Use}); $T->param(file_doctype => $File->{DOCTYPE}); # # Output options... $T->param(opt_show_source => $File->{Opt}->{'Show Source'}); $T->param(opt_show_tidy => $File->{Opt}->{'Show Tidy'}); $T->param(opt_show_outline => $File->{Opt}->{Outline}); $T->param(opt_verbose => $File->{Opt}->{Verbose}); $T->param(opt_group_errors => $File->{Opt}->{'Group Errors'}); $T->param(opt_no200 => $File->{Opt}->{No200}); # # Tip of the Day... my $tip = &get_tip(); $T->param(tip_uri => $tip->[0]); $T->param(tip_slug => $tip->[1]); # Root Element $T->param(root_element => $File->{Root}); # Namespaces... $T->param(file_namespace => $File->{Namespace}); my %seen_ns = (); my @bulk_ns = @{$File->{Namespaces}}; $File->{Namespaces} = []; # reinitialize the list of non-root namespaces # ... and then get a uniq version of it foreach my $single_namespace (@bulk_ns) { push(@{$File->{Namespaces}}, $single_namespace) unless (($single_namespace eq $File->{Namespace}) or $seen_ns{$single_namespace}++); } my @nss = map({uri => $_}, @{$File->{Namespaces}}); $T->param(file_namespaces => \@nss) if @nss; if ($File->{Opt}->{DOCTYPE}) { my $over_doctype_param = "override doctype $File->{Opt}->{DOCTYPE}"; $T->param($over_doctype_param => TRUE); } if ($File->{Opt}->{Charset}) { my $over_charset_param = "override charset $File->{Opt}->{Charset}"; $T->param($over_charset_param => TRUE); } # Allow content-negotiation if ($File->{Opt}->{'Accept Header'}) { $T->param('accept' => $File->{Opt}->{'Accept Header'}); } if ($File->{Opt}->{'Accept-Language Header'}) { $T->param('accept-language' => $File->{Opt}->{'Accept-Language Header'}); } if ($File->{Opt}->{'Accept-Charset Header'}) { $T->param('accept-charset' => $File->{Opt}->{'Accept-Charset Header'}); } if ($File->{Opt}->{'User Agent'}) { $T->param('user-agent' => $File->{Opt}->{'User Agent'}); } if ($File->{'Error Flagged'}) { $T->param(fatal_error => TRUE); } } sub fin_template ($$) { my $File = shift; my $T = shift; # # Set debug info for HTML report. $T->param(opt_debug => $DEBUG); $T->param(debug => [ map({name => $_, value => $ENV{$_}}, qw(no_proxy http_proxy https_proxy ftp_proxy FTP_PASSIVE)), { name => 'Content-Encoding', value => $File->{ContentEnc} }, { name => 'Content-Language', value => $File->{ContentLang} }, { name => 'Content-Location', value => $File->{ContentLoc} }, { name => 'Transfer-Encoding', value => $File->{TransferEnc} }, { name => 'Parse Mode', value => $File->{Mode} }, { name => 'Parse Mode Factor', value => $File->{ModeChoice} }, { name => 'Parser', value => $File->{ParserName} }, { name => 'Parser Options', value => $File->{ParserOpts} }, ], ); if (! $File->{Doctype} and ($File->{Version} eq 'unknown' or $File->{Version} eq 'SGML' or (!$File->{Version}))) { my $default_doctype = (&is_xml($File) ? $File->{"Default DOCTYPE"}->{"XHTML"} : $File->{"Default DOCTYPE"}->{"HTML"}); $T->param(file_version => "$default_doctype"); } else { $T->param(file_version => $File->{Version}); } my ($num_errors,$num_warnings, $num_info, $reported_errors) = &report_errors($File); if ($num_errors+$num_warnings > 0) { $T->param(has_errors => 1); } $T->param(valid_errors_num => $num_errors); $num_warnings += scalar @{$File->{Warnings}}; $T->param(valid_warnings_num => $num_warnings); my $number_of_errors = ""; # textual form of $num_errors my $number_of_warnings = ""; # textual form of $num_errors # The following is a bit hack-ish, but will enable us to have some logic # for a human-readable display of the number, with cases for 0, 1, 2 and above # (the case of 2 appears to be useful for localization in some languages where the plural is different for 2, and above) if ($num_errors > 1) { $T->param(number_of_errors_is_0 => FALSE ); $T->param(number_of_errors_is_1 => FALSE); if ($num_errors eq 2) { $T->param(number_of_errors_is_2 => TRUE); } else { $T->param(number_of_errors_is_2 => FALSE ); } $T->param(number_of_errors_is_plural => TRUE ); } elsif ($num_errors eq 1) { $T->param(number_of_errors_is_0 => FALSE ); $T->param(number_of_errors_is_1 => TRUE ); $T->param(number_of_errors_is_2 => FALSE ); $T->param(number_of_errors_is_plural => FALSE ); } else { # 0 $T->param(number_of_errors_is_0 => TRUE ); $T->param(number_of_errors_is_1 => FALSE ); $T->param(number_of_errors_is_2 => FALSE ); $T->param(number_of_errors_is_plural => FALSE ); } if ($num_warnings > 1) { $T->param(number_of_warnings_is_0 => FALSE ); $T->param(number_of_warnings_is_1 => FALSE); if ($num_warnings eq 2) { $T->param(number_of_warnings_is_2 => TRUE); } else { $T->param(number_of_warnings_is_2 => FALSE); } $T->param(number_of_warnings_is_plural => TRUE ); } elsif ($num_warnings eq 1) { $T->param(number_of_warnings_is_0 => FALSE ); $T->param(number_of_warnings_is_1 => TRUE ); $T->param(number_of_warnings_is_2 => FALSE ); $T->param(number_of_warnings_is_plural => FALSE ); } else { # 0 $T->param(number_of_warnings_is_0 => TRUE ); $T->param(number_of_warnings_is_1 => FALSE ); $T->param(number_of_warnings_is_2 => FALSE ); $T->param(number_of_warnings_is_plural => FALSE ); } $T->param(file_outline => $File->{heading_outline}) if $File->{Opt}->{Outline}; $T->param(file_errors => $reported_errors); if ($File->{'Is Valid'}) { $T->param(VALID => TRUE); $T->param(valid_status => 'Valid'); &report_valid($File, $T); } else { $T->param(VALID => FALSE); $T->param(valid_status => 'Invalid'); } } # # Output "This page is Valid" report. sub report_valid { my $File = shift; my $T = shift; unless ($File->{Version} eq 'unknown' or defined $File->{Tentative}) { if (exists $CFG->{Types}->{$File->{DOCTYPE}}->{Badge}) { my $cfg = $CFG->{Types}->{$File->{DOCTYPE}}; $T->param(have_badge => TRUE); $T->param(badge_uri => $cfg->{Badge}->{URI}); if (exists $CFG->{Types}->{$File->{DOCTYPE}}->{Badge}->{'Local URI'}) { $T->param(local_badge_uri => $cfg->{Badge}->{'Local URI'}); $T->param(have_local_badge => TRUE); } if (exists $CFG->{Types}->{$File->{DOCTYPE}}->{Badge}->{'ALT URI'}) { $T->param(badge_alt_uri => $cfg->{Badge}->{'ALT URI'}); if (exists $CFG->{Types}->{$File->{DOCTYPE}}->{Badge}->{'Local ALT URI'}) { $T->param(local_alt_badge_uri => $cfg->{Badge}->{'Local ALT URI'}); } $T->param(have_alt_badge => TRUE); } if (exists $CFG->{Types}->{$File->{DOCTYPE}}->{Badge}->{RDFa}) { $T->param(have_badge_rdfa => TRUE); $T->param(badge_rdfa => $CFG->{Types}->{$File->{DOCTYPE}}->{Badge}->{RDFa}); } else { $T->param(have_badge_rdfa => FALSE); } $T->param(badge_alt => $cfg->{Badge}->{Alt}); $T->param(badge_h => $cfg->{Badge}->{Height}); $T->param(badge_w => $cfg->{Badge}->{Width}); $T->param(badge_tagc => ($cfg->{'Parse Mode'} eq 'XML' ? ' /' : '')); } } elsif (defined $File->{Tentative}) { $T->param(is_tentative => TRUE); } if ($File->{XMLWF_ONLY}){ $T->param(xmlwf_only => TRUE); } my $thispage = self_url_file($File); $T->param(file_thispage => $thispage); } # # Add a warning message to the output. sub add_warning ($$) { my $WID = shift; my $params = shift; push @{$File->{Warnings}}, $WID; $File->{Templates}->{Result}->param($WID => TRUE, %{$params}); $File->{Templates}->{Result}->param(have_warnings => TRUE); $File->{Templates}->{Error}->param($WID => TRUE, %{$params}); $File->{Templates}->{Error}->param(have_warnings => TRUE); $File->{Templates}->{SOAP}->param($WID => TRUE, %{$params}); $File->{Templates}->{SOAP}->param(have_warnings => TRUE); } # # Proxy authentication requests. # Note: expects the third argument to be a hash ref (see HTTP::Headers::Auth). sub authenticate { my $File = shift; my $resource = shift; my $authHeader = shift || {}; my $realm = $resource; $realm =~ s([^\w\d.-]*){}g; for my $scheme (keys(%$authHeader)) { my $origrealm = $authHeader->{$scheme}->{realm}; if (not defined $origrealm or lc($scheme) !~ /^(?:basic|digest)$/) { delete($authHeader->{$scheme}); next; } $authHeader->{$scheme}->{realm} = "$realm-$origrealm"; } my $headers = HTTP::Headers->new(Connection => 'close'); $headers->www_authenticate(%$authHeader); $headers = $headers->as_string(); chomp($headers); $File->{Templates}->{AuthzReq}->param(http_401_headers => $headers); $File->{Templates}->{AuthzReq}->param(http_401_url => $resource); print Encode::encode('UTF-8', $File->{Templates}->{AuthzReq}->output); exit; # Further interaction will be a new HTTP request. } # # Fetch an URL and return the content and selected meta-info. sub handle_uri { my $q = shift; # The CGI object. my $File = shift; # The master datastructure. my $uri = new URI (ref $q ? $q->param('uri') : $q)->canonical(); $uri->fragment(undef); my $ua = new W3C::Validator::UserAgent ($CFG, $File); unless ($ua->is_protocol_supported($uri)) { $File->{'Error Flagged'} = TRUE; if (($uri->canonical() eq "1") ) #if uri param is empty (also for empty direct or upload), it's been set to TRUE in sub prepCGI() { $File->{Templates}->{Error}->param(fatal_no_content => TRUE); } else { $File->{Templates}->{Error}->param(fatal_uri_error => TRUE); $File->{Templates}->{Error}->param(fatal_uri_scheme => $uri->scheme()); } return $File; } return $File unless $ua->uri_ok($uri); my $req = new HTTP::Request(GET => $uri); # if one wants to use the accept, accept-charset and accept-language params # in order to trigger specific negotiation if ($File->{Opt}->{'Accept Header'}) { $req->header(Accept => $File->{Opt}->{'Accept Header'}); } if ($File->{Opt}->{'Accept-Language Header'}) { $req->header(Accept_Language => $File->{Opt}->{'Accept-Language Header'}); } if ($File->{Opt}->{'Accept-Charset Header'}) { $req->header(Accept_Charset => $File->{Opt}->{'Accept-Charset Header'}); } # All Apache configurations don't set HTTP_AUTHORIZATION for CGI scripts. # If we're under mod_perl, there is a way around it... eval { local $SIG{__DIE__}; my $auth = Apache2::RequestUtil->request()->headers_in()->{Authorization}; $ENV{HTTP_AUTHORIZATION} = $auth if $auth; } if (IS_MODPERL2() && !$ENV{HTTP_AUTHORIZATION}); # If we got a Authorization header, the client is back at it after being # prompted for a password so we insert the header as is in the request. $req->headers->header(Authorization => $ENV{HTTP_AUTHORIZATION}) if $ENV{HTTP_AUTHORIZATION}; my $res = $ua->request($req); return $File if $File->{'Error Flagged'}; # Redirect IP rejected? unless ($res->code == 200 or $File->{Opt}->{'No200'}) { if ($res->code == 401) { my %auth = $res->www_authenticate(); # HTTP::Headers::Auth &authenticate($File, $res->request->uri, \%auth); } else { $File->{'Error Flagged'} = TRUE; my $no200url = undef; if (!$File->{Opt}->{No200}) { # $File->{URI} not set yet; setting it non-local has side effects local $File->{URI} = $uri->as_string; local $File->{Opt}->{No200} = TRUE; $no200url = &self_url_file($File); } my $warning = $res->header("Client-Warning"); if ($warning && $warning =~ /Internal response/i) { # Response doc generated internally by LWP, no need to show that info # nor to provide error doc validation link to it. $warning = undef; $no200url = undef; } $File->{Templates}->{Error}->param(fatal_http_error => TRUE); $File->{Templates}->{Error}->param(fatal_http_uri => $uri->as_string); $File->{Templates}->{Error}->param(fatal_http_code => $res->code); $File->{Templates}->{Error}->param(fatal_http_msg => $res->message); $File->{Templates}->{Error}->param(fatal_http_warn => $warning); $File->{Templates}->{Error}->param(fatal_http_dns => TRUE) if $res->code == 500; $File->{Templates}->{Error}->param(fatal_http_no200 => $no200url); } return $File; } # # Enforce Max Recursion level. &check_recursion($File, $res); my ($mode, $ct, $charset) = &parse_content_type( $File, scalar($res->header('Content-Type')), scalar($res->request->uri), ); my $content = &get_content($File, $res); return $File if $File->{'Error Flagged'}; $File->{Bytes} = $content; $File->{Mode} = $mode; $File->{ContentType} = $ct; $File->{ContentEnc} = $res->content_encoding; $File->{ContentLang} = $res->content_language; $File->{ContentLoc} = $res->header('Content-Location'); $File->{TransferEnc} = $res->header('Client-Transfer-Encoding'); $File->{Charset}->{HTTP} = lc $charset; $File->{Modified} = $res->header('Last-Modified'); $File->{Server} = scalar $res->server; # TODO: Content-Length is not always set, so either this should # be renamed to 'Content-Length' or it should consider more than # the Content-Length header. $File->{Size} = scalar $res->content_length; $File->{URI} = scalar $res->request->uri->canonical; $File->{'Is Upload'} = FALSE; $File->{'Direct Input'} = FALSE; return $File; } # # Handle uploaded file and return the content and selected meta-info. sub handle_file { my $q = shift; # The CGI object. my $File = shift; # The master datastructure. my $f = $q->param('uploaded_file'); my $h = $q->uploadInfo($f); my $file; local $/ = undef; # set line delimiter so that <> reads rest of file $file = <$f>; my ($mode, $ct, $charset) = &parse_content_type($File, $h->{'Content-Type'}); $File->{Bytes} = $file; $File->{Mode} = $mode; $File->{ContentType} = $ct; $File->{Charset}->{HTTP} = lc $charset; $File->{Modified} = $q->http('Last-Modified'); $File->{Server} = $q->http('User-Agent'); # Fake a "server". :-) $File->{Size} = $q->http('Content-Length'); $File->{URI} = "$f"; $File->{'Is Upload'} = TRUE; $File->{'Direct Input'} = FALSE; return $File; } # # Handle uploaded file and return the content and selected meta-info. sub handle_frag { my $q = shift; # The CGI object. my $File = shift; # The master datastructure. $File->{Bytes} = $q->param('fragment'); $File->{Mode} = 'TBD'; $File->{Modified} = ''; $File->{Server} = ''; $File->{Size} = ''; $File->{ContentType} = ''; # @@TODO? $File->{URI} = 'upload://Form Submission'; $File->{'Is Upload'} = FALSE; $File->{'Direct Input'} = TRUE; $File->{Charset}->{HTTP} = "utf-8"; # by default, the form accepts utf-8 chars if ($File->{Opt}->{Prefill}) { # we surround the HTML fragment with some basic document structure my $prefill_Template = undef; if ($File->{Opt}->{'Prefill Doctype'} eq 'html401') { $prefill_Template = $File->{Templates}->{PrefillHTML}; } else { $prefill_Template = $File->{Templates}->{PrefillXHTML}; } $prefill_Template->param(fragment => $File->{Bytes}); $File->{Bytes} = $prefill_Template->output(); # Let's force the view source so that the user knows what we've put around # their code. $File->{Opt}->{'Show Source'} = TRUE; # Ignore doctype overrides (#5132). $File->{Opt}->{DOCTYPE} = 'Inline'; } return $File; } # # Parse a Content-Type and parameters. Return document type and charset. sub parse_content_type { my $File = shift; my $Content_Type = shift; my $url = shift; my $charset = ''; my ($ct) = lc($Content_Type) =~ /^\s*([^\s;]*)/g; my $mode = $CFG->{MIME}->{$ct} || $ct; $charset = HTML::Encoding::encoding_from_content_type($Content_Type); if ($mode =~ m(/)) { # a "/" means it's unknown or we'd have a mode here. if ($ct eq 'text/css' and defined $url) { print redirect 'http://jigsaw.w3.org/css-validator/validator?uri=' . uri_escape $url; exit; } elsif ($ct eq 'application/atom+xml' and defined $url) { print redirect 'http://validator.w3.org/feed/check.cgi?url=' . uri_escape $url; exit; } elsif ($ct =~ m(^application/.+\+xml$)) { # unknown media types which should be XML - we give these a try $mode = "XML"; } else { $File->{'Error Flagged'} = TRUE; $File->{Templates}->{Error}->param(fatal_mime_error => TRUE); $File->{Templates}->{Error}->param(fatal_mime_ct => $ct); } } return $mode, $ct, $charset; } # # Get content with Content-Encodings decoded from a response. sub get_content ($$) { my $File = shift; my $res = shift; my $content; eval { $content = $res->decoded_content(charset => 'none', raise_error => 1); }; if ($@) { (my $errmsg = $@) =~ s/ at .*//s; my $cenc = $res->header("Content-Encoding"); my $uri = $res->request->uri; $File->{'Error Flagged'} = TRUE; $File->{Templates}->{Error}->param(fatal_decode_error => TRUE); $File->{Templates}->{Error}->param(fatal_decode_errmsg => $errmsg); $File->{Templates}->{Error}->param(fatal_decode_cenc => $cenc); # Include URI because it might be a subsystem (eg. HTML5 validator) one $File->{Templates}->{Error}->param(fatal_decode_uri => $uri); } return $content; } # # Check recursion level and enforce Max Recursion limit. sub check_recursion ($$) { my $File = shift; my $res = shift; # Not looking at our own output. return unless defined $res->header('X-W3C-Validator-Recursion'); my $lvl = $res->header('X-W3C-Validator-Recursion'); return unless $lvl =~ m/^\d+$/; # Non-digit, i.e. garbage, ignore. if ($lvl >= $CFG->{'Max Recursion'}) { print redirect $CFG->{'Home Page'}; } else { # Increase recursion level in output. $File->{Templates}->{Result}->param(depth => $lvl++); } } # # Return $_[0] encoded for HTML entities (cribbed from merlyn). # # Note that this is used both for HTML and XML escaping. # sub ent { local $_ = shift; return '' unless defined; # Eliminate warnings # TODO: Err, why have " twice in the character class? ' maybe? s(["<&>"]){'&#' . ord($&) . ';'}ge; # should switch to hex sooner or later return $_; } # # Truncate source lines for report. # Expects 1-based column indexes. sub truncate_line { my $line = shift; my $col = shift; my $maxlen = 80; # max line length to truncate to my $diff = length($line) - $maxlen; # Don't truncate at all if it fits. return ($line, $col) if ($diff <= 0); my $start = $col - int($maxlen/2); if ($start < 0) { # Truncate only from end of line. $start = 0; $line = substr($line, $start, $maxlen - 1) . '…'; } elsif ($start > $diff) { # Truncate only from beginning of line. $start = $diff; $line = '…' . substr($line, $start + 1); } else { # Truncate from both beginning and end of line. $line = '…' . substr($line, $start + 1, $maxlen - 2) . '…'; } # Shift column if we truncated from beginning of line. $col -= $start; return ($line, $col); } # # Suppress any existing DOCTYPE by commenting it out. sub override_doctype { my $File = shift; my ($dt) = grep { $_->{Display} eq $File->{Opt}->{DOCTYPE} } values %{$CFG->{Types}}; # @@TODO: abort/whine about unrecognized doctype if $dt is undef.; my $pubid = $dt->{PubID}; my $sysid = $dt->{SysID}; my $name = $dt->{Name}; # The HTML5 PubID is a fake, reset it out of the way. $pubid = undef if ($pubid eq 'HTML5'); # We don't have public/system ids for all types. my $dtd = "'; my $org_dtd = ''; my $HTML = ''; my $seen_doctype = FALSE; my $seen_root = FALSE; my $declaration = sub { my ($tag, $text) = @_; if ($seen_doctype || uc($tag) ne '!DOCTYPE') { $HTML .= $text; return; } $seen_doctype = TRUE; $org_dtd = &ent($text); ($File->{Root}, undef, $File->{DOCTYPE}) = $text =~ //si; $File->{DOCTYPE} = 'HTML5' if (lc($File->{Root} || '') eq 'html' && (!defined($File->{DOCTYPE}) || $File->{DOCTYPE} eq 'about:legacy-compat')); # No Override if Fallback was requested, or if override is the same as detected my $known = $CFG->{Types}->{$File->{DOCTYPE}}; if ($File->{Opt}->{FB}->{DOCTYPE} or ($known && $File->{Opt}->{DOCTYPE} eq $known->{Display})) { $HTML .= $text; # Stash it as is... } else { $HTML .= "$dtd"; } }; my $start_element = sub{ if ($seen_root) { $HTML .= $_[0]; # Stash it as is... moving on } else { $seen_root = TRUE; if ($seen_doctype) { # doctype addition aldready done, we move on $HTML .= $_[0]; } else { # no original doctype present, hence none replaced already # => we sneak the chosen doctype before the root elt $HTML .= "$dtd$_[0]"; } } }; HTML::Parser->new(default_h => [sub {$HTML .= shift}, 'text'], declaration_h => [$declaration, 'tag,text'], start_h => [$start_element, 'text'] )->parse(join "\n", @{$File->{Content}})->eof(); $File->{Content} = [split /\n/, $HTML]; if ($seen_doctype) { my $known = $CFG->{Types}->{$File->{DOCTYPE}}; unless ($File->{Opt}->{FB}->{DOCTYPE} or ($known && $File->{Opt}->{DOCTYPE} eq $known->{Display} )) { &add_warning('W13', { W13_org => $org_dtd, W13_new => $File->{Opt}->{DOCTYPE}, }); $File->{Tentative} |= T_ERROR; # Tag it as Invalid. } } else { if ($File->{"DOCTYPEless OK"}) { &add_warning('W25', {W25_dtd => $File->{Opt}->{DOCTYPE}}); } elsif ($File->{Opt}->{FB}->{DOCTYPE}) { &add_warning('W16', {W16_dtd => $File->{Opt}->{DOCTYPE}}); $File->{Tentative} |= T_ERROR; # Tag it as Invalid. } else { &add_warning('W15', {W15_dtd => $File->{Opt}->{DOCTYPE}}); $File->{Tentative} |= T_ERROR; # Tag it as Invalid. } } return $File; } # # Generate a HTML report of detected errors. sub report_errors ($) { my $File = shift; my $Errors = []; my %Errors_bytype; my $number_of_errors = 0; my $number_of_warnings = 0; my $number_of_info = 0; # Hash to keep track of how many of each error is reported. my %Msgs; # Used to generate a UID for explanations. # for the sake of readability, at least until the xmlwf errors have explanations, # we push the errors from the XML parser at the END of the error list. foreach my $errmsg (@{$File->{WF_Errors}}){ push @{$File->{Errors}}, $errmsg; } if (scalar @{$File->{Errors}}) { foreach my $err (@{$File->{Errors}}) { my $line; my $col = 0; # avoid truncating lines that do not exist if (defined($err->{line}) && $File->{Content}->[$err->{line}-1]) { if (defined($err->{char}) && $err->{char} =~ /^[0-9]+$/ ){ ($line, $col) = &truncate_line($File->{Content}->[$err->{line}-1], $err->{char}); $line = &mark_error($line, $col); } elsif (defined($err->{line})) { $col = length($File->{Content}->[$err->{line}-1]); $col = 80 if ($col > 80); ($line, $col) = &truncate_line($File->{Content}->[$err->{line}-1], $col); $line = &ent($line); $col = 0; } } else { $col = 0; $line = ""; } my $explanation = ""; if ($err->{expl}) { } else { if ($err->{num}) { my $num = $err->{num}; $explanation .= Encode::decode_utf8("\n $RSRC{msg}->{$num}->{verbose}\n") if exists $RSRC{msg}->{$num} && exists $RSRC{msg}->{$num}->{verbose}; my $_msg = $RSRC{msg}->{nomsg}->{verbose}; $_msg =~ s//$num/g; if (($File->{'Is Upload'}) or ($File->{'Direct Input'})) { $_msg =~ s///g } else { my $escaped_uri = uri_escape($File->{URI}); $_msg =~ s//$escaped_uri/g; } $explanation = " $_msg\n$explanation"; # The send feedback plea. $explanation =~ s//$CFG->{'Home Page'}/g; } $err->{expl} = $explanation; } $err->{src} = $line; $err->{col} = ' ' x $col; if ($err->{type} eq 'I') { $err->{class} = 'msg_info'; $err->{err_type_err} = 0; $err->{err_type_warn} = 0; $err->{err_type_info} = 1; $number_of_info += 1; } elsif ($err->{type} eq 'E') { $err->{class} = 'msg_err'; $err->{err_type_err} = 1; $err->{err_type_warn} = 0; $err->{err_type_info} = 0; $number_of_errors += 1; } elsif (($err->{type} eq 'W') or ($err->{type} eq 'X') ) { $err->{class} = 'msg_warn'; $err->{err_type_err} = 0; $err->{err_type_warn} = 1; $err->{err_type_info} = 0; $number_of_warnings += 1; } # TODO other classes for "X" etc? FIXME find all types of message. push @{$Errors}, $err; if (($File->{Opt}->{'Group Errors'}) and (($err->{type} eq 'E') or ($err->{type} eq 'W')or ($err->{type} eq 'X'))) { # index by num for errors and warnings only - info usually give context of error or warning if (! exists $Errors_bytype{$err->{num}}) { $Errors_bytype{$err->{num}}->{instances} = []; my $msg_text; if ($err->{num} ne 'xmlwf') { $msg_text = $RSRC{msg}->{$err->{num}}->{original}; $msg_text =~ s/%1/X/; $msg_text =~ s/%2/Y/; } else { ## FIXME ## we need a catalog of errors from our XML parser $msg_text = "XML Parsing Error"; } $Errors_bytype{$err->{num}}->{expl} = $err->{expl}; $Errors_bytype{$err->{num}}->{generic_msg} = $msg_text; $Errors_bytype{$err->{num}}->{msg} = $err->{msg}; $Errors_bytype{$err->{num}}->{type} = $err->{type}; $Errors_bytype{$err->{num}}->{class} = $err->{class}; $Errors_bytype{$err->{num}}->{err_type_err} = $err->{err_type_err}; $Errors_bytype{$err->{num}}->{err_type_warn} = $err->{err_type_warn}; $Errors_bytype{$err->{num}}->{err_type_info} = $err->{err_type_info}; } push @ { $Errors_bytype{$err->{num}}->{instances} }, $err; } } } if ($File->{Opt}->{'Group Errors'}) { $Errors = []; for my $err_num (keys %Errors_bytype){ push @{$Errors}, $Errors_bytype{$err_num}; } } # we are not sorting errors by line, as it would break the position # of auxiliary messages such as "start tag was here". We'll have to live with # the fact that XML well-formedness errors are listed first, then validation errors #else { # sort error by lines # @{$Errors} = sort {$a->{line} <=> $b->{line} } @{$Errors}; #} return $number_of_errors, $number_of_warnings, $number_of_info, $Errors; } # # Chop the source line into 3 pieces; the character at which the error # was detected, and everything to the left and right of that position. # That way we can add markup to the relevant char without breaking &ent(). # Expects 1-based column indexes. sub mark_error ($$) { my $line = shift; my $col = shift; my $linelen = length($line); # Coerce column into an index valid within the line. if ($col < 1) { $col = 1; } elsif ($col > $linelen) { $col = $linelen; } $col--; my $left = substr($line, 0, $col); my $char = substr($line, $col, 1); my $right = substr($line, $col + 1); $char = &ent($char); $char = qq($char); $line = &ent($left) . $char . &ent($right); return $line; } # # Create a HTML representation of the document. sub source { my $File = shift; # Remove any BOM since we're not at BOT anymore... $File->{Content}->[0] = substr($File->{Content}->[0], 1) if ($File->{BOM} && scalar(@{$File->{Content}})); my @source = map({file_source_line => $_}, @{$File->{Content}}); return \@source; } sub match_DTD_FPI_SI { my ($File, $FPI, $SI) = @_; if ($CFG->{Types}->{$FPI}) { if ($CFG->{Types}->{$FPI}->{SysID}){ if ($SI ne $CFG->{Types}->{$FPI}->{SysID}) { &add_warning('W26', {W26_dtd_pub => $FPI, W26_dtd_pub_display =>$CFG->{Types}->{$FPI}->{Display}, W26_dtd_sys=> $SI, W26_dtd_sys_recommend=> $CFG->{Types}->{$FPI}->{SysID}}); } } } else { # FPI not known, checking if the SI is foreach my $proper_FPI (keys %{$CFG->{Types}}) { if ($CFG->{Types}->{$proper_FPI}->{SysID}) { if ($CFG->{Types}->{$proper_FPI}->{SysID} eq $SI) { &add_warning('W26', {W26_dtd_pub => $FPI, W26_dtd_pub_display =>$CFG->{Types}->{$proper_FPI}->{Display}, W26_dtd_sys => $SI, W26_dtd_pub_recommend=> $proper_FPI }); } } } } } # # Do an initial parse of the Document Entity to extract FPI. sub preparse_doctype { my $File = shift; # # Reset DOCTYPE, Root (for second invocation, probably not needed anymore). $File->{DOCTYPE} = ''; $File->{Root} = ''; my $dtd = sub { return if $File->{Root}; # TODO: The \s and \w are probably wrong now that the strings are utf8_on my $declaration = shift; my $doctype_type; my $doctype_secondpart; if ($declaration =~ //si) { $File->{Root} = "html"; $File->{DOCTYPE} = "HTML5"; } else { ($File->{Root}, $doctype_type, $File->{DOCTYPE}, $doctype_secondpart) = $declaration =~ m()si; if (($doctype_type eq "PUBLIC") and (($doctype_secondpart) = $doctype_secondpart =~ m(\s+(?:[\'\"])([^\"\']+)(?:[\"\']).*)si)){ &match_DTD_FPI_SI($File, $File->{DOCTYPE}, $doctype_secondpart); } } }; my $start = sub { my ($p, $tag, $attr) = @_; if ($File->{Root}) { return unless $tag eq $File->{Root}; } else { $File->{Root} = $tag; } if ($attr->{xmlns}) {$File->{Namespace} = $attr->{xmlns}}; if ($attr->{version}) {$File->{'Root Version'} = $attr->{version}}; if ($attr->{baseProfile}) {$File->{'Root BaseProfile'} = $attr->{baseProfile}}; # We're done parsing. $p->eof(); }; # we use HTML::Parser as pre-parser. May use html5lib or other in the future my $p = HTML::Parser->new(api_version => 3); # if content-type has shown we should pre-parse with XML mode, use that # otherwise (mostly text/html cases) use default mode $p->xml_mode(&is_xml($File)); $p->handler(declaration => $dtd, 'text'); $p->handler(start => $start, 'self,tag,attr'); my $line = 0; my $max = scalar(@{$File->{Content}}); $p->parse(sub { return ($line < $max) ? $File->{Content}->[$line++] . "\n" : undef; }); $p->eof(); # TODO: These \s here are probably wrong now that the strings are utf8_on $File->{DOCTYPE} = '' unless defined $File->{DOCTYPE}; $File->{DOCTYPE} =~ s(^\s+){ }g; $File->{DOCTYPE} =~ s(\s+$){ }g; $File->{DOCTYPE} =~ s(\s+) { }g; # Some document types actually need no doctype to be identified, # root element and some version attribute is enough # TODO applicable doctypes should be migrated to a config file? # if (($File->{DOCTYPE} eq '') and ($File->{Root} eq "svg") ) { # if (($File->{'Root Version'}) or ($File->{'Root BaseProfile'})) # { # if (! $File->{'Root Version'}) { $File->{'Root Version'} = "0"; } # if (! $File->{'Root BaseProfile'}) { $File->{'Root BaseProfile'} = "0"; } # if ($File->{'Root Version'} eq "1.0"){ # $File->{DOCTYPE} = "-//W3C//DTD SVG 1.0//EN"; # $File->{"DOCTYPEless OK"} = TRUE; # $File->{Opt}->{DOCTYPE} = "SVG 1.0"; # } # if ((($File->{'Root Version'} eq "1.1") or ($File->{'Root Version'} eq "0")) and ($File->{'Root BaseProfile'} eq "tiny")) { # $File->{DOCTYPE} = "-//W3C//DTD SVG Tiny 1.1//EN"; # $File->{"DOCTYPEless OK"} = TRUE; # $File->{Opt}->{DOCTYPE} = "SVG 1.1 Tiny"; # } # elsif ((($File->{'Root Version'} eq "1.1") or ($File->{'Root Version'} eq "0")) and ($File->{'Root BaseProfile'} eq "basic")) { # $File->{DOCTYPE} = "-//W3C//DTD SVG Basic 1.1//EN"; # $File->{Opt}->{DOCTYPE} = "SVG 1.1 Basic"; # $File->{"DOCTYPEless OK"} = TRUE; # } # elsif (($File->{'Root Version'} eq "1.1") and (!$File->{'Root BaseProfile'})) { # $File->{DOCTYPE} = "-//W3C//DTD SVG 1.1//EN"; # $File->{Opt}->{DOCTYPE} = "SVG 1.1"; # $File->{"DOCTYPEless OK"} = TRUE; # } # if ($File->{'Root Version'} eq "0") { $File->{'Root Version'} = undef; } # if ($File->{'Root BaseProfile'} eq "0") { $File->{'Root BaseProfile'} = undef; } # } # else { # # by default for an svg root elt, we use SVG 1.1 # $File->{DOCTYPE} = "-//W3C//DTD SVG 1.1//EN"; # $File->{Opt}->{DOCTYPE} = "SVG 1.1"; # $File->{"DOCTYPEless OK"} = TRUE; # } # } if (($File->{"DOCTYPEless OK"}) and ($File->{Opt}->{DOCTYPE})) { # doctypeless document type found, we fake the override # so that the parser will have something to validate against $File = &override_doctype($File); } return $File; } # # Preprocess CGI parameters. sub prepCGI { my $File = shift; my $q = shift; my $path_info; { # @@@HACK: # CGI.pm's _name_and_path_from_env has query string related issues; # just trump the query string for the duration of calling methods we # know we don't need it for and which have been affected in the past. # More info: http://www.w3.org/Bugs/Public/show_bug.cgi?id=4365 # ->url() reportedly fixed in CGI.pm 3.34, ->path_info() in 3.40. local $ENV{REQUEST_URI} = URI->new($ENV{REQUEST_URI})->path() if $ENV{REQUEST_URI}; # The URL to this CGI script. $File->{Env}->{'Self URI'} = $q->url(); $path_info = $q->path_info(); } # Avoid CGI.pm's "exists but undef" behaviour. if (scalar $q->param) { foreach my $param ($q->param) { next if $param eq 'uploaded_file'; # 'uploaded_file' contains data. next if $param eq 'fragment'; # Ditto 'fragment'. next if $param eq 'accept'; # Original checking had a specific Accept: header sent. next if $param eq 'accept-language'; # Ditto Accept-Language:. next if $param eq 'accept-charset'; # Ditto Accept-Charset:. next if $q->param($param) eq '0'; # Keep false-but-set params. # # Parameters that are given to us without specifying a value get # set to "1" (the "TRUE" constant). This is so we can test for the # boolean value of a parameter instead of first checking whether # the param was given and then testing its value. Needed because # CGI.pm sets ";param" and ";param=" to a boolean false value # (undef() or a null string, respectively). $q->param($param, TRUE) unless $q->param($param); } } # IIS reportedly does not provide the $path_info we expect - hack around it. $path_info =~ s|(.*)/check\.pl(.*)$|$2| if ($ENV{SERVER_SOFTWARE} and $ENV{SERVER_SOFTWARE} =~ /Microsoft-IIS/); # apparently, with mod_perl2, $path_info is empty even if it should be filled # working around that if (!$path_info && $File->{Env}->{'Self URI'} =~ /check\/referr?er$/) { $path_info = '/referer'; $File->{Env}->{'Self URI'} =~ s/\/referr?er$//; } # Futz the URL so "/referer" works. if ($path_info) { if ($path_info =~ m|^/referr?er$|) { if ($q->referer) { $q->param('uri', $q->referer); print redirect &self_url_q($q, $File); } else { print redirect $File->{Env}->{'Self URI'} . '?uri=referer'; } } else { print redirect &self_url_q($q, $File); } exit; } # Use "url" unless a "uri" was also given. if ($q->param('url') and not $q->param('uri')) { $q->param('uri', $q->param('url')); } # Munge the URL to include commonly omitted prefix. my $u = $q->param('uri'); $q->param('uri', "http://$u") if $u && $u =~ m(^www)i; # Issue a redirect for uri=referer. if ($q->param('uri') and $q->param('uri') eq 'referer') { if ($q->referer) { $q->param('uri', $q->referer); $q->param('accept',$q->http('Accept')) if ($q->http('Accept')); $q->param('accept-language',$q->http('Accept-Language')) if ($q->http('Accept-Language')); $q->param('accept-charset',$q->http('Accept-Charset')) if ($q->http('Accept-Charset')); print redirect &self_url_q($q, $File); exit; } else { # Redirected from /check/referer to /check?uri=referer because # the browser didn't send a Referer header, or the request was # for /check?uri=referer but no Referer header was found. $File->{'Error Flagged'} = TRUE; $File->{Templates}->{Error}->param(fatal_referer_error => TRUE); } } # Supersede URL with an uploaded file. if ($q->param('uploaded_file')) { $q->param('uri', 'upload://' . $q->param('uploaded_file')); $File->{'Is Upload'} = TRUE; # Tag it for later use. } # Supersede URL with an uploaded fragment. if ($q->param('fragment')) { $q->param('uri', 'upload://Form Submission'); $File->{'Direct Input'} = TRUE; # Tag it for later use. } # Redirect to a GETable URL if method is POST without a file upload. if (defined $q->request_method and $q->request_method eq 'POST' and not ($File->{'Is Upload'} or $File->{'Direct Input'})) { my $thispage = &self_url_q($q, $File); print redirect $thispage; exit; } # # Flag an error if we didn't get a file to validate. unless ($q->param('uri')) { $File->{'Error Flagged'} = TRUE; $File->{Templates}->{Error}->param(fatal_uri_error => TRUE); $File->{Templates}->{Error}->param(fatal_uri_scheme => 'undefined'); } return $q; } # # Set parse mode (SGML or XML) based on a number of preparsed factors: # * HTTP Content-Type # * Doctype Declaration # * XML Declaration # * XML namespaces sub set_parse_mode { my $File = shift; my $CFG = shift; my $fpi = $File->{DOCTYPE}; $File->{ModeChoice} = ''; my $parseModeFromDoctype = $CFG->{Types}->{$fpi}->{'Parse Mode'} || 'TBD'; # $File->{Mode} may have been set in parse_content_type # and it would come from the Media Type my $parseModeFromMimeType = $File->{Mode}; my $begincontent = join "\x20",@{$File->{Content}}; # for the sake of xml decl detection, # the 10 first lines should be safe my $parseModeFromXMLDecl = ( $begincontent =~ /^ [\x20|\x09|\x0D|\x0A]* # whitespace before the decl should not be happening # but we are greedy for the sake of detection, not validation <\?xml # start matching an XML Declaration [\x20|\x09|\x0D|\x0A]+ # x20, x09, xD and xA are the allowed "xml white space" version [\x20|\x09|\x0D|\x0A]* = # for documents, version info is mandatory [\x20|\x09|\x0D|\x0A]* ("1.0"|"1.1"|'1.0'|'1.1') # hardcoding the existing XML versions. # Maybe we should use \d\.\d ([\x20|\x09|\x0D|\x0A]+ encoding [\x20|\x09|\x0D|\x0A]* = [\x20|\x09|\x0D|\x0A]* ("[A-Za-z][a-zA-Z0-9_-]+"|'[A-Za-z][a-zA-Z0-9_-]+') )? # encoding info is optional ([\x20|\x09|\x0D|\x0A]+ standalone [\x20|\x09|\x0D|\x0A]* = [\x20|\x09|\x0D|\x0A]* ("yes"|"no"|'yes'|'no') )? # ditto standalone info, optional [\x20|\x09|\x0D|\x0A]* \?> # end of XML Declaration /x ? 'XML' : 'TBD' ); my $parseModeFromNamespace = 'TBD'; if ($File->{Namespace}) { $parseModeFromNamespace = 'XML'} if (($parseModeFromMimeType eq 'TBD') and ($parseModeFromXMLDecl eq 'TBD') and ($parseModeFromNamespace eq 'TBD') and (!exists $CFG->{Types}->{$fpi})) { # if the mime type is text/html (ambiguous, hence TBD mode) # and the doctype isn't in the catalogue # and XML prolog detection was unsuccessful # and we found no namespace at the root # ... throw in a warning &add_warning('W06', { W06_mime => $File->{ContentType}, w06_doctype => $File->{DOCTYPE} }); return; } $parseModeFromDoctype = 'TBD' unless $parseModeFromDoctype eq 'SGML' or $parseModeFromDoctype eq 'HTML5' or $parseModeFromDoctype eq 'XML' or $parseModeFromNamespace eq 'XML'; if (($parseModeFromDoctype eq 'TBD') and ($parseModeFromXMLDecl eq 'TBD') and ($parseModeFromMimeType eq 'TBD') and ($parseModeFromNamespace eq 'TBD')) { # if all factors are useless to give us a parse mode # => we use SGML-based DTD validation as a default $File->{Mode} = 'DTD+SGML'; $File->{ModeChoice} = 'Fallback'; # and send warning about the fallback &add_warning('W06', { W06_mime => $File->{ContentType}, w06_doctype => $File->{DOCTYPE} }); return; } elsif ($parseModeFromMimeType ne 'TBD') { # if The mime type gives clear indication of whether the document is XML or not if (($parseModeFromDoctype ne 'TBD') and ($parseModeFromDoctype ne 'HTML5') and ($parseModeFromMimeType ne $parseModeFromDoctype)) { # if document-type recommended mode and content-type recommended mode clash, shoot a warning # unknown doctypes will not trigger this # neither will html5 documents, which can be XML or not &add_warning('W07', { W07_mime => $File->{ContentType}, W07_ct => $parseModeFromMimeType, W07_dtd => $parseModeFromDoctype, }); } # mime type has precedence, we stick to it $File->{ModeChoice} = 'Mime'; if ($parseModeFromDoctype eq "HTML5") { $File->{Mode} = 'HTML5+'.$File->{Mode}; } else { $File->{Mode} = 'DTD+'.$File->{Mode}; } return; } elsif ($parseModeFromDoctype ne 'TBD') { # the mime type is ambiguous (hence we didn't stop at the previous test) # but by now we're sure that the document type is a good indication # so we use that. if ($parseModeFromDoctype eq "HTML5") { if ($parseModeFromXMLDecl eq "XML" or $parseModeFromNamespace eq "XML") { $File->{Mode} = "HTML5+XML"; } else { $File->{Mode} = "HTML5"; } } else { # not HTML5 $File->{Mode} = "DTD+".$parseModeFromDoctype; } $File->{ModeChoice} = 'Doctype'; return; } elsif ($parseModeFromXMLDecl ne 'TBD') { # the mime type is ambiguous (hence we didn't stop at the previous test) # and so was the doctype # but we found an XML declaration # so we use that. if ($File->{Mode} eq "") { $File->{Mode} = "DTD+".$parseModeFromXMLDecl; } elsif ($File->{Mode} =~ /\+/ ) { $File->{Mode} =~ s/\+.*/\+$parseModeFromXMLDecl/; } else { $File->{Mode} = $File->{Mode}."+".$parseModeFromXMLDecl; } $File->{ModeChoice} = 'XMLDecl'; return; } else { # this is the last case. We know that all modes are not TBD, # yet mime type, doctype AND XML DECL tests have failed => we are saved by the presence of namespaces if ($File->{Mode} eq "") { $File->{Mode} = "DTD+".$parseModeFromNamespace; } elsif ($File->{Mode} =~ /\+/ ) { $File->{Mode} =~ s/\+.*/\+$parseModeFromNamespace/; } else { $File->{Mode} = $File->{Mode}."+".$parseModeFromNamespace; } $File->{ModeChoice} = 'Namespace'; } } # # Utility sub to tell if mode "is" XML. sub is_xml {shift->{Mode} =~ /XML/}; # # Check charset conflicts and add any warnings necessary. sub charset_conflicts { my $File = shift; # # Handle the case where there was no charset to be found. unless ($File->{Charset}->{Use}) { &add_warning('W17', {}); $File->{Tentative} |= T_WARN; } # # Add a warning if there was charset info conflict (HTTP header, # XML declaration, or element). # filtering out some of the warnings in direct input mode where HTTP encoding is a "fake" if ((charset_not_equal($File->{Charset}->{HTTP}, $File->{Charset}->{XML})) and not ($File->{'Direct Input'})) { &add_warning('W18', { W18_http => $File->{Charset}->{HTTP}, W18_xml => $File->{Charset}->{XML}, W18_use => $File->{Charset}->{Use}, }); } elsif (charset_not_equal($File->{Charset}->{HTTP}, $File->{Charset}->{META}) and not ($File->{'Direct Input'})) { &add_warning('W19', { W19_http => $File->{Charset}->{HTTP}, W19_meta => $File->{Charset}->{META}, W19_use => $File->{Charset}->{Use}, }); } elsif (charset_not_equal($File->{Charset}->{XML}, $File->{Charset}->{META})) { &add_warning('W20', { W20_http => $File->{Charset}->{XML}, W20_xml => $File->{Charset}->{META}, }); $File->{Tentative} |= T_WARN; } return $File; } # # Transcode to UTF-8 sub transcode { my $File = shift; my $general_charset = $File->{Charset}->{Use}; my $exact_charset = $general_charset; # TODO: This should be done before transcode() if ($general_charset eq 'utf-16') { if ($File->{Charset}->{Auto} =~ m/^utf-16[bl]e$/) { $exact_charset = $File->{Charset}->{Auto}; } else { $exact_charset = 'utf-16be'; } } my $cs = $exact_charset; if ($CFG->{Charsets}->{$cs}) { if ($CFG->{Charsets}->{$cs} =~ /ERR /) { # The encoding is not supported due to policy $File->{'Error Flagged'} = TRUE; $File->{Templates}->{Error}->param(fatal_transcode_error => TRUE); $File->{Templates}->{Error}->param(fatal_transcode_charset => $cs); # @@FIXME might need better text $File->{Templates}->{Error}->param(fatal_transcode_errmsg => "This encoding is not supported by the validator."); return $File; } elsif ($CFG->{Charsets}->{$cs} =~ /X /) { # possibly problematic, we recommend another alias my $recommended_charset = $CFG->{Charsets}->{$cs}; $recommended_charset =~ s/X //; &add_warning('W22', { W22_declared => $cs, W22_suggested => $recommended_charset, }); } } # Does the system support decoding this encoding? eval { Encode::decode($cs, ''); }; if ($@) { # This system's Encode installation does not support # the character encoding; might need additional modules $File->{'Error Flagged'} = TRUE; $File->{Templates}->{Error}->param(fatal_transcode_error => TRUE); $File->{Templates}->{Error}->param(fatal_transcode_charset => $cs); # @@FIXME might need better text $File->{Templates}->{Error}->param(fatal_transcode_errmsg => "Encoding not supported."); return $File; } elsif (!$CFG->{Charsets}->{$cs}) { # not in the list, but technically OK -> we warn &add_warning('W24', { W24_declared => $cs, }); } my $output; my $input = $File->{Bytes}; # Try to transcode eval { $output = Encode::decode($cs, $input, Encode::FB_CROAK); }; # Transcoding failed if ($@) { my $line_num = 0; foreach my $input_line (split /\r\n|\n|\r/, $input){ $line_num++; eval { Encode::decode($cs, $input_line, Encode::FB_CROAK); }; if ($@) { $File->{'Error Flagged'} = TRUE; $File->{Templates}->{Error}->param(fatal_byte_error => TRUE); $File->{Templates}->{Error}->param(fatal_byte_lines => $line_num); $File->{Templates}->{Error}->param(fatal_byte_charset => $cs); my $croak_message = $@; $croak_message =~ s/ at .*//; $File->{Templates}->{Error}->param(fatal_byte_error_msg => $croak_message); } } return $File; } # @@FIXME is this what we want? $output =~ s/\015?\012/\n/g; # make sure we deal only with unix newlines # tentative fix for http://www.w3.org/Bugs/Public/show_bug.cgi?id=3992 $output =~ s/(\r\n|\n|\r)/\n/g; #debug: we could check if the content has utf8 bit on with #$output= utf8::is_utf8($output) ? 1 : 0; $File->{Content} = [split/\n/, $output]; return $File; } sub find_encodings { my $File = shift; my $bom = HTML::Encoding::encoding_from_byte_order_mark($File->{Bytes}); my @first = HTML::Encoding::encoding_from_first_chars($File->{Bytes}); if (defined $bom) { # @@FIXME this BOM entry should not be needed at all! $File->{BOM} = length(Encode::encode($bom, "\x{FEFF}")); $File->{Charset}->{Auto} = lc $bom; } else { $File->{Charset}->{Auto} = lc($first[0]) if @first; } my $xml = HTML::Encoding::encoding_from_xml_document($File->{Bytes}); $File->{Charset}->{XML} = lc $xml if defined $xml; my %metah; foreach my $try (@first) { # @@FIXME I think the old code used HTML::Parser xml mode, check this is ok my $meta = HTML::Encoding::encoding_from_meta_element($File->{Bytes}, $try); $metah{lc($meta)}++ if defined $meta and length $meta; } my @meta = sort { $metah{$b} <=> $metah{$a} } keys %metah; $File->{Charset}->{META} = $meta[0] if @meta; return $File; } # # Abort with a message if an error was flagged at point. sub abort_if_error_flagged { my $File = shift; my $Flags = shift; return unless $File->{'Error Flagged'}; return if $File->{'Error Handled'}; # Previous error, keep going. $File->{Templates}->{Error}->param(fatal_error => TRUE); if ($File->{Opt}->{Output} eq 'html') { &prep_template($File, $File->{Templates}->{Error}); # transcode output from perl's internal to utf-8 and output print Encode::encode('UTF-8',$File->{Templates}->{Error}->output); exit; } else { #@@FIXME: This is borked after templatification. # &add_warning($File, 'fatal', 'Fatal Error', <<".EOF."); # A fatal error has occurred while processing the requested document. Processing # has continued but any later output will be of dubious quality. Limitations of # this output mode prevent the full error message from being returned; please # retry this operation in interactive mode using the web interface to see the # actual error message. # .EOF. #@@FIXME; $File->{'Error Handled'} = TRUE; } } # # conflicting encodings sub charset_not_equal { my $encodingA = shift; my $encodingB = shift; return $encodingA && $encodingB && ($encodingA ne $encodingB); } # # Construct a self-referential URL from a CGI.pm $q object. sub self_url_q { my ($q, $File) = @_; my $thispage = $File->{Env}->{'Self URI'} . '?'; # Pass-through parameters for my $param (qw(uri accept accept-language accept-charset)) { $thispage .= "$param=" . uri_escape($q->param($param)) . ';' if $q->param($param); } # Boolean parameters for my $param (qw(ss outline No200 verbose group)) { $thispage .= "$param=1;" if $q->param($param); } # Others if ($q->param('doctype') and not $q->param('doctype') =~ /(Inline|detect)/i) { $thispage .= 'doctype=' . uri_escape($q->param('doctype')) . ';'; } if ($q->param('charset') and not $q->param('charset') =~ /detect/i) { $thispage .= 'charset=' . uri_escape($q->param('charset')) . ';'; } $thispage =~ s/[\?;]$//; return $thispage; } # # Return random tip and its URL. sub get_tip { my @tipAddrs = keys %{$CFG->{Tips}}; my $tipAddr = $tipAddrs[rand scalar @tipAddrs]; my $tipSlug = $CFG->{Tips}->{$tipAddr}; return [$tipAddr, $tipSlug]; } # # Construct a self-referential URL from a $File object. sub self_url_file { my $File = shift; my $thispage = $File->{Env}->{'Self URI'}; my $escaped_uri = uri_escape($File->{URI}); $thispage .= qq(?uri=$escaped_uri); $thispage .= ';ss=1' if $File->{Opt}->{'Show Source'}; $thispage .= ';st=1' if $File->{Opt}->{'Show Tidy'}; $thispage .= ';outline=1' if $File->{Opt}->{Outline}; $thispage .= ';No200=1' if $File->{Opt}->{No200}; $thispage .= ';verbose=1' if $File->{Opt}->{Verbose}; $thispage .= ';group=1' if $File->{Opt}->{'Group Errors'}; $thispage .= ';accept=' . uri_escape($File->{Opt}->{'Accept Header'}) if $File->{Opt}->{'Accept Header'}; $thispage .= ';accept-language=' . uri_escape($File->{Opt}->{'Accept-Language Header'}) if $File->{Opt}->{'Accept-Language Header'}; $thispage .= ';accept-charset=' . uri_escape($File->{Opt}->{'Accept-Charset Header'}) if $File->{Opt}->{'Accept-Charset Header'}; return $thispage; } ##### package W3C::Validator::EventHandler; # # Define global constants use constant TRUE => 1; use constant FALSE => 0; # # Tentative Validation Severities. use constant T_WARN => 4; # 0000 0100 use constant T_ERROR => 8; # 0000 1000 # # Output flags for error processing use constant O_SOURCE => 1; # 0000 0001 use constant O_CHARSET => 2; # 0000 0010 use constant O_DOCTYPE => 4; # 0000 0100 use constant O_NONE => 8; # 0000 1000 sub new { my $class = shift; my $parser = shift; my $File = shift; my $CFG = shift; my $self = { _file => $File, CFG => $CFG, _parser => $parser }; bless $self, $class; } sub start_element { my ($self, $element) = @_; my $has_xmlns = FALSE; my $xmlns_value = undef; # If in XML mode, find namespace used for each element. if (&W3C::Validator::MarkupValidator::is_xml($self->{_file})) { if (my $attr = $element->{Attributes}->{xmlns}) { $xmlns_value = ""; # Try with SAX method if ($attr->{Value}) { $has_xmlns = TRUE; $xmlns_value = $attr->{Value}; } #next if $has_xmlns; # The following is not SAX, but OpenSP specific. my $defaulted = $attr->{Defaulted} || ''; if ($defaulted eq "specified") { $has_xmlns = TRUE; $xmlns_value .= join("", map { $_->{Data} } @{$attr->{CdataChunks}}); } } } my $doctype = $self->{_file}->{DOCTYPE}; if (!defined($self->{CFG}->{Types}->{$doctype}->{Name}) || $element->{Name} ne $self->{CFG}->{Types}->{$doctype}->{Name}) { # add to list of non-root namespaces push(@{$self->{_file}->{Namespaces}}, $xmlns_value) if $has_xmlns; } elsif (!$has_xmlns and $self->{CFG}->{Types}->{$doctype}->{"Namespace Required"}) { # whine if the root xmlns attribute is noted as required by spec, # but not present my $err; my $location = $self->{_parser}->get_location(); $err->{src} = '...'; # do this with show_open_entities()? $err->{line} = $location->{LineNumber}; $err->{char} = $location->{ColumnNumber}; $err->{num} = "no-xmlns"; $err->{type} = "E"; $err->{msg} = "Missing xmlns attribute for element ".$element->{Name} . ". The value should be: $self->{CFG}->{Types}->{$doctype}->{Namespace}"; # ... $self->{_file}->{'Is Valid'} = FALSE; push @{$self->{_file}->{Errors}}, $err; } elsif ($has_xmlns and (defined $self->{CFG}->{Types}->{$doctype}->{Namespace}) and ($xmlns_value ne $self->{CFG}->{Types}->{$doctype}->{Namespace}) ) { # whine if root xmlns element is not the one specificed by the spec my $err; my $location = $self->{_parser}->get_location(); $err->{src} = '...'; # do this with show_open_entities()? $err->{line} = $location->{LineNumber}; $err->{char} = $location->{ColumnNumber}; $err->{num} = "wrong-xmlns"; $err->{type} = "E"; $err->{msg} = "Wrong xmlns attribute for element $element->{Name}. ". "The value should be: $self->{CFG}->{Types}->{$doctype}->{Namespace}"; # ... $self->{_file}->{'Is Valid'} = FALSE; push @{$self->{_file}->{Errors}}, $err; } } sub error { my $self = shift; my $error = shift; my $mess; eval { $mess = $self->{_parser}->split_message($error); }; if ($@) { # this is a message that S:P:O could not handle, we skip its croaking return; } my $File = $self->{_file}; # TODO: this does not filter out errors in DTDs. my $err; $err->{src} = '...'; # do this with show_open_entities()? $err->{line} = $mess->{primary_message}{LineNumber}; $err->{char} = $mess->{primary_message}{ColumnNumber}+1; $err->{num} = $mess->{primary_message}{Number}; $err->{type} = $mess->{primary_message}{Severity}; $err->{msg} = $mess->{primary_message}{Text}; $err->{msg} =~ s/"&"/"&"/gsi; # our parser OpenSP is not quite XML-aware, or XML Namespaces Aware, # so we filter out a few errors for now my $is_xml = &W3C::Validator::MarkupValidator::is_xml($File); if ($is_xml and $err->{num} eq '108' and $err->{msg} =~ m{ "xmlns:\S+"}) { # the error is about a missing xmlns: attribute definition" return; # this is not an error, 'cause we said so } if ($err->{num} eq '187') # filtering out no "document type declaration; will parse without validation" # if root element is not html and mode is xml... { # since parsing was done without validation, result can only be "well-formed" if ($is_xml and lc($File->{Root}) ne 'html') { $File->{XMLWF_ONLY} = TRUE; W3C::Validator::MarkupValidator::add_warning('W09xml', {}); return; # don't report this as an error, just proceed } # if mode is not XML, we do report the error. It should not happen in the case of without doctype, # in that case the error message will be #344 } if (($err->{num} eq '113') and ($err->{msg} =~ /xml:space/)) { # FIXME # this is a problem with some of the "flattened" W3C DTDs, filtering them out to not confuse users. # hoping to get the DTDs fixed, see http://lists.w3.org/Archives/Public/www-html-editor/2007AprJun/0010.html return; # don't report this, just proceed } if ($is_xml and $err->{num} eq '344' and $File->{Namespace}) { # we are in XML mode, we have a namespace, but no doctype. # the validator will already have said "no doctype, falling back to default" above # no need to report this. return; # don't report this, just proceed } if (($err->{num} eq '248') or ($err->{num} eq '247') or ($err->{num} eq '246')) { # these two errors should be triggered by -wmin-tag to report shorttag used, # but we're making them warnings, not errors # see http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.7 $err->{type} = "W"; } # Workaround for onsgmls as of 1.5 sometimes allegedly reporting errors # beyond EOL. If you see this warning in your web server logs, please # let the validator developers know, see http://validator.w3.org/feedback.html # As long as $err may be from somewhere else than the document (such as # from a DTD) and we have no way of identifying these cases, this # produces bogus results and error log spewage, so commented out for now. # if ((my $l = length($File->{Content}->[$err->{line}-1])) < $err->{char}) { # warn("Warning: reported error column larger than line length " . # "($err->{char} > $l) in $File->{URI} line $err->{line}, " . # "OpenSP bug? Resetting to line length."); # $err->{char} = $l; # } # No or unknown FPI and a relative SI. if ($err->{msg} =~ m(cannot (open|find))) { $File->{'Error Flagged'} = TRUE; $File->{Templates}->{Error}->param(fatal_parse_extid_error => TRUE); $File->{Templates}->{Error}->param(fatal_parse_extid_msg => $err->{msg}); } # No DOCTYPE found! We are falling back to vanilla DTD if ($err->{msg} =~ m(prolog can\'t be omitted)) { if (lc($File->{Root}) eq 'html') { my $dtd = ($is_xml ? $File->{"Default DOCTYPE"}->{"XHTML"} : $File->{"Default DOCTYPE"}->{"HTML"} ); W3C::Validator::MarkupValidator::add_warning('W09', {W09_dtd => $dtd}); } else { # not html root element, we are not using fallback unless ($is_xml) { $File->{'Is Valid'} = FALSE; W3C::Validator::MarkupValidator::add_warning('W09nohtml', {}); } } return; # Don't report this as a normal error. } # TODO: calling exit() here is probably a bad idea W3C::Validator::MarkupValidator::abort_if_error_flagged($File, O_DOCTYPE); push @{$File->{Errors}}, $err; # ... $File->{'Is Valid'} = FALSE if $err->{type} eq 'E'; if (defined $mess->{aux_message}) { # "duplicate id ... first defined here" style messages push @{$File->{Errors}}, { line => $mess->{aux_message}{LineNumber}, char => $mess->{aux_message}{ColumnNumber}+1, msg => $mess->{aux_message}{Text}, type => 'I', }; } } package W3C::Validator::EventHandler::Outliner; # # Define global constants use constant TRUE => 1; use constant FALSE => 0; # # Tentative Validation Severities. use constant T_WARN => 4; # 0000 0100 use constant T_ERROR => 8; # 0000 1000 # # Output flags for error processing use constant O_SOURCE => 1; # 0000 0001 use constant O_CHARSET => 2; # 0000 0010 use constant O_DOCTYPE => 4; # 0000 0100 use constant O_NONE => 8; # 0000 1000 use base qw(W3C::Validator::EventHandler); sub new { my $class = shift; my $parser = shift; my $File = shift; my $CFG = shift; my $self = $class->SUPER::new($parser, $File, $CFG); $self->{am_in_heading} = 0; $self->{heading_text} = []; bless $self, $class; } sub data { my ($self, $chars) = @_; push(@{$self->{heading_text}}, $chars->{Data}) if $self->{am_in_heading}; } sub start_element { my ($self, $element) = @_; if ($element->{Name} =~ /^h([1-6])$/i) { $self->{_file}->{heading_outline} ||= ""; $self->{_file}->{heading_outline} .= " " x int($1) . "[$element->{Name}] "; $self->{am_in_heading} = 1; } return $self->SUPER::start_element($element); } sub end_element { my ($self, $element) = @_; if ($element->{Name} =~ /^h[1-6]$/i) { my $text = join("", @{$self->{heading_text}}); $text =~ s/^\s+//g; $text =~ s/\s+/ /g; $text =~ s/\s+$//g; $self->{_file}->{heading_outline} .= "$text\n"; $self->{am_in_heading} = 0; $self->{heading_text} = []; } } ##### package W3C::Validator::UserAgent; use HTTP::Message qw(); use LWP::UserAgent 2.032 qw(); # Need 2.032 for default_header() use Net::hostent qw(gethostbyname); use Net::IP qw(); use Socket qw(inet_ntoa); use base qw(LWP::UserAgent); BEGIN { # The 4k default line length in LWP <= 5.832 isn't enough for example to # accommodate 4kB cookies (RFC 2985); bump it (#6678). require LWP::Protocol::http; push(@LWP::Protocol::http::EXTRA_SOCK_OPTS, MaxLineLength => 8*1024); } sub new { my ($proto, $CFG, $File, @rest) = @_; my $class = ref($proto) || $proto; my $self = $class->SUPER::new(@rest); $self->{'W3C::Validator::CFG'} = $CFG; $self->{'W3C::Validator::File'} = $File; $self->env_proxy(); $self->agent($File->{Opt}->{'User Agent'}); $self->protocols_allowed($CFG->{Protocols}->{Allow} || ['http', 'https']); # Don't parse the http-equiv stuff. $self->parse_head(0); # Tell caches in the middle we want a fresh copy (Bug 4998). $self->default_header('Cache-Control' => 'max-age=0'); # If not in debug mode, set Accept-Encoding to what LWP (>= 5.816) can handle $self->default_header('Accept-Encoding' => scalar HTTP::Message::decodable()) if (!$File->{Opt}->{Debug} && HTTP::Message->can('decodable')); return $self; } sub redirect_ok { my ($self, $req, $res) = @_; return $self->SUPER::redirect_ok($req, $res) && $self->uri_ok($req->uri()); } sub uri_ok { my ($self, $uri) = @_; return 1 if ($self->{'W3C::Validator::CFG'}->{'Allow Private IPs'} or !$uri->can('host')); my $h5uri = $self->{'W3C::Validator::CFG'}->{External}->{HTML5}; if ($h5uri) { my $clone = $uri->clone(); $clone->query(undef); $clone->fragment(undef); $h5uri = URI->new($h5uri); $h5uri->query(undef); $h5uri->fragment(undef); return 1 if $clone->eq($h5uri); } my $addr = my $iptype = undef; if (my $host = gethostbyname($uri->host())) { $addr = inet_ntoa($host->addr()) if $host->addr(); if ($addr && (my $ip = Net::IP->new($addr))) { $iptype = $ip->iptype(); } } if ($iptype && $iptype ne 'PUBLIC') { my $File = $self->{'W3C::Validator::File'}; $File->{'Error Flagged'} = 1; $File->{Templates}->{Error}->param(fatal_ip_error => 1); $File->{Templates}->{Error}->param(fatal_ip_hostname => 1) if $addr and $uri->host() ne $addr; $File->{Templates}->{Error}->param(fatal_ip_host => ($uri->host() || 'undefined')); return 0; } return 1; } # Local Variables: # mode: perl # indent-tabs-mode: nil # tab-width: 2 # perl-indent-level: 2 # End: # ex: ts=2 sw=2 et