#!/usr/bin/perl -T # # W3C Markup Validation Service # A CGI script to retrieve and validate a markup file # # Copyright 1995-2006 World Wide Web Consortium, (Massachusetts # Institute of Technology, European Research Consortium for Informatics # and Mathematics, Keio University). All Rights Reserved. # # Originally written by Gerald Oskoboiny # for additional contributors, see http://dev.w3.org/cvsweb/validator/ # and http://validator.w3.org/about.html#credits # # This source code is available under the license at: # http://www.w3.org/Consortium/Legal/copyright-software # # $Id: check,v 1.487 2007-03-23 03:52:21 ot Exp $ # # Disable buffering on STDOUT! $| = 1; # # We need Perl 5.8.0+. use 5.008; ############################################################################### #### Load modules. ############################################################ ############################################################################### # # Pragmas. use strict; use warnings; # # Modules. See also the BEGIN block further down below. # # Version numbers given where we absolutely need a minimum version of a given # module (gives nicer error messages). By default, add an empty import list # when loading modules to prevent non-OO or poorly written modules from # polluting our namespace. # use CGI 2.81 qw(-newstyle_urls -private_tempfiles redirect); use CGI::Carp qw(carp croak fatalsToBrowser); use Config::General 2.19 qw(); # Need 2.19 for -AutoLaunder use File::Spec qw(); use HTML::Parser 3.25 qw(); # Need 3.25 for $p->ignore_elements. use HTML::Template 2.6 qw(); use HTTP::Request qw(); use HTTP::Headers::Auth qw(); # Needs to be imported after other HTTP::*. use URI qw(); use URI::Escape qw(uri_escape); use Encode qw(); use Encode::Alias qw(); use HTML::Encoding 0.52 qw(); use SGML::Parser::OpenSP qw(); ############################################################################### #### Constant definitions. #################################################### ############################################################################### # # Define global constants use constant TRUE => 1; use constant FALSE => 0; # # Tentative Validation Severities. use constant T_WARN => 4; # 0000 0100 use constant T_ERROR => 8; # 0000 1000 # # Output flags for error processing use constant O_SOURCE => 1; # 0000 0001 use constant O_CHARSET => 2; # 0000 0010 use constant O_DOCTYPE => 4; # 0000 0100 use constant O_NONE => 8; # 0000 1000 # # Define global variables. use vars qw($DEBUG $CFG $RSRC $VERSION); # # Things inside BEGIN don't happen on every request in persistent environments # (such as mod_perl); so let's do the globals, eg. read config, here. BEGIN { # Launder data for -T; -AutoLaunder doesn't catch this one. if (exists $ENV{W3C_VALIDATOR_HOME}) { $ENV{W3C_VALIDATOR_HOME} =~ /^(.*)$/; $ENV{W3C_VALIDATOR_HOME} = $1; } # # Read Config Files. eval { my %config_opts = ( -ConfigFile => ($ENV{W3C_VALIDATOR_CFG} || '/etc/w3c/validator.conf'), -MergeDuplicateOptions => TRUE, -MergeDuplicateBlocks => TRUE, -SplitPolicy => 'equalsign', -UseApacheInclude => TRUE, -IncludeRelative => TRUE, -InterPolateVars => TRUE, -AutoLaunder => TRUE, -AutoTrue => TRUE, -DefaultConfig => { Protocols => {Allow => 'http,https'}, Paths => { Base => ($ENV{W3C_VALIDATOR_HOME} || '/usr/local/validator'), }, }, ); my %cfg = Config::General->new(%config_opts)->getall(); $CFG = \%cfg; }; if ($@) { die <<".EOF."; Could not read configuration. Set the W3C_VALIDATOR_CFG environment variable or copy conf/* to /etc/w3c/. Make sure that the configuration file and all included files are readable by the web server user. The error was:\n'$@' .EOF. } # # Check a filesystem path for existance and "readability". sub pathcheck (@) { my %paths = map { $_ => [-d $_, -r _] } @_; my @_d = grep {not $paths{$_}->[0]} keys %paths; my @_r = grep {not $paths{$_}->[1]} keys %paths; return TRUE if (scalar(@_d) + scalar(@_r) == 0); die <<".EOF." if scalar @_d; Does not exist or is not a directory: @_d .EOF. die <<".EOF." if scalar @_r; Directory not readable (permission denied): @_r .EOF. } # # Check paths in config... # @@FIXME: This does not do a very good job error-message-wise if a path is # @@FIXME: missing...; { my @dirs = (); push @dirs, $CFG->{Paths}->{Base}; push @dirs, $CFG->{Paths}->{Templates}; push @dirs, $CFG->{Paths}->{SGML}->{Library}; &pathcheck(@dirs); } # # Split allowed protocols into a list. if (my $allowed = delete($CFG->{Protocols}->{Allow})) { $CFG->{Protocols}->{Allow} = [split(/\s*,\s*/, $allowed)]; } { # Make types config indexed by FPI. my $_types = {}; map { $_types->{$CFG->{Types}->{$_}->{PubID}} = $CFG->{Types}->{$_} } keys %{$CFG->{Types}}; $CFG->{Types} = $_types; } # # Change strings to internal constants in MIME type mapping. for (keys %{$CFG->{MIME}}) { $CFG->{MIME}->{$_} = 'TBD' unless $CFG->{MIME}->{$_} eq 'SGML' or $CFG->{MIME}->{$_} eq 'XML'; } # # Set debug flag. if ($CFG->{'Allow Debug'} == TRUE) { $DEBUG = TRUE if $ENV{W3C_VALIDATOR_DEBUG} || $CFG->{'Enable Debug'}; } else { $DEBUG = FALSE; } # # Strings $VERSION = q$Revision: 1.487 $; $VERSION =~ s/Revision: ([\d\.]+) /$1/; # # Use passive FTP by default. $ENV{FTP_PASSIVE} = 1 unless exists($ENV{FTP_PASSIVE}); # Read friendly error message file my %rsrc = Config::General->new( -MergeDuplicateBlocks => 1, -ConfigFile => File::Spec->catfile($CFG->{Paths}->{Templates}, 'en_US', 'error_messages.cfg'), )->getall(); # 'en_US' should be replaced by $lang for lang-neg # Config::General workarounds for issues: # http://lists.w3.org/Archives/Public/public-qa-dev/2006Feb/0022.html # http://lists.w3.org/Archives/Public/public-qa-dev/2006Feb/0025.html # https://rt.cpan.org/Public/Bug/Display.html?id=17852 $rsrc{msg}{0} ||= delete($rsrc{'msg 0'}) || # < 2.31 { original => delete($rsrc{msg}{original}), # 2.31 verbose => delete($rsrc{msg}{verbose}), }; $RSRC = \%rsrc; } # end of BEGIN block. # # Get rid of (possibly insecure) $PATH. delete $ENV{PATH}; #@@DEBUG: Dump $CFG datastructure. Used only as a developer aid. #use Data::Dumper qw(Dumper); #print Dumper($CFG); #exit; #@@DEBUG; ############################################################################### #### Process CGI variables and initialize. #################################### ############################################################################### # # Create a new CGI object. my $q = new CGI; # # The data structure that will hold all session data. # @@FIXME This can't be my() as $File will sooner or # later be undef and add_warning will cause the script # to die. our() seems to work but has other problems. # @@FIXME Apparently, this must be set to {} also, # otherwise the script might pick up an old object # after abort_if_error_flagged under mod_perl. our $File = {}; ################################# # Initialize the datastructure. # ################################# # # Charset data (casing policy: lowercase early). $File->{Charset}->{Use} = ''; # The charset used for validation. $File->{Charset}->{Auto} = ''; # Autodetection using XML rules (Appendix F) $File->{Charset}->{HTTP} = ''; # From HTTP's "charset" parameter. $File->{Charset}->{META} = ''; # From HTML's . $File->{Charset}->{XML} = ''; # From the XML Declaration. $File->{Charset}->{Override} = ''; # From CGI/user override. # # Misc simple types. $File->{Mode} = 'SGML'; # Default parse mode is SGML. # # Listrefs. $File->{Warnings} = []; # Warnings... $File->{Namespaces} = []; # Other (non-root) Namespaces. ######################################### # Populate $File->{Opt} -- CGI Options. # ######################################### # # Preprocess the CGI parameters. $q = &prepCGI($File, $q); ############################################################################### #### Generate Template for Result. ############################################ ############################################################################### # first we determine the chosen language based on # 1) lang argument given as parameter (if this language is available) # 2) HTTP language negotiation between variants available and user-agent choices # 3) English by default my $lang = $q->param('lang') ? $q->param('lang') : ''; my @localizations; my $lang_ok = FALSE; foreach my $lang_available ( split(" ", $CFG->{Languages}) ) { if ($lang eq $lang_available){ $lang_ok = TRUE; next } } if (($lang eq '') or (!$lang_ok)) { # use HTTP-based negotiation $lang = ''; use HTTP::Negotiate qw(choose); foreach my $lang_available ( split(" ", $CFG->{Languages}) ) { push @localizations, [$lang_available, 1.000, 'text/html', undef, 'utf-8', $lang_available , undef] } $lang = choose(\@localizations); } if ($lang eq "en") { $lang = 'en_US'; # legacy } my $T = HTML::Template->new( filename => File::Spec->catfile($CFG->{Paths}->{Templates}, $lang, 'result.tmpl'), die_on_bad_params => FALSE, loop_context_vars => TRUE, cache => TRUE, ); my $E = HTML::Template->new( filename => File::Spec->catfile($CFG->{Paths}->{Templates}, $lang, 'fatal-error.tmpl'), die_on_bad_params => FALSE, cache => TRUE, ); my $H = HTML::Template->new( filename => File::Spec->catfile($CFG->{Paths}->{Templates}, $lang, 'http_401_authrequired.tmpl'), die_on_bad_params => FALSE, cache => TRUE, ); # templates for alternate output formats my $XMLT = HTML::Template->new( filename => File::Spec->catfile($CFG->{Paths}->{Templates}, $lang, 'xml_output.tmpl'), die_on_bad_params => FALSE, loop_context_vars => TRUE, cache => TRUE, ); my $SOAPT = HTML::Template->new( filename => File::Spec->catfile($CFG->{Paths}->{Templates}, $lang, 'soap_output.tmpl'), die_on_bad_params => FALSE, loop_context_vars => TRUE, cache => TRUE, ); my $UCNT = HTML::Template->new( filename => File::Spec->catfile($CFG->{Paths}->{Templates}, $lang, 'ucn_output.tmpl'), die_on_bad_params => FALSE, loop_context_vars => TRUE, cache => TRUE, ); my $SOAPFT = HTML::Template->new( filename => File::Spec->catfile($CFG->{Paths}->{Templates}, $lang, 'soap_fault.tmpl'), die_on_bad_params => FALSE, loop_context_vars => TRUE, cache => TRUE, ); my $SOAPDIS = HTML::Template->new( filename => File::Spec->catfile($CFG->{Paths}->{Templates}, $lang, 'soap_disabled.tmpl'), die_on_bad_params => FALSE, loop_context_vars => TRUE, cache => TRUE, ); my $EARLT = HTML::Template->new( filename => File::Spec->catfile($CFG->{Paths}->{Templates}, $lang, 'earl_xml.tmpl'), die_on_bad_params => FALSE, loop_context_vars => TRUE, cache => TRUE, global_vars => TRUE, ); my $N3T = HTML::Template->new( filename => File::Spec->catfile($CFG->{Paths}->{Templates}, $lang, 'earl_n3.tmpl'), die_on_bad_params => FALSE, loop_context_vars => TRUE, cache => TRUE, global_vars => TRUE, ); $File->{T} = $T; $File->{S} = $SOAPT; $File->{E} = $E; $File->{H} = $H; $T->param(cfg_home_page => $CFG->{'Home Page'}); $SOAPT->param(cfg_home_page => $CFG->{'Home Page'}); undef $lang; ##################################################### # Populate $File->{Opt} -- CGI Options. (continued) # ##################################################### # # Set session switches. $File->{Opt}->{'Outline'} = $q->param('outline') ? TRUE : FALSE; $File->{Opt}->{'Show Source'} = $q->param('ss') ? TRUE : FALSE; $File->{Opt}->{'Show Tidy'} = $q->param('st') ? TRUE : FALSE; $File->{Opt}->{'Show Parsetree'} = $q->param('sp') ? TRUE : FALSE; $File->{Opt}->{'No Attributes'} = $q->param('noatt') ? TRUE : FALSE; $File->{Opt}->{'Show ESIS'} = $q->param('esis') ? TRUE : FALSE; $File->{Opt}->{'Show Errors'} = $q->param('errors') ? TRUE : FALSE; $File->{Opt}->{'Verbose'} = $q->param('verbose') ? TRUE : FALSE; $File->{Opt}->{'Group Errors'} = $q->param('group') ? TRUE : FALSE; $File->{Opt}->{'Debug'} = $q->param('debug') ? TRUE : FALSE; $File->{Opt}->{'No200'} = $q->param('No200') ? TRUE : FALSE; $File->{Opt}->{'Charset'} = $q->param('charset') ? lc $q->param('charset'): ''; $File->{Opt}->{'DOCTYPE'} = $q->param('doctype') ? $q->param('doctype') : ''; $File->{Opt}->{'Output'} = $q->param('output') ? $q->param('output') : 'html'; $File->{Opt}->{'Max Errors'} = $q->param('me') ? $q->param('me') : ''; # # "Fallback" info for Character Encoding (fbc), Content-Type (fbt), # and DOCTYPE (fbd). If TRUE, the Override values are treated as # Fallbacks instead of Overrides. $File->{Opt}->{FB}->{Charset} = $q->param('fbc') ? TRUE : FALSE; $File->{Opt}->{FB}->{Type} = $q->param('fbt') ? TRUE : FALSE; $File->{Opt}->{FB}->{DOCTYPE} = $q->param('fbd') ? TRUE : FALSE; # # If ";debug" was given, let it overrule the value from the config file, # regardless of whether it's "0" or "1" (on or off), but only if config # allows the debugging options. if ($CFG->{'Allow Debug'}) { $DEBUG = $q->param('debug') if defined $q->param('debug'); $File->{Opt}->{Verbose} = TRUE if $DEBUG; } else { $DEBUG = FALSE; # The default. } &abort_if_error_flagged($File, O_NONE); # # Get the file and metadata. if ($q->param('uploaded_file')) { $File = &handle_file($q, $File); } elsif ($q->param('fragment')) { $File = &handle_frag($q, $File); } elsif ($q->param('uri')) { $File = &handle_uri($q, $File); } # # Abort if an error was flagged during initialization. &abort_if_error_flagged($File, 0); # # Get rid of the CGI object. undef $q; # # We don't need STDIN any more, so get rid of it to avoid getting clobbered # by Apache::Registry's idiotic interference under mod_perl. untie *STDIN; ############################################################################### #### Output validation results. ############################################### ############################################################################### $File = find_encodings($File); # # Decide on a charset to use (first part) # if ($File->{Charset}->{HTTP}) { # HTTP, if given, is authoritative. $File->{Charset}->{Use} = $File->{Charset}->{HTTP}; } elsif ($File->{ContentType} =~ m(^text/([-.a-zA-Z0-9]\+)?xml$)) { # Act as if $http_charset was 'us-ascii'. (MIME rules) $File->{Charset}->{Use} = 'us-ascii'; &add_warning('W01', { W01_upload => $File->{'Is Upload'}, W01_agent => $File->{Server}, W01_ct => $File->{ContentType}, }); } elsif ($File->{Charset}->{XML}) { $File->{Charset}->{Use} = $File->{Charset}->{XML}; } elsif ($File->{Charset}->{Auto} =~ /^utf-16[bl]e$/ && $File->{BOM} == 2) { $File->{Charset}->{Use} = 'utf-16'; } elsif ($File->{ContentType} =~ m(^application/([-.a-zA-Z0-9]+\+)?xml$)) { $File->{Charset}->{Use} = "utf-8"; } elsif (&is_xml($File) and not $File->{ContentType} =~ m(^text/)) { $File->{Charset}->{Use} = 'utf-8'; # UTF-8 (image/svg+xml etc.) } unless ($File->{Charset}->{Use}) { $File->{Charset}->{Use} = $File->{Charset}->{META}; } # # Handle any Fallback or Override for the charset. if ( charset_not_equal($File->{Opt}->{Charset}, '(detect automatically)') and charset_not_equal($File->{Opt}->{Charset}, '') ) { # charset=foo was given to the CGI and it wasn't "autodetect" or empty. # # Extract the user-requested charset from CGI param. my ($override, undef) = split(/\s/, $File->{Opt}->{Charset}, 2); $File->{Charset}->{Override} = lc($override); if ($File->{Opt}->{FB}->{Charset}) { # charset fallback mode unless ($File->{Charset}->{Use}) { &add_warning('W02', {W02_charset => $File->{Charset}->{Override}}); $File->{Tentative} |= T_ERROR; # Tag it as Invalid. $File->{Charset}->{Use} = $File->{Charset}->{Override}; } } else { # charset "hard override" mode if (! $File->{Charset}->{Use}) { # overriding "nothing" &add_warning('W04', {W04_charset => $File->{Charset}->{Override}, W04_override => TRUE}); $File->{Tentative} |= T_ERROR; $File->{Charset}->{Use} = $File->{Charset}->{Override}; } else { #actually overriding something # Warn about Override unless it's the same as the real charset... unless ($File->{Charset}->{Override} eq $File->{Charset}->{Use}) { &add_warning('W03', { W03_use => $File->{Charset}->{Use}, W03_opt => $File->{Charset}->{Override}, }); $File->{Tentative} |= T_ERROR; $File->{Charset}->{Use} = $File->{Charset}->{Override}; } } } } unless ($File->{Charset}->{Use}) { # No charset given... &add_warning('W04', {W04_charset => 'UTF-8'}); $File->{Tentative} |= T_ERROR; # Can never be valid. $File->{Charset}->{Use} = 'utf-8'; } # # Abort if an error was flagged while finding the encoding. &abort_if_error_flagged($File, O_CHARSET|O_DOCTYPE); # # Encode alias definitions. This might not be the best # place for them, feel free to move them elsewhere. # implicit bidi, but character encoding is the same Encode::Alias::define_alias('iso-8859-6-i', 'iso-8859-6'); # implicit bidi, but character encoding is the same Encode::Alias::define_alias('iso-8859-8-i', 'iso-8859-8'); # 0xA0 is U+00A0 in ISO-8859-11 but undefined in tis-620 # other than that the character encodings are equivalent Encode::Alias::define_alias('tis-620', 'iso-8859-11'); # Encode::Byte does not know 'macintosh' but MacRoman Encode::Alias::define_alias('macintosh', 'MacRoman'); # x-mac-roman is the non-standard version of 'macintosh' Encode::Alias::define_alias('x-mac-roman', 'MacRoman'); # Encode only knows the long hand version of 'ksc_5601' Encode::Alias::define_alias('ksc_5601', 'KS_C_5601-1987'); # gb18030 requires Encode::HanExtra but no additional alias # # Always transcode, even if the content claims to be UTF-8 $File = transcode($File); &abort_if_error_flagged($File, O_CHARSET); # # Add a warning if doc is UTF-8 and contains a BOM. if ($File->{Charset}->{Use} eq 'utf-8' && $File->{Content}->[0] =~ m(^\x{FEFF})) { &add_warning('W21', {}); } # # Overall parsing algorithm for documents returned as text/html: # # For documents that come to us as text/html, # # 1. check if there's a doctype # 2. if there is a doctype, parse/validate against that DTD # 3. if no doctype, check for an xmlns= attribute on the first element # 4. if there is an xmlns= attribute, check for XML well-formedness # 5. if there is no xmlns= attribute, and no DOCTYPE, punt. # # # Override DOCTYPE if user asked for it. if ($File->{Opt}->{DOCTYPE} and not $File->{Opt}->{DOCTYPE} =~ /(Inline|detect)/i and $File->{Opt}->{DOCTYPE} ne '1' ) { $File = &override_doctype($File); } # # Try to extract a DOCTYPE or xmlns. $File = &preparse_doctype($File); # # Set parse mode. set_parse_mode($File, $CFG) if $File->{DOCTYPE}; # # Sanity check Charset information and add any warnings necessary. $File = &charset_conflicts($File); # before we start the parsing, clean slate $File->{'Is Valid'} = TRUE; $File->{Errors} = []; # preparse with XML parser if necessary # we should really be using a SAX ErrorHandler, but I can't find # a way to make it work with XML::LibXML::SAX::Parser... ** FIXME ** # ditto, we should try using W3C::Validator::SAXHandler, # but it's badly linked to opensp at the moment if (&is_xml($File)) { use XML::LibXML; my $xmlparser = XML::LibXML->new(); $xmlparser->line_numbers(1); eval { $xmlparser->parse_string(join"\n",@{$File->{Content}}); }; my $xml_parse_errors_line = undef; my @xmlwf_error_list; if ($@) { my $xmlwf_errors = $@; my $xmlwf_error_line = undef; my $xmlwf_error_col = undef; my $xmlwf_error_msg = undef; my $num_xmlwf_error = 0; my $last_err_msg = undef; my $err; foreach my $msg_line (split "\n", $xmlwf_errors){ $msg_line =~ s{[^\x0d\x0a](:\d+:)}{\n$1}g; $msg_line =~ s{[^\x0d\x0a]+[\x0d\x0a]$}{}; if ($msg_line =~ /(:\d+:)(.*)/ ){ $xmlwf_error_line = $1; $xmlwf_error_msg = $2; $xmlwf_error_line =~ s/:(\d+):/$1/; $xmlwf_error_msg =~ s/ parser error :/XML Parsing Error: /; } if ($msg_line =~ /(.+)\^/){ $xmlwf_error_col = length($1); } if ((defined $xmlwf_error_line) and (defined $xmlwf_error_col) and (defined $xmlwf_error_msg)){ $err->{src} = '...'; # do this with show_open_entities()? $err->{line} = $xmlwf_error_line; $err->{char} = $xmlwf_error_col; $err->{num} = 'xmlwf'; $err->{type} = "E"; $err->{msg} = $xmlwf_error_msg; # ... $last_err_msg = $err; push (@xmlwf_error_list, $err); $err = undef; $xmlwf_error_line = undef; $xmlwf_error_col = undef; $xmlwf_error_msg = undef; $num_xmlwf_error++; } } foreach my $errmsg (@xmlwf_error_list){ $File->{'Is Valid'} = FALSE; push @{$File->{Errors}}, $errmsg; } } } # # Abandon all hope ye who enter here... $File = &parse($File); sub parse (\$) { my $File = shift; # TODO switch parser on the fly my $opensp = SGML::Parser::OpenSP->new(); my $parser_name = "SGML::Parser::OpenSP"; # # By default, use SGML catalog file and SGML Declaration. my $catalog = File::Spec->catfile($CFG->{Paths}->{SGML}->{Library}, 'sgml.soc'); # default parsing options my @spopt = qw(valid non-sgml-char-ref no-duplicate); # # Switch to XML semantics if file is XML. if (&is_xml($File)) { $catalog = File::Spec->catfile($CFG->{Paths}->{SGML}->{Library}, 'xml.soc'); push(@spopt, 'xml'); # workaround for a bug in s:p:o 0.99 # see http://www.w3.org/Bugs/Public/show_bug.cgi?id=798#c5 push(@spopt, 'xml'); # FIXME when fixed s:p:o gets released } # # Parser configuration $opensp->search_dirs($CFG->{Paths}->{SGML}->{Library}); $opensp->catalogs($catalog); $opensp->show_error_numbers(1); $opensp->warnings(@spopt); # # Restricted file reading is disabled on Win32 for the time # beeing since neither SGML::Parser::OpenSP nor check auto- # magically set search_dirs to include the temp directory # so restricted file reading would defunct the Validator. $opensp->restrict_file_reading(1) unless $^O eq 'MSWin32'; # # Set debug info for HTML report. $T->param(opt_debug => $DEBUG); $T->param(debug => [ map({name => $_, value => $ENV{$_}}, qw(no_proxy http_proxy https_proxy ftp_proxy FTP_PASSIVE)), { name => 'Content-Encoding', value => $File->{ContentEnc} }, { name => 'Content-Language', value => $File->{ContentLang} }, { name => 'Content-Location', value => $File->{ContentLoc} }, { name => 'Transfer-Encoding', value => $File->{TransferEnc} }, { name => 'Parse Mode', value => $File->{Mode} }, { name => 'Parser', value => $parser_name }, { name => 'Parser Options', value => join " ", @spopt }, ], ); my $h = W3C::Validator::SAXHandler->new($opensp, $File); $opensp->handler($h); $opensp->parse_string(join"\n",@{$File->{Content}}); # Make sure there are no circular references, otherwise the script # would leak memory until mod_perl unloads it which could take some # time. @@FIXME It's probably overly careful though. $opensp->handler(undef); undef $h->{_parser}; undef $h->{_file}; undef $h; undef $opensp; # # Set Version to be the FPI initially. $File->{Version} = $File->{DOCTYPE}; return $File; } # # Force "XML" if type is an XML type and an FPI was not found. # Otherwise set the type to be the FPI. if (&is_xml($File) and not $File->{DOCTYPE}) { $File->{Version} = 'XML'; } else { $File->{Version} = $File->{DOCTYPE} unless $File->{Version}; } # # Get the pretty text version of the FPI if a mapping exists. if (my $prettyver = $CFG->{Types}->{$File->{Version}}->{Display}) { $File->{Version} = $prettyver; } # # check the received mime type against Allowed mime types if ($File->{ContentType}){ my @allowedMediaType = split(" ", $CFG->{Types}->{$File->{DOCTYPE}}->{Types}->{Allowed}); my $usedCTisAllowed; if (scalar @allowedMediaType) { $usedCTisAllowed = FALSE; foreach (@allowedMediaType) { $usedCTisAllowed = TRUE if ($_ eq $File->{ContentType}); } } else { # wedon't know what media type is recommended, so better shut up $usedCTisAllowed = TRUE; } if(! $usedCTisAllowed ){ &add_warning('W23', { W23_type => $File->{ContentType}, W23_type_pref => $CFG->{Types}->{$File->{DOCTYPE}}->{Types}->{Preferred}, w23_doctype => $File->{Version} }); } } # # Warn about unknown, incorrect, or missing Namespaces. if ($File->{Namespace}) { my $ns = $CFG->{Types}->{$File->{Version}}->{Namespace} || FALSE; if (&is_xml($File)) { if ($ns eq $File->{Namespace}) { &add_warning('W10', { W10_ns => $File->{Namespace}, W10_type => $File->{Type}, }); } } else { &add_warning('W11', {W11_ns => $File->{Namespace}}); } } else { if (&is_xml($File) and $CFG->{Types}->{$File->{Version}}->{Namespace}) { &add_warning('W12', {}); } } ## if invalid content, pass through tidy if (! $File->{'Is Valid'}) { eval { local $SIG{__DIE__}; require HTML::Tidy; my $tidy = HTML::Tidy->new({config_file => $CFG->{Paths}->{TidyConf}}); $File->{'Tidy'} = $tidy->clean(join"\n",@{$File->{Content}}); $File->{'Tidy_OK'} = TRUE; }; if ($@) { $File->{'Tidy_OK'} = FALSE; } } my $template; if ($File->{Opt}->{Output} eq 'xml') { $template = $XMLT; } elsif ($File->{Opt}->{Output} eq 'earl') { $template = $EARLT; } elsif ($File->{Opt}->{Output} eq 'n3') { $template = $N3T; } elsif ($File->{Opt}->{Output} eq 'ucn') { $template = $UCNT; } elsif ($File->{Opt}->{Output} eq 'soap12') { if ($CFG->{'Enable SOAP'} != 1) { # API disabled - ideally this should have been sent before performing validation... print CGI::header(-status => 503, -content_language => "en", -type => "text/html", -charset => "utf-8" ); $template = $SOAPDIS; } elsif ($File->{'Error Flagged'}) { # should send SOAP fault message $template = $SOAPFT; } else { $template = $SOAPT; } } else { $template = $T; } &prep_template($File, $template); fin_template($File, $template); $template->param(file_warnings => $File->{Warnings}); $template->param(tidy_output => $File->{'Tidy'}); $template->param(file_source => &source($File)) if $template->param('opt_show_source'); #$template->param('opt_show_esis' => TRUE) # if $File->{Opt}->{'Show ESIS'}; #$template->param('opt_show_raw_errors' => TRUE) # if $File->{Opt}->{'Show Errors'}; #$template->param('file_raw_errors' => &show_errors($File)) # if $template->param('opt_show_raw_errors'); # $T->param(file_outline => &outline($File)) if $T->param('opt_show_outline'); print $template->output; # # Get rid of $File object and exit. undef $File; exit; ############################################################################# # Subroutine definitions ############################################################################# # # Generate HTML report. sub prep_template ($$) { my $File = shift; my $T = shift; # # XML mode... $T->param(is_xml => &is_xml($File)); # # Upload? $T->param(is_upload => $File->{'Is Upload'}); # # The URI... $T->param(file_uri => $File->{URI}); $T->param(file_uri_param => uri_escape($File->{URI})); # # Set URL for page title. $T->param(page_title_url => $File->{URI}); # # Metadata... $T->param(file_modified => $File->{Modified}); $T->param(file_server => $File->{Server}); $T->param(file_size => $File->{Size}); $T->param(file_contenttype => $File->{ContentType}); $T->param(file_charset => $File->{Charset}->{Use}); $T->param(file_doctype => $File->{DOCTYPE}); # # Output options... $T->param(opt_show_source => $File->{Opt}->{'Show Source'}); $T->param(opt_show_tidy => $File->{Opt}->{'Show Tidy'}); $T->param(opt_show_outline => $File->{Opt}->{'Outline'}); $T->param(opt_show_parsetree => $File->{Opt}->{'Show Parsetree'}); $T->param(opt_show_noatt => $File->{Opt}->{'No Attributes'}); $T->param(opt_verbose => $File->{Opt}->{'Verbose'}); $T->param(opt_group_errors => $File->{Opt}->{'Group Errors'}); $T->param(opt_no200 => $File->{Opt}->{'No200'}); # # Tip of the Day... my $tip = &get_tip(); $T->param(tip_uri => $tip->[0]); $T->param(tip_slug => $tip->[1]); # # Namespaces... $T->param(file_namespace => $File->{Namespace}); my %seen_ns = (); my @bulk_ns = @{$File->{Namespaces}}; $File->{Namespaces} = []; # reinitialize the list of non-root namespaces # ... and then get a uniq version of it foreach my $single_namespace (@bulk_ns) { push(@{$File->{Namespaces}}, $single_namespace) unless $seen_ns{$single_namespace}++; } my @nss = map({uri => $_}, @{$File->{Namespaces}}); $T->param(file_namespaces => \@nss) if @nss; if ($File->{Opt}->{DOCTYPE}) { my $over_doctype_param = "override doctype $File->{Opt}->{DOCTYPE}"; $T->param($over_doctype_param => TRUE); } if ($File->{Opt}->{Charset}) { my $over_charset_param = "override charset $File->{Opt}->{Charset}"; $T->param($over_charset_param => TRUE); } } sub fin_template ($$) { my $File = shift; my $T = shift; if (! $File->{Doctype} and ($File->{Version} eq 'unknown' or $File->{Version} eq 'SGML' or (!$File->{Version}))) { # @@TODO@@ we should try falling back on other version # info, such as the ones stored in Version_ESIS $T->param(file_version => '(no Doctype found)'); } else { $T->param(file_version => $File->{Version}); } my ($num_errors,$num_warnings, $num_info, $reported_errors) = &report_errors($File); if ($num_errors+$num_warnings > 0) { $T->param(has_errors => 1); } $T->param(valid_errors_num => $num_errors); $num_warnings += scalar @{$File->{Warnings}}; $T->param(valid_warnings_num => $num_warnings); my $number_of_errors = ""; # textual form of $num_errors my $number_of_warnings = ""; # textual form of $num_errors # The following is a bit hack-ish, but will enable us to have some logic # for a human-readable display of the number, with cases for 0, 1, 2 and above # (the case of 2 appears to be useful for localization in some languages where the plural is different for 2, and above) if ($num_errors > 1) { $T->param(number_of_errors_is_0 => FALSE ); $T->param(number_of_errors_is_1 => FALSE); if ($num_errors eq 2) { $T->param(number_of_errors_is_2 => TRUE); } else { $T->param(number_of_errors_is_2 => FALSE ); } $T->param(number_of_errors_is_plural => TRUE ); } elsif ($num_errors eq 1) { $T->param(number_of_errors_is_0 => FALSE ); $T->param(number_of_errors_is_1 => TRUE ); $T->param(number_of_errors_is_2 => FALSE ); $T->param(number_of_errors_is_plural => FALSE ); } else { # 0 $T->param(number_of_errors_is_0 => TRUE ); $T->param(number_of_errors_is_1 => FALSE ); $T->param(number_of_errors_is_2 => FALSE ); $T->param(number_of_errors_is_plural => FALSE ); } if ($num_warnings > 1) { $T->param(number_of_warnings_is_0 => FALSE ); $T->param(number_of_warnings_is_1 => FALSE); if ($num_warnings eq 2) { $T->param(number_of_warnings_is_2 => TRUE); } else { $T->param(number_of_warnings_is_2 => FALSE ); } $T->param(number_of_warnings_is_plural => TRUE ); } elsif ($num_warnings eq 1) { $T->param(number_of_warnings_is_0 => FALSE ); $T->param(number_of_warnings_is_1 => TRUE ); $T->param(number_of_warnings_is_2 => FALSE ); $T->param(number_of_warnings_is_plural => FALSE ); } else { # 0 $T->param(number_of_warnings_is_0 => TRUE ); $T->param(number_of_warnings_is_1 => FALSE ); $T->param(number_of_warnings_is_2 => FALSE ); $T->param(number_of_warnings_is_plural => FALSE ); } $T->param(file_errors => $reported_errors); if ($File->{'Is Valid'}) { $T->param(VALID => TRUE); $T->param(valid_status => 'Valid'); &report_valid($File, $T); } else { $T->param(VALID => FALSE); $T->param(valid_status => 'Invalid'); } } # # Output "This page is Valid" report. sub report_valid { my $File = shift; my $T = shift; unless ($File->{Version} eq 'unknown' or defined $File->{Tentative}) { if (exists $CFG->{Types}->{$File->{DOCTYPE}}->{Badge}) { my $cfg = $CFG->{Types}->{$File->{DOCTYPE}}; $T->param(have_badge => TRUE); $T->param(badge_uri => $cfg->{Badge}->{URI}); if (exists $CFG->{Types}->{$File->{DOCTYPE}}->{Badge}->{'ALT URI'}) { $T->param(badge_alt_uri => $cfg->{Badge}->{'ALT URI'}); $T->param(have_alt_badge => TRUE); } $T->param(badge_alt => $cfg->{Badge}->{Alt}); $T->param(badge_h => $cfg->{Badge}->{Height}); $T->param(badge_w => $cfg->{Badge}->{Width}); $T->param(badge_tagc => ($cfg->{'Parse Mode'} eq 'XML' ? ' /' : '')); } } elsif (defined $File->{Tentative}) { $T->param(is_tentative => TRUE); } if ($File->{Opt}->{'Outline'}) { $T->param(file_outline => $File->{heading_outline}); } my $thispage = self_url_file($File); $T->param(file_thispage => $thispage); } # # Add a waring message to the output. sub add_warning ($$) { my $WID = shift; my $params = shift; push @{$File->{Warnings}}, $WID; $File->{T}->param($WID => TRUE, %{$params}); $File->{T}->param(have_warnings => TRUE); $File->{E}->param($WID => TRUE, %{$params}); $File->{E}->param(have_warnings => TRUE); $File->{S}->param($WID => TRUE, %{$params}); $File->{S}->param(have_warnings => TRUE); } # # Proxy authentication requests. # Note: expects the third argument to be a hash ref (see HTTP::Headers::Auth). sub authenticate { my $File = shift; my $resource = shift; my $authHeader = shift || {}; my $realm = $resource; $realm =~ s([^\w\d.-]*){}g; for my $scheme (keys(%$authHeader)) { my $origrealm = $authHeader->{$scheme}->{realm}; if (not defined $origrealm or lc($scheme) !~ /^(?:basic|digest)$/) { delete($authHeader->{$scheme}); next; } $authHeader->{$scheme}->{realm} = "$realm-$origrealm"; } my $headers = HTTP::Headers->new(Connection => 'close'); $headers->www_authenticate(%$authHeader); $headers = $headers->as_string(); chomp($headers); $File->{H}->param(http_401_headers => $headers); $File->{H}->param(http_401_url => $resource); print $File->{H}->output; exit; # Further interaction will be a new HTTP request. } # # Fetch an URL and return the content and selected meta-info. sub handle_uri { my $q = shift; # The CGI object. my $File = shift; # The master datastructure. my $uri = new URI (ref $q ? $q->param('uri') : $q)->canonical(); $uri->fragment(undef); my $ua = new W3C::Validator::UserAgent ($CFG, $File); $ua->env_proxy(); $ua->agent("W3C_Validator/$VERSION"); $ua->parse_head(0); # Don't parse the http-equiv stuff. $ua->protocols_allowed($CFG->{Protocols}->{Allow} || ['http', 'https']); unless ($ua->is_protocol_supported($uri)) { $File->{'Error Flagged'} = TRUE; if (($uri->canonical() eq "1") ) #if uri param is empty (also for empty direct or upload), it's been set to TRUE in sub prepCGI() { $File->{E}->param(fatal_no_content => TRUE); } else { $File->{E}->param(fatal_uri_error => TRUE); $File->{E}->param(fatal_uri_scheme => $uri->scheme()); } return $File; } return $File unless $ua->uri_ok($uri); my $req = new HTTP::Request(GET => $uri); # If we got a Authorization header, the client is back at it after being # prompted for a password so we insert the header as is in the request. if($ENV{HTTP_AUTHORIZATION}){ $req->headers->header(Authorization => $ENV{HTTP_AUTHORIZATION}); } my $res = $ua->request($req); return $File if $File->{'Error Flagged'}; # Redirect IP rejected? unless ($res->code == 200 or $File->{Opt}->{'No200'}) { if ($res->code == 401) { my %auth = $res->www_authenticate(); # HTTP::Headers::Auth &authenticate($File, $res->request->uri, \%auth); } else { $File->{'Error Flagged'} = TRUE; $File->{E}->param(fatal_http_error => TRUE); $File->{E}->param(fatal_http_uri => $uri->as_string); $File->{E}->param(fatal_http_code => $res->code); $File->{E}->param(fatal_http_msg => $res->message); $File->{E}->param(fatal_http_dns => TRUE) if $res->code == 500; } return $File; } # # Enforce Max Recursion level. &check_recursion($File, $res); my ($mode, $ct, $charset) = &parse_content_type( $File, $res->header('Content-Type'), scalar($res->request->uri), ); my $lastmod = undef; if ( $res->last_modified ) { $lastmod = scalar(gmtime($res->last_modified)); } my $content = $res->can('decoded_content') ? $res->decoded_content(charset => 'none') : $res->content; $File->{Bytes} = $content; $File->{Mode} = $mode; $File->{ContentType} = $ct; $File->{ContentEnc} = $res->content_encoding; $File->{ContentLang} = $res->content_language; $File->{ContentLoc} = $res->header('Content-Location'); $File->{TransferEnc} = $res->header('Client-Transfer-Encoding'); $File->{Charset}->{HTTP} = lc $charset; $File->{Modified} = $lastmod; $File->{Server} = scalar $res->server; # TODO: Content-Length is not always set, so either this should # be renamed to 'Content-Length' or it should consider more than # the Content-Length header. $File->{Size} = scalar $res->content_length; $File->{URI} = scalar $res->request->uri->canonical; $File->{'Is Upload'} = FALSE; $File->{'Direct Input'} = FALSE; return $File; } # # Handle uploaded file and return the content and selected meta-info. sub handle_file { my $q = shift; # The CGI object. my $File = shift; # The master datastructure. my $f = $q->param('uploaded_file'); my $h = $q->uploadInfo($f); my $file; local $/ = undef; # set line delimiter so that <> reads rest of file $file = <$f>; my ($mode, $ct, $charset) = &parse_content_type($File, $h->{'Content-Type'}); $File->{Bytes} = $file; $File->{Mode} = $mode; $File->{ContentType} = $ct; $File->{Charset}->{HTTP} = lc $charset; $File->{Modified} = $q->http('Last-Modified'); $File->{Server} = $q->http('User-Agent'); # Fake a "server". :-) $File->{Size} = $q->http('Content-Length'); $File->{URI} = "$f"; $File->{'Is Upload'} = TRUE; $File->{'Direct Input'} = FALSE; return $File; } # # Handle uploaded file and return the content and selected meta-info. sub handle_frag { my $q = shift; # The CGI object. my $File = shift; # The master datastructure. $File->{Bytes} = $q->param('fragment'); $File->{Mode} = 'TBD'; $File->{Modified} = ''; $File->{Server} = ''; $File->{Size} = ''; $File->{ContentType} = ''; # @@TODO? $File->{URI} = 'upload://Form Submission'; $File->{'Is Upload'} = TRUE; $File->{'Direct Input'} = TRUE; $File->{Charset}->{HTTP} = "utf-8"; # by default, the form accepts utf-8 chars return $File; } # # Parse a Content-Type and parameters. Return document type and charset. sub parse_content_type { my $File = shift; my $Content_Type = shift; my $url = shift; my $charset = ''; my ($ct) = lc($Content_Type) =~ /^\s*([^\s;]*)/g; my $mode = $CFG->{MIME}->{$ct} || $ct; $charset = HTML::Encoding::encoding_from_content_type($Content_Type); if ($mode =~ m(/)) { # a "/" means it's unknown or we'd have a mode here. if ($ct =~ m(text/css) and defined $url) { print redirect 'http://jigsaw.w3.org/css-validator/validator?uri=' . uri_escape $url; exit; } else { $File->{'Error Flagged'} = TRUE; $File->{E}->param(fatal_mime_error => TRUE); $File->{E}->param(fatal_mime_ct => $ct); } } return $mode, $ct, $charset; } # # Check recursion level and enforce Max Recursion limit. sub check_recursion ($$) { my $File = shift; my $res = shift; # Not looking at our own output. return unless defined $res->header('X-W3C-Validator-Recursion'); my $lvl = $res->header('X-W3C-Validator-Recursion'); return unless $lvl =~ m(^\d+$); # Non-digit, i.e. garbage, ignore. if ($lvl >= $CFG->{'Max Recursion'}) { print redirect $CFG->{'Home Page'}; } else { $T->param(depth => $lvl++); # Increase recursion level in output. } } # # Return $_[0] encoded for HTML entities (cribbed from merlyn). # # Note that this is used both for HTML and XML escaping. # sub ent { local $_ = shift; return '' unless defined; # Eliminate warnings # TODO: Err, why have " twice in the character class? ' maybe? s(["<&>"]){'&#' . ord($&) . ';'}ge; # should switch to hex sooner or later return $_; } # # Truncate source lines for report. # sub truncate_line { my $line = shift; my $col = shift; my $start = $col; my $end = $col; for (1..40) { $start-- if ($start - 1 >= 0); # in/de-crement until... $end++ if ($end + 1 <= length $line); # ...we hit end of line. } unless ($end - $start == 80) { if ($start == 0) { # Hit start of line, maybe grab more at end. my $diff = 40 - $col; for (1..$diff) { $end++ if ($end + 1 <= length $line); } } elsif ($end == length $line) { # Hit end of line, maybe grab more at beginning. my $diff = 80 - $col; for (1..$diff) { $start-- if ($start - 1 >= 0); } } } # # Add elipsis at end if necessary. unless ($end == length $line) {substr $line, -3, 3, '...'}; $col = $col - $start; # New offset is diff from $col to $start. $line = substr $line, $start, $end - $start; # Truncate. # # Add elipsis at start if necessary. unless ($start == 0) {substr $line, 0, 3, '...'}; return $line, $col; } # # Suppress any existing DOCTYPE by commenting it out. sub override_doctype { no strict 'vars'; my $File = shift; my ($dt) = grep { $_->{Display} eq $File->{Opt}->{DOCTYPE} } values %{$CFG->{Types}}; # @@TODO: abort/whine about unrecognized doctype if $dt is undef.; my $pubid = $dt->{PubID}; my $sysid = $dt->{SysID}; my $name = $dt->{Name}; local $dtd = qq('; local $org_dtd = ''; local $HTML = ''; local $seen = FALSE; my $declaration = sub { $seen = TRUE; # No Override if Fallback was requested. if ($File->{Opt}->{FB}->{DOCTYPE}) { $HTML .= $_[0]; # Stash it as is... } else { # Comment it out and insert the new one... $HTML .= "$dtd\n" . ''; $org_dtd = &ent($_[0]); } }; HTML::Parser->new(default_h => [sub {$HTML .= shift}, 'text'], declaration_h => [$declaration, 'text'] )->parse(join "\n", @{$File->{Content}})->eof(); $File->{Content} = [split /\n/, $HTML]; if ($seen) { unless (($File->{Opt}->{FB}->{DOCTYPE}) or ($File->{Opt}->{DOCTYPE} eq $CFG->{Types}->{$File->{DOCTYPE}}->{Display} )) { &add_warning('W13', { W13_org => $org_dtd, W13_new => $File->{Opt}->{DOCTYPE}, }); $File->{Tentative} |= T_ERROR; # Tag it as Invalid. } } else { unshift @{$File->{Content}}, $dtd; if ($File->{Opt}->{FB}->{DOCTYPE}) { &add_warning('W16', {W16_dtd => $File->{Opt}->{DOCTYPE}}); $File->{Tentative} |= T_ERROR; # Tag it as Invalid. } else { &add_warning('W15', {W15_dtd => $File->{Opt}->{DOCTYPE}}); $File->{Tentative} |= T_ERROR; # Tag it as Invalid. } } return $File; } # # Generate a HTML report of detected errors. sub report_errors ($) { my $File = shift; my $Errors = []; my %Errors_bytype; my $number_of_errors = 0; my $number_of_warnings = 0; my $number_of_info = 0; # Hash to keep track of how many of each error is reported. my %Msgs; # Used to generate a UID for explanations. if (scalar @{$File->{Errors}}) { foreach my $err (@{$File->{Errors}}) { my ($line, $col) = &truncate_line($File->{Content}->[$err->{line}-1], $err->{char}); $line = &mark_error($line, $col); my $explanation; if ($err->{num}) { my $num = $err->{num}; # if (exists $Msgs{$num}) { # We've already seen this message... # if ($File->{Opt}->{Verbose}) { # ...so only repeat it in Verbose mode. # $explanation = qq(\n \n); # } # } else { # $Msgs{$num} = 1; $explanation .= "\n $RSRC->{msg}->{$num}->{verbose}\n" if exists $RSRC->{msg}->{$num} && exists $RSRC->{msg}->{$num}->{verbose}; # } my $_msg = $RSRC->{msg}->{nomsg}->{verbose}; $_msg =~ s//$num/g; if ($File->{'Is Upload'}) { $_msg =~ s///g } else { my $escaped_uri = uri_escape($File->{URI}); $_msg =~ s//$escaped_uri/g; } $explanation = " $_msg\n".$explanation; # The send feedback plea. } $err->{src} = $line; $err->{col} = ' ' x $col; $err->{expl} = $explanation; if ($err->{type} eq 'I') { $err->{class} = 'msg_info'; $err->{err_type_err} = 0; $err->{err_type_warn} = 0; $err->{err_type_info} = 1; $number_of_info += 1; } elsif ($err->{type} eq 'E') { $err->{class} = 'msg_err'; $err->{err_type_err} = 1; $err->{err_type_warn} = 0; $err->{err_type_info} = 0; $number_of_errors += 1; } elsif (($err->{type} eq 'W') or ($err->{type} eq 'X') ) { $err->{class} = 'msg_warn'; $err->{err_type_err} = 0; $err->{err_type_warn} = 1; $err->{err_type_info} = 0; $number_of_warnings += 1; } # TODO other classes for "X" etc? FIXME find all types of message. push @{$Errors}, $err; if (($File->{Opt}->{'Group Errors'}) and (($err->{type} eq 'E') or ($err->{type} eq 'W')or ($err->{type} eq 'X'))) { # index by num for errors and warnings only - info usually give context of error or warning if (! exists $Errors_bytype{$err->{num}}) { $Errors_bytype{$err->{num}}->{instances} = []; my $msg_text; if ($err->{num} ne 'xmlwf') { $msg_text = $RSRC->{msg}->{$err->{num}}->{original}; $msg_text =~ s/%1/X/; $msg_text =~ s/%2/Y/; } else { ## FIXME ## we need a catalog of errors from our XML parser $msg_text = "XML Parsing Error"; } $Errors_bytype{$err->{num}}->{expl} = $err->{expl}; $Errors_bytype{$err->{num}}->{msg} = $msg_text; $Errors_bytype{$err->{num}}->{type} = $err->{type}; $Errors_bytype{$err->{num}}->{class} = $err->{class}; $Errors_bytype{$err->{num}}->{err_type_err} = $err->{err_type_err}; $Errors_bytype{$err->{num}}->{err_type_warn} = $err->{err_type_warn}; $Errors_bytype{$err->{num}}->{err_type_info} = $err->{err_type_info}; } push @ { $Errors_bytype{$err->{num}}->{instances} }, $err; } } } if ($File->{Opt}->{'Group Errors'}) { $Errors = []; for my $err_num (keys %Errors_bytype){ push @{$Errors}, $Errors_bytype{$err_num}; } } return $number_of_errors, $number_of_warnings, $number_of_info, $Errors; } # # Chop the source line into 3 pieces; the character at which the error # was detected, and everything to the left and right of that position. # That way we can add markup to the relevant char without breaking &ent(). sub mark_error (\$\$) { my $line = shift; my $col = shift; # # Left side... my $left; { my $offset = 0; # Left side allways starts at 0. my $length; if ($col - 1 < 0) { # If error is at start of line... $length = 0; # ...floor to 0 (no negative offset). } elsif ($col == length $line) { # If error is at EOL... $length = $col - 1; # ...leave last char to indicate position. } else { # Otherwise grab everything up to pos of error. $length = $col; } $left = substr $line, $offset, $length; } # # The character where the error was detected. my $char; { my $offset; my $length = 1; # Length is always 1; the char where error was found. if ($col == length $line) { # If err is at EOL... $offset = $col - 1; # ...then grab last char on line instead. } else { $offset = $col; # Otherwise just grab the char. } $char = substr $line, $offset, $length; $char = &ent($char); } # # The right side up to the end of the line... my $right; { my $offset; my $length; # Offset... if ($col == length $line) { # If at EOL... $offset = 0; # Don't bother as there is nothing left to grab. } else { $offset = $col + 1; # Otherwise get everything from char-after-error. } # Length... if ($col == length $line) { # If at end of line... $length = 0; # ...then don't grab anything. } else { $length = length($line) - ($col - 1); # Otherwise get the rest of the line. } $right = substr $line, $offset, $length; } $char = qq($char); $line = &ent($left) . $char . &ent($right); return $line; } # # Create a HTML representation of the document. sub source { my $File = shift; # Remove any BOM since we're not at BOT anymore... $File->{Content}->[0] = substr $File->{Content}->[0], ($File->{BOM} ? 1 : 0); # remove BOM my @source = map({file_source_line => $_}, @{$File->{Content}}); return \@source; } # # Do an initial parse of the Document Entity to extract FPI. sub preparse_doctype { my $File = shift; # # Reset DOCTYPE, Root (for second invocation, probably not needed anymore). $File->{DOCTYPE} = ''; $File->{Root} = ''; my $dtd = sub { return if $File->{Root}; # TODO: The \s and \w are probably wrong now that the strings are utf8_on ($File->{Root}, $File->{DOCTYPE}) = shift =~ m()si; }; my $start = sub { my $tag = shift; my $attr = shift; my %attr = map {lc($_) => $attr->{$_}} keys %{$attr}; if ($File->{Root}) { return unless $tag eq $File->{Root}; } else { $File->{Root} = $tag; } if ($attr->{xmlns}) {$File->{Namespace} = $attr->{xmlns}}; }; # we use HTML::Parser as pre-parser. May use html5lib or other in the future my $p = HTML::Parser->new(api_version => 3); # if content-type has shown we should pre-parse with XML mode, use that # otherwise (mostly text/html cases) use default mode $p->xml_mode(TRUE) if ($File->{Mode} eq 'XML'); $p->ignore_elements('BODY'); $p->ignore_elements('body'); $p->handler(declaration => $dtd, 'text'); $p->handler(start => $start, 'tag,attr'); $p->parse(join "\n", @{$File->{Content}}); # TODO: These \s here are probably wrong now that the strings are utf8_on $File->{DOCTYPE} = '' unless defined $File->{DOCTYPE}; $File->{DOCTYPE} =~ s(^\s+){ }g; $File->{DOCTYPE} =~ s(\s+$){ }g; $File->{DOCTYPE} =~ s(\s+) { }g; return $File; } # # Print out the raw error output for debugging. sub show_errors ($) { # @@FIXME This is broken with SGML::Parser::OpenSP my $file_raw_errors = ""; for (@{shift->{DEBUG}->{Errors}}) { $file_raw_errors .= ent $_; } return $file_raw_errors; } # # Preprocess CGI parameters. sub prepCGI { my $File = shift; my $q = shift; my $path_info; { # @@@HACK: # CGI.pm's _name_and_path_from_env has query string related issues; # just trump the query string for the duration of calling methods we # know we don't need it for and which have been affected in the past. # More info: http://www.w3.org/Bugs/Public/show_bug.cgi?id=4365 local $ENV{REQUEST_URI} = URI->new($ENV{REQUEST_URI})->path() if $ENV{REQUEST_URI}; # The URL to this CGI script. $File->{Env}->{'Self URI'} = $q->url(); $path_info = $q->path_info(); } # Avoid CGI.pm's "exists but undef" behaviour. if (scalar $q->param) { foreach my $param ($q->param) { next if $param eq 'uploaded_file'; # 'uploaded_file' contains data. next if $param eq 'fragment'; # Ditto 'fragment'. next if $q->param($param) eq '0'; # Keep false-but-set params. # # Parameters that are given to us without specifying a value get # set to "1" (the "TRUE" constant). This is so we can test for the # boolean value of a parameter instead of first checking whether # the param was given and then testing it's value. Needed because # CGI.pm sets ";param" and ";param=" to a boolean false value # (undef() or a null string, respectively). $q->param($param, TRUE) unless $q->param($param); } } # Futz the URL so "/referer" works. if ($path_info) { if ($path_info eq '/referer' or $path_info eq '/referrer') { if ($q->referer) { $q->param('uri', $q->referer); print redirect &self_url_q($q, $File); exit; } else { print redirect $File->{Env}->{'Self URI'} . '?uri=referer'; exit; } } else { print redirect &self_url_q($q, $File); exit; } } # Use "url" unless a "uri" was also given. if ($q->param('url') and not $q->param('uri')) { $q->param('uri', $q->param('url')); } # Munge the URL to include commonly omitted prefix. my $u = $q->param('uri'); $q->param('uri', "http://$u") if $u && $u =~ m(^www)i; # Issue a redirect for uri=referer. if ($q->param('uri') and $q->param('uri') eq 'referer') { if ($q->referer) { $q->param('uri', $q->referer); print redirect &self_url_q($q, $File); exit; } else { # Redirected from /check/referer to /check?uri=referer because # the browser didn't send a Referer header, or the request was # for /check?uri=referer but no Referer header was found. $File->{'Error Flagged'} = TRUE; $File->{E}->param(fatal_referer_error => TRUE); } } # Supersede URL with an uploaded file. if ($q->param('uploaded_file')) { $q->param('uri', 'upload://' . $q->param('uploaded_file')); $File->{'Is Upload'} = TRUE; # Tag it for later use. } # Supersede URL with an uploaded fragment. if ($q->param('fragment')) { $q->param('uri', 'upload://Form Submission'); $File->{'Is Upload'} = TRUE; # Tag it for later use. } # Redirect to a GETable URL if method is POST without a file upload. if (defined $q->request_method and $q->request_method eq 'POST' and not $File->{'Is Upload'}) { my $thispage = &self_url_q($q, $File); print redirect $thispage; exit; } # # Flag an error if we didn't get a file to validate. unless ($q->param('uri')) { $File->{'Error Flagged'} = TRUE; $File->{E}->param(fatal_uri_error => TRUE); $File->{E}->param(fatal_uri_scheme => 'undefined'); } return $q; } # # Set parse mode. sub set_parse_mode { my $File = shift; my $CFG = shift; my $fpi = $File->{DOCTYPE}; my $parseModeFromDoctype = $CFG->{Types}->{$fpi}->{'Parse Mode'}; my $parseModeFromMimeType = $File->{Mode}; if (($parseModeFromMimeType eq 'TBD') and (!exists $CFG->{Types}->{$fpi})) { # the mime type is text/html (ambiguous, hence TBD mode) # and the doctype isn't in the catalogue... we scream &add_warning('W08', {W08_mime => $File->{ContentType}}); return; } $parseModeFromDoctype = 'TBD' unless $parseModeFromDoctype eq 'SGML' or $parseModeFromDoctype eq 'XML'; if (($parseModeFromDoctype eq 'TBD') and ($parseModeFromMimeType eq 'TBD')) { # if both doctype and mime type are useless to give us a parse mode # => we use SGML as a default $File->{Mode} = 'SGML'; &add_warning('W06', { W06_mime => $File->{ContentType}, w06_doctype => $File->{Version} }); return; } elsif ($parseModeFromDoctype eq 'TBD') { # doctype does not give us anything clear (e.g custom DTD) # but mime type gives clear indication # => we just use what the content type tells us - move along return; } elsif ($parseModeFromMimeType eq 'TBD') { # the mime type is text/html (ambiguous, hence TBD mode) # but by now we're sure that the document type is a good indication # so we use that. $File->{Mode} = $parseModeFromDoctype; return; } elsif ($parseModeFromMimeType ne $parseModeFromDoctype) { # if document-type recommended mode and content-type recommended mode clash # shoot a warning &add_warning('W07', { W07_mime => $File->{ContentType}, W07_ct => $parseModeFromMimeType, W07_dtd => $parseModeFromDoctype, }); return; } } # # Utility sub to tell if mode "is" XML. sub is_xml {shift->{Mode} eq 'XML'}; # # Check charset conflicts and add any warnings necessary. sub charset_conflicts { my $File = shift; # # Handle the case where there was no charset to be found. unless ($File->{Charset}->{Use}) { &add_warning('W17', {}); $File->{Tentative} |= T_WARN; } # # Add a warning if there was charset info conflict (HTTP header, # XML declaration, or element). if (charset_not_equal($File->{Charset}->{HTTP}, $File->{Charset}->{XML})) { &add_warning('W18', { W18_http => $File->{Charset}->{HTTP}, W18_xml => $File->{Charset}->{XML}, W18_use => $File->{Charset}->{Use}, }); } elsif (charset_not_equal($File->{Charset}->{HTTP}, $File->{Charset}->{META}) and not ($File->{'Direct Input'})) { &add_warning('W19', { W19_http => $File->{Charset}->{HTTP}, W19_meta => $File->{Charset}->{META}, W19_use => $File->{Charset}->{Use}, }); } elsif (charset_not_equal($File->{Charset}->{XML}, $File->{Charset}->{META})) { &add_warning('W20', { W20_http => $File->{Charset}->{XML}, W20_xml => $File->{Charset}->{META}, }); $File->{Tentative} |= T_WARN; } return $File; } # # Transcode to UTF-8 sub transcode { my $File = shift; my $general_charset = $File->{Charset}->{Use}; my $exact_charset = $general_charset; # TODO: This should be done before transcode() if ($general_charset eq 'utf-16') { if ($File->{Charset}->{Auto} =~ m/^utf-16[bl]e$/) { $exact_charset = $File->{Charset}->{Auto}; } else { $exact_charset = 'utf-16be'; } } my $cs = $exact_charset; if (!$CFG->{Charsets}->{$cs}) { # The encoding is not supported due to policy # and possibly other reasons $File->{'Error Flagged'} = TRUE; $File->{E}->param(fatal_transcode_error => TRUE); $File->{E}->param(fatal_transcode_charset => $cs); # @@FIXME might need better text $File->{E}->param(fatal_transcode_errmsg => "Encoding not supported."); return $File; } # Does the system support decoding this encoding? eval { Encode::decode($cs, ''); }; if ($@) { # This system's Encode installation does not support # the character encoding; might need additional modules $File->{'Error Flagged'} = TRUE; $File->{E}->param(fatal_transcode_error => TRUE); $File->{E}->param(fatal_transcode_charset => $cs); # @@FIXME might need better text $File->{E}->param(fatal_transcode_errmsg => "Encoding not supported."); return $File; } my $output; my $input = $File->{Bytes}; # Try to transcode eval { $output = Encode::decode($cs, $input, Encode::FB_CROAK); }; # Transcoding failed if ($@) { $File->{'Error Flagged'} = TRUE; # @@FIXME might need better text, in particular, this does not tell # where the error occured; it might be possible to emulate that # using a Encode CHECK parameter that modifies the input, then split # the decodable string to give line / column information, or don't # split and report the offset calculated from the result. $File->{E}->param(fatal_byte_error => TRUE); $File->{E}->param(fatal_byte_lines => 0); $File->{E}->param(fatal_byte_charset => $cs); return $File; } # @@FIXME is this what we want? $output =~ s/\015?\012/\n/g; # make sure we deal only with unix newlines # tentative fix for http://www.w3.org/Bugs/Public/show_bug.cgi?id=3992 $output =~ s/(\r\n|\n|\r)/\n/g; $File->{Content} = [split/\n/, $output]; return $File; } sub find_encodings { my $File = shift; my $bom = HTML::Encoding::encoding_from_byte_order_mark($File->{Bytes}); my @first = HTML::Encoding::encoding_from_first_chars($File->{Bytes}); if (defined $bom) { # @@FIXME this BOM entry should not be needed at all! $File->{BOM} = length(Encode::encode($bom, "\x{FEFF}")); $File->{Charset}->{Auto} = lc $bom; } else { $File->{Charset}->{Auto} = lc($first[0]) if @first; } my $xml = HTML::Encoding::encoding_from_xml_document($File->{Bytes}); $File->{Charset}->{XML} = lc $xml if defined $xml; my %metah; foreach my $try (@first) { # @@FIXME I think the old code used HTML::Parser xml mode, check this is ok my $meta = lc HTML::Encoding::encoding_from_meta_element($File->{Bytes}, $try); $metah{$meta}++ if defined $meta and length $meta; } my @meta = sort { $metah{$b} <=> $metah{$a} } keys %metah; $File->{Charset}->{META} = lc $meta[0] if @meta; return $File; } # # Abort with a message if an error was flagged at point. sub abort_if_error_flagged { my $File = shift; my $Flags = shift; return unless $File->{'Error Flagged'}; return if $File->{'Error Handled'}; # Previous error, keep going. if ($File->{Opt}->{Output} eq 'html') { &prep_template($File, $E); print $E->output; exit; } else { #@@FIXME: This is borked after templatification. # &add_warning($File, 'fatal', 'Fatal Error', <<".EOF."); # A fatal error has occurred while processing the requested document. Processing # has continued but any later output will be of dubious quality. Limitations of # this output mode prevent the full error message from being returned; please # retry this operation in interactive mode using the web interface to see the # actual error message. # .EOF. #@@FIXME; $File->{'Error Handled'} = TRUE; } } # # conflicting encodings sub charset_not_equal { my $encodingA = shift; my $encodingB = shift; return $encodingA && $encodingB && ($encodingA ne $encodingB); } # # Construct a self-referential URL from a CGI.pm $q object. sub self_url_q { my ($q, $File) = @_; my $thispage = $File->{Env}->{'Self URI'} . '?'; $thispage .= 'uri=' . uri_escape($q->param('uri')) . ';' if $q->param('uri'); $thispage .= 'ss=1;' if $q->param('ss'); $thispage .= 'sp=1;' if $q->param('sp'); $thispage .= 'noatt=1;' if $q->param('noatt'); $thispage .= 'outline=1;' if $q->param('outline'); $thispage .= 'No200=1;' if $q->param('No200'); $thispage .= 'verbose=1;' if $q->param('verbose'); $thispage .= 'group=1;' if $q->param('group'); if ($q->param('doctype') and not $q->param('doctype') =~ /(Inline|detect)/i) { $thispage .= 'doctype=' . uri_escape($q->param('doctype')) . ';'; } if ($q->param('charset') and not $q->param('charset') =~ /detect/i) { $thispage .= 'charset=' . uri_escape($q->param('charset')) . ';'; } $thispage =~ s/[\?;]$//; return $thispage; } # # Return random Tip with it's URL. sub get_tip { my @tipAddrs = keys %{$CFG->{Tips}}; my $tipAddr = $tipAddrs[rand scalar @tipAddrs]; my $tipSlug = $CFG->{Tips}->{$tipAddr}; return [$tipAddr, $tipSlug]; } # # Construct a self-referential URL from a $File object. sub self_url_file { my $File = shift; my $thispage = $File->{Env}->{'Self URI'}; my $escaped_uri = uri_escape($File->{URI}); $thispage .= qq(?uri=$escaped_uri); $thispage .= ';ss=1' if $File->{Opt}->{'Show Source'}; $thispage .= ';st=1' if $File->{Opt}->{'Show Tidy'}; $thispage .= ';sp=1' if $File->{Opt}->{'Show Parsetree'}; $thispage .= ';noatt=1' if $File->{Opt}->{'No Attributes'}; $thispage .= ';outline=1' if $File->{Opt}->{'Outline'}; # These were not added by report_valid; perhaps they should be? # $thispage .= ';verbose=1' if $File->{Opt}->{'Verbose'}; # $thispage .= ';group=1' if $File->{Opt}->{'Group Errors'}; # $thispage .= ';No200=1' if $File->{Opt}->{'No200'}; return $thispage; } ##### sub W3C::Validator::SAXHandler::new { my $class = shift; my $parser = shift; my $File = shift; my $self = { _file => $File, _parser => $parser, current_heading_level => 0, am_in_heading => 0 }; bless $self, $class; } sub W3C::Validator::SAXHandler::characters { my ($self, $chars) = @_; if ($self->{am_in_heading} == 1) { my $data = $chars->{Data}; $data =~ s/[\r|\n]/ /g; $self->{_file}->{heading_outline} = $self->{_file}->{heading_outline} . $data; } } sub W3C::Validator::SAXHandler::data { my ($self, $chars) = @_; if ($self->{am_in_heading} == 1) { my $data = $chars->{Data}; $data =~ s/[\r|\n]/ /g; $self->{_file}->{heading_outline} = $self->{_file}->{heading_outline} . $data; } } sub W3C::Validator::SAXHandler::start_element { my ($self, $element) = @_; if ($element->{Name} =~ /^h([1-6])$/) { $self->{_file}->{heading_outline} = $self->{_file}->{heading_outline} . " " x int($1) . "[". $element->{Name}."] "; $self->{am_in_heading} = 1; } } sub W3C::Validator::SAXHandler::end_element { my ($self, $element) = @_; if ($element->{Name} =~ /^h[1-6]$/) { $self->{_file}->{heading_outline} = $self->{_file}->{heading_outline} . "\n"; $self->{am_in_heading} = 0; } } sub W3C::Validator::SAXHandler::error { my $self = shift; my $error = shift; my $mess = $self->{_parser}->split_message($error); my $File = $self->{_file}; # TODO: this does not filter out errors in DTDs. my $err; $err->{src} = '...'; # do this with show_open_entities()? $err->{line} = $mess->{primary_message}{LineNumber}; $err->{char} = $mess->{primary_message}{ColumnNumber}; $err->{num} = $mess->{primary_message}{Number}; $err->{type} = $mess->{primary_message}{Severity}; $err->{msg} = $mess->{primary_message}{Text}; # ... $File->{'Is Valid'} = FALSE if $err->{type} eq 'E'; # Workaround for onsgmls 1.5 sometimes reporting errors beyond EOL. # (How true is that? Test cases please.) if ((my $l = length($File->{Content}->[$err->{line}-1])) < $err->{char}) { $err->{char} = $l; } # No or unknown FPI and a relative SI. if ($err->{msg} =~ m(cannot (open|find))) { $File->{'Error Flagged'} = TRUE; $File->{E}->param(fatal_parse_extid_error => TRUE); $File->{E}->param(fatal_parse_extid_msg => $err->{msg}); } # No DOCTYPE. if ($err->{msg} =~ m(prolog can\'t be omitted)) { my $dtd = ($File->{Mode} eq 'SGML' ? 'HTML 4.01 Transitional' : 'XHTML 1.0 Transitional'); add_warning('W09', {W09_dtd => $dtd}); return; # Don't report this as a normal error. } # TODO: calling exit() here is probably a bad idea abort_if_error_flagged($File, O_DOCTYPE); push @{$File->{Errors}}, $err; if (defined $mess->{aux_message}) { # "duplicate id ... first defined here" style messages push @{$File->{Errors}}, { line => $mess->{aux_message}{LineNumber}, char => $mess->{aux_message}{ColumnNumber}, msg => $mess->{aux_message}{Text}, type => 'I', }; } } ##### package W3C::Validator::UserAgent; use LWP::UserAgent 1.90 qw(); # Need 1.90 for protocols_(allowed|forbidden) use Net::hostent qw(gethostbyname); use Net::IP qw(); use Socket qw(inet_ntoa); use base qw(LWP::UserAgent); sub new { my ($proto, $CFG, $File, @rest) = @_; my $class = ref($proto) || $proto; my $self = $class->SUPER::new(@rest); $self->{'W3C::Validator::CFG'} = $CFG; $self->{'W3C::Validator::File'} = $File; return $self; } sub redirect_ok { my ($self, $req, $res) = @_; return $self->SUPER::redirect_ok($req, $res) && $self->uri_ok($req->uri()); } sub uri_ok { my ($self, $uri) = @_; return 1 if ($self->{'W3C::Validator::CFG'}->{'Allow Private IPs'} or !$uri->can('host')); my $addr = my $iptype = undef; if (my $host = gethostbyname($uri->host())) { $addr = inet_ntoa($host->addr()) if $host->addr(); if ($addr && (my $ip = Net::IP->new($addr))) { $iptype = $ip->iptype(); } } if ($iptype && $iptype ne 'PUBLIC') { my $File = $self->{'W3C::Validator::File'}; $File->{'Error Flagged'} = 1; $File->{E}->param(fatal_ip_error => 1); $File->{E}->param(fatal_ip_hostname => 1) if $addr and $uri->host() ne $addr; $File->{E}->param(fatal_ip_host => ($uri->host() || 'undefined')); return 0; } return 1; } # Local Variables: # mode: perl # indent-tabs-mode: nil # tab-width: 2 # perl-indent-level: 2 # End: