#!/usr/bin/perl -T # # W3C Markup Validation Service # A CGI script to retrieve and validate a markup file # # Copyright 1995-2007 World Wide Web Consortium, (Massachusetts # Institute of Technology, European Research Consortium for Informatics # and Mathematics, Keio University). All Rights Reserved. # # Originally written by Gerald Oskoboiny # for additional contributors, see http://dev.w3.org/cvsweb/validator/ # and http://validator.w3.org/about.html#credits # # This source code is available under the license at: # http://www.w3.org/Consortium/Legal/copyright-software # # $Id: check,v 1.571 2007-09-25 07:22:03 ot Exp $ # # Disable buffering on STDOUT! $| = 1; # # We need Perl 5.8.0+. use 5.008; ############################################################################### #### Load modules. ############################################################ ############################################################################### # # Pragmas. use strict; use warnings; use utf8; # # Modules. See also the BEGIN block further down below. # # Version numbers given where we absolutely need a minimum version of a given # module (gives nicer error messages). By default, add an empty import list # when loading modules to prevent non-OO or poorly written modules from # polluting our namespace. # use CGI 2.81 qw(-newstyle_urls -private_tempfiles redirect); use CGI::Carp qw(carp croak fatalsToBrowser); use Config::General 2.19 qw(); # Need 2.19 for -AutoLaunder use Encode qw(); use Encode::Alias qw(); use Encode::HanExtra qw(); # for some chinese character encodings, # e.g gb18030 use Encode::JIS2K qw(); # ditto extra japanese encodings use File::Spec qw(); use HTML::Encoding 0.52 qw(); use HTML::Parser 3.25 qw(); # Need 3.25 for $p->ignore_elements. use HTML::Template 2.6 qw(); use HTTP::Negotiate qw(); use HTTP::Request qw(); use HTTP::Headers::Auth qw(); # Needs to be imported after other HTTP::*. use SGML::Parser::OpenSP qw(); use URI qw(); use URI::Escape qw(uri_escape); use XML::LibXML qw(); ############################################################################### #### Constant definitions. #################################################### ############################################################################### # # Define global constants use constant TRUE => 1; use constant FALSE => 0; # # Tentative Validation Severities. use constant T_WARN => 4; # 0000 0100 use constant T_ERROR => 8; # 0000 1000 # # Output flags for error processing use constant O_SOURCE => 1; # 0000 0001 use constant O_CHARSET => 2; # 0000 0010 use constant O_DOCTYPE => 4; # 0000 0100 use constant O_NONE => 8; # 0000 1000 # # Define global variables. use vars qw($DEBUG $CFG $RSRC $VERSION); # # Things inside BEGIN don't happen on every request in persistent environments # (such as mod_perl); so let's do the globals, eg. read config, here. BEGIN { # Launder data for -T; -AutoLaunder doesn't catch this one. if (exists $ENV{W3C_VALIDATOR_HOME}) { $ENV{W3C_VALIDATOR_HOME} =~ /^(.*)$/; $ENV{W3C_VALIDATOR_HOME} = $1; } # # Read Config Files. eval { my %config_opts = ( -ConfigFile => ($ENV{W3C_VALIDATOR_CFG} || '/etc/w3c/validator.conf'), -MergeDuplicateOptions => TRUE, -MergeDuplicateBlocks => TRUE, -SplitPolicy => 'equalsign', -UseApacheInclude => TRUE, -IncludeRelative => TRUE, -InterPolateVars => TRUE, -AutoLaunder => TRUE, -AutoTrue => TRUE, -DefaultConfig => { Protocols => {Allow => 'http,https'}, Paths => { Base => ($ENV{W3C_VALIDATOR_HOME} || '/usr/local/validator'), }, }, ); my %cfg = Config::General->new(%config_opts)->getall(); $CFG = \%cfg; }; if ($@) { die <<".EOF."; Could not read configuration. Set the W3C_VALIDATOR_CFG environment variable or copy conf/* to /etc/w3c/. Make sure that the configuration file and all included files are readable by the web server user. The error was:\n'$@' .EOF. } # # Check a filesystem path for existance and "readability". sub pathcheck (@) { my %paths = map { $_ => [-d $_, -r _] } @_; my @_d = grep {not $paths{$_}->[0]} keys %paths; my @_r = grep {not $paths{$_}->[1]} keys %paths; return TRUE if (scalar(@_d) + scalar(@_r) == 0); die <<".EOF." if scalar @_d; Does not exist or is not a directory: @_d .EOF. die <<".EOF." if scalar @_r; Directory not readable (permission denied): @_r .EOF. } # # Check paths in config... # @@FIXME: This does not do a very good job error-message-wise if a path is # @@FIXME: missing...; { my @dirs = (); push @dirs, $CFG->{Paths}->{Base}; push @dirs, $CFG->{Paths}->{Templates}; push @dirs, $CFG->{Paths}->{SGML}->{Library}; &pathcheck(@dirs); } # # Split allowed protocols into a list. if (my $allowed = delete($CFG->{Protocols}->{Allow})) { $CFG->{Protocols}->{Allow} = [split(/\s*,\s*/, $allowed)]; } { # Make types config indexed by FPI. my $_types = {}; map { $_types->{$CFG->{Types}->{$_}->{PubID}} = $CFG->{Types}->{$_} } keys %{$CFG->{Types}}; $CFG->{Types} = $_types; } # # Change strings to internal constants in MIME type mapping. for (keys %{$CFG->{MIME}}) { $CFG->{MIME}->{$_} = 'TBD' unless $CFG->{MIME}->{$_} eq 'SGML' or $CFG->{MIME}->{$_} eq 'XML'; } # # Set debug flag. if ($CFG->{'Allow Debug'}) { $DEBUG = TRUE if $ENV{W3C_VALIDATOR_DEBUG} || $CFG->{'Enable Debug'}; } else { $DEBUG = FALSE; } # # Strings $VERSION = q$Revision: 1.571 $; $VERSION =~ s/Revision: ([\d\.]+) /$1/; # # Use passive FTP by default. $ENV{FTP_PASSIVE} = 1 unless exists($ENV{FTP_PASSIVE}); # Read friendly error message file my %rsrc = Config::General->new( -MergeDuplicateBlocks => 1, -ConfigFile => File::Spec->catfile($CFG->{Paths}->{Templates}, 'en_US', 'error_messages.cfg'), )->getall(); # 'en_US' should be replaced by $lang for lang-neg # Config::General workarounds for issues: # http://lists.w3.org/Archives/Public/public-qa-dev/2006Feb/0022.html # http://lists.w3.org/Archives/Public/public-qa-dev/2006Feb/0025.html # https://rt.cpan.org/Public/Bug/Display.html?id=17852 $rsrc{msg}{0} ||= delete($rsrc{'msg 0'}) || # < 2.31 { original => delete($rsrc{msg}{original}), # 2.31 verbose => delete($rsrc{msg}{verbose}), }; $RSRC = \%rsrc; } # end of BEGIN block. # # Get rid of (possibly insecure) $PATH. delete $ENV{PATH}; #@@DEBUG: Dump $CFG datastructure. Used only as a developer aid. #use Data::Dumper qw(Dumper); #print Dumper($CFG); #exit; #@@DEBUG; ############################################################################### #### Process CGI variables and initialize. #################################### ############################################################################### # # Create a new CGI object. my $q = new CGI; # # The data structure that will hold all session data. # @@FIXME This can't be my() as $File will sooner or # later be undef and add_warning will cause the script # to die. our() seems to work but has other problems. # @@FIXME Apparently, this must be set to {} also, # otherwise the script might pick up an old object # after abort_if_error_flagged under mod_perl. our $File = {}; ################################# # Initialize the datastructure. # ################################# # # Charset data (casing policy: lowercase early). $File->{Charset}->{Use} = ''; # The charset used for validation. $File->{Charset}->{Auto} = ''; # Autodetection using XML rules (Appendix F) $File->{Charset}->{HTTP} = ''; # From HTTP's "charset" parameter. $File->{Charset}->{META} = ''; # From HTML's . $File->{Charset}->{XML} = ''; # From the XML Declaration. $File->{Charset}->{Override} = ''; # From CGI/user override. # # Misc simple types. $File->{Mode} = 'SGML'; # Default parse mode is SGML. # By default, perform validation (we may perform only xml-wf in some cases) $File->{XMLWF_ONLY} = FALSE; # # Listrefs. $File->{Warnings} = []; # Warnings... $File->{Namespaces} = []; # Other (non-root) Namespaces. ############################################################################### #### Generate Template for Result. ############################################ ############################################################################### # in case there is no language set up on the server, we'll use english as default: if (!defined $CFG->{Languages}) { $CFG->{Languages} = "en"; } # first we determine the chosen language based on # 1) lang argument given as parameter (if this language is available) # 2) HTTP language negotiation between variants available and user-agent choices # 3) English by default my $lang = $q->param('lang') ? $q->param('lang') : ''; my @localizations; my $lang_ok = FALSE; foreach my $lang_available (split(" ", $CFG->{Languages})) { if ($lang eq $lang_available) { $lang_ok = TRUE; next; } } if (($lang eq '') or (!$lang_ok)) { # use HTTP-based negotiation $lang = ''; foreach my $lang_available (split(" ", $CFG->{Languages})) { push @localizations, [$lang_available, 1.000, 'text/html', undef, 'utf-8', $lang_available, undef]; } $lang = HTTP::Negotiate::choose(\@localizations); } # HTTP::Negotiate::choose may return undef # e.g if sent Accept-Language: en;q=0 $lang = 'en_US' if (!defined($lang)); if ($lang eq "en") { $lang = 'en_US'; # legacy } my %template_defaults = ( die_on_bad_params => FALSE, cache => TRUE, ); $File->{Templates}->{Result} = HTML::Template->new( %template_defaults, filename => File::Spec->catfile($CFG->{Paths}->{Templates}, $lang, 'result.tmpl'), loop_context_vars => TRUE, filter => sub { my $ref = shift; ${$ref} = Encode::decode_utf8(${$ref}); } ); $File->{Templates}->{Error} = HTML::Template->new( %template_defaults, filename => File::Spec->catfile($CFG->{Paths}->{Templates}, $lang, 'fatal-error.tmpl'), filter => sub { my $ref = shift;${$ref} = Encode::decode_utf8(${$ref});} ); $File->{Templates}->{AuthzReq} = HTML::Template->new( %template_defaults, filename => File::Spec->catfile($CFG->{Paths}->{Templates}, $lang, 'http_401_authrequired.tmpl'), filter => sub { my $ref = shift;${$ref} = Encode::decode_utf8(${$ref});} ); # templates for alternate output formats $File->{Templates}->{XML} = HTML::Template->new( %template_defaults, filename => File::Spec->catfile($CFG->{Paths}->{Templates}, $lang, 'xml_output.tmpl'), loop_context_vars => TRUE, filter => sub { my $ref = shift;${$ref} = Encode::decode_utf8(${$ref});} ); $File->{Templates}->{SOAP} = HTML::Template->new( %template_defaults, filename => File::Spec->catfile($CFG->{Paths}->{Templates}, $lang, 'soap_output.tmpl'), loop_context_vars => TRUE, filter => sub { my $ref = shift;${$ref} = Encode::decode_utf8(${$ref});} ); $File->{Templates}->{UCN} = HTML::Template->new( %template_defaults, filename => File::Spec->catfile($CFG->{Paths}->{Templates}, $lang, 'ucn_output.tmpl'), loop_context_vars => TRUE, filter => sub { my $ref = shift;${$ref} = Encode::decode_utf8(${$ref});} ); $File->{Templates}->{SOAPFault} = HTML::Template->new( %template_defaults, filename => File::Spec->catfile($CFG->{Paths}->{Templates}, $lang, 'soap_fault.tmpl'), loop_context_vars => TRUE, filter => sub { my $ref = shift;${$ref} = Encode::decode_utf8(${$ref});} ); $File->{Templates}->{SOAPDisabled} = HTML::Template->new( %template_defaults, filename => File::Spec->catfile($CFG->{Paths}->{Templates}, $lang, 'soap_disabled.tmpl'), loop_context_vars => TRUE, filter => sub { my $ref = shift;${$ref} = Encode::decode_utf8(${$ref});} ); $File->{Templates}->{EARLXML} = HTML::Template->new( %template_defaults, filename => File::Spec->catfile($CFG->{Paths}->{Templates}, $lang, 'earl_xml.tmpl'), loop_context_vars => TRUE, global_vars => TRUE, filter => sub { my $ref = shift;${$ref} = Encode::decode_utf8(${$ref});} ); $File->{Templates}->{EARLN3} = HTML::Template->new( %template_defaults, filename => File::Spec->catfile($CFG->{Paths}->{Templates}, $lang, 'earl_n3.tmpl'), loop_context_vars => TRUE, global_vars => TRUE, filter => sub { my $ref = shift;${$ref} = Encode::decode_utf8(${$ref});} ); $File->{Templates}->{PrefillHTML} = HTML::Template->new( %template_defaults, filename => File::Spec->catfile($CFG->{Paths}->{Templates}, $lang, 'prefill_html401.tmpl'), loop_context_vars => TRUE, global_vars => TRUE, filter => sub { my $ref = shift;${$ref} = Encode::decode_utf8(${$ref});} ); $File->{Templates}->{PrefillXHTML} = HTML::Template->new( %template_defaults, filename => File::Spec->catfile($CFG->{Paths}->{Templates}, $lang, 'prefill_xhtml10.tmpl'), loop_context_vars => TRUE, global_vars => TRUE, filter => sub { my $ref = shift;${$ref} = Encode::decode_utf8(${$ref});} ); $File->{Templates}->{Result}->param(cfg_home_page => $CFG->{'Home Page'}); $File->{Templates}->{SOAP}->param(cfg_home_page => $CFG->{'Home Page'}); undef $lang; undef %template_defaults; ######################################### # Populate $File->{Opt} -- CGI Options. # ######################################### # # Preprocess the CGI parameters. $q = &prepCGI($File, $q); # # Set session switches. $File->{Opt}->{'Outline'} = $q->param('outline') ? TRUE : FALSE; $File->{Opt}->{'Show Source'} = $q->param('ss') ? TRUE : FALSE; $File->{Opt}->{'Show Tidy'} = $q->param('st') ? TRUE : FALSE; $File->{Opt}->{'Show Parsetree'} = $q->param('sp') ? TRUE : FALSE; $File->{Opt}->{'No Attributes'} = $q->param('noatt') ? TRUE : FALSE; $File->{Opt}->{'Show ESIS'} = $q->param('esis') ? TRUE : FALSE; $File->{Opt}->{'Show Errors'} = $q->param('errors') ? TRUE : FALSE; $File->{Opt}->{'Verbose'} = $q->param('verbose') ? TRUE : FALSE; $File->{Opt}->{'Group Errors'} = $q->param('group') ? TRUE : FALSE; $File->{Opt}->{'Debug'} = $q->param('debug') ? TRUE : FALSE; $File->{Opt}->{'No200'} = $q->param('No200') ? TRUE : FALSE; $File->{Opt}->{'Charset'} = $q->param('charset') ? lc $q->param('charset'): ''; $File->{Opt}->{'DOCTYPE'} = $q->param('doctype') ? $q->param('doctype') : ''; $File->{Opt}->{'Output'} = $q->param('output') ? $q->param('output') : 'html'; $File->{Opt}->{'Max Errors'} = $q->param('me') ? $q->param('me') : ''; $File->{Opt}->{'Prefill'} = $q->param('prefill') ? TRUE : FALSE; $File->{Opt}->{'Prefill Doctype'} = $q->param('prefill_doctype') ? $q->param('prefill_doctype') : 'html401'; # # "Fallback" info for Character Encoding (fbc), Content-Type (fbt), # and DOCTYPE (fbd). If TRUE, the Override values are treated as # Fallbacks instead of Overrides. $File->{Opt}->{FB}->{Charset} = $q->param('fbc') ? TRUE : FALSE; $File->{Opt}->{FB}->{Type} = $q->param('fbt') ? TRUE : FALSE; $File->{Opt}->{FB}->{DOCTYPE} = $q->param('fbd') ? TRUE : FALSE; # # If ";debug" was given, let it overrule the value from the config file, # regardless of whether it's "0" or "1" (on or off), but only if config # allows the debugging options. if ($CFG->{'Allow Debug'}) { $DEBUG = $q->param('debug') if defined $q->param('debug'); $File->{Opt}->{Verbose} = TRUE if $DEBUG; } else { $DEBUG = FALSE; # The default. } &abort_if_error_flagged($File, O_NONE); # # Get the file and metadata. if ($q->param('uploaded_file')) { $File = &handle_file($q, $File); } elsif ($q->param('fragment')) { $File = &handle_frag($q, $File); } elsif ($q->param('uri')) { $File = &handle_uri($q, $File); } # # Abort if an error was flagged during initialization. &abort_if_error_flagged($File, 0); # # Get rid of the CGI object. undef $q; # # We don't need STDIN any more, so get rid of it to avoid getting clobbered # by Apache::Registry's idiotic interference under mod_perl. untie *STDIN; ############################################################################### #### Output validation results. ############################################### ############################################################################### $File = find_encodings($File); # # Decide on a charset to use (first part) # if ($File->{Charset}->{HTTP}) { # HTTP, if given, is authoritative. $File->{Charset}->{Use} = $File->{Charset}->{HTTP}; } elsif ($File->{ContentType} =~ m(^text/([-.a-zA-Z0-9]\+)?xml$)) { # Act as if $http_charset was 'us-ascii'. (MIME rules) $File->{Charset}->{Use} = 'us-ascii'; &add_warning('W01', { W01_upload => $File->{'Is Upload'}, W01_agent => $File->{Server}, W01_ct => $File->{ContentType}, }); } elsif ($File->{Charset}->{XML}) { $File->{Charset}->{Use} = $File->{Charset}->{XML}; } elsif ($File->{Charset}->{Auto} =~ /^utf-16[bl]e$/ && $File->{BOM} == 2) { $File->{Charset}->{Use} = 'utf-16'; } elsif ($File->{ContentType} =~ m(^application/([-.a-zA-Z0-9]+\+)?xml$)) { $File->{Charset}->{Use} = "utf-8"; } elsif (&is_xml($File) and not $File->{ContentType} =~ m(^text/)) { $File->{Charset}->{Use} = 'utf-8'; # UTF-8 (image/svg+xml etc.) } unless ($File->{Charset}->{Use}) { $File->{Charset}->{Use} = $File->{Charset}->{META}; } # # Handle any Fallback or Override for the charset. if (charset_not_equal($File->{Opt}->{Charset}, '(detect automatically)')) { # charset=foo was given to the CGI and it wasn't "autodetect" or empty. # # Extract the user-requested charset from CGI param. my ($override, undef) = split(/\s/, $File->{Opt}->{Charset}, 2); $File->{Charset}->{Override} = lc($override); if ($File->{Opt}->{FB}->{Charset}) { # charset fallback mode unless ($File->{Charset}->{Use}) { &add_warning('W02', {W02_charset => $File->{Charset}->{Override}}); $File->{Tentative} |= T_ERROR; # Tag it as Invalid. $File->{Charset}->{Use} = $File->{Charset}->{Override}; } } else { # charset "hard override" mode if (! $File->{Charset}->{Use}) { # overriding "nothing" &add_warning('W04', {W04_charset => $File->{Charset}->{Override}, W04_override => TRUE}); $File->{Tentative} |= T_ERROR; $File->{Charset}->{Use} = $File->{Charset}->{Override}; } else { #actually overriding something # Warn about Override unless it's the same as the real charset... unless ($File->{Charset}->{Override} eq $File->{Charset}->{Use}) { &add_warning('W03', { W03_use => $File->{Charset}->{Use}, W03_opt => $File->{Charset}->{Override}, }); $File->{Tentative} |= T_ERROR; $File->{Charset}->{Use} = $File->{Charset}->{Override}; } } } } unless ($File->{Charset}->{Use}) { # No charset given... &add_warning('W04', {W04_charset => 'UTF-8'}); $File->{Tentative} |= T_ERROR; # Can never be valid. $File->{Charset}->{Use} = 'utf-8'; } # # Abort if an error was flagged while finding the encoding. &abort_if_error_flagged($File, O_CHARSET|O_DOCTYPE); # # Encode alias definitions. This might not be the best # place for them, feel free to move them elsewhere. # implicit bidi, but character encoding is the same Encode::Alias::define_alias('iso-8859-6-i', 'iso-8859-6'); # implicit bidi, but character encoding is the same Encode::Alias::define_alias('iso-8859-8-i', 'iso-8859-8'); # 0xA0 is U+00A0 in ISO-8859-11 but undefined in tis-620 # other than that the character encodings are equivalent Encode::Alias::define_alias('tis-620', 'iso-8859-11'); # Encode::Byte does not know 'macintosh' but MacRoman Encode::Alias::define_alias('macintosh', 'MacRoman'); # x-mac-roman is the non-standard version of 'macintosh' Encode::Alias::define_alias('x-mac-roman', 'MacRoman'); # Encode only knows the long hand version of 'ksc_5601' Encode::Alias::define_alias('ksc_5601', 'KS_C_5601-1987'); # gb18030 requires Encode::HanExtra but no additional alias # # Always transcode, even if the content claims to be UTF-8 $File = transcode($File); &abort_if_error_flagged($File, O_CHARSET); # # Add a warning if doc is UTF-8 and contains a BOM. if ($File->{Charset}->{Use} eq 'utf-8' && $File->{Content}->[0] =~ m(^\x{FEFF})) { &add_warning('W21', {}); } # # Overall parsing algorithm for documents returned as text/html: # # For documents that come to us as text/html, # # 1. check if there's a doctype # 2. if there is a doctype, parse/validate against that DTD # 3. if no doctype, check for an xmlns= attribute on the first element, or XML declaration # 4. if no doctype and XML mode, check for XML well-formedness # 5. otherwise , punt. # # # Override DOCTYPE if user asked for it. if ($File->{Opt}->{DOCTYPE} and not $File->{Opt}->{DOCTYPE} =~ /(Inline|detect)/i) { $File = &override_doctype($File); } # # Try to extract a DOCTYPE or xmlns. $File = &preparse_doctype($File); # # Determine the parse mode (XML or SGML). ##set_parse_mode($File, $CFG) if $File->{DOCTYPE}; set_parse_mode($File, $CFG); # # Sanity check Charset information and add any warnings necessary. $File = &charset_conflicts($File); # before we start the parsing, clean slate $File->{'Is Valid'} = TRUE; $File->{Errors} = []; # preparse with XML parser if necessary # we should really be using a SAX ErrorHandler, but I can't find # a way to make it work with XML::LibXML::SAX::Parser... ** FIXME ** # ditto, we should try using W3C::Validator::SAXHandler, # but it's badly linked to opensp at the moment if (&is_xml($File)) { my $xmlparser = XML::LibXML->new(); $xmlparser->line_numbers(1); $xmlparser->validation(0); $xmlparser->load_ext_dtd(0); # [NOT] loading the XML catalog for entities resolution as it seems to cause a lot of unnecessary DTD/entities fetching (requires >= 1.53 if enabled) #$xmlparser->load_catalog( File::Spec->catfile($CFG->{Paths}->{SGML}->{Library}, 'xml.soc') ); my $xml_string = join"\n",@{$File->{Content}}; # the XML parser will check the value of encoding attribute in XML declaration # so we have to amend it to reflect transcoding. see Bug 4867 $xml_string =~ s/(<\?xml.*) (encoding[\x20|\x09|\x0D|\x0A]*=[\x20|\x09|\x0D|\x0A]*(?:"[A-Za-z][a-zA-Z0-9_-]+"|'[A-Za-z][a-zA-Z0-9_-]+')) (.*\?>)/$1encoding="utf-8"$3/sx; eval { $xmlparser->parse_string($xml_string); }; $xml_string = undef; my $xml_parse_errors_line = undef; my @xmlwf_error_list; if ($@) { my $xmlwf_errors = $@; my $xmlwf_error_line = undef; my $xmlwf_error_col = undef; my $xmlwf_error_msg = undef; my $got_error_message = 0; my $got_quoted_line = 0; my $num_xmlwf_error = 0; foreach my $msg_line (split "\n", $xmlwf_errors){ $msg_line =~ s{[^\x0d\x0a](:\d+:)}{\n$1}g; $msg_line =~ s{[^\x0d\x0a]+[\x0d\x0a]$}{}; # first we get the actual error message if (($got_error_message eq 0) and ($msg_line =~ /^(:\d+:)( parser error : .*)/ )) { $xmlwf_error_line = $1; $xmlwf_error_msg = $2; $xmlwf_error_line =~ s/:(\d+):/$1/; $xmlwf_error_msg =~ s/ parser error :/XML Parsing Error: /; $got_error_message = 1; } # then we skip the second line, which shows the context (we don't use that) elsif (($got_error_message eq 1) and ($got_quoted_line eq 0)) { $got_quoted_line = 1; } # we now take the third line, with the pointer to the error's column elsif (($msg_line =~ /(\s+)\^/) and ($got_error_message eq 1) and ($got_quoted_line eq 1)) { $xmlwf_error_col = length($1); } # cleanup for a number of bugs for the column number if (defined($xmlwf_error_col)) { if ((my $l = length($File->{Content}->[$xmlwf_error_line-1])) < $xmlwf_error_col) { # http://bugzilla.gnome.org/show_bug.cgi?id=434196 #warn("Warning: reported error column larger than line length " . # "($xmlwf_error_col > $l) in $File->{URI} line " . # "$xmlwf_error_line, libxml2 bug? Resetting to line length."); $xmlwf_error_col = $l; } elsif ($xmlwf_error_col == 79) { # working around an apparent odd limitation of libxml # which only gives context for lines up to 80 chars # http://www.w3.org/Bugs/Public/show_bug.cgi?id=4420 # http://bugzilla.gnome.org/show_bug.cgi?id=424017 $xmlwf_error_col = "> 80"; # non-int line number will trigger the proper behavior in report_error } } # when we have all the info (one full error message), proceed and move on to the next error if ((defined $xmlwf_error_line) and (defined $xmlwf_error_col) and (defined $xmlwf_error_msg)){ # Reinitializing for the next batch of 3 lines $got_error_message = 0; $got_quoted_line = 0; # formatting the error message for output my $err; $err->{src} = '...'; # do this with show_open_entities()? $err->{line} = $xmlwf_error_line; $err->{char} = $xmlwf_error_col; $err->{num} = 'xmlwf'; $err->{type} = "E"; $err->{msg} = $xmlwf_error_msg; # The validator will sometimes fail to dereference entities files # we're filtering the bogus resulting error if ($err->{msg} =~ /Entity '\w+' not defined/) { $xmlwf_error_line = undef; $xmlwf_error_col = undef; $xmlwf_error_msg = undef; next; } push (@xmlwf_error_list, $err); $xmlwf_error_line = undef; $xmlwf_error_col = undef; $xmlwf_error_msg = undef; $num_xmlwf_error++; } } foreach my $errmsg (@xmlwf_error_list){ $File->{'Is Valid'} = FALSE; push @{$File->{WF_Errors}}, $errmsg; } } } # # Abandon all hope ye who enter here... $File = &parse($File); sub parse (\$) { my $File = shift; # TODO switch parser on the fly my $opensp = SGML::Parser::OpenSP->new(); my $parser_name = "SGML::Parser::OpenSP"; # # By default, use SGML catalog file and SGML Declaration. my $catalog = File::Spec->catfile($CFG->{Paths}->{SGML}->{Library}, 'sgml.soc'); # default parsing options my @spopt = qw(valid non-sgml-char-ref no-duplicate); # # Switch to XML semantics if file is XML. if (&is_xml($File)) { $catalog = File::Spec->catfile($CFG->{Paths}->{SGML}->{Library}, 'xml.soc'); push(@spopt, 'xml'); # workaround for a bug in s:p:o 0.99 # see http://www.w3.org/Bugs/Public/show_bug.cgi?id=798#c5 push(@spopt, 'xml'); # FIXME when fixed s:p:o gets released } else { # add warnings for shorttags push(@spopt, 'min-tag'); # twice, ditto above re: s:p:o bug in 0.99 push(@spopt, 'min-tag'); } # # Parser configuration $opensp->search_dirs($CFG->{Paths}->{SGML}->{Library}); $opensp->catalogs($catalog); $opensp->show_error_numbers(1); $opensp->warnings(@spopt); # # Restricted file reading is disabled on Win32 for the time # beeing since neither SGML::Parser::OpenSP nor check auto- # magically set search_dirs to include the temp directory # so restricted file reading would defunct the Validator. $opensp->restrict_file_reading(1) unless $^O eq 'MSWin32'; # # Set debug info for HTML report. $File->{Templates}->{Result}->param(opt_debug => $DEBUG); $File->{Templates}->{Result}->param(debug => [ map({name => $_, value => $ENV{$_}}, qw(no_proxy http_proxy https_proxy ftp_proxy FTP_PASSIVE)), { name => 'Content-Encoding', value => $File->{ContentEnc} }, { name => 'Content-Language', value => $File->{ContentLang} }, { name => 'Content-Location', value => $File->{ContentLoc} }, { name => 'Transfer-Encoding', value => $File->{TransferEnc} }, { name => 'Parse Mode', value => $File->{Mode} }, { name => 'Parse Mode Factor', value => $File->{ModeChoice} }, { name => 'Parser', value => $parser_name }, { name => 'Parser Options', value => join " ", @spopt }, ], ); $File->{Templates}->{SOAP}->param(opt_debug => $DEBUG); $File->{Templates}->{SOAP}->param(debug => [ map({name => $_, value => $ENV{$_}}, qw(no_proxy http_proxy https_proxy ftp_proxy FTP_PASSIVE)), { name => 'Content-Encoding', value => $File->{ContentEnc} }, { name => 'Content-Language', value => $File->{ContentLang} }, { name => 'Content-Location', value => $File->{ContentLoc} }, { name => 'Transfer-Encoding', value => $File->{TransferEnc} }, { name => 'Parse Mode', value => $File->{Mode} }, { name => 'Parse Mode Factor', value => $File->{ModeChoice} }, { name => 'Parser', value => $parser_name }, { name => 'Parser Options', value => join " ", @spopt }, ], ); my $h = W3C::Validator::SAXHandler->new($opensp, $File); $opensp->handler($h); $opensp->parse_string(join"\n",@{$File->{Content}}); # Make sure there are no circular references, otherwise the script # would leak memory until mod_perl unloads it which could take some # time. @@FIXME It's probably overly careful though. $opensp->handler(undef); undef $h->{_parser}; undef $h->{_file}; undef $h; undef $opensp; # # Set Version to be the FPI initially. $File->{Version} = $File->{DOCTYPE}; return $File; } # # Force "XML" if type is an XML type and an FPI was not found. # Otherwise set the type to be the FPI. if (&is_xml($File) and not $File->{DOCTYPE} and lc($File->{Root}) ne 'html') { $File->{Version} = 'XML'; } else { $File->{Version} = $File->{DOCTYPE} unless $File->{Version}; } # # Get the pretty text version of the FPI if a mapping exists. if (my $prettyver = $CFG->{Types}->{$File->{Version}}->{Display}) { $File->{Version} = $prettyver; } # # check the received mime type against Allowed mime types if ($File->{ContentType}){ my @allowedMediaType = split(/\s+/, $CFG->{Types}->{$File->{DOCTYPE}}->{Types}->{Allowed} || ''); my $usedCTisAllowed; if (scalar @allowedMediaType) { $usedCTisAllowed = FALSE; foreach (@allowedMediaType) { $usedCTisAllowed = TRUE if ($_ eq $File->{ContentType}); } } else { # wedon't know what media type is recommended, so better shut up $usedCTisAllowed = TRUE; } if(! $usedCTisAllowed ){ &add_warning('W23', { W23_type => $File->{ContentType}, W23_type_pref => $CFG->{Types}->{$File->{DOCTYPE}}->{Types}->{Preferred}, w23_doctype => $File->{Version} }); } } # # Warn about unknown, incorrect, or missing Namespaces. if ($File->{Namespace}) { my $ns = $CFG->{Types}->{$File->{Version}}->{Namespace} || FALSE; if (&is_xml($File)) { if ($ns eq $File->{Namespace}) { &add_warning('W10', { W10_ns => $File->{Namespace}, W10_type => $File->{Type}, }); } } else { &add_warning('W11', {W11_ns => $File->{Namespace}}); } } else { if (&is_xml($File) and $CFG->{Types}->{$File->{Version}}->{Namespace}) { &add_warning('W12', {}); } } ## if invalid content, AND if requested, pass through tidy if ((! $File->{'Is Valid'}) and ($File->{Opt}->{'Show Tidy'}) ) { eval { local $SIG{__DIE__}; require HTML::Tidy; my $tidy = HTML::Tidy->new({config_file => $CFG->{Paths}->{TidyConf}}); $File->{'Tidy'} = Encode::decode('utf-8', $tidy->clean(join"\n",@{$File->{Content}})); $File->{'Tidy_OK'} = TRUE; }; if ($@) { $File->{'Tidy_OK'} = FALSE; } } else { # if document is valid, we don't really need tidy, do we? $File->{'Tidy_OK'} = FALSE; } if (!$File->{'Tidy_OK'}) { # if tidy not available, disable $File->{Opt}->{'Show Tidy'} = FALSE; } my $template; if ($File->{Opt}->{Output} eq 'xml') { $template = $File->{Templates}->{XML}; } elsif ($File->{Opt}->{Output} eq 'earl') { $template = $File->{Templates}->{EARLXML}; } elsif ($File->{Opt}->{Output} eq 'n3') { $template = $File->{Templates}->{EARLN3}; } elsif ($File->{Opt}->{Output} eq 'ucn') { $template = $File->{Templates}->{UCN}; } elsif ($File->{Opt}->{Output} eq 'soap12') { if ($CFG->{'Enable SOAP'} != 1) { # API disabled - ideally this should have been sent before performing validation... print CGI::header(-status => 503, -content_language => "en", -type => "text/html", -charset => "utf-8" ); $template = $File->{Templates}->{SOAPDisabled}; } elsif ($File->{'Error Flagged'}) { # should send SOAP fault message $template = $File->{Templates}->{SOAPFault}; # we fill the soap fault template #with the variables that had been passed to the HTML fatal error template foreach my $fault_param ($File->{Templates}->{Error}->param()) { $template->param($fault_param => $File->{Templates}->{Error}->param($fault_param)); } } else { $template = $File->{Templates}->{SOAP}; } } else { $template = $File->{Templates}->{Result}; } &prep_template($File, $template); &fin_template($File, $template); $template->param(file_warnings => $File->{Warnings}); $template->param(tidy_output => $File->{'Tidy'}); $template->param(file_source => &source($File)) if ($template->param('opt_show_source') or ($File->{'Is Upload'}) or ($File->{'Direct Input'})); #$template->param('opt_show_esis' => TRUE) # if $File->{Opt}->{'Show ESIS'}; #$template->param('opt_show_raw_errors' => TRUE) # if $File->{Opt}->{'Show Errors'}; #$template->param('file_raw_errors' => &show_errors($File)) # if $template->param('opt_show_raw_errors'); # $T->param(file_outline => &outline($File)) if $T->param('opt_show_outline'); # transcode output from perl's internal to utf-8 and output print Encode::encode('UTF-8', $template->output); # # Get rid of $File object and exit. undef $File; exit; ############################################################################# # Subroutine definitions ############################################################################# # # Generate HTML report. sub prep_template ($$) { my $File = shift; my $T = shift; # # XML mode... $T->param(is_xml => &is_xml($File)); # # Upload? $T->param(is_upload => $File->{'Is Upload'}); # # Direct Input? $T->param(is_direct_input => $File->{'Direct Input'}); # # The URI... $T->param(file_uri => $File->{URI}); $T->param(file_uri_param => uri_escape($File->{URI})); # # Set URL for page title. $T->param(page_title_url => $File->{URI}); # # Metadata... $T->param(file_modified => $File->{Modified}); $T->param(file_server => $File->{Server}); $T->param(file_size => $File->{Size}); $T->param(file_contenttype => $File->{ContentType}); $T->param(file_charset => $File->{Charset}->{Use}); $T->param(file_doctype => $File->{DOCTYPE}); # # Output options... $T->param(opt_show_source => $File->{Opt}->{'Show Source'}); $T->param(opt_show_tidy => $File->{Opt}->{'Show Tidy'}); $T->param(opt_show_outline => $File->{Opt}->{'Outline'}); $T->param(opt_show_parsetree => $File->{Opt}->{'Show Parsetree'}); $T->param(opt_show_noatt => $File->{Opt}->{'No Attributes'}); $T->param(opt_verbose => $File->{Opt}->{'Verbose'}); $T->param(opt_group_errors => $File->{Opt}->{'Group Errors'}); $T->param(opt_no200 => $File->{Opt}->{'No200'}); # # Tip of the Day... my $tip = &get_tip(); $T->param(tip_uri => $tip->[0]); $T->param(tip_slug => $tip->[1]); # Root Element $T->param(root_element => $File->{Root}); # Namespaces... $T->param(file_namespace => $File->{Namespace}); my %seen_ns = (); my @bulk_ns = @{$File->{Namespaces}}; $File->{Namespaces} = []; # reinitialize the list of non-root namespaces # ... and then get a uniq version of it foreach my $single_namespace (@bulk_ns) { push(@{$File->{Namespaces}}, $single_namespace) unless (($single_namespace eq $File->{Namespace}) or $seen_ns{$single_namespace}++); } my @nss = map({uri => $_}, @{$File->{Namespaces}}); $T->param(file_namespaces => \@nss) if @nss; if ($File->{Opt}->{DOCTYPE}) { my $over_doctype_param = "override doctype $File->{Opt}->{DOCTYPE}"; $T->param($over_doctype_param => TRUE); } if ($File->{Opt}->{Charset}) { my $over_charset_param = "override charset $File->{Opt}->{Charset}"; $T->param($over_charset_param => TRUE); } if ($File->{'Error Flagged'}) { $T->param(fatal_error => TRUE); } } sub fin_template ($$) { my $File = shift; my $T = shift; if (! $File->{Doctype} and ($File->{Version} eq 'unknown' or $File->{Version} eq 'SGML' or (!$File->{Version}))) { # @@TODO@@ we should try falling back on other version # info, such as the ones stored in Version_ESIS $T->param(file_version => '(no Doctype found)'); } else { $T->param(file_version => $File->{Version}); } my ($num_errors,$num_warnings, $num_info, $reported_errors) = &report_errors($File); if ($num_errors+$num_warnings > 0) { $T->param(has_errors => 1); } $T->param(valid_errors_num => $num_errors); $num_warnings += scalar @{$File->{Warnings}}; $T->param(valid_warnings_num => $num_warnings); my $number_of_errors = ""; # textual form of $num_errors my $number_of_warnings = ""; # textual form of $num_errors # The following is a bit hack-ish, but will enable us to have some logic # for a human-readable display of the number, with cases for 0, 1, 2 and above # (the case of 2 appears to be useful for localization in some languages where the plural is different for 2, and above) if ($num_errors > 1) { $T->param(number_of_errors_is_0 => FALSE ); $T->param(number_of_errors_is_1 => FALSE); if ($num_errors eq 2) { $T->param(number_of_errors_is_2 => TRUE); } else { $T->param(number_of_errors_is_2 => FALSE ); } $T->param(number_of_errors_is_plural => TRUE ); } elsif ($num_errors eq 1) { $T->param(number_of_errors_is_0 => FALSE ); $T->param(number_of_errors_is_1 => TRUE ); $T->param(number_of_errors_is_2 => FALSE ); $T->param(number_of_errors_is_plural => FALSE ); } else { # 0 $T->param(number_of_errors_is_0 => TRUE ); $T->param(number_of_errors_is_1 => FALSE ); $T->param(number_of_errors_is_2 => FALSE ); $T->param(number_of_errors_is_plural => FALSE ); } if ($num_warnings > 1) { $T->param(number_of_warnings_is_0 => FALSE ); $T->param(number_of_warnings_is_1 => FALSE); if ($num_warnings eq 2) { $T->param(number_of_warnings_is_2 => TRUE); } else { $T->param(number_of_warnings_is_2 => FALSE); } $T->param(number_of_warnings_is_plural => TRUE ); } elsif ($num_warnings eq 1) { $T->param(number_of_warnings_is_0 => FALSE ); $T->param(number_of_warnings_is_1 => TRUE ); $T->param(number_of_warnings_is_2 => FALSE ); $T->param(number_of_warnings_is_plural => FALSE ); } else { # 0 $T->param(number_of_warnings_is_0 => TRUE ); $T->param(number_of_warnings_is_1 => FALSE ); $T->param(number_of_warnings_is_2 => FALSE ); $T->param(number_of_warnings_is_plural => FALSE ); } $T->param(file_errors => $reported_errors); if ($File->{'Is Valid'}) { $T->param(VALID => TRUE); $T->param(valid_status => 'Valid'); &report_valid($File, $T); } else { $T->param(VALID => FALSE); $T->param(valid_status => 'Invalid'); } } # # Output "This page is Valid" report. sub report_valid { my $File = shift; my $T = shift; unless ($File->{Version} eq 'unknown' or defined $File->{Tentative}) { if (exists $CFG->{Types}->{$File->{DOCTYPE}}->{Badge}) { my $cfg = $CFG->{Types}->{$File->{DOCTYPE}}; $T->param(have_badge => TRUE); $T->param(badge_uri => $cfg->{Badge}->{URI}); if (exists $CFG->{Types}->{$File->{DOCTYPE}}->{Badge}->{'Local URI'}) { $T->param(local_badge_uri => $cfg->{Badge}->{'Local URI'}); $T->param(have_local_badge => TRUE); } if (exists $CFG->{Types}->{$File->{DOCTYPE}}->{Badge}->{'ALT URI'}) { $T->param(badge_alt_uri => $cfg->{Badge}->{'ALT URI'}); if (exists $CFG->{Types}->{$File->{DOCTYPE}}->{Badge}->{'Local ALT URI'}) { $T->param(local_alt_badge_uri => $cfg->{Badge}->{'Local ALT URI'}); } $T->param(have_alt_badge => TRUE); } $T->param(badge_alt => $cfg->{Badge}->{Alt}); $T->param(badge_h => $cfg->{Badge}->{Height}); $T->param(badge_w => $cfg->{Badge}->{Width}); $T->param(badge_tagc => ($cfg->{'Parse Mode'} eq 'XML' ? ' /' : '')); } } elsif (defined $File->{Tentative}) { $T->param(is_tentative => TRUE); } if ($File->{Opt}->{'Outline'}) { $T->param(file_outline => $File->{heading_outline}); } if ($File->{XMLWF_ONLY}){ $T->param(xmlwf_only => TRUE); } my $thispage = self_url_file($File); $T->param(file_thispage => $thispage); } # # Add a waring message to the output. sub add_warning ($$) { my $WID = shift; my $params = shift; push @{$File->{Warnings}}, $WID; $File->{Templates}->{Result}->param($WID => TRUE, %{$params}); $File->{Templates}->{Result}->param(have_warnings => TRUE); $File->{Templates}->{Error}->param($WID => TRUE, %{$params}); $File->{Templates}->{Error}->param(have_warnings => TRUE); $File->{Templates}->{SOAP}->param($WID => TRUE, %{$params}); $File->{Templates}->{SOAP}->param(have_warnings => TRUE); } # # Proxy authentication requests. # Note: expects the third argument to be a hash ref (see HTTP::Headers::Auth). sub authenticate { my $File = shift; my $resource = shift; my $authHeader = shift || {}; my $realm = $resource; $realm =~ s([^\w\d.-]*){}g; for my $scheme (keys(%$authHeader)) { my $origrealm = $authHeader->{$scheme}->{realm}; if (not defined $origrealm or lc($scheme) !~ /^(?:basic|digest)$/) { delete($authHeader->{$scheme}); next; } $authHeader->{$scheme}->{realm} = "$realm-$origrealm"; } my $headers = HTTP::Headers->new(Connection => 'close'); $headers->www_authenticate(%$authHeader); $headers = $headers->as_string(); chomp($headers); $File->{Templates}->{AuthzReq}->param(http_401_headers => $headers); $File->{Templates}->{AuthzReq}->param(http_401_url => $resource); print $File->{Templates}->{AuthzReq}->output; exit; # Further interaction will be a new HTTP request. } # # Fetch an URL and return the content and selected meta-info. sub handle_uri { my $q = shift; # The CGI object. my $File = shift; # The master datastructure. my $uri = new URI (ref $q ? $q->param('uri') : $q)->canonical(); $uri->fragment(undef); my $ua = new W3C::Validator::UserAgent ($CFG, $File); $ua->env_proxy(); $ua->agent("W3C_Validator/$VERSION"); $ua->parse_head(0); # Don't parse the http-equiv stuff. $ua->protocols_allowed($CFG->{Protocols}->{Allow} || ['http', 'https']); unless ($ua->is_protocol_supported($uri)) { $File->{'Error Flagged'} = TRUE; if (($uri->canonical() eq "1") ) #if uri param is empty (also for empty direct or upload), it's been set to TRUE in sub prepCGI() { $File->{Templates}->{Error}->param(fatal_no_content => TRUE); } else { $File->{Templates}->{Error}->param(fatal_uri_error => TRUE); $File->{Templates}->{Error}->param(fatal_uri_scheme => $uri->scheme()); } return $File; } return $File unless $ua->uri_ok($uri); my $req = new HTTP::Request(GET => $uri); # telling caches in the middle we want a fresh copy (Bug 4998) $req->header(Cache_control=> "max-age=0"); # If we got a Authorization header, the client is back at it after being # prompted for a password so we insert the header as is in the request. if($ENV{HTTP_AUTHORIZATION}){ $req->headers->header(Authorization => $ENV{HTTP_AUTHORIZATION}); } my $res = $ua->request($req); return $File if $File->{'Error Flagged'}; # Redirect IP rejected? unless ($res->code == 200 or $File->{Opt}->{'No200'}) { if ($res->code == 401) { my %auth = $res->www_authenticate(); # HTTP::Headers::Auth &authenticate($File, $res->request->uri, \%auth); } else { $File->{'Error Flagged'} = TRUE; $File->{Templates}->{Error}->param(fatal_http_error => TRUE); $File->{Templates}->{Error}->param(fatal_http_uri => $uri->as_string); $File->{Templates}->{Error}->param(fatal_http_code => $res->code); $File->{Templates}->{Error}->param(fatal_http_msg => $res->message); $File->{Templates}->{Error}->param(fatal_http_dns => TRUE) if $res->code == 500; } return $File; } # # Enforce Max Recursion level. &check_recursion($File, $res); my ($mode, $ct, $charset) = &parse_content_type( $File, $res->header('Content-Type'), scalar($res->request->uri), ); my $lastmod = undef; if ( $res->last_modified ) { $lastmod = scalar(gmtime($res->last_modified)); } my $content = $res->can('decoded_content') ? $res->decoded_content(charset => 'none') : $res->content; $File->{Bytes} = $content; $File->{Mode} = $mode; $File->{ContentType} = $ct; $File->{ContentEnc} = $res->content_encoding; $File->{ContentLang} = $res->content_language; $File->{ContentLoc} = $res->header('Content-Location'); $File->{TransferEnc} = $res->header('Client-Transfer-Encoding'); $File->{Charset}->{HTTP} = lc $charset; $File->{Modified} = $lastmod; $File->{Server} = scalar $res->server; # TODO: Content-Length is not always set, so either this should # be renamed to 'Content-Length' or it should consider more than # the Content-Length header. $File->{Size} = scalar $res->content_length; $File->{URI} = scalar $res->request->uri->canonical; $File->{'Is Upload'} = FALSE; $File->{'Direct Input'} = FALSE; return $File; } # # Handle uploaded file and return the content and selected meta-info. sub handle_file { my $q = shift; # The CGI object. my $File = shift; # The master datastructure. my $f = $q->param('uploaded_file'); my $h = $q->uploadInfo($f); my $file; local $/ = undef; # set line delimiter so that <> reads rest of file $file = <$f>; my ($mode, $ct, $charset) = &parse_content_type($File, $h->{'Content-Type'}); $File->{Bytes} = $file; $File->{Mode} = $mode; $File->{ContentType} = $ct; $File->{Charset}->{HTTP} = lc $charset; $File->{Modified} = $q->http('Last-Modified'); $File->{Server} = $q->http('User-Agent'); # Fake a "server". :-) $File->{Size} = $q->http('Content-Length'); $File->{URI} = "$f"; $File->{'Is Upload'} = TRUE; $File->{'Direct Input'} = FALSE; return $File; } # # Handle uploaded file and return the content and selected meta-info. sub handle_frag { my $q = shift; # The CGI object. my $File = shift; # The master datastructure. $File->{Bytes} = $q->param('fragment'); $File->{Mode} = 'TBD'; $File->{Modified} = ''; $File->{Server} = ''; $File->{Size} = ''; $File->{ContentType} = ''; # @@TODO? $File->{URI} = 'upload://Form Submission'; $File->{'Is Upload'} = FALSE; $File->{'Direct Input'} = TRUE; $File->{Charset}->{HTTP} = "utf-8"; # by default, the form accepts utf-8 chars if ($File->{Opt}->{'Prefill'}) { # we surround the HTML fragment with some basic document structure my $prefill_Template = undef; if ($File->{Opt}->{'Prefill Doctype'} eq 'html401') { $prefill_Template = $File->{Templates}->{PrefillHTML}; } else { $prefill_Template = $File->{Templates}->{PrefillXHTML}; } $prefill_Template->param(fragment => $File->{Bytes}); $File->{Bytes} = $prefill_Template->output(); # let's force the view source so that the user knows what we've put around their code $File->{Opt}->{'Show Source'} = TRUE; } return $File; } # # Parse a Content-Type and parameters. Return document type and charset. sub parse_content_type { my $File = shift; my $Content_Type = shift; my $url = shift; my $charset = ''; my ($ct) = lc($Content_Type) =~ /^\s*([^\s;]*)/g; my $mode = $CFG->{MIME}->{$ct} || $ct; $charset = HTML::Encoding::encoding_from_content_type($Content_Type); if ($mode =~ m(/)) { # a "/" means it's unknown or we'd have a mode here. if ($ct eq 'text/css' and defined $url) { print redirect 'http://jigsaw.w3.org/css-validator/validator?uri=' . uri_escape $url; exit; } elsif ($ct eq 'application/atom+xml' and defined $url) { print redirect 'http://validator.w3.org/feed/check.cgi?url=' . uri_escape $url; exit; } elsif ($ct =~ m(^application/.+\+xml$)) { # unknown media types which should be XML - we give these a try $mode = "XML"; } else { $File->{'Error Flagged'} = TRUE; $File->{Templates}->{Error}->param(fatal_mime_error => TRUE); $File->{Templates}->{Error}->param(fatal_mime_ct => $ct); } } return $mode, $ct, $charset; } # # Check recursion level and enforce Max Recursion limit. sub check_recursion ($$) { my $File = shift; my $res = shift; # Not looking at our own output. return unless defined $res->header('X-W3C-Validator-Recursion'); my $lvl = $res->header('X-W3C-Validator-Recursion'); return unless $lvl =~ m/^\d+$/; # Non-digit, i.e. garbage, ignore. if ($lvl >= $CFG->{'Max Recursion'}) { print redirect $CFG->{'Home Page'}; } else { # Increase recursion level in output. $File->{Templates}->{Result}->param(depth => $lvl++); } } # # Return $_[0] encoded for HTML entities (cribbed from merlyn). # # Note that this is used both for HTML and XML escaping. # sub ent { local $_ = shift; return '' unless defined; # Eliminate warnings # TODO: Err, why have " twice in the character class? ' maybe? s(["<&>"]){'&#' . ord($&) . ';'}ge; # should switch to hex sooner or later return $_; } # # Truncate source lines for report. # sub truncate_line { my $line = shift; my $col = shift; my $start = $col; my $end = $col; for (1..40) { $start-- if ($start - 1 >= 0); # in/de-crement until... $end++ if ($end + 1 <= length $line); # ...we hit end of line. } unless ($end - $start == 80) { if ($start == 0) { # Hit start of line, maybe grab more at end. my $diff = 40 - $col; for (1..$diff) { $end++ if ($end + 1 <= length $line); } } elsif ($end == length $line) { # Hit end of line, maybe grab more at beginning. my $diff = 80 - $col; for (1..$diff) { $start-- if ($start - 1 >= 0); } } } # # Add elipsis at end if necessary. unless ($end == length $line) {substr $line, -3, 3, '…'}; $col = $col - $start; # New offset is diff from $col to $start. $line = substr $line, $start, $end - $start; # Truncate. # # Add elipsis at start if necessary. unless ($start == 0) { substr $line, 0, 3, '…'; $col = $col - 2; }; return $line, $col; } # # Suppress any existing DOCTYPE by commenting it out. sub override_doctype { no strict 'vars'; my $File = shift; my ($dt) = grep { $_->{Display} eq $File->{Opt}->{DOCTYPE} } values %{$CFG->{Types}}; # @@TODO: abort/whine about unrecognized doctype if $dt is undef.; my $pubid = $dt->{PubID}; my $sysid = $dt->{SysID}; my $name = $dt->{Name}; local $dtd = qq('; local $org_dtd = ''; local $HTML = ''; local $seen = FALSE; local $seen_root = FALSE; my $declaration = sub { $seen = TRUE; # No Override if Fallback was requested. if ($File->{Opt}->{FB}->{DOCTYPE}) { $HTML .= $_[0]; # Stash it as is... } else { # Comment it out and insert the new one... $HTML .= "$dtd\n" . ''; $org_dtd = &ent($_[0]); } }; my $start_element = sub{ if ($seen_root) { $HTML .= $_[0]; # Stash it as is... moving on } else { $seen_root = TRUE; if ($seen) { # doctype addition aldready done, we move on $HTML .= $_[0]; } else { # no original doctype present, hence none replaced already # => we sneak the chosen doctype before the root elt $HTML .= "$dtd\n" . $_[0]; } } }; HTML::Parser->new(default_h => [sub {$HTML .= shift}, 'text'], declaration_h => [$declaration, 'text'], start_h => [$start_element, "text"] )->parse(join "\n", @{$File->{Content}})->eof(); $File->{Content} = [split /\n/, $HTML]; if ($seen) { unless (($File->{Opt}->{FB}->{DOCTYPE}) or ($File->{Opt}->{DOCTYPE} eq $CFG->{Types}->{$File->{DOCTYPE}}->{Display} )) { &add_warning('W13', { W13_org => $org_dtd, W13_new => $File->{Opt}->{DOCTYPE}, }); $File->{Tentative} |= T_ERROR; # Tag it as Invalid. } } else { if ($File->{Opt}->{FB}->{DOCTYPE}) { &add_warning('W16', {W16_dtd => $File->{Opt}->{DOCTYPE}}); $File->{Tentative} |= T_ERROR; # Tag it as Invalid. } else { &add_warning('W15', {W15_dtd => $File->{Opt}->{DOCTYPE}}); $File->{Tentative} |= T_ERROR; # Tag it as Invalid. } } return $File; } # # Generate a HTML report of detected errors. sub report_errors ($) { my $File = shift; my $Errors = []; my %Errors_bytype; my $number_of_errors = 0; my $number_of_warnings = 0; my $number_of_info = 0; # Hash to keep track of how many of each error is reported. my %Msgs; # Used to generate a UID for explanations. # for the sake of readability, at least until the xmlwf errors have explanations, # we push the errors from the XML parser at the END of the error list. foreach my $errmsg (@{$File->{WF_Errors}}){ push @{$File->{Errors}}, $errmsg; } if (scalar @{$File->{Errors}}) { foreach my $err (@{$File->{Errors}}) { my $line; my $col; if ($err->{char} =~ /^[0-9]+$/ ){ ($line, $col) = &truncate_line($File->{Content}->[$err->{line}-1], $err->{char}); $line = &mark_error($line, $col); } else { $col = length($File->{Content}->[$err->{line}-1]); $col = 80 if ($col > 80); ($line, $col) = &truncate_line($File->{Content}->[$err->{line}-1], $col); $line = &ent($line) . "…"; $col = 0; } my $explanation = ""; if ($err->{num}) { my $num = $err->{num}; # if (exists $Msgs{$num}) { # We've already seen this message... # if ($File->{Opt}->{Verbose}) { # ...so only repeat it in Verbose mode. # $explanation = qq(\n \n); # } # } else { # $Msgs{$num} = 1; $explanation .= Encode::decode_utf8("\n $RSRC->{msg}->{$num}->{verbose}\n") if exists $RSRC->{msg}->{$num} && exists $RSRC->{msg}->{$num}->{verbose}; # } my $_msg = $RSRC->{msg}->{nomsg}->{verbose}; $_msg =~ s//$num/g; if (($File->{'Is Upload'}) or ($File->{'Direct Input'})) { $_msg =~ s///g } else { my $escaped_uri = uri_escape($File->{URI}); $_msg =~ s//$escaped_uri/g; } $_msg =~ s//$CFG->{'Home Page'}/g; $explanation = " $_msg\n$explanation"; # The send feedback plea. } $err->{src} = $line; $err->{col} = ' ' x $col; $err->{expl} = $explanation; if ($err->{type} eq 'I') { $err->{class} = 'msg_info'; $err->{err_type_err} = 0; $err->{err_type_warn} = 0; $err->{err_type_info} = 1; $number_of_info += 1; } elsif ($err->{type} eq 'E') { $err->{class} = 'msg_err'; $err->{err_type_err} = 1; $err->{err_type_warn} = 0; $err->{err_type_info} = 0; $number_of_errors += 1; } elsif (($err->{type} eq 'W') or ($err->{type} eq 'X') ) { $err->{class} = 'msg_warn'; $err->{err_type_err} = 0; $err->{err_type_warn} = 1; $err->{err_type_info} = 0; $number_of_warnings += 1; } # TODO other classes for "X" etc? FIXME find all types of message. push @{$Errors}, $err; if (($File->{Opt}->{'Group Errors'}) and (($err->{type} eq 'E') or ($err->{type} eq 'W')or ($err->{type} eq 'X'))) { # index by num for errors and warnings only - info usually give context of error or warning if (! exists $Errors_bytype{$err->{num}}) { $Errors_bytype{$err->{num}}->{instances} = []; my $msg_text; if ($err->{num} ne 'xmlwf') { $msg_text = $RSRC->{msg}->{$err->{num}}->{original}; $msg_text =~ s/%1/X/; $msg_text =~ s/%2/Y/; } else { ## FIXME ## we need a catalog of errors from our XML parser $msg_text = "XML Parsing Error"; } $Errors_bytype{$err->{num}}->{expl} = $err->{expl}; $Errors_bytype{$err->{num}}->{generic_msg} = $msg_text; $Errors_bytype{$err->{num}}->{msg} = $err->{msg}; $Errors_bytype{$err->{num}}->{type} = $err->{type}; $Errors_bytype{$err->{num}}->{class} = $err->{class}; $Errors_bytype{$err->{num}}->{err_type_err} = $err->{err_type_err}; $Errors_bytype{$err->{num}}->{err_type_warn} = $err->{err_type_warn}; $Errors_bytype{$err->{num}}->{err_type_info} = $err->{err_type_info}; } push @ { $Errors_bytype{$err->{num}}->{instances} }, $err; } } } if ($File->{Opt}->{'Group Errors'}) { $Errors = []; for my $err_num (keys %Errors_bytype){ push @{$Errors}, $Errors_bytype{$err_num}; } } # we are not sorting errors by line, as it would break the position # of auxiliary messages such as "start tag was here". We'll have to live with # the fact that XML well-formedness errors are listed first, then validation errors #else { # sort error by lines # @{$Errors} = sort {$a->{line} <=> $b->{line} } @{$Errors}; #} return $number_of_errors, $number_of_warnings, $number_of_info, $Errors; } # # Chop the source line into 3 pieces; the character at which the error # was detected, and everything to the left and right of that position. # That way we can add markup to the relevant char without breaking &ent(). sub mark_error (\$\$) { my $line = shift; my $col = shift; # # Left side... my $left; { my $offset = 0; # Left side allways starts at 0. my $length; if ($col - 1 < 0) { # If error is at start of line... $length = 0; # ...floor to 0 (no negative offset). } elsif ($col == length $line) { # If error is at EOL... $length = $col - 1; # ...leave last char to indicate position. } else { # Otherwise grab everything up to pos of error. $length = $col; } $left = substr $line, $offset, $length; } # # The character where the error was detected. my $char; { my $offset; my $length = 1; # Length is always 1; the char where error was found. if ($col == length $line) { # If err is at EOL... $offset = $col - 1; # ...then grab last char on line instead. } else { $offset = $col; # Otherwise just grab the char. } $char = substr $line, $offset, $length; $char = &ent($char); } # # The right side up to the end of the line... my $right; { my $offset; my $length; # Offset... if ($col == length $line) { # If at EOL... $offset = 0; # Don't bother as there is nothing left to grab. } else { $offset = $col + 1; # Otherwise get everything from char-after-error. } # Length... if ($col == length $line) { # If at end of line... $length = 0; # ...then don't grab anything. } else { $length = length($line) - ($col - 1); # Otherwise get the rest of the line. } $right = substr $line, $offset, $length; } $char = qq($char); $line = &ent($left) . $char . &ent($right); return $line; } # # Create a HTML representation of the document. sub source { my $File = shift; # Remove any BOM since we're not at BOT anymore... $File->{Content}->[0] = substr $File->{Content}->[0], ($File->{BOM} ? 1 : 0); # remove BOM my @source = map({file_source_line => $_}, @{$File->{Content}}); return \@source; } # # Do an initial parse of the Document Entity to extract FPI. sub preparse_doctype { my $File = shift; # # Reset DOCTYPE, Root (for second invocation, probably not needed anymore). $File->{DOCTYPE} = ''; $File->{Root} = ''; my $dtd = sub { return if $File->{Root}; # TODO: The \s and \w are probably wrong now that the strings are utf8_on ($File->{Root}, $File->{DOCTYPE}) = shift =~ m()si; }; my $start = sub { my $tag = shift; my $attr = shift; my %attr = map {lc($_) => $attr->{$_}} keys %{$attr}; if ($File->{Root}) { return unless $tag eq $File->{Root}; } else { $File->{Root} = $tag; } if ($attr->{xmlns}) {$File->{Namespace} = $attr->{xmlns}}; }; # we use HTML::Parser as pre-parser. May use html5lib or other in the future my $p = HTML::Parser->new(api_version => 3); # if content-type has shown we should pre-parse with XML mode, use that # otherwise (mostly text/html cases) use default mode $p->xml_mode(TRUE) if ($File->{Mode} eq 'XML'); $p->ignore_elements('BODY'); $p->ignore_elements('body'); $p->handler(declaration => $dtd, 'text'); $p->handler(start => $start, 'tag,attr'); $p->parse(join "\n", @{$File->{Content}}); # TODO: These \s here are probably wrong now that the strings are utf8_on $File->{DOCTYPE} = '' unless defined $File->{DOCTYPE}; $File->{DOCTYPE} =~ s(^\s+){ }g; $File->{DOCTYPE} =~ s(\s+$){ }g; $File->{DOCTYPE} =~ s(\s+) { }g; return $File; } # # Print out the raw error output for debugging. sub show_errors ($) { # @@FIXME This is broken with SGML::Parser::OpenSP my $file_raw_errors = ""; for (@{shift->{DEBUG}->{Errors}}) { $file_raw_errors .= ent $_; } return $file_raw_errors; } # # Preprocess CGI parameters. sub prepCGI { my $File = shift; my $q = shift; my $path_info; { # @@@HACK: # CGI.pm's _name_and_path_from_env has query string related issues; # just trump the query string for the duration of calling methods we # know we don't need it for and which have been affected in the past. # More info: http://www.w3.org/Bugs/Public/show_bug.cgi?id=4365 local $ENV{REQUEST_URI} = URI->new($ENV{REQUEST_URI})->path() if $ENV{REQUEST_URI}; # The URL to this CGI script. $File->{Env}->{'Self URI'} = $q->url(); $path_info = $q->path_info(); } # Avoid CGI.pm's "exists but undef" behaviour. if (scalar $q->param) { foreach my $param ($q->param) { next if $param eq 'uploaded_file'; # 'uploaded_file' contains data. next if $param eq 'fragment'; # Ditto 'fragment'. next if $q->param($param) eq '0'; # Keep false-but-set params. # # Parameters that are given to us without specifying a value get # set to "1" (the "TRUE" constant). This is so we can test for the # boolean value of a parameter instead of first checking whether # the param was given and then testing it's value. Needed because # CGI.pm sets ";param" and ";param=" to a boolean false value # (undef() or a null string, respectively). $q->param($param, TRUE) unless $q->param($param); } } # apparently, with mod_perl2, $path_info is empty even if it should be filled # working around that if (!$path_info) { if ($File->{Env}->{'Self URI'} =~ /check\/referer$/){ $path_info = '/referer'; $File->{Env}->{'Self URI'} =~ s/\/referer//; } } # Futz the URL so "/referer" works. if ($path_info) { if ($path_info eq '/referer' or $path_info eq '/referrer') { if ($q->referer) { $q->param('uri', $q->referer); print redirect &self_url_q($q, $File); exit; } else { print redirect $File->{Env}->{'Self URI'} . '?uri=referer'; exit; } } else { print redirect &self_url_q($q, $File); exit; } } # Use "url" unless a "uri" was also given. if ($q->param('url') and not $q->param('uri')) { $q->param('uri', $q->param('url')); } # Munge the URL to include commonly omitted prefix. my $u = $q->param('uri'); $q->param('uri', "http://$u") if $u && $u =~ m(^www)i; # Issue a redirect for uri=referer. if ($q->param('uri') and $q->param('uri') eq 'referer') { if ($q->referer) { $q->param('uri', $q->referer); print redirect &self_url_q($q, $File); exit; } else { # Redirected from /check/referer to /check?uri=referer because # the browser didn't send a Referer header, or the request was # for /check?uri=referer but no Referer header was found. $File->{'Error Flagged'} = TRUE; $File->{Templates}->{Error}->param(fatal_referer_error => TRUE); } } # Supersede URL with an uploaded file. if ($q->param('uploaded_file')) { $q->param('uri', 'upload://' . $q->param('uploaded_file')); $File->{'Is Upload'} = TRUE; # Tag it for later use. } # Supersede URL with an uploaded fragment. if ($q->param('fragment')) { $q->param('uri', 'upload://Form Submission'); $File->{'Direct Input'} = TRUE; # Tag it for later use. } # Redirect to a GETable URL if method is POST without a file upload. if (defined $q->request_method and $q->request_method eq 'POST' and not ($File->{'Is Upload'} or $File->{'Direct Input'})) { my $thispage = &self_url_q($q, $File); print redirect $thispage; exit; } # # Flag an error if we didn't get a file to validate. unless ($q->param('uri')) { $File->{'Error Flagged'} = TRUE; $File->{Templates}->{Error}->param(fatal_uri_error => TRUE); $File->{Templates}->{Error}->param(fatal_uri_scheme => 'undefined'); } return $q; } # # Set parse mode (SGML or XML) based on a number of preparsed factors: # * HTTP Content-Type # * Doctype Declaration # * XML Declaration sub set_parse_mode { my $File = shift; my $CFG = shift; my $fpi = $File->{DOCTYPE}; $File->{ModeChoice} = ''; my $parseModeFromDoctype = $CFG->{Types}->{$fpi}->{'Parse Mode'} || 'TBD'; my $parseModeFromMimeType = $File->{Mode}; my $begincontent = join "\x20",@{$File->{Content}}; # for the sake of xml decl detection, # the 10 first lines should be safe my $parseModeFromXMLDecl = ( $begincontent =~ /^ [\x20|\x09|\x0D|\x0A]* # whitespace before the decl should not be happening # but we are greedy for the sake of detection, not validation <\?xml # start matching an XML Declaration [\x20|\x09|\x0D|\x0A]+ # x20, x09, xD and xA are the allowed "xml white space" version [\x20|\x09|\x0D|\x0A]* = # for documents, version info is mandatory [\x20|\x09|\x0D|\x0A]* ("1.0"|"1.1"|'1.0'|'1.1') # hardcoding the existing XML versions. # Maybe we should use \d\.\d ([\x20|\x09|\x0D|\x0A]+ encoding [\x20|\x09|\x0D|\x0A]* = [\x20|\x09|\x0D|\x0A]* ("[A-Za-z][a-zA-Z0-9_-]+"|'[A-Za-z][a-zA-Z0-9_-]+') )? # encoding info is optional ([\x20|\x09|\x0D|\x0A]+ standalone [\x20|\x09|\x0D|\x0A]* = [\x20|\x09|\x0D|\x0A]* ("yes"|"no"|'yes'|'no') )? # ditto standalone info, optional [\x20|\x09|\x0D|\x0A]* \?> # end of XML Declaration /x ? 'XML' : 'TBD' ); if (($parseModeFromMimeType eq 'TBD') and ($parseModeFromXMLDecl eq 'TBD') and (!exists $CFG->{Types}->{$fpi})) { # if the mime type is text/html (ambiguous, hence TBD mode) # and XML prolog detection was unsuccessful # and the doctype isn't in the catalogue... we scream &add_warning('W06', { W06_mime => $File->{ContentType}, w06_doctype => $File->{DOCTYPE} }); return; } $parseModeFromDoctype = 'TBD' unless $parseModeFromDoctype eq 'SGML' or $parseModeFromDoctype eq 'XML'; if (($parseModeFromDoctype eq 'TBD') and ($parseModeFromXMLDecl eq 'TBD') and ($parseModeFromMimeType eq 'TBD')) { # if all three factors are useless to give us a parse mode # => we use SGML as a default $File->{Mode} = 'SGML'; $File->{ModeChoice} = 'Fallback'; # and send warning about the fallback &add_warning('W06', { W06_mime => $File->{ContentType}, w06_doctype => $File->{DOCTYPE} }); return; } elsif ($parseModeFromMimeType ne 'TBD') { # if The mime type gives clear indication of the parse mode if (($parseModeFromDoctype ne 'TBD') and ($parseModeFromMimeType ne $parseModeFromDoctype)) { # if document-type recommended mode and content-type recommended mode clash # shoot a warning &add_warning('W07', { W07_mime => $File->{ContentType}, W07_ct => $parseModeFromMimeType, W07_dtd => $parseModeFromDoctype, }); } # mime type has precedence, we stick to it $File->{ModeChoice} = 'Mime'; return; } elsif ($parseModeFromDoctype ne 'TBD') { # the mime type is ambiguous (hence we didn't stop at the previous test) # but by now we're sure that the document type is a good indication # so we use that. $File->{Mode} = $parseModeFromDoctype; $File->{ModeChoice} = 'Doctype'; return; } else { # this is the last case. We know that all three modes are not TBD, # yet both mime type and doctype tests have failed => we are saved by the XML declaration $File->{Mode} = $parseModeFromXMLDecl; $File->{ModeChoice} = 'XMLDecl'; } } # # Utility sub to tell if mode "is" XML. sub is_xml {shift->{Mode} eq 'XML'}; # # Check charset conflicts and add any warnings necessary. sub charset_conflicts { my $File = shift; # # Handle the case where there was no charset to be found. unless ($File->{Charset}->{Use}) { &add_warning('W17', {}); $File->{Tentative} |= T_WARN; } # # Add a warning if there was charset info conflict (HTTP header, # XML declaration, or element). # filtering out some of the warnings in direct input mode where HTTP encoding is a "fake" if ((charset_not_equal($File->{Charset}->{HTTP}, $File->{Charset}->{XML})) and not ($File->{'Direct Input'})) { &add_warning('W18', { W18_http => $File->{Charset}->{HTTP}, W18_xml => $File->{Charset}->{XML}, W18_use => $File->{Charset}->{Use}, }); } elsif (charset_not_equal($File->{Charset}->{HTTP}, $File->{Charset}->{META}) and not ($File->{'Direct Input'})) { &add_warning('W19', { W19_http => $File->{Charset}->{HTTP}, W19_meta => $File->{Charset}->{META}, W19_use => $File->{Charset}->{Use}, }); } elsif (charset_not_equal($File->{Charset}->{XML}, $File->{Charset}->{META})) { &add_warning('W20', { W20_http => $File->{Charset}->{XML}, W20_xml => $File->{Charset}->{META}, }); $File->{Tentative} |= T_WARN; } return $File; } # # Transcode to UTF-8 sub transcode { my $File = shift; my $general_charset = $File->{Charset}->{Use}; my $exact_charset = $general_charset; # TODO: This should be done before transcode() if ($general_charset eq 'utf-16') { if ($File->{Charset}->{Auto} =~ m/^utf-16[bl]e$/) { $exact_charset = $File->{Charset}->{Auto}; } else { $exact_charset = 'utf-16be'; } } my $cs = $exact_charset; if ($CFG->{Charsets}->{$cs}) { if ($CFG->{Charsets}->{$cs} =~ /ERR /) { # The encoding is not supported due to policy $File->{'Error Flagged'} = TRUE; $File->{Templates}->{Error}->param(fatal_transcode_error => TRUE); $File->{Templates}->{Error}->param(fatal_transcode_charset => $cs); # @@FIXME might need better text $File->{Templates}->{Error}->param(fatal_transcode_errmsg => "This encoding is not supported by the validator."); return $File; } elsif ($CFG->{Charsets}->{$cs} =~ /X /) { # possibly problematic, we recommend another alias my $recommended_charset = $CFG->{Charsets}->{$cs}; $recommended_charset =~ s/X //; &add_warning('W22', { W22_declared => $cs, W22_suggested => $recommended_charset, }); } } # Does the system support decoding this encoding? eval { Encode::decode($cs, ''); }; if ($@) { # This system's Encode installation does not support # the character encoding; might need additional modules $File->{'Error Flagged'} = TRUE; $File->{Templates}->{Error}->param(fatal_transcode_error => TRUE); $File->{Templates}->{Error}->param(fatal_transcode_charset => $cs); # @@FIXME might need better text $File->{Templates}->{Error}->param(fatal_transcode_errmsg => "Encoding not supported."); return $File; } elsif (!$CFG->{Charsets}->{$cs}) { # not in the list, but technically OK -> we warn &add_warning('W24', { W24_declared => $cs, }); } my $output; my $input = $File->{Bytes}; # Try to transcode eval { $output = Encode::decode($cs, $input, Encode::FB_CROAK); }; # Transcoding failed if ($@) { my $line_num = 0; foreach my $input_line (split /\r\n|\n|\r/, $input){ $line_num++; eval { Encode::decode($cs, $input_line, Encode::FB_CROAK); }; if ($@) { $File->{'Error Flagged'} = TRUE; $File->{Templates}->{Error}->param(fatal_byte_error => TRUE); $File->{Templates}->{Error}->param(fatal_byte_lines => $line_num); $File->{Templates}->{Error}->param(fatal_byte_charset => $cs); my $croak_message = $@; $croak_message =~ s/ at .*//; $File->{Templates}->{Error}->param(fatal_byte_error_msg => $croak_message); } } return $File; } # @@FIXME is this what we want? $output =~ s/\015?\012/\n/g; # make sure we deal only with unix newlines # tentative fix for http://www.w3.org/Bugs/Public/show_bug.cgi?id=3992 $output =~ s/(\r\n|\n|\r)/\n/g; #debug: we could check if the content has utf8 bit on with #$output= utf8::is_utf8($output) ? 1 : 0; $File->{Content} = [split/\n/, $output]; return $File; } sub find_encodings { my $File = shift; my $bom = HTML::Encoding::encoding_from_byte_order_mark($File->{Bytes}); my @first = HTML::Encoding::encoding_from_first_chars($File->{Bytes}); if (defined $bom) { # @@FIXME this BOM entry should not be needed at all! $File->{BOM} = length(Encode::encode($bom, "\x{FEFF}")); $File->{Charset}->{Auto} = lc $bom; } else { $File->{Charset}->{Auto} = lc($first[0]) if @first; } my $xml = HTML::Encoding::encoding_from_xml_document($File->{Bytes}); $File->{Charset}->{XML} = lc $xml if defined $xml; my %metah; foreach my $try (@first) { # @@FIXME I think the old code used HTML::Parser xml mode, check this is ok my $meta = lc HTML::Encoding::encoding_from_meta_element($File->{Bytes}, $try); $metah{$meta}++ if defined $meta and length $meta; } my @meta = sort { $metah{$b} <=> $metah{$a} } keys %metah; $File->{Charset}->{META} = lc $meta[0] if @meta; return $File; } # # Abort with a message if an error was flagged at point. sub abort_if_error_flagged { my $File = shift; my $Flags = shift; return unless $File->{'Error Flagged'}; return if $File->{'Error Handled'}; # Previous error, keep going. $File->{Templates}->{Error}->param(fatal_error => TRUE); if ($File->{Opt}->{Output} eq 'html') { &prep_template($File, $File->{Templates}->{Error}); # transcode output from perl's internal to utf-8 and output print Encode::encode('UTF-8',$File->{Templates}->{Error}->output); exit; } else { #@@FIXME: This is borked after templatification. # &add_warning($File, 'fatal', 'Fatal Error', <<".EOF."); # A fatal error has occurred while processing the requested document. Processing # has continued but any later output will be of dubious quality. Limitations of # this output mode prevent the full error message from being returned; please # retry this operation in interactive mode using the web interface to see the # actual error message. # .EOF. #@@FIXME; $File->{'Error Handled'} = TRUE; } } # # conflicting encodings sub charset_not_equal { my $encodingA = shift; my $encodingB = shift; return $encodingA && $encodingB && ($encodingA ne $encodingB); } # # Construct a self-referential URL from a CGI.pm $q object. sub self_url_q { my ($q, $File) = @_; my $thispage = $File->{Env}->{'Self URI'} . '?'; $thispage .= 'uri=' . uri_escape($q->param('uri')) . ';' if $q->param('uri'); $thispage .= 'ss=1;' if $q->param('ss'); $thispage .= 'sp=1;' if $q->param('sp'); $thispage .= 'noatt=1;' if $q->param('noatt'); $thispage .= 'outline=1;' if $q->param('outline'); $thispage .= 'No200=1;' if $q->param('No200'); $thispage .= 'verbose=1;' if $q->param('verbose'); $thispage .= 'group=1;' if $q->param('group'); if ($q->param('doctype') and not $q->param('doctype') =~ /(Inline|detect)/i) { $thispage .= 'doctype=' . uri_escape($q->param('doctype')) . ';'; } if ($q->param('charset') and not $q->param('charset') =~ /detect/i) { $thispage .= 'charset=' . uri_escape($q->param('charset')) . ';'; } $thispage =~ s/[\?;]$//; return $thispage; } # # Return random Tip with it's URL. sub get_tip { my @tipAddrs = keys %{$CFG->{Tips}}; my $tipAddr = $tipAddrs[rand scalar @tipAddrs]; my $tipSlug = $CFG->{Tips}->{$tipAddr}; return [$tipAddr, $tipSlug]; } # # Construct a self-referential URL from a $File object. sub self_url_file { my $File = shift; my $thispage = $File->{Env}->{'Self URI'}; my $escaped_uri = uri_escape($File->{URI}); $thispage .= qq(?uri=$escaped_uri); $thispage .= ';ss=1' if $File->{Opt}->{'Show Source'}; $thispage .= ';st=1' if $File->{Opt}->{'Show Tidy'}; $thispage .= ';sp=1' if $File->{Opt}->{'Show Parsetree'}; $thispage .= ';noatt=1' if $File->{Opt}->{'No Attributes'}; $thispage .= ';outline=1' if $File->{Opt}->{'Outline'}; # These were not added by report_valid; perhaps they should be? # $thispage .= ';verbose=1' if $File->{Opt}->{'Verbose'}; # $thispage .= ';group=1' if $File->{Opt}->{'Group Errors'}; # $thispage .= ';No200=1' if $File->{Opt}->{'No200'}; return $thispage; } ##### sub W3C::Validator::SAXHandler::new { my $class = shift; my $parser = shift; my $File = shift; my $self = { _file => $File, _parser => $parser, current_heading_level => 0, am_in_heading => 0 }; bless $self, $class; } sub W3C::Validator::SAXHandler::characters { my ($self, $chars) = @_; if ($self->{am_in_heading} == 1) { my $data = $chars->{Data}; $data =~ s/[\r|\n]/ /g; $self->{_file}->{heading_outline} .= $data; } } sub W3C::Validator::SAXHandler::data { my ($self, $chars) = @_; if ($self->{am_in_heading} == 1) { my $data = $chars->{Data}; $data =~ s/[\r|\n]/ /g; $self->{_file}->{heading_outline} .= $data; } } sub W3C::Validator::SAXHandler::start_element { my ($self, $element) = @_; if ($element->{Name} =~ /^h([1-6])$/i) { $self->{_file}->{heading_outline} ||= ""; $self->{_file}->{heading_outline} .= " " x int($1) . "[$element->{Name}] "; $self->{am_in_heading} = 1; } my $has_xmlns = FALSE; my $xmlns_value = undef; if ( ($self->{_file}->{Mode} eq 'XML')){ # if in XML mode, find namespace used for each element foreach my $attr (keys %{$element->{Attributes}}) { if ($element->{Attributes}->{$attr}->{Name} eq "xmlns") { # Try with SAX method if($element->{Attributes}->{$attr}->{Value} ){ $has_xmlns = TRUE; $xmlns_value = $element->{Attributes}->{$attr}->{Value}; } #next if ($has_xmlns); # the following is not SAX, but OPENSP specific if ( $element->{Attributes}->{$attr}->{Defaulted}){ if ($element->{Attributes}->{$attr}->{Defaulted} eq "specified") { $has_xmlns = TRUE; foreach my $datachunk (@{$element->{Attributes}->{$attr}->{CdataChunks}}) { $xmlns_value = $xmlns_value.$datachunk->{Data}; } } } } } } my $doctype = $self->{_file}->{DOCTYPE}; if (!defined($CFG->{Types}->{$doctype}->{Name}) || $element->{Name} ne $CFG->{Types}->{$doctype}->{Name}) { # add to list of non-root namespaces push(@{$self->{_file}->{Namespaces}}, $xmlns_value) if $has_xmlns; } elsif (!$has_xmlns and $CFG->{Types}->{$doctype}->{"Namespace Required"}) { # whine if the root xmlns attribute is noted as required by spec, # but not present my $err; my $location = $self->{_parser}->get_location(); $err->{src} = '...'; # do this with show_open_entities()? $err->{line} = $location->{LineNumber}; $err->{char} = $location->{ColumnNumber}; $err->{num} = "no-xmlns"; $err->{type} = "E"; $err->{msg} = "Missing xmlns attribute for element ".$element->{Name} . ". The value should be: $CFG->{Types}->{$doctype}->{Namespace}"; # ... $self->{_file}->{'Is Valid'} = FALSE; push @{$self->{_file}->{Errors}}, $err; } elsif ($has_xmlns and (defined $CFG->{Types}->{$doctype}->{Namespace}) and ($xmlns_value ne $CFG->{Types}->{$doctype}->{Namespace}) ) { # whine if root xmlns element is not the one specificed by the spec my $err; my $location = $self->{_parser}->get_location(); $err->{src} = '...'; # do this with show_open_entities()? $err->{line} = $location->{LineNumber}; $err->{char} = $location->{ColumnNumber}; $err->{num} = "wrong-xmlns"; $err->{type} = "E"; $err->{msg} = "Wrong xmlns attribute for element $element->{Name}. ". "The value should be: $CFG->{Types}->{$doctype}->{Namespace}"; # ... $self->{_file}->{'Is Valid'} = FALSE; push @{$self->{_file}->{Errors}}, $err; } } sub W3C::Validator::SAXHandler::end_element { my ($self, $element) = @_; if ($element->{Name} =~ /^h[1-6]$/i) { $self->{_file}->{heading_outline} .= "\n"; $self->{am_in_heading} = 0; } } sub W3C::Validator::SAXHandler::error { my $self = shift; my $error = shift; my $mess; eval { $mess = $self->{_parser}->split_message($error); }; if ($@) { # this is a message that S:P:O could not handle, we skip its croaking return; } my $File = $self->{_file}; # TODO: this does not filter out errors in DTDs. my $err; $err->{src} = '...'; # do this with show_open_entities()? $err->{line} = $mess->{primary_message}{LineNumber}; $err->{char} = $mess->{primary_message}{ColumnNumber}; $err->{num} = $mess->{primary_message}{Number}; $err->{type} = $mess->{primary_message}{Severity}; $err->{msg} = $mess->{primary_message}{Text}; # our parser OpenSP is not quite XML-aware, or XML Namespaces Aware, # so we filter out a few errors for now if ($File->{Mode} eq 'XML') { if ($err->{num} eq '108' and $err->{msg} =~ m{ "xmlns:\S+"}) { # the error is about a missing xmlns: attribute definition" return ; # this is not an error, 'cause we said so } } if ($err->{num} eq '187') # filtering out no "document type declaration; will parse without validation" # if root element is not html and mode is xml... { # since parsing was done without validation, result can only be "well-formed" if ($File->{Mode} eq 'XML' and lc($File->{Root}) ne 'html') { $File->{XMLWF_ONLY} = TRUE; add_warning('W09xml', {}); return; # don't report this as an error, just proceed } # if mode is not XML, we do report the error. It should not happen in the case of without doctype, # in that case the error message will be #344 } if (($err->{num} eq '113') and ($err->{msg} =~ /xml:space/)) { # FIXME # this is a problem with some of the "flattened" W3C DTDs, filtering them out to not confuse users. # hoping to get the DTDs fixed, see http://lists.w3.org/Archives/Public/www-html-editor/2007AprJun/0010.html return; # don't report this, just proceed } if (($err->{num} eq '248') or ($err->{num} eq '247') or ($err->{num} eq '246')) { # these two errors should be triggered by -wmin-tag to report shorttag used, # but we're making them warnings, not errors # see http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.7 $err->{type} = "W"; } # Workaround for onsgmls as of 1.5 sometimes allegedly reporting errors # beyond EOL. If you see this warning in your web server logs, please # let the validator developers know, see http://validator.w3.org/feedback.html # As long as $err may be from somewhere else than the document (such as # from a DTD) and we have no way of identifying these cases, this # produces bogus results and error log spewage, so commented out for now. # if ((my $l = length($File->{Content}->[$err->{line}-1])) < $err->{char}) { # warn("Warning: reported error column larger than line length " . # "($err->{char} > $l) in $File->{URI} line $err->{line}, " . # "OpenSP bug? Resetting to line length."); # $err->{char} = $l; # } # No or unknown FPI and a relative SI. if ($err->{msg} =~ m(cannot (open|find))) { $File->{'Error Flagged'} = TRUE; $File->{Templates}->{Error}->param(fatal_parse_extid_error => TRUE); $File->{Templates}->{Error}->param(fatal_parse_extid_msg => $err->{msg}); } # No DOCTYPE found! We are falling back to vanilla DTD if ($err->{msg} =~ m(prolog can\'t be omitted)) { if (lc($File->{Root}) eq 'html') { my $dtd = ($File->{Mode} eq 'XML' ? 'XHTML 1.0 Transitional' : 'HTML 4.01 Transitional' ); add_warning('W09', {W09_dtd => $dtd}); } else { # not html root element, we are not using fallback if ($File->{Mode} ne 'XML') { $File->{'Is Valid'} = FALSE; add_warning('W09nohtml', {}); } } return; # Don't report this as a normal error. } # TODO: calling exit() here is probably a bad idea abort_if_error_flagged($File, O_DOCTYPE); push @{$File->{Errors}}, $err; # ... $File->{'Is Valid'} = FALSE if $err->{type} eq 'E'; if (defined $mess->{aux_message}) { # "duplicate id ... first defined here" style messages push @{$File->{Errors}}, { line => $mess->{aux_message}{LineNumber}, char => $mess->{aux_message}{ColumnNumber}, msg => $mess->{aux_message}{Text}, type => 'I', }; } } ##### package W3C::Validator::UserAgent; use LWP::UserAgent 1.90 qw(); # Need 1.90 for protocols_(allowed|forbidden) use Net::hostent qw(gethostbyname); use Net::IP qw(); use Socket qw(inet_ntoa); use base qw(LWP::UserAgent); sub new { my ($proto, $CFG, $File, @rest) = @_; my $class = ref($proto) || $proto; my $self = $class->SUPER::new(@rest); $self->{'W3C::Validator::CFG'} = $CFG; $self->{'W3C::Validator::File'} = $File; return $self; } sub redirect_ok { my ($self, $req, $res) = @_; return $self->SUPER::redirect_ok($req, $res) && $self->uri_ok($req->uri()); } sub uri_ok { my ($self, $uri) = @_; return 1 if ($self->{'W3C::Validator::CFG'}->{'Allow Private IPs'} or !$uri->can('host')); my $addr = my $iptype = undef; if (my $host = gethostbyname($uri->host())) { $addr = inet_ntoa($host->addr()) if $host->addr(); if ($addr && (my $ip = Net::IP->new($addr))) { $iptype = $ip->iptype(); } } if ($iptype && $iptype ne 'PUBLIC') { my $File = $self->{'W3C::Validator::File'}; $File->{'Error Flagged'} = 1; $File->{Templates}->{Error}->param(fatal_ip_error => 1); $File->{Templates}->{Error}->param(fatal_ip_hostname => 1) if $addr and $uri->host() ne $addr; $File->{Templates}->{Error}->param(fatal_ip_host => ($uri->host() || 'undefined')); return 0; } return 1; } # Local Variables: # mode: perl # indent-tabs-mode: nil # tab-width: 2 # perl-indent-level: 2 # End: # ex: ts=2 sw=2 et