diff options
author | ville <ville@localhost> | 2009-11-23 22:15:19 +0000 |
---|---|---|
committer | ville <ville@localhost> | 2009-11-23 22:15:19 +0000 |
commit | 5b60b407ff2794593e3ccffd7533d5e2449e08c8 (patch) | |
tree | 2eca829ccbc61f847db1be6f4d6cfa934055b073 | |
parent | 675a1c5355fc61459257de958e7090837e36507a (diff) | |
download | markup-validator-5b60b407ff2794593e3ccffd7533d5e2449e08c8.zip markup-validator-5b60b407ff2794593e3ccffd7533d5e2449e08c8.tar.gz markup-validator-5b60b407ff2794593e3ccffd7533d5e2449e08c8.tar.bz2 |
Run perltidy on perl sources.
-rwxr-xr-x | httpd/cgi-bin/check | 5439 | ||||
-rwxr-xr-x | httpd/cgi-bin/sendfeedback.pl | 154 | ||||
-rw-r--r-- | misc/bundle/Makefile.PL | 70 | ||||
-rwxr-xr-x | misc/docs_errors.pl | 183 | ||||
-rwxr-xr-x | misc/spmpp.pl | 18 |
5 files changed, 3124 insertions, 2740 deletions
diff --git a/httpd/cgi-bin/check b/httpd/cgi-bin/check index 94cb85d..b0bf649 100755 --- a/httpd/cgi-bin/check +++ b/httpd/cgi-bin/check @@ -14,7 +14,7 @@ # This source code is available under the license at: # http://www.w3.org/Consortium/Legal/copyright-software # -# $Id: check,v 1.719 2009-11-16 19:50:26 ville Exp $ +# $Id: check,v 1.720 2009-11-23 22:15:18 ville Exp $ # # Disable buffering on STDOUT! $| = 1; @@ -33,8 +33,8 @@ use strict; use warnings; use utf8; - package W3C::Validator::MarkupValidator; + # # Modules. See also the BEGIN block further down below. # @@ -44,26 +44,26 @@ package W3C::Validator::MarkupValidator; # polluting our namespace. # -use CGI 2.81 qw(-newstyle_urls -private_tempfiles redirect); -use CGI::Carp qw(carp croak fatalsToBrowser); -use Config::General 2.32 qw(); # Need 2.32 for <msg 0>, rt.cpan.org#17852 -use Encode qw(); -use Encode::Alias qw(); -use Encode::HanExtra qw(); # for some chinese character encodings, - # e.g gb18030 -use File::Spec::Functions qw(catfile rel2abs tmpdir); -use HTML::Encoding 0.52 qw(); -use HTML::Parser 3.24 qw(); # Need 3.24 for $p->parse($code_ref) -use HTML::Template 2.6 qw(); # Need 2.6 for path param, other things. -use HTTP::Headers::Util qw(); -use HTTP::Message 1.52 qw(); # Need 1.52 for decoded_content() -use HTTP::Request qw(); -use HTTP::Headers::Auth qw(); # Needs to be imported after other HTTP::*. -use JSON 2.00 qw(); +use CGI 2.81 qw(-newstyle_urls -private_tempfiles redirect); +use CGI::Carp qw(carp croak fatalsToBrowser); +use Config::General 2.32 qw(); # Need 2.32 for <msg 0>, rt.cpan.org#17852 +use Encode qw(); +use Encode::Alias qw(); +use Encode::HanExtra qw(); # for some chinese character encodings, + # e.g gb18030 +use File::Spec::Functions qw(catfile rel2abs tmpdir); +use HTML::Encoding 0.52 qw(); +use HTML::Parser 3.24 qw(); # Need 3.24 for $p->parse($code_ref) +use HTML::Template 2.6 qw(); # Need 2.6 for path param, other things. +use HTTP::Headers::Util qw(); +use HTTP::Message 1.52 qw(); # Need 1.52 for decoded_content() +use HTTP::Request qw(); +use HTTP::Headers::Auth qw(); # Needs to be imported after other HTTP::*. +use JSON 2.00 qw(); use SGML::Parser::OpenSP 0.991 qw(); -use URI qw(); -use URI::Escape qw(uri_escape); -use XML::LibXML qw(); +use URI qw(); +use URI::Escape qw(uri_escape); +use XML::LibXML qw(); ############################################################################### #### Constant definitions. #################################################### @@ -76,160 +76,166 @@ use constant FALSE => 0; # # Tentative Validation Severities. -use constant T_WARN => 4; # 0000 0100 -use constant T_ERROR => 8; # 0000 1000 +use constant T_WARN => 4; # 0000 0100 +use constant T_ERROR => 8; # 0000 1000 # # Output flags for error processing -use constant O_SOURCE => 1; # 0000 0001 -use constant O_CHARSET => 2; # 0000 0010 -use constant O_DOCTYPE => 4; # 0000 0100 -use constant O_NONE => 8; # 0000 1000 +use constant O_SOURCE => 1; # 0000 0001 +use constant O_CHARSET => 2; # 0000 0010 +use constant O_DOCTYPE => 4; # 0000 0100 +use constant O_NONE => 8; # 0000 1000 # # Define global variables. use vars qw($DEBUG $CFG %RSRC $VERSION); use constant IS_MODPERL2 => - (exists($ENV{MOD_PERL_API_VERSION}) && $ENV{MOD_PERL_API_VERSION} >= 2); + (exists($ENV{MOD_PERL_API_VERSION}) && $ENV{MOD_PERL_API_VERSION} >= 2); # # Things inside BEGIN don't happen on every request in persistent environments # (such as mod_perl); so let's do the globals, eg. read config, here. BEGIN { - # Launder data for -T; -AutoLaunder doesn't catch this one. - if (exists $ENV{W3C_VALIDATOR_HOME}) { - $ENV{W3C_VALIDATOR_HOME} =~ /^(.*)$/; - $ENV{W3C_VALIDATOR_HOME} = $1; - } - - # - # Read Config Files. - eval { - my %config_opts = ( - -ConfigFile => ($ENV{W3C_VALIDATOR_CFG} || '/etc/w3c/validator.conf'), - -MergeDuplicateOptions => TRUE, - -MergeDuplicateBlocks => TRUE, - -SplitPolicy => 'equalsign', - -UseApacheInclude => TRUE, - -IncludeRelative => TRUE, - -InterPolateVars => TRUE, - -AutoLaunder => TRUE, - -AutoTrue => TRUE, - -DefaultConfig => { - Protocols => {Allow => 'http,https'}, - Paths => { - Base => ($ENV{W3C_VALIDATOR_HOME} || '/usr/local/validator'), - Cache => '', - }, - External => { - HTML5 => FALSE, - }, - }, - ); - my %cfg = Config::General->new(%config_opts)->getall(); - $CFG = \%cfg; - }; - if ($@) { - die <<".EOF."; + + # Launder data for -T; -AutoLaunder doesn't catch this one. + if (exists $ENV{W3C_VALIDATOR_HOME}) { + $ENV{W3C_VALIDATOR_HOME} =~ /^(.*)$/; + $ENV{W3C_VALIDATOR_HOME} = $1; + } + + # + # Read Config Files. + eval { + my %config_opts = ( + -ConfigFile => + ($ENV{W3C_VALIDATOR_CFG} || '/etc/w3c/validator.conf'), + -MergeDuplicateOptions => TRUE, + -MergeDuplicateBlocks => TRUE, + -SplitPolicy => 'equalsign', + -UseApacheInclude => TRUE, + -IncludeRelative => TRUE, + -InterPolateVars => TRUE, + -AutoLaunder => TRUE, + -AutoTrue => TRUE, + -DefaultConfig => { + Protocols => {Allow => 'http,https'}, + Paths => { + Base => + ($ENV{W3C_VALIDATOR_HOME} || '/usr/local/validator'), + Cache => '', + }, + External => {HTML5 => FALSE,}, + }, + ); + my %cfg = Config::General->new(%config_opts)->getall(); + $CFG = \%cfg; + }; + if ($@) { + die <<".EOF."; Could not read configuration. Set the W3C_VALIDATOR_CFG environment variable or copy conf/* to /etc/w3c/. Make sure that the configuration file and all included files are readable by the web server user. The error was:\n'$@' .EOF. - } - - # - # Check a filesystem path for existance and "readability". - sub pathcheck (@) { - my %paths = map { $_ => [-d $_, -r _] } @_; - my @_d = grep {not $paths{$_}->[0]} keys %paths; - my @_r = grep {not $paths{$_}->[1]} keys %paths; - return TRUE if (scalar(@_d) + scalar(@_r) == 0); - die <<".EOF." if scalar @_d; + } + + # + # Check a filesystem path for existance and "readability". + sub pathcheck (@) + { + my %paths = map { $_ => [-d $_, -r _] } @_; + my @_d = grep { not $paths{$_}->[0] } keys %paths; + my @_r = grep { not $paths{$_}->[1] } keys %paths; + return TRUE if (scalar(@_d) + scalar(@_r) == 0); + die <<".EOF." if scalar @_d; Does not exist or is not a directory: @_d .EOF. - die <<".EOF." if scalar @_r; + die <<".EOF." if scalar @_r; Directory not readable (permission denied): @_r .EOF. - } - - # - # Check paths in config... - # @@FIXME: This does not do a very good job error-message-wise if a path is - # @@FIXME: missing...; - { - my @dirs = (); - push @dirs, $CFG->{Paths}->{Base}; - push @dirs, $CFG->{Paths}->{Templates}; - push @dirs, $CFG->{Paths}->{SGML}->{Library}; - &pathcheck(@dirs); - } - - # - # Split allowed protocols into a list. - if (my $allowed = delete($CFG->{Protocols}->{Allow})) { - $CFG->{Protocols}->{Allow} = [split(/\s*,\s*/, $allowed)]; - } - - # Split available languages into a list - if (my $langs = delete($CFG->{Languages})) { - $CFG->{Languages} = [split(/\s+/, $langs)]; - } else { - # Default to english - $CFG->{Languages} = ["en"]; - } - - { # Make types config indexed by FPI. - my $_types = {}; - map { $_types->{$CFG->{Types}->{$_}->{PubID}} = $CFG->{Types}->{$_} } - keys %{$CFG->{Types}}; - $CFG->{Types} = $_types; - } - - # - # Change strings to internal constants in MIME type mapping. - for (keys %{$CFG->{MIME}}) { - $CFG->{MIME}->{$_} = 'TBD' unless $CFG->{MIME}->{$_} eq 'SGML' - or $CFG->{MIME}->{$_} eq 'XML'; - } - - # - # Register Encode aliases. - while (my ($key, $value) = each %{$CFG->{Charsets}}) { - Encode::Alias::define_alias($key, $1) if ($value =~ /^[AX] (\S+)/); - } - - # - # Set debug flag. - if ($CFG->{'Allow Debug'}) { - $DEBUG = TRUE if $ENV{W3C_VALIDATOR_DEBUG} || $CFG->{'Enable Debug'}; - } else { - $DEBUG = FALSE; - } - - # - # Strings - $VERSION = q$Revision: 1.719 $; - $VERSION =~ s/Revision: ([\d\.]+) /$1/; - - # - # Use passive FTP by default. - $ENV{FTP_PASSIVE} = 1 unless exists($ENV{FTP_PASSIVE}); - - # Read friendly error message file - # 'en_US' should be replaced by $lang for lang-neg - %RSRC = Config::General->new( - -MergeDuplicateBlocks => 1, - -ConfigFile => catfile($CFG->{Paths}->{Templates}, 'en_US', - 'error_messages.cfg'), - )->getall(); - - eval { - local $SIG{__DIE__}; - require Encode::JIS2K; # for optional extra Japanese encodings - }; - -} # end of BEGIN block. + } + + # + # Check paths in config... + # @@FIXME: This does not do a very good job error-message-wise if a path is + # @@FIXME: missing...; + { + my @dirs = (); + push @dirs, $CFG->{Paths}->{Base}; + push @dirs, $CFG->{Paths}->{Templates}; + push @dirs, $CFG->{Paths}->{SGML}->{Library}; + &pathcheck(@dirs); + } + + # + # Split allowed protocols into a list. + if (my $allowed = delete($CFG->{Protocols}->{Allow})) { + $CFG->{Protocols}->{Allow} = [split(/\s*,\s*/, $allowed)]; + } + + # Split available languages into a list + if (my $langs = delete($CFG->{Languages})) { + $CFG->{Languages} = [split(/\s+/, $langs)]; + } + else { + + # Default to english + $CFG->{Languages} = ["en"]; + } + + { # Make types config indexed by FPI. + my $_types = {}; + map { $_types->{$CFG->{Types}->{$_}->{PubID}} = $CFG->{Types}->{$_} } + keys %{$CFG->{Types}}; + $CFG->{Types} = $_types; + } + + # + # Change strings to internal constants in MIME type mapping. + for (keys %{$CFG->{MIME}}) { + $CFG->{MIME}->{$_} = 'TBD' + unless $CFG->{MIME}->{$_} eq 'SGML' or + $CFG->{MIME}->{$_} eq 'XML'; + } + + # + # Register Encode aliases. + while (my ($key, $value) = each %{$CFG->{Charsets}}) { + Encode::Alias::define_alias($key, $1) if ($value =~ /^[AX] (\S+)/); + } + + # + # Set debug flag. + if ($CFG->{'Allow Debug'}) { + $DEBUG = TRUE if $ENV{W3C_VALIDATOR_DEBUG} || $CFG->{'Enable Debug'}; + } + else { + $DEBUG = FALSE; + } + + # + # Strings + $VERSION = q$Revision: 1.720 $; + $VERSION =~ s/Revision: ([\d\.]+) /$1/; + + # + # Use passive FTP by default. + $ENV{FTP_PASSIVE} = 1 unless exists($ENV{FTP_PASSIVE}); + + # Read friendly error message file + # 'en_US' should be replaced by $lang for lang-neg + %RSRC = Config::General->new( + -MergeDuplicateBlocks => 1, + -ConfigFile => + catfile($CFG->{Paths}->{Templates}, 'en_US', 'error_messages.cfg'), + )->getall(); + + eval { + local $SIG{__DIE__}; + require Encode::JIS2K; # for optional extra Japanese encodings + }; + +} # end of BEGIN block. # # Get rid of (possibly insecure) $PATH. @@ -274,18 +280,20 @@ $File->{Charset}->{Override} = ''; # From CGI/user override. # # Misc simple types. -$File->{Mode} = 'DTD+SGML'; # Default parse mode is DTD validation in SGML mode. +$File->{Mode} = + 'DTD+SGML'; # Default parse mode is DTD validation in SGML mode. # By default, perform validation (we may perform only xml-wf in some cases) $File->{XMLWF_ONLY} = FALSE; + # # Listrefs. -$File->{Warnings} = []; # Warnings... -$File->{Namespaces} = []; # Other (non-root) Namespaces. +$File->{Warnings} = []; # Warnings... +$File->{Namespaces} = []; # Other (non-root) Namespaces. # By default, doctype-less documents can not be valid -$File->{"DOCTYPEless OK"} = FALSE; -$File->{"Default DOCTYPE"}->{"HTML"} = 'HTML 4.01 Transitional'; +$File->{"DOCTYPEless OK"} = FALSE; +$File->{"Default DOCTYPE"}->{"HTML"} = 'HTML 4.01 Transitional'; $File->{"Default DOCTYPE"}->{"XHTML"} = 'XHTML 1.0 Transitional'; ############################################################################### @@ -299,42 +307,46 @@ $File->{"Default DOCTYPE"}->{"XHTML"} = 'XHTML 1.0 Transitional'; my $lang = $q->param('lang') || ''; my @localizations; foreach my $lang_available (@{$CFG->{Languages}}) { - if ($lang eq $lang_available) { - # Requested language (from parameters) is available, just use it - undef @localizations; - last; - } - push @localizations, - [$lang_available, 1, 'text/html', undef, 'utf-8', $lang_available, undef]; + if ($lang eq $lang_available) { + + # Requested language (from parameters) is available, just use it + undef @localizations; + last; + } + push @localizations, + [ + $lang_available, 1, 'text/html', undef, + 'utf-8', $lang_available, undef + ]; } # If language is not chosen yet, use HTTP-based negotiation if (@localizations) { - require HTTP::Negotiate; - $lang = HTTP::Negotiate::choose(\@localizations); + require HTTP::Negotiate; + $lang = HTTP::Negotiate::choose(\@localizations); } # HTTP::Negotiate::choose may return undef e.g if sent Accept-Language: en;q=0 $lang ||= 'en_US'; if ($lang eq "en") { - $lang = 'en_US'; # legacy + $lang = 'en_US'; # legacy } $File->{Template_Defaults} = { - die_on_bad_params => FALSE, - loop_context_vars => TRUE, - global_vars => TRUE, - path => [ catfile($CFG->{Paths}->{Templates}, $lang) ], - filter => sub { my $ref = shift; ${$ref} = Encode::decode_utf8(${$ref}); }, + die_on_bad_params => FALSE, + loop_context_vars => TRUE, + global_vars => TRUE, + path => [catfile($CFG->{Paths}->{Templates}, $lang)], + filter => sub { my $ref = shift; ${$ref} = Encode::decode_utf8(${$ref}); }, }; if (IS_MODPERL2()) { - $File->{Template_Defaults}->{cache} = TRUE; + $File->{Template_Defaults}->{cache} = TRUE; } elsif ($CFG->{Paths}->{Cache}) { - $File->{Template_Defaults}->{file_cache} = TRUE; - $File->{Template_Defaults}->{file_cache_dir} = - rel2abs($CFG->{Paths}->{Cache}, tmpdir()); + $File->{Template_Defaults}->{file_cache} = TRUE; + $File->{Template_Defaults}->{file_cache_dir} = + rel2abs($CFG->{Paths}->{Cache}, tmpdir()); } undef $lang; @@ -349,32 +361,35 @@ $q = &prepCGI($File, $q); # # Set session switches. -$File->{Opt}->{Outline} = $q->param('outline') ? TRUE : FALSE; -$File->{Opt}->{'Show Source'} = $q->param('ss') ? TRUE : FALSE; -$File->{Opt}->{'Show Tidy'} = $q->param('st') ? TRUE : FALSE; -$File->{Opt}->{Verbose} = $q->param('verbose') ? TRUE : FALSE; -$File->{Opt}->{'Group Errors'} = $q->param('group') ? TRUE : FALSE; -$File->{Opt}->{Debug} = $q->param('debug') ? TRUE : FALSE; -$File->{Opt}->{No200} = $q->param('No200') ? TRUE : FALSE; -$File->{Opt}->{Prefill} = $q->param('prefill') ? TRUE : FALSE; +$File->{Opt}->{Outline} = $q->param('outline') ? TRUE : FALSE; +$File->{Opt}->{'Show Source'} = $q->param('ss') ? TRUE : FALSE; +$File->{Opt}->{'Show Tidy'} = $q->param('st') ? TRUE : FALSE; +$File->{Opt}->{Verbose} = $q->param('verbose') ? TRUE : FALSE; +$File->{Opt}->{'Group Errors'} = $q->param('group') ? TRUE : FALSE; +$File->{Opt}->{Debug} = $q->param('debug') ? TRUE : FALSE; +$File->{Opt}->{No200} = $q->param('No200') ? TRUE : FALSE; +$File->{Opt}->{Prefill} = $q->param('prefill') ? TRUE : FALSE; $File->{Opt}->{'Prefill Doctype'} = $q->param('prefill_doctype') || 'html401'; -$File->{Opt}->{Charset} = lc($q->param('charset') || ''); -$File->{Opt}->{DOCTYPE} = $q->param('doctype') || ''; -$File->{Opt}->{Output} = $q->param('output') || 'html'; - -$File->{Opt}->{'User Agent'} = $q->param('user-agent') && $q->param('user-agent') ne 1 ? $q->param('user-agent') : "W3C_Validator/$VERSION"; +$File->{Opt}->{Charset} = lc($q->param('charset') || ''); +$File->{Opt}->{DOCTYPE} = $q->param('doctype') || ''; +$File->{Opt}->{Output} = $q->param('output') || 'html'; + +$File->{Opt}->{'User Agent'} = + $q->param('user-agent') && + $q->param('user-agent') ne 1 ? $q->param('user-agent') : + "W3C_Validator/$VERSION"; $File->{Opt}->{'User Agent'} =~ tr/\x00-\x09\x0b\x0c-\x1f//d; if ($File->{Opt}->{'User Agent'} eq 'mobileok') { - $File->{Opt}->{'User Agent'} = 'W3C-mobileOK/DDC-1.0 (see http://www.w3.org/2006/07/mobileok-ddc)'; + $File->{Opt}->{'User Agent'} = + 'W3C-mobileOK/DDC-1.0 (see http://www.w3.org/2006/07/mobileok-ddc)'; } - $File->{Opt}->{'Accept Header'} = $q->param('accept') || ''; $File->{Opt}->{'Accept-Language Header'} = $q->param('accept-language') || ''; $File->{Opt}->{'Accept-Charset Header'} = $q->param('accept-charset') || ''; $File->{Opt}->{$_} =~ tr/\x00-\x09\x0b\x0c-\x1f//d - for ('Accept Header', 'Accept-Language Header', 'Accept-Charset Header'); + for ('Accept Header', 'Accept-Language Header', 'Accept-Charset Header'); # # "Fallback" info for Character Encoding (fbc), Content-Type (fbt), @@ -389,10 +404,11 @@ $File->{Opt}->{FB}->{DOCTYPE} = $q->param('fbd') ? TRUE : FALSE; # regardless of whether it's "0" or "1" (on or off), but only if config # allows the debugging options. if ($CFG->{'Allow Debug'}) { - $DEBUG = $q->param('debug') if defined $q->param('debug'); - $File->{Opt}->{Verbose} = TRUE if $DEBUG; -} else { - $DEBUG = FALSE; # The default. + $DEBUG = $q->param('debug') if defined $q->param('debug'); + $File->{Opt}->{Verbose} = TRUE if $DEBUG; +} +else { + $DEBUG = FALSE; # The default. } $File->{Opt}->{Debug} = $DEBUG; @@ -401,11 +417,13 @@ $File->{Opt}->{Debug} = $DEBUG; # # Get the file and metadata. if ($q->param('uploaded_file')) { - $File = &handle_file($q, $File); -} elsif ($q->param('fragment')) { - $File = &handle_frag($q, $File); -} elsif ($q->param('uri')) { - $File = &handle_uri($q, $File); + $File = &handle_file($q, $File); +} +elsif ($q->param('fragment')) { + $File = &handle_frag($q, $File); +} +elsif ($q->param('uri')) { + $File = &handle_uri($q, $File); } # @@ -430,79 +448,98 @@ $File = find_encodings($File); # # Decide on a charset to use (first part) # -if ($File->{Charset}->{HTTP}) { # HTTP, if given, is authoritative. - $File->{Charset}->{Use} = $File->{Charset}->{HTTP}; -} elsif ($File->{ContentType} =~ m(^text/([-.a-zA-Z0-9]\+)?xml$)) { - # Act as if $http_charset was 'us-ascii'. (MIME rules) - $File->{Charset}->{Use} = 'us-ascii'; - - &add_warning('W01', { - W01_upload => $File->{'Is Upload'}, - W01_agent => $File->{Server}, - W01_ct => $File->{ContentType}, - }); - -} elsif ($File->{Charset}->{XML}) { - $File->{Charset}->{Use} = $File->{Charset}->{XML}; -} elsif ($File->{Charset}->{Auto} =~ /^utf-16[bl]e$/ && $File->{BOM} == 2) { - $File->{Charset}->{Use} = 'utf-16'; -} elsif ($File->{ContentType} =~ m(^application/([-.a-zA-Z0-9]+\+)?xml$)) { - $File->{Charset}->{Use} = "utf-8"; -} elsif (&is_xml($File) and not $File->{ContentType} =~ m(^text/)) { - $File->{Charset}->{Use} = 'utf-8'; # UTF-8 (image/svg+xml etc.) +if ($File->{Charset}->{HTTP}) { # HTTP, if given, is authoritative. + $File->{Charset}->{Use} = $File->{Charset}->{HTTP}; +} +elsif ($File->{ContentType} =~ m(^text/([-.a-zA-Z0-9]\+)?xml$)) { + + # Act as if $http_charset was 'us-ascii'. (MIME rules) + $File->{Charset}->{Use} = 'us-ascii'; + + &add_warning( + 'W01', + { W01_upload => $File->{'Is Upload'}, + W01_agent => $File->{Server}, + W01_ct => $File->{ContentType}, + } + ); + +} +elsif ($File->{Charset}->{XML}) { + $File->{Charset}->{Use} = $File->{Charset}->{XML}; +} +elsif ($File->{Charset}->{Auto} =~ /^utf-16[bl]e$/ && $File->{BOM} == 2) { + $File->{Charset}->{Use} = 'utf-16'; +} +elsif ($File->{ContentType} =~ m(^application/([-.a-zA-Z0-9]+\+)?xml$)) { + $File->{Charset}->{Use} = "utf-8"; +} +elsif (&is_xml($File) and not $File->{ContentType} =~ m(^text/)) { + $File->{Charset}->{Use} = 'utf-8'; # UTF-8 (image/svg+xml etc.) } $File->{Charset}->{Use} ||= $File->{Charset}->{META}; # # Handle any Fallback or Override for the charset. if (charset_not_equal($File->{Opt}->{Charset}, '(detect automatically)')) { - # charset=foo was given to the CGI and it wasn't "autodetect" or empty. - # - # Extract the user-requested charset from CGI param. - my ($override, undef) = split(/\s/, $File->{Opt}->{Charset}, 2); - $File->{Charset}->{Override} = lc($override); - - if ($File->{Opt}->{FB}->{Charset}) { # charset fallback mode - unless ($File->{Charset}->{Use}) { # no charset detected, actual fallback - &add_warning('W02', {W02_charset => $File->{Charset}->{Override}}); - $File->{Tentative} |= T_ERROR; # Tag it as Invalid. - $File->{Charset}->{Use} = $File->{Charset}->{Override}; - } - } - else { # charset "hard override" mode - if (! $File->{Charset}->{Use}) { # overriding "nothing" - &add_warning('W04', { W04_charset => $File->{Charset}->{Override}, - W04_override => TRUE}); - $File->{Tentative} |= T_ERROR; - $File->{Charset}->{Use} = $File->{Charset}->{Override}; - } - elsif ($File->{Charset}->{Override} ne $File->{Charset}->{Use}) { - # Actually overriding something; warn about override. - &add_warning('W03', { W03_use => $File->{Charset}->{Use}, - W03_opt => $File->{Charset}->{Override}}); - $File->{Tentative} |= T_ERROR; - $File->{Charset}->{Use} = $File->{Charset}->{Override}; - } - } -} - -if ($File->{'Direct Input'}) { #explain why UTF-8 is forced + + # charset=foo was given to the CGI and it wasn't "autodetect" or empty. + # + # Extract the user-requested charset from CGI param. + my ($override, undef) = split(/\s/, $File->{Opt}->{Charset}, 2); + $File->{Charset}->{Override} = lc($override); + + if ($File->{Opt}->{FB}->{Charset}) { # charset fallback mode + unless ($File->{Charset}->{Use}) + { # no charset detected, actual fallback + &add_warning('W02', {W02_charset => $File->{Charset}->{Override}}); + $File->{Tentative} |= T_ERROR; # Tag it as Invalid. + $File->{Charset}->{Use} = $File->{Charset}->{Override}; + } + } + else { # charset "hard override" mode + if (!$File->{Charset}->{Use}) { # overriding "nothing" + &add_warning( + 'W04', + { W04_charset => $File->{Charset}->{Override}, + W04_override => TRUE + } + ); + $File->{Tentative} |= T_ERROR; + $File->{Charset}->{Use} = $File->{Charset}->{Override}; + } + elsif ($File->{Charset}->{Override} ne $File->{Charset}->{Use}) { + + # Actually overriding something; warn about override. + &add_warning( + 'W03', + { W03_use => $File->{Charset}->{Use}, + W03_opt => $File->{Charset}->{Override} + } + ); + $File->{Tentative} |= T_ERROR; + $File->{Charset}->{Use} = $File->{Charset}->{Override}; + } + } +} + +if ($File->{'Direct Input'}) { #explain why UTF-8 is forced &add_warning('W28', {}); } -unless ($File->{Charset}->{XML} || $File->{Charset}->{META}){ #suggest character encoding info within doc +unless ($File->{Charset}->{XML} || $File->{Charset}->{META}) +{ #suggest character encoding info within doc &add_warning('W27', {}); } - # # Abort if an error was flagged while finding the encoding. -&abort_if_error_flagged($File, O_CHARSET|O_DOCTYPE); +&abort_if_error_flagged($File, O_CHARSET | O_DOCTYPE); $File->{Charset}->{Default} = FALSE; -unless ($File->{Charset}->{Use}) { # No charset given... - $File->{Charset}->{Use} = 'utf-8'; - $File->{Charset}->{Default} = TRUE; - $File->{Tentative} |= T_ERROR; # Can never be valid. +unless ($File->{Charset}->{Use}) { # No charset given... + $File->{Charset}->{Use} = 'utf-8'; + $File->{Charset}->{Default} = TRUE; + $File->{Tentative} |= T_ERROR; # Can never be valid. &add_warning('W04', {W04_charset => "UTF-8"}); } @@ -511,15 +548,16 @@ $File = transcode($File); # Try guessing if it didn't work out if ($File->{ContentType} eq 'text/html' && $File->{Charset}->{Default}) { - my $also_tried = 'UTF-8'; - for my $cs (qw(windows-1252 iso-8859-1)) { - last unless $File->{'Error Flagged'}; - $File->{'Error Flagged'} = FALSE; # reset - $File->{Charset}->{Use} = $cs; - &add_warning('W04', { W04_charset => $cs, W04_also_tried => $also_tried }); - $File = transcode($File); - $also_tried .= ", $cs"; - } + my $also_tried = 'UTF-8'; + for my $cs (qw(windows-1252 iso-8859-1)) { + last unless $File->{'Error Flagged'}; + $File->{'Error Flagged'} = FALSE; # reset + $File->{Charset}->{Use} = $cs; + &add_warning('W04', + {W04_charset => $cs, W04_also_tried => $also_tried}); + $File = transcode($File); + $also_tried .= ", $cs"; + } } # if it still does not work, we abandon hope here @@ -528,8 +566,9 @@ if ($File->{ContentType} eq 'text/html' && $File->{Charset}->{Default}) { # # Add a warning if doc is UTF-8 and contains a BOM. if ($File->{Charset}->{Use} eq 'utf-8' && - $File->{Content}->[0] =~ m(^\x{FEFF})) { - &add_warning('W21', {}); + $File->{Content}->[0] =~ m(^\x{FEFF})) +{ + &add_warning('W21', {}); } # @@ -547,30 +586,30 @@ if ($File->{Charset}->{Use} eq 'utf-8' && # # Override DOCTYPE if user asked for it. if ($File->{Opt}->{DOCTYPE}) { - if ($File->{Opt}->{DOCTYPE} !~ /(Inline|detect)/i) { - $File = &override_doctype($File); - } - else { - # Get rid of inline|detect for easy truth value checking later - $File->{Opt}->{DOCTYPE} = ''; - } + if ($File->{Opt}->{DOCTYPE} !~ /(Inline|detect)/i) { + $File = &override_doctype($File); + } + else { + + # Get rid of inline|detect for easy truth value checking later + $File->{Opt}->{DOCTYPE} = ''; + } } # Try to extract a DOCTYPE or xmlns. $File = &preparse_doctype($File); if ($File->{Opt}->{DOCTYPE} eq "HTML5") { - $File->{DOCTYPE} = "HTML5"; - $File->{Version} = $File->{DOCTYPE}; + $File->{DOCTYPE} = "HTML5"; + $File->{Version} = $File->{DOCTYPE}; } set_parse_mode($File, $CFG); + # # Sanity check Charset information and add any warnings necessary. $File = &charset_conflicts($File); - - # before we start the parsing, clean slate $File->{'Is Valid'} = TRUE; $File->{Errors} = []; @@ -581,312 +620,369 @@ $File->{Errors} = []; # ditto, we should try using W3C::Validator::EventHandler, # but it's badly linked to opensp at the moment if (&is_xml($File)) { - if ($File->{DOCTYPE} eq "HTML5") - { - # $File->{DOCTYPE} = "XHTML5"; - # $File->{Version} = "XHTML5"; - } - else { - my $xmlparser = XML::LibXML->new(); - $xmlparser->line_numbers(1); - $xmlparser->validation(0); - $xmlparser->load_ext_dtd(0); - # [NOT] loading the XML catalog for entities resolution as it seems to cause a lot of unnecessary DTD/entities fetching (requires >= 1.53 if enabled) - #$xmlparser->load_catalog(catfile($CFG->{Paths}->{SGML}->{Library}, 'xml.soc')); - my $xml_string = join"\n",@{$File->{Content}}; - # the XML parser will check the value of encoding attribute in XML declaration - # so we have to amend it to reflect transcoding. see Bug 4867 - $xml_string =~ s/(<\?xml.*) + if ($File->{DOCTYPE} eq "HTML5") { + + # $File->{DOCTYPE} = "XHTML5"; + # $File->{Version} = "XHTML5"; + } + else { + my $xmlparser = XML::LibXML->new(); + $xmlparser->line_numbers(1); + $xmlparser->validation(0); + $xmlparser->load_ext_dtd(0); + + # [NOT] loading the XML catalog for entities resolution as it seems to cause a lot of unnecessary DTD/entities fetching (requires >= 1.53 if enabled) + #$xmlparser->load_catalog(catfile($CFG->{Paths}->{SGML}->{Library}, 'xml.soc')); + my $xml_string = join "\n", @{$File->{Content}}; + + # the XML parser will check the value of encoding attribute in XML declaration + # so we have to amend it to reflect transcoding. see Bug 4867 + $xml_string =~ s/(<\?xml.*) (encoding[\x20|\x09|\x0D|\x0A]*=[\x20|\x09|\x0D|\x0A]*(?:"[A-Za-z][a-zA-Z0-9_-]+"|'[A-Za-z][a-zA-Z0-9_-]+')) (.*\?>)/$1encoding="utf-8"$3/sx; - eval { - $xmlparser->parse_string($xml_string); - }; - $xml_string = undef; - my $xml_parse_errors_line = undef; - my @xmlwf_error_list; - my @xmlwf_obj_error_list; - if (ref($@)) { - # handle a structured error (XML::LibXML::Error object) - # (lib XML::LibXML > 0.66, but will work MUCH better > 0.69 ) - push (@xmlwf_obj_error_list, $@); - my $err_obj = $@; - while($err_obj->_prev()) { - $err_obj = $err_obj->_prev(); - unshift(@xmlwf_obj_error_list, $err_obj); - } - my $num_xmlwf_error = 0; - foreach my $err_obj (@xmlwf_obj_error_list){ - my $err; - $err->{src} = '...'; # do this with show_open_entities()? - $err->{line} = $err_obj->line(); - # -> column() is available in XML::LibXML >= 1.69_2 - $err->{char} = eval { $err_obj->column() }; - $err->{num} = "libxml2-".$err_obj->code(); - $err->{type} = "E"; - $err->{msg} = $err_obj->message(); - # The validator will sometimes fail to dereference entities files - # we're filtering the bogus resulting error - if ($err->{msg} =~ /Entity '\w+' not defined/) { - $err = undef; - next; + eval { $xmlparser->parse_string($xml_string); }; + $xml_string = undef; + my $xml_parse_errors_line = undef; + my @xmlwf_error_list; + my @xmlwf_obj_error_list; + + if (ref($@)) { + + # handle a structured error (XML::LibXML::Error object) + # (lib XML::LibXML > 0.66, but will work MUCH better > 0.69 ) + push(@xmlwf_obj_error_list, $@); + my $err_obj = $@; + while ($err_obj->_prev()) { + $err_obj = $err_obj->_prev(); + unshift(@xmlwf_obj_error_list, $err_obj); + } + my $num_xmlwf_error = 0; + foreach my $err_obj (@xmlwf_obj_error_list) { + my $err; + $err->{src} = '...'; # do this with show_open_entities()? + $err->{line} = $err_obj->line(); + + # -> column() is available in XML::LibXML >= 1.69_2 + $err->{char} = eval { $err_obj->column() }; + $err->{num} = "libxml2-" . $err_obj->code(); + $err->{type} = "E"; + $err->{msg} = $err_obj->message(); + + # The validator will sometimes fail to dereference entities files + # we're filtering the bogus resulting error + if ($err->{msg} =~ /Entity '\w+' not defined/) { + $err = undef; + next; + } + push(@xmlwf_error_list, $err); + $num_xmlwf_error++; } - push (@xmlwf_error_list, $err); - $num_xmlwf_error++; - } - } - elsif ($@) { - my $xmlwf_errors = $@; - my $xmlwf_error_line = undef; - my $xmlwf_error_col = undef; - my $xmlwf_error_msg = undef; - my $got_error_message = 0; - my $got_quoted_line = 0; - my $num_xmlwf_error = 0; - foreach my $msg_line (split "\n", $xmlwf_errors){ - - $msg_line =~ s{[^\x0d\x0a](:\d+:)}{\n$1}g; - $msg_line =~ s{[^\x0d\x0a]+[\x0d\x0a]$}{}; - - # first we get the actual error message - if (($got_error_message eq 0) and ($msg_line =~ /^(:\d+:)( parser error : .*)/ )) { - $xmlwf_error_line = $1; - $xmlwf_error_msg = $2; - $xmlwf_error_line =~ s/:(\d+):/$1/; - $xmlwf_error_msg =~ s/ parser error :/XML Parsing Error: /; - $got_error_message = 1; - } - # then we skip the second line, which shows the context (we don't use that) - elsif (($got_error_message eq 1) and ($got_quoted_line eq 0)) { - $got_quoted_line = 1; - } - # we now take the third line, with the pointer to the error's column - elsif (($msg_line =~ /(\s+)\^/) and ($got_error_message eq 1) and ($got_quoted_line eq 1)) { - $xmlwf_error_col = length($1); - } - - # cleanup for a number of bugs for the column number - if (defined($xmlwf_error_col)) { - if ((my $l = length($File->{Content}->[$xmlwf_error_line-1])) < $xmlwf_error_col) { - # http://bugzilla.gnome.org/show_bug.cgi?id=434196 - #warn("Warning: reported error column larger than line length " . - # "($xmlwf_error_col > $l) in $File->{URI} line " . - # "$xmlwf_error_line, libxml2 bug? Resetting to line length."); - $xmlwf_error_col = $l; - } - elsif ($xmlwf_error_col == 79) { - # working around an apparent odd limitation of libxml - # which only gives context for lines up to 80 chars - # http://www.w3.org/Bugs/Public/show_bug.cgi?id=4420 - # http://bugzilla.gnome.org/show_bug.cgi?id=424017 - $xmlwf_error_col = "> 80"; - # non-int line number will trigger the proper behavior in report_error - } - } - - # when we have all the info (one full error message), proceed and move on to the next error - if ((defined $xmlwf_error_line) and (defined $xmlwf_error_col) and (defined $xmlwf_error_msg)){ - # Reinitializing for the next batch of 3 lines - $got_error_message = 0; - $got_quoted_line = 0; - - # formatting the error message for output - my $err; - $err->{src} = '...'; # do this with show_open_entities()? - $err->{line} = $xmlwf_error_line; - $err->{char} = $xmlwf_error_col; - $err->{num} = 'xmlwf'; - $err->{type} = "E"; - $err->{msg} = $xmlwf_error_msg; - - # The validator will sometimes fail to dereference entities files - # we're filtering the bogus resulting error - if ($err->{msg} =~ /Entity '\w+' not defined/) { - $xmlwf_error_line = undef; - $xmlwf_error_col = undef; - $xmlwf_error_msg = undef; - next; - } - push (@xmlwf_error_list, $err); - $xmlwf_error_line = undef; - $xmlwf_error_col = undef; - $xmlwf_error_msg = undef; - $num_xmlwf_error++; - } - } - } - foreach my $errmsg (@xmlwf_error_list){ - $File->{'Is Valid'} = FALSE; - push @{$File->{WF_Errors}}, $errmsg; - } - } -} + elsif ($@) { + my $xmlwf_errors = $@; + my $xmlwf_error_line = undef; + my $xmlwf_error_col = undef; + my $xmlwf_error_msg = undef; + my $got_error_message = 0; + my $got_quoted_line = 0; + my $num_xmlwf_error = 0; + foreach my $msg_line (split "\n", $xmlwf_errors) { + + $msg_line =~ s{[^\x0d\x0a](:\d+:)}{\n$1}g; + $msg_line =~ s{[^\x0d\x0a]+[\x0d\x0a]$}{}; + + # first we get the actual error message + if (($got_error_message eq 0) and + ($msg_line =~ /^(:\d+:)( parser error : .*)/)) + { + $xmlwf_error_line = $1; + $xmlwf_error_msg = $2; + $xmlwf_error_line =~ s/:(\d+):/$1/; + $xmlwf_error_msg =~ s/ parser error :/XML Parsing Error: /; + $got_error_message = 1; + } + + # then we skip the second line, which shows the context (we don't use that) + elsif (($got_error_message eq 1) and ($got_quoted_line eq 0)) { + $got_quoted_line = 1; + } + + # we now take the third line, with the pointer to the error's column + elsif (($msg_line =~ /(\s+)\^/) and + ($got_error_message eq 1) and + ($got_quoted_line eq 1)) + { + $xmlwf_error_col = length($1); + } + # cleanup for a number of bugs for the column number + if (defined($xmlwf_error_col)) { + if (( my $l = + length($File->{Content}->[$xmlwf_error_line - 1]) + ) < $xmlwf_error_col + ) + { + + # http://bugzilla.gnome.org/show_bug.cgi?id=434196 + #warn("Warning: reported error column larger than line length " . + # "($xmlwf_error_col > $l) in $File->{URI} line " . + # "$xmlwf_error_line, libxml2 bug? Resetting to line length."); + $xmlwf_error_col = $l; + } + elsif ($xmlwf_error_col == 79) { + + # working around an apparent odd limitation of libxml + # which only gives context for lines up to 80 chars + # http://www.w3.org/Bugs/Public/show_bug.cgi?id=4420 + # http://bugzilla.gnome.org/show_bug.cgi?id=424017 + $xmlwf_error_col = "> 80"; + + # non-int line number will trigger the proper behavior in report_error + } + } + # when we have all the info (one full error message), proceed and move on to the next error + if ((defined $xmlwf_error_line) and + (defined $xmlwf_error_col) and + (defined $xmlwf_error_msg)) + { + + # Reinitializing for the next batch of 3 lines + $got_error_message = 0; + $got_quoted_line = 0; + + # formatting the error message for output + my $err; + $err->{src} = '...'; # do this with show_open_entities()? + $err->{line} = $xmlwf_error_line; + $err->{char} = $xmlwf_error_col; + $err->{num} = 'xmlwf'; + $err->{type} = "E"; + $err->{msg} = $xmlwf_error_msg; + + # The validator will sometimes fail to dereference entities files + # we're filtering the bogus resulting error + if ($err->{msg} =~ /Entity '\w+' not defined/) { + $xmlwf_error_line = undef; + $xmlwf_error_col = undef; + $xmlwf_error_msg = undef; + next; + } + push(@xmlwf_error_list, $err); + $xmlwf_error_line = undef; + $xmlwf_error_col = undef; + $xmlwf_error_msg = undef; + $num_xmlwf_error++; + } + } + } + foreach my $errmsg (@xmlwf_error_list) { + $File->{'Is Valid'} = FALSE; + push @{$File->{WF_Errors}}, $errmsg; + } + } +} if (($File->{DOCTYPE} eq "HTML5") or ($File->{DOCTYPE} eq "XHTML5")) { - if ($CFG->{External}->{HTML5}) { - $File = &html5_validate($File); - &add_warning('W00', { - W00_experimental_name => "HTML5 Conformance Checker", - W00_experimental_URI => "feedback.html" - }); - } - else { - $File->{'Error Flagged'} = TRUE; - &get_template($File, 'fatal-error.tmpl')->param( - fatal_no_checker => TRUE, - fatal_missing_checker => 'HTML5 Validator', - ); - } + if ($CFG->{External}->{HTML5}) { + $File = &html5_validate($File); + &add_warning( + 'W00', + { W00_experimental_name => "HTML5 Conformance Checker", + W00_experimental_URI => "feedback.html" + } + ); + } + else { + $File->{'Error Flagged'} = TRUE; + &get_template($File, 'fatal-error.tmpl')->param( + fatal_no_checker => TRUE, + fatal_missing_checker => 'HTML5 Validator', + ); + } } -elsif(($File->{DOCTYPE} eq '') and (($File->{Root} eq "svg") or @{$File->{Namespaces}} >1)){ +elsif (($File->{DOCTYPE} eq '') and + (($File->{Root} eq "svg") or @{$File->{Namespaces}} > 1)) +{ + # we send doctypeless SVG, or any doctypeless XML document with multiple namespaces found, to a different engine # WARNING this is experimental. if ($CFG->{External}->{CompoundXML}) { - $File = &compoundxml_validate($File); - &add_warning('W00', { - W00_experimental_name => "validator.nu Conformance Checker", - W00_experimental_URI => "feedback.html" - }); + $File = &compoundxml_validate($File); + &add_warning( + 'W00', + { W00_experimental_name => "validator.nu Conformance Checker", + W00_experimental_URI => "feedback.html" + } + ); } } else { - $File = &dtd_validate($File); + $File = &dtd_validate($File); } &abort_if_error_flagged($File, 0); # # Force "XML" if type is an XML type and an FPI was not found. # Otherwise set the type to be the FPI. -if (&is_xml($File) and not $File->{DOCTYPE} and lc($File->{Root}) ne 'html') { - $File->{Version} = 'XML'; -} else { - $File->{Version} ||= $File->{DOCTYPE}; +if (&is_xml($File) and not $File->{DOCTYPE} and lc($File->{Root}) ne 'html') { + $File->{Version} = 'XML'; +} +else { + $File->{Version} ||= $File->{DOCTYPE}; } # # Get the pretty text version of the FPI if a mapping exists. if (my $prettyver = $CFG->{Types}->{$File->{Version}}->{Display}) { - $File->{Version} = $prettyver; + $File->{Version} = $prettyver; } # # check the received mime type against Allowed mime types -if ($File->{ContentType}){ - my @allowedMediaType = - split(/\s+/, $CFG->{Types}->{$File->{DOCTYPE}}->{Types}->{Allowed} || ''); - my $usedCTisAllowed; - if (scalar @allowedMediaType) { - $usedCTisAllowed = FALSE; - foreach (@allowedMediaType) { $usedCTisAllowed = TRUE if ($_ eq $File->{ContentType}); } - } - else { - # wedon't know what media type is recommended, so better shut up - $usedCTisAllowed = TRUE; - } - if(! $usedCTisAllowed ){ - &add_warning('W23', { - W23_type => $File->{ContentType}, - W23_type_pref => $CFG->{Types}->{$File->{DOCTYPE}}->{Types}->{Preferred}, - w23_doctype => $File->{Version} - }); - } +if ($File->{ContentType}) { + my @allowedMediaType = + split(/\s+/, + $CFG->{Types}->{$File->{DOCTYPE}}->{Types}->{Allowed} || ''); + my $usedCTisAllowed; + if (scalar @allowedMediaType) { + $usedCTisAllowed = FALSE; + foreach (@allowedMediaType) { + $usedCTisAllowed = TRUE if ($_ eq $File->{ContentType}); + } + } + else { + + # wedon't know what media type is recommended, so better shut up + $usedCTisAllowed = TRUE; + } + if (!$usedCTisAllowed) { + &add_warning( + 'W23', + { W23_type => $File->{ContentType}, + W23_type_pref => + $CFG->{Types}->{$File->{DOCTYPE}}->{Types}->{Preferred}, + w23_doctype => $File->{Version} + } + ); + } } # # Warn about unknown, incorrect, or missing Namespaces. if ($File->{Namespace}) { - my $ns = $CFG->{Types}->{$File->{Version}}->{Namespace} || FALSE; - - if (&is_xml($File)) { - if ($ns eq $File->{Namespace}) { - &add_warning('W10', { - W10_ns => $File->{Namespace}, - W10_type => $File->{Type}, - }); + my $ns = $CFG->{Types}->{$File->{Version}}->{Namespace} || FALSE; + + if (&is_xml($File)) { + if ($ns eq $File->{Namespace}) { + &add_warning( + 'W10', + { W10_ns => $File->{Namespace}, + W10_type => $File->{Type}, + } + ); + } + } + else { + &add_warning( + 'W11', + { W11_ns => $File->{Namespace}, + w11_doctype => $File->{DOCTYPE} + } + ); + } +} +else { + if (&is_xml($File) and $CFG->{Types}->{$File->{Version}}->{Namespace}) { + &add_warning('W12', {}); } - } else { - &add_warning('W11', {W11_ns => $File->{Namespace}, - w11_doctype => $File->{DOCTYPE}}); - } -} else { - if (&is_xml($File) and $CFG->{Types}->{$File->{Version}}->{Namespace}) { - &add_warning('W12', {}); - } } - ## if invalid content, AND if requested, pass through tidy if (!$File->{'Is Valid'} && $File->{Opt}->{'Show Tidy'}) { - eval { - local $SIG{__DIE__}; - require HTML::Tidy; - my $tidy = HTML::Tidy->new({config_file => $CFG->{Paths}->{TidyConf}}); - my $cleaned = $tidy->clean(join("\n", @{$File->{Content}})); - $cleaned = Encode::decode_utf8($cleaned); - $File->{Tidy} = $cleaned; - }; - if ($@) { - (my $errmsg = $@) =~ s/ at .*//s; - &add_warning('W29', { W29_msg => $errmsg }); - } + eval { + local $SIG{__DIE__}; + require HTML::Tidy; + my $tidy = HTML::Tidy->new({config_file => $CFG->{Paths}->{TidyConf}}); + my $cleaned = $tidy->clean(join("\n", @{$File->{Content}})); + $cleaned = Encode::decode_utf8($cleaned); + $File->{Tidy} = $cleaned; + }; + if ($@) { + (my $errmsg = $@) =~ s/ at .*//s; + &add_warning('W29', {W29_msg => $errmsg}); + } } my $template; if ($File->{Opt}->{Output} eq 'earl') { - $template = &get_template($File, 'earl_xml.tmpl'); -} elsif ($File->{Opt}->{Output} eq 'n3') { - $template = &get_template($File, 'earl_n3.tmpl'); -} elsif ($File->{Opt}->{Output} eq 'json') { - $template = &get_template($File, 'json_output.tmpl'); -} elsif ($File->{Opt}->{Output} eq 'ucn') { - $template = &get_template($File, 'ucn_output.tmpl'); -} elsif ($File->{Opt}->{Output} eq 'soap12') { - if ($CFG->{'Enable SOAP'} != 1) { # API disabled - ideally this should have been sent before performing validation... - print CGI::header(-status => 503, -content_language => "en", - -type => "text/html", -charset => "utf-8" - ); - $template = &get_template($File, 'soap_disabled.tmpl'); - } elsif ($File->{'Error Flagged'}) { # should send SOAP fault message - $template = &get_template($File, 'soap_fault.tmpl'); - # we fill the soap fault template - #with the variables that had been passed to the HTML fatal error template - my $errtmpl = &get_template($File, 'fatal-error.tmpl'); - foreach my $fparam ($errtmpl->param()) { - $template->param($fparam => $errtmpl->param($fparam)); - } - } else { - $template = &get_template($File, 'soap_output.tmpl'); - } -} else { - $template = &get_template($File, 'result.tmpl'); + $template = &get_template($File, 'earl_xml.tmpl'); +} +elsif ($File->{Opt}->{Output} eq 'n3') { + $template = &get_template($File, 'earl_n3.tmpl'); +} +elsif ($File->{Opt}->{Output} eq 'json') { + $template = &get_template($File, 'json_output.tmpl'); +} +elsif ($File->{Opt}->{Output} eq 'ucn') { + $template = &get_template($File, 'ucn_output.tmpl'); +} +elsif ($File->{Opt}->{Output} eq 'soap12') { + if ($CFG->{'Enable SOAP'} != 1) + { # API disabled - ideally this should have been sent before performing validation... + print CGI::header( + -status => 503, + -content_language => "en", + -type => "text/html", + -charset => "utf-8" + ); + $template = &get_template($File, 'soap_disabled.tmpl'); + } + elsif ($File->{'Error Flagged'}) { # should send SOAP fault message + $template = &get_template($File, 'soap_fault.tmpl'); + + # we fill the soap fault template + #with the variables that had been passed to the HTML fatal error template + my $errtmpl = &get_template($File, 'fatal-error.tmpl'); + foreach my $fparam ($errtmpl->param()) { + $template->param($fparam => $errtmpl->param($fparam)); + } + } + else { + $template = &get_template($File, 'soap_output.tmpl'); + } +} +else { + $template = &get_template($File, 'result.tmpl'); } &prep_template($File, $template); &fin_template($File, $template); $template->param(file_warnings => $File->{Warnings}); -$template->param(tidy_output => $File->{Tidy}); -$template->param(file_source => &source($File)) - if ($template->param('opt_show_source') or ($File->{'Is Upload'}) or ($File->{'Direct Input'})); +$template->param(tidy_output => $File->{Tidy}); +$template->param(file_source => &source($File)) + if ($template->param('opt_show_source') or + ($File->{'Is Upload'}) or + ($File->{'Direct Input'})); if ($File->{Opt}->{Output} eq 'json') { - # No JSON escaping in HTML::Template (and "JS" is not the right thing here) - my $json = JSON->new(); - $json->allow_nonref(TRUE); - for my $msgs ($template->param("file_errors"), - $template->param("file_warnings")) { - next unless $msgs; - for my $msg (@$msgs) { - for my $key (qw(msg expl)) { - $msg->{$key} = $json->encode($msg->{$key}) if $msg->{$key}; - } + + # No JSON escaping in HTML::Template (and "JS" is not the right thing here) + my $json = JSON->new(); + $json->allow_nonref(TRUE); + for my $msgs ($template->param("file_errors"), + $template->param("file_warnings")) + { + next unless $msgs; + for my $msg (@$msgs) { + for my $key (qw(msg expl)) { + $msg->{$key} = $json->encode($msg->{$key}) if $msg->{$key}; + } + } } - } } # transcode output from perl's internal to utf-8 and output @@ -897,935 +993,1005 @@ print Encode::encode('UTF-8', $template->output); undef $File; exit; - ############################################################################# # Subroutine definitions ############################################################################# -sub get_template ($$) { - my ($File, $fname) = @_; - if (!$File->{_Templates}->{$fname}) { - my $tmpl = - HTML::Template->new(%{$File->{Template_Defaults}}, filename => $fname); - $tmpl->param(cfg_home_page => $CFG->{'Home Page'}); - $File->{_Templates}->{$fname} = $tmpl; - } - return $File->{_Templates}->{$fname}; +sub get_template ($$) +{ + my ($File, $fname) = @_; + if (!$File->{_Templates}->{$fname}) { + my $tmpl = HTML::Template->new(%{$File->{Template_Defaults}}, + filename => $fname); + $tmpl->param(cfg_home_page => $CFG->{'Home Page'}); + $File->{_Templates}->{$fname} = $tmpl; + } + return $File->{_Templates}->{$fname}; } # TODO: need to bring in fixes from html5_validate() here -sub compoundxml_validate (\$) { - my $File = shift; - my $ua = new W3C::Validator::UserAgent ($CFG, $File); +sub compoundxml_validate (\$) +{ + my $File = shift; + my $ua = new W3C::Validator::UserAgent($CFG, $File); + + $File->{ParserName} = "validator.nu"; + $File->{ParserOpts} = ""; - $File->{ParserName} = "validator.nu"; - $File->{ParserOpts} = ""; + my $url = URI->new($CFG->{External}->{CompoundXML}); + $url->query_form(out => "xml"); - my $url = URI->new($CFG->{External}->{CompoundXML}); - $url->query_form(out => "xml"); + my $req = HTTP::Request->new(POST => $url); - my $req = HTTP::Request->new(POST => $url); + if ($File->{Opt}->{DOCTYPE} || $File->{Charset}->{Override}) { - if ($File->{Opt}->{DOCTYPE} || $File->{Charset}->{Override}) { - # Doctype or charset overridden, need to use $File->{Content} in UTF-8 - # because $File->{Bytes} is not affected by the overrides. This will - # most likely be a source of errors about internal/actual charset - # differences as long as our transcoding process does not "fix" the - # charset info in XML declaration and meta http-equiv (any others?). - if($File->{'Direct Input'}) { # sane default when using html5 validator by direct input - $req->content_type("application/xml; charset=UTF-8"); + # Doctype or charset overridden, need to use $File->{Content} in UTF-8 + # because $File->{Bytes} is not affected by the overrides. This will + # most likely be a source of errors about internal/actual charset + # differences as long as our transcoding process does not "fix" the + # charset info in XML declaration and meta http-equiv (any others?). + if ($File->{'Direct Input'}) + { # sane default when using html5 validator by direct input + $req->content_type("application/xml; charset=UTF-8"); + } + else { + $req->content_type("$File->{ContentType}; charset=UTF-8"); + } + $req->content(Encode::encode_utf8(join("\n", @{$File->{Content}}))); } else { - $req->content_type("$File->{ContentType}; charset=UTF-8"); - } - $req->content(Encode::encode_utf8(join("\n", @{$File->{Content}}))); - } - else { - # Pass original bytes, Content-Type and charset as-is. - # We trust that our and validator.nu's interpretation of line numbers - # is the same (regardless of EOL chars used in the document). - - my @content_type = ($File->{ContentType} => undef); - push(@content_type, charset => $File->{Charset}->{HTTP}) - if $File->{Charset}->{HTTP}; - - $req->content_type(HTTP::Headers::Util::join_header_words(@content_type)); - $req->content_ref(\$File->{Bytes}); - } - - $req->content_language($File->{ContentLang}) if $File->{ContentLang}; - # Intentionally using direct header access instead of $req->last_modified - $req->header('Last-Modified', $File->{Modified}) if $File->{Modified}; - - # If not in debug mode, gzip the request (LWP >= 5.817) - eval { $req->encode("gzip"); } unless $File->{Opt}->{Debug}; - - my $res = $ua->request($req); - if (! $res->is_success()) { - $File->{'Error Flagged'} = TRUE; - &get_template($File, 'fatal-error.tmpl')->param( - fatal_no_checker => TRUE, - fatal_missing_checker => 'HTML5 Validator', - ); - } - else { - my $content = &get_content($File, $res); - return $File if $File->{'Error Flagged'}; - # and now we parse according to http://wiki.whatwg.org/wiki/Validator.nu_XML_Output - # I wish we could use XML::LibXML::Reader here. but SHAME on those major - # unix distributions still shipping with libxml2 2.6.16… 4 years after its release - my $xml_reader = XML::LibXML->new(); - my $xmlDOM; - eval { $xmlDOM = $xml_reader->parse_string( $content);}; - if ($@) { - $File->{'Error Flagged'} = TRUE; - &get_template($File, 'fatal-error.tmpl')->param( - fatal_no_checker => TRUE, - fatal_missing_checker => 'HTML5 Validator', - ); - return $File; - } - my @nodelist = $xmlDOM->getElementsByTagName("messages"); - my $messages_node = $nodelist[0]; - my @message_nodes = $messages_node->childNodes; - foreach my $message_node (@message_nodes) { - my $message_type = $message_node->localname; - my $err; - my ($xml_error_line, $xml_error_col, $xml_error_msg, $xml_error_expl); - if ($message_type eq "error") { - $err->{type} = "E"; - $File->{'Is Valid'} = FALSE; - } - elsif ($message_type eq "info") { - $err->{type} = "I"; # by default - we find warnings in the type attribute (below) - } - if ($message_node->hasAttributes()) { - my @attributelist = $message_node->attributes(); - foreach my $attribute (@attributelist) { - if($attribute->name eq "type"){ - if (($attribute->getValue() eq "warning") and ($message_type eq "info")) { - $err->{type} = "W"; + # Pass original bytes, Content-Type and charset as-is. + # We trust that our and validator.nu's interpretation of line numbers + # is the same (regardless of EOL chars used in the document). + + my @content_type = ($File->{ContentType} => undef); + push(@content_type, charset => $File->{Charset}->{HTTP}) + if $File->{Charset}->{HTTP}; + + $req->content_type( + HTTP::Headers::Util::join_header_words(@content_type)); + $req->content_ref(\$File->{Bytes}); + } + + $req->content_language($File->{ContentLang}) if $File->{ContentLang}; + + # Intentionally using direct header access instead of $req->last_modified + $req->header('Last-Modified', $File->{Modified}) if $File->{Modified}; + + # If not in debug mode, gzip the request (LWP >= 5.817) + eval { $req->encode("gzip"); } unless $File->{Opt}->{Debug}; + + my $res = $ua->request($req); + if (!$res->is_success()) { + $File->{'Error Flagged'} = TRUE; + &get_template($File, 'fatal-error.tmpl')->param( + fatal_no_checker => TRUE, + fatal_missing_checker => 'HTML5 Validator', + ); + } + else { + my $content = &get_content($File, $res); + return $File if $File->{'Error Flagged'}; + + # and now we parse according to http://wiki.whatwg.org/wiki/Validator.nu_XML_Output + # I wish we could use XML::LibXML::Reader here. but SHAME on those major + # unix distributions still shipping with libxml2 2.6.16… 4 years after its release + my $xml_reader = XML::LibXML->new(); + my $xmlDOM; + eval { $xmlDOM = $xml_reader->parse_string($content); }; + if ($@) { + $File->{'Error Flagged'} = TRUE; + &get_template($File, 'fatal-error.tmpl')->param( + fatal_no_checker => TRUE, + fatal_missing_checker => 'HTML5 Validator', + ); + return $File; + } + my @nodelist = $xmlDOM->getElementsByTagName("messages"); + my $messages_node = $nodelist[0]; + my @message_nodes = $messages_node->childNodes; + foreach my $message_node (@message_nodes) { + my $message_type = $message_node->localname; + my $err; + my ($xml_error_line, $xml_error_col, + $xml_error_msg, $xml_error_expl + ); + if ($message_type eq "error") { + $err->{type} = "E"; + $File->{'Is Valid'} = FALSE; + } + elsif ($message_type eq "info") { + $err->{type} = "I" + ; # by default - we find warnings in the type attribute (below) + } + if ($message_node->hasAttributes()) { + my @attributelist = $message_node->attributes(); + foreach my $attribute (@attributelist) { + if ($attribute->name eq "type") { + if (($attribute->getValue() eq "warning") and + ($message_type eq "info")) + { + $err->{type} = "W"; + } + + } + if ($attribute->name eq "last-column") { + $xml_error_col = $attribute->getValue(); + } + if ($attribute->name eq "last-line") { + $xml_error_line = $attribute->getValue(); + } + + } + } + my @child_nodes = $message_node->childNodes; + foreach my $child_node (@child_nodes) { + if ($child_node->localname eq "message") { + $xml_error_msg = $child_node->toString(); + $xml_error_msg =~ s,</?[^>]*>,,gsi; + } + if ($child_node->localname eq "elaboration") { + $xml_error_expl = $child_node->toString(); + $xml_error_expl =~ s,</?elaboration>,,gi; + $xml_error_expl = + "\n<div class=\"ve xml\">$xml_error_expl</div>\n"; + } + } + + # formatting the error message for output + $err->{src} = '...'; # do this with show_open_entities()? + $err->{line} = $xml_error_line; + $err->{char} = $xml_error_col; + $err->{num} = 'validator.nu'; + $err->{msg} = $xml_error_msg; + $err->{expl} = $xml_error_expl; + + if ($err->{msg} =~ + /Using the preset for (.*) based on the root namespace/) + { + $File->{DOCTYPE} = $1; } + else { + push @{$File->{Errors}}, $err; + } + + # @@ TODO message explanation / elaboration + } + } + return $File; +} + +sub html5_validate (\$) +{ + my $File = shift; + my $ua = new W3C::Validator::UserAgent($CFG, $File); + + $File->{ParserName} = "validator.nu"; + $File->{ParserOpts} = ""; + + my $url = URI->new($CFG->{External}->{HTML5}); + $url->query_form(out => "xml"); + + my $req = HTTP::Request->new(POST => $url); - } - if($attribute->name eq "last-column") { - $xml_error_col = $attribute->getValue(); - } - if($attribute->name eq "last-line") { - $xml_error_line = $attribute->getValue(); - } - - } - } - my @child_nodes = $message_node->childNodes; - foreach my $child_node (@child_nodes) { - if ($child_node->localname eq "message") { - $xml_error_msg= $child_node->toString(); - $xml_error_msg =~ s,</?[^>]*>,,gsi; - } - if ($child_node->localname eq "elaboration") { - $xml_error_expl = $child_node->toString(); - $xml_error_expl =~ s,</?elaboration>,,gi; - $xml_error_expl = "\n<div class=\"ve xml\">$xml_error_expl</div>\n"; - } - } - # formatting the error message for output - $err->{src} = '...'; # do this with show_open_entities()? - $err->{line} = $xml_error_line; - $err->{char} = $xml_error_col; - $err->{num} = 'validator.nu'; - $err->{msg} = $xml_error_msg; - $err->{expl} = $xml_error_expl; - - if ($err->{msg} =~ /Using the preset for (.*) based on the root namespace/ ) { - $File->{DOCTYPE} = $1; - } - else { - push @{$File->{Errors}}, $err; - } - # @@ TODO message explanation / elaboration - } - } - return $File; -} - - -sub html5_validate (\$) { - my $File = shift; - my $ua = new W3C::Validator::UserAgent ($CFG, $File); - - $File->{ParserName} = "validator.nu"; - $File->{ParserOpts} = ""; - - my $url = URI->new($CFG->{External}->{HTML5}); - $url->query_form(out => "xml"); - - my $req = HTTP::Request->new(POST => $url); - - if ($File->{Opt}->{DOCTYPE} || $File->{Charset}->{Override}) { - # Doctype or charset overridden, need to use $File->{Content} in UTF-8 - # because $File->{Bytes} is not affected by the overrides. This will - # most likely be a source of errors about internal/actual charset - # differences as long as our transcoding process does not "fix" the - # charset info in XML declaration, meta http-equiv/charset and/or BOM - # (any others?). - - my $ct = $File->{'Direct Input'} ? "text/html" : $File->{ContentType}; - $req->content_type("$ct; charset=UTF-8"); - $req->content(Encode::encode_utf8(join("\n", @{$File->{Content}}))); - } - else { - # Pass original bytes, Content-Type and charset as-is. - # We trust that our and validator.nu's interpretation of line numbers - # is the same later when displaying error contexts (regardless of EOL chars - # used in the document). - - if ($File->{'Direct Input'}) { - $req->content_type("text/html; charset=UTF-8"); + if ($File->{Opt}->{DOCTYPE} || $File->{Charset}->{Override}) { + + # Doctype or charset overridden, need to use $File->{Content} in UTF-8 + # because $File->{Bytes} is not affected by the overrides. This will + # most likely be a source of errors about internal/actual charset + # differences as long as our transcoding process does not "fix" the + # charset info in XML declaration, meta http-equiv/charset and/or BOM + # (any others?). + + my $ct = $File->{'Direct Input'} ? "text/html" : $File->{ContentType}; + $req->content_type("$ct; charset=UTF-8"); + $req->content(Encode::encode_utf8(join("\n", @{$File->{Content}}))); } else { - my @ct = ($File->{ContentType} => undef); - push(@ct, charset => $File->{Charset}->{HTTP}) - if $File->{Charset}->{HTTP}; - $req->content_type(HTTP::Headers::Util::join_header_words(@ct)); - } - $req->content_ref(\$File->{Bytes}); - } - - $req->content_language($File->{ContentLang}) if $File->{ContentLang}; - # Intentionally using direct header access instead of $req->last_modified - # (the latter takes seconds since epoch, but $File->{Modified} is an already - # formatted string). - $req->header('Last-Modified', $File->{Modified}) if $File->{Modified}; - - # If not in debug mode, gzip the request (LWP >= 5.817) - eval { $req->encode("gzip"); } unless $File->{Opt}->{Debug}; - - my $res = $ua->request($req); - if (! $res->is_success()) { - $File->{'Error Flagged'} = TRUE; - &get_template($File, 'fatal-error.tmpl')->param( - fatal_no_checker => TRUE, - fatal_missing_checker => 'HTML5 Validator', - ); - } - else { - my $content = &get_content($File, $res); - return $File if $File->{'Error Flagged'}; - # and now we parse according to http://wiki.whatwg.org/wiki/Validator.nu_XML_Output - # I wish we could use XML::LibXML::Reader here. but SHAME on those major - # unix distributions still shipping with libxml2 2.6.16… 4 years after its release - my $xml_reader = XML::LibXML->new(); - my $xmlDOM; - eval { $xmlDOM = $xml_reader->parse_string( $content);}; - if ($@) { - $File->{'Error Flagged'} = TRUE; - &get_template($File, 'fatal-error.tmpl')->param( - fatal_no_checker => TRUE, - fatal_missing_checker => 'HTML5 Validator', - ); - return $File; - } - my @nodelist = $xmlDOM->getElementsByTagName("messages"); - my $messages_node = $nodelist[0]; - my @message_nodes = $messages_node->childNodes; - foreach my $message_node (@message_nodes) { - my $message_type = $message_node->localname; - my $err; - my ($html5_error_line, $html5_error_col, $html5_error_msg, $html5_error_expl); - # TODO: non-document errors should receive different/better treatment, - # but this is better than hiding all problems for now (#6747) - if ($message_type eq "error" || $message_type eq "non-document-error") { - $err->{type} = "E"; - $File->{'Is Valid'} = FALSE; - } - elsif ($message_type eq "info") { - $err->{type} = "I"; # by default - we find warnings in the type attribute (below) - } - if ($message_node->hasAttributes()) { - my @attributelist = $message_node->attributes(); - foreach my $attribute (@attributelist) { - if($attribute->name eq "type"){ - if (($attribute->getValue() eq "warning") and ($message_type eq "info")) { - $err->{type} = "W"; + # Pass original bytes, Content-Type and charset as-is. + # We trust that our and validator.nu's interpretation of line numbers + # is the same later when displaying error contexts (regardless of EOL chars + # used in the document). + + if ($File->{'Direct Input'}) { + $req->content_type("text/html; charset=UTF-8"); + } + else { + my @ct = ($File->{ContentType} => undef); + push(@ct, charset => $File->{Charset}->{HTTP}) + if $File->{Charset}->{HTTP}; + $req->content_type(HTTP::Headers::Util::join_header_words(@ct)); + } + $req->content_ref(\$File->{Bytes}); + } + + $req->content_language($File->{ContentLang}) if $File->{ContentLang}; + + # Intentionally using direct header access instead of $req->last_modified + # (the latter takes seconds since epoch, but $File->{Modified} is an already + # formatted string). + $req->header('Last-Modified', $File->{Modified}) if $File->{Modified}; + + # If not in debug mode, gzip the request (LWP >= 5.817) + eval { $req->encode("gzip"); } unless $File->{Opt}->{Debug}; + + my $res = $ua->request($req); + if (!$res->is_success()) { + $File->{'Error Flagged'} = TRUE; + &get_template($File, 'fatal-error.tmpl')->param( + fatal_no_checker => TRUE, + fatal_missing_checker => 'HTML5 Validator', + ); + } + else { + my $content = &get_content($File, $res); + return $File if $File->{'Error Flagged'}; + + # and now we parse according to http://wiki.whatwg.org/wiki/Validator.nu_XML_Output + # I wish we could use XML::LibXML::Reader here. but SHAME on those major + # unix distributions still shipping with libxml2 2.6.16… 4 years after its release + my $xml_reader = XML::LibXML->new(); + my $xmlDOM; + eval { $xmlDOM = $xml_reader->parse_string($content); }; + if ($@) { + $File->{'Error Flagged'} = TRUE; + &get_template($File, 'fatal-error.tmpl')->param( + fatal_no_checker => TRUE, + fatal_missing_checker => 'HTML5 Validator', + ); + return $File; + } + my @nodelist = $xmlDOM->getElementsByTagName("messages"); + my $messages_node = $nodelist[0]; + my @message_nodes = $messages_node->childNodes; + foreach my $message_node (@message_nodes) { + my $message_type = $message_node->localname; + my $err; + my ($html5_error_line, $html5_error_col, + $html5_error_msg, $html5_error_expl + ); + + # TODO: non-document errors should receive different/better treatment, + # but this is better than hiding all problems for now (#6747) + if ($message_type eq "error" || + $message_type eq "non-document-error") + { + $err->{type} = "E"; + $File->{'Is Valid'} = FALSE; } + elsif ($message_type eq "info") { + $err->{type} = "I" + ; # by default - we find warnings in the type attribute (below) + } + if ($message_node->hasAttributes()) { + my @attributelist = $message_node->attributes(); + foreach my $attribute (@attributelist) { + if ($attribute->name eq "type") { + if (($attribute->getValue() eq "warning") and + ($message_type eq "info")) + { + $err->{type} = "W"; + } + + } + elsif ($attribute->name eq "last-column") { + $html5_error_col = $attribute->getValue(); + } + elsif ($attribute->name eq "last-line") { + $html5_error_line = $attribute->getValue(); + } - } - elsif ($attribute->name eq "last-column") { - $html5_error_col = $attribute->getValue(); - } - elsif ($attribute->name eq "last-line") { - $html5_error_line = $attribute->getValue(); - } - - } - } - my @child_nodes = $message_node->childNodes; - foreach my $child_node (@child_nodes) { - if ($child_node->localname eq "message") { - $html5_error_msg = $child_node->textContent(); - } - elsif ($child_node->localname eq "elaboration") { - $html5_error_expl = $child_node->toString(); - $html5_error_expl =~ s,</?elaboration>,,gi; - $html5_error_expl = "\n<div class=\"ve html5\">$html5_error_expl</div>\n"; - } - } - # formatting the error message for output - $err->{src} = '...'; # do this with show_open_entities()? - $err->{line} = $html5_error_line; - $err->{char} = $html5_error_col; - $err->{num} = 'html5'; - $err->{msg} = $html5_error_msg; - $err->{expl} = $html5_error_expl; - push @{$File->{Errors}}, $err; - # @@ TODO message explanation / elaboration - } - } - return $File; -} - - -sub dtd_validate (\$) { - my $File = shift; - my $opensp = SGML::Parser::OpenSP->new(); - my $parser_name = "SGML::Parser::OpenSP"; - # - # By default, use SGML catalog file and SGML Declaration. - my $catalog = catfile($CFG->{Paths}->{SGML}->{Library}, 'sgml.soc'); - - # default parsing options - my @spopt = qw(valid non-sgml-char-ref no-duplicate); - - # - # Switch to XML semantics if file is XML. - if (&is_xml($File)) { - $catalog = catfile($CFG->{Paths}->{SGML}->{Library}, 'xml.soc'); - push(@spopt, 'xml'); - } - else { - # add warnings for shorttags - push(@spopt, 'min-tag'); - } - - $File->{ParserName} = $parser_name; - $File->{ParserOpts} = join(" ", @spopt); - - # - # Parser configuration - $opensp->search_dirs($CFG->{Paths}->{SGML}->{Library}); - $opensp->catalogs($catalog); - $opensp->show_error_numbers(1); - $opensp->warnings(@spopt); - - # - # Restricted file reading is disabled on Win32 for the time - # beeing since neither SGML::Parser::OpenSP nor check auto- - # magically set search_dirs to include the temp directory - # so restricted file reading would defunct the Validator. - $opensp->restrict_file_reading(1) unless $^O eq 'MSWin32'; - - - my $h; # event handler - if ($File->{Opt}->{Outline}) { - $h = W3C::Validator::EventHandler::Outliner->new($opensp, $File, $CFG); - } - else { - $h = W3C::Validator::EventHandler->new($opensp, $File, $CFG); - } - - $opensp->handler($h); - $opensp->parse_string(join"\n",@{$File->{Content}}); - - # Make sure there are no circular references, otherwise the script - # would leak memory until mod_perl unloads it which could take some - # time. @@FIXME It's probably overly careful though. - $opensp->handler(undef); - undef $h->{_parser}; - undef $h->{_file}; - undef $h; - undef $opensp; - - # - # Set Version to be the FPI initially. - $File->{Version} = $File->{DOCTYPE}; - return $File; + } + } + my @child_nodes = $message_node->childNodes; + foreach my $child_node (@child_nodes) { + if ($child_node->localname eq "message") { + $html5_error_msg = $child_node->textContent(); + } + elsif ($child_node->localname eq "elaboration") { + $html5_error_expl = $child_node->toString(); + $html5_error_expl =~ s,</?elaboration>,,gi; + $html5_error_expl = + "\n<div class=\"ve html5\">$html5_error_expl</div>\n"; + } + } + + # formatting the error message for output + $err->{src} = '...'; # do this with show_open_entities()? + $err->{line} = $html5_error_line; + $err->{char} = $html5_error_col; + $err->{num} = 'html5'; + $err->{msg} = $html5_error_msg; + $err->{expl} = $html5_error_expl; + push @{$File->{Errors}}, $err; + + # @@ TODO message explanation / elaboration + } + } + return $File; +} + +sub dtd_validate (\$) +{ + my $File = shift; + my $opensp = SGML::Parser::OpenSP->new(); + my $parser_name = "SGML::Parser::OpenSP"; + + # + # By default, use SGML catalog file and SGML Declaration. + my $catalog = catfile($CFG->{Paths}->{SGML}->{Library}, 'sgml.soc'); + + # default parsing options + my @spopt = qw(valid non-sgml-char-ref no-duplicate); + + # + # Switch to XML semantics if file is XML. + if (&is_xml($File)) { + $catalog = catfile($CFG->{Paths}->{SGML}->{Library}, 'xml.soc'); + push(@spopt, 'xml'); + } + else { + + # add warnings for shorttags + push(@spopt, 'min-tag'); + } + + $File->{ParserName} = $parser_name; + $File->{ParserOpts} = join(" ", @spopt); + + # + # Parser configuration + $opensp->search_dirs($CFG->{Paths}->{SGML}->{Library}); + $opensp->catalogs($catalog); + $opensp->show_error_numbers(1); + $opensp->warnings(@spopt); + + # + # Restricted file reading is disabled on Win32 for the time + # beeing since neither SGML::Parser::OpenSP nor check auto- + # magically set search_dirs to include the temp directory + # so restricted file reading would defunct the Validator. + $opensp->restrict_file_reading(1) unless $^O eq 'MSWin32'; + + my $h; # event handler + if ($File->{Opt}->{Outline}) { + $h = W3C::Validator::EventHandler::Outliner->new($opensp, $File, $CFG); + } + else { + $h = W3C::Validator::EventHandler->new($opensp, $File, $CFG); + } + + $opensp->handler($h); + $opensp->parse_string(join "\n", @{$File->{Content}}); + + # Make sure there are no circular references, otherwise the script + # would leak memory until mod_perl unloads it which could take some + # time. @@FIXME It's probably overly careful though. + $opensp->handler(undef); + undef $h->{_parser}; + undef $h->{_file}; + undef $h; + undef $opensp; + + # + # Set Version to be the FPI initially. + $File->{Version} = $File->{DOCTYPE}; + return $File; } # # Generate HTML report. -sub prep_template ($$) { - my $File = shift; - my $T = shift; - - # - # XML mode... - $T->param(is_xml => &is_xml($File)); - - # - # Upload? - $T->param(is_upload => $File->{'Is Upload'}); - - # - # Direct Input? - $T->param(is_direct_input => $File->{'Direct Input'}); - - # - # The URI... - $T->param(file_uri => $File->{URI}); - $T->param(file_uri_param => uri_escape($File->{URI})); - - # - # Set URL for page title. - $T->param(page_title_url => $File->{URI}); - - # - # Metadata... - $T->param(file_modified => $File->{Modified}); - $T->param(file_server => $File->{Server}); - $T->param(file_size => $File->{Size}); - $T->param(file_contenttype => $File->{ContentType}); - $T->param(file_charset => $File->{Charset}->{Use}); - $T->param(file_doctype => $File->{DOCTYPE}); - - # - # Output options... - $T->param(opt_show_source => $File->{Opt}->{'Show Source'}); - $T->param(opt_show_tidy => $File->{Opt}->{'Show Tidy'}); - $T->param(opt_show_outline => $File->{Opt}->{Outline}); - $T->param(opt_verbose => $File->{Opt}->{Verbose}); - $T->param(opt_group_errors => $File->{Opt}->{'Group Errors'}); - $T->param(opt_no200 => $File->{Opt}->{No200}); - - # - # Tip of the Day... - my $tip = &get_tip(); - $T->param(tip_uri => $tip->[0]); - $T->param(tip_slug => $tip->[1]); - - # Root Element - $T->param(root_element => $File->{Root}); - - # Namespaces... - $T->param(file_namespace => $File->{Namespace}); - my %seen_ns = (); - my @bulk_ns = @{$File->{Namespaces}}; - $File->{Namespaces} = []; # reinitialize the list of non-root namespaces - # ... and then get a uniq version of it - foreach my $single_namespace (@bulk_ns) { - push(@{$File->{Namespaces}}, $single_namespace) unless (($single_namespace eq $File->{Namespace}) or $seen_ns{$single_namespace}++); - } - my @nss = map({uri => $_}, @{$File->{Namespaces}}); - $T->param(file_namespaces => \@nss) if @nss; - - if ($File->{Opt}->{DOCTYPE}) { - my $over_doctype_param = "override doctype $File->{Opt}->{DOCTYPE}"; - $T->param($over_doctype_param => TRUE); - } - - if ($File->{Opt}->{Charset}) { - my $over_charset_param = "override charset $File->{Opt}->{Charset}"; - $T->param($over_charset_param => TRUE); - } - - # Allow content-negotiation - if ($File->{Opt}->{'Accept Header'}) { - $T->param('accept' => $File->{Opt}->{'Accept Header'}); - } - if ($File->{Opt}->{'Accept-Language Header'}) { - $T->param('accept-language' => $File->{Opt}->{'Accept-Language Header'}); - } - if ($File->{Opt}->{'Accept-Charset Header'}) { - $T->param('accept-charset' => $File->{Opt}->{'Accept-Charset Header'}); - } - if ($File->{Opt}->{'User Agent'}) { - $T->param('user-agent' => $File->{Opt}->{'User Agent'}); - } - if ($File->{'Error Flagged'}) { - $T->param(fatal_error => TRUE); - } -} - -sub fin_template ($$) { - my $File = shift; - my $T = shift; - - - - # - # Set debug info for HTML report. - $T->param(opt_debug => $DEBUG); - $T->param(debug => - [ - map({name => $_, value => $ENV{$_}}, - qw(no_proxy http_proxy https_proxy ftp_proxy FTP_PASSIVE)), - { name => 'Content-Encoding', value => $File->{ContentEnc} }, - { name => 'Content-Language', value => $File->{ContentLang} }, - { name => 'Content-Location', value => $File->{ContentLoc} }, - { name => 'Transfer-Encoding', value => $File->{TransferEnc} }, - { name => 'Parse Mode', value => $File->{Mode} }, - { name => 'Parse Mode Factor', value => $File->{ModeChoice} }, - { name => 'Parser', value => $File->{ParserName} }, - { name => 'Parser Options', value => $File->{ParserOpts} }, - ], - ); - - if (! $File->{Doctype} and ($File->{Version} eq 'unknown' or $File->{Version} eq 'SGML' or (!$File->{Version}))) { - - my $default_doctype = (&is_xml($File) ? - $File->{"Default DOCTYPE"}->{"XHTML"} : $File->{"Default DOCTYPE"}->{"HTML"}); - $T->param(file_version => "$default_doctype"); - } - else { - $T->param(file_version => $File->{Version}); - } - my ($num_errors,$num_warnings, $num_info, $reported_errors) = &report_errors($File); - if ($num_errors+$num_warnings > 0) - { - $T->param(has_errors => 1); - } - $T->param(valid_errors_num => $num_errors); - $num_warnings += scalar @{$File->{Warnings}}; - $T->param(valid_warnings_num => $num_warnings); - my $number_of_errors = ""; # textual form of $num_errors - my $number_of_warnings = ""; # textual form of $num_errors - -# The following is a bit hack-ish, but will enable us to have some logic -# for a human-readable display of the number, with cases for 0, 1, 2 and above -# (the case of 2 appears to be useful for localization in some languages where the plural is different for 2, and above) - - if ($num_errors > 1) { - $T->param(number_of_errors_is_0 => FALSE ); - $T->param(number_of_errors_is_1 => FALSE); - if ($num_errors eq 2) { - $T->param(number_of_errors_is_2 => TRUE); +sub prep_template ($$) +{ + my $File = shift; + my $T = shift; + + # + # XML mode... + $T->param(is_xml => &is_xml($File)); + + # + # Upload? + $T->param(is_upload => $File->{'Is Upload'}); + + # + # Direct Input? + $T->param(is_direct_input => $File->{'Direct Input'}); + + # + # The URI... + $T->param(file_uri => $File->{URI}); + $T->param(file_uri_param => uri_escape($File->{URI})); + + # + # Set URL for page title. + $T->param(page_title_url => $File->{URI}); + + # + # Metadata... + $T->param(file_modified => $File->{Modified}); + $T->param(file_server => $File->{Server}); + $T->param(file_size => $File->{Size}); + $T->param(file_contenttype => $File->{ContentType}); + $T->param(file_charset => $File->{Charset}->{Use}); + $T->param(file_doctype => $File->{DOCTYPE}); + + # + # Output options... + $T->param(opt_show_source => $File->{Opt}->{'Show Source'}); + $T->param(opt_show_tidy => $File->{Opt}->{'Show Tidy'}); + $T->param(opt_show_outline => $File->{Opt}->{Outline}); + $T->param(opt_verbose => $File->{Opt}->{Verbose}); + $T->param(opt_group_errors => $File->{Opt}->{'Group Errors'}); + $T->param(opt_no200 => $File->{Opt}->{No200}); + + # + # Tip of the Day... + my $tip = &get_tip(); + $T->param(tip_uri => $tip->[0]); + $T->param(tip_slug => $tip->[1]); + + # Root Element + $T->param(root_element => $File->{Root}); + + # Namespaces... + $T->param(file_namespace => $File->{Namespace}); + my %seen_ns = (); + my @bulk_ns = @{$File->{Namespaces}}; + $File->{Namespaces} = []; # reinitialize the list of non-root namespaces + # ... and then get a uniq version of it + foreach my $single_namespace (@bulk_ns) { + push(@{$File->{Namespaces}}, $single_namespace) + unless (($single_namespace eq $File->{Namespace}) or + $seen_ns{$single_namespace}++); + } + my @nss = map({uri => $_}, @{$File->{Namespaces}}); + $T->param(file_namespaces => \@nss) if @nss; + + if ($File->{Opt}->{DOCTYPE}) { + my $over_doctype_param = "override doctype $File->{Opt}->{DOCTYPE}"; + $T->param($over_doctype_param => TRUE); + } + + if ($File->{Opt}->{Charset}) { + my $over_charset_param = "override charset $File->{Opt}->{Charset}"; + $T->param($over_charset_param => TRUE); + } + + # Allow content-negotiation + if ($File->{Opt}->{'Accept Header'}) { + $T->param('accept' => $File->{Opt}->{'Accept Header'}); + } + if ($File->{Opt}->{'Accept-Language Header'}) { + $T->param( + 'accept-language' => $File->{Opt}->{'Accept-Language Header'}); + } + if ($File->{Opt}->{'Accept-Charset Header'}) { + $T->param('accept-charset' => $File->{Opt}->{'Accept-Charset Header'}); + } + if ($File->{Opt}->{'User Agent'}) { + $T->param('user-agent' => $File->{Opt}->{'User Agent'}); + } + if ($File->{'Error Flagged'}) { + $T->param(fatal_error => TRUE); + } +} + +sub fin_template ($$) +{ + my $File = shift; + my $T = shift; + + # + # Set debug info for HTML report. + $T->param(opt_debug => $DEBUG); + $T->param( + debug => [ + map({name => $_, value => $ENV{$_}}, + qw(no_proxy http_proxy https_proxy ftp_proxy FTP_PASSIVE)), + {name => 'Content-Encoding', value => $File->{ContentEnc}}, + {name => 'Content-Language', value => $File->{ContentLang}}, + {name => 'Content-Location', value => $File->{ContentLoc}}, + {name => 'Transfer-Encoding', value => $File->{TransferEnc}}, + {name => 'Parse Mode', value => $File->{Mode}}, + {name => 'Parse Mode Factor', value => $File->{ModeChoice}}, + {name => 'Parser', value => $File->{ParserName}}, + {name => 'Parser Options', value => $File->{ParserOpts}}, + ], + ); + + if (!$File->{Doctype} and + ($File->{Version} eq 'unknown' or + $File->{Version} eq 'SGML' or + (!$File->{Version})) + ) + { + + my $default_doctype = + (&is_xml($File) ? $File->{"Default DOCTYPE"}->{"XHTML"} : + $File->{"Default DOCTYPE"}->{"HTML"}); + $T->param(file_version => "$default_doctype"); } else { - $T->param(number_of_errors_is_2 => FALSE ); - } - $T->param(number_of_errors_is_plural => TRUE ); - } - elsif ($num_errors eq 1) { - $T->param(number_of_errors_is_0 => FALSE ); - $T->param(number_of_errors_is_1 => TRUE ); - $T->param(number_of_errors_is_2 => FALSE ); - $T->param(number_of_errors_is_plural => FALSE ); - } - else { # 0 - $T->param(number_of_errors_is_0 => TRUE ); - $T->param(number_of_errors_is_1 => FALSE ); - $T->param(number_of_errors_is_2 => FALSE ); - $T->param(number_of_errors_is_plural => FALSE ); - } - - if ($num_warnings > 1) { - $T->param(number_of_warnings_is_0 => FALSE ); - $T->param(number_of_warnings_is_1 => FALSE); - if ($num_warnings eq 2) { - $T->param(number_of_warnings_is_2 => TRUE); + $T->param(file_version => $File->{Version}); + } + my ($num_errors, $num_warnings, $num_info, $reported_errors) = + &report_errors($File); + if ($num_errors + $num_warnings > 0) { + $T->param(has_errors => 1); + } + $T->param(valid_errors_num => $num_errors); + $num_warnings += scalar @{$File->{Warnings}}; + $T->param(valid_warnings_num => $num_warnings); + my $number_of_errors = ""; # textual form of $num_errors + my $number_of_warnings = ""; # textual form of $num_errors + + # The following is a bit hack-ish, but will enable us to have some logic + # for a human-readable display of the number, with cases for 0, 1, 2 and above + # (the case of 2 appears to be useful for localization in some languages where the plural is different for 2, and above) + + if ($num_errors > 1) { + $T->param(number_of_errors_is_0 => FALSE); + $T->param(number_of_errors_is_1 => FALSE); + if ($num_errors eq 2) { + $T->param(number_of_errors_is_2 => TRUE); + } + else { + $T->param(number_of_errors_is_2 => FALSE); + } + $T->param(number_of_errors_is_plural => TRUE); + } + elsif ($num_errors eq 1) { + $T->param(number_of_errors_is_0 => FALSE); + $T->param(number_of_errors_is_1 => TRUE); + $T->param(number_of_errors_is_2 => FALSE); + $T->param(number_of_errors_is_plural => FALSE); + } + else { # 0 + $T->param(number_of_errors_is_0 => TRUE); + $T->param(number_of_errors_is_1 => FALSE); + $T->param(number_of_errors_is_2 => FALSE); + $T->param(number_of_errors_is_plural => FALSE); + } + + if ($num_warnings > 1) { + $T->param(number_of_warnings_is_0 => FALSE); + $T->param(number_of_warnings_is_1 => FALSE); + if ($num_warnings eq 2) { + $T->param(number_of_warnings_is_2 => TRUE); + } + else { + $T->param(number_of_warnings_is_2 => FALSE); + } + $T->param(number_of_warnings_is_plural => TRUE); + } + elsif ($num_warnings eq 1) { + $T->param(number_of_warnings_is_0 => FALSE); + $T->param(number_of_warnings_is_1 => TRUE); + $T->param(number_of_warnings_is_2 => FALSE); + $T->param(number_of_warnings_is_plural => FALSE); + } + else { # 0 + $T->param(number_of_warnings_is_0 => TRUE); + $T->param(number_of_warnings_is_1 => FALSE); + $T->param(number_of_warnings_is_2 => FALSE); + $T->param(number_of_warnings_is_plural => FALSE); + } + + $T->param(file_outline => $File->{heading_outline}) + if $File->{Opt}->{Outline}; + + $T->param(file_errors => $reported_errors); + if ($File->{'Is Valid'}) { + $T->param(VALID => TRUE); + $T->param(valid_status => 'Valid'); + &report_valid($File, $T); } else { - $T->param(number_of_warnings_is_2 => FALSE); - } - $T->param(number_of_warnings_is_plural => TRUE ); - } - elsif ($num_warnings eq 1) { - $T->param(number_of_warnings_is_0 => FALSE ); - $T->param(number_of_warnings_is_1 => TRUE ); - $T->param(number_of_warnings_is_2 => FALSE ); - $T->param(number_of_warnings_is_plural => FALSE ); - } - else { # 0 - $T->param(number_of_warnings_is_0 => TRUE ); - $T->param(number_of_warnings_is_1 => FALSE ); - $T->param(number_of_warnings_is_2 => FALSE ); - $T->param(number_of_warnings_is_plural => FALSE ); - } - - $T->param(file_outline => $File->{heading_outline}) - if $File->{Opt}->{Outline}; - - $T->param(file_errors => $reported_errors); - if ($File->{'Is Valid'}) { - $T->param(VALID => TRUE); - $T->param(valid_status => 'Valid'); - &report_valid($File, $T); - } else { - $T->param(VALID => FALSE); - $T->param(valid_status => 'Invalid'); - } + $T->param(VALID => FALSE); + $T->param(valid_status => 'Invalid'); + } } # # Output "This page is Valid" report. -sub report_valid { - my $File = shift; - my $T = shift; - - unless ($File->{Version} eq 'unknown' or defined $File->{Tentative}) { - - if (exists $CFG->{Types}->{$File->{DOCTYPE}}->{Badge}) { - my $cfg = $CFG->{Types}->{$File->{DOCTYPE}}; - $T->param( - badge_uri => $cfg->{Badge}->{URI}, - local_badge_uri => $cfg->{Badge}->{'Local URI'}, - badge_alt_uri => $cfg->{Badge}->{'Alt URI'}, - local_alt_badge_uri => $cfg->{Badge}->{'Local ALT URI'}, - badge_alt => $cfg->{Badge}->{Alt}, - badge_rdfa => $cfg->{Badge}->{RDFa}, - badge_h => $cfg->{Badge}->{Height}, - badge_w => $cfg->{Badge}->{Width}, - badge_tagc => $cfg->{'Parse Mode'} eq 'XML' ? ' /' : '', - ); - } - } elsif (defined $File->{Tentative}) { - $T->param(is_tentative => TRUE); - } - - if ($File->{XMLWF_ONLY}){ - $T->param(xmlwf_only => TRUE); - } - my $thispage = self_url_file($File); - $T->param(file_thispage => $thispage); +sub report_valid +{ + my $File = shift; + my $T = shift; + + unless ($File->{Version} eq 'unknown' or defined $File->{Tentative}) { + + if (exists $CFG->{Types}->{$File->{DOCTYPE}}->{Badge}) { + my $cfg = $CFG->{Types}->{$File->{DOCTYPE}}; + $T->param( + badge_uri => $cfg->{Badge}->{URI}, + local_badge_uri => $cfg->{Badge}->{'Local URI'}, + badge_alt_uri => $cfg->{Badge}->{'Alt URI'}, + local_alt_badge_uri => $cfg->{Badge}->{'Local ALT URI'}, + badge_alt => $cfg->{Badge}->{Alt}, + badge_rdfa => $cfg->{Badge}->{RDFa}, + badge_h => $cfg->{Badge}->{Height}, + badge_w => $cfg->{Badge}->{Width}, + badge_tagc => $cfg->{'Parse Mode'} eq 'XML' ? ' /' : '', + ); + } + } + elsif (defined $File->{Tentative}) { + $T->param(is_tentative => TRUE); + } + + if ($File->{XMLWF_ONLY}) { + $T->param(xmlwf_only => TRUE); + } + my $thispage = self_url_file($File); + $T->param(file_thispage => $thispage); } # # Add a warning message to the output. -sub add_warning ($$) { - my $WID = shift; - my $params = shift; +sub add_warning ($$) +{ + my $WID = shift; + my $params = shift; - push @{$File->{Warnings}}, $WID; + push @{$File->{Warnings}}, $WID; - my %tmplparams = ( - $WID => TRUE, - have_warnings => TRUE, - %$params, - ); - for my $tmpl (qw(result fatal-error soap_output)) { - &get_template($File, "$tmpl.tmpl")->param(%tmplparams); - } + my %tmplparams = ( + $WID => TRUE, + have_warnings => TRUE, + %$params, + ); + for my $tmpl (qw(result fatal-error soap_output)) { + &get_template($File, "$tmpl.tmpl")->param(%tmplparams); + } } # # Proxy authentication requests. # Note: expects the third argument to be a hash ref (see HTTP::Headers::Auth). -sub authenticate { - my $File = shift; - my $resource = shift; - my $authHeader = shift || {}; +sub authenticate +{ + my $File = shift; + my $resource = shift; + my $authHeader = shift || {}; - my $realm = $resource; - $realm =~ s([^\w\d.-]*){}g; + my $realm = $resource; + $realm =~ s([^\w\d.-]*){}g; - for my $scheme (keys(%$authHeader)) { - my $origrealm = $authHeader->{$scheme}->{realm}; - if (not defined $origrealm or lc($scheme) !~ /^(?:basic|digest)$/) { - delete($authHeader->{$scheme}); - next; + for my $scheme (keys(%$authHeader)) { + my $origrealm = $authHeader->{$scheme}->{realm}; + if (not defined $origrealm or lc($scheme) !~ /^(?:basic|digest)$/) { + delete($authHeader->{$scheme}); + next; + } + $authHeader->{$scheme}->{realm} = "$realm-$origrealm"; } - $authHeader->{$scheme}->{realm} = "$realm-$origrealm"; - } - my $headers = HTTP::Headers->new(Connection => 'close'); - $headers->www_authenticate(%$authHeader); - $headers = $headers->as_string(); - chomp($headers); + my $headers = HTTP::Headers->new(Connection => 'close'); + $headers->www_authenticate(%$authHeader); + $headers = $headers->as_string(); + chomp($headers); - my $tmpl = &get_template($File, 'http_401_authrequired.tmpl'); - $tmpl->param(http_401_headers => $headers, http_401_url => $resource); + my $tmpl = &get_template($File, 'http_401_authrequired.tmpl'); + $tmpl->param(http_401_headers => $headers, http_401_url => $resource); - print Encode::encode('UTF-8', $tmpl->output); - exit; # Further interaction will be a new HTTP request. + print Encode::encode('UTF-8', $tmpl->output); + exit; # Further interaction will be a new HTTP request. } # # Fetch an URL and return the content and selected meta-info. -sub handle_uri { - my $q = shift; # The CGI object. - my $File = shift; # The master datastructure. +sub handle_uri +{ + my $q = shift; # The CGI object. + my $File = shift; # The master datastructure. - my $uri = new URI (ref $q ? $q->param('uri') : $q)->canonical(); - $uri->fragment(undef); + my $uri = new URI(ref $q ? $q->param('uri') : $q)->canonical(); + $uri->fragment(undef); - my $ua = new W3C::Validator::UserAgent ($CFG, $File); + my $ua = new W3C::Validator::UserAgent($CFG, $File); - unless ($ua->is_protocol_supported($uri)) { - $File->{'Error Flagged'} = TRUE; - my $tmpl = &get_template($File, 'fatal-error.tmpl'); - if (($uri->canonical() eq "1") ) - #if uri param is empty (also for empty direct or upload), it's been set to TRUE in sub prepCGI() - { - $tmpl->param(fatal_no_content => TRUE); + unless ($ua->is_protocol_supported($uri)) { + $File->{'Error Flagged'} = TRUE; + my $tmpl = &get_template($File, 'fatal-error.tmpl'); + if (($uri->canonical() eq "1")) + + #if uri param is empty (also for empty direct or upload), it's been set to TRUE in sub prepCGI() + { + $tmpl->param(fatal_no_content => TRUE); + } + else { + $tmpl->param( + fatal_uri_error => TRUE, + fatal_uri_scheme => $uri->scheme() + ); + } + return $File; } - else { - $tmpl->param(fatal_uri_error => TRUE, fatal_uri_scheme => $uri->scheme()); + + return $File unless $ua->uri_ok($uri); + + my $req = new HTTP::Request(GET => $uri); + + # if one wants to use the accept, accept-charset and accept-language params + # in order to trigger specific negotiation + if ($File->{Opt}->{'Accept Header'}) { + $req->header(Accept => $File->{Opt}->{'Accept Header'}); } - return $File; - } - - return $File unless $ua->uri_ok($uri); - - my $req = new HTTP::Request(GET => $uri); - - # if one wants to use the accept, accept-charset and accept-language params - # in order to trigger specific negotiation - if ($File->{Opt}->{'Accept Header'}) { - $req->header(Accept => $File->{Opt}->{'Accept Header'}); - } - if ($File->{Opt}->{'Accept-Language Header'}) { - $req->header(Accept_Language => $File->{Opt}->{'Accept-Language Header'}); - } - if ($File->{Opt}->{'Accept-Charset Header'}) { - $req->header(Accept_Charset => $File->{Opt}->{'Accept-Charset Header'}); - } - - # All Apache configurations don't set HTTP_AUTHORIZATION for CGI scripts. - # If we're under mod_perl, there is a way around it... - eval { - local $SIG{__DIE__}; - my $auth = Apache2::RequestUtil->request()->headers_in()->{Authorization}; - $ENV{HTTP_AUTHORIZATION} = $auth if $auth; - } if (IS_MODPERL2() && !$ENV{HTTP_AUTHORIZATION}); - - # If we got a Authorization header, the client is back at it after being - # prompted for a password so we insert the header as is in the request. - $req->headers->header(Authorization => $ENV{HTTP_AUTHORIZATION}) - if $ENV{HTTP_AUTHORIZATION}; - - my $res = $ua->request($req); - - return $File if $File->{'Error Flagged'}; # Redirect IP rejected? - - unless ($res->code == 200 or $File->{Opt}->{'No200'}) { - if ($res->code == 401) { - my %auth = $res->www_authenticate(); # HTTP::Headers::Auth - &authenticate($File, $res->request->uri, \%auth); - } else { - $File->{'Error Flagged'} = TRUE; - - my $no200url = undef; - if (!$File->{Opt}->{No200}) { - # $File->{URI} not set yet; setting it non-local has side effects - local $File->{URI} = $uri->as_string; - local $File->{Opt}->{No200} = TRUE; - $no200url = &self_url_file($File); - } - - my $warning = $res->header("Client-Warning"); - if ($warning && $warning =~ /Internal response/i) { - # Response doc generated internally by LWP, no need to show that info - # nor to provide error doc validation link to it. - $warning = undef; - $no200url = undef; - } - - my $tmpl = &get_template($File, 'fatal-error.tmpl'); - $tmpl->param( - fatal_http_error => TRUE, - fatal_http_uri => $uri->as_string, - fatal_http_code => $res->code, - fatal_http_msg => $res->message, - fatal_http_warn => $warning, - fatal_http_no200 => $no200url, - ); - $tmpl->param(fatal_http_dns => TRUE) if ($res->code == 500); + if ($File->{Opt}->{'Accept-Language Header'}) { + $req->header( + Accept_Language => $File->{Opt}->{'Accept-Language Header'}); + } + if ($File->{Opt}->{'Accept-Charset Header'}) { + $req->header( + Accept_Charset => $File->{Opt}->{'Accept-Charset Header'}); } - return $File; - } + # All Apache configurations don't set HTTP_AUTHORIZATION for CGI scripts. + # If we're under mod_perl, there is a way around it... + eval { + local $SIG{__DIE__}; + my $auth = + Apache2::RequestUtil->request()->headers_in()->{Authorization}; + $ENV{HTTP_AUTHORIZATION} = $auth if $auth; + } if (IS_MODPERL2() && !$ENV{HTTP_AUTHORIZATION}); - # - # Enforce Max Recursion level. - &check_recursion($File, $res); + # If we got a Authorization header, the client is back at it after being + # prompted for a password so we insert the header as is in the request. + $req->headers->header(Authorization => $ENV{HTTP_AUTHORIZATION}) + if $ENV{HTTP_AUTHORIZATION}; - my ($mode, $ct, $charset) - = &parse_content_type( - $File, - scalar($res->header('Content-Type')), - scalar($res->request->uri), - ); + my $res = $ua->request($req); - my $content = &get_content($File, $res); - return $File if $File->{'Error Flagged'}; + return $File if $File->{'Error Flagged'}; # Redirect IP rejected? - $File->{Bytes} = $content; - $File->{Mode} = $mode; - $File->{ContentType} = $ct; - $File->{ContentEnc} = $res->content_encoding; - $File->{ContentLang} = $res->content_language; - $File->{ContentLoc} = $res->header('Content-Location'); - $File->{TransferEnc} = $res->header('Client-Transfer-Encoding'); - $File->{Charset}->{HTTP} = lc $charset; - $File->{Modified} = $res->header('Last-Modified'); - $File->{Server} = scalar $res->server; + unless ($res->code == 200 or $File->{Opt}->{'No200'}) { + if ($res->code == 401) { + my %auth = $res->www_authenticate(); # HTTP::Headers::Auth + &authenticate($File, $res->request->uri, \%auth); + } + else { + $File->{'Error Flagged'} = TRUE; - # TODO: Content-Length is not always set, so either this should - # be renamed to 'Content-Length' or it should consider more than - # the Content-Length header. - $File->{Size} = scalar $res->content_length; - $File->{URI} = scalar $res->request->uri->canonical; - $File->{'Is Upload'} = FALSE; - $File->{'Direct Input'} = FALSE; + my $no200url = undef; + if (!$File->{Opt}->{No200}) { + # $File->{URI} not set yet; setting it non-local has side effects + local $File->{URI} = $uri->as_string; + local $File->{Opt}->{No200} = TRUE; + $no200url = &self_url_file($File); + } - return $File; -} + my $warning = $res->header("Client-Warning"); + if ($warning && $warning =~ /Internal response/i) { -# -# Handle uploaded file and return the content and selected meta-info. -sub handle_file { - my $q = shift; # The CGI object. - my $File = shift; # The master datastructure. + # Response doc generated internally by LWP, no need to show that info + # nor to provide error doc validation link to it. + $warning = undef; + $no200url = undef; + } - my $f = $q->param('uploaded_file'); - my $h = $q->uploadInfo($f); - my $file; + my $tmpl = &get_template($File, 'fatal-error.tmpl'); + $tmpl->param( + fatal_http_error => TRUE, + fatal_http_uri => $uri->as_string, + fatal_http_code => $res->code, + fatal_http_msg => $res->message, + fatal_http_warn => $warning, + fatal_http_no200 => $no200url, + ); + $tmpl->param(fatal_http_dns => TRUE) if ($res->code == 500); + } - local $/ = undef; # set line delimiter so that <> reads rest of file - $file = <$f>; + return $File; + } - my ($mode, $ct, $charset) = &parse_content_type($File, $h->{'Content-Type'}); + # + # Enforce Max Recursion level. + &check_recursion($File, $res); - $File->{Bytes} = $file; - $File->{Mode} = $mode; - $File->{ContentType} = $ct; - $File->{Charset}->{HTTP} = lc $charset; - $File->{Modified} = $q->http('Last-Modified'); - $File->{Server} = $q->http('User-Agent'); # Fake a "server". :-) - $File->{Size} = $q->http('Content-Length'); - $File->{URI} = "$f"; - $File->{'Is Upload'} = TRUE; - $File->{'Direct Input'} = FALSE; + my ($mode, $ct, $charset) = &parse_content_type( + $File, + scalar($res->header('Content-Type')), + scalar($res->request->uri), + ); - return $File; + my $content = &get_content($File, $res); + return $File if $File->{'Error Flagged'}; + + $File->{Bytes} = $content; + $File->{Mode} = $mode; + $File->{ContentType} = $ct; + $File->{ContentEnc} = $res->content_encoding; + $File->{ContentLang} = $res->content_language; + $File->{ContentLoc} = $res->header('Content-Location'); + $File->{TransferEnc} = $res->header('Client-Transfer-Encoding'); + $File->{Charset}->{HTTP} = lc $charset; + $File->{Modified} = $res->header('Last-Modified'); + $File->{Server} = scalar $res->server; + + # TODO: Content-Length is not always set, so either this should + # be renamed to 'Content-Length' or it should consider more than + # the Content-Length header. + $File->{Size} = scalar $res->content_length; + $File->{URI} = scalar $res->request->uri->canonical; + $File->{'Is Upload'} = FALSE; + $File->{'Direct Input'} = FALSE; + + return $File; } # # Handle uploaded file and return the content and selected meta-info. -sub handle_frag { - my $q = shift; # The CGI object. - my $File = shift; # The master datastructure. - - $File->{Bytes} = $q->param('fragment'); - $File->{Mode} = 'TBD'; - $File->{Modified} = ''; - $File->{Server} = ''; - $File->{Size} = ''; - $File->{ContentType} = ''; # @@TODO? - $File->{URI} = 'upload://Form Submission'; - $File->{'Is Upload'} = FALSE; - $File->{'Direct Input'} = TRUE; - $File->{Charset}->{HTTP} = "utf-8"; # by default, the form accepts utf-8 chars - - if ($File->{Opt}->{Prefill}) { - # we surround the HTML fragment with some basic document structure - my $prefill_Template; - if ($File->{Opt}->{'Prefill Doctype'} eq 'html401') { - $prefill_Template = &get_template($File, 'prefill_html401.tmpl'); - } - else { - $prefill_Template = &get_template($File, 'prefill_xhtml10.tmpl'); +sub handle_file +{ + my $q = shift; # The CGI object. + my $File = shift; # The master datastructure. + + my $f = $q->param('uploaded_file'); + my $h = $q->uploadInfo($f); + my $file; + + local $/ = undef; # set line delimiter so that <> reads rest of file + $file = <$f>; + + my ($mode, $ct, $charset) = + &parse_content_type($File, $h->{'Content-Type'}); + + $File->{Bytes} = $file; + $File->{Mode} = $mode; + $File->{ContentType} = $ct; + $File->{Charset}->{HTTP} = lc $charset; + $File->{Modified} = $q->http('Last-Modified'); + $File->{Server} = $q->http('User-Agent'); # Fake a "server". :-) + $File->{Size} = $q->http('Content-Length'); + $File->{URI} = "$f"; + $File->{'Is Upload'} = TRUE; + $File->{'Direct Input'} = FALSE; + + return $File; +} + +# +# Handle uploaded file and return the content and selected meta-info. +sub handle_frag +{ + my $q = shift; # The CGI object. + my $File = shift; # The master datastructure. + + $File->{Bytes} = $q->param('fragment'); + $File->{Mode} = 'TBD'; + $File->{Modified} = ''; + $File->{Server} = ''; + $File->{Size} = ''; + $File->{ContentType} = ''; # @@TODO? + $File->{URI} = 'upload://Form Submission'; + $File->{'Is Upload'} = FALSE; + $File->{'Direct Input'} = TRUE; + $File->{Charset}->{HTTP} = + "utf-8"; # by default, the form accepts utf-8 chars + + if ($File->{Opt}->{Prefill}) { + + # we surround the HTML fragment with some basic document structure + my $prefill_Template; + if ($File->{Opt}->{'Prefill Doctype'} eq 'html401') { + $prefill_Template = &get_template($File, 'prefill_html401.tmpl'); + } + else { + $prefill_Template = &get_template($File, 'prefill_xhtml10.tmpl'); + } + $prefill_Template->param(fragment => $File->{Bytes}); + $File->{Bytes} = $prefill_Template->output(); + + # Let's force the view source so that the user knows what we've put around + # their code. + $File->{Opt}->{'Show Source'} = TRUE; + + # Ignore doctype overrides (#5132). + $File->{Opt}->{DOCTYPE} = 'Inline'; } - $prefill_Template->param(fragment => $File->{Bytes}); - $File->{Bytes} = $prefill_Template->output(); - # Let's force the view source so that the user knows what we've put around - # their code. - $File->{Opt}->{'Show Source'} = TRUE; - # Ignore doctype overrides (#5132). - $File->{Opt}->{DOCTYPE} = 'Inline'; - } - return $File; + return $File; } # # Parse a Content-Type and parameters. Return document type and charset. -sub parse_content_type { - my $File = shift; - my $Content_Type = shift; - my $url = shift; - my $charset = ''; +sub parse_content_type +{ + my $File = shift; + my $Content_Type = shift; + my $url = shift; + my $charset = ''; - my ($ct) = lc($Content_Type) =~ /^\s*([^\s;]*)/g; + my ($ct) = lc($Content_Type) =~ /^\s*([^\s;]*)/g; - my $mode = $CFG->{MIME}->{$ct} || $ct; + my $mode = $CFG->{MIME}->{$ct} || $ct; - $charset = HTML::Encoding::encoding_from_content_type($Content_Type); + $charset = HTML::Encoding::encoding_from_content_type($Content_Type); - if ($mode =~ m(/)) { # a "/" means it's unknown or we'd have a mode here. - if ($ct eq 'text/css' and defined $url) { - print redirect - 'http://jigsaw.w3.org/css-validator/validator?uri=' - . uri_escape $url; - exit; - } elsif ($ct eq 'application/atom+xml' and defined $url) { - print redirect - 'http://validator.w3.org/feed/check.cgi?url=' - . uri_escape $url; - exit; - } elsif ($ct =~ m(^application/.+\+xml$)) { - # unknown media types which should be XML - we give these a try - $mode = "XML"; - } else { - $File->{'Error Flagged'} = TRUE; - &get_template($File, 'fatal-error.tmpl')->param( - fatal_mime_error => TRUE, - fatal_mime_ct => $ct, - ); + if ($mode =~ m(/)) { # a "/" means it's unknown or we'd have a mode here. + if ($ct eq 'text/css' and defined $url) { + print redirect + 'http://jigsaw.w3.org/css-validator/validator?uri=' . + uri_escape $url; + exit; + } + elsif ($ct eq 'application/atom+xml' and defined $url) { + print redirect 'http://validator.w3.org/feed/check.cgi?url=' . + uri_escape $url; + exit; + } + elsif ($ct =~ m(^application/.+\+xml$)) { + + # unknown media types which should be XML - we give these a try + $mode = "XML"; + } + else { + $File->{'Error Flagged'} = TRUE; + &get_template($File, 'fatal-error.tmpl')->param( + fatal_mime_error => TRUE, + fatal_mime_ct => $ct, + ); + } } - } - return $mode, $ct, $charset; + return $mode, $ct, $charset; } # # Get content with Content-Encodings decoded from a response. -sub get_content ($$) { - my $File = shift; - my $res = shift; - - my $content; - eval { - $content = $res->decoded_content(charset => 'none', raise_error => 1); - }; - if ($@) { - (my $errmsg = $@) =~ s/ at .*//s; - my $cenc = $res->header("Content-Encoding"); - my $uri = $res->request->uri; - $File->{'Error Flagged'} = TRUE; - &get_template($File, 'fatal-error.tmpl')->param( - fatal_decode_error => TRUE, - fatal_decode_errmsg => $errmsg, - fatal_decode_cenc => $cenc, - # Include URI because it might be a subsystem (eg. HTML5 validator) one - fatal_decode_uri => $uri, - ); - } +sub get_content ($$) +{ + my $File = shift; + my $res = shift; + + my $content; + eval { + $content = $res->decoded_content(charset => 'none', raise_error => 1); + }; + if ($@) { + (my $errmsg = $@) =~ s/ at .*//s; + my $cenc = $res->header("Content-Encoding"); + my $uri = $res->request->uri; + $File->{'Error Flagged'} = TRUE; + &get_template($File, 'fatal-error.tmpl')->param( + fatal_decode_error => TRUE, + fatal_decode_errmsg => $errmsg, + fatal_decode_cenc => $cenc, + + # Include URI because it might be a subsystem (eg. HTML5 validator) one + fatal_decode_uri => $uri, + ); + } - return $content; + return $content; } # # Check recursion level and enforce Max Recursion limit. -sub check_recursion ($$) { - my $File = shift; - my $res = shift; +sub check_recursion ($$) +{ + my $File = shift; + my $res = shift; - # Not looking at our own output. - return unless defined $res->header('X-W3C-Validator-Recursion'); + # Not looking at our own output. + return unless defined $res->header('X-W3C-Validator-Recursion'); - my $lvl = $res->header('X-W3C-Validator-Recursion'); - return unless $lvl =~ m/^\d+$/; # Non-digit, i.e. garbage, ignore. + my $lvl = $res->header('X-W3C-Validator-Recursion'); + return unless $lvl =~ m/^\d+$/; # Non-digit, i.e. garbage, ignore. - if ($lvl >= $CFG->{'Max Recursion'}) { - print redirect $CFG->{'Home Page'}; - } else { - # Increase recursion level in output. - &get_template($File, 'result.tmpl')->param(depth => $lvl++); - } + if ($lvl >= $CFG->{'Max Recursion'}) { + print redirect $CFG->{'Home Page'}; + } + else { + + # Increase recursion level in output. + &get_template($File, 'result.tmpl')->param(depth => $lvl++); + } } # @@ -1833,300 +1999,335 @@ sub check_recursion ($$) { # # Note that this is used both for HTML and XML escaping. # -sub ent { - local $_ = shift; - return '' unless defined; # Eliminate warnings +sub ent +{ + local $_ = shift; + return '' unless defined; # Eliminate warnings - # TODO: Err, why have " twice in the character class? ' maybe? - s(["<&>"]){'&#' . ord($&) . ';'}ge; # should switch to hex sooner or later - return $_; + # TODO: Err, why have " twice in the character class? ' maybe? + s(["<&>"]){'&#' . ord($&) . ';'}ge; # should switch to hex sooner or later + return $_; } # # Truncate source lines for report. # Expects 1-based column indexes. -sub truncate_line { - my $line = shift; - my $col = shift; - my $maxlen = 80; # max line length to truncate to +sub truncate_line +{ + my $line = shift; + my $col = shift; + my $maxlen = 80; # max line length to truncate to + + my $diff = length($line) - $maxlen; - my $diff = length($line) - $maxlen; + # Don't truncate at all if it fits. + return ($line, $col) if ($diff <= 0); - # Don't truncate at all if it fits. - return ($line, $col) if ($diff <= 0); + my $start = $col - int($maxlen / 2); + if ($start < 0) { - my $start = $col - int($maxlen/2); - if ($start < 0) { - # Truncate only from end of line. - $start = 0; - $line = substr($line, $start, $maxlen - 1) . '…'; - } - elsif ($start > $diff) { - # Truncate only from beginning of line. - $start = $diff; - $line = '…' . substr($line, $start + 1); - } - else { - # Truncate from both beginning and end of line. - $line = '…' . substr($line, $start + 1, $maxlen - 2) . '…'; - } + # Truncate only from end of line. + $start = 0; + $line = substr($line, $start, $maxlen - 1) . '…'; + } + elsif ($start > $diff) { - # Shift column if we truncated from beginning of line. - $col -= $start; + # Truncate only from beginning of line. + $start = $diff; + $line = '…' . substr($line, $start + 1); + } + else { - return ($line, $col); + # Truncate from both beginning and end of line. + $line = '…' . substr($line, $start + 1, $maxlen - 2) . '…'; + } + + # Shift column if we truncated from beginning of line. + $col -= $start; + + return ($line, $col); } # # Suppress any existing DOCTYPE by commenting it out. -sub override_doctype { - my $File = shift; - - my ($dt) = - grep { $_->{Display} eq $File->{Opt}->{DOCTYPE} } values %{$CFG->{Types}}; - - # @@TODO: abort/whine about unrecognized doctype if $dt is undef.; - my $pubid = $dt->{PubID}; - my $sysid = $dt->{SysID}; - my $name = $dt->{Name}; - - # The HTML5 PubID is a fake, reset it out of the way. - $pubid = undef if ($pubid eq 'HTML5'); - - # We don't have public/system ids for all types. - my $dtd = "<!DOCTYPE $name"; - if ($pubid) { - $dtd .= qq( PUBLIC "$pubid"); - $dtd .= qq( "$sysid") if $sysid; - } - elsif ($sysid) { - $dtd .= qq( SYSTEM "$sysid"); - } - $dtd .= '>'; - - my $org_dtd = ''; - my $HTML = ''; - my $seen_doctype = FALSE; - my $seen_root = FALSE; - - my $declaration = sub { - my ($tag, $text) = @_; - if ($seen_doctype || uc($tag) ne '!DOCTYPE') { - $HTML .= $text; - return; - } - - $seen_doctype = TRUE; - - $org_dtd = &ent($text); - ($File->{Root}, undef, $File->{DOCTYPE}) = $text =~ - /<!DOCTYPE\s+(\w[\w\.-]+)(?:\s+(?:PUBLIC|SYSTEM)\s+(['"])(.*?)\2)?\s*>/si; - - $File->{DOCTYPE} = 'HTML5' - if (lc($File->{Root} || '') eq 'html' && - (!defined($File->{DOCTYPE}) || - $File->{DOCTYPE} eq 'about:legacy-compat')); - - # No Override if Fallback was requested, or if override is the same as detected - my $known = $CFG->{Types}->{$File->{DOCTYPE}}; - if ($File->{Opt}->{FB}->{DOCTYPE} or - ($known && $File->{Opt}->{DOCTYPE} eq $known->{Display})) { - $HTML .= $text; # Stash it as is... - } else { - $HTML .= "$dtd<!-- $text -->"; - } - }; - - my $start_element = sub{ - if ($seen_root) { - $HTML .= $_[0]; # Stash it as is... moving on +sub override_doctype +{ + my $File = shift; + + my ($dt) = + grep { $_->{Display} eq $File->{Opt}->{DOCTYPE} } + values %{$CFG->{Types}}; + + # @@TODO: abort/whine about unrecognized doctype if $dt is undef.; + my $pubid = $dt->{PubID}; + my $sysid = $dt->{SysID}; + my $name = $dt->{Name}; + + # The HTML5 PubID is a fake, reset it out of the way. + $pubid = undef if ($pubid eq 'HTML5'); + + # We don't have public/system ids for all types. + my $dtd = "<!DOCTYPE $name"; + if ($pubid) { + $dtd .= qq( PUBLIC "$pubid"); + $dtd .= qq( "$sysid") if $sysid; } - else { - $seen_root = TRUE; - if ($seen_doctype) { - # doctype addition aldready done, we move on - $HTML .= $_[0]; - } - else { - # no original doctype present, hence none replaced already - # => we sneak the chosen doctype before the root elt - $HTML .= "$dtd$_[0]"; - } + elsif ($sysid) { + $dtd .= qq( SYSTEM "$sysid"); } - }; + $dtd .= '>'; - HTML::Parser->new(default_h => [sub {$HTML .= shift}, 'text'], - declaration_h => [$declaration, 'tag,text'], - start_h => [$start_element, 'text'] - )->parse(join "\n", @{$File->{Content}})->eof(); + my $org_dtd = ''; + my $HTML = ''; + my $seen_doctype = FALSE; + my $seen_root = FALSE; - $File->{Content} = [split /\n/, $HTML]; + my $declaration = sub { + my ($tag, $text) = @_; + if ($seen_doctype || uc($tag) ne '!DOCTYPE') { + $HTML .= $text; + return; + } - if ($seen_doctype) { - my $known = $CFG->{Types}->{$File->{DOCTYPE}}; - unless ($File->{Opt}->{FB}->{DOCTYPE} or - ($known && $File->{Opt}->{DOCTYPE} eq $known->{Display} )) { - &add_warning('W13', { - W13_org => $org_dtd, - W13_new => $File->{Opt}->{DOCTYPE}, - }); - $File->{Tentative} |= T_ERROR; # Tag it as Invalid. - } - } else { - if ($File->{"DOCTYPEless OK"}) { - &add_warning('W25', {W25_dtd => $File->{Opt}->{DOCTYPE}}); + $seen_doctype = TRUE; + + $org_dtd = &ent($text); + ($File->{Root}, undef, $File->{DOCTYPE}) = $text =~ + /<!DOCTYPE\s+(\w[\w\.-]+)(?:\s+(?:PUBLIC|SYSTEM)\s+(['"])(.*?)\2)?\s*>/si; + + $File->{DOCTYPE} = 'HTML5' + if ( + lc($File->{Root} || '') eq 'html' && + (!defined($File->{DOCTYPE}) || + $File->{DOCTYPE} eq 'about:legacy-compat') + ); + + # No Override if Fallback was requested, or if override is the same as detected + my $known = $CFG->{Types}->{$File->{DOCTYPE}}; + if ($File->{Opt}->{FB}->{DOCTYPE} or + ($known && $File->{Opt}->{DOCTYPE} eq $known->{Display})) + { + $HTML .= $text; # Stash it as is... + } + else { + $HTML .= "$dtd<!-- $text -->"; + } + }; + + my $start_element = sub { + if ($seen_root) { + $HTML .= $_[0]; # Stash it as is... moving on + } + else { + $seen_root = TRUE; + if ($seen_doctype) { + + # doctype addition aldready done, we move on + $HTML .= $_[0]; + } + else { + + # no original doctype present, hence none replaced already + # => we sneak the chosen doctype before the root elt + $HTML .= "$dtd$_[0]"; + } + } + }; + + HTML::Parser->new( + default_h => [sub { $HTML .= shift }, 'text'], + declaration_h => [$declaration, 'tag,text'], + start_h => [$start_element, 'text'] + )->parse(join "\n", @{$File->{Content}})->eof(); + + $File->{Content} = [split /\n/, $HTML]; + + if ($seen_doctype) { + my $known = $CFG->{Types}->{$File->{DOCTYPE}}; + unless ($File->{Opt}->{FB}->{DOCTYPE} or + ($known && $File->{Opt}->{DOCTYPE} eq $known->{Display})) + { + &add_warning( + 'W13', + { W13_org => $org_dtd, + W13_new => $File->{Opt}->{DOCTYPE}, + } + ); + $File->{Tentative} |= T_ERROR; # Tag it as Invalid. + } } - elsif ($File->{Opt}->{FB}->{DOCTYPE}) { - &add_warning('W16', {W16_dtd => $File->{Opt}->{DOCTYPE}}); - $File->{Tentative} |= T_ERROR; # Tag it as Invalid. - } else { - &add_warning('W15', {W15_dtd => $File->{Opt}->{DOCTYPE}}); - $File->{Tentative} |= T_ERROR; # Tag it as Invalid. + else { + if ($File->{"DOCTYPEless OK"}) { + &add_warning('W25', {W25_dtd => $File->{Opt}->{DOCTYPE}}); + } + elsif ($File->{Opt}->{FB}->{DOCTYPE}) { + &add_warning('W16', {W16_dtd => $File->{Opt}->{DOCTYPE}}); + $File->{Tentative} |= T_ERROR; # Tag it as Invalid. + } + else { + &add_warning('W15', {W15_dtd => $File->{Opt}->{DOCTYPE}}); + $File->{Tentative} |= T_ERROR; # Tag it as Invalid. + } } - } - return $File; + return $File; } # # Generate a HTML report of detected errors. -sub report_errors ($) { - my $File = shift; - my $Errors = []; - my %Errors_bytype; - my $number_of_errors = 0; - my $number_of_warnings = 0; - my $number_of_info = 0; - - # Hash to keep track of how many of each error is reported. - my %Msgs; # Used to generate a UID for explanations. - - # for the sake of readability, at least until the xmlwf errors have explanations, - # we push the errors from the XML parser at the END of the error list. - foreach my $errmsg (@{$File->{WF_Errors}}){ - push @{$File->{Errors}}, $errmsg; - } - - - if (scalar @{$File->{Errors}}) { - foreach my $err (@{$File->{Errors}}) { - my $line; - my $col = 0; - # avoid truncating lines that do not exist - if (defined($err->{line}) && $File->{Content}->[$err->{line}-1]) { - if (defined($err->{char}) && $err->{char} =~ /^[0-9]+$/ ){ - ($line, $col) = &truncate_line($File->{Content}->[$err->{line}-1], $err->{char}); - $line = &mark_error($line, $col); - } - elsif (defined($err->{line})) { - $col = length($File->{Content}->[$err->{line}-1]); - $col = 80 if ($col > 80); - ($line, $col) = &truncate_line($File->{Content}->[$err->{line}-1], $col); - $line = &ent($line); - $col = 0; - } - } - else { - $col = 0; - $line = ""; - } - my $explanation = ""; - if ($err->{expl}) { - - } - else { - if ($err->{num}) { - my $num = $err->{num}; - $explanation .= Encode::decode_utf8("\n $RSRC{msg}->{$num}->{verbose}\n") - if exists $RSRC{msg}->{$num} - && exists $RSRC{msg}->{$num}->{verbose}; - my $_msg = $RSRC{msg}->{nomsg}->{verbose}; - $_msg =~ s/<!--MID-->/$num/g; - if (($File->{'Is Upload'}) or ($File->{'Direct Input'})) - { - $_msg =~ s/<!--URI-->//g - } - else - { - my $escaped_uri = uri_escape($File->{URI}); - $_msg =~ s/<!--URI-->/$escaped_uri/g; - } - $explanation = " $_msg\n$explanation"; # The send feedback plea. - $explanation =~ s/<!--CFG_HOME_PAGE-->/$CFG->{'Home Page'}/g; - } - $err->{expl} = $explanation; - } - - $err->{src} = $line; - $err->{col} = ' ' x $col; - if ($err->{type} eq 'I') - { - $err->{class} = 'msg_info'; - $err->{err_type_err} = 0; - $err->{err_type_warn} = 0; - $err->{err_type_info} = 1; - $number_of_info += 1; - } - elsif ($err->{type} eq 'E') - { - $err->{class} = 'msg_err'; - $err->{err_type_err} = 1; - $err->{err_type_warn} = 0; - $err->{err_type_info} = 0; - $number_of_errors += 1; - } - elsif (($err->{type} eq 'W') or ($err->{type} eq 'X') ) - { - $err->{class} = 'msg_warn'; - $err->{err_type_err} = 0; - $err->{err_type_warn} = 1; - $err->{err_type_info} = 0; - $number_of_warnings += 1; - } - # TODO other classes for "X" etc? FIXME find all types of message. - - push @{$Errors}, $err; - - if (($File->{Opt}->{'Group Errors'}) and (($err->{type} eq 'E') or ($err->{type} eq 'W')or ($err->{type} eq 'X'))) { - # index by num for errors and warnings only - info usually give context of error or warning - if (! exists $Errors_bytype{$err->{num}}) { - $Errors_bytype{$err->{num}}->{instances} = []; - my $msg_text; - if ($err->{num} ne 'xmlwf') { - $msg_text = $RSRC{msg}->{$err->{num}}->{original}; - $msg_text =~ s/%1/X/; - $msg_text =~ s/%2/Y/; - } - else { ## FIXME ## we need a catalog of errors from our XML parser - $msg_text = "XML Parsing Error"; - } - $Errors_bytype{$err->{num}}->{expl} = $err->{expl}; - $Errors_bytype{$err->{num}}->{generic_msg} = $msg_text; - $Errors_bytype{$err->{num}}->{msg} = $err->{msg}; - $Errors_bytype{$err->{num}}->{type} = $err->{type}; - $Errors_bytype{$err->{num}}->{class} = $err->{class}; - $Errors_bytype{$err->{num}}->{err_type_err} = $err->{err_type_err}; - $Errors_bytype{$err->{num}}->{err_type_warn} = $err->{err_type_warn}; - $Errors_bytype{$err->{num}}->{err_type_info} = $err->{err_type_info}; - } - push @ { $Errors_bytype{$err->{num}}->{instances} }, $err; - } - } - } - if ($File->{Opt}->{'Group Errors'}) { - $Errors = []; - for my $err_num (keys %Errors_bytype){ - push @{$Errors}, $Errors_bytype{$err_num}; - } - } - # we are not sorting errors by line, as it would break the position - # of auxiliary messages such as "start tag was here". We'll have to live with - # the fact that XML well-formedness errors are listed first, then validation errors - #else { - # sort error by lines - # @{$Errors} = sort {$a->{line} <=> $b->{line} } @{$Errors}; - #} - return $number_of_errors, $number_of_warnings, $number_of_info, $Errors; +sub report_errors ($) +{ + my $File = shift; + my $Errors = []; + my %Errors_bytype; + my $number_of_errors = 0; + my $number_of_warnings = 0; + my $number_of_info = 0; + + # Hash to keep track of how many of each error is reported. + my %Msgs; # Used to generate a UID for explanations. + + # for the sake of readability, at least until the xmlwf errors have explanations, + # we push the errors from the XML parser at the END of the error list. + foreach my $errmsg (@{$File->{WF_Errors}}) { + push @{$File->{Errors}}, $errmsg; + } + + if (scalar @{$File->{Errors}}) { + foreach my $err (@{$File->{Errors}}) { + my $line; + my $col = 0; + + # avoid truncating lines that do not exist + if (defined($err->{line}) && $File->{Content}->[$err->{line} - 1]) + { + if (defined($err->{char}) && $err->{char} =~ /^[0-9]+$/) { + ($line, $col) = + &truncate_line($File->{Content}->[$err->{line} - 1], + $err->{char}); + $line = &mark_error($line, $col); + } + elsif (defined($err->{line})) { + $col = length($File->{Content}->[$err->{line} - 1]); + $col = 80 if ($col > 80); + ($line, $col) = + &truncate_line($File->{Content}->[$err->{line} - 1], + $col); + $line = &ent($line); + $col = 0; + } + } + else { + $col = 0; + $line = ""; + } + my $explanation = ""; + if ($err->{expl}) { + + } + else { + if ($err->{num}) { + my $num = $err->{num}; + $explanation .= Encode::decode_utf8( + "\n $RSRC{msg}->{$num}->{verbose}\n") + if exists $RSRC{msg}->{$num} && + exists $RSRC{msg}->{$num}->{verbose}; + my $_msg = $RSRC{msg}->{nomsg}->{verbose}; + $_msg =~ s/<!--MID-->/$num/g; + if (($File->{'Is Upload'}) or ($File->{'Direct Input'})) { + $_msg =~ s/<!--URI-->//g; + } + else { + my $escaped_uri = uri_escape($File->{URI}); + $_msg =~ s/<!--URI-->/$escaped_uri/g; + } + $explanation = + " $_msg\n$explanation"; # The send feedback plea. + $explanation =~ + s/<!--CFG_HOME_PAGE-->/$CFG->{'Home Page'}/g; + } + $err->{expl} = $explanation; + } + + $err->{src} = $line; + $err->{col} = ' ' x $col; + if ($err->{type} eq 'I') { + $err->{class} = 'msg_info'; + $err->{err_type_err} = 0; + $err->{err_type_warn} = 0; + $err->{err_type_info} = 1; + $number_of_info += 1; + } + elsif ($err->{type} eq 'E') { + $err->{class} = 'msg_err'; + $err->{err_type_err} = 1; + $err->{err_type_warn} = 0; + $err->{err_type_info} = 0; + $number_of_errors += 1; + } + elsif (($err->{type} eq 'W') or ($err->{type} eq 'X')) { + $err->{class} = 'msg_warn'; + $err->{err_type_err} = 0; + $err->{err_type_warn} = 1; + $err->{err_type_info} = 0; + $number_of_warnings += 1; + } + + # TODO other classes for "X" etc? FIXME find all types of message. + + push @{$Errors}, $err; + + if (($File->{Opt}->{'Group Errors'}) and + (($err->{type} eq 'E') or + ($err->{type} eq 'W') or + ($err->{type} eq 'X')) + ) + { + + # index by num for errors and warnings only - info usually give context of error or warning + if (!exists $Errors_bytype{$err->{num}}) { + $Errors_bytype{$err->{num}}->{instances} = []; + my $msg_text; + if ($err->{num} ne 'xmlwf') { + $msg_text = $RSRC{msg}->{$err->{num}}->{original}; + $msg_text =~ s/%1/X/; + $msg_text =~ s/%2/Y/; + } + else + { ## FIXME ## we need a catalog of errors from our XML parser + $msg_text = "XML Parsing Error"; + } + $Errors_bytype{$err->{num}}->{expl} = $err->{expl}; + $Errors_bytype{$err->{num}}->{generic_msg} = $msg_text; + $Errors_bytype{$err->{num}}->{msg} = $err->{msg}; + $Errors_bytype{$err->{num}}->{type} = $err->{type}; + $Errors_bytype{$err->{num}}->{class} = $err->{class}; + $Errors_bytype{$err->{num}}->{err_type_err} = + $err->{err_type_err}; + $Errors_bytype{$err->{num}}->{err_type_warn} = + $err->{err_type_warn}; + $Errors_bytype{$err->{num}}->{err_type_info} = + $err->{err_type_info}; + } + push @{$Errors_bytype{$err->{num}}->{instances}}, $err; + } + } + } + if ($File->{Opt}->{'Group Errors'}) { + $Errors = []; + for my $err_num (keys %Errors_bytype) { + push @{$Errors}, $Errors_bytype{$err_num}; + } + } + + # we are not sorting errors by line, as it would break the position + # of auxiliary messages such as "start tag was here". We'll have to live with + # the fact that XML well-formedness errors are listed first, then validation errors + #else { + # sort error by lines + # @{$Errors} = sort {$a->{line} <=> $b->{line} } @{$Errors}; + #} + return $number_of_errors, $number_of_warnings, $number_of_info, $Errors; } # @@ -2134,315 +2335,361 @@ sub report_errors ($) { # was detected, and everything to the left and right of that position. # That way we can add markup to the relevant char without breaking &ent(). # Expects 1-based column indexes. -sub mark_error ($$) { - my $line = shift; - my $col = shift; - my $linelen = length($line); +sub mark_error ($$) +{ + my $line = shift; + my $col = shift; + my $linelen = length($line); - # Coerce column into an index valid within the line. - if ($col < 1) { - $col = 1; - } elsif ($col > $linelen) { - $col = $linelen; - } - $col--; + # Coerce column into an index valid within the line. + if ($col < 1) { + $col = 1; + } + elsif ($col > $linelen) { + $col = $linelen; + } + $col--; - my $left = substr($line, 0, $col); - my $char = substr($line, $col, 1); - my $right = substr($line, $col + 1); + my $left = substr($line, 0, $col); + my $char = substr($line, $col, 1); + my $right = substr($line, $col + 1); - $char = &ent($char); - $char = qq(<strong title="Position where error was detected.">$char</strong>); - $line = &ent($left) . $char . &ent($right); + $char = &ent($char); + $char = + qq(<strong title="Position where error was detected.">$char</strong>); + $line = &ent($left) . $char . &ent($right); - return $line; + return $line; } # # Create a HTML representation of the document. -sub source { - my $File = shift; +sub source +{ + my $File = shift; - # Remove any BOM since we're not at BOT anymore... - $File->{Content}->[0] = substr($File->{Content}->[0], 1) - if ($File->{BOM} && scalar(@{$File->{Content}})); + # Remove any BOM since we're not at BOT anymore... + $File->{Content}->[0] = substr($File->{Content}->[0], 1) + if ($File->{BOM} && scalar(@{$File->{Content}})); - my @source = map({file_source_line => $_}, @{$File->{Content}}); - return \@source; + my @source = map({file_source_line => $_}, @{$File->{Content}}); + return \@source; } - -sub match_DTD_FPI_SI { +sub match_DTD_FPI_SI +{ my ($File, $FPI, $SI) = @_; if ($CFG->{Types}->{$FPI}) { - if ($CFG->{Types}->{$FPI}->{SysID}){ + if ($CFG->{Types}->{$FPI}->{SysID}) { if ($SI ne $CFG->{Types}->{$FPI}->{SysID}) { - &add_warning('W26', {W26_dtd_pub => $FPI, - W26_dtd_pub_display =>$CFG->{Types}->{$FPI}->{Display}, - W26_dtd_sys=> $SI, - W26_dtd_sys_recommend=> $CFG->{Types}->{$FPI}->{SysID}}); + &add_warning( + 'W26', + { W26_dtd_pub => $FPI, + W26_dtd_pub_display => + $CFG->{Types}->{$FPI}->{Display}, + W26_dtd_sys => $SI, + W26_dtd_sys_recommend => $CFG->{Types}->{$FPI}->{SysID} + } + ); } } } - else { # FPI not known, checking if the SI is + else { # FPI not known, checking if the SI is foreach my $proper_FPI (keys %{$CFG->{Types}}) { if ($CFG->{Types}->{$proper_FPI}->{SysID}) { if ($CFG->{Types}->{$proper_FPI}->{SysID} eq $SI) { - &add_warning('W26', {W26_dtd_pub => $FPI, - W26_dtd_pub_display =>$CFG->{Types}->{$proper_FPI}->{Display}, - W26_dtd_sys => $SI, - W26_dtd_pub_recommend=> $proper_FPI }); + &add_warning( + 'W26', + { W26_dtd_pub => $FPI, + W26_dtd_pub_display => + $CFG->{Types}->{$proper_FPI}->{Display}, + W26_dtd_sys => $SI, + W26_dtd_pub_recommend => $proper_FPI + } + ); } } } } } + # # Do an initial parse of the Document Entity to extract FPI. -sub preparse_doctype { - my $File = shift; - - # - # Reset DOCTYPE, Root (for second invocation, probably not needed anymore). - $File->{DOCTYPE} = ''; - $File->{Root} = ''; - - my $dtd = sub { - return if $File->{Root}; - # TODO: The \s and \w are probably wrong now that the strings are utf8_on - my $declaration = shift; - my $doctype_type; - my $doctype_secondpart; - if ($declaration =~ /<!DOCTYPE\s+html(?:\s+SYSTEM\s+(['"])about:legacy-compat\1)?\s*>/si) { - $File->{Root} = "html"; - $File->{DOCTYPE} = "HTML5"; - } - else { - ($File->{Root}, $doctype_type, $File->{DOCTYPE}, $doctype_secondpart) = $declaration =~ m(<!DOCTYPE\s+(\w[\w\.-]+)\s+(PUBLIC|SYSTEM)\s+(?:[\'\"])([^\"\']+)(?:[\"\'])(.*)>)si; - if (($doctype_type eq "PUBLIC") and (($doctype_secondpart) = $doctype_secondpart =~ m(\s+(?:[\'\"])([^\"\']+)(?:[\"\']).*)si)){ - &match_DTD_FPI_SI($File, $File->{DOCTYPE}, $doctype_secondpart); - } - } - }; +sub preparse_doctype +{ + my $File = shift; + + # + # Reset DOCTYPE, Root (for second invocation, probably not needed anymore). + $File->{DOCTYPE} = ''; + $File->{Root} = ''; + + my $dtd = sub { + return if $File->{Root}; + + # TODO: The \s and \w are probably wrong now that the strings are utf8_on + my $declaration = shift; + my $doctype_type; + my $doctype_secondpart; + if ($declaration =~ + /<!DOCTYPE\s+html(?:\s+SYSTEM\s+(['"])about:legacy-compat\1)?\s*>/si + ) + { + $File->{Root} = "html"; + $File->{DOCTYPE} = "HTML5"; + } + else { + ( $File->{Root}, $doctype_type, + $File->{DOCTYPE}, $doctype_secondpart + ) + = $declaration =~ + m(<!DOCTYPE\s+(\w[\w\.-]+)\s+(PUBLIC|SYSTEM)\s+(?:[\'\"])([^\"\']+)(?:[\"\'])(.*)>)si; + if (($doctype_type eq "PUBLIC") and + (($doctype_secondpart) = + $doctype_secondpart =~ + m(\s+(?:[\'\"])([^\"\']+)(?:[\"\']).*)si) + ) + { + &match_DTD_FPI_SI($File, $File->{DOCTYPE}, + $doctype_secondpart); + } + } + }; - my $start = sub { - my ($p, $tag, $attr) = @_; + my $start = sub { + my ($p, $tag, $attr) = @_; - if ($File->{Root}) { - return unless $tag eq $File->{Root}; - } else { - $File->{Root} = $tag; - } - if ($attr->{xmlns}) {$File->{Namespace} = $attr->{xmlns}}; - if ($attr->{version}) {$File->{'Root Version'} = $attr->{version}}; - if ($attr->{baseProfile}) {$File->{'Root BaseProfile'} = $attr->{baseProfile}}; + if ($File->{Root}) { + return unless $tag eq $File->{Root}; + } + else { + $File->{Root} = $tag; + } + if ($attr->{xmlns}) { $File->{Namespace} = $attr->{xmlns} } + if ($attr->{version}) { $File->{'Root Version'} = $attr->{version} } + if ($attr->{baseProfile}) { + $File->{'Root BaseProfile'} = $attr->{baseProfile}; + } + + # We're done parsing. + $p->eof(); + }; + + # we use HTML::Parser as pre-parser. May use html5lib or other in the future + my $p = HTML::Parser->new(api_version => 3); + + # if content-type has shown we should pre-parse with XML mode, use that + # otherwise (mostly text/html cases) use default mode + $p->xml_mode(&is_xml($File)); + $p->handler(declaration => $dtd, 'text'); + $p->handler(start => $start, 'self,tag,attr'); - # We're done parsing. + my $line = 0; + my $max = scalar(@{$File->{Content}}); + $p->parse( + sub { + return ($line < $max) ? $File->{Content}->[$line++] . "\n" : undef; + } + ); $p->eof(); - }; - - # we use HTML::Parser as pre-parser. May use html5lib or other in the future - my $p = HTML::Parser->new(api_version => 3); - - # if content-type has shown we should pre-parse with XML mode, use that - # otherwise (mostly text/html cases) use default mode - $p->xml_mode(&is_xml($File)); - $p->handler(declaration => $dtd, 'text'); - $p->handler(start => $start, 'self,tag,attr'); - - my $line = 0; - my $max = scalar(@{$File->{Content}}); - $p->parse(sub { - return ($line < $max) ? $File->{Content}->[$line++] . "\n" : undef; - }); - $p->eof(); - - # TODO: These \s here are probably wrong now that the strings are utf8_on - $File->{DOCTYPE} = '' unless defined $File->{DOCTYPE}; - $File->{DOCTYPE} =~ s(^\s+){ }g; - $File->{DOCTYPE} =~ s(\s+$){ }g; - $File->{DOCTYPE} =~ s(\s+) { }g; - - # Some document types actually need no doctype to be identified, - # root element and some version attribute is enough - # TODO applicable doctypes should be migrated to a config file? - - # if (($File->{DOCTYPE} eq '') and ($File->{Root} eq "svg") ) { - # if (($File->{'Root Version'}) or ($File->{'Root BaseProfile'})) - # { - # if (! $File->{'Root Version'}) { $File->{'Root Version'} = "0"; } - # if (! $File->{'Root BaseProfile'}) { $File->{'Root BaseProfile'} = "0"; } - # if ($File->{'Root Version'} eq "1.0"){ - # $File->{DOCTYPE} = "-//W3C//DTD SVG 1.0//EN"; - # $File->{"DOCTYPEless OK"} = TRUE; - # $File->{Opt}->{DOCTYPE} = "SVG 1.0"; - # } - # if ((($File->{'Root Version'} eq "1.1") or ($File->{'Root Version'} eq "0")) and ($File->{'Root BaseProfile'} eq "tiny")) { - # $File->{DOCTYPE} = "-//W3C//DTD SVG 1.1 Tiny//EN"; - # $File->{"DOCTYPEless OK"} = TRUE; - # $File->{Opt}->{DOCTYPE} = "SVG 1.1 Tiny"; - # } - # elsif ((($File->{'Root Version'} eq "1.1") or ($File->{'Root Version'} eq "0")) and ($File->{'Root BaseProfile'} eq "basic")) { - # $File->{DOCTYPE} = "-//W3C//DTD SVG 1.1 Basic//EN"; - # $File->{Opt}->{DOCTYPE} = "SVG 1.1 Basic"; - # $File->{"DOCTYPEless OK"} = TRUE; - # } - # elsif (($File->{'Root Version'} eq "1.1") and (!$File->{'Root BaseProfile'})) { - # $File->{DOCTYPE} = "-//W3C//DTD SVG 1.1//EN"; - # $File->{Opt}->{DOCTYPE} = "SVG 1.1"; - # $File->{"DOCTYPEless OK"} = TRUE; - # } - # if ($File->{'Root Version'} eq "0") { $File->{'Root Version'} = undef; } - # if ($File->{'Root BaseProfile'} eq "0") { $File->{'Root BaseProfile'} = undef; } - # } - # else { - # # by default for an svg root elt, we use SVG 1.1 - # $File->{DOCTYPE} = "-//W3C//DTD SVG 1.1//EN"; - # $File->{Opt}->{DOCTYPE} = "SVG 1.1"; - # $File->{"DOCTYPEless OK"} = TRUE; - # } - # } - if (($File->{"DOCTYPEless OK"}) and ($File->{Opt}->{DOCTYPE})) { - # doctypeless document type found, we fake the override - # so that the parser will have something to validate against - $File = &override_doctype($File); - } - return $File; -} + # TODO: These \s here are probably wrong now that the strings are utf8_on + $File->{DOCTYPE} = '' unless defined $File->{DOCTYPE}; + $File->{DOCTYPE} =~ s(^\s+){ }g; + $File->{DOCTYPE} =~ s(\s+$){ }g; + $File->{DOCTYPE} =~ s(\s+) { }g; + + # Some document types actually need no doctype to be identified, + # root element and some version attribute is enough + # TODO applicable doctypes should be migrated to a config file? + + # if (($File->{DOCTYPE} eq '') and ($File->{Root} eq "svg") ) { + # if (($File->{'Root Version'}) or ($File->{'Root BaseProfile'})) + # { + # if (! $File->{'Root Version'}) { $File->{'Root Version'} = "0"; } + # if (! $File->{'Root BaseProfile'}) { $File->{'Root BaseProfile'} = "0"; } + # if ($File->{'Root Version'} eq "1.0"){ + # $File->{DOCTYPE} = "-//W3C//DTD SVG 1.0//EN"; + # $File->{"DOCTYPEless OK"} = TRUE; + # $File->{Opt}->{DOCTYPE} = "SVG 1.0"; + # } + # if ((($File->{'Root Version'} eq "1.1") or ($File->{'Root Version'} eq "0")) and ($File->{'Root BaseProfile'} eq "tiny")) { + # $File->{DOCTYPE} = "-//W3C//DTD SVG 1.1 Tiny//EN"; + # $File->{"DOCTYPEless OK"} = TRUE; + # $File->{Opt}->{DOCTYPE} = "SVG 1.1 Tiny"; + # } + # elsif ((($File->{'Root Version'} eq "1.1") or ($File->{'Root Version'} eq "0")) and ($File->{'Root BaseProfile'} eq "basic")) { + # $File->{DOCTYPE} = "-//W3C//DTD SVG 1.1 Basic//EN"; + # $File->{Opt}->{DOCTYPE} = "SVG 1.1 Basic"; + # $File->{"DOCTYPEless OK"} = TRUE; + # } + # elsif (($File->{'Root Version'} eq "1.1") and (!$File->{'Root BaseProfile'})) { + # $File->{DOCTYPE} = "-//W3C//DTD SVG 1.1//EN"; + # $File->{Opt}->{DOCTYPE} = "SVG 1.1"; + # $File->{"DOCTYPEless OK"} = TRUE; + # } + # if ($File->{'Root Version'} eq "0") { $File->{'Root Version'} = undef; } + # if ($File->{'Root BaseProfile'} eq "0") { $File->{'Root BaseProfile'} = undef; } + # } + # else { + # # by default for an svg root elt, we use SVG 1.1 + # $File->{DOCTYPE} = "-//W3C//DTD SVG 1.1//EN"; + # $File->{Opt}->{DOCTYPE} = "SVG 1.1"; + # $File->{"DOCTYPEless OK"} = TRUE; + # } + # } + if (($File->{"DOCTYPEless OK"}) and ($File->{Opt}->{DOCTYPE})) { + + # doctypeless document type found, we fake the override + # so that the parser will have something to validate against + $File = &override_doctype($File); + } + return $File; +} # # Preprocess CGI parameters. -sub prepCGI { - my $File = shift; - my $q = shift; - - my $path_info; - { - # @@@HACK: - # CGI.pm's _name_and_path_from_env has query string related issues; - # just trump the query string for the duration of calling methods we - # know we don't need it for and which have been affected in the past. - # More info: http://www.w3.org/Bugs/Public/show_bug.cgi?id=4365 - # ->url() reportedly fixed in CGI.pm 3.34, ->path_info() in 3.40. - - local $ENV{REQUEST_URI} = URI->new($ENV{REQUEST_URI})->path() - if $ENV{REQUEST_URI}; - - # The URL to this CGI script. - $File->{Env}->{'Self URI'} = $q->url(); - - $path_info = $q->path_info(); - } - - # Avoid CGI.pm's "exists but undef" behaviour. - if (scalar $q->param) { - foreach my $param ($q->param) { - next if $param eq 'uploaded_file'; # 'uploaded_file' contains data. - next if $param eq 'fragment'; # Ditto 'fragment'. - next if $param eq 'accept'; # Original checking had a specific Accept: header sent. - next if $param eq 'accept-language'; # Ditto Accept-Language:. - next if $param eq 'accept-charset'; # Ditto Accept-Charset:. - next if $q->param($param) eq '0'; # Keep false-but-set params. - - # - # Parameters that are given to us without specifying a value get - # set to "1" (the "TRUE" constant). This is so we can test for the - # boolean value of a parameter instead of first checking whether - # the param was given and then testing its value. Needed because - # CGI.pm sets ";param" and ";param=" to a boolean false value - # (undef() or a null string, respectively). - $q->param($param, TRUE) unless $q->param($param); - } - } - - # IIS reportedly does not provide the $path_info we expect - hack around it. - $path_info =~ s|(.*)/check\.pl(.*)$|$2| - if ($ENV{SERVER_SOFTWARE} and $ENV{SERVER_SOFTWARE} =~ /Microsoft-IIS/); - - # apparently, with mod_perl2, $path_info is empty even if it should be filled - # working around that - if (!$path_info && $File->{Env}->{'Self URI'} =~ /check\/referr?er$/) { - $path_info = '/referer'; - $File->{Env}->{'Self URI'} =~ s/\/referr?er$//; - } - - # Futz the URL so "/referer" works. - if ($path_info) { - if ($path_info =~ m|^/referr?er$|) { - if ($q->referer) { - $q->param('uri', $q->referer); - print redirect &self_url_q($q, $File); - } else { - print redirect $File->{Env}->{'Self URI'} . '?uri=referer'; - } - } else { - print redirect &self_url_q($q, $File); - } - exit; - } - - # Use "url" unless a "uri" was also given. - if ($q->param('url') and not $q->param('uri')) { - $q->param('uri', $q->param('url')); - } - - # Munge the URL to include commonly omitted prefix. - my $u = $q->param('uri'); - $q->param('uri', "http://$u") if $u && $u =~ m(^www)i; - - # Issue a redirect for uri=referer. - if ($q->param('uri') and $q->param('uri') eq 'referer') { - if ($q->referer) { - $q->param('uri', $q->referer); - $q->param('accept',$q->http('Accept')) if ($q->http('Accept')); - $q->param('accept-language',$q->http('Accept-Language')) if ($q->http('Accept-Language')); - $q->param('accept-charset',$q->http('Accept-Charset')) if ($q->http('Accept-Charset')); - print redirect &self_url_q($q, $File); - exit; - } else { - - # Redirected from /check/referer to /check?uri=referer because - # the browser didn't send a Referer header, or the request was - # for /check?uri=referer but no Referer header was found. - $File->{'Error Flagged'} = TRUE; - &get_template($File, 'fatal-error.tmpl')->param( - fatal_referer_error => TRUE, - ); - } - } - - # Supersede URL with an uploaded file. - if ($q->param('uploaded_file')) { - $q->param('uri', 'upload://' . $q->param('uploaded_file')); - $File->{'Is Upload'} = TRUE; # Tag it for later use. - } - - # Supersede URL with an uploaded fragment. - if ($q->param('fragment')) { - $q->param('uri', 'upload://Form Submission'); - $File->{'Direct Input'} = TRUE; # Tag it for later use. - } - - # Redirect to a GETable URL if method is POST without a file upload. - if (defined $q->request_method and $q->request_method eq 'POST' - and not ($File->{'Is Upload'} or $File->{'Direct Input'})) { - my $thispage = &self_url_q($q, $File); - print redirect $thispage; - exit; - } - - # - # Flag an error if we didn't get a file to validate. - unless ($q->param('uri')) { - $File->{'Error Flagged'} = TRUE; - &get_template($File, 'fatal-error.tmpl')->param( - fatal_uri_error => TRUE, - fatal_uri_scheme => 'undefined', - ); - } +sub prepCGI +{ + my $File = shift; + my $q = shift; - return $q; + my $path_info; + { + + # @@@HACK: + # CGI.pm's _name_and_path_from_env has query string related issues; + # just trump the query string for the duration of calling methods we + # know we don't need it for and which have been affected in the past. + # More info: http://www.w3.org/Bugs/Public/show_bug.cgi?id=4365 + # ->url() reportedly fixed in CGI.pm 3.34, ->path_info() in 3.40. + + local $ENV{REQUEST_URI} = URI->new($ENV{REQUEST_URI})->path() + if $ENV{REQUEST_URI}; + + # The URL to this CGI script. + $File->{Env}->{'Self URI'} = $q->url(); + + $path_info = $q->path_info(); + } + + # Avoid CGI.pm's "exists but undef" behaviour. + if (scalar $q->param) { + foreach my $param ($q->param) { + next if $param eq 'uploaded_file'; # 'uploaded_file' contains data. + next if $param eq 'fragment'; # Ditto 'fragment'. + next + if $param eq 'accept' + ; # Original checking had a specific Accept: header sent. + next if $param eq 'accept-language'; # Ditto Accept-Language:. + next if $param eq 'accept-charset'; # Ditto Accept-Charset:. + next if $q->param($param) eq '0'; # Keep false-but-set params. + + # + # Parameters that are given to us without specifying a value get + # set to "1" (the "TRUE" constant). This is so we can test for the + # boolean value of a parameter instead of first checking whether + # the param was given and then testing its value. Needed because + # CGI.pm sets ";param" and ";param=" to a boolean false value + # (undef() or a null string, respectively). + $q->param($param, TRUE) unless $q->param($param); + } + } + + # IIS reportedly does not provide the $path_info we expect - hack around it. + $path_info =~ s|(.*)/check\.pl(.*)$|$2| + if ($ENV{SERVER_SOFTWARE} and + $ENV{SERVER_SOFTWARE} =~ /Microsoft-IIS/); + + # apparently, with mod_perl2, $path_info is empty even if it should be filled + # working around that + if (!$path_info && $File->{Env}->{'Self URI'} =~ /check\/referr?er$/) { + $path_info = '/referer'; + $File->{Env}->{'Self URI'} =~ s/\/referr?er$//; + } + + # Futz the URL so "/referer" works. + if ($path_info) { + if ($path_info =~ m|^/referr?er$|) { + if ($q->referer) { + $q->param('uri', $q->referer); + print redirect &self_url_q($q, $File); + } + else { + print redirect $File->{Env}->{'Self URI'} . '?uri=referer'; + } + } + else { + print redirect &self_url_q($q, $File); + } + exit; + } + + # Use "url" unless a "uri" was also given. + if ($q->param('url') and not $q->param('uri')) { + $q->param('uri', $q->param('url')); + } + + # Munge the URL to include commonly omitted prefix. + my $u = $q->param('uri'); + $q->param('uri', "http://$u") if $u && $u =~ m(^www)i; + + # Issue a redirect for uri=referer. + if ($q->param('uri') and $q->param('uri') eq 'referer') { + if ($q->referer) { + $q->param('uri', $q->referer); + $q->param('accept', $q->http('Accept')) if ($q->http('Accept')); + $q->param('accept-language', $q->http('Accept-Language')) + if ($q->http('Accept-Language')); + $q->param('accept-charset', $q->http('Accept-Charset')) + if ($q->http('Accept-Charset')); + print redirect &self_url_q($q, $File); + exit; + } + else { + + # Redirected from /check/referer to /check?uri=referer because + # the browser didn't send a Referer header, or the request was + # for /check?uri=referer but no Referer header was found. + $File->{'Error Flagged'} = TRUE; + &get_template($File, 'fatal-error.tmpl') + ->param(fatal_referer_error => TRUE,); + } + } + + # Supersede URL with an uploaded file. + if ($q->param('uploaded_file')) { + $q->param('uri', 'upload://' . $q->param('uploaded_file')); + $File->{'Is Upload'} = TRUE; # Tag it for later use. + } + + # Supersede URL with an uploaded fragment. + if ($q->param('fragment')) { + $q->param('uri', 'upload://Form Submission'); + $File->{'Direct Input'} = TRUE; # Tag it for later use. + } + + # Redirect to a GETable URL if method is POST without a file upload. + if (defined $q->request_method and + $q->request_method eq 'POST' and + not($File->{'Is Upload'} or $File->{'Direct Input'})) + { + my $thispage = &self_url_q($q, $File); + print redirect $thispage; + exit; + } + + # + # Flag an error if we didn't get a file to validate. + unless ($q->param('uri')) { + $File->{'Error Flagged'} = TRUE; + &get_template($File, 'fatal-error.tmpl')->param( + fatal_uri_error => TRUE, + fatal_uri_scheme => 'undefined', + ); + } + + return $q; } # @@ -2451,21 +2698,23 @@ sub prepCGI { # * Doctype Declaration # * XML Declaration # * XML namespaces -sub set_parse_mode { - my $File = shift; - my $CFG = shift; - my $fpi = $File->{DOCTYPE}; - $File->{ModeChoice} = ''; - my $parseModeFromDoctype = $CFG->{Types}->{$fpi}->{'Parse Mode'} || 'TBD'; - - # $File->{Mode} may have been set in parse_content_type - # and it would come from the Media Type - my $parseModeFromMimeType = $File->{Mode}; - my $begincontent = join "\x20",@{$File->{Content}}; # for the sake of xml decl detection, - # the 10 first lines should be safe - my $parseModeFromXMLDecl = ( - $begincontent - =~ /^ [\x20|\x09|\x0D|\x0A]* # whitespace before the decl should not be happening +sub set_parse_mode +{ + my $File = shift; + my $CFG = shift; + my $fpi = $File->{DOCTYPE}; + $File->{ModeChoice} = ''; + my $parseModeFromDoctype = $CFG->{Types}->{$fpi}->{'Parse Mode'} || 'TBD'; + + # $File->{Mode} may have been set in parse_content_type + # and it would come from the Media Type + my $parseModeFromMimeType = $File->{Mode}; + my $begincontent = join "\x20", + @{$File->{Content}}; # for the sake of xml decl detection, + # the 10 first lines should be safe + my $parseModeFromXMLDecl = ( + $begincontent =~ + /^ [\x20|\x09|\x0D|\x0A]* # whitespace before the decl should not be happening # but we are greedy for the sake of detection, not validation <\?xml # start matching an XML Declaration [\x20|\x09|\x0D|\x0A]+ # x20, x09, xD and xA are the allowed "xml white space" @@ -2482,402 +2731,480 @@ sub set_parse_mode { )? # ditto standalone info, optional [\x20|\x09|\x0D|\x0A]* \?> # end of XML Declaration /x - ? 'XML' : 'TBD' ); - - my $parseModeFromNamespace = 'TBD'; - if ($File->{Namespace}) { $parseModeFromNamespace = 'XML'} - - if (($parseModeFromMimeType eq 'TBD') and ($parseModeFromXMLDecl eq 'TBD') and ($parseModeFromNamespace eq 'TBD') and (!exists $CFG->{Types}->{$fpi})) { - # if the mime type is text/html (ambiguous, hence TBD mode) - # and the doctype isn't in the catalogue - # and XML prolog detection was unsuccessful - # and we found no namespace at the root - # ... throw in a warning - &add_warning('W06', { - W06_mime => $File->{ContentType}, - w06_doctype => $File->{DOCTYPE} - }); - return; - } - - $parseModeFromDoctype = 'TBD' unless $parseModeFromDoctype eq 'SGML' or $parseModeFromDoctype eq 'HTML5' or $parseModeFromDoctype eq 'XML' or $parseModeFromNamespace eq 'XML'; - - if (($parseModeFromDoctype eq 'TBD') and ($parseModeFromXMLDecl eq 'TBD') and ($parseModeFromMimeType eq 'TBD') and ($parseModeFromNamespace eq 'TBD')) { - # if all factors are useless to give us a parse mode - # => we use SGML-based DTD validation as a default - $File->{Mode} = 'DTD+SGML'; - $File->{ModeChoice} = 'Fallback'; - # and send warning about the fallback - &add_warning('W06', { - W06_mime => $File->{ContentType}, - w06_doctype => $File->{DOCTYPE} - }); - return; - } - elsif ($parseModeFromMimeType ne 'TBD') { - # if The mime type gives clear indication of whether the document is XML or not - if (($parseModeFromDoctype ne 'TBD') and ($parseModeFromDoctype ne 'HTML5') and ($parseModeFromMimeType ne $parseModeFromDoctype)) { - # if document-type recommended mode and content-type recommended mode clash, shoot a warning - # unknown doctypes will not trigger this - # neither will html5 documents, which can be XML or not - &add_warning('W07', { - W07_mime => $File->{ContentType}, - W07_ct => $parseModeFromMimeType, - W07_dtd => $parseModeFromDoctype, - }); - } - # mime type has precedence, we stick to it - $File->{ModeChoice} = 'Mime'; - if ($parseModeFromDoctype eq "HTML5") { - $File->{Mode} = 'HTML5+'.$File->{Mode}; - } else { - $File->{Mode} = 'DTD+'.$File->{Mode}; - } - return; - } - elsif ($parseModeFromDoctype ne 'TBD') { - # the mime type is ambiguous (hence we didn't stop at the previous test) - # but by now we're sure that the document type is a good indication - # so we use that. - if ($parseModeFromDoctype eq "HTML5") { - if ($parseModeFromXMLDecl eq "XML" or $parseModeFromNamespace eq "XML") { - $File->{Mode} = "HTML5+XML"; - } - else { - $File->{Mode} = "HTML5"; - } - } - else { # not HTML5 - $File->{Mode} = "DTD+".$parseModeFromDoctype; - } - $File->{ModeChoice} = 'Doctype'; - return; - } - elsif ($parseModeFromXMLDecl ne 'TBD') { - # the mime type is ambiguous (hence we didn't stop at the previous test) - # and so was the doctype - # but we found an XML declaration - # so we use that. - if ($File->{Mode} eq "") { - $File->{Mode} = "DTD+".$parseModeFromXMLDecl; - } - elsif ($File->{Mode} =~ /\+/ ) { - $File->{Mode} =~ s/\+.*/\+$parseModeFromXMLDecl/; - } - else { - $File->{Mode} = $File->{Mode}."+".$parseModeFromXMLDecl; + ? + 'XML' : + 'TBD' + ); + + my $parseModeFromNamespace = 'TBD'; + if ($File->{Namespace}) { $parseModeFromNamespace = 'XML' } + + if (($parseModeFromMimeType eq 'TBD') and + ($parseModeFromXMLDecl eq 'TBD') and + ($parseModeFromNamespace eq 'TBD') and + (!exists $CFG->{Types}->{$fpi})) + { + + # if the mime type is text/html (ambiguous, hence TBD mode) + # and the doctype isn't in the catalogue + # and XML prolog detection was unsuccessful + # and we found no namespace at the root + # ... throw in a warning + &add_warning( + 'W06', + { W06_mime => $File->{ContentType}, + w06_doctype => $File->{DOCTYPE} + } + ); + return; } - $File->{ModeChoice} = 'XMLDecl'; - return; - } - else { - # this is the last case. We know that all modes are not TBD, - # yet mime type, doctype AND XML DECL tests have failed => we are saved by the presence of namespaces - if ($File->{Mode} eq "") { - $File->{Mode} = "DTD+".$parseModeFromNamespace; + + $parseModeFromDoctype = 'TBD' + unless $parseModeFromDoctype eq 'SGML' or + $parseModeFromDoctype eq 'HTML5' or + $parseModeFromDoctype eq 'XML' or + $parseModeFromNamespace eq 'XML'; + + if (($parseModeFromDoctype eq 'TBD') and + ($parseModeFromXMLDecl eq 'TBD') and + ($parseModeFromMimeType eq 'TBD') and + ($parseModeFromNamespace eq 'TBD')) + { + + # if all factors are useless to give us a parse mode + # => we use SGML-based DTD validation as a default + $File->{Mode} = 'DTD+SGML'; + $File->{ModeChoice} = 'Fallback'; + + # and send warning about the fallback + &add_warning( + 'W06', + { W06_mime => $File->{ContentType}, + w06_doctype => $File->{DOCTYPE} + } + ); + return; + } + elsif ($parseModeFromMimeType ne 'TBD') { + + # if The mime type gives clear indication of whether the document is XML or not + if (($parseModeFromDoctype ne 'TBD') and + ($parseModeFromDoctype ne 'HTML5') and + ($parseModeFromMimeType ne $parseModeFromDoctype)) + { + + # if document-type recommended mode and content-type recommended mode clash, shoot a warning + # unknown doctypes will not trigger this + # neither will html5 documents, which can be XML or not + &add_warning( + 'W07', + { W07_mime => $File->{ContentType}, + W07_ct => $parseModeFromMimeType, + W07_dtd => $parseModeFromDoctype, + } + ); + } + + # mime type has precedence, we stick to it + $File->{ModeChoice} = 'Mime'; + if ($parseModeFromDoctype eq "HTML5") { + $File->{Mode} = 'HTML5+' . $File->{Mode}; + } + else { + $File->{Mode} = 'DTD+' . $File->{Mode}; + } + return; + } + elsif ($parseModeFromDoctype ne 'TBD') { + + # the mime type is ambiguous (hence we didn't stop at the previous test) + # but by now we're sure that the document type is a good indication + # so we use that. + if ($parseModeFromDoctype eq "HTML5") { + if ($parseModeFromXMLDecl eq "XML" or + $parseModeFromNamespace eq "XML") + { + $File->{Mode} = "HTML5+XML"; + } + else { + $File->{Mode} = "HTML5"; + } + } + else { # not HTML5 + $File->{Mode} = "DTD+" . $parseModeFromDoctype; + } + $File->{ModeChoice} = 'Doctype'; + return; } - elsif ($File->{Mode} =~ /\+/ ) { - $File->{Mode} =~ s/\+.*/\+$parseModeFromNamespace/; + elsif ($parseModeFromXMLDecl ne 'TBD') { + + # the mime type is ambiguous (hence we didn't stop at the previous test) + # and so was the doctype + # but we found an XML declaration + # so we use that. + if ($File->{Mode} eq "") { + $File->{Mode} = "DTD+" . $parseModeFromXMLDecl; + } + elsif ($File->{Mode} =~ /\+/) { + $File->{Mode} =~ s/\+.*/\+$parseModeFromXMLDecl/; + } + else { + $File->{Mode} = $File->{Mode} . "+" . $parseModeFromXMLDecl; + } + $File->{ModeChoice} = 'XMLDecl'; + return; } else { - $File->{Mode} = $File->{Mode}."+".$parseModeFromNamespace; + + # this is the last case. We know that all modes are not TBD, + # yet mime type, doctype AND XML DECL tests have failed => we are saved by the presence of namespaces + if ($File->{Mode} eq "") { + $File->{Mode} = "DTD+" . $parseModeFromNamespace; + } + elsif ($File->{Mode} =~ /\+/) { + $File->{Mode} =~ s/\+.*/\+$parseModeFromNamespace/; + } + else { + $File->{Mode} = $File->{Mode} . "+" . $parseModeFromNamespace; + } + $File->{ModeChoice} = 'Namespace'; } - $File->{ModeChoice} = 'Namespace'; - } } - # # Utility sub to tell if mode "is" XML. -sub is_xml {shift->{Mode} =~ /XML/}; +sub is_xml { shift->{Mode} =~ /XML/ } # # Check charset conflicts and add any warnings necessary. -sub charset_conflicts { - my $File = shift; - - # - # Handle the case where there was no charset to be found. - unless ($File->{Charset}->{Use}) { - &add_warning('W17', {}); - $File->{Tentative} |= T_WARN; - } - - # - # Add a warning if there was charset info conflict (HTTP header, - # XML declaration, or <meta> element). - # filtering out some of the warnings in direct input mode where HTTP encoding is a "fake" - if ((charset_not_equal($File->{Charset}->{HTTP}, $File->{Charset}->{XML})) and not ($File->{'Direct Input'})) { - &add_warning('W18', { - W18_http => $File->{Charset}->{HTTP}, - W18_xml => $File->{Charset}->{XML}, - W18_use => $File->{Charset}->{Use}, - }); - } elsif (charset_not_equal($File->{Charset}->{HTTP}, $File->{Charset}->{META}) and not ($File->{'Direct Input'})) { - &add_warning('W19', { - W19_http => $File->{Charset}->{HTTP}, - W19_meta => $File->{Charset}->{META}, - W19_use => $File->{Charset}->{Use}, - }); - } elsif (charset_not_equal($File->{Charset}->{XML}, $File->{Charset}->{META})) { - &add_warning('W20', { - W20_http => $File->{Charset}->{XML}, - W20_xml => $File->{Charset}->{META}, - }); - $File->{Tentative} |= T_WARN; - } - - return $File; +sub charset_conflicts +{ + my $File = shift; + + # + # Handle the case where there was no charset to be found. + unless ($File->{Charset}->{Use}) { + &add_warning('W17', {}); + $File->{Tentative} |= T_WARN; + } + + # + # Add a warning if there was charset info conflict (HTTP header, + # XML declaration, or <meta> element). + # filtering out some of the warnings in direct input mode where HTTP encoding is a "fake" + if (( charset_not_equal( + $File->{Charset}->{HTTP}, + $File->{Charset}->{XML} + ) + ) and + not($File->{'Direct Input'}) + ) + { + &add_warning( + 'W18', + { W18_http => $File->{Charset}->{HTTP}, + W18_xml => $File->{Charset}->{XML}, + W18_use => $File->{Charset}->{Use}, + } + ); + } + elsif ( + charset_not_equal($File->{Charset}->{HTTP}, $File->{Charset}->{META}) + and not($File->{'Direct Input'})) + { + &add_warning( + 'W19', + { W19_http => $File->{Charset}->{HTTP}, + W19_meta => $File->{Charset}->{META}, + W19_use => $File->{Charset}->{Use}, + } + ); + } + elsif ( + charset_not_equal($File->{Charset}->{XML}, $File->{Charset}->{META})) + { + &add_warning( + 'W20', + { W20_http => $File->{Charset}->{XML}, + W20_xml => $File->{Charset}->{META}, + } + ); + $File->{Tentative} |= T_WARN; + } + + return $File; } # # Transcode to UTF-8 -sub transcode { - my $File = shift; - - my $general_charset = $File->{Charset}->{Use}; - my $exact_charset = $general_charset; - - # TODO: This should be done before transcode() - if ($general_charset eq 'utf-16') { - if ($File->{Charset}->{Auto} =~ m/^utf-16[bl]e$/) { - $exact_charset = $File->{Charset}->{Auto}; - } else { $exact_charset = 'utf-16be'; } - } - - my $cs = $exact_charset; - - if ($CFG->{Charsets}->{$cs}) { - if ($CFG->{Charsets}->{$cs} =~ /ERR /) { - # The encoding is not supported due to policy - - $File->{'Error Flagged'} = TRUE; - &get_template($File, 'fatal-error.tmpl')->param( - fatal_transcode_error => TRUE, - fatal_transcode_charset => $cs, - # @@FIXME might need better text - fatal_transcode_errmsg => 'This encoding is not supported by the validator.', - ); - return $File; - } - elsif ($CFG->{Charsets}->{$cs} =~ /X /) { - # possibly problematic, we recommend another alias - my $recommended_charset = $CFG->{Charsets}->{$cs}; - $recommended_charset =~ s/X //; - &add_warning('W22', { - W22_declared => $cs, - W22_suggested => $recommended_charset, - }); - } - } - - # Does the system support decoding this encoding? - my $enc = Encode::find_encoding($cs); - - if (!$enc) { - # This system's Encode installation does not support - # the character encoding; might need additional modules - - $File->{'Error Flagged'} = TRUE; - &get_template($File, 'fatal-error.tmpl')->param( - fatal_transcode_error => TRUE, - fatal_transcode_charset => $cs, - # @@FIXME might need better text - fatal_transcode_errmsg => 'Encoding not supported.', - ); - return $File; - } - elsif (!$CFG->{Charsets}->{$cs}) { - # not in the list, but technically OK -> we warn - &add_warning('W24', { - W24_declared => $cs, - }); - - } - - my $output; - my $input = $File->{Bytes}; - - # Try to transcode - eval { - $output = $enc->decode($input, Encode::FB_CROAK); - }; - - if ($@) { - # Transcoding failed - do it again line by line to find out exactly where - my $line_num = 0; - foreach my $input_line (split /\r\n|\n|\r/, $input) { - $line_num++; - eval { - $enc->decode($input_line, Encode::FB_CROAK); - }; - if ($@) { - my $croak_message = $@; - $croak_message =~ s/ at .*//; +sub transcode +{ + my $File = shift; + + my $general_charset = $File->{Charset}->{Use}; + my $exact_charset = $general_charset; + + # TODO: This should be done before transcode() + if ($general_charset eq 'utf-16') { + if ($File->{Charset}->{Auto} =~ m/^utf-16[bl]e$/) { + $exact_charset = $File->{Charset}->{Auto}; + } + else { $exact_charset = 'utf-16be'; } + } + + my $cs = $exact_charset; + + if ($CFG->{Charsets}->{$cs}) { + if ($CFG->{Charsets}->{$cs} =~ /ERR /) { + + # The encoding is not supported due to policy + + $File->{'Error Flagged'} = TRUE; + &get_template($File, 'fatal-error.tmpl')->param( + fatal_transcode_error => TRUE, + fatal_transcode_charset => $cs, + + # @@FIXME might need better text + fatal_transcode_errmsg => + 'This encoding is not supported by the validator.', + ); + return $File; + } + elsif ($CFG->{Charsets}->{$cs} =~ /X /) { + + # possibly problematic, we recommend another alias + my $recommended_charset = $CFG->{Charsets}->{$cs}; + $recommended_charset =~ s/X //; + &add_warning( + 'W22', + { W22_declared => $cs, + W22_suggested => $recommended_charset, + } + ); + } + } + + # Does the system support decoding this encoding? + my $enc = Encode::find_encoding($cs); + + if (!$enc) { + + # This system's Encode installation does not support + # the character encoding; might need additional modules + $File->{'Error Flagged'} = TRUE; &get_template($File, 'fatal-error.tmpl')->param( - fatal_byte_error => TRUE, - fatal_byte_lines => $line_num, - fatal_byte_charset => $cs, - fatal_byte_error_msg => $croak_message, + fatal_transcode_error => TRUE, + fatal_transcode_charset => $cs, + + # @@FIXME might need better text + fatal_transcode_errmsg => 'Encoding not supported.', ); - } + return $File; } - return $File; - } + elsif (!$CFG->{Charsets}->{$cs}) { + + # not in the list, but technically OK -> we warn + &add_warning('W24', {W24_declared => $cs,}); + + } + + my $output; + my $input = $File->{Bytes}; - # @@FIXME is this what we want? - $output =~ s/\015?\012/\n/g; + # Try to transcode + eval { $output = $enc->decode($input, Encode::FB_CROAK); }; - # make sure we deal only with unix newlines - # tentative fix for http://www.w3.org/Bugs/Public/show_bug.cgi?id=3992 - $output =~ s/(\r\n|\n|\r)/\n/g; + if ($@) { + + # Transcoding failed - do it again line by line to find out exactly where + my $line_num = 0; + foreach my $input_line (split /\r\n|\n|\r/, $input) { + $line_num++; + eval { $enc->decode($input_line, Encode::FB_CROAK); }; + if ($@) { + my $croak_message = $@; + $croak_message =~ s/ at .*//; + $File->{'Error Flagged'} = TRUE; + &get_template($File, 'fatal-error.tmpl')->param( + fatal_byte_error => TRUE, + fatal_byte_lines => $line_num, + fatal_byte_charset => $cs, + fatal_byte_error_msg => $croak_message, + ); + } + } + return $File; + } + + # @@FIXME is this what we want? + $output =~ s/\015?\012/\n/g; - #debug: we could check if the content has utf8 bit on with - #$output= utf8::is_utf8($output) ? 1 : 0; - $File->{Content} = [split/\n/, $output]; + # make sure we deal only with unix newlines + # tentative fix for http://www.w3.org/Bugs/Public/show_bug.cgi?id=3992 + $output =~ s/(\r\n|\n|\r)/\n/g; - return $File; + #debug: we could check if the content has utf8 bit on with + #$output= utf8::is_utf8($output) ? 1 : 0; + $File->{Content} = [split /\n/, $output]; + + return $File; } sub find_encodings { - my $File = shift; - my $bom = HTML::Encoding::encoding_from_byte_order_mark($File->{Bytes}); - my @first = HTML::Encoding::encoding_from_first_chars($File->{Bytes}); + my $File = shift; + my $bom = HTML::Encoding::encoding_from_byte_order_mark($File->{Bytes}); + my @first = HTML::Encoding::encoding_from_first_chars($File->{Bytes}); - if (defined $bom) - { - # @@FIXME this BOM entry should not be needed at all! - $File->{BOM} = length(Encode::encode($bom, "\x{FEFF}")); - $File->{Charset}->{Auto} = lc $bom; - } - else - { - $File->{Charset}->{Auto} = lc($first[0]) if @first; - } + if (defined $bom) { - my $xml = HTML::Encoding::encoding_from_xml_document($File->{Bytes}); - $File->{Charset}->{XML} = lc $xml if defined $xml; + # @@FIXME this BOM entry should not be needed at all! + $File->{BOM} = length(Encode::encode($bom, "\x{FEFF}")); + $File->{Charset}->{Auto} = lc $bom; + } + else { + $File->{Charset}->{Auto} = lc($first[0]) if @first; + } - my %metah; - foreach my $try (@first) - { - # @@FIXME I think the old code used HTML::Parser xml mode, check this is ok - my $meta = HTML::Encoding::encoding_from_meta_element($File->{Bytes}, $try); - $metah{lc($meta)}++ if defined $meta and length $meta; - } + my $xml = HTML::Encoding::encoding_from_xml_document($File->{Bytes}); + $File->{Charset}->{XML} = lc $xml if defined $xml; - my @meta = sort { $metah{$b} <=> $metah{$a} } keys %metah; - $File->{Charset}->{META} = $meta[0] if @meta; + my %metah; + foreach my $try (@first) { - return $File; + # @@FIXME I think the old code used HTML::Parser xml mode, check this is ok + my $meta = + HTML::Encoding::encoding_from_meta_element($File->{Bytes}, $try); + $metah{lc($meta)}++ if defined $meta and length $meta; + } + + my @meta = sort { $metah{$b} <=> $metah{$a} } keys %metah; + $File->{Charset}->{META} = $meta[0] if @meta; + + return $File; } # # Abort with a message if an error was flagged at point. -sub abort_if_error_flagged { - my $File = shift; - my $Flags = shift; +sub abort_if_error_flagged +{ + my $File = shift; + my $Flags = shift; - return unless $File->{'Error Flagged'}; - return if $File->{'Error Handled'}; # Previous error, keep going. + return unless $File->{'Error Flagged'}; + return if $File->{'Error Handled'}; # Previous error, keep going. - my $tmpl = &get_template($File, 'fatal-error.tmpl'); - $tmpl->param(fatal_error => TRUE); + my $tmpl = &get_template($File, 'fatal-error.tmpl'); + $tmpl->param(fatal_error => TRUE); - if ($File->{Opt}->{Output} eq 'html') { - &prep_template($File, $tmpl); - # transcode output from perl's internal to utf-8 and output - print Encode::encode('UTF-8', $tmpl->output); - exit; - } else { + if ($File->{Opt}->{Output} eq 'html') { + &prep_template($File, $tmpl); - #@@FIXME: This is borked after templatification. - # &add_warning($File, 'fatal', 'Fatal Error', <<".EOF."); - # A fatal error has occurred while processing the requested document. Processing - # has continued but any later output will be of dubious quality. Limitations of - # this output mode prevent the full error message from being returned; please - # retry this operation in interactive mode using the web interface to see the - # actual error message. - # .EOF. - #@@FIXME; - $File->{'Error Handled'} = TRUE; - } + # transcode output from perl's internal to utf-8 and output + print Encode::encode('UTF-8', $tmpl->output); + exit; + } + else { + + #@@FIXME: This is borked after templatification. + # &add_warning($File, 'fatal', 'Fatal Error', <<".EOF."); + # A fatal error has occurred while processing the requested document. Processing + # has continued but any later output will be of dubious quality. Limitations of + # this output mode prevent the full error message from being returned; please + # retry this operation in interactive mode using the web interface to see the + # actual error message. + # .EOF. + #@@FIXME; + $File->{'Error Handled'} = TRUE; + } } # # conflicting encodings -sub charset_not_equal { - my $encodingA = shift; - my $encodingB = shift; - return $encodingA && $encodingB && ($encodingA ne $encodingB); +sub charset_not_equal +{ + my $encodingA = shift; + my $encodingB = shift; + return $encodingA && $encodingB && ($encodingA ne $encodingB); } # # Construct a self-referential URL from a CGI.pm $q object. -sub self_url_q { - my ($q, $File) = @_; - my $thispage = $File->{Env}->{'Self URI'} . '?'; +sub self_url_q +{ + my ($q, $File) = @_; + my $thispage = $File->{Env}->{'Self URI'} . '?'; - # Pass-through parameters - for my $param (qw(uri accept accept-language accept-charset)) { - $thispage .= "$param=" . uri_escape($q->param($param)) . ';' - if $q->param($param); - } + # Pass-through parameters + for my $param (qw(uri accept accept-language accept-charset)) { + $thispage .= "$param=" . uri_escape($q->param($param)) . ';' + if $q->param($param); + } - # Boolean parameters - for my $param (qw(ss outline No200 verbose group)) { - $thispage .= "$param=1;" if $q->param($param); - } + # Boolean parameters + for my $param (qw(ss outline No200 verbose group)) { + $thispage .= "$param=1;" if $q->param($param); + } - # Others - if ($q->param('doctype') - and not $q->param('doctype') =~ /(Inline|detect)/i) { - $thispage .= 'doctype=' . uri_escape($q->param('doctype')) . ';'; - } - if ($q->param('charset') and not $q->param('charset') =~ /detect/i) { - $thispage .= 'charset=' . uri_escape($q->param('charset')) . ';'; - } + # Others + if ($q->param('doctype') and + not $q->param('doctype') =~ /(Inline|detect)/i) + { + $thispage .= 'doctype=' . uri_escape($q->param('doctype')) . ';'; + } + if ($q->param('charset') and not $q->param('charset') =~ /detect/i) { + $thispage .= 'charset=' . uri_escape($q->param('charset')) . ';'; + } - $thispage =~ s/[\?;]$//; - return $thispage; + $thispage =~ s/[\?;]$//; + return $thispage; } # # Return random tip and its URL. -sub get_tip { - my @tipAddrs = keys %{$CFG->{Tips}}; - my $tipAddr = $tipAddrs[rand scalar @tipAddrs]; - my $tipSlug = $CFG->{Tips}->{$tipAddr}; +sub get_tip +{ + my @tipAddrs = keys %{$CFG->{Tips}}; + my $tipAddr = $tipAddrs[rand scalar @tipAddrs]; + my $tipSlug = $CFG->{Tips}->{$tipAddr}; - return [$tipAddr, $tipSlug]; + return [$tipAddr, $tipSlug]; } # # Construct a self-referential URL from a $File object. -sub self_url_file { - my $File = shift; - - my $thispage = $File->{Env}->{'Self URI'}; - my $escaped_uri = uri_escape($File->{URI}); - $thispage .= qq(?uri=$escaped_uri); - $thispage .= ';ss=1' if $File->{Opt}->{'Show Source'}; - $thispage .= ';st=1' if $File->{Opt}->{'Show Tidy'}; - $thispage .= ';outline=1' if $File->{Opt}->{Outline}; - $thispage .= ';No200=1' if $File->{Opt}->{No200}; - $thispage .= ';verbose=1' if $File->{Opt}->{Verbose}; - $thispage .= ';group=1' if $File->{Opt}->{'Group Errors'}; - $thispage .= ';accept=' . uri_escape($File->{Opt}->{'Accept Header'}) if $File->{Opt}->{'Accept Header'}; - $thispage .= ';accept-language=' . uri_escape($File->{Opt}->{'Accept-Language Header'}) if $File->{Opt}->{'Accept-Language Header'}; - $thispage .= ';accept-charset=' . uri_escape($File->{Opt}->{'Accept-Charset Header'}) if $File->{Opt}->{'Accept-Charset Header'}; - - return $thispage; +sub self_url_file +{ + my $File = shift; + + my $thispage = $File->{Env}->{'Self URI'}; + my $escaped_uri = uri_escape($File->{URI}); + $thispage .= qq(?uri=$escaped_uri); + $thispage .= ';ss=1' if $File->{Opt}->{'Show Source'}; + $thispage .= ';st=1' if $File->{Opt}->{'Show Tidy'}; + $thispage .= ';outline=1' if $File->{Opt}->{Outline}; + $thispage .= ';No200=1' if $File->{Opt}->{No200}; + $thispage .= ';verbose=1' if $File->{Opt}->{Verbose}; + $thispage .= ';group=1' if $File->{Opt}->{'Group Errors'}; + $thispage .= ';accept=' . uri_escape($File->{Opt}->{'Accept Header'}) + if $File->{Opt}->{'Accept Header'}; + $thispage .= + ';accept-language=' . + uri_escape($File->{Opt}->{'Accept-Language Header'}) + if $File->{Opt}->{'Accept-Language Header'}; + $thispage .= + ';accept-charset=' . + uri_escape($File->{Opt}->{'Accept-Charset Header'}) + if $File->{Opt}->{'Accept-Charset Header'}; + + return $thispage; } ##### package W3C::Validator::EventHandler; + # # Define global constants use constant TRUE => 1; @@ -2885,230 +3212,255 @@ use constant FALSE => 0; # # Tentative Validation Severities. -use constant T_WARN => 4; # 0000 0100 -use constant T_ERROR => 8; # 0000 1000 +use constant T_WARN => 4; # 0000 0100 +use constant T_ERROR => 8; # 0000 1000 # # Output flags for error processing -use constant O_SOURCE => 1; # 0000 0001 -use constant O_CHARSET => 2; # 0000 0010 -use constant O_DOCTYPE => 4; # 0000 0100 -use constant O_NONE => 8; # 0000 1000 - +use constant O_SOURCE => 1; # 0000 0001 +use constant O_CHARSET => 2; # 0000 0010 +use constant O_DOCTYPE => 4; # 0000 0100 +use constant O_NONE => 8; # 0000 1000 sub new { - my $class = shift; - my $parser = shift; - my $File = shift; - my $CFG = shift; - my $self = { _file => $File, CFG => $CFG, _parser => $parser }; - bless $self, $class; + my $class = shift; + my $parser = shift; + my $File = shift; + my $CFG = shift; + my $self = {_file => $File, CFG => $CFG, _parser => $parser}; + bless $self, $class; } - sub start_element { - my ($self, $element) = @_; - - my $has_xmlns = FALSE; - my $xmlns_value = undef; - - # If in XML mode, find namespace used for each element. - if (&W3C::Validator::MarkupValidator::is_xml($self->{_file})) { - if (my $attr = $element->{Attributes}->{xmlns}) { - $xmlns_value = ""; - # Try with SAX method - if ($attr->{Value}) { - $has_xmlns = TRUE; - $xmlns_value = $attr->{Value}; - } - #next if $has_xmlns; - - # The following is not SAX, but OpenSP specific. - my $defaulted = $attr->{Defaulted} || ''; - if ($defaulted eq "specified") { - $has_xmlns = TRUE; - $xmlns_value .= join("", map { $_->{Data} } @{$attr->{CdataChunks}}); - } - } - } - - my $doctype = $self->{_file}->{DOCTYPE}; - - if (!defined($self->{CFG}->{Types}->{$doctype}->{Name}) || - $element->{Name} ne $self->{CFG}->{Types}->{$doctype}->{Name}) { - # add to list of non-root namespaces - push(@{$self->{_file}->{Namespaces}}, $xmlns_value) if $has_xmlns; - } - elsif (!$has_xmlns and $self->{CFG}->{Types}->{$doctype}->{"Namespace Required"}) { - # whine if the root xmlns attribute is noted as required by spec, - # but not present - my $err; - my $location = $self->{_parser}->get_location(); - $err->{src} = '...'; # do this with show_open_entities()? - $err->{line} = $location->{LineNumber}; - $err->{char} = $location->{ColumnNumber}; - $err->{num} = "no-xmlns"; - $err->{type} = "E"; - $err->{msg} = "Missing xmlns attribute for element ".$element->{Name} . ". + my ($self, $element) = @_; + + my $has_xmlns = FALSE; + my $xmlns_value = undef; + + # If in XML mode, find namespace used for each element. + if (&W3C::Validator::MarkupValidator::is_xml($self->{_file})) { + if (my $attr = $element->{Attributes}->{xmlns}) { + $xmlns_value = ""; + + # Try with SAX method + if ($attr->{Value}) { + $has_xmlns = TRUE; + $xmlns_value = $attr->{Value}; + } + + #next if $has_xmlns; + + # The following is not SAX, but OpenSP specific. + my $defaulted = $attr->{Defaulted} || ''; + if ($defaulted eq "specified") { + $has_xmlns = TRUE; + $xmlns_value .= + join("", map { $_->{Data} } @{$attr->{CdataChunks}}); + } + } + } + + my $doctype = $self->{_file}->{DOCTYPE}; + + if (!defined($self->{CFG}->{Types}->{$doctype}->{Name}) || + $element->{Name} ne $self->{CFG}->{Types}->{$doctype}->{Name}) + { + + # add to list of non-root namespaces + push(@{$self->{_file}->{Namespaces}}, $xmlns_value) if $has_xmlns; + } + elsif (!$has_xmlns and + $self->{CFG}->{Types}->{$doctype}->{"Namespace Required"}) + { + + # whine if the root xmlns attribute is noted as required by spec, + # but not present + my $err; + my $location = $self->{_parser}->get_location(); + $err->{src} = '...'; # do this with show_open_entities()? + $err->{line} = $location->{LineNumber}; + $err->{char} = $location->{ColumnNumber}; + $err->{num} = "no-xmlns"; + $err->{type} = "E"; + $err->{msg} = + "Missing xmlns attribute for element " . $element->{Name} . ". The value should be: $self->{CFG}->{Types}->{$doctype}->{Namespace}"; + # ... + $self->{_file}->{'Is Valid'} = FALSE; + push @{$self->{_file}->{Errors}}, $err; + } + elsif ($has_xmlns and + (defined $self->{CFG}->{Types}->{$doctype}->{Namespace}) and + ($xmlns_value ne $self->{CFG}->{Types}->{$doctype}->{Namespace})) + { - # ... - $self->{_file}->{'Is Valid'} = FALSE; - push @{$self->{_file}->{Errors}}, $err; - } - elsif ($has_xmlns and (defined $self->{CFG}->{Types}->{$doctype}->{Namespace}) - and ($xmlns_value ne $self->{CFG}->{Types}->{$doctype}->{Namespace}) ) { - # whine if root xmlns element is not the one specificed by the spec - my $err; - my $location = $self->{_parser}->get_location(); - $err->{src} = '...'; # do this with show_open_entities()? - $err->{line} = $location->{LineNumber}; - $err->{char} = $location->{ColumnNumber}; - $err->{num} = "wrong-xmlns"; - $err->{type} = "E"; - $err->{msg} = "Wrong xmlns attribute for element $element->{Name}. ". - "The value should be: $self->{CFG}->{Types}->{$doctype}->{Namespace}"; + # whine if root xmlns element is not the one specificed by the spec + my $err; + my $location = $self->{_parser}->get_location(); + $err->{src} = '...'; # do this with show_open_entities()? + $err->{line} = $location->{LineNumber}; + $err->{char} = $location->{ColumnNumber}; + $err->{num} = "wrong-xmlns"; + $err->{type} = "E"; + $err->{msg} = + "Wrong xmlns attribute for element $element->{Name}. " . + "The value should be: $self->{CFG}->{Types}->{$doctype}->{Namespace}"; - # ... - $self->{_file}->{'Is Valid'} = FALSE; - push @{$self->{_file}->{Errors}}, $err; - } + # ... + $self->{_file}->{'Is Valid'} = FALSE; + push @{$self->{_file}->{Errors}}, $err; + } } - sub error { - my $self = shift; - my $error = shift; - my $mess; - eval { - $mess = $self->{_parser}->split_message($error); - }; - if ($@) { - # this is a message that S:P:O could not handle, we skip its croaking - return; - } - my $File = $self->{_file}; - - # TODO: this does not filter out errors in DTDs. - - my $err; - - $err->{src} = '...'; # do this with show_open_entities()? - $err->{line} = $mess->{primary_message}{LineNumber}; - $err->{char} = $mess->{primary_message}{ColumnNumber}+1; - $err->{num} = $mess->{primary_message}{Number}; - $err->{type} = $mess->{primary_message}{Severity}; - $err->{msg} = $mess->{primary_message}{Text}; - - $err->{msg} =~ s/"&"/"&"/gsi; - - # our parser OpenSP is not quite XML-aware, or XML Namespaces Aware, - # so we filter out a few errors for now - - my $is_xml = &W3C::Validator::MarkupValidator::is_xml($File); - - if ($is_xml and $err->{num} eq '108' and $err->{msg} =~ m{ "xmlns:\S+"}) { - # the error is about a missing xmlns: attribute definition" - return; # this is not an error, 'cause we said so - } - - if ($err->{num} eq '187') - # filtering out no "document type declaration; will parse without validation" - # if root element is not html and mode is xml... - { - # since parsing was done without validation, result can only be "well-formed" - if ($is_xml and lc($File->{Root}) ne 'html') { - $File->{XMLWF_ONLY} = TRUE; - W3C::Validator::MarkupValidator::add_warning('W09xml', {}); - return; # don't report this as an error, just proceed - } - # if mode is not XML, we do report the error. It should not happen in the case of <html> without doctype, - # in that case the error message will be #344 - } - - if (($err->{num} eq '113') and ($err->{msg} =~ /xml:space/)) { - # FIXME - # this is a problem with some of the "flattened" W3C DTDs, filtering them out to not confuse users. - # hoping to get the DTDs fixed, see http://lists.w3.org/Archives/Public/www-html-editor/2007AprJun/0010.html - return; # don't report this, just proceed - } - - if ($is_xml and $err->{num} eq '344' and $File->{Namespace}) { - # we are in XML mode, we have a namespace, but no doctype. - # the validator will already have said "no doctype, falling back to default" above - # no need to report this. - return; # don't report this, just proceed - } - - if (($err->{num} eq '248') or ($err->{num} eq '247') or ($err->{num} eq '246')) { - # these two errors should be triggered by -wmin-tag to report shorttag used, - # but we're making them warnings, not errors - # see http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.7 - $err->{type} = "W"; - } - - # Workaround for onsgmls as of 1.5 sometimes allegedly reporting errors - # beyond EOL. If you see this warning in your web server logs, please - # let the validator developers know, see http://validator.w3.org/feedback.html - # As long as $err may be from somewhere else than the document (such as - # from a DTD) and we have no way of identifying these cases, this - # produces bogus results and error log spewage, so commented out for now. -# if ((my $l = length($File->{Content}->[$err->{line}-1])) < $err->{char}) { -# warn("Warning: reported error column larger than line length " . -# "($err->{char} > $l) in $File->{URI} line $err->{line}, " . -# "OpenSP bug? Resetting to line length."); -# $err->{char} = $l; -# } - - # No or unknown FPI and a relative SI. - if ($err->{msg} =~ m(cannot (open|find))) { - $File->{'Error Flagged'} = TRUE; - &W3C::Validator::MarkupValidator::get_template($File, 'fatal-error.tmpl')->param( - fatal_parse_extid_error => TRUE, - fatal_parse_extid_msg => $err->{msg}, - ); - } + my $self = shift; + my $error = shift; + my $mess; + eval { $mess = $self->{_parser}->split_message($error); }; + if ($@) { + + # this is a message that S:P:O could not handle, we skip its croaking + return; + } + my $File = $self->{_file}; + + # TODO: this does not filter out errors in DTDs. + + my $err; + + $err->{src} = '...'; # do this with show_open_entities()? + $err->{line} = $mess->{primary_message}{LineNumber}; + $err->{char} = $mess->{primary_message}{ColumnNumber} + 1; + $err->{num} = $mess->{primary_message}{Number}; + $err->{type} = $mess->{primary_message}{Severity}; + $err->{msg} = $mess->{primary_message}{Text}; + + $err->{msg} =~ s/"&"/"&"/gsi; + + # our parser OpenSP is not quite XML-aware, or XML Namespaces Aware, + # so we filter out a few errors for now + + my $is_xml = &W3C::Validator::MarkupValidator::is_xml($File); + + if ($is_xml and $err->{num} eq '108' and $err->{msg} =~ m{ "xmlns:\S+"}) { + + # the error is about a missing xmlns: attribute definition" + return; # this is not an error, 'cause we said so + } + + if ($err->{num} eq '187') + + # filtering out no "document type declaration; will parse without validation" + # if root element is not html and mode is xml... + { + + # since parsing was done without validation, result can only be "well-formed" + if ($is_xml and lc($File->{Root}) ne 'html') { + $File->{XMLWF_ONLY} = TRUE; + W3C::Validator::MarkupValidator::add_warning('W09xml', {}); + return; # don't report this as an error, just proceed + } + + # if mode is not XML, we do report the error. It should not happen in the case of <html> without doctype, + # in that case the error message will be #344 + } + + if (($err->{num} eq '113') and ($err->{msg} =~ /xml:space/)) { + + # FIXME + # this is a problem with some of the "flattened" W3C DTDs, filtering them out to not confuse users. + # hoping to get the DTDs fixed, see http://lists.w3.org/Archives/Public/www-html-editor/2007AprJun/0010.html + return; # don't report this, just proceed + } - # No DOCTYPE found! We are falling back to vanilla DTD - if ($err->{msg} =~ m(prolog can\'t be omitted)) { - if (lc($File->{Root}) eq 'html') { - my $dtd = ($is_xml ? - $File->{"Default DOCTYPE"}->{"XHTML"} : $File->{"Default DOCTYPE"}->{"HTML"} ); - W3C::Validator::MarkupValidator::add_warning('W09', {W09_dtd => $dtd}); + if ($is_xml and $err->{num} eq '344' and $File->{Namespace}) { + + # we are in XML mode, we have a namespace, but no doctype. + # the validator will already have said "no doctype, falling back to default" above + # no need to report this. + return; # don't report this, just proceed } - else { # not html root element, we are not using fallback - unless ($is_xml) { - $File->{'Is Valid'} = FALSE; - W3C::Validator::MarkupValidator::add_warning('W09nohtml', {}); - } + + if (($err->{num} eq '248') or + ($err->{num} eq '247') or + ($err->{num} eq '246')) + { + + # these two errors should be triggered by -wmin-tag to report shorttag used, + # but we're making them warnings, not errors + # see http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.7 + $err->{type} = "W"; + } + + # Workaround for onsgmls as of 1.5 sometimes allegedly reporting errors + # beyond EOL. If you see this warning in your web server logs, please + # let the validator developers know, see http://validator.w3.org/feedback.html + # As long as $err may be from somewhere else than the document (such as + # from a DTD) and we have no way of identifying these cases, this + # produces bogus results and error log spewage, so commented out for now. + # if ((my $l = length($File->{Content}->[$err->{line}-1])) < $err->{char}) { + # warn("Warning: reported error column larger than line length " . + # "($err->{char} > $l) in $File->{URI} line $err->{line}, " . + # "OpenSP bug? Resetting to line length."); + # $err->{char} = $l; + # } + + # No or unknown FPI and a relative SI. + if ($err->{msg} =~ m(cannot (open|find))) { + $File->{'Error Flagged'} = TRUE; + &W3C::Validator::MarkupValidator::get_template($File, + 'fatal-error.tmpl')->param( + fatal_parse_extid_error => TRUE, + fatal_parse_extid_msg => $err->{msg}, + ); + } + + # No DOCTYPE found! We are falling back to vanilla DTD + if ($err->{msg} =~ m(prolog can\'t be omitted)) { + if (lc($File->{Root}) eq 'html') { + my $dtd = + ($is_xml ? $File->{"Default DOCTYPE"}->{"XHTML"} : + $File->{"Default DOCTYPE"}->{"HTML"}); + W3C::Validator::MarkupValidator::add_warning('W09', + {W09_dtd => $dtd}); + } + else { # not html root element, we are not using fallback + unless ($is_xml) { + $File->{'Is Valid'} = FALSE; + W3C::Validator::MarkupValidator::add_warning('W09nohtml', {}); + } + } + + return; # Don't report this as a normal error. } - return; # Don't report this as a normal error. - } + # TODO: calling exit() here is probably a bad idea + W3C::Validator::MarkupValidator::abort_if_error_flagged($File, O_DOCTYPE); - # TODO: calling exit() here is probably a bad idea - W3C::Validator::MarkupValidator::abort_if_error_flagged($File, O_DOCTYPE); + push @{$File->{Errors}}, $err; + + # ... + $File->{'Is Valid'} = FALSE if $err->{type} eq 'E'; - push @{$File->{Errors}}, $err; - # ... - $File->{'Is Valid'} = FALSE if $err->{type} eq 'E'; + if (defined $mess->{aux_message}) { - if (defined $mess->{aux_message}) - { - # "duplicate id ... first defined here" style messages - push @{$File->{Errors}}, { line => $mess->{aux_message}{LineNumber}, - char => $mess->{aux_message}{ColumnNumber}+1, - msg => $mess->{aux_message}{Text}, - type => 'I', - }; - } + # "duplicate id ... first defined here" style messages + push @{$File->{Errors}}, + { + line => $mess->{aux_message}{LineNumber}, + char => $mess->{aux_message}{ColumnNumber} + 1, + msg => $mess->{aux_message}{Text}, + type => 'I', + }; + } } package W3C::Validator::EventHandler::Outliner; + # # Define global constants use constant TRUE => 1; @@ -3116,152 +3468,161 @@ use constant FALSE => 0; # # Tentative Validation Severities. -use constant T_WARN => 4; # 0000 0100 -use constant T_ERROR => 8; # 0000 1000 +use constant T_WARN => 4; # 0000 0100 +use constant T_ERROR => 8; # 0000 1000 # # Output flags for error processing -use constant O_SOURCE => 1; # 0000 0001 -use constant O_CHARSET => 2; # 0000 0010 -use constant O_DOCTYPE => 4; # 0000 0100 -use constant O_NONE => 8; # 0000 1000 +use constant O_SOURCE => 1; # 0000 0001 +use constant O_CHARSET => 2; # 0000 0010 +use constant O_DOCTYPE => 4; # 0000 0100 +use constant O_NONE => 8; # 0000 1000 use base qw(W3C::Validator::EventHandler); sub new { - my $class = shift; - my $parser = shift; - my $File = shift; - my $CFG = shift; - my $self = $class->SUPER::new($parser, $File, $CFG); - $self->{am_in_heading} = 0; - $self->{heading_text} = []; - bless $self, $class; + my $class = shift; + my $parser = shift; + my $File = shift; + my $CFG = shift; + my $self = $class->SUPER::new($parser, $File, $CFG); + $self->{am_in_heading} = 0; + $self->{heading_text} = []; + bless $self, $class; } sub data { - my ($self, $chars) = @_; - push(@{$self->{heading_text}}, $chars->{Data}) if $self->{am_in_heading}; + my ($self, $chars) = @_; + push(@{$self->{heading_text}}, $chars->{Data}) if $self->{am_in_heading}; } sub start_element { - my ($self, $element) = @_; - if ($element->{Name} =~ /^h([1-6])$/i) { - $self->{_file}->{heading_outline} ||= ""; - $self->{_file}->{heading_outline} .= - " " x int($1) . "[$element->{Name}] "; - $self->{am_in_heading} = 1; - } + my ($self, $element) = @_; + if ($element->{Name} =~ /^h([1-6])$/i) { + $self->{_file}->{heading_outline} ||= ""; + $self->{_file}->{heading_outline} .= + " " x int($1) . "[$element->{Name}] "; + $self->{am_in_heading} = 1; + } - return $self->SUPER::start_element($element); + return $self->SUPER::start_element($element); } sub end_element { - my ($self, $element) = @_; - if ($element->{Name} =~ /^h[1-6]$/i) { - my $text = join("", @{$self->{heading_text}}); - $text =~ s/^\s+//g; - $text =~ s/\s+/ /g; - $text =~ s/\s+$//g; - $self->{_file}->{heading_outline} .= "$text\n"; - $self->{am_in_heading} = 0; - $self->{heading_text} = []; - } + my ($self, $element) = @_; + if ($element->{Name} =~ /^h[1-6]$/i) { + my $text = join("", @{$self->{heading_text}}); + $text =~ s/^\s+//g; + $text =~ s/\s+/ /g; + $text =~ s/\s+$//g; + $self->{_file}->{heading_outline} .= "$text\n"; + $self->{am_in_heading} = 0; + $self->{heading_text} = []; + } } - ##### package W3C::Validator::UserAgent; -use HTTP::Message qw(); -use LWP::UserAgent 2.032 qw(); # Need 2.032 for default_header() -use Net::hostent qw(gethostbyname); -use Net::IP qw(); -use Socket qw(inet_ntoa); +use HTTP::Message qw(); +use LWP::UserAgent 2.032 qw(); # Need 2.032 for default_header() +use Net::hostent qw(gethostbyname); +use Net::IP qw(); +use Socket qw(inet_ntoa); use base qw(LWP::UserAgent); -BEGIN -{ - # The 4k default line length in LWP <= 5.832 isn't enough for example to - # accommodate 4kB cookies (RFC 2985); bump it (#6678). - require LWP::Protocol::http; - push(@LWP::Protocol::http::EXTRA_SOCK_OPTS, MaxLineLength => 8*1024); +BEGIN { + + # The 4k default line length in LWP <= 5.832 isn't enough for example to + # accommodate 4kB cookies (RFC 2985); bump it (#6678). + require LWP::Protocol::http; + push(@LWP::Protocol::http::EXTRA_SOCK_OPTS, MaxLineLength => 8 * 1024); } -sub new { - my ($proto, $CFG, $File, @rest) = @_; - my $class = ref($proto) || $proto; - my $self = $class->SUPER::new(@rest); +sub new +{ + my ($proto, $CFG, $File, @rest) = @_; + my $class = ref($proto) || $proto; + my $self = $class->SUPER::new(@rest); - $self->{'W3C::Validator::CFG'} = $CFG; - $self->{'W3C::Validator::File'} = $File; + $self->{'W3C::Validator::CFG'} = $CFG; + $self->{'W3C::Validator::File'} = $File; - $self->env_proxy(); - $self->agent($File->{Opt}->{'User Agent'}); - $self->protocols_allowed($CFG->{Protocols}->{Allow} || ['http', 'https']); + $self->env_proxy(); + $self->agent($File->{Opt}->{'User Agent'}); + $self->protocols_allowed($CFG->{Protocols}->{Allow} || ['http', 'https']); - # Don't parse the http-equiv stuff. - $self->parse_head(0); + # Don't parse the http-equiv stuff. + $self->parse_head(0); - # Tell caches in the middle we want a fresh copy (Bug 4998). - $self->default_header('Cache-Control' => 'max-age=0'); + # Tell caches in the middle we want a fresh copy (Bug 4998). + $self->default_header('Cache-Control' => 'max-age=0'); - # If not in debug mode, set Accept-Encoding to what LWP (>= 5.816) can handle - $self->default_header('Accept-Encoding' => scalar HTTP::Message::decodable()) - if (!$File->{Opt}->{Debug} && HTTP::Message->can('decodable')); + # If not in debug mode, set Accept-Encoding to what LWP (>= 5.816) can handle + $self->default_header( + 'Accept-Encoding' => scalar HTTP::Message::decodable()) + if (!$File->{Opt}->{Debug} && HTTP::Message->can('decodable')); - return $self; + return $self; } -sub redirect_ok { - my ($self, $req, $res) = @_; - return $self->SUPER::redirect_ok($req, $res) && $self->uri_ok($req->uri()); +sub redirect_ok +{ + my ($self, $req, $res) = @_; + return $self->SUPER::redirect_ok($req, $res) && $self->uri_ok($req->uri()); } -sub uri_ok { - my ($self, $uri) = @_; - - return 1 if ($self->{'W3C::Validator::CFG'}->{'Allow Private IPs'} or - !$uri->can('host')); - - my $h5uri = $self->{'W3C::Validator::CFG'}->{External}->{HTML5}; - if ($h5uri) { - my $clone = $uri->clone(); $clone->query(undef); $clone->fragment(undef); - $h5uri = URI->new($h5uri); $h5uri->query(undef); $h5uri->fragment(undef); - return 1 if $clone->eq($h5uri); - } - - my $addr = my $iptype = undef; - if (my $host = gethostbyname($uri->host())) { - $addr = inet_ntoa($host->addr()) if $host->addr(); - if ($addr && (my $ip = Net::IP->new($addr))) { - $iptype = $ip->iptype(); +sub uri_ok +{ + my ($self, $uri) = @_; + + return 1 + if ($self->{'W3C::Validator::CFG'}->{'Allow Private IPs'} or + !$uri->can('host')); + + my $h5uri = $self->{'W3C::Validator::CFG'}->{External}->{HTML5}; + if ($h5uri) { + my $clone = $uri->clone(); + $clone->query(undef); + $clone->fragment(undef); + $h5uri = URI->new($h5uri); + $h5uri->query(undef); + $h5uri->fragment(undef); + return 1 if $clone->eq($h5uri); + } + + my $addr = my $iptype = undef; + if (my $host = gethostbyname($uri->host())) { + $addr = inet_ntoa($host->addr()) if $host->addr(); + if ($addr && (my $ip = Net::IP->new($addr))) { + $iptype = $ip->iptype(); + } } - } - if ($iptype && $iptype ne 'PUBLIC') { - my $File = $self->{'W3C::Validator::File'}; - $File->{'Error Flagged'} = 1; - my $tmpl = &W3C::Validator::MarkupValidator::get_template($File, 'fatal-error.tmpl'); - $tmpl->param( - fatal_ip_error => 1, - fatal_ip_host => $uri->host() || 'undefined', - ); - $tmpl->param(fatal_ip_hostname => 1) if ($addr and $uri->host() ne $addr); - return 0; - } - return 1; + if ($iptype && $iptype ne 'PUBLIC') { + my $File = $self->{'W3C::Validator::File'}; + $File->{'Error Flagged'} = 1; + my $tmpl = &W3C::Validator::MarkupValidator::get_template($File, + 'fatal-error.tmpl'); + $tmpl->param( + fatal_ip_error => 1, + fatal_ip_host => $uri->host() || 'undefined', + ); + $tmpl->param(fatal_ip_hostname => 1) + if ($addr and $uri->host() ne $addr); + return 0; + } + return 1; } # Local Variables: # mode: perl # indent-tabs-mode: nil -# tab-width: 2 -# perl-indent-level: 2 +# perl-indent-level: 4 # End: -# ex: ts=2 sw=2 et +# ex: ts=4 sw=4 et diff --git a/httpd/cgi-bin/sendfeedback.pl b/httpd/cgi-bin/sendfeedback.pl index 2dd9032..292bdd3 100755 --- a/httpd/cgi-bin/sendfeedback.pl +++ b/httpd/cgi-bin/sendfeedback.pl @@ -1,21 +1,21 @@ #!/usr/bin/perl -T ## ## feedback generator for W3C Markup Validation Service -# # $Id: sendfeedback.pl,v 1.12 2009-06-29 14:37:08 ville Exp $ +# # $Id: sendfeedback.pl,v 1.13 2009-11-23 22:15:18 ville Exp $ ## Pragmas. use strict; use warnings; - ## Modules. See also the BEGIN block further down below. -use CGI qw(); +use CGI qw(); use File::Spec::Functions qw(catfile); -use HTML::Template 2.6 qw(); -use Config::General 2.32 qw(); # Need 2.32 for <msg 0>, rt.cpan.org#17852 +use HTML::Template 2.6 qw(); +use Config::General 2.32 qw(); # Need 2.32 for <msg 0>, rt.cpan.org#17852 use vars qw($DEBUG $CFG %RSRC $VERSION); + # Define global constants use constant TRUE => 1; use constant FALSE => 0; @@ -23,60 +23,63 @@ use constant FALSE => 0; # Things inside BEGIN don't happen on every request in persistent # environments, such as mod_perl. So let's do globals, eg. read config here. BEGIN { - # Launder data for -T; -AutoLaunder doesn't catch this one. - if (exists $ENV{W3C_VALIDATOR_HOME}) { - $ENV{W3C_VALIDATOR_HOME} =~ /^(.*)$/; - $ENV{W3C_VALIDATOR_HOME} = $1; - } - - # - # Read Config Files. - eval { - my %config_opts = ( - -ConfigFile => ($ENV{W3C_VALIDATOR_CFG} || '/etc/w3c/validator.conf'), - -MergeDuplicateOptions => TRUE, - -MergeDuplicateBlocks => TRUE, - -SplitPolicy => 'equalsign', - -UseApacheInclude => TRUE, - -IncludeRelative => TRUE, - -InterPolateVars => TRUE, - -AutoLaunder => TRUE, - -AutoTrue => TRUE, - -DefaultConfig => { - Paths => { - Base => ($ENV{W3C_VALIDATOR_HOME} || '/usr/local/validator'), - }, - }, - ); - my %cfg = Config::General->new(%config_opts)->getall(); - $CFG = \%cfg; - }; - if ($@) { - die <<".EOF."; + + # Launder data for -T; -AutoLaunder doesn't catch this one. + if (exists $ENV{W3C_VALIDATOR_HOME}) { + $ENV{W3C_VALIDATOR_HOME} =~ /^(.*)$/; + $ENV{W3C_VALIDATOR_HOME} = $1; + } + + # + # Read Config Files. + eval { + my %config_opts = ( + -ConfigFile => + ($ENV{W3C_VALIDATOR_CFG} || '/etc/w3c/validator.conf'), + -MergeDuplicateOptions => TRUE, + -MergeDuplicateBlocks => TRUE, + -SplitPolicy => 'equalsign', + -UseApacheInclude => TRUE, + -IncludeRelative => TRUE, + -InterPolateVars => TRUE, + -AutoLaunder => TRUE, + -AutoTrue => TRUE, + -DefaultConfig => { + Paths => { + Base => + ($ENV{W3C_VALIDATOR_HOME} || '/usr/local/validator'), + }, + }, + ); + my %cfg = Config::General->new(%config_opts)->getall(); + $CFG = \%cfg; + }; + if ($@) { + die <<".EOF."; Could not read configuration. Set the W3C_VALIDATOR_CFG environment variable or copy conf/* to /etc/w3c/. Make sure that the configuration file and all included files are readable by the web server user. The error was:\n'$@' .EOF. - } -} # end of BEGIN block. + } +} # end of BEGIN block. # # Get rid of (possibly insecure) $PATH. delete $ENV{PATH}; -our $q = new CGI; -our $lang = 'en_US'; # @@@ TODO: conneg +our $q = new CGI; +our $lang = 'en_US'; # @@@ TODO: conneg # Read error message + explanations file %RSRC = Config::General->new( - -MergeDuplicateBlocks => 1, - -ConfigFile => catfile($CFG->{Paths}->{Templates}, $lang, - 'error_messages.cfg'), - )->getall(); + -MergeDuplicateBlocks => 1, + -ConfigFile => + catfile($CFG->{Paths}->{Templates}, $lang, 'error_messages.cfg'), +)->getall(); our $T = HTML::Template->new( - filename => catfile($CFG->{Paths}->{Templates}, $lang, 'feedback.tmpl'), - die_on_bad_params => FALSE, + filename => catfile($CFG->{Paths}->{Templates}, $lang, 'feedback.tmpl'), + die_on_bad_params => FALSE, ); our $errlist = ""; @@ -84,13 +87,15 @@ our $errmsg_text; our $validated_uri; our $errmsg_id; -sub process_query { +sub process_query +{ $validated_uri = $q->param('uri'); - $errmsg_id = $q->param('errmsg_id'); + $errmsg_id = $q->param('errmsg_id'); if ($errmsg_id) { $errmsg_text = "$RSRC{msg}->{$errmsg_id}->{original}"; $errmsg_text = de_template_explanation($errmsg_text); } + # Trigger "thanks for your message. If your query requires an answer,..." ack paragraph my $sent = $q->param('send'); if ($sent) { @@ -99,33 +104,42 @@ sub process_query { } } -sub send_message { -# sends message to www-validator list @@ TODO @@ +sub send_message +{ + + # sends message to www-validator list @@ TODO @@ } -sub error_choices { -# creates drop-down menu with all possible error messages to send feedback about - my @msgnumbers = keys( %{$RSRC{msg}} ); +sub error_choices +{ + + # creates drop-down menu with all possible error messages to send feedback about + my @msgnumbers = keys(%{$RSRC{msg}}); @msgnumbers = sort { $a <=> $b } @msgnumbers; my $errlabel; - for my $errnum ( @msgnumbers ) { + for my $errnum (@msgnumbers) { $errlabel = $RSRC{msg}->{$errnum}->{original}; $errlabel = de_template_explanation($errlabel); - if (length($errlabel) > 70) { $errlabel = substr($errlabel, 0, 67)."..." } - $errlist = $errlist.'<option value="'. $errnum.'"'; + if (length($errlabel) > 70) { + $errlabel = substr($errlabel, 0, 67) . "..."; + } + $errlist = $errlist . '<option value="' . $errnum . '"'; if ($errmsg_id) { - if ($errnum == $errmsg_id) { $errlist = $errlist.'selected="selected" '; } + if ($errnum == $errmsg_id) { + $errlist = $errlist . 'selected="selected" '; + } } - $errlist = $errlist."> $errnum $errlabel</option>\n"; + $errlist = $errlist . "> $errnum $errlabel</option>\n"; } } +sub de_template_explanation +{ -sub de_template_explanation { -# takes the error message template, and replace "template keywords" with real life keywords + # takes the error message template, and replace "template keywords" with real life keywords my $explanation = shift; - if ($explanation){ + if ($explanation) { $explanation =~ s/\%1/X/; $explanation =~ s/\%2/Y/; $explanation =~ s/\%3/Z/; @@ -136,24 +150,26 @@ sub de_template_explanation { return $explanation; } +sub prepare_error_message +{ -sub prepare_error_message { -# if the form sent contains errors (what kind exactly?) -# @@ TODO @@ + # if the form sent contains errors (what kind exactly?) + # @@ TODO @@ } -sub print_prefilled_form { +sub print_prefilled_form +{ $T->param(page_title => "Feedback"); - $T->param(uri => $validated_uri); - $T->param(errmsg_id => $errmsg_id); -# $T->param(errlist => $errlist); + $T->param(uri => $validated_uri); + $T->param(errmsg_id => $errmsg_id); + + # $T->param(errlist => $errlist); $T->param(explanation => $errmsg_text); print $T->output; } - - process_query; + #error_choices; print_prefilled_form; diff --git a/misc/bundle/Makefile.PL b/misc/bundle/Makefile.PL index 794a36c..5f6fdee 100644 --- a/misc/bundle/Makefile.PL +++ b/misc/bundle/Makefile.PL @@ -8,43 +8,45 @@ WriteMakefile( LICENSE => 'open_source', VERSION_FROM => 'lib/Bundle/W3C/Validator.pm', PREREQ_PM => { - # Hard dependencies: - CGI => 2.81, - CGI::Carp => 0, - Config::General => 2.32, - Encode => 0, - Encode::Alias => 0, - Encode::HanExtra => 0, - File::Spec::Functions => 0, - HTML::Encoding => 0.52, - HTML::Parser => 3.24, - HTML::Template => 2.6, - HTTP::Headers::Auth => 0, - HTTP::Headers::Util => 0, - HTTP::Message => 1.52, - HTTP::Negotiate => 0, - HTTP::Request => 0, - JSON => 2.00, - LWP::UserAgent => 2.032, - Net::hostent => 0, - Net::IP => 0, - SGML::Parser::OpenSP => 0.991, - Socket => 0, - URI => 0, - URI::Escape => 0, - XML::LibXML => 0, - # Optional: - Encode::JIS2K => 0, - HTML::Tidy => 0, - }, - depend => { distdir => 'ChangeLog' }, - dist => { TARFLAGS => '--owner=0 --group=0 -cvf' }, - clean => { FILES => 'ChangeLog.bak' }, + # Hard dependencies: + CGI => 2.81, + CGI::Carp => 0, + Config::General => 2.32, + Encode => 0, + Encode::Alias => 0, + Encode::HanExtra => 0, + File::Spec::Functions => 0, + HTML::Encoding => 0.52, + HTML::Parser => 3.24, + HTML::Template => 2.6, + HTTP::Headers::Auth => 0, + HTTP::Headers::Util => 0, + HTTP::Message => 1.52, + HTTP::Negotiate => 0, + HTTP::Request => 0, + JSON => 2.00, + LWP::UserAgent => 2.032, + Net::hostent => 0, + Net::IP => 0, + SGML::Parser::OpenSP => 0.991, + Socket => 0, + URI => 0, + URI::Escape => 0, + XML::LibXML => 0, + + # Optional: + Encode::JIS2K => 0, + HTML::Tidy => 0, + }, + depend => {distdir => 'ChangeLog'}, + dist => {TARFLAGS => '--owner=0 --group=0 -cvf'}, + clean => {FILES => 'ChangeLog.bak'}, ); -sub MY::postamble { - return <<'MAKE_FRAG'; +sub MY::postamble +{ + return <<'MAKE_FRAG'; ChangeLog: README lib/Bundle/W3C/Validator.pm t/00load.t cvs2cl --FSF --utc --prune \ --ignore ChangeLog --ignore cvsignore --ignore SIGNATURE \ diff --git a/misc/docs_errors.pl b/misc/docs_errors.pl index 28ae8e4..b6774c7 100755 --- a/misc/docs_errors.pl +++ b/misc/docs_errors.pl @@ -2,143 +2,148 @@ ## ## Generates HTML documentation of error messages and explanations ## for W3C Markup Validation Service -## $Id: docs_errors.pl,v 1.11 2009-06-29 14:37:08 ville Exp $ +## $Id: docs_errors.pl,v 1.12 2009-11-23 22:15:18 ville Exp $ ## Pragmas. use strict; use warnings; - ## Modules. See also the BEGIN block further down below. use File::Spec::Functions qw(catfile); -use HTML::Template 2.6 qw(); -use Config::General 2.32 qw(); # Need 2.32 for <msg 0>, rt.cpan.org#17852 +use HTML::Template 2.6 qw(); +use Config::General 2.32 qw(); # Need 2.32 for <msg 0>, rt.cpan.org#17852 use vars qw($DEBUG $CFG $VERSION); + # Define global constants use constant TRUE => 1; use constant FALSE => 0; BEGIN { - # Launder data for -T; -AutoLaunder doesn't catch this one. - if (exists $ENV{W3C_VALIDATOR_HOME}) { - $ENV{W3C_VALIDATOR_HOME} =~ /^(.*)$/; - $ENV{W3C_VALIDATOR_HOME} = $1; - } - - # - # Read Config Files. - eval { - my %config_opts = ( - -ConfigFile => ($ENV{W3C_VALIDATOR_CFG} || '/etc/w3c/validator.conf'), - -MergeDuplicateOptions => TRUE, - -MergeDuplicateBlocks => TRUE, - -SplitPolicy => 'equalsign', - -UseApacheInclude => TRUE, - -IncludeRelative => TRUE, - -InterPolateVars => TRUE, - -AutoLaunder => TRUE, - -AutoTrue => TRUE, - -DefaultConfig => { - Paths => { - Base => ($ENV{W3C_VALIDATOR_HOME} || '/usr/local/validator'), - }, - }, - ); - my %cfg = Config::General->new(%config_opts)->getall(); - $CFG = \%cfg; - }; - if ($@) { - die <<".EOF."; + + # Launder data for -T; -AutoLaunder doesn't catch this one. + if (exists $ENV{W3C_VALIDATOR_HOME}) { + $ENV{W3C_VALIDATOR_HOME} =~ /^(.*)$/; + $ENV{W3C_VALIDATOR_HOME} = $1; + } + + # + # Read Config Files. + eval { + my %config_opts = ( + -ConfigFile => + ($ENV{W3C_VALIDATOR_CFG} || '/etc/w3c/validator.conf'), + -MergeDuplicateOptions => TRUE, + -MergeDuplicateBlocks => TRUE, + -SplitPolicy => 'equalsign', + -UseApacheInclude => TRUE, + -IncludeRelative => TRUE, + -InterPolateVars => TRUE, + -AutoLaunder => TRUE, + -AutoTrue => TRUE, + -DefaultConfig => { + Paths => { + Base => + ($ENV{W3C_VALIDATOR_HOME} || '/usr/local/validator'), + }, + }, + ); + my %cfg = Config::General->new(%config_opts)->getall(); + $CFG = \%cfg; + }; + if ($@) { + die <<".EOF."; Could not read configuration. Set the W3C_VALIDATOR_CFG environment variable or copy conf/* to /etc/w3c/. Make sure that the configuration file and all included files are readable by the web server user. The error was:\n'$@' .EOF. - } -} # end of BEGIN block. + } +} # end of BEGIN block. # # Get rid of (possibly insecure) $PATH. delete $ENV{PATH}; -our $lang = 'en_US'; # @@@ TODO: conneg +our $lang = 'en_US'; # @@@ TODO: conneg # Read error message + explanations file -our $error_messages_file = catfile($CFG->{Paths}->{Templates}, $lang, 'error_messages.cfg'); -our %config_errs = ( -MergeDuplicateBlocks => 1, - -ConfigFile => $error_messages_file); +our $error_messages_file = + catfile($CFG->{Paths}->{Templates}, $lang, 'error_messages.cfg'); +our %config_errs = ( + -MergeDuplicateBlocks => 1, + -ConfigFile => $error_messages_file +); our %rsrc = Config::General->new(%config_errs)->getall(); - our $T = HTML::Template->new( - filename => catfile($CFG->{Paths}->{Templates}, $lang, 'docs_errors.tmpl'), - die_on_bad_params => FALSE, + filename => catfile($CFG->{Paths}->{Templates}, $lang, 'docs_errors.tmpl'), + die_on_bad_params => FALSE, ); $T->param(list_errors_hasverbose => &list_errors_hasverbose(\%rsrc)); -$T->param(list_errors_noverbose => &list_errors_noverbose(\%rsrc)); +$T->param(list_errors_noverbose => &list_errors_noverbose(\%rsrc)); print $T->output; -sub list_errors_hasverbose{ - my $rsrc = shift; +sub list_errors_hasverbose +{ + my $rsrc = shift; my $errors = []; my $error_id; - my $max_error_id=500; # where to stop - for ($error_id=0;$error_id<$max_error_id;$error_id++) - { - my %single_error; - if ($rsrc->{msg}->{$error_id}) - { - my $verbose = $rsrc->{msg}->{$error_id}->{verbose}; - if ($verbose) - { - my $original = $rsrc->{msg}->{$error_id}->{original}; - $original = &de_template_explanation($original); - $single_error{original} = $original; - $single_error{id} = $error_id; - $single_error{verbose} = $rsrc->{msg}->{$error_id}->{verbose}; - $single_error{verbose} =~ s/<!--CFG_HOME_PAGE-->/$CFG->{'Home Page'}/g; - - push @{$errors}, \%single_error; + my $max_error_id = 500; # where to stop + for ($error_id = 0; $error_id < $max_error_id; $error_id++) { + my %single_error; + if ($rsrc->{msg}->{$error_id}) { + my $verbose = $rsrc->{msg}->{$error_id}->{verbose}; + if ($verbose) { + my $original = $rsrc->{msg}->{$error_id}->{original}; + $original = &de_template_explanation($original); + $single_error{original} = $original; + $single_error{id} = $error_id; + $single_error{verbose} = $rsrc->{msg}->{$error_id}->{verbose}; + $single_error{verbose} =~ + s/<!--CFG_HOME_PAGE-->/$CFG->{'Home Page'}/g; + + push @{$errors}, \%single_error; + # Fix up relative paths (/check vs /docs/errors.html) s/href="docs\//href="/ for $single_error{original}, $single_error{verbose}; - } + } - } + } } - return $errors; + return $errors; } -sub list_errors_noverbose{ - my $rsrc = shift; +sub list_errors_noverbose +{ + my $rsrc = shift; my $errors = []; my $error_id; - my $max_error_id=500; # where to stop - for ($error_id=0;$error_id<$max_error_id;$error_id++) - { - my %single_error; - if ($rsrc->{msg}->{$error_id}) - { - my $verbose = $rsrc->{msg}->{$error_id}->{verbose}; - if (! $verbose) - { - my $original = $rsrc->{msg}->{$error_id}->{original}; - $original = &de_template_explanation($original); - $single_error{original} = $original; - $single_error{id} = $error_id; - $single_error{verbose} = $rsrc->{msg}->{$error_id}->{verbose}; - push @{$errors}, \%single_error; - } - - } + my $max_error_id = 500; # where to stop + for ($error_id = 0; $error_id < $max_error_id; $error_id++) { + my %single_error; + if ($rsrc->{msg}->{$error_id}) { + my $verbose = $rsrc->{msg}->{$error_id}->{verbose}; + if (!$verbose) { + my $original = $rsrc->{msg}->{$error_id}->{original}; + $original = &de_template_explanation($original); + $single_error{original} = $original; + $single_error{id} = $error_id; + $single_error{verbose} = $rsrc->{msg}->{$error_id}->{verbose}; + push @{$errors}, \%single_error; + } + + } } - return $errors; + return $errors; } -sub de_template_explanation { -# takes the error message template, and replace "template keywords" with real life keywords +sub de_template_explanation +{ + + # takes the error message template, and replace "template keywords" with real life keywords my $explanation = shift; if ($explanation) { $explanation =~ s/\%1/X/; diff --git a/misc/spmpp.pl b/misc/spmpp.pl index c6a153c..ff364fc 100755 --- a/misc/spmpp.pl +++ b/misc/spmpp.pl @@ -4,7 +4,7 @@ # for use in the Validator, from an OpenSP ParserMessages.rc. # (spmpp = "SP Message Pre-Processor") # -# $Id: spmpp.pl,v 1.2 2004-05-09 15:56:55 link Exp $ +# $Id: spmpp.pl,v 1.3 2009-11-23 22:15:18 ville Exp $ # # @@ -23,14 +23,14 @@ my @msg; # # Snarf OpenSP's ParserMessages.rc and populate @msg. my $msgfile = $ARGV[0] || "/usr/local/validator/htdocs/config/verbosemsg.rc"; -open FH, $msgfile - or die "Can't open OpenSP ParserMessages file '$msgfile': $!"; +open FH, $msgfile or + die "Can't open OpenSP ParserMessages file '$msgfile': $!"; while (<FH>) { - next if /^\s*$/; - my($id, $s) = split /, /, $_, 2; - $id += 0; # Force numerical (kill leading space)... - chomp $s; # Strip newline from end of message... - push @msg, [$id, $s]; + next if /^\s*$/; + my ($id, $s) = split /, /, $_, 2; + $id += 0; # Force numerical (kill leading space)... + chomp $s; # Strip newline from end of message... + push @msg, [$id, $s]; } close FH; @@ -53,7 +53,7 @@ print <<".EOF."; # the last digit of the "muid" is replaced at runtime). # for (@msg) { - print <<"_.EOF._"; + print <<"_.EOF._"; <msg $_->[0]> original = $_->[1] verbose <<.EOF. |