#!/usr/local/bin/perl -w # # W3C HTML Validation Service # A CGI script to retrieve and validate an HTML file # # Copyright 1995-2001 Gerald Oskoboiny # for additional contributors, see http://dev.w3.org/cvsweb/validator/ # # This source code is available under the license at: # http://www.w3.org/Consortium/Legal/copyright-software # # $Id: check,v 1.185 2001-09-20 18:22:43 link Exp $ # # We need Perl 5.004. require 5.004; # # Load modules use strict; use LWP::UserAgent; use URI; use URI::Escape; use CGI::Carp; use CGI qw(:cgi -newstyle_urls -private_tempfiles); use Text::Wrap; use Text::Iconv; use HTML::Parser 3.25; # Need 3.25 for $p->ignore_elements. ############################################################################# # Constant definitions ############################################################################# # # Define global constants use constant TRUE => 1; use constant FALSE => 0; use constant UNDEF => undef; use constant DEBUG => 0; # # Tentative Validation Severeties. use constant T_DEBUG => 1; # 0000 0001 use constant T_INFO => 2; # 0000 0010 use constant T_WARN => 4; # 0000 0100 use constant T_ERROR => 8; # 0000 1000 use constant T_FATAL => 16; # 0001 0000 use constant T_CHARSET_KLUDGE => 128; # 1000 0000 # # Define global variables use vars qw($VERSION $DATE $MAINTAINER $NOTICE); # Strings. use vars qw($frag $pub_ids $element_uri $file_type $doctypes $charsets); # Cfg hashes. use vars qw($DEBUG); # Switch to turn debugging on and off. use vars qw($File); # Global var to hold all metadata for this validation. $DEBUG += 1 unless $ENV{SERVER_PORT} == 80; # # Paths and file locations my $base_path = $0; # get full name of program $base_path =~ s#^(/([^/]*/)*)[^/]*/[^/]*/[^/]*$#$1#; # move two levels up # backtracking is somewhat inefficient, but nothing to worry my $html_path = $base_path . 'htdocs/'; my $elem_db = $html_path . 'config/eref.cfg'; my $fpis_db = $html_path . 'config/fpis.cfg'; my $frag_db = $html_path . 'config/frag.cfg'; my $type_db = $html_path . 'config/type.cfg'; my $dtds_db = $html_path . 'config/doctypes.cfg'; my $chst_db = $html_path . 'config/charset.cfg'; my $sgmlstuff = $html_path . 'sgml-lib'; my $temp = "/tmp/validate.$$"; # @@ Use POSIX/IO::File tmpfiles instead! # # Executables and binaries my $sp = '/usr/local/bin/lq-nsgmls'; my $osp = '/usr/local/bin/onsgmls'; # # URIs and fragments my $abs_svc_uri = 'http://validator.w3.org/'; my $uri_def_uri = 'http://www.w3.org/Addressing/#terms'; my $faqloc = '/docs/'; my $faqerrloc = $faqloc . 'errors.html'; my $element_ref = 'http://www.htmlhelp.com/reference/html40/'; # # Strings $VERSION = q$Revision: 1.185 $; $VERSION =~ s/Revision: ([\d\.]+) /$1/; $DATE = q$Date: 2001-09-20 18:22:43 $; $MAINTAINER = 'gerald@w3.org'; $NOTICE = ''; # "

Note: This service will be ..."; # # DOCTYPEs my $html32_doctype = q(); my $html40s_doctype = q(); my $html40t_doctype = q(); my $html40f_doctype = q(); my $xhtmlt_doctype = q( plain text version string $pub_ids = &read_cfg($fpis_db); # Errors -> fragment identifier $element_uri = &read_cfg($elem_db); # Element -> URI fragment $file_type = &read_cfg($type_db); # Content -> File -type $doctypes = &read_cfg($dtds_db); # Name -> doctype $charsets = &read_cfg($chst_db); # charset -> iconv parameter # # Set up signal handlers. $SIG{TERM} = \&erase_stuff; $SIG{KILL} = \&erase_stuff; $SIG{PIPE} = 'IGNORE'; # # delete() the, possibly tainted, $PATH. delete $ENV{PATH}; ############################################################################# # Process CGI variables ############################################################################# # # Create a new CGI object. my $q = new CGI; # # Backwards compatibility; see # http://lists.w3.org/Archives/Public/www-validator/1999JulSep/0197 # http://lists.w3.org/Archives/Public/www-validator/1999JulSep/0212 if (scalar $q->param) { foreach my $param ($q->param) { $q->param($param, TRUE) unless $q->param($param); } } # # Futz the URI so "/referer" works. if ($q->path_info eq '/referer') { $q->param('uri', $q->referer); } # # USe HTTP Referer if uri=referer. if ($q->param('uri') =~ m(referer)i) { $q->param('uri', $q->referer); } # # Use "url" unless a "uri" was also given. if ($q->param('url') and not $q->param('uri')) { $q->param('uri', $q->param('url')); } # # Supercede URI with an uploaded file. if ($q->param('uploaded_file')) { &redirect_to_home_page unless length($q->param('uploaded_file')); # Must have filename. $q->param('uri', 'upload://' . $q->param('uploaded_file')); } # # Supercede URI with an uploaded fragment. if ($q->param('fragment')) {$q->param('uri', 'upload://Form Submission')}; # # Send them to the homepage unless we can extract a URI from either of the # acceptable sources: uri, url or /referer. &redirect_to_home_page unless length($q->param('uri')) > 5; # # Munge the URI to include commonly omitted prefix. $q->param('uri', 'http://' . $q->param('uri')) if $q->param('uri') =~ m(^www)i; ############################################################################# # Output validation results ############################################################################# # # A string containing the HTML header for validation results. # We save it in a string instead of printing it in case we need to abort before # we have any meaningfull results to report. @@ May not be necessary! $File->{Results} = <<"EOF"; Content-Type: text/html; charset=utf-8 $html40t_doctype W3C HTML Validation Service Results

HTML Validation Service Results

$NOTICE EOF # # Punt if we don't recognize this URI scheme. # @@ LWP does a whole bunch more: transparently! unless ($q->param('uri') =~ m(^(http|upload)://)) { print $File->{Results}; print <<"EOF";

Sorry, this type of URI is not supported by this service.

URIs should be in the form:

$abs_svc_uri

(There are other types of URIs, too, but only http:// URIs are currently supported by this service.)

EOF &clean_up_and_exit; } # # Get the file and metadata. if ($q->param('uploaded_file')) {$File = &handle_file($q, $File)} elsif ($q->param('fragment')) {$File = &handle_frag($q, $File)} elsif ($q->param('uri')) {$File = &handle_uri( $q, $File)}; # # Abort if there was no document type mapping for this Content-Type, in which # case the document type will be equal to the content type (contains a "/"). if ($File->{'Type'} =~ m(/) and not $q->param('uploaded_file')) { print $File->{Results}; print <<"EOF";

Sorry, I am unable to validate this document because its returned content-type was $File->{Type}, which is not currently supported by this service.

EOF &clean_up_and_exit; } # # Overall parsing algorithm for documents returned as text/html: # # For documents that come to us as text/html, # # 1. check if there's a doctype # 2. if there is a doctype, parse/validate against that DTD # 3. if no doctype, check for an xmlns= attribute on the first element # 4. if there is an xmlns= attribute, check for XML well-formedness # 5. if there is no xmlns= attribute, and no DOCTYPE, punt. # # # Detect and remove a UTF-8 BOM. $File->{Content}[0] =~ s/^\xEF\xBB\xBF// and &add_warning(<<".EOF."); Note: UTF-8 'BOM' detected and removed. (This message is informational. See the explanation for details.) .EOF. # # Override DOCTYPE if user asked for it. if (defined $q->param('doctype') and not $q->param('doctype') =~ /(Inline|detect)/i) { $File->{Content} = &supress_doctype($File->{Content}); unshift @{$File->{Content}}, $doctypes->{$q->param('doctype')}; my $dtd = ent($q->param('doctype')); &add_warning(<<".EOF."); DOCTYPE Override in effect! Any DOCTYPE Declaration in the document has been supressed and the DOCTYPE for «$dtd» inserted instead. The document will not be Valid until you alter the source file to reflect this new DOCTYPE. .EOF. $File->{Tentative} |= T_ERROR; # Tag it as Invalid. } # # Try to extract a DOCTYPE or xmlns. $File = &preparse($File); # # Set document type to XHTML if the DOCTYPE was for XHTML. # Set document type to MathML if the DOCTYPE was for MathML. # This happens when the file is served as text/html $File->{Type} = 'xhtml' if $File->{DOCTYPE} =~ /xhtml/i; $File->{Type} = 'mathml' if $File->{DOCTYPE} =~ /mathml/i; # # Figure out which charset was detected. if ($File->{HTTP_Charset}) {$File->{Charset} = $File->{HTTP_Charset}} elsif ($File->{XML_Charset}) {$File->{Charset} = $File->{XML_Charset}} elsif ($File->{META_Charset}) {$File->{Charset} = $File->{META_Charset}} else { $File->{Charset} = 'unknown'} # If we have a charset field in the request, we use it if ($q->param('charset') and $q->param('charset') ne 'unknown' and $q->param('charset') ne '(detect automatically)') { $q->param('charset') =~ /^([^ ]*)/; $File->{Use_Charset} = lc $1; } else { $File->{Use_Charset} = $File->{Charset}; } # # Setup SP environment for the charset. if ($File->{Use_Charset} ne 'unknown') { $ENV{SP_CHARSET_FIXED} = 'YES'; $ENV{SP_ENCODING} = 'utf-8'; } # # Print header and jump links. print $File->{Results}, &build_jump_links; # # Print different things if we got redirected or had a file upload. if (URI::eq("$File->{URI}", $q->param('uri'))) { # @@ Need to stringify here? &add_table(qq(Address), '' . ' [Go to...]'); } elsif ($q->param('uploaded_file')) { &add_table("File", $File->{URI}); } else { &add_table(qq(URI), '' . ' [Go to URI]'); &add_warning('Note: The URI you gave me, «

' .
	       $q->param('uri') . '

», ' . 'returned a redirect to ' . '«' . $File->{URI} . '».'); } &add_table("Last Modified", $File->{Modified}) if $File->{Modified}; &add_table("Server", $File->{Server}) if $File->{Server}; &add_table("Content Length", $File->{Size}) if $File->{Size}; &add_table("Detected Character Encoding", "$File->{Charset}"); &add_table("Used Character Encoding", "$File->{Use_Charset}") unless $File->{Charset} eq $File->{Use_Charset}; unless ($q->param('uploaded_file')) { &add_table(qq(Select Character Encoding), $q->popup_menu(-name => 'charset', -id => 'charset', -values => [ '(detect automatically)', 'utf-8 (Unicode, worldwide)', 'iso-8859-1 (Western Europe)', 'iso-8859-2 (Central Europe)', 'iso-8859-3 (Maltese)', 'iso-8859-4 (Baltic Rim)', 'iso-8859-5 (Cyrillic)', 'iso-8859-6-i (Arabic)', 'iso-8859-7 (Greek)', 'iso-8859-8-i (Hebrew)', 'iso-8859-9 (Turkish)', 'iso-8859-10 (Latin 6)', 'iso-8859-13 (Latin 7)', 'iso-8859-14 (Celtic)', 'iso-8859-15 (Latin 9)', 'us-ascii (basic English)', 'euc-jp (Japanese, Unix)', 'shift_jis (Japanese, Win/Mac)', 'iso-2022-jp (Japanese, email)', 'euc-kr (Korean)', 'gb2312 (Chinese, simplified)', 'big5 (Chinese, traditional)', 'tis-620 (Thai)', 'koi8-r (Russian)', 'koi8-u (Ukrainian)', 'macintosh (MacRoman)', 'windows-1250 (Central Europe)', 'windows-1251 (Cyrillic)', 'windows-1252 (Western Europe)', 'windows-1253 (Greek)', 'windows-1254 (Turkish)', 'windows-1255 (Hebrew)', 'windows-1256 (Arabic)', 'windows-1257 (Baltic Rim)', ], ) ); } if ($File->{HTTP_Charset} ne $File->{META_Charset} and $File->{HTTP_Charset} ne '' and $File->{META_Charset} ne '' and $File->{Charset} ne 'unknown') { &add_warning(<<"EOHD"); Character Encoding mismatch! The character encoding specified in the HTTP header («$File->{HTTP_Charset}») is different from the one specified in the META element («$File->{META_Charset}»). I will use «$File->{Charset}» for this validation. EOHD } elsif ($File->{HTTP_Charset} ne $File->{XML_Charset} and $File->{HTTP_Charset} ne '' and $File->{XML_Charset} ne '' and $File->{Charset} ne 'unknown') { &add_warning(<<"EOHD"); Character Encoding mismatch! The character encoding specified in the HTTP header («$File->{HTTP_Charset}») is different from the one specified in the XML declaration («$File->{XML_Charset}»). I will use «$File->{Charset}» for this validation. EOHD } if ($File->{Use_Charset} ne $File->{Charset}) { &add_warning(<<"EOHD"); Character Encoding Override in effect! The detected character encoding («$File->{Charset}») has been supressed and the character encoding («$File->{Use_Charset}») used instead. EOHD $File->{Tentative} |= T_ERROR; } if ($File->{Use_Charset} eq 'unknown') { &add_warning(<<"EOHD"); No Character Encoding detected! To assure correct validation, processing, and display, it is important that the character encoding is properly labeled. Further explanations. EOHD $File->{Tentative} |= T_DEBUG; # WOuld be T_WARN, but the complaints... } { # block for character conversion and checking my @lines; unless ($File->{Use_Charset} eq 'utf-8' or $File->{Use_Charset} eq 'unknown') { my ($command, $result_charset) = split " ", $charsets->{$File->{Use_Charset}}, 2; if ($command eq 'I') { eval {my $c = Text::Iconv->new($result_charset, 'utf-8')}; $command = '' if ($@); } elsif ($command eq 'X') { $@ = "$File->{Use_Charset} undefined; replace by $result_charset"; } if ($command ne 'I') { &print_table; &print_warnings; &print_charset_error($@, $File->{Use_Charset}); &clean_up_and_exit; } my $c = Text::Iconv->new($result_charset, 'utf-8'); my $line = 0; for (@{$File->{Content}}) { my $in = $_; $line++; $_ = $c->convert($_); # $_ is local!! push @lines, $line if ($in ne "" and $_ eq ""); } } # check correctness of UTF-8 both for UTF-8 input and for conversion results unless ($File->{Use_Charset} eq 'unknown') { for (my $i = 0; $i < $#{$File->{Content}}; $i++) { # substitution needed for very long lines (>32K), # to avoid backtrack stack overflow local $_ = $File->{Content}->[$i]; s/ [\x00-\x7F] # ASCII | [\xC2-\xDF] [\x80-\xBF] # non-overlong 2-byte sequences | \xE0[\xA0-\xBF] [\x80-\xBF] # excluding overlongs | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte sequences | \xED[\x80-\x9F] [\x80-\xBF] # excluding surrogates | \xF0[\x90-\xBF] [\x80-\xBF]{2} # planes 1-3 | [\xF1-\xF3] [\x80-\xBF]{3} # planes 4-15 | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 //xg; push @lines, $i if length; } } if(@lines) { my $lines = $#lines ? "lines " : "line "; $lines .= join ", ", @lines; &print_table; &print_warnings; print <<"EOF";

Sorry, I am unable to validate this document because on $lines it contained some byte(s) that I cannot interpret as $File->{Use_Charset}. Please check both the content of the file and the character encoding indication.

EOF &clean_up_and_exit; } } my $xmlflags = '-wnon-sgml-char-ref'; my $catalog = $sgmlstuff . '/catalog'; if ($File->{Type} eq 'xhtml') { $catalog = $sgmlstuff . '/xhtml.soc'; $ENV{SP_CHARSET_FIXED} = 'YES'; $ENV{SP_ENCODING} = 'UTF-8'; $xmlflags = '-wxml'; } elsif ($File->{Type} eq 'svg') { $catalog = $sgmlstuff . '/svg.soc'; $ENV{SP_CHARSET_FIXED} = 'YES'; $ENV{SP_ENCODING} = 'UTF-8'; $xmlflags = '-wxml'; } elsif ($File->{Type} eq 'smil') { $catalog = $sgmlstuff . '/smil.soc'; $ENV{SP_CHARSET_FIXED} = 'YES'; $ENV{SP_ENCODING} = 'UTF-8'; $xmlflags = '-wxml'; } elsif ($File->{Type} eq 'mathml') { $catalog = $sgmlstuff . '/mathml.soc'; $ENV{SP_CHARSET_FIXED} = 'NO'; $ENV{SP_ENCODING} = 'XML'; $xmlflags = '-wxml '; $sp = $osp; # *WARNING* *WARNING* Danger Will Robinson! :-) } elsif ($File->{Type} eq 'xml' or $File->{Namespace}) { # no doctype, with xmlns attr on 1st element $File->{Type} = 'xml'; # @@ probably a better way to do this $catalog = $sgmlstuff . '/sp-1.3/pubtext/xml.soc'; $ENV{SP_CHARSET_FIXED} = 'YES'; $ENV{SP_ENCODING} = 'XML'; $xmlflags = '-wxml'; $xmlflags .= ' -wno-valid' unless $File->{DOCTYPE}; } my $command = "$sp -f$temp -E0 $xmlflags -c $catalog"; &add_table("nsgmls command line", "$command") if $DEBUG; open CHECKER, "|$command - >$temp.esis" or &internal_error ("open(|$command - >$temp.esis) returned: $!"); for (@{$File->{Content}}) {print CHECKER $_, "\n"}; close CHECKER; $File = &parse_errors($File, $temp); # Parse error output. $File->{ESIS} = []; my $elements_found = 0; open ESIS, "$temp.esis" or &internal_error("open($temp.esis) returned: $!"); while () { push @{$File->{'DEBUG'}->{ESIS}}, $_; $elements_found++ if /^\(/; if ($File->{Type} eq 'xml' or $File->{Type} eq 'xhtml') { if (/^Axmlns() \w+ (.*)/ or /^Axmlns:([^ ]+) \w+ (.*)/) { if (not $File->{Namespace} and $elements_found == 0 and $1 eq "") { $File->{Namespace} = $2; } $File->{Namespaces}->{$2}++; } } next if / IMPLIED$/; next if /^ASDAFORM CDATA /; next if /^ASDAPREF CDATA /; chomp; # Removes trailing newlines push @{$File->{ESIS}}, $_; } close ESIS or warn "close($temp.esis) returned: $!"; my $fpi; my $version = 'unknown'; if ($File->{Type} eq 'xhtml' or $File->{Type} eq 'mathml' or $File->{Type} eq 'svg' or $File->{Type} eq 'smil') { $fpi = $File->{DOCTYPE}; } elsif ($File->{Type} eq 'xml') { $fpi = 'XML'; } else { for (@{$File->{ESIS}}) { next unless /^AVERSION CDATA (.*)/; $fpi = $1; last; } # Needed for HTML4 Strict, which has no version attribute on the HTML element if (length $File->{DOCTYPE} and not defined $fpi) {$fpi = $File->{DOCTYPE}}; } $version = $pub_ids->{$fpi} || 'unknown'; if ($File->{Type} eq 'xml' or $File->{Type} eq 'xhtml') { &add_table("Document Type", $version); if ($File->{Type} eq 'xhtml' and $File->{Namespace} ne 'http://www.w3.org/1999/xhtml') { &add_warning ("Unknown namespace («$File->{Namespace}») for text/html document!"); if ($File->{Namespace} ne '') { &add_table("Root Namespace", "$File->{Namespace}"); } } elsif ($File->{Type} eq 'svg' and $File->{Namespace} ne 'http://www.w3.org/2000/svg') { &add_warning ("Unknown namespace («$File->{Namespace}») for SVG document!"); if ($File->{Namespace} ne '') { &add_table("Root Namespace", "$File->{Namespace}"); } } else { if ($File->{Namespace} ne '') { &add_table("Root Namespace", "$File->{Namespace}"); } } if (scalar keys %{$File->{Namespaces}} > 1) { my $namespaces = "

"); } } else { &add_table("Current Doctype", $version); # is this current or detected??? } unless ($q->param('uploaded_file')) { &add_table(qq(Select Doctype), $q->popup_menu(-name => 'doctype', -id => 'doctype', -values => [ '(detect automatically)', 'XHTML 1.0 Strict', 'XHTML 1.0 Transitional', 'XHTML 1.0 Frameset', 'HTML 4.01 Strict', 'HTML 4.01 Transitional', 'HTML 4.01 Frameset', 'HTML 3.2', 'HTML 2.0', ], -default => $q->param('doctype'), ) ); } &print_table; &print_warnings; print "

Validation Results

\n"; if ($File->{Type} eq 'xml' or $File->{Type} eq 'xhtml' or $File->{Type} eq 'mathml' or $File->{Type} eq 'svg' or $File->{Type} eq 'smil') { my $xmlvalid = ($File->{DOCTYPE} ? ' and validity' : ''); print <<"EOHD";

Below are the results of checking this document for XML well-formedness$xmlvalid.

EOHD } else { print <<"EOHD";

Below are the results of attempting to parse this document with an SGML parser.

EOHD } if (defined $File->{Tentative}) { my $class = ''; $class .= ($File->{Tentative} & T_INFO ? ' info' :''); $class .= ($File->{Tentative} & T_WARN ? ' warning' :''); $class .= ($File->{Tentative} & T_ERROR ? ' error' :''); $class .= ($File->{Tentative} & T_FATAL ? ' fatal' :''); unless ($File->{Tentative} == T_DEBUG) { print <<".EOF.";

Please note that you have chosen one or more options that alter the content of the document before validation, or have not provided enough information to accurately validate the document. Even if no errors are reported below, the document will not be valid until you manually make the changes we have performed automatically. Specifically, if you used some of the options that override a property of the document (e.g. the DOCTYPE or Character Encoding), you must make the same change to the source document or the server setup before it can be valid. You will also need to insert an appropriate DOCTYPE Declaration or Character Encoding (the "charset" parameter for the Content-Type HTTP header) if any of those are missing.

.EOF. } } if (scalar @{$File->{Errors}}) { $q->param('ss', TRUE); &report_errors($File) } else { &report_valid($File) } &weblint() if $q->param('weblint'); &outline($File) if $q->param('outline'); &show_source($File) if $q->param('ss'); &parse_tree($File) if $q->param('sp'); &show_esis($File) if $q->param('esis'); &show_errors($File) if $q->param('errors'); &clean_up_and_exit; ############################################################################# # Subroutine definitions ############################################################################# # # Add info to the metadata table datastructure. sub add_table {push @{$File->{Table}}, { Head => $_[0], Tail => $_[1]}}; # # Print the table containing the metadata about the Document Entity. sub print_table { my $tableEntry; unless ($q->param('uploaded_file')) { add_table("Options", ' param('ss') ? 'checked="checked"' : '') . " />Show Source\n" . ' param('outline') ? 'checked="checked"' : '') . " />Outline\n" . ' param('sp') ? 'checked="checked"' : '') . " />Parse Tree\n" . ' param('noatt') ? 'checked="checked"' : '') . " />...no attributes\n" ); } print "

\n"; for $tableEntry (@{$File->{Table}}) { print " \n"; print ' ' x 6, "\n"; print ' ' x 6, "\n"; print " \n"; } print " \n"; print "

", $$tableEntry{Head}, ":	", $$tableEntry{Tail}, "

\n"; } # # Add a waring message to the output. sub add_warning {push @{$File->{Warnings}}, shift}; # # Print out a list of warnings. sub print_warnings { return unless defined @{$File->{Warnings}}; print qq(

Notes & Warnings

\n"; } # # Print HTML explaining why/how to use a DOCTYPE Declaration. sub output_doctype_spiel { print <<"EOF";

You should make the first line of your HTML document a DOCTYPE declaration, for example, for a typical HTML 4.01 document:

      <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
      <HTML>
	<HEAD>
	  <TITLE>Title</TITLE>
	</HEAD>

	<BODY>
	  <-- ... body of document ... -->
	</BODY>
      </HTML>

EOF } # # Spit out some closing HTML at the end of output. sub output_closing { print <<"EOF";

Gerald Oskoboiny
Last modified: $DATE

EOF } # # Leave a message and then die (use for internal errors only) sub internal_error { my ($dieMessage) = shift; print <<"EOF";

Internal server error ($dieMessage). Please contact maintainer. EOF &output_closing; &erase_stuff; die "$dieMessage\n"; } # # Delete temporary files. sub erase_stuff { unlink $temp or warn "unlink($temp) returned: $!\n"; unlink "$temp.esis" or warn "unlink($temp.esis) returned: $!\n"; unlink "$temp.weblint"; } # # Clean up and exit... :-) sub clean_up_and_exit { &output_closing; &erase_stuff; exit; } # # Redirect them to the home page. Simplistic error handling. sub redirect_to_home_page { print <<".EOF."; Status: 301 Moved Permanently Content-Type: text/html Location: $abs_svc_uri Moved!

Please see the home page.

.EOF. &clean_up_and_exit; } # # Generate HTML for the "Jump to:" links in results. sub build_jump_links { my $text = ''; my $count = 0; $count++ if $q->param('ss'); $count++ if $q->param('sp'); $count++ if $q->param('weblint'); $count++ if $q->param('outline'); if ( $count ) { $text .= "

\n Jump to: "; if ( $q->param('weblint') ) { $text .= "Weblint Results"; $count--; $text .= " or " if ( $count == 1 ); $text .= ", " if ( $count > 1 ); } if ( $q->param('outline') ) { $text .= "Outline"; $count--; $text .= " or " if ( $count == 1 ); $text .= ", " if ( $count > 1 ); } if ( $q->param('ss') ) { $text .= "Source Listing"; $count--; $text .= " or " if ( $count == 1 ); $text .= ", " if ( $count > 1 ); } if ( $q->param('sp') ) { $text .= "Parse Tree"; } $text .= ".\n

\n\n"; } return $text; } # # Proxy authentication requests. sub authenticate { my $resource = shift; my $authHeader = shift; my $realm = $resource; $realm =~ s([^\w\d.-]*){}g; $authHeader =~ s( realm=([\'\"])?([^\1]+)\1){ realm="$realm-$2"}; print <<"EOF"; Status: 401 Authorization Required WWW-Authenticate: $authHeader Connection: close Content-Type: text/html 401 Authorization Required

Authorization Required

Sorry, I am not authorized to access the specified URI.

The URI you specified,

$resource

returned a 401 "authorization required" response when I tried to download it.

You should have been prompted by your browser for a username/password pair; if you had supplied this information, I would have forwarded it to your server for authorization to access the resource. You can use your browser's "reload" function to try again, if you wish.

Of course, you may not want to trust me with this information, which is fine. I can tell you that I don't log it or do anything else nasty with it, and you can download the source for this service to see what it does, but you have no guarantee that this is actually the code I'm using; you basically have to decide whether to trust me or not. :-)

You should also be aware that the way we proxy this authentication information defeats the normal working of HTTP Authentication. If you authenticate to server A, your browser may keep sending the authentication information to us every time you validate a page, regardless of what server it's on, and we'll happily pass that on to the server thereby making it possible for a malicious server operator to capture your credentials.

Due to the way HTTP Authentication works there is no way we can avoid this. We are using some "tricks" to fool your client into not sending this information in the first place, but there is no guarantee this will work. If security is a conern to you, you may wish to avoid validating protected resources or take extra precautions to prevent your browser from sending authentication information when validating other servers.

Also note that you shouldn't use HTTP Basic Authentication for anything which really needs to be private, since the password goes across the network unencrypted.

EOF } # # Complain about unknown HTTP responses. sub print_unknown_http_error_message { my $uri = shift; my $code = shift; my $message = shift; print <<"EOF";

I got the following unexpected response when trying to retrieve $uri:

$code $message

Please make sure you have entered the URI correctly.

EOF } # # Complain about strange charsets. sub print_charset_error { my $error = shift; my $charset = shift; print <<".EOF.";

Sorry! A fatal error occurred when attempting to transcode the character encoding of the document. Either we do not support this character encoding yet, or you have specified a non-existent character encoding (often a misspelling).

The detected character encoding was "$charset".

The error was "$error".

If you believe the character encoding to be valid you can submit a request for that character encoding (see the feedback page for details) and we will look into supporting it in the future.

.EOF. } # # Print blurb advocating using the CSS Validator. sub output_css_validator_blurb { my $uri = shift; $uri = ent($uri); print <<"EOHD";

If you use CSS in your document, you should also check it for validity using the W3C CSS Validation Service.

EOHD } # # Read TAB-delimited configuration files. Returns a hash reference. sub read_cfg { my $file = shift; my %cfg; open CFG, $file or die "open($file) returned: $!\n"; while () { next if /^\s*$/; next if /^\s*#/; chomp; my($k, $v) = split /\t+/, $_; $cfg{$k} = $v; } close CFG; return \%cfg; } # # Fetch an URI and return the content and selected meta-info. sub handle_uri { my $q = shift; # The CGI object. my $File = shift; # The master datastructure. my $uri = $q->param('uri'); # The URI to fetch. my $ua = new LWP::UserAgent; $ua->agent("W3C_Validator/$VERSION " . $ua->agent); $ua->parse_head(0); # Parse the http-equiv stuff ourselves. @@ Why? my $req = new HTTP::Request(GET => $uri); # If we got a Authorization header, the client is back at it after being # prompted for a password so we insert the header as is in the request. if($ENV{HTTP_AUTHORIZATION}){ $req->headers->header(Authorization => $ENV{HTTP_AUTHORIZATION}); } my $res = $ua->request($req); unless ($res->code == 200) { if ($res->code == 401) { &authenticate($res->request->url, $res->www_authenticate); } else { print $File->{Results}; &print_unknown_http_error_message($uri, $res->code, $res->message); } &clean_up_and_exit; } my($type, $charset) = &parse_content_type($res->header('Content-Type')); my $lastmod = undef; if ( $res->last_modified ) { $lastmod = scalar(gmtime($res->last_modified)); } $File->{Content} = &normalize_newlines($res->content); $File->{Type} = $type; $File->{HTTP_Charset} = $charset; $File->{Modified} = $lastmod; $File->{Server} = scalar($res->server); $File->{Size} = scalar($res->content_length); $File->{URI} = scalar($res->request->url); return $File; } # # Handle uploaded file and return the content and selected meta-info. sub handle_file { my $q = shift; # The CGI object. my $File = shift; # The master datastructure. my $f = $q->param('uploaded_file'); my $h = $q->uploadInfo($f); my $file; while (not eof $f) {$file .= <$f>}; my($type, $charset) = &parse_content_type($h->{'Content-Type'}); $File->{Content} = &normalize_newlines($file); $File->{Type} = $type; $File->{HTTP_Charset} = $charset; $File->{Modified} = $h->{'Last-Modified'}; $File->{Server} = $h->{'Server'}; $File->{Size} = $h->{'Content-Length'}; $File->{URI} = $q->param('uploaded_file'); return $File; } # # Handle uploaded file and return the content and selected meta-info. sub handle_frag { my $q = shift; # The CGI object. my $File = shift; # The master datastructure. $File->{Content} = &normalize_newlines(shift->param('fragment')); $File->{Type} = 'html'; $File->{HTTP_Charset} = ''; $File->{Modified} = ''; $File->{Server} = ''; $File->{Size} = ''; $File->{URI} = 'upload://Form Submission'; return $File; } # # Parse a Content-Type and parameters. Return document type and charset. sub parse_content_type { my $Content_Type = shift; my $charset = ''; my $type = ''; my($ct, @param) = split /\s*;\s*/, lc $Content_Type; $type = $file_type->{$ct} || $ct; foreach my $param (@param) { my($p, $v) = split /\s*=\s*/, $param; next unless $p =~ m(charset)i; if ($v =~ m/([\'\"]?)(\S+)\1/i) { $charset = lc $2; last; } } return $type, $charset; } # # Normalize newline forms (CRLF/CR/LF) to native newline. sub normalize_newlines { my $file = shift; $file =~ s(\015\012?|\012){\n}g; # Turn ASCII CRLF into native newline. return [split /\n/, $file]; } # # Return $_[0] encoded for HTML entities (cribbed from merlyn). sub ent { local $_ = shift; s(["<&>"]){'&#' . ord($&) . ';'}ge; return $_; } # # Truncate source lines for report. sub truncate_line { my $line = shift; my $col = shift; if (length $line > 70) { if ($col < 25) { # Truncate at 70 chars and right side only. $line = substr($line, 0, 70) . " ..."; } elsif ($col > 70) { # Keep rightmost 70 chars; left side only. my $diff = $col - 50; $line = "... " . substr($line, $diff, 70); if (length $line == 70 + 4) { $line .= " ..."; } if ($col > $diff) { $col -= $diff; } else { $col -= 70; } } else { # Truncate both sides; leave more on left, and 30 chars on right. if ($col < 35) { $line = "... " . substr($line, 0, 60); } else { $line = "... " . substr($line, $col - 35, 60); $col = 35; } if (length $line == 60 + 4) {$line .= " ..."}; } } return $line, $col; } # # Supress any existing DOCTYPE by commenting it out. sub supress_doctype { no strict 'vars'; my $file = shift; local $HTML = ''; HTML::Parser->new(default_h => [sub {$HTML .= shift}, 'text'], declaration_h => [sub {$HTML .= ''}, 'text'] )->parse(join "\n", @{$file}); return [split /\n/, $HTML]; } # # Parse errors reported by SP. sub parse_errors ($$) { my $File = shift; my $file = shift; $File->{Errors} = []; # Initialize to an (empty) anonymous array ref. open ERRORS, "<$file" or &internal_error("open($file) returned: $!"); for () { push @{$File->{'DEBUG'}->{Errors}}, $_; my($err, @errors); next if /^0:[0-9]+:[0-9]+:[^A-Z]/; next if /numbers exceeding 65535 not supported/; next if /URL Redirected to/; my(@_err) = split /:/; next unless $_err[1] eq '0'; if ($_err[1] =~ m(^)) { @errors = ($_err[0], join(':', $_err[1], $_err[2]), @_err[3..$#_err]); } else { @errors = @_err; } $err->{src} = $errors[1]; $err->{line} = $errors[2]; $err->{char} = $errors[3]; $err->{type} = $errors[4]; if ($err->{type} eq 'W' or $err->{type} eq 'E' or $err->{type} eq 'X') { $err->{msg} = $errors[5]; } else { $err->{type} = 'I'; $err->{msg} = $errors[4]; } push @{$File->{Errors}}, $err; } close ERRORS or &internal_error("close($file) returned: $!\n"); return $File; } # # Generate a HTML report of detected errors. sub report_errors ($) { print '

Fatal Error: $err->{msg}

I could not parse this document, because it uses a public identifier that is not in my catalog.

Fatal Error: $err->{msg}

I could not parse this document, because it makes reference to a system-specific file instead of using a well-known public identifier to specify the type of markup being used.
.EOF. &output_doctype_spiel; last; } my($line, $col) = &truncate_line($File->{Content}->[$err->{line}-1], $err->{char}); # Strip curlies from lq-nsgmls output. $err->{msg} =~ s/[{}]//g; # Find index into the %frag hash for the "explanation..." links. $err->{idx} = $err->{msg}; $err->{idx} =~ s/"[^\"]+"/FOO/g; $err->{idx} =~ s/[^A-Za-z ]//g; $line = &ent($line); # Entity encode. $line =~ s/\t/ /g; # Collapse TABs. print qq(
Line $err->{line}, column $col: ); print qq{$err->{msg}}; if (defined $frag->{$err->{idx}}) { print qq{ (explanation...)}; } print "\n
$line\n"; print " " x ($col + 2); # 2 is the number of spaces before above print " " x 4 if $col != $err->{char}; # only for truncated lines print "^
\n"; } print "

\n"; print "\n"; if ($version eq 'unknown') { print "Sorry, I can't validate this document."; } elsif ($File->{Type} eq 'xml') { print "Sorry, this document is not well-formed XML."; } else { print "Sorry, this document does not validate as $version."; &output_css_validator_blurb($q->param('uri')); } } # # Output "This page is Valid" report. sub report_valid { my $File = shift; my $gifborder = ' border="0"'; my $xhtmlendtag = ''; my($image_uri, $alttext, $gifhw); if ($File->{Type} eq 'xml' or $File->{Type} eq 'xhtml') { print "\n \n No errors found! "; print '*', "\n\n"; } else { print "\n \n No errors found!\n\n"; } unless ($version eq 'unknown' or defined $File->{Tentative}) { if ($version =~ /^HTML 2\.0$/) { $image_uri = "${abs_svc_uri}images/vh20"; $alttext = "Valid HTML 2.0!"; $gifborder = ""; } elsif ($version =~ /HTML 3\.2 Strict$/) { $image_uri = "http://www.w3.org/Icons/valid-html40"; $alttext = "Valid HTML 4.0!"; $gifborder = ""; $gifhw = ' height="31" width="88"'; } elsif ($version =~ /HTML 4\.0<\/a> /) { $image_uri = "http://www.w3.org/Icons/valid-html40"; $alttext = "Valid HTML 4.0!"; $gifhw = ' height="31" width="88"'; } elsif ($version =~ /HTML 4\.01<\/a> Strict$/) { $image_uri = "http://www.w3.org/Icons/valid-html401"; $alttext = "Valid HTML 4.01!"; $gifborder = ""; $gifhw = ' height="31" width="88"'; } elsif ($version =~ /HTML 4\.01<\/a> /) { $image_uri = "http://www.w3.org/Icons/valid-html401"; $alttext = "Valid HTML 4.01!"; $gifhw = ' height="31" width="88"'; } elsif ($version =~ /XHTML 1\.0<\/a> /) { $image_uri = "http://www.w3.org/Icons/valid-xhtml10"; $alttext = "Valid XHTML 1.0!"; $gifborder = ""; $gifhw = ' height="31" width="88"'; $xhtmlendtag = " /"; # } elsif ($version =~ /XHTML Basic 1.0/) { # $image_uri = "${abs_svc_uri}/images/vxhtml-basic10"; # $alttext = "Valid XHTML Basic 1.0!"; # $gifborder = ""; # $gifhw = ' height="31" width="88"'; # $xhtmlendtag = " /"; } elsif ($version =~ /XHTML 1.1/) { $image_uri = "http://www.w3.org/Icons/valid-xhtml11"; $alttext = "Valid XHTML 1.1!"; $gifborder = ""; $gifhw = ' height="31" width="88"'; $xhtmlendtag = " /"; } elsif ($version =~ /HTML 3\.0/) { $image_uri = "${abs_svc_uri}images/vh30"; $alttext = "Valid HTML 3.0!"; } elsif ($version =~ /Netscape/) { $image_uri = "${abs_svc_uri}images/vhns"; $alttext = "Valid Netscape-HTML!"; } elsif ($version =~ /Hotjava/) { $image_uri = "${abs_svc_uri}images/vhhj"; $alttext = "Valid Hotjava-HTML!"; } if (defined $image_uri) { print <<"EOHD"; Congratulations, this document validates as $version! To show your readers that you have taken the care to create an interoperable Web page, you may display this icon on any page that validates. Here is the HTML you could use to add this icon to your Web page: <p> <a href="${abs_svc_uri}check/referer"><img$gifborder src="$image_uri" alt="$alttext"$gifhw$xhtmlendtag></a> </p> If you like, you can download a copy of this image (in PNG or GIF format) to keep in your local web directory, and change the HTML fragment above to reference your local image rather than the one on this server. EOHD } } if ($File->{Type} eq 'xml' and not $File->{DOCTYPE}) { print " Congratulations, this document is well-formed XML.\n"; } elsif (defined $File->{Tentative}) { if ($File->{Tentative} == T_CHARSET_KLUDGE) { print " \n This document would validate as the document type specified if you updated it to specify the Character Encoding used.\n \n"; } else { print " \n This document would validate as the document type specified if you updated it to match the Options used.\n \n"; } } elsif ($version eq 'unknown' or not defined $image_uri) { print " \n Congratulations, this document validates as the document type specified! (I don't have an icon for this one yet, sorry.)\n \n"; } unless ($q->param('uploaded_file')) { my $thispage = $q->url(-query => 0); $thispage .= qq(?uri=$File->{URI}); $thispage .= ';ss=1' if $q->param('ss'); $thispage .= ';sp=1' if $q->param('sp'); $thispage .= ';noatt=1' if $q->param('noatt'); $thispage .= ';outline=1' if $q->param('outline'); &output_css_validator_blurb($q->param('uri')); print <<"EOHD"; If you would like to create a link to this page (i.e., this validation result) to make it easier to re-validate this page in the future or to allow others to validate your page, the URI is: $thispage (Or, you can just add the current page to your bookmarks or hotlist.) EOHD } if ($File->{Type} eq 'xml' or $File->{Type} eq 'xhtml') { print qq{ Caveat This validator is based on SP, which has some limitations in its support for XML. }; } } # # Legacy support. Print blurb to clarify status of weblint support. sub weblint { print <<"EOF"; Weblint Doesnt Live Here Any More Weblint is no longer actively maintained and has become badly out of date with the current state of the web, and so this feature has been deprecated. If you want to get the Weblint results you can use one of the web frontends referenced from the Weblint home page or install a copy of it locally. We are currently investigating alternative methods to replace the Weblint feature in the Validator. Tidy appears to be a likely candidate at this point. EOF } # # Produce an outline of the document based on Hn elements from the ESIS. sub outline { my $File = shift; print <<'EOF'; Outline Below is an outline for this document, automatically generated from the heading tags (<H1> through <H6>.) EOF my $prevlevel = 0; my $indent = 0; my $level = 0; for (1 .. $#{$File->{ESIS}}) { my $line = $File->{ESIS}->[$_]; next unless $line =~ /^$H([1-6])$/i; $prevlevel = $level; $level = $1; print " \n" x ($prevlevel - $level); # perl is so cool. if ($level - $prevlevel == 1) {print " \n"}; foreach my $i (($prevlevel + 1) .. ($level - 1)) { print qq( \n A level $i heading is missing!\n); } if ($level - $prevlevel > 1) {print " \n"}; $line = ''; my $heading = ''; until (substr($line, 0, 3) =~ /^$H$level/i) { $line = $File->{ESIS}->[$_++]; $line =~ s/\\011/ /g; $line =~ s/\\012/ /g; if ($line =~ /^-/) { my $headcont = $line; substr($headcont, 0, 1) = " "; $headcont =~ s/\\n/ /g; $heading .= $headcont; } elsif ($line =~ /^AALT CDATA( .+)/i) { my $headcont = $1; $headcont =~ s/\\n/ /g; $heading .= $headcont; } } $heading = substr($heading, 1); # chop the leading '-' or ' '. $heading = &ent($heading); print " $heading\n"; } print " \n" x $level; print <<'EOF'; If this does not look like a real outline, it is likely that the heading tags are not being used properly. (Headings should reflect the logical structure of the document; they should not be used simply to add emphasis, or to change the font size.) EOF } # # Create a HTML representation of the document. sub show_source { my $File = shift; my $line = 1; print <<'EOF'; Source Listing Below is the source input I used for this validation: EOF for (@{$File->{Content}}) { printf "%4d: %s\n", $line, $line, ent $_; $line++; } print " \n "; } # # Create a HTML Parse Tree of the document for validation report. sub parse_tree { my $File = shift; print <<'EOF'; Parse Tree EOF if ($q->param('noatt')) { print <<'EOF'; I am excluding the attributes, as you requested. EOF } else { print <<'EOF'; You can also view this parse tree without attributes by selecting the appropriate option on the form. EOF } my $indent = 0; my $prevdata = ''; print "\n"; foreach my $line (@{$File->{ESIS}}) { if ($q->param('noatt')) { # don't show attributes next if $line =~ /^A/; next if $line =~ /^$A$/; next if $line =~ /^$A$/; } $line =~ s/\\n/ /g; $line =~ s/\\011/ /g; $line =~ s/\\012/ /g; $line =~ s/\s+/ /g; next if $line =~ /^-\s*$/; if ($line =~ /^-/) { substr($line, 0, 1) = ' '; $prevdata .= $line; next; } elsif ($prevdata) { $prevdata = &ent($prevdata); $prevdata =~ s/\s+/ /go; print wrap(' ' x $indent, ' ' x $indent, $prevdata), "\n"; undef $prevdata; } $line = &ent($line); if ($line =~ /^\)/) { $indent -= 2; } my $printme; chomp($printme = $line); $printme =~ s{^([()])(.*)} # reformat and add links on HTML elements { my $close = ''; $close = "/" if $1 eq ")"; # ")" -> close-tag "<" . $close . "{lc($2)} . "\">$2<\/a>>" }egx; $printme =~ s,^A, A,; # indent attributes a bit print ' ' x $indent, $printme, "\n"; if ($line =~ /^\(/) { $indent += 2; } } print " \n"; print " \n"; } # # Do an initial parse of the Document Entity to extract charset and FPI. sub preparse { my $File = shift; my $dtd = sub { return if $File->{Root}; ($File->{Root}, $File->{DOCTYPE}) = shift =~ m()si; }; my $pi = sub { return if $File->{XML_Charset}; my $pi = shift; return unless $pi =~ m(<\?xml); $pi =~ m(<\?xml[^>]*\sencoding\s*=\s*([\"\'])([A-Za-z][-A-Za-z0-9._]*)\1)s; warn qq("$1" - "$2"\n); $File->{XML_Charset} = lc $2; }; my $start = sub { my $tag = shift; my $attr = shift; my %attr = map {lc($_) => $attr->{$_}} keys %{$attr}; if ($File->{Root}) { if (lc $tag eq 'meta') { if (lc $attr{'http-equiv'} eq 'content-type') { $attr{content} =~ m(charset\s*=[\s\"\']*([^\s;\"\'>]*))si; $File->{META_Charset} = lc $1; } } return unless $tag eq $File->{Root}; } else { $File->{Root} = $tag; } if ($attr->{xmlns}) {$File->{Namespace} = $attr->{xmlns}}; }; my $p = HTML::Parser->new(api_version => 3); $p->xml_mode(TRUE); $p->ignore_elements('BODY'); $p->ignore_elements('body'); $p->handler(declaration => $dtd, 'text'); $p->handler(process => $pi, 'text'); $p->handler(start => $start, 'tag,attr'); $p->parse(join "\n", @{$File->{Content}}); return $File; } # # Print out the raw ESIS output for debugging. sub show_esis ($) { print <<'EOF'; Raw ESIS Output EOF for (@{shift->{'DEBUG'}->{ESIS}}) { s/\\012//g; s/\\n/\n/g; print ent $_; } print " \n "; } # # Print out the raw error output for debugging. sub show_errors ($) { print <<'EOF'; Raw Error Output EOF for (@{shift->{'DEBUG'}->{Errors}}) {print ent $_}; print " \n "; }