#!/usr/local/bin/perl -w # # W3C HTML Validation Service # A CGI script to retrieve and validate an HTML file # # Copyright 1995-1999 Gerald Oskoboiny # # This source code is available under the license at: # http://www.w3.org/Consortium/Legal/copyright-software # # $Id: check,v 1.91 2001-03-08 01:57:52 link Exp $ # # We need Perl 5.004. require 5.004; # # Load modules use strict; use LWP::UserAgent; use URI::Escape; use CGI::Carp; use CGI qw(:cgi -newstyle_urls -private_tempfiles); use Text::Wrap; use Text::Iconv; ############################################################################# # Constant definitions ############################################################################# # # Define global constants use constant TRUE => 1; use constant FALSE => 0; use constant UNDEF => undef; use constant DEBUG => 0; # # Define global variables use vars qw($VERSION $DATE $MAINTAINER $NOTICE); # Strings. use vars qw($frag $pub_ids $element_uri $file_type $doctypes); # Cfg hashes. # # Paths and file locations my $base_path = '/usr/local/validator/'; if ( $ENV{SERVER_PORT} eq "8000" ) { $base_path = '/home/gerald/validator/'; } my $html_path = $base_path . 'htdocs/'; my $elem_db = $html_path . 'config/eref.cfg'; my $fpis_db = $html_path . 'config/fpis.cfg'; my $frag_db = $html_path . 'config/frag.cfg'; my $type_db = $html_path . 'config/type.cfg'; my $dtds_db = $html_path . 'config/doctypes.cfg'; my $sgmlstuff = $html_path . 'sgml-lib'; my $sgmldecl = $sgmlstuff . '/REC-html401-19991224/HTML4.decl'; my $xhtmldecl = $sgmlstuff . '/REC-xhtml1-20000126/xhtml1.dcl'; my $xmldecl = $sgmlstuff . '/sp-1.3/pubtext/xml.dcl'; my $temp = "/tmp/validate.$$"; # @@ Use POSIX/IO::File tmpfiles instead! # # Executables and binaries my $sp = '/usr/local/bin/lq-nsgmls'; my $weblint = '/usr/bin/weblint'; # # URIs and fragments my $abs_svc_uri = 'http://validator.w3.org/'; my $uri_def_uri = 'http://www.w3.org/Addressing/#terms'; my $faqloc = '/docs/'; my $faqerrloc = $faqloc . 'errors.html'; my $element_ref = 'http://www.htmlhelp.com/reference/html40/'; # # Strings $VERSION = q$Revision: 1.91 $; $VERSION =~ s/Revision: ([\d\.]+) /$1/; $DATE = q$Date: 2001-03-08 01:57:52 $; $MAINTAINER = 'gerald@w3.org'; $NOTICE = ''; # "

Note: This service will be ..."; # # DOCTYPEs my $html32_doctype = q(); my $html40s_doctype = q(); my $html40t_doctype = q(); my $html40f_doctype = q(); my $xhtmlt_doctype = q( plain text version string $pub_ids = &read_cfg($fpis_db); # Errors -> fragment identifier $element_uri = &read_cfg($elem_db); # Element -> URI fragment $file_type = &read_cfg($type_db); # Content -> File -type $doctypes = &read_cfg($dtds_db); # Name -> doctype # # Set up signal handlers. $SIG{TERM} = \&erase_stuff; $SIG{KILL} = \&erase_stuff; $SIG{PIPE} = 'IGNORE'; # # delete() the, possibly tainted, $PATH. delete $ENV{PATH}; ############################################################################# # Process CGI variables ############################################################################# # # Create a new CGI object. my $q = new CGI; # # Backwards compatibility; see # http://lists.w3.org/Archives/Public/www-validator/1999JulSep/0197 # http://lists.w3.org/Archives/Public/www-validator/1999JulSep/0212 if (scalar $q->param) { foreach my $param ($q->param) { $q->param($param, TRUE) unless $q->param($param); } } # # Futz the URI so "/referer" works. if ($q->path_info eq '/referer') { $q->param('uri', $q->referer); } # # Use "url" unless a "uri" was also given. if ($q->param('url') and not $q->param('uri')) { $q->param('uri', $q->param('url')); } # # Supercede URI with an uploaded file. if ($q->param('uploaded_file')) { $q->param('uri', 'upload://' . $q->param('uploaded_file')); } # # Send them to the homepage unless we can extract a URI from either of the # acceptable sources: uri, url or /referer. &redirect_to_home_page unless length($q->param('uri')) > 5; # # Munge the URI to include commonly omitted prefixes/suffixes. $q->param('uri', $q->param('uri') . '/') unless $q->param('uri') =~ m(/); $q->param('uri', 'http://' . $q->param('uri')) if $q->param('uri') =~ m(^www)i; ############################################################################# # Output validation results ############################################################################# # # A string containing the HTML header for validation results. # We save it in a string instead of printing it in case we need to abort before # we have any meaningfull results to report. @@ May not be necessary! my $header = <<"EOF"; Content-Type: text/html; charset=utf-8 $html40t_doctype W3C HTML Validation Service Results

HTML Validation Service Results

$NOTICE EOF # # Punt if we don't recognize this URI scheme. # @@ LWP does a whole bunch more: transparently! unless ($q->param('uri') =~ m(^(http|upload)://)) { print $header; print <<"EOF";

Sorry, this type of URI is not supported by this service.

URIs should be in the form:

$abs_svc_uri

(There are other types of URIs, too, but only http:// URIs are currently supported by this service.)

EOF &clean_up_and_exit; } # # Get the file and metadata. my $File = ($q->param('uploaded_file') ? &handle_file($q) : &handle_uri($q)); # # Abort if there was no document type mapping for this Content-Type, in which # case the document type will be equal to the content type (contains a "/"). if ($File->{'Type'} =~ m(/) and not $q->param('uploaded_file')) { print $header; print <<"EOF";

Sorry, I am unable to validate this document because its returned content-type was $File->{Type}, which is not currently supported by this service.

EOF &clean_up_and_exit; } # # Overall parsing algorithm for documents returned as text/html: # # For documents that come to us as text/html, # # 1. check if there's a doctype # 2. if there is a doctype, parse/validate against that DTD # 3. if no doctype, check for an xmlns= attribute on the first element # 4. if there is an xmlns= attribute, check for XML well-formedness # 5. if there is no xmlns= attribute, validate as HTML using the doctype # inferred by the check_for_doctype function # # # Override DOCTYPE if user asked for it. if (defined $q->param('doctype') and not $q->param('doctype') =~ /Inline/i) { $File->{Content} = &supress_doctype($File->{Content}); unshift @{$File->{Content}}, $doctypes->{$q->param('doctype')}; } # # Try to extract or guess the DOCTYPE for HTML and XHTML files. my $doctype; my $guessed_doctype = 2; if ($File->{Type} eq 'html' or $File->{Type} eq 'xhtml' or $q->param('uploaded_file') ) { ($guessed_doctype, $doctype) = &check_for_doctype($File->{Content}); } # # Set document type to XHTML if the DOCTYPE was for XHTML. # This happens when a XHTML file is served as text/html if ($doctype =~ /xhtml/i && $guessed_doctype != 2) { $File->{Type} = 'xhtml'; } # # Set document type to MathML if the DOCTYPE was for MathML. if ($doctype =~ /mathml/i && $guessed_doctype != 2) { $File->{Type} = 'mathml'; } # # If we find a XML declaration with charset information, # we take it into account. foreach my $line (@{$File->{Content}}) { # @@ needs to handle declarations that span more than one line if ($line =~ /<\?xml\s/) { if ($line =~ /encoding\s*=[\s\"\']*([^\s;\"\'>]*)/) { $File->{XML_Charset} = lc $1; last; } else {last}; } } # # If we find a META element with charset information, we take it into account. foreach my $line (@{$File->{Content}}) { # @@ needs to handle meta elements that span more than one line if ($line =~ /]*)/i) { $File->{META_Charset} = lc $1; last; } elsif ($line =~ /{HTTP_Charset}) { $File->{Charset} = $File->{HTTP_Charset}; } elsif ($File->{XML_Charset}) { $File->{Charset} = $File->{XML_Charset}; } elsif ($File->{META_Charset}) { $File->{Charset} = $File->{META_Charset}; } else { $File->{Charset} = 'unknown'; } # # Setup SP environment for the charset. if ($File->{Charset} ne 'unknown' and $File->{Charset} ne 'us-ascii') { $ENV{SP_CHARSET_FIXED} = 'YES'; $ENV{SP_ENCODING} = 'utf-8'; } # # Print header and jump links. print $header, qq(\n

Document Checked

\n), &build_jump_links; # # Print the list of meta data. print "

URI: ); print '', ent($File->{URI}), qq(\n); } elsif ($q->param('uploaded_file')) { print ' ' x 4, '
File: ', $File->{URI}, "
URI: ); print '', $q->param('uri'), qq(\n), ' ' x 6, qq(), qq(I was redirected to <URI:), '', $File->{URI}, '>', qq(\n), ' ' x 4, qq(
Last modified: ), $File->{Modified}, qq(
Server: ), $File->{Server}, qq(
Content length: ), $File->{Size}, qq(
nsgmls command line: $command\n"; } open CHECKER, "|$command - >$temp.esis" or die "open(|$command - >$temp.esis) returned: $!\n"; print CHECKER $doctype, "\n" if $guessed_doctype == 1; if ( $File->{Charset} eq 'utf-8' or $File->{Charset} eq 'us-ascii' or $File->{Charset} eq 'unknown') { for (@{$File->{Content}}) {print CHECKER $_, "\n"}; } else { # workaround for windows-nnnn charsets missing from glibc<2.2 my $temp_charset = $File->{Charset}; $temp_charset =~ s/^windows-(\d+)$/CP$1/i; eval {my $c = Text::Iconv->new($temp_charset, 'utf-8')}; if (not $@) { my $c = Text::Iconv->new($temp_charset, 'utf-8'); for (@{$File->{Content}}) {print CHECKER $c->convert("$_\n")}; } else { &print_charset_error($@, $File->{Charset}); &clean_up_and_exit; } } close CHECKER; open ERRORS, "<$temp" or die "open($temp) returned: $!\n"; my @errors = ; close ERRORS or warn "close($temp) returned: $!\n"; my @esis; my $elements_found = 0; my $root_namespace; my %other_namespaces; open ESIS, "$temp.esis" or die "open($temp.esis) returned: $!\n"; while () { $elements_found++ if ( /^\(/ ); # look for xml namespaces if ( ( ($File->{Type} eq 'xml') || ($File->{Type} eq 'xhtml')) && ( (/^Axmlns() \w+ (.*)/) || (/^Axmlns:([^ ]+) \w+ (.*)/) ) ) { if ( ( ! defined $root_namespace ) && ( $elements_found == 0 ) && ( $1 eq "" ) ) { $root_namespace = $2; } else { $other_namespaces{$2}++; } } next if / IMPLIED$/; next if /^ASDAFORM CDATA /; next if /^ASDAPREF CDATA /; chomp; # Removes trailing newlines push @esis, $_; } close ESIS or warn "close($temp.esis) returned: $!"; my $fpi; my $version = 'unknown'; if (($File->{Type} eq 'xhtml') || ($File->{Type} eq 'mathml')) { $fpi = $doctype; } elsif ($File->{Type} eq 'xml') { $fpi = 'XML'; } else { for (@esis) { next unless /^AVERSION CDATA (.*)/; $fpi = $1; last; } # Needed for HTML4 Strict, which has no version attribute on the HTML element if (length $doctype and not defined $fpi) {$fpi = $doctype}; } $version = $pub_ids->{$fpi} || 'unknown'; if ($guessed_doctype == 1) { unshift @errors, "$sp:0:2:1:E: Missing DOCTYPE declaration at start of document (explanation...)\n"; } print ' ' x 4, q(
Character encoding: ), $File->{Charset}; if ($File->{HTTP_Charset} ne $File->{META_Charset} and $File->{HTTP_Charset} ne '' and $File->{META_Charset} ne '' and $File->{Charset} ne 'unknown') { print <<"EOHD"; The character encoding specified in the HTTP header ("$File->{HTTP_Charset}") is different from the one specified in the META element ("$File->{META_Charset}"). I will use "$File->{Charset}" for this validation. EOHD } elsif ($File->{HTTP_Charset} ne $File->{XML_Charset} and $File->{HTTP_Charset} ne '' and $File->{XML_Charset} ne '' and $File->{Charset} ne 'unknown') { print <<"EOHD"; The character encoding specified in the HTTP header ("$File->{HTTP_Charset}") is different from the one specified in the XML declaration ("$File->{XML_Charset}"). I will use "$File->{Charset}" for this validation. EOHD } print ' ' x 4, qq(
Document type: ), $version; if ($File->{Type} eq 'xhtml' and $root_namespace ne 'http://www.w3.org/1999/xhtml') { print "
warning: unknown namespace for text/html document!"; if ($root_namespace ne '') { print qq{, $root_namespace}; } print "\n"; } else { if ($root_namespace ne '') { print qq( with namespace $root_namespace); } } if (scalar keys %other_namespaces >= 1) { print "
Other namespaces in this document: "; for (keys %other_namespaces) { print qq($_, ), "\n"; } } print qq(
Document type: ), $version, qq(

\n\n); if (($File->{Type} eq 'xml') || ($File->{Type} eq 'xhtml')) { my $validity_blurb = ""; if ( $guessed_doctype == 0 ) { $validity_blurb = " and validity"; } print <<"EOHD";

Below are the results of checking this document for XML well-formedness${validity_blurb}.

EOHD } else { print <<"EOHD";

Below are the results of attempting to parse this document with an SGML parser.

EOHD } if ( $? || ($guessed_doctype == 1) ) { my($line, $col, $type, $msg); print "

0:[0-9]+:[0-9]+:[^A-Z]/; next if / numbers exceeding 65535 not supported$/; # @@ could maybe replace the following statement with an SGMLDECL # setting in the catalog, like mathml2? next if /:W: SGML declaration was not implied$/ && ($File->{Type} =~ /^x(ht)?ml$/); next if /^$sp:$invalid location$:W: URL Redirected to/; s/^$sp://g; if ( ! (($line, $col, $type, $msg)=(/^[^:]*:([0-9]+):([0-9]+):([A-Z]?):? (.*)/))) { print "Uh oh! I got the following unknown error:\n\n $_\n\n"; print "Please make sure you specified the DOCTYPE properly!\n\n"; &output_doctype_spiel; last; } if ( $msg =~ /^cannot generate system identifier for entity / ) { print "

Fatal error! $msg\n\n"; print "

I couldn't parse this document, because it " . "uses a public\n identifier that's not in my catalog!\n

\n"; &output_doctype_spiel; last; } if ( $msg =~ /unrecognized ({{)?DOCTYPE(}})?/i ) { print "

Fatal error! $msg\n\n"; print "

I couldn't parse this document, because it " . "uses a public\n identifier that's not in my catalog!\n

\n"; &output_doctype_spiel; last; } if ( $msg =~ /^cannot open / ) { print "

Fatal error! $msg\n\n"; print "

I couldn't parse this document, because it " . "makes reference to\n a system-specific file instead of " . "using a well-known public identifier\n to specify the " . "level of HTML being used.\n

\n"; &output_doctype_spiel; last; } if ( $msg =~ /^cannot find / ) { print "

Fatal error! $msg\n\n"; print "

I couldn't parse this document, because it " . "makes reference to\n a system-specific file instead of " . "using a well-known public identifier\n to specify the " . "level of HTML being used.\n

\n"; &output_doctype_spiel; last; } $line-- if ( $guessed_doctype == 1 ); my $newline = $File->{Content}->[$line - 1]; # make sure there are no ^P's or ^Q's in the file, since we need to use # them to represent '<' and '>' temporarily. We'll just change them to # literal P's and Q's for a lack of anything better to do with them. my $lt = "\020"; my $gt = "\021"; $newline =~ s/\020/P/go; $newline =~ s/\021/Q/g; my $orig_col = $col; ($newline, $col) = &truncate_line($newline, $col); # temporarily strip curlies from lq-nsgmls output. # @@ should link HTML elements using $elem_db instead. $msg =~ s/[{}]//g; # figure out the index into the %frag associative array for the # "explanation..." links to the KGV FAQ. my $msgindex = $msg; $msgindex =~ s/"[^\"]+"/FOO/g; $msgindex =~ s/[^A-Za-z ]//g; $newline =~ s/&/&/go; $newline =~ s//g; $newline =~ s/\t/ /g; $newline =~ s/ //g; print "

"; print qq{} if $q->param('ss'); print "Line $line"; print "" if $q->param('ss'); print ", column $orig_col:\n"; print "

  $newline\n";
	print " " x ($col+2);	# 2 is the number of spaces before  above
	print " " x 4 if $col != $orig_col;	# only for truncated lines
	print "^

\n";
	print "\n";

        print qq{Error: $msg};

	if (defined $frag->{$msgindex}) {
	  print qq{ (explanation...)};
	}

	print "\n";

    }
    print "

\n";
    print "\n";
    if ( $version eq "unknown" ) {
	print "\n  \n    Sorry, I can't validate this document.\n  \n";
    }
    elsif ( $File->{Type} eq 'xml' ) {
	print "\n  \n    Sorry, this document is not well-formed XML.\n  \n";
    }
    else {
	print "\n  \n    Sorry, this document does not validate as $version.\n  \n\n";
	&output_css_validator_blurb( $q->param('uri') );
    }
} else {
    my $gifborder   = ' border="0"';
    my $xhtmlendtag = '';
    my $image_uri;
    my $alttext;
    my $gifhw;

    if (($File->{Type} eq 'xml') || ($File->{Type} eq 'xhtml')) {
	print "\n  \n    No errors found! ";
	print "*\n\n";
    }
    else {
	print "\n  \n    No errors found!\n\n";
    }
    if ( $version ne "unknown" ) {
	if ( $version =~ /^HTML 2\.0$/ ) {
	    $image_uri = "http://validator.w3.org/images/vh20";
	    $alttext = "Valid HTML 2.0!";
            $gifborder = "";
	}
	elsif ( $version =~ /HTML 3\.2 Strict$/ ) {
	    $image_uri = "http://www.w3.org/Icons/valid-html40";
	    $alttext = "Valid HTML 4.0!";
            $gifborder = "";
            $gifhw   = ' height="31" width="88"';
	}
	elsif ( $version =~ /HTML 4\.0<\/a> / ) {
	    $image_uri = "http://www.w3.org/Icons/valid-html40";
	    $alttext = "Valid HTML 4.0!";
            $gifhw   = ' height="31" width="88"';
	}
	elsif ( $version =~ /HTML 4\.01<\/a> Strict$/ ) {
	    $image_uri = "http://www.w3.org/Icons/valid-html401";
	    $alttext = "Valid HTML 4.01!";
            $gifborder = "";
            $gifhw   = ' height="31" width="88"';
	}
	elsif ( $version =~ /HTML 4\.01<\/a> / ) {
	    $image_uri = "http://www.w3.org/Icons/valid-html401";
	    $alttext = "Valid HTML 4.01!";
            $gifhw   = ' height="31" width="88"';
	}
	elsif ( $version =~ /XHTML 1\.0<\/a> / ) {
	    $image_uri = "http://www.w3.org/Icons/valid-xhtml10";
	    $alttext = "Valid XHTML 1.0!";
            $gifborder = "";
            $gifhw   = ' height="31" width="88"';
	    $xhtmlendtag = " /";
	}
	elsif ( $version =~ /HTML 3\.0/ ) {
	    $image_uri = "http://validator.w3.org/images/vh30";
	    $alttext = "Valid HTML 3.0!";
	}
	elsif ( $version =~ /Netscape/ ) {
	    $image_uri = "http://validator.w3.org/images/vhns";
	    $alttext = "Valid Netscape-HTML!";
	}
	elsif ( $version =~ /Hotjava/ ) {
	    $image_uri = "http://validator.w3.org/images/vhhj";
	    $alttext = "Valid Hotjava-HTML!";
	}
	if ( defined $image_uri ) {
	    print <<"EOHD";
  
     Congratulations, this
    document validates as $version!
  

  
    To show your readers that you have taken the care to create an
    interoperable Web page, you may display this icon on any page
    that validates. Here is the HTML you could use to add this icon
    to your Web page:
  
    <p>
    <a href="${abs_svc_uri}check/referer"><img$gifborder
        src="$image_uri"
        alt="$alttext"$gifhw$xhtmlendtag></a>
  </p>
  
    If you like, you can download a copy of this image (in PNG or GIF
    format) to keep in your local web directory, and change the HTML fragment
    above to reference your local image rather than the one on this server.
  

EOHD
	}
    }
    if ( ( $version eq "unknown" ) || ( ! defined $image_uri ) ) {
      if ( $guessed_doctype != 0 ) {	# if no doctype, only checked wf-ness
	print "  \n    Congratulations, this document is well-formed XML.\n  \n";
      }
      else {	# checked validity
	print "  \n    Congratulations, this document validates as the document type specified! (I don't have an icon for this one yet, sorry.)\n  \n";
      }
    }

    unless ($q->param('uploaded_file')) {
      my $thispage = $q->self_url;

      &output_css_validator_blurb($q->param('uri'));

      print <<"EOHD";
  
    If you would like to create a link to this page (i.e., this
    validation result) to make it easier to re-validate this page in the
    future or to allow others to validate your page, the URI is:
  

  
    $thispage
  

  
    (Or, you can just add the current page to your bookmarks or hotlist.)
  
EOHD
    }
    if (($File->{Type} eq 'xml') || ($File->{Type} eq 'xhtml')) {
      print qq{    Caveat
      
	This validator is based on SP, which has some limitations
	in its support for XML.
      
      };
    }
}

if ( $q->param('weblint') ) {
  my $pedanticflags;
  my $pedantic_blurb;

    if ( $q->param('pw') ) {
	$pedanticflags  = '-pedantic -e mailto-link';
	$pedantic_blurb = ' (in "pedantic" mode)';
    }
    else {
	$pedanticflags = '';
    }

    print <<"EOF";
  
  Weblint Results

  
    Below are the results of running Weblint
    on this document$pedantic_blurb:
  

  
    Note:
    Weblint is a useful HTML syntax and style checker, but does
    not do true HTML validation.
    Also, the version of weblint used by this service has not
    been updated for some time, so some of the messages below may
    be misleading or inaccurate.
  
EOF

    open WEBLINT, "|$weblint -s $pedanticflags - 2>&1 >$temp.weblint"
      or die "open($weblint) returned: $!\n";
    for (@{$File->{Content}}) {print WEBLINT $_, "\n"};
    close WEBLINT;

    print "\n\n";
    if ( $? ) {
	print "  \n";

	    open( WEBLINTOUT, "$temp.weblint" )
		|| die "couldn't open weblint results in $temp: $!";

	    while () {
		s/ \(use "-x " to allow this\)\.$/./go;
		s/&/&/go;
		s//>/go;
		print "    $_";
	    }

	    close( WEBLINTOUT ) || die "couldn't close weblint results: $!";
	    print "  \n";
    }
    else {
	print "\n  \n    Looks good to me!\n  \n";
    }
    print "\n\n";
}

if ($q->param('outline')) {
  print <<'EOF';
  
    
    Outline
    
      Below is an outline for this document, automatically generated from the
      heading tags (<H1> through <H6>.)
    
EOF

  my $prevlevel = 0;
  my $indent    = 0;
  my $level     = 0;
  for (1 .. $#esis) {
    my $line = $esis[$_];
    next unless $line =~ /^\(H([1-6])$/i;
    $prevlevel = $level;
    $level     = $1;

    print "    \n" x ($prevlevel - $level); # perl is so cool.
    if ($level - $prevlevel == 1) {
      print "    \n";
    }
    foreach my $i (($prevlevel + 1) .. ($level - 1)) {
      print qq(  \n    A level $i heading is missing!\n);
    }
    if ($level - $prevlevel > 1) {
      print "    \n";
    }

    $line       = '';
    my $heading = '';
    until (substr($line, 0, 3) =~ /^\)H$level/i) {
      $line = $esis[$_++];
      $line =~ s/\\011/ /g;
      $line =~ s/\\012/ /g;
      if ($line =~ /^-/) {
	my $headcont = $line;
	substr($headcont, 0, 1) = " ";
	$headcont =~ s/\\n/ /g;
	$heading .= $headcont;
      } elsif ($line =~ /^AALT CDATA( .+)/i) {
	my $headcont = $1;
	$headcont =~ s/\\n/ /g;
	$heading .= $headcont;
      }
    }

    $heading = substr($heading, 1); # chop the leading '-' or ' '.
    $heading =~ s/&/&/go; $heading =~ s/$heading\n";
  }
  print "    \n" x $level;
  print <<'EOF';
    
      If this does not look like a real outline, it is likely that the
      heading tags are not being used properly. (Headings should reflect
      the logical structure of the document; they should not be used simply
      to add emphasis, or to change the font size.)
    
    
EOF
}

if ($q->param('ss')) {
  my $line = 1;

  print <<'EOF';
  
    
    Source Listing

    Below is the source input I used for this validation:
    EOF

  if ($guessed_doctype == 1) {
    my $gd = $doctype . "\n";
    $gd =~ s/&/&/go; $gd =~ s/{Content}}) {
    s/&/&/go; s/%4d: %s\n", $line, $line, $_;
    $line++;
  }
  print "    
\n  ";
}

if ($q->param('sp')) {
  print <<'EOF';
  
    
    Parse Tree
EOF
    if ($q->param('noatt')) {
      print <<'EOF';
    
      I am excluding the attributes, as you requested.
    
EOF
    } else {
      print <<'EOF';
    
      You can also view this parse tree without attributes by selecting the
      appropriate option on the form.
    
EOF
    }

  my $indent   = 0;
  my $prevdata = '';

  print "\n";
  foreach my $line (@esis) {
    if ($q->param('noatt')) {	# don't show attributes
      next if $line =~ /^A/;
      next if $line =~ /^\(A$/;
      next if $line =~ /^\)A$/;
    }

    $line =~ s/\\n/ /g;
    $line =~ s/\\011/ /g;
    $line =~ s/\\012/ /g;
    $line =~ s/\s+/ /g;
    next if $line =~ /^-\s*$/;

    if ($line =~ /^-/) {
      substr($line, 0, 1) = ' ';
      $prevdata .= $line;
      next;
    } elsif ($prevdata) {
      $prevdata =~ s/&/&/go;
      $prevdata =~ s/ close-tag
		   "<" . $close . "{lc($2)} .
		   "\">$2<\/a>>"
		 }egx;
    $printme =~ s,^A,  A,;	# indent attributes a bit
    print ' ' x $indent, $printme, "\n";
    if ($line =~ /^\(/) {
      $indent += 2;
    }
  }
  print "
\n";
  print "
\n";
}

&clean_up_and_exit;


#############################################################################
# Subroutine definitions
#############################################################################

sub output_doctype_spiel {

    print <<"EOF";

    
      You should make the first line of your HTML document a DOCTYPE
      declaration, for example, for a typical HTML 4.01 document:
    

          <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
      <HTML>
	<HEAD>
	  <TITLE>Title</TITLE>
	</HEAD>

	<BODY>
	  <-- ... body of document ... -->
	</BODY>
      </HTML>

EOF

}

sub output_closing {

    print <<"EOF";




  
  Gerald Oskoboiny

  Last modified: $DATE





EOF

}

sub erase_stuff {
  unlink $temp        or warn "unlink($temp) returned: $!\n";
  unlink "$temp.esis" or warn "unlink($temp.esis) returned: $!\n";
  unlink "$temp.weblint";
}

sub clean_up_and_exit {
  &output_closing;
  &erase_stuff;
  exit;
}

sub redirect_to_home_page {

    print "Status: 301 Moved Permanently\n";
    print "Content-Type: text/html\n";
    print "Location: http://validator.w3.org/\n\n";
    print "Moved!\n";
    print "\n";
    print "  Please see the validation service's home page.\n";
    print "\n";

    &clean_up_and_exit;

}

sub build_jump_links {

    my $text;
    my $count = 0;

    $count++ if $q->param('ss');
    $count++ if $q->param('sp');
    $count++ if $q->param('weblint');
    $count++ if $q->param('outline');

    if ( $count ) {
	$text .= "  \n    Jump to: ";
	if ( $q->param('weblint') ) {
	    $text .= "Weblint Results";
	    $count--;
	    $text .= " or " if ( $count == 1 );
	    $text .= ", "   if ( $count >  1 );
	}
	if ( $q->param('outline') ) {
	    $text .= "Outline";
	    $count--;
	    $text .= " or " if ( $count == 1 );
	    $text .= ", "   if ( $count >  1 );
	}
	if ( $q->param('ss') ) {
	    $text .= "Source Listing";
	    $count--;
	    $text .= " or " if ( $count == 1 );
	    $text .= ", "   if ( $count >  1 );
	}
	if ( $q->param('sp') ) {
	    $text .= "Parse Tree";
	}
	$text .= ".\n  \n\n";
    }
    return $text;

}

#
# Check if the document has a doctype; if it doesn't, try to guess
# an appropriate one given the elements used. Returns 2 values.
# The first value is:
#   0 if there was a DOCTYPE,
#   1 if there was no doctype and no xmlns= attribute
#       on the first element in the document, or
#   2 if there was no doctype and there IS an xmlns= attribute
#       on the first element
# The Second value is the doctype or namespace, if any.
sub check_for_doctype {
  my $file = shift; # a reference to @file, for efficiency

  foreach my $count (0 .. scalar @{$file}) {
    my $line = $file->[$count];

    # does an HTML element precede the doctype on the same line?
    if ( $line =~ /<[a-z].*//go;
    # Strip comments, so the next line doesn't find commented-out markup etc.
    # (this doesn't handle multi-line comments, unfortunately)
    if ( $line =~ /<[a-z]/i ) {	# found an element
	if ( $line =~ /<[a-z]+ xmlns=['"]([^ "']*)/i ) {# look for an xmlns attr
	    return 2, $1;
	}
	last;
    }
  }

  for (@{$file}[0 .. 20]) {
    return 1, $html40f_doctype if /

401 Authorization Required

Authorization Required

  Sorry, I am not authorized to access the specified URI.



  The URI you specified,



  $resource



  returned a 401 "authorization required" response when I tried
  to download it.



  You should have been prompted by your browser for a
  username/password pair; if you had supplied this information, I
  would have forwarded it to your server for authorization to
  access the resource. You can use your browser's "reload" function
  to try again, if you wish.



  Of course, you may not want to trust me with this information,
  which is fine. I can tell you that I don't log it or do
  anything else nasty with it, and you can download the source for
  this service to see what it does, but you have no guarantee
  that this is actually the code I'm using; you basically have to
  decide whether to trust me or not. :-)



  Note that you shouldn't use HTTP Basic Authentication for
  anything which really needs to be private, since the password
  goes across the network unencrypted.

EOF
}

sub print_unknown_http_error_message {

    my $uri = shift;
    my $code = shift;
    my $message = shift;

    print <<"EOF";
  
    I got the following unexpected response when trying to
    retrieve $uri:
  

  
    $code $message
  

  
    Please make sure you have entered the URI correctly.
  

EOF

}


#
# Complain about strange charsets.
sub print_charset_error {
  my $error   = shift;
  my $charset = shift;

  print <<".EOF.";
  
    A fatal error occurred when attempting to transliterate the document
    charset. Either we do not support this character encoding yet, or you have
    specified a non-existent character set (typically a misspelling such as
    "iso8859-1" for "iso-8859-1").
  
  The detected charset was "$charset".
  The error was "$error".
  
    If you believe the charset to be valid you can submit a request for that
    character set (see the feedback page for
    details) and we will look into supporting it in the future.
  
.EOF.
}


sub output_css_validator_blurb {
  my $uri = shift;
  $uri = ent($uri);

  print <<"EOHD";
  
    If you use CSS in your document,
    you should also check it for
    validity using the W3C CSS Validation Service.
  
EOHD
}


#
# Read TAB-delimited configuration files. Returns a hash reference.
sub read_cfg {
  my $file = shift;
  my %cfg;

  open CFG, $file or die "open($file) returned: $!\n";
  while () {
    next if /^\s*$/;
    next if /^\s*#/;
    chomp;
    my($k, $v) = split /\t+/, $_;
    $cfg{$k} = $v;
  }
  close CFG;
  return \%cfg;
}

#
# Fetch an URI and return the content and selected meta-info.
sub handle_uri {
  my $q   = shift;
  my $uri = $q->param('uri'); # The URI to fetch.

  my $ua = new LWP::UserAgent;
  $ua->agent("W3C_Validator/$VERSION " . $ua->agent);
  $ua->parse_head(0);  # Parse the http-equiv stuff ourselves. @@ Why?
  my $req = new HTTP::Request(GET => $uri);

  # If we got a Authorization header, the client is back at it after being
  # prompted for a password so we insert the header as is in the request.
  if($ENV{HTTP_AUTHORIZATION}){
    $req->headers->header(Authorization => $ENV{HTTP_AUTHORIZATION});
  }

  my $res = $ua->request($req);

  unless ($res->code == 200) {
    if ($res->code == 401) {
      &authenticate($res->request->url, $res->www_authenticate);
    } else {
      print $header;
      &print_unknown_http_error_message($uri, $res->code, $res->message);
    }
    &clean_up_and_exit;
  }
  
  my($type, $charset) = &parse_content_type($res->header('Content-Type'));

  my $lastmod = undef;
  if ( $res->last_modified ) {
    $lastmod = scalar(gmtime($res->last_modified));
  }

  return {Content       => &normalize_newlines($res->content),
	  Type          => $type,
	  HTTP_Charset  => $charset,
	  Modified      => $lastmod,
	  Server        => scalar($res->server),
	  Size          => scalar($res->content_length),
	  URI           => scalar($res->request->url)};
}

#
# Handle uploaded file and return the content and selected meta-info.
sub handle_file {
  my $q = shift; # The CGI object.
  my $f = $q->param('uploaded_file');
  my $h = $q->uploadInfo($f);
  my $file;

  while (not eof $f) {$file .= <$f>};
  my($type, $charset) = &parse_content_type($h->{'Content-Type'});

  return {Content      => &normalize_newlines($file),
	  Type         => $type,
	  HTTP_Charset => $charset,
	  Modified     => $h->{'Last-Modified'},
	  Server       => $h->{'Server'},
	  Size         => $h->{'Content-Length'},
	  URI          => $q->param('uploaded_file')};
}

#
# Parse a Content-Type and parameters. Return document type and charset.
sub parse_content_type {
  my $Content_Type = shift;
  my ($charset, $type);

  my($ct, @param) = split /\s*;\s*/, lc $Content_Type;

  $type = $file_type->{$ct} || $ct;

  foreach my $param (@param) {
    my($p, $v) = split /\s*=\s*/, $param;
    next unless $p =~ m(charset)i;
    if ($v =~ m/([\'\"]?)(\S+)\1/i) {
      $charset = lc $2;
      last;
    }
  }

  return $type, $charset;
}

#
# Normalize newlines of form CRLF or CR to LF.
sub normalize_newlines {
  my $file = shift;

  $file =~ s(\015\012){\n}g; # Turn ASCII CRLF into native newline.
  $file =~ s(\015)    {\n}g; # Turn ASCII CR   into native newline.

  return [split /\n/, $file];
}

#
# Return $_[0] encoded for HTML entities (cribbed from merlyn).
sub ent {
  local $_ = shift;
  s(["<&>"]){'&#' . ord($&) . ';'}ge;
  return $_;
}

#
# Truncate source lines for report.
sub truncate_line {
  my $line = shift;
  my $col  = shift;

  if (length $line > 70) {
    if ($col < 25) {      # Truncate at 70 chars and right side only.
      $line = substr($line, 0, 70) . " ...";
    } elsif ($col > 70) { # Keep rightmost 70 chars; left side only.
      my $diff = $col - 50;
      $line = "... " . substr($line, $diff, 70);
      if (length $line == 70 + 4) {
	$line .= " ...";
      }
      if ($col > $diff) {
	$col -= $diff;
      } else {
	$col -= 70;
      }
    } else { # Truncate both sides; leave more on left, and 30 chars on right.
      if ($col < 35) {
	$line = "... " . substr($line, 0, 60);
      } else {
	$line = "... " . substr($line, $col - 35, 60);
	$col = 35;
      }
      if (length $line == 60 + 4) {$line .= " ..."};
    }
  }

  return $line, $col;
}

#
# Supress any existing DOCTYPE by commenting it out.
sub supress_doctype {
  no strict 'vars';
  my $file = shift;
  local $HTML = '';

  HTML::Parser->new(default_h     => [sub {$HTML .= shift}, 'text'],
		    declaration_h => [sub {$HTML .= ''}, 'text']
		   )->parse(join "\n", @{$file});
  return [split /\n/, $HTML];
}