summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-xhttpd/cgi-bin/check1280
1 files changed, 1280 insertions, 0 deletions
diff --git a/httpd/cgi-bin/check b/httpd/cgi-bin/check
index e69de29..9941887 100755
--- a/httpd/cgi-bin/check
+++ b/httpd/cgi-bin/check
@@ -0,0 +1,1280 @@
+#!/usr/local/bin/perl
+#
+# W3C HTML Validation Service
+# A CGI script to retrieve and validate an HTML file
+#
+# Copyright 1995-1998 Gerald Oskoboiny <gerald@w3.org>
+#
+# This source code is available under the license at:
+# http://www.w3.org/Consortium/Legal/copyright-software
+#
+# $Id: check,v 1.2 1998-07-24 22:13:05 gerald Exp $
+
+#############################################################################
+# Constant definitions
+#############################################################################
+
+$ENV{USER_AGENT} = 'W3C_Validator/1.0';
+
+$logfile = "/usr/local/lib/httpd/logs/val-svc";
+$reflogfile = "/usr/local/lib/httpd/logs/val-svc-referers";
+
+$faqloc = "http://www.cs.duke.edu/~dsb/kgv-faq/";
+$faqerrloc = "${faqloc}errors.html";
+$abs_svc_uri = "http://validator.w3.org/";
+$rel_img_uri = "/images/";
+$abs_img_uri = "${abs_svc_uri}images/";
+
+$grabber = "/usr/local/lib/libwww-perl/get";
+$sgmlstuff = "/usr/local/lib/sgml";
+$sp = "/usr/local/bin/nsgmls";
+$tidy = "/usr/local/bin/tidy";
+
+$sgmldecl = "$sgmlstuff/REC-html40-971218/HTML4.decl";
+
+# $testing = 1;
+
+# $notice = "<p><strong>Note: This service will be intermittently unavailable for the next few hours for an operating system upgrade.</strong>";
+
+# use the new stuff only for me for testing
+if ( ( $ENV{REMOTE_ADDR} =~ /18\.29\.0\.60/i ) && ( $testing ) ) {
+ $sgmldecl =
+ "/usr/local/lib/sgml/html40-970917/HTML4.decl";
+}
+
+umask( 022 );
+$weblint = "/usr/local/bin/weblint";
+$html2_doctype = qq{<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">};
+$html32_doctype = qq{<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2//EN">};
+$nice_html40_doctype = qq{<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0//EN\n "http://www.w3.org/TR/REC-html40/strict.dtd">};
+$nice_html40t_doctype = qq{<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"\n "http://www.w3.org/TR/REC-html40/loose.dtd">};
+$nice_html40f_doctype = qq{<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Frameset//EN"\n "http://www.w3.org/TR/REC-html40/frameset.dtd">};
+$html40_doctype = qq{<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0//EN "http://www.w3.org/TR/REC-html40/strict.dtd">};
+$html40t_doctype = qq{<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">};
+$html40f_doctype = qq{<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Frameset//EN" "http://www.w3.org/TR/REC-html40/frameset.dtd">};
+$default_doctype = $html2_doctype;
+$temp = "/tmp/validate.$$";
+$lt = "\020";
+$gt = "\021";
+# $leftarrow = qq{${lt}tt${gt}${lt}img src="arrow_left.gif" alt="^"${gt}${lt}/tt${gt}};
+$leftarrow = "${lt}tt${gt}${lt}img src=\"/arrow_left.gif\" alt=\"^\"${gt}${lt}/tt${gt}";
+$rightarrow = "${lt}tt${gt}${lt}img src=\"/arrow_right.gif\" alt=\"^\"${gt}${lt}/tt${gt}";
+$contchars = "${lt}tt${gt}${lt}img src=\"/ellipsis.gif\" alt=\"[...]\"${gt}${lt}/tt${gt}";
+$gifborder = " BORDER=0";
+
+@options = ( 'weblint', 'pw', 'outline', 'ss', 'sp', 'noatt' );
+# this doesn't work for some reason
+# qw{
+# weblint pw outline ss sp noatt
+# };
+
+#############################################################################
+# Array of FPIs -> plain text version strings
+#############################################################################
+
+%pub_ids = (
+ '-//IETF//DTD HTML Level 0//EN//2.0', 'HTML 0.0',
+ '-//IETF//DTD HTML Strict Level 0//EN//2.0', 'Strict HTML 0.0',
+
+ '-//IETF//DTD HTML 2.0 Level 1//EN', 'HTML 1.0',
+ '-//IETF//DTD HTML 2.0 Strict Level 1//EN', 'Strict HTML 1.0',
+
+ '-//IETF//DTD HTML 2.0 Strict//EN', 'Strict HTML 2.0',
+ '-//IETF//DTD HTML 2.0//EN', 'HTML 2.0',
+ '-//IETF//DTD HTML 2.1E//EN', 'HTML 2.1E',
+
+ '-//AS//DTD HTML 3.0 asWedit + extensions//EN', 'HTML 3.0 (AdvaSoft version)',
+ '-//IETF//DTD HTML 3.0//EN', 'HTML 3.0 (Beta)',
+ '-//W3O//DTD W3 HTML Strict 3.0//EN//', 'Strict HTML 3.0 (Beta)',
+
+ '-//Sun Microsystems Corp.//DTD HotJava HTML//EN', 'Hotjava-HTML',
+'-//Sun Microsystems Corp.//DTD HotJava Strict HTML//EN', 'Strict Hotjava-HTML',
+ '-//WebTechs//DTD Mozilla HTML 2.0//EN', 'Netscape-HTML',
+ '-//Netscape Comm. Corp. Strict//DTD HTML//EN', 'Strict Netscape-HTML',
+ '-//Microsoft//DTD Internet Explorer 2.0 HTML//EN', 'MSIE-HTML',
+ '-//Microsoft//DTD Internet Explorer 2.0 HTML Strict//EN', 'Strict MSIE-HTML',
+ '-//Microsoft//DTD Internet Explorer 3.0 HTML//EN', 'MSIE 3.0 HTML',
+ '-//Microsoft//DTD Internet Explorer 3.0 HTML Strict//EN', 'Strict MSIE 3.0 HTML',
+ '-//OReilly and Associates//DTD HTML Extended 1.0//EN', 'O\'Reilly HTML Extended v1.0',
+ '-//OReilly and Associates//DTD HTML Extended Relaxed 1.0//EN', 'O\'Reilly HTML Extended Relaxed v1.0',
+
+ '-//IETF//DTD HTML V2.2//EN', 'HTML 2.2',
+ '-//W3C//DTD HTML 1996-01//EN', 'HTML 1996-01',
+ '-//W3C//DTD HTML 3.2 Final//EN', '<a href="http://www.w3.org/TR/REC-HTML32">HTML 3.2</a>',
+ '-//W3C//DTD HTML Experimental 970421//EN', '<a href="http://www.w3.org/TR/NOTE-html-970421.html">HTML 3.2 + Style</a>',
+ '+//Silmaril//DTD HTML Pro v0r11 19970101//EN', '<a href="http://www.ucc.ie/doc/www/html/dtds/htmlpro.html">HTML Pro</a>',
+ '-//Spyglass//DTD HTML 2.0 Extended//EN', 'Spyglass HTML 2.0 Extended',
+ 'http://www.w3.org/MarkUp/Cougar/Cougar.dtd', '<a href="http://www.w3.org/MarkUp/Cougar/">HTML Level "Cougar"</a>',
+ '-//W3C//DTD HTML 4.0//EN', '<a href="http://www.w3.org/TR/REC-html40/">HTML 4.0</a>',
+ '-//W3C//DTD HTML 4.0 Transitional//EN', '<a href="http://www.w3.org/TR/REC-html40/">HTML 4.0</a> Transitional',
+ '-//W3C//DTD HTML 4.0 Frameset//EN', '<a href="http://www.w3.org/TR/REC-html40/">HTML 4.0</a> Frameset'
+
+);
+
+#############################################################################
+# Array of errors -> fragment identifiers for error explanation links
+#############################################################################
+
+%frag = (
+ 'entity end not allowed in comment', 'unterm-comment-1',
+ 'name start character invalid only s and comment allowed in comment declaration', 'unterm-comment-2',
+ 'name character invalid only s and comment allowed in comment declaration', 'unterm-comment-2',
+ 'unknown declaration type FOO', 'bad-comment',
+ 'character FOO not allowed in attribute specification list', 'attr-char',
+ 'an attribute value must be a literal unless it contains only name characters', 'attr-quoted',
+ 'syntax of attribute value does not conform to declared value', 'bad-attr-char',
+ 'length of attribute value must not exceed LITLEN less NORMSEP', 'name-length',
+ 'element FOO undefined', 'undef-tag',
+ 'element FOO not allowed here', 'not-allowed',
+ 'there is no attribute FOO', 'undef-attr',
+ 'FOO is not a member of the group specified in the declared value of this attribute', 'undef-attr-val',
+ 'FOO is not a member of a group specified for any attribute', 'bad-abbrev-attr',
+ 'end tag for FOO omitted but its declaration does not permit this', 'no-end-tag',
+ 'end tag for element FOO which is not open', 'floating-close',
+ 'end tag for FOO which is not finished', 'omitted-content',
+ 'start tag for FOO omitted but its declaration does not permit this', 'no-start-tag',
+ 'general entity FOO not defined and no default entity', 'bad-entity',
+ 'non SGML character number', 'bad-char',
+ 'cannot generate system identifier for entity FOO', 'bad-pub-id'
+
+# 'error', 'frag',
+# 'character data is not allowed here', 'frag',
+
+);
+
+#############################################################################
+# Set up some signal handlers in case we get killed (darned impatient people...)
+#############################################################################
+
+$SIG{'TERM'} = 'erase_stuff';
+$SIG{'KILL'} = 'erase_stuff';
+$SIG{'PIPE'} = 'IGNORE';
+# $SIG{'CHLD'} = 'erase_stuff';
+
+#############################################################################
+# Process CGI variables
+#############################################################################
+
+# this should be replaced with a 'referer' parameter.
+# if ( ( ( $ENV{HTTP_REFERER} =~ /tsd\.ml\.org/i ) ||
+# ( $ENV{HTTP_REFERER} =~ /hwg\.org/i ) ) &&
+# ( ! length( $ENV{QUERY_STRING} ) ) ) {
+# $ENV{QUERY_STRING} = "uri=" . $ENV{HTTP_REFERER};
+# }
+
+# accept either check/foo or check?foo
+$parameters = $ENV{PATH_INFO} || $ENV{QUERY_STRING};
+
+if ( ! $parameters ) {
+ if ( $ENV{REQUEST_URI} =~ /check/ ) {
+ &redirect_to_home_page;
+ }
+ else {
+ &output_intro_spiel;
+ &clean_up_and_exit;
+ }
+}
+
+foreach $pair (split(/[&;,]/, $parameters)) {
+
+ # this referer handling probably needs fixing to handle strange cases
+ # (possibly tied to the explanation given when connections fail;
+ # the referer could be on an intranet, etc.)
+
+ if ( $pair eq "/referer" && $ENV{HTTP_REFERER} =~ m,^http://, ) {
+ $FORM{uri} = $ENV{HTTP_REFERER};
+ next;
+ }
+
+ ($name, $value) = split(/=/, $pair);
+ $value =~ tr/+/ /;
+ $value =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg;
+
+ $FORM{$name} = $value || "true";
+}
+
+# accept "url=foo" for backwards compatibility (but uri=foo is preferred)
+$uri = $FORM{uri} || $FORM{url};
+
+if ( ( $uri eq "true" || length( $uri ) == 0 ) &&
+ ( $ENV{REQUEST_URI} =~ /check/ ) ) {
+ &redirect_to_home_page;
+}
+
+if ( $uri !~ /\// ) {
+ $uri .= "/";
+}
+
+if ( $uri =~ /^www/i ) {
+ $uri = "http://$uri";
+}
+
+#############################################################################
+# Output validation results
+#############################################################################
+
+print <<"EOF";
+Content-Type: text/html
+
+$html40t_doctype
+<html>
+
+ <head>
+ <title>W3C HTML Validation Service Results for $uri</title>
+ <link rev="made" href="mailto:gerald\@w3.org">
+ </head>
+
+ <body bgcolor="#FFFFFF" text="#000000" link="#0000ee" vlink="#551a8b">
+
+ <p>
+ <a href="http://www.w3.org/"><img
+ src="http://www.w3.org/Icons/WWW/w3c_home" height=48 border=0
+ alt="W3C"></a>
+ </p>
+
+ <h1>W3C HTML Validation Service Results</h1>
+
+$notice$debugmessage
+EOF
+
+print <<"EOF";
+ <p>
+ Here are the <a href="/">W3C HTML Validation Service</a> results
+ for the document at URI:
+ </p>
+
+ <blockquote>
+ <a href="$uri"><code>$uri</code></a>
+ </blockquote>
+
+EOF
+
+#############################################################################
+# Print the "Jump to: " line with fragment-ID links
+#############################################################################
+
+$count = 0;
+$count++ if $FORM{"ss"} eq "true";
+$count++ if $FORM{"sp"} eq "true";
+$count++ if $FORM{"weblint"} eq "true";
+$count++ if $FORM{"outline"} eq "true";
+if ( $count ) {
+ print " <p>\n Jump to: ";
+ if ( $FORM{"weblint"} eq "true" ) {
+ print "<a\n href=\"#weblint\">Weblint Results</a>";
+ $count--;
+ print " or " if ( $count == 1 );
+ print ", " if ( $count > 1 );
+ }
+ if ( $FORM{"outline"} eq "true" ) {
+ print "<a\n href=\"#outline\">Outline</a>";
+ $count--;
+ print " or " if ( $count == 1 );
+ print ", " if ( $count > 1 );
+ }
+ if ( $FORM{"ss"} eq "true" ) {
+ print "<a\n href=\"#source\">Source Listing</a>";
+ $count--;
+ print " or " if ( $count == 1 );
+ print ", " if ( $count > 1 );
+ }
+ if ( $FORM{"sp"} eq "true" ) {
+ print "<a\n href=\"#parse\">Parse Tree</a>";
+ }
+ print ".\n </p>\n\n";
+}
+
+if ( $uri !~ m#^http://# ) {
+ print <<"EOF";
+<p>
+ Sorry, this type of URI is not supported by this service.
+</p>
+
+<p>
+ URIs should be in the form:
+</p>
+
+<blockquote>
+ <code>$abs_svc_uri</code>
+</blockquote>
+
+<p>
+ (There are other types of URIs, too, but only <code>http://</code> URIs
+ are currently supported by this service.)
+</p>
+
+EOF
+
+ &clean_up_and_exit;
+}
+
+# @@ is this completely safe!?
+$uri_escaped = $uri;
+$uri_escaped =~ s/\$/\\\$/g;
+open( URI, "$grabber \"$uri_escaped\" |" ) || die "couldn't retrieve uri: $!";
+@file = <URI>;
+close( URI ) || die "couldn't close uri retrieval pipe: $!";
+
+# skip the request headers
+while ( $_ = shift( @file ) ) {
+ chop;
+ last if /^$/;
+}
+
+# check the response headers
+while ( $_ = shift( @file ) ) {
+ chop;
+ if (/^HTTP\/[0-9\.]+ [0-9][0-9][0-9] .*/ ) {
+ ( ( $httpversion, $response, $message ) =
+ ( /^(HTTP\/[0-9\.]+) ([0-9][0-9][0-9]) (.*)/ ) );
+ }
+ if (/^Location: / ) {
+ ( ( $redirect_uri ) = ( /^Location: (.*)/ ) );
+ $redirect_uri =~ s/\s*$//g; # it has a trailing space sometimes (?)
+ }
+ last if /^$/;
+}
+
+if ( $response != 200 ) {
+ print "<p>\n I got the following unexpected response when trying to ";
+ print "retrieve $uri:\n";
+ print "</p>\n\n<blockquote> <code>$response $message</code>\n</blockquote>\n";
+ $optionstring = &build_options;
+ if ( ( $response == "302" ) || ( $response == "301" ) ){
+ # this (server name grabbing) should be moved elsewhere, probably.
+ ($server_name) = ($uri =~ /^http:\/\/([^\/]*)/i);
+
+ if ( $redirect_uri !~ /:\/\// ) {
+ $whiney_location_message = qq{
+<p>
+ <strong>Note</strong>: the HTTP server at $server_name is returning broken
+ "Location:" headers. According to <a
+ href="http://www.w3.org/Protocols/">the HTTP specifications</a>,
+ the Location header should be an absolute URI; this server is returning
+ relative URIs instead. If you are the maintainer of this server,
+ please arrange for this bug to be fixed.
+</p>
+};
+ $redirect_uri = "http://$server_name$redirect_uri";
+ }
+
+ print <<"EOF";
+
+<p>
+ This indicates that the server has redirected the request to a different
+ URI.
+</p>
+$whiney_location_message
+<p>
+ The URI it was redirected to is:
+</p>
+
+<blockquote>
+ <a href="/check?uri=$redirect_uri$optionstring">$redirect_uri</a>
+</blockquote>
+
+EOF
+
+ }
+ elsif ( $response == "401" ) {
+ print <<"EOF";
+
+<p>
+ Sorry, I am not authorized to access the specified URI.
+</p>
+
+<p>
+ This service can only validate pages which are available somewhere
+ on the publicly-accessible Internet. In the future this service may
+ be enhanced to accept a username/password combination to be proxied
+ to your server for authentication purposes.
+</p>
+EOF
+
+ }
+ else {
+ print "<p>\n Please make sure you have entered the URI correctly.\n</p>";
+ }
+ &clean_up_and_exit;
+}
+
+print <<'EOF';
+ <hr>
+ <h2><a name="validation">HTML Validation Results</a></h2>
+
+ <p>
+ Below are the results of attempting to parse this document with
+ an SGML parser.
+ </p>
+EOF
+
+$no_doctype=1;
+foreach $line (@file) {
+ if ( $line =~ /<[a-z].*<!doctype/i ) {
+ $no_doctype = 1;
+ last;
+ }
+ if ( $line =~ /<!doctype/i ) {
+ $no_doctype = 0;
+ last;
+ }
+ if ( $line =~ /<[a-z]/i ) {
+ $no_doctype = 1;
+ last;
+ }
+}
+
+# do several loops of increasing lengths to avoid iterating over
+# the whole file if possible.
+#
+# these heuristics could be improved a lot.
+if ( $no_doctype ) {
+ foreach $line (@file[0..20]) {
+ if ( $line =~ /<frame/i ) {
+ $default_doctype = $html40f_doctype;
+ $guessed_doctype_already = 1;
+ last;
+ }
+ }
+}
+
+if ( $no_doctype && ! $guessed_doctype_already ) {
+ foreach $line (@file[0..20]) {
+ if ( $line =~ /<table/i ) {
+ $default_doctype = $html40t_doctype;
+ $guessed_doctype_already = 1;
+ last;
+ }
+ if ( $line =~ /<body /i ) {
+ $default_doctype = $html40t_doctype;
+ $guessed_doctype_already = 1;
+ last;
+ }
+ }
+}
+
+if ( $no_doctype && ! $guessed_doctype_already ) {
+ foreach $line (@file) {
+ if ( $line =~ /<table/i ) {
+ $default_doctype = $html40t_doctype;
+ $guessed_doctype_already = 1;
+ last;
+ }
+ if ( $line =~ /<body /i ) {
+ $default_doctype = $html40t_doctype;
+ $guessed_doctype_already = 1;
+ last;
+ }
+ }
+}
+
+if ( $no_doctype && ! $guessed_doctype_already ) {
+ foreach $line (@file) {
+ if ( $line =~ /<center>/i ) {
+ $default_doctype = $html32_doctype;
+ last;
+ }
+ if ( $line =~ /<[h0-9p]*\s*align\s*=\s*center>/i ) {
+ $default_doctype = $html32_doctype;
+ last;
+ }
+ }
+}
+
+if ( $FORM{tidy} eq "true" ) {
+ open( CHECKER,
+ "| $tidy | $sp -E0 -m $sgmlstuff/catalog $sgmldecl - >$temp.esis 2>$temp" )
+ || die "couldn't open checker: $!";
+}
+else {
+ open( CHECKER,
+ "| $sp -E0 -m $sgmlstuff/catalog $sgmldecl - >$temp.esis 2>$temp" )
+ || die "couldn't open checker: $!";
+}
+
+print CHECKER "$default_doctype\n" if $no_doctype;
+# this is a kludge for DOS users with their entire file on a single line
+# like http://bogo.w3.org/test/samuels.html
+if ( $#file == 0 ) {
+ @file = (split(/ /,$file[0]));
+ for (0..$#file) {
+ $file[$_] .= "\n";
+ }
+}
+# kludge for other DOS users with CRLFs
+for (@file) {
+ s/ +$//;
+ print CHECKER;
+}
+# print CHECKER @file;
+close( CHECKER ) || "couldn't close checker";
+
+open( ERRORS, "< $temp" ) || die "couldn't open checker results: $!";
+@errors = <ERRORS>;
+close( ERRORS ) || die "couldn't close checker results: $!";
+
+open( ESIS, "$temp.esis" ) || die "couldn't read parser output: $!";
+while (<ESIS>) {
+ next if / IMPLIED$/;
+ next if /^ASDAFORM CDATA /;
+ next if /^ASDAPREF CDATA /;
+ push(@esis,$_);
+}
+close( ESIS ) || die "couldn't close parser output: $!";
+
+for (@esis) {
+ # grab the doctype, etc.
+ next unless /^AVERSION CDATA (.*)/;
+ $fpi = $1;
+}
+$version = $pub_ids{$fpi} || "unknown";
+
+if ( $no_doctype ) {
+ push( @fake_errors, "nsgmls:<OSFD>0:2:1:E: Missing DOCTYPE declaration at start of document\n" );
+}
+
+if ( $no_doctype ) {
+
+ $escaped_doctype = $default_doctype;
+ $escaped_doctype =~ s/" "/"\n "/;
+ $escaped_doctype =~ s/&/\&amp;/g;
+ $escaped_doctype =~ s/</\&lt;/g;
+
+ print <<"EOF";
+ <p>
+ <strong>Note</strong>: This document didn't start with the required
+ DOCTYPE declaration, so I inserted the following doctype before
+ attempting to validate the page:
+ </p>
+
+ <pre>
+ $escaped_doctype
+ </pre>
+
+ <p>
+ This document needs a doctype before it will be valid.
+ </p>
+EOF
+}
+
+print "\n <p>\n Version of HTML selected: <b>$version</b>.\n </p>\n";
+
+if ( $? || $no_doctype ) {
+ print "<pre>\n";
+ for ((@fake_errors,@errors)) {
+ next if /^<OSFD>0:[0-9]+:[0-9]+:[^A-Z]/;
+ next if / numbers exceeding 65535 not supported$/;
+ s/.*<OSFD>//g;
+ if ( ! (($line, $col, $type, $msg)=(/^[^:]*:([0-9]+):([0-9]+):([A-Z]?):? (.*)/))) {
+ print "Uh oh! I got the following unknown error:\n\n $_\n\n";
+ print "Please make sure you specified the DOCTYPE properly!\n\n";
+ &output_doctype_spiel;
+ last;
+ }
+ if ( $msg =~ /^cannot generate system identifier for entity / ) {
+ print " <b>Fatal error</b>! $msg\n";
+ print "</pre>\n<p>I couldn't parse this document, because it " .
+ "uses a public\n identifier that's not in my <a\n " .
+ " href=\"lib/catalog\">catalog</a>!\n </p>\n";
+ &output_doctype_spiel;
+ print "<pre>"; # so the </pre> we print later gets re-started
+ last;
+ }
+ if ( $msg =~ /^cannot open / ) {
+ print " Fatal error! $msg\n";
+ print "</pre>\n<p>I couldn't parse this document, because it " .
+ "makes reference to\n a system-specific file instead of " .
+ "simply using a public identifier\n to specify the " .
+ "level of HTML being used.\n </p>\n";
+ &output_doctype_spiel;
+ print "<pre>"; # so the </pre> we print later gets re-started
+ last;
+ }
+ $extraspaces = ""; # in case we put "(truncated)" gif on LHS
+ $line-- if $no_doctype;
+ $newline = $file[$line-1];
+
+ # make sure there's no ^P or ^Q's in the file, since we need to use
+ # them to represent '<' and '>' temporarily.
+ $newline =~ s/${lt}/P/go; $newline =~ s/${gt}/Q/g;
+
+ if ( length( $newline ) > 70 ) {
+ if ( $col < 25 ) {
+ # truncate source line at 70 chars (truncate right side only)
+ $newline = substr( $newline, 0, 70 ) . "$contchars" . "\n";
+ }
+ elsif ( $col > 70 ) {
+ # keep rightmost 70 chars; adjust $col accordingly
+ # (truncate left side only)
+ $diff = $col - 50;
+ $newline = "$contchars" . substr( $newline, $diff, 70 );
+ if ( length( $newline ) == (70 + length( "$contchars" )) ) {
+ $newline .= "$contchars" . "\n";
+ }
+ if ( $col > $diff ) {
+ $col -= $diff;
+ }
+ else {
+ $col -= 70;
+ }
+ $extraspaces = " " x 8;
+ }
+ else {
+ # truncate source line on both sides; leave more source text
+ # on left, and about 30 chars on right side. Also, adjust $col.
+ if ( $col < 35 ) {
+ $newline = "$contchars" . substr( $newline, 0, 60 );
+ }
+ else {
+ $newline = "$contchars" . substr( $newline, $col - 35, 60 );
+ $col = 35;
+ }
+ if ( length( $newline ) == ( 60 + length( "$contchars" ))) {
+ $newline .= "$contchars" . "\n";
+ }
+ $extraspaces = " " x 8;
+ }
+ }
+
+ # figure out the index into the %frag associative array for the
+ # "explanation..." links to the KGV FAQ.
+ $msgindex = $msg;
+ $msgindex =~ s/"[^"]+"/FOO/g;
+ $msgindex =~ s/[^A-Za-z ]//;
+
+ $out = "${lt}hr${gt}\n\nError at line $line:\n $newline";
+ if ( length( $msg ) < $col ) { # does it fit in front?
+ $out .= "$extraspaces " . ' ' x ($col-length($msg)) .
+ "$msg $rightarrow";
+ }
+ else {
+ if ( ( length( $msg ) + $col ) > 60 ) {
+ if ( $msg =~ /,/ ) {
+ $msg =~ s/,/,\n /;
+ }
+ else {
+ if ( ( (length( $msg ) / 2) + $col ) > 60 ) {
+ $pos = index( $msg, ' ', length($msg)/4 );
+ $indent = " " x (65-length($msg)*3/4);
+ }
+ else {
+ $pos = index( $msg, ' ', length($msg)/2 );
+ $indent = " " x ($col + 4);
+ }
+ $msg = substr( $msg, 0, $pos ) .
+ "\n$indent" . substr( $msg, $pos );
+ }
+ }
+ $out .= "$extraspaces " . ' ' x ($col-1) . "$leftarrow $msg";
+ }
+
+ if ( defined $frag{$msgindex} ) {
+ $out .=
+ " (${lt}a href=\"$faqerrloc#$frag{$msgindex}\"${gt}explanation...${lt}/a${gt})";
+ }
+ else { # remember msgindexes without frags, to get the KGV FAQ updated.
+ $undef_frag{$msgindex} = 1;
+ }
+
+ $out .= "\n";
+ $out =~ s/&/&amp;/go; $out =~ s/</&lt;/go; $out =~ s/>/&gt;/go;
+# the following kludge is so the <img src>s don't get sgml-proofed above
+ $out =~ s/${lt}/</g; $out =~ s/${gt}/>/g;
+ print $out;
+ }
+ print "</pre>\n";
+ print "<hr>\n";
+ if ( $version eq "unknown" ) {
+ print "\n <p>\n Sorry, I can't validate this document.\n </p>\n";
+ }
+ else {
+ print "\n <p>\n Sorry, this document does not validate as $version.\n </p>\n";
+ }
+ $validity="invalid";
+}
+else {
+ print "\n <pre>\n No errors found!</pre>\n";
+ if ( $version ne "unknown" ) {
+ if ( $version =~ /^HTML 2\.0$/ ) {
+ $gifname = "vh20.gif";
+ $alttext = "Valid HTML 2.0!";
+ $gifborder = "";
+ }
+ elsif ( $version =~ /HTML 3\.2</ ) {
+ $gifname = "vh32.gif";
+ $alttext = "Valid HTML 3.2!";
+ $gifhw = " HEIGHT=31 WIDTH=88";
+ }
+ elsif ( $version =~ /HTML 4\.0<\/a>$/ ) {
+ $gifname = "vh40.gif";
+ $alttext = "Valid HTML 4.0!";
+ $gifborder = "";
+ $gifhw = " HEIGHT=31 WIDTH=88";
+ }
+ elsif ( $version =~ /HTML 4\.0<\/a> / ) {
+ $gifname = "vh40.gif";
+ $alttext = "Valid HTML 4.0!";
+ $gifhw = " HEIGHT=31 WIDTH=88";
+ }
+ elsif ( $version =~ /HTML 3\.0/ ) {
+ $gifname = "vh30.gif";
+ $alttext = "Valid HTML 3.0!";
+ }
+ elsif ( $version =~ /Netscape/ ) {
+ $gifname = "vhns.gif";
+ $alttext = "Valid Netscape-HTML!";
+ }
+ elsif ( $version =~ /Hotjava/ ) {
+ $gifname = "vhhj.gif";
+ $alttext = "Valid Hotjava-HTML!";
+ }
+ if ( defined $gifname ) {
+ $nicegifname = $gifname;
+ $nicegifname =~ s/</\&lt;/g; $nicegifname =~ s/&/\&amp;/g;
+ print <<"EOHD";
+ <p>
+ <img src="$rel_img_uri$gifname" alt="$alttext"> Congratulations, this
+ document validates as $version!
+ </p>
+
+ <p>
+ To show your readers that you've taken the care to create an
+ interoperable Web page, you may display this icon on any page
+ that validates. Here is the HTML you could use to add this icon
+ to your Web page:
+ </p>
+ <pre>
+ &lt;P&gt;&lt;A HREF="$abs_svc_uri"&gt;&lt;IMG$gifborder
+ SRC="$abs_img_uri$nicegifname"
+ ALT="$alttext"$gifhw&gt;&lt;/A&gt;</pre>
+ <p>
+ If you like, you can <a href="$rel_img_uri$gifname">download a copy of this
+ image</a> to keep in your local web directory, and change the HTML fragment
+ above to reference your local image rather than the one on this server.
+ </p>
+EOHD
+ }
+ }
+ if ( ( $version eq "unknown" ) || ( ! defined $gifname ) ) {
+ print " <p>\n Congratulations, this document validates as the document type specified! (I don't have an icon for this one yet, sorry.)\n </p>\n";
+ }
+
+ $escaped_uri = $uri;
+ $escaped_uri =~ s/=/%3D/g;
+ $escaped_uri =~ s/\&/%26/g;
+ $escaped_uri =~ s/;/%3B/g;
+ $escaped_uri =~ s/,/%2C/g;
+ # ugh
+
+ $thispage = "${abs_svc_uri}check?uri=$escaped_uri";
+ $thispage .= &build_options;
+ print <<"EOHD";
+
+ <p>
+ If you would like to create a link to <em>this</em> page (i.e., this
+ validation result) to make it easier to re-validate this page in the
+ future or to allow others to validate your page, the URI is:
+ </p>
+
+ <blockquote>
+ <code>$thispage</code>
+ </blockquote>
+
+ <p>
+ (Or, you can just add the current page to your bookmarks or hotlist.)
+ </p>
+
+EOHD
+
+ $validity="valid";
+}
+
+$validation_return_code = $?;
+
+if ( $FORM{"weblint"} eq "true" ) {
+
+ if ( $FORM{"pw"} eq "true" ) {
+ $pedanticflags = "-pedantic -e mailto-link";
+ $pedantic_blurb = " (in \"pedantic\" mode)";
+ }
+ else {
+ $pedanticflags = "";
+ }
+
+ print <<"EOF";
+ <hr>
+ <h2><a name="weblint">Weblint Results</a></h2>
+
+ <p>
+ Below are the results of running <a
+ href="http://www.cre.canon.co.uk/~neilb/weblint/">Weblint</a>
+ on this document$pedantic_blurb:
+ </p>
+EOF
+
+ open( WEBLINT,
+ "| $weblint -s $pedanticflags - 2>&1 >$temp.weblint" )
+ || die "couldn't open weblint: $!";
+ print WEBLINT @file;
+ close( WEBLINT ) || "couldn't close weblint: $!";
+
+ print "\n\n";
+ if ( $? ) {
+ print " <ul>\n";
+
+ open( WEBLINTOUT, "$temp.weblint" )
+ || die "couldn't open weblint results in $temp: $!";
+
+ while (<WEBLINTOUT>) {
+ s/ \(use "-x <extension>" to allow this\)\.$/./go;
+ s/&/&amp;/go;
+ s/</&lt;/go;
+ s/>/&gt;/go;
+ print " <li>$_";
+ }
+
+ close( WEBLINTOUT ) || die "couldn't close weblint results: $!";
+ print " </ul>\n";
+ }
+ else {
+ print "\n <blockquote>\n Looks good to me!\n </blockquote>\n";
+ }
+ print "\n\n";
+}
+
+if ( $FORM{"outline"} eq "true" ) {
+ print <<'EOF';
+ <hr>
+ <h2><a name="outline">Outline</a></h2>
+
+ <p>
+ Below is an outline for this document, automatically generated from the
+ heading tags (<code>&lt;H1&gt;</code> through <code>&lt;H6&gt;</code>.)
+ </p>
+EOF
+
+ $prevlevel = 0;
+ $indent = 0;
+ for (1..$#esis) {
+ $line = $esis[$_];
+ next if / IMPLIED$/;
+ next if /^ASDAFORM CDATA /;
+ next if /^ASDAPREF CDATA /;
+ next unless $line =~ /^\(H[1-6]$/;
+ $prevlevel = $level;
+ $level = substr( $line, 2, 1 );
+
+ print " </ul>\n" x ( $prevlevel - $level ); # perl is so cool.
+ if ( $level - $prevlevel == 1 ) {
+ print " <ul>\n";
+ }
+ foreach $i ( ($prevlevel+1) .. ($level-1)) {
+ print " <ul>\n <li><em>A level $i heading is missing!</em>\n";
+ }
+ if ( $level - $prevlevel > 1 ) {
+ print " <ul>\n";
+ }
+
+ $line = "foo"; $heading = "";
+ while ( substr( $line, 0, 3 ) ne ")H$level" ) {
+ $line = $esis[$_++];
+ if ( $line =~ /^-/ ) {
+ $headcont = $line;
+ substr( $headcont, 0, 1 ) = " ";
+ $headcont =~ s/\\n/ /g;
+ $heading .= $headcont;
+ }
+ }
+
+ $heading = substr( $heading, 1 ); # chop the leading '-'
+ $heading =~ s/&/&amp;/go; $heading =~ s/</&lt;/go;
+
+ print " <li>$heading";
+ }
+ print " </ul>\n" x ( $level );
+ print <<'EOF';
+ <p>
+ If this doesn't look like a real outline, it is likely that the
+ heading tags are not being used properly. (Headings should reflect
+ the logical structure of the document; they should not be used simply
+ to add emphasis, or to change the font size.)
+ </p>
+EOF
+}
+
+if ( $FORM{"ss"} eq "true" ) {
+ print <<'EOF';
+ <hr>
+ <h2><a name="source">Source Listing</a></h2>
+
+ <p>
+ Below is the source input I used for this validation:
+ </p>
+EOF
+
+ print "<pre>\n";
+ if ( $no_doctype ) {
+ $dd = "$default_doctype\n";
+ $dd =~ s/&/&amp;/go; $dd =~ s/</&lt;/go; # $dd =~ s/>/&gt;/go;
+ printf "%4d: %s", 0, $dd;
+ }
+ $line = 1;
+ for (@file) {
+ s/&/&amp;/go; s/</&lt;/go; # s/>/&gt;/go;
+ printf "%4d: %s", $line, $_;
+ $line++;
+ }
+ print "</pre>\n";
+}
+
+if ( $FORM{"sp"} eq "true" ) {
+ print <<'EOF';
+ <hr>
+ <h2><a name="parse">Parse Tree</a></h2>
+
+ <p>
+ Below is the parse tree for this document. <em>Note: this feature is
+ still under construction! I'm trying to make this easier to read
+ somehow, with little success.
+EOF
+
+ if ( $FORM{"noatt"} ne "true" ) {
+ print <<'EOF';
+ It helps a bit if you select the
+ "don't show attributes" option on the <a href="./#byURI">form</a>.
+EOF
+ }
+ print "</em>\n </p>\n\n";
+
+ if ( $FORM{"noatt"} eq "true" ) {
+ print <<'EOF';
+ <p>
+ I'm excluding the attributes, as you requested.
+ </p>
+EOF
+ }
+
+ $indent = 0;
+ print "<pre>\n";
+ for (@esis) {
+ if ( $FORM{"noatt"} eq "true" ) {
+ next if /^A/;
+ next if /^\(A$/;
+ next if /^\)A$/;
+ }
+
+# experimental: skip data if it's only newlines and space.
+# next if /^-(\\n|\s+)*$/;
+
+# another way to do the same thing:
+# experimental: skip data if it's only newlines and space.
+s/\\n/ /g; s/\\011/ /g; s/\s+/ /g;
+next if /^-\s*$/;
+
+if ( /^-/ ) {
+ chop;
+ substr( $_, 0, 1 ) = " ";
+# $_ = substr( $_, 1 ); # experimental. previous line works OK.
+ $prevdata .= $_;
+ next;
+}
+else {
+ $prevdata =~ s/&/&amp;/go;
+ $prevdata =~ s/</&lt;/go;
+ $prevdata =~ s/\s+/ /go;
+# should wrap this to 80 columns or whatever. check tchrist's wrapping hack from c.l.p.m.
+ print " " x $indent . "$prevdata\n";
+ undef $prevdata;
+}
+# end of experimental stuff
+
+
+# next unless /^AHREF CDATA /; # this is interesting when uncommented
+ s/&/&amp;/go;
+ s/</&lt;/go;
+ if ( /^\)/ ) {
+ $indent -= 3;
+ }
+
+ chop( $printme = $_ );
+ $printme =~ s/^\((.*)/&lt;$1&gt;\n/;
+ $printme =~ s/^\)(.*)/&lt;\/$1&gt;/;
+ print " " x $indent . "$printme";
+# print " " x $indent . "$_";
+
+ if ( /^\(/ ) {
+ $indent += 3;
+ }
+ }
+ print "</pre>\n";
+}
+
+&clean_up_and_exit;
+
+sub output_doctype_spiel {
+
+ print <<"EOF";
+
+ <p>
+ You should make the first line of your HTML document a DOCTYPE
+ declaration, like this:
+ </p>
+
+ <pre>
+ &lt;!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2//EN"&gt;
+ &lt;HTML&gt;
+ &lt;HEAD&gt;
+ &lt;TITLE&gt;Title&lt;/TITLE&gt;
+ &lt;/HEAD&gt;
+
+ &lt;BODY&gt;
+ &lt;-- ... body of document ... --&gt;
+ &lt;/BODY&gt;
+ &lt;/HTML&gt;</pre>
+
+ <p>
+ Or, if you are using features from <a
+ href="http://www.w3.org/TR/REC-html40/">HTML 4.0</a>,
+ one of these:
+ </p>
+
+ <pre>
+ &lt;!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0//EN"&gt;
+ &lt;!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"&gt;
+ &lt;!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Frameset//EN"&gt;</pre>
+
+EOF
+
+}
+
+sub output_intro_spiel {
+
+ print <<"EOF";
+Content-Type: text/html
+
+$html40t_doctype
+<html>
+
+<head>
+ <title>W3C HTML Validation Service</title>
+ <link rev="made" href="mailto:gerald\@w3.org">
+ <meta name="keywords" content="HTML, Hypertext Markup Language, Validation,
+ W3C HTML Validation Service">
+ <meta name="description" content="W3C's easy-to-use
+ HTML validation service, based on an SGML parser.">
+</head>
+
+<body bgcolor="#FFFFFF" text="#000000" link="#0000ee" vlink="#551a8b">
+
+ <p>
+ <a href="http://www.w3.org/"><img
+ src="http://www.w3.org/Icons/WWW/w3c_home" height=48 border=0
+ alt="W3C"></a>
+ </p>
+
+ <h1>W3C HTML Validation Service</h1>
+
+ <p>
+ Welcome to the W3C HTML Validation Service!
+ See <a href="whatsnew.html"><strong>what's new</strong></a>. (updated
+ June 19, 1998)
+ </p>
+$notice
+ <p>
+ This is an easy-to-use HTML validation service based on an SGML
+ parser. It checks HTML documents for compliance with W3C HTML
+ Recommendations and other HTML standards.
+ </p>
+
+ <p>
+ To use the <a
+ href="http://www.w3.org/TR/REC-html40/">W3C HTML 4.0 Recommendation</a>,
+ start your document with one of the following doctype declarations:
+ </p>
+
+ <pre>
+ &lt;!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0//EN"
+ "http://www.w3.org/TR/REC-html40/strict.dtd"&gt;
+ &lt;!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"
+ "http://www.w3.org/TR/REC-html40/loose.dtd"&gt;
+ &lt;!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Frameset//EN"
+ "http://www.w3.org/TR/REC-html40/frameset.dtd"&gt;
+ </pre>
+
+ <hr>
+ <h2><a name="byURI">Validate Documents by URI</a></h2>
+
+ <p>
+ Enter the URI of a document you would like validated:
+ </p>
+
+ <form method=GET action="/check">
+ URI: <input name="uri" size=50><br>
+ <input name="weblint" type=checkbox value="">Include <a
+ href="http://www.cre.canon.co.uk/~neilb/weblint/">Weblint</a>
+ results
+ <input name="pw" type=checkbox value="">run Weblint
+ in "pedantic" mode<br>
+ <input name="ss" type=checkbox value="">Show source input
+ <input name="outline" type=checkbox value="">Show an outline of this
+ document<br>
+ <input name="sp" type=checkbox value="">Show parse tree
+ <input name="noatt" type=checkbox value="">don't show
+ attributes in the parse tree<br>
+ <input type="submit" value="Submit this URI for validation">
+ <input type="reset" value="Reset this form">
+ </form>
+
+ <hr>
+ <h2><a name="otherInfo">Other sources of information</a></h2>
+
+ <ul>
+ <li><a href="whatsnew.html">Recent changes to this service</a></li>
+ <li><a href="news:comp.infosystems.www.authoring.html">The HTML
+ authoring newsgroup</a> (for help with HTML)
+ <li><a href="lib/catalog">DTDs (document types) supported by this
+ service</a> (the SGML catalog)</li>
+ <li>The <a href="http://www.w3.org/TR/REC-html40/">W3C HTML 4.0
+ Recommendation</a>
+ <li>The <a href="http://www.w3.org/TR/REC-html32">W3C HTML 3.2
+ Recommendation</a>
+
+ </ul>
+
+ <h2><a name="comingSoon">Coming soon</a></h2>
+
+ <ul>
+
+ <li>A comprehensive user manual.
+
+ <li>A feature to allow the DOCTYPE to be temporarily overriden
+ by a user-selectable DTD.
+
+ <li>Support for <a
+ href="http://www.ics.uci.edu/pub/ietf/html/rfc1867.txt">Form-based
+ File upload</a>.</li>
+
+ <li>A text area in which to enter HTML code to be tested.</li>
+
+ </ul>
+
+ <h2><a name="credits">Credits</a></h2>
+
+ <p>
+ This service uses:
+ </p>
+
+ <ul>
+
+ <li>
+ <a href="about.html">Hardware/network connectivity</a> provided by <a
+ href="http://www.compaq.com/">Compaq Computer Corporation</a>.
+ </li>
+
+ <li>
+ <a href="http://www.jclark.com/">James Clark</a>'s excellent <a
+ href="http://www.jclark.com/sp/">SGML parser</a>,
+ </li>
+
+ <li>
+ <a href="http://www.ics.uci.edu/~fielding/">Roy T. Fielding</a>'s <a
+ href="http://www.ics.uci.edu/pub/websoft/libwww-perl/">libwww-perl</a>
+ to retrieve documents,
+ </li>
+
+ <li>
+ <a href="http://www.cre.canon.co.uk/~neilb/">Neil Bowers</a>' HTML
+ style checker, <a
+ href="http://www.cre.canon.co.uk/~neilb/weblint/">Weblint</a>.
+ </li>
+
+ <li>
+ SGML <a href="http://www.cs.duke.edu/~dsb/kgv-faq/errors.html">error
+ explanations</a> maintained by <a
+ href="http://www.cs.duke.edu/~dsb/index.html">Scott Bigham</a>.
+
+ </ul>
+
+EOF
+
+}
+
+sub output_closing {
+
+ print <<"EOF";
+
+<hr>
+
+<address>
+ <a href="http://validator.w3.org/check/referer"><img
+ src="http://validator.w3.org/images/vh40.gif" height=31 width=88
+ align=right border=0 alt="Valid HTML 4.0!"></a>
+ <a href="http://www.w3.org/People/Gerald/">Gerald Oskoboiny</A><br><A
+ href="http://www.w3.org/Help/Webmaster">Webmaster</A><br>
+ \$Date: 1998-07-24 22:13:05 $ \
+</address>
+
+</body>
+
+</html>
+EOF
+
+}
+
+sub build_options {
+
+ $optionstring = "";
+ foreach $option (@options) {
+ $optionstring .= ";$option" if $FORM{$option} eq "true";
+ }
+ $optionstring;
+
+}
+
+sub erase_stuff {
+
+ unlink $temp;
+ unlink "$temp.esis";
+ unlink "$temp.weblint";
+
+}
+
+sub make_log_entry {
+
+ open(LOG,">>$logfile") || die "couldn't append to log: $!";
+ print LOG "$ENV{REMOTE_HOST}\t$validity $version\t$uri\n";
+ foreach $msgindex (keys %undef_frag) {
+ print LOG "frag not defined for msgindex: $msgindex\n";
+ }
+ close( LOG ) || die "couldn't close log: $!";
+
+ if ( length( $ENV{HTTP_REFERER} ) ) {
+ open(RLOG,">>$reflogfile") || die "couldn't append to rlog: $!";
+ print RLOG "$ENV{HTTP_REFERER} -> $ENV{REQUEST_URI}\n";
+ close( RLOG ) || die "couldn't close rlog: $!";
+ }
+
+}
+
+sub clean_up_and_exit {
+
+ &output_closing;
+ &erase_stuff;
+ &make_log_entry;
+ exit;
+
+}
+
+sub redirect_to_home_page {
+
+ print "Status: 302 Moved Permanently\n";
+ print "Content-Type: text/html\n";
+ print "Location: http://validator.w3.org/\n\n";
+ print "<title>Moved!</title>\n";
+ print "<p>\n";
+ print " Please see <a href=\"http://validator.w3.org/\">the validation service's home page.</a>\n";
+ print "</p>\n";
+
+ &clean_up_and_exit;
+
+}