diff options
author | gerald <gerald@localhost> | 1999-02-25 04:21:32 +0000 |
---|---|---|
committer | gerald <gerald@localhost> | 1999-02-25 04:21:32 +0000 |
commit | c38845092e50104ccf4e9910dd5b619c7a309fc4 (patch) | |
tree | 33d9727eea45c9e6b19c73670e6e3cb9e754fb5c | |
parent | 3677222d4fb79ddb699cec80694fa00e686ca666 (diff) | |
download | markup-validator-c38845092e50104ccf4e9910dd5b619c7a309fc4.zip markup-validator-c38845092e50104ccf4e9910dd5b619c7a309fc4.tar.gz markup-validator-c38845092e50104ccf4e9910dd5b619c7a309fc4.tar.bz2 |
finished the addition of Japanese charset-handling, restructured the code
a bit (moved doctype-guessing code to a separate function), misc other
tweaks/fixes in anticipation of handling multiple URIs on one pass
-rwxr-xr-x | httpd/cgi-bin/check | 366 |
1 files changed, 183 insertions, 183 deletions
diff --git a/httpd/cgi-bin/check b/httpd/cgi-bin/check index 502e9c8..3abcbc3 100755 --- a/httpd/cgi-bin/check +++ b/httpd/cgi-bin/check @@ -8,18 +8,19 @@ # This source code is available under the license at: # http://www.w3.org/Consortium/Legal/copyright-software # -# $Id: check,v 1.10 1999-01-21 08:43:28 gerald Exp $ +# $Id: check,v 1.11 1999-02-25 04:21:32 gerald Exp $ ############################################################################# # Constant definitions ############################################################################# -$cvsrevision = '$Revision: 1.10 $'; -$cvsdate = '$Date: 1999-01-21 08:43:28 $'; +$cvsrevision = '$Revision: 1.11 $'; +$cvsdate = '$Date: 1999-02-25 04:21:32 $'; $logfile = "/var/log/httpd/val-svc"; $reflogfile = "/var/log/httpd/val-svc-referers"; +$uri_def_uri = "http://www.w3.org/Addressing/#terms"; $faqloc = "http://www.cs.duke.edu/~dsb/kgv-faq/"; $faqerrloc = "${faqloc}errors.html"; $abs_svc_uri = "http://validator.w3.org/"; @@ -52,7 +53,6 @@ $nice_html40f_doctype = qq{<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Frameset/ $html40_doctype = qq{<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0//EN "http://www.w3.org/TR/REC-html40/strict.dtd">}; $html40t_doctype = qq{<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">}; $html40f_doctype = qq{<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Frameset//EN" "http://www.w3.org/TR/REC-html40/frameset.dtd">}; -$default_doctype = $html2_doctype; $temp = "/tmp/validate.$$"; $lt = "\020"; $gt = "\021"; @@ -213,7 +213,7 @@ $html40t_doctype <html> <head> - <title>W3C HTML Validation Service Results for $uri</title> + <title>W3C HTML Validation Service Results</title> <link rev="made" href="mailto:gerald\@w3.org"> </head> @@ -225,58 +225,11 @@ $html40t_doctype alt="W3C"></a> </p> - <h1>W3C HTML Validation Service Results</h1> + <h1><a href="/">W3C HTML Validation Service</a> Results</h1> $notice$debugmessage EOF -print <<"EOF"; - <p> - Here are the <a href="/">W3C HTML Validation Service</a> results - for the document at URI: - </p> - - <blockquote> - <a href="$uri"><code>$uri</code></a> - </blockquote> - -EOF - -############################################################################# -# Print the "Jump to: " line with fragment-ID links -############################################################################# - -$count = 0; -$count++ if $FORM{"ss"} eq "true"; -$count++ if $FORM{"sp"} eq "true"; -$count++ if $FORM{"weblint"} eq "true"; -$count++ if $FORM{"outline"} eq "true"; -if ( $count ) { - print " <p>\n Jump to: "; - if ( $FORM{"weblint"} eq "true" ) { - print "<a\n href=\"#weblint\">Weblint Results</a>"; - $count--; - print " or " if ( $count == 1 ); - print ", " if ( $count > 1 ); - } - if ( $FORM{"outline"} eq "true" ) { - print "<a\n href=\"#outline\">Outline</a>"; - $count--; - print " or " if ( $count == 1 ); - print ", " if ( $count > 1 ); - } - if ( $FORM{"ss"} eq "true" ) { - print "<a\n href=\"#source\">Source Listing</a>"; - $count--; - print " or " if ( $count == 1 ); - print ", " if ( $count > 1 ); - } - if ( $FORM{"sp"} eq "true" ) { - print "<a\n href=\"#parse\">Parse Tree</a>"; - } - print ".\n </p>\n\n"; -} - if ( $uri !~ m#^http://# ) { print <<"EOF"; <p> @@ -322,17 +275,26 @@ while ( $_ = shift( @file ) ) { ( ( $httpversion, $response, $message ) = ( /^(HTTP\/[0-9\.]+) ([0-9][0-9][0-9]) (.*)/ ) ); } - if (/^Location: / ) { - ( ( $redirect_uri ) = ( /^Location: (.*)/ ) ); + if (/^Last-Modified: /i ) { + ( $lastmod ) = ( /^Last-Modified: (.*)/i ); + } + if (/^Content-Length: /i ) { + ( $length ) = ( /^Content-Length: (.*)/i ); + } + if (/^Server: /i ) { + ( $server ) = ( /^Server: (.*)/i ); + } + if (/^Location: /i ) { + ( $redirect_uri ) = ( /^Location: (.*)/i ); $redirect_uri =~ s/\s*$//g; # it has a trailing space sometimes (?) } if (/^Content-Type: /i) { - ( ( $content_type ) = ( /^Content-Type: (.*)/i ) ); - ( ( $charset ) = ( $content_type =~ /;\s*charset=(.*)/i ) ); + ( $content_type ) = ( /^Content-Type: (.*)/i ); + ( $http_charset ) = ( $content_type =~ /;\s*charset=(.*)/i ); $content_type =~ s/;.*$//; $content_type =~ s/\s*$//g; - $charset =~ s/;.*//; - $charset =~ s/\s*//g; + $http_charset =~ s/;.*//; + $http_charset =~ s/\s*//g; } last if /^$/; @@ -414,92 +376,20 @@ EOF &clean_up_and_exit; } -print <<'EOF'; - <hr> - <h2><a name="validation">HTML Validation Results</a></h2> - - <p> - Below are the results of attempting to parse this document with - an SGML parser. - </p> -EOF - -$no_doctype=1; -foreach $line (@file) { - if ( $line =~ /<[a-z].*<!doctype/i ) { - $no_doctype = 1; - last; - } - if ( $line =~ /<!doctype/i ) { - $no_doctype = 0; - last; - } - if ( $line =~ /<[a-z]/i ) { - $no_doctype = 1; - last; - } -} - -# do several loops of increasing lengths to avoid iterating over -# the whole file if possible. -# -# these heuristics could be improved a lot. -if ( $no_doctype ) { - foreach $line (@file[0..20]) { - if ( $line =~ /<frame/i ) { - $default_doctype = $html40f_doctype; - $guessed_doctype_already = 1; - last; - } - } -} + $jump_links = &build_jump_links; + $count = 1; # @@ should loop over many uris instead -if ( $no_doctype && ! $guessed_doctype_already ) { - foreach $line (@file[0..20]) { - if ( $line =~ /<table/i ) { - $default_doctype = $html40t_doctype; - $guessed_doctype_already = 1; - last; - } - if ( $line =~ /<body /i ) { - $default_doctype = $html40t_doctype; - $guessed_doctype_already = 1; - last; - } - } -} +print <<"EOF"; + <h2><a name="doc$count">Document Checked</a></h2> -if ( $no_doctype && ! $guessed_doctype_already ) { - foreach $line (@file) { - if ( $line =~ /<table/i ) { - $default_doctype = $html40t_doctype; - $guessed_doctype_already = 1; - last; - } - if ( $line =~ /<body /i ) { - $default_doctype = $html40t_doctype; - $guessed_doctype_already = 1; - last; - } - } -} +$jump_links +EOF -if ( $no_doctype && ! $guessed_doctype_already ) { - foreach $line (@file) { - if ( $line =~ /<center>/i ) { - $default_doctype = $html32_doctype; - last; - } - if ( $line =~ /<[h0-9p]*\s*align\s*=\s*center>/i ) { - $default_doctype = $html32_doctype; - last; - } - } -} +$guessed_doctype = &check_for_doctype( \@file ); foreach $line (@file) { # @@ needs to handle meta elements that span more than one line - if ( $line =~ /<meta/i ) { + if ( $line =~ /<meta/i ) { if ( $line =~ /charset\s*=[\s"]*([^\s;">]*)/i ) { $meta_charset = $1; last; @@ -507,38 +397,54 @@ foreach $line (@file) { } } -print "<p>\n debug: [$charset],[$meta_charset]\n</p>"; - -if ($charset ne '') { - if ($meta_charset ne '' && $charset !~ /$meta_charset/i) { - print "<p>\n <strong>Warning:</strong> Charset in HTTP header and in META tag are different.\n</p>"; - print "<p>\n $charset != $meta_charset\n</p>"; +if ( $http_charset ne '' ) { + $effective_charset = $http_charset; + if ( $meta_charset ne '' && $http_charset !~ /$meta_charset/i ) { + # @@ the above needs work + $charsets_differ = 1; + } +} +else { + if ( $meta_charset ne '' ) { + $effective_charset = $meta_charset; } -} else { - if ($meta_charset ne '') { - $charset = $meta_charset; - } else { - print "<p>\n <strong>Note:</strong> Charset is unknown.\n</p>"; + else { + $effective_charset = "unknown"; } } -if ( $charset =~ /iso-2022-jp/i ) { +if ( $effective_charset =~ /iso-2022-jp/i ) { $codeconv = "$nkf -Jex | "; } -elsif ( $charset =~ /Shift_JIS/i ) { +elsif ( $effective_charset =~ /Shift_JIS/i ) { $codeconv = "$nkf -Sex | "; } else { $codeconv = ""; } -print "<p>\n debug: [$charset],[$codeconv]\n</p>"; +print qq{<ul>\n <li><a href="$uri_def_uri">URI</a>: <a href="$uri">$uri</a>\n}; + +if ( defined $lastmod ) { + print qq{ <li>Last modified: $lastmod\n}; +} + +# @@ add a "verbose" option or something on the advanced form, then +# display a bunch of stuff like this if it's selected +# +# if ( defined $server ) { +# print qq{<li>Server: $server\n}; +# } +# +# if ( defined $length ) { +# print qq{<li>Content length: $length\n}; +# } open( CHECKER, "| $codeconv $sp -E0 -m $sgmlstuff/catalog - >$temp.esis 2>$temp" ) || die "couldn't open checker: $!"; -print CHECKER "$default_doctype\n" if $no_doctype; +print CHECKER "$guessed_doctype\n" if $guessed_doctype; # this is a kludge for DOS users with their entire file on a single line # like http://bogo.w3.org/test/samuels.html if ( $#file == 0 ) { @@ -575,37 +481,44 @@ for (@esis) { } $version = $pub_ids{$fpi} || "unknown"; -if ( $no_doctype ) { - push( @fake_errors, "nsgmls:<OSFD>0:2:1:E: Missing DOCTYPE declaration at start of document\n" ); +if ( $guessed_doctype ) { + push( @fake_errors, "nsgmls:<OSFD>0:2:1:E: Missing DOCTYPE declaration at start of document (${lt}a href=\"/docs/doctypes.html\"${gt}explanation...${lt}/a${gt})\n" ); } -if ( $no_doctype ) { +if ( $guessed_doctype ) { + $escaped_doctype = $guessed_doctype; + $escaped_doctype =~ s/" "/"\n "/; + $escaped_doctype =~ s/&/\&/g; + $escaped_doctype =~ s/</\</g; +} - $escaped_doctype = $default_doctype; - $escaped_doctype =~ s/" "/"\n "/; - $escaped_doctype =~ s/&/\&/g; - $escaped_doctype =~ s/</\</g; +print qq{ <li>Character encoding: $http_charset\n}; - print <<"EOF"; - <p> - <strong>Note</strong>: This document didn't start with the required - DOCTYPE declaration, so I inserted the following doctype before - attempting to validate the page: - </p> +if ( $charsets_differ ) { + print <<"EOHD"; +<br> + <strong>Warning:</strong> the character encoding specified in the HTTP header + (<code>$http_charset</code>) is different from the one specified in the META + element (<code>$meta_charset</code>). + I will use <code>$effective_charset</code> for this validation. - <pre> - $escaped_doctype - </pre> +EOHD - <p> - This document needs a doctype before it will be valid. - </p> -EOF } -print "\n <p>\n Version of HTML selected: <b>$version</b>.\n </p>\n"; +print " <li>Level of HTML: <b>$version</b>.\n"; + +print "</ul>\n\n"; -if ( $? || $no_doctype ) { +print <<"EOHD"; + <p> + Below are the results of attempting to parse this document with + an SGML parser. + </p> + +EOHD + +if ( $? || $guessed_doctype ) { print "<pre>\n"; for ((@fake_errors,@errors)) { next if /^<OSFD>0:[0-9]+:[0-9]+:[^A-Z]/; @@ -637,7 +550,7 @@ if ( $? || $no_doctype ) { last; } $extraspaces = ""; # in case we put "(truncated)" gif on LHS - $line-- if $no_doctype; + $line-- if $guessed_doctype; $newline = $file[$line-1]; # make sure there's no ^P or ^Q's in the file, since we need to use @@ -739,7 +652,7 @@ if ( $? || $no_doctype ) { $validity="invalid"; } else { - print "\n <pre>\n No errors found!</pre>\n"; + print "\n <pre>\n No errors found!</pre>\n\n"; if ( $version ne "unknown" ) { if ( $version =~ /^HTML 2\.0$/ ) { $gifname = "vh20.gif"; @@ -959,10 +872,10 @@ if ( $FORM{"ss"} eq "true" ) { EOF print "<pre>\n"; - if ( $no_doctype ) { - $dd = "$default_doctype\n"; - $dd =~ s/&/&/go; $dd =~ s/</</go; # $dd =~ s/>/>/go; - printf "%4d: %s", 0, $dd; + if ( $guessed_doctype ) { + $gd = "$guessed_doctype\n"; + $gd =~ s/&/&/go; $gd =~ s/</</go; # $gd =~ s/>/>/go; + printf "%4d: %s", 0, $gd; } $line = 1; for (@file) { @@ -1293,7 +1206,7 @@ sub clean_up_and_exit { &output_closing; &erase_stuff; - &make_log_entry; +# &make_log_entry; exit; } @@ -1311,3 +1224,90 @@ sub redirect_to_home_page { &clean_up_and_exit; } + +sub build_jump_links { + + my $text; + my $count = 0; + + $count++ if $FORM{ss} eq "true"; + $count++ if $FORM{sp} eq "true"; + $count++ if $FORM{weblint} eq "true"; + $count++ if $FORM{outline} eq "true"; + + if ( $count ) { + $text .= " <p>\n Jump to: "; + if ( $FORM{"weblint"} eq "true" ) { + $text .= "<a\n href=\"#weblint\">Weblint Results</a>"; + $count--; + $text .= " or " if ( $count == 1 ); + $text .= ", " if ( $count > 1 ); + } + if ( $FORM{"outline"} eq "true" ) { + $text .= "<a\n href=\"#outline\">Outline</a>"; + $count--; + $text .= " or " if ( $count == 1 ); + $text .= ", " if ( $count > 1 ); + } + if ( $FORM{"ss"} eq "true" ) { + $text .= "<a\n href=\"#source\">Source Listing</a>"; + $count--; + $text .= " or " if ( $count == 1 ); + $text .= ", " if ( $count > 1 ); + } + if ( $FORM{"sp"} eq "true" ) { + $text .= "<a\n href=\"#parse\">Parse Tree</a>"; + } + $text .= ".\n </p>\n\n"; + } + return $text; + +} + +sub check_for_doctype { + # check if the document has a doctype; if it doesn't, try to + # guess an appropriate one given the elements used + # + # if the document has a doctype, it returns 0; if not, it + # returns a text string with the inferred doctype + + my $fileref = shift; # a reference to @file, for efficiency + my @file = @$fileref; # dereference $fileref + + foreach $line (@file) { + # does an HTML element precede the doctype on the same line? + last if $line =~ /<[a-z].*<!doctype/i; + return 0 if $line =~ /<!doctype/i; # found a doctype + last if ( $line =~ /<[a-z]/i ); # found an element + # @@ this needs to be fixed to handle commented-out markup + # which appears before the doctype + } + + # do several loops of increasing lengths to avoid iterating over + # the whole file if possible. + # + # these heuristics could be improved a lot. + + foreach $line (@file[0..20]) { + return $html40f_doctype if $line =~ /<frame/i; + } + + foreach $line (@file[0..20]) { + return $html40t_doctype if $line =~ /<(table|body )/i; + } + + # go through the whole file + foreach $line (@file) { + return $html40t_doctype if $line =~ /<(table|body )/i; + } + + foreach $line (@file) { + return $html32_doctype if $line =~ /<center>/i; + return $html32_doctype if $line =~ /<[h0-9p]*\s*align\s*=\s*center>/i; + } + + # no luck earlier; guess HTML 4.0 transitional + return $html40t_doctype; + +} + |