- new sub conflict: easy test for conflicts between charsets

- new sub find_xml_encoding (extracting code from main) - implemented strict 'early lowercasing' policy for charsets - eliminating "<p>The command was "$command".</p>" because of no use to end user - Solved utf-16 vs. utf-16be/le problem: except for $File->{Charset}->{Auto} and $result_charset in &transcode_and_check, all instances are labels, and have to match exactly. - Strict test for 'utf-8' (never know who might create utf-88) - renamed validate_encoding to transcode_and_check - renamed validate_charset to charset_conflicts and rewrote it quite a bit (removed some actual functionality, but added more error checks). Removed Norvegian quotes where there were () anyway. Changed 'META element' to '<meta> element' (we don't want to use uc because of xhtml). Needs more error checks to be added. - Added UTF-16 to overrides (leaving be/le out until requested) - moved 'charset override' code, added $File->{Charset}->{Override} - always trying to get <meta> - removed utf-8 bom -> utf-8 default (only okay for xml!) - removed decoding pseudocode, not accurate anyway - fixed copyright -2002 - commented out unused subs parse_xml_decl and transcode - some comment tweaking
author: duerst <duerst@localhost> 2002-08-27 14:52:46 +0000
committer: duerst <duerst@localhost> 2002-08-27 14:52:46 +0000
commit: a16d4dde1b0c2e9101a1c8393e1a1d098085976b (patch)
tree: 8d7acda0373a8fde7dfe22123304f4ed58696823
parent: 84ad203a0798aec9780eb3a3836f14d100aca109 (diff)
download: markup-validator-a16d4dde1b0c2e9101a1c8393e1a1d098085976b.zip
markup-validator-a16d4dde1b0c2e9101a1c8393e1a1d098085976b.tar.gz
markup-validator-a16d4dde1b0c2e9101a1c8393e1a1d098085976b.tar.bz2
1 files changed, 216 insertions, 223 deletions
diff --git a/httpd/cgi-bin/check b/httpd/cgi-bin/check
index 51efd97..165bdb1 100755
--- a/httpd/cgi-bin/check
+++ b/httpd/cgi-bin/check
@@ -3,13 +3,13 @@
 # W3C HTML Validation Service
 # A CGI script to retrieve and validate an HTML file
 #
-# Copyright 1995-2001 Gerald Oskoboiny <gerald@w3.org>
+# Copyright 1995-2002 Gerald Oskoboiny <gerald@w3.org>
 # for additional contributors, see http://dev.w3.org/cvsweb/validator/
 #
 # This source code is available under the license at:
 #     http://www.w3.org/Consortium/Legal/copyright-software
 #
-# $Id: check,v 1.215 2002-08-27 03:04:43 duerst Exp $
+# $Id: check,v 1.216 2002-08-27 14:52:46 duerst Exp $
 
 #
 # Disable buffering on STDOUT!
@@ -97,7 +97,7 @@ BEGIN {
 
   #
   # Strings
-  $VERSION    =  q$Revision: 1.215 $;
+  $VERSION    =  q$Revision: 1.216 $;
   $VERSION    =~ s/Revision: ([\d\.]+) /$1/;
 
 
@@ -153,7 +153,7 @@ $File->{'Header'} = &prepSSI({
 			     });
 $File->{'Footer'} = &prepSSI({
 			      File => $CFG->{'Footer'},
-			      Date => q$Date: 2002-08-27 03:04:43 $,
+			      Date => q$Date: 2002-08-27 14:52:46 $,
 			     });
 
 #
@@ -177,11 +177,13 @@ $File->{Env}->{'Self URI'} = $q->url(-query => 0);
 
 #
 # Initialize parameters we'll need (and override) later.
+# (casing policy: lowercase early)
 $File->{Charset}->{Use}  = ''; # The charset used for validation.
-$File->{Charset}->{Auto} = ''; # Autodetected using XML rules.
+$File->{Charset}->{Auto} = ''; # Autodetion using XML rules (Appendix F).
 $File->{Charset}->{HTTP} = ''; # From HTTP's "charset" parameter.
 $File->{Charset}->{META} = ''; # From HTML's <meta http-equiv>.
 $File->{Charset}->{XML}  = ''; # From the XML Declaration.
+$File->{Charset}->{Override}=''# From override.
 
 #
 # Array (ref) used to store character offsets for the XML report.
@@ -198,18 +200,18 @@ $q = &prepCGI($File, $q);
 
 #
 # Set session switches.
-$File->{Opt}->{'Outline'}        = $q->param('outline') ? TRUE                 :  FALSE;
-$File->{Opt}->{'Show Source'}    = $q->param('ss')      ? TRUE                 :  FALSE;
-$File->{Opt}->{'Show Parsetree'} = $q->param('sp')      ? TRUE                 :  FALSE;
-$File->{Opt}->{'No Attributes'}  = $q->param('noatt')   ? TRUE                 :  FALSE;
-$File->{Opt}->{'Show ESIS'}      = $q->param('esis')    ? TRUE                 :  FALSE;
-$File->{Opt}->{'Show Errors'}    = $q->param('errors')  ? TRUE                 :  FALSE;
-$File->{Opt}->{'Verbose'}        = $q->param('verbose') ? TRUE                 :  FALSE;
-$File->{Opt}->{'Debug'}          = $q->param('debug')   ? TRUE                 :  FALSE;
-$File->{Opt}->{'Charset'}        = $q->param('charset') ? $q->param('charset') :     '';
-$File->{Opt}->{'DOCTYPE'}        = $q->param('doctype') ? $q->param('doctype') :     '';
-$File->{Opt}->{'URI'}            = $q->param('uri')     ? $q->param('uri')     :     '';
-$File->{Opt}->{'Output'}         = $q->param('output')  ? $q->param('output')  : 'html';
+$File->{Opt}->{'Outline'}        = $q->param('outline') ? TRUE                   :  FALSE;
+$File->{Opt}->{'Show Source'}    = $q->param('ss')      ? TRUE                   :  FALSE;
+$File->{Opt}->{'Show Parsetree'} = $q->param('sp')      ? TRUE                   :  FALSE;
+$File->{Opt}->{'No Attributes'}  = $q->param('noatt')   ? TRUE                   :  FALSE;
+$File->{Opt}->{'Show ESIS'}      = $q->param('esis')    ? TRUE                   :  FALSE;
+$File->{Opt}->{'Show Errors'}    = $q->param('errors')  ? TRUE                   :  FALSE;
+$File->{Opt}->{'Verbose'}        = $q->param('verbose') ? TRUE                   :  FALSE;
+$File->{Opt}->{'Debug'}          = $q->param('debug')   ? TRUE                   :  FALSE;
+$File->{Opt}->{'Charset'}        = $q->param('charset') ? lc $q->param('charset'):     '';
+$File->{Opt}->{'DOCTYPE'}        = $q->param('doctype') ? $q->param('doctype')   :     '';
+$File->{Opt}->{'URI'}            = $q->param('uri')     ? $q->param('uri')       :     '';
+$File->{Opt}->{'Output'}         = $q->param('output')  ? $q->param('output')    : 'html';
 
 $DEBUG = $File->{Opt}->{Debug};
 
@@ -246,125 +248,55 @@ untie *STDIN;
 ###############################################################################
 
 
-
-# if (<http charset given>) {
-#   # Use it, transcode, validate...
-# } elsif (<text/...+xml>) { # The one special case.
-#   # Act as if $http_charset was 'us-ascii'.
-# } else {
-#   if (<XML Rec Autodetect>) {
-#     # Trust it, transcode, validate...
-#   } else { # Autodetect failed... Assume ASCII/UTF-8 compatible.
-#     if (<XML encoding given>) {
-#       # Use it, transcode, validate...
-#     } elsif (<meta charset given...>) {
-#       # Use it, transcode, validate...
-#     } else {
-#       # No charset in HTTP, XML, or META.
-#       # Refuse to validate!
-#     }
-#   }
-# }
-
-
-#
-# Find encoding in document according to XML rules
-# Only meaningfull if file contains a BOM, or for well-formed XML!
-{
-  my ($preSize, $postSize);
-
-  ($File->{Charset}->{Auto}, $File->{BOM}, $preSize, $postSize)
-    = &find_base_encoding($File->{Bytes});
-  my $charSize = $preSize + $postSize + 1;
-  my $initSize = $charSize * 100;  # 100 arbitrary, but enough in any case
-  my $someBytes = substr $File->{Bytes}, $File->{BOM}, $initSize;
-  my $someText = '';
-
-  # translate from guessed encoding to ascii-compatible
-  if ($File->{Charset}->{Auto} eq 'EBCDIC') {
-    # special treatment for EBCDIC, maybe use tr///
-    # work on this later
-  }
-  elsif (($preSize + $postSize) == 0) {
-    $someText = $someBytes; # efficiency shortcut
-  }
-  else { # generic code for UTF-16/UCS-4
-    my ($i, $j);
-   LABEL:
-    for ($i=0; $i<=$initSize-$charSize; ) {
-      for ($j=0; $j<$preSize; $i++, $j++) {
-        if ((substr $someBytes, $i, 1) != '\x00') {
-          last LABEL;
-        }
-      }
-      $someText .= substr $someBytes, $i++, 1;
-      for ($j=0; $j<$postSize; $i++, $j++) {
-        if ((substr $someBytes, $i, 1) != '\x00') {
-          chop $someText; # remove last character
-          last LABEL;
-        }
-      }
-    }
-  }
-  # try to find encoding pseudo-attribute
-  $someText =~ m("^<\?xml[ \t\n\r]+version[ \t\n\r]?=[ \t\n\r]?([\'\"])[-._:a-zA-Z0-9]+\1[
-\t\n\r]+encoding[ \t\n\r]?=[ \t\n\r]?([\'\"])([A-Za-z][-._A-Za-z0-9]*)\2);
-
-  $File->{Charset}->{XML} = $3;
-}
-
-#
-# Abort if an error was flagged by charset autodetect.
-# &abort_if_error_flagged($File, 0); #### no errors up to here ?!
+$File = &find_xml_encoding ($File);
 
 #
 # Decide on a charset to use (first part)
 #
-if ($File->{Opt}->{'Charset'} && $File->{Opt}->{'Charset'} ne '(detect automatically)') {
-  ($File->{Charset}->{Use}) = split $File->{Opt}->{'Charset'}, ' ', 1;
-  ## message about 'charset override in effect
-}
-elsif ($File->{Charset}->{HTTP}) {
+if ($File->{Charset}->{HTTP}) {
   $File->{Charset}->{Use} = $File->{Charset}->{HTTP}; # HTTP, if given, is authorative.
 } elsif ($File->{Type} =~ m(^text/\w+\+xml$)) {
   $File->{Charset}->{Use} = 'us-ascii'; # Act as if $http_charset was 'us-ascii'. (MIME rules)
-      &add_warning($File, <<"      .EOF.");
+  &add_warning($File, <<"      .EOF.");
       <em>Note:</em>
       The HTTP Content-Type field did not contain a "charset" attribute,
       but the Content-Type was one of the XML text/* sub-types. The relevant
-      standards specify a strong default of "us-ascii" for such documents so
+      specification (RFC 3023) specifies a strong default of "us-ascii" for such documents so
       we will use this value regardless of any encoding you may have indicated
       elsewhere. If you would like to use a different encoding, you should
       arrange to have your server send this new encoding information.
       .EOF.
 } elsif ($File->{Charset}->{XML}) {
   $File->{Charset}->{Use} = $File->{Charset}->{XML};
-} elsif ($File->{Charset}->{Auto} =~ "^UTF-16" && $File->{BOM} == 2) {
-  $File->{Charset}->{Use} = $File->{Charset}->{Auto};
-} elsif ($File->{Charset}->{Auto} eq "UTF-8"   && $File->{BOM} == 3) {
-  $File->{Charset}->{Use} = "UTF-8";
+} elsif ($File->{Charset}->{Auto} =~ "^utf-16[bl]e$" && $File->{BOM} == 2) {
+  $File->{Charset}->{Use} = 'utf-16';
 } elsif ($File->{Type} =~ m(^application/\w+\+xml$)) {
-  $File->{Charset}->{Use} = "UTF-8";
+  $File->{Charset}->{Use} = "utf-8";
 } 
 
 $File->{Content} = &normalize_newlines($File->{Bytes}, $File->{Charset}->{Use});
 $File->{Content}->[0] = substr $File->{Content}->[0], $File->{BOM}; # remove BOM
-
+#### add warning about BOM in UTF-8
 
 #
-# Decide on a charset to use (continue)
-#
-if (!$File->{Charset}->{Use}) {
-  #
-  # Try to extract META charset.
-  $File = &preparse($File);
-  if ($File->{Charset}->{META}) {
-    $File->{Charset}->{Use} = $File->{Charset}->{META};
-  } else {
+# Try to extract META charset
+# (works only if ascii-based and reasonably clean before <meta>)
+$File = &preparse($File);
+unless ($File->{Charset}->{Use}) {
+  $File->{Charset}->{Use} = $File->{Charset}->{META};
+}
+
+if (&conflict($File->{Opt}->{'Charset'}, '(detect automatically)')) {
+  $File->{Charset}->{Override} = split $File->{Opt}->{'Charset'}, ' ', 1;
+  $File->{Charset}->{Use} = $File->{Charset}->{Override} = lc $File->{Charset}->{Override};
+  # message about 'charset override' in effect comes later
+}
+
+unless ($File->{Charset}->{Use}) {
     $File->{'Error Flagged'} = TRUE;
     $File->{'Error Message'} = <<".EOF.";
     <p>
-      I was not able to extract a character encoding labelling from either of
+      I was not able to extract a character encoding labelling from any of
       the valid sources for such information. Without encoding information it
       is impossible to validate the document. The sources I tried are:
     </p>
@@ -401,10 +333,10 @@ if (!$File->{Charset}->{Use}) {
 
 #
 # Check the detected Encoding and transcode.
-$File = &validate_encoding($File);
+$File = &transcode_and_check($File);
 
 #
-# Abort if an error was flagged during Encoding Validation.
+# Abort if an error was flagged during transcoding
 &abort_if_error_flagged($File, O_SOURCE);
 
 
@@ -452,7 +384,7 @@ $File->{Type} = 'mathml+xml' if $File->{DOCTYPE} =~ /mathml/i;
 
 #
 # Sanity check Charset information and add any warnings necessary.
-$File = &validate_charset($File);
+$File = &charset_conflicts($File);
 
 
 #
@@ -497,6 +429,7 @@ if ($File->{'Is Upload'}) {
 			      -values => [
 					  '(detect automatically)',
 					  'utf-8 (Unicode, worldwide)',
+					  'utf-16 (Unicode, worldwide)',
 					  'iso-8859-1 (Western Europe)',
 					  'iso-8859-2 (Central Europe)',
 					  'iso-8859-3 (Maltese)',
@@ -577,7 +510,7 @@ $ENV{SGML_SEARCH_PATH} = $CFG->{'SGML Library'};
 
 ##
 ## MathML and XHTML. Must be here because they're usually served as text/html
-## to deal with braindead browsers. IOW, these override the check for &is_html.
+## to deal with braindead browsers. In other words, these override the check for &is_html.
 #$catalog = File::Spec->catfile($CFG->{'SGML Library'}, 'xhtml.soc')
 #  if &is_xhtml($File->{Type});
 #$catalog = File::Spec->catfile($CFG->{'SGML Library'}, 'mathml.soc')
@@ -1204,7 +1137,7 @@ sub handle_uri {
 
   $File->{Bytes}           = $res->content;
   $File->{Type}            = $type;
-  $File->{Charset}->{HTTP} = $charset;
+  $File->{Charset}->{HTTP} = lc $charset;
   $File->{Modified}        = $lastmod;
   $File->{Server}          = scalar $res->server;
   $File->{Size}            = scalar $res->content_length;
@@ -1232,7 +1165,7 @@ sub handle_file {
 
   $File->{Bytes}           = $file;
   $File->{Type}            = $type;
-  $File->{Charset}->{HTTP} = $charset;
+  $File->{Charset}->{HTTP} = lc $charset;
   $File->{Modified}        = $h->{'Last-Modified'};
   $File->{Server}          = $h->{'Server'};
   $File->{Size}            = $h->{'Content-Length'};
@@ -1304,11 +1237,11 @@ sub normalize_newlines {
   my $pattern = '';
 
   # patterns are not allowed to have parentheses
-  $pattern = '\x00\015\x00\012|\x00\015|\x00\012' if /^UTF-16BE$/i;
-  $pattern = '\015\x00\012\x00|\015\x00|\012\x00' if /^UTF-16LE$/i;
-  $pattern = '\x00\x00\x00\015\x00\x00\x00\012|\x00\x00\x00\015|\x00\x00\x00\012' if /^UCS-4BE$/i;
-  $pattern = '\015\x00\x00\x00\012\x00\x00\x00|\015\x00\x00\x00|\012\x00\x00\x00' if /^UCS-4LE$/i;
-  # insert other special cases here
+  $pattern = '\x00\015\x00\012|\x00\015|\x00\012' if /^utf-16be$/;
+  $pattern = '\015\x00\012\x00|\015\x00|\012\x00' if /^utf-16le$/;
+  $pattern = '\x00\x00\x00\015\x00\x00\x00\012|\x00\x00\x00\015|\x00\x00\x00\012' if /^UCS-4be$/;
+  $pattern = '\015\x00\x00\x00\012\x00\x00\x00|\015\x00\x00\x00|\012\x00\x00\x00' if /^UCS-4le$/;
+  # insert other special cases here, such as EBCDIC
   $pattern = '\015\012|\015|\012' if !$pattern;    # all other cases
 
   return [split /$pattern/, $file];
@@ -1319,7 +1252,7 @@ sub normalize_newlines {
 # Return $_[0] encoded for HTML entities (cribbed from merlyn).
 sub ent {
   local $_ = shift;
-  s(["<&>"]){'&#' . ord($&) . ';'}ge;
+  s(["<&>"]){'&#' . ord($&) . ';'}ge;  # should switch to hex sooner or later
   return $_;
 }
 
@@ -1417,10 +1350,6 @@ sub parse_errors ($$) {
     }
 
 
-
-
-
-
     # An unknown FPI and no SI.
     if ($err->{msg} =~ m(cannot generate system identifier for entity)
 	or $err->{msg} =~ m(unrecognized ({{)?DOCTYPE(}})?)i
@@ -2081,15 +2010,15 @@ sub is_mathml {shift =~ m(^mathml)};
 
 
 #
-# Sanity check charset info and add any warnings necessary.
-sub validate_charset {
+# Check charset conflicts and add any warnings necessary.
+sub charset_conflicts {
   my $File = shift;
   #
   # Handle the case where there was no charset to be found.
   # (This is a different issue from whether an override was given!)
   unless ($File->{Charset}->{Use}) {
     if (&is_xml($File->{Type})) {
-      $File->{Charset}->{Use} = 'UTF-8'; # @@@TODO@@@: Should detect UTF-8 vs. UTF-16!
+      $File->{Charset}->{Use} = 'UTF-8';
       &add_warning($File, <<"      .EOF.");
       <strong>No Character Encoding detected!</strong>
       To ensure correct validation, processing, and display,
@@ -2114,53 +2043,55 @@ sub validate_charset {
   }
 
   #
-  # If we have a charset field in the request, we use it
-  if ($File->{Opt}->{Charset}) {
-    if ($File->{Opt}->{Charset} =~ m(detect automatically)i) {
-      $File->{Opt}->{Charset} = '';
-    } else {
-      $File->{Opt}->{Charset} =~ /^(\w+)/;
-      $File->{Opt}->{Charset} = lc $1;
-
-      &add_warning($File, <<"      .EOF.");
+  # warn about charset override
+  if (&conflict($File->{Charset}->{Override}, $File->{Charset}->{Use})) {
+    &add_warning($File, <<"      .EOF.");
       <strong>Character Encoding Override in effect!</strong>
       The detected character encoding,
       &#171;<code>$File->{Charset}->{Use}</code>&#187;, has been supressed and the
-      character encoding &#171;<code>$File->{Opt}->{Charset}</code>&#187;
+      character encoding &#171;<code>$File->{Opt}->{Charset}</code>&#187; is
       used instead.
       .EOF.
-      $File->{Tentative} |= T_ERROR;
-      $File->{Charset}->{Use} = $File->{Opt}->{Charset};
-    }
+    $File->{Tentative} |= T_ERROR;
   }
 
   #
-  # Add a warning if there was charset info in the HTTP header, but it was
-  # later overridden by a META element or XML Declaration.
-  if ($File->{Charset}->{HTTP}) {
-    if ($File->{Charset}->{META}) {
-      unless ($File->{Charset}->{META} eq $File->{Charset}->{HTTP}) {
-	&add_warning($File, <<"        .EOF.");
+  # Add a warning if there was charset info conflict (HTTP header,
+  # XML declaration, or <meta> element).
+  if (&conflict($File->{Charset}->{HTTP}, $File->{Charset}->{XML})) {
+    &add_warning($File, <<"        .EOF.");
       <strong>Character Encoding mismatch!</strong>
-      The character encoding specified in the HTTP header,
-      &#171;<code>$File->{Charset}->{HTTP}</code>&#187;, is different from the
-      value &#171;<code>$File->{Charset}->{META}</code>&#187; in the META element.
-      I will use &#171;<code>$File->{Charset}->{Use}</code>&#187; for this validation.
+      The character encoding specified in the
+        HTTP header (<code>$File->{Charset}->{HTTP}</code>)
+      is different from the value in the
+        XML declaration (<code>$File->{Charset}->{XML}</code>).
+      I will use the value from the
+        HTTP header (<code>$File->{Charset}->{Use}</code>)
+      for this validation.
         .EOF.
-      }
-    } elsif ($File->{Charset}->{XML}) {
-      unless ($File->{Charset}->{XML} eq $File->{Charset}->{HTTP}) {
-	&add_warning($File, <<"        .EOF.");
+  } elsif (&conflict($File->{Charset}->{HTTP}, $File->{Charset}->{META})) {
+    &add_warning($File, <<"        .EOF.");
       <strong>Character Encoding mismatch!</strong>
-      The character encoding specified in the HTTP header
-      (&#171;<code>$File->{Charset}->{HTTP}</code>&#187;) is different from the
-      value in the XML declaration (&#171;<code>$File->{Charset}->{XML}</code>&#187;).
-      I will use the value from the XML Declaration
-      (&#171;<code>$File->{Charset}->{Use}</code>&#187;)
+      The character encoding specified in the
+        HTTP header (<code>$File->{Charset}->{HTTP}</code>)
+      is different from the value in the
+        <code>&lt;meta&gt;</code> element (<code>$File->{Charset}->{META}</code>).
+      I will use the value from the
+        HTTP header (<code>$File->{Charset}->{Use}</code>)
+      for this validation.
+        .EOF.
+  }
+  elsif (&conflict($File->{Charset}->{XML}, $File->{Charset}->{META})) {
+    &add_warning($File, <<"        .EOF.");
+      <strong>Character Encoding mismatch!</strong>
+      The character encoding specified in the
+        XML declaration (<code>$File->{Charset}->{XML}</code>)
+      is different from the value in the
+        <code>&lt;meta&gt;</code> element (<code>$File->{Charset}->{META}</code>).
+      I will use the value from the
+        XML declaration (<code>$File->{Charset}->{XML}</code>)
       for this validation.
         .EOF.
-      }
-    }
   }
 
   return $File;
@@ -2169,15 +2100,19 @@ sub validate_charset {
 
 #
 # Check Encoding and Transcode.
-sub validate_encoding {
+sub transcode_and_check {
   my $File = shift;
 
   my @lines;
 
-  unless ($File->{Charset}->{Use} =~ m(utf-8)i) {
+  unless ($File->{Charset}->{Use} eq 'utf-8') {
     my ($command, $result_charset) = split " ", $CFG->{Charsets}->{$File->{Charset}->{Use}}, 2;
 
+    if ($result_charset eq 'utf-16' && $File->{Charset}->{Auto} =~ m/^utf-16[bl]e$/) {
+      $result_charset = $File->{Charset}->{Auto}; # for per-line conversion, need to be exact
+    }
     if ($command eq 'I') {
+      # test if given charset is available
       eval {my $c = Text::Iconv->new($result_charset, 'utf-8')};
       $command = '' if $@;
     } elsif ($command eq 'X') {
@@ -2194,7 +2129,6 @@ sub validate_encoding {
         </p>
         <p>The detected character encoding was "$File->{Charset}->{Use}".</p>
         <p>The error was "$@".</p>
-        <p>The command was "$command".</p>
         <p>
           If you believe the character encoding to be valid you can submit a request for
           that character encoding (see the <a href="feedback.html">feedback page</a>
@@ -2218,7 +2152,7 @@ sub validate_encoding {
   }
 
   # check correctness of UTF-8 both for UTF-8 input and for conversion results
-  if ($File->{Charset}->{Use}) {
+  if ($File->{Charset}->{Use}) {  #### this check seems to be unnecessary
     for (my $i = 0; $i < $#{$File->{Content}}; $i++) {
       # substitution needed for very long lines (>32K), to avoid backtrack
       # stack overflow. Handily, this also happens to count characters.
@@ -2236,6 +2170,7 @@ sub validate_encoding {
       push @lines, ($i+1) if length;
       $count += 0; # Force numeric.
       $File->{Offsets}->[$i + 1] = [$count, $File->{Offsets}->[$i]->[1] + $count];
+      #### replace invalid line with some dummy text
     }
   }
 
@@ -2259,31 +2194,31 @@ sub validate_encoding {
   return $File;
 }
 
-
+#### not used
 #
 # Transcode into UTF-8.
-sub transcode {
-  my $File = shift;
-  my $from = shift;
-
-  my @Result = ();
-  my @lines  = ();
-
-  eval {my $c = Text::Iconv->new($from, 'utf-8')};
-  if ($@) {
-    return FALSE;
-  } else {
-    my $c = Text::Iconv->new($from, 'utf-8');
-    my $line = 0;
-    foreach my $in (@{$File->{Content}}) {
-      $line++;
-      my $out = $c->convert($in);
-      push @lines, $line if ($in and not $out);
-      push @Result, $out;
-    }
-  }
-  return {Data => \@Result, Lines => \@lines};
-}
+#sub transcode {
+#  my $File = shift;
+#  my $from = shift;
+#
+#  my @Result = ();
+#  my @lines  = ();
+#
+#  eval {my $c = Text::Iconv->new($from, 'utf-8')};
+#  if ($@) {
+#    return FALSE;
+#  } else {
+#    my $c = Text::Iconv->new($from, 'utf-8');
+#    my $line = 0;
+#    foreach my $in (@{$File->{Content}}) {
+#      $line++;
+#      my $out = $c->convert($in);
+#      push @lines, $line if ($in and not $out);
+#      push @Result, $out;
+#    }
+#  }
+#  return {Data => \@Result, Lines => \@lines};
+#}
 
 
 #
@@ -2513,9 +2448,7 @@ X-W3C-Validator-Errors: $errs
 
 
 #
-# Find the encoding in use in a document instance.
-#
-# Implements autodetection as in Appendix F of the XML 1.0 Recommendation.
+# Autodetection as in Appendix F of the XML 1.0 Recommendation.
 # <URL:http://www.w3.org/TR/2000/REC-xml-20001006#sec-guessing>
 #
 # return values are: (base_encoding, BOMSize, preSize, postSize)
@@ -2523,58 +2456,109 @@ sub find_base_encoding {
   local $_ = shift;
 
   # With a Byte Order Mark:
-  return ('UCS-4BE',  4, 3, 0)
+  return ('ucs-4be',  4, 3, 0)
     if /^\x00\x00\xFE\xFF/; # UCS-4, big-endian machine (1234)
-  return ('UCS-4LE',  4, 0, 3)
+  return ('ucs-4le',  4, 0, 3)
     if /^\xFF\xFE\x00\x00/; # UCS-4, little-endian machine (4321)
-  return ('UTF-16BE', 2, 1, 0)
+  return ('utf-16be', 2, 1, 0)
     if /^\xFE\xFF/;         # UTF-16, big-endian.
-  return ('UTF-16LE', 2, 0, 1)
+  return ('utf-16le', 2, 0, 1)
     if /^\xFF\xFE/;         # UTF-16, little-endian.
-  return ('UTF-8',    3, 0, 0)
+  return ('utf-8',    3, 0, 0)
     if /^\xEF\xBB\xBF/; # UTF-8.
 
   # Without a Byte Order Mark:
-  return ('UCS-4BE',  0, 3, 0)
+  return ('ucs-4be',  0, 3, 0)
     if /^\x00\x00\x00\x3C/; # UCS-4 or 32bit; big-endian machine (1234 order).
-  return ('UCS-4LE',  0, 0, 3)
+  return ('ucs-4le',  0, 0, 3)
     if /^\x3C\x00\x00\x00/; # UCS-4 or 32bit; little-endian machine (4321 order).
-  return ('UTF-16BE', 0, 1, 0)
+  return ('utf-16be', 0, 1, 0)
     if /^\x00\x3C\x00\x3F/; # UCS-2, UTF-16, or 16bit; big-endian.
-  return ('UTF-16LE', 0, 0, 1)
+  return ('utf-16le', 0, 0, 1)
     if /^\x3C\x00\x3F\x00/; # UCS-2, UTF-16, or 16bit; little-endian.
-  return ('UTF-8',    0, 0, 0)
+  return ('utf-8',    0, 0, 0)
     if /^\x3C\x3F\x78\x6D/; # UTF-8, ISO-646, ASCII, ISO-8859-*, Shift-JIS, EUC, etc.
-  return ('EBCDIC',   0, 0, 0)
+  return ('ebcdic',   0, 0, 0)
     if /^\x4C\x6F\xA7\x94/; # EBCDIC
   return ('',         0, 0, 0);
+                            # nothing in particular
 }
 
 
-sub parse_xml_decl {
-  my $f = shift;
-  my $e = '';
-  my $p =  HTML::Parser->new(api_version => 3);
+#
+# Find encoding in document according to XML rules
+# Only meaningfull if file contains a BOM, or for well-formed XML!
+sub find_xml_encoding {
+  my $File = shift;
+  my ($preSize, $postSize);
 
-  my $pi = sub {
-    return if $e;
-    my $pi = shift;
-    return unless $pi =~ m(<\?xml);
-    $pi =~ m(<\?xml[^>]*\sencoding\s*=\s*([\"\'])([A-Za-z][-A-Za-z0-9._]*)\1)s;
-    warn qq(\$1 == '$1' and \$2 == '$2'\n) if $DEBUG;
-    $e = lc $2 if $2;
-    warn "\$e inside == $e\n";
-    $p->eof()  if $e;
-  };
+  ($File->{Charset}->{Auto}, $File->{BOM}, $preSize, $postSize)
+    = &find_base_encoding($File->{Bytes});
+  my $charSize = $preSize + $postSize + 1;
+  my $initSize = $charSize * 100;  # 100 arbitrary, but enough in any case
+  my $someBytes = substr $File->{Bytes}, $File->{BOM}, $initSize;
+  my $someText = '';
 
-  $p->xml_mode(TRUE);
-  $p->handler(process => $pi, 'text');
-  $p->parse($f->{Content});
-  warn "$e\n" if $DEBUG;
-  return $e;
+  # translate from guessed encoding to ascii-compatible
+  if ($File->{Charset}->{Auto} eq 'ebcdic') {
+    # special treatment for EBCDIC, maybe use tr///
+    # work on this later
+  }
+  elsif ($charSize == 1) {
+    $someText = $someBytes; # efficiency shortcut
+  }
+  else { # generic code for UTF-16/UCS-4
+    my ($i, $j);
+   LABEL:
+    for ($i=0; $i<=$initSize-$charSize; ) {
+      for ($j=0; $j<$preSize; $i++, $j++) {
+        if ((substr $someBytes, $i, 1) != '\x00') {
+          last LABEL;
+        }
+      }
+      $someText .= substr $someBytes, $i++, 1;
+      for ($j=0; $j<$postSize; $i++, $j++) {
+        if ((substr $someBytes, $i, 1) != '\x00') {
+          chop $someText; # remove last character
+          last LABEL;
+        }
+      }
+    }
+  }
+  # try to find encoding pseudo-attribute
+  $someText =~ m("^<\?xml[ \t\n\r]+version[ \t\n\r]?=[ \t\n\r]?([\'\"])[-._:a-zA-Z0-9]+\1[
+\t\n\r]+encoding[ \t\n\r]?=[ \t\n\r]?([\'\"])([A-Za-z][-._A-Za-z0-9]*)\2);
+
+  $File->{Charset}->{XML} = lc $3;
+  return $File;
 }
 
 
+#### not used anymore
+#sub parse_xml_decl {
+#  my $f = shift;
+#  my $e = '';
+#  my $p =  HTML::Parser->new(api_version => 3);
+#
+#  my $pi = sub {
+#    return if $e;
+#    my $pi = shift;
+#    return unless $pi =~ m(<\?xml);
+#    $pi =~ m(<\?xml[^>]*\sencoding\s*=\s*([\"\'])([A-Za-z][-A-Za-z0-9._]*)\1)s;
+#    warn qq(\$1 == '$1' and \$2 == '$2'\n) if $DEBUG;
+#    $e = lc $2 if $2;
+#    warn "\$e inside == $e\n";
+#    $p->eof()  if $e;
+#  };
+#
+#  $p->xml_mode(TRUE);
+#  $p->handler(process => $pi, 'text');
+#  $p->parse($f->{Content});
+#  warn "$e\n" if $DEBUG;
+#  return $e;
+#}
+
+
 #
 # Abort with a message if an error was flagged at point.
 sub abort_if_error_flagged {
@@ -2590,3 +2574,12 @@ sub abort_if_error_flagged {
   }
 }
 
+#
+# conflicting encodings
+sub conflict {
+  my $encodingA = shift;
+  my $encodingB = shift;
+  return $encodingA && $encodingB && $encodingA ne $encodingB;
+}
+
+
author	duerst <duerst@localhost>	2002-08-27 14:52:46 +0000
committer	duerst <duerst@localhost>	2002-08-27 14:52:46 +0000
commit	a16d4dde1b0c2e9101a1c8393e1a1d098085976b (patch)
tree	8d7acda0373a8fde7dfe22123304f4ed58696823
parent	84ad203a0798aec9780eb3a3836f14d100aca109 (diff)
download	markup-validator-a16d4dde1b0c2e9101a1c8393e1a1d098085976b.zip markup-validator-a16d4dde1b0c2e9101a1c8393e1a1d098085976b.tar.gz markup-validator-a16d4dde1b0c2e9101a1c8393e1a1d098085976b.tar.bz2