replacing several loops with two pattern matches

author: duerst <duerst@localhost> 2002-08-31 04:04:02 +0000
committer: duerst <duerst@localhost> 2002-08-31 04:04:02 +0000
commit: b2144d99fdc00965cc60a07a000c736a286dc659 (patch)
tree: 65c7e0b29a5d7550b1c4b2fdd82206b73f491695
parent: dedf10d1fa32ae4227f2943d5185e6385a26ff83 (diff)
download: markup-validator-b2144d99fdc00965cc60a07a000c736a286dc659.zip
markup-validator-b2144d99fdc00965cc60a07a000c736a286dc659.tar.gz
markup-validator-b2144d99fdc00965cc60a07a000c736a286dc659.tar.bz2
1 files changed, 25 insertions, 39 deletions
diff --git a/httpd/cgi-bin/check b/httpd/cgi-bin/check
index 11df333..9818f9e 100755
--- a/httpd/cgi-bin/check
+++ b/httpd/cgi-bin/check
@@ -9,7 +9,7 @@
 # This source code is available under the license at:
 #     http://www.w3.org/Consortium/Legal/copyright-software
 #
-# $Id: check,v 1.225 2002-08-31 03:38:21 duerst Exp $
+# $Id: check,v 1.226 2002-08-31 04:04:02 duerst Exp $
 
 #
 # Disable buffering on STDOUT!
@@ -95,7 +95,7 @@ BEGIN {
 
   #
   # Strings
-  $VERSION    =  q$Revision: 1.225 $;
+  $VERSION    =  q$Revision: 1.226 $;
   $VERSION    =~ s/Revision: ([\d\.]+) /$1/;
 
 
@@ -151,7 +151,7 @@ $File->{'Header'} = &prepSSI({
 			     });
 $File->{'Footer'} = &prepSSI({
 			      File => $CFG->{'Footer'},
-			      Date => q$Date: 2002-08-31 03:38:21 $,
+			      Date => q$Date: 2002-08-31 04:04:02 $,
 			     });
 
 #
@@ -2417,36 +2417,36 @@ X-W3C-Validator-Errors: $errs
 # Autodetection as in Appendix F of the XML 1.0 Recommendation.
 # <URL:http://www.w3.org/TR/2000/REC-xml-20001006#sec-guessing>
 #
-# return values are: (base_encoding, BOMSize, preSize, postSize)
+# return values are: (base_encoding, BOMSize, Size, Pattern)
 sub find_base_encoding {
   local $_ = shift;
 
   # With a Byte Order Mark:
-  return ('ucs-4be',  4, 3, 0)
+  return ('ucs-4be',  4, 4, '\x00\x00\x00(.)')
     if /^\x00\x00\xFE\xFF/; # UCS-4, big-endian machine (1234)
-  return ('ucs-4le',  4, 0, 3)
+  return ('ucs-4le',  4, 4, '(.)\x00\x00\x00')
     if /^\xFF\xFE\x00\x00/; # UCS-4, little-endian machine (4321)
-  return ('utf-16be', 2, 1, 0)
+  return ('utf-16be', 2, 2, '\x00(.)')
     if /^\xFE\xFF/;         # UTF-16, big-endian.
-  return ('utf-16le', 2, 0, 1)
+  return ('utf-16le', 2, 2, '(.)\x00')
     if /^\xFF\xFE/;         # UTF-16, little-endian.
-  return ('utf-8',    3, 0, 0)
+  return ('utf-8',    3, 1, '')
     if /^\xEF\xBB\xBF/; # UTF-8.
 
   # Without a Byte Order Mark:
-  return ('ucs-4be',  0, 3, 0)
+  return ('ucs-4be',  0, 4, '\x00\x00\x00(.)')
     if /^\x00\x00\x00\x3C/; # UCS-4 or 32bit; big-endian machine (1234 order).
-  return ('ucs-4le',  0, 0, 3)
+  return ('ucs-4le',  0, 4, '(.)\x00\x00\x00')
     if /^\x3C\x00\x00\x00/; # UCS-4 or 32bit; little-endian machine (4321 order).
-  return ('utf-16be', 0, 1, 0)
+  return ('utf-16be', 0, 2, '\x00(.)')
     if /^\x00\x3C\x00\x3F/; # UCS-2, UTF-16, or 16bit; big-endian.
-  return ('utf-16le', 0, 0, 1)
+  return ('utf-16le', 0, 2, '(.)\x00')
     if /^\x3C\x00\x3F\x00/; # UCS-2, UTF-16, or 16bit; little-endian.
-  return ('utf-8',    0, 0, 0)
+  return ('utf-8',    0, 1, '')
     if /^\x3C\x3F\x78\x6D/; # UTF-8, ISO-646, ASCII, ISO-8859-*, Shift-JIS, EUC, etc.
-  return ('ebcdic',   0, 0, 0)
+  return ('ebcdic',   0, 1, '')
     if /^\x4C\x6F\xA7\x94/; # EBCDIC
-  return ('',         0, 0, 0);
+  return ('',         0, 1, '');
                             # nothing in particular
 }
 
@@ -2456,41 +2456,27 @@ sub find_base_encoding {
 # Only meaningfull if file contains a BOM, or for well-formed XML!
 sub find_xml_encoding {
   my $File = shift;
-  my ($preSize, $postSize);
+  my ($CodeUnitSize, $Pattern);
 
-  ($File->{Charset}->{Auto}, $File->{BOM}, $preSize, $postSize)
+  ($File->{Charset}->{Auto}, $File->{BOM}, $CodeUnitSize, $Pattern)
     = &find_base_encoding($File->{Bytes});
-  my $charSize = $preSize + $postSize + 1;
-  my $initSize = $charSize * 100;  # 100 arbitrary, but enough in any case
-  my $someBytes = substr $File->{Bytes}, $File->{BOM}, $initSize;
-  my $someText = '';
+  my $someBytes = substr $File->{Bytes}, $File->{BOM}, ($CodeUnitSize * 100);
+  my $someText = '';                  # 100 arbitrary, but enough in any case
 
   # translate from guessed encoding to ascii-compatible
   if ($File->{Charset}->{Auto} eq 'ebcdic') {
     # special treatment for EBCDIC, maybe use tr///
     # work on this later
   }
-  elsif ($charSize == 1) {
+  elsif (!$Pattern) {
     $someText = $someBytes; # efficiency shortcut
   }
   else { # generic code for UTF-16/UCS-4
-    my ($i, $j);
-   LABEL:
-    for ($i=0; $i<=$initSize-$charSize; ) {
-      for ($j=0; $j<$preSize; $i++, $j++) {
-        if ((substr $someBytes, $i, 1) != '\x00') {
-          last LABEL;
-        }
-      }
-      $someText .= substr $someBytes, $i++, 1;
-      for ($j=0; $j<$postSize; $i++, $j++) {
-        if ((substr $someBytes, $i, 1) != '\x00') {
-          chop $someText; # remove last character
-          last LABEL;
-        }
-      }
-    }
+    $someBytes =~ /^(($Pattern)*)/;
+    $someText = $1;       # get initial piece without chars >255
+    $someText = s/$Pattern/$1/g;    # select the relevant bytes
   }
+
   # try to find encoding pseudo-attribute
   $someText =~ m("^<\?xml[ \t\n\r]+version[ \t\n\r]?=[ \t\n\r]?([\'\"])[-._:a-zA-Z0-9]+\1[
 \t\n\r]+encoding[ \t\n\r]?=[ \t\n\r]?([\'\"])([A-Za-z][-._A-Za-z0-9]*)\2);
author	duerst <duerst@localhost>	2002-08-31 04:04:02 +0000
committer	duerst <duerst@localhost>	2002-08-31 04:04:02 +0000
commit	b2144d99fdc00965cc60a07a000c736a286dc659 (patch)
tree	65c7e0b29a5d7550b1c4b2fdd82206b73f491695
parent	dedf10d1fa32ae4227f2943d5185e6385a26ff83 (diff)
download	markup-validator-b2144d99fdc00965cc60a07a000c736a286dc659.zip markup-validator-b2144d99fdc00965cc60a07a000c736a286dc659.tar.gz markup-validator-b2144d99fdc00965cc60a07a000c736a286dc659.tar.bz2