summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-xhttpd/cgi-bin/check183
1 files changed, 88 insertions, 95 deletions
diff --git a/httpd/cgi-bin/check b/httpd/cgi-bin/check
index 575f1da..22fb600 100755
--- a/httpd/cgi-bin/check
+++ b/httpd/cgi-bin/check
@@ -9,7 +9,7 @@
# This source code is available under the license at:
# http://www.w3.org/Consortium/Legal/copyright-software
#
-# $Id: check,v 1.222 2002-08-30 07:39:39 duerst Exp $
+# $Id: check,v 1.223 2002-08-30 08:17:45 duerst Exp $
#
# Disable buffering on STDOUT!
@@ -95,7 +95,7 @@ BEGIN {
#
# Strings
- $VERSION = q$Revision: 1.222 $;
+ $VERSION = q$Revision: 1.223 $;
$VERSION =~ s/Revision: ([\d\.]+) /$1/;
@@ -151,7 +151,7 @@ $File->{'Header'} = &prepSSI({
});
$File->{'Footer'} = &prepSSI({
File => $CFG->{'Footer'},
- Date => q$Date: 2002-08-30 07:39:39 $,
+ Date => q$Date: 2002-08-30 08:17:45 $,
});
#
@@ -187,6 +187,10 @@ $File->{Charset}->{Override}=''# From override.
# Array (ref) used to store character offsets for the XML report.
$File->{Offsets}->[0] = [0, 0]; # The first item isn't used...
+#
+# List to hold line numbers for encoding errors
+@{$File->{Lines}} = ();
+
#########################################
# Populate $File->{Opt} -- CGI Options. #
@@ -325,7 +329,13 @@ unless ($File->{Charset}->{Use}) {
#
# Check the detected Encoding and transcode.
-$File = &transcode_and_check($File);
+unless (&conflict($File->{Charset}->{Use}, 'utf-8')) {
+ $File = &transcode($File);
+ &abort_if_error_flagged($File, 0);
+}
+
+$File = &check_utf8($File);
+$File = &byte_error($File);
#
# Abort if an error was flagged during transcoding
@@ -2067,81 +2077,91 @@ sub charset_conflicts {
#
-# Check Encoding and Transcode.
-sub transcode_and_check {
+# Transcode to UTF-8
+sub transcode {
my $File = shift;
- my @lines;
-
- unless ($File->{Charset}->{Use} eq 'utf-8') {
- my ($command, $result_charset) = split " ", $CFG->{Charsets}->{$File->{Charset}->{Use}}, 2;
+ my ($command, $result_charset) = split " ", $CFG->{Charsets}->{$File->{Charset}->{Use}}, 2;
- if ($result_charset eq 'utf-16' && $File->{Charset}->{Auto} =~ m/^utf-16[bl]e$/) {
- $result_charset = $File->{Charset}->{Auto}; # for per-line conversion, need to be exact
- }
- if ($command eq 'I') {
- # test if given charset is available
- eval {my $c = Text::Iconv->new($result_charset, 'utf-8')};
- $command = '' if $@;
- } elsif ($command eq 'X') {
- $@ = "$File->{Charset}->{Use} undefined; replace by $result_charset";
- }
+ if ($result_charset eq 'utf-16' && $File->{Charset}->{Auto} =~ m/^utf-16[bl]e$/) {
+ $result_charset = $File->{Charset}->{Auto}; # for per-line conversion, need to be exact
+ }
+ if ($command eq 'I') {
+ # test if given charset is available
+ eval {my $c = Text::Iconv->new($result_charset, 'utf-8')};
+ $command = '' if $@;
+ } elsif ($command eq 'X') {
+ $@ = "$File->{Charset}->{Use} undefined; replace by $result_charset";
+ }
- if ($command ne 'I') {
- $File->{'Error Flagged'} = TRUE;
- $File->{'Error Message'} = <<" .EOF.";
- <p>Sorry!
- A fatal error occurred when attempting to transcode the character encoding
- of the document. Either we do not support this character encoding yet, or you
- have specified a non-existent character encoding (often a misspelling).
- </p>
- <p>The detected character encoding was "$File->{Charset}->{Use}".</p>
- <p>The error was "$@".</p>
- <p>
- If you believe the character encoding to be valid you can submit a request for
- that character encoding (see the <a href="feedback.html">feedback page</a>
- for details) and we will look into supporting it in the future.
- </p>
- .EOF.
- return $File;
- }
+ if ($command ne 'I') {
+ $File->{'Error Flagged'} = TRUE;
+ $File->{'Error Message'} = <<" .EOF.";
+ <p>Sorry!
+ A fatal error occurred when attempting to transcode the character encoding
+ of the document. Either we do not support this character encoding yet, or you
+ have specified a non-existent character encoding (often a misspelling).
+ </p>
+ <p>The detected character encoding was "$File->{Charset}->{Use}".</p>
+ <p>The error was "$@".</p>
+ <p>
+ If you believe the character encoding to be valid you can submit a request for
+ that character encoding (see the <a href="feedback.html">feedback page</a>
+ for details) and we will look into supporting it in the future.
+ </p>
+ .EOF.
+ return $File;
+ }
- my $c = Text::Iconv->new($result_charset, 'utf-8');
- my $line = 0;
- for (@{$File->{Content}}) {
- my $in = $_;
- $line++;
- $_ = $c->convert($_); # $_ is local!!
- if ($in ne "" and $_ eq "") {
- push @lines, $line;
- $_ = "#### encoding problem on this line, not shown ####";
- }
+ my $c = Text::Iconv->new($result_charset, 'utf-8');
+ my $line = 0;
+ for (@{$File->{Content}}) {
+ my $in = $_;
+ $line++;
+ $_ = $c->convert($_); # $_ is local!!
+ if ($in ne "" and $_ eq "") {
+ push @{$File->{Lines}}, $line;
+ $_ = "#### encoding problem on this line, not shown ####";
}
}
+ return $File;
+}
- # check correctness of UTF-8 both for UTF-8 input and for conversion results
- if ($File->{Charset}->{Use}) { #### this check seems to be unnecessary
- for (my $i = 0; $i < $#{$File->{Content}}; $i++) {
- # substitution needed for very long lines (>32K), to avoid backtrack
- # stack overflow. Handily, this also happens to count characters.
- local $_ = $File->{Content}->[$i];
- my $count =
- s/ [\x00-\x7F] # ASCII
- | [\xC2-\xDF] [\x80-\xBF] # non-overlong 2-byte sequences
- | \xE0[\xA0-\xBF] [\x80-\xBF] # excluding overlongs
- | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte sequences
- | \xED[\x80-\x9F] [\x80-\xBF] # excluding surrogates
- | \xF0[\x90-\xBF] [\x80-\xBF]{2} # planes 1-3
- | [\xF1-\xF3] [\x80-\xBF]{3} # planes 4-15
- | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
- //xg;
- push @lines, ($i+1) if length;
- $count += 0; # Force numeric.
- $File->{Offsets}->[$i + 1] = [$count, $File->{Offsets}->[$i]->[1] + $count];
- #### replace invalid line with some dummy text
+#
+# Check correctness of UTF-8 both for UTF-8 input and for conversion results
+sub check_utf8 {
+ my $File = shift;
+
+ for (my $i = 0; $i < $#{$File->{Content}}; $i++) {
+ # substitution needed for very long lines (>32K), to avoid backtrack
+ # stack overflow. Handily, this also happens to count characters.
+ local $_ = $File->{Content}->[$i];
+ my $count =
+ s/ [\x00-\x7F] # ASCII
+ | [\xC2-\xDF] [\x80-\xBF] # non-overlong 2-byte sequences
+ | \xE0[\xA0-\xBF] [\x80-\xBF] # excluding overlongs
+ | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte sequences
+ | \xED[\x80-\x9F] [\x80-\xBF] # excluding surrogates
+ | \xF0[\x90-\xBF] [\x80-\xBF]{2} # planes 1-3
+ | [\xF1-\xF3] [\x80-\xBF]{3} # planes 4-15
+ | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
+ //xg;
+ if (length) {
+ push @{$File->{Lines}}, ($i+1);
+ $File->{Content}->[$i] = "#### encoding problem on this line, not shown ####";
+ $count = 50; # length of above text
}
+ $count += 0; # Force numeric.
+ $File->{Offsets}->[$i + 1] = [$count, $File->{Offsets}->[$i]->[1] + $count];
}
+ return $File;
+}
+#
+# byte error analysis
+sub byte_error {
+ my $File = shift;
+ my @lines = @{$File->{Lines}};
if (scalar @lines) {
$File->{'Error Flagged'} = TRUE;
my $s = $#lines ? 's' : '';
@@ -2157,37 +2177,10 @@ sub transcode_and_check {
encoding indication.
</p>
.EOF.
- return $File;
}
return $File;
}
-#### not used
-#
-# Transcode into UTF-8.
-#sub transcode {
-# my $File = shift;
-# my $from = shift;
-#
-# my @Result = ();
-# my @lines = ();
-#
-# eval {my $c = Text::Iconv->new($from, 'utf-8')};
-# if ($@) {
-# return FALSE;
-# } else {
-# my $c = Text::Iconv->new($from, 'utf-8');
-# my $line = 0;
-# foreach my $in (@{$File->{Content}}) {
-# $line++;
-# my $out = $c->convert($in);
-# push @lines, $line if ($in and not $out);
-# push @Result, $out;
-# }
-# }
-# return {Data => \@Result, Lines => \@lines};
-#}
-
#
# Return an XML report for the page.