diff options
-rwxr-xr-x | httpd/cgi-bin/check | 183 |
1 files changed, 88 insertions, 95 deletions
diff --git a/httpd/cgi-bin/check b/httpd/cgi-bin/check index 575f1da..22fb600 100755 --- a/httpd/cgi-bin/check +++ b/httpd/cgi-bin/check @@ -9,7 +9,7 @@ # This source code is available under the license at: # http://www.w3.org/Consortium/Legal/copyright-software # -# $Id: check,v 1.222 2002-08-30 07:39:39 duerst Exp $ +# $Id: check,v 1.223 2002-08-30 08:17:45 duerst Exp $ # # Disable buffering on STDOUT! @@ -95,7 +95,7 @@ BEGIN { # # Strings - $VERSION = q$Revision: 1.222 $; + $VERSION = q$Revision: 1.223 $; $VERSION =~ s/Revision: ([\d\.]+) /$1/; @@ -151,7 +151,7 @@ $File->{'Header'} = &prepSSI({ }); $File->{'Footer'} = &prepSSI({ File => $CFG->{'Footer'}, - Date => q$Date: 2002-08-30 07:39:39 $, + Date => q$Date: 2002-08-30 08:17:45 $, }); # @@ -187,6 +187,10 @@ $File->{Charset}->{Override}=''# From override. # Array (ref) used to store character offsets for the XML report. $File->{Offsets}->[0] = [0, 0]; # The first item isn't used... +# +# List to hold line numbers for encoding errors +@{$File->{Lines}} = (); + ######################################### # Populate $File->{Opt} -- CGI Options. # @@ -325,7 +329,13 @@ unless ($File->{Charset}->{Use}) { # # Check the detected Encoding and transcode. -$File = &transcode_and_check($File); +unless (&conflict($File->{Charset}->{Use}, 'utf-8')) { + $File = &transcode($File); + &abort_if_error_flagged($File, 0); +} + +$File = &check_utf8($File); +$File = &byte_error($File); # # Abort if an error was flagged during transcoding @@ -2067,81 +2077,91 @@ sub charset_conflicts { # -# Check Encoding and Transcode. -sub transcode_and_check { +# Transcode to UTF-8 +sub transcode { my $File = shift; - my @lines; - - unless ($File->{Charset}->{Use} eq 'utf-8') { - my ($command, $result_charset) = split " ", $CFG->{Charsets}->{$File->{Charset}->{Use}}, 2; + my ($command, $result_charset) = split " ", $CFG->{Charsets}->{$File->{Charset}->{Use}}, 2; - if ($result_charset eq 'utf-16' && $File->{Charset}->{Auto} =~ m/^utf-16[bl]e$/) { - $result_charset = $File->{Charset}->{Auto}; # for per-line conversion, need to be exact - } - if ($command eq 'I') { - # test if given charset is available - eval {my $c = Text::Iconv->new($result_charset, 'utf-8')}; - $command = '' if $@; - } elsif ($command eq 'X') { - $@ = "$File->{Charset}->{Use} undefined; replace by $result_charset"; - } + if ($result_charset eq 'utf-16' && $File->{Charset}->{Auto} =~ m/^utf-16[bl]e$/) { + $result_charset = $File->{Charset}->{Auto}; # for per-line conversion, need to be exact + } + if ($command eq 'I') { + # test if given charset is available + eval {my $c = Text::Iconv->new($result_charset, 'utf-8')}; + $command = '' if $@; + } elsif ($command eq 'X') { + $@ = "$File->{Charset}->{Use} undefined; replace by $result_charset"; + } - if ($command ne 'I') { - $File->{'Error Flagged'} = TRUE; - $File->{'Error Message'} = <<" .EOF."; - <p>Sorry! - A fatal error occurred when attempting to transcode the character encoding - of the document. Either we do not support this character encoding yet, or you - have specified a non-existent character encoding (often a misspelling). - </p> - <p>The detected character encoding was "$File->{Charset}->{Use}".</p> - <p>The error was "$@".</p> - <p> - If you believe the character encoding to be valid you can submit a request for - that character encoding (see the <a href="feedback.html">feedback page</a> - for details) and we will look into supporting it in the future. - </p> - .EOF. - return $File; - } + if ($command ne 'I') { + $File->{'Error Flagged'} = TRUE; + $File->{'Error Message'} = <<" .EOF."; + <p>Sorry! + A fatal error occurred when attempting to transcode the character encoding + of the document. Either we do not support this character encoding yet, or you + have specified a non-existent character encoding (often a misspelling). + </p> + <p>The detected character encoding was "$File->{Charset}->{Use}".</p> + <p>The error was "$@".</p> + <p> + If you believe the character encoding to be valid you can submit a request for + that character encoding (see the <a href="feedback.html">feedback page</a> + for details) and we will look into supporting it in the future. + </p> + .EOF. + return $File; + } - my $c = Text::Iconv->new($result_charset, 'utf-8'); - my $line = 0; - for (@{$File->{Content}}) { - my $in = $_; - $line++; - $_ = $c->convert($_); # $_ is local!! - if ($in ne "" and $_ eq "") { - push @lines, $line; - $_ = "#### encoding problem on this line, not shown ####"; - } + my $c = Text::Iconv->new($result_charset, 'utf-8'); + my $line = 0; + for (@{$File->{Content}}) { + my $in = $_; + $line++; + $_ = $c->convert($_); # $_ is local!! + if ($in ne "" and $_ eq "") { + push @{$File->{Lines}}, $line; + $_ = "#### encoding problem on this line, not shown ####"; } } + return $File; +} - # check correctness of UTF-8 both for UTF-8 input and for conversion results - if ($File->{Charset}->{Use}) { #### this check seems to be unnecessary - for (my $i = 0; $i < $#{$File->{Content}}; $i++) { - # substitution needed for very long lines (>32K), to avoid backtrack - # stack overflow. Handily, this also happens to count characters. - local $_ = $File->{Content}->[$i]; - my $count = - s/ [\x00-\x7F] # ASCII - | [\xC2-\xDF] [\x80-\xBF] # non-overlong 2-byte sequences - | \xE0[\xA0-\xBF] [\x80-\xBF] # excluding overlongs - | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte sequences - | \xED[\x80-\x9F] [\x80-\xBF] # excluding surrogates - | \xF0[\x90-\xBF] [\x80-\xBF]{2} # planes 1-3 - | [\xF1-\xF3] [\x80-\xBF]{3} # planes 4-15 - | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 - //xg; - push @lines, ($i+1) if length; - $count += 0; # Force numeric. - $File->{Offsets}->[$i + 1] = [$count, $File->{Offsets}->[$i]->[1] + $count]; - #### replace invalid line with some dummy text +# +# Check correctness of UTF-8 both for UTF-8 input and for conversion results +sub check_utf8 { + my $File = shift; + + for (my $i = 0; $i < $#{$File->{Content}}; $i++) { + # substitution needed for very long lines (>32K), to avoid backtrack + # stack overflow. Handily, this also happens to count characters. + local $_ = $File->{Content}->[$i]; + my $count = + s/ [\x00-\x7F] # ASCII + | [\xC2-\xDF] [\x80-\xBF] # non-overlong 2-byte sequences + | \xE0[\xA0-\xBF] [\x80-\xBF] # excluding overlongs + | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte sequences + | \xED[\x80-\x9F] [\x80-\xBF] # excluding surrogates + | \xF0[\x90-\xBF] [\x80-\xBF]{2} # planes 1-3 + | [\xF1-\xF3] [\x80-\xBF]{3} # planes 4-15 + | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 + //xg; + if (length) { + push @{$File->{Lines}}, ($i+1); + $File->{Content}->[$i] = "#### encoding problem on this line, not shown ####"; + $count = 50; # length of above text } + $count += 0; # Force numeric. + $File->{Offsets}->[$i + 1] = [$count, $File->{Offsets}->[$i]->[1] + $count]; } + return $File; +} +# +# byte error analysis +sub byte_error { + my $File = shift; + my @lines = @{$File->{Lines}}; if (scalar @lines) { $File->{'Error Flagged'} = TRUE; my $s = $#lines ? 's' : ''; @@ -2157,37 +2177,10 @@ sub transcode_and_check { encoding indication. </p> .EOF. - return $File; } return $File; } -#### not used -# -# Transcode into UTF-8. -#sub transcode { -# my $File = shift; -# my $from = shift; -# -# my @Result = (); -# my @lines = (); -# -# eval {my $c = Text::Iconv->new($from, 'utf-8')}; -# if ($@) { -# return FALSE; -# } else { -# my $c = Text::Iconv->new($from, 'utf-8'); -# my $line = 0; -# foreach my $in (@{$File->{Content}}) { -# $line++; -# my $out = $c->convert($in); -# push @lines, $line if ($in and not $out); -# push @Result, $out; -# } -# } -# return {Data => \@Result, Lines => \@lines}; -#} - # # Return an XML report for the page. |