diff options
author | ot <ot@localhost> | 2008-04-28 03:42:44 +0000 |
---|---|---|
committer | ot <ot@localhost> | 2008-04-28 03:42:44 +0000 |
commit | 7a12d0ee363673b0a9f0ad8d5e199f496800fedb (patch) | |
tree | e42984b3992564110b2223bbe34c664b94c57af2 | |
parent | a9efd99de95d26cdd1c229bf58e4e2cecd985f92 (diff) | |
download | markup-validator-7a12d0ee363673b0a9f0ad8d5e199f496800fedb.zip markup-validator-7a12d0ee363673b0a9f0ad8d5e199f496800fedb.tar.gz markup-validator-7a12d0ee363673b0a9f0ad8d5e199f496800fedb.tar.bz2 |
Trying to solve the headache of the default fallback character encoding,
when different specs suggest different encodings, and we try to promote utf-8
as the best practice. This patch tried to go around the issue by trying, in sequence,
a fallback of utf8, win-1252, then iso-8859-1 - in sequence.
-rwxr-xr-x | httpd/cgi-bin/check | 38 | ||||
-rw-r--r-- | share/templates/en_US/warnings.tmpl | 5 |
2 files changed, 32 insertions, 11 deletions
diff --git a/httpd/cgi-bin/check b/httpd/cgi-bin/check index ed3e8ad..e447932 100755 --- a/httpd/cgi-bin/check +++ b/httpd/cgi-bin/check @@ -14,7 +14,7 @@ # This source code is available under the license at: # http://www.w3.org/Consortium/Legal/copyright-software # -# $Id: check,v 1.584 2008-04-23 04:23:31 ot Exp $ +# $Id: check,v 1.585 2008-04-28 03:42:43 ot Exp $ # # Disable buffering on STDOUT! @@ -186,7 +186,7 @@ Directory not readable (permission denied): @_r # # Strings - $VERSION = q$Revision: 1.584 $; + $VERSION = q$Revision: 1.585 $; $VERSION =~ s/Revision: ([\d\.]+) /$1/; # @@ -519,8 +519,7 @@ if ($File->{Charset}->{HTTP}) { # HTTP, if given, is authoritative. $File->{Charset}->{Use} = "utf-8"; } elsif (&is_xml($File) and not $File->{ContentType} =~ m(^text/)) { $File->{Charset}->{Use} = 'utf-8'; # UTF-8 (image/svg+xml etc.) -} - +} unless ($File->{Charset}->{Use}) { $File->{Charset}->{Use} = $File->{Charset}->{META}; } @@ -563,12 +562,6 @@ if (charset_not_equal($File->{Opt}->{Charset}, '(detect automatically)')) { } } -unless ($File->{Charset}->{Use}) { # No charset given... - &add_warning('W04', {W04_charset => 'UTF-8'}); - $File->{Tentative} |= T_ERROR; # Can never be valid. - $File->{Charset}->{Use} = 'utf-8'; -} - # # Abort if an error was flagged while finding the encoding. &abort_if_error_flagged($File, O_CHARSET|O_DOCTYPE); @@ -598,9 +591,32 @@ Encode::Alias::define_alias('ksc_5601', 'KS_C_5601-1987'); # gb18030 requires Encode::HanExtra but no additional alias -# +$File->{Charset}->{Default} = FALSE; +unless ($File->{Charset}->{Use}) { # No charset given... + $File->{Charset}->{Use} = 'utf-8'; + $File->{Charset}->{Default} = TRUE; + $File->{Tentative} |= T_ERROR; # Can never be valid. + &add_warning('W04', {W04_charset => "UTF-8"}); +} + + # Always transcode, even if the content claims to be UTF-8 $File = transcode($File); +if (($File->{ContentType} == "text/html") and ($File->{Charset}->{Default}) and $File->{'Error Flagged'}) { + $File->{'Error Flagged'} = FALSE; # reset + # we try again, this time with win-1252 + $File->{Charset}->{Use} = 'windows-1252'; + &add_warning('W04', {W04_charset => "windows-1252", W04_also_tried=> "UTF-8"}); + $File = transcode($File); +} +if (($File->{ContentType} == "text/html") and ($File->{Charset}->{Default}) and $File->{'Error Flagged'}) { + $File->{'Error Flagged'} = FALSE; # reset + # we try again, this time with latin1... + $File->{Charset}->{Use} = 'iso-8859-1'; + &add_warning('W04', {W04_charset => "iso-8859-1", W04_also_tried => "UTF-8, windows-1252"}); + $File = transcode($File); +} +# if it still does not work, we abandon hope here &abort_if_error_flagged($File, O_CHARSET); # diff --git a/share/templates/en_US/warnings.tmpl b/share/templates/en_US/warnings.tmpl index ecce33a..c799222 100644 --- a/share/templates/en_US/warnings.tmpl +++ b/share/templates/en_US/warnings.tmpl @@ -72,6 +72,11 @@ to check these potential issues, and, if necessary, fix them and re-validate the encoding was used to read the content and attempt to perform the validation, but this is likely to fail for all non-trivial documents. </p> + <TMPL_IF NAME="W04_also_tried"> + <p>Before defaulting to <code><TMPL_IF NAME="W04_charset"><TMPL_VAR NAME="W04_charset" ESCAPE="HTML"><TMPL_ELSE>UTF-8</TMPL_IF></code> + the validator also tried to read the content with the following encoding(s), without success: + <code><TMPL_VAR NAME="W04_also_tried" ESCAPE="HTML"></code>.</p> + </TMPL_IF> </TMPL_IF> <TMPL_IF NAME="opt_verbose"> <p>The sources used to find encoding information include:</p> |