summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorot <ot@localhost>2007-07-19 08:18:31 +0000
committerot <ot@localhost>2007-07-19 08:18:31 +0000
commitf11f7e9050d68a18a70b2035f56c82a9baf720fd (patch)
tree1e65468018e693e9c02614ffbcadae02733f2c09
parentdf7f4b4de4c1770008fd15cef497d7642877d43f (diff)
downloadmarkup-validator-f11f7e9050d68a18a70b2035f56c82a9baf720fd.zip
markup-validator-f11f7e9050d68a18a70b2035f56c82a9baf720fd.tar.gz
markup-validator-f11f7e9050d68a18a70b2035f56c82a9baf720fd.tar.bz2
re-populating the list of charsets (from older revisions in both branch 0.7 and 0.8)
to be used thus: * no fatal error if the charset is supported by encode * a warning with the suggestion for a better alias if we know one * a warning that the encoding may be "odd" if not in the list but encode says it's OK
-rw-r--r--htdocs/config/charset.cfg89
-rwxr-xr-xhttpd/cgi-bin/check20
-rw-r--r--share/templates/en_US/soap_warnings.tmpl3
-rw-r--r--share/templates/en_US/ucn_warnings.tmpl3
-rw-r--r--share/templates/en_US/warnings.tmpl17
5 files changed, 110 insertions, 22 deletions
diff --git a/htdocs/config/charset.cfg b/htdocs/config/charset.cfg
index aed81a6..f007cc4 100644
--- a/htdocs/config/charset.cfg
+++ b/htdocs/config/charset.cfg
@@ -1,23 +1,78 @@
#
-# List of encodings aliases and forbidden encodings
+# list of accepted/preferred character encodings
#
-# $Id: charset.cfg,v 1.14 2007-07-19 03:59:23 ot Exp $
-
-# This list indicates character encoding aliases that are
-# not recommended, along with a recommended equivalent, e.g:
-# encoding-obscure = encoding-well-known
-
-# It also lists encoding names that the validator will refuse to treat:
-# bogus_encoding = Encoding Forbidden (Reason why)
+# $Id: charset.cfg,v 1.15 2007-07-19 08:18:30 ot Exp $
+#
+# Syntax:
+#
+# charset/encoding = ? result
+#
+# Note: charsets and results are lowercase, actions are uppercase
+#
+# ? indicates the action to take:
+# 1: OK, character supported
+# X: frequent error, e.g. starting with x-; ask user to replace with result
+# ERR: a charset we refuse, per some policy. Reason stated after ERR
-# The list is independent of what
-# is supported on a specific system but subject to the Validator
-# policy for acceptable encodings.
+#e.g:
+# utf-8 = 1
+# odd-alias = X good-alias
+# bad_charset = ERR explain reason
+utf-8 = 1
+utf-16 = 1
+utf-16be = 1
+utf-16le = 1
+iso-8859-1 = 1
+iso-8859-2 = 1
+iso-8859-3 = 1
+iso-8859-4 = 1
+iso-8859-5 = 1
+iso-8859-6 = 1
+# implicit bidi, but character encoding is the same
+iso-8859-6-i = 1
+iso-8859-7 = 1
+iso-8859-8 = 1
+# implicit bidi, but character encoding is the same
+iso-8859-8-i = 1
+iso-8859-9 = 1
+iso-8859-10 = 1
+iso-8859-11 = 1
+# iso-8859-12 doesn't exist (yet?)
+iso-8859-13 = 1
+iso-8859-14 = 1
+iso-8859-15 = 1
+iso-8859-16 = 1
+us-ascii = 1
+iso-2022-jp = 1
+shift_jis = 1
+euc-jp = 1
+gb2312 = 1
+big5 = 1
+iso-2022-kr = 1
+euc-kr = 1
+gb18030 = 1
+tis-620 = 1
+koi8-r = 1
+koi8-u = 1
+iso-ir-111 = 1
+windows-1250 = 1
+windows-1251 = 1
+windows-1252 = 1
+windows-1253 = 1
+windows-1254 = 1
+windows-1255 = 1
+windows-1256 = 1
+windows-1257 = 1
+# windows-1258 = 1
+macintosh = 1
+ks_c_5601-1987 = 1
+ksc_5601 = 1
-x-mac-roman = macintosh
-x-sjis = shift_jis
-iso8859-1 = iso-8859-1
-ascii = us-ascii
+x-mac-roman = X macintosh
+x-sjis = X shift_jis
+iso8859-1 = X iso-8859-1
+ascii = X us-ascii
+8859_1 = X iso-8859-1
# this one is in IANA, but better use only windows-1252
-iso-8859-1-Windows-3.1-Latin-1 = windows-1252
+iso-8859-1-Windows-3.1-Latin-1 = X windows-1252
diff --git a/httpd/cgi-bin/check b/httpd/cgi-bin/check
index 5a6387c..ad5f1e2 100755
--- a/httpd/cgi-bin/check
+++ b/httpd/cgi-bin/check
@@ -14,7 +14,7 @@
# This source code is available under the license at:
# http://www.w3.org/Consortium/Legal/copyright-software
#
-# $Id: check,v 1.542 2007-07-19 06:21:03 ot Exp $
+# $Id: check,v 1.543 2007-07-19 08:18:30 ot Exp $
#
# Disable buffering on STDOUT!
@@ -183,7 +183,7 @@ Directory not readable (permission denied): @_r
#
# Strings
- $VERSION = q$Revision: 1.542 $;
+ $VERSION = q$Revision: 1.543 $;
$VERSION =~ s/Revision: ([\d\.]+) /$1/;
#
@@ -2090,7 +2090,7 @@ sub transcode {
my $cs = $exact_charset;
if ($CFG->{Charsets}->{$cs}) {
- if ($CFG->{Charsets}->{$cs} =~ /Encoding Forbidden/) {
+ if ($CFG->{Charsets}->{$cs} =~ /ERR /) {
# The encoding is not supported due to policy
$File->{'Error Flagged'} = TRUE;
@@ -2102,10 +2102,13 @@ sub transcode {
"This encoding is not supported by the validator.");
return $File;
}
- else {
+ elsif ($CFG->{Charsets}->{$cs} =~ /X /) {
+ # possibly problematic, we recommend another alias
+ my $recommended_charset = $CFG->{Charsets}->{$cs};
+ $recommended_charset =~ s/X //;
&add_warning('W22', {
W22_declared => $cs,
- W22_suggested => $CFG->{Charsets}->{$cs},
+ W22_suggested => $recommended_charset,
});
}
}
@@ -2127,6 +2130,13 @@ sub transcode {
return $File;
}
+ elsif (!$CFG->{Charsets}->{$cs}) {
+ # not in the list, but technically OK -> we warn
+ &add_warning('W23', {
+ W23_declared => $cs,
+ });
+
+ }
my $output;
my $input = $File->{Bytes};
diff --git a/share/templates/en_US/soap_warnings.tmpl b/share/templates/en_US/soap_warnings.tmpl
index 463480d..e8040e0 100644
--- a/share/templates/en_US/soap_warnings.tmpl
+++ b/share/templates/en_US/soap_warnings.tmpl
@@ -77,6 +77,9 @@
<m:warning><m:message>Character Encoding suggestion: use
<TMPL_VAR NAME="W22_suggested" ESCAPE="HTML"> instead of <TMPL_VAR NAME="W22_declared" ESCAPE="HTML"></m:warning></m:message>
</TMPL_IF>
+<TMPL_IF NAME="W23">
+ <m:warning><m:message>Rare or unregistered Character Encoding detected</m:message></m:warning>
+</TMPL_IF>
<TMPL_IF NAME="W@@">
<m:warning><m:message></m:message></m:warning>
diff --git a/share/templates/en_US/ucn_warnings.tmpl b/share/templates/en_US/ucn_warnings.tmpl
index 5222cba..6a31f1b 100644
--- a/share/templates/en_US/ucn_warnings.tmpl
+++ b/share/templates/en_US/ucn_warnings.tmpl
@@ -77,6 +77,9 @@
<warning><message>Character Encoding suggestion: use
<TMPL_VAR NAME="W22_suggested" ESCAPE="HTML"> instead of <TMPL_VAR NAME="W22_declared" ESCAPE="HTML"></warning></message>
</TMPL_IF>
+<TMPL_IF NAME="W23">
+ <warning><message>Rare or unregistered Character Encoding detected</message></warning>
+</TMPL_IF>
<TMPL_IF NAME="W@@">
<warning><message></message></warning>
diff --git a/share/templates/en_US/warnings.tmpl b/share/templates/en_US/warnings.tmpl
index d99dcec..fe64041 100644
--- a/share/templates/en_US/warnings.tmpl
+++ b/share/templates/en_US/warnings.tmpl
@@ -443,6 +443,23 @@
</p>
</li>
</TMPL_IF>
+<TMPL_IF NAME="W23">
+ <li class="msg_warn" id="W23"><span class="err_type"><img src="images/info_icons/warning.png" alt="Warning" title="Warning" /></span> <span class="msg">Rare or unregistered character encoding detected</span>
+
+ <p>
+ The character encoding declared for this document
+ (<code><TMPL_VAR NAME="W23_declared" ESCAPE="HTML"></code>)
+ is supported by the validator, but may not be widely supported across platforms.
+ For the sake of interoperability, it is best to use a unicode character encoding
+ such as <code>UTF-8</code>, or one of the
+ <a href="http://www.iana.org/assignments/character-sets">registered character
+ encodings</a>.
+ </p>
+ <p>
+ More information on <a href="http://www.w3.org/International/O-charset.html">declaring a character encoding on your Web server or in your document</a> can be found on the W3C Internationalization site.
+ </p>
+ </li>
+</TMPL_IF>