summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorville <ville@localhost>2010-05-07 17:22:58 +0000
committerville <ville@localhost>2010-05-07 17:22:58 +0000
commit4572b2b38e9d7c90791307333741cb55884fb95c (patch)
treefc30161abc6d75207362eea8541d4aaf3223e5ce
parent4713fca26317885ee6976d7361335e4f504bb974 (diff)
downloadmarkup-validator-4572b2b38e9d7c90791307333741cb55884fb95c.zip
markup-validator-4572b2b38e9d7c90791307333741cb55884fb95c.tar.gz
markup-validator-4572b2b38e9d7c90791307333741cb55884fb95c.tar.bz2
When passing transcoded content to external validators (and the internal
XML one), fix up charset declarations in content to reflect transcoding. http://www.w3.org/mid/Pine.LNX.4.64.1004291710580.13453%40zen.rrzn.uni-hannover.de
-rwxr-xr-xhtdocs/whatsnew.html20
-rwxr-xr-xhttpd/cgi-bin/check74
2 files changed, 73 insertions, 21 deletions
diff --git a/htdocs/whatsnew.html b/htdocs/whatsnew.html
index 6e661df..0b937ca 100755
--- a/htdocs/whatsnew.html
+++ b/htdocs/whatsnew.html
@@ -1,5 +1,5 @@
-<!--#set var="revision" value="\$Id: whatsnew.html,v 1.96 2010-03-02 16:44:19 ville Exp $"
---><!--#set var="date" value="\$Date: 2010-03-02 16:44:19 $"
+<!--#set var="revision" value="\$Id: whatsnew.html,v 1.97 2010-05-07 17:22:58 ville Exp $"
+--><!--#set var="date" value="\$Date: 2010-05-07 17:22:58 $"
--><!--#set var="title" value="What's New at The W3C Markup Validation Service"
--><!--#set var="relroot" value="./"
--><!--#set var="feeds" value="1"
@@ -28,6 +28,22 @@
</div>
<dl id="news">
+ <dt id="vxyz">201Y-MM-DD &mdash; X.Y.Z release:</dt>
+ <dd>
+ <p>
+ The X.Y.Z release of the markup validator is a TBD release.
+ Changes include:
+ </p>
+ <ul>
+ <li>
+ Enhancement: when passing its internal transcoded document to
+ external validators, validator now reflects the transcoding
+ by modifying the character information included in the passed
+ document.
+ </li>
+ </ul>
+ </dd>
+
<dt id="v086">2010-03-01 &mdash; 0.8.6 release:</dt>
<dd>
<p>
diff --git a/httpd/cgi-bin/check b/httpd/cgi-bin/check
index f3eeb7a..87920cd 100755
--- a/httpd/cgi-bin/check
+++ b/httpd/cgi-bin/check
@@ -14,7 +14,7 @@
# This source code is available under the license at:
# http://www.w3.org/Consortium/Legal/copyright-software
#
-# $Id: check,v 1.774 2010-05-07 09:18:53 ville Exp $
+# $Id: check,v 1.775 2010-05-07 17:22:58 ville Exp $
#
# We need Perl 5.8.0+.
@@ -191,7 +191,7 @@ EOF
#
# Strings
- $VERSION = q$Revision: 1.774 $;
+ $VERSION = q$Revision: 1.775 $;
$VERSION =~ s/Revision: ([\d\.]+) /$1/;
# Read friendly error message file
@@ -613,21 +613,13 @@ if (&is_xml($File)) {
# [NOT] loading the XML catalog for entities resolution as it seems to
# cause a lot of unnecessary DTD/entities fetching
#$xmlparser->load_catalog(catfile($CFG->{Paths}->{SGML}->{Library}, 'xml.soc'));
+
+ &override_charset($File, "UTF-8");
+
my $xml_string = join "\n", @{$File->{Content}};
my $xmlws = qr/[\x20\x09\x0D\x0A]/o;
- # the XML parser will check the value of encoding attribute in XML
- # declaration so we have to amend it to reflect transcoding.
- # see Bug 4867
- $xml_string =~ s/
- (^<\?xml\b[^>]*${xmlws})
- (encoding${xmlws}*=${xmlws}*
- (?:(["'])[A-Za-z][a-zA-Z0-9_-]+\3)
- )
- (.*?\?>)
- /$1encoding="UTF-8"$4/sox;
-
# Is the document standalone? Need to check with a regex because
# the parser may fail to return a document we could use for this.
my $standalone = (
@@ -1073,11 +1065,9 @@ sub html5_validate (\$)
if ($File->{Opt}->{DOCTYPE} || $File->{Charset}->{Override}) {
# Doctype or charset overridden, need to use $File->{Content} in UTF-8
- # because $File->{Bytes} is not affected by the overrides. This will
- # most likely be a source of errors about internal/actual charset
- # differences as long as our transcoding process does not "fix" the
- # charset info in XML declaration, meta http-equiv/charset and/or BOM
- # (any others?).
+ # because $File->{Bytes} is not affected by the overrides.
+
+ &override_charset($File, "UTF-8");
$ct = $File->{ContentType} unless $File->{'Direct Input'};
my @ct = ($ct => undef, charset => "UTF-8");
@@ -2030,7 +2020,7 @@ sub override_doctype
$seen_root = TRUE;
if ($seen_doctype) {
- # doctype addition aldready done, we move on
+ # doctype addition already done, we move on
$HTML .= $_[0];
}
else {
@@ -2082,6 +2072,52 @@ sub override_doctype
}
#
+# Override inline charset declarations, for use e.g. when passing
+# transcoded results to external parsers that use them.
+sub override_charset ($$)
+{
+ my ($File, $charset) = @_;
+
+ my $ws = qr/[\x20\x09\x0D\x0A]/o;
+ my $cs = qr/[A-Za-z][a-zA-Z0-9_-]+/o;
+
+ my $content = join("\n", @{$File->{Content}});
+
+ # <?xml encoding="charset"?>
+ $content =~ s/(
+ (^<\?xml\b[^>]*?${ws}encoding${ws}*=${ws}*(["']))
+ (${cs})
+ (\3.*?\?>)
+ )/lc($4) eq lc($charset) ? "$1" : "$2$charset$5<!-- $1 -->"/esx;
+
+ # <meta charset="charset">
+ $content =~ s/(
+ (<meta\b[^>]*?${ws}charset${ws}*=${ws}*["']?${ws}*)
+ (${cs})
+ (.*?>)
+ )/lc($3) eq lc($charset) ? "$1" : "$2$charset$4<!-- $1 -->"/esix;
+
+ # <meta http-equiv="content-type" content="some/type; charset=charset">
+ $content =~ s/(
+ (<meta\b[^>]*${ws}
+ http-equiv${ws}*=${ws}*["']?${ws}*content-type\b[^>]*?${ws}
+ content${ws}*=${ws}*["']?[^"'>]+?;${ws}*charset${ws}*=${ws}*)
+ (${cs})
+ (.*?>)
+ )/lc($3) eq lc($charset) ? "$1" : "$2$charset$4<!-- $1 -->"/esix;
+
+ # <meta content="some/type; charset=charset" http-equiv="content-type">
+ $content =~ s/(
+ (<meta\b[^>]*${ws}
+ content${ws}*=${ws}*["']?[^"'>]+?;${ws}*charset${ws}*=${ws}*)
+ (${cs})
+ ([^>]*?${ws}http-equiv${ws}*=${ws}*["']?${ws}*content-type\b.*?>)
+ )/lc($3) eq lc($charset) ? "$1" : "$2$charset$4<!-- $1 -->"/esix;
+
+ $File->{Content} = [split /\n/, $content];
+}
+
+#
# Generate a HTML report of detected errors.
sub report_errors ($)
{