diff options
author | ville <ville@localhost> | 2010-05-07 17:22:58 +0000 |
---|---|---|
committer | ville <ville@localhost> | 2010-05-07 17:22:58 +0000 |
commit | 4572b2b38e9d7c90791307333741cb55884fb95c (patch) | |
tree | fc30161abc6d75207362eea8541d4aaf3223e5ce | |
parent | 4713fca26317885ee6976d7361335e4f504bb974 (diff) | |
download | markup-validator-4572b2b38e9d7c90791307333741cb55884fb95c.zip markup-validator-4572b2b38e9d7c90791307333741cb55884fb95c.tar.gz markup-validator-4572b2b38e9d7c90791307333741cb55884fb95c.tar.bz2 |
When passing transcoded content to external validators (and the internal
XML one), fix up charset declarations in content to reflect transcoding.
http://www.w3.org/mid/Pine.LNX.4.64.1004291710580.13453%40zen.rrzn.uni-hannover.de
-rwxr-xr-x | htdocs/whatsnew.html | 20 | ||||
-rwxr-xr-x | httpd/cgi-bin/check | 74 |
2 files changed, 73 insertions, 21 deletions
diff --git a/htdocs/whatsnew.html b/htdocs/whatsnew.html index 6e661df..0b937ca 100755 --- a/htdocs/whatsnew.html +++ b/htdocs/whatsnew.html @@ -1,5 +1,5 @@ -<!--#set var="revision" value="\$Id: whatsnew.html,v 1.96 2010-03-02 16:44:19 ville Exp $" ---><!--#set var="date" value="\$Date: 2010-03-02 16:44:19 $" +<!--#set var="revision" value="\$Id: whatsnew.html,v 1.97 2010-05-07 17:22:58 ville Exp $" +--><!--#set var="date" value="\$Date: 2010-05-07 17:22:58 $" --><!--#set var="title" value="What's New at The W3C Markup Validation Service" --><!--#set var="relroot" value="./" --><!--#set var="feeds" value="1" @@ -28,6 +28,22 @@ </div> <dl id="news"> + <dt id="vxyz">201Y-MM-DD — X.Y.Z release:</dt> + <dd> + <p> + The X.Y.Z release of the markup validator is a TBD release. + Changes include: + </p> + <ul> + <li> + Enhancement: when passing its internal transcoded document to + external validators, validator now reflects the transcoding + by modifying the character information included in the passed + document. + </li> + </ul> + </dd> + <dt id="v086">2010-03-01 — 0.8.6 release:</dt> <dd> <p> diff --git a/httpd/cgi-bin/check b/httpd/cgi-bin/check index f3eeb7a..87920cd 100755 --- a/httpd/cgi-bin/check +++ b/httpd/cgi-bin/check @@ -14,7 +14,7 @@ # This source code is available under the license at: # http://www.w3.org/Consortium/Legal/copyright-software # -# $Id: check,v 1.774 2010-05-07 09:18:53 ville Exp $ +# $Id: check,v 1.775 2010-05-07 17:22:58 ville Exp $ # # We need Perl 5.8.0+. @@ -191,7 +191,7 @@ EOF # # Strings - $VERSION = q$Revision: 1.774 $; + $VERSION = q$Revision: 1.775 $; $VERSION =~ s/Revision: ([\d\.]+) /$1/; # Read friendly error message file @@ -613,21 +613,13 @@ if (&is_xml($File)) { # [NOT] loading the XML catalog for entities resolution as it seems to # cause a lot of unnecessary DTD/entities fetching #$xmlparser->load_catalog(catfile($CFG->{Paths}->{SGML}->{Library}, 'xml.soc')); + + &override_charset($File, "UTF-8"); + my $xml_string = join "\n", @{$File->{Content}}; my $xmlws = qr/[\x20\x09\x0D\x0A]/o; - # the XML parser will check the value of encoding attribute in XML - # declaration so we have to amend it to reflect transcoding. - # see Bug 4867 - $xml_string =~ s/ - (^<\?xml\b[^>]*${xmlws}) - (encoding${xmlws}*=${xmlws}* - (?:(["'])[A-Za-z][a-zA-Z0-9_-]+\3) - ) - (.*?\?>) - /$1encoding="UTF-8"$4/sox; - # Is the document standalone? Need to check with a regex because # the parser may fail to return a document we could use for this. my $standalone = ( @@ -1073,11 +1065,9 @@ sub html5_validate (\$) if ($File->{Opt}->{DOCTYPE} || $File->{Charset}->{Override}) { # Doctype or charset overridden, need to use $File->{Content} in UTF-8 - # because $File->{Bytes} is not affected by the overrides. This will - # most likely be a source of errors about internal/actual charset - # differences as long as our transcoding process does not "fix" the - # charset info in XML declaration, meta http-equiv/charset and/or BOM - # (any others?). + # because $File->{Bytes} is not affected by the overrides. + + &override_charset($File, "UTF-8"); $ct = $File->{ContentType} unless $File->{'Direct Input'}; my @ct = ($ct => undef, charset => "UTF-8"); @@ -2030,7 +2020,7 @@ sub override_doctype $seen_root = TRUE; if ($seen_doctype) { - # doctype addition aldready done, we move on + # doctype addition already done, we move on $HTML .= $_[0]; } else { @@ -2082,6 +2072,52 @@ sub override_doctype } # +# Override inline charset declarations, for use e.g. when passing +# transcoded results to external parsers that use them. +sub override_charset ($$) +{ + my ($File, $charset) = @_; + + my $ws = qr/[\x20\x09\x0D\x0A]/o; + my $cs = qr/[A-Za-z][a-zA-Z0-9_-]+/o; + + my $content = join("\n", @{$File->{Content}}); + + # <?xml encoding="charset"?> + $content =~ s/( + (^<\?xml\b[^>]*?${ws}encoding${ws}*=${ws}*(["'])) + (${cs}) + (\3.*?\?>) + )/lc($4) eq lc($charset) ? "$1" : "$2$charset$5<!-- $1 -->"/esx; + + # <meta charset="charset"> + $content =~ s/( + (<meta\b[^>]*?${ws}charset${ws}*=${ws}*["']?${ws}*) + (${cs}) + (.*?>) + )/lc($3) eq lc($charset) ? "$1" : "$2$charset$4<!-- $1 -->"/esix; + + # <meta http-equiv="content-type" content="some/type; charset=charset"> + $content =~ s/( + (<meta\b[^>]*${ws} + http-equiv${ws}*=${ws}*["']?${ws}*content-type\b[^>]*?${ws} + content${ws}*=${ws}*["']?[^"'>]+?;${ws}*charset${ws}*=${ws}*) + (${cs}) + (.*?>) + )/lc($3) eq lc($charset) ? "$1" : "$2$charset$4<!-- $1 -->"/esix; + + # <meta content="some/type; charset=charset" http-equiv="content-type"> + $content =~ s/( + (<meta\b[^>]*${ws} + content${ws}*=${ws}*["']?[^"'>]+?;${ws}*charset${ws}*=${ws}*) + (${cs}) + ([^>]*?${ws}http-equiv${ws}*=${ws}*["']?${ws}*content-type\b.*?>) + )/lc($3) eq lc($charset) ? "$1" : "$2$charset$4<!-- $1 -->"/esix; + + $File->{Content} = [split /\n/, $content]; +} + +# # Generate a HTML report of detected errors. sub report_errors ($) { |