diff options
author | ot <ot@localhost> | 2007-03-19 00:58:44 +0000 |
---|---|---|
committer | ot <ot@localhost> | 2007-03-19 00:58:44 +0000 |
commit | 4e69ab374f947c0844b554fe50dd3c969395d0b9 (patch) | |
tree | 1ea3853d88ac7b6f419a76f8b7e37c3e87749712 | |
parent | c17fbd795d3e8c6cf573a93c911d95e3c8ceed4d (diff) | |
download | markup-validator-4e69ab374f947c0844b554fe50dd3c969395d0b9.zip markup-validator-4e69ab374f947c0844b554fe50dd3c969395d0b9.tar.gz markup-validator-4e69ab374f947c0844b554fe50dd3c969395d0b9.tar.bz2 |
Adding XML well-formedness parsing for XML docs
with ideas from Jacques Distler http://golem.ph.utexas.edu/~distler/blog/archives/001054.html
We could be using only the XML parser and not parse twice, but
at the moment I am tempted to keep using opensp, for as long
as we have the best library of error message and explanations for it.
Code is not really pretty, and should be amended at some point to
use a real SAX ErrorHandler. That said, it very much tolls the bell for the
"Validator XML support has some limitations." message.
-rwxr-xr-x | httpd/cgi-bin/check | 85 |
1 files changed, 78 insertions, 7 deletions
diff --git a/httpd/cgi-bin/check b/httpd/cgi-bin/check index b4cdc97..b8a0668 100755 --- a/httpd/cgi-bin/check +++ b/httpd/cgi-bin/check @@ -14,7 +14,7 @@ # This source code is available under the license at: # http://www.w3.org/Consortium/Legal/copyright-software # -# $Id: check,v 1.483 2007-03-16 12:42:36 ot Exp $ +# $Id: check,v 1.484 2007-03-19 00:58:44 ot Exp $ # # Disable buffering on STDOUT! @@ -180,7 +180,7 @@ Directory not readable (permission denied): @_r # # Strings - $VERSION = q$Revision: 1.483 $; + $VERSION = q$Revision: 1.484 $; $VERSION =~ s/Revision: ([\d\.]+) /$1/; # @@ -604,6 +604,79 @@ set_parse_mode($File, $CFG) if $File->{DOCTYPE}; # Sanity check Charset information and add any warnings necessary. $File = &charset_conflicts($File); + + +# before we start the parsing, clean slate +$File->{'Is Valid'} = TRUE; +$File->{Errors} = []; + +# preparse with XML parser if necessary +# we should really be using a SAX ErrorHandler, but I can't find +# a way to make it work with XML::LibXML::SAX::Parser... ** FIXME ** +# ditto, we should try using W3C::Validator::ErrorHandler, +# but it's badly linked to opensp at the moment +if (&is_xml($File)) { + + use XML::LibXML; + my $xmlparser = XML::LibXML->new(); + $xmlparser->line_numbers(1); + eval { + $xmlparser->parse_string(join"\n",@{$File->{Content}}); + }; + my $xml_parse_errors_line = undef; + my @xmlwf_error_list; + if ($@) { + + my $xmlwf_errors = $@; + my $xmlwf_error_line = undef; + my $xmlwf_error_col = undef; + my $xmlwf_error_msg = undef; + my $num_xmlwf_error = 0; + my $last_err_msg = undef; + my $err; + foreach my $msg_line (split "\n", $xmlwf_errors){ + $msg_line =~ s{[^\x0d\x0a](:\d+:)}{\n$1}g; + $msg_line =~ s{[^\x0d\x0a]+[\x0d\x0a]$}{}; + if ($msg_line =~ /(:\d+:)(.*)/ ){ + $xmlwf_error_line = $1; + $xmlwf_error_msg = $2; + $xmlwf_error_line =~ s/:(\d+):/$1/; + $xmlwf_error_msg =~ s/ parser error :/XML Parsing Error: /; + } + if ($msg_line =~ /(.+)\^/){ + $xmlwf_error_col = length($1); + } + + if ((defined $xmlwf_error_line) and (defined $xmlwf_error_col) and (defined $xmlwf_error_msg)){ + $err->{src} = '...'; # do this with show_open_entities()? + $err->{line} = $xmlwf_error_line; + $err->{char} = $xmlwf_error_col; + $err->{num} = 0; + $err->{type} = "E"; + $err->{msg} = $xmlwf_error_msg; + + # ... + $last_err_msg = $err; + push (@xmlwf_error_list, $err); + $err = undef; + $xmlwf_error_line = undef; + $xmlwf_error_col = undef; + $xmlwf_error_msg = undef; + $num_xmlwf_error++; + + } + } + foreach my $errmsg (@xmlwf_error_list){ + $File->{'Is Valid'} = FALSE; + push @{$File->{Errors}}, $errmsg; + } + + } +} + + + + # # Abandon all hope ye who enter here... $File = &parse($File); @@ -631,6 +704,7 @@ sub parse (\$) { # FIXME when fixed s:p:o gets released } + # # Parser configuration $opensp->search_dirs($CFG->{Paths}->{SGML}->{Library}); @@ -680,11 +754,12 @@ sub parse (\$) { # # Set Version to be the FPI initially. $File->{Version} = $File->{DOCTYPE}; - return $File; } + + # # Force "XML" if type is an XML type and an FPI was not found. # Otherwise set the type to be the FPI. @@ -2052,10 +2127,6 @@ sub W3C::Validator::ErrorHandler::new my $self = { _file => $File, _parser => $parser }; - # ... - $File->{'Is Valid'} = TRUE; - $File->{Errors} = []; - bless $self, $class; } |