diff options
author | duerst <duerst@localhost> | 2004-09-22 00:40:40 +0000 |
---|---|---|
committer | duerst <duerst@localhost> | 2004-09-22 00:40:40 +0000 |
commit | 68517fe73488b1ac61847cd1994f4f21ae46c6d1 (patch) | |
tree | 21e1caee80b46d732b6ce7f0e2e54db94167addd | |
parent | f6ccd5538f6d23a7c0c66c1063bcd94bd37a451b (diff) | |
download | markup-validator-68517fe73488b1ac61847cd1994f4f21ae46c6d1.zip markup-validator-68517fe73488b1ac61847cd1994f4f21ae46c6d1.tar.gz markup-validator-68517fe73488b1ac61847cd1994f4f21ae46c6d1.tar.bz2 |
spliting up &preparse into two subs
-rwxr-xr-x | httpd/cgi-bin/check | 66 |
1 files changed, 60 insertions, 6 deletions
diff --git a/httpd/cgi-bin/check b/httpd/cgi-bin/check index 23eb5ce..04e6fc6 100755 --- a/httpd/cgi-bin/check +++ b/httpd/cgi-bin/check @@ -9,7 +9,7 @@ # This source code is available under the license at: # http://www.w3.org/Consortium/Legal/copyright-software # -# $Id: check,v 1.342 2004-09-17 14:36:23 bjoern Exp $ +# $Id: check,v 1.343 2004-09-22 00:40:40 duerst Exp $ # # Disable buffering on STDOUT! @@ -225,7 +225,7 @@ Directory not readable (permission denied): @_r # # Strings - $VERSION = q$Revision: 1.342 $; + $VERSION = q$Revision: 1.343 $; $VERSION =~ s/Revision: ([\d\.]+) /$1/; # @@ -440,7 +440,7 @@ $File->{Content} = &normalize_newlines($File->{Bytes}, # # Try to extract META charset # (works only if ascii-based and reasonably clean before <meta>) -$File = &preparse($File); +$File = &preparse_meta($File); unless ($File->{Charset}->{Use}) { $File->{Charset}->{Use} = $File->{Charset}->{META}; } @@ -607,7 +607,7 @@ if ($File->{Opt}->{DOCTYPE} # # Try to extract a DOCTYPE or xmlns. -$File = &preparse($File); +$File = &preparse_doctype($File); # @@ -1886,8 +1886,62 @@ sub parsetree { # -# Do an initial parse of the Document Entity to extract charset and FPI. -sub preparse { +# Do an initial parse of the Document Entity to extract FPI. +# (still also extracts charset) +sub preparse_doctype { + my $File = shift; + + # + # Reset DOCTYPE, Root, and Charset (for second invocation). + $File->{Charset}->{META} = ''; + $File->{DOCTYPE} = ''; + $File->{Root} = ''; + + my $dtd = sub { + return if $File->{Root}; + ($File->{Root}, $File->{DOCTYPE}) = shift =~ m(<!DOCTYPE\s+(\w+)\s+PUBLIC\s+(?:[\'\"])([^\"\']+)(?:[\"\']).*>)si; + }; + + my $start = sub { + my $tag = shift; + my $attr = shift; + my %attr = map {lc($_) => $attr->{$_}} keys %{$attr}; + + if ($File->{Root}) { + if (lc $tag eq 'meta') { + if (lc $attr{'http-equiv'} eq 'content-type') { + if ($attr{content} =~ m(charset\s*=[\s\"\']*([^\s;\"\'>]*))si) { + $File->{Charset}->{META} = lc $1; + } + } + } + return unless $tag eq $File->{Root}; + } else { + $File->{Root} = $tag; + } + if ($attr->{xmlns}) {$File->{Namespace} = $attr->{xmlns}}; + }; + + my $p = HTML::Parser->new(api_version => 3); + $p->xml_mode(TRUE); + $p->ignore_elements('BODY'); + $p->ignore_elements('body'); + $p->handler(declaration => $dtd, 'text'); + $p->handler(start => $start, 'tag,attr'); + $p->parse(join "\n", @{$File->{Content}}); + + $File->{DOCTYPE} = '' unless defined $File->{DOCTYPE}; + $File->{DOCTYPE} =~ s(^\s+){ }g; + $File->{DOCTYPE} =~ s(\s+$){ }g; + $File->{DOCTYPE} =~ s(\s+) { }g; + + return $File; +} + +# +# Do an initial parse of the Document Entity to extract charset from HTML <meta>. +# (still also extracts FPI) +sub preparse_meta { my $File = shift; # |