spliting up &preparse into two subs

author: duerst <duerst@localhost> 2004-09-22 00:40:40 +0000
committer: duerst <duerst@localhost> 2004-09-22 00:40:40 +0000
commit: 68517fe73488b1ac61847cd1994f4f21ae46c6d1 (patch)
tree: 21e1caee80b46d732b6ce7f0e2e54db94167addd
parent: f6ccd5538f6d23a7c0c66c1063bcd94bd37a451b (diff)
download: markup-validator-68517fe73488b1ac61847cd1994f4f21ae46c6d1.zip
markup-validator-68517fe73488b1ac61847cd1994f4f21ae46c6d1.tar.gz
markup-validator-68517fe73488b1ac61847cd1994f4f21ae46c6d1.tar.bz2
1 files changed, 60 insertions, 6 deletions
diff --git a/httpd/cgi-bin/check b/httpd/cgi-bin/check
index 23eb5ce..04e6fc6 100755
--- a/httpd/cgi-bin/check
+++ b/httpd/cgi-bin/check
@@ -9,7 +9,7 @@
 # This source code is available under the license at:
 #     http://www.w3.org/Consortium/Legal/copyright-software
 #
-# $Id: check,v 1.342 2004-09-17 14:36:23 bjoern Exp $
+# $Id: check,v 1.343 2004-09-22 00:40:40 duerst Exp $
 
 #
 # Disable buffering on STDOUT!
@@ -225,7 +225,7 @@ Directory not readable (permission denied): @_r
 
   #
   # Strings
-  $VERSION    =  q$Revision: 1.342 $;
+  $VERSION    =  q$Revision: 1.343 $;
   $VERSION    =~ s/Revision: ([\d\.]+) /$1/;
 
   #
@@ -440,7 +440,7 @@ $File->{Content} = &normalize_newlines($File->{Bytes},
 #
 # Try to extract META charset
 # (works only if ascii-based and reasonably clean before <meta>)
-$File = &preparse($File);
+$File = &preparse_meta($File);
 unless ($File->{Charset}->{Use}) {
   $File->{Charset}->{Use} = $File->{Charset}->{META};
 }
@@ -607,7 +607,7 @@ if ($File->{Opt}->{DOCTYPE}
 
 #
 # Try to extract a DOCTYPE or xmlns.
-$File = &preparse($File);
+$File = &preparse_doctype($File);
 
 
 #
@@ -1886,8 +1886,62 @@ sub parsetree {
 
 
 #
-# Do an initial parse of the Document Entity to extract charset and FPI.
-sub preparse {
+# Do an initial parse of the Document Entity to extract FPI.
+# (still also extracts charset)
+sub preparse_doctype {
+  my $File = shift;
+
+  #
+  # Reset DOCTYPE, Root, and Charset (for second invocation).
+  $File->{Charset}->{META} = '';
+  $File->{DOCTYPE}         = '';
+  $File->{Root}            = '';
+
+  my $dtd = sub {
+    return if $File->{Root};
+    ($File->{Root}, $File->{DOCTYPE}) = shift =~  m(<!DOCTYPE\s+(\w+)\s+PUBLIC\s+(?:[\'\"])([^\"\']+)(?:[\"\']).*>)si;
+  };
+
+  my $start = sub {
+    my $tag  = shift;
+    my $attr = shift;
+    my %attr = map {lc($_) => $attr->{$_}} keys %{$attr};
+
+    if ($File->{Root}) {
+      if (lc $tag eq 'meta') {
+        if (lc $attr{'http-equiv'} eq 'content-type') {
+          if ($attr{content} =~ m(charset\s*=[\s\"\']*([^\s;\"\'>]*))si) {
+            $File->{Charset}->{META} = lc $1;
+          }
+        }
+      }
+      return unless $tag eq $File->{Root};
+    } else {
+      $File->{Root} = $tag;
+    }
+    if ($attr->{xmlns}) {$File->{Namespace} = $attr->{xmlns}};
+  };
+
+  my $p = HTML::Parser->new(api_version => 3);
+  $p->xml_mode(TRUE);
+  $p->ignore_elements('BODY');
+  $p->ignore_elements('body');
+  $p->handler(declaration => $dtd, 'text');
+  $p->handler(start => $start, 'tag,attr');
+  $p->parse(join "\n", @{$File->{Content}});
+
+  $File->{DOCTYPE} = '' unless defined $File->{DOCTYPE};
+  $File->{DOCTYPE} =~ s(^\s+){ }g;
+  $File->{DOCTYPE} =~ s(\s+$){ }g;
+  $File->{DOCTYPE} =~ s(\s+) { }g;
+
+  return $File;
+}
+
+#
+# Do an initial parse of the Document Entity to extract charset from HTML <meta>.
+# (still also extracts FPI)
+sub preparse_meta {
   my $File = shift;
 
   #
author	duerst <duerst@localhost>	2004-09-22 00:40:40 +0000
committer	duerst <duerst@localhost>	2004-09-22 00:40:40 +0000
commit	68517fe73488b1ac61847cd1994f4f21ae46c6d1 (patch)
tree	21e1caee80b46d732b6ce7f0e2e54db94167addd
parent	f6ccd5538f6d23a7c0c66c1063bcd94bd37a451b (diff)
download	markup-validator-68517fe73488b1ac61847cd1994f4f21ae46c6d1.zip markup-validator-68517fe73488b1ac61847cd1994f4f21ae46c6d1.tar.gz markup-validator-68517fe73488b1ac61847cd1994f4f21ae46c6d1.tar.bz2