diff options
author | hugo <hugo@localhost> | 1999-11-24 22:04:35 +0000 |
---|---|---|
committer | hugo <hugo@localhost> | 1999-11-24 22:04:35 +0000 |
commit | aa1a1791a2fa6e7168a0ae62f6e6004ff267fa49 (patch) | |
tree | 60055fd8f962cecf4c081f026a5c12b639d9d13d | |
parent | 92780613afb083bcfe9e544f00ca419b3789e5bd (diff) | |
download | markup-validator-aa1a1791a2fa6e7168a0ae62f6e6004ff267fa49.zip markup-validator-aa1a1791a2fa6e7168a0ae62f6e6004ff267fa49.tar.gz markup-validator-aa1a1791a2fa6e7168a0ae62f6e6004ff267fa49.tar.bz2 |
First commit.
This is a beta version: line count isn't reliable, the CGI part is untested,
the output is not clear, etc.
-rwxr-xr-x | httpd/cgi-bin/LinkChecker.pl | 786 |
1 files changed, 786 insertions, 0 deletions
diff --git a/httpd/cgi-bin/LinkChecker.pl b/httpd/cgi-bin/LinkChecker.pl new file mode 100755 index 0000000..c46295d --- /dev/null +++ b/httpd/cgi-bin/LinkChecker.pl @@ -0,0 +1,786 @@ +#! /usr/bin/perl -w +# +# W3C Link Checker +# by Hugo Haas +# (c) 1999 World Wide Web Consortium +# based on Renaud Bruyeron's checklin.pl +# +# $Id: LinkChecker.pl,v 1.1 1999-11-24 22:04:35 hugo Exp $ +# +# This program is licensed under the W3C License. + +package W3C::LinkChecker; +require HTML::Parser; +@W3C::LinkChecker::ISA = qw(HTML::Parser); +package W3C::UserAgent; +require LWP::UserAgent; +@W3C::UserAgent::ISA = qw(LWP::UserAgent); + +# Autoflush +$| = 1; + +# Version info +my $PROGRAM = 'W3C LinkChecker'; +my $VERSION = '$Revision: 1.1 $ (c) 1999 W3C'; + +# State of the program +my $_cl; +my $_quiet = 0; +my $_summary = 0; +my $_verbose = 0; +my $_progress = 0; +my $_html = 0; +my $_timeout = 60; +my $_redirects = 1; +my $_user; +my $_password; +my $query; + +if ($#ARGV >= 0) { + $_cl = 1; +# Parse command line + my @uris = &parse_arguments(); + if ($_user && (! $_password)) { + &ask_password(); + } + foreach $uri (@uris) { + if (! $_summary) { + printf("%s %s\n", $PROGRAM ,$VERSION); + } else { + $_verbose = 0; + $_progress = 0; + } + $uri = urize($uri); + &check_uri($uri); + } +} else { + use CGI; + $query = new CGI; + $_cl = 0; + $_progress = 0; + my $uri; + if ($query->param('uri')) { + $uri = $query->param('uri'); + } else { + &print_form($query); + } + if ($uri =~ m/^file:/) { + &file_uri($uri); + } elsif (!($uri =~ m/:/)) { + $uri = 'http://'.$uri; + } + &check_uri($uri); +} + +############################################################################### + +################################ +# Command line and usage stuff # +################################ + +sub parse_arguments() { + my @uris; + my $uris = 0; + while (@ARGV) { + $_ = shift(@ARGV); + if ($uris) { + push(@uris, $_); + } elsif (m/^--$/) { + $uris = 1; + } elsif (m/^-[^-up]/) { + if (m/q/) { + $_quiet = 1; + } + if (m/s/) { + $_summary = 1; + } + if (m/b/) { + $_redirects = 0; + } + if (m/v/) { + $_verbose = 1; + } + if (m/i/) { + $_progress = 1; + } + if (m/h/) { + $_html = 1; + } + } elsif (m/^--help$/) { + &usage(); + } elsif (m/^--quiet$/) { + $_quiet = 1; + } elsif (m/^--summary$/) { + $_summary = 1; + } elsif (m/^--broken$/) { + $_redirects = 0; + } elsif (m/^--verbose$/) { + $_verbose = 1; + } elsif (m/^--indicator$/) { + $_progress = 1; + } elsif (m/^--html$/) { + $_html = 1; + } elsif (m/^-u|--user$/) { + $_user = shift(@ARGV); + } elsif (m/^-p|--password$/) { + $_password = shift(@ARGV); + } elsif (m/^-t|--timeout$/) { + $_timeout = shift(@ARGV); + } else { + push(@uris, $_); + } + } + return(@uris); +} + +sub usage() { + print STDERR "$PROGRAM $VERSION +Usage: LinkCheck.pl <options> <uris> +Options: + -s/--summary Result summary only. + -b/--broken Show only the broken links, not the redirects. + -q/--quiet No output if no errors are found. + -v/--verbose Verbose mode. + -i/--indicator Show progress while parsing. + -u/--user username Specify a username for authentication. + -p/--password password Specify a password. + -t/--timeout value Timeout for the HTTP requests. + -h/--html HTML output. + --help Show this message. +"; + exit(0); +} + +sub ask_password() { + print(STDERR 'Enter your password: '); + system('stty -echo'); + chomp($_password = <STDIN>); + system('stty echo'); + print(STDERR "ok.\n"); +} + +############################################################################### + +########################################### +# Transform foo into file://localhost/foo # +########################################### + +sub urize() { + use URI; + $_ = $_[0]; + my $base; + my $res = $_; + if (m/:/) { + $base = URI->new($_); + } elsif (m/^\//) { + $base = URI->new('file://localhost'); + } else { + my $pwd; + chop($pwd = `pwd`); + $base = URI->new('file://localhost'.$pwd.'/'); + } + $u = URI->new($res); + $result = $u->abs($base); + return($result->as_string()); +} + +####################### +# Do the job on a URI # +####################### + +sub check_uri() { + my $uri = $_[0]; + my $start; + if (! $_summary) { + $start = &get_timestamp(); + printf("\nProcessing\t%s\n", $uri); + } + # Get the document + my $response = &get_uri('GET', $uri); + if (! $response->is_success()) { + printf("Error: %d %s\n", $response->code(), $response->message()); + if ($response->code() == 401) { + &authentication($response); + } + exit(-1); + } + my %redirects; + &record_redirects(\%redirects, $response->{Redirects}); + # Parse the document + if (! ($response->header('Content-type') =~ m/text\/html/)) { + printf("Can't check link: Content-type is '%s'.\n", + $response->header('Content-type')); + return(-1); + } + my $base_uri = URI->new($response->base()); + my $uri_uri = URI->new($uri); + my $absolute_base = $base_uri->abs($uri_uri); + my $p = &parse_document($absolute_base->as_string(), $response->content(), 1); + my $base = URI->new($p->{base}); + # Check anchors + if (! $_summary) { + print("Checking anchors:\n"); + } + my %errors; + foreach $anchor (keys %{$p->{Anchors}}) { + my @lines = keys %{$p->{Anchors}{$anchor}}; + my $times = $#lines + 1; + if ($times > 1) { + $errors{$anchor} = 1; + } + } + if (! $_summary) { + print(" done.\n"); + } + # Check links + my %links; + foreach $link (keys %{$p->{Links}}) { + my $link_uri = URI->new($link); + my $abs_link_uri = URI->new_abs($link_uri, $base); + foreach $lines (keys %{$p->{Links}{$link}}) { + my $canonical = URI->new($abs_link_uri->canonical()); + my $url = $canonical->scheme().':'.$canonical->opaque(); + my $fragment = $canonical->fragment() ? $canonical->fragment() : $url; + $links{$url}{$fragment}{$lines} = 1; + } + } + for $url (keys %links) { + if (!defined($links{$url}{$url})) { + $links{$url}{$url}{-1} = 1; + } + } + my %results; + my %broken; + foreach $u (keys %links) { + # Don't check mailto: URI's + next if ($u =~ m/^mailto:/); + if (! $_summary) { + printf("Checking link %s\n", $u); + } + &check_validity($uri, $u, \%links, \%results, \%redirects, $p->{Anchors}); + if ($_verbose) { + printf("\tReturn code: %s\n", $results{$u}{$u}{code}); + } + if ($results{$u}{$u}{success}) { + foreach $fragment (keys %{$links{$u}}) { + next if ($fragment eq $u); + if ($_versbose) { + printf("\t\t%s %s - Lines: %s\n", + $fragment, + ($results{$u}{$fragment}?'OK':'Not found'), + join(',', keys %{$links{$u}{$fragment}})); + } + if ($results{$u}{$fragment} == 0) { + $broken{$u}{$fragment} = 1; + } + } + } else { + $broken{$u}{$u} = 1; + foreach $fragment (keys %{$links{$u}}) { + $broken{$u}{$fragment} = 1; + } + } + } + if (! $_summary) { + my $stop = &get_timestamp(); + printf("Processed in %ss.\n", &time_diff($start, $stop)); + } + # Display results + if ($_html) { + &html_header($uri); + } else { + print "\n"; + } + &anchors_summary($p->{Anchors}, \%errors); + &links_summary(\%links, \%results, \%broken, \%redirects); + if ($_html) { + &html_footer(); + } +} + +############################ +# Get the content of a URI # +############################ + +sub W3C::UserAgent::redirect_ok { + my ($self, $request) = @_; + + if (! $_summary) { + printf("\n%s %s ", $request->method(), $request->uri()); + } + + $self->{Redirects}{$self->{fetching}} = $request->uri(); + $self->{fetching} = $request->uri(); + + return 0 if $request->method() eq "POST"; + return 1; +} + +sub get_uri() { + my ($method, $uri) = @_; + my $start = &get_timestamp(); + my $ua = new W3C::UserAgent; + $ua->timeout($_timeout); + $ua->agent($PROGRAM.' '.$VERSION); + $ua->{uri} = $uri; + $ua->{fetching} = $uri; + my $count = 0; + my $response; + if (! $_summary) { + printf("%s %s ", $method, $uri); + } + my $request = new HTTP::Request($method, $uri); + $response = $ua->request($request); + $response->{Redirects} = $ua->{Redirects}; + my $stop = &get_timestamp(); + if (! $_summary) { + printf(" fetched in %ss\n", &time_diff($start,$stop)); + } + return $response; +} + +#################### +# Parse a document # +#################### + +sub parse_document() { + my ($uri, $document, $links) = @_; + + my $start; + my $p = W3C::LinkChecker->new(); + if (! $_summary) { + $start = &get_timestamp(); + print("Parsing...\n"); + } + $p->{base} = $uri; + $p->{Line} = 1; + if (!$_summary || $_progress) { + $p->{Total} = ($document =~ tr/\n//); + } + $p->{extract_links} = $links; + $p->parse($document); + if (! $_summary) { + my $stop = &get_timestamp(); + if ($_progress) { + print "\r"; + } + printf(" done (%d lines in %ss).\n", + $p->{Total}, + &time_diff($start, $stop)); + } + return $p; +} + +####################################### +# Count the number of lines in a file # +####################################### + +sub W3C::LinkChecker::new_line() { + my ($self, $string) = @_; + my $count = ($string =~ tr/\n//); + $self->{Line} = $self->{Line} + $count; + if ($_progress) { + printf("\r%4d%%", int($self->{Line}/$self->{Total}*100)); + } +} + +####################################### +# start function used by HTML::Parser # +####################################### + +sub W3C::LinkChecker::start() { + my ($self, $tag, $attr, $text) = @_; + my $anchor; + # Links + if ($self->{extract_links}) { + my $link; + if ($tag eq 'a') { + $link = $attr->{href}; + $anchor = $attr->{name}; + } elsif ($tag eq 'img') { + $link = $attr->{src}; + } elsif (($tag eq 'frame') || ($tag eq 'link')) { + $link = $attr->{href}; + } + if (defined($link)) { + $self->{Links}{$link}{$self->{Line}}++; + } + # Just anchors + } else { + if ($tag eq 'a') { + $anchor = $attr->{name}; + } + } + if (defined($anchor)) { + $self->{Anchors}{$anchor}{$self->{Line}}++; + } + # Line counting + if ($text =~ m/\n/) { + $self->new_line($text); + } +} + +##################################################### +# text and end functions for line counting purposes # +##################################################### + +sub W3C::LinkChecker::text() { + my ($self, $text) = @_; + if (!$_progress) { + return unless $self->{extract_links}; + } + if ($text =~ /\n/) { + $self->new_line($text); + } +} + +sub W3C::LinkChecker::end() { + my $self = shift; + return unless $self->{extract_links}; + shift; + my $text = shift; + $self->text($self, $text); +} + +################################ +# Check the validity of a link # +################################ + +sub check_validity($, $, \%, \%, \%, \%) { + use HTTP::Status; + my ($testing, $uri, $links, $results, $redirects, $anchors) = @_; + # Checking file: URI's is not allowed with a CGI + if ($testing ne $uri) { + if ((! $_cl) && (!($testing =~ m/^file:/)) && ($uri =~ m/^file:/)) { + $results->{$uri}{$uri}{code} = $RC_BAD_REQUEST; + $results->{$uri}{$uri}{success} = 0; + $results->{$uri}{$uri}{message} = 'Error: \'file:\' URI not allowed'; + if ($_verbose) { + printf("Error: %d %s\n", + $results->{$uri}{$uri}{code}, + $results->{$uri}{$uri}{message}); + } + return; + } + } + # Get the document with the appropriate method + my $method; + my @fragments = keys %{$links->{$uri}}; + if ($testing eq $uri) { + $method = 'Already have'; + } elsif ($#fragments == 0) { + $method = 'HEAD'; + } else { + $method = 'GET'; + } + my $response; + if ($testing eq $uri) { + # Mimic an HTTP::Response object + $results->{$uri}{$uri}{code} = 600; + $results->{$uri}{$uri}{success} = 1; + } else { + $response = &get_uri($method, $uri); + # Record the redirects + &record_redirects($redirects, $response->{Redirects}); + # Parse the document if necessary and possible + $results->{$uri}{$uri}{code} = $response->code(); + $results->{$uri}{$uri}{success} = $response->is_success(); + if (! $results->{$uri}{$uri}{success}) { + $results->{$uri}{$uri}{message} = $response->message(); + if ($_verbose) { + printf("Error: %d %s\n", + $results->{$uri}{$uri}{code}, + $results->{$uri}{$uri}{message}); + } + return; + } + } + if ($#fragments == 0) { + return; + } + my $p; + if ($testing ne $uri) { + if (!(($results->{$uri}{$uri}{type} = $response->header('Content-type')) =~ m/text\/html/i)) { + if ($_verbose) { + printf("Can't check content: Content-type is '%s'.\n", + $response->header('Content-type')); + } + return; + } + $p = &parse_document($response->base(), $response->as_string(), 0); + } else { + $p->{Anchors} = $anchors; + } + # Check that the fragments exist + foreach $fragment (keys %{$links->{$uri}}) { + next if ($fragment eq $uri); + if (defined($p->{Anchors}{$fragment}) + || &escape_match($fragment, $p->{Anchors})) { + $results->{$uri}{$fragment} = 1; + } else { + $results->{$uri}{$fragment} = 0; + } + } +} + +sub escape_match($, \%) { + use URI::Escape; + my ($a, $hash) = (uri_unescape($_[0]), $_[1]); + foreach $b (keys %$hash) { + if ($a eq uri_unescape($b)) { + return(1); + } + } + return(0); +} + +########################## +# Ask for authentication # +########################## + +sub authentication() { + my $r = $_[0]; + $r->headers->www_authenticate =~ /Basic realm=\"([^\"]+)\"/; + my $realm = $1; + if ($_cl) { + printf(STDERR "Authentication is required for %s.\n", $r->request->url); + printf(STDERR "The realm is %s.\n", $realm); + print(STDERR "Use the -u and -p options to specify a username and password.\n"); + } else { + } +} + +################## +# Get statistics # +################## + +sub get_timestamp() { + require 'sys/syscall.ph'; + my $timestamp = pack('LL', ()); + syscall(&SYS_gettimeofday, $timestamp, 0) != -1 or $timestamp = 0; + return($timestamp); +} + +sub time_diff() { + my @start = unpack('LL', $_[0]); + my @stop = unpack('LL', $_[1]); + for ($start[1], $stop[1]) { + $_ /= 1_000_000; + } + return(sprintf("%.2f", ($stop[0]+$stop[1])-($start[0]+$start[1]))); +} + +######################## +# Handle the redirects # +######################## + +sub record_redirects(\%, \%) { + my ($redirects, $sub) = @_; + foreach $r (keys %$sub) { + $redirects->{$r} = $sub->{$r}; + } +} + +sub is_redirected($, %) { + my ($uri, %redirects) = @_; + return(defined($redirects{$uri})); +} + +sub get_redirects($, %) { + my ($uri, %redirects) = @_; + my @history = ($uri); + my $origin = $uri; + my %seen; + while ($redirects{$uri}) { + $uri = $redirects{$uri}; + push(@history, $uri); + last if ($uri eq $origin); + } + return(@history); +} + +##################### +# Print the results # +##################### + +sub anchors_summary(\%, \%) { + my ($anchors, $errors) = @_; + # Number of anchors found. + if (! $_quiet) { + if ($_html) { + print('<p>'); + } + my @anchors = keys %{$anchors}; + printf("Found %d anchors.", $#anchors+1); + if ($_html) { + print('</p>'); + } + print("\n"); + } + # List of the duplicates, if any. + @errors = keys %{$errors}; + return if ($#errors < 0); + if ($_html) { + print('<p>'); + } + print('List of duplicate anchors:'); + if ($_html) { + print("</p>\n<table border=\"1\">\n<tr><td><b>Anchors</b></td><td><b>Lines</b></td></tr>"); + } + print("\n"); + foreach $anchor (@errors) { + my $format; + if ($_html) { + $format = "<tr><td>%s</td><td>%s</td></tr>\n"; + } else { + $format = "\t%s\tLines: %s\n"; + } + printf($format, $anchor, join(', ', keys %{$anchors->{$anchor}})); + } + if ($_html) { + print("</table>\n"); + } +} + +sub links_summary(\%,\%,\%) { + my ($links, $results, $broken, $redirects) = @_; + if (! $_quiet) { + if ($_html) { + print("\n<hr>\n\n<p>"); + } + my @links = keys %$links; + my $n_fragments = 0; + my $n_total = 0; + foreach $u (@links) { + my @fragments = keys %{$links->{$u}}; + $n_fragments += $#fragments + 1; + foreach $f (@fragments) { + my @lines = keys %{$links->{$u}{$f}}; + foreach $l (@lines) { + $n_total += $links->{$u}{$f}{$l}; + } + } + } + printf("Found %d locations for %d URI's (%d total).", + $#links+1, + $n_fragments, + $n_total); + if ($_html) { + print('</p>'); + } + print("\n"); + } + # List of the broken links + @urls = keys %{$broken}; + return if ($#urls < 0); + if ($_html) { + print('<p>'); + } + printf('List of broken %slinks:', $_redirects ? 'and redirected ' : ''); + if ($_html) { + print("</p>\n<table border=\"1\">\n<tr><td><b>Location</b></td><td><b>Code</b></td><td><b>Fragment</b></td><td><b>Lines</b></td></tr>"); + } + print("\n"); + if ($_redirects) { + for $l (keys %$redirects) { + next unless (defined($results->{$l}) + && !defined($broken->{$l})); + push(@urls, $l); + } + } + foreach $u (@urls) { + my @fragments = keys %{$broken->{$u}}; + my $n_fragments = $#fragments+1; + my $redirected = &is_redirected($u, %$redirects); + my $lines_list = defined($links->{$u}{$u}{-1}) ? + '' : join(', ', keys %{$links->{$u}{$u}}); + if ($_html) { + printf("<tr><th rowspan=\"%d\">%s</th><th rowspan=\"%d\">%d</th><td>%s</td><td>%s</td></tr>\n", + $n_fragments, + $redirected ? join('<br>-> ', + &get_redirects($u, %$redirects)) : $u, + $n_fragments, + $results->{$u}{$u}{code}, + '', + $lines_list); + } else { + printf("%s\tLines: %s\tCode: %d\n", + $redirected ? join("\n-> ", + &get_redirects($u, %$redirects)) : $u, + $lines_list, + $results->{$u}{$u}{code}); + } + foreach $f (@fragments) { + next if ($f eq $u); + my $format; + if ($_html) { + $format = "<tr><td>%s</td><td>%s</td></tr>\n"; + } else { + $format = "\t%s\tLines: %s\n"; + } + printf($format, + $f, + join(', ', keys %{$links->{$u}{$f}})); + } + } + if ($_html) { + print("</table>\n"); + } +} + +############################################################################### + +################## +# HTML interface # +################## + +sub html_header() { + my $uri = HTML::Entities::encode($_[0]); + print "Content-type: text/html + +<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0//EN\"> +<html> +<head> +<title>W3C Link Ckecker: $uri</title> +<style type=\"text/css\"> +</style> +</head> +<body> +<h1>W3C Link Checker: $uri</h1> +\n"; +} + +sub html_footer() { + print " +<hr> +<address> +$PROGRAM $VERSION<br> +Report bugs to <a href=\"hugo\@w3.org\">Hugo Haas</a> +</address> +</body> +</html> +"; +} + +sub file_uri() { + my $uri = $_[0]; + &html_header($uri); + print "<h2>Forbidden</h2> +<p>You cannot check such a URI (<code>$uri</code>).</p> +"; + &html_footer(); + exit; +} + +sub print_form() { + my ($q) = @_; + &html_header($VERSION); + print "<form action=\"".$q->self_url()."\" method=\"get\"> +<p>Enter the URI that you want to check:</p> +<input type=\"text\" name=\"uri\"> +<input type=\"submit\" name=\"submit\" value=\"Check\"> +</form> +"; + &html_footer(); + exit; +} |