summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-xhttpd/cgi-bin/checklink.pl233
1 files changed, 168 insertions, 65 deletions
diff --git a/httpd/cgi-bin/checklink.pl b/httpd/cgi-bin/checklink.pl
index 80ab14e..72bf2e0 100755
--- a/httpd/cgi-bin/checklink.pl
+++ b/httpd/cgi-bin/checklink.pl
@@ -1,11 +1,11 @@
#! /usr/bin/perl -w
#
# W3C Link Checker
-# by Hugo Haas
-# (c) 1999-2000 World Wide Web Consortium
+# by Hugo Haas <hugo@w3.org>
+# (c) 1999-2001 World Wide Web Consortium
# based on Renaud Bruyeron's checklink.pl
#
-# $Id: checklink.pl,v 2.67 2000-09-25 17:38:35 hugo Exp $
+# $Id: checklink.pl,v 2.68 2001-01-18 21:58:04 hugo Exp $
#
# This program is licensed under the W3C(r) License:
# http://www.w3.org/Consortium/Legal/copyright-software
@@ -34,7 +34,7 @@ $| = 1;
# Version info
my $PROGRAM = 'W3C checklink';
-my $VERSION = q$Revision: 2.67 $ . '(c) 1999-2000 W3C';
+my $VERSION = q$Revision: 2.68 $ . '(c) 1999-2001 W3C';
my $REVISION; ($REVISION = $VERSION) =~ s/Revision: (\d+\.\d+) .*/$1/;
# Different options specified by the user
@@ -56,6 +56,9 @@ my $_accept_language = 1;
my $_languages = '*';
my $_base_location = '.';
my $_contact_address = 'hugo@w3.org';
+my $_masquerade = 0;
+my $_local_dir = my $_remote_masqueraded_uri = '';
+my $_hide_same_realm = 0;
# Restrictions for the online version
my $_sleep_time = 3;
@@ -90,6 +93,7 @@ if ($#ARGV >= 0) {
$_verbose = 0;
$_progress = 0;
}
+ # Transform the parameter into a URI
$uri = urize($uri);
&check_uri($uri);
}
@@ -100,6 +104,7 @@ if ($#ARGV >= 0) {
use CGI;
use CGI::Carp qw(fatalsToBrowser);
$query = new CGI;
+ # Set a few parameters in CGI mode
$_cl = 0;
$_verbose = 0;
$_progress = 0;
@@ -130,11 +135,17 @@ if ($#ARGV >= 0) {
}
$uri =~ s/^\s+//g;
if ($uri =~ m/^file:/) {
+ # Only the http scheme is allowed
&file_uri($uri);
} elsif (!($uri =~ m/:/)) {
- $uri = 'http://'.$uri;
+ if ($uri =~ m|^//|) {
+ $uri = 'http:'.$uri;
+ } else {
+ $uri = 'http://'.$uri;
+ }
}
&check_uri($uri, 1);
+ &html_footer();
}
###############################################################################
@@ -215,6 +226,12 @@ sub parse_arguments() {
$_trusted = shift(@ARGV);
} elsif (m/^-y|--proxy$/) {
$_http_proxy = shift(@ARGV);
+ } elsif (m/^--masquerade$/) {
+ $_masquerade = 1;
+ $_local_dir = shift(@ARGV);
+ $_remote_masqueraded_uri = shift(@ARGV);
+ } elsif (m/^--hide-same-realm$/) {
+ $_hide_same_realm = 1;
} else {
push(@uris, $_);
}
@@ -244,10 +261,15 @@ Options:
-i/--indicator Show progress while parsing.
-u/--user username Specify a username for authentication.
-p/--password password Specify a password.
+ --hide-same-real Hide 401's that are in the same realm as the
+ document checked.
-t/--timeout value Timeout for the HTTP requests.
-d/--domain domain Regular expression describing the domain to
which the authetication information will be
sent (default: '$_trusted').
+ --masquerade local remote Masquerade local dir as a remote URI (e.g.
+ /home/hugo/MathML2/ is in fact
+ http://www.w3.org/TR/MathML2/).
-y/--proxy proxy Specify an HTTP proxy server.
-h/--html HTML output.
--help Show this message.
@@ -256,7 +278,7 @@ Options:
}
sub ask_password() {
- print(STDERR 'Enter your password: ');
+ print(STDERR 'Enter the password for user '.$_user.': ');
# Will only work on Unix...
system('stty -echo');
chomp($_password = <STDIN>);
@@ -277,7 +299,7 @@ sub urize() {
my $res = $_;
if (m/:/) {
$base = URI->new($_);
- } elsif (m/^\//) {
+ } elsif (m|^/|) {
$base = URI->new('file://localhost');
} else {
my $pwd;
@@ -289,21 +311,24 @@ sub urize() {
return($result->as_string());
}
-#######################
-# Do the job on a URI #
-#######################
+########################################
+# Check for broken links in a resource #
+########################################
sub check_uri() {
my ($uri, $html_header) = @_;
+ # If $html_header equals 1, we need to generate a HTML header (first
+ # instance called in HTML mode).
my $start;
if (! $_quiet) {
$start = &get_timestamp();
}
- # Get the document
+ # Get and parse the document
my $response = &get_document('GET', $uri, $doc_count, \%redirects);
+ # Can we check the resource? If not, we exit here...
if (defined($response->{Stop})) {
return(-1);
}
@@ -334,12 +359,15 @@ sub check_uri() {
}
}
+ # Record that we have processed this resource
$processed{$absolute_uri} = 1;
+ # Parse the document
my $p = &parse_document($uri, $absolute_uri,
$response->content(), 1);
my $base = URI->new($p->{base});
# Check anchors
+ ###############
if (! $_summary) {
print("Checking anchors:\n");
@@ -352,9 +380,11 @@ sub check_uri() {
foreach $l (keys %{$p->{Anchors}{$anchor}}) {
$times += $p->{Anchors}{$anchor}{$l};
}
+ # They should appear only once
if ($times > 1) {
$errors{$anchor} = 1;
}
+ # Empty IDREF's are not allowed
if ($anchor eq '') {
$errors{$anchor} = 1;
}
@@ -364,21 +394,35 @@ sub check_uri() {
}
# Check links
+ #############
my %links;
- # Record all the links
+ # Record all the links found
my $link;
foreach $link (keys %{$p->{Links}}) {
my $link_uri = URI->new($link);
my $abs_link_uri = URI->new_abs($link_uri, $base);
+ if ($_masquerade) {
+ if ($abs_link_uri =~ m|^$_remote_masqueraded_uri|) {
+ printf("processing %s in base $_local_dir %s\n",
+ $abs_link_uri);
+ my $nlink;
+ $nlink = $abs_link_uri;
+ $nlink =~
+ s|^$_remote_masqueraded_uri|file://localhost$_local_dir|;
+ $abs_link_uri = URI->new($nlink);
+ };
+ }
my $lines;
foreach $lines (keys %{$p->{Links}{$link}}) {
my $canonical = URI->new($abs_link_uri->canonical());
my $url = $canonical->scheme().':'.$canonical->opaque();
my $fragment = $canonical->fragment();
if (! $fragment) {
+ # Document without fragment
$links{$url}{location}{$lines} = 1;
} else {
+ # Resource with a fragment
$links{$url}{fragments}{$fragment}{$lines} = 1;
}
}
@@ -393,15 +437,19 @@ sub check_uri() {
if (! $_summary) {
&hprintf("Checking link %s\n", $u);
}
+ # Check that a link is valid
&check_validity($uri, $u, \%links, \%redirects);
if ($_verbose) {
&hprintf("\tReturn code: %s\n", $results{$u}{location}{code});
}
if ($results{$u}{location}{success}) {
my $fragment;
+ # Even though it was not broken, we might want to display it
+ # on the results page (e.g. because it required authentication)
if ($results{$u}{location}{display} >= 400) {
$broken{$u}{location} = 1;
}
+ # List the broken fragments
foreach $fragment (keys %{$links{$u}{fragments}}) {
if ($_verbose) {
&hprintf("\t\t%s %s - Lines: %s\n",
@@ -420,6 +468,7 @@ sub check_uri() {
} else {
# Couldn't find the document
$broken{$u}{location} = 1;
+ # All the fragments associated are hence broken
my $fragment;
foreach $fragment (keys %{$links{$u}{fragments}}) {
$broken{$u}{fragments}{$fragment}++;
@@ -450,14 +499,17 @@ sub check_uri() {
# Get the name of the original directory
# e.g. http://www.w3.org/TR/html4/Overview.html
# should return http://www.w3.org/TR/html4/
- $results{$uri}{parsing}{base} =~ m/^(.*\/)[^\/]*/;
+ $results{$uri}{parsing}{base} =~ m|^(.*/)[^/]*|;
$_base_location = $1;
}
foreach $u (keys %links) {
next if (! (# Check if it's in our scope for recursion
- ($u =~ m/^$_base_location/) &&
+ ($u =~ m|^$_base_location|) &&
# and the link is not broken
- $results{$u}{location}{success})
+ $results{$u}{location}{success} &&
+ # And it is a text/html resource
+ ($results{$u}{location}{type} =~ m|text/html|)
+ )
);
# Check if we have already processed the URI
next if (&already_processed($u) != 0);
@@ -490,18 +542,31 @@ sub check_uri() {
}
}
-#############################
-# Get a document to process #
-#############################
+#######################################
+# Get and parse a resource to process #
+#######################################
sub get_document() {
my ($method, $uri, $in_recursion, $redirects) = @_;
+ # $method contains the HTTP method the use (GET or HEAD)
+ # $uri contains the identifier of the resource
+ # $in_recursion equals 1 if we are in recursion mode (i.e. it is at least
+ # the second resource checked)
+ # $redirects is a pointer to the hash containing the map of the redirects
- # Get the document
- my $response = &get_uri($method, $uri);
- &record_results($uri, $method, $response);
+ # Get the resource
+ my $response;
+ if (defined($results{$uri}{response})
+ && !(($method eq 'GET') && ($results{$uri}{method} eq 'HEAD'))) {
+ $response = $results{$uri}{response};
+ } else {
+ $response = &get_uri($method, $uri);
+ &record_results($uri, $method, $response);
+ &record_redirects($redirects, $response->{Redirects});
+ }
if (! $response->is_success()) {
if (! $in_recursion) {
+ # Is it too late to request authentication?
if ($response->code() == 401) {
&authentication($response);
} else {
@@ -510,25 +575,20 @@ sub get_document() {
}
&hprintf("\nError: %d %s\n",
$response->code(), $response->message());
- if ($_html) {
- &html_footer();
- }
}
}
$response->{Stop} = 1;
return($response);
}
- &record_redirects($redirects, $response->{Redirects});
-
- # What URI are we processing by the way?
+ # What is the URI of the resource that we are processing by the way?
my $base_uri = URI->new($response->base());
my $request_uri = URI->new($response->request->url);
$response->{absolute_uri} = $base_uri->abs($request_uri);
- # Parse the document
+ # Can we parse the document?
my $failed_reason;
- if (! ($response->header('Content-Type') =~ m/text\/html/)) {
+ if (! ($response->header('Content-Type') =~ m|text/html|)) {
$failed_reason = "Content-Type is '".
$response->header('Content-Type')."'";
} elsif ($response->header('Content-Encoding')
@@ -537,6 +597,7 @@ sub get_document() {
$response->header('Content-encoding')."'";
}
if ($failed_reason) {
+ # No, there is a problem...
if (! $in_recursion) {
if ($_html) {
&html_header($uri);
@@ -557,15 +618,18 @@ sub get_document() {
sub already_processed($) {
my ($uri, %redirects) = @_;
- # Don't be verbose for that part
+ # Don't be verbose for that part...
my $summary_value = $_summary;
$_summary = 1;
- # Do a HEAD
- my $response = &get_document('HEAD', $uri, 1);
+ # Do a GET: if it fails, we stop, if not, the results are cached
+ my $response = &get_document('GET', $uri, 1);
+ # ... but just for that part
$_summary = $summary_value;
- # Have we already processed it?
+ # Can we process the resource?
return(-1) if (defined($response->{Stop}));
+ # Have we already processed it?
return(1) if (defined($processed{$response->{absolute_uri}->as_string()}));
+ # It's not processed yet and it is processable: return 0
return(0);
}
@@ -590,6 +654,7 @@ sub W3C::UserAgent::redirect_ok {
&hprintf("\n%s %s ", $request->method(), $request->uri());
}
+ # Build a map of redirects
$self->{Redirects}{$self->{fetching}} = $request->uri();
$self->{fetching} = $request->uri();
@@ -600,7 +665,19 @@ sub W3C::UserAgent::redirect_ok {
sub get_uri() {
# Here we have a lot of extra parameters in order not to lose information
# if the function is called several times (401's)
- my ($method, $uri, $start, $redirects, $code, $realm, $message, $tested) = @_;
+ my ($method, $uri, $start, $redirects, $code, $realm, $message,
+ $auth) = @_;
+ # $method contains the method used
+ # $uri contains the target of the request
+ # $start is a timestamp (not defined the first time the function is
+ # called)
+ # $redirects is a map of redirects
+ # $code is the first HTTP return code
+ # $realm is the realm of the request
+ # $message is the HTTP message received
+ # $auth equals 1 if we want to send out authentication information
+
+ # For timing purposes
if (! defined($start)) {
$start = &get_timestamp();
}
@@ -611,13 +688,14 @@ sub get_uri() {
if ($_http_proxy) {
$ua->proxy('http', 'http://'.$_http_proxy);
}
- $ua->{uri} = $uri;
- $ua->{fetching} = $uri;
+ # $ua->{fetching} contains the URI we originally wanted
+ # $ua->{uri} is modified in the case of a redirect; this is used to
+ # build $ua->{Redirects}
+ $ua->{uri} = $ua->{fetching} = $uri;
if (defined($redirects)) {
$ua->{Redirects} = $redirects;
}
- my $count = 0;
- my $response;
+
if (! ($_summary || (!$doc_count && $_html))) {
&hprintf("%s %s ", $method, $uri);
}
@@ -626,7 +704,7 @@ sub get_uri() {
$request->header('Accept-Language' => 'en');
}
# Are we providing authentication info?
- if (defined($tested)
+ if (defined($auth)
&& ($request->url->netloc =~ /$_trusted$/)) {
if (defined($ENV{HTTP_AUTHORIZATION})) {
$request->headers->header(Authorization => $ENV{HTTP_AUTHORIZATION});
@@ -635,19 +713,18 @@ sub get_uri() {
}
}
# Do the query
- $response = $ua->request($request);
+ my $response = $ua->request($request);
# Get the results
+ # Record the very first response
if (! defined($code)) {
$code = $ua->{FirstResponse};
- }
- if (! defined($message)) {
$message = $ua->{FirstMessage};
}
# Authentication requested?
if (($response->code() == 401)
&& (defined($ENV{HTTP_AUTHORIZATION})
|| (defined($_user) && defined($_password)))
- && !defined ($tested)) {
+ && !defined ($auth)) {
# Deal with authentication and avoid loops
if (! defined ($realm)) {
$response->headers->www_authenticate =~ /Basic realm=\"([^\"]+)\"/;
@@ -680,6 +757,7 @@ sub get_uri() {
sub record_results() {
my ($uri, $method, $response) = @_;
+ $results{$uri}{response} = $response;
$results{$uri}{method} = $method;
$results{$uri}{location}{code} = $response->code();
$results{$uri}{location}{type} = $response->header('Content-type');
@@ -694,8 +772,12 @@ sub record_results() {
# Stores the authentication information
if (defined($response->{Realm})) {
$results{$uri}{location}{realm} = $response->{Realm};
- $results{$uri}{location}{display} = 401;
+ if (! $_hide_same_realm) {
+ $results{$uri}{location}{display} = 401;
+ }
}
+ # What type of broken link is it? (stored in {record} - the {display}
+ # information is just for visual use only)
if (($results{$uri}{location}{display} == 401)
&& ($results{$uri}{location}{code} == 404)) {
$results{$uri}{location}{record} = 404;
@@ -773,9 +855,9 @@ sub parse_document() {
return($p);
}
-####################################
+###################################
# Constructor for W3C::CheckLink #
-####################################
+###################################
sub W3C::CheckLink::new() {
my $p = HTML::Parser::new(@_, api_version => 3);
@@ -820,13 +902,13 @@ sub W3C::CheckLink::doctype() {
# Check for the id tag
if (
# HTML 2.0 & 3.0
- m/^-\/\/IETF\/\/DTD HTML [23]\.0\/\// ||
+ m%^-//IETF//DTD HTML [23]\.0//% ||
# HTML 3.2
- m/^-\/\/W3C\/\/DTD HTML 3\.2\/\//) {
+ m%^-//W3C//DTD HTML 3\.2//%) {
$self->{check_id} = 0;
}
# Enable XML extensions
- if (m/^-\/\/W3C\/\/DTD XHTML /) {
+ if (m%^-//W3C//DTD XHTML %) {
$self->xml_mode(1);
}
}
@@ -856,11 +938,12 @@ sub W3C::CheckLink::get_anchor() {
$anchor = $attr->{id};
}
if ($self->{check_name} && ($tag eq 'a')) {
+ # @@@@ In XHTML, <a name="foo" id="foo"> is mandatory
+ # Force an error if it's not the case (or if id's and name's values
+ # are different)
# If id is defined, name if defined must have the same value
if (!$anchor) {
$anchor = $attr->{name};
- } else {
- # @@@ Issue a warning
}
}
@@ -951,6 +1034,10 @@ sub W3C::CheckLink::declaration() {
sub check_validity() {
use HTTP::Status;
my ($testing, $uri, $links, $redirects) = @_;
+ # $testing is the URI of the document checked
+ # $uri is the URI of the target that we are verifying
+ # $links is a hash of the links in the documents checked
+ # $redirects is a map of the redirects encountered
# Checking file: URI's is not allowed with a CGI
if ($testing ne $uri) {
@@ -971,6 +1058,8 @@ sub check_validity() {
# Get the document with the appropriate method
my $method;
my @fragments = keys %{$links->{$uri}{fragments}};
+ # Only use GET if there are fragments. HEAD is enough if it's not the
+ # case.
if ($#fragments == -1) {
$method = 'HEAD';
} else {
@@ -983,12 +1072,13 @@ sub check_validity() {
|| (($method eq 'GET') && ($results{$uri}{method} eq 'HEAD'))) {
$being_processed = 1;
$response = &get_uri($method, $uri);
- # Record the redirects
- &record_redirects($redirects, $response->{Redirects});
# Get the information back from get_uri()
&record_results($uri, $method, $response);
+ # Record the redirects
+ &record_redirects($redirects, $response->{Redirects});
}
+ # We got the response of the HTTP request. Stop here if it was a HEAD.
if ($#fragments == -1) {
return;
}
@@ -999,7 +1089,7 @@ sub check_validity() {
if (!defined($results{$uri}{location}{type})) {
return;
}
- if (! ($results{$uri}{location}{type} =~ m/text\/html/i)) {
+ if (! ($results{$uri}{location}{type} =~ m|text/html|i)) {
if ($_verbose) {
&hprintf("Can't check content: Content-type is '%s'.\n",
$response->header('Content-type'));
@@ -1219,7 +1309,17 @@ sub show_link_report {
my $whattodo;
my $redirect_too;
if ($todo) {
- $whattodo = $todo->{$c};
+ if ($c == 500) {
+ # 500's could be a real 500 or a DNS lookup problem
+ if ($results->{$u}{location}{message} =~
+ m/Bad hostname '[^\']*'/) {
+ $whattodo = 'The hostname could not be resolved. This link needs to be fixed.';
+ } else {
+ $whattodo = 'This is a server-side problem. Check the URI.';
+ }
+ } else {
+ $whattodo = $todo->{$c};
+ }
if (defined($redirects{$u}) && ($c != 301) && ($c != 302)) {
$redirect_too = 'The original resquest has been redirected.';
$whattodo .= ' '.$redirect_too if (! $_html);
@@ -1232,7 +1332,7 @@ sub show_link_report {
# Style stuff
my $idref = '';
if ($codes && (!defined($previous_c) || ($c != $previous_c))) {
- $idref = ' id="code_'.$c.'"';
+ $idref = ' id="d'.$doc_count.'code_'.$c.'"';
$previous_c = $c;
}
# Main info
@@ -1367,20 +1467,22 @@ sub links_summary {
my %todo = ( 200 => 'There are broken fragments which must be fixed.',
300 => 'It usually means that there is a typo in a link that triggers mod_speling action - this must be fixed!',
- 301 => 'Usually nothing. You may want to update the link though.',
+ 301 => 'You should update the link.',
302 => 'Usually nothing.',
400 => 'Usually the sign of a malformed URL that cannot be parsed by the server.',
- 401 => 'The link is not public. You had better specify it.',
- 403 => 'The link is forbidden! This needs fixing. Usual suspect: a missing Overview.html or index.html, or bad access control.',
+ 401 => "The link is not public. You'd better specify it.",
+ 403 => 'The link is forbidden! This needs fixing. Usual suspects: a missing index.html or Overview.html, or a missing ACL.',
404 => 'The link is broken. Fix it NOW!',
- 405 => 'The server does not allow HEAD requests. Go ask the guys who run this server why.',
+ 405 => 'The server does not allow HEAD requests. Go ask the guys who run this server why. Check the link manually.',
407 => 'The link is a proxy, but requires Authentication.',
- 408 => 'The request timed out',
+ 408 => 'The request timed out.',
+ 410 => 'The resource is gone. You should remove this link.',
415 => 'The media type is not supported.',
- 500 => 'Either the hostname is incorrect or it is a server side problem.',
+ 500 => 'Either the hostname is incorrect or it is a server side problem. Check the detailed list.',
501 => 'Could not check this link: method not implemented or scheme not supported.',
503 => 'The server cannot service the request, for some unknown reason.');
- my %priority = ( 404 => 1,
+ my %priority = ( 410 => 1,
+ 404 => 2,
403 => 5,
200 => 10,
300 => 15,
@@ -1465,7 +1567,8 @@ sub links_summary {
my $code;
foreach $code (sort(keys(%code_summary))) {
printf('<tr%s>', &bgcolor($code));
- printf('<td><a href="#code_%s">%s</a></td>', $code, $code);
+ printf('<td><a href="#d%scode_%s">%s</a></td>',
+ $doc_count, $code, $code);
printf('<td>%s</td>', $code_summary{$code});
printf('<td>%s</td>', $todo{$code});
print "</tr>\n";
@@ -1516,7 +1619,7 @@ sub html_header() {
print "Cache-Control: no-cache\nPragma: no-cache\n";
}
if (! $_cl) {
- print 'Content-type: text/html';
+ print "Content-Type: text/html\nContent-Language: en";
}
print "