diff options
-rwxr-xr-x | httpd/cgi-bin/checklink.pl | 233 |
1 files changed, 168 insertions, 65 deletions
diff --git a/httpd/cgi-bin/checklink.pl b/httpd/cgi-bin/checklink.pl index 80ab14e..72bf2e0 100755 --- a/httpd/cgi-bin/checklink.pl +++ b/httpd/cgi-bin/checklink.pl @@ -1,11 +1,11 @@ #! /usr/bin/perl -w # # W3C Link Checker -# by Hugo Haas -# (c) 1999-2000 World Wide Web Consortium +# by Hugo Haas <hugo@w3.org> +# (c) 1999-2001 World Wide Web Consortium # based on Renaud Bruyeron's checklink.pl # -# $Id: checklink.pl,v 2.67 2000-09-25 17:38:35 hugo Exp $ +# $Id: checklink.pl,v 2.68 2001-01-18 21:58:04 hugo Exp $ # # This program is licensed under the W3C(r) License: # http://www.w3.org/Consortium/Legal/copyright-software @@ -34,7 +34,7 @@ $| = 1; # Version info my $PROGRAM = 'W3C checklink'; -my $VERSION = q$Revision: 2.67 $ . '(c) 1999-2000 W3C'; +my $VERSION = q$Revision: 2.68 $ . '(c) 1999-2001 W3C'; my $REVISION; ($REVISION = $VERSION) =~ s/Revision: (\d+\.\d+) .*/$1/; # Different options specified by the user @@ -56,6 +56,9 @@ my $_accept_language = 1; my $_languages = '*'; my $_base_location = '.'; my $_contact_address = 'hugo@w3.org'; +my $_masquerade = 0; +my $_local_dir = my $_remote_masqueraded_uri = ''; +my $_hide_same_realm = 0; # Restrictions for the online version my $_sleep_time = 3; @@ -90,6 +93,7 @@ if ($#ARGV >= 0) { $_verbose = 0; $_progress = 0; } + # Transform the parameter into a URI $uri = urize($uri); &check_uri($uri); } @@ -100,6 +104,7 @@ if ($#ARGV >= 0) { use CGI; use CGI::Carp qw(fatalsToBrowser); $query = new CGI; + # Set a few parameters in CGI mode $_cl = 0; $_verbose = 0; $_progress = 0; @@ -130,11 +135,17 @@ if ($#ARGV >= 0) { } $uri =~ s/^\s+//g; if ($uri =~ m/^file:/) { + # Only the http scheme is allowed &file_uri($uri); } elsif (!($uri =~ m/:/)) { - $uri = 'http://'.$uri; + if ($uri =~ m|^//|) { + $uri = 'http:'.$uri; + } else { + $uri = 'http://'.$uri; + } } &check_uri($uri, 1); + &html_footer(); } ############################################################################### @@ -215,6 +226,12 @@ sub parse_arguments() { $_trusted = shift(@ARGV); } elsif (m/^-y|--proxy$/) { $_http_proxy = shift(@ARGV); + } elsif (m/^--masquerade$/) { + $_masquerade = 1; + $_local_dir = shift(@ARGV); + $_remote_masqueraded_uri = shift(@ARGV); + } elsif (m/^--hide-same-realm$/) { + $_hide_same_realm = 1; } else { push(@uris, $_); } @@ -244,10 +261,15 @@ Options: -i/--indicator Show progress while parsing. -u/--user username Specify a username for authentication. -p/--password password Specify a password. + --hide-same-real Hide 401's that are in the same realm as the + document checked. -t/--timeout value Timeout for the HTTP requests. -d/--domain domain Regular expression describing the domain to which the authetication information will be sent (default: '$_trusted'). + --masquerade local remote Masquerade local dir as a remote URI (e.g. + /home/hugo/MathML2/ is in fact + http://www.w3.org/TR/MathML2/). -y/--proxy proxy Specify an HTTP proxy server. -h/--html HTML output. --help Show this message. @@ -256,7 +278,7 @@ Options: } sub ask_password() { - print(STDERR 'Enter your password: '); + print(STDERR 'Enter the password for user '.$_user.': '); # Will only work on Unix... system('stty -echo'); chomp($_password = <STDIN>); @@ -277,7 +299,7 @@ sub urize() { my $res = $_; if (m/:/) { $base = URI->new($_); - } elsif (m/^\//) { + } elsif (m|^/|) { $base = URI->new('file://localhost'); } else { my $pwd; @@ -289,21 +311,24 @@ sub urize() { return($result->as_string()); } -####################### -# Do the job on a URI # -####################### +######################################## +# Check for broken links in a resource # +######################################## sub check_uri() { my ($uri, $html_header) = @_; + # If $html_header equals 1, we need to generate a HTML header (first + # instance called in HTML mode). my $start; if (! $_quiet) { $start = &get_timestamp(); } - # Get the document + # Get and parse the document my $response = &get_document('GET', $uri, $doc_count, \%redirects); + # Can we check the resource? If not, we exit here... if (defined($response->{Stop})) { return(-1); } @@ -334,12 +359,15 @@ sub check_uri() { } } + # Record that we have processed this resource $processed{$absolute_uri} = 1; + # Parse the document my $p = &parse_document($uri, $absolute_uri, $response->content(), 1); my $base = URI->new($p->{base}); # Check anchors + ############### if (! $_summary) { print("Checking anchors:\n"); @@ -352,9 +380,11 @@ sub check_uri() { foreach $l (keys %{$p->{Anchors}{$anchor}}) { $times += $p->{Anchors}{$anchor}{$l}; } + # They should appear only once if ($times > 1) { $errors{$anchor} = 1; } + # Empty IDREF's are not allowed if ($anchor eq '') { $errors{$anchor} = 1; } @@ -364,21 +394,35 @@ sub check_uri() { } # Check links + ############# my %links; - # Record all the links + # Record all the links found my $link; foreach $link (keys %{$p->{Links}}) { my $link_uri = URI->new($link); my $abs_link_uri = URI->new_abs($link_uri, $base); + if ($_masquerade) { + if ($abs_link_uri =~ m|^$_remote_masqueraded_uri|) { + printf("processing %s in base $_local_dir %s\n", + $abs_link_uri); + my $nlink; + $nlink = $abs_link_uri; + $nlink =~ + s|^$_remote_masqueraded_uri|file://localhost$_local_dir|; + $abs_link_uri = URI->new($nlink); + }; + } my $lines; foreach $lines (keys %{$p->{Links}{$link}}) { my $canonical = URI->new($abs_link_uri->canonical()); my $url = $canonical->scheme().':'.$canonical->opaque(); my $fragment = $canonical->fragment(); if (! $fragment) { + # Document without fragment $links{$url}{location}{$lines} = 1; } else { + # Resource with a fragment $links{$url}{fragments}{$fragment}{$lines} = 1; } } @@ -393,15 +437,19 @@ sub check_uri() { if (! $_summary) { &hprintf("Checking link %s\n", $u); } + # Check that a link is valid &check_validity($uri, $u, \%links, \%redirects); if ($_verbose) { &hprintf("\tReturn code: %s\n", $results{$u}{location}{code}); } if ($results{$u}{location}{success}) { my $fragment; + # Even though it was not broken, we might want to display it + # on the results page (e.g. because it required authentication) if ($results{$u}{location}{display} >= 400) { $broken{$u}{location} = 1; } + # List the broken fragments foreach $fragment (keys %{$links{$u}{fragments}}) { if ($_verbose) { &hprintf("\t\t%s %s - Lines: %s\n", @@ -420,6 +468,7 @@ sub check_uri() { } else { # Couldn't find the document $broken{$u}{location} = 1; + # All the fragments associated are hence broken my $fragment; foreach $fragment (keys %{$links{$u}{fragments}}) { $broken{$u}{fragments}{$fragment}++; @@ -450,14 +499,17 @@ sub check_uri() { # Get the name of the original directory # e.g. http://www.w3.org/TR/html4/Overview.html # should return http://www.w3.org/TR/html4/ - $results{$uri}{parsing}{base} =~ m/^(.*\/)[^\/]*/; + $results{$uri}{parsing}{base} =~ m|^(.*/)[^/]*|; $_base_location = $1; } foreach $u (keys %links) { next if (! (# Check if it's in our scope for recursion - ($u =~ m/^$_base_location/) && + ($u =~ m|^$_base_location|) && # and the link is not broken - $results{$u}{location}{success}) + $results{$u}{location}{success} && + # And it is a text/html resource + ($results{$u}{location}{type} =~ m|text/html|) + ) ); # Check if we have already processed the URI next if (&already_processed($u) != 0); @@ -490,18 +542,31 @@ sub check_uri() { } } -############################# -# Get a document to process # -############################# +####################################### +# Get and parse a resource to process # +####################################### sub get_document() { my ($method, $uri, $in_recursion, $redirects) = @_; + # $method contains the HTTP method the use (GET or HEAD) + # $uri contains the identifier of the resource + # $in_recursion equals 1 if we are in recursion mode (i.e. it is at least + # the second resource checked) + # $redirects is a pointer to the hash containing the map of the redirects - # Get the document - my $response = &get_uri($method, $uri); - &record_results($uri, $method, $response); + # Get the resource + my $response; + if (defined($results{$uri}{response}) + && !(($method eq 'GET') && ($results{$uri}{method} eq 'HEAD'))) { + $response = $results{$uri}{response}; + } else { + $response = &get_uri($method, $uri); + &record_results($uri, $method, $response); + &record_redirects($redirects, $response->{Redirects}); + } if (! $response->is_success()) { if (! $in_recursion) { + # Is it too late to request authentication? if ($response->code() == 401) { &authentication($response); } else { @@ -510,25 +575,20 @@ sub get_document() { } &hprintf("\nError: %d %s\n", $response->code(), $response->message()); - if ($_html) { - &html_footer(); - } } } $response->{Stop} = 1; return($response); } - &record_redirects($redirects, $response->{Redirects}); - - # What URI are we processing by the way? + # What is the URI of the resource that we are processing by the way? my $base_uri = URI->new($response->base()); my $request_uri = URI->new($response->request->url); $response->{absolute_uri} = $base_uri->abs($request_uri); - # Parse the document + # Can we parse the document? my $failed_reason; - if (! ($response->header('Content-Type') =~ m/text\/html/)) { + if (! ($response->header('Content-Type') =~ m|text/html|)) { $failed_reason = "Content-Type is '". $response->header('Content-Type')."'"; } elsif ($response->header('Content-Encoding') @@ -537,6 +597,7 @@ sub get_document() { $response->header('Content-encoding')."'"; } if ($failed_reason) { + # No, there is a problem... if (! $in_recursion) { if ($_html) { &html_header($uri); @@ -557,15 +618,18 @@ sub get_document() { sub already_processed($) { my ($uri, %redirects) = @_; - # Don't be verbose for that part + # Don't be verbose for that part... my $summary_value = $_summary; $_summary = 1; - # Do a HEAD - my $response = &get_document('HEAD', $uri, 1); + # Do a GET: if it fails, we stop, if not, the results are cached + my $response = &get_document('GET', $uri, 1); + # ... but just for that part $_summary = $summary_value; - # Have we already processed it? + # Can we process the resource? return(-1) if (defined($response->{Stop})); + # Have we already processed it? return(1) if (defined($processed{$response->{absolute_uri}->as_string()})); + # It's not processed yet and it is processable: return 0 return(0); } @@ -590,6 +654,7 @@ sub W3C::UserAgent::redirect_ok { &hprintf("\n%s %s ", $request->method(), $request->uri()); } + # Build a map of redirects $self->{Redirects}{$self->{fetching}} = $request->uri(); $self->{fetching} = $request->uri(); @@ -600,7 +665,19 @@ sub W3C::UserAgent::redirect_ok { sub get_uri() { # Here we have a lot of extra parameters in order not to lose information # if the function is called several times (401's) - my ($method, $uri, $start, $redirects, $code, $realm, $message, $tested) = @_; + my ($method, $uri, $start, $redirects, $code, $realm, $message, + $auth) = @_; + # $method contains the method used + # $uri contains the target of the request + # $start is a timestamp (not defined the first time the function is + # called) + # $redirects is a map of redirects + # $code is the first HTTP return code + # $realm is the realm of the request + # $message is the HTTP message received + # $auth equals 1 if we want to send out authentication information + + # For timing purposes if (! defined($start)) { $start = &get_timestamp(); } @@ -611,13 +688,14 @@ sub get_uri() { if ($_http_proxy) { $ua->proxy('http', 'http://'.$_http_proxy); } - $ua->{uri} = $uri; - $ua->{fetching} = $uri; + # $ua->{fetching} contains the URI we originally wanted + # $ua->{uri} is modified in the case of a redirect; this is used to + # build $ua->{Redirects} + $ua->{uri} = $ua->{fetching} = $uri; if (defined($redirects)) { $ua->{Redirects} = $redirects; } - my $count = 0; - my $response; + if (! ($_summary || (!$doc_count && $_html))) { &hprintf("%s %s ", $method, $uri); } @@ -626,7 +704,7 @@ sub get_uri() { $request->header('Accept-Language' => 'en'); } # Are we providing authentication info? - if (defined($tested) + if (defined($auth) && ($request->url->netloc =~ /$_trusted$/)) { if (defined($ENV{HTTP_AUTHORIZATION})) { $request->headers->header(Authorization => $ENV{HTTP_AUTHORIZATION}); @@ -635,19 +713,18 @@ sub get_uri() { } } # Do the query - $response = $ua->request($request); + my $response = $ua->request($request); # Get the results + # Record the very first response if (! defined($code)) { $code = $ua->{FirstResponse}; - } - if (! defined($message)) { $message = $ua->{FirstMessage}; } # Authentication requested? if (($response->code() == 401) && (defined($ENV{HTTP_AUTHORIZATION}) || (defined($_user) && defined($_password))) - && !defined ($tested)) { + && !defined ($auth)) { # Deal with authentication and avoid loops if (! defined ($realm)) { $response->headers->www_authenticate =~ /Basic realm=\"([^\"]+)\"/; @@ -680,6 +757,7 @@ sub get_uri() { sub record_results() { my ($uri, $method, $response) = @_; + $results{$uri}{response} = $response; $results{$uri}{method} = $method; $results{$uri}{location}{code} = $response->code(); $results{$uri}{location}{type} = $response->header('Content-type'); @@ -694,8 +772,12 @@ sub record_results() { # Stores the authentication information if (defined($response->{Realm})) { $results{$uri}{location}{realm} = $response->{Realm}; - $results{$uri}{location}{display} = 401; + if (! $_hide_same_realm) { + $results{$uri}{location}{display} = 401; + } } + # What type of broken link is it? (stored in {record} - the {display} + # information is just for visual use only) if (($results{$uri}{location}{display} == 401) && ($results{$uri}{location}{code} == 404)) { $results{$uri}{location}{record} = 404; @@ -773,9 +855,9 @@ sub parse_document() { return($p); } -#################################### +################################### # Constructor for W3C::CheckLink # -#################################### +################################### sub W3C::CheckLink::new() { my $p = HTML::Parser::new(@_, api_version => 3); @@ -820,13 +902,13 @@ sub W3C::CheckLink::doctype() { # Check for the id tag if ( # HTML 2.0 & 3.0 - m/^-\/\/IETF\/\/DTD HTML [23]\.0\/\// || + m%^-//IETF//DTD HTML [23]\.0//% || # HTML 3.2 - m/^-\/\/W3C\/\/DTD HTML 3\.2\/\//) { + m%^-//W3C//DTD HTML 3\.2//%) { $self->{check_id} = 0; } # Enable XML extensions - if (m/^-\/\/W3C\/\/DTD XHTML /) { + if (m%^-//W3C//DTD XHTML %) { $self->xml_mode(1); } } @@ -856,11 +938,12 @@ sub W3C::CheckLink::get_anchor() { $anchor = $attr->{id}; } if ($self->{check_name} && ($tag eq 'a')) { + # @@@@ In XHTML, <a name="foo" id="foo"> is mandatory + # Force an error if it's not the case (or if id's and name's values + # are different) # If id is defined, name if defined must have the same value if (!$anchor) { $anchor = $attr->{name}; - } else { - # @@@ Issue a warning } } @@ -951,6 +1034,10 @@ sub W3C::CheckLink::declaration() { sub check_validity() { use HTTP::Status; my ($testing, $uri, $links, $redirects) = @_; + # $testing is the URI of the document checked + # $uri is the URI of the target that we are verifying + # $links is a hash of the links in the documents checked + # $redirects is a map of the redirects encountered # Checking file: URI's is not allowed with a CGI if ($testing ne $uri) { @@ -971,6 +1058,8 @@ sub check_validity() { # Get the document with the appropriate method my $method; my @fragments = keys %{$links->{$uri}{fragments}}; + # Only use GET if there are fragments. HEAD is enough if it's not the + # case. if ($#fragments == -1) { $method = 'HEAD'; } else { @@ -983,12 +1072,13 @@ sub check_validity() { || (($method eq 'GET') && ($results{$uri}{method} eq 'HEAD'))) { $being_processed = 1; $response = &get_uri($method, $uri); - # Record the redirects - &record_redirects($redirects, $response->{Redirects}); # Get the information back from get_uri() &record_results($uri, $method, $response); + # Record the redirects + &record_redirects($redirects, $response->{Redirects}); } + # We got the response of the HTTP request. Stop here if it was a HEAD. if ($#fragments == -1) { return; } @@ -999,7 +1089,7 @@ sub check_validity() { if (!defined($results{$uri}{location}{type})) { return; } - if (! ($results{$uri}{location}{type} =~ m/text\/html/i)) { + if (! ($results{$uri}{location}{type} =~ m|text/html|i)) { if ($_verbose) { &hprintf("Can't check content: Content-type is '%s'.\n", $response->header('Content-type')); @@ -1219,7 +1309,17 @@ sub show_link_report { my $whattodo; my $redirect_too; if ($todo) { - $whattodo = $todo->{$c}; + if ($c == 500) { + # 500's could be a real 500 or a DNS lookup problem + if ($results->{$u}{location}{message} =~ + m/Bad hostname '[^\']*'/) { + $whattodo = 'The hostname could not be resolved. This link needs to be fixed.'; + } else { + $whattodo = 'This is a server-side problem. Check the URI.'; + } + } else { + $whattodo = $todo->{$c}; + } if (defined($redirects{$u}) && ($c != 301) && ($c != 302)) { $redirect_too = 'The original resquest has been redirected.'; $whattodo .= ' '.$redirect_too if (! $_html); @@ -1232,7 +1332,7 @@ sub show_link_report { # Style stuff my $idref = ''; if ($codes && (!defined($previous_c) || ($c != $previous_c))) { - $idref = ' id="code_'.$c.'"'; + $idref = ' id="d'.$doc_count.'code_'.$c.'"'; $previous_c = $c; } # Main info @@ -1367,20 +1467,22 @@ sub links_summary { my %todo = ( 200 => 'There are broken fragments which must be fixed.', 300 => 'It usually means that there is a typo in a link that triggers mod_speling action - this must be fixed!', - 301 => 'Usually nothing. You may want to update the link though.', + 301 => 'You should update the link.', 302 => 'Usually nothing.', 400 => 'Usually the sign of a malformed URL that cannot be parsed by the server.', - 401 => 'The link is not public. You had better specify it.', - 403 => 'The link is forbidden! This needs fixing. Usual suspect: a missing Overview.html or index.html, or bad access control.', + 401 => "The link is not public. You'd better specify it.", + 403 => 'The link is forbidden! This needs fixing. Usual suspects: a missing index.html or Overview.html, or a missing ACL.', 404 => 'The link is broken. Fix it NOW!', - 405 => 'The server does not allow HEAD requests. Go ask the guys who run this server why.', + 405 => 'The server does not allow HEAD requests. Go ask the guys who run this server why. Check the link manually.', 407 => 'The link is a proxy, but requires Authentication.', - 408 => 'The request timed out', + 408 => 'The request timed out.', + 410 => 'The resource is gone. You should remove this link.', 415 => 'The media type is not supported.', - 500 => 'Either the hostname is incorrect or it is a server side problem.', + 500 => 'Either the hostname is incorrect or it is a server side problem. Check the detailed list.', 501 => 'Could not check this link: method not implemented or scheme not supported.', 503 => 'The server cannot service the request, for some unknown reason.'); - my %priority = ( 404 => 1, + my %priority = ( 410 => 1, + 404 => 2, 403 => 5, 200 => 10, 300 => 15, @@ -1465,7 +1567,8 @@ sub links_summary { my $code; foreach $code (sort(keys(%code_summary))) { printf('<tr%s>', &bgcolor($code)); - printf('<td><a href="#code_%s">%s</a></td>', $code, $code); + printf('<td><a href="#d%scode_%s">%s</a></td>', + $doc_count, $code, $code); printf('<td>%s</td>', $code_summary{$code}); printf('<td>%s</td>', $todo{$code}); print "</tr>\n"; @@ -1516,7 +1619,7 @@ sub html_header() { print "Cache-Control: no-cache\nPragma: no-cache\n"; } if (! $_cl) { - print 'Content-type: text/html'; + print "Content-Type: text/html\nContent-Language: en"; } print " |