#! /usr/bin/perl -w # # W3C Link Checker # by Hugo Haas # (c) 1999-2000 World Wide Web Consortium # based on Renaud Bruyeron's checklink.pl # # $Id: LinkChecker.pl,v 1.33 2000-02-24 22:22:05 hugo Exp $ # # This program is licensed under the W3C(r) License: # http://www.w3.org/Consortium/Legal/copyright-software # # See the CVSweb interface at: # http://dev.w3.org/cvsweb/validator/httpd/cgi-bin/LinkChecker.pl # # An online version is available at: # http://validator.w3.org/checklink use strict; package W3C::UserAgent; require LWP::UserAgent; @W3C::UserAgent::ISA = qw(LWP::UserAgent); package W3C::LinkChecker; require HTML::Parser; @W3C::LinkChecker::ISA = qw(HTML::Parser); # Autoflush $| = 1; # Version info my $PROGRAM = 'W3C LinkChecker'; my $VERSION = q$Revision: 1.33 $ . '(c) 1999-2000 W3C'; my $REVISION; ($REVISION = $VERSION) =~ s/Revision: (\d+\.\d+) .*/$1/; # Different options specified by the user my $_cl; my $_quiet = 0; my $_summary = 0; my $_verbose = 0; my $_progress = 0; my $_html = 0; my $_timeout = 60; my $_chunksize = 1024; my $_redirects = 1; my $_dir_redirects = 1; my $_user; my $_password; my $_trusted = '\.w3\.org'; my $_http_proxy; my $_recursive = 0; my $_base_location = '.'; # Global variables # Used for the output my $first = 2; # What is our query? my $query; # What URI's did we process? (used for $_recursive == 1) my %processed; # Result of the HTTP query my %results; # List of redirects my %redirects; if ($#ARGV >= 0) { $_cl = 1; # Parse command line my @uris = &parse_arguments(); if ($_user && (! $_password)) { &ask_password(); } my $uri; foreach $uri (@uris) { if (! $_summary) { printf("%s %s\n", $PROGRAM ,$VERSION); } else { $_verbose = 0; $_progress = 0; } $uri = urize($uri); &check_uri($uri); } } else { use CGI; use CGI::Carp qw(fatalsToBrowser); $query = new CGI; $_cl = 0; $_verbose = 0; $_progress = 0; if ($query->param('summary')) { $_summary = 1; } else { } if ($query->param('hide_redirects')) { $_redirects = 0; } if ($query->param('hide_dir_redirects')) { $_dir_redirects = 0; } if ($query->param('recursive')) { $_recursive = 1; } $_html = 1; my $uri; if ($query->param('uri')) { $uri = $query->param('uri'); } elsif ($query->param('url')) { $uri = $query->param('url'); } else { &print_form($query); } $uri =~ s/^\s+//g; if ($uri =~ m/^file:/) { &file_uri($uri); } elsif (!($uri =~ m/:/)) { $uri = 'http://'.$uri; } &check_uri($uri, 1); } ############################################################################### ################################ # Command line and usage stuff # ################################ sub parse_arguments() { my @uris; my $uris = 0; while (@ARGV) { $_ = shift(@ARGV); if ($uris) { push(@uris, $_); } elsif (m/^--$/) { $uris = 1; } elsif (m/^-[^-upytdcl]/) { if (m/q/) { $_quiet = 1; $_summary = 1; } if (m/s/) { $_summary = 1; } if (m/b/) { $_redirects = 0; } if (m/e/) { $_dir_redirects = 0; } if (m/v/) { $_verbose = 1; } if (m/i/) { $_progress = 1; } if (m/h/) { $_html = 1; } if (m/r/) { $_recursive = 1; } } elsif (m/^--help$/) { &usage(); } elsif (m/^--quiet$/) { $_quiet = 1; } elsif (m/^--summary$/) { $_summary = 1; } elsif (m/^--broken$/) { $_redirects = 0; } elsif (m/^--dir-redirects$/) { $_dir_redirects = 0; } elsif (m/^--verbose$/) { $_verbose = 1; } elsif (m/^--indicator$/) { $_progress = 1; } elsif (m/^--html$/) { $_html = 1; } elsif (m/^--recursive$/) { $_recursive = 1; } elsif (m/^-l|--location$/) { $_base_location = shift(@ARGV); } elsif (m/^-u|--user$/) { $_user = shift(@ARGV); } elsif (m/^-p|--password$/) { $_password = shift(@ARGV); } elsif (m/^-t|--timeout$/) { $_timeout = shift(@ARGV); } elsif (m/^-d|--domain$/) { $_trusted = shift(@ARGV); } elsif (m/^-y|--proxy$/) { $_http_proxy = shift(@ARGV); } elsif (m/^-c|--chunksize$/) { $_chunksize = shift(@ARGV); } else { push(@uris, $_); } } return(@uris); } sub usage() { print STDERR "$PROGRAM $VERSION Usage: LinkCheck.pl Options: -s/--summary Result summary only. -b/--broken Show only the broken links, not the redirects. -e/--directory Hide directory redirects - e.g. http://www.w3.org/TR -> http://www.w3.org/TR/ -r/--recursive Check the documents linked from the first one. -l/--location uri Scope of the documents checked. By default, for http://www.w3.org/TR/html4/Overview.html for example, it would be: http://www.w3.org/TR/html4/ -q/--quiet No output if no errors are found. -v/--verbose Verbose mode. -i/--indicator Show progress while parsing. -u/--user username Specify a username for authentication. -p/--password password Specify a password. -t/--timeout value Timeout for the HTTP requests. -d/--domain domain Regular expression describing the domain to which the authetication information will be sent (default: $_trusted). -y/--proxy proxy Specify an HTTP proxy server. -c/--chunk-size size Size of the blocks parsed (default: $_chunksize). -h/--html HTML output. --help Show this message. "; exit(0); } sub ask_password() { print(STDERR 'Enter your password: '); system('stty -echo'); chomp($_password = ); system('stty echo'); print(STDERR "ok.\n"); } ############################################################################### ########################################### # Transform foo into file://localhost/foo # ########################################### sub urize() { use URI; $_ = URI::Escape::uri_unescape($_[0]); my $base; my $res = $_; if (m/:/) { $base = URI->new($_); } elsif (m/^\//) { $base = URI->new('file://localhost'); } else { my $pwd; chop($pwd = `pwd`); $base = URI->new('file://localhost'.$pwd.'/'); } my $u = URI->new($res); my $result = $u->abs($base); return($result->as_string()); } ####################### # Do the job on a URI # ####################### sub check_uri() { my ($uri, $html_stuff) = @_; if ($_html && $html_stuff) { $first = 1; } else { $first = 0; } my $start; if (! $_quiet) { $start = &get_timestamp(); } # Get the document my $response = &get_document('GET', $uri, 1, \%redirects); if (defined($response->{Stop})) { if ($html_stuff) { &html_header($uri); } &hprintf("\nError: %d %s\n", $response->code(), $response->message()); if ($html_stuff) { &html_footer(); } return(-1); } if ($_html) { if ($html_stuff) { &html_header($uri); } print('

'); } my $absolute_uri = $response->{absolute_uri}->as_string(); printf("\nProcessing\t%s\n\n", $absolute_uri); if ($_html) { print("

\n"); if (! $_summary) { print "
\n";
        }
    }

    $processed{$absolute_uri} = 1;
    my $p = &parse_document($uri, $absolute_uri,
                            $response->content(), 1);
    my $base = URI->new($p->{base});

    # Check anchors
    ###############

    if (! $_summary) {
        print("Checking anchors:\n");
    }
    my %errors;
    my $anchor;
    foreach $anchor (keys %{$p->{Anchors}}) {
        my @lines = keys %{$p->{Anchors}{$anchor}};
        my $times = $#lines + 1;
        if ($times > 1) {
            $errors{$anchor} = 1;
        }
        if ($anchor eq '') {
            $errors{$anchor} = 1;
        }
    }
    if (! $_summary) {
        print(" done.\n");
    }

    # Check links
    #############

    my %links;
    # Record all the links
    my $link;
    foreach $link (keys %{$p->{Links}}) {
        my $link_uri = URI->new($link);
        my $abs_link_uri = URI->new_abs($link_uri, $base);
        my $lines;
        foreach $lines (keys %{$p->{Links}{$link}}) {
            my $canonical = URI->new($abs_link_uri->canonical());
            my $url = $canonical->scheme().':'.$canonical->opaque();
            my $fragment = $canonical->fragment();
            if (! $fragment) {
                $links{$url}{location}{$lines} = 1;
            } else {
                $links{$url}{fragments}{$fragment}{$lines} = 1;
            }
        }
    }

    # Build the list of broken URI's
    my %broken;
    my $u;
    foreach $u (keys %links) {
        # Don't check mailto: URI's
        next if ($u =~ m/^mailto:/);
        if (! $_summary) {
            &hprintf("Checking link %s\n", $u);
        }
        &check_validity($uri, $u, \%links, \%redirects);
        if ($_verbose) {
            &hprintf("\tReturn code: %s\n", $results{$u}{location}{code});
        }
        if ($results{$u}{location}{success}) {
            my $fragment;
            if ($results{$u}{location}{display} >= 400) {
                $broken{$u}{location} = 1;
            }
            foreach $fragment (keys %{$links{$u}{fragments}}) {
                if ($_verbose) {
                    &hprintf("\t\t%s %s - Lines: %s\n",
                             $fragment,
                             ($results{$u}{fragments}{$fragment}
                              ? 'OK' : 'Not found'),
                             join(',',
                                  keys %{$links{$u}{fragments}{$fragment}})
                             );
                }
                # A broken fragment?
                if ($results{$u}{fragments}{$fragment} == 0) {
                    $broken{$u}{fragments}{$fragment} += 2;
                }
            }
        } else {
            # Couldn't find the document
            $broken{$u}{location} = 1;
            my $fragment;
            foreach $fragment (keys %{$links{$u}{fragments}}) {
                $broken{$u}{fragments}{$fragment}++;
            }
        }
    }
    if (! $_summary) {
        my $stop = &get_timestamp();
        &hprintf("Processed in %ss.\n", &time_diff($start, $stop));
    }

    # Display results
    if ($_html) {
        if (! $_summary) {
            print "
\n"; } } if (! $_quiet) { print "\n"; } &anchors_summary($p->{Anchors}, \%errors); &links_summary(\%links, \%results, \%broken, \%redirects); # Do we want to process other documents? if ($_recursive) { if ($_base_location eq '.') { # Get the name of the original directory # e.g. http://www.w3.org/TR/html4/Overview.html # should return http://www.w3.org/TR/html4/ $results{$uri}{parsing}{base} =~ m/^(.*\/)[^\/]*/; $_base_location = $1; } foreach $u (keys %links) { next if (! (# Check if it's in our scope for recursion ($u =~ m/^$_base_location/) && # and the link is not broken $results{$u}{location}{success}) ); # Check if we have already processed the URI next if (&already_processed($u) != 0); # Do the job print "\n"; if (! $_html) { my $i = 40; while ($i--) { print('-'); } } else { print('
'); # For the online version, wait for a while to avoid abuses sleep(3); } print "\n"; &check_uri($u, 0); } } if ($_html && $html_stuff) { &html_footer(); } } ############################# # Get a document to process # ############################# sub get_document() { my ($method, $uri, $in_recursion, $redirects) = @_; # Get the document my $response = &get_uri($method, $uri); &record_results($uri, $method, $response); $first = 0; if (! $response->is_success()) { if (! $in_recursion) { if ($response->code() == 401) { &authentication($response); } else { if ($_html) { &html_header($uri); } &hprintf("Error: %d %s\n", $response->code(), $response->message()); if ($_html) { &html_footer(); } } } $response->{Stop} = 1; return($response); } &record_redirects($redirects, $response->{Redirects}); # What URI are we processing by the way? my $base_uri = URI->new($response->base()); my $request_uri = URI->new($response->request->url); $response->{absolute_uri} = $base_uri->abs($request_uri); # Parse the document if (! ($response->header('Content-type') =~ m/text\/html/)) { if (! $in_recursion) { &hprintf("Can't check link: Content-type is '%s'.\n", $response->header('Content-type')); } $response->{Stop} = 1; } # Ok, return the information return($response); } ################################################## # Check whether a URI has already been processed # ################################################## sub already_processed($) { my ($uri, %redirects) = @_; # Don't be verbose for that part my $summary_value = $_summary; $_summary = 1; # Do a HEAD my $response = &get_document('HEAD', $uri, 1); $_summary = $summary_value; # Have we already processed it? return(-1) if (defined($response->{Stop})); return(1) if (defined($processed{$response->{absolute_uri}->as_string()})); return(0); } ############################ # Get the content of a URI # ############################ sub W3C::UserAgent::simple_request() { my $self = shift; my $response = $self->W3C::UserAgent::SUPER::simple_request(@_); if (! defined($self->{FirstResponse})) { $self->{FirstResponse} = $response->code(); $self->{FirstMessage} = $response->message(); } return $response; } sub W3C::UserAgent::redirect_ok { my ($self, $request) = @_; if (! ($_summary || $first)) { &hprintf("\n%s %s ", $request->method(), $request->uri()); } $self->{Redirects}{$self->{fetching}} = $request->uri(); $self->{fetching} = $request->uri(); return 0 if $request->method() eq "POST"; return 1; } sub get_uri() { # Here we have a lot of extra parameters in order not to lose information # if the function is called several times (401's) my ($method, $uri, $start, $redirects, $code, $realm, $message, $tested) = @_; if (! defined($start)) { $start = &get_timestamp(); } # Prepare the query my $ua = new W3C::UserAgent; $ua->timeout($_timeout); $ua->agent('W3CLinkChecker/'.$REVISION.' '.$ua->agent()); if ($_http_proxy) { $ua->proxy('http', 'http://'.$_http_proxy); } $ua->{uri} = $uri; $ua->{fetching} = $uri; if (defined($redirects)) { $ua->{Redirects} = $redirects; } my $count = 0; my $response; if (! ($_summary || $first)) { &hprintf("%s %s ", $method, $uri); } my $request = new HTTP::Request($method, $uri); # Are we providing authentication info? if (defined($tested) && ($request->url->netloc =~ /$_trusted$/)) { if (defined($ENV{HTTP_AUTHORIZATION})) { $request->headers->header(Authorization => $ENV{HTTP_AUTHORIZATION}); } elsif (defined($_user) && defined($_password)) { use MIME::Base64; my $authorization = encode_base64($_user.':'.$_password); $request->headers->header(Authorization => 'Basic '.$authorization); } } # Do the query $response = $ua->request($request); # Get the results if (! defined($code)) { $code = $ua->{FirstResponse}; } if (! defined($message)) { $message = $ua->{FirstMessage}; } # Authentication requested? if (($response->code() == 401) && (defined($ENV{HTTP_AUTHORIZATION}) || (defined($_user) && defined($_password))) && !defined ($tested)) { # Deal with authentication and avoid loops if (! defined ($realm)) { $response->headers->www_authenticate =~ /Basic realm=\"([^\"]+)\"/; $realm = $1; } if (! ($_summary || $first)) { print "\n"; } return &get_uri($method, $response->request->url, $start, $ua->{Redirects}, $code, $realm, $message, 1); } # Record the redirects $response->{Redirects} = $ua->{Redirects}; my $stop = &get_timestamp(); if (! ($_summary || $first)) { &hprintf(" fetched in %ss\n", &time_diff($start,$stop)); } $response->{OriginalCode} = $code; $response->{OriginalMessage} = $message; if (defined($realm)) { $response->{Realm} = $realm; } return $response; } ######################################### # Record the results of an HTTP request # ######################################### sub record_results() { my ($uri, $method, $response) = @_; $results{$uri}{method} = $method; $results{$uri}{location}{code} = $response->code(); $results{$uri}{location}{type} = $response->header('Content-type'); $results{$uri}{location}{display} = $results{$uri}{location}{code}; $results{$uri}{location}{orig} = $response->{OriginalCode}; # Did we get a redirect? if ($response->{OriginalCode} != $response->code()) { $results{$uri}{location}{orig_message} = $response->{OriginalMessage}; $results{$uri}{location}{redirected} = 1; } $results{$uri}{location}{success} = $response->is_success(); # Stores the authentication information if (defined($response->{Realm})) { $results{$uri}{location}{realm} = $response->{Realm}; $results{$uri}{location}{display} = 401; } if (($results{$uri}{location}{display} == 401) && ($results{$uri}{location}{code} == 404)) { $results{$uri}{location}{record} = 404; } else { $results{$uri}{location}{record} = $results{$uri}{location}{display}; } # Did it fail? if (! $results{$uri}{location}{success}) { $results{$uri}{location}{message} = $response->message(); if ($_verbose) { &hprintf("Error: %d %s\n", $results{$uri}{location}{code}, $results{$uri}{location}{message}); } return; } } #################### # Parse a document # #################### sub parse_document() { my ($uri, $location, $document, $links) = @_; my $p; if (defined($results{$uri}{parsing})) { # We have already done the job. Woohoo! $p->{base} = $results{$uri}{parsing}{base}; $p->{Anchors} = $results{$uri}{parsing}{Anchors}; $p->{Links} = $results{$uri}{parsing}{Links}; return($p); } my $start; $p = W3C::LinkChecker->new(); # Loose interpretation of the HTML comments since browsers will do the same $p->strict_comment(0); $p->{base} = $location; if (! $_summary) { $start = &get_timestamp(); print("Parsing...\n"); } if (!$_summary || $_progress) { $p->{Total} = ($document =~ tr/\n//); } # We only look for anchors if we are not interested in the links # obviously, or if we are running a recursive checking because we # might need this information later $p->{only_anchors} = !($links || $_recursive); # Parse small chunks: much faster my @chunks = unpack("a$_chunksize"x(length($document)/$_chunksize).'a*', $document); for (@chunks) { $p->parse($_); } if (! $_summary) { my $stop = &get_timestamp(); if ($_progress) { print "\r"; } &hprintf(" done (%d lines in %ss).\n", $p->{Total}, &time_diff($start, $stop)); } # Save the results before exiting $results{$uri}{parsing}{base} = $p->{base}; $results{$uri}{parsing}{Anchors} = $p->{Anchors}; $results{$uri}{parsing}{Links} = $p->{Links}; return($p); } #################################### # Constructor for W3C::Linkchecker # #################################### sub W3C::LinkChecker::new() { my $p = HTML::Parser::new(@_); # Line count $p->{Line} = 1; # Attribute for ids in element a # Up to XHTML 1.0, it is 'name'. After that it is 'id'. $p->{check_name} = 1; return $p; } ################################################# # Record or return the doctype of the document # ################################################# sub W3C::LinkChecker::doctype() { my ($self, $dc) = @_; if (! $dc) { return $self->{doctype}; } $self->{doctype} = $dc; # Check if we should check or not # The good way to do that is to get the DTD and parse it but it is # much more complex if ($dc eq '-//W3C//DTD XHTML Basic 1.0//EN') { $self->{check_name} = 0; } } ####################################### # Count the number of lines in a file # ####################################### sub W3C::LinkChecker::new_line() { my ($self, $string) = @_; my $count = ($string =~ tr/\n//); $self->{Line} = $self->{Line} + $count; if ($_progress) { printf("\r%4d%%", int($self->{Line}/$self->{Total}*100)); } } ####################################### # start function used by HTML::Parser # ####################################### sub W3C::LinkChecker::start() { my ($self, $tag, $attr, $attrseq, $text) = @_; my $anchor; # Links if (!$self->{only_anchors}) { my $link; $anchor = $attr->{id}; if (($tag eq 'a')) { $link = $attr->{href}; if ($self->{check_name}) { $anchor = $attr->{name}; } } elsif ($tag eq 'img') { $link = $attr->{src}; } elsif (($tag eq 'frame') || ($tag eq 'link')) { $link = $attr->{href}; } if (defined($link)) { $self->{Links}{$link}{$self->{Line}}++; } # Just anchors } else { $anchor = $attr->{id}; if (($tag eq 'a') && $self->{check_name}) { $anchor = $attr->{name}; } } if (defined($anchor)) { $self->{Anchors}{$anchor}{$self->{Line}}++; } # Line counting if ($text =~ m/\n/) { $self->new_line($text); } } #################################################### # Overloading functions for line counting purposes # #################################################### # W3C::LinkChecker::text() is called by end(), declaration() and comment() sub W3C::LinkChecker::text() { my ($self, $text) = @_; if (!$_progress) { # If we are just extracting information about anchors, # parsing this part is only cosmetic (progress indicator) return unless !$self->{only_anchors}; } if ($text =~ /\n/) { $self->new_line($text); } } sub W3C::LinkChecker::end() { my $self = shift; return unless !$self->{only_anchors}; shift; my $text = shift; $self->text($text); } sub W3C::LinkChecker::declaration() { my ($self, $text) = @_; # Extract the doctype my @declaration = split(/\s+/, $text, 4); if (($#declaration >= 3) && ($declaration[0] eq 'DOCTYPE') && (lc($declaration[1]) eq 'html')) { # Parse the doctype declaration $text =~ m/^DOCTYPE\s+html\s+PUBLIC\s+\"([^\"]*)\"(\s+\"([^\"]*)\")?\s*$/i; # Store the doctype if ($1) { $self->doctype($1); } # If there is a link to the DTD, record it if (!$self->{only_anchors} && $3) { $self->{Links}{$3}{$self->{Line}}++; } } return unless !$self->{only_anchors}; $self->text($text); } sub W3C::LinkChecker::comment() { my $self = shift; return unless !$self->{only_anchors}; my $text = shift; $self->text($text); } ################################ # Check the validity of a link # ################################ sub check_validity() { use HTTP::Status; my ($testing, $uri, $links, $redirects) = @_; # Checking file: URI's is not allowed with a CGI if ($testing ne $uri) { if ((! $_cl) && (!($testing =~ m/^file:/)) && ($uri =~ m/^file:/)) { # Can't test? Return 400 Bad request. $results{$uri}{location}{code} = 400; $results{$uri}{location}{success} = 0; $results{$uri}{location}{message} = 'Error: \'file:\' URI not allowed'; if ($_verbose) { &hprintf("Error: %d %s\n", $results{$uri}{location}{code}, $results{$uri}{location}{message}); } return; } } # Get the document with the appropriate method my $method; my @fragments = keys %{$links->{$uri}{fragments}}; if ($#fragments == -1) { $method = 'HEAD'; } else { $method = 'GET'; } my $response; my $being_processed = 0; if ((! defined($results{$uri})) || (($method eq 'GET') && ($results{$uri}{method} eq 'HEAD'))) { $being_processed = 1; $response = &get_uri($method, $uri); # Record the redirects &record_redirects($redirects, $response->{Redirects}); # Get the information back from get_uri() &record_results($uri, $method, $response); } if ($#fragments == -1) { return; } # There are fragments. Parse the document. my $p; if ($being_processed) { # Can we really parse the document? if (! ($results{$uri}{location}{type} =~ m/text\/html/i)) { if ($_verbose) { &hprintf("Can't check content: Content-type is '%s'.\n", $response->header('Content-type')); } return; } # Do it then $p = &parse_document($uri, $response->base(), $response->as_string(), 0); } else { # We already had the information $p->{Anchors} = $results{$uri}{parsing}{Anchors}; } # Check that the fragments exist my $fragment; foreach $fragment (keys %{$links->{$uri}{fragments}}) { if (defined($p->{Anchors}{$fragment}) || &escape_match($fragment, $p->{Anchors})) { $results{$uri}{fragments}{$fragment} = 1; } else { $results{$uri}{fragments}{$fragment} = 0; } } } sub escape_match($, \%) { use URI::Escape; my ($a, $hash) = (uri_unescape($_[0]), $_[1]); foreach $b (keys %$hash) { if ($a eq uri_unescape($b)) { return(1); } } return(0); } ########################## # Ask for authentication # ########################## sub authentication() { my $r = $_[0]; $r->headers->www_authenticate =~ /Basic realm=\"([^\"]+)\"/; my $realm = $1; my $authHeader = $r->headers->www_authenticate; if ($_cl) { printf(STDERR "\nAuthentication is required for %s.\n", $r->request->url); printf(STDERR "The realm is %s.\n", $realm); print(STDERR "Use the -u and -p options to specify a username and password.\n"); } else { printf("Status: 401 Authorization Required\nWWW-Authenticate: %s\nConnection: close\nContent-Type: text/html\n\n", $r->headers->www_authenticate); printf(" 401 Authorization Required

Authorization Required

You need %s access to %s to perform Link Checking.

", $realm, $r->request->url); } } ################## # Get statistics # ################## sub get_timestamp() { require 'sys/syscall.ph'; my $timestamp = pack('LL', ()); syscall(&SYS_gettimeofday, $timestamp, 0) != -1 or $timestamp = 0; return($timestamp); } sub time_diff() { my @start = unpack('LL', $_[0]); my @stop = unpack('LL', $_[1]); for ($start[1], $stop[1]) { $_ /= 1_000_000; } return(sprintf("%.2f", ($stop[0]+$stop[1])-($start[0]+$start[1]))); } ######################## # Handle the redirects # ######################## # Record the redirects in a hash sub record_redirects(\%, \%) { my ($redirects, $sub) = @_; my $r; foreach $r (keys %$sub) { $redirects->{$r} = $sub->{$r}; } } # Determine if a request is redirected sub is_redirected($, %) { my ($uri, %redirects) = @_; return(defined($redirects{$uri})); } # Get a list of redirects for a URI sub get_redirects($, %) { my ($uri, %redirects) = @_; my @history = ($uri); my $origin = $uri; my %seen; while ($redirects{$uri}) { $uri = $redirects{$uri}; push(@history, $uri); last if ($uri eq $origin); } return(@history); } #################################################### # Tool for sorting the unique elements of an array # #################################################### sub sort_unique() { my %saw; @saw{@_} = (); return (sort { $a <=> $b } keys %saw); } ##################### # Print the results # ##################### sub anchors_summary(\%, \%) { my ($anchors, $errors) = @_; # Number of anchors found. if (! $_quiet) { if ($_html) { print('

'); } my @anchors = keys %{$anchors}; &hprintf("Found %d anchors.", $#anchors+1); if ($_html) { print('

'); } print("\n"); } # List of the duplicates, if any. my @errors = keys %{$errors}; if ($#errors < 0) { if (! $_quiet && $_html) { print "

Valid anchors!

\n"; } return; } if ($_html) { print('

'); } print('List of duplicate and empty anchors:'); if ($_html) { print("

\n\n"); } print("\n"); my $anchor; foreach $anchor (@errors) { my $format; if ($_html) { $format = "\n"; } else { $format = "\t%s\tLines: %s\n"; } printf($format, &encode($anchor eq '' ? 'Empty anchor' : $ anchor), join(', ', &sort_unique(keys %{$anchors->{$anchor}}))); } if ($_html) { print("
AnchorsLines
%s%s
\n"); } } sub show_link_report { my ($links, $results, $broken, $redirects, $urls) = @_; # Head of the table if ($_html) { print("\n\n"); } print("\n"); # Process each URL my $u; foreach $u (@{$urls}) { my @fragments = keys %{$broken->{$u}{fragments}}; # Count the rows displayed my $n_fragments = $#fragments+1; if (!defined($broken->{$u}{location})) { $n_fragments++; } # Did we get a redirect? my $redirected = &is_redirected($u, %$redirects); # List of lines my @total_lines; my ($f, $l); foreach $l (keys %{$links->{$u}{location}}) { push (@total_lines, $l); } foreach $f (keys %{$links->{$u}{fragments}}) { if ($f eq $u) { next if (defined($links->{$u}{$u}{-1})); } my $l; foreach $l (keys %{$links->{$u}{fragments}{$f}}) { push (@total_lines, $l); } } my $lines_list = join(', ', &sort_unique(@total_lines)); if ($_html) { # Main info my @redirects_urls = &get_redirects($u, %$redirects); for (@redirects_urls) { $_ = &show_url($_); } printf("%s\n", # Number of fragments (for rowspan) $n_fragments, # Color &bgcolor($results->{$u}{location}{record}), # List of redirects $redirected ? join('
-> ', @redirects_urls) : &show_url($u), # Number of fragments (for rowspan) $n_fragments, # Color &bgcolor($results->{$u}{location}{orig}), # Original HTTP reply $results->{$u}{location}{orig}, # Final HTTP reply ($results->{$u}{location}{code} != $results->{$u}{location}{orig}) ? '-> '.&encode($results->{$u}{location}{code}) : '', # Realm defined($results->{$u}{location}{realm}) ? '
Realm: '.&encode($results->{$u}{location}{realm}) : '', # HTTP original message defined($results->{$u}{location}{orig_message}) ? '
'.&encode($results->{$u}{location}{orig_message}).' ->' : '', # HTTP final message $results->{$u}{location}{message} ? '
'.&encode($results->{$u}{location}{message}) : '', '', # Color again &bgcolor($results->{$u}{location}{code}), # List of lines $lines_list); } else { printf("\n%s\t%s\n Code: %d%s %s\n", # List of redirects $redirected ? join("\n-> ", &get_redirects($u, %$redirects)) : $u, # List of lines $lines_list ? 'Lines: '.$lines_list : '' , # Original HTTP reply $results->{$u}{location}{orig}, # Final HTTP reply ($results->{$u}{location}{code} != $results->{$u}{location}{orig}) ? ' -> '.$results->{$u}{location}{code} : '', # HTTP message $results->{$u}{location}{message} ? $results->{$u}{location}{message} : ''); } # Fragments foreach $f (@fragments) { next if ($f eq $u); if ($_html) { my $color = ($broken->{$u}{fragments}{$f} > 1) ? &bgcolor(404) : &bgcolor($results->{$u}{location}{code}); printf("%s%s\n", # Color $color, # Broken fragment &show_url($u, $f), # Color $color, # List of lines join(', ', &sort_unique(keys %{$links->{$u}{fragments}{$f}}))); } else { printf("\t%-30s\tLines: %s\n", # Fragment $f, # List of lines join(', ', &sort_unique(keys %{$links->{$u}{fragments}{$f}}))); } } } # End of the table if ($_html) { print("
LocationCodeFragmentLines
%s%d%s%s%s%s%s
\n"); } } sub links_summary { # Advices to fix the problems my %todo = ( 200 => 'nothing !', 300 => 'it usually means that there is a typo in a link that triggers mod_speling action - this should be fixed', 301 => 'usually nothing, unless the end point of the redirect is broken (in which case, the Code column is RED)', 302 => 'usually nothing, unless the end point of the redirect is broken (in which case, the Code column is RED)', 400 => 'Usually the sign of a malformed URL that cannot be parsed by the server', 401 => 'The link is not public. You had better specify it.', 403 => 'The link is forbidden ! This needs fixing. Usual suspect: a missing Overview.html or index.html, or bad access control', 404 => 'The link is broken. Fix it NOW', 405 => 'The server does not allow HEAD requests. How liberal. Go ask the guys who run this server why.', 407 => 'The link is a proxy, but requires Authentication.', 408 => 'The request timed out', 415 => 'The media type is not supported.', 500 => 'The server failed. It is a server side problem.', 501 => 'HEAD or GET is not implemented on this server... What kind of server is that?', 503 => 'The server cannot service the request, for some unknown reason'); my ($links, $results, $broken, $redirects) = @_; # Count the links. Useless but interesting. if (! $_quiet) { if ($_html) { print("\n\n

"); } my @links = keys %$links; my $n_fragments = 0; my $n_total = 0; my $u; # Give a few stats foreach $u (@links) { my @fragments = keys %{$links->{$u}{fragments}}; $n_fragments += $#fragments + 1; if (defined($links->{$u}{location})) { $n_fragments++; } my ($f, $l); foreach $l (keys %{$links->{$u}{location}}) { $n_total += $links->{$u}{location}{$l}; } foreach $f (@fragments) { my @lines = keys %{$links->{$u}{fragments}{$f}}; foreach $l (@lines) { $n_total += $links->{$u}{fragments}{$f}{$l}; } } } &hprintf("Found %d locations for %d unique URI's (%d total).", $#links+1, $n_fragments, $n_total); if ($_html) { print('

'); } print("\n"); } # Print a summary if ($_html) { my %code_summary; my $u; foreach $u (keys %$links) { if (defined($results->{$u}{location}{record})) { my $c; if ($results->{$u}{location}{record} == 200) { $c = $results->{$u}{location}{orig}; } else { $c = $results->{$u}{location}{record}; } $code_summary{$c}++; } } print "\n\n"; my $code; foreach $code (sort(keys(%code_summary))) { printf("", &bgcolor($code)); printf("", $code); printf("", $code_summary{$code}); printf("", $todo{$code}); print "\n"; } print "
CodeOccurencesWhat to do
%s%s%s
\n"; } # List of the broken links my @urls = keys %{$broken}; my @dir_redirect_urls = (); if ($_redirects) { # Add the redirected URI's to the report my $l; for $l (keys %$redirects) { next unless (defined($results->{$l}) && defined($links->{$l}) && !defined($broken->{$l})); # Check whether we have a "directory redirect" # e.g. http://www.w3.org/TR -> http://www.w3.org/TR/ my @redirects = &get_redirects($l, %$redirects); if (($#redirects == 1) && (($redirects[0].'/') eq $redirects[1])) { push(@dir_redirect_urls, $l); next; } push(@urls, $l); } } # Broken links and redirects if ($#urls < 0) { if (! $_quiet && $_html) { print "

Valid links!

\n"; } } else { if ($_html) { print('

'); } print("\nList of broken links"); if ($_redirects) { print(' and redirects'); } print(':'); if ($_html) { print('
Broken fragments appear in red

'); } &show_link_report($links, $results, $broken, $redirects, \@urls); } # Show directory redirects if ($_redirects && $_dir_redirects && ($#dir_redirect_urls > -1)) { if ($_html) { print('

'); } print("\nList of directory redirects:"); &show_link_report($links, $results, $broken, $redirects, \@dir_redirect_urls); } } ############################################################################### ################## # HTML interface # ################## sub html_header() { my $uri = &encode($_[0]); # Cache control? if (defined($_[1])) { print "Cache-Control: no-cache\nPragma: no-cache\n"; } print "Content-type: text/html W3C Link Ckecker: $uri \"W3C\"

W3C® Link Checker: $uri

\n"; } sub bgcolor() { my ($code) = @_; my $color; my $r = HTTP::Response->new($code); if ($r->is_success()) { return ''; } if ($code == 300) { return ' bgcolor="magenta"'; } if ($code == 401) { return ' bgcolor="aqua"'; } if ($r->is_redirect()) { return ' bgcolor="yellow"'; } if ($r->is_error()) { return ' bgcolor="red"'; } return ' bgcolor="grey"'; } sub show_url() { my ($url, $fragment) = @_; if (defined($fragment)) { $url .= '#'.$fragment; } return(''.&encode(defined($fragment) ? $fragment : $url).''); } sub html_footer() { print "
$PROGRAM $VERSION
Report bugs to Hugo Haas. Check out the source code.
"; } sub file_uri() { my $uri = $_[0]; &html_header($uri); print "

Forbidden

You cannot check such a URI ($uri).

"; &html_footer(); exit; } sub print_form() { my ($q) = @_; &html_header($VERSION, 1); print "
self_url()."\" method=\"get\">

Enter the URI that you want to check:

Options:

Summary only
Hide redirects
Hide directory redirects
Check linked documents recursively

"; &html_footer(); exit; } sub encode() { if (! $_html) { return @_; } else { return HTML::Entities::encode(@_); } } sub hprintf() { if (! $_html) { printf(@_); } else { print HTML::Entities::encode(sprintf($_[0], @_[1..@_-1])); } }