#!/usr/bin/perl -wT # # W3C Link Checker # by Hugo Haas # (c) 1999-2002 World Wide Web Consortium # based on Renaud Bruyeron's checklink.pl # # $Id: checklink.pl,v 3.6 2002-11-23 21:37:09 ville Exp $ # # This program is licensed under the W3C(r) License: # http://www.w3.org/Consortium/Legal/copyright-software # # The documentation is at: # http://www.w3.org/2000/07/checklink # # See the CVSweb interface at: # http://dev.w3.org/cvsweb/validator/httpd/cgi-bin/checklink.pl # # An online version is available at: # http://validator.w3.org/checklink # # Comments and suggestions should be sent to the www-validator mailing list: # www-validator@w3.org (with 'checklink' in the subject) # http://lists.w3.org/Archives/Public/www-validator/ (archives) use strict; # ----------------------------------------------------------------------------- package W3C::UserAgent; use LWP::UserAgent qw(); # @@@ Needs also W3C::CheckLink but can't use() it here. @W3C::UserAgent::ISA = qw(LWP::UserAgent); sub simple_request { my $self = shift; my $response = $self->W3C::UserAgent::SUPER::simple_request(@_); if (! defined($self->{FirstResponse})) { $self->{FirstResponse} = $response->code(); $self->{FirstMessage} = $response->message(); } return $response; } sub redirect_ok { my ($self, $request) = @_; if ($self->{Checklink_verbose_progress}) { &W3C::CheckLink::hprintf("\n%s %s ", $request->method(), $request->uri()); } # Build a map of redirects $self->{Redirects}{$self->{fetching}} = $request->uri(); $self->{fetching} = $request->uri(); return ($request->method() eq 'POST') ? 0 : 1; } # ----------------------------------------------------------------------------- package W3C::CheckLink; use vars qw($PROGRAM $AGENT $VERSION $CVS_VERSION $REVISION $Have_ReadKey $DocType); use HTML::Entities qw(); use HTML::Parser 3.00 qw(); use HTTP::Request qw(); use HTTP::Response qw(); use Time::HiRes qw(); use URI qw(); use URI::Escape qw(); use URI::file qw(); # @@@ Needs also W3C::UserAgent but can't use() it here. @W3C::CheckLink::ISA = qw(HTML::Parser); BEGIN { # Version info $PROGRAM = 'W3C checklink'; ($AGENT = $PROGRAM) =~ s/\s+/-/g; ($CVS_VERSION) = q$Revision: 3.6 $ =~ /(\d+[\d\.]*\.\d+)/; $VERSION = sprintf('%d.%02d', $CVS_VERSION =~ /(\d+)\.(\d+)/); $REVISION = sprintf('version %s (c) 1999-2002 W3C', $VERSION); eval "use Term::ReadKey 2.00 qw(ReadMode)"; $Have_ReadKey = !$@; $DocType = ''; } # Autoflush $| = 1; # Am I being run as a CGI or from the command line? my $_cl = ! ($ENV{GATEWAY_INTERFACE} && $ENV{GATEWAY_INTERFACE} =~ /^CGI/); # Different options specified by the user my $_quiet = 0; my $_summary = 0; my $_verbose = 0; my $_progress = 0; my $_html = 0; my $_timeout = 60; my $_redirects = 1; my $_dir_redirects = 1; my $_user; my $_password; my $_trusted = '\.w3\.org'; my $_http_proxy; my $_accept_language = 1; my $_languages = '*'; my $_base_location = '.'; my $_masquerade = 0; my $_local_dir = my $_remote_masqueraded_uri = ''; my $_hide_same_realm = 0; my $_depth = 0; # -1 means unlimited recursion # Restrictions for the online version my $_sleep_time = 3; my $_max_documents = 150; # Global variables # What is our query? my $query; # What URI's did we process? (used for recursive mode) my %processed; # Result of the HTTP query my %results; # List of redirects my %redirects; # Count of the number of documents checked my $doc_count = 0; # Time stamp my $timestamp = &get_timestamp; if ($_cl) { # Parse command line &parse_arguments(); if ($_user && (! $_password)) { &ask_password(); } my $first = 1; foreach my $uri (@ARGV) { if (!$_summary) { printf("%s %s\n", $PROGRAM, $REVISION) if (! $_html); } else { $_verbose = 0; $_progress = 0; } # Transform the parameter into a URI $uri = &urize($uri); &check_uri($uri, ($_html && $first), $_depth); $first &&= 0; } undef $first; if ($_html) { &html_footer(); } elsif (($doc_count > 0) && !$_summary) { printf("\n%s\n", &global_stats()); } } else { use CGI (); use CGI::Carp qw(fatalsToBrowser); $query = new CGI; # Set a few parameters in CGI mode $_verbose = 0; $_progress = 0; $_html = 1; # Backwards compatibility if ($query->param('hide_dir_redirects')) { $query->param('hide_redirects', 'on'); $query->param('hide_type', 'dir'); $query->delete('hide_dir_redirects'); } if (my $uri = $query->param('url')) { $query->param('uri', $uri) unless $query->param('uri'); $query->delete('url'); } # Override undefined values from the cookie, if we got one. if (my %cookie = $query->cookie($AGENT)) { while (my ($key, $value) = each %cookie) { $query->param($key, $value) unless defined($query->param($key)); } } if ($query->param('summary')) { $_summary = 1; } else { } if ($query->param('hide_redirects')) { $_dir_redirects = 0; if (my $type = $query->param('hide_type')) { $_redirects = 0 if ($type ne 'dir'); } else { $_redirects = 0; } } if ($query->param('no_accept_language')) { $_accept_language = 0; } if ($query->param('recursive')) { if ($_depth == 0) { $_depth = -1; } } if ($query->param('depth') && ($query->param('depth') != 0)) { $_depth = $query->param('depth'); } # Save, clear or leave cookie as is. my $cookie = ''; if (my $action = $query->param('cookie')) { my %cookie = $query->cookie($AGENT); if ($action eq 'clear') { # Clear the cookie. $cookie{-value} = ''; $cookie{-expires} = '-1M'; } else { # Always refresh the expiration time. $cookie{-expires} = '+1M'; if ($action eq 'set') { # Set the options. my %options = $query->Vars(); delete($options{$_}) for ('url', 'uri', 'check'); # Non-persistent. $cookie{-value} = \%options; $cookie{-expires} = '+1M'; } } $cookie = $query->cookie(%cookie); } my $uri = $query->param('uri'); if (! $uri) { &html_header('', 1); # Set cookie only from results page. &print_form($query); &html_footer(); exit; } undef $query; # Not needed any more. $uri =~ s/^\s+//g; if ($uri =~ m/^file:/) { # Only the http scheme is allowed &file_uri($uri); } elsif ($uri !~ m/:/) { if ($uri =~ m|^//|) { $uri = 'http:'.$uri; } else { $uri = 'http://'.$uri; } } &check_uri($uri, 1, $_depth, $cookie); &html_footer(); } ############################################################################### ################################ # Command line and usage stuff # ################################ sub parse_arguments () { require Getopt::Long; Getopt::Long->require_version(2.17); Getopt::Long->import('GetOptions'); Getopt::Long::Configure('no_ignore_case'); my @masq = (); GetOptions('help' => \&usage, 'q|quiet' => sub { $_quiet = 1; $_summary = 1; }, 's|summary' => \$_summary, 'b|broken' => sub { $_redirects = $_dir_redirects = 0; }, 'e|dir-redirects' => sub { $_dir_redirects = 0; }, 'v|verbose' => \$_verbose, 'i|indicator' => \$_progress, 'h|html' => \$_html, 'n|noacclanguage' => sub { $_accept_language = 0; }, 'r|recursive' => sub { $_depth = -1 if $_depth == 0; }, 'l|location=s' => \$_base_location, 'u|user=s' => \$_user, 'p|password=s' => \$_password, 't|timeout=i' => \$_timeout, 'L|languages=s' => \$_languages, 'D|depth=i' => sub { $_depth = $_[1] unless $_[1] == 0; }, 'd|domain=s' => \$_trusted, 'y|proxy=s' => \$_http_proxy, 'masquerade' => \@masq, 'hide-same-realm' => \$_hide_same_realm, 'V|version' => \&version, ); if (@masq) { $_masquerade = 1; $_local_dir = shift(@masq); $_remote_masqueraded_uri = shift(@masq); } } sub version () { print STDERR "$PROGRAM $REVISION\n"; exit(0); } sub usage () { print STDERR "$PROGRAM $REVISION Usage: checklink Options: -s/--summary Result summary only. -b/--broken Show only the broken links, not the redirects. -e/--directory Hide directory redirects - e.g. http://www.w3.org/TR -> http://www.w3.org/TR/ -r/--recursive Check the documents linked from the first one. -D/--depth n Check the documents linked from the first one to depth n. -l/--location uri Scope of the documents checked. By default, for http://www.w3.org/TR/html4/Overview.html for example, it would be: http://www.w3.org/TR/html4/ -n/--noacclanguage Do not send an Accept-Language header. -L/--languages Languages accepted (default: '$_languages'). -q/--quiet No output if no errors are found. -v/--verbose Verbose mode. -i/--indicator Show progress while parsing. -u/--user username Specify a username for authentication. -p/--password password Specify a password. --hide-same-realm Hide 401's that are in the same realm as the document checked. -t/--timeout value Timeout for the HTTP requests. -d/--domain domain Regular expression describing the domain to which the authetication information will be sent (default: '$_trusted'). --masquerade local remote Masquerade local dir as a remote URI (e.g. /home/hugo/MathML2/ is in fact http://www.w3.org/TR/MathML2/). -y/--proxy proxy Specify an HTTP proxy server. -h/--html HTML output. --help Show this message. -V/--version Output version information. Documentation at: http://www.w3.org/2000/07/checklink Please send bug reports and comments to the www-validator mailing list: www-validator\@w3.org (with 'checklink' in the subject) Archives are at: http://lists.w3.org/Archives/Public/www-validator/ "; exit(0); } sub ask_password () { printf(STDERR 'Enter the password for user %s: ', $_user); $Have_ReadKey ? ReadMode('noecho', *STDIN) : system('stty -echo'); chomp($_password = ); $Have_ReadKey ? ReadMode('restore', *STDIN) : system('stty echo'); print(STDERR "ok.\n"); } ############################################################################### ########################################### # Transform foo into file://localhost/foo # ########################################### sub urize ($) { my $u = URI->new_abs(URI::Escape::uri_unescape($_[0]), URI::file->cwd()); return $u->as_string(); } ######################################## # Check for broken links in a resource # ######################################## sub check_uri ($$$) { my ($uri, $html_header, $depth, $cookie) = @_; # If $html_header equals 1, we need to generate a HTML header (first # instance called in HTML mode). my $start; if (! $_quiet) { $start = &get_timestamp(); } # Get and parse the document my $response = &get_document('GET', $uri, $doc_count, \%redirects); # Can we check the resource? If not, we exit here... if (defined($response->{Stop})) { return(-1); } # We are checking a new document $doc_count++; if ($_html) { &html_header($uri, 0, $cookie) if $html_header; print('

'); } my $absolute_uri = $response->{absolute_uri}->as_string(); my $result_anchor = 'results'.$doc_count; printf("\nProcessing\t%s\n\n", $_html ? &show_url(&encode($absolute_uri)) : $absolute_uri); if ($_html) { print("

\n"); if (! $_summary) { printf("

Go to the results.

\n", $result_anchor); printf("

Check also: HTML Validity & CSS Validity

Back to the link checker.

\n", map{&encode(URI::Escape::uri_escape($absolute_uri, "^A-Za-z0-9."))}(1..2)); print("
\n");
    }
  }

  # Record that we have processed this resource
  $processed{$absolute_uri} = 1;
  # Parse the document
  my $p = &parse_document($uri, $absolute_uri,
                          $response->content(), 1,
                          $depth != 0);
  my $base = URI->new($p->{base});

  # Check anchors
  ###############

  if (! $_summary) {
    print("Checking anchors...\n");
  }
  my %errors;
  foreach my $anchor (keys %{$p->{Anchors}}) {
    my $times = 0;
    foreach my $l (keys %{$p->{Anchors}{$anchor}}) {
      $times += $p->{Anchors}{$anchor}{$l};
    }
    # They should appear only once
    if ($times > 1) {
      $errors{$anchor} = 1;
    }
    # Empty IDREF's are not allowed
    if ($anchor eq '') {
      $errors{$anchor} = 1;
    }
  }
  if (! $_summary) {
    print(" done.\n");
  }

  # Check links
  #############

  my %links;
  # Record all the links found
  foreach my $link (keys %{$p->{Links}}) {
    my $link_uri = URI->new($link);
    my $abs_link_uri = URI->new_abs($link_uri, $base);
    if ($_masquerade) {
      if ($abs_link_uri =~ m|^$_remote_masqueraded_uri|) {
        printf("processing %s in base %s\n", $abs_link_uri, $_local_dir);
        my $nlink = $abs_link_uri;
        $nlink =~
          s|^$_remote_masqueraded_uri|file://localhost$_local_dir|;
        $abs_link_uri = URI->new($nlink);
      };
    }
    foreach my $lines (keys %{$p->{Links}{$link}}) {
      my $canonical = URI->new($abs_link_uri->canonical());
      my $url = $canonical->scheme().':'.$canonical->opaque();
      my $fragment = $canonical->fragment();
      if (! $fragment) {
        # Document without fragment
        $links{$url}{location}{$lines} = 1;
      } else {
        # Resource with a fragment
        $links{$url}{fragments}{$fragment}{$lines} = 1;
      }
    }
  }

  # Build the list of broken URI's
  my %broken;
  foreach my $u (keys %links) {
    # Don't check mailto: URI's
    next if ($u =~ m/^mailto:/);
    if (! $_summary) {
      &hprintf("Checking link %s\n", $u);
    }
    # Check that a link is valid
    &check_validity($uri, $u, \%links, \%redirects);
    if ($_verbose) {
      &hprintf("\tReturn code: %s\n", $results{$u}{location}{code});
    }
    if ($results{$u}{location}{success}) {
      # Even though it was not broken, we might want to display it
      # on the results page (e.g. because it required authentication)
      if ($results{$u}{location}{display} >= 400) {
        $broken{$u}{location} = 1;
      }
      # List the broken fragments
      foreach my $fragment (keys %{$links{$u}{fragments}}) {
        if ($_verbose) {
          my @frags = sort keys %{$links{$u}{fragments}{$fragment}};
          &hprintf("\t\t%s %s - Line%s: %s\n",
                   $fragment,
                   ($results{$u}{fragments}{$fragment}) ? 'OK' : 'Not found',
                   (scalar(@frags) > 1) ? 's' : '',
                   join(', ', @frags)
                  );
        }
        # A broken fragment?
        if ($results{$u}{fragments}{$fragment} == 0) {
          $broken{$u}{fragments}{$fragment} += 2;
        }
      }
    } else {
      # Couldn't find the document
      $broken{$u}{location} = 1;
      # All the fragments associated are hence broken
      foreach my $fragment (keys %{$links{$u}{fragments}}) {
        $broken{$u}{fragments}{$fragment}++;
      }
    }
  }
  if (! $_summary) {
    &hprintf("Processed in %ss.\n", &time_diff($start, &get_timestamp()));
  }

  # Display results
  if ($_html) {
    if (! $_summary) {
      print("
\n"); printf("

Results

\n", $result_anchor); } } if (! $_quiet) { print "\n"; } &anchors_summary($p->{Anchors}, \%errors); &links_summary(\%links, \%results, \%broken, \%redirects); # Do we want to process other documents? if ($depth != 0) { if ($_base_location eq '.') { # Get the name of the original directory # e.g. http://www.w3.org/TR/html4/Overview.html # should return http://www.w3.org/TR/html4/ $results{$uri}{parsing}{base} =~ m|^(.*/)[^/]*|; $_base_location = $1; } foreach my $u (keys %links) { next if (! (# Check if it's in our scope for recursion ($u =~ m|^$_base_location|) && # and the link is not broken $results{$u}{location}{success} && # And it is a text/html or application/xhtml+xml # resource (($results{$u}{location}{type} =~ m|text/html|) || ($results{$u}{location}{type} =~ m|application/xhtml\+xml|)) ) ); # Check if we have already processed the URI next if (&already_processed($u) != 0); # Do the job print "\n"; if (! $_html) { print '-' x 40; } else { # For the online version, wait for a while to avoid abuses if (!$_cl) { if ($doc_count == $_max_documents) { print("
\n

Maximum number of documents reached!

\n"); } if ($doc_count >= $_max_documents) { $doc_count++; print("

Not checking $u

\n"); $processed{$u} = 1; next; } } print('
'); sleep($_sleep_time); } print "\n"; if ($depth < 0) { &check_uri($u, 0, -1); } else { &check_uri($u, 0, $depth-1); } } } } ####################################### # Get and parse a resource to process # ####################################### sub get_document ($$$;\%) { my ($method, $uri, $in_recursion, $redirects) = @_; # $method contains the HTTP method the use (GET or HEAD) # $uri contains the identifier of the resource # $in_recursion equals 1 if we are in recursion mode (i.e. it is at least # the second resource checked) # $redirects is a pointer to the hash containing the map of the redirects # Get the resource my $response; if (defined($results{$uri}{response}) && !(($method eq 'GET') && ($results{$uri}{method} eq 'HEAD'))) { $response = $results{$uri}{response}; } else { $response = &get_uri($method, $uri); &record_results($uri, $method, $response); &record_redirects($redirects, $response->{Redirects}); } if (! $response->is_success()) { if (! $in_recursion) { # Is it too late to request authentication? if ($response->code() == 401) { &authentication($response); } else { &html_header($uri) if $_html; &hprintf("\nError: %d %s\n", $response->code(), $response->message()); } } $response->{Stop} = 1; return($response); } # What is the URI of the resource that we are processing by the way? my $base_uri = URI->new($response->base()); my $request_uri = URI->new($response->request->url); $response->{absolute_uri} = $base_uri->abs($request_uri); # Can we parse the document? my $failed_reason; if (! (($response->header('Content-Type') =~ m|text/html|) || ($response->header('Content-Type') =~ m|application/xhtml\+xml|)) ) { $failed_reason = "Content-Type is '". $response->header('Content-Type')."'"; } elsif ($response->header('Content-Encoding') && ($response->header('Content-Encoding') ne 'identity')) { $failed_reason = "Content-Encoding is '". $response->header('Content-encoding')."'"; } if ($failed_reason) { # No, there is a problem... if (! $in_recursion) { &html_header($uri) if $_html; &hprintf("Can't check links: %s.\n", $failed_reason); } $response->{Stop} = 1; } # Ok, return the information return($response); } ################################################## # Check whether a URI has already been processed # ################################################## sub already_processed ($) { my ($uri, %redirects) = @_; # Don't be verbose for that part... my $summary_value = $_summary; $_summary = 1; # Do a GET: if it fails, we stop, if not, the results are cached my $response = &get_document('GET', $uri, 1); # ... but just for that part $_summary = $summary_value; # Can we process the resource? return(-1) if (defined($response->{Stop})); # Have we already processed it? return(1) if (defined($processed{$response->{absolute_uri}->as_string()})); # It's not processed yet and it is processable: return 0 return(0); } ############################ # Get the content of a URI # ############################ sub get_uri ($$;$\%$$$$) { # Here we have a lot of extra parameters in order not to lose information # if the function is called several times (401's) my ($method, $uri, $start, $redirects, $code, $realm, $message, $auth) = @_; # $method contains the method used # $uri contains the target of the request # $start is a timestamp (not defined the first time the function is # called) # $redirects is a map of redirects # $code is the first HTTP return code # $realm is the realm of the request # $message is the HTTP message received # $auth equals 1 if we want to send out authentication information # For timing purposes if (! defined($start)) { $start = &get_timestamp(); } # Prepare the query my %lwpargs = ($LWP::VERSION >= 5.6) ? (keep_alive => 1) : (); my $ua = W3C::UserAgent->new(%lwpargs); $ua->timeout($_timeout); $ua->agent(sprintf('%s/%s %s', $AGENT, $VERSION, $ua->agent())); $ua->env_proxy(); $ua->proxy('http', 'http://' . $_http_proxy) if ($_http_proxy); # $ua->{fetching} contains the URI we originally wanted # $ua->{uri} is modified in the case of a redirect; this is used to # build $ua->{Redirects} $ua->{uri} = $ua->{fetching} = $uri; if (defined($redirects)) { $ua->{Redirects} = $redirects; } # Do we want printouts of progress? my $verbose_progress = ! ($_summary || (!$doc_count && $_html)); &hprintf("%s %s ", $method, $uri) if $verbose_progress; my $request = new HTTP::Request($method, $uri); if ($_accept_language) { $request->header('Accept-Language' => 'en'); } # Are we providing authentication info? if (defined($auth) && ($request->url->host =~ /$_trusted$/)) { if (defined($ENV{HTTP_AUTHORIZATION})) { $request->headers->header(Authorization => $ENV{HTTP_AUTHORIZATION}); } elsif (defined($_user) && defined($_password)) { $request->authorization_basic($_user, $_password); } } # Tell the user agent if we want progress reports (in redirects) or not. $ua->{Checklink_verbose_progress} = $verbose_progress; # Do the query my $response = $ua->request($request); # Get the results # Record the very first response if (! defined($code)) { $code = $ua->{FirstResponse}; $message = $ua->{FirstMessage}; } # Authentication requested? if (($response->code() == 401) && (defined($ENV{HTTP_AUTHORIZATION}) || (defined($_user) && defined($_password))) && !defined ($auth)) { # Deal with authentication and avoid loops if (! defined ($realm)) { $response->headers->www_authenticate =~ /Basic realm=\"([^\"]+)\"/; $realm = $1; } print "\n" if $verbose_progress; return &get_uri($method, $response->request->url, $start, $ua->{Redirects}, $code, $realm, $message, 1); } # Record the redirects $response->{Redirects} = $ua->{Redirects}; &hprintf(" fetched in %ss\n", &time_diff($start, &get_timestamp())) if $verbose_progress; $response->{OriginalCode} = $code; $response->{OriginalMessage} = $message; $response->{Realm} = $realm if defined($realm); return $response; } ######################################### # Record the results of an HTTP request # ######################################### sub record_results ($$$) { my ($uri, $method, $response) = @_; $results{$uri}{response} = $response; $results{$uri}{method} = $method; $results{$uri}{location}{code} = $response->code(); $results{$uri}{location}{type} = $response->header('Content-type'); $results{$uri}{location}{display} = $results{$uri}{location}{code}; $results{$uri}{location}{orig} = $response->{OriginalCode}; # Did we get a redirect? if ($response->{OriginalCode} != $response->code()) { $results{$uri}{location}{orig_message} = $response->{OriginalMessage}; $results{$uri}{location}{redirected} = 1; } $results{$uri}{location}{success} = $response->is_success(); # Stores the authentication information if (defined($response->{Realm})) { $results{$uri}{location}{realm} = $response->{Realm}; if (! $_hide_same_realm) { $results{$uri}{location}{display} = 401; } } # What type of broken link is it? (stored in {record} - the {display} # information is just for visual use only) if (($results{$uri}{location}{display} == 401) && ($results{$uri}{location}{code} == 404)) { $results{$uri}{location}{record} = 404; } else { $results{$uri}{location}{record} = $results{$uri}{location}{display}; } # Did it fail? $results{$uri}{location}{message} = $response->message(); if (! $results{$uri}{location}{success}) { if ($_verbose) { &hprintf("Error: %d %s\n", $results{$uri}{location}{code}, $results{$uri}{location}{message}); } return; } } #################### # Parse a document # #################### sub parse_document ($$$$;$) { my ($uri, $location, $document, $links, $rec_needs_links) = @_; my $p; if (defined($results{$uri}{parsing})) { # We have already done the job. Woohoo! $p->{base} = $results{$uri}{parsing}{base}; $p->{Anchors} = $results{$uri}{parsing}{Anchors}; $p->{Links} = $results{$uri}{parsing}{Links}; return($p); } my $start; $p = W3C::CheckLink->new(); $p->{base} = $location; if (! $_summary) { $start = &get_timestamp(); print("Parsing...\n"); } if (!$_summary || $_progress) { $p->{Total} = ($document =~ tr/\n//); } # We only look for anchors if we are not interested in the links # obviously, or if we are running a recursive checking because we # might need this information later $p->{only_anchors} = !($links || $rec_needs_links); # Transform into for parsing # Processing instructions are not parsed by process, but in this case # it should be. It's expensive, it's horrible, but it's the easiest way # for right now. if (!$p->{only_anchors}) { $document =~ s/\<\?(xml:stylesheet.*?)\?\>/\<$1\>/; } $p->parse($document); if (! $_summary) { my $stop = &get_timestamp(); if ($_progress) { print "\r"; } &hprintf(" done (%d lines in %ss).\n", $p->{Total}, &time_diff($start, $stop)); } # Save the results before exiting $results{$uri}{parsing}{base} = $p->{base}; $results{$uri}{parsing}{Anchors} = $p->{Anchors}; $results{$uri}{parsing}{Links} = $p->{Links}; return($p); } ################################### # Constructor for W3C::CheckLink # ################################### sub W3C::CheckLink::new { my $p = HTML::Parser::new(@_, api_version => 3); # Start tags $p->handler(start => 'start', 'self, tagname, attr, text, event, tokens'); # Declarations $p->handler(declaration => sub { my $self = shift; $self->declaration(substr($_[0], 2, -1)); }, 'self, text'); # Other stuff $p->handler(default => 'text', 'self, text'); # Line count $p->{Line} = 1; # Check ? $p->{check_name} = 1; # Check <[..] id="..">? $p->{check_id} = 1; # Don't interpret comment loosely $p->strict_comment(1); return $p; } ################################################# # Record or return the doctype of the document # ################################################# sub W3C::CheckLink::doctype { my ($self, $dc) = @_; if (! $dc) { return $self->{doctype}; } $_ = $self->{doctype} = $dc; # What to look for depending on the doctype if ($_ eq '-//W3C//DTD XHTML Basic 1.0//EN') { $self->{check_name} = 0; } # Check for the id tag if ( # HTML 2.0 & 3.0 m%^-//IETF//DTD HTML [23]\.0//% || # HTML 3.2 m%^-//W3C//DTD HTML 3\.2//%) { $self->{check_id} = 0; } # Enable XML extensions if (m%^-//W3C//DTD XHTML %) { $self->xml_mode(1); } } ####################################### # Count the number of lines in a file # ####################################### sub W3C::CheckLink::new_line { my ($self, $string) = @_; my $count = ($string =~ tr/\n//); $self->{Line} = $self->{Line} + $count; if ($_progress) { printf("\r%4d%%", int($self->{Line}/$self->{Total}*100)); } } ############################# # Extraction of the anchors # ############################# sub W3C::CheckLink::get_anchor { my ($self, $tag, $attr) = @_; my $anchor; if ($self->{check_id}) { $anchor = $attr->{id}; } if ($self->{check_name} && ($tag eq 'a')) { # @@@@ In XHTML, is mandatory # Force an error if it's not the case (or if id's and name's values # are different) # If id is defined, name if defined must have the same value if (!$anchor) { $anchor = $attr->{name}; } } return($anchor); } ########################### # W3C::CheckLink handlers # ########################### sub add_link { my ($self, $uri) = @_; if (defined($uri)) { $self->{Links}{$uri}{$self->{Line}}++; } } sub start { my ($self, $tag, $attr, $text) = @_; # Anchors my $anchor = $self->get_anchor($tag, $attr); if (defined($anchor)) { $self->{Anchors}{$anchor}{$self->{Line}}++; } # Links if (!$self->{only_anchors}) { # Here, we are checking too many things # The right thing to do is to parse the DTD... if ($tag eq 'base') { if (defined($attr->{href})) { $self->{base} = $attr->{href}; } } else { $self->add_link($attr->{href}); } $self->add_link($attr->{src}); if ($tag eq 'object') { $self->add_link($attr->{data}); } if ($tag eq 'blockquote') { $self->add_link($attr->{cite}); } } # Line counting if ($text =~ m/\n/) { $self->new_line($text); } } sub text { my ($self, $text) = @_; if (!$_progress) { # If we are just extracting information about anchors, # parsing this part is only cosmetic (progress indicator) return unless !$self->{only_anchors}; } if ($text =~ /\n/) { $self->new_line($text); } } sub W3C::CheckLink::declaration { my ($self, $text) = @_; # Extract the doctype my @declaration = split(/\s+/, $text, 4); if (($#declaration >= 3) && ($declaration[0] eq 'DOCTYPE') && (lc($declaration[1]) eq 'html')) { # Parse the doctype declaration $text =~ m/^DOCTYPE\s+html\s+PUBLIC\s+\"([^\"]*)\"(\s+\"([^\"]*)\")?\s*$/i; # Store the doctype if ($1) { $self->doctype($1); } # If there is a link to the DTD, record it if (!$self->{only_anchors} && $3) { $self->{Links}{$3}{$self->{Line}}++; } } return unless !$self->{only_anchors}; $self->text($text); } ################################ # Check the validity of a link # ################################ sub check_validity ($$\%\%) { my ($testing, $uri, $links, $redirects) = @_; # $testing is the URI of the document checked # $uri is the URI of the target that we are verifying # $links is a hash of the links in the documents checked # $redirects is a map of the redirects encountered # Checking file: URI's is not allowed with a CGI if ($testing ne $uri) { if ((! $_cl) && (!($testing =~ m/^file:/)) && ($uri =~ m/^file:/)) { # Can't test? Return 400 Bad request. $results{$uri}{location}{code} = 400; $results{$uri}{location}{success} = 0; $results{$uri}{location}{message} = 'Error: \'file:\' URI not allowed'; if ($_verbose) { &hprintf("Error: %d %s\n", $results{$uri}{location}{code}, $results{$uri}{location}{message}); } return; } } # Get the document with the appropriate method # Only use GET if there are fragments. HEAD is enough if it's not the # case. my @fragments = keys %{$links->{$uri}{fragments}}; my $method = scalar(@fragments) ? 'GET' : 'HEAD'; my $response; my $being_processed = 0; if ((! defined($results{$uri})) || (($method eq 'GET') && ($results{$uri}{method} eq 'HEAD'))) { $being_processed = 1; $response = &get_uri($method, $uri); # Get the information back from get_uri() &record_results($uri, $method, $response); # Record the redirects &record_redirects($redirects, $response->{Redirects}); } # We got the response of the HTTP request. Stop here if it was a HEAD. return if ($method eq 'HEAD'); # There are fragments. Parse the document. my $p; if ($being_processed) { # Can we really parse the document? if (!defined($results{$uri}{location}{type})) { return; } if (! (($results{$uri}{location}{type} =~ m|text/html|i) || ($results{$uri}{location}{type} =~ m|application/xhtml\+xml|i)) ) { if ($_verbose) { &hprintf("Can't check content: Content-type is '%s'.\n", $response->header('Content-type')); } return; } # Do it then $p = &parse_document($uri, $response->base(), $response->as_string(), 0); } else { # We already had the information $p->{Anchors} = $results{$uri}{parsing}{Anchors}; } # Check that the fragments exist foreach my $fragment (keys %{$links->{$uri}{fragments}}) { if (defined($p->{Anchors}{$fragment}) || &escape_match($fragment, $p->{Anchors})) { $results{$uri}{fragments}{$fragment} = 1; } else { $results{$uri}{fragments}{$fragment} = 0; } } } sub escape_match ($\%) { my ($a, $hash) = (URI::Escape::uri_unescape($_[0]), $_[1]); foreach my $b (keys %$hash) { if ($a eq URI::Escape::uri_unescape($b)) { return(1); } } return(0); } ########################## # Ask for authentication # ########################## sub authentication ($) { my $r = $_[0]; $r->headers->www_authenticate =~ /Basic realm=\"([^\"]+)\"/; my $realm = $1; my $authHeader = $r->headers->www_authenticate; if ($_cl) { printf(STDERR "\nAuthentication is required for %s.\n", $r->request->url); printf(STDERR "The realm is %s.\n", $realm); print(STDERR "Use the -u and -p options to specify a username and password.\n"); } else { printf("Status: 401 Authorization Required\nWWW-Authenticate: %s\nConnection: close\nContent-Language: en\nContent-Type: text/html\n\n", $r->headers->www_authenticate); printf("%s 401 Authorization Required

Authorization Required

You need %s access to %s to perform Link Checking.

", $DocType, &encode($realm), $r->request->url); } } ################## # Get statistics # ################## sub get_timestamp () { return pack('LL', Time::HiRes::gettimeofday()); } sub time_diff ($$) { my @start = unpack('LL', $_[0]); my @stop = unpack('LL', $_[1]); for ($start[1], $stop[1]) { $_ /= 1_000_000; } return(sprintf("%.1f", ($stop[0]+$stop[1])-($start[0]+$start[1]))); } ######################## # Handle the redirects # ######################## # Record the redirects in a hash sub record_redirects (\%\%) { my ($redirects, $sub) = @_; foreach my $r (keys %$sub) { $redirects->{$r} = $sub->{$r}; } } # Determine if a request is redirected sub is_redirected ($%) { my ($uri, %redirects) = @_; return(defined($redirects{$uri})); } # Get a list of redirects for a URI sub get_redirects ($%) { my ($uri, %redirects) = @_; my @history = ($uri); my $origin = $uri; while ($redirects{$uri}) { $uri = $redirects{$uri}; push(@history, $uri); last if ($uri eq $origin); } return(@history); } #################################################### # Tool for sorting the unique elements of an array # #################################################### sub sort_unique (@) { my %saw; @saw{@_} = (); return (sort { $a <=> $b } keys %saw); } ##################### # Print the results # ##################### sub anchors_summary (\%\%) { my ($anchors, $errors) = @_; # Number of anchors found. my $n = scalar(keys(%$anchors)); if (! $_quiet) { if ($_html) { print("

Anchors

\n

"); } else { print("Anchors\n\n"); } &hprintf("Found %d anchor%s.", $n, ($n == 1) ? '' : 's'); print('

') if $_html; print("\n"); } # List of the duplicates, if any. my @errors = keys %{$errors}; if (! scalar(@errors)) { print("

Valid anchors!

\n") if (! $_quiet && $_html && $n); return; } undef $n; print('

') if $_html; print('List of duplicate and empty anchors'); print("

\n\n") if $_html; print("\n"); foreach my $anchor (@errors) { my $format; my @unique = &sort_unique(keys %{$anchors->{$anchor}}); if ($_html) { $format = "\n"; } else { my $s = (scalar(@unique) > 1) ? 's' : ''; $format = "\t%s\tLine$s: %s\n"; } printf($format, &encode($anchor eq '' ? 'Empty anchor' : $anchor), join(', ', @unique)); } print("
AnchorsLines
%s%s
\n") if $_html; } sub show_link_report (\%\%\%\%\@;$\%) { my ($links, $results, $broken, $redirects, $urls, $codes, $todo) = @_; print("\n
") if $_html; print("\n"); # Process each URL my ($c, $previous_c); foreach my $u (@{$urls}) { my @fragments = keys %{$broken->{$u}{fragments}}; # Did we get a redirect? my $redirected = &is_redirected($u, %$redirects); # List of lines my @total_lines; foreach my $l (keys %{$links->{$u}{location}}) { push (@total_lines, $l); } foreach my $f (keys %{$links->{$u}{fragments}}) { if ($f eq $u) { next if (defined($links->{$u}{$u}{-1})); } foreach my $l (keys %{$links->{$u}{fragments}{$f}}) { push (@total_lines, $l); } } # Error type $c = &code_shown($u, $results); # What to do my $whattodo; my $redirect_too; if ($todo) { if ($u =~ m/^javascript:/) { if ($_html) { $whattodo = 'You must change this link: people using a browser without Javascript support will not be able to follow this link. See the Web Content Accessibility Guidelines on the use of scripting on the Web and the techniques on how to solve this.'; } else { $whattodo = 'Change this link: people using a browser without Javascript support will not be able to follow this link.'; } } elsif ($c == 500) { # 500's could be a real 500 or a DNS lookup problem if ($results->{$u}{location}{message} =~ m/Bad hostname '[^\']*'/) { $whattodo = 'The hostname could not be resolved. This link needs to be fixed.'; } else { $whattodo = 'This is a server-side problem. Check the URI.'; } } else { $whattodo = $todo->{$c}; } if (defined($redirects{$u}) && ($c != 301) && ($c != 302)) { $redirect_too = 'The original request has been redirected.'; $whattodo .= ' '.$redirect_too if (! $_html); } } else { # Directory redirects $whattodo = 'Add a trailing slash to the URL.'; } my @unique = &sort_unique(@total_lines); my $lines_list = join(', ', @unique); my $s = (scalar(@unique) > 1) ? 's' : ''; undef @unique; if ($_html) { # Style stuff my $idref = ''; if ($codes && (!defined($previous_c) || ($c != $previous_c))) { $idref = ' id="d'.$doc_count.'code_'.$c.'"'; $previous_c = $c; } # Main info my @redirects_urls = &get_redirects($u, %$redirects); for (@redirects_urls) { $_ = &show_url($_); } # HTTP message my $http_message; if ($results->{$u}{location}{message}) { $http_message = &encode($results->{$u}{location}{message}); if (($c == 404) || ($c == 500)) { $http_message = ''. $http_message.''; } } printf(" %s
What to do: %s%s
HTTP Code returned: %d%s
HTTP Message: %s%s%s
Line%s: %s
\n", # Anchor for return codes $idref, # List of redirects $redirected ? join(' redirected to
', @redirects_urls) : &show_url($u), # Color &bgcolor($c), # What to do $whattodo, # Redirect too? $redirect_too ? ' '.$redirect_too.'' : '', # Original HTTP reply $results->{$u}{location}{orig}, # Final HTTP reply ($results->{$u}{location}{code} != $results->{$u}{location}{orig}) ? ' -> '. &encode($results->{$u}{location}{code}) : '', # Realm (defined($results->{$u}{location}{realm}) ? 'Realm: '.&encode($results->{$u}{location}{realm}).'
' : ''), # HTTP original message defined($results->{$u}{location}{orig_message}) ? &encode($results->{$u}{location}{orig_message}). ' -> ' : '', # HTTP final message $http_message, $s, # List of lines $lines_list); if ($#fragments >= 0) { my $fragment_direction = ''; if ($results->{$u}{location}{code} == 200) { $fragment_direction = ' They need to be fixed!'; } printf("
Broken fragments and their line numbers: %s
\n", $fragment_direction); } } else { printf("\n%s\t%s\n Code: %d%s %s\nTo do: %s\n", # List of redirects $redirected ? join("\n-> ", &get_redirects($u, %$redirects)) : $u, # List of lines $lines_list ? "Line$s: $lines_list" : '', # Original HTTP reply $results->{$u}{location}{orig}, # Final HTTP reply ($results->{$u}{location}{code} != $results->{$u}{location}{orig}) ? ' -> '.$results->{$u}{location}{code} : '', # HTTP message $results->{$u}{location}{message} ? $results->{$u}{location}{message} : '', # What to do $whattodo); if ($#fragments >= 0) { if ($results->{$u}{location}{code} == 200) { print("The following fragments need to be fixed:\n"); } else { print("Fragments:\n"); } } } # Fragments foreach my $f (@fragments) { if ($_html) { printf("
%s: %s
\n", # Broken fragment &show_url($u, $f), # List of lines join(', ', &sort_unique(keys %{$links->{$u}{fragments}{$f}}))); } else { my @unq = &sort_unique(keys %{$links->{$u}{fragments}{$f}}); printf("\t%-30s\tLine%s: %s\n", # Fragment $f, # Multiple? (scalar(@unq) > 1) ? 's' : '', # List of lines join(', ', @unq)); } } print("
\n") if ($_html && scalar(@fragments)); } # End of the table print("
\n") if $_html; } sub code_shown ($$) { my ($u, $results) = @_; if ($results->{$u}{location}{record} == 200) { return $results->{$u}{location}{orig}; } else { return $results->{$u}{location}{record}; } } sub links_summary (\%\%\%\%) { # Advices to fix the problems my %todo = ( 200 => 'There are broken fragments which must be fixed.', 300 => 'It usually means that there is a typo in a link that triggers mod_speling action - this must be fixed!', 301 => 'You should update the link.', 302 => 'Usually nothing.', 400 => 'Usually the sign of a malformed URL that cannot be parsed by the server.', 401 => "The link is not public. You'd better specify it.", 403 => 'The link is forbidden! This needs fixing. Usual suspects: a missing index.html or Overview.html, or a missing ACL.', 404 => 'The link is broken. Fix it NOW!', 405 => 'The server does not allow HEAD requests. Go ask the guys who run this server why. Check the link manually.', 407 => 'The link is a proxy, but requires Authentication.', 408 => 'The request timed out.', 410 => 'The resource is gone. You should remove this link.', 415 => 'The media type is not supported.', 500 => 'Either the hostname is incorrect or it is a server side problem. Check the detailed list.', 501 => 'Could not check this link: method not implemented or scheme not supported.', 503 => 'The server cannot service the request, for some unknown reason.'); my %priority = ( 410 => 1, 404 => 2, 403 => 5, 200 => 10, 300 => 15, 401 => 20 ); my ($links, $results, $broken, $redirects) = @_; # List of the broken links my @urls = keys %{$broken}; my @dir_redirect_urls = (); if ($_redirects) { # Add the redirected URI's to the report for my $l (keys %$redirects) { next unless (defined($results->{$l}) && defined($links->{$l}) && !defined($broken->{$l})); # Check whether we have a "directory redirect" # e.g. http://www.w3.org/TR -> http://www.w3.org/TR/ my @redirects = &get_redirects($l, %$redirects); if (($#redirects == 1) && (($redirects[0].'/') eq $redirects[1])) { push(@dir_redirect_urls, $l); next; } push(@urls, $l); } } # Broken links and redirects if ($#urls < 0) { if (! $_quiet) { if ($_html) { print "

Links

\n"; print "

Valid links!

"; } else { print "\nValid links."; } print "\n"; } } else { print('

') if $_html; print("\nList of broken links"); print(' and redirects') if $_redirects; # Sort the URI's by HTTP Code my %code_summary; my @idx; foreach my $u (@urls) { if (defined($results->{$u}{location}{record})) { my $c = &code_shown($u, $results); $code_summary{$c}++; push(@idx, $c); } } my @sorted = @urls[ sort { defined($priority{$idx[$a]}) ? defined($priority{$idx[$b]}) ? $priority{$idx[$a]} <=> $priority{$idx[$b]} : -1 : defined($priority{$idx[$b]}) ? 1 : $idx[$a] <=> $idx[$b] } 0 .. $#idx ]; @urls = @sorted; undef(@sorted); undef(@idx); if ($_html) { print('

Fragments listed are broken. See the table below to know what action to take.

'); # Print a summary print "\n\n"; foreach my $code (sort(keys(%code_summary))) { printf('', &bgcolor($code)); printf('', $doc_count, $code, $code); printf('', $code_summary{$code}); printf('', $todo{$code}); print "\n"; } print "
CodeOccurrencesWhat to do
%s%s%s
\n"; } else { print(':'); } &show_link_report($links, $results, $broken, $redirects, \@urls, 1, \%todo); } # Show directory redirects if ($_dir_redirects && ($#dir_redirect_urls > -1)) { print('

') if $_html; print("\nList of directory redirects"); print("

\n

The links below are not broken, but the document does not use the exact URL.

") if $_html; &show_link_report($links, $results, $broken, $redirects, \@dir_redirect_urls); } } ############################################################################### ################ # Global stats # ################ sub global_stats () { my $stop = &get_timestamp(); my $n_docs = ($doc_count <= $_max_documents) ? $doc_count : $_max_documents; return sprintf('Checked %d document%s in %s seconds.', $n_docs, ($n_docs == 1) ? '' : 's', &time_diff($timestamp, $stop)); } ################## # HTML interface # ################## sub html_header ($;$$) { my ($uri, $doform, $cookie) = @_; $uri = &encode($uri); my $title = ' Link Checker' . ($uri eq '' ? '' : ': ' . $uri); # mod_perl 1.99_05 doesn't seem to like if the "\n\n" isn't in the same # print() statement as the last header... my $headers = ''; if (! $_cl) { $headers .= "Cache-Control: no-cache\nPragma: no-cache\n" if $doform; $headers .= "Content-Type: text/html; charset=iso-8859-1\n"; $headers .= "Content-Script-Type: text/javascript\n"; $headers .= "Set-Cookie: $cookie\n" if $cookie; $headers .= "Content-Language: en\n\n"; } my $script = my $onload = ''; if ($doform) { $script = " "; $onload = ' onload="document.forms[0].uri.focus()"'; } print $headers, $DocType, " W3C", $title, " ", $script, "

\"W3C\" ", $title, "

\n\n"; } sub bgcolor ($) { my ($code) = @_; my $class; my $r = HTTP::Response->new($code); if ($r->is_success()) { return ''; } elsif ($code == 300) { $class = 'multiple'; } elsif ($code == 401) { $class = 'unauthorized'; } elsif ($r->is_redirect()) { $class = 'redirect'; } elsif ($r->is_error()) { $class = 'broken'; } else { $class = 'broken'; } return(' class="'.$class.'"'); } sub show_url ($;$) { my ($url, $fragment) = @_; if (defined($fragment)) { $url .= '#'.$fragment; } return sprintf('%s', $url, &encode(defined($fragment) ? $fragment : $url)); } sub html_footer () { if (($doc_count > 0) && !$_quiet) { printf("

%s

\n", &global_stats()); } print "
$PROGRAM $REVISION, by Hugo Haas and others.
Please send bug reports, suggestions and comments to the www-validator mailing list (archives).
Check out the documentation. Download the source code from CVS.
"; } sub file_uri ($) { my ($uri) = @_; &html_header($uri); print "

Forbidden

You cannot check such a URI ($uri).

"; &html_footer(); exit; } sub print_form ($) { my ($q) = @_; my $chk = ' checked="checked"'; $q->param('hide_type', 'all') unless $q->param('hide_type'); my $sum = $q->param('summary') ? $chk : ''; my $red = $q->param('hide_redirects') ? $chk : ''; my $all = ($q->param('hide_type') ne 'dir') ? $chk : ''; my $dir = $all ? '' : $chk; my $acc = $q->param('no_accept_language') ? $chk : ''; my $rec = $q->param('recursive') ? $chk : ''; my $dep = &encode($q->param('depth') || ''); my $cookie_options = ''; if ($q->cookie()) { $cookie_options = " "; } else { $cookie_options = " "; } print "
self_url(), "\" method=\"get\" onsubmit=\"return uriOk()\">

Options




,

", $cookie_options, "

"; } sub encode (@) { if (! $_html) { return @_; } else { return HTML::Entities::encode(@_); } } sub hprintf (@) { if (! $_html) { printf(@_); } else { print HTML::Entities::encode(sprintf($_[0], @_[1..@_-1])); } }