#!/usr/bin/perl -wT # # W3C Link Checker # by Hugo Haas # (c) 1999-2002 World Wide Web Consortium # based on Renaud Bruyeron's checklink.pl # # $Id: checklink.pl,v 3.6 2002-11-23 21:37:09 ville Exp $ # # This program is licensed under the W3C(r) License: # http://www.w3.org/Consortium/Legal/copyright-software # # The documentation is at: # http://www.w3.org/2000/07/checklink # # See the CVSweb interface at: # http://dev.w3.org/cvsweb/validator/httpd/cgi-bin/checklink.pl # # An online version is available at: # http://validator.w3.org/checklink # # Comments and suggestions should be sent to the www-validator mailing list: # www-validator@w3.org (with 'checklink' in the subject) # http://lists.w3.org/Archives/Public/www-validator/ (archives) use strict; # ----------------------------------------------------------------------------- package W3C::UserAgent; use LWP::UserAgent qw(); # @@@ Needs also W3C::CheckLink but can't use() it here. @W3C::UserAgent::ISA = qw(LWP::UserAgent); sub simple_request { my $self = shift; my $response = $self->W3C::UserAgent::SUPER::simple_request(@_); if (! defined($self->{FirstResponse})) { $self->{FirstResponse} = $response->code(); $self->{FirstMessage} = $response->message(); } return $response; } sub redirect_ok { my ($self, $request) = @_; if ($self->{Checklink_verbose_progress}) { &W3C::CheckLink::hprintf("\n%s %s ", $request->method(), $request->uri()); } # Build a map of redirects $self->{Redirects}{$self->{fetching}} = $request->uri(); $self->{fetching} = $request->uri(); return ($request->method() eq 'POST') ? 0 : 1; } # ----------------------------------------------------------------------------- package W3C::CheckLink; use vars qw($PROGRAM $AGENT $VERSION $CVS_VERSION $REVISION $Have_ReadKey $DocType); use HTML::Entities qw(); use HTML::Parser 3.00 qw(); use HTTP::Request qw(); use HTTP::Response qw(); use Time::HiRes qw(); use URI qw(); use URI::Escape qw(); use URI::file qw(); # @@@ Needs also W3C::UserAgent but can't use() it here. @W3C::CheckLink::ISA = qw(HTML::Parser); BEGIN { # Version info $PROGRAM = 'W3C checklink'; ($AGENT = $PROGRAM) =~ s/\s+/-/g; ($CVS_VERSION) = q$Revision: 3.6 $ =~ /(\d+[\d\.]*\.\d+)/; $VERSION = sprintf('%d.%02d', $CVS_VERSION =~ /(\d+)\.(\d+)/); $REVISION = sprintf('version %s (c) 1999-2002 W3C', $VERSION); eval "use Term::ReadKey 2.00 qw(ReadMode)"; $Have_ReadKey = !$@; $DocType = ''; } # Autoflush $| = 1; # Am I being run as a CGI or from the command line? my $_cl = ! ($ENV{GATEWAY_INTERFACE} && $ENV{GATEWAY_INTERFACE} =~ /^CGI/); # Different options specified by the user my $_quiet = 0; my $_summary = 0; my $_verbose = 0; my $_progress = 0; my $_html = 0; my $_timeout = 60; my $_redirects = 1; my $_dir_redirects = 1; my $_user; my $_password; my $_trusted = '\.w3\.org'; my $_http_proxy; my $_accept_language = 1; my $_languages = '*'; my $_base_location = '.'; my $_masquerade = 0; my $_local_dir = my $_remote_masqueraded_uri = ''; my $_hide_same_realm = 0; my $_depth = 0; # -1 means unlimited recursion # Restrictions for the online version my $_sleep_time = 3; my $_max_documents = 150; # Global variables # What is our query? my $query; # What URI's did we process? (used for recursive mode) my %processed; # Result of the HTTP query my %results; # List of redirects my %redirects; # Count of the number of documents checked my $doc_count = 0; # Time stamp my $timestamp = &get_timestamp; if ($_cl) { # Parse command line &parse_arguments(); if ($_user && (! $_password)) { &ask_password(); } my $first = 1; foreach my $uri (@ARGV) { if (!$_summary) { printf("%s %s\n", $PROGRAM, $REVISION) if (! $_html); } else { $_verbose = 0; $_progress = 0; } # Transform the parameter into a URI $uri = &urize($uri); &check_uri($uri, ($_html && $first), $_depth); $first &&= 0; } undef $first; if ($_html) { &html_footer(); } elsif (($doc_count > 0) && !$_summary) { printf("\n%s\n", &global_stats()); } } else { use CGI (); use CGI::Carp qw(fatalsToBrowser); $query = new CGI; # Set a few parameters in CGI mode $_verbose = 0; $_progress = 0; $_html = 1; # Backwards compatibility if ($query->param('hide_dir_redirects')) { $query->param('hide_redirects', 'on'); $query->param('hide_type', 'dir'); $query->delete('hide_dir_redirects'); } if (my $uri = $query->param('url')) { $query->param('uri', $uri) unless $query->param('uri'); $query->delete('url'); } # Override undefined values from the cookie, if we got one. if (my %cookie = $query->cookie($AGENT)) { while (my ($key, $value) = each %cookie) { $query->param($key, $value) unless defined($query->param($key)); } } if ($query->param('summary')) { $_summary = 1; } else { } if ($query->param('hide_redirects')) { $_dir_redirects = 0; if (my $type = $query->param('hide_type')) { $_redirects = 0 if ($type ne 'dir'); } else { $_redirects = 0; } } if ($query->param('no_accept_language')) { $_accept_language = 0; } if ($query->param('recursive')) { if ($_depth == 0) { $_depth = -1; } } if ($query->param('depth') && ($query->param('depth') != 0)) { $_depth = $query->param('depth'); } # Save, clear or leave cookie as is. my $cookie = ''; if (my $action = $query->param('cookie')) { my %cookie = $query->cookie($AGENT); if ($action eq 'clear') { # Clear the cookie. $cookie{-value} = ''; $cookie{-expires} = '-1M'; } else { # Always refresh the expiration time. $cookie{-expires} = '+1M'; if ($action eq 'set') { # Set the options. my %options = $query->Vars(); delete($options{$_}) for ('url', 'uri', 'check'); # Non-persistent. $cookie{-value} = \%options; $cookie{-expires} = '+1M'; } } $cookie = $query->cookie(%cookie); } my $uri = $query->param('uri'); if (! $uri) { &html_header('', 1); # Set cookie only from results page. &print_form($query); &html_footer(); exit; } undef $query; # Not needed any more. $uri =~ s/^\s+//g; if ($uri =~ m/^file:/) { # Only the http scheme is allowed &file_uri($uri); } elsif ($uri !~ m/:/) { if ($uri =~ m|^//|) { $uri = 'http:'.$uri; } else { $uri = 'http://'.$uri; } } &check_uri($uri, 1, $_depth, $cookie); &html_footer(); } ############################################################################### ################################ # Command line and usage stuff # ################################ sub parse_arguments () { require Getopt::Long; Getopt::Long->require_version(2.17); Getopt::Long->import('GetOptions'); Getopt::Long::Configure('no_ignore_case'); my @masq = (); GetOptions('help' => \&usage, 'q|quiet' => sub { $_quiet = 1; $_summary = 1; }, 's|summary' => \$_summary, 'b|broken' => sub { $_redirects = $_dir_redirects = 0; }, 'e|dir-redirects' => sub { $_dir_redirects = 0; }, 'v|verbose' => \$_verbose, 'i|indicator' => \$_progress, 'h|html' => \$_html, 'n|noacclanguage' => sub { $_accept_language = 0; }, 'r|recursive' => sub { $_depth = -1 if $_depth == 0; }, 'l|location=s' => \$_base_location, 'u|user=s' => \$_user, 'p|password=s' => \$_password, 't|timeout=i' => \$_timeout, 'L|languages=s' => \$_languages, 'D|depth=i' => sub { $_depth = $_[1] unless $_[1] == 0; }, 'd|domain=s' => \$_trusted, 'y|proxy=s' => \$_http_proxy, 'masquerade' => \@masq, 'hide-same-realm' => \$_hide_same_realm, 'V|version' => \&version, ); if (@masq) { $_masquerade = 1; $_local_dir = shift(@masq); $_remote_masqueraded_uri = shift(@masq); } } sub version () { print STDERR "$PROGRAM $REVISION\n"; exit(0); } sub usage () { print STDERR "$PROGRAM $REVISION Usage: checklink Options: -s/--summary Result summary only. -b/--broken Show only the broken links, not the redirects. -e/--directory Hide directory redirects - e.g. http://www.w3.org/TR -> http://www.w3.org/TR/ -r/--recursive Check the documents linked from the first one. -D/--depth n Check the documents linked from the first one to depth n. -l/--location uri Scope of the documents checked. By default, for http://www.w3.org/TR/html4/Overview.html for example, it would be: http://www.w3.org/TR/html4/ -n/--noacclanguage Do not send an Accept-Language header. -L/--languages Languages accepted (default: '$_languages'). -q/--quiet No output if no errors are found. -v/--verbose Verbose mode. -i/--indicator Show progress while parsing. -u/--user username Specify a username for authentication. -p/--password password Specify a password. --hide-same-realm Hide 401's that are in the same realm as the document checked. -t/--timeout value Timeout for the HTTP requests. -d/--domain domain Regular expression describing the domain to which the authetication information will be sent (default: '$_trusted'). --masquerade local remote Masquerade local dir as a remote URI (e.g. /home/hugo/MathML2/ is in fact http://www.w3.org/TR/MathML2/). -y/--proxy proxy Specify an HTTP proxy server. -h/--html HTML output. --help Show this message. -V/--version Output version information. Documentation at: http://www.w3.org/2000/07/checklink Please send bug reports and comments to the www-validator mailing list: www-validator\@w3.org (with 'checklink' in the subject) Archives are at: http://lists.w3.org/Archives/Public/www-validator/ "; exit(0); } sub ask_password () { printf(STDERR 'Enter the password for user %s: ', $_user); $Have_ReadKey ? ReadMode('noecho', *STDIN) : system('stty -echo'); chomp($_password = ); $Have_ReadKey ? ReadMode('restore', *STDIN) : system('stty echo'); print(STDERR "ok.\n"); } ############################################################################### ########################################### # Transform foo into file://localhost/foo # ########################################### sub urize ($) { my $u = URI->new_abs(URI::Escape::uri_unescape($_[0]), URI::file->cwd()); return $u->as_string(); } ######################################## # Check for broken links in a resource # ######################################## sub check_uri ($$$) { my ($uri, $html_header, $depth, $cookie) = @_; # If $html_header equals 1, we need to generate a HTML header (first # instance called in HTML mode). my $start; if (! $_quiet) { $start = &get_timestamp(); } # Get and parse the document my $response = &get_document('GET', $uri, $doc_count, \%redirects); # Can we check the resource? If not, we exit here... if (defined($response->{Stop})) { return(-1); } # We are checking a new document $doc_count++; if ($_html) { &html_header($uri, 0, $cookie) if $html_header; print('

'); } my $absolute_uri = $response->{absolute_uri}->as_string(); my $result_anchor = 'results'.$doc_count; printf("\nProcessing\t%s\n\n", $_html ? &show_url(&encode($absolute_uri)) : $absolute_uri); if ($_html) { print("

\n"); if (! $_summary) { printf("

Go to the results.

\n", $result_anchor); printf("

Check also: HTML Validity & CSS Validity

Back to the link checker.

\n", map{&encode(URI::Escape::uri_escape($absolute_uri, "^A-Za-z0-9."))}(1..2)); print("

\n");
    }
  }

  # Record that we have processed this resource
  $processed{$absolute_uri} = 1;
  # Parse the document
  my $p = &parse_document($uri, $absolute_uri,
                          $response->content(), 1,
                          $depth != 0);
  my $base = URI->new($p->{base});

  # Check anchors
  ###############

  if (! $_summary) {
    print("Checking anchors...\n");
  }
  my %errors;
  foreach my $anchor (keys %{$p->{Anchors}}) {
    my $times = 0;
    foreach my $l (keys %{$p->{Anchors}{$anchor}}) {
      $times += $p->{Anchors}{$anchor}{$l};
    }
    # They should appear only once
    if ($times > 1) {
      $errors{$anchor} = 1;
    }
    # Empty IDREF's are not allowed
    if ($anchor eq '') {
      $errors{$anchor} = 1;
    }
  }
  if (! $_summary) {
    print(" done.\n");
  }

  # Check links
  #############

  my %links;
  # Record all the links found
  foreach my $link (keys %{$p->{Links}}) {
    my $link_uri = URI->new($link);
    my $abs_link_uri = URI->new_abs($link_uri, $base);
    if ($_masquerade) {
      if ($abs_link_uri =~ m|^$_remote_masqueraded_uri|) {
        printf("processing %s in base %s\n", $abs_link_uri, $_local_dir);
        my $nlink = $abs_link_uri;
        $nlink =~
          s|^$_remote_masqueraded_uri|file://localhost$_local_dir|;
        $abs_link_uri = URI->new($nlink);
      };
    }
    foreach my $lines (keys %{$p->{Links}{$link}}) {
      my $canonical = URI->new($abs_link_uri->canonical());
      my $url = $canonical->scheme().':'.$canonical->opaque();
      my $fragment = $canonical->fragment();
      if (! $fragment) {
        # Document without fragment
        $links{$url}{location}{$lines} = 1;
      } else {
        # Resource with a fragment
        $links{$url}{fragments}{$fragment}{$lines} = 1;
      }
    }
  }

  # Build the list of broken URI's
  my %broken;
  foreach my $u (keys %links) {
    # Don't check mailto: URI's
    next if ($u =~ m/^mailto:/);
    if (! $_summary) {
      &hprintf("Checking link %s\n", $u);
    }
    # Check that a link is valid
    &check_validity($uri, $u, \%links, \%redirects);
    if ($_verbose) {
      &hprintf("\tReturn code: %s\n", $results{$u}{location}{code});
    }
    if ($results{$u}{location}{success}) {
      # Even though it was not broken, we might want to display it
      # on the results page (e.g. because it required authentication)
      if ($results{$u}{location}{display} >= 400) {
        $broken{$u}{location} = 1;
      }
      # List the broken fragments
      foreach my $fragment (keys %{$links{$u}{fragments}}) {
        if ($_verbose) {
          my @frags = sort keys %{$links{$u}{fragments}{$fragment}};
          &hprintf("\t\t%s %s - Line%s: %s\n",
                   $fragment,
                   ($results{$u}{fragments}{$fragment}) ? 'OK' : 'Not found',
                   (scalar(@frags) > 1) ? 's' : '',
                   join(', ', @frags)
                  );
        }
        # A broken fragment?
        if ($results{$u}{fragments}{$fragment} == 0) {
          $broken{$u}{fragments}{$fragment} += 2;
        }
      }
    } else {
      # Couldn't find the document
      $broken{$u}{location} = 1;
      # All the fragments associated are hence broken
      foreach my $fragment (keys %{$links{$u}{fragments}}) {
        $broken{$u}{fragments}{$fragment}++;
      }
    }
  }
  if (! $_summary) {
    &hprintf("Processed in %ss.\n", &time_diff($start, &get_timestamp()));
  }

  # Display results
  if ($_html) {
    if (! $_summary) {
      print("

\n"); printf("

Results

\n", $result_anchor); } } if (! $_quiet) { print "\n"; } &anchors_summary($p->{Anchors}, \%errors); &links_summary(\%links, \%results, \%broken, \%redirects); # Do we want to process other documents? if ($depth != 0) { if ($_base_location eq '.') { # Get the name of the original directory # e.g. http://www.w3.org/TR/html4/Overview.html # should return http://www.w3.org/TR/html4/ $results{$uri}{parsing}{base} =~ m|^(.*/)[^/]*|; $_base_location = $1; } foreach my $u (keys %links) { next if (! (# Check if it's in our scope for recursion ($u =~ m|^$_base_location|) && # and the link is not broken $results{$u}{location}{success} && # And it is a text/html or application/xhtml+xml # resource (($results{$u}{location}{type} =~ m|text/html|) || ($results{$u}{location}{type} =~ m|application/xhtml\+xml|)) ) ); # Check if we have already processed the URI next if (&already_processed($u) != 0); # Do the job print "\n"; if (! $_html) { print '-' x 40; } else { # For the online version, wait for a while to avoid abuses if (!$_cl) { if ($doc_count == $_max_documents) { print("

Maximum number of documents reached!

\n"); } if ($doc_count >= $_max_documents) { $doc_count++; print("

Not checking $u

\n"); $processed{$u} = 1; next; } } print('

'); sleep($_sleep_time); } print "\n"; if ($depth < 0) { &check_uri($u, 0, -1); } else { &check_uri($u, 0, $depth-1); } } } } ####################################### # Get and parse a resource to process # ####################################### sub get_document ($$$;\%) { my ($method, $uri, $in_recursion, $redirects) = @_; # $method contains the HTTP method the use (GET or HEAD) # $uri contains the identifier of the resource # $in_recursion equals 1 if we are in recursion mode (i.e. it is at least # the second resource checked) # $redirects is a pointer to the hash containing the map of the redirects # Get the resource my $response; if (defined($results{$uri}{response}) && !(($method eq 'GET') && ($results{$uri}{method} eq 'HEAD'))) { $response = $results{$uri}{response}; } else { $response = &get_uri($method, $uri); &record_results($uri, $method, $response); &record_redirects($redirects, $response->{Redirects}); } if (! $response->is_success()) { if (! $in_recursion) { # Is it too late to request authentication? if ($response->code() == 401) { &authentication($response); } else { &html_header($uri) if $_html; &hprintf("\nError: %d %s\n", $response->code(), $response->message()); } } $response->{Stop} = 1; return($response); } # What is the URI of the resource that we are processing by the way? my $base_uri = URI->new($response->base()); my $request_uri = URI->new($response->request->url); $response->{absolute_uri} = $base_uri->abs($request_uri); # Can we parse the document? my $failed_reason; if (! (($response->header('Content-Type') =~ m|text/html|) || ($response->header('Content-Type') =~ m|application/xhtml\+xml|)) ) { $failed_reason = "Content-Type is '". $response->header('Content-Type')."'"; } elsif ($response->header('Content-Encoding') && ($response->header('Content-Encoding') ne 'identity')) { $failed_reason = "Content-Encoding is '". $response->header('Content-encoding')."'"; } if ($failed_reason) { # No, there is a problem... if (! $in_recursion) { &html_header($uri) if $_html; &hprintf("Can't check links: %s.\n", $failed_reason); } $response->{Stop} = 1; } # Ok, return the information return($response); } ################################################## # Check whether a URI has already been processed # ################################################## sub already_processed ($) { my ($uri, %redirects) = @_; # Don't be verbose for that part... my $summary_value = $_summary; $_summary = 1; # Do a GET: if it fails, we stop, if not, the results are cached my $response = &get_document('GET', $uri, 1); # ... but just for that part $_summary = $summary_value; # Can we process the resource? return(-1) if (defined($response->{Stop})); # Have we already processed it? return(1) if (defined($processed{$response->{absolute_uri}->as_string()})); # It's not processed yet and it is processable: return 0 return(0); } ############################ # Get the content of a URI # ############################ sub get_uri ($$;$\%$$$$) { # Here we have a lot of extra parameters in order not to lose information # if the function is called several times (401's) my ($method, $uri, $start, $redirects, $code, $realm, $message, $auth) = @_; # $method contains the method used # $uri contains the target of the request # $start is a timestamp (not defined the first time the function is # called) # $redirects is a map of redirects # $code is the first HTTP return code # $realm is the realm of the request # $message is the HTTP message received # $auth equals 1 if we want to send out authentication information # For timing purposes if (! defined($start)) { $start = &get_timestamp(); } # Prepare the query my %lwpargs = ($LWP::VERSION >= 5.6) ? (keep_alive => 1) : (); my $ua = W3C::UserAgent->new(%lwpargs); $ua->timeout($_timeout); $ua->agent(sprintf('%s/%s %s', $AGENT, $VERSION, $ua->agent())); $ua->env_proxy(); $ua->proxy('http', 'http://' . $_http_proxy) if ($_http_proxy); # $ua->{fetching} contains the URI we originally wanted # $ua->{uri} is modified in the case of a redirect; this is used to # build $ua->{Redirects} $ua->{uri} = $ua->{fetching} = $uri; if (defined($redirects)) { $ua->{Redirects} = $redirects; } # Do we want printouts of progress? my $verbose_progress = ! ($_summary || (!$doc_count && $_html)); &hprintf("%s %s ", $method, $uri) if $verbose_progress; my $request = new HTTP::Request($method, $uri); if ($_accept_language) { $request->header('Accept-Language' => 'en'); } # Are we providing authentication info? if (defined($auth) && ($request->url->host =~ /$_trusted$/)) { if (defined($ENV{HTTP_AUTHORIZATION})) { $request->headers->header(Authorization => $ENV{HTTP_AUTHORIZATION}); } elsif (defined($_user) && defined($_password)) { $request->authorization_basic($_user, $_password); } } # Tell the user agent if we want progress reports (in redirects) or not. $ua->{Checklink_verbose_progress} = $verbose_progress; # Do the query my $response = $ua->request($request); # Get the results # Record the very first response if (! defined($code)) { $code = $ua->{FirstResponse}; $message = $ua->{FirstMessage}; } # Authentication requested? if (($response->code() == 401) && (defined($ENV{HTTP_AUTHORIZATION}) || (defined($_user) && defined($_password))) && !defined ($auth)) { # Deal with authentication and avoid loops if (! defined ($realm)) { $response->headers->www_authenticate =~ /Basic realm=\"([^\"]+)\"/; $realm = $1; } print "\n" if $verbose_progress; return &get_uri($method, $response->request->url, $start, $ua->{Redirects}, $code, $realm, $message, 1); } # Record the redirects $response->{Redirects} = $ua->{Redirects}; &hprintf(" fetched in %ss\n", &time_diff($start, &get_timestamp())) if $verbose_progress; $response->{OriginalCode} = $code; $response->{OriginalMessage} = $message; $response->{Realm} = $realm if defined($realm); return $response; } ######################################### # Record the results of an HTTP request # ######################################### sub record_results ($$$) { my ($uri, $method, $response) = @_; $results{$uri}{response} = $response; $results{$uri}{method} = $method; $results{$uri}{location}{code} = $response->code(); $results{$uri}{location}{type} = $response->header('Content-type'); $results{$uri}{location}{display} = $results{$uri}{location}{code}; $results{$uri}{location}{orig} = $response->{OriginalCode}; # Did we get a redirect? if ($response->{OriginalCode} != $response->code()) { $results{$uri}{location}{orig_message} = $response->{OriginalMessage}; $results{$uri}{location}{redirected} = 1; } $results{$uri}{location}{success} = $response->is_success(); # Stores the authentication information if (defined($response->{Realm})) { $results{$uri}{location}{realm} = $response->{Realm}; if (! $_hide_same_realm) { $results{$uri}{location}{display} = 401; } } # What type of broken link is it? (stored in {record} - the {display} # information is just for visual use only) if (($results{$uri}{location}{display} == 401) && ($results{$uri}{location}{code} == 404)) { $results{$uri}{location}{record} = 404; } else { $results{$uri}{location}{record} = $results{$uri}{location}{display}; } # Did it fail? $results{$uri}{location}{message} = $response->message(); if (! $results{$uri}{location}{success}) { if ($_verbose) { &hprintf("Error: %d %s\n", $results{$uri}{location}{code}, $results{$uri}{location}{message}); } return; } } #################### # Parse a document # #################### sub parse_document ($$$$;$) { my ($uri, $location, $document, $links, $rec_needs_links) = @_; my $p; if (defined($results{$uri}{parsing})) { # We have already done the job. Woohoo! $p->{base} = $results{$uri}{parsing}{base}; $p->{Anchors} = $results{$uri}{parsing}{Anchors}; $p->{Links} = $results{$uri}{parsing}{Links}; return($p); } my $start; $p = W3C::CheckLink->new(); $p->{base} = $location; if (! $_summary) { $start = &get_timestamp(); print("Parsing...\n"); } if (!$_summary || $_progress) { $p->{Total} = ($document =~ tr/\n//); } # We only look for anchors if we are not interested in the links # obviously, or if we are running a recursive checking because we # might need this information later $p->{only_anchors} = !($links || $rec_needs_links); # Transform into for parsing # Processing instructions are not parsed by process, but in this case # it should be. It's expensive, it's horrible, but it's the easiest way # for right now. if (!$p->{only_anchors}) { $document =~ s/\<\?(xml:stylesheet.*?)\?\>/\<$1\>/; } $p->parse($document); if (! $_summary) { my $stop = &get_timestamp(); if ($_progress) { print "\r"; } &hprintf(" done (%d lines in %ss).\n", $p->{Total}, &time_diff($start, $stop)); } # Save the results before exiting $results{$uri}{parsing}{base} = $p->{base}; $results{$uri}{parsing}{Anchors} = $p->{Anchors}; $results{$uri}{parsing}{Links} = $p->{Links}; return($p); } ################################### # Constructor for W3C::CheckLink # ################################### sub W3C::CheckLink::new { my $p = HTML::Parser::new(@_, api_version => 3); # Start tags $p->handler(start => 'start', 'self, tagname, attr, text, event, tokens'); # Declarations $p->handler(declaration => sub { my $self = shift; $self->declaration(substr($_[0], 2, -1)); }, 'self, text'); # Other stuff $p->handler(default => 'text', 'self, text'); # Line count $p->{Line} = 1; # Check ? $p->{check_name} = 1; # Check <[..] id="..">? $p->{check_id} = 1; # Don't interpret comment loosely $p->strict_comment(1); return $p; } ################################################# # Record or return the doctype of the document # ################################################# sub W3C::CheckLink::doctype { my ($self, $dc) = @_; if (! $dc) { return $self->{doctype}; } $_ = $self->{doctype} = $dc; # What to look for depending on the doctype if ($_ eq '-//W3C//DTD XHTML Basic 1.0//EN') { $self->{check_name} = 0; } # Check for the id tag if ( # HTML 2.0 & 3.0 m%^-//IETF//DTD HTML [23]\.0//% || # HTML 3.2 m%^-//W3C//DTD HTML 3\.2//%) { $self->{check_id} = 0; } # Enable XML extensions if (m%^-//W3C//DTD XHTML %) { $self->xml_mode(1); } } ####################################### # Count the number of lines in a file # ####################################### sub W3C::CheckLink::new_line { my ($self, $string) = @_; my $count = ($string =~ tr/\n//); $self->{Line} = $self->{Line} + $count; if ($_progress) { printf("\r%4d%%", int($self->{Line}/$self->{Total}*100)); } } ############################# # Extraction of the anchors # ############################# sub W3C::CheckLink::get_anchor { my ($self, $tag, $attr) = @_; my $anchor; if ($self->{check_id}) { $anchor = $attr->{id}; } if ($self->{check_name} && ($tag eq 'a')) { # @@@@ In XHTML, is mandatory # Force an error if it's not the case (or if id's and name's values # are different) # If id is defined, name if defined must have the same value if (!$anchor) { $anchor = $attr->{name}; } } return($anchor); } ########################### # W3C::CheckLink handlers # ########################### sub add_link { my ($self, $uri) = @_; if (defined($uri)) { $self->{Links}{$uri}{$self->{Line}}++; } } sub start { my ($self, $tag, $attr, $text) = @_; # Anchors my $anchor = $self->get_anchor($tag, $attr); if (defined($anchor)) { $self->{Anchors}{$anchor}{$self->{Line}}++; } # Links if (!$self->{only_anchors}) { # Here, we are checking too many things # The right thing to do is to parse the DTD... if ($tag eq 'base') { if (defined($attr->{href})) { $self->{base} = $attr->{href}; } } else { $self->add_link($attr->{href}); } $self->add_link($attr->{src}); if ($tag eq 'object') { $self->add_link($attr->{data}); } if ($tag eq 'blockquote') { $self->add_link($attr->{cite}); } } # Line counting if ($text =~ m/\n/) { $self->new_line($text); } } sub text { my ($self, $text) = @_; if (!$_progress) { # If we are just extracting information about anchors, # parsing this part is only cosmetic (progress indicator) return unless !$self->{only_anchors}; } if ($text =~ /\n/) { $self->new_line($text); } } sub W3C::CheckLink::declaration { my ($self, $text) = @_; # Extract the doctype my @declaration = split(/\s+/, $text, 4); if (($#declaration >= 3) && ($declaration[0] eq 'DOCTYPE') && (lc($declaration[1]) eq 'html')) { # Parse the doctype declaration $text =~ m/^DOCTYPE\s+html\s+PUBLIC\s+\"([^\"]*)\"(\s+\"([^\"]*)\")?\s*$/i; # Store the doctype if ($1) { $self->doctype($1); } # If there is a link to the DTD, record it if (!$self->{only_anchors} && $3) { $self->{Links}{$3}{$self->{Line}}++; } } return unless !$self->{only_anchors}; $self->text($text); } ################################ # Check the validity of a link # ################################ sub check_validity ($$\%\%) { my ($testing, $uri, $links, $redirects) = @_; # $testing is the URI of the document checked # $uri is the URI of the target that we are verifying # $links is a hash of the links in the documents checked # $redirects is a map of the redirects encountered # Checking file: URI's is not allowed with a CGI if ($testing ne $uri) { if ((! $_cl) && (!($testing =~ m/^file:/)) && ($uri =~ m/^file:/)) { # Can't test? Return 400 Bad request. $results{$uri}{location}{code} = 400; $results{$uri}{location}{success} = 0; $results{$uri}{location}{message} = 'Error: \'file:\' URI not allowed'; if ($_verbose) { &hprintf("Error: %d %s\n", $results{$uri}{location}{code}, $results{$uri}{location}{message}); } return; } } # Get the document with the appropriate method # Only use GET if there are fragments. HEAD is enough if it's not the # case. my @fragments = keys %{$links->{$uri}{fragments}}; my $method = scalar(@fragments) ? 'GET' : 'HEAD'; my $response; my $being_processed = 0; if ((! defined($results{$uri})) || (($method eq 'GET') && ($results{$uri}{method} eq 'HEAD'))) { $being_processed = 1; $response = &get_uri($method, $uri); # Get the information back from get_uri() &record_results($uri, $method, $response); # Record the redirects &record_redirects($redirects, $response->{Redirects}); } # We got the response of the HTTP request. Stop here if it was a HEAD. return if ($method eq 'HEAD'); # There are fragments. Parse the document. my $p; if ($being_processed) { # Can we really parse the document? if (!defined($results{$uri}{location}{type})) { return; } if (! (($results{$uri}{location}{type} =~ m|text/html|i) || ($results{$uri}{location}{type} =~ m|application/xhtml\+xml|i)) ) { if ($_verbose) { &hprintf("Can't check content: Content-type is '%s'.\n", $response->header('Content-type')); } return; } # Do it then $p = &parse_document($uri, $response->base(), $response->as_string(), 0); } else { # We already had the information $p->{Anchors} = $results{$uri}{parsing}{Anchors}; } # Check that the fragments exist foreach my $fragment (keys %{$links->{$uri}{fragments}}) { if (defined($p->{Anchors}{$fragment}) || &escape_match($fragment, $p->{Anchors})) { $results{$uri}{fragments}{$fragment} = 1; } else { $results{$uri}{fragments}{$fragment} = 0; } } } sub escape_match ($\%) { my ($a, $hash) = (URI::Escape::uri_unescape($_[0]), $_[1]); foreach my $b (keys %$hash) { if ($a eq URI::Escape::uri_unescape($b)) { return(1); } } return(0); } ########################## # Ask for authentication # ########################## sub authentication ($) { my $r = $_[0]; $r->headers->www_authenticate =~ /Basic realm=\"([^\"]+)\"/; my $realm = $1; my $authHeader = $r->headers->www_authenticate; if ($_cl) { printf(STDERR "\nAuthentication is required for %s.\n", $r->request->url); printf(STDERR "The realm is %s.\n", $realm); print(STDERR "Use the -u and -p options to specify a username and password.\n"); } else { printf("Status: 401 Authorization Required\nWWW-Authenticate: %s\nConnection: close\nContent-Language: en\nContent-Type: text/html\n\n", $r->headers->www_authenticate); printf("%s 401 Authorization Required

Authorization Required

You need %s access to %s to perform Link Checking.

Anchors

"); } else { print("Anchors\n\n"); } &hprintf("Found %d anchor%s.", $n, ($n == 1) ? '' : 's'); print('

') if $_html; print("\n"); } # List of the duplicates, if any. my @errors = keys %{$errors}; if (! scalar(@errors)) { print("

Valid anchors!

\n") if (! $_quiet && $_html && $n); return; } undef $n; print('

') if $_html; print('List of duplicate and empty anchors'); print("

\n\n") if $_html; print("\n"); foreach my $anchor (@errors) { my $format; my @unique = &sort_unique(keys %{$anchors->{$anchor}}); if ($_html) { $format = "\n"; } else { my $s = (scalar(@unique) > 1) ? 's' : ''; $format = "\t%s\tLine$s: %s\n"; } printf($format, &encode($anchor eq '' ? 'Empty anchor' : $anchor), join(', ', @unique)); } print("

Anchors	Lines
%s	%s

\n") if $_html; } sub show_link_report (\%\%\%\%\@;$\%) { my ($links, $results, $broken, $redirects, $urls, $codes, $todo) = @_; print("\n

What to do: %s%s; HTTP Code returned: %d%s
HTTP Message: %s%s%s; Line%s: %s; Broken fragments and their line numbers: %s
\n", $fragment_direction); } } else { printf("\n%s\t%s\n Code: %d%s %s\nTo do: %s\n", # List of redirects $redirected ? join("\n-> ", &get_redirects($u, %$redirects)) : $u, # List of lines $lines_list ? "Line$s: $lines_list" : '', # Original HTTP reply $results->{$u}{location}{orig}, # Final HTTP reply ($results->{$u}{location}{code} != $results->{$u}{location}{orig}) ? ' -> '.$results->{$u}{location}{code} : '', # HTTP message $results->{$u}{location}{message} ? $results->{$u}{location}{message} : '', # What to do $whattodo); if ($#fragments >= 0) { if ($results->{$u}{location}{code} == 200) { print("The following fragments need to be fixed:\n"); } else { print("Fragments:\n"); } } } # Fragments foreach my $f (@fragments) { if ($_html) { printf("
%s: %s
\n", # Broken fragment &show_url($u, $f), # List of lines join(', ', &sort_unique(keys %{$links->{$u}{fragments}{$f}}))); } else { my @unq = &sort_unique(keys %{$links->{$u}{fragments}{$f}}); printf("\t%-30s\tLine%s: %s\n", # Fragment $f, # Multiple? (scalar(@unq) > 1) ? 's' : '', # List of lines join(', ', @unq)); } } print("

\n") if $_html; } sub code_shown ($$) { my ($u, $results) = @_; if ($results->{$u}{location}{record} == 200) { return $results->{$u}{location}{orig}; } else { return $results->{$u}{location}{record}; } } sub links_summary (\%\%\%\%) { # Advices to fix the problems my %todo = ( 200 => 'There are broken fragments which must be fixed.', 300 => 'It usually means that there is a typo in a link that triggers mod_speling action - this must be fixed!', 301 => 'You should update the link.', 302 => 'Usually nothing.', 400 => 'Usually the sign of a malformed URL that cannot be parsed by the server.', 401 => "The link is not public. You'd better specify it.", 403 => 'The link is forbidden! This needs fixing. Usual suspects: a missing index.html or Overview.html, or a missing ACL.', 404 => 'The link is broken. Fix it NOW!', 405 => 'The server does not allow HEAD requests. Go ask the guys who run this server why. Check the link manually.', 407 => 'The link is a proxy, but requires Authentication.', 408 => 'The request timed out.', 410 => 'The resource is gone. You should remove this link.', 415 => 'The media type is not supported.', 500 => 'Either the hostname is incorrect or it is a server side problem. Check the detailed list.', 501 => 'Could not check this link: method not implemented or scheme not supported.', 503 => 'The server cannot service the request, for some unknown reason.'); my %priority = ( 410 => 1, 404 => 2, 403 => 5, 200 => 10, 300 => 15, 401 => 20 ); my ($links, $results, $broken, $redirects) = @_; # List of the broken links my @urls = keys %{$broken}; my @dir_redirect_urls = (); if ($_redirects) { # Add the redirected URI's to the report for my $l (keys %$redirects) { next unless (defined($results->{$l}) && defined($links->{$l}) && !defined($broken->{$l})); # Check whether we have a "directory redirect" # e.g. http://www.w3.org/TR -> http://www.w3.org/TR/ my @redirects = &get_redirects($l, %$redirects); if (($#redirects == 1) && (($redirects[0].'/') eq $redirects[1])) { push(@dir_redirect_urls, $l); next; } push(@urls, $l); } } # Broken links and redirects if ($#urls < 0) { if (! $_quiet) { if ($_html) { print "

Links

\n"; print "

Valid links!

"; } else { print "\nValid links."; } print "\n"; } } else { print('

') if $_html; print("\nList of broken links"); print(' and redirects') if $_redirects; # Sort the URI's by HTTP Code my %code_summary; my @idx; foreach my $u (@urls) { if (defined($results->{$u}{location}{record})) { my $c = &code_shown($u, $results); $code_summary{$c}++; push(@idx, $c); } } my @sorted = @urls[ sort { defined($priority{$idx[$a]}) ? defined($priority{$idx[$b]}) ? $priority{$idx[$a]} <=> $priority{$idx[$b]} : -1 : defined($priority{$idx[$b]}) ? 1 : $idx[$a] <=> $idx[$b] } 0 .. $#idx ]; @urls = @sorted; undef(@sorted); undef(@idx); if ($_html) { print('

Fragments listed are broken. See the table below to know what action to take.

'); # Print a summary print "\n\n"; foreach my $code (sort(keys(%code_summary))) { printf('', &bgcolor($code)); printf('', $doc_count, $code, $code); printf('', $code_summary{$code}); printf('', $todo{$code}); print "\n"; } print "

Code	Occurrences	What to do
%s	%s	%s

\n"; } else { print(':'); } &show_link_report($links, $results, $broken, $redirects, \@urls, 1, \%todo); } # Show directory redirects if ($_dir_redirects && ($#dir_redirect_urls > -1)) { print('

') if $_html; print("\nList of directory redirects"); print("

The links below are not broken, but the document does not use the exact URL.

") if $_html; &show_link_report($links, $results, $broken, $redirects, \@dir_redirect_urls); } } ############################################################################### ################ # Global stats # ################ sub global_stats () { my $stop = &get_timestamp(); my $n_docs = ($doc_count <= $_max_documents) ? $doc_count : $_max_documents; return sprintf('Checked %d document%s in %s seconds.', $n_docs, ($n_docs == 1) ? '' : 's', &time_diff($timestamp, $stop)); } ################## # HTML interface # ################## sub html_header ($;$$) { my ($uri, $doform, $cookie) = @_; $uri = &encode($uri); my $title = ' Link Checker' . ($uri eq '' ? '' : ': ' . $uri); # mod_perl 1.99_05 doesn't seem to like if the "\n\n" isn't in the same # print() statement as the last header... my $headers = ''; if (! $_cl) { $headers .= "Cache-Control: no-cache\nPragma: no-cache\n" if $doform; $headers .= "Content-Type: text/html; charset=iso-8859-1\n"; $headers .= "Content-Script-Type: text/javascript\n"; $headers .= "Set-Cookie: $cookie\n" if $cookie; $headers .= "Content-Language: en\n\n"; } my $script = my $onload = ''; if ($doform) { $script = " "; $onload = ' onload="document.forms[0].uri.focus()"'; } print $headers, $DocType, " W3C", $title, " ", $script, "

$\"W3C\"$ ", $title, "

\n\n"; } sub bgcolor ($) { my ($code) = @_; my $class; my $r = HTTP::Response->new($code); if ($r->is_success()) { return ''; } elsif ($code == 300) { $class = 'multiple'; } elsif ($code == 401) { $class = 'unauthorized'; } elsif ($r->is_redirect()) { $class = 'redirect'; } elsif ($r->is_error()) { $class = 'broken'; } else { $class = 'broken'; } return(' class="'.$class.'"'); } sub show_url ($;$) { my ($url, $fragment) = @_; if (defined($fragment)) { $url .= '#'.$fragment; } return sprintf('%s', $url, &encode(defined($fragment) ? $fragment : $url)); } sub html_footer () { if (($doc_count > 0) && !$_quiet) { printf("

\n", &global_stats()); } print "

$PROGRAM $REVISION, by Hugo Haas and others.
Please send bug reports, suggestions and comments to the www-validator mailing list (archives).
Check out the documentation. Download the source code from CVS.

"; } sub file_uri ($) { my ($uri) = @_; &html_header($uri); print "

Forbidden

You cannot check such a URI ($uri).

"; &html_footer(); exit; } sub print_form ($) { my ($q) = @_; my $chk = ' checked="checked"'; $q->param('hide_type', 'all') unless $q->param('hide_type'); my $sum = $q->param('summary') ? $chk : ''; my $red = $q->param('hide_redirects') ? $chk : ''; my $all = ($q->param('hide_type') ne 'dir') ? $chk : ''; my $dir = $all ? '' : $chk; my $acc = $q->param('no_accept_language') ? $chk : ''; my $rec = $q->param('recursive') ? $chk : ''; my $dep = &encode($q->param('depth') || ''); my $cookie_options = ''; if ($q->cookie()) { $cookie_options = " Don't modify saved options Save these options Clear saved options"; } else { $cookie_options = " Save options in a cookie"; } print " "; } sub encode (@) { if (! $_html) { return @_; } else { return HTML::Entities::encode(@_); } } sub hprintf (@) { if (! $_html) { printf(@_); } else { print HTML::Entities::encode(sprintf($_[0], @_[1..@_-1])); } }