summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-xhttpd/cgi-bin/LinkChecker.pl786
1 files changed, 786 insertions, 0 deletions
diff --git a/httpd/cgi-bin/LinkChecker.pl b/httpd/cgi-bin/LinkChecker.pl
new file mode 100755
index 0000000..c46295d
--- /dev/null
+++ b/httpd/cgi-bin/LinkChecker.pl
@@ -0,0 +1,786 @@
+#! /usr/bin/perl -w
+#
+# W3C Link Checker
+# by Hugo Haas
+# (c) 1999 World Wide Web Consortium
+# based on Renaud Bruyeron's checklin.pl
+#
+# $Id: LinkChecker.pl,v 1.1 1999-11-24 22:04:35 hugo Exp $
+#
+# This program is licensed under the W3C License.
+
+package W3C::LinkChecker;
+require HTML::Parser;
+@W3C::LinkChecker::ISA = qw(HTML::Parser);
+package W3C::UserAgent;
+require LWP::UserAgent;
+@W3C::UserAgent::ISA = qw(LWP::UserAgent);
+
+# Autoflush
+$| = 1;
+
+# Version info
+my $PROGRAM = 'W3C LinkChecker';
+my $VERSION = '$Revision: 1.1 $ (c) 1999 W3C';
+
+# State of the program
+my $_cl;
+my $_quiet = 0;
+my $_summary = 0;
+my $_verbose = 0;
+my $_progress = 0;
+my $_html = 0;
+my $_timeout = 60;
+my $_redirects = 1;
+my $_user;
+my $_password;
+my $query;
+
+if ($#ARGV >= 0) {
+ $_cl = 1;
+# Parse command line
+ my @uris = &parse_arguments();
+ if ($_user && (! $_password)) {
+ &ask_password();
+ }
+ foreach $uri (@uris) {
+ if (! $_summary) {
+ printf("%s %s\n", $PROGRAM ,$VERSION);
+ } else {
+ $_verbose = 0;
+ $_progress = 0;
+ }
+ $uri = urize($uri);
+ &check_uri($uri);
+ }
+} else {
+ use CGI;
+ $query = new CGI;
+ $_cl = 0;
+ $_progress = 0;
+ my $uri;
+ if ($query->param('uri')) {
+ $uri = $query->param('uri');
+ } else {
+ &print_form($query);
+ }
+ if ($uri =~ m/^file:/) {
+ &file_uri($uri);
+ } elsif (!($uri =~ m/:/)) {
+ $uri = 'http://'.$uri;
+ }
+ &check_uri($uri);
+}
+
+###############################################################################
+
+################################
+# Command line and usage stuff #
+################################
+
+sub parse_arguments() {
+ my @uris;
+ my $uris = 0;
+ while (@ARGV) {
+ $_ = shift(@ARGV);
+ if ($uris) {
+ push(@uris, $_);
+ } elsif (m/^--$/) {
+ $uris = 1;
+ } elsif (m/^-[^-up]/) {
+ if (m/q/) {
+ $_quiet = 1;
+ }
+ if (m/s/) {
+ $_summary = 1;
+ }
+ if (m/b/) {
+ $_redirects = 0;
+ }
+ if (m/v/) {
+ $_verbose = 1;
+ }
+ if (m/i/) {
+ $_progress = 1;
+ }
+ if (m/h/) {
+ $_html = 1;
+ }
+ } elsif (m/^--help$/) {
+ &usage();
+ } elsif (m/^--quiet$/) {
+ $_quiet = 1;
+ } elsif (m/^--summary$/) {
+ $_summary = 1;
+ } elsif (m/^--broken$/) {
+ $_redirects = 0;
+ } elsif (m/^--verbose$/) {
+ $_verbose = 1;
+ } elsif (m/^--indicator$/) {
+ $_progress = 1;
+ } elsif (m/^--html$/) {
+ $_html = 1;
+ } elsif (m/^-u|--user$/) {
+ $_user = shift(@ARGV);
+ } elsif (m/^-p|--password$/) {
+ $_password = shift(@ARGV);
+ } elsif (m/^-t|--timeout$/) {
+ $_timeout = shift(@ARGV);
+ } else {
+ push(@uris, $_);
+ }
+ }
+ return(@uris);
+}
+
+sub usage() {
+ print STDERR "$PROGRAM $VERSION
+Usage: LinkCheck.pl <options> <uris>
+Options:
+ -s/--summary Result summary only.
+ -b/--broken Show only the broken links, not the redirects.
+ -q/--quiet No output if no errors are found.
+ -v/--verbose Verbose mode.
+ -i/--indicator Show progress while parsing.
+ -u/--user username Specify a username for authentication.
+ -p/--password password Specify a password.
+ -t/--timeout value Timeout for the HTTP requests.
+ -h/--html HTML output.
+ --help Show this message.
+";
+ exit(0);
+}
+
+sub ask_password() {
+ print(STDERR 'Enter your password: ');
+ system('stty -echo');
+ chomp($_password = <STDIN>);
+ system('stty echo');
+ print(STDERR "ok.\n");
+}
+
+###############################################################################
+
+###########################################
+# Transform foo into file://localhost/foo #
+###########################################
+
+sub urize() {
+ use URI;
+ $_ = $_[0];
+ my $base;
+ my $res = $_;
+ if (m/:/) {
+ $base = URI->new($_);
+ } elsif (m/^\//) {
+ $base = URI->new('file://localhost');
+ } else {
+ my $pwd;
+ chop($pwd = `pwd`);
+ $base = URI->new('file://localhost'.$pwd.'/');
+ }
+ $u = URI->new($res);
+ $result = $u->abs($base);
+ return($result->as_string());
+}
+
+#######################
+# Do the job on a URI #
+#######################
+
+sub check_uri() {
+ my $uri = $_[0];
+ my $start;
+ if (! $_summary) {
+ $start = &get_timestamp();
+ printf("\nProcessing\t%s\n", $uri);
+ }
+ # Get the document
+ my $response = &get_uri('GET', $uri);
+ if (! $response->is_success()) {
+ printf("Error: %d %s\n", $response->code(), $response->message());
+ if ($response->code() == 401) {
+ &authentication($response);
+ }
+ exit(-1);
+ }
+ my %redirects;
+ &record_redirects(\%redirects, $response->{Redirects});
+ # Parse the document
+ if (! ($response->header('Content-type') =~ m/text\/html/)) {
+ printf("Can't check link: Content-type is '%s'.\n",
+ $response->header('Content-type'));
+ return(-1);
+ }
+ my $base_uri = URI->new($response->base());
+ my $uri_uri = URI->new($uri);
+ my $absolute_base = $base_uri->abs($uri_uri);
+ my $p = &parse_document($absolute_base->as_string(), $response->content(), 1);
+ my $base = URI->new($p->{base});
+ # Check anchors
+ if (! $_summary) {
+ print("Checking anchors:\n");
+ }
+ my %errors;
+ foreach $anchor (keys %{$p->{Anchors}}) {
+ my @lines = keys %{$p->{Anchors}{$anchor}};
+ my $times = $#lines + 1;
+ if ($times > 1) {
+ $errors{$anchor} = 1;
+ }
+ }
+ if (! $_summary) {
+ print(" done.\n");
+ }
+ # Check links
+ my %links;
+ foreach $link (keys %{$p->{Links}}) {
+ my $link_uri = URI->new($link);
+ my $abs_link_uri = URI->new_abs($link_uri, $base);
+ foreach $lines (keys %{$p->{Links}{$link}}) {
+ my $canonical = URI->new($abs_link_uri->canonical());
+ my $url = $canonical->scheme().':'.$canonical->opaque();
+ my $fragment = $canonical->fragment() ? $canonical->fragment() : $url;
+ $links{$url}{$fragment}{$lines} = 1;
+ }
+ }
+ for $url (keys %links) {
+ if (!defined($links{$url}{$url})) {
+ $links{$url}{$url}{-1} = 1;
+ }
+ }
+ my %results;
+ my %broken;
+ foreach $u (keys %links) {
+ # Don't check mailto: URI's
+ next if ($u =~ m/^mailto:/);
+ if (! $_summary) {
+ printf("Checking link %s\n", $u);
+ }
+ &check_validity($uri, $u, \%links, \%results, \%redirects, $p->{Anchors});
+ if ($_verbose) {
+ printf("\tReturn code: %s\n", $results{$u}{$u}{code});
+ }
+ if ($results{$u}{$u}{success}) {
+ foreach $fragment (keys %{$links{$u}}) {
+ next if ($fragment eq $u);
+ if ($_versbose) {
+ printf("\t\t%s %s - Lines: %s\n",
+ $fragment,
+ ($results{$u}{$fragment}?'OK':'Not found'),
+ join(',', keys %{$links{$u}{$fragment}}));
+ }
+ if ($results{$u}{$fragment} == 0) {
+ $broken{$u}{$fragment} = 1;
+ }
+ }
+ } else {
+ $broken{$u}{$u} = 1;
+ foreach $fragment (keys %{$links{$u}}) {
+ $broken{$u}{$fragment} = 1;
+ }
+ }
+ }
+ if (! $_summary) {
+ my $stop = &get_timestamp();
+ printf("Processed in %ss.\n", &time_diff($start, $stop));
+ }
+ # Display results
+ if ($_html) {
+ &html_header($uri);
+ } else {
+ print "\n";
+ }
+ &anchors_summary($p->{Anchors}, \%errors);
+ &links_summary(\%links, \%results, \%broken, \%redirects);
+ if ($_html) {
+ &html_footer();
+ }
+}
+
+############################
+# Get the content of a URI #
+############################
+
+sub W3C::UserAgent::redirect_ok {
+ my ($self, $request) = @_;
+
+ if (! $_summary) {
+ printf("\n%s %s ", $request->method(), $request->uri());
+ }
+
+ $self->{Redirects}{$self->{fetching}} = $request->uri();
+ $self->{fetching} = $request->uri();
+
+ return 0 if $request->method() eq "POST";
+ return 1;
+}
+
+sub get_uri() {
+ my ($method, $uri) = @_;
+ my $start = &get_timestamp();
+ my $ua = new W3C::UserAgent;
+ $ua->timeout($_timeout);
+ $ua->agent($PROGRAM.' '.$VERSION);
+ $ua->{uri} = $uri;
+ $ua->{fetching} = $uri;
+ my $count = 0;
+ my $response;
+ if (! $_summary) {
+ printf("%s %s ", $method, $uri);
+ }
+ my $request = new HTTP::Request($method, $uri);
+ $response = $ua->request($request);
+ $response->{Redirects} = $ua->{Redirects};
+ my $stop = &get_timestamp();
+ if (! $_summary) {
+ printf(" fetched in %ss\n", &time_diff($start,$stop));
+ }
+ return $response;
+}
+
+####################
+# Parse a document #
+####################
+
+sub parse_document() {
+ my ($uri, $document, $links) = @_;
+
+ my $start;
+ my $p = W3C::LinkChecker->new();
+ if (! $_summary) {
+ $start = &get_timestamp();
+ print("Parsing...\n");
+ }
+ $p->{base} = $uri;
+ $p->{Line} = 1;
+ if (!$_summary || $_progress) {
+ $p->{Total} = ($document =~ tr/\n//);
+ }
+ $p->{extract_links} = $links;
+ $p->parse($document);
+ if (! $_summary) {
+ my $stop = &get_timestamp();
+ if ($_progress) {
+ print "\r";
+ }
+ printf(" done (%d lines in %ss).\n",
+ $p->{Total},
+ &time_diff($start, $stop));
+ }
+ return $p;
+}
+
+#######################################
+# Count the number of lines in a file #
+#######################################
+
+sub W3C::LinkChecker::new_line() {
+ my ($self, $string) = @_;
+ my $count = ($string =~ tr/\n//);
+ $self->{Line} = $self->{Line} + $count;
+ if ($_progress) {
+ printf("\r%4d%%", int($self->{Line}/$self->{Total}*100));
+ }
+}
+
+#######################################
+# start function used by HTML::Parser #
+#######################################
+
+sub W3C::LinkChecker::start() {
+ my ($self, $tag, $attr, $text) = @_;
+ my $anchor;
+ # Links
+ if ($self->{extract_links}) {
+ my $link;
+ if ($tag eq 'a') {
+ $link = $attr->{href};
+ $anchor = $attr->{name};
+ } elsif ($tag eq 'img') {
+ $link = $attr->{src};
+ } elsif (($tag eq 'frame') || ($tag eq 'link')) {
+ $link = $attr->{href};
+ }
+ if (defined($link)) {
+ $self->{Links}{$link}{$self->{Line}}++;
+ }
+ # Just anchors
+ } else {
+ if ($tag eq 'a') {
+ $anchor = $attr->{name};
+ }
+ }
+ if (defined($anchor)) {
+ $self->{Anchors}{$anchor}{$self->{Line}}++;
+ }
+ # Line counting
+ if ($text =~ m/\n/) {
+ $self->new_line($text);
+ }
+}
+
+#####################################################
+# text and end functions for line counting purposes #
+#####################################################
+
+sub W3C::LinkChecker::text() {
+ my ($self, $text) = @_;
+ if (!$_progress) {
+ return unless $self->{extract_links};
+ }
+ if ($text =~ /\n/) {
+ $self->new_line($text);
+ }
+}
+
+sub W3C::LinkChecker::end() {
+ my $self = shift;
+ return unless $self->{extract_links};
+ shift;
+ my $text = shift;
+ $self->text($self, $text);
+}
+
+################################
+# Check the validity of a link #
+################################
+
+sub check_validity($, $, \%, \%, \%, \%) {
+ use HTTP::Status;
+ my ($testing, $uri, $links, $results, $redirects, $anchors) = @_;
+ # Checking file: URI's is not allowed with a CGI
+ if ($testing ne $uri) {
+ if ((! $_cl) && (!($testing =~ m/^file:/)) && ($uri =~ m/^file:/)) {
+ $results->{$uri}{$uri}{code} = $RC_BAD_REQUEST;
+ $results->{$uri}{$uri}{success} = 0;
+ $results->{$uri}{$uri}{message} = 'Error: \'file:\' URI not allowed';
+ if ($_verbose) {
+ printf("Error: %d %s\n",
+ $results->{$uri}{$uri}{code},
+ $results->{$uri}{$uri}{message});
+ }
+ return;
+ }
+ }
+ # Get the document with the appropriate method
+ my $method;
+ my @fragments = keys %{$links->{$uri}};
+ if ($testing eq $uri) {
+ $method = 'Already have';
+ } elsif ($#fragments == 0) {
+ $method = 'HEAD';
+ } else {
+ $method = 'GET';
+ }
+ my $response;
+ if ($testing eq $uri) {
+ # Mimic an HTTP::Response object
+ $results->{$uri}{$uri}{code} = 600;
+ $results->{$uri}{$uri}{success} = 1;
+ } else {
+ $response = &get_uri($method, $uri);
+ # Record the redirects
+ &record_redirects($redirects, $response->{Redirects});
+ # Parse the document if necessary and possible
+ $results->{$uri}{$uri}{code} = $response->code();
+ $results->{$uri}{$uri}{success} = $response->is_success();
+ if (! $results->{$uri}{$uri}{success}) {
+ $results->{$uri}{$uri}{message} = $response->message();
+ if ($_verbose) {
+ printf("Error: %d %s\n",
+ $results->{$uri}{$uri}{code},
+ $results->{$uri}{$uri}{message});
+ }
+ return;
+ }
+ }
+ if ($#fragments == 0) {
+ return;
+ }
+ my $p;
+ if ($testing ne $uri) {
+ if (!(($results->{$uri}{$uri}{type} = $response->header('Content-type')) =~ m/text\/html/i)) {
+ if ($_verbose) {
+ printf("Can't check content: Content-type is '%s'.\n",
+ $response->header('Content-type'));
+ }
+ return;
+ }
+ $p = &parse_document($response->base(), $response->as_string(), 0);
+ } else {
+ $p->{Anchors} = $anchors;
+ }
+ # Check that the fragments exist
+ foreach $fragment (keys %{$links->{$uri}}) {
+ next if ($fragment eq $uri);
+ if (defined($p->{Anchors}{$fragment})
+ || &escape_match($fragment, $p->{Anchors})) {
+ $results->{$uri}{$fragment} = 1;
+ } else {
+ $results->{$uri}{$fragment} = 0;
+ }
+ }
+}
+
+sub escape_match($, \%) {
+ use URI::Escape;
+ my ($a, $hash) = (uri_unescape($_[0]), $_[1]);
+ foreach $b (keys %$hash) {
+ if ($a eq uri_unescape($b)) {
+ return(1);
+ }
+ }
+ return(0);
+}
+
+##########################
+# Ask for authentication #
+##########################
+
+sub authentication() {
+ my $r = $_[0];
+ $r->headers->www_authenticate =~ /Basic realm=\"([^\"]+)\"/;
+ my $realm = $1;
+ if ($_cl) {
+ printf(STDERR "Authentication is required for %s.\n", $r->request->url);
+ printf(STDERR "The realm is %s.\n", $realm);
+ print(STDERR "Use the -u and -p options to specify a username and password.\n");
+ } else {
+ }
+}
+
+##################
+# Get statistics #
+##################
+
+sub get_timestamp() {
+ require 'sys/syscall.ph';
+ my $timestamp = pack('LL', ());
+ syscall(&SYS_gettimeofday, $timestamp, 0) != -1 or $timestamp = 0;
+ return($timestamp);
+}
+
+sub time_diff() {
+ my @start = unpack('LL', $_[0]);
+ my @stop = unpack('LL', $_[1]);
+ for ($start[1], $stop[1]) {
+ $_ /= 1_000_000;
+ }
+ return(sprintf("%.2f", ($stop[0]+$stop[1])-($start[0]+$start[1])));
+}
+
+########################
+# Handle the redirects #
+########################
+
+sub record_redirects(\%, \%) {
+ my ($redirects, $sub) = @_;
+ foreach $r (keys %$sub) {
+ $redirects->{$r} = $sub->{$r};
+ }
+}
+
+sub is_redirected($, %) {
+ my ($uri, %redirects) = @_;
+ return(defined($redirects{$uri}));
+}
+
+sub get_redirects($, %) {
+ my ($uri, %redirects) = @_;
+ my @history = ($uri);
+ my $origin = $uri;
+ my %seen;
+ while ($redirects{$uri}) {
+ $uri = $redirects{$uri};
+ push(@history, $uri);
+ last if ($uri eq $origin);
+ }
+ return(@history);
+}
+
+#####################
+# Print the results #
+#####################
+
+sub anchors_summary(\%, \%) {
+ my ($anchors, $errors) = @_;
+ # Number of anchors found.
+ if (! $_quiet) {
+ if ($_html) {
+ print('<p>');
+ }
+ my @anchors = keys %{$anchors};
+ printf("Found %d anchors.", $#anchors+1);
+ if ($_html) {
+ print('</p>');
+ }
+ print("\n");
+ }
+ # List of the duplicates, if any.
+ @errors = keys %{$errors};
+ return if ($#errors < 0);
+ if ($_html) {
+ print('<p>');
+ }
+ print('List of duplicate anchors:');
+ if ($_html) {
+ print("</p>\n<table border=\"1\">\n<tr><td><b>Anchors</b></td><td><b>Lines</b></td></tr>");
+ }
+ print("\n");
+ foreach $anchor (@errors) {
+ my $format;
+ if ($_html) {
+ $format = "<tr><td>%s</td><td>%s</td></tr>\n";
+ } else {
+ $format = "\t%s\tLines: %s\n";
+ }
+ printf($format, $anchor, join(', ', keys %{$anchors->{$anchor}}));
+ }
+ if ($_html) {
+ print("</table>\n");
+ }
+}
+
+sub links_summary(\%,\%,\%) {
+ my ($links, $results, $broken, $redirects) = @_;
+ if (! $_quiet) {
+ if ($_html) {
+ print("\n<hr>\n\n<p>");
+ }
+ my @links = keys %$links;
+ my $n_fragments = 0;
+ my $n_total = 0;
+ foreach $u (@links) {
+ my @fragments = keys %{$links->{$u}};
+ $n_fragments += $#fragments + 1;
+ foreach $f (@fragments) {
+ my @lines = keys %{$links->{$u}{$f}};
+ foreach $l (@lines) {
+ $n_total += $links->{$u}{$f}{$l};
+ }
+ }
+ }
+ printf("Found %d locations for %d URI's (%d total).",
+ $#links+1,
+ $n_fragments,
+ $n_total);
+ if ($_html) {
+ print('</p>');
+ }
+ print("\n");
+ }
+ # List of the broken links
+ @urls = keys %{$broken};
+ return if ($#urls < 0);
+ if ($_html) {
+ print('<p>');
+ }
+ printf('List of broken %slinks:', $_redirects ? 'and redirected ' : '');
+ if ($_html) {
+ print("</p>\n<table border=\"1\">\n<tr><td><b>Location</b></td><td><b>Code</b></td><td><b>Fragment</b></td><td><b>Lines</b></td></tr>");
+ }
+ print("\n");
+ if ($_redirects) {
+ for $l (keys %$redirects) {
+ next unless (defined($results->{$l})
+ && !defined($broken->{$l}));
+ push(@urls, $l);
+ }
+ }
+ foreach $u (@urls) {
+ my @fragments = keys %{$broken->{$u}};
+ my $n_fragments = $#fragments+1;
+ my $redirected = &is_redirected($u, %$redirects);
+ my $lines_list = defined($links->{$u}{$u}{-1}) ?
+ '' : join(', ', keys %{$links->{$u}{$u}});
+ if ($_html) {
+ printf("<tr><th rowspan=\"%d\">%s</th><th rowspan=\"%d\">%d</th><td>%s</td><td>%s</td></tr>\n",
+ $n_fragments,
+ $redirected ? join('<br>-&gt; ',
+ &get_redirects($u, %$redirects)) : $u,
+ $n_fragments,
+ $results->{$u}{$u}{code},
+ '',
+ $lines_list);
+ } else {
+ printf("%s\tLines: %s\tCode: %d\n",
+ $redirected ? join("\n-> ",
+ &get_redirects($u, %$redirects)) : $u,
+ $lines_list,
+ $results->{$u}{$u}{code});
+ }
+ foreach $f (@fragments) {
+ next if ($f eq $u);
+ my $format;
+ if ($_html) {
+ $format = "<tr><td>%s</td><td>%s</td></tr>\n";
+ } else {
+ $format = "\t%s\tLines: %s\n";
+ }
+ printf($format,
+ $f,
+ join(', ', keys %{$links->{$u}{$f}}));
+ }
+ }
+ if ($_html) {
+ print("</table>\n");
+ }
+}
+
+###############################################################################
+
+##################
+# HTML interface #
+##################
+
+sub html_header() {
+ my $uri = HTML::Entities::encode($_[0]);
+ print "Content-type: text/html
+
+<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0//EN\">
+<html>
+<head>
+<title>W3C Link Ckecker: $uri</title>
+<style type=\"text/css\">
+</style>
+</head>
+<body>
+<h1>W3C Link Checker: $uri</h1>
+\n";
+}
+
+sub html_footer() {
+ print "
+<hr>
+<address>
+$PROGRAM $VERSION<br>
+Report bugs to <a href=\"hugo\@w3.org\">Hugo Haas</a>
+</address>
+</body>
+</html>
+";
+}
+
+sub file_uri() {
+ my $uri = $_[0];
+ &html_header($uri);
+ print "<h2>Forbidden</h2>
+<p>You cannot check such a URI (<code>$uri</code>).</p>
+";
+ &html_footer();
+ exit;
+}
+
+sub print_form() {
+ my ($q) = @_;
+ &html_header($VERSION);
+ print "<form action=\"".$q->self_url()."\" method=\"get\">
+<p>Enter the URI that you want to check:</p>
+<input type=\"text\" name=\"uri\">
+<input type=\"submit\" name=\"submit\" value=\"Check\">
+</form>
+";
+ &html_footer();
+ exit;
+}