summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorville <ville@localhost>2002-10-26 19:04:35 +0000
committerville <ville@localhost>2002-10-26 19:04:35 +0000
commit2ad3d1511810092f633d481f9510bfcbe11d9bd7 (patch)
tree01af9273d823419eb32b19adab4b4d7518998996
parent63f6853c2452be73c1776e41a2ed10d8241c289f (diff)
downloadmarkup-validator-2ad3d1511810092f633d481f9510bfcbe11d9bd7.zip
markup-validator-2ad3d1511810092f633d481f9510bfcbe11d9bd7.tar.gz
markup-validator-2ad3d1511810092f633d481f9510bfcbe11d9bd7.tar.bz2
Whitespace only (reindentation).
-rwxr-xr-xhttpd/cgi-bin/checklink.pl2548
1 files changed, 1272 insertions, 1276 deletions
diff --git a/httpd/cgi-bin/checklink.pl b/httpd/cgi-bin/checklink.pl
index 06cf39f..c5ad600 100755
--- a/httpd/cgi-bin/checklink.pl
+++ b/httpd/cgi-bin/checklink.pl
@@ -5,7 +5,7 @@
# (c) 1999-2002 World Wide Web Consortium
# based on Renaud Bruyeron's checklink.pl
#
-# $Id: checklink.pl,v 2.96 2002-10-26 18:29:06 ville Exp $
+# $Id: checklink.pl,v 2.97 2002-10-26 19:04:35 ville Exp $
#
# This program is licensed under the W3C(r) License:
# http://www.w3.org/Consortium/Legal/copyright-software
@@ -46,7 +46,7 @@ $| = 1;
# Version info
my $PROGRAM = 'W3C checklink';
-my $VERSION = q$Revision: 2.96 $ . '(c) 1999-2002 W3C';
+my $VERSION = q$Revision: 2.97 $ . '(c) 1999-2002 W3C';
my $REVISION; ($REVISION = $VERSION) =~ s/Revision: (\d+\.\d+) .*/$1/;
# Different options specified by the user
@@ -90,100 +90,100 @@ my $doc_count = 0;
my $timestamp = &get_timestamp;
if ($#ARGV >= 0 && !(@ARGV == 1 && $ARGV[0] eq 'DEBUG')) {
- $_cl = 1;
-# Parse command line
- &parse_arguments();
- if ($_user && (! $_password)) {
- &ask_password();
- }
- my $uri;
- foreach $uri (@ARGV) {
- if (!$_summary) {
- printf("%s %s\n", $PROGRAM, $VERSION) if (! $_html);
- } else {
- $_verbose = 0;
- $_progress = 0;
- }
- # Transform the parameter into a URI
- $uri = &urize($uri);
- &check_uri($uri, 0, $_depth);
- }
- if (($doc_count > 0) && !$_summary) {
- printf("\n%s\n", &global_stats());
- }
+ $_cl = 1;
+ # Parse command line
+ &parse_arguments();
+ if ($_user && (! $_password)) {
+ &ask_password();
+ }
+ my $uri;
+ foreach $uri (@ARGV) {
+ if (!$_summary) {
+ printf("%s %s\n", $PROGRAM, $VERSION) if (! $_html);
+ } else {
+ $_verbose = 0;
+ $_progress = 0;
+ }
+ # Transform the parameter into a URI
+ $uri = &urize($uri);
+ &check_uri($uri, 0, $_depth);
+ }
+ if (($doc_count > 0) && !$_summary) {
+ printf("\n%s\n", &global_stats());
+ }
} else {
- if (0) { # Uncomment if you want to replay HTTP requests in the debugger.
- if ($ARGV[0] eq 'DEBUG') {
- open (OUT, "/tmp/checklink.pl.log");
- foreach my $e (<OUT>) {
- chomp $e;
- my ($key, $value) = split(': ', $e, 2);
- $ENV{$key} = $value;
- print "$key: $value\n";
- }
- close OUT;
- } else {
- open (OUT, ">/tmp/checklink.pl.log");
- use POSIX qw(strftime);
- foreach my $e (keys %ENV) {
- print OUT "$e: $ENV{$e}\n";
- }
- my $now_string = strftime "%a %b %e %H:%M:%S %Y", localtime;
- print OUT "Run-Date: $now_string\n";
- close OUT;
- }
- }
-
- use CGI ();
- use CGI::Carp qw(fatalsToBrowser);
- $query = new CGI;
- # Set a few parameters in CGI mode
- $_cl = 0;
- $_verbose = 0;
- $_progress = 0;
- if ($query->param('summary')) {
- $_summary = 1;
+ if (0) { # Uncomment if you want to replay HTTP requests in the debugger.
+ if ($ARGV[0] eq 'DEBUG') {
+ open (OUT, "/tmp/checklink.pl.log");
+ foreach my $e (<OUT>) {
+ chomp $e;
+ my ($key, $value) = split(': ', $e, 2);
+ $ENV{$key} = $value;
+ print "$key: $value\n";
+ }
+ close OUT;
} else {
- }
- if ($query->param('hide_redirects')) {
- $_redirects = 0;
- }
- if ($query->param('no_accept_language')) {
- $_accept_language = 0;
- }
- if ($query->param('hide_dir_redirects')) {
- $_dir_redirects = 0;
- }
- if ($query->param('recursive')) {
- if ($_depth == 0) {
- $_depth = -1;
- }
- }
- if ($query->param('depth') && ($query->param('depth') != 0)) {
- $_depth = $query->param('depth');
- }
- $_html = 1;
- my $uri;
- if ($query->param('uri')) {
- $uri = $query->param('uri');
- } elsif ($query->param('url')) {
- $uri = $query->param('url');
+ open (OUT, ">/tmp/checklink.pl.log");
+ use POSIX qw(strftime);
+ foreach my $e (keys %ENV) {
+ print OUT "$e: $ENV{$e}\n";
+ }
+ my $now_string = strftime "%a %b %e %H:%M:%S %Y", localtime;
+ print OUT "Run-Date: $now_string\n";
+ close OUT;
+ }
+ }
+
+ use CGI ();
+ use CGI::Carp qw(fatalsToBrowser);
+ $query = new CGI;
+ # Set a few parameters in CGI mode
+ $_cl = 0;
+ $_verbose = 0;
+ $_progress = 0;
+ if ($query->param('summary')) {
+ $_summary = 1;
+ } else {
+ }
+ if ($query->param('hide_redirects')) {
+ $_redirects = 0;
+ }
+ if ($query->param('no_accept_language')) {
+ $_accept_language = 0;
+ }
+ if ($query->param('hide_dir_redirects')) {
+ $_dir_redirects = 0;
+ }
+ if ($query->param('recursive')) {
+ if ($_depth == 0) {
+ $_depth = -1;
+ }
+ }
+ if ($query->param('depth') && ($query->param('depth') != 0)) {
+ $_depth = $query->param('depth');
+ }
+ $_html = 1;
+ my $uri;
+ if ($query->param('uri')) {
+ $uri = $query->param('uri');
+ } elsif ($query->param('url')) {
+ $uri = $query->param('url');
+ } else {
+ &print_form($query);
+ }
+ $uri =~ s/^\s+//g;
+ if ($uri =~ m/^file:/) {
+ # Only the http scheme is allowed
+ &file_uri($uri);
+ } elsif (!($uri =~ m/:/)) {
+ if ($uri =~ m|^//|) {
+ $uri = 'http:'.$uri;
} else {
- &print_form($query);
- }
- $uri =~ s/^\s+//g;
- if ($uri =~ m/^file:/) {
- # Only the http scheme is allowed
- &file_uri($uri);
- } elsif (!($uri =~ m/:/)) {
- if ($uri =~ m|^//|) {
- $uri = 'http:'.$uri;
- } else {
- $uri = 'http://'.$uri;
- }
+ $uri = 'http://'.$uri;
}
- &check_uri($uri, 1, $_depth);
- &html_footer();
+ }
+ &check_uri($uri, 1, $_depth);
+ &html_footer();
}
###############################################################################
@@ -195,49 +195,49 @@ if ($#ARGV >= 0 && !(@ARGV == 1 && $ARGV[0] eq 'DEBUG')) {
sub parse_arguments ()
{
- use Getopt::Long 2.17 qw(GetOptions);
- Getopt::Long::Configure('no_ignore_case');
- my @masq = ();
-
- GetOptions('help' => \&usage,
- 'q|quiet' => sub { $_quiet = 1; $_summary = 1; },
- 's|summary' => \$_summary,
- 'b|broken' => sub { $_redirects = 0; },
- 'e|dir-redirects' => sub { $_dir_redirects = 0; },
- 'v|verbose' => \$_verbose,
- 'i|indicator' => \$_progress,
- 'h|html' => \$_html,
- 'n|noacclanguage' => sub { $_accept_language = 0; },
- 'r|recursive' => sub { $_depth = -1 if $_depth == 0; },
- 'l|location=s' => \$_base_location,
- 'u|user=s' => \$_user,
- 'p|password=s' => \$_password,
- 't|timeout=i' => \$_timeout,
- 'L|languages=s' => \$_languages,
- 'D|depth=i' => sub { $_depth = $_[1] unless $_[1] == 0; },
- 'd|domain=s' => \$_trusted,
- 'y|proxy=s' => \$_http_proxy,
- 'masquerade' => \@masq,
- 'hide-same-realm' => \$_hide_same_realm,
- 'V|version' => \&version,
- );
-
- if (@masq) {
- $_masquerade = 1;
- $_local_dir = shift(@masq);
- $_remote_masqueraded_uri = shift(@masq);
- }
+ use Getopt::Long 2.17 qw(GetOptions);
+ Getopt::Long::Configure('no_ignore_case');
+ my @masq = ();
+
+ GetOptions('help' => \&usage,
+ 'q|quiet' => sub { $_quiet = 1; $_summary = 1; },
+ 's|summary' => \$_summary,
+ 'b|broken' => sub { $_redirects = 0; },
+ 'e|dir-redirects' => sub { $_dir_redirects = 0; },
+ 'v|verbose' => \$_verbose,
+ 'i|indicator' => \$_progress,
+ 'h|html' => \$_html,
+ 'n|noacclanguage' => sub { $_accept_language = 0; },
+ 'r|recursive' => sub { $_depth = -1 if $_depth == 0; },
+ 'l|location=s' => \$_base_location,
+ 'u|user=s' => \$_user,
+ 'p|password=s' => \$_password,
+ 't|timeout=i' => \$_timeout,
+ 'L|languages=s' => \$_languages,
+ 'D|depth=i' => sub { $_depth = $_[1] unless $_[1] == 0; },
+ 'd|domain=s' => \$_trusted,
+ 'y|proxy=s' => \$_http_proxy,
+ 'masquerade' => \@masq,
+ 'hide-same-realm' => \$_hide_same_realm,
+ 'V|version' => \&version,
+ );
+
+ if (@masq) {
+ $_masquerade = 1;
+ $_local_dir = shift(@masq);
+ $_remote_masqueraded_uri = shift(@masq);
+ }
}
sub version ()
{
- print STDERR "$PROGRAM $VERSION\n";
- exit(0);
+ print STDERR "$PROGRAM $VERSION\n";
+ exit(0);
}
sub usage ()
{
- print STDERR "$PROGRAM $VERSION
+ print STDERR "$PROGRAM $VERSION
Usage: checklink <options> <uris>
Options:
@@ -279,16 +279,16 @@ Please send bug reports and comments to the www-validator mailing list:
www-validator\@w3.org (with 'checklink' in the subject)
Archives are at: http://lists.w3.org/Archives/Public/www-validator/
";
- exit(0);
+ exit(0);
}
sub ask_password ()
{
- printf(STDERR 'Enter the password for user %s: ', $_user);
- $Have_ReadKey ? ReadMode('noecho', *STDIN) : system('stty -echo');
- chomp($_password = <STDIN>);
- $Have_ReadKey ? ReadMode('restore', *STDIN) : system('stty echo');
- print(STDERR "ok.\n");
+ printf(STDERR 'Enter the password for user %s: ', $_user);
+ $Have_ReadKey ? ReadMode('noecho', *STDIN) : system('stty -echo');
+ chomp($_password = <STDIN>);
+ $Have_ReadKey ? ReadMode('restore', *STDIN) : system('stty echo');
+ print(STDERR "ok.\n");
}
###############################################################################
@@ -299,12 +299,12 @@ sub ask_password ()
sub urize ($)
{
- use URI ();
- use URI::Escape qw(uri_unescape);
- use URI::file ();
+ use URI ();
+ use URI::Escape qw(uri_unescape);
+ use URI::file ();
- my $u = URI->new_abs(uri_unescape($_[0]), URI::file->cwd());
- return $u->as_string();
+ my $u = URI->new_abs(uri_unescape($_[0]), URI::file->cwd());
+ return $u->as_string();
}
########################################
@@ -313,246 +313,246 @@ sub urize ($)
sub check_uri ($$$)
{
- my ($uri, $html_header, $depth) = @_;
- # If $html_header equals 1, we need to generate a HTML header (first
- # instance called in HTML mode).
+ my ($uri, $html_header, $depth) = @_;
+ # If $html_header equals 1, we need to generate a HTML header (first
+ # instance called in HTML mode).
- my $start;
- if (! $_quiet) {
- $start = &get_timestamp();
- }
+ my $start;
+ if (! $_quiet) {
+ $start = &get_timestamp();
+ }
- # Get and parse the document
- my $response = &get_document('GET', $uri, $doc_count, \%redirects);
+ # Get and parse the document
+ my $response = &get_document('GET', $uri, $doc_count, \%redirects);
- # Can we check the resource? If not, we exit here...
- if (defined($response->{Stop})) {
- return(-1);
- }
+ # Can we check the resource? If not, we exit here...
+ if (defined($response->{Stop})) {
+ return(-1);
+ }
- # We are checking a new document
- $doc_count++;
+ # We are checking a new document
+ $doc_count++;
- if ($_html) {
- if ($html_header) {
- &html_header($uri);
- }
- print('<h2>');
+ if ($_html) {
+ if ($html_header) {
+ &html_header($uri);
}
+ print('<h2>');
+ }
- my $absolute_uri = $response->{absolute_uri}->as_string();
+ my $absolute_uri = $response->{absolute_uri}->as_string();
- my $result_anchor = 'results'.$doc_count;
+ my $result_anchor = 'results'.$doc_count;
- printf("\nProcessing\t%s\n\n", $_html ? &show_url(&encode($absolute_uri))
- : $absolute_uri);
+ printf("\nProcessing\t%s\n\n", $_html ? &show_url(&encode($absolute_uri))
+ : $absolute_uri);
- if ($_html) {
- print("</h2>\n");
- if (! $_summary) {
- use URI::Escape ();
- printf("<p>Go to <a href='#%s'>the results</a>.</p>\n",
- $result_anchor);
- printf("<p>Check also:
+ if ($_html) {
+ print("</h2>\n");
+ if (! $_summary) {
+ use URI::Escape ();
+ printf("<p>Go to <a href='#%s'>the results</a>.</p>\n",
+ $result_anchor);
+ printf("<p>Check also:
<a href=\"check?uri=%s\">HTML Validity</a> &amp;
<a href=\"http://jigsaw.w3.org/css-validator/validator?uri=%s\">CSS
Validity</a></p>
<p>Back to the <a href=\"checklink\">link checker</a>.</p>\n",
- map{&encode(URI::Escape::uri_escape($absolute_uri,
- "^A-Za-z0-9."))}(1..2));
- print("<pre>\n");
- }
- }
-
- # Record that we have processed this resource
- $processed{$absolute_uri} = 1;
- # Parse the document
- my $p = &parse_document($uri, $absolute_uri,
- $response->content(), 1,
- $depth != 0);
- my $base = URI->new($p->{base});
-
- # Check anchors
- ###############
-
+ map{&encode(URI::Escape::uri_escape($absolute_uri,
+ "^A-Za-z0-9."))}(1..2));
+ print("<pre>\n");
+ }
+ }
+
+ # Record that we have processed this resource
+ $processed{$absolute_uri} = 1;
+ # Parse the document
+ my $p = &parse_document($uri, $absolute_uri,
+ $response->content(), 1,
+ $depth != 0);
+ my $base = URI->new($p->{base});
+
+ # Check anchors
+ ###############
+
+ if (! $_summary) {
+ print("Checking anchors:\n");
+ }
+ my %errors;
+ my $anchor;
+ foreach $anchor (keys %{$p->{Anchors}}) {
+ my $times;
+ my $l;
+ foreach $l (keys %{$p->{Anchors}{$anchor}}) {
+ $times += $p->{Anchors}{$anchor}{$l};
+ }
+ # They should appear only once
+ if ($times > 1) {
+ $errors{$anchor} = 1;
+ }
+ # Empty IDREF's are not allowed
+ if ($anchor eq '') {
+ $errors{$anchor} = 1;
+ }
+ }
+ if (! $_summary) {
+ print(" done.\n");
+ }
+
+ # Check links
+ #############
+
+ my %links;
+ # Record all the links found
+ my $link;
+ foreach $link (keys %{$p->{Links}}) {
+ my $link_uri = URI->new($link);
+ my $abs_link_uri = URI->new_abs($link_uri, $base);
+ if ($_masquerade) {
+ if ($abs_link_uri =~ m|^$_remote_masqueraded_uri|) {
+ printf("processing %s in base $_local_dir %s\n",
+ $abs_link_uri);
+ my $nlink;
+ $nlink = $abs_link_uri;
+ $nlink =~
+ s|^$_remote_masqueraded_uri|file://localhost$_local_dir|;
+ $abs_link_uri = URI->new($nlink);
+ };
+ }
+ my $lines;
+ foreach $lines (keys %{$p->{Links}{$link}}) {
+ my $canonical = URI->new($abs_link_uri->canonical());
+ my $url = $canonical->scheme().':'.$canonical->opaque();
+ my $fragment = $canonical->fragment();
+ if (! $fragment) {
+ # Document without fragment
+ $links{$url}{location}{$lines} = 1;
+ } else {
+ # Resource with a fragment
+ $links{$url}{fragments}{$fragment}{$lines} = 1;
+ }
+ }
+ }
+
+ # Build the list of broken URI's
+ my %broken;
+ my $u;
+ foreach $u (keys %links) {
+ # Don't check mailto: URI's
+ next if ($u =~ m/^mailto:/);
if (! $_summary) {
- print("Checking anchors:\n");
- }
- my %errors;
- my $anchor;
- foreach $anchor (keys %{$p->{Anchors}}) {
- my $times;
- my $l;
- foreach $l (keys %{$p->{Anchors}{$anchor}}) {
- $times += $p->{Anchors}{$anchor}{$l};
- }
- # They should appear only once
- if ($times > 1) {
- $errors{$anchor} = 1;
- }
- # Empty IDREF's are not allowed
- if ($anchor eq '') {
- $errors{$anchor} = 1;
- }
- }
- if (! $_summary) {
- print(" done.\n");
- }
-
- # Check links
- #############
-
- my %links;
- # Record all the links found
- my $link;
- foreach $link (keys %{$p->{Links}}) {
- my $link_uri = URI->new($link);
- my $abs_link_uri = URI->new_abs($link_uri, $base);
- if ($_masquerade) {
- if ($abs_link_uri =~ m|^$_remote_masqueraded_uri|) {
- printf("processing %s in base $_local_dir %s\n",
- $abs_link_uri);
- my $nlink;
- $nlink = $abs_link_uri;
- $nlink =~
- s|^$_remote_masqueraded_uri|file://localhost$_local_dir|;
- $abs_link_uri = URI->new($nlink);
- };
- }
- my $lines;
- foreach $lines (keys %{$p->{Links}{$link}}) {
- my $canonical = URI->new($abs_link_uri->canonical());
- my $url = $canonical->scheme().':'.$canonical->opaque();
- my $fragment = $canonical->fragment();
- if (! $fragment) {
- # Document without fragment
- $links{$url}{location}{$lines} = 1;
- } else {
- # Resource with a fragment
- $links{$url}{fragments}{$fragment}{$lines} = 1;
- }
- }
- }
-
- # Build the list of broken URI's
- my %broken;
- my $u;
- foreach $u (keys %links) {
- # Don't check mailto: URI's
- next if ($u =~ m/^mailto:/);
- if (! $_summary) {
- &hprintf("Checking link %s\n", $u);
- }
- # Check that a link is valid
- &check_validity($uri, $u, \%links, \%redirects);
+ &hprintf("Checking link %s\n", $u);
+ }
+ # Check that a link is valid
+ &check_validity($uri, $u, \%links, \%redirects);
+ if ($_verbose) {
+ &hprintf("\tReturn code: %s\n", $results{$u}{location}{code});
+ }
+ if ($results{$u}{location}{success}) {
+ my $fragment;
+ # Even though it was not broken, we might want to display it
+ # on the results page (e.g. because it required authentication)
+ if ($results{$u}{location}{display} >= 400) {
+ $broken{$u}{location} = 1;
+ }
+ # List the broken fragments
+ foreach $fragment (keys %{$links{$u}{fragments}}) {
if ($_verbose) {
- &hprintf("\tReturn code: %s\n", $results{$u}{location}{code});
+ &hprintf("\t\t%s %s - Lines: %s\n",
+ $fragment,
+ ($results{$u}{fragments}{$fragment}
+ ? 'OK' : 'Not found'),
+ join(',',
+ keys %{$links{$u}{fragments}{$fragment}})
+ );
}
- if ($results{$u}{location}{success}) {
- my $fragment;
- # Even though it was not broken, we might want to display it
- # on the results page (e.g. because it required authentication)
- if ($results{$u}{location}{display} >= 400) {
- $broken{$u}{location} = 1;
- }
- # List the broken fragments
- foreach $fragment (keys %{$links{$u}{fragments}}) {
- if ($_verbose) {
- &hprintf("\t\t%s %s - Lines: %s\n",
- $fragment,
- ($results{$u}{fragments}{$fragment}
- ? 'OK' : 'Not found'),
- join(',',
- keys %{$links{$u}{fragments}{$fragment}})
- );
- }
- # A broken fragment?
- if ($results{$u}{fragments}{$fragment} == 0) {
- $broken{$u}{fragments}{$fragment} += 2;
- }
- }
- } else {
- # Couldn't find the document
- $broken{$u}{location} = 1;
- # All the fragments associated are hence broken
- my $fragment;
- foreach $fragment (keys %{$links{$u}{fragments}}) {
- $broken{$u}{fragments}{$fragment}++;
- }
+ # A broken fragment?
+ if ($results{$u}{fragments}{$fragment} == 0) {
+ $broken{$u}{fragments}{$fragment} += 2;
}
- }
- if (! $_summary) {
- my $stop = &get_timestamp();
- &hprintf("Processed in %ss.\n", &time_diff($start, $stop));
- }
+ }
+ } else {
+ # Couldn't find the document
+ $broken{$u}{location} = 1;
+ # All the fragments associated are hence broken
+ my $fragment;
+ foreach $fragment (keys %{$links{$u}{fragments}}) {
+ $broken{$u}{fragments}{$fragment}++;
+ }
+ }
+ }
+ if (! $_summary) {
+ my $stop = &get_timestamp();
+ &hprintf("Processed in %ss.\n", &time_diff($start, $stop));
+ }
- # Display results
- if ($_html) {
- if (! $_summary) {
- print("</pre>\n");
- printf("<h2><a name='%s'>Results</a></h2>\n", $result_anchor);
- }
- }
- if (! $_quiet) {
- print "\n";
+ # Display results
+ if ($_html) {
+ if (! $_summary) {
+ print("</pre>\n");
+ printf("<h2><a name='%s'>Results</a></h2>\n", $result_anchor);
+ }
+ }
+ if (! $_quiet) {
+ print "\n";
+ }
+ &anchors_summary($p->{Anchors}, \%errors);
+ &links_summary(\%links, \%results, \%broken, \%redirects);
+
+ # Do we want to process other documents?
+ if ($depth != 0) {
+ if ($_base_location eq '.') {
+ # Get the name of the original directory
+ # e.g. http://www.w3.org/TR/html4/Overview.html
+ # should return http://www.w3.org/TR/html4/
+ $results{$uri}{parsing}{base} =~ m|^(.*/)[^/]*|;
+ $_base_location = $1;
}
- &anchors_summary($p->{Anchors}, \%errors);
- &links_summary(\%links, \%results, \%broken, \%redirects);
-
- # Do we want to process other documents?
- if ($depth != 0) {
- if ($_base_location eq '.') {
- # Get the name of the original directory
- # e.g. http://www.w3.org/TR/html4/Overview.html
- # should return http://www.w3.org/TR/html4/
- $results{$uri}{parsing}{base} =~ m|^(.*/)[^/]*|;
- $_base_location = $1;
+ foreach $u (keys %links) {
+ next if (! (# Check if it's in our scope for recursion
+ ($u =~ m|^$_base_location|) &&
+ # and the link is not broken
+ $results{$u}{location}{success} &&
+ # And it is a text/html or application/xhtml+xml
+ # resource
+ (($results{$u}{location}{type} =~ m|text/html|) ||
+ ($results{$u}{location}{type}
+ =~ m|application/xhtml\+xml|))
+ )
+ );
+ # Check if we have already processed the URI
+ next if (&already_processed($u) != 0);
+ # Do the job
+ print "\n";
+ if (! $_html) {
+ my $i = 40;
+ while ($i--) {
+ print('-');
}
- foreach $u (keys %links) {
- next if (! (# Check if it's in our scope for recursion
- ($u =~ m|^$_base_location|) &&
- # and the link is not broken
- $results{$u}{location}{success} &&
- # And it is a text/html or application/xhtml+xml
- # resource
- (($results{$u}{location}{type} =~ m|text/html|) ||
- ($results{$u}{location}{type}
- =~ m|application/xhtml\+xml|))
- )
- );
- # Check if we have already processed the URI
- next if (&already_processed($u) != 0);
- # Do the job
- print "\n";
- if (! $_html) {
- my $i = 40;
- while ($i--) {
- print('-');
- }
- } else {
- # For the online version, wait for a while to avoid abuses
- if (!$_cl) {
- if ($doc_count == $_max_documents) {
- print("<hr>\n<p><strong>Maximum number of documents reached!</strong></p>\n");
- }
- if ($doc_count >= $_max_documents) {
- $doc_count++;
- print("<p>Not checking <strong>$u</strong></p>\n");
- $processed{$u} = 1;
- next;
- }
- }
- print('<hr>');
- sleep($_sleep_time);
- }
- print "\n";
- if ($depth < 0) {
- &check_uri($u, 0, -1);
- } else {
- &check_uri($u, 0, $depth-1);
- }
+ } else {
+ # For the online version, wait for a while to avoid abuses
+ if (!$_cl) {
+ if ($doc_count == $_max_documents) {
+ print("<hr>\n<p><strong>Maximum number of documents reached!</strong></p>\n");
+ }
+ if ($doc_count >= $_max_documents) {
+ $doc_count++;
+ print("<p>Not checking <strong>$u</strong></p>\n");
+ $processed{$u} = 1;
+ next;
+ }
}
- }
+ print('<hr>');
+ sleep($_sleep_time);
+ }
+ print "\n";
+ if ($depth < 0) {
+ &check_uri($u, 0, -1);
+ } else {
+ &check_uri($u, 0, $depth-1);
+ }
+ }
+ }
}
#######################################
@@ -561,71 +561,71 @@ Validity</a></p>
sub get_document ($$$;\%)
{
- my ($method, $uri, $in_recursion, $redirects) = @_;
- # $method contains the HTTP method the use (GET or HEAD)
- # $uri contains the identifier of the resource
- # $in_recursion equals 1 if we are in recursion mode (i.e. it is at least
- # the second resource checked)
- # $redirects is a pointer to the hash containing the map of the redirects
-
- # Get the resource
- my $response;
- if (defined($results{$uri}{response})
- && !(($method eq 'GET') && ($results{$uri}{method} eq 'HEAD'))) {
- $response = $results{$uri}{response};
- } else {
- $response = &get_uri($method, $uri);
- &record_results($uri, $method, $response);
- &record_redirects($redirects, $response->{Redirects});
- }
- if (! $response->is_success()) {
- if (! $in_recursion) {
- # Is it too late to request authentication?
- if ($response->code() == 401) {
- &authentication($response);
- } else {
- if ($_html) {
- &html_header($uri);
- }
- &hprintf("\nError: %d %s\n",
- $response->code(), $response->message());
- }
- }
- $response->{Stop} = 1;
- return($response);
- }
-
- # What is the URI of the resource that we are processing by the way?
- my $base_uri = URI->new($response->base());
- my $request_uri = URI->new($response->request->url);
- $response->{absolute_uri} = $base_uri->abs($request_uri);
-
- # Can we parse the document?
- my $failed_reason;
- if (! (($response->header('Content-Type') =~ m|text/html|) ||
- ($response->header('Content-Type') =~ m|application/xhtml\+xml|))
- ) {
- $failed_reason = "Content-Type is '".
- $response->header('Content-Type')."'";
- } elsif ($response->header('Content-Encoding')
- && ($response->header('Content-Encoding') ne 'identity')) {
- $failed_reason = "Content-Encoding is '".
- $response->header('Content-encoding')."'";
- }
- if ($failed_reason) {
- # No, there is a problem...
- if (! $in_recursion) {
- if ($_html) {
- &html_header($uri);
- }
- &hprintf("Can't check links: %s.\n",
- $failed_reason);
+ my ($method, $uri, $in_recursion, $redirects) = @_;
+ # $method contains the HTTP method the use (GET or HEAD)
+ # $uri contains the identifier of the resource
+ # $in_recursion equals 1 if we are in recursion mode (i.e. it is at least
+ # the second resource checked)
+ # $redirects is a pointer to the hash containing the map of the redirects
+
+ # Get the resource
+ my $response;
+ if (defined($results{$uri}{response})
+ && !(($method eq 'GET') && ($results{$uri}{method} eq 'HEAD'))) {
+ $response = $results{$uri}{response};
+ } else {
+ $response = &get_uri($method, $uri);
+ &record_results($uri, $method, $response);
+ &record_redirects($redirects, $response->{Redirects});
+ }
+ if (! $response->is_success()) {
+ if (! $in_recursion) {
+ # Is it too late to request authentication?
+ if ($response->code() == 401) {
+ &authentication($response);
+ } else {
+ if ($_html) {
+ &html_header($uri);
}
- $response->{Stop} = 1;
+ &hprintf("\nError: %d %s\n",
+ $response->code(), $response->message());
+ }
}
-
- # Ok, return the information
+ $response->{Stop} = 1;
return($response);
+ }
+
+ # What is the URI of the resource that we are processing by the way?
+ my $base_uri = URI->new($response->base());
+ my $request_uri = URI->new($response->request->url);
+ $response->{absolute_uri} = $base_uri->abs($request_uri);
+
+ # Can we parse the document?
+ my $failed_reason;
+ if (! (($response->header('Content-Type') =~ m|text/html|) ||
+ ($response->header('Content-Type') =~ m|application/xhtml\+xml|))
+ ) {
+ $failed_reason = "Content-Type is '".
+ $response->header('Content-Type')."'";
+ } elsif ($response->header('Content-Encoding')
+ && ($response->header('Content-Encoding') ne 'identity')) {
+ $failed_reason = "Content-Encoding is '".
+ $response->header('Content-encoding')."'";
+ }
+ if ($failed_reason) {
+ # No, there is a problem...
+ if (! $in_recursion) {
+ if ($_html) {
+ &html_header($uri);
+ }
+ &hprintf("Can't check links: %s.\n",
+ $failed_reason);
+ }
+ $response->{Stop} = 1;
+ }
+
+ # Ok, return the information
+ return($response);
}
##################################################
@@ -634,20 +634,20 @@ sub get_document ($$$;\%)
sub already_processed ($)
{
- my ($uri, %redirects) = @_;
- # Don't be verbose for that part...
- my $summary_value = $_summary;
- $_summary = 1;
- # Do a GET: if it fails, we stop, if not, the results are cached
- my $response = &get_document('GET', $uri, 1);
- # ... but just for that part
- $_summary = $summary_value;
- # Can we process the resource?
- return(-1) if (defined($response->{Stop}));
- # Have we already processed it?
- return(1) if (defined($processed{$response->{absolute_uri}->as_string()}));
- # It's not processed yet and it is processable: return 0
- return(0);
+ my ($uri, %redirects) = @_;
+ # Don't be verbose for that part...
+ my $summary_value = $_summary;
+ $_summary = 1;
+ # Do a GET: if it fails, we stop, if not, the results are cached
+ my $response = &get_document('GET', $uri, 1);
+ # ... but just for that part
+ $_summary = $summary_value;
+ # Can we process the resource?
+ return(-1) if (defined($response->{Stop}));
+ # Have we already processed it?
+ return(1) if (defined($processed{$response->{absolute_uri}->as_string()}));
+ # It's not processed yet and it is processable: return 0
+ return(0);
}
############################
@@ -656,120 +656,120 @@ sub already_processed ($)
sub W3C::UserAgent::simple_request
{
- my $self = shift;
- my $response = $self->W3C::UserAgent::SUPER::simple_request(@_);
- if (! defined($self->{FirstResponse})) {
- $self->{FirstResponse} = $response->code();
- $self->{FirstMessage} = $response->message();
- }
- return $response;
+ my $self = shift;
+ my $response = $self->W3C::UserAgent::SUPER::simple_request(@_);
+ if (! defined($self->{FirstResponse})) {
+ $self->{FirstResponse} = $response->code();
+ $self->{FirstMessage} = $response->message();
+ }
+ return $response;
}
sub W3C::UserAgent::redirect_ok
{
- my ($self, $request) = @_;
+ my ($self, $request) = @_;
- if (! ($_summary || (!$doc_count && $_html))) {
- &hprintf("\n%s %s ", $request->method(), $request->uri());
- }
+ if (! ($_summary || (!$doc_count && $_html))) {
+ &hprintf("\n%s %s ", $request->method(), $request->uri());
+ }
- # Build a map of redirects
- $self->{Redirects}{$self->{fetching}} = $request->uri();
- $self->{fetching} = $request->uri();
+ # Build a map of redirects
+ $self->{Redirects}{$self->{fetching}} = $request->uri();
+ $self->{fetching} = $request->uri();
- return 0 if $request->method() eq "POST";
- return 1;
+ return 0 if $request->method() eq "POST";
+ return 1;
}
sub get_uri ($$;$\%$$$$)
{
- # Here we have a lot of extra parameters in order not to lose information
- # if the function is called several times (401's)
- my ($method, $uri, $start, $redirects, $code, $realm, $message,
- $auth) = @_;
- # $method contains the method used
- # $uri contains the target of the request
- # $start is a timestamp (not defined the first time the function is
- # called)
- # $redirects is a map of redirects
- # $code is the first HTTP return code
- # $realm is the realm of the request
- # $message is the HTTP message received
- # $auth equals 1 if we want to send out authentication information
-
- # For timing purposes
- if (! defined($start)) {
- $start = &get_timestamp();
+ # Here we have a lot of extra parameters in order not to lose information
+ # if the function is called several times (401's)
+ my ($method, $uri, $start, $redirects, $code, $realm, $message,
+ $auth) = @_;
+ # $method contains the method used
+ # $uri contains the target of the request
+ # $start is a timestamp (not defined the first time the function is
+ # called)
+ # $redirects is a map of redirects
+ # $code is the first HTTP return code
+ # $realm is the realm of the request
+ # $message is the HTTP message received
+ # $auth equals 1 if we want to send out authentication information
+
+ # For timing purposes
+ if (! defined($start)) {
+ $start = &get_timestamp();
+ }
+ # Prepare the query
+ my $ua = new W3C::UserAgent;
+ $ua->timeout($_timeout);
+ $ua->agent('W3C-checklink/'.$REVISION.' '.$ua->agent());
+ $ua->env_proxy();
+ if ($_http_proxy) {
+ $ua->proxy('http', 'http://'.$_http_proxy);
+ }
+ # $ua->{fetching} contains the URI we originally wanted
+ # $ua->{uri} is modified in the case of a redirect; this is used to
+ # build $ua->{Redirects}
+ $ua->{uri} = $ua->{fetching} = $uri;
+ if (defined($redirects)) {
+ $ua->{Redirects} = $redirects;
+ }
+
+ if (! ($_summary || (!$doc_count && $_html))) {
+ &hprintf("%s %s ", $method, $uri);
+ }
+ my $request = new HTTP::Request($method, $uri);
+ if ($_accept_language) {
+ $request->header('Accept-Language' => 'en');
+ }
+ # Are we providing authentication info?
+ if (defined($auth)
+ && ($request->url->host =~ /$_trusted$/)) {
+ if (defined($ENV{HTTP_AUTHORIZATION})) {
+ $request->headers->header(Authorization => $ENV{HTTP_AUTHORIZATION});
+ } elsif (defined($_user) && defined($_password)) {
+ $request->authorization_basic($_user, $_password);
+ }
+ }
+ # Do the query
+ my $response = $ua->request($request);
+ # Get the results
+ # Record the very first response
+ if (! defined($code)) {
+ $code = $ua->{FirstResponse};
+ $message = $ua->{FirstMessage};
+ }
+ # Authentication requested?
+ if (($response->code() == 401)
+ && (defined($ENV{HTTP_AUTHORIZATION})
+ || (defined($_user) && defined($_password)))
+ && !defined ($auth)) {
+ # Deal with authentication and avoid loops
+ if (! defined ($realm)) {
+ $response->headers->www_authenticate =~ /Basic realm=\"([^\"]+)\"/;
+ $realm = $1;
}
- # Prepare the query
- my $ua = new W3C::UserAgent;
- $ua->timeout($_timeout);
- $ua->agent('W3C-checklink/'.$REVISION.' '.$ua->agent());
- $ua->env_proxy();
- if ($_http_proxy) {
- $ua->proxy('http', 'http://'.$_http_proxy);
- }
- # $ua->{fetching} contains the URI we originally wanted
- # $ua->{uri} is modified in the case of a redirect; this is used to
- # build $ua->{Redirects}
- $ua->{uri} = $ua->{fetching} = $uri;
- if (defined($redirects)) {
- $ua->{Redirects} = $redirects;
- }
-
if (! ($_summary || (!$doc_count && $_html))) {
- &hprintf("%s %s ", $method, $uri);
- }
- my $request = new HTTP::Request($method, $uri);
- if ($_accept_language) {
- $request->header('Accept-Language' => 'en');
- }
- # Are we providing authentication info?
- if (defined($auth)
- && ($request->url->host =~ /$_trusted$/)) {
- if (defined($ENV{HTTP_AUTHORIZATION})) {
- $request->headers->header(Authorization => $ENV{HTTP_AUTHORIZATION});
- } elsif (defined($_user) && defined($_password)) {
- $request->authorization_basic($_user, $_password);
- }
- }
- # Do the query
- my $response = $ua->request($request);
- # Get the results
- # Record the very first response
- if (! defined($code)) {
- $code = $ua->{FirstResponse};
- $message = $ua->{FirstMessage};
- }
- # Authentication requested?
- if (($response->code() == 401)
- && (defined($ENV{HTTP_AUTHORIZATION})
- || (defined($_user) && defined($_password)))
- && !defined ($auth)) {
- # Deal with authentication and avoid loops
- if (! defined ($realm)) {
- $response->headers->www_authenticate =~ /Basic realm=\"([^\"]+)\"/;
- $realm = $1;
- }
- if (! ($_summary || (!$doc_count && $_html))) {
- print "\n";
- }
- return &get_uri($method, $response->request->url,
- $start, $ua->{Redirects},
- $code, $realm, $message, 1);
- }
- # Record the redirects
- $response->{Redirects} = $ua->{Redirects};
- my $stop = &get_timestamp();
- if (! ($_summary || (!$doc_count && $_html))) {
- &hprintf(" fetched in %ss\n", &time_diff($start, $stop));
- }
- $response->{OriginalCode} = $code;
- $response->{OriginalMessage} = $message;
- if (defined($realm)) {
- $response->{Realm} = $realm;
- }
- return $response;
+ print "\n";
+ }
+ return &get_uri($method, $response->request->url,
+ $start, $ua->{Redirects},
+ $code, $realm, $message, 1);
+ }
+ # Record the redirects
+ $response->{Redirects} = $ua->{Redirects};
+ my $stop = &get_timestamp();
+ if (! ($_summary || (!$doc_count && $_html))) {
+ &hprintf(" fetched in %ss\n", &time_diff($start, $stop));
+ }
+ $response->{OriginalCode} = $code;
+ $response->{OriginalMessage} = $message;
+ if (defined($realm)) {
+ $response->{Realm} = $realm;
+ }
+ return $response;
}
#########################################
@@ -778,44 +778,44 @@ sub get_uri ($$;$\%$$$$)
sub record_results ($$$)
{
- my ($uri, $method, $response) = @_;
- $results{$uri}{response} = $response;
- $results{$uri}{method} = $method;
- $results{$uri}{location}{code} = $response->code();
- $results{$uri}{location}{type} = $response->header('Content-type');
- $results{$uri}{location}{display} = $results{$uri}{location}{code};
- $results{$uri}{location}{orig} = $response->{OriginalCode};
- # Did we get a redirect?
- if ($response->{OriginalCode} != $response->code()) {
- $results{$uri}{location}{orig_message} = $response->{OriginalMessage};
- $results{$uri}{location}{redirected} = 1;
- }
- $results{$uri}{location}{success} = $response->is_success();
- # Stores the authentication information
- if (defined($response->{Realm})) {
- $results{$uri}{location}{realm} = $response->{Realm};
- if (! $_hide_same_realm) {
- $results{$uri}{location}{display} = 401;
- }
- }
- # What type of broken link is it? (stored in {record} - the {display}
- # information is just for visual use only)
- if (($results{$uri}{location}{display} == 401)
- && ($results{$uri}{location}{code} == 404)) {
- $results{$uri}{location}{record} = 404;
- } else {
- $results{$uri}{location}{record} = $results{$uri}{location}{display};
- }
- # Did it fail?
- $results{$uri}{location}{message} = $response->message();
- if (! $results{$uri}{location}{success}) {
- if ($_verbose) {
- &hprintf("Error: %d %s\n",
- $results{$uri}{location}{code},
- $results{$uri}{location}{message});
- }
- return;
- }
+ my ($uri, $method, $response) = @_;
+ $results{$uri}{response} = $response;
+ $results{$uri}{method} = $method;
+ $results{$uri}{location}{code} = $response->code();
+ $results{$uri}{location}{type} = $response->header('Content-type');
+ $results{$uri}{location}{display} = $results{$uri}{location}{code};
+ $results{$uri}{location}{orig} = $response->{OriginalCode};
+ # Did we get a redirect?
+ if ($response->{OriginalCode} != $response->code()) {
+ $results{$uri}{location}{orig_message} = $response->{OriginalMessage};
+ $results{$uri}{location}{redirected} = 1;
+ }
+ $results{$uri}{location}{success} = $response->is_success();
+ # Stores the authentication information
+ if (defined($response->{Realm})) {
+ $results{$uri}{location}{realm} = $response->{Realm};
+ if (! $_hide_same_realm) {
+ $results{$uri}{location}{display} = 401;
+ }
+ }
+ # What type of broken link is it? (stored in {record} - the {display}
+ # information is just for visual use only)
+ if (($results{$uri}{location}{display} == 401)
+ && ($results{$uri}{location}{code} == 404)) {
+ $results{$uri}{location}{record} = 404;
+ } else {
+ $results{$uri}{location}{record} = $results{$uri}{location}{display};
+ }
+ # Did it fail?
+ $results{$uri}{location}{message} = $response->message();
+ if (! $results{$uri}{location}{success}) {
+ if ($_verbose) {
+ &hprintf("Error: %d %s\n",
+ $results{$uri}{location}{code},
+ $results{$uri}{location}{message});
+ }
+ return;
+ }
}
####################
@@ -824,58 +824,58 @@ sub record_results ($$$)
sub parse_document ($$$$;$)
{
- my ($uri, $location, $document, $links, $rec_needs_links) = @_;
-
- my $p;
-
- if (defined($results{$uri}{parsing})) {
- # We have already done the job. Woohoo!
- $p->{base} = $results{$uri}{parsing}{base};
- $p->{Anchors} = $results{$uri}{parsing}{Anchors};
- $p->{Links} = $results{$uri}{parsing}{Links};
- return($p);
- }
-
- my $start;
- $p = W3C::CheckLink->new();
- $p->{base} = $location;
- if (! $_summary) {
- $start = &get_timestamp();
- print("Parsing...\n");
- }
- if (!$_summary || $_progress) {
- $p->{Total} = ($document =~ tr/\n//);
- }
- # We only look for anchors if we are not interested in the links
- # obviously, or if we are running a recursive checking because we
- # might need this information later
- $p->{only_anchors} = !($links || $rec_needs_links);
-
- # Transform <?xml:stylesheet ...?> into <xml:stylesheet ...> for parsing
- # Processing instructions are not parsed by process, but in this case
- # it should be. It's expensive, it's horrible, but it's the easiest way
- # for right now.
- if (!$p->{only_anchors}) {
- $document =~ s/\<\?(xml:stylesheet.*?)\?\>/\<$1\>/;
- }
+ my ($uri, $location, $document, $links, $rec_needs_links) = @_;
- $p->parse($document);
+ my $p;
- if (! $_summary) {
- my $stop = &get_timestamp();
- if ($_progress) {
- print "\r";
- }
- &hprintf(" done (%d lines in %ss).\n",
- $p->{Total}, &time_diff($start, $stop));
+ if (defined($results{$uri}{parsing})) {
+ # We have already done the job. Woohoo!
+ $p->{base} = $results{$uri}{parsing}{base};
+ $p->{Anchors} = $results{$uri}{parsing}{Anchors};
+ $p->{Links} = $results{$uri}{parsing}{Links};
+ return($p);
+ }
+
+ my $start;
+ $p = W3C::CheckLink->new();
+ $p->{base} = $location;
+ if (! $_summary) {
+ $start = &get_timestamp();
+ print("Parsing...\n");
+ }
+ if (!$_summary || $_progress) {
+ $p->{Total} = ($document =~ tr/\n//);
+ }
+ # We only look for anchors if we are not interested in the links
+ # obviously, or if we are running a recursive checking because we
+ # might need this information later
+ $p->{only_anchors} = !($links || $rec_needs_links);
+
+ # Transform <?xml:stylesheet ...?> into <xml:stylesheet ...> for parsing
+ # Processing instructions are not parsed by process, but in this case
+ # it should be. It's expensive, it's horrible, but it's the easiest way
+ # for right now.
+ if (!$p->{only_anchors}) {
+ $document =~ s/\<\?(xml:stylesheet.*?)\?\>/\<$1\>/;
+ }
+
+ $p->parse($document);
+
+ if (! $_summary) {
+ my $stop = &get_timestamp();
+ if ($_progress) {
+ print "\r";
}
+ &hprintf(" done (%d lines in %ss).\n",
+ $p->{Total}, &time_diff($start, $stop));
+ }
- # Save the results before exiting
- $results{$uri}{parsing}{base} = $p->{base};
- $results{$uri}{parsing}{Anchors} = $p->{Anchors};
- $results{$uri}{parsing}{Links} = $p->{Links};
+ # Save the results before exiting
+ $results{$uri}{parsing}{base} = $p->{base};
+ $results{$uri}{parsing}{Anchors} = $p->{Anchors};
+ $results{$uri}{parsing}{Links} = $p->{Links};
- return($p);
+ return($p);
}
###################################
@@ -884,28 +884,28 @@ sub parse_document ($$$$;$)
sub W3C::CheckLink::new
{
- my $p = HTML::Parser::new(@_, api_version => 3);
-
- # Start tags
- $p->handler(start => 'start', 'self, tagname, attr, text, event, tokens');
- # Declarations
- $p->handler(declaration =>
- sub {
- my $self = shift;
- $self->declaration(substr($_[0], 2, -1));
- }, 'self, text');
- # Other stuff
- $p->handler(default => 'text', 'self, text');
- # Line count
- $p->{Line} = 1;
- # Check <a [..] name="...">?
- $p->{check_name} = 1;
- # Check <[..] id="..">?
- $p->{check_id} = 1;
- # Don't interpret comment loosely
- $p->strict_comment(1);
-
- return $p;
+ my $p = HTML::Parser::new(@_, api_version => 3);
+
+ # Start tags
+ $p->handler(start => 'start', 'self, tagname, attr, text, event, tokens');
+ # Declarations
+ $p->handler(declaration =>
+ sub {
+ my $self = shift;
+ $self->declaration(substr($_[0], 2, -1));
+ }, 'self, text');
+ # Other stuff
+ $p->handler(default => 'text', 'self, text');
+ # Line count
+ $p->{Line} = 1;
+ # Check <a [..] name="...">?
+ $p->{check_name} = 1;
+ # Check <[..] id="..">?
+ $p->{check_id} = 1;
+ # Don't interpret comment loosely
+ $p->strict_comment(1);
+
+ return $p;
}
#################################################
@@ -914,28 +914,28 @@ sub W3C::CheckLink::new
sub W3C::CheckLink::doctype
{
- my ($self, $dc) = @_;
- if (! $dc) {
- return $self->{doctype};
- }
- $_ = $self->{doctype} = $dc;
-
- # What to look for depending on the doctype
- if ($_ eq '-//W3C//DTD XHTML Basic 1.0//EN') {
- $self->{check_name} = 0;
- }
- # Check for the id tag
- if (
- # HTML 2.0 & 3.0
- m%^-//IETF//DTD HTML [23]\.0//% ||
- # HTML 3.2
- m%^-//W3C//DTD HTML 3\.2//%) {
- $self->{check_id} = 0;
- }
- # Enable XML extensions
- if (m%^-//W3C//DTD XHTML %) {
- $self->xml_mode(1);
- }
+ my ($self, $dc) = @_;
+ if (! $dc) {
+ return $self->{doctype};
+ }
+ $_ = $self->{doctype} = $dc;
+
+ # What to look for depending on the doctype
+ if ($_ eq '-//W3C//DTD XHTML Basic 1.0//EN') {
+ $self->{check_name} = 0;
+ }
+ # Check for the id tag
+ if (
+ # HTML 2.0 & 3.0
+ m%^-//IETF//DTD HTML [23]\.0//% ||
+ # HTML 3.2
+ m%^-//W3C//DTD HTML 3\.2//%) {
+ $self->{check_id} = 0;
+ }
+ # Enable XML extensions
+ if (m%^-//W3C//DTD XHTML %) {
+ $self->xml_mode(1);
+ }
}
#######################################
@@ -944,12 +944,12 @@ sub W3C::CheckLink::doctype
sub W3C::CheckLink::new_line
{
- my ($self, $string) = @_;
- my $count = ($string =~ tr/\n//);
- $self->{Line} = $self->{Line} + $count;
- if ($_progress) {
- printf("\r%4d%%", int($self->{Line}/$self->{Total}*100));
- }
+ my ($self, $string) = @_;
+ my $count = ($string =~ tr/\n//);
+ $self->{Line} = $self->{Line} + $count;
+ if ($_progress) {
+ printf("\r%4d%%", int($self->{Line}/$self->{Total}*100));
+ }
}
#############################
@@ -958,23 +958,23 @@ sub W3C::CheckLink::new_line
sub W3C::CheckLink::get_anchor
{
- my ($self, $tag, $attr) = @_;
- my $anchor;
-
- if ($self->{check_id}) {
- $anchor = $attr->{id};
- }
- if ($self->{check_name} && ($tag eq 'a')) {
- # @@@@ In XHTML, <a name="foo" id="foo"> is mandatory
- # Force an error if it's not the case (or if id's and name's values
- # are different)
- # If id is defined, name if defined must have the same value
- if (!$anchor) {
- $anchor = $attr->{name};
- }
- }
-
- return($anchor);
+ my ($self, $tag, $attr) = @_;
+ my $anchor;
+
+ if ($self->{check_id}) {
+ $anchor = $attr->{id};
+ }
+ if ($self->{check_name} && ($tag eq 'a')) {
+ # @@@@ In XHTML, <a name="foo" id="foo"> is mandatory
+ # Force an error if it's not the case (or if id's and name's values
+ # are different)
+ # If id is defined, name if defined must have the same value
+ if (!$anchor) {
+ $anchor = $attr->{name};
+ }
+ }
+
+ return($anchor);
}
###########################
@@ -983,83 +983,83 @@ sub W3C::CheckLink::get_anchor
sub W3C::CheckLink::add_link
{
- my ($self, $uri) = @_;
+ my ($self, $uri) = @_;
- if (defined($uri)) {
- $self->{Links}{$uri}{$self->{Line}}++;
- }
+ if (defined($uri)) {
+ $self->{Links}{$uri}{$self->{Line}}++;
+ }
}
sub W3C::CheckLink::start
{
- my ($self, $tag, $attr, $text) = @_;
-
- # Anchors
- my $anchor = $self->get_anchor($tag, $attr);
- if (defined($anchor)) {
- $self->{Anchors}{$anchor}{$self->{Line}}++;
+ my ($self, $tag, $attr, $text) = @_;
+
+ # Anchors
+ my $anchor = $self->get_anchor($tag, $attr);
+ if (defined($anchor)) {
+ $self->{Anchors}{$anchor}{$self->{Line}}++;
+ }
+
+ # Links
+ if (!$self->{only_anchors}) {
+ # Here, we are checking too many things
+ # The right thing to do is to parse the DTD...
+ if ($tag eq 'base') {
+ if (defined($attr->{href})) {
+ $self->{base} = $attr->{href};
+ }
+ } else {
+ $self->add_link($attr->{href});
}
-
- # Links
- if (!$self->{only_anchors}) {
- # Here, we are checking too many things
- # The right thing to do is to parse the DTD...
- if ($tag eq 'base') {
- if (defined($attr->{href})) {
- $self->{base} = $attr->{href};
- }
- } else {
- $self->add_link($attr->{href});
- }
- $self->add_link($attr->{src});
- if ($tag eq 'object') {
- $self->add_link($attr->{data});
- }
- if ($tag eq 'blockquote') {
- $self->add_link($attr->{cite});
- }
+ $self->add_link($attr->{src});
+ if ($tag eq 'object') {
+ $self->add_link($attr->{data});
}
-
- # Line counting
- if ($text =~ m/\n/) {
- $self->new_line($text);
+ if ($tag eq 'blockquote') {
+ $self->add_link($attr->{cite});
}
+ }
+
+ # Line counting
+ if ($text =~ m/\n/) {
+ $self->new_line($text);
+ }
}
sub W3C::CheckLink::text
{
- my ($self, $text) = @_;
- if (!$_progress) {
- # If we are just extracting information about anchors,
- # parsing this part is only cosmetic (progress indicator)
- return unless !$self->{only_anchors};
- }
- if ($text =~ /\n/) {
- $self->new_line($text);
- }
+ my ($self, $text) = @_;
+ if (!$_progress) {
+ # If we are just extracting information about anchors,
+ # parsing this part is only cosmetic (progress indicator)
+ return unless !$self->{only_anchors};
+ }
+ if ($text =~ /\n/) {
+ $self->new_line($text);
+ }
}
sub W3C::CheckLink::declaration
{
- my ($self, $text) = @_;
- # Extract the doctype
- my @declaration = split(/\s+/, $text, 4);
- if (($#declaration >= 3) &&
- ($declaration[0] eq 'DOCTYPE') &&
- (lc($declaration[1]) eq 'html')) {
- # Parse the doctype declaration
- $text =~ m/^DOCTYPE\s+html\s+PUBLIC\s+\"([^\"]*)\"(\s+\"([^\"]*)\")?\s*$/i;
- # Store the doctype
- if ($1) {
- $self->doctype($1);
- }
- # If there is a link to the DTD, record it
- if (!$self->{only_anchors} && $3) {
- $self->{Links}{$3}{$self->{Line}}++;
- }
- }
- return unless !$self->{only_anchors};
- $self->text($text);
+ my ($self, $text) = @_;
+ # Extract the doctype
+ my @declaration = split(/\s+/, $text, 4);
+ if (($#declaration >= 3) &&
+ ($declaration[0] eq 'DOCTYPE') &&
+ (lc($declaration[1]) eq 'html')) {
+ # Parse the doctype declaration
+ $text =~ m/^DOCTYPE\s+html\s+PUBLIC\s+\"([^\"]*)\"(\s+\"([^\"]*)\")?\s*$/i;
+ # Store the doctype
+ if ($1) {
+ $self->doctype($1);
+ }
+ # If there is a link to the DTD, record it
+ if (!$self->{only_anchors} && $3) {
+ $self->{Links}{$3}{$self->{Line}}++;
+ }
+ }
+ return unless !$self->{only_anchors};
+ $self->text($text);
}
################################
@@ -1068,100 +1068,100 @@ sub W3C::CheckLink::declaration
sub check_validity ($$\%\%)
{
- my ($testing, $uri, $links, $redirects) = @_;
- # $testing is the URI of the document checked
- # $uri is the URI of the target that we are verifying
- # $links is a hash of the links in the documents checked
- # $redirects is a map of the redirects encountered
-
- # Checking file: URI's is not allowed with a CGI
- if ($testing ne $uri) {
- if ((! $_cl) && (!($testing =~ m/^file:/)) && ($uri =~ m/^file:/)) {
- # Can't test? Return 400 Bad request.
- $results{$uri}{location}{code} = 400;
- $results{$uri}{location}{success} = 0;
- $results{$uri}{location}{message} = 'Error: \'file:\' URI not allowed';
- if ($_verbose) {
- &hprintf("Error: %d %s\n",
- $results{$uri}{location}{code},
- $results{$uri}{location}{message});
- }
- return;
- }
- }
-
- # Get the document with the appropriate method
- my $method;
- my @fragments = keys %{$links->{$uri}{fragments}};
- # Only use GET if there are fragments. HEAD is enough if it's not the
- # case.
- if ($#fragments == -1) {
- $method = 'HEAD';
- } else {
- $method = 'GET';
- }
-
- my $response;
- my $being_processed = 0;
- if ((! defined($results{$uri}))
- || (($method eq 'GET') && ($results{$uri}{method} eq 'HEAD'))) {
- $being_processed = 1;
- $response = &get_uri($method, $uri);
- # Get the information back from get_uri()
- &record_results($uri, $method, $response);
- # Record the redirects
- &record_redirects($redirects, $response->{Redirects});
- }
-
- # We got the response of the HTTP request. Stop here if it was a HEAD.
- if ($#fragments == -1) {
- return;
- }
- # There are fragments. Parse the document.
- my $p;
- if ($being_processed) {
- # Can we really parse the document?
- if (!defined($results{$uri}{location}{type})) {
- return;
- }
- if (! (($results{$uri}{location}{type} =~ m|text/html|i) ||
- ($results{$uri}{location}{type} =~ m|application/xhtml\+xml|i))
- ) {
- if ($_verbose) {
- &hprintf("Can't check content: Content-type is '%s'.\n",
- $response->header('Content-type'));
- }
- return;
- }
- # Do it then
- $p = &parse_document($uri, $response->base(),
- $response->as_string(), 0);
+ my ($testing, $uri, $links, $redirects) = @_;
+ # $testing is the URI of the document checked
+ # $uri is the URI of the target that we are verifying
+ # $links is a hash of the links in the documents checked
+ # $redirects is a map of the redirects encountered
+
+ # Checking file: URI's is not allowed with a CGI
+ if ($testing ne $uri) {
+ if ((! $_cl) && (!($testing =~ m/^file:/)) && ($uri =~ m/^file:/)) {
+ # Can't test? Return 400 Bad request.
+ $results{$uri}{location}{code} = 400;
+ $results{$uri}{location}{success} = 0;
+ $results{$uri}{location}{message} = 'Error: \'file:\' URI not allowed';
+ if ($_verbose) {
+ &hprintf("Error: %d %s\n",
+ $results{$uri}{location}{code},
+ $results{$uri}{location}{message});
+ }
+ return;
+ }
+ }
+
+ # Get the document with the appropriate method
+ my $method;
+ my @fragments = keys %{$links->{$uri}{fragments}};
+ # Only use GET if there are fragments. HEAD is enough if it's not the
+ # case.
+ if ($#fragments == -1) {
+ $method = 'HEAD';
+ } else {
+ $method = 'GET';
+ }
+
+ my $response;
+ my $being_processed = 0;
+ if ((! defined($results{$uri}))
+ || (($method eq 'GET') && ($results{$uri}{method} eq 'HEAD'))) {
+ $being_processed = 1;
+ $response = &get_uri($method, $uri);
+ # Get the information back from get_uri()
+ &record_results($uri, $method, $response);
+ # Record the redirects
+ &record_redirects($redirects, $response->{Redirects});
+ }
+
+ # We got the response of the HTTP request. Stop here if it was a HEAD.
+ if ($#fragments == -1) {
+ return;
+ }
+ # There are fragments. Parse the document.
+ my $p;
+ if ($being_processed) {
+ # Can we really parse the document?
+ if (!defined($results{$uri}{location}{type})) {
+ return;
+ }
+ if (! (($results{$uri}{location}{type} =~ m|text/html|i) ||
+ ($results{$uri}{location}{type} =~ m|application/xhtml\+xml|i))
+ ) {
+ if ($_verbose) {
+ &hprintf("Can't check content: Content-type is '%s'.\n",
+ $response->header('Content-type'));
+ }
+ return;
+ }
+ # Do it then
+ $p = &parse_document($uri, $response->base(),
+ $response->as_string(), 0);
+ } else {
+ # We already had the information
+ $p->{Anchors} = $results{$uri}{parsing}{Anchors};
+ }
+ # Check that the fragments exist
+ my $fragment;
+ foreach $fragment (keys %{$links->{$uri}{fragments}}) {
+ if (defined($p->{Anchors}{$fragment})
+ || &escape_match($fragment, $p->{Anchors})) {
+ $results{$uri}{fragments}{$fragment} = 1;
} else {
- # We already had the information
- $p->{Anchors} = $results{$uri}{parsing}{Anchors};
- }
- # Check that the fragments exist
- my $fragment;
- foreach $fragment (keys %{$links->{$uri}{fragments}}) {
- if (defined($p->{Anchors}{$fragment})
- || &escape_match($fragment, $p->{Anchors})) {
- $results{$uri}{fragments}{$fragment} = 1;
- } else {
- $results{$uri}{fragments}{$fragment} = 0;
- }
+ $results{$uri}{fragments}{$fragment} = 0;
}
+ }
}
sub escape_match ($\%)
{
- use URI::Escape ();
- my ($a, $hash) = (URI::Escape::uri_unescape($_[0]), $_[1]);
- foreach $b (keys %$hash) {
- if ($a eq URI::Escape::uri_unescape($b)) {
- return(1);
- }
- }
- return(0);
+ use URI::Escape ();
+ my ($a, $hash) = (URI::Escape::uri_unescape($_[0]), $_[1]);
+ foreach $b (keys %$hash) {
+ if ($a eq URI::Escape::uri_unescape($b)) {
+ return(1);
+ }
+ }
+ return(0);
}
##########################
@@ -1170,17 +1170,17 @@ sub escape_match ($\%)
sub authentication ($)
{
- my $r = $_[0];
- $r->headers->www_authenticate =~ /Basic realm=\"([^\"]+)\"/;
- my $realm = $1;
- my $authHeader = $r->headers->www_authenticate;
- if ($_cl) {
- printf(STDERR "\nAuthentication is required for %s.\n", $r->request->url);
- printf(STDERR "The realm is %s.\n", $realm);
- print(STDERR "Use the -u and -p options to specify a username and password.\n");
- } else {
- printf("Status: 401 Authorization Required\nWWW-Authenticate: %s\nConnection: close\nContent-Language: en\nContent-Type: text/html\n\n", $r->headers->www_authenticate);
- printf("<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">
+ my $r = $_[0];
+ $r->headers->www_authenticate =~ /Basic realm=\"([^\"]+)\"/;
+ my $realm = $1;
+ my $authHeader = $r->headers->www_authenticate;
+ if ($_cl) {
+ printf(STDERR "\nAuthentication is required for %s.\n", $r->request->url);
+ printf(STDERR "The realm is %s.\n", $realm);
+ print(STDERR "Use the -u and -p options to specify a username and password.\n");
+ } else {
+ printf("Status: 401 Authorization Required\nWWW-Authenticate: %s\nConnection: close\nContent-Language: en\nContent-Type: text/html\n\n", $r->headers->www_authenticate);
+ printf("<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">
<html lang=\"en\">
<head>
<title>401 Authorization Required</title>
@@ -1191,7 +1191,7 @@ sub authentication ($)
</body>
</html>
", &encode($realm), $r->request->url);
- }
+ }
}
##################
@@ -1200,18 +1200,18 @@ sub authentication ($)
sub get_timestamp ()
{
- use Time::HiRes ();
- return pack('LL', Time::HiRes::gettimeofday());
+ use Time::HiRes ();
+ return pack('LL', Time::HiRes::gettimeofday());
}
sub time_diff ($$)
{
- my @start = unpack('LL', $_[0]);
- my @stop = unpack('LL', $_[1]);
- for ($start[1], $stop[1]) {
- $_ /= 1_000_000;
- }
- return(sprintf("%.1f", ($stop[0]+$stop[1])-($start[0]+$start[1])));
+ my @start = unpack('LL', $_[0]);
+ my @stop = unpack('LL', $_[1]);
+ for ($start[1], $stop[1]) {
+ $_ /= 1_000_000;
+ }
+ return(sprintf("%.1f", ($stop[0]+$stop[1])-($start[0]+$start[1])));
}
########################
@@ -1221,33 +1221,33 @@ sub time_diff ($$)
# Record the redirects in a hash
sub record_redirects (\%\%)
{
- my ($redirects, $sub) = @_;
- my $r;
- foreach $r (keys %$sub) {
- $redirects->{$r} = $sub->{$r};
- }
+ my ($redirects, $sub) = @_;
+ my $r;
+ foreach $r (keys %$sub) {
+ $redirects->{$r} = $sub->{$r};
+ }
}
# Determine if a request is redirected
sub is_redirected ($%)
{
- my ($uri, %redirects) = @_;
- return(defined($redirects{$uri}));
+ my ($uri, %redirects) = @_;
+ return(defined($redirects{$uri}));
}
# Get a list of redirects for a URI
sub get_redirects ($%)
{
- my ($uri, %redirects) = @_;
- my @history = ($uri);
- my $origin = $uri;
- my %seen;
- while ($redirects{$uri}) {
- $uri = $redirects{$uri};
- push(@history, $uri);
- last if ($uri eq $origin);
- }
- return(@history);
+ my ($uri, %redirects) = @_;
+ my @history = ($uri);
+ my $origin = $uri;
+ my %seen;
+ while ($redirects{$uri}) {
+ $uri = $redirects{$uri};
+ push(@history, $uri);
+ last if ($uri eq $origin);
+ }
+ return(@history);
}
####################################################
@@ -1256,9 +1256,9 @@ sub get_redirects ($%)
sub sort_unique (@)
{
- my %saw;
- @saw{@_} = ();
- return (sort { $a <=> $b } keys %saw);
+ my %saw;
+ @saw{@_} = ();
+ return (sort { $a <=> $b } keys %saw);
}
#####################
@@ -1267,96 +1267,95 @@ sub sort_unique (@)
sub anchors_summary (\%\%)
{
- my ($anchors, $errors) = @_;
- # Number of anchors found.
- if (! $_quiet) {
- if ($_html) {
- print("<h3>Anchors</h3>\n<p>");
- } else {
- print("Anchors\n\n");
- }
- my @anchors = keys %{$anchors};
- &hprintf("Found %d anchor(s).", $#anchors+1);
- if ($_html) {
- print('</p>');
- }
- print("\n");
- }
- # List of the duplicates, if any.
- my @errors = keys %{$errors};
- if ($#errors < 0) {
- if (! $_quiet && $_html) {
- print "<p>Valid anchors!</p>\n";
- }
- return;
- }
+ my ($anchors, $errors) = @_;
+ # Number of anchors found.
+ if (! $_quiet) {
if ($_html) {
- print('<p>');
+ print("<h3>Anchors</h3>\n<p>");
+ } else {
+ print("Anchors\n\n");
}
- print('List of duplicate and empty anchors:');
+ my @anchors = keys %{$anchors};
+ &hprintf("Found %d anchor(s).", $#anchors+1);
if ($_html) {
- print("</p>\n<table border=\"1\">\n<tr><td><b>Anchors</b></td><td><b>Lines</b></td></tr>");
+ print('</p>');
}
print("\n");
- my $anchor;
- foreach $anchor (@errors) {
- my $format;
- if ($_html) {
- $format = "<tr class=\"broken\"><td>%s</td><td>%s</td></tr>\n";
- } else {
- $format = "\t%s\tLines: %s\n";
- }
- printf($format,
- &encode($anchor eq '' ? 'Empty anchor' : $ anchor),
- join(', ', &sort_unique(keys %{$anchors->{$anchor}})));
- }
+ }
+ # List of the duplicates, if any.
+ my @errors = keys %{$errors};
+ if ($#errors < 0) {
+ if (! $_quiet && $_html) {
+ print "<p>Valid anchors!</p>\n";
+ }
+ return;
+ }
+ if ($_html) {
+ print('<p>');
+ }
+ print('List of duplicate and empty anchors:');
+ if ($_html) {
+ print("</p>\n<table border=\"1\">\n<tr><td><b>Anchors</b></td><td><b>Lines</b></td></tr>");
+ }
+ print("\n");
+ my $anchor;
+ foreach $anchor (@errors) {
+ my $format;
if ($_html) {
- print("</table>\n");
- }
+ $format = "<tr class=\"broken\"><td>%s</td><td>%s</td></tr>\n";
+ } else {
+ $format = "\t%s\tLines: %s\n";
+ }
+ printf($format,
+ &encode($anchor eq '' ? 'Empty anchor' : $ anchor),
+ join(', ', &sort_unique(keys %{$anchors->{$anchor}})));
+ }
+ if ($_html) {
+ print("</table>\n");
+ }
}
sub show_link_report (\%\%\%\%\@;$\%)
{
- my ($links, $results, $broken, $redirects, $urls, $codes, $todo) = @_;
-
- if ($_html) {
- print("\n<dl class=\"report\">");
- }
- print("\n");
-
- # Process each URL
- my $u;
- my ($c, $previous_c);
- foreach $u (@{$urls}) {
- my @fragments = keys %{$broken->{$u}{fragments}};
- # Did we get a redirect?
- my $redirected = &is_redirected($u, %$redirects);
- # List of lines
- my @total_lines;
- my ($f, $l);
- foreach $l (keys %{$links->{$u}{location}}) {
- push (@total_lines, $l);
- }
- foreach $f (keys %{$links->{$u}{fragments}}) {
- if ($f eq $u) {
- next if (defined($links->{$u}{$u}{-1}));
- }
- my $l;
- foreach $l (keys %{$links->{$u}{fragments}{$f}}) {
- push (@total_lines, $l);
- }
- }
- my $lines_list = join(', ',
- &sort_unique(@total_lines));
- # Error type
- $c = &code_shown($u, $results);
- # What to do
- my $whattodo;
- my $redirect_too;
- if ($todo) {
- if ($u =~ m/^javascript:/) {
- if ($_html) {
- $whattodo =
+ my ($links, $results, $broken, $redirects, $urls, $codes, $todo) = @_;
+
+ if ($_html) {
+ print("\n<dl class=\"report\">");
+ }
+ print("\n");
+
+ # Process each URL
+ my $u;
+ my ($c, $previous_c);
+ foreach $u (@{$urls}) {
+ my @fragments = keys %{$broken->{$u}{fragments}};
+ # Did we get a redirect?
+ my $redirected = &is_redirected($u, %$redirects);
+ # List of lines
+ my @total_lines;
+ my ($f, $l);
+ foreach $l (keys %{$links->{$u}{location}}) {
+ push (@total_lines, $l);
+ }
+ foreach $f (keys %{$links->{$u}{fragments}}) {
+ if ($f eq $u) {
+ next if (defined($links->{$u}{$u}{-1}));
+ }
+ my $l;
+ foreach $l (keys %{$links->{$u}{fragments}{$f}}) {
+ push (@total_lines, $l);
+ }
+ }
+ my $lines_list = join(', ', &sort_unique(@total_lines));
+ # Error type
+ $c = &code_shown($u, $results);
+ # What to do
+ my $whattodo;
+ my $redirect_too;
+ if ($todo) {
+ if ($u =~ m/^javascript:/) {
+ if ($_html) {
+ $whattodo =
'You must change this link: people using a browser without Javascript support
will <em>not</em> be able to follow this link. See the
<a href="http://www.w3.org/TR/1999/WAI-WEBCONTENT-19990505/#tech-scripts">Web
@@ -1364,295 +1363,293 @@ Content Accessibility Guidelines on the use of scripting on the Web</a> and
the
<a href="http://www.w3.org/TR/WCAG10-HTML-TECHS/#directly-accessible-scripts">techniques
on how to solve this</a>.';
- } else {
- $whattodo = 'Change this link: people using a browser without Javascript support will not be able to follow this link.';
- }
- } elsif ($c == 500) {
- # 500's could be a real 500 or a DNS lookup problem
- if ($results->{$u}{location}{message} =~
- m/Bad hostname '[^\']*'/) {
- $whattodo = 'The hostname could not be resolved. This link needs to be fixed.';
- } else {
- $whattodo = 'This is a server-side problem. Check the URI.';
- }
- } else {
- $whattodo = $todo->{$c};
- }
- if (defined($redirects{$u}) && ($c != 301) && ($c != 302)) {
- $redirect_too = 'The original request has been redirected.';
- $whattodo .= ' '.$redirect_too if (! $_html);
- }
} else {
- # Directory redirects
- $whattodo = 'Add a trailing slash to the URL.';
+ $whattodo = 'Change this link: people using a browser without Javascript support will not be able to follow this link.';
}
- if ($_html) {
- # Style stuff
- my $idref = '';
- if ($codes && (!defined($previous_c) || ($c != $previous_c))) {
- $idref = ' id="d'.$doc_count.'code_'.$c.'"';
- $previous_c = $c;
- }
- # Main info
- my @redirects_urls = &get_redirects($u, %$redirects);
- for (@redirects_urls) {
- $_ = &show_url($_);
- }
- # HTTP message
- my $http_message;
- if ($results->{$u}{location}{message}) {
- $http_message = &encode($results->{$u}{location}{message});
- if (($c == 404) || ($c == 500)) {
- $http_message = '<span class="broken">'.
- $http_message.'</span>';
- }
- }
- printf("
+ } elsif ($c == 500) {
+ # 500's could be a real 500 or a DNS lookup problem
+ if ($results->{$u}{location}{message} =~
+ m/Bad hostname '[^\']*'/) {
+ $whattodo = 'The hostname could not be resolved. This link needs to be fixed.';
+ } else {
+ $whattodo = 'This is a server-side problem. Check the URI.';
+ }
+ } else {
+ $whattodo = $todo->{$c};
+ }
+ if (defined($redirects{$u}) && ($c != 301) && ($c != 302)) {
+ $redirect_too = 'The original request has been redirected.';
+ $whattodo .= ' '.$redirect_too if (! $_html);
+ }
+ } else {
+ # Directory redirects
+ $whattodo = 'Add a trailing slash to the URL.';
+ }
+ if ($_html) {
+ # Style stuff
+ my $idref = '';
+ if ($codes && (!defined($previous_c) || ($c != $previous_c))) {
+ $idref = ' id="d'.$doc_count.'code_'.$c.'"';
+ $previous_c = $c;
+ }
+ # Main info
+ my @redirects_urls = &get_redirects($u, %$redirects);
+ for (@redirects_urls) {
+ $_ = &show_url($_);
+ }
+ # HTTP message
+ my $http_message;
+ if ($results->{$u}{location}{message}) {
+ $http_message = &encode($results->{$u}{location}{message});
+ if (($c == 404) || ($c == 500)) {
+ $http_message = '<span class="broken">'.
+ $http_message.'</span>';
+ }
+ }
+ printf("
<dt%s>%s</dt>
<dd>What to do: <strong%s>%s</strong>%s<br></dd>
<dd>HTTP Code returned: %d%s<br>
HTTP Message: %s%s%s</dd>
<dd>Lines: %s</dd>\n",
- # Anchor for return codes
- $idref,
- # List of redirects
- $redirected ? join(' redirected to<br>',
- @redirects_urls) : &show_url($u),
- # Color
- &bgcolor($c),
- # What to do
- $whattodo,
- # Redirect too?
- $redirect_too ? ' <span '.&bgcolor(301).'>'.$redirect_too.'</span>' : '',
- # Original HTTP reply
- $results->{$u}{location}{orig},
- # Final HTTP reply
- ($results->{$u}{location}{code} !=
- $results->{$u}{location}{orig})
- ? ' <span title="redirected to">-&gt;</span> '.
- &encode($results->{$u}{location}{code})
- : '',
- # Realm
- (defined($results->{$u}{location}{realm})
- ? 'Realm: '.&encode($results->{$u}{location}{realm}).'<br>'
- : ''),
- # HTTP original message
- defined($results->{$u}{location}{orig_message})
- ? &encode($results->{$u}{location}{orig_message}).
- ' <span title="redirected to">-&gt;</span> '
- : '',
- # HTTP final message
- $http_message,
- # List of lines
- $lines_list);
- if ($#fragments >= 0) {
- my $fragment_direction;
- if ($results->{$u}{location}{code} == 200) {
- $fragment_direction =
- ' <strong class="broken">They need to be fixed!</strong>';
- }
- printf("<dd><dl><dt>Broken fragments and their line numbers: %s</dt>\n",
- $fragment_direction);
- }
- } else {
- printf("\n%s\t%s\n Code: %d%s %s\nTo do: %s\n",
- # List of redirects
- $redirected ? join("\n-> ",
- &get_redirects($u, %$redirects)) : $u,
- # List of lines
- $lines_list ? 'Lines: '.$lines_list : '' ,
- # Original HTTP reply
- $results->{$u}{location}{orig},
- # Final HTTP reply
- ($results->{$u}{location}{code} != $results->{$u}{location}{orig})
- ? ' -> '.$results->{$u}{location}{code}
- : '',
- # HTTP message
- $results->{$u}{location}{message} ?
- $results->{$u}{location}{message} : '',
- # What to do
- $whattodo);
- if ($#fragments >= 0) {
- if ($results->{$u}{location}{code} == 200) {
- print("The following fragments need to be fixed:\n");
- } else {
- print("Fragments:\n");
- }
- }
+ # Anchor for return codes
+ $idref,
+ # List of redirects
+ $redirected ? join(' redirected to<br>',
+ @redirects_urls) : &show_url($u),
+ # Color
+ &bgcolor($c),
+ # What to do
+ $whattodo,
+ # Redirect too?
+ $redirect_too ? ' <span '.&bgcolor(301).'>'.$redirect_too.'</span>' : '',
+ # Original HTTP reply
+ $results->{$u}{location}{orig},
+ # Final HTTP reply
+ ($results->{$u}{location}{code} !=
+ $results->{$u}{location}{orig})
+ ? ' <span title="redirected to">-&gt;</span> '.
+ &encode($results->{$u}{location}{code})
+ : '',
+ # Realm
+ (defined($results->{$u}{location}{realm})
+ ? 'Realm: '.&encode($results->{$u}{location}{realm}).'<br>'
+ : ''),
+ # HTTP original message
+ defined($results->{$u}{location}{orig_message})
+ ? &encode($results->{$u}{location}{orig_message}).
+ ' <span title="redirected to">-&gt;</span> '
+ : '',
+ # HTTP final message
+ $http_message,
+ # List of lines
+ $lines_list);
+ if ($#fragments >= 0) {
+ my $fragment_direction;
+ if ($results->{$u}{location}{code} == 200) {
+ $fragment_direction =
+ ' <strong class="broken">They need to be fixed!</strong>';
}
- # Fragments
- foreach $f (@fragments) {
- if ($_html) {
- printf("<dd>%s: %s</dd>\n",
- # Broken fragment
- &show_url($u, $f),
- # List of lines
- join(', ',
- &sort_unique(keys %{$links->{$u}{fragments}{$f}})));
- } else {
- printf("\t%-30s\tLines: %s\n",
- # Fragment
- $f,
- # List of lines
- join(', ',
- &sort_unique(keys %{$links->{$u}{fragments}{$f}})));
- }
- }
- if ($_html) {
- if ($#fragments >= 0) {
- print("</dl></dd>\n");
- }
+ printf("<dd><dl><dt>Broken fragments and their line numbers: %s</dt>\n",
+ $fragment_direction);
+ }
+ } else {
+ printf("\n%s\t%s\n Code: %d%s %s\nTo do: %s\n",
+ # List of redirects
+ $redirected ? join("\n-> ",
+ &get_redirects($u, %$redirects)) : $u,
+ # List of lines
+ $lines_list ? 'Lines: '.$lines_list : '' ,
+ # Original HTTP reply
+ $results->{$u}{location}{orig},
+ # Final HTTP reply
+ ($results->{$u}{location}{code} != $results->{$u}{location}{orig})
+ ? ' -> '.$results->{$u}{location}{code}
+ : '',
+ # HTTP message
+ $results->{$u}{location}{message} ?
+ $results->{$u}{location}{message} : '',
+ # What to do
+ $whattodo);
+ if ($#fragments >= 0) {
+ if ($results->{$u}{location}{code} == 200) {
+ print("The following fragments need to be fixed:\n");
+ } else {
+ print("Fragments:\n");
}
+ }
+ }
+ # Fragments
+ foreach $f (@fragments) {
+ if ($_html) {
+ printf("<dd>%s: %s</dd>\n",
+ # Broken fragment
+ &show_url($u, $f),
+ # List of lines
+ join(', ', &sort_unique(keys %{$links->{$u}{fragments}{$f}})));
+ } else {
+ printf("\t%-30s\tLines: %s\n",
+ # Fragment
+ $f,
+ # List of lines
+ join(', ', &sort_unique(keys %{$links->{$u}{fragments}{$f}})));
+ }
}
- # End of the table
if ($_html) {
- print("</dl>\n");
- }
+ if ($#fragments >= 0) {
+ print("</dl></dd>\n");
+ }
+ }
+ }
+ # End of the table
+ if ($_html) {
+ print("</dl>\n");
+ }
}
sub code_shown ($$)
{
- my ($u, $results) = @_;
+ my ($u, $results) = @_;
- if ($results->{$u}{location}{record} == 200) {
- return $results->{$u}{location}{orig};
- } else {
- return $results->{$u}{location}{record};
- }
+ if ($results->{$u}{location}{record} == 200) {
+ return $results->{$u}{location}{orig};
+ } else {
+ return $results->{$u}{location}{record};
+ }
}
sub links_summary (\%\%\%\%)
{
- # Advices to fix the problems
-
- my %todo = ( 200 => 'There are broken fragments which must be fixed.',
- 300 => 'It usually means that there is a typo in a link that triggers mod_speling action - this must be fixed!',
- 301 => 'You should update the link.',
- 302 => 'Usually nothing.',
- 400 => 'Usually the sign of a malformed URL that cannot be parsed by the server.',
- 401 => "The link is not public. You'd better specify it.",
- 403 => 'The link is forbidden! This needs fixing. Usual suspects: a missing index.html or Overview.html, or a missing ACL.',
- 404 => 'The link is broken. Fix it NOW!',
- 405 => 'The server does not allow HEAD requests. Go ask the guys who run this server why. Check the link manually.',
- 407 => 'The link is a proxy, but requires Authentication.',
- 408 => 'The request timed out.',
- 410 => 'The resource is gone. You should remove this link.',
- 415 => 'The media type is not supported.',
- 500 => 'Either the hostname is incorrect or it is a server side problem. Check the detailed list.',
- 501 => 'Could not check this link: method not implemented or scheme not supported.',
- 503 => 'The server cannot service the request, for some unknown reason.');
- my %priority = ( 410 => 1,
- 404 => 2,
- 403 => 5,
- 200 => 10,
- 300 => 15,
- 401 => 20
- );
-
- my ($links, $results, $broken, $redirects) = @_;
-
- # List of the broken links
- my @urls = keys %{$broken};
- my @dir_redirect_urls = ();
+ # Advices to fix the problems
+
+ my %todo = ( 200 => 'There are broken fragments which must be fixed.',
+ 300 => 'It usually means that there is a typo in a link that triggers mod_speling action - this must be fixed!',
+ 301 => 'You should update the link.',
+ 302 => 'Usually nothing.',
+ 400 => 'Usually the sign of a malformed URL that cannot be parsed by the server.',
+ 401 => "The link is not public. You'd better specify it.",
+ 403 => 'The link is forbidden! This needs fixing. Usual suspects: a missing index.html or Overview.html, or a missing ACL.',
+ 404 => 'The link is broken. Fix it NOW!',
+ 405 => 'The server does not allow HEAD requests. Go ask the guys who run this server why. Check the link manually.',
+ 407 => 'The link is a proxy, but requires Authentication.',
+ 408 => 'The request timed out.',
+ 410 => 'The resource is gone. You should remove this link.',
+ 415 => 'The media type is not supported.',
+ 500 => 'Either the hostname is incorrect or it is a server side problem. Check the detailed list.',
+ 501 => 'Could not check this link: method not implemented or scheme not supported.',
+ 503 => 'The server cannot service the request, for some unknown reason.');
+ my %priority = ( 410 => 1,
+ 404 => 2,
+ 403 => 5,
+ 200 => 10,
+ 300 => 15,
+ 401 => 20
+ );
+
+ my ($links, $results, $broken, $redirects) = @_;
+
+ # List of the broken links
+ my @urls = keys %{$broken};
+ my @dir_redirect_urls = ();
+ if ($_redirects) {
+ # Add the redirected URI's to the report
+ my $l;
+ for $l (keys %$redirects) {
+ next unless (defined($results->{$l})
+ && defined($links->{$l})
+ && !defined($broken->{$l}));
+ # Check whether we have a "directory redirect"
+ # e.g. http://www.w3.org/TR -> http://www.w3.org/TR/
+ my @redirects = &get_redirects($l, %$redirects);
+ if (($#redirects == 1)
+ && (($redirects[0].'/') eq $redirects[1])) {
+ push(@dir_redirect_urls, $l);
+ next;
+ }
+ push(@urls, $l);
+ }
+ }
+
+ # Broken links and redirects
+ if ($#urls < 0) {
+ if (! $_quiet) {
+ if ($_html) {
+ print "<h3>Links</h3>\n";
+ print "<p>Valid links!</p>";
+ } else {
+ print "\nValid links.";
+ }
+ print "\n";
+ }
+ } else {
+ if ($_html) {
+ print('<h3>');
+ }
+ print("\nList of broken links");
if ($_redirects) {
- # Add the redirected URI's to the report
- my $l;
- for $l (keys %$redirects) {
- next unless (defined($results->{$l})
- && defined($links->{$l})
- && !defined($broken->{$l}));
- # Check whether we have a "directory redirect"
- # e.g. http://www.w3.org/TR -> http://www.w3.org/TR/
- my @redirects = &get_redirects($l, %$redirects);
- if (($#redirects == 1)
- && (($redirects[0].'/') eq $redirects[1])) {
- push(@dir_redirect_urls, $l);
- next;
- }
- push(@urls, $l);
- }
+ print(' and redirects');
}
- # Broken links and redirects
- if ($#urls < 0) {
- if (! $_quiet) {
- if ($_html) {
- print "<h3>Links</h3>\n";
- print "<p>Valid links!</p>";
- } else {
- print "\nValid links.";
- }
- print "\n";
- }
- } else {
- if ($_html) {
- print('<h3>');
- }
- print("\nList of broken links");
- if ($_redirects) {
- print(' and redirects');
- }
-
- # Sort the URI's by HTTP Code
- my %code_summary;
- my $u;
- my @idx;
- foreach $u (@urls) {
- if (defined($results->{$u}{location}{record})) {
- my $c = &code_shown($u, $results);
- $code_summary{$c}++;
- push(@idx, $c);
- }
- }
- my @sorted = @urls[
- sort {
- defined($priority{$idx[$a]}) ?
+ # Sort the URI's by HTTP Code
+ my %code_summary;
+ my $u;
+ my @idx;
+ foreach $u (@urls) {
+ if (defined($results->{$u}{location}{record})) {
+ my $c = &code_shown($u, $results);
+ $code_summary{$c}++;
+ push(@idx, $c);
+ }
+ }
+ my @sorted = @urls[
+ sort {
+ defined($priority{$idx[$a]}) ?
+ defined($priority{$idx[$b]}) ?
+ $priority{$idx[$a]}
+ <=> $priority{$idx[$b]} :
+ -1 :
defined($priority{$idx[$b]}) ?
- $priority{$idx[$a]}
- <=> $priority{$idx[$b]} :
- -1 :
- defined($priority{$idx[$b]}) ?
- 1 :
- $idx[$a] <=> $idx[$b]
- } 0 .. $#idx
- ];
- @urls = @sorted;
- undef(@sorted); undef(@idx);
+ 1 :
+ $idx[$a] <=> $idx[$b]
+ } 0 .. $#idx
+ ];
+ @urls = @sorted;
+ undef(@sorted); undef(@idx);
- if ($_html) {
- print('</h3><p><i>Fragments listed are broken. See the table below to know what action to take.</i></p>');
-
- # Print a summary
- print "<table border=\"1\">\n<tr><td><b>Code</b></td><td><b>Occurrences</b></td><td><b>What to do</b></td></tr>\n";
- my $code;
- foreach $code (sort(keys(%code_summary))) {
- printf('<tr%s>', &bgcolor($code));
- printf('<td><a href="#d%scode_%s">%s</a></td>',
- $doc_count, $code, $code);
- printf('<td>%s</td>', $code_summary{$code});
- printf('<td>%s</td>', $todo{$code});
- print "</tr>\n";
- }
- print "</table>\n";
- } else {
- print(':');
- }
- &show_link_report($links, $results, $broken, $redirects,
- \@urls, 1, \%todo);
+ if ($_html) {
+ print('</h3><p><i>Fragments listed are broken. See the table below to know what action to take.</i></p>');
+
+ # Print a summary
+ print "<table border=\"1\">\n<tr><td><b>Code</b></td><td><b>Occurrences</b></td><td><b>What to do</b></td></tr>\n";
+ my $code;
+ foreach $code (sort(keys(%code_summary))) {
+ printf('<tr%s>', &bgcolor($code));
+ printf('<td><a href="#d%scode_%s">%s</a></td>',
+ $doc_count, $code, $code);
+ printf('<td>%s</td>', $code_summary{$code});
+ printf('<td>%s</td>', $todo{$code});
+ print "</tr>\n";
+ }
+ print "</table>\n";
+ } else {
+ print(':');
}
+ &show_link_report($links, $results, $broken, $redirects,
+ \@urls, 1, \%todo);
+ }
- # Show directory redirects
- if ($_redirects && $_dir_redirects && ($#dir_redirect_urls > -1)) {
- if ($_html) {
- print('<h3>');
- }
- print("\nList of directory redirects:");
- if ($_html) {
- print("</h3>\n<p>The links below are not broken, but the document does not use the exact URL.</p>");
- }
- &show_link_report($links, $results, $broken, $redirects,
- \@dir_redirect_urls);
+ # Show directory redirects
+ if ($_redirects && $_dir_redirects && ($#dir_redirect_urls > -1)) {
+ if ($_html) {
+ print('<h3>');
+ }
+ print("\nList of directory redirects:");
+ if ($_html) {
+ print("</h3>\n<p>The links below are not broken, but the document does not use the exact URL.</p>");
}
+ &show_link_report($links, $results, $broken, $redirects,
+ \@dir_redirect_urls);
+ }
}
###############################################################################
@@ -1663,10 +1660,10 @@ sub links_summary (\%\%\%\%)
sub global_stats ()
{
- my $stop = &get_timestamp();
- return sprintf("Checked %d document(s) in %ss.",
- ($doc_count<=$_max_documents? $doc_count : $_max_documents),
- &time_diff($timestamp, $stop));
+ my $stop = &get_timestamp();
+ return sprintf("Checked %d document(s) in %ss.",
+ ($doc_count<=$_max_documents? $doc_count : $_max_documents),
+ &time_diff($timestamp, $stop));
}
##################
@@ -1675,16 +1672,16 @@ sub global_stats ()
sub html_header ($;$)
{
- my $uri = &encode($_[0]);
- # Cache control?
- if (defined($_[1])) {
- print "Cache-Control: no-cache\nPragma: no-cache\n";
- }
- if (! $_cl) {
- print "Content-Type: text/html; charset=iso-8859-1\nContent-Language: en";
- }
- my $title = ' Link Checker'.($uri eq '' ? '' : ': '.$uri);
- print "
+ my $uri = &encode($_[0]);
+ # Cache control?
+ if (defined($_[1])) {
+ print "Cache-Control: no-cache\nPragma: no-cache\n";
+ }
+ if (! $_cl) {
+ print "Content-Type: text/html; charset=iso-8859-1\nContent-Language: en";
+ }
+ my $title = ' Link Checker'.($uri eq '' ? '' : ': '.$uri);
+ print "
<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">
<html lang=\"en\">
@@ -1741,42 +1738,41 @@ dt.report {
sub bgcolor ($)
{
- my ($code) = @_;
- my $class;
- my $r = HTTP::Response->new($code);
- if ($r->is_success()) {
- return '';
- } elsif ($code == 300) {
- $class = 'multiple';
- } elsif ($code == 401) {
- $class = 'unauthorized';
- } elsif ($r->is_redirect()) {
- $class = 'redirect';
- } elsif ($r->is_error()) {
- $class = 'broken';
- } else {
- $class = 'broken';
- }
- return(' class="'.$class.'"');
+ my ($code) = @_;
+ my $class;
+ my $r = HTTP::Response->new($code);
+ if ($r->is_success()) {
+ return '';
+ } elsif ($code == 300) {
+ $class = 'multiple';
+ } elsif ($code == 401) {
+ $class = 'unauthorized';
+ } elsif ($r->is_redirect()) {
+ $class = 'redirect';
+ } elsif ($r->is_error()) {
+ $class = 'broken';
+ } else {
+ $class = 'broken';
+ }
+ return(' class="'.$class.'"');
}
sub show_url ($;$)
{
- my ($url, $fragment) = @_;
- if (defined($fragment)) {
- $url .= '#'.$fragment;
- }
- return('<a href="'.$url.'">'.&encode(defined($fragment) ? $fragment : $url).'</a>');
+ my ($url, $fragment) = @_;
+ if (defined($fragment)) {
+ $url .= '#'.$fragment;
+ }
+ return('<a href="'.$url.'">'.&encode(defined($fragment) ? $fragment : $url).'</a>');
}
sub html_footer ()
{
+ if (($doc_count > 0) && !$_quiet) {
+ printf("<p>%s</p>\n", &global_stats());
+ }
- if (($doc_count > 0) && !$_quiet) {
- printf("<p>%s</p>\n", &global_stats());
- }
-
- print "
+ print "
<hr>
<address>
$PROGRAM $VERSION<br>
@@ -1800,20 +1796,20 @@ code</a> from
sub file_uri ($)
{
- my $uri = $_[0];
- &html_header($uri);
- print "<h2>Forbidden</h2>
+ my $uri = $_[0];
+ &html_header($uri);
+ print "<h2>Forbidden</h2>
<p>You cannot check such a URI (<code>$uri</code>).</p>
";
- &html_footer();
- exit;
+ &html_footer();
+ exit;
}
sub print_form ($)
{
- my ($q) = @_;
- &html_header('', 1);
- print "<form action=\"".$q->self_url()."\" method=\"get\">
+ my ($q) = @_;
+ &html_header('', 1);
+ print "<form action=\"".$q->self_url()."\" method=\"get\">
<p><label for=\"uri\">Enter the address (<a href='http://www.w3.org/Addressing/#terms'>URL</a>)
of a document that you would like to check:</label></p>
<p><input type=\"text\" size=\"50\" id=\"uri\" name=\"uri\"></p>
@@ -1834,24 +1830,24 @@ of a document that you would like to check:</label></p>
<p><input type=\"submit\" name=\"submit\" value=\"Check\"></p>
</form>
";
- &html_footer();
- exit;
+ &html_footer();
+ exit;
}
sub encode (@)
{
- if (! $_html) {
- return @_;
- } else {
- return HTML::Entities::encode(@_);
- }
+ if (! $_html) {
+ return @_;
+ } else {
+ return HTML::Entities::encode(@_);
+ }
}
sub hprintf (@)
{
- if (! $_html) {
- printf(@_);
- } else {
- print HTML::Entities::encode(sprintf($_[0], @_[1..@_-1]));
- }
+ if (! $_html) {
+ printf(@_);
+ } else {
+ print HTML::Entities::encode(sprintf($_[0], @_[1..@_-1]));
+ }
}