summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorrenaudb <renaudb@localhost>1998-09-11 21:34:04 +0000
committerrenaudb <renaudb@localhost>1998-09-11 21:34:04 +0000
commit198eab5db5c34e7813f78e2e3c151ea26877e885 (patch)
tree313d2ee598aa59563678cf9408686ebb809835f9
parentac7c600f8be40829b2bb4254eeeef61358294833 (diff)
downloadmarkup-validator-198eab5db5c34e7813f78e2e3c151ea26877e885.zip
markup-validator-198eab5db5c34e7813f78e2e3c151ea26877e885.tar.gz
markup-validator-198eab5db5c34e7813f78e2e3c151ea26877e885.tar.bz2
finished handling of 401s. fixed double responses too
-rwxr-xr-xhttpd/cgi-bin/checklink.pl102
1 files changed, 74 insertions, 28 deletions
diff --git a/httpd/cgi-bin/checklink.pl b/httpd/cgi-bin/checklink.pl
index 21ec0e7..782d01d 100755
--- a/httpd/cgi-bin/checklink.pl
+++ b/httpd/cgi-bin/checklink.pl
@@ -16,15 +16,17 @@ sub new_line {
}
# called by HTML::Parser::parse
# overriden to count lines
+# I looked at getting text from links for the output
+# but I don't see why it would be of interest
+# plus, need to remove markup within <a></a>
+# probably not worth it
sub start {
my $self=shift;
my ($tag,$attr)=@_;
my $link;
- $self->{STARTLINK} = 0;
$link=$attr->{href} if $tag eq "a";
$link=$attr->{src} if $tag eq "img";
if (defined $link){
- $self->{STARTLINK} = 1;
$self->{Links}{$link}{$self->{Line}+1}++;
}
}
@@ -35,16 +37,32 @@ use vars qw(@ISA);
@ISA = qw(LWP::Parallel::UserAgent);
# function overload to handle 301s and build
# redirect chain
-# next is to handle 401s to make sure a 404
-# is not hidden behind it.
+# it now handles 401s too, but not like I wanted it to
+# it uses a team password to HEAD non-public documents
+# I make sure I don't air this password out of w3.org
+# proper way would be: fork a HTTP server, and then proxy 401
+# between server and client
+# caching credentials in HTTP Daemon may be difficult
+# I'll look at this later
+# it is the way to go if we ever want to integrate this into validator
sub on_return {
my $self = shift;
my ($request,$response,$content) = @_;
- if($response->code == 301 || $response->code == 302){
- $self->{URL}{Redirect}{$request->url} = $response->headers->header('Location');
- unless(defined $self->{URL}{Registered}{$response->headers->header('Location')}){
- $self->register(HTTP::Request->new(HEAD => $response->headers->header('Location')),\&callback_check,undef,0);
- $self->{URL}{Registered}{$response->headers->header('Location')}="True";
+ if($self->{DoAuth}){
+ if($response->code == 301 || $response->code == 302){
+ $self->{URL}{Redirect}{$request->url} = $response->headers->header('Location');
+ unless(defined $self->{URL}{Registered}{$response->headers->header('Location')}){
+ $self->register(HTTP::Request->new(HEAD => $response->headers->header('Location')),\&callback_check,undef,0);
+ $self->{URL}{Registered}{$response->headers->header('Location')}="True";
+ }
+ }
+ if($response->code == 401 && !defined($self->{Auth}{$response->request->url})){
+ if($response->request->url->netloc =~ /\.w3\.org$/){
+ my $newreq = HTTP::Request->new(HEAD => $response->request->url);
+ $newreq->headers->authorization_basic("XXXXX","XXXXX");
+ $self->{Auth}{$response->request->url} = 1;
+ $self->register($newreq,\&callback_check,undef,0);
+ }
}
}
}
@@ -55,11 +73,23 @@ use CGI qw(:standard);
###############
# Global Variables
-my $VERSION= '$Id: checklink.pl,v 1.14 1998-09-10 00:33:57 renaudb Exp $ ';
+my $VERSION= '$Id: checklink.pl,v 1.15 1998-09-11 21:34:04 renaudb Exp $ ';
my %ALLOWED_SCHEMES = ( "http" => 1 );
my %SCHEMES = (); # for report
my %URL = ();
-my %COLORS = ( 301 => ' BGCOLOR="yellow"', 302 => ' BGCOLOR="yellow"', 404 => ' BGCOLOR="red"' , 401 => ' BGCOLOR="aqua"' );
+my %COLORS = ( 200 => '', 301 => ' BGCOLOR="yellow"', 302 => ' BGCOLOR="yellow"', 404 => ' BGCOLOR="red"' , 401 => ' BGCOLOR="aqua"' );
+my %HTTP_CODES = ( 200 => 'ok' ,
+ 201 => '201',
+ 301 => 'redirect' ,
+ 302 => 'redirect' ,
+ 401 => 'unauthorized' ,
+ 404 => 'not found' ,
+ 405 => '405',
+ 408 => '408',
+ 415 => '415',
+ 500 => '500',
+ 501 => '501',
+ 503 => '503');
my $VERBOSE = 0;
my $CGI = 0;
my $p = ParseLink->new(); # we want the parser to be global, so we can call it from callback_parse
@@ -134,7 +164,6 @@ sub callback_check {
sub checklinks {
my $url=URI::URL->new(shift);
my $q = shift if($CGI);
- #my $ua = new LWP::Parallel::UserAgent;
my $ua = new UserAgent;
my $request = HTTP::Request->new(GET => $url);
@@ -155,6 +184,7 @@ sub checklinks {
my $res = $ua->wait(10);
$ua->initialize;
$ua->{URL}= \%URL;
+ $ua->{DoAuth} = 1;
foreach my $r (map {$res->{$_}->response} keys %$res){
if ($r->is_success && $r->content_type =~ /text\/html/i){
@@ -177,21 +207,24 @@ sub checklinks {
&print_result($url,$response,$q);
} else {
# error handling if error on fetching document to be checklink-ed
+ # I need to add a command-line mode here
if($r->code == 401){
- print $q->h3($r->request->url);
- $r->headers->www_authenticate =~ /Basic realm=\"([^\"]+)\"/;
- my $realm = $1;
if($CGI){
- print $q->h2('Authentication Required To Fetch '.$q->a({href=>$url},$url));
- print $q->startform('POST',$q->url);
- print $q->textfield(-name=>'url',-size=>'50',-value=>$url),$q->br;
- print $q->textfield(-name=>'username',-size=>'10'),"Username for Realm ",$realm,$q->br;
- print $q->password_field(-name=>'password',-size=>'10'),"Password",$q->br;
- print $q->submit('Proceed');
- print $q->endform;
+ print $q->h3($r->request->url);
+ $r->headers->www_authenticate =~ /Basic realm=\"([^\"]+)\"/;
+ my $realm = $1;
+ if($CGI){
+ print $q->h2('Authentication Required To Fetch '.$q->a({href=>$url},$url));
+ print $q->startform('POST',$q->url);
+ print $q->textfield(-name=>'url',-size=>'50',-value=>$url),$q->br;
+ print $q->textfield(-name=>'username',-size=>'10'),"Username for Realm ",$realm,$q->br;
+ print $q->password_field(-name=>'password',-size=>'10'),"Password",$q->br;
+ print $q->submit('Proceed');
+ print $q->endform;
+ }
}
} else {
- print $q->h2('Error '.$r->code) if($CGI);
+ print $q->h2('Error '.$r->code." ".$r->content_type) if($CGI);
}
}
}
@@ -209,7 +242,17 @@ sub print_result{
print $q->h2('Result for '.$q->a({href=>$url},$url)) if($CGI);
print "<table border=\"1\"><TR ALIGN=\"center\"><TD>Lines</TD><TD>URI</TD><TD>Code</TD><TD>Extra</TD></TR>\n" if($CGI);
+ # this is to handle 401s and see what's behind them
+ # I need to watch this carefully, I might be wrong about these tests
foreach my $resp (map {$response->{$_}->response} keys %$response){
+ $URL{Responses}{$resp->request->url} = $resp unless($URL{Responses}{$resp->request->url} && $resp->code == 200);
+ $URL{Codes}{$resp->request->url} = $resp->code unless($URL{Codes}{$resp->request->url} && $resp->code == 401);
+ }
+ # loop that selects through all the responses to the HEADs to the links
+ # sorting can happen here
+ foreach my $resp (map {$URL{Responses}{$_}} sort keys %{$URL{Responses}}){
+ # we pass links for which we don't have a line number
+ # (which means they are responses from 301/302 or 401 handling)
if(keys %{$URL{$resp->request->url}}){
unless($resp->code eq "200"){
print "<TR><TD ALIGN=\"center\">" if ($CGI);
@@ -217,10 +260,13 @@ sub print_result{
if ($CGI){
print "</TD><TD".$COLORS{$resp->code}.">";
print "<B>".$q->a({href=>$resp->request->url},$resp->request->url)."</B>";
- print $q->br.&recurse_redirect($resp->request->url,$q) if($resp->code == 301 || $resp->code == 302);
- print "</TD><TD ALIGN=\"center\">".$resp->code."</TD><TD>";
+ if($URL{Codes}{$resp->request->url} == 301 || $URL{Codes}{$resp->request->url} == 302){
+ print $q->br.&recurse_redirect($resp->request->url,$q);
+ } else {
+ print "</TD><TD ALIGN=\"center\"".$COLORS{$URL{Codes}{$resp->request->url}}.">".$HTTP_CODES{$URL{Codes}{$resp->request->url}}."</TD><TD>";
+ }
}
- print " ",$resp->request->url, ": ",$resp->code," " if($VERBOSE);
+ print " ",$resp->request->url, ": ",$URL{Codes}{$resp->request->url}," " if($VERBOSE);
print $resp->headers->www_authenticate if($resp->code == 401);
print "</TD></TR>\n" if($CGI);
print "\n" if($VERBOSE);
@@ -233,7 +279,7 @@ sub print_result{
}
}
}
- print "</table>\n";# if($CGI);
+ print "</table>\n" if($CGI);
}
# I'll add links to source later
@@ -250,6 +296,6 @@ sub html_header {
sub recurse_redirect {
my $url = shift;
my $q = shift;
- my $rec = $URL{Redirect}{$URL{Redirect}{$url}}? "<BR>".&recurse_redirect($URL{Redirect}{$url},$q):"";
+ my $rec = $URL{Redirect}{$URL{Redirect}{$url}}? "<BR>".&recurse_redirect($URL{Redirect}{$url},$q):"</TD><TD ALIGN=\"center\">".$HTTP_CODES{$URL{Codes}{$URL{Redirect}{$url}}}."</TD><TD>";
return "-&gt; ".$q->a({href=>$URL{Redirect}{$url}},$URL{Redirect}{$url}).$rec;
}