Find bad links, dead links or 404s.
The script displays a “.” for each good link and displays the error code and message for each bad link.
churls_web.pl
#!/usr/bin/perl -wT
use lib '/home/alexpbco/perl/lib/perl5/site_perl/5.8.7';
use HTML::LinkExtor;
use LWP::UserAgent;
use CGI; # qw(fatalsToBrowser);
use strict;
use Encode;
my $cgi = new CGI;
my $base_url = lc($cgi->param('q'));
my $ua = LWP::UserAgent->new;
$ua->agent("Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.7) Gecko/20070914 Firefox/2.0.0.7");
$ua->protocols_allowed( [ 'http', 'https' ] );
$ua->cookie_jar({ file => "cookies.txt" });
$ua->max_size( 200 * 1024 );
$ua->timeout( 120 );
my $parser = HTML::LinkExtor->new(undef, $base_url);
my $title = "Dead Link Finder";
($title .= " – ".$base\_url) if ($base\_url);
print $cgi->header( "text/html" ),
$cgi->start_html(-title => $title),
$cgi->h1( $title ),
$cgi->start\_form( { -action => "churls\_web.pl",
-enctype => "application/x-www-form-urlencoded",
-method => "post" } ),
$cgi->input( { -name => "q",
-size => "30",
-value => "$base_url",
-type => "text"} ),
$cgi->input( { -name => "search",
-value => "search",
-type => "submit"} ),
$cgi->div(" e.g.: http://www.myurl.com/"),
$cgi->end_form;
if ($base_url) {
my $response = $ua->get($base_url);
if ($response->is_success) {
$parser->parse(decode_utf8($response->content));
}
else {
print $cgi->p($response->status_line);
print $cgi->end_html();
return;
}
my @links = $parser->links;
print $cgi->h2("Results:");
my ($badlinksnum, $alllinks) = 0;
foreach my $linkarray (@links) {
my @element = @$linkarray;
my $elt_type = shift @element;
while (@element) {
my ($attr\_name , $attr\_value) = splice(@element, 0, 2);
if ($attr_value->scheme =~ /\b(https?)\b/) {
sleep 1; #1 second pause between get requests
my $response\_temp =$ua->get($attr\_value);
unless ($response\_temp->is\_success) {
print $cgi->p($attr_value." – **".$response\_temp->status\_line."**");
$badlinksnum++;
}
else {
print $cgi->span("."); # print "." for each good link (to prevent timeouts and show progress)
$alllinks++;
}
$|=1; # send HTML to browser right away to prevent timeouts
return if ($alllinks >= 500);
}
}
}
print $cgi->h3("Checked $alllinks link(s) and found **$badlinksnum** bad link(s)");
}
print $cgi->end_html();