06 Oct 2007

Free Link Checker

Find bad links, dead links or 404s.

The script displays a “.” for each good link and displays the error code and message for each bad link.

churls_web.pl

#!/usr/bin/perl -wT  
use lib '/home/alexpbco/perl/lib/perl5/site_perl/5.8.7';  
use HTML::LinkExtor;  
use LWP::UserAgent;  
use CGI; # qw(fatalsToBrowser);  
use strict;  
use Encode;

my $cgi = new CGI;  
my $base_url = lc($cgi->param('q'));  
my $ua = LWP::UserAgent->new;

$ua->agent("Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.7) Gecko/20070914 Firefox/2.0.0.7");  
$ua->protocols_allowed( [ 'http', 'https' ] );  
$ua->cookie_jar({ file => "cookies.txt" });  
$ua->max_size( 200 * 1024 );  
$ua->timeout( 120 );
  
my $parser = HTML::LinkExtor->new(undef, $base_url);  
my $title  = "Dead Link Finder";
  
($title .= " – ".$base\_url) if ($base\_url);  
print $cgi->header( "text/html" ),  
$cgi->start_html(-title => $title),  
$cgi->h1( $title ),  
$cgi->start\_form( { -action => "churls\_web.pl",  
                     -enctype => "application/x-www-form-urlencoded",  
                     -method => "post" } ),  
$cgi->input( {  -name => "q",  
                -size => "30",  
                -value => "$base_url",  
                -type => "text"} ),  
$cgi->input( {  -name => "search",  
                -value => "search",  
                -type => "submit"} ),  
$cgi->div(" e.g.: http://www.myurl.com/"),  
$cgi->end_form;  
if ($base_url) {  
  my $response = $ua->get($base_url);  
  if ($response->is_success) {  
    $parser->parse(decode_utf8($response->content));  
  }  
  else {  
    print $cgi->p($response->status_line);  
    print $cgi->end_html();  
    return;  
  }

  my @links = $parser->links;  
  print $cgi->h2("Results:");  
  my ($badlinksnum, $alllinks) = 0;  
  foreach my $linkarray (@links) {  
    my @element = @$linkarray;  
    my $elt_type = shift @element;  
    while (@element) {  
      my ($attr\_name , $attr\_value) = splice(@element, 0, 2);  
      if ($attr_value->scheme =~ /\b(https?)\b/) {  
        sleep 1; #1 second pause between get requests  
        my $response\_temp =$ua->get($attr\_value);  
        unless ($response\_temp->is\_success) {  
          print $cgi->p($attr_value." – **".$response\_temp->status\_line."**");  
          $badlinksnum++;  
        }  
        else {  
          print $cgi->span("."); # print "." for each good link (to prevent timeouts and show progress)  
          $alllinks++;  
        }  
        $|=1; # send HTML to browser right away to prevent timeouts  
        return if ($alllinks >= 500);  
      }  
    }  
  }  
  print $cgi->h3("Checked $alllinks link(s) and found **$badlinksnum** bad link(s)");  
}  
print $cgi->end_html();