#!/usr/bin/perl

# Example code from Chapter 12 of /Perl and LWP/ by Sean M. Burke
# http://www.oreilly.com/catalog/perllwp/
# sburke@cpan.org

require 5;
use strict;
use warnings;

use URI;
use LWP;

# Switch processing:
my %option;
use Getopt::Std;
getopts('m:n:t:l:e:u:t:d:hv', \%option) || usage_quit(1);
usage_quit(0) if $option{'h'} or not @ARGV;

sub usage_quit {
  # Emit usage message, then exit with given error code.
  print <<"END_OF_MESSAGE"; exit($_[0] || 0);
Usage:
$0  [switches]  [urls]
This will spider for bad links, starting at the given URLs.
  
Switches:
 -h        display this help message
 -v        be verbose in messages to STDOUT  (default off)
 -m 123    run for at most 123 minutes.  (default 20)
 -n 456    cause at most 456 network hits.  (default 500)
 -d 7      delay for 7 seconds between hits.  (default 10)
 -l x.log  log to text file x.log. (default is to not log)
 -e y\@a.b  set bot admin address to y\@a.b  (no default!)
 -u Xyz    set bot name to Xyz.  (default: Verifactrola)
 -t 34     set request timeout to 34 seconds.  (default 15)

END_OF_MESSAGE
}

my $expiration = ($option{'m'} ||  20) * 60 + time();
my $hit_limit  =  $option{'h'} || 500;
my $log        =  $option{'l'};
my $verbose    =  $option{'v'};
my $bot_name   =  $option{'u'} || 'Verifactrola/1.0';
my $bot_email  =  $option{'e'} || '';
my $timeout    =  $option{'t'} || 15;
my $delay      =  $option{'d'} || 10;
die "Specify your email address with -e\n"
 unless $bot_email and $bot_email =~ m/\@/;

my $hit_count = 0;
my $robot;  # the user-agent itself

# Then the top-level code we've already seen:
initialize();
process_starting_urls(@ARGV);
main_loop();
report() if $hit_count;
say("Quitting.\n");
exit;

###########################################################################
#
#  GLORIOUS GLORIOUS MAIN LOOP
#
###########################################################################

my $QUIT_NOW;
 # a flag we can set to indicate that we stop now!

sub main_loop {
  while(
    schedule_count()
    and $hit_count < $hit_limit
    and time() < $expiration
    and ! $QUIT_NOW
  ) {
    process_url( next_scheduled_url() );
  }
  return;
}

###########################################################################
# Just init routines

sub initialize {
  init_logging();
  init_robot();
  init_signals();
  return;
}

sub init_logging {
  my $selected = select(STDERR);
  $| = 1; # Make STDERR unbuffered.
  if($log) {
    open LOG, ">>$log" or die "Can't append-open $log: $!";
    select(LOG);
    $| = 1; # Make LOG unbuffered
  }
  select($selected);
  print "Logging to $log\n" if $log;
  return;
}

sub init_robot {
  use LWP::RobotUA;
  $robot = LWP::RobotUA->new($bot_name, $bot_email);
  $robot->delay($delay/60); # "/60" to do seconds->minutes
  $robot->timeout($timeout);
  $robot->requests_redirectable([]);
    # don't follow any sort of redirects
  $robot->protocols_allowed(['http']);  # disabling all others
  say("$bot_name ($bot_email) starting at ",
    scalar(localtime), "\n");
  say("Using LWP v$LWP::VERSION under Perl v$] $^O\n");
  return;
}

sub init_signals {  # catch control-C's
  $SIG{'INT'} = sub { $QUIT_NOW = 1; return;};
   # That might not be emulated right under MSWin.
}

###########################################################################
# Logging routines

my $last_time_anything_said;
sub say {
  # Add timestamps as needed:
  unless(time() == ($last_time_anything_said || 0)) {
    $last_time_anything_said = time();
    unshift @_, "[T$last_time_anything_said = " .
      localtime($last_time_anything_said) . "]\n";
  }
  print LOG @_ if $log;
  print @_;
}

my $last_time_anything_muttered;
sub mutter {
  # Add timestamps as needed:
  unless(time() == ($last_time_anything_muttered || 0)) {
    $last_time_anything_muttered = time();
    unshift @_, "[T$last_time_anything_muttered = " .
      localtime($last_time_anything_muttered) . "]\n";
  }
  print LOG @_ if $log;
  print @_ if $verbose;
}

###########################################################################

sub process_url {
  my $url = $_[0];
  if( near_url($url) )   { process_near_url($url) }
  else                   { process_far_url($url) }
  return;
}

sub process_far_url {
  my $url = $_[0];
  say("HEADing $url\n");
  ++$hit_count;
  my $response = $robot->head($url, refer($url));
  mutter("  That was hit #$hit_count\n");
  consider_response($response);  # that's all we do!
  return;
}

sub process_near_url { 
  my $url = $_[0]; 
  mutter("HEADing $url\n"); 
  ++$hit_count;
  my $response = $robot->head($url, refer($url)); 
  mutter("  That was hit #$hit_count\n"); 
  return unless consider_response($response); 
   
  if($response->content_type ne 'text/html') { 
    mutter("  HEAD-response says it's not HTML!  Skipping", 
        $response->content_type, "\n"); 
    return; 
  }
  if(length ${ $response->content_ref }) { 
    mutter("  Hm, that had content!  Using it...\n" ); 
    say("Using head-gotten $url\n"); 
  } else { 
    mutter("It's HTML!\n"); 
    say("Getting $url\n"); 
    ++$hit_count; 
    $response = $robot->get($url, refer($url)); 
    mutter("  That was hit #$hit_count\n"); 
    return unless consider_response($response); 
  }
  if($response->content_type eq 'text/html') { 
    mutter("  Scanning the gotten HTML...\n"); 
    extract_links_from_response($response); 
  } else { 
    mutter("  Skipping the gotten non-HTML (", 
      $response->content_type, ") content.\n"); 
  }
  return; 
}


sub consider_response {
  # Return 1 if it's successful, otherwise return 0
  my $response = $_[0];
  mutter("  ", $response->status_line, "\n");
  return 1 if $response->is_success;

  if($response->is_redirect) {
    my $to_url = $response->header('Location');
    if(defined $to_url and length $to_url and 
      $to_url !~ m/\s/
    ) {
      my $from_url = $response->request->uri;
      $to_url = URI->new_abs($to_url, $from_url);
      mutter("Noting redirection\n  from $from_url\n",
        "    to $to_url\n");
      note_link_to( $from_url => $to_url );
    }
  } else {
    note_error_response($response);
  }

  return 0;
}




my %notable_url_error;  # URL => error message

sub note_error_response {
  my $response = $_[0];
  return unless $response->is_error;

  my $code = $response->code;
  my $url = URI->new( $response->request->uri )->canonical;

  if(  $code == 404 or $code == 410 or $code == 500  ) {
    mutter(sprintf "Noting {%s} error at %s\n",
           $response->status_line, $url );
    $notable_url_error{$url} = $response->status_line;
  } else {
    mutter(sprintf "Not really noting {%s} error at %s\n",
           $response->status_line, $url );
  }
  return;
}




use HTML::TokeParser; 
use URI; 
 
sub extract_links_from_response { 
  my $response = $_[0]; 
   
  my $base = URI->new( $response->base )->canonical; 
    # "canonical" returns it in the one "official" tidy form 
   
  my $stream = HTML::TokeParser->new( $response->content_ref ); 
  my $page_url = URI->new( $response->request->uri ); 
   
  mutter( "Extracting links from $page_url\n" ); 
   
  my($tag, $link_url); 
  while( $tag = $stream->get_tag('a') ) { 
    next unless defined($link_url = $tag->[1]{'href'}); 
    next if $link_url =~ m/\s/; # If it's got whitespace, it's a bad URL. 
    next unless length $link_url; # sanity check! 
     
    $link_url = URI->new_abs($link_url, $base)->canonical; 
    next unless $link_url->scheme eq 'http'; # sanity 
     
    $link_url->fragment(undef); # chop off any "#foo" part 
    note_link_to($page_url => $link_url) 
      unless $link_url->eq($page_url); # Don't note links to itself! 
  }
  return; 
}


###########################################################################

my %points_to;

sub report {  # This that gets run at the end.
  say(
    "\n\nEnding at ", scalar(localtime),
    " after ", time() - $^T,
    "s of runtime and $hit_count hits.\n\n",
  );
  unless(keys %notable_url_error) {
    say( "\nNo bad links seen!\n" );
    return;
  }
  
  say( "BAD LINKS SEEN:\n" );
  foreach my $url (sort keys %notable_url_error) {
    say( "\n$url\n  Error: $notable_url_error{$url}\n" );
    foreach my $linker (sort keys %{ $points_to{$url} } ) {
      say( "  < $linker\n" );
    }
  }
  return;
}

sub refer {
  # Generate a good Referer header for requesting this URL.
  my $url = $_[0];
  my $links_to_it = $points_to{$url};
   # the set (hash) of all things that link to $url
  return() unless $links_to_it and keys %$links_to_it;
  
  my @urls = keys %$links_to_it; # in no special order!
  mutter "  For $url, Referer => $urls[0]\n";
  return "Referer" => $urls[0];
}

sub note_link_to { 
  my($from_url => $to_url) = @_; 
  $points_to{ $to_url }{ $from_url } = 1; 
  mutter("Noting link\n  from $from_url\n    to $to_url\n"); 
  schedule($to_url); 
  return; 
}

###########################################################################

sub regularize_hostname { 
  my $host = lc $_[0]; 
  $host =~ s/\.+/\./g; # foo..com => foo.com 
  $host =~ s/^\.//;    # .foo.com => foo.com 
  $host =~ s/\.$//;    # foo.com. => foo.com 
  return 'localhost' if $host =~ m/^0*127\.0+\.0+\.0*1$/; 
  return $host; 
}

sub url_path_count { 
  # Return 4 for "http://foo.int/fee/fie/foe/fum" 
  #                               1   2   3   4 
  my $url = $_[0]; 
  my @parts = $url->path_segments; 
  shift @parts if @parts and $parts[ 0] eq ''; 
  pop   @parts if @parts and $parts[-1] eq ''; 
  return scalar @parts; 
}

###########################################################################

my @starting_urls;

sub near_url {   # Is the given URL "near"?
  my $url = $_[0];
  foreach my $starting_url (@starting_urls) {
    if( substr($url, 0, length($starting_url))
     eq $starting_url
     # We assume that all URLs are in canonical form!
    ) {
      mutter("  So $url is near\n");
      return 1;
    }
  }
  mutter("  So $url is far\n");
  return 0;
}

sub process_starting_urls {
  foreach my $url (@_) {
    my $u = URI->new($url)->canonical;
    schedule($u);
    push @starting_urls, $u;
  }
  return;
}

###########################################################################

my @schedule; 

sub schedule_count     { return scalar @schedule }

sub next_scheduled_url { 
  my $url = splice @schedule, rand(@schedule), 1;

  mutter("\nPulling from schedule: ", $url || "[nil]", 
    "\n  with ", scalar(@schedule), 
    " items left in schedule.\n"); 
  return $url; 
}

my %seen_url_before;
sub schedule { 
  # Add these URLs to the schedule 
  foreach my $url (@_) { 
    my $u = ref($url) ? $url : URI->new($url); 
    $u = $u->canonical;  # force canonical form
   
    next unless 'http' eq ($u->scheme || ''); 
    next if defined $u->query; 
    next if defined $u->userinfo;

    $u->host( regularize_hostname( $u->host() ) ); 
    return unless $u->host() =~ m/\./; 
 
    next if url_path_count($u) > 6; 
    next if $u->path =~ m<//> or $u->path =~ m</\.+(/|$)>;

    $u->fragment(undef);

    if( $seen_url_before{ $u->as_string }++ ) { 
      mutter("  Skipping the already-seen $u\n"); 
    } else { 
      mutter("  Scheduling $u\n"); 
      push @schedule, $u; 
    } 
  } 
  return; 
}

###########################################################################
__END__

