#!/usr/bin/perl

# Example code from Chapter 7 of /Perl and LWP/ by Sean M. Burke
# http://www.oreilly.com/catalog/perllwp/
# sburke@cpan.org

require 5;
use strict;
use warnings;

use HTML::TokeParser;
use URI;

#scan_bbc_stream(
#  HTML::TokeParser->new('bbc.html') || die($!),
#  'http://news.bbc.co.uk/' # base URL
#);

url_scan(\&scan_bbc_stream, 'http://news.bbc.co.uk/');


my $browser;
BEGIN {
  use LWP::UserAgent;
  $browser = LWP::UserAgent->new;
  # and any other $browser initialization code here
}

sub url_scan {
  my($scanner, $url) = @_;
  die "What scanner function?" unless $scanner and ref($scanner) eq 
'CODE';
  die "What URL?" unless $url;
  my $resp = $browser->get( $url );
  die "Error getting $url: ", $resp->status_line
    unless $resp->is_success;
  die "It's not HTML, it's ", $resp->content_type
    unless $resp->content_type eq 'text/html';

  my $stream = HTML::TokeParser->new( $resp->content_ref )
    || die "Couldn't make a stream from $url\'s content!?";
  # new() on a string wants a reference, and so that's what
  #  we give it!  HTTP::Response objects just happen to
  #  offer a method that returns a reference to the content.
  $scanner->($stream, $resp->base);
}


sub scan_bbc_stream {
  my($stream, $docbase) = @_;

 Token:
  while(my $token = $stream->get_token) {

    if($token->[0] eq 'S'  and  $token->[1] eq 'b'  and  
       ($token->[2]{'class'} || '') eq 'h3') {
      # The href we want is in the NEXT token... probably.
      # Like: <B CLASS="h3"><A href="magic_url_here">

      my(@next) = ($stream->get_token);

      if($next[0] and $next[0][0] eq 'S'  and  $next[0][1] eq 'a'  and
          defined $next[0][2]{'href'} ) {
         # We found <a href="...">!  This rule matches!
         print URI->new_abs($next[0][2]{'href'}, $docbase), "\n";
         next Token;
      }
      # We get here only if we've given up on this rule:
      $stream->unget_token(@next);
    }

    if($token->[0] eq 'S'  and  $token->[1] eq 'a'  and
       defined $token->[2]{'href'} ) {
      # Like: <A href="magic_url_here"> <B class="h2">

      my(@next) = ($stream->get_token);
      if( $next[0] and $next[0][0] eq 'T' and $next[0][1] =~ m/^\s+/s ) {
        # We found whitespace.
        push @next, $stream->get_token;
        if($next[1] and $next[1][0] eq 'S'  and  $next[1][1] eq 'b'  and
           ($next[1][2]{'class'} || '') =~ m/^h[12]$/s ) {
          # We found <b class="h2">!  This rule matches!
          print URI->new_abs( $token->[2]{'href'}, $docbase ), "\n";
          next Token;
        }
      }
      # We get here only if we've given up on this rule:
      $stream->unget_token(@next);
    }
  }
  return;
}
__END__

