#!/usr/bin/perl

# Example code from Chapter 9 of /Perl and LWP/ by Sean M. Burke
# http://www.oreilly.com/catalog/perllwp/
# sburke@cpan.org

require 5;
use warnings;
use strict; 

use URI;
use HTML::TreeBuilder 3; 
 
my $tree = HTML::TreeBuilder->new(); 
#$tree->parse_file('bbc.html') || die $!;  # the saved source from BBC News 

use LWP::Simple;
my $url = 'http://news.bbc.co.uk/';
my $source = get($url) || die "Can't get $url";
$tree->parse($source);
$tree->eof;

scan_bbc_tree( $tree, $url ); 
$tree->delete(); 
 

sub scan_bbc_tree { 
  my($root, $docbase) = @_; 
  foreach my $b ($root->find_by_tag_name('b')) { 
    my $class = $b->attr('class') || next; 
    if($class eq 'h3') { 
      # Expect one 'a' element as a child 
      my @children = $b->content_list; 
      if(@children == 1 and ref $children[0] and $children[0]->tag eq 'a') {
        print $b->as_text, "\n  ", URI->new_abs( 
          $children[0]->attr('href') || next, 
          $docbase 
        ), "\n"; 
      } 
    } elsif($class eq 'h1' or $class eq 'h2') { 
      # Expect an 'a' element as a parent 
      my $parent = $b->parent; 
      if($parent and $parent->tag eq 'a') { 
        print $b->as_text, "\n  ", URI->new_abs( 
          $parent->attr('href') || next, 
          $docbase 
        ), "\n"; 
      } 
    } 
  } 
  return;   
}

__END__

Example output:

No respite for nervous markets
  http://news.bbc.co.uk/hi/english/business/newsid_2130000/2130923.stm
Vaccine call in foot-and-mouth report
  http://news.bbc.co.uk/hi/english/sci/tech/newsid_2130000/2130789.stm
Selby pit complex to close
  http://news.bbc.co.uk/hi/english/uk/england/newsid_2129000/2129050.stm
Bush plan to plug security gaps
  http://news.bbc.co.uk/hi/english/world/americas/newsid_2131000/2131067.stm
Rio on the brink
  http://news.bbc.co.uk/sport/hi/english/football/eng_prem/newsid_2128000/2128763.stm
ICC accepts burn-out fears
  http://news.bbc.co.uk/sport/hi/english/cricket/newsid_2128000/2128942.stm
Blair grilled by senior MPs
  http://news.bbc.co.uk/hi/english/uk_politics/newsid_2130000/2130875.stm
UK inflation hits record low
  http://news.bbc.co.uk/hi/english/business/newsid_2131000/2131161.stm
We can afford 61bn - Brown
  http://news.bbc.co.uk/hi/english/uk_politics/newsid_2131000/2131081.stm
Record loss for BNFL
  http://news.bbc.co.uk/hi/english/business/newsid_2130000/2130972.stm
'Extinction' claim over North Sea cod
  http://news.bbc.co.uk/hi/english/sci/tech/newsid_2130000/2130024.stm
GPs back NHS contract talks
  http://news.bbc.co.uk/hi/english/health/newsid_2131000/2131129.stm
School 'cash for reform' plans due
  http://news.bbc.co.uk/hi/english/education/newsid_2130000/2130447.stm
Winona Ryder to stand trial
  http://news.bbc.co.uk/hi/english/entertainment/showbiz/newsid_2131000/2131038.stm

