#!/usr/bin/perl

# Example code from Chapter 11 of /Perl and LWP/ by Sean M. Burke
# http://www.oreilly.com/catalog/perllwp/
# sburke@cpan.org

require 5;
use strict;
use warnings;

use LWP;
my $browser = LWP::UserAgent->new;
$browser->agent('Mozilla/4.76 [en] (Win98; U)');

my @netscape_like_headers = (
  'User-Agent' => 'Mozilla/4.76 [en] (Win98; U)',
  'Accept-Language' => 'en-US',
  'Accept-Charset' => 'iso-8859-1,*,utf-8',
  'Accept-Encoding' => 'gzip',
  'Accept' =>
   "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, image/png, */*",
);

my $url = 'http://www.expreszo.nl/home.php';

my $response = $browser->get($url,
 # Leave the next line commented-out if just the user-agent change is enough:
 #  @netscape_like_headers
 # Otherwise uncomment it to get very Netscape-like headers.
 # (But don't be surprised if things come back in utf-8 and/or gzipped!)
);
die "Can't get $url: ", $response->status_line
 unless $response->is_success;


# Extract headlines:

$_ = $response->content;
my %seen;
while( m{href="(headlines.php[^"]+)">(.*?)</A>}sg ) {
  my $this = URI->new_abs($1,$response->base);
  print "$this\n  $2\n" unless $seen{$this}++;
}
print "NO HEADLINES?!  Source:\n", $response->content unless keys %seen;


__END__


Snapshots of headers sent by various browsers:



...an older Netscape...

User-Agent: Mozilla/4.76 [en] (Win98; U)
Accept: image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, image/png, */*
Accept-Charset: iso-8859-1,*,utf-8
Accept-Encoding: gzip
Accept-Language: en-US

(Plus probably a "Connection: keep-alive" headers.)



...another Netscape...

User-Agent: Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en-US;
   rv:0.9.4) Gecko/20011126 Netscape6/6.2.1
Accept: text/xml, application/xml, application/xhtml+xml, text/html;q=0.9,
   image/png, image/jpeg, image/gif;q=0.2, text/plain;q=0.8,
   text/css, */*;q=0.1
Accept-Charset: ISO-8859-1, utf-8;q=0.66, *;q=0.66
Accept-Encoding: gzip, deflate, compress;q=0.9
Accept-Language: en-us



...Opera 5.12...

User-Agent: Opera/5.12 (Windows 98; U)  [en]
Accept: text/html, image/png, image/jpeg, image/gif, image/x-xbitmap, */*
Accept-Language: en
Accept-Encoding: deflate, gzip, x-gzip, identity, *;q=0



...Internet Explorer 5.12...

Accept: */*
Accept-Language: en
Extension: Security/Remote-Passphrase
UA-CPU: PPC
UA-OS: MacOS
User-Agent: Mozilla/4.0 (compatible; MSIE 5.12; Mac_PowerPC)



...Lynx...
(Given my my system's /etc/mailcap file)

Accept: text/html, text/plain, audio/mod, image/*, video/*, video/mpeg,
  application/pgp, application/pgp, application/pdf, message/partial,
  message/external-body, application/postscript, x-be2,
  application/andrew-inset, text/richtext, text/enriched
Accept: x-sun-attachment, audio-file, postscript-file, default,
  mail-file, sun-deskset-message, application/x-metamail-patch,
  text/sgml, */*;q=0.01
Accept-Encoding: gzip, compress
Accept-Language: en, es
User-Agent: Lynx/2.8.3dev.18 libwww-FM/2.14



[White hats only, folks!  --SMB]

