View Single Post
Old 09-10-2004, 01:10 PM   #6
aranpura
Member
aranpura began at the beginning.
 
aranpura's Avatar
 
Posts: 12
Karma: 34
Join Date: Sep 2004
Location: San Francisco, California
Device: Tungsten T3
perl code for parseNation.pl

#!/usr/pkg/bin/perl -w

#----------------------------------------------------------------
#--- this script depends on Perl modules installed at Freeshell
#----------------------------------------------------------------


$URL = "http://www.thenation.com/";

use LWP::Simple;
unless (defined ($webPage = get $URL)) {
die "could not get $URL\n";
}

#--- cut the header information and all the left/top nav
$webPage =~ s/<head>.*?(<div class="tnhphed">)/$1/sg;

#--- cut the middle search fields
$webPage =~ s/(<!-- little ones -->.*?)(<\/table>).*?(<div class="tns2")/<br>$1$2<br><br>$3/s;

#--- cut the middle advertisement
$webPage =~ s/<div align="center".*?(<div class="tns2")/$1/s;

#--- cut everything at the bottom
$webPage =~ s/<td width="1" bgcolor="#cccccc".*(<\/body>)/$1/s;

#--- fix any partial URLs
$webPage =~ s/"doc\.mhtml/"http:\/\/www\.thenation\.com\/doc\.mhtml/g;

#--- link all articles to the print version instead of the online version
$webPage =~ s/doc\.mhtml/docprint\.mhtml/g;

#--- cut all links to full issues
$webPage =~ s/<a href="\/issue.mhtml.*?>(.*?)<\/a>/$1/g;

#--- return the result to a browser CGI query
print "Content-type:text/html\n\n";
print "<font color=\"#CC0000\" face=\"serif\" size=\"5\"><strong>THE NATION</strong></font><br><br>";
print $webPage;
aranpura is offline