perl code for parseNation.pl
#!/usr/pkg/bin/perl -w
#----------------------------------------------------------------
#--- this script depends on Perl modules installed at Freeshell
#----------------------------------------------------------------
$URL = "http://www.thenation.com/";
use LWP::Simple;
unless (defined ($webPage = get $URL)) {
die "could not get $URL\n";
}
#--- cut the header information and all the left/top nav
$webPage =~ s/<head>.*?(<div class="tnhphed">)/$1/sg;
#--- cut the middle search fields
$webPage =~ s/(<!-- little ones -->.*?)(<\/table>).*?(<div class="tns2")/<br>$1$2<br><br>$3/s;
#--- cut the middle advertisement
$webPage =~ s/<div align="center".*?(<div class="tns2")/$1/s;
#--- cut everything at the bottom
$webPage =~ s/<td width="1" bgcolor="#cccccc".*(<\/body>)/$1/s;
#--- fix any partial URLs
$webPage =~ s/"doc\.mhtml/"http:\/\/www\.thenation\.com\/doc\.mhtml/g;
#--- link all articles to the print version instead of the online version
$webPage =~ s/doc\.mhtml/docprint\.mhtml/g;
#--- cut all links to full issues
$webPage =~ s/<a href="\/issue.mhtml.*?>(.*?)<\/a>/$1/g;
#--- return the result to a browser CGI query
print "Content-type:text/html\n\n";
print "<font color=\"#CC0000\" face=\"serif\" size=\"5\"><strong>THE NATION</strong></font><br><br>";
print $webPage;
|