MobileRead Forums - View Single Post

aranpura · 09-10-2004, 02:10 PM

#!/usr/pkg/bin/perl -w

#----------------------------------------------------------------
#--- this script depends on Perl modules installed at Freeshell
#----------------------------------------------------------------

$URL = "http://www.thenation.com/";

use LWP::Simple;
unless (defined ($webPage = get $URL)) {
die "could not get $URL\n";
}

#--- cut the header information and all the left/top nav
$webPage =~ s/<head>.*?(<div class="tnhphed">)/$1/sg;

#--- cut the middle search fields
$webPage =~ s/(.*?)(<\/table>).*?(<div class="tns2")/<br>$1$2<br><br>$3/s;

#--- cut the middle advertisement
$webPage =~ s/<div align="center".*?(<div class="tns2")/$1/s;

#--- cut everything at the bottom
$webPage =~ s/<td width="1" bgcolor="#cccccc".*(<\/body>)/$1/s;

#--- fix any partial URLs
$webPage =~ s/"doc\.mhtml/"http:\/\/www\.thenation\.com\/doc\.mhtml/g;

#--- link all articles to the print version instead of the online version
$webPage =~ s/doc\.mhtml/docprint\.mhtml/g;

#--- cut all links to full issues
$webPage =~ s/<a href="\/issue.mhtml.*?>(.*?)<\/a>/$1/g;

#--- return the result to a browser CGI query
print "Content-type:text/html\n\n";
print "<font color=\"#CC0000\" face=\"serif\" size=\"5\"><strong>THE NATION</strong></font><br><br>";
print $webPage;

09-10-2004, 02:10 PM	#6
aranpura Member Posts: 12 Karma: 34 Join Date: Sep 2004 Location: San Francisco, California Device: Tungsten T3	perl code for parseNation.pl #!/usr/pkg/bin/perl -w #---------------------------------------------------------------- #--- this script depends on Perl modules installed at Freeshell #---------------------------------------------------------------- $URL = "http://www.thenation.com/"; use LWP::Simple; unless (defined ($webPage = get $URL)) { die "could not get $URL\n"; } #--- cut the header information and all the left/top nav $webPage =~ s/<head>.?(<div class="tnhphed">)/$1/sg; #--- cut the middle search fields $webPage =~ s/(<!-- little ones -->.?)(<\/table>).?(<div class="tns2")/<br>$1$2<br><br>$3/s; #--- cut the middle advertisement $webPage =~ s/<div align="center".?(<div class="tns2")/$1/s; #--- cut everything at the bottom $webPage =~ s/<td width="1" bgcolor="#cccccc".(<\/body>)/$1/s; #--- fix any partial URLs $webPage =~ s/"doc\.mhtml/"http:\/\/www\.thenation\.com\/doc\.mhtml/g; #--- link all articles to the print version instead of the online version $webPage =~ s/doc\.mhtml/docprint\.mhtml/g; #--- cut all links to full issues $webPage =~ s/<a href="\/issue.mhtml.?>(.*?)<\/a>/$1/g; #--- return the result to a browser CGI query print "Content-type:text/html\n\n"; print "<font color=\"#CC0000\" face=\"serif\" size=\"5\"><strong>THE NATION</strong></font><br><br>"; print $webPage;