#!/usr/bin/perl

# AvantSlash v3.1
# Copyright (c) Richard Lawrence 2000-2005
############################################################################
#
# This program is free software; you can redistribute it and/or modify   
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or 
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,  
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License      
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
#
############################################################################
# For instructions please read the file called README.
# Settings should be changed within the file "avantify.config".
#
# Don't forget to change the path below to the location of this script:

chdir("/home/richard/www/lawrence/playground/slashdot");

#
# You should not need to change anything below this line.
############################################################################

use strict;
use LWP;
use CGI qw(:param);
use CGI::Carp qw(fatalsToBrowser);
use HTML::Parser;
use URI::Escape;

use constant ONE_MINUTE => 60;

my $version = "3.1";
my $self = $ENV{'SCRIPT_NAME'};
my $clean_html = "";
my $clean_done = 0;
my $url = param("url");
my $page_date;
my $wap = 0;

# Set up defaults

my $mp_show_posted_by = 1;
my $mp_show_department = 1;
my $mp_show_blurb = 1;
my $mp_show_read_more = 1;

my $ap_show_posted_by = 1;
my $ap_show_department = 1;
my $ap_show_story = 1;

my $threshold = 5;
my $max_comments = 100;
my $internalize_links = 0;
my $no_italics = 0;

my $cache_timeout = 30;
my $debugging = 0;
my $enable_local_cache = 1;

# Load in config file

open(CONFIG, "avantify.config") || die "Unable to open configuration file: $!";
while(<CONFIG>)
{
  chomp;
  my $configline = $_;

  next if (($configline =~ /^#/) || ($configline =~ /^$/));
  my ($configkeyword, $configopts) = split (/\s/,$configline,2);
  next if (($configkeyword =~ /^$/) || ($configopts =~ /^$/));

  if    ($configkeyword eq "MP_SHOW_POSTED_BY") { $mp_show_posted_by = $configopts; }
  elsif ($configkeyword eq "MP_SHOW_DEPARTMENT") { $mp_show_department = $configopts; }
  elsif ($configkeyword eq "MP_SHOW_BLURB") { $mp_show_blurb = $configopts; }
  elsif ($configkeyword eq "MP_SHOW_READ_MORE") { $mp_show_read_more = $configopts; }

  elsif ($configkeyword eq "AP_SHOW_POSTED_BY") { $ap_show_posted_by = $configopts; }
  elsif ($configkeyword eq "AP_SHOW_DEPARTMENT") { $ap_show_department = $configopts; }
  elsif ($configkeyword eq "AP_SHOW_STORY") { $ap_show_story = $configopts; }

  elsif ($configkeyword eq "THRESHOLD") { $threshold = $configopts; }
  elsif ($configkeyword eq "MAX_COMMENTS") { $max_comments = $configopts; }
  elsif ($configkeyword eq "INTERNALIZE_LINKS") { $internalize_links = $configopts; }
  elsif ($configkeyword eq "NO_ITALICS") { $no_italics = $configopts; }

  elsif ($configkeyword eq "CACHE_TIMEOUT") { $cache_timeout = $configopts; }
  elsif ($configkeyword eq "DEBUGGING") { $debugging = $configopts; }
  elsif ($configkeyword eq "ENABLE_LOCAL_CACHE") { $enable_local_cache = $configopts; }
  else
  {
    die "Unknow configuration option '$configkeyword'";
  }
}
close CONFIG;

# Process request from user

my $external = param("external");
$external =~ s/[^01]//g;  # only can have 0 or 1

# If the User Agent is "Google WAP Proxy" then they're viewing this
# on a WAP phone.

if ($ENV{'HTTP_USER_AGENT'} =~ /Google WAP Proxy/i)
{
  $wap = 1;

  # Reset some of the options to be more WAP-friendly

  $mp_show_posted_by = 1;
  $mp_show_department = 0;
  $mp_show_blurb = 0;
  $mp_show_read_more = 0;
  $ap_show_posted_by = 0;
  $ap_show_department = 0;
  $ap_show_story = 1;
  $no_italics = 1;
}
elsif ($ENV{'HTTP_ACCEPT'} =~ /vnd\.wap/ && !($ENV{'HTTP_ACCEPT'} =~ /text\/html/))
{
  # They're using a browser which can cope with WML and not HTML - so we
  # assume this is a WAP phone and therefore redirect them to Google HTML
  # to WML proxy service
  my $go = $ENV{'SERVER_NAME'} . $self;
  print "Location: http://wmlproxy.google.com/wmltrans/h=en/g=\@26amp\@3bwmlmode=url?u=" . uri_escape($go) . "\n\n";
  exit;
}

if ($url)
{
  # stop silly buggers by stripping out invalid characters
  $url =~ s/[<>\|\0]//gis;  
}

if ($url eq "credits")
{
  show_credits();
  exit;
}

# get the html, strip out their bizzare formatting and apply our own

my $page = getpage($url);

if ($external)
{
  print "Content-type: text/html\n\n";

  # This stops the prat at 63.169.220.2 (firebox.mbe-eng.com) who used my
  # script to clean up the Newsweek Site Index on MSNBC (and other links
  # not used by slashdot) and ended up twatting the server good and hard.

  if (!$internalize_links)
  {
    print "<html><head><title>Forbidden</title><body><h1>Forbidden</h1><p>You are forbidden from processing pages in this way.</body></html>";
    exit;
  }
  
  print $page;
  exit;
}

# Advert removal
$page =~ s#<!\-\- advertisement code\. \-\->.+?<!\-\- end ad code \-\->##gis;
$page =~ s#<P><TABLE.+?ad\.doubleclick\.net.+?</NOSCRIPT></TD></TR></TABLE>##gis;

# More generic removal of scripts and iframes
$page =~ s#<script .+?</script>##gis;
$page =~ s#<noscript.+?</noscript>##gis;
$page =~ s#<iframe.+?</iframe>##gis;


$page =~ s#[\n\t]# #g;
$page =~ s# +# #g;
$page =~ s#(<br.*?>)#$1\n#gis;
$page =~ s#(</tr.*?>)#$1\n#gis;
$page =~ s#(<table.*?>)#\n$1#gis;
$page =~ s#(</table.*?>)#$1\n#gis;
$page =~ s#(<p.*?>)#\n$1#gis;
$page =~ s#(</p.*?>)#$1\n#gis;
$page =~ s#<(/?)em>#<$1i>#gis;

if ($url)
{
  parse_comments();
  exit;
}

# parse the main page

show_header("Slashdot: News for nerds, stuff that matters");

my ($title, $blurb, $posted, $dept, $more);

my @bits = split(/\n/, $page);
foreach (@bits)
{
  # title

  if (m!<IMG SRC=".+?" WIDTH="13" HEIGHT="16" ALT="" ALIGN="TOP"><FONT FACE="arial,helvetica" SIZE="4" COLOR="#?FFFFFF"><B>(.+?)</B></FONT></TD> </TR>!i)
  {
    $title = $1;
    $title =~ s!<A HREF=".*?//.+?\.slashdot.org/"><FONT COLOR="#?FFFFFF">(.+?)</FONT></A>!$1!gi;
    print "title = $title<br>" if ($debugging);
  }

  # department
  # Note that we don't have a space between ".*?" and "dept". This is
  # because someone once forgot the department and the script broke.
  elsif (m!<FONT SIZE="2"><B>(from the .*?dept\.)</B></FONT><BR>!i)
  {
    $dept = $1;
    print "blurb = $blurb<br>" if ($debugging);
  }

  # posted

  elsif (m!<B>(Posted by .+? on .+?[AP]M)</B><BR>!i)
  {
    $posted = $1;
    $posted =~ s!<a.+?>(.+?)</a>!$1!i;
    print "posted = $posted<br>" if ($debugging);
  }

  # link to more

  elsif (m!<A HREF="//.*?slashdot.org.*?/(\d\d/\d\d/\d\d/\d+?).shtml\?tid=.+?"><b>Read More...</b></A>!)
  {
    $more = $1;
    print "more = $more<br>" if ($debugging);
  }

  # the blurb

  elsif($title && $dept && !$more && $posted)
  {
    $blurb .= $_;
    print "blurb = $blurb<br>" if ($debugging);
  }

  # display if everything is in order

  if ($title && $dept && $more && $posted && $blurb)
  {
    print "Got everything...<br>" if ($debugging);

    # remove the no ad tag
    $blurb =~ s/<P> <!-- no ad \d+ -->//gi;

    # point external links to the stripurls code
    $blurb = repoint_urls($blurb);

    # remove trailing <P> in blurb that suddenly started appearing one day
    $blurb =~ s/\s<P>\s*$//i;
    print "<p><hr></p>\n" if (!$wap);
    print "<p><font size=\"+1\"><b><a href=\"$self?url=$more\">$title</a></b></font>\n";
    print "<br>$posted" if ($mp_show_posted_by);
    print "<br><font size=\"-1\">$dept</font>\n" if ($mp_show_department);
    print "<br>$blurb" if ($mp_show_blurb);
    print "<br><a href=\"$self?url=$more\">(read more)</a>" if ($mp_show_read_more);
    print "</p>";

    $title = $dept = $more = $posted = $blurb = "";
  }
}
print "<p><hr></p>\n";
show_footer();
exit;

# parse comments
# this is going to be horrible :o(

sub parse_comments
{
  my ($title, $story, $posted, $dept, $end, $header);
  my ($i, $elements);

  my @bits = split(/\n/, $page);
  $elements = scalar @bits;

  for ($i = 0; $i < $elements; $i++)
  {
    $_ = $bits[$i];

    # title

    if (m!<FONT FACE="arial,helvetica" SIZE="4" COLOR="#?FFFFFF"><B>(.+?)</B></FONT></TD> </TR>! && !$title)
    {
      $title = $1;
      print "title = $title<br>" if ($debugging);
      if (!$header)
      {
        show_header($title);
        $header = 1;
      }
    }

    elsif (m!<B>(Posted by .+? on .+?[AP]M)</B><BR>!)
    {
      $posted = $1;
      $posted =~ s!<a.+?>(.+?)</a>!$1!i;
      print "posted = $posted<br>" if ($debugging);
    }

    elsif (m!<FONT SIZE="2"><B>(from the .+? dept.)</B></FONT><BR>!)
    {
      $dept = $1;
      print "dept = $dept<br>" if ($debugging);
    }

    elsif (m!<TD ALIGN="RIGHT" VALIGN="TOP" WIDTH="230" HEIGHT="1">!)
    {
      print "Got everything...<br>" if ($debugging);

      # remove the advert
      $story =~ s/<P><IFRAME.+?>//gis;

      # repoint external links
      $story = repoint_urls($story);

      print "<p><font size=\"+1\">$title</font>";
      print "<br>$posted\n" if ($ap_show_posted_by);
      print "<br><font size=\"-1\">$dept</font>\n" if ($ap_show_department);
      print "<br>$story\n" if ($ap_show_story);
      print "</p><p><hr></p>\n";

      # bail out to the comments part
      last;
    }

    elsif ($title && $posted)
    {
      $story .= $_;
    }
  }

  # and now the comments

  my ($author, $score, $message, $status);
  my $j;
  $j=0;
  for (; $i < $elements; $i++)
  {
    $_ = $bits[$i];

    if (m!<TR><TD BGCOLOR="#?[0-9A-F]{6}"> <FONT SIZE="3" COLOR="#?000000"> <A NAME="\d+?"><B>(.+)</B></A> \(Score:(.+?)\) </FONT> <BR>!i)
    {
      $j++;
      $title = $1;
      $score = $2;

      print "title = $title<br>score = $score<br>" if ($debugging);
    }

    elsif (m!(by .+? on .+?, @\d\d:\d\d[PA]M)!)
    {
      $author = $1;
      $author =~ s/<.+?>//gi;
      $author =~ s/ \(.+?\@.+?\)//gi;

      $status = 1;

      print "author = $author<br>" if ($debugging);

      # Yes, I'm being sad - sue me :o)
      $author =~ s/Mr_Silver \(213637\)/<a href="$self?url=credits">Mr_Silver<\/a> \(213637\)/i;
    }

    elsif (m! <TR><TD> <FONT SIZE="2"> \[ <A HREF=".+?">Reply to This</A>!i)
    {
      # We have everything (i hope) so clean up message

      print "Got everything...<br>" if ($debugging);

      # Remove the journal and homepage
      $message =~ s!^<FONT SIZE="-1">\(.+?</FONT>!!i;

      # Remove the "Read the rest of this comment" line and replace
      # with generic message
      $message =~ s!<B><A HREF="//slashdot.org.+?">Read the rest of this comment...</A> </B>!<b><font size="-1">[Rest of comment truncated]</font></b>!i;

      # Remove table tags
      $message =~ s!</{0,1}T[RD]>!!gi;

      # Repoint URL's
      $message = repoint_urls($message);
      if ($j <= $max_comments)
      {
      print <<EOF3;
<p><b>$title</b> (score: $score)<br>
<font size="-1">$author</font></p>
<p>$message</p>
<p><hr></p>
EOF3
      }
      $author = $score = $author = $message = $status = "";
    }

    elsif ($status == 1)
    {
      $message .= $_;
    }
  }
  show_footer();
}  


# download the page

sub getpage
{
  my $stuff;  

  $stuff = get_from_cache($_[0]) if ($enable_local_cache);

  if (!$stuff)
  {
    $stuff = download_page($_[0]);
  }

  $stuff =~ s#</?i>##gis if ($no_italics);

  return $stuff;
}

sub download_page
{
  my $ua = new LWP::UserAgent;
  $ua->agent("Mozilla/4.0 (compatible; MSIE 5.01; Windows 98; DigExt)");
  my $req;

  if (!$external)
  {
    if ($_[0])
    {
      $req = new HTTP::Request GET => "http://slashdot.org/article.pl?sid=$_[0]&mode=flat&threshold=$threshold";
    }
    else
    {
      $req = new HTTP::Request GET => "http://slashdot.org/";
    }
  }
  else  # external site
  {
    if ($_[0])
    {
      $req = new HTTP::Request GET => "$_[0]";
    }
    else
    {
      return "";
    }
  }
    
  my $res = $ua->request($req);

  if ($external)
  {
    return(strip_links($res->content));
  }
  else
  {
    $page_date = time();
    return($res->content);
  }
}

sub show_footer
{
  my $when = localtime($page_date);

  print <<EOF4;
<p><font size="-1">Avantslash v$version - <a href="$self?url=credits">More info...</a><br>Page collected: $when</font><p>
</body></html>
EOF4
}

sub show_header
{
  my $timeout = $cache_timeout * ONE_MINUTE;
  my $title = shift;

  # Headers
  print "Content-type: text/html\nCache-control: max-age=$timeout\n\n";

  # if WAP, print something more basic
  if ($wap)
  {
    print "<html><body><p><b>Slashdot</b><br>News for Nerds. Stuff that matters.</p>\n";
    return;
  }  

  # Print one for the web

  print <<EOF5;
<html>
<head><title>$title</title>
</head>
<body bgcolor="#ffffff" text="#000000">
<p align="left"><a href="$self"><img src="logo.gif" border="0" alt="Slashdot"></a><br><font size="-2">News for Nerds. Stuff that matters.</font></p>
EOF5
}

sub get_from_cache
{
  my $url = shift;
  my $age;
  my $filename;

  if (!$external)
  {  
    if (!$url)
    {
      $filename = "index.html"
    }
    else
    {
      $filename = "$url.html";
      $filename =~ s/\//-/g;
    }
  }
  else
  {
    $filename = $url;
    $filename =~ s/[^a-z]//gi;
    $filename .= ".html";
  }

  if (-e "cache/$filename")
  {
    $age = (stat("cache/$filename"))[9];
    if (time - $age <= ($cache_timeout * ONE_MINUTE))
    {
      # page exists and is fresh
      my $ret;
      open(FILE, "cache/$filename") || die "Can't open cache file $filename: $!";
      while(<FILE>)
      {
        $ret .= $_;
      }
      close FILE;

      $page_date = $age;

      # do a quick clean up, any files over $cache_timeout old, delete.

      my @files = glob("cache/*.html");
      foreach my $thisfile (@files)
      {
        $age = (stat("$thisfile"))[9]; 
        unlink("$thisfile") if (time - $age > ($cache_timeout * ONE_MINUTE));
      }

      return $ret;
    }
    # old page, drop through and overwrite
  }

  open(FILE, ">cache/$filename") || die "Can't write out to cache/$filename: $!";
  my $html = download_page($url);  
  print FILE $html;
  close FILE;
  chmod 0777, "cache/$filename";
  return $html;
}

sub repoint_urls
{
  my $txt = shift;
  if ($internalize_links) {
    $txt =~ s#"(http://.+?)"#make_url($1)#egis;
  }
  return $txt;
}

sub make_url
{
  my $url = uri_escape(shift);
  return "\"$self?external=1&url=$url\"";
}

sub show_credits
{
  show_header("About Avantslash");

  print <<EOF6;

<html><head>
<p><hr></p>
<p>
 <b>Avantslash version $version, Copyright &copy; 2000-2005 Richard Lawrence</b><br>

This program is free software; you can redistribute it and/or modify   
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or 
(at your option) any later version.
<p>
This program is distributed in the hope that it will be useful,  
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
<p>
All trademarks and copyrights on the Slashdot pages are owned by their respective owners. Comments are owned by the Poster. The Rest  1997-2005 OSTG
<p>
For more information, please visit the Avantslash website at <b>http://www.fourteenminutes.com/code/avantslash/</b></a><p>
<p><hr></p>

<p><a href="$self">Back to main page</a></p>

</body>
</html>
EOF6
}

#
# This code takes an external site and removes all links and graphics
# so allowing you to crawl them without worrying about trawling off
# into the depths of the internet.
#
# Please note that I know very little about HTML::Parser. If you can
# rewrite this to be cleaner - then please do so.
#

sub strip_links
{
  my $html = shift;

  my $parser = HTML::Parser->new(api_version => 3,
                                 default_h => [ sub { $clean_html .= shift }, "text" ],
                                 start_h => [ \&start_parse, "self,tagname,attr,text" ],
                                 end_document_h => [ sub { $clean_done = 1 } ]);
  $parser->parse($html);
  $parser->eof;

  while($clean_done == 0) {};

  return $clean_html;
}

sub start_parse
{
  my ($self, $tag, $attr, $text) = @_;

  if ($tag eq 'a' && exists $attr->{href})
  {
    $self->handler(end => \&end_a, "self, tagname, text");
    return;
  }
  elsif ($tag eq 'img' && exists $attr->{src})
  {
    $self->handler(end => \&end_img, "self, tagname, text");
    return;
  }
  elsif ($tag eq 'form')
  {
    $self->handler(end => \&end_form, "self, tagname, text");
    return;
  }
  elsif ($tag eq 'script')
  {
    $self->handler(end => \&end_script, "self, tagname, text");
    return;
  }
  elsif ($tag eq 'iframe')
  {
    $self->handler(end => \&end_iframe, "self, tagname, text");
    return;
  }
  
  $clean_html .= $text;
}

sub end_a
{
  my ($self, $tag, $text) = @_;

  if ($tag eq 'a')
  {
    $self->handler(end => undef);
  }
  else
  {
    $clean_html .= $text;
  }
}

sub end_img
{
  my ($self, $tag, $text) = @_;

  if ($tag eq 'img')
  {
    $self->handler(end => undef);
  }
  else
  {
    $clean_html .= $text;
  }
}

sub end_form
{
  my ($self, $tag, $text) = @_;

  if ($tag eq 'form')
  {
    $self->handler(end => undef);
  }
  else
  {
    $clean_html .= $text;
  }
}

sub end_script
{
  my ($self, $tag, $text) = @_;

  if ($tag eq 'script')
  {
    $self->handler(end => undef);
  }
  else
  {
    $clean_html .= $text;
  }
}

sub end_iframe
{
  my ($self, $tag, $text) = @_;

  if ($tag eq 'iframe')
  {
    $self->handler(end => undef);
  }
  else
  {
    $clean_html .= $text;
  }
}  


