#!/bin/sh
#
#       License: GPLv2
#       Author: review (http://www.mobileread.com/forums/member.php?u=88891)
#       Author: fortwienix (http://www.mobileread.com/forums/member.php?u=109732)

# some settings

# file name pattern where the downloaded feed articles are save to
TARGET_FILE=Der_Standard-$(date +%Y-%m-%d).html
# directory where to save the html file
TARGET_DIR=/mnt/ext1/news
# location of the progrssb.app that is shown during download
PROGRESSBAR_APP=/mnt/ext1/applications/tools/progressb.app

# these are the RSS feed urls. Remove the # before each link that you want to
# download articles from. Add a # in front of the link when you don't want 
# to read this feed. Do not remove the lines with the quotes that enclose
# the feed list

RSS_LINKS="
#http://derStandard.at/?page=rss&ressort=Seite1
#http://derstandard.at/?page=rss&ressort=InnenPolitik
http://derstandard.at/?page=rss&ressort=InternationalPolitik
http://derStandard.at/?page=rss&ressort=Wirtschaft
#http://derStandard.at/?page=rss&ressort=Web
#http://derStandard.at/?page=rss&ressort=Sport
#http://derStandard.at/?page=rss&ressort=Panorama
#http://derStandard.at/?page=rss&ressort=Etat
#http://derStandard.at/?page=rss&ressort=Kultur
#http://derStandard.at/?page=rss&ressort=Wissenschaft
#http://derStandard.at/?page=rss&ressort=Gesundheit
#http://derStandard.at/?page=rss&ressort=Bildung
#http://derStandard.at/?page=rss&ressort=Meinung
#http://derStandard.at/?page=rss&ressort=Lifestyle
#http://derStandard.at/?page=rss&ressort=Reisen
#http://derStandard.at/?page=rss&ressort=Karriere
#http://derstandard.at/?page=rss&ressort=Immobilien
#http://dieStandard.at/?page=rss&ressort=diestandard
#http://daStandard.at/?page=rss&ressort=dastandard
"

# here starts the script - usually changes are not necessary below but bugfixing
# and improvements can be done beyond this point

/ebrmain/bin/netagent connect
cd /mnt/ext1

# create a temporary directory, delete it when it already exists to recreate it again
if [ -d myrss-tmp ]; then 
  rm -rf myrss-tmp;
fi
mkdir myrss-tmp;
cd myrss-tmp;

# create the target directory where the html files are stored to if it doesn't exist yet
if [ ! -d $TARGET_DIR ]; then
  mkdir -p $TARGET_DIR
fi

# header of the html file where the articles are stored to
echo '<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/></head><body>' > $TARGET_FILE

echo "0" > percent

for link in $RSS_LINKS
do
  url=$(echo $link | grep -vE '^\s*#')
  if [ ! -z $url ]; then
    wget -O feed $url
    cat feed | sed 's|><|>\n<|g' |
        grep -E "</link>" | sed 's|</\?link>||g' |
        tr -d \\r | grep -E 'at\/[[:digit:]]' >> feed1
  fi
done

# show the progress bar if the executable exists
if [ -e ${PROGRESSBAR_APP} ]; then
  tail -f percent| ${PROGRESSBAR_APP} &
fi

# count the articles that will be fetched (needed to show the progress when downloading)
total=$(grep -c ^ feed1);
current=0;
mode=0
cat feed1|
while read link; do
  current=$((current+1));
  echo $current $total|awk '{printf "%.0f\n",$1*100/$2}' >> percent

  rm article-raw 2>/dev/null
  textlink=$(echo $link | sed 's|http://|http://text.|')
  wget -O article-raw ${textlink}
  cat article-raw|
    tr -d \\r |
    sed 's|<div|\n<div|g;s|</div>|</div>\n|g' > raw-formatted
  
  # look for the begin of the first block in the article
  begin=$(cat raw-formatted |grep -n '<div id="content"' |tail -1|awk -F: '{print $1}')
  if [ -z $begin ]; then
    continue
  fi
  # look for the end of the first block in the article
  end=$(sed -n ${begin},\$p raw-formatted | grep -n '<div class="empty"' | head -1 | awk  -F: '{print $1}')
  if [ -z $end ]; then
    continue
  fi
  end=$((begin+end-1));
  
  # extract the article content
  sed -n ${begin},${end}p raw-formatted | 
      grep -v '<b>Postings anzeigen</b>' |
      grep -v '<div class="empty' |
      sed 's|<\(/\?\)h[2-5]>|<\1b>|g' > article.html
  cat article.html >> $TARGET_FILE
done

echo "100" >> percent
echo '</body></html>' >> $TARGET_FILE
mv $TARGET_FILE $TARGET_DIR
cd ..
rm -rf myrss-tmp
/ebrmain/bin/netagent disconnect
