#!/bin/sh
#
#       License: GPLv2
#       Author: review (http://www.mobileread.com/forums/member.php?u=88891)
#       Author: fortwienix (http://www.mobileread.com/forums/member.php?u=109732)

# some settings

# file name pattern where the downloaded feed articles are save to
TARGET_FILE=Tagesanzeiger-$(date +%Y-%m-%d).html
# directory where to save the html file
TARGET_DIR=/mnt/ext1/news
# target dir where to save the images that are linked from the html file,
# must be relative to TARGET_DIR
IMAGE_DIR=imgsfeeds
# fetch images (1 on, else off)
GET_IMAGES=1
# location of the progrssb.app that is shown during download
PROGRESSBAR_APP=/mnt/ext1/applications/tools/progressb.app

# these are the RSS feed urls. Remove the # before each link that you want to
# download articles from. Add a # in front of the link when you don't want 
# to read this feed. Do not remove the lines with the quotes that enclose
# the feed list

RSS_LINKS="
#http://www.tagesanzeiger.ch/rss.html
#http://www.tagesanzeiger.ch/rss_ticker.html
#http://www.tagesanzeiger.ch/zuerich/rss.html
http://www.tagesanzeiger.ch/schweiz/rss.html
http://www.tagesanzeiger.ch/ausland/rss.html
http://www.tagesanzeiger.ch/wirtschaft/rss.html
#http://www.tagesanzeiger.ch/sport/rss.html
#http://www.tagesanzeiger.ch/kultur/rss.html
#http://www.tagesanzeiger.ch/panorama/rss.html
#http://www.tagesanzeiger.ch/leben/rss.html
#http://www.tagesanzeiger.ch/auto/rss.html
#http://www.tagesanzeiger.ch/digital/rss.html
#http://www.tagesanzeiger.ch/wissen/rss.html
#http://www.tagesanzeiger.ch/dienste/RSS/story/rss.html
"

# here starts the script - usually changes are not necessary below but bugfixing
# and improvements can be done beyond this point

/ebrmain/bin/netagent connect
cd /mnt/ext1

# create a temporary directory, delete it when it already exists to recreate it again
if [ -d myrss-tmp ]; then 
  rm -rf myrss-tmp;
fi
mkdir myrss-tmp;
cd myrss-tmp;

# create the target directory where the html files are stored to if it doesn't exist yet
if [ ! -d $TARGET_DIR ]; then
  mkdir -p $TARGET_DIR
fi
# create the target image directory if it doesn't exist yet
if [ ! -d ${TARGET_DIR}/${IMAGE_DIR} ]; then
  mkdir -p  ${TARGET_DIR}/${IMAGE_DIR}
fi


# header of the html file where the articles are stored to
echo '<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/></head><body>' > $TARGET_FILE

echo "0" > percent

for link in $RSS_LINKS
do
  url=$(echo $link | grep -vE '^\s*#')
  if [ ! -z $url ]; then
    wget -O feed $url
    cat feed | sed 's|><|>\n<|g' | grep -E "[[:digit:]]+</link>" | sed 's|</\?link>||g' | tr -d \\r >> feed1
    if [ -z $domain ]; then
      # get the domain name from the last link
      domain=$(echo $url | sed -n 's|\(http://[^/]*\)/.*|\1|p')
    fi
  fi
done

# show the progress bar if the executable exists
if [ -e ${PROGRESSBAR_APP} ]; then
  tail -f percent| ${PROGRESSBAR_APP} &
fi

# count the articles that will be fetched (needed to show the progress when downloading)
total=$(grep -c ^ feed1);
current=0;
mode=0
cat feed1|
while read link; do
  current=$((current+1));
  echo $current $total|awk '{printf "%.0f\n",$1*100/$2}' >> percent

  rm article-raw 2>/dev/null
  wget -O article-raw "${link}/print.html";
  cat article-raw|
    tr -d \\r |
    sed 's|<div|\n<div|g;s|</div>|</div>\n|g' > raw-formatted
  
  # look for the begin of the first block in the article
  begin=$(cat raw-formatted |grep -n '<div id="singleLeft">' |tail -1|awk -F: '{print $1}')
  if [ -z $begin ]; then
    continue
  fi
  # look for the end of the first block in the article
  end=$(sed -n ${begin},\$p raw-formatted | grep -n '<p class="publishedDate">' | head -1 | awk  -F: '{print $1}')
  if [ -z $end ]; then
    continue
  fi
  end=$((begin+end-1));
  
  # extract the article content
  sed -n ${begin},${end}p raw-formatted | 
      sed 's|<\(/\?\)h[2]>|<\1h1>|g' |
      sed 's|<\(/\?\)h[3-5]>|<\1b>|g' |
      sed 's|<div id="contentbox">|<div style="display:none;">|' |
      sed 's|<div id="metaLine">|<div style="display:none;">|' > article.html
  # check if we also want the images of the article
  if [ $GET_IMAGES -eq 1 ]; then
    # start each img tag at a new line
    sed 's|<img|\n<img|g' article.html > article.new
    # find all image urls
    sed -n 's|src="\([^"]*\)"|\n\1\n|pg' < article.new |
      grep http | grep -Ei '(png|jpeg|jpg|gif)$' > images
    mv article.new article.html
    # read the list of extracted image links
    cat images |
    while read imgurl; do
      # fetch the image name and check if the file already exists locally
      imgname=$(echo $imgurl | sed 's|http://||g' | sed 's|/|_|g')
      if [ ! -e ${TARGET_DIR}/${IMAGE_DIR}/${imgname} ]; then
        wget -O ${TARGET_DIR}/${IMAGE_DIR}/${imgname} $imgurl
      fi
      # replace the old image url in the article with the new image url
      sed "s|\(src=\"\)${imgurl}\"|\1${IMAGE_DIR}/${imgname}\"|g" article.html > article.new
      mv article.new article.html
    done
  fi
  cat article.html >> $TARGET_FILE
done

echo "100" >> percent
echo '</body></html>' >> $TARGET_FILE
mv $TARGET_FILE $TARGET_DIR
cd ..
rm -rf myrss-tmp
/ebrmain/bin/netagent disconnect
