My own recipe for downloading news from iHNed.cz (web version of Hospodářské Noviny, one of Czech newspapers)
It is downloading the articles not from RSS, but from homepage. You can change if it downloads all of them, or only the ones from today, by changing stahnout_vsechny to either True or False.
It is my first bigger python script, so it may not be completely efficient

and it is based on the example NYTimes script from
here.
Code:
import string, re, time
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class IHNed(BasicNewsRecipe):
download_all = False
#True = stahuje vsechny z homepage
#False = stahuje pouze dnesni clanky (ze dne, kdy je skript spusten)
title = 'iHNed'
__author__ = 'Karel Bílek'
description = 'Zprávy z iHNed.cz'
timefmt = ' [%a, %d %b, %Y]'
needs_subscription = False
remove_tags = [dict(attrs={'class':['borderbottom', 'web', 'foot', 'reklama', 'd-elm d-rellinks', 'd-elm']}),
dict(style=['text-align: center;']),
dict(id=['r-bfull']),
dict(name=['script', 'noscript', 'style'])]
encoding = 'windows-1250'
no_stylesheets = True
remove_tags_before = dict(attrs={'class':'d-nadtit'})
remove_tags_after = dict(attrs={'class':'like'})
conversion_options = {
'linearize_tables' : True,
}
def preprocess_html(self, soup):
def makeurl(wat):
return "http://ihned.cz"+wat;
for h1 in soup.findAll('h1'):
a = h1.find('a')
if a:
string = a.string
if string:
soup.a.replaceWith(string)
for a in soup.findAll('a', href=True) :
cil = str(a['href'])
if cil.startswith("/") or cil.startswith("index"):
a['href'] = makeurl(cil)
return soup
def parse_index(self):
def makeurl(wat):
if wat.startswith("/") or wat.startswith("index"):
return "http://ihned.cz"+wat;
else:
return wat
articles = {} #vysledek, asi
key = None #soucasna sekce
ans = [] #vsechny sekce
articles["Hlavní"] = []
ans.append("Hlavní")
was = {}
def parse_subpage(url, name):
articles[name] = []
ans.append(name)
print "Tusom"
soup = self.index_to_soup(url)
otvirak = soup.find(True, attrs={'class':['otv']})
if otvirak:
print "Tusom2"
#the code is copypasted here because I don't know python. simple as that.
a = otvirak.find('a', href=True)
title = self.tag_to_string(a, use_alt=True).strip()
txt = otvirak.find(True, attrs={'class':['txt']})
description = ''
if txt:
match = re.match(r'<div class="txt">\s*([^<]*)\s*<a', str(txt), re.L)
if match:
description = match.group(1)
pubdate = strftime('%d. %m.')
if not title in was:
articles[name].append(
dict(title=title, url=makeurl(a['href']), date=pubdate,
description=description,
content=''))
otv234 = soup.find(True, attrs={'class':['otv234', 'col2a']})
if otv234:
for ow in otv234.findAll(True, attrs={'class':['ow']}):
a = ow.find('a', href=True)
title = self.tag_to_string(a, use_alt=True).strip()
description=''
prx = ow.find(True, attrs={'class':['prx']});
if prx:
description = str(prx.string)
nfo = ow.find(True, attrs={'class':['nfo']});
pubdate = ''
if nfo:
dtime = time.localtime();
day = dtime[2]
month = dtime[1]
pubdate = strftime('%d. %m.')
match = re.search(r'([0-9]*)\.([0-9]*)\.', str(nfo))
if self.download_all or (int(day) == int(match.group(1)) and int(month) == int(match.group(2))):
if not title in was:
articles[name].append(
dict(title=title, url=makeurl(a['href']), date=pubdate,
description=description,
content=''))
soup = self.index_to_soup('http://ihned.cz/')
otvirak = soup.find(True, attrs={'class':['otv']})
if otvirak:
a = otvirak.find('a', href=True)
title = self.tag_to_string(a, use_alt=True).strip()
txt = otvirak.find(True, attrs={'class':['txt']})
description = ''
if txt:
match = re.match(r'<div class="txt">\s*([^<]*)\s*<a', str(txt), re.L)
if match:
description = match.group(1)
pubdate = strftime('%d. %m.')
feed = "Hlavní"
articles[feed].append(
dict(title=title, url=(a['href']), date=pubdate,
description=description,
content=''))
was[title]=1
otvirak2345 = soup.find(True, attrs={'class':['otv2345']})
if otvirak2345:
for otv2 in otvirak2345.findAll(True, attrs={'class':['otv2-5']}):
a = otv2.find('a', attrs={'class':['tit2']}, href=True)
title = self.tag_to_string(a, use_alt=True).strip()
description=''
span = otv2.find('span');
if span:
match = re.match(r'<span>\s*([^<]*)\s*<a', str(span), re.L)
if match:
description = match.group(1)
feed = "Hlavní"
pubdate = strftime('%d. %m.')
articles[feed].append(
dict(title=title, url=(a['href']), date=pubdate,
description=description,
content=''))
was[title]=1
parse_subpage("http://komentare.ihned.cz/", "Komentáře")
parse_subpage("http://domaci.ihned.cz", "Domácí")
parse_subpage("http://ekonomika.ihned.cz", "Ekonomika")
parse_subpage("http://zahranicni.ihned.cz/", "Zahraničí");
parse_subpage("http://finweb.ihned.cz/", "Finance");
parse_subpage("http://digiweb.ihned.cz/", "DigiWeb");
parse_subpage("http://kultura.ihned.cz/", "Kultura")
parse_subpage("http://sport.ihned.cz/", "Sport");
#seradi kategorie
ans = self.sort_index_by(ans, {'Hlavni':1, 'Domácí':2, 'Ekonomika':5, 'Zahraničí':3, 'Finance':6, 'DigiWeb':7, 'Kultura':8, 'Sport':9, 'Komentáře':4})
#vrati, ale pouze, kdyz je v kategoriich...
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans
edit: I made a big syntax error. This is the correct version. Sorry about that...