![]() |
#1 |
Member
![]() ![]() ![]() ![]() ![]() ![]() Posts: 14
Karma: 560
Join Date: Jan 2011
Device: Kindle
|
iHNed.cz (Czech news source)
My own recipe for downloading news from iHNed.cz (web version of Hospodářské Noviny, one of Czech newspapers)
It is downloading the articles not from RSS, but from homepage. You can change if it downloads all of them, or only the ones from today, by changing stahnout_vsechny to either True or False. It is my first bigger python script, so it may not be completely efficient ![]() Code:
import string, re, time from calibre import strftime from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag class IHNed(BasicNewsRecipe): download_all = False #True = stahuje vsechny z homepage #False = stahuje pouze dnesni clanky (ze dne, kdy je skript spusten) title = 'iHNed' __author__ = 'Karel Bílek' description = 'Zprávy z iHNed.cz' timefmt = ' [%a, %d %b, %Y]' needs_subscription = False remove_tags = [dict(attrs={'class':['borderbottom', 'web', 'foot', 'reklama', 'd-elm d-rellinks', 'd-elm']}), dict(style=['text-align: center;']), dict(id=['r-bfull']), dict(name=['script', 'noscript', 'style'])] encoding = 'windows-1250' no_stylesheets = True remove_tags_before = dict(attrs={'class':'d-nadtit'}) remove_tags_after = dict(attrs={'class':'like'}) conversion_options = { 'linearize_tables' : True, } def preprocess_html(self, soup): def makeurl(wat): return "http://ihned.cz"+wat; for h1 in soup.findAll('h1'): a = h1.find('a') if a: string = a.string if string: soup.a.replaceWith(string) for a in soup.findAll('a', href=True) : cil = str(a['href']) if cil.startswith("/") or cil.startswith("index"): a['href'] = makeurl(cil) return soup def parse_index(self): def makeurl(wat): if wat.startswith("/") or wat.startswith("index"): return "http://ihned.cz"+wat; else: return wat articles = {} #vysledek, asi key = None #soucasna sekce ans = [] #vsechny sekce articles["Hlavní"] = [] ans.append("Hlavní") was = {} def parse_subpage(url, name): articles[name] = [] ans.append(name) print "Tusom" soup = self.index_to_soup(url) otvirak = soup.find(True, attrs={'class':['otv']}) if otvirak: print "Tusom2" #the code is copypasted here because I don't know python. simple as that. a = otvirak.find('a', href=True) title = self.tag_to_string(a, use_alt=True).strip() txt = otvirak.find(True, attrs={'class':['txt']}) description = '' if txt: match = re.match(r'<div class="txt">\s*([^<]*)\s*<a', str(txt), re.L) if match: description = match.group(1) pubdate = strftime('%d. %m.') if not title in was: articles[name].append( dict(title=title, url=makeurl(a['href']), date=pubdate, description=description, content='')) otv234 = soup.find(True, attrs={'class':['otv234', 'col2a']}) if otv234: for ow in otv234.findAll(True, attrs={'class':['ow']}): a = ow.find('a', href=True) title = self.tag_to_string(a, use_alt=True).strip() description='' prx = ow.find(True, attrs={'class':['prx']}); if prx: description = str(prx.string) nfo = ow.find(True, attrs={'class':['nfo']}); pubdate = '' if nfo: dtime = time.localtime(); day = dtime[2] month = dtime[1] pubdate = strftime('%d. %m.') match = re.search(r'([0-9]*)\.([0-9]*)\.', str(nfo)) if self.download_all or (int(day) == int(match.group(1)) and int(month) == int(match.group(2))): if not title in was: articles[name].append( dict(title=title, url=makeurl(a['href']), date=pubdate, description=description, content='')) soup = self.index_to_soup('http://ihned.cz/') otvirak = soup.find(True, attrs={'class':['otv']}) if otvirak: a = otvirak.find('a', href=True) title = self.tag_to_string(a, use_alt=True).strip() txt = otvirak.find(True, attrs={'class':['txt']}) description = '' if txt: match = re.match(r'<div class="txt">\s*([^<]*)\s*<a', str(txt), re.L) if match: description = match.group(1) pubdate = strftime('%d. %m.') feed = "Hlavní" articles[feed].append( dict(title=title, url=(a['href']), date=pubdate, description=description, content='')) was[title]=1 otvirak2345 = soup.find(True, attrs={'class':['otv2345']}) if otvirak2345: for otv2 in otvirak2345.findAll(True, attrs={'class':['otv2-5']}): a = otv2.find('a', attrs={'class':['tit2']}, href=True) title = self.tag_to_string(a, use_alt=True).strip() description='' span = otv2.find('span'); if span: match = re.match(r'<span>\s*([^<]*)\s*<a', str(span), re.L) if match: description = match.group(1) feed = "Hlavní" pubdate = strftime('%d. %m.') articles[feed].append( dict(title=title, url=(a['href']), date=pubdate, description=description, content='')) was[title]=1 parse_subpage("http://komentare.ihned.cz/", "Komentáře") parse_subpage("http://domaci.ihned.cz", "Domácí") parse_subpage("http://ekonomika.ihned.cz", "Ekonomika") parse_subpage("http://zahranicni.ihned.cz/", "Zahraničí"); parse_subpage("http://finweb.ihned.cz/", "Finance"); parse_subpage("http://digiweb.ihned.cz/", "DigiWeb"); parse_subpage("http://kultura.ihned.cz/", "Kultura") parse_subpage("http://sport.ihned.cz/", "Sport"); #seradi kategorie ans = self.sort_index_by(ans, {'Hlavni':1, 'Domácí':2, 'Ekonomika':5, 'Zahraničí':3, 'Finance':6, 'DigiWeb':7, 'Kultura':8, 'Sport':9, 'Komentáře':4}) #vrati, ale pouze, kdyz je v kategoriich... ans = [(key, articles[key]) for key in ans if articles.has_key(key)] return ans Last edited by running; 01-18-2011 at 10:48 AM. |
![]() |
![]() |
![]() |
#2 |
creator of calibre
![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() Posts: 45,345
Karma: 27182818
Join Date: Oct 2006
Location: Mumbai, India
Device: Various
|
Thanks, added
|
![]() |
![]() |
Advert | |
|
![]() |
|
![]() |
||||
Thread | Thread Starter | Forum | Replies | Last Post |
Custom News Source | scrumhalf | Recipes | 5 | 11-26-2010 11:30 AM |
Best English News Source? | Gideon | Reading Recommendations | 24 | 11-16-2010 05:14 PM |
customize new source to Fetch News | gustavoleo | Recipes | 0 | 11-09-2010 06:01 PM |
News source downloads always slow | ckole | Calibre | 7 | 01-23-2010 07:48 AM |
Custom news source | JayCeeEll | Calibre | 2 | 11-14-2009 04:01 AM |