Hi SZ Recipe Community!
first of all - thanks for the great SZ recipe - it is a welcome alternative to Amazon's expensive SZ abo!
As you can already download the newspaper of the next day after about 7 pm on the SZ e-paper website - I was wondering if this is also possible with the Calibre recipe. I tried to find out the id pattern in the download URL eg.
http://epaper.sueddeutsche.de/app/ep...lt/1422486000/.
obviously id's from 27.Jan. to 29.Jan. have all fix places 1422wxyz00.
Where wxyz was rising each day.
26.Jan.: wxyz = 2268
28.Jan.: wxyz = 3996
29.Jan.: wxyz = 4860
the id difference between two days is 4860-3996=864
I added my code below. It is tested just by downloading (an our ago) the paper of tomorrow which worked well. I expect that the code will work till 30.Jan. - Will be exciting what the URL ID will actually look like in February :-) Most probably the ID calculation has to be adjusted...
I also don't adjusted the date which is entered in the calibre database as I don't know at the moment how to rise the date by one day...
Code:
,'date' :strftime(self.timefmt)
I hope this code is useful for all evening readers out there - like me - who have to leave the house very early in the morning :-)
My additional Code in short:
Code:
from datetime import datetime
d = 864 #id delta between two days
d29 = 4860 #start id @ day 29
now = datetime.now()
dy = int(strftime('%j'))
dyt = dy + 1 # day of the year tomorrow
dg = dyt - 29
id_d = d * dg
d_d = d29 + id_d
id = "1422"+str(d_d)+"00" #1422 jan.2015
feeds = [
(u'Politik' , INDEX + 'Politik/{}/'.format(id) )
...
The whole adapted SZ recipe Code:
Code:
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
__license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
'''
www.sueddeutsche.de/sz/
'''
# History
# 2014.10.02 Fixed url Problem von lala-rob(web@lala-rob.de)
from calibre.web.feeds.news import BasicNewsRecipe
from calibre import strftime
from datetime import datetime
class SueddeutcheZeitung(BasicNewsRecipe):
title = u'Süddeutsche Zeitung'
__author__ = 'Darko Miletic'
description = 'News from Germany. Access to paid content.'
publisher = u'Süddeutsche Zeitung'
category = 'news, politics, Germany'
no_stylesheets = True
oldest_article = 2
encoding = 'iso-8859-1'
needs_subscription = True
remove_empty_feeds = True
delay = 1
cover_source = 'http://www.sueddeutsche.de/verlag'
PREFIX = 'http://epaper.sueddeutsche.de'
INDEX = PREFIX + '/app/epaper/textversion/'
use_embedded_content = False
masthead_url = 'http://pix.sueddeutsche.de/img/layout/header/SZ_solo288x31.gif'
language = 'de'
publication_type = 'newspaper'
extra_css = ' body{font-family: Arial,Helvetica,sans-serif} '
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
, 'linearize_tables' : True
}
remove_attributes = ['height','width','style']
def get_browser(self):
browser = BasicNewsRecipe.get_browser(self)
# Login via fetching of Streiflicht -> Fill out login request
#url = self.root_url + 'show.php?id=streif'
url = 'https://id.sueddeutsche.de/login'
browser.open(url)
browser.select_form(nr=0) # to select the first form
browser['login'] = self.username
browser['password'] = self.password
browser.submit()
return browser
remove_tags =[
dict(attrs={'class':'hidePrint'})
,dict(name=['link','object','embed','base','iframe','br'])
]
keep_only_tags = [dict(attrs={'class':'artikelBox'})]
remove_tags_before = dict(attrs={'class':'artikelTitel'})
remove_tags_after = dict(attrs={'class':'author'})
#P.S. 28.01.15
#BEG
d = 864 #id delta between two days
d29 = 4860 #start id @ day 29
now = datetime.now()
dy = int(strftime('%j'))
dyt = dy + 1 # day of the year tomorrow
dg = dyt - 29
id_d = d * dg
d_d = d29 + id_d
id = "1422"+str(d_d)+"00" #1422 jan.2015
#END
feeds = [
(u'Politik' , INDEX + 'Politik/{}/'.format(id) )
,(u'Seite drei' , INDEX + 'Seite+drei/{}/'.format(id) )
,(u'Thema des Tages' , INDEX + 'Thema+des+Tages/{}/'.format(id) )
,(u'Meinungsseite' , INDEX + 'Meinungsseite/{}/'.format(id))
,(u'Wissen' , INDEX + 'Wissen/{}/'.format(id) )
,(u'Panorama' , INDEX + 'Panorama/{}/'.format(id) )
,(u'Feuilleton' , INDEX + 'Feuilleton/{}/'.format(id) )
,(u'Medien' , INDEX + 'Medien/{}/'.format(id) )
,(u'Wirtschaft' , INDEX + 'Wirtschaft/{}/'.format(id) )
,(u'Sport' , INDEX + 'Sport/{}/'.format(id) )
,(u'Bayern' , INDEX + 'Bayern/{}/'.format(id) )
,(u'Muenchen' , INDEX + 'M%FCnchen/{}/'.format(id) )
,(u'Muenchen City' , INDEX + 'M%FCnchen+City/{}/'.format(id) )
,(u'Jetzt.de' , INDEX + 'Jetzt.de/{}/'.format(id) )
,(u'Reise' , INDEX + 'Reise/{}/'.format(id) )
,(u'SZ Extra' , INDEX + 'SZ+Extra/{}/'.format(id) )
,(u'Wochenende' , INDEX + 'SZ+am+Wochenende/{}/'.format(id) )
,(u'Stellen-Markt' , INDEX + 'Stellen-Markt/{}/'.format(id))
,(u'Motormarkt' , INDEX + 'Motormarkt/{}/'.format(id))
,(u'Immobilien-Markt', INDEX + 'Immobilien-Markt/{}/'.format(id))
,(u'Thema' , INDEX + 'Thema/{}/'.format(id) )
,(u'Forum' , INDEX + 'Forum/{}/'.format(id) )
,(u'Leute' , INDEX + 'Leute/{}/'.format(id) )
,(u'Jugend' , INDEX + 'Jugend/{}/'.format(id) )
,(u'Beilage' , INDEX + 'Beilage/{}/'.format(id) )
]
def get_cover_url(self):
cover_source_soup = self.index_to_soup(self.cover_source)
preview_image_div = cover_source_soup.find(attrs={'class':'preview-image'})
return preview_image_div.div.img['src']
def parse_index(self):
src = self.index_to_soup(self.INDEX)
id = ''
for itt in src.findAll('a',href=True):
if itt['href'].startswith('/app/epaper/textversion/inhalt/{}/'.format(id)):
id = itt['href'].rpartition('/inhalt/{}/'.format(id))[2]
totalfeeds = []
lfeeds = self.get_feeds()
for feedobj in lfeeds:
feedtitle, feedurl = feedobj
self.report_progress(0, ('Fetching feed')+' %s...'%(feedtitle if feedtitle else feedurl))
articles = []
soup = self.index_to_soup(feedurl + id)
tbl = soup.find(attrs={'class':'szprintd'})
for item in tbl.findAll(name='td',attrs={'class':'topthema'}):
atag = item.find(attrs={'class':'Titel'}).a
ptag = item.find('p')
stag = ptag.find('script')
if stag:
stag.extract()
url = self.PREFIX + atag['href']
title = self.tag_to_string(atag)
description = self.tag_to_string(ptag)
articles.append({
'title' :title
,'date' :strftime(self.timefmt)
,'url' :url
,'description':description
})
totalfeeds.append((feedtitle, articles))
return totalfeeds