would be cleaner over the next weeks. because the regional channels has another site-setup. i would figure it out.
Code:
import string, re
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class AdvancedUserRecipe(BasicNewsRecipe):
title = u'Tagesschau.de'
__author__ = 'schuster'
# Achtung hohe frequenz neuer meldungen
oldest_article = 1
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
language = 'de'
remove_javascript = True
cover_url = 'http://www.einsfestival.de/bilder_pool/sendungsdetailseiten/tagesschau/multiteaser01.jpg'
masthead_url = 'http://www.einsfestival.de/bilder_pool/sendungsdetailseiten/tagesschau/multiteaser01.jpg'
extra_css = '''
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large; margin-left:auto; margin-right:auto; margin-bottom:1.5em;}
h4{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
img {margin-left:auto; margin-right:autom; margin-bottom:5em;}
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
b{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small; margin-left:auto; margin-right:auto; margin-bottom:0.5em;}
.standDatum{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small; margin-left:auto; margin-right:auto; margin-bottom:2.5em;}
.imgSubline{font-family:Arial,Helvetica,sans-serif;font-size:small; margin-bottom:1.5em; margin-top:0.3em;}
.articleImg{font-family:Arial,Helvetica,sans-serif;font-size:small; margin-bottom:1.5em; margin-top:0.3em;}
'''
# entfernen aller hotlinks
def preprocess_html(self, soup):
for alink in soup.findAll('a'):
if alink.string is not None:
tstr = alink.string
alink.replaceWith(tstr)
return soup
#fuer alle bereiche evtl. erweitern wg. regionalberichte wie unten
remove_tags = [dict(attrs={'class':['zitatBox breit flashaudio', 'leftNavText', 'zitatBox img', 'listContainer', 'NaviContainer', 'marke', 'rightColPadding', 'directLinks', 'directLinks weltatlas',
'teaserBox metaBlock', 'boxMoreLinks', 'inv', 'leftNavL1 secL1 leftNavText', 'leftColPadding', 'singleCo', 'videoSubline', 'videoDownloadLi aktiv', 'leftNavL1 leftNavText',
'rs','span-3', 'footerServices', 'span-3 last', 'impretc', 'NaviContainer', 'chart', 'toplink', 'span-4 bMeldungList', 'boxhead', 'span-4 bMeldung', 'rsbox boxbody',
'headerRBB', 'containerService', 'containerMarginalA', 'containerMarginalB', 'iconList', 'footer']}),
dict(id=['content', 'leftcols', 'rightcol', 'seitenanfang', 'footerwraptop', 'ardFussText', 'footer', 'sendungenLeft', 'leftcol','ardFuss', 'logos', 'ardFussText', 'footerard']),
dict(name=['hjtrs', 'kud'])]
# - - - - - - - tagesschau - - - - - - -
# remove_tags = [dict(attrs={'class':['zitatBox breit flashaudio', 'leftNavText', 'zitatBox img', 'listContainer', 'NaviContainer', 'marke', 'rightColPadding', 'directLinks', 'directLinks weltatlas',
# 'teaserBox metaBlock', 'boxMoreLinks', 'inv', 'leftNavL1 secL1 leftNavText', 'leftColPadding', 'singleCo', 'videoSubline', 'videoDownloadLi aktiv', 'leftNavL1 leftNavText',
# dict(id=['content', 'leftcols', 'rightcol', 'seitenanfang', 'footerwraptop', 'ardFussText', 'footer', 'sendungenLeft', 'leftcol', 'ardFuss', 'logos', 'ardFussText', 'footerard']),
# dict(name=['hjtrs', 'kud'])]
# - - - - - - boerse der ard, eigenes seitenlayout- - - - - -
# remove_tags = [dict(attrs={'class':['rs','span-3', 'footerServices', 'span-3 last', 'impretc', 'NaviContainer', 'chart', 'toplink', 'span-4 bMeldungList', 'boxhead', 'span-4 bMeldung', 'rsbox boxbody']}),
# dict(id=['ardFuss', 'logos', 'ardFussText', 'footerard'])]
# - - - - - - - - rbb-online - regionalberichterstattung- - - - - - - -
# remove_tags = [dict(attrs={'class':['headerRBB', 'containerService', 'containerMarginalA', 'containerMarginalB', 'iconList', 'footer']})]
feeds = [
(u'Tagesschau', u'http://www.tagesschau.de/xml/rss2'),
(u'ARD - Ratgeber', u'http://www.ard.de/export/rss20/ratgeber/-/id=1874/format=rss20/6jw58y/index.xml'),
(u'Kultur', u'http://www.ard.de/export/rss20/kultur/-/id=467344/format=rss20/1xblu0z/index.xml'),
(u'Wissen', u'http://www.ard.de/export/rss20/wissen/-/id=918004/format=rss20/ks3dcs/index.xml'),
(u'Börse', u'http://boerse.ard.de/rss_news.jsp')
]