Thread: FAZ-Net Update
View Single Post
Old 01-26-2016, 04:59 PM   #11
Divingduck
Wizard
Divingduck ought to be getting tired of karma fortunes by now.Divingduck ought to be getting tired of karma fortunes by now.Divingduck ought to be getting tired of karma fortunes by now.Divingduck ought to be getting tired of karma fortunes by now.Divingduck ought to be getting tired of karma fortunes by now.Divingduck ought to be getting tired of karma fortunes by now.Divingduck ought to be getting tired of karma fortunes by now.Divingduck ought to be getting tired of karma fortunes by now.Divingduck ought to be getting tired of karma fortunes by now.Divingduck ought to be getting tired of karma fortunes by now.Divingduck ought to be getting tired of karma fortunes by now.
 
Posts: 1,166
Karma: 1410083
Join Date: Nov 2010
Location: Germany
Device: Sony PRS-650
A new update to skip some not wanted advertising.

Spoiler:
__license__ = 'GPL v3'
__copyright__ = '2008-2011, Kovid Goyal <kovid at kovidgoyal.net>, Darko Miletic <darko at gmail.com>'
'''
Profile to download FAZ.NET
'''

from calibre.web.feeds.news import BasicNewsRecipe

class FazNet(BasicNewsRecipe):
title = 'FAZ.NET'
__author__ = 'Kovid Goyal, Darko Miletic, Armin Geller' # AGe upd. V7 2016-01-26
description = 'Frankfurter Allgemeine Zeitung'
publisher = 'Frankfurter Allgemeine Zeitung GmbH'
category = 'news, politics, Germany'
use_embedded_content = False
language = 'de'

max_articles_per_feed = 30
no_stylesheets = True
encoding = 'utf-8'
remove_javascript = True

keep_only_tags = [
{'class':['FAZArtikelEinleitung']},
dict(name='div', attrs={'class':'FAZSlimHeader'}),
{'id':'ArtikelTabContent_0'}
]

remove_tags_after = [dict(name='div', attrs={'class':['ArtikelFooter']})]
remove_tags = [
dict(name='div', attrs={'class':['ArtikelFooter','clear']}),
dict(name='div', attrs={'id':['berndsbox','dertagbox']}), # AGe 2016-01-26
dict(name='a', attrs={'title':['Vergrößern']}), # AGe 2014-10-22
dict(name='img', attrs={'class':['VideoCtrlIcon']}), # AGe 2014-10-22
dict(name='span', attrs={'class':['shareAutor']}) # AGe 2014-10-22
]

feeds = [
('FAZ.NET Aktuell', 'http://www.faz.net/aktuell/?rssview=1'),
('Politik', 'http://www.faz.net/aktuell/politik/?rssview=1'),
('Wirtschaft', 'http://www.faz.net/aktuell/wirtschaft/?rssview=1'),
('Feuilleton', 'http://www.faz.net/aktuell/feuilleton/?rssview=1'),
('Sport', 'http://www.faz.net/aktuell/sport/?rssview=1'),
('Lebensstil', 'http://www.faz.net/aktuell/lebensstil/?rssview=1'),
('Gesellschaft', 'http://www.faz.net/aktuell/gesellschaft/?rssview=1'),
('Finanzen', 'http://www.faz.net/aktuell/finanzen/?rssview=1'),
('Technik & Motor', 'http://www.faz.net/aktuell/technik-motor/?rssview=1'),
('Wissen', 'http://www.faz.net/aktuell/wissen/?rssview=1'),
('Reise', 'http://www.faz.net/aktuell/reise/?rssview=1'),
('Beruf & Chance', 'http://www.faz.net/aktuell/beruf-chance/?rssview=1'),
('Rhein-Main', 'http://www.faz.net/aktuell/rhein-main/?rssview=1')
]

# AGe 2014-01-10 For multipages
INDEX = ''
def append_page(self, soup, appendtag, position):
pager = soup.find('a',attrs={'title':'Nächste Seite'})
if pager:
nexturl = self.INDEX + pager['href']
soup2 = self.index_to_soup(nexturl)
texttag = soup2.find('div', attrs={'class':'FAZArtikelContent'})
for cls in (
'ArtikelFooter', 'ArtikelAbbinder',
'ArtikelKommentieren Artikelfuss GETS;tk;boxen.top-lesermeinungen;tp;content',
'Anzeige GoogleAdsBuehne', 'ThemenLinks', 'rechtehinweis',
'stageModule Ressortmodul Rubrikenkopf clearfix', 'VideoCtrlIcon',
'ArtikelAbbinder clearfix',
'stageModule clearfix GETS;tk;artikel.empfehlungen.weitere-artikel;tp;content',
'ThemenLinks',
): # AGe 2014-10-22
div = texttag.find(attrs={'class':cls})
if div is not None:
div.extract()
for cls in (
'berndsbox','dertagbox'): # AGe 2016-01-26
div = texttag.find(attrs={'id':cls})
if div is not None:
div.extract()
div = texttag.find(attrs={'title':'Vergrößern'}) # AGe 2014-10-22
if div is not None:
div.extract()
newpos = len(texttag.contents)
self.append_page(soup2,texttag,newpos)
texttag.extract()
pager.extract()
appendtag.insert(position,texttag)

def preprocess_html(self, soup):
self.append_page(soup, soup.body, 3)
for img in soup.findAll('img', attrs={'data-src':True}):
img['src'] = img['data-src']
return self.adeify_images(soup)

def postprocess_html(self, soup, first_fetch):
for div in soup.findAll(id='ArticlePagerBottom'):
div.extract()
for div in soup.findAll('div', attrs={'class':'clear'}): # AGe add 2014-10-24
div.extract()
return soup


As always, let me know if there are any issues with this version.

DivingDuck
Attached Files
File Type: zip faznet_AGe_V7.zip (1.6 KB, 262 views)
Divingduck is offline   Reply With Quote