Okay, I think this should work now. There is still room for improvement (images, line breaks, additional blogs).
The issue regarding HTML comments is still unclear to me.
In addition, I understand that remove_tags is applied after preprocess_html. Is there a smart way to re-implementing remove_tags? Is there a way to process the subsequent pages equally as any other downloaded page?
But for now, have fun with this, let me know if it works for you as well.
Spoiler:
Code:
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2011, Nikolas Mangold <nmangold at gmail.com>'
'''
sz-magazin.de
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre import strftime
import re
class SueddeutscheZeitungMagazin(BasicNewsRecipe):
title = u'Süddeutsche Zeitung Magazin'
__author__ = 'Nikolas Mangold'
description = u'Süddeutsche Zeitung Magazin'
publisher = u'Magazin Verlagsgesellschaft Süddeutsche Zeitung mbH'
category = 'Germany'
no_stylesheets = True
encoding = 'cp1252'
remove_empty_feeds = True
delay = 1
PREFIX = 'http://sz-magazin.sueddeutsche.de'
INDEX = PREFIX + '/hefte'
use_embedded_content = False
masthead_url = 'http://sz-magazin.sueddeutsche.de/img/general/logo.gif'
language = 'de'
publication_type = 'magazine'
extra_css = ' body{font-family: Arial,Helvetica,sans-serif} '
timefmt = '%W %Y'
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
, 'linearize_tables' : True
}
remove_tags_before = dict(attrs={'class':'vorspann'})
remove_tags_after = dict(attrs={'id':'commentsContainer'})
remove_tags = [
dict(name='ul',attrs={'class':'textoptions'}),
dict(name='div',attrs={'class':'BannerBug'}),
dict(name='div',attrs={'id':'commentsContainer'}),
dict(name='div',attrs={'class':'plugin-linkbox'}), #not working
dict(name='div',attrs={'id':'galleryInfo0'}),
dict(name='div',attrs={'class':'control'})
]
def parse_index(self):
feeds = []
# determine current issue
index = self.index_to_soup(self.INDEX)
year_index = index.find('ul', attrs={'class':'hefte-jahre'})
week_index = index.find('ul', attrs={'class':'heftindex'})
year = self.tag_to_string(year_index.find('li')).strip()
tmp = week_index.find('li').a
week = self.tag_to_string(tmp)
aktuelles_heft = self.PREFIX + tmp['href']
# set cover
self.cover_url = '{0}/img/hefte/thumbs_l/{1}{2}.jpg'.format(self.PREFIX,year,week)
# find articles and add to main feed
soup = self.index_to_soup(aktuelles_heft)
content = soup.find('div',{'id':'maincontent'})
mainfeed = 'SZ Magazin {0}/{1}'.format(week, year)
articles = []
for article in content.findAll('li'):
txt = article.find('div',{'class':'text-holder'})
if txt is None:
continue
link = txt.find('a')
desc = txt.find('p')
title = self.tag_to_string(link).strip()
self.log('Found article ', title)
url = self.PREFIX + link['href']
articles.append({'title' : title, 'date' : strftime(self.timefmt), 'url' : url, 'desc' : desc})
feeds.append((mainfeed,articles))
return feeds;
def preprocess_html(self, soup):
# determine if multipage, if not bail out
multipage = soup.find('ul',attrs={'class':'blaettern'})
if multipage is None:
return soup;
# get all subsequent pages and delete multipage links
next_pages = []
for next in multipage.findAll('li'):
if next.a is None:
continue
nexturl = next.a['href']
nexttitle = self.tag_to_string(next).strip()
next_pages.append((self.PREFIX + nexturl,nexttitle))
multipage.extract()
# extract article from subsequent pages and insert at end of first page article
firstpage = soup.find('body')
firstpage_header = firstpage.find('div',attrs={'class':'vorspann'})
firstpage_article = firstpage.find('div',attrs={'id':'artikel'})
firstpage_header.insert(len(firstpage_header.contents),firstpage_article)
for url, title in next_pages:
next_soup = self.index_to_soup(url)
next_article = next_soup.find('div',attrs={'id':'artikel'})
# remove banner ad
banner = next_article.find('div',attrs={'class':'BannerBug'})
if banner:
banner.extract()
# remove remaining HTML comments
comments = next_article.findAll(text=re.compile('google_ad'))
[comment.extract() for comment in comments]
firstpage_header.insert(len(firstpage_header.contents), next_article)
return firstpage_header