Hello,
Next to Sueddeutsche Zeitung (newspaper) there is also a magazine.
This one sometimes has articles with multiple pages, a print-version is available for some articles, but very inconsistent.
I looked at the "Adventure Gamers" multi page example and adopted the code. I had the advantage that all subsequent pages are linked on the first page and hence skipped the recursion and implemented it more simple the iterative way.
Here is my current code.
Spoiler:
Code:
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2011, Nikolas Mangold <nmangold at gmail.com>'
'''
sz-magazin.de
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre import strftime
from BeautifulSoup import Comment
class SueddeutscheZeitungMagazin(BasicNewsRecipe):
title = 'Sueddeutsche Zeitung Magazin'
__author__ = 'Nikolas Mangold'
description = 'Sueddeutsche Zeitung Magazin'
category = 'Germany'
no_stylesheets = True
encoding = 'cp1252'
remove_empty_feeds = True
delay = 1
PREFIX = 'http://sz-magazin.sueddeutsche.de'
INDEX = PREFIX + '/hefte'
use_embedded_content = False
masthead_url = 'http://sz-magazin.sueddeutsche.de/img/general/logo.gif'
language = 'de'
publication_type = 'magazine'
extra_css = ' body{font-family: Arial,Helvetica,sans-serif} '
timefmt = '%W %Y'
remove_tags_before = dict(attrs={'class':'vorspann'})
remove_tags_after = dict(attrs={'id':'commentsContainer'})
remove_tags = [dict(name='ul', attrs={'class':'textoptions'}),dict(name='div', attrs={'class':'BannerBug'}),dict(name='div', attrs={'id':'commentsContainer'}),dict(name='div', attrs={'class':'plugin-linkbox'})]
def parse_index(self):
# determine current issue
index = self.index_to_soup(self.INDEX)
year_index = index.find('ul', attrs={'class':'hefte-jahre'})
week_index = index.find('ul', attrs={'class':'heftindex'})
year = self.tag_to_string(year_index.find('li')).strip()
tmp = week_index.find('li').a
week = self.tag_to_string(tmp)
aktuelles_heft = self.PREFIX + tmp['href']
# set cover
self.cover_url = 'http://sz-magazin.sueddeutsche.de/img/hefte/thumbs_l/{0}{1}.jpg'.format(year,week)
# find articles and add to main feed
soup = self.index_to_soup(aktuelles_heft)
content = soup.find('div',{'id':'maincontent'})
feed = 'SZ Magazin {0}/{1}'.format(week, year)
feeds = []
articles = []
for article in content.findAll('li'):
txt = article.find('div',{'class':'text-holder'})
if txt is None:
continue
link = txt.find('a')
desc = txt.find('p')
title = self.tag_to_string(link).strip()
self.log('Found article ', title)
url = self.PREFIX + link['href']
articles.append({'title' : title, 'date' : strftime(self.timefmt), 'url' : url, 'desc' : desc})
feeds.append((feed,articles))
return feeds;
def preprocess_html(self, soup):
# determine if multipage, if not bail out
multipage = soup.find('ul',attrs={'class':'blaettern'})
if multipage is None:
return soup;
# get all subsequent pages and delete multipage links
next_pages = []
for next in multipage.findAll('li'):
if next.a is None:
continue
nexturl = next.a['href']
nexttitle = self.tag_to_string(next).strip()
next_pages.append((self.PREFIX + nexturl,nexttitle))
multipage.extract()
# extract article from subsequent pages and insert at end of first page article
firstpage_article = soup.find('div',attrs={'id':'artikel'})
position = len(firstpage_article.contents) # TODO
offset = 0
for url, title in next_pages:
next_soup = self.index_to_soup(url)
next_article = next_soup.find('div',attrs={'id':'artikel'})
banner = next_article.find('div',attrs={'class':'BannerBug'})
if banner:
banner.extract()
firstpage_article.insert(position + offset, next_article)
offset += len(next_article.contents)
return firstpage_article
Things look pretty good right now, except I have one issue with HTML comments in preprocess_html.
are duplicated as
Code:
<!--<!-- ad tag -->-->
I tried extracting the comment with
Code:
comments = next_article.findAll(text=lambda text:isinstance(text, Comment))
[comment.extract() for comment in comments]
but I think it is no longer recognized as a comment.
Why are the HTML tags re-commented again?