Hi,
first of all: Happy new year to all of you!
The current brand eins recipe has a flaw as it handles only issues of the latest year, so with the addition of the (current, thus incomplete) january 2011 issue to the archive it's no longer possible to download issues from 2010, especially the december issue which is now the latest complete issue.
So I fixed the recipe and it's now possible to download any issue from all years.
Ciao,
Steffen
Code:
#!/usr/bin/env python
# -*- coding: utf-8 mode: python -*-
__license__ = 'GPL v3'
__copyright__ = '2010, Constantin Hofstetter <consti at consti.de>, Steffen Siebert <calibre at steffensiebert.de>'
__version__ = '0.97'
''' http://brandeins.de - Wirtschaftsmagazin '''
import re
import string
from calibre.ebooks.BeautifulSoup import Tag
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.web.feeds.templates import Template, CLASS
from lxml.html.builder import HTML, HEAD, TITLE, STYLE, DIV, BODY, BR, A, HR, UL
class BrandEins(BasicNewsRecipe):
title = u'brand eins'
__author__ = 'Constantin Hofstetter'
description = u'Wirtschaftsmagazin'
publisher ='brandeins.de'
category = 'politics, business, wirtschaft, Germany'
use_embedded_content = False
lang = 'de-DE'
no_stylesheets = True
encoding = 'utf-8'
language = 'de'
publication_type = 'magazine'
needs_subscription = 'optional'
# Prevent that conversion date is appended to title
timefmt = ''
# 2 is the last full magazine (default)
# 1 is the newest (but not full)
# 3 is one before 2 etc.
# This value can be set via the username field.
default_issue = 2
keep_only_tags = [dict(name='div', attrs={'id':'theContent'}), dict(name='div', attrs={'id':'sidebar'}), dict(name='div', attrs={'class':'intro'}), dict(name='p', attrs={'class':'bodytext'}), dict(name='div', attrs={'class':'single_image'})]
'''
brandeins.de
'''
def postprocess_html(self, soup,first):
# Move the image of the sidebar right below the h3
first_h3 = soup.find(name='div', attrs={'id':'theContent'}).find('h3')
for imgdiv in soup.findAll(name='div', attrs={'class':'single_image'}):
if len(first_h3.findNextSiblings('div', {'class':'intro'})) >= 1:
# first_h3.parent.insert(2, imgdiv)
first_h3.findNextSiblings('div', {'class':'intro'})[0].parent.insert(4, imgdiv)
else:
first_h3.parent.insert(2, imgdiv)
# Now, remove the sidebar
soup.find(name='div', attrs={'id':'sidebar'}).extract()
# Remove the rating-image (stars) from the h3
for img in first_h3.findAll(name='img'):
img.extract()
# Mark the intro texts as italic
for div in soup.findAll(name='div', attrs={'class':'intro'}):
for p in div.findAll('p'):
content = self.tag_to_string(p)
new_p = "<p><i>"+ content +"</i></p>"
p.replaceWith(new_p)
# Change <h3> to <h1>
header = soup.find("h3")
if header:
tag = Tag(soup, "h1")
tag.insert(0, header.contents[0])
header.replaceWith(tag)
return soup
def get_cover(self, soup):
cover_url = None
cover_item = soup.find('div', attrs = {'class': 'cover_image'})
if cover_item:
cover_url = 'http://www.brandeins.de/' + cover_item.img['src']
return cover_url
def parse_index(self):
feeds = []
issue_map = {}
archive = "http://www.brandeins.de/archiv.html"
issue = self.default_issue
if self.username:
try:
issue = int(self.username)
except:
pass
soup = self.index_to_soup(archive)
issue_list = soup.findAll('div', attrs={'class': 'tx-brandeinsmagazine-pi1'})[0].findAll('a')
issue_list = [i for i in issue_list if i.get('onmouseover', False)]
for i in issue_list:
issue_number_string = i.get('onmouseover', False)
if issue_number_string:
match = re.match("^switch_magazine\(([0-9]+), ([0-9]+)\)$", issue_number_string)
issue_number = "%04i%02i" % (int(match.group(1)), int(match.group(2)))
issue_map[issue_number] = i
keys = issue_map.keys()
keys.sort()
keys.reverse()
selected_issue = issue_map[keys[issue-1]]
url = selected_issue.get('href', False)
# Get the title for the magazin - build it out of the title of the cover - take the issue and year;
self.title = "brand eins "+ re.search(r"(?P<date>\d\d\/\d\d\d\d)", selected_issue.find('img').get('title', False)).group('date')
url = 'http://brandeins.de/'+url
# url = "http://www.brandeins.de/archiv/magazin/tierisch.html"
titles_and_articles = self.brand_eins_parse_issue(url)
if titles_and_articles:
for title, articles in titles_and_articles:
feeds.append((title, articles))
return feeds
def brand_eins_parse_issue(self, url):
soup = self.index_to_soup(url)
self.cover_url = self.get_cover(soup)
article_lists = [soup.find('div', attrs={'class':'subColumnLeft articleList'}), soup.find('div', attrs={'class':'subColumnRight articleList'})]
titles_and_articles = []
current_articles = []
chapter_title = "Editorial"
self.log('Found Chapter:', chapter_title)
# Remove last list of links (thats just the impressum and the 'gewinnspiel')
article_lists[1].findAll('ul')[len(article_lists[1].findAll('ul'))-1].extract()
for article_list in article_lists:
for chapter in article_list.findAll('ul'):
if len(chapter.findPreviousSiblings('h3')) >= 1:
new_chapter_title = string.capwords(self.tag_to_string(chapter.findPreviousSiblings('h3')[0]))
if new_chapter_title != chapter_title:
titles_and_articles.append([chapter_title, current_articles])
current_articles = []
self.log('Found Chapter:', new_chapter_title)
chapter_title = new_chapter_title
for li in chapter.findAll('li'):
a = li.find('a', href = True)
if a is None:
continue
title = self.tag_to_string(a)
url = a.get('href', False)
if not url or not title:
continue
url = 'http://brandeins.de/'+url
if len(a.parent.findNextSiblings('p')) >= 1:
description = self.tag_to_string(a.parent.findNextSiblings('p')[0])
else:
description = ''
self.log('\t\tFound article:', title)
self.log('\t\t\t', url)
self.log('\t\t\t', description)
current_articles.append({'title': title, 'url': url, 'description': description, 'date':''})
titles_and_articles.append([chapter_title, current_articles])
return titles_and_articles