Hi,
I took the liberty to enhance the existing brand eins recipe.
Here is my changelog:
NEW: The issue to download can be selected via the username field.
NEW: Add cover image.
NEW: Prevent that conversion date is appended to title.
NEW: Remove "This article was downloaded by calibre from..." section from bottom of each page.
FIXED: "brand eins" is written in lowercase.
And here is the recipe:
Code:
#!/usr/bin/env python
# -*- coding: utf-8 mode: python -*-
__license__ = 'GPL v3'
__copyright__ = '2010, Constantin Hofstetter <consti at consti.de>, Steffen Siebert <calibre at steffensiebert.de>'
__version__ = '0.96'
''' http://brandeins.de - Wirtschaftsmagazin '''
import re
import string
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.web.feeds.templates import Template, CLASS
from lxml.html.builder import HTML, HEAD, TITLE, STYLE, DIV, BODY, BR, A, HR, UL
class MyNavBarTemplate(Template):
"""
Same as calibre.web.feeds.templates.NavBarTemplate but without the
'This article was downloaded by calibre from...'
text at the bottom.
"""
def _generate(self, bottom, feed, art, number_of_articles_in_feed,
two_levels, url, __appname__, prefix='', center=True,
extra_css=None, style=None):
head = HEAD(TITLE('navbar'))
if style:
head.append(STYLE(style, type='text/css'))
if extra_css:
head.append(STYLE(extra_css, type='text/css'))
if prefix and not prefix.endswith('/'):
prefix += '/'
align = 'center' if center else 'left'
navbar = DIV(CLASS('calibre_navbar', 'calibre_rescale_70',
style='text-align:'+align))
if bottom:
if not url.startswith('file://'):
navbar.append(HR())
else:
next = 'feed_%d'%(feed+1) if art == number_of_articles_in_feed - 1 \
else 'article_%d'%(art+1)
up = '../..' if art == number_of_articles_in_feed - 1 else '..'
href = '%s%s/%s/index.html'%(prefix, up, next)
navbar.text = '| '
navbar.append(A('Next', href=href))
href = '%s../index.html#article_%d'%(prefix, art)
navbar.iterchildren(reversed=True).next().tail = ' | '
navbar.append(A('Section Menu', href=href))
href = '%s../../index.html#feed_%d'%(prefix, feed)
navbar.iterchildren(reversed=True).next().tail = ' | '
navbar.append(A('Main Menu', href=href))
if art > 0 and not bottom:
href = '%s../article_%d/index.html'%(prefix, art-1)
navbar.iterchildren(reversed=True).next().tail = ' | '
navbar.append(A('Previous', href=href))
navbar.iterchildren(reversed=True).next().tail = ' | '
if not bottom:
navbar.append(HR())
self.root = HTML(head, BODY(navbar))
class BrandEins(BasicNewsRecipe):
title = u'brand eins'
__author__ = 'Constantin Hofstetter'
description = u'Wirtschaftsmagazin'
publisher ='brandeins.de'
category = 'politics, business, wirtschaft, Germany'
use_embedded_content = False
lang = 'de-DE'
no_stylesheets = True
encoding = 'utf-8'
language = 'de'
publication_type = 'magazine'
needs_subscription = True
# Prevent that conversion date is appended to title
timefmt = ''
# 2 is the last full magazine (default)
# 1 is the newest (but not full)
# 3 is one before 2 etc.
# This value can be set via the username field.
default_issue = 2
keep_only_tags = [dict(name='div', attrs={'id':'theContent'}), dict(name='div', attrs={'id':'sidebar'}), dict(name='div', attrs={'class':'intro'}), dict(name='p', attrs={'class':'bodytext'}), dict(name='div', attrs={'class':'single_image'})]
'''
brandeins.de
'''
def __init__(self, options, log, progress_reporter):
""" Constructor. """
BasicNewsRecipe.__init__(self, options, log, progress_reporter)
self.navbar = MyNavBarTemplate()
def postprocess_html(self, soup,first):
# Move the image of the sidebar right below the h3
first_h3 = soup.find(name='div', attrs={'id':'theContent'}).find('h3')
for imgdiv in soup.findAll(name='div', attrs={'class':'single_image'}):
if len(first_h3.findNextSiblings('div', {'class':'intro'})) >= 1:
# first_h3.parent.insert(2, imgdiv)
first_h3.findNextSiblings('div', {'class':'intro'})[0].parent.insert(4, imgdiv)
else:
first_h3.parent.insert(2, imgdiv)
# Now, remove the sidebar
soup.find(name='div', attrs={'id':'sidebar'}).extract()
# Remove the rating-image (stars) from the h3
for img in first_h3.findAll(name='img'):
img.extract()
# Mark the intro texts as italic
for div in soup.findAll(name='div', attrs={'class':'intro'}):
for p in div.findAll('p'):
content = self.tag_to_string(p)
new_p = "<p><i>"+ content +"</i></p>"
p.replaceWith(new_p)
return soup
def get_cover(self, soup):
cover_url = None
cover_item = soup.find('div', attrs = {'class': 'cover_image'})
if cover_item:
cover_url = 'http://www.brandeins.de/' + cover_item.img['src']
return cover_url
def parse_index(self):
feeds = []
archive = "http://www.brandeins.de/archiv.html"
issue = self.default_issue
if self.username:
try:
issue = int(self.username)
except:
pass
soup = self.index_to_soup(archive)
latest_jahrgang = soup.findAll('div', attrs={'class': re.compile(r'\bjahrgang-latest\b') })[0].findAll('ul')[0]
pre_latest_issue = latest_jahrgang.findAll('a')[len(latest_jahrgang.findAll('a'))-issue]
url = pre_latest_issue.get('href', False)
# Get the title for the magazin - build it out of the title of the cover - take the issue and year;
self.title = "brand eins "+ re.search(r"(?P<date>\d\d\/\d\d\d\d)", pre_latest_issue.find('img').get('title', False)).group('date')
url = 'http://brandeins.de/'+url
# url = "http://www.brandeins.de/archiv/magazin/tierisch.html"
titles_and_articles = self.brand_eins_parse_latest_issue(url)
if titles_and_articles:
for title, articles in titles_and_articles:
feeds.append((title, articles))
return feeds
def brand_eins_parse_latest_issue(self, url):
soup = self.index_to_soup(url)
self.cover_url = self.get_cover(soup)
article_lists = [soup.find('div', attrs={'class':'subColumnLeft articleList'}), soup.find('div', attrs={'class':'subColumnRight articleList'})]
titles_and_articles = []
current_articles = []
chapter_title = "Editorial"
self.log('Found Chapter:', chapter_title)
# Remove last list of links (thats just the impressum and the 'gewinnspiel')
article_lists[1].findAll('ul')[len(article_lists[1].findAll('ul'))-1].extract()
for article_list in article_lists:
for chapter in article_list.findAll('ul'):
if len(chapter.findPreviousSiblings('h3')) >= 1:
new_chapter_title = string.capwords(self.tag_to_string(chapter.findPreviousSiblings('h3')[0]))
if new_chapter_title != chapter_title:
titles_and_articles.append([chapter_title, current_articles])
current_articles = []
self.log('Found Chapter:', new_chapter_title)
chapter_title = new_chapter_title
for li in chapter.findAll('li'):
a = li.find('a', href = True)
if a is None:
continue
title = self.tag_to_string(a)
url = a.get('href', False)
if not url or not title:
continue
url = 'http://brandeins.de/'+url
if len(a.parent.findNextSiblings('p')) >= 1:
description = self.tag_to_string(a.parent.findNextSiblings('p')[0])
else:
description = ''
self.log('\t\tFound article:', title)
self.log('\t\t\t', url)
self.log('\t\t\t', description)
current_articles.append({'title': title, 'url': url, 'description': description, 'date':''})
titles_and_articles.append([chapter_title, current_articles])
return titles_and_articles
Ciao,
Steffen