someone notified me that the original Brandeins recipe is no longer working.
I saw that someone updated my original one in the meantime. Unfortunately, they changed their website once again, so I had to basically re-write the recipe. Also bad news, some of the sections, divs or classes have generated identifiers, so I needed to identify some elements by text.
Anyways, here we go, I'd appreciate if some of you on here could test it as well, before I would recommend to replace this recipe with the existing one included with Calibre.
This version downloads the 4th issues (from newest), same as before it is the one with all articles online.
Spoiler:
Code:
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import unicode_literals
__license__ = 'GPL v3'
__version__ = '0.3'
'''
brand eins.de
'''
from calibre.web.feeds.news import BasicNewsRecipe
class BrandEins(BasicNewsRecipe):
title = u'brand eins'
__author__ = 'Nikolas Mangold-Takao'
language = 'de'
description = u'brand eins beschreibt den momentanen Wandel in Wirtschaft und Gesellschaft.'
publisher = u'brand eins Media'
category = 'economics, business, Germany'
PREFIX = 'http://www.brandeins.de'
HTTPS_PREFIX = 'https://www.brandeins.de' #Cover download requires https
INDEX = PREFIX + '/magazine/brand-eins-wirtschaftsmagazin'
last_full_issue = 3
section_name_shop = "Zum Online-Shop" #Name of last section with no relevant content
footer_ads = "Lust auf*mehr?" #Name of text before bottom footer
use_embedded_content = False
resolve_internal_links = True
no_stylesheets = True
needs_subscription = False
delay = 1
summary_length = 200
simultaneous_downloads = 5
remove_javascript = True
remove_tags_before = dict(name='div', attrs={'class': 'article-intro clearfix'})
remove_tags_after = dict(name='div', attrs={'class': 'mobile-only sticky-sharebar sharing--colored'})
remove_tags = [dict(name='div', attrs={'class': 'mobile-only sticky-sharebar sharing--colored'}),
dict(name='aside', attrs={'class': 'page-info-section row'})]
extra_css = '''
body, p {text-align: left;}
.headline {font-size: x-large;}
h2 {font-size: medium;}
h1 {font-size: large;}
em.Bold {font-weight:bold;font-style:normal;}
em.Italic {font-style:italic;}
'''
def parse_index(self):
soup = self.index_to_soup(self.INDEX)
issue_list = soup.findAll('a', attrs={'class': 'btn-read btn btn-sm btn-block btn-success'})
url = issue_list[self.last_full_issue].get('href')
self.issue_url = url # save to extract cover URL later
self.log('Issue to get: ', url)
return self.parse_issue(url)
def parse_issue(self, url):
soup = self.index_to_soup(url)
feeds = []
articles = []
#sections can be new chapters or actual articles, have to differ by class descriptor
section_list = soup.findAll('section', attrs={'class': ['ihv-teaser container', 'container separator']})
section_title = ''
for section in section_list:
#if h2 with seperator__title we found a new section, be aware: this one has 2 underscores
new_section = section.find('h2', attrs={'class': 'separator__title'})
if new_section is not None:
#add current articles to old section_title
if section_title != '':
#self.log('Adding articles to section', section_title, articles)
feeds.append((section_title, articles))
articles = []
section_title = self.tag_to_string(new_section)
a = section.find('a', href=True)
if a is None:
continue
url = self.PREFIX + a['href']
title = self.tag_to_string(section.find('h2'))
desc = ''
for p in section.findAll('p'):
desc += self.tag_to_string(p)
articles.append({'title': title, 'url': url, 'description': desc})
#self.log('Section', section_title, ' - Article', title, 'at ', ur)
#self.log('Adding articles to section', section_title, articles)
if section_title != self.section_name_shop:
feeds.append((section_title, articles))
return feeds
#some articles contain a footer with a reference to the issue with links, all secionts, divs, etc. have dynamic IDs or Classes
#so we are finding a specific text ("Lust auf mehr"), and remove this sectiont then :/
def preprocess_html(self, soup):
headers = soup.findAll('h3')
for header in headers:
if self.tag_to_string(header) == self.footer_ads:
remove1 = header.findParent('div')
remove2 = remove1.findPrevious('div', attrs = {'class': 'picture'})
#self.log('Remove1', remove1)
#self.log('Remove2', remove2)
remove1.extract()
remove2.extract()
return soup
def get_cover_url(self):
soup = self.index_to_soup(self.issue_url)
img = soup.find('img', attrs={'class': 'issue__cover'})
#self.log('Found cover image: ', img)
return self.HTTPS_PREFIX + img['src']
def preprocess_raw_html(self, raw_html, url):
return raw_html.replace('<p>• ', '<p>')