MobileRead Forums - View Single Post

Flugschwein · 03-05-2019, 04:58 PM

The default recipe for one for one of the most famous quality newspapers in Austria, der Standard (or derStandard), is pretty outdated and doesn't work anymore at all.
This is what the current default looks like:

Spoiler:

Code:

#!/usr/bin/env  python2
# -*- coding: utf-8 -*-
from __future__ import unicode_literals, division, absolute_import, print_function

__license__ = 'GPL v3'
__copyright__ = '2009, Gerhard Aigner <gerhard.aigner at gmail.com>'

''' http://www.derstandard.at - Austrian Newspaper '''

import re
import random
from calibre.web.feeds.news import BasicNewsRecipe


class DerStandardRecipe(BasicNewsRecipe):
    title = u'derStandard'
    __author__ = 'Gerhard Aigner and Sujata Raman and Marcel Jira and Peter Reschenhofer'
    description = u'Nachrichten aus Österreich'
    publisher = 'derStandard.at'
    category = 'news, politics, nachrichten, Austria'
    use_embedded_content = False
    remove_empty_feeds = True
    no_stylesheets = True
    encoding = 'utf-8'
    language = 'de_AT'

    oldest_article = 1
    max_articles_per_feed = 100
    ignore_duplicate_articles = {'title', 'url'}

    masthead_url = 'http://images.derstandard.at/2012/06/19/derStandardat_1417x274.gif'

    feeds = [
        (u'Newsroom', u'http://derStandard.at/?page=rss&ressort=Seite1'),
        (u'Inland', u'http://derstandard.at/?page=rss&ressort=InnenPolitik'),
        (u'International', u'http://derstandard.at/?page=rss&ressort=InternationalPolitik'),
        (u'Wirtschaft', u'http://derStandard.at/?page=rss&ressort=Wirtschaft'),
        (u'Web', u'http://derStandard.at/?page=rss&ressort=Web'),
        (u'Sport', u'http://derStandard.at/?page=rss&ressort=Sport'),
        (u'Panorama', u'http://derStandard.at/?page=rss&ressort=Panorama'),
        (u'Etat', u'http://derStandard.at/?page=rss&ressort=Etat'),
        (u'Kultur', u'http://derStandard.at/?page=rss&ressort=Kultur'),
        (u'Wissenschaft', u'http://derStandard.at/?page=rss&ressort=Wissenschaft'),
        (u'Gesundheit', u'http://derStandard.at/?page=rss&ressort=Gesundheit'),
        (u'Bildung', u'http://derStandard.at/?page=rss&ressort=Bildung'),
        (u'Meinung', u'http://derStandard.at/?page=rss&ressort=Meinung'),
        (u'Lifestyle', u'http://derStandard.at/?page=rss&ressort=Lifestyle'),
        (u'Reisen', u'http://derStandard.at/?page=rss&ressort=Reisen'),
        (u'Familie', u'http://derstandard.at/?page=rss&ressort=Familie'),
        (u'Greenlife', u'http://derStandard.at/?page=rss&ressort=Greenlife'),
        (u'Karriere', u'http://derStandard.at/?page=rss&ressort=Karriere'),
        (u'Immobilien', u'http://derstandard.at/?page=rss&ressort=Immobilien'),
        (u'Automobil', u'http://derstandard.at/?page=rss&ressort=Automobil'),
        (u'dieStandard', u'http://dieStandard.at/?page=rss&ressort=diestandard'),
        (u'daStandard', u'http://daStandard.at/?page=rss&ressort=dastandard')
    ]

    keep_only_tags = [
        dict(name='div', attrs={'class': re.compile('^artikel')})
    ]

    remove_tags = [
        dict(name=['link', 'iframe', 'style', 'hr']),
        dict(attrs={'class': ['lookup-links', 'media-list']}),
        dict(name='form', attrs={'name': 'sitesearch'}),
        dict(name='div', attrs={'class': ['socialsharing', 'block video',
                                          'blog-browsing section',
                                          'diashow', 'supplemental']}),
        dict(name='div', attrs={'id': 'highlighted'})
    ]

    remove_attributes = ['width', 'height']

    preprocess_regexps = [
        (re.compile(r'\[[\d]*\]', re.DOTALL |
                    re.IGNORECASE), lambda match: ''),
        (re.compile(r'bgcolor="#\w{3,6}"',
                    re.DOTALL | re.IGNORECASE), lambda match: '')
    ]

    filter_regexps = [r'/r[1-9]*']

    def get_article_url(self, article):
        matchObj = re.search(re.compile(
            r'/r' + '[1-9]*', flags=0), article.link, flags=0)

        if matchObj:
            return None

        return article.link

    def preprocess_html(self, soup):
        if soup.find('div', {'class': re.compile('^artikel')}) is None:
            self.abort_article()
        for t in soup.findAll(['ul', 'li']):
            t.name = 'div'
        return soup

    def get_cover_url(self):
        base_url = 'https://epaper.derstandard.at/'
        url = base_url + 'shelf.act?s=' + str(random.random() * 10000)
        soup = self.index_to_soup(url)
        img = soup.find(
            'img', {'class': re.compile('^thumbnailBig'), 'src': True})
        if img and img['src']:
            cover_url = base_url + img['src']
            return cover_url

While this might've worked a few years ago, it seems the Standard has changed it's feeds and some other stuff regarding their online presence as well.
As you can see here (attention: German. Use a translator if you need to understand that, but it should be rather self explanatory), the feeds list is no longer up to date. It should be like that instead if I understood that part correctly:

Code:

feeds = [
        (u'Newsroom', u'http://derStandard.at/?page=rss&ressort=Seite1'),
        (u'International', u'http://derstandard.at/?page=rss&ressort=International'),
        (u'Inland', u'http://derstandard.at/?page=rss&ressort=Inland'),
        (u'Wirtschaft', u'http://derStandard.at/?page=rss&ressort=Wirtschaft'),
        (u'Web', u'http://derStandard.at/?page=rss&ressort=Web'),
        (u'Sport', u'http://derStandard.at/?page=rss&ressort=Sport'),
        (u'Panorama', u'http://derStandard.at/?page=rss&ressort=Panorama'),
        (u'Etat', u'http://derStandard.at/?page=rss&ressort=Etat'),
        (u'Kultur', u'http://derStandard.at/?page=rss&ressort=Kultur'),
        (u'Wissenschaft', u'http://derStandard.at/?page=rss&ressort=Wissenschaft'),
        (u'Gesundheit', u'http://derStandard.at/?page=rss&ressort=Gesundheit'),
        (u'Bildung', u'http://derStandard.at/?page=rss&ressort=Bildung'),
        (u'Meinung', u'http://derStandard.at/?page=rss&ressort=Meinung'),
        (u'Lifestyle', u'http://derStandard.at/?page=rss&ressort=Lifestyle'),
        (u'Reisen', u'http://derStandard.at/?page=rss&ressort=Reisen'),
        (u'Familie', u'http://derstandard.at/?page=rss&ressort=Familie'),
        (u'Meinung', u'http://derStandard.at/?page=rss&ressort=Meinung'),
        (u'User', u'http://derStandard.at/?page=rss&ressort=User'),
        (u'Karriere', u'http://derStandard.at/?page=rss&ressort=Karriere'),
        (u'Immobilien', u'http://derstandard.at/?page=rss&ressort=Immobilien'),
        (u'Automobil', u'http://derstandard.at/?page=rss&ressort=Automobil'),
        (u'dieStandard', u'http://derStandard.at/?page=rss&ressort=diestandard'),
    ]

But even if the feeds are correct, the output you get isn't. Sadly I currently lack the resources to get it working myself, so I would really appreciate if a more experienced user could provide me with a working version of the recipe (and maybe even open a Pull Request to the Git repository ;-) )
Thanks in advance,
Flugschwein

PS: according to https://calibre-ebook.com/dynamic/recipe-usage derStandard is the 4th most downloaded German (language wise, not nationality) newspaper using calibre