Updates for Austrian news sources

jr17oo · 06-26-2016, 04:48 AM

I have prepared updates for the Austrian news sources "Kleine Zeitung", "Kurier", and "Der Standard". The respective recipes are attached below.

jr17oo · 06-26-2016, 04:49 AM

Update for kleinezeitung.recipe:

new feed addresses (local news feeds are commented out)
make use of print_version
improve formatting

Code:

#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import unicode_literals, division, absolute_import, print_function

from calibre.web.feeds.news import BasicNewsRecipe

class KleineZeitungRecipe(BasicNewsRecipe):
    __license__  = 'GPL v3'
    __author__ = 'kwetal'
    language = 'de_AT'
    version = 1

    title = u'Kleine Zeitung'
    publisher = u'Kleine Zeitung GmbH & Co KG'
    category = u'News, Newspaper'
    description = u'Nachrichten aus \u00D6sterreich'

    use_embedded_content = False
    remove_empty_feeds = True
    oldest_article = 2
    max_articles_per_feed = 100

    no_stylesheets = True
    remove_javascript = True

    masthead_url = 'https://cdn-kl.niceshops.com/images/logos/logo_kleine_invoice.jpg'

    feeds = [
        ('Nachrichten', 'http://www.kleinezeitung.at/rss/nachrichten'),
        ('Politik', 'http://www.kleinezeitung.at/rss/politik'),
        ('Wirtschaft', 'http://www.kleinezeitung.at/rss/wirtschaft'),
        ('Österreich und die Welt', 'http://www.kleinezeitung.at/rss/chronik'),
        ('Leben', 'http://www.kleinezeitung.at/rss/leben'),
        ('Sport', 'http://www.kleinezeitung.at/rss/sport'),
#        ('Ennstal', 'http://www.kleinezeitung.at/rss/rss_ennstal'),
#        ('Graz & Umgebung', 'http://www.kleinezeitung.at/rss/rss_graz'),
#        ('Leoben', 'http://www.kleinezeitung.at/rss/rss_leoben'),
#        ('Murtal', 'http://www.kleinezeitung.at/rss/rss_murtal'),
#        ('Mürztal', 'http://www.kleinezeitung.at/rss/rss_muerztal'),
#        ('Oststeier', 'http://www.kleinezeitung.at/rss/rss_oststeier'),
#        ('Süd & Südwest', 'http://www.kleinezeitung.at/rss/rss_suedsuedwest'),
#        ('Südost & Süd', 'http://www.kleinezeitung.at/rss/rss_sueostsued'),
#        ('Weiz', 'http://www.kleinezeitung.at/rss/rss_weiz'),
#        ('Weststeier', 'http://www.kleinezeitung.at/rss/rss_weststeier'),
#        ('Feldkirchen', 'http://www.kleinezeitung.at/rss/rss_feldkirchen'),
#        ('Klagenfurt', 'http://www.kleinezeitung.at/rss/rss_klagenfurt'),
#        ('Lavanttal', 'http://www.kleinezeitung.at/rss/rss_lavanttal'),
#        ('Oberkärnten', 'http://www.kleinezeitung.at/rss/rss_oberkaernten'),
#        ('Osttirol', 'http://www.kleinezeitung.at/rss/rss_osttirol'),
#        ('St. Veit', 'http://www.kleinezeitung.at/rss/rss_stveit'),
#        ('Villach', 'http://www.kleinezeitung.at/rss/rss_villach'),
#        ('Völkermarkt', 'http://www.kleinezeitung.at/rss/rss_voelkermarkt')
    ]

    remove_tags_before = dict(attrs={'class':'hline'})
    remove_tags_after = [dict(name='div', attrs={'class':'articletext'})]
    remove_tags = [dict(name='hr')]

    extra_css = '''
                h1 {text-align: left;}
                '''

    def print_version(self, url):
        main, sep, id = url.rpartition('/')
        return main + '/print.do'

    def preprocess_html(self, soup):
        if soup.find('div', {'class':'articletext'}) is None:
            self.abort_article()
        return soup

jr17oo · 06-26-2016, 04:50 AM

Update for kurier.recipe:

new feed addresses
improve formatting

Code:

#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import unicode_literals, division, absolute_import, print_function

__license__   = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'

'''
kurier.at
'''

import re
from calibre.web.feeds.news import BasicNewsRecipe

class Kurier(BasicNewsRecipe):
    title                 = 'Kurier'
    __author__            = 'Darko Miletic'
    description           = 'News from Austria'
    publisher             = 'KURIER'
    category              = 'news, politics, Austria'
    oldest_article        = 2
    max_articles_per_feed = 100
    timeout               = 30
    no_stylesheets        = True
    use_embedded_content  = False
    language              = 'de_AT'
    remove_empty_feeds    = True
    publication_type      = 'newspaper'

    conversion_options = {
                          'comment'   : description
                        , 'tags'      : category
                        , 'publisher' : publisher
                        , 'language'  : language
                        }

    feeds = [
        ('Politik', 'http://kurier.at/politik/xml/rss'),
        ('Wirtschaft', 'http://kurier.at/wirtschaft/xml/rss'),
        ('Chronik', 'http://kurier.at/chronik/xml/rss'),
        ('Kultur', 'http://kurier.at/kultur/xml/rss'),
        ('Leben', 'http://kurier.at/leben/xml/rss'),
        ('Menschen', 'http://kurier.at/menschen/xml/rss'),
        ('Sport', 'http://kurier.at/sport/xml/rss')
    ]

    keep_only_tags = [
        dict(name='article', attrs={'class':re.compile('main-article')})
    ]

    remove_tags = [
        dict(name='div', attrs={'class':'social-media-container'}),
        dict(name='section', attrs={'class':'tags'}),
        dict(name='section', attrs={'class':re.compile('comment-box')}),
        dict(name='section', attrs={'class':re.compile('related-content')}),
        dict(name='section', attrs={'class':re.compile('article-slider')}),
        dict(name='section', attrs={'class':re.compile('commentcontainer')}),
        dict(name='blockquote')
    ]

    remove_attributes = ['width','height']

    def preprocess_html(self, soup):
        for item in soup.findAll(style=True):
            del item['style']
        return self.adeify_images(soup)

jr17oo · 06-26-2016, 04:51 AM

Update for der_standard.recipe:

fix get_cover_url
improve formatting
remove some obsolete code

Code:

#!/usr/bin/env  python2
# -*- coding: utf-8 -*-
from __future__ import unicode_literals, division, absolute_import, print_function

__license__   = 'GPL v3'
__copyright__ = '2009, Gerhard Aigner <gerhard.aigner at gmail.com>'

''' http://www.derstandard.at - Austrian Newspaper '''

import re
import random
from calibre.web.feeds.news import BasicNewsRecipe

class DerStandardRecipe(BasicNewsRecipe):
    title = u'derStandard'
    __author__ = 'Gerhard Aigner and Sujata Raman and Marcel Jira and Peter Reschenhofer'
    description = u'Nachrichten aus Österreich'
    publisher ='derStandard.at'
    category = 'news, politics, nachrichten, Austria'
    use_embedded_content = False
    remove_empty_feeds = True
    no_stylesheets = True
    encoding = 'utf-8'
    language = 'de_AT'

    oldest_article = 1
    max_articles_per_feed = 100
    ignore_duplicate_articles = {'title', 'url'}

    masthead_url = 'http://images.derstandard.at/2012/06/19/derStandardat_1417x274.gif'

    feeds = [
        (u'Newsroom', u'http://derStandard.at/?page=rss&ressort=Seite1'),
        (u'Inland', u'http://derstandard.at/?page=rss&ressort=InnenPolitik'),
        (u'International', u'http://derstandard.at/?page=rss&ressort=InternationalPolitik'),
        (u'Wirtschaft', u'http://derStandard.at/?page=rss&ressort=Wirtschaft'),
        (u'Web', u'http://derStandard.at/?page=rss&ressort=Web'),
        (u'Sport', u'http://derStandard.at/?page=rss&ressort=Sport'),
        (u'Panorama', u'http://derStandard.at/?page=rss&ressort=Panorama'),
        (u'Etat', u'http://derStandard.at/?page=rss&ressort=Etat'),
        (u'Kultur', u'http://derStandard.at/?page=rss&ressort=Kultur'),
        (u'Wissenschaft', u'http://derStandard.at/?page=rss&ressort=Wissenschaft'),
        (u'Gesundheit', u'http://derStandard.at/?page=rss&ressort=Gesundheit'),
        (u'Bildung', u'http://derStandard.at/?page=rss&ressort=Bildung'),
        (u'Meinung', u'http://derStandard.at/?page=rss&ressort=Meinung'),
        (u'Lifestyle', u'http://derStandard.at/?page=rss&ressort=Lifestyle'),
        (u'Reisen', u'http://derStandard.at/?page=rss&ressort=Reisen'),
        (u'Familie', u'http://derstandard.at/?page=rss&ressort=Familie'),
        (u'Greenlife', u'http://derStandard.at/?page=rss&ressort=Greenlife'),
        (u'Karriere', u'http://derStandard.at/?page=rss&ressort=Karriere'),
        (u'Immobilien', u'http://derstandard.at/?page=rss&ressort=Immobilien'),
        (u'Automobil', u'http://derstandard.at/?page=rss&ressort=Automobil'),
        (u'dieStandard', u'http://dieStandard.at/?page=rss&ressort=diestandard'),
        (u'daStandard', u'http://daStandard.at/?page=rss&ressort=dastandard')
    ]

    keep_only_tags = [
        dict(name='div', attrs={'class':re.compile('^artikel')})
    ]

    remove_tags = [
        dict(name=['link', 'iframe', 'style', 'hr']),
        dict(attrs={'class':['lookup-links', 'media-list']}),
        dict(name='form',attrs={'name':'sitesearch'}),
        dict(name='div', attrs={'class':['socialsharing', 'block video',
                                         'blog-browsing section',
                                         'diashow', 'supplemental']}),
        dict(name='div', attrs={'id':'highlighted'})
    ]

    remove_attributes = ['width', 'height']

    preprocess_regexps = [
        (re.compile(r'\[[\d]*\]', re.DOTALL|re.IGNORECASE), lambda match: ''),
        (re.compile(r'bgcolor="#\w{3,6}"', re.DOTALL|re.IGNORECASE), lambda match: '')
    ]

    filter_regexps = [r'/r[1-9]*']

    def get_article_url(self, article):
        matchObj = re.search( re.compile(r'/r'+'[1-9]*',flags=0), article.link,flags=0)

        if matchObj:
            return None

        return article.link

    def preprocess_html(self, soup):
        if soup.find('div', {'class':re.compile('^artikel')}) is None:
            self.abort_article()
        for t in soup.findAll(['ul', 'li']):
            t.name = 'div'
        return soup

    def get_cover_url(self):
        base_url = 'https://epaper.derstandard.at/'
        url = base_url + 'shelf.act?s=' + str(random.random() * 10000)
        soup = self.index_to_soup(url)
        img = soup.find('img', {'class':re.compile('^thumbnailBig'), 'src':True})
        if img and img['src']:
            cover_url = base_url + img['src']
            return cover_url

06-26-2016, 04:48 AM	#1
jr17oo Member Posts: 14 Karma: 10 Join Date: Jan 2012 Device: Sony PRS-T1	Updates for Austrian news sources I have prepared updates for the Austrian news sources "Kleine Zeitung", "Kurier", and "Der Standard". The respective recipes are attached below.

Similar Threads
Thread	Thread Starter	Forum	Replies	Last Post
Automatic news fetch: different news sources to different Kindles?	kcp	Calibre	4	01-24-2015 03:43 PM
News Sources	sisterphonetica	Recipes	5	06-27-2014 12:30 PM
[Enhancement] Add new news sources of ABC NEWS	donnie888	Recipes	0	12-23-2012 01:39 AM
News sources	Lob	Recipes	2	02-17-2011 12:49 PM
Best Free News Sources	Gideon	Deals and Resources (No Self-Promotion or Affiliate Links)	1	08-05-2009 02:25 AM

Advert