Register Guidelines E-Books Today's Posts Search

Go Back   MobileRead Forums > E-Book Software > Calibre > Recipes

Notices

Reply
 
Thread Tools Search this Thread
Old 06-26-2016, 03:48 AM   #1
jr17oo
Member
jr17oo began at the beginning.
 
Posts: 14
Karma: 10
Join Date: Jan 2012
Device: Sony PRS-T1
Updates for Austrian news sources

I have prepared updates for the Austrian news sources "Kleine Zeitung", "Kurier", and "Der Standard". The respective recipes are attached below.
jr17oo is offline   Reply With Quote
Old 06-26-2016, 03:49 AM   #2
jr17oo
Member
jr17oo began at the beginning.
 
Posts: 14
Karma: 10
Join Date: Jan 2012
Device: Sony PRS-T1
Kleine Zeitung

Update for kleinezeitung.recipe:
  • new feed addresses (local news feeds are commented out)
  • make use of print_version
  • improve formatting

Code:
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import unicode_literals, division, absolute_import, print_function

from calibre.web.feeds.news import BasicNewsRecipe

class KleineZeitungRecipe(BasicNewsRecipe):
    __license__  = 'GPL v3'
    __author__ = 'kwetal'
    language = 'de_AT'
    version = 1

    title = u'Kleine Zeitung'
    publisher = u'Kleine Zeitung GmbH & Co KG'
    category = u'News, Newspaper'
    description = u'Nachrichten aus \u00D6sterreich'

    use_embedded_content = False
    remove_empty_feeds = True
    oldest_article = 2
    max_articles_per_feed = 100

    no_stylesheets = True
    remove_javascript = True

    masthead_url = 'https://cdn-kl.niceshops.com/images/logos/logo_kleine_invoice.jpg'

    feeds = [
        ('Nachrichten', 'http://www.kleinezeitung.at/rss/nachrichten'),
        ('Politik', 'http://www.kleinezeitung.at/rss/politik'),
        ('Wirtschaft', 'http://www.kleinezeitung.at/rss/wirtschaft'),
        ('Österreich und die Welt', 'http://www.kleinezeitung.at/rss/chronik'),
        ('Leben', 'http://www.kleinezeitung.at/rss/leben'),
        ('Sport', 'http://www.kleinezeitung.at/rss/sport'),
#        ('Ennstal', 'http://www.kleinezeitung.at/rss/rss_ennstal'),
#        ('Graz & Umgebung', 'http://www.kleinezeitung.at/rss/rss_graz'),
#        ('Leoben', 'http://www.kleinezeitung.at/rss/rss_leoben'),
#        ('Murtal', 'http://www.kleinezeitung.at/rss/rss_murtal'),
#        ('Mürztal', 'http://www.kleinezeitung.at/rss/rss_muerztal'),
#        ('Oststeier', 'http://www.kleinezeitung.at/rss/rss_oststeier'),
#        ('Süd & Südwest', 'http://www.kleinezeitung.at/rss/rss_suedsuedwest'),
#        ('Südost & Süd', 'http://www.kleinezeitung.at/rss/rss_sueostsued'),
#        ('Weiz', 'http://www.kleinezeitung.at/rss/rss_weiz'),
#        ('Weststeier', 'http://www.kleinezeitung.at/rss/rss_weststeier'),
#        ('Feldkirchen', 'http://www.kleinezeitung.at/rss/rss_feldkirchen'),
#        ('Klagenfurt', 'http://www.kleinezeitung.at/rss/rss_klagenfurt'),
#        ('Lavanttal', 'http://www.kleinezeitung.at/rss/rss_lavanttal'),
#        ('Oberkärnten', 'http://www.kleinezeitung.at/rss/rss_oberkaernten'),
#        ('Osttirol', 'http://www.kleinezeitung.at/rss/rss_osttirol'),
#        ('St. Veit', 'http://www.kleinezeitung.at/rss/rss_stveit'),
#        ('Villach', 'http://www.kleinezeitung.at/rss/rss_villach'),
#        ('Völkermarkt', 'http://www.kleinezeitung.at/rss/rss_voelkermarkt')
    ]

    remove_tags_before = dict(attrs={'class':'hline'})
    remove_tags_after = [dict(name='div', attrs={'class':'articletext'})]
    remove_tags = [dict(name='hr')]

    extra_css = '''
                h1 {text-align: left;}
                '''

    def print_version(self, url):
        main, sep, id = url.rpartition('/')
        return main + '/print.do'

    def preprocess_html(self, soup):
        if soup.find('div', {'class':'articletext'}) is None:
            self.abort_article()
        return soup
jr17oo is offline   Reply With Quote
Old 06-26-2016, 03:50 AM   #3
jr17oo
Member
jr17oo began at the beginning.
 
Posts: 14
Karma: 10
Join Date: Jan 2012
Device: Sony PRS-T1
Kurier

Update for kurier.recipe:
  • new feed addresses
  • improve formatting

Code:
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import unicode_literals, division, absolute_import, print_function

__license__   = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'

'''
kurier.at
'''

import re
from calibre.web.feeds.news import BasicNewsRecipe

class Kurier(BasicNewsRecipe):
    title                 = 'Kurier'
    __author__            = 'Darko Miletic'
    description           = 'News from Austria'
    publisher             = 'KURIER'
    category              = 'news, politics, Austria'
    oldest_article        = 2
    max_articles_per_feed = 100
    timeout               = 30
    no_stylesheets        = True
    use_embedded_content  = False
    language              = 'de_AT'
    remove_empty_feeds    = True
    publication_type      = 'newspaper'

    conversion_options = {
                          'comment'   : description
                        , 'tags'      : category
                        , 'publisher' : publisher
                        , 'language'  : language
                        }

    feeds = [
        ('Politik', 'http://kurier.at/politik/xml/rss'),
        ('Wirtschaft', 'http://kurier.at/wirtschaft/xml/rss'),
        ('Chronik', 'http://kurier.at/chronik/xml/rss'),
        ('Kultur', 'http://kurier.at/kultur/xml/rss'),
        ('Leben', 'http://kurier.at/leben/xml/rss'),
        ('Menschen', 'http://kurier.at/menschen/xml/rss'),
        ('Sport', 'http://kurier.at/sport/xml/rss')
    ]

    keep_only_tags = [
        dict(name='article', attrs={'class':re.compile('main-article')})
    ]

    remove_tags = [
        dict(name='div', attrs={'class':'social-media-container'}),
        dict(name='section', attrs={'class':'tags'}),
        dict(name='section', attrs={'class':re.compile('comment-box')}),
        dict(name='section', attrs={'class':re.compile('related-content')}),
        dict(name='section', attrs={'class':re.compile('article-slider')}),
        dict(name='section', attrs={'class':re.compile('commentcontainer')}),
        dict(name='blockquote')
    ]

    remove_attributes = ['width','height']

    def preprocess_html(self, soup):
        for item in soup.findAll(style=True):
            del item['style']
        return self.adeify_images(soup)
jr17oo is offline   Reply With Quote
Old 06-26-2016, 03:51 AM   #4
jr17oo
Member
jr17oo began at the beginning.
 
Posts: 14
Karma: 10
Join Date: Jan 2012
Device: Sony PRS-T1
Der Standard

Update for der_standard.recipe:
  • fix get_cover_url
  • improve formatting
  • remove some obsolete code

Code:
#!/usr/bin/env  python2
# -*- coding: utf-8 -*-
from __future__ import unicode_literals, division, absolute_import, print_function

__license__   = 'GPL v3'
__copyright__ = '2009, Gerhard Aigner <gerhard.aigner at gmail.com>'

''' http://www.derstandard.at - Austrian Newspaper '''

import re
import random
from calibre.web.feeds.news import BasicNewsRecipe

class DerStandardRecipe(BasicNewsRecipe):
    title = u'derStandard'
    __author__ = 'Gerhard Aigner and Sujata Raman and Marcel Jira and Peter Reschenhofer'
    description = u'Nachrichten aus Österreich'
    publisher ='derStandard.at'
    category = 'news, politics, nachrichten, Austria'
    use_embedded_content = False
    remove_empty_feeds = True
    no_stylesheets = True
    encoding = 'utf-8'
    language = 'de_AT'

    oldest_article = 1
    max_articles_per_feed = 100
    ignore_duplicate_articles = {'title', 'url'}

    masthead_url = 'http://images.derstandard.at/2012/06/19/derStandardat_1417x274.gif'

    feeds = [
        (u'Newsroom', u'http://derStandard.at/?page=rss&ressort=Seite1'),
        (u'Inland', u'http://derstandard.at/?page=rss&ressort=InnenPolitik'),
        (u'International', u'http://derstandard.at/?page=rss&ressort=InternationalPolitik'),
        (u'Wirtschaft', u'http://derStandard.at/?page=rss&ressort=Wirtschaft'),
        (u'Web', u'http://derStandard.at/?page=rss&ressort=Web'),
        (u'Sport', u'http://derStandard.at/?page=rss&ressort=Sport'),
        (u'Panorama', u'http://derStandard.at/?page=rss&ressort=Panorama'),
        (u'Etat', u'http://derStandard.at/?page=rss&ressort=Etat'),
        (u'Kultur', u'http://derStandard.at/?page=rss&ressort=Kultur'),
        (u'Wissenschaft', u'http://derStandard.at/?page=rss&ressort=Wissenschaft'),
        (u'Gesundheit', u'http://derStandard.at/?page=rss&ressort=Gesundheit'),
        (u'Bildung', u'http://derStandard.at/?page=rss&ressort=Bildung'),
        (u'Meinung', u'http://derStandard.at/?page=rss&ressort=Meinung'),
        (u'Lifestyle', u'http://derStandard.at/?page=rss&ressort=Lifestyle'),
        (u'Reisen', u'http://derStandard.at/?page=rss&ressort=Reisen'),
        (u'Familie', u'http://derstandard.at/?page=rss&ressort=Familie'),
        (u'Greenlife', u'http://derStandard.at/?page=rss&ressort=Greenlife'),
        (u'Karriere', u'http://derStandard.at/?page=rss&ressort=Karriere'),
        (u'Immobilien', u'http://derstandard.at/?page=rss&ressort=Immobilien'),
        (u'Automobil', u'http://derstandard.at/?page=rss&ressort=Automobil'),
        (u'dieStandard', u'http://dieStandard.at/?page=rss&ressort=diestandard'),
        (u'daStandard', u'http://daStandard.at/?page=rss&ressort=dastandard')
    ]

    keep_only_tags = [
        dict(name='div', attrs={'class':re.compile('^artikel')})
    ]

    remove_tags = [
        dict(name=['link', 'iframe', 'style', 'hr']),
        dict(attrs={'class':['lookup-links', 'media-list']}),
        dict(name='form',attrs={'name':'sitesearch'}),
        dict(name='div', attrs={'class':['socialsharing', 'block video',
                                         'blog-browsing section',
                                         'diashow', 'supplemental']}),
        dict(name='div', attrs={'id':'highlighted'})
    ]

    remove_attributes = ['width', 'height']

    preprocess_regexps = [
        (re.compile(r'\[[\d]*\]', re.DOTALL|re.IGNORECASE), lambda match: ''),
        (re.compile(r'bgcolor="#\w{3,6}"', re.DOTALL|re.IGNORECASE), lambda match: '')
    ]

    filter_regexps = [r'/r[1-9]*']

    def get_article_url(self, article):
        matchObj = re.search( re.compile(r'/r'+'[1-9]*',flags=0), article.link,flags=0)

        if matchObj:
            return None

        return article.link

    def preprocess_html(self, soup):
        if soup.find('div', {'class':re.compile('^artikel')}) is None:
            self.abort_article()
        for t in soup.findAll(['ul', 'li']):
            t.name = 'div'
        return soup

    def get_cover_url(self):
        base_url = 'https://epaper.derstandard.at/'
        url = base_url + 'shelf.act?s=' + str(random.random() * 10000)
        soup = self.index_to_soup(url)
        img = soup.find('img', {'class':re.compile('^thumbnailBig'), 'src':True})
        if img and img['src']:
            cover_url = base_url + img['src']
            return cover_url
jr17oo is offline   Reply With Quote
Reply


Forum Jump

Similar Threads
Thread Thread Starter Forum Replies Last Post
Automatic news fetch: different news sources to different Kindles? kcp Calibre 4 01-24-2015 02:43 PM
News Sources sisterphonetica Recipes 5 06-27-2014 11:30 AM
[Enhancement] Add new news sources of ABC NEWS donnie888 Recipes 0 12-23-2012 12:39 AM
News sources Lob Recipes 2 02-17-2011 11:49 AM
Best Free News Sources Gideon Deals and Resources (No Self-Promotion or Affiliate Links) 1 08-05-2009 01:25 AM


All times are GMT -4. The time now is 12:21 AM.


MobileRead.com is a privately owned, operated and funded community.