View Single Post
Old 05-05-2012, 02:08 PM   #5
Spindoctor
Junior Member
Spindoctor began at the beginning.
 
Posts: 7
Karma: 10
Join Date: Feb 2012
Device: PRS-T1
Thumbs up Improved recipe

a friend of mine helped me with the title-page.

Here's the new improved .recipe-file for the Austrian newspaper "Der Standard" (http://www.derstandard.at)

Code:
#!/usr/bin/env  python
# -*- coding: utf-8 -*-

__license__   = 'GPL v3'
__copyright__ = '2009, Gerhard Aigner <gerhard.aigner at gmail.com>'

''' http://www.derstandard.at - Austrian Newspaper '''
import re, urllib
from calibre.web.feeds.news import BasicNewsRecipe
from time import strftime

class DerStandardRecipe(BasicNewsRecipe):
    title = u'derStandard'
    __author__ = 'Gerhard Aigner and Sujata Raman and Marcel Jira and Peter Reschenhofer'
    description = u'Nachrichten aus Österreich'
    publisher ='derStandard.at'
    category = 'news, politics, nachrichten, Austria'
    use_embedded_content = False
    remove_empty_feeds = True
    lang = 'de-AT'
    no_stylesheets = True
    encoding = 'utf-8'
    language = 'de'

    oldest_article = 1
    max_articles_per_feed = 100

    extra_css = '''
                .artikelBody{font-family:Arial,Helvetica,sans-serif;}
                .artikelLeft{font-family:Arial,Helvetica,sans-serif;font-size:x-small;}
                h4{color:#404450;font-size:x-small;}
                h6{color:#404450; font-size:x-small;}
                '''
    feeds          = [
        (u'Newsroom', u'http://derStandard.at/?page=rss&ressort=Seite1'),
        (u'Inland', u'http://derstandard.at/?page=rss&ressort=InnenPolitik'),
        (u'International', u'http://derstandard.at/?page=rss&ressort=InternationalPolitik'),
        (u'Wirtschaft', u'http://derStandard.at/?page=rss&ressort=Wirtschaft'),
        (u'Web', u'http://derStandard.at/?page=rss&ressort=Web'),
        (u'Sport', u'http://derStandard.at/?page=rss&ressort=Sport'),
        (u'Panorama', u'http://derStandard.at/?page=rss&ressort=Panorama'),
        (u'Etat', u'http://derStandard.at/?page=rss&ressort=Etat'),
        (u'Kultur', u'http://derStandard.at/?page=rss&ressort=Kultur'),
        (u'Wissenschaft', u'http://derStandard.at/?page=rss&ressort=Wissenschaft'),
        (u'Gesundheit', u'http://derStandard.at/?page=rss&ressort=Gesundheit'),
        (u'Bildung', u'http://derStandard.at/?page=rss&ressort=Bildung'),
        (u'Meinung', u'http://derStandard.at/?page=rss&ressort=Meinung'),
        (u'Lifestyle', u'http://derStandard.at/?page=rss&ressort=Lifestyle'),
        (u'Reisen', u'http://derStandard.at/?page=rss&ressort=Reisen'),
        (u'Karriere', u'http://derStandard.at/?page=rss&ressort=Karriere'),
        (u'Immobilien', u'http://derstandard.at/?page=rss&ressort=Immobilien'),
        (u'dieStandard', u'http://dieStandard.at/?page=rss&ressort=diestandard'),
        (u'daStandard', u'http://daStandard.at/?page=rss&ressort=dastandard')
                      ]

    keep_only_tags = [
                        dict(name='div', attrs={'class':["artikel","artikelLeft","artikelBody"]}) ,
                         ]

    remove_tags = [
                    dict(name='link'), dict(name='meta'),dict(name='iframe'),dict(name='style'),
                    dict(name='form',attrs={'name':'sitesearch'}), dict(name='hr'),
                    dict(name='div', attrs={'class':["diashow"]})]
    preprocess_regexps = [
        (re.compile(r'\[[\d]*\]', re.DOTALL|re.IGNORECASE), lambda match: ''),
        (re.compile(r'bgcolor="#\w{3,6}"', re.DOTALL|re.IGNORECASE), lambda match: '')
    ]

    filter_regexps = [r'/r[1-9]*']

    def get_article_url(self, article):
        '''if the article links to a index page (ressort) or a picture gallery
           (ansichtssache), don't add it'''
        if ( article.link.count('ressort') > 0 or article.title.lower().count('ansichtssache') > 0 ):
            return None
        matchObj = re.search( re.compile(r'/r'+'[1-9]*',flags=0), article.link,flags=0)

        if matchObj:
            return None

        return article.link

    def preprocess_html(self, soup):
        soup.html['xml:lang'] = self.lang
        soup.html['lang']     = self.lang
        mtag = '<meta http-equiv="Content-Type" content="text/html; charset=' + self.encoding + '">'
        soup.head.insert(0,mtag)

        for t in soup.findAll(['ul', 'li']):
            t.name = 'div'
        return soup

    def get_cover_url(self):
        highResolution = True
        
        date    = strftime("%Y/%Y%m%d")
        # it is also possible for the past
        #date    = '2012/20120503'
        
        urlP1   = 'http://epaper.derstandarddigital.at/'
        urlP2   = 'data_ep/STAN/' + date
        urlP3   = '/V.B1/'
        urlP4   = 'paper.htm'
        urlHTML = urlP1 + urlP2 + urlP3 + urlP4
        
        htmlF  = urllib.urlopen(urlHTML)
        htmlC  = htmlF.read()
        
        
        # URL EXAMPLE: data_ep/STAN/2012/20120504/V.B1/pages/A3B6798F-2751-4D8D-A103-C5EF22F7ACBE.htm
        # consists of part2 + part3 + 'pages/' + code
        # 'pages/' has length 6, code has lenght 36
        
        index   = htmlC.find(urlP2) + len(urlP2 + urlP3) + 6 
        code    = htmlC[index:index + 36]
        
        
        # URL EXAMPLE HIGH RESOLUTION: http://epaper.derstandarddigital.at/data_ep/STAN/2012/20120504/pagejpg/A3B6798F-2751-4D8D-A103-C5EF22F7ACBE_b.png
        # URL EXAMPLE LOW RESOLUTION: http://epaper.derstandarddigital.at/data_ep/STAN/2012/20120504/pagejpg/2AB52F71-11C1-4859-9114-CDCD79BEFDCB.png
        
        urlPic  = urlP1 + urlP2 + '/pagejpg/' + code
        
        if highResolution:
            urlPic  = urlPic + '_b'
            
        urlPic  = urlPic + '.png'
        
        return urlPic
Is there another place to upload this recipe, so that it can be added to the next release of Calibre?

Thank you for your help and for Calibre

Last edited by Spindoctor; 05-08-2012 at 02:14 PM. Reason: changed title in recipe
Spindoctor is offline   Reply With Quote