Register Guidelines E-Books Today's Posts Search

Go Back   MobileRead Forums > E-Book Software > Calibre > Recipes

Notices

Reply
 
Thread Tools Search this Thread
Old 05-14-2011, 12:40 PM   #1
schuster
Zealot
schuster doesn't litterschuster doesn't litter
 
Posts: 119
Karma: 100
Join Date: Jan 2011
Location: Germany / NRW /Köln
Device: prs-650 / prs-350 /kindle 3
recipe for Bild.de - German

here a recipe for german newspaper (yellowpress) Bild.de

Code:
import string, re
from calibre.web.feeds.recipes import BasicNewsRecipe
class AdvancedUserRecipe1303841067(BasicNewsRecipe):
    title          = u'Bild.de'
    __author__  = 'schuster'
    oldest_article = 1
    max_articles_per_feed = 50
    no_stylesheets         = True
    use_embedded_content   = False
    language               = 'de'
    remove_javascript      = True

# get cover from myspace
    cover_url = 'http://a3.l3-images.myspacecdn.com/images02/56/0232f842170b4d349779f8379c27e073/l.jpg'

# set what to fetch on the site
    remove_tags_before =  dict(name = 'h2', attrs={'id':'cover'})
    remove_tags_after = dict(name ='div', attrs={'class':'back'})

#  thanx to kiklop74 for code (see sticky thread -> Recipes - Re-usable code)
# this one removes a lot of direct-link's
    def preprocess_html(self, soup):
        for alink in soup.findAll('a'):
            if alink.string is not None:
               tstr = alink.string
               alink.replaceWith(tstr)
        return soup

# remove the ad's
    filter_regexps = [r'.\.smartadserver\.com']
    def skip_ad_pages(self, soup):
        return None

#get the real url behind  .feedsportal.com and fetch the artikels
    def get_article_url(self, article):
        return article.get('id', article.get('guid', None))

#list of the rss source from www.bild.de
    feeds          = [(u'Überblick', u'http://rss.bild.de/bild.xml'),
                          (u'News', u'http://rss.bild.de/bild-news.xml'),
                          (u'Politik', u'http://rss.bild.de/bild-politik.xml'),
                          (u'Unterhaltung', u'http://rss.bild.de/bild-unterhaltung.xml'),
                          (u'Sport', u'http://rss.bild.de/bild-sport.xml'),
                          (u'Lifestyle', u'http://rss.bild.de/bild-lifestyle.xml'),
                          (u'Ratgeber', u'http://rss.bild.de/bild-ratgeber.xml')
]
schuster is offline   Reply With Quote
Old 06-03-2011, 06:56 AM   #2
schuster
Zealot
schuster doesn't litterschuster doesn't litter
 
Posts: 119
Karma: 100
Join Date: Jan 2011
Location: Germany / NRW /Köln
Device: prs-650 / prs-350 /kindle 3
**Update**

her a new update for bild.de:

1- fetch all regional feeds
2- cleaner layout


Code:
import string, re
from calibre.web.feeds.recipes import BasicNewsRecipe
class AdvancedUserRecipe1303841067(BasicNewsRecipe):

    title          = u'Bild.de'
    __author__  = 'schuster'
    oldest_article = 1
    max_articles_per_feed = 100
    no_stylesheets         = True
    use_embedded_content   = False
    language               = 'de'
    remove_javascript      = True

# get cover from myspace
    cover_url = 'http://a3.l3-images.myspacecdn.com/images02/56/0232f842170b4d349779f8379c27e073/l.jpg'
    masthead_url = 'http://a3.l3-images.myspacecdn.com/images02/56/0232f842170b4d349779f8379c27e073/l.jpg'   

# set what to fetch on the site
    remove_tags_before =  dict(name = 'h2', attrs={'id':'cover'})
    remove_tags_after = dict(name ='div', attrs={'class':'back'})


# remove things on the site that we don't want
    remove_tags = [dict(name='div', attrs={'class':'credit'}),
        dict(name='div', attrs={'class':'index'}),
        dict(name='div', attrs={'id':'zstart31'}),
        dict(name='div', attrs={'class':'hentry'}),
        dict(name='div', attrs={'class':'back'}),
        dict(name='div', attrs={'class':'pagination'}),
        dict(name='div', attrs={'class':'header'}),
        dict(name='div', attrs={'class':'element floatL'}),
        dict(name='div', attrs={'class':'stWrap'})
]

#  thanx to kiklop74 for code (see sticky thread -> Recipes - Re-usable code)
# this one removes a lot of direct-link's
    def preprocess_html(self, soup):
        for alink in soup.findAll('a'):
            if alink.string is not None:
               tstr = alink.string
               alink.replaceWith(tstr)
        return soup

# remove the ad's
    filter_regexps = [r'.\.smartadserver\.com']
    def skip_ad_pages(self, soup):
        return None

#get the real url behind  .feedsportal.com and fetch the artikels
    def get_article_url(self, article):
        return article.get('id', article.get('guid', None))

#list of the rss source from www.bild.de
    feeds          = [(u'Überblick', u'http://rss.bild.de/bild.xml'),
                          (u'News', u'http://rss.bild.de/bild-news.xml'),
                          (u'Politik', u'http://rss.bild.de/bild-politik.xml'),
                          (u'Unterhaltung', u'http://rss.bild.de/bild-unterhaltung.xml'),
                          (u'Sport', u'http://rss.bild.de/bild-sport.xml'),
                          (u'Lifestyle', u'http://rss.bild.de/bild-lifestyle.xml'),
                          (u'Ratgeber', u'http://rss.bild.de/bild-ratgeber.xml'),
                          (u'Reg. - Berlin', u'http://rss.bild.de/bild-berlin.xml'),
                          (u'Reg. - Bremen', u'http://rss.bild.de/bild-bremen.xml'),
                          (u'Reg. - Dresden', u'http://rss.bild.de/bild-dresden.xml'),
                          (u'Reg. - Düsseldorf', u'http://rss.bild.de/bild-duesseldorf.xml'),
                          (u'Reg. - Frankfurt-Main', u'http://rss.bild.de/bild-frankfurt-main.xml'),
                          (u'Reg. - Hamburg', u'http://rss.bild.de/bild-hamburg.xml'),
                          (u'Reg. - Hannover', u'http://rss.bild.de/bild-hannover.xml'),
                          (u'Reg. - Köln', u'http://rss.bild.de/bild-koeln.xml'),
                          (u'Reg. - Leipzig', u'http://rss.bild.de/bild-leipzig.xml'),
                          (u'Reg. - München', u'http://rss.bild.de/bild-muenchen.xml'),
                          (u'Reg. - Ruhrgebiet', u'http://rss.bild.de/bild-ruhrgebiet.xml'),
                          (u'Reg. - Stuttgart', u'http://rss.bild.de/bild-stuttgart.xml')
]
schuster is offline   Reply With Quote
Advert
Old 05-22-2016, 05:00 AM   #3
Aimylios
Member
Aimylios began at the beginning.
 
Posts: 16
Karma: 10
Join Date: Apr 2016
Device: Tolino Vision 3HD
Hi,

the addresses of the bild.de RSS feeds have been changed. Here's an updated version of the bild_de.recipe.

Code:
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import unicode_literals, division, absolute_import, print_function

'''
bild.de
'''

import re
from calibre.web.feeds.recipes import BasicNewsRecipe

class AdvancedUserRecipe1303841067(BasicNewsRecipe):
    title       = 'Bild.de'
    __author__  = 'schuster'
    description = 'RSS-Feeds von Bild.de'
    language    = 'de'

    oldest_article        = 1
    max_articles_per_feed = 100
    no_stylesheets        = True
    remove_javascript     = True
    use_embedded_content  = False
    remove_empty_feeds    = True

    masthead_url = 'http://a3.l3-images.myspacecdn.com/images02/56/0232f842170b4d349779f8379c27e073/l.jpg'

    # By default, no local news feeds will be fetched. To change this,
    # just uncomment the lines for the regions you are interested in.
    feeds = [
        ('Politik', 'http://www.bild.de/rss-feeds/rss-16725492,feed=politik.bild.html'),
        ('Unterhaltung', 'http://www.bild.de/rss-feeds/rss-16725492,feed=unterhaltung.bild.html'),
        ('Sport', 'http://www.bild.de/rss-feeds/rss-16725492,feed=sport.bild.html'),
        ('Lifestyle', 'http://www.bild.de/rss-feeds/rss-16725492,feed=lifestyle.bild.html'),
        ('Ratgeber', 'http://www.bild.de/rss-feeds/rss-16725492,feed=ratgeber.bild.html'),
        ('Auto', 'http://www.bild.de/rss-feeds/rss-16725492,feed=auto.bild.html'),
        ('Digital', 'http://www.bild.de/rss-feeds/rss-16725492,feed=digital.bild.html'),
        ('Spiele', 'http://www.bild.de/rss-feeds/rss-16725492,feed=spiele.bild.html'),
        ('Leserreporter', 'http://www.bild.de/rss-feeds/rss-16725492,feed=leserreporter.bild.html'),
#        ('Berlin', 'http://www.bild.de/rss-feeds/rss-16725492,feed=Newsticker.bild.html'),
#        ('Bremen', 'http://www.bild.de/rss-feeds/rss-16725492,feed=bremen.bild.html'),
#        ('Chemnitz', 'http://www.bild.de/rssfeeds/rss3/rss3-20745882,feed=ressort-regio-chemnitz.bild.html'),
#        ('Dresden', 'http://www.bild.de/rss-feeds/rss-16725492,feed=dresden.bild.html'),
#        ('Düsseldorf', 'http://www.bild.de/rss-feeds/rss-16725492,feed=duesseldorf.bild.html'),
#        ('Frankfurt/Main', 'http://www.bild.de/rss-feeds/rss-16725492,feed=regio-frankfurt.bild.html'),
#        ('Hamburg', 'http://www.bild.de/rss-feeds/rss-16725492,feed=hamburg.bild.html'),
#        ('Hannover', 'http://www.bild.de/rss-feeds/rss-16725492,feed=regio-hannover.bild.html'),
#        ('Köln', 'http://www.bild.de/rss-feeds/rss-16725492,feed=regio-koeln.bild.html'),
#        ('Leipzig', 'http://www.bild.de/rss-feeds/rss-16725492,feed=leipzig.bild.html'),
#        ('München', 'http://www.bild.de/rss-feeds/rss-16725492,feed=muenchen.bild.html'),
#        ('Ruhrgebiet', 'http://www.bild.de/rss-feeds/rss-16725492,feed=ruhrgebiet.bild.html'),
#        ('Saarland', 'http://www.bild.de/rssfeeds/rss3/rss3-20745882,feed=regional-saarland.bild.html'),
#        ('Stuttgart', 'http://www.bild.de/rss-feeds/rss-16725492,feed=regio-stuttgart.bild.html')
    ]

    keep_only_tags = [
        dict(name='article')
    ]

    remove_tags = [
        dict(name=['aside', 'iframe']),
        dict(attrs={'class':['socialbar', 'social-sharing flank', 'vel', 'back']}),
        dict(name='img', attrs={'alt':'logo'}),
        dict(name='div', attrs={'class':re.compile('infoEl')}),
        dict(name='span', attrs={'class':re.compile('loupe')})
    ]

    remove_tags_after = [
        dict(name ='div', attrs={'itemprop':re.compile('articleBody')})
    ]

    def preprocess_html(self, soup):
        # skip articles without relevant content
        if not soup.find('article'):
            self.abort_article()
        # remove all style attributes
        for item in soup.findAll(attrs={'style':True}):
            del item['style']
        # remove <br> within headlines
        for h1 in soup.findAll('h1'):
            for br in h1.findAll('br'):
                br.replaceWith(' ')
        # remove all links
        for a in soup.findAll('a'):
            a.replaceWith(a.renderContents())
        return soup
Aimylios is offline   Reply With Quote
Reply


Forum Jump

Similar Threads
Thread Thread Starter Forum Replies Last Post
Recipe works when mocked up as Python file, fails when converted to Recipe ode Recipes 7 09-04-2011 04:57 AM
recipe for Welt der Physik - German schuster Recipes 0 05-10-2011 09:18 AM
Welches Bild schaut Ihr Euch grade an? beachwanderer Lounge 16 12-10-2010 03:46 AM
Calibre Recipe: Telepolis (Artikel) (German) lena_punkt Calibre 1 09-27-2010 05:03 AM
Bild Fankfurt verlost 20 Cybook Opus beachwanderer Deutsches Forum 3 10-02-2009 04:25 PM


All times are GMT -4. The time now is 12:12 PM.


MobileRead.com is a privately owned, operated and funded community.