Register Guidelines E-Books Search Today's Posts Mark Forums Read

Go Back   MobileRead Forums > E-Book Software > Calibre > Recipes

Notices

Reply
 
Thread Tools Search this Thread
Old 05-14-2011, 12:40 PM   #1
schuster
Zealot
schuster doesn't litterschuster doesn't litter
 
Posts: 119
Karma: 100
Join Date: Jan 2011
Location: Germany / NRW /Köln
Device: prs-650 / prs-350 /kindle 3
recipe for Bild.de - German

here a recipe for german newspaper (yellowpress) Bild.de

Code:
import string, re
from calibre.web.feeds.recipes import BasicNewsRecipe
class AdvancedUserRecipe1303841067(BasicNewsRecipe):
    title          = u'Bild.de'
    __author__  = 'schuster'
    oldest_article = 1
    max_articles_per_feed = 50
    no_stylesheets         = True
    use_embedded_content   = False
    language               = 'de'
    remove_javascript      = True

# get cover from myspace
    cover_url = 'http://a3.l3-images.myspacecdn.com/images02/56/0232f842170b4d349779f8379c27e073/l.jpg'

# set what to fetch on the site
    remove_tags_before =  dict(name = 'h2', attrs={'id':'cover'})
    remove_tags_after = dict(name ='div', attrs={'class':'back'})

#  thanx to kiklop74 for code (see sticky thread -> Recipes - Re-usable code)
# this one removes a lot of direct-link's
    def preprocess_html(self, soup):
        for alink in soup.findAll('a'):
            if alink.string is not None:
               tstr = alink.string
               alink.replaceWith(tstr)
        return soup

# remove the ad's
    filter_regexps = [r'.\.smartadserver\.com']
    def skip_ad_pages(self, soup):
        return None

#get the real url behind  .feedsportal.com and fetch the artikels
    def get_article_url(self, article):
        return article.get('id', article.get('guid', None))

#list of the rss source from www.bild.de
    feeds          = [(u'Überblick', u'http://rss.bild.de/bild.xml'),
                          (u'News', u'http://rss.bild.de/bild-news.xml'),
                          (u'Politik', u'http://rss.bild.de/bild-politik.xml'),
                          (u'Unterhaltung', u'http://rss.bild.de/bild-unterhaltung.xml'),
                          (u'Sport', u'http://rss.bild.de/bild-sport.xml'),
                          (u'Lifestyle', u'http://rss.bild.de/bild-lifestyle.xml'),
                          (u'Ratgeber', u'http://rss.bild.de/bild-ratgeber.xml')
]
schuster is offline   Reply With Quote
Old 06-03-2011, 06:56 AM   #2
schuster
Zealot
schuster doesn't litterschuster doesn't litter
 
Posts: 119
Karma: 100
Join Date: Jan 2011
Location: Germany / NRW /Köln
Device: prs-650 / prs-350 /kindle 3
**Update**

her a new update for bild.de:

1- fetch all regional feeds
2- cleaner layout


Code:
import string, re
from calibre.web.feeds.recipes import BasicNewsRecipe
class AdvancedUserRecipe1303841067(BasicNewsRecipe):

    title          = u'Bild.de'
    __author__  = 'schuster'
    oldest_article = 1
    max_articles_per_feed = 100
    no_stylesheets         = True
    use_embedded_content   = False
    language               = 'de'
    remove_javascript      = True

# get cover from myspace
    cover_url = 'http://a3.l3-images.myspacecdn.com/images02/56/0232f842170b4d349779f8379c27e073/l.jpg'
    masthead_url = 'http://a3.l3-images.myspacecdn.com/images02/56/0232f842170b4d349779f8379c27e073/l.jpg'   

# set what to fetch on the site
    remove_tags_before =  dict(name = 'h2', attrs={'id':'cover'})
    remove_tags_after = dict(name ='div', attrs={'class':'back'})


# remove things on the site that we don't want
    remove_tags = [dict(name='div', attrs={'class':'credit'}),
        dict(name='div', attrs={'class':'index'}),
        dict(name='div', attrs={'id':'zstart31'}),
        dict(name='div', attrs={'class':'hentry'}),
        dict(name='div', attrs={'class':'back'}),
        dict(name='div', attrs={'class':'pagination'}),
        dict(name='div', attrs={'class':'header'}),
        dict(name='div', attrs={'class':'element floatL'}),
        dict(name='div', attrs={'class':'stWrap'})
]

#  thanx to kiklop74 for code (see sticky thread -> Recipes - Re-usable code)
# this one removes a lot of direct-link's
    def preprocess_html(self, soup):
        for alink in soup.findAll('a'):
            if alink.string is not None:
               tstr = alink.string
               alink.replaceWith(tstr)
        return soup

# remove the ad's
    filter_regexps = [r'.\.smartadserver\.com']
    def skip_ad_pages(self, soup):
        return None

#get the real url behind  .feedsportal.com and fetch the artikels
    def get_article_url(self, article):
        return article.get('id', article.get('guid', None))

#list of the rss source from www.bild.de
    feeds          = [(u'Überblick', u'http://rss.bild.de/bild.xml'),
                          (u'News', u'http://rss.bild.de/bild-news.xml'),
                          (u'Politik', u'http://rss.bild.de/bild-politik.xml'),
                          (u'Unterhaltung', u'http://rss.bild.de/bild-unterhaltung.xml'),
                          (u'Sport', u'http://rss.bild.de/bild-sport.xml'),
                          (u'Lifestyle', u'http://rss.bild.de/bild-lifestyle.xml'),
                          (u'Ratgeber', u'http://rss.bild.de/bild-ratgeber.xml'),
                          (u'Reg. - Berlin', u'http://rss.bild.de/bild-berlin.xml'),
                          (u'Reg. - Bremen', u'http://rss.bild.de/bild-bremen.xml'),
                          (u'Reg. - Dresden', u'http://rss.bild.de/bild-dresden.xml'),
                          (u'Reg. - Düsseldorf', u'http://rss.bild.de/bild-duesseldorf.xml'),
                          (u'Reg. - Frankfurt-Main', u'http://rss.bild.de/bild-frankfurt-main.xml'),
                          (u'Reg. - Hamburg', u'http://rss.bild.de/bild-hamburg.xml'),
                          (u'Reg. - Hannover', u'http://rss.bild.de/bild-hannover.xml'),
                          (u'Reg. - Köln', u'http://rss.bild.de/bild-koeln.xml'),
                          (u'Reg. - Leipzig', u'http://rss.bild.de/bild-leipzig.xml'),
                          (u'Reg. - München', u'http://rss.bild.de/bild-muenchen.xml'),
                          (u'Reg. - Ruhrgebiet', u'http://rss.bild.de/bild-ruhrgebiet.xml'),
                          (u'Reg. - Stuttgart', u'http://rss.bild.de/bild-stuttgart.xml')
]
schuster is offline   Reply With Quote
Advert
Old 05-22-2016, 05:00 AM   #3
Aimylios
Member
Aimylios began at the beginning.
 
Posts: 17
Karma: 10
Join Date: Apr 2016
Device: Tolino Vision 3HD
Hi,

the addresses of the bild.de RSS feeds have been changed. Here's an updated version of the bild_de.recipe.

Code:
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import unicode_literals, division, absolute_import, print_function

'''
bild.de
'''

import re
from calibre.web.feeds.recipes import BasicNewsRecipe

class AdvancedUserRecipe1303841067(BasicNewsRecipe):
    title       = 'Bild.de'
    __author__  = 'schuster'
    description = 'RSS-Feeds von Bild.de'
    language    = 'de'

    oldest_article        = 1
    max_articles_per_feed = 100
    no_stylesheets        = True
    remove_javascript     = True
    use_embedded_content  = False
    remove_empty_feeds    = True

    masthead_url = 'http://a3.l3-images.myspacecdn.com/images02/56/0232f842170b4d349779f8379c27e073/l.jpg'

    # By default, no local news feeds will be fetched. To change this,
    # just uncomment the lines for the regions you are interested in.
    feeds = [
        ('Politik', 'http://www.bild.de/rss-feeds/rss-16725492,feed=politik.bild.html'),
        ('Unterhaltung', 'http://www.bild.de/rss-feeds/rss-16725492,feed=unterhaltung.bild.html'),
        ('Sport', 'http://www.bild.de/rss-feeds/rss-16725492,feed=sport.bild.html'),
        ('Lifestyle', 'http://www.bild.de/rss-feeds/rss-16725492,feed=lifestyle.bild.html'),
        ('Ratgeber', 'http://www.bild.de/rss-feeds/rss-16725492,feed=ratgeber.bild.html'),
        ('Auto', 'http://www.bild.de/rss-feeds/rss-16725492,feed=auto.bild.html'),
        ('Digital', 'http://www.bild.de/rss-feeds/rss-16725492,feed=digital.bild.html'),
        ('Spiele', 'http://www.bild.de/rss-feeds/rss-16725492,feed=spiele.bild.html'),
        ('Leserreporter', 'http://www.bild.de/rss-feeds/rss-16725492,feed=leserreporter.bild.html'),
#        ('Berlin', 'http://www.bild.de/rss-feeds/rss-16725492,feed=Newsticker.bild.html'),
#        ('Bremen', 'http://www.bild.de/rss-feeds/rss-16725492,feed=bremen.bild.html'),
#        ('Chemnitz', 'http://www.bild.de/rssfeeds/rss3/rss3-20745882,feed=ressort-regio-chemnitz.bild.html'),
#        ('Dresden', 'http://www.bild.de/rss-feeds/rss-16725492,feed=dresden.bild.html'),
#        ('Düsseldorf', 'http://www.bild.de/rss-feeds/rss-16725492,feed=duesseldorf.bild.html'),
#        ('Frankfurt/Main', 'http://www.bild.de/rss-feeds/rss-16725492,feed=regio-frankfurt.bild.html'),
#        ('Hamburg', 'http://www.bild.de/rss-feeds/rss-16725492,feed=hamburg.bild.html'),
#        ('Hannover', 'http://www.bild.de/rss-feeds/rss-16725492,feed=regio-hannover.bild.html'),
#        ('Köln', 'http://www.bild.de/rss-feeds/rss-16725492,feed=regio-koeln.bild.html'),
#        ('Leipzig', 'http://www.bild.de/rss-feeds/rss-16725492,feed=leipzig.bild.html'),
#        ('München', 'http://www.bild.de/rss-feeds/rss-16725492,feed=muenchen.bild.html'),
#        ('Ruhrgebiet', 'http://www.bild.de/rss-feeds/rss-16725492,feed=ruhrgebiet.bild.html'),
#        ('Saarland', 'http://www.bild.de/rssfeeds/rss3/rss3-20745882,feed=regional-saarland.bild.html'),
#        ('Stuttgart', 'http://www.bild.de/rss-feeds/rss-16725492,feed=regio-stuttgart.bild.html')
    ]

    keep_only_tags = [
        dict(name='article')
    ]

    remove_tags = [
        dict(name=['aside', 'iframe']),
        dict(attrs={'class':['socialbar', 'social-sharing flank', 'vel', 'back']}),
        dict(name='img', attrs={'alt':'logo'}),
        dict(name='div', attrs={'class':re.compile('infoEl')}),
        dict(name='span', attrs={'class':re.compile('loupe')})
    ]

    remove_tags_after = [
        dict(name ='div', attrs={'itemprop':re.compile('articleBody')})
    ]

    def preprocess_html(self, soup):
        # skip articles without relevant content
        if not soup.find('article'):
            self.abort_article()
        # remove all style attributes
        for item in soup.findAll(attrs={'style':True}):
            del item['style']
        # remove <br> within headlines
        for h1 in soup.findAll('h1'):
            for br in h1.findAll('br'):
                br.replaceWith(' ')
        # remove all links
        for a in soup.findAll('a'):
            a.replaceWith(a.renderContents())
        return soup
Aimylios is offline   Reply With Quote
Reply

Thread Tools Search this Thread
Search this Thread:

Advanced Search

Forum Jump

Similar Threads
Thread Thread Starter Forum Replies Last Post
Recipe works when mocked up as Python file, fails when converted to Recipe ode Recipes 7 09-04-2011 04:57 AM
recipe for Welt der Physik - German schuster Recipes 0 05-10-2011 09:18 AM
Welches Bild schaut Ihr Euch grade an? beachwanderer Lounge 16 12-10-2010 03:46 AM
Calibre Recipe: Telepolis (Artikel) (German) lena_punkt Calibre 1 09-27-2010 05:03 AM
Bild Fankfurt verlost 20 Cybook Opus beachwanderer Deutsches Forum 3 10-02-2009 04:25 PM


All times are GMT -4. The time now is 01:39 PM.


MobileRead.com is a privately owned, operated and funded community.