View Single Post
Old 03-28-2012, 08:39 AM   #1
veezh
plus ça change
veezh does all things with Zen-like beautyveezh does all things with Zen-like beautyveezh does all things with Zen-like beautyveezh does all things with Zen-like beautyveezh does all things with Zen-like beautyveezh does all things with Zen-like beautyveezh does all things with Zen-like beautyveezh does all things with Zen-like beautyveezh does all things with Zen-like beautyveezh does all things with Zen-like beautyveezh does all things with Zen-like beauty
 
veezh's Avatar
 
Posts: 101
Karma: 32134
Join Date: Dec 2009
Location: France
Device: Kindle PW2, Voyage
Recipe for NRC Handelsblad (RSS feeds)

Code:
__license__   = 'GPL v3'
__copyright__ = '2012'
'''
nrc.nl
'''
import re
from calibre.web.feeds.recipes import BasicNewsRecipe

class NRC(BasicNewsRecipe):
    title                  = 'NRC Handelsblad'
    __author__             = 'veezh'
    description            = 'Nieuws'
    oldest_article         = 1
    max_articles_per_feed  = 100
    no_stylesheets         = True
    #delay                  = 1
    use_embedded_content   = False
    encoding               = 'utf-8'
    publisher              = 'nrc.nl'
    category               = 'news, Netherlands, world'
    language               = 'nl_NL'
    timefmt = ''
    #publication_type       = 'newsportal'
    extra_css = '''
                    h1{font-size:130%;}
                    #h2{font-size:100%;font-weight:normal;}
                    #.href{font-size:xx-small;}
                    .bijschrift{color:#666666; font-size:x-small;}
                    #.main-article-info{font-family:Arial,Helvetica,sans-serif;}
                    #full-contents{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;}
                    #match-stats-summary{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;}
                '''
    #preprocess_regexps     = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
    conversion_options = {
                              'comments'        : description
                             ,'tags'            : category
                             ,'language'        : language
                             ,'publisher'       : publisher
                             ,'linearize_tables': True
                          }

    remove_empty_feeds = True

    filterDuplicates = True

    def preprocess_html(self, soup):
        for alink in soup.findAll('a'):
            if alink.string is not None:
               tstr = alink.string
               alink.replaceWith(tstr)
        return soup

    keep_only_tags = [dict(name='div', attrs={'class':'article'})]
    remove_tags_after = [dict(id='broodtekst')]

#    keep_only_tags    = [
#                       dict(name='div', attrs={'class':['label']})
#                        ]

#    remove_tags_after = [dict(name='dl', attrs={'class':['tags']})]

#    def get_article_url(self, article):
#        link = article.get('link')
#        if 'blog' not in link and ('chat' not in link):
#             return link

    feeds          = [
#                      ('Nieuws', 'http://www.nrc.nl/rss.php'),
                      ('Binnenland', 'http://www.nrc.nl/nieuws/categorie/binnenland/rss.php'),
                      ('Buitenland', 'http://www.nrc.nl/nieuws/categorie/buitenland/rss.php'),
                      ('Economie', 'http://www.nrc.nl/nieuws/categorie/economie/rss.php'),
                      ('Wetenschap', 'http://www.nrc.nl/nieuws/categorie/wetenschap/rss.php'),
                      ('Cultuur', 'http://www.nrc.nl/nieuws/categorie/cultuur/rss.php'),
                      ('Boeken', 'http://www.nrc.nl/boeken/rss.php'),
                      ('Tech', 'http://www.nrc.nl/tech/rss.php/'),
                      ('Klimaat', 'http://www.nrc.nl/klimaat/rss.php/'),
                    ]
veezh is offline   Reply With Quote