MobileRead Forums - View Single Post

snarkophilus · 07-17-2011, 06:58 AM

Hi folks,

Here's a recipe for autosport.com. Please be gentle, this is the first time I've written any python (although it's mostly cut'n'paste from other recipes).

There's many RSS feeds available on the autosport site, but I've used only two (that I happen to be interested in). Is there a way you can enable/disable many feeds within a single recipe? I also suspect that the recipe might work without a subscription for parts of the autosport site but I haven't played around with that.

This recipe uses the method here to only download stories once.

Here's the recipe. Feedback appreciated!

Spoiler:

Code:

import re
from calibre.constants import config_dir, CONFIG_DIR_MODE
import os, os.path, urllib
from hashlib import md5

class AutoSportF1(BasicNewsRecipe):
    title          = u'Autosport'
    oldest_article = 100
    max_articles_per_feed = 1000
    needs_subscription = True

    feeds          = [
        (u'F1 News', u'http://www.autosport.com/rss/f1news.xml'),
        (u'Features', u'http://www.autosport.com/rss/features.xml')
    ]

    remove_attributes = ['width','height']

    preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
        [
            ## Remove anything before the body of the article.
            (r'<body.*?Begin Main Box-->', lambda match: '<body>'),

            ## Remove anything before the body of the article (alternate if above doesn't work).
            (r'<body.*?-- main column -->', lambda match: '<body>'),

            ## Remove anything after the end of the article.
            (r'<!-- End Main Box.*?</body>', lambda match : '</body>'),
            ]
    ]

    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
        if self.username is not None and self.password is not None:
            br.open('http://www.autosport.com/subs/login.php')

            br.select_form(nr=2)
            br['user_email']   = self.username
            br['user_password'] = self.password
            raw = br.submit().read()
            if 'Please try again' in raw:
                raise Exception('Your username and password are incorrect')
        return br

    # As seen here: https://www.mobileread.com/forums/showpost.php?p=1295505&postcount=10
    def parse_feeds(self):
        recipe_dir = os.path.join(config_dir,'recipes')
        hash_dir = os.path.join(recipe_dir,'recipe_storage')
        feed_dir = os.path.join(hash_dir,self.title.encode('utf-8').replace('/',':'))
        if not os.path.isdir(feed_dir):
            os.makedirs(feed_dir,mode=CONFIG_DIR_MODE)

        feeds = BasicNewsRecipe.parse_feeds(self)

        for feed in feeds:
            feed_hash = urllib.quote(feed.title.encode('utf-8'),safe='')
            feed_fn = os.path.join(feed_dir,feed_hash)

            past_items = set()
            if os.path.exists(feed_fn):
               with file(feed_fn) as f:
                   for h in f:
                       past_items.add(h.strip())

            cur_items = set()
            for article in feed.articles[:]:
                item_hash = md5()
                if article.content: item_hash.update(article.content.encode('utf-8'))
                if article.summary: item_hash.update(article.summary.encode('utf-8'))
                item_hash = item_hash.hexdigest()
                if article.url:
                    item_hash = article.url + ':' + item_hash
                cur_items.add(item_hash)
                if item_hash in past_items:
                    feed.articles.remove(article)
            with file(feed_fn,'w') as f:
                for h in cur_items:
                    f.write(h+'\n')

        remove = [f for f in feeds if len(f) == 0 and
                self.remove_empty_feeds]
        for f in remove:
            feeds.remove(f)

        return feeds

Cheers,
Simon.