Hi folks,
Here's a recipe for autosport.com. Please be gentle, this is the first time I've written any python (although it's mostly cut'n'paste from other recipes).
There's many RSS feeds available on the autosport site, but I've used only two (that I happen to be interested in). Is there a way you can enable/disable many feeds within a single recipe? I also suspect that the recipe might work without a subscription for parts of the autosport site but I haven't played around with that.
This recipe uses the method
here to only download stories once.
Here's the recipe. Feedback appreciated!
Spoiler:
Code:
import re
from calibre.constants import config_dir, CONFIG_DIR_MODE
import os, os.path, urllib
from hashlib import md5
class AutoSportF1(BasicNewsRecipe):
title = u'Autosport'
oldest_article = 100
max_articles_per_feed = 1000
needs_subscription = True
feeds = [
(u'F1 News', u'http://www.autosport.com/rss/f1news.xml'),
(u'Features', u'http://www.autosport.com/rss/features.xml')
]
remove_attributes = ['width','height']
preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
## Remove anything before the body of the article.
(r'<body.*?Begin Main Box-->', lambda match: '<body>'),
## Remove anything before the body of the article (alternate if above doesn't work).
(r'<body.*?-- main column -->', lambda match: '<body>'),
## Remove anything after the end of the article.
(r'<!-- End Main Box.*?</body>', lambda match : '</body>'),
]
]
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open('http://www.autosport.com/subs/login.php')
br.select_form(nr=2)
br['user_email'] = self.username
br['user_password'] = self.password
raw = br.submit().read()
if 'Please try again' in raw:
raise Exception('Your username and password are incorrect')
return br
# As seen here: https://www.mobileread.com/forums/showpost.php?p=1295505&postcount=10
def parse_feeds(self):
recipe_dir = os.path.join(config_dir,'recipes')
hash_dir = os.path.join(recipe_dir,'recipe_storage')
feed_dir = os.path.join(hash_dir,self.title.encode('utf-8').replace('/',':'))
if not os.path.isdir(feed_dir):
os.makedirs(feed_dir,mode=CONFIG_DIR_MODE)
feeds = BasicNewsRecipe.parse_feeds(self)
for feed in feeds:
feed_hash = urllib.quote(feed.title.encode('utf-8'),safe='')
feed_fn = os.path.join(feed_dir,feed_hash)
past_items = set()
if os.path.exists(feed_fn):
with file(feed_fn) as f:
for h in f:
past_items.add(h.strip())
cur_items = set()
for article in feed.articles[:]:
item_hash = md5()
if article.content: item_hash.update(article.content.encode('utf-8'))
if article.summary: item_hash.update(article.summary.encode('utf-8'))
item_hash = item_hash.hexdigest()
if article.url:
item_hash = article.url + ':' + item_hash
cur_items.add(item_hash)
if item_hash in past_items:
feed.articles.remove(article)
with file(feed_fn,'w') as f:
for h in cur_items:
f.write(h+'\n')
remove = [f for f in feeds if len(f) == 0 and
self.remove_empty_feeds]
for f in remove:
feeds.remove(f)
return feeds
Cheers,
Simon.