Until
meta-recipes are properly implemented, the following recipe builds a mobi-pocket file from the custom recipe classes of all of my favorite blogs. When I send the resulting mobi to my Kindle, it appears as a periodical with the nice table of contents and so on. I thought others might find it useful. Simply add whatever custom recipe classes you need to before the final AllBlogsRecipe, then add instances of those classes to the list at the end of the __init__ function. Currently oldest_article and max_articles_per_feed are over-ridden by the AllBlogsRecipe class, but you can comment those lines out if you want to use different values for each recipe.
Note 1: I've never coded in python before. This code is crude, a hack, and probably has unnecessary lines, but it works for me!
Note 2: I removed most of my custom blog recipes, for brevity's sake. Two of the recipes included, Front Porch Republic and First Things would probably make good additions to Calibre's built-in recipe list.
Question for anyone who knows: How does Calibre know to use only the AllBlogsRecipe class, and not the custom classes before it?
Code:
#!/usr/bin/python
from __future__ import with_statement
import os, time, traceback, re, urlparse, sys, cStringIO
from collections import defaultdict
from functools import partial
from contextlib import nested, closing
from calibre import browser, __appname__, iswindows, \
strftime, preferred_encoding, as_unicode
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre import entity_to_unicode
from calibre.web import Recipe
from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.metadata import MetaInformation
from calibre.web.feeds import feed_from_xml, templates, feeds_from_index, Feed
from calibre.web.fetch.simple import option_parser as web2disk_option_parser
from calibre.web.fetch.simple import RecursiveFetcher
from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending
from calibre.ptempfile import PersistentTemporaryFile
from calibre.utils.date import now as nowf
from calibre.utils.magick.draw import save_cover_data_to, add_borders_to_image
class FirstThingsRecipe(BasicNewsRecipe):
title = "First Things: On the Square"
description = "Daily columns from First Things' top writers"
language = 'en'
use_embedded_content = False
encoding = "cp1252"
feeds = [('First Things: On the Square', 'http://www.firstthings.com/rss/onthesquare.php')]
remove_tags = [dict(name='script', attrs={'type':'text/javascript'})]
no_stylesheets = True
def print_version(self, url):
return url.replace('http://www.firstthings.com/', 'http://www.firstthings.com/print/')
class FPRRecipe(BasicNewsRecipe):
title = "Front Porch Republic"
description = "Place, Limits, Liberty"
language = 'en'
use_embedded_content = False
feeds = [('Front Porch Republic', 'http://feeds.feedburner.com/FrontPorchRepublic')]
no_stylesheets = True
keep_only_tags = [dict(id=['content', 'heading'])]
remove_tags = [dict(name='div', attrs={'id':'respond'}), dict(name='div', attrs={'class':'prev_next post_nav'})]
preprocess_regexps = [
(re.compile(r'middot;.*?</div>', re.DOTALL|re.IGNORECASE),
lambda match: 'nbsp;</div>')
]
class NashvillestRecipe(BasicNewsRecipe):
title = "Nashvillest"
language = "en"
description = "All you ever wanted to know about the Music City"
feeds = [('Nashvillest', 'http://nashvillest.com/feed/')]
remove_tags = [dict(name='a', attrs={'rel':'nofollow'})]
class AllBlogsRecipe(BasicNewsRecipe):
title = "Justin's Blogs"
oldest_article = 3
max_articles_per_feed = 100
remove_empty_feeds = True
remove_javascript = True
no_stylesheets = True
def build_index(self):
self.report_progress(0, _('Fetching feeds...'))
self.jobs = []
feeds = []
feedcount = 0
for obj in self.recipe_objects:
obj.jobs = []
self.jobs = []
self.web2disk_cmdline = [ 'web2disk',
'--timeout', str(obj.timeout),
'--max-recursions', str(obj.recursions),
'--delay', str(obj.delay),
]
if obj.verbose:
self.web2disk_cmdline.append('--verbose')
if obj.no_stylesheets:
self.web2disk_cmdline.append('--dont-download-stylesheets')
for reg in obj.match_regexps:
self.web2disk_cmdline.extend(['--match-regexp', reg])
for reg in obj.filter_regexps:
self.web2disk_cmdline.extend(['--filter-regexp', reg])
self.web2disk_options = web2disk_option_parser().parse_args(self.web2disk_cmdline)[0]
for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps',
'skip_ad_pages', 'preprocess_html', 'remove_tags_after',
'remove_tags_before', 'is_link_wanted'):
setattr(self.web2disk_options, extra, getattr(obj, extra))
self.web2disk_options.postprocess_html = obj._postprocess_html
self.web2disk_options.encoding = obj.encoding
obj.max_articles_per_feed = self.max_articles_per_feed
obj.oldest_article = self.oldest_article
try:
temp_feeds = feeds_from_index(obj.parse_index(), oldest_article=self.oldest_article,
max_articles_per_feed=self.max_articles_per_feed,
log=self.log)
self.report_progress(0, _('Got feeds from index page'))
except NotImplementedError:
temp_feeds = obj.parse_feeds()
remove = [f for f in temp_feeds if len(f) == 0 and
self.remove_empty_feeds]
for f in remove:
temp_feeds.remove(f)
obj.has_single_feed = len(temp_feeds) == 1
obj.feed_objects = temp_feeds
if obj.reverse_article_order:
for feed in temp_feeds:
if hasattr(feed, 'reverse'):
feed.reverse()
for f, feed in enumerate(temp_feeds):
myf = f + feedcount
feed_dir = os.path.join(self.output_dir, 'feed_%d'%myf)
if not os.path.isdir(feed_dir):
os.makedirs(feed_dir)
for a, article in enumerate(feed):
if a >= self.max_articles_per_feed:
break
art_dir = os.path.join(feed_dir, 'article_%d'%a)
if not os.path.isdir(art_dir):
os.makedirs(art_dir)
try:
url = obj.print_version(article.url)
except NotImplementedError:
url = article.url
except:
self.log.exception('Failed to find print version for: '+article.url)
url = None
if not url:
continue
func, arg = (self.fetch_embedded_article, article) \
if obj.use_embedded_content or (obj.use_embedded_content == None and feed.has_embedded_content()) \
else \
((self.fetch_obfuscated_article if obj.articles_are_obfuscated \
else self.fetch_article), url)
req = WorkRequest(func, (arg, art_dir, f, a, len(feed)),
{}, (f, a), self.article_downloaded,
self.error_in_article_download)
req.feed = feed
req.article = article
req.feed_dir = feed_dir
self.jobs.append(req)
self.has_single_feed = len(temp_feeds) == 1
self.feed_objects = temp_feeds
self.jobs_done = 0
tp = ThreadPool(obj.simultaneous_downloads)
for req in self.jobs:
tp.putRequest(req, block=True, timeout=0)
self.report_progress(0, _('Starting download [%d thread(s)]...')%obj.simultaneous_downloads)
while True:
try:
tp.poll()
time.sleep(0.1)
except NoResultsPending:
break
for f, feed in enumerate(temp_feeds):
myf = f+feedcount
html = self.feed2index(f,temp_feeds)
feed_dir = os.path.join(self.output_dir, 'feed_%d'%myf)
with open(os.path.join(feed_dir, 'index.html'), 'wb') as fi:
fi.write(html)
feeds.extend(temp_feeds)
feedcount = feedcount + len(temp_feeds)
remove = [f for f in feeds if len(f) == 0 and
self.remove_empty_feeds]
for f in remove:
feeds.remove(f)
if not feeds:
raise ValueError('No articles found, aborting')
#feeds = FeedCollection(feeds)
self.has_single_feed = len(temp_feeds) == 1
self.feed_objects = temp_feeds
self.report_progress(0, _('Trying to download cover...'))
self.download_cover()
self.report_progress(0, _('Generating masthead...'))
self.masthead_path = None
try:
murl = self.get_masthead_url()
except:
self.log.exception('Failed to get masthead url')
murl = None
if murl is not None:
# Try downloading the user-supplied masthead_url
# Failure sets self.masthead_path to None
self.download_masthead(murl)
if self.masthead_path is None:
self.log.info("Synthesizing mastheadImage")
self.masthead_path = os.path.join(self.output_dir, 'mastheadImage.jpg')
try:
self.default_masthead_image(self.masthead_path)
except:
self.log.exception('Failed to generate default masthead image')
self.masthead_path = None
if self.test:
feeds = feeds[:2]
self.has_single_feed = len(feeds) == 1
index = os.path.join(self.output_dir, 'index.html')
html = self.feeds2index(feeds)
with open(index, 'wb') as fi:
fi.write(html)
self.feed_objects = feeds
#feeds.restore_duplicates()
self.create_opf(feeds)
self.report_progress(1, _('Feeds downloaded to %s')%index)
return index
def __init__ (self, options, log, progress_reporter):
BasicNewsRecipe.__init__(self, options, log, progress_reporter)
self.recipe_objects = [
NashvillestRecipe(options, log, progress_reporter),
FPRRecipe(options, log, progress_reporter),
FirstThingsRecipe(options, log, progress_reporter)
]