MobileRead Forums - View Single Post - First stab at a meta-recipe

jmgregory · 05-03-2011, 05:05 PM

Until meta-recipes are properly implemented, the following recipe builds a mobi-pocket file from the custom recipe classes of all of my favorite blogs. When I send the resulting mobi to my Kindle, it appears as a periodical with the nice table of contents and so on. I thought others might find it useful. Simply add whatever custom recipe classes you need to before the final AllBlogsRecipe, then add instances of those classes to the list at the end of the __init__ function. Currently oldest_article and max_articles_per_feed are over-ridden by the AllBlogsRecipe class, but you can comment those lines out if you want to use different values for each recipe.

Note 1: I've never coded in python before. This code is crude, a hack, and probably has unnecessary lines, but it works for me!

Note 2: I removed most of my custom blog recipes, for brevity's sake. Two of the recipes included, Front Porch Republic and First Things would probably make good additions to Calibre's built-in recipe list.

Question for anyone who knows: How does Calibre know to use only the AllBlogsRecipe class, and not the custom classes before it?

Code:

#!/usr/bin/python

from __future__ import with_statement
import os, time, traceback, re, urlparse, sys, cStringIO
from collections import defaultdict
from functools import partial
from contextlib import nested, closing
from calibre import browser, __appname__, iswindows, \
                    strftime, preferred_encoding, as_unicode
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre import entity_to_unicode
from calibre.web import Recipe
from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.metadata import MetaInformation
from calibre.web.feeds import feed_from_xml, templates, feeds_from_index, Feed
from calibre.web.fetch.simple import option_parser as web2disk_option_parser
from calibre.web.fetch.simple import RecursiveFetcher
from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending
from calibre.ptempfile import PersistentTemporaryFile
from calibre.utils.date import now as nowf
from calibre.utils.magick.draw import save_cover_data_to, add_borders_to_image

class FirstThingsRecipe(BasicNewsRecipe):
    title = "First Things: On the Square"
    description = "Daily columns from First Things' top writers"
    language = 'en'
    use_embedded_content = False
    encoding = "cp1252"
    feeds = [('First Things: On the Square', 'http://www.firstthings.com/rss/onthesquare.php')]
    remove_tags = [dict(name='script', attrs={'type':'text/javascript'})]
    no_stylesheets = True

    def print_version(self, url):
        return url.replace('http://www.firstthings.com/', 'http://www.firstthings.com/print/')

class FPRRecipe(BasicNewsRecipe):
    title = "Front Porch Republic"
    description = "Place, Limits, Liberty"
    language = 'en'
    use_embedded_content = False
    feeds = [('Front Porch Republic', 'http://feeds.feedburner.com/FrontPorchRepublic')]
    no_stylesheets = True

    keep_only_tags = [dict(id=['content', 'heading'])]
    remove_tags = [dict(name='div', attrs={'id':'respond'}), dict(name='div', attrs={'class':'prev_next post_nav'})]
    preprocess_regexps = [
        (re.compile(r'middot;.*?</div>', re.DOTALL|re.IGNORECASE),
         lambda match: 'nbsp;</div>')
        ]

class NashvillestRecipe(BasicNewsRecipe):
    title = "Nashvillest"
    language = "en"
    description = "All you ever wanted to know about the Music City"
    feeds = [('Nashvillest', 'http://nashvillest.com/feed/')]
    remove_tags = [dict(name='a', attrs={'rel':'nofollow'})]

class AllBlogsRecipe(BasicNewsRecipe):
    title = "Justin's Blogs"
    oldest_article = 3
    max_articles_per_feed = 100
    remove_empty_feeds = True
    remove_javascript = True
    no_stylesheets = True

    def build_index(self):
        self.report_progress(0, _('Fetching feeds...'))
        self.jobs = []
        feeds = []
        feedcount = 0
        for obj in self.recipe_objects:
            obj.jobs = []
            self.jobs = []
            self.web2disk_cmdline = [ 'web2disk',
                                      '--timeout', str(obj.timeout),
                                      '--max-recursions', str(obj.recursions),
                                      '--delay', str(obj.delay),
                                      ]
            if obj.verbose:
                self.web2disk_cmdline.append('--verbose')
            if obj.no_stylesheets:
                self.web2disk_cmdline.append('--dont-download-stylesheets')
            for reg in obj.match_regexps:
                self.web2disk_cmdline.extend(['--match-regexp', reg])
            for reg in obj.filter_regexps:
                self.web2disk_cmdline.extend(['--filter-regexp', reg])
            self.web2disk_options = web2disk_option_parser().parse_args(self.web2disk_cmdline)[0]
            for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps',
                          'skip_ad_pages', 'preprocess_html', 'remove_tags_after',
                          'remove_tags_before', 'is_link_wanted'):
                setattr(self.web2disk_options, extra, getattr(obj, extra))
            self.web2disk_options.postprocess_html = obj._postprocess_html
            self.web2disk_options.encoding = obj.encoding
            obj.max_articles_per_feed = self.max_articles_per_feed
            obj.oldest_article = self.oldest_article
            try:
                temp_feeds = feeds_from_index(obj.parse_index(), oldest_article=self.oldest_article,
                                              max_articles_per_feed=self.max_articles_per_feed,
                                              log=self.log)
                self.report_progress(0, _('Got feeds from index page'))
            except NotImplementedError:
                temp_feeds = obj.parse_feeds()

            remove = [f for f in temp_feeds if len(f) == 0 and
                      self.remove_empty_feeds]
            for f in remove:
                temp_feeds.remove(f)

            obj.has_single_feed = len(temp_feeds) == 1
            obj.feed_objects = temp_feeds

            if obj.reverse_article_order:
                for feed in temp_feeds:
                    if hasattr(feed, 'reverse'):
                        feed.reverse()

            for f, feed in enumerate(temp_feeds):
                myf = f + feedcount
                feed_dir = os.path.join(self.output_dir, 'feed_%d'%myf)
                if not os.path.isdir(feed_dir):
                    os.makedirs(feed_dir)
                    

                for a, article in enumerate(feed):
                    if a >= self.max_articles_per_feed:
                        break
                    art_dir = os.path.join(feed_dir, 'article_%d'%a)
                    if not os.path.isdir(art_dir):
                        os.makedirs(art_dir)
                    try:
                        url = obj.print_version(article.url)
                    except NotImplementedError:
                        url = article.url
                    except:
                        self.log.exception('Failed to find print version for: '+article.url)
                        url = None
                    if not url:
                        continue
                    func, arg = (self.fetch_embedded_article, article) \
                                if obj.use_embedded_content or (obj.use_embedded_content == None and feed.has_embedded_content()) \
                                else \
                                ((self.fetch_obfuscated_article if obj.articles_are_obfuscated \
                                  else self.fetch_article), url)
                    req = WorkRequest(func, (arg, art_dir, f, a, len(feed)),
                                      {}, (f, a), self.article_downloaded,
                                      self.error_in_article_download)
                    req.feed = feed
                    req.article = article
                    req.feed_dir = feed_dir
                    self.jobs.append(req)                                                

            self.has_single_feed = len(temp_feeds) == 1
            self.feed_objects = temp_feeds

            self.jobs_done = 0
            tp = ThreadPool(obj.simultaneous_downloads)
            for req in self.jobs:
                tp.putRequest(req, block=True, timeout=0)

            self.report_progress(0, _('Starting download [%d thread(s)]...')%obj.simultaneous_downloads)
            while True:
                try:
                    tp.poll()
                    time.sleep(0.1)
                except NoResultsPending:
                    break

            for f, feed in enumerate(temp_feeds):
                myf = f+feedcount
                html = self.feed2index(f,temp_feeds)
                feed_dir = os.path.join(self.output_dir, 'feed_%d'%myf)
                with open(os.path.join(feed_dir, 'index.html'), 'wb') as fi:
                    fi.write(html)
                                                  
            feeds.extend(temp_feeds)
            feedcount = feedcount + len(temp_feeds)

        remove = [f for f in feeds if len(f) == 0 and
                self.remove_empty_feeds]
        for f in remove:
            feeds.remove(f)

        if not feeds:
            raise ValueError('No articles found, aborting')

        #feeds = FeedCollection(feeds)

        self.has_single_feed = len(temp_feeds) == 1
        self.feed_objects = temp_feeds

        self.report_progress(0, _('Trying to download cover...'))
        self.download_cover()
        self.report_progress(0, _('Generating masthead...'))
        self.masthead_path = None

        try:
            murl = self.get_masthead_url()
        except:
            self.log.exception('Failed to get masthead url')
            murl = None

        if murl is not None:
            # Try downloading the user-supplied masthead_url
            # Failure sets self.masthead_path to None
            self.download_masthead(murl)
        if self.masthead_path is None:
            self.log.info("Synthesizing mastheadImage")
            self.masthead_path = os.path.join(self.output_dir, 'mastheadImage.jpg')
            try:
                self.default_masthead_image(self.masthead_path)
            except:
                self.log.exception('Failed to generate default masthead image')
                self.masthead_path = None

        if self.test:
            feeds = feeds[:2]
        self.has_single_feed = len(feeds) == 1

        index = os.path.join(self.output_dir, 'index.html')

        html = self.feeds2index(feeds)
        with open(index, 'wb') as fi:
            fi.write(html)

        self.feed_objects = feeds

        #feeds.restore_duplicates()
        self.create_opf(feeds)
        self.report_progress(1, _('Feeds downloaded to %s')%index)

        return index

    def __init__ (self, options, log, progress_reporter):
        BasicNewsRecipe.__init__(self, options, log, progress_reporter)
        self.recipe_objects = [ 
                               NashvillestRecipe(options, log, progress_reporter),
                               FPRRecipe(options, log, progress_reporter),
                               FirstThingsRecipe(options, log, progress_reporter)
                              ]