05-03-2011, 04:05 PM | #1 |
Junior Member
Posts: 2
Karma: 10
Join Date: May 2011
Device: Kindle (3rd gen)
|
First stab at a meta-recipe
Until meta-recipes are properly implemented, the following recipe builds a mobi-pocket file from the custom recipe classes of all of my favorite blogs. When I send the resulting mobi to my Kindle, it appears as a periodical with the nice table of contents and so on. I thought others might find it useful. Simply add whatever custom recipe classes you need to before the final AllBlogsRecipe, then add instances of those classes to the list at the end of the __init__ function. Currently oldest_article and max_articles_per_feed are over-ridden by the AllBlogsRecipe class, but you can comment those lines out if you want to use different values for each recipe.
Note 1: I've never coded in python before. This code is crude, a hack, and probably has unnecessary lines, but it works for me! Note 2: I removed most of my custom blog recipes, for brevity's sake. Two of the recipes included, Front Porch Republic and First Things would probably make good additions to Calibre's built-in recipe list. Question for anyone who knows: How does Calibre know to use only the AllBlogsRecipe class, and not the custom classes before it? Code:
#!/usr/bin/python from __future__ import with_statement import os, time, traceback, re, urlparse, sys, cStringIO from collections import defaultdict from functools import partial from contextlib import nested, closing from calibre import browser, __appname__, iswindows, \ strftime, preferred_encoding, as_unicode from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag from calibre.ebooks.metadata.opf2 import OPFCreator from calibre import entity_to_unicode from calibre.web import Recipe from calibre.ebooks.metadata.toc import TOC from calibre.ebooks.metadata import MetaInformation from calibre.web.feeds import feed_from_xml, templates, feeds_from_index, Feed from calibre.web.fetch.simple import option_parser as web2disk_option_parser from calibre.web.fetch.simple import RecursiveFetcher from calibre.utils.threadpool import WorkRequest, ThreadPool, NoResultsPending from calibre.ptempfile import PersistentTemporaryFile from calibre.utils.date import now as nowf from calibre.utils.magick.draw import save_cover_data_to, add_borders_to_image class FirstThingsRecipe(BasicNewsRecipe): title = "First Things: On the Square" description = "Daily columns from First Things' top writers" language = 'en' use_embedded_content = False encoding = "cp1252" feeds = [('First Things: On the Square', 'http://www.firstthings.com/rss/onthesquare.php')] remove_tags = [dict(name='script', attrs={'type':'text/javascript'})] no_stylesheets = True def print_version(self, url): return url.replace('http://www.firstthings.com/', 'http://www.firstthings.com/print/') class FPRRecipe(BasicNewsRecipe): title = "Front Porch Republic" description = "Place, Limits, Liberty" language = 'en' use_embedded_content = False feeds = [('Front Porch Republic', 'http://feeds.feedburner.com/FrontPorchRepublic')] no_stylesheets = True keep_only_tags = [dict(id=['content', 'heading'])] remove_tags = [dict(name='div', attrs={'id':'respond'}), dict(name='div', attrs={'class':'prev_next post_nav'})] preprocess_regexps = [ (re.compile(r'middot;.*?</div>', re.DOTALL|re.IGNORECASE), lambda match: 'nbsp;</div>') ] class NashvillestRecipe(BasicNewsRecipe): title = "Nashvillest" language = "en" description = "All you ever wanted to know about the Music City" feeds = [('Nashvillest', 'http://nashvillest.com/feed/')] remove_tags = [dict(name='a', attrs={'rel':'nofollow'})] class AllBlogsRecipe(BasicNewsRecipe): title = "Justin's Blogs" oldest_article = 3 max_articles_per_feed = 100 remove_empty_feeds = True remove_javascript = True no_stylesheets = True def build_index(self): self.report_progress(0, _('Fetching feeds...')) self.jobs = [] feeds = [] feedcount = 0 for obj in self.recipe_objects: obj.jobs = [] self.jobs = [] self.web2disk_cmdline = [ 'web2disk', '--timeout', str(obj.timeout), '--max-recursions', str(obj.recursions), '--delay', str(obj.delay), ] if obj.verbose: self.web2disk_cmdline.append('--verbose') if obj.no_stylesheets: self.web2disk_cmdline.append('--dont-download-stylesheets') for reg in obj.match_regexps: self.web2disk_cmdline.extend(['--match-regexp', reg]) for reg in obj.filter_regexps: self.web2disk_cmdline.extend(['--filter-regexp', reg]) self.web2disk_options = web2disk_option_parser().parse_args(self.web2disk_cmdline)[0] for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps', 'skip_ad_pages', 'preprocess_html', 'remove_tags_after', 'remove_tags_before', 'is_link_wanted'): setattr(self.web2disk_options, extra, getattr(obj, extra)) self.web2disk_options.postprocess_html = obj._postprocess_html self.web2disk_options.encoding = obj.encoding obj.max_articles_per_feed = self.max_articles_per_feed obj.oldest_article = self.oldest_article try: temp_feeds = feeds_from_index(obj.parse_index(), oldest_article=self.oldest_article, max_articles_per_feed=self.max_articles_per_feed, log=self.log) self.report_progress(0, _('Got feeds from index page')) except NotImplementedError: temp_feeds = obj.parse_feeds() remove = [f for f in temp_feeds if len(f) == 0 and self.remove_empty_feeds] for f in remove: temp_feeds.remove(f) obj.has_single_feed = len(temp_feeds) == 1 obj.feed_objects = temp_feeds if obj.reverse_article_order: for feed in temp_feeds: if hasattr(feed, 'reverse'): feed.reverse() for f, feed in enumerate(temp_feeds): myf = f + feedcount feed_dir = os.path.join(self.output_dir, 'feed_%d'%myf) if not os.path.isdir(feed_dir): os.makedirs(feed_dir) for a, article in enumerate(feed): if a >= self.max_articles_per_feed: break art_dir = os.path.join(feed_dir, 'article_%d'%a) if not os.path.isdir(art_dir): os.makedirs(art_dir) try: url = obj.print_version(article.url) except NotImplementedError: url = article.url except: self.log.exception('Failed to find print version for: '+article.url) url = None if not url: continue func, arg = (self.fetch_embedded_article, article) \ if obj.use_embedded_content or (obj.use_embedded_content == None and feed.has_embedded_content()) \ else \ ((self.fetch_obfuscated_article if obj.articles_are_obfuscated \ else self.fetch_article), url) req = WorkRequest(func, (arg, art_dir, f, a, len(feed)), {}, (f, a), self.article_downloaded, self.error_in_article_download) req.feed = feed req.article = article req.feed_dir = feed_dir self.jobs.append(req) self.has_single_feed = len(temp_feeds) == 1 self.feed_objects = temp_feeds self.jobs_done = 0 tp = ThreadPool(obj.simultaneous_downloads) for req in self.jobs: tp.putRequest(req, block=True, timeout=0) self.report_progress(0, _('Starting download [%d thread(s)]...')%obj.simultaneous_downloads) while True: try: tp.poll() time.sleep(0.1) except NoResultsPending: break for f, feed in enumerate(temp_feeds): myf = f+feedcount html = self.feed2index(f,temp_feeds) feed_dir = os.path.join(self.output_dir, 'feed_%d'%myf) with open(os.path.join(feed_dir, 'index.html'), 'wb') as fi: fi.write(html) feeds.extend(temp_feeds) feedcount = feedcount + len(temp_feeds) remove = [f for f in feeds if len(f) == 0 and self.remove_empty_feeds] for f in remove: feeds.remove(f) if not feeds: raise ValueError('No articles found, aborting') #feeds = FeedCollection(feeds) self.has_single_feed = len(temp_feeds) == 1 self.feed_objects = temp_feeds self.report_progress(0, _('Trying to download cover...')) self.download_cover() self.report_progress(0, _('Generating masthead...')) self.masthead_path = None try: murl = self.get_masthead_url() except: self.log.exception('Failed to get masthead url') murl = None if murl is not None: # Try downloading the user-supplied masthead_url # Failure sets self.masthead_path to None self.download_masthead(murl) if self.masthead_path is None: self.log.info("Synthesizing mastheadImage") self.masthead_path = os.path.join(self.output_dir, 'mastheadImage.jpg') try: self.default_masthead_image(self.masthead_path) except: self.log.exception('Failed to generate default masthead image') self.masthead_path = None if self.test: feeds = feeds[:2] self.has_single_feed = len(feeds) == 1 index = os.path.join(self.output_dir, 'index.html') html = self.feeds2index(feeds) with open(index, 'wb') as fi: fi.write(html) self.feed_objects = feeds #feeds.restore_duplicates() self.create_opf(feeds) self.report_progress(1, _('Feeds downloaded to %s')%index) return index def __init__ (self, options, log, progress_reporter): BasicNewsRecipe.__init__(self, options, log, progress_reporter) self.recipe_objects = [ NashvillestRecipe(options, log, progress_reporter), FPRRecipe(options, log, progress_reporter), FirstThingsRecipe(options, log, progress_reporter) ] |
05-03-2011, 04:19 PM | #2 |
creator of calibre
Posts: 43,871
Karma: 22666666
Join Date: Oct 2006
Location: Mumbai, India
Device: Various
|
calibre just picks the first or last (dont remember) subclass of BasicNewsRecipe it finds in the module. I'd suggest declaring your sub recipes withing the AllBlog class, like this
Code:
class AllBlogs(BasicNewsRecipe): class Blog1(BasicNewsRecipe): .... and refer of Blog1 as AllBlogs.Blog1 |
Advert | |
|
05-04-2011, 11:15 AM | #3 |
Junior Member
Posts: 2
Karma: 10
Join Date: May 2011
Device: Kindle (3rd gen)
|
That's a good idea. Thanks.
|
|
Similar Threads | ||||
Thread | Thread Starter | Forum | Replies | Last Post |
Recipe works when mocked up as Python file, fails when converted to Recipe | ode | Recipes | 7 | 09-04-2011 04:57 AM |
Please help with Meta-tags | brodymcd | Calibre | 1 | 07-29-2010 06:13 PM |
set meta data with ebook-meta and ebook-convert | krischik | Calibre | 6 | 01-19-2010 11:40 AM |
Meta info | rambling | Calibre | 0 | 11-26-2008 08:22 PM |
Need help with lrf-meta | shousa | Workshop | 8 | 02-03-2008 06:50 PM |