MobileRead Forums - View Single Post - longform.org (My first recipe, please critique)

Barty · 11-21-2011, 07:41 PM

longform.org is an aggregate / curate site for long general-interest articles on the web. It has a proper feed but the links are to summaries on its own site, not to the original articles. Maybe there's a simple workaround for this, but I don't know so I wrote a recipe. It's my first and also first time doing something with python, so it's probably extremely naive.

Code:

import re

from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag, NavigableString


class AdvancedUserRecipe1321856301(BasicNewsRecipe):
	title          = u'Longform.org'
	__author__     = 'barty on mobileread.com forum'
	publisher      = 'longform.org'
	category       = 'essay, long form jounnalism'
	max_articles_per_feed = 100
	oldest_article = 365
	auto_cleanup   = True
	feeds          = [
		(u'Editor\'s Picks', u'http://longform.org/category/editors-pick/feed'),
		(u'More articles', u'http://longform.org/feed')
		]

	def parse_index(self):
		self.cover_url = 'http://longform.org/wp-content/themes/grid_focus_april2011/images/longform_flag.jpg'
		seen_urls = set([])
		totalfeeds = []
		lfeeds = self.get_feeds()
		for feedobj in lfeeds:
			feedtitle, feedurl = feedobj
			articles = []
			soup = self.index_to_soup(feedurl)
			#for atag in soup.findAll(lambda tag: tag.name=='a' and tag.string and tag.string.lower()=='full story'):
			for item in soup.findAll('item'):
				content = item.find('content:encoded')
				if content:
					#m = re.search( r' href="(http://(?<!(long\.fm)).+?)">full story<', content.string, re.I)
					m = re.search( r' href="(.+?)">full story<', content.contents[0], re.I)
					if m:
						url = m.group(1)
						# skip promotionals
						if url.startswith('http://long.fm') or url in seen_urls:
							continue
						seen_urls.add(url)
						date = item.find('pubdate').contents[0]
						date = date[:16] if date else ''
						#print url
						#print date
						# there is a description tag but it is always truncated so prefer content:encoded
						m = re.search( r'.+?<br\s*/>(.+)\[<a href="http://(www\.)?([^:/]+)', content.contents[0], re.DOTALL|re.I)
						desc = '['+ m.group(3)+'] '+m.group(1) if m else item.description.contents[0]
						#print desc
						articles.append({'title':item.title.contents[0],'url':url,
							'date':date,'description':desc})
			totalfeeds.append((feedtitle, articles))
		return totalfeeds

It mostly works. Articles from vanityfair get truncated for some reason. The way I pull the url out of the content-encoded field is rather ugly (find doesn't work the way I thought/hoped ut would). Lastly, when I run in test mode, today's date is added to the book title (which I like), but not when running for real.