8/7/2012
A minor update. Somehow my linux calibre is unable to download cover pic and my windows one is fine. Anyways this is a quick work around. I will keep updating (if issues come up) in this thread.
Code:
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
from collections import OrderedDict
class TNR(BasicNewsRecipe):
title = 'The New Republic'
__author__ = 'Rick Shang'
description = 'The New Republic is a journal of opinion with an emphasis on politics and domestic and international affairs. It carries feature articles by staff and contributing editors. The second half of each issue is devoted to book and the arts, theater, motion pictures, music and art.'
language = 'en'
category = 'news'
encoding = 'UTF-8'
remove_tags = [dict(attrs={'class':['print-logo','print-site_name','print-hr']})]
no_javascript = True
no_stylesheets = True
def parse_index(self):
#Go to the issue
soup0 = self.index_to_soup('http://www.tnr.com/magazine-issues')
issue = soup0.find('div',attrs={'id':'current_issue'})
#Find date
date = self.tag_to_string(issue.find('div',attrs={'class':'date'})).strip()
self.timefmt = u' [%s]'%date
#Go to the main body
current_issue_url = 'http://www.tnr.com' + issue.find('a', href=True)['href']
soup = self.index_to_soup(current_issue_url)
div = soup.find ('div', attrs={'class':'article_detail_body'})
feeds = OrderedDict()
section_title = ''
subsection_title = ''
for post in div.findAll('p'):
articles = []
em=post.find('em')
b=post.find('b')
a=post.find('a',href=True)
p=post.find('img', src=True)
#Find cover
if p is not None:
self.cover_url = p['src'].strip()
elif em is not None:
section_title = self.tag_to_string(em).strip()
subsection_title = ''
elif b is not None:
subsection_title=self.tag_to_string(b).strip()
elif a is not None:
prefix = (subsection_title+': ') if subsection_title else ''
url=re.sub('www.tnr.com','www.tnr.com/print', a['href'])
author=re.sub('.*by\s', '', self.tag_to_string(post), re.DOTALL)
title=prefix + self.tag_to_string(a).strip()+ u' (%s)'%author
articles.append({'title':title, 'url':url, 'description':'', 'date':''})
if articles:
if section_title not in feeds:
feeds[section_title] = []
feeds[section_title] += articles
ans = [(key, val) for key, val in feeds.iteritems()]
return ans