Chatham House is one of the better known foreign policy think-tanks in London. Their monthly publication is called The World Today.
I didn't have to do much to get it to work, but I had real problems trying to format the text so it reads a bit better (eg split up the paragraphs). Perhaps someone could help? My attempt at this is in the recipe (attached) still but commented out.
Code:
__license__ = 'GPL v3'
class ChathamHouseTheWorldToday(BasicNewsRecipe):
title = u'Chatham House: The World Today'
oldest_article = 40
max_articles_per_feed = 100
publisher = u'Chatham House'
__author__ = u'Ben Leavett'
comments = u'Calibre recipe by Ben Leavett'
feeds = [(u'The World Today', u'http://www.chathamhouse.org.uk/rss/16/')]
# full content is in the RSS feed
use_embedded_content = True
page_with_cover_img = u'http://www.chathamhouse.org.uk/publications/twt/'
'''
Insert some line breaks into the HTML.
'''
def preprocess_html(self, soup):
''' BJL: this is intended to add in some line breaks
where it finds '\n' characters. It successfully builds 'newspan'
but the final call to 'replaceWith' only results in clearing the
contents of 'it', it doesn't then do the insert part of the replace.
for it in soup.findAll('span'):
# If we find at least one '\n' character in this span
if it.string.find('\n') > -1:
lines = it.string.split('\n')
newspan = Tag(soup, 'span')
i=0
for line in lines:
p = Tag(soup, 'p')
p.insert(0, NavigableString(line))
newspan.insert(i, p)
i+=1
it.replaceWith(newspan)
'''
return soup
def postprocess_html(self, soup, first_fetch):
return soup
def get_cover_url(self):
soup = self.index_to_soup(self.page_with_cover_img)
node = soup.find('div', {'id' : 'contentInner_subpage'}).h2.img
self.log('Found cover URL: ' + node['src'])
return node['src']
def get_masthead_url(self):
return u'http://www.chathamhouse.org.uk/images/main_logo.gif'