I have included 3 of the sections of the website. also I used auto clean up which removes one or two pictures. you can do the clean up in detail if you wish. for the most part he auto clean up works very well.
Hope this helps
Code:
import string, re
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class NYTimes(BasicNewsRecipe):
title = 'The Spectator'
__author__ = 'Krittika Goyal'
description = 'UK magazine'
timefmt = ' [%d %b, %Y]'
needs_subscription = False
no_stylesheets = True
auto_cleanup = True
def articles_in_spec_section(self, section_url):
articles = []
soup = self.index_to_soup(section_url)
div = soup.find(id='centre')
for x in div.findAll(True):
if x.name == 'h1':
# Article found
title = self.tag_to_string(x)
self.log('\tFound article:', title)
a = x.find('a', href=True)
if a is None:
continue
url = a['href']
if url.startswith('/'):
url = 'http://www.spectator.co.uk'+url
articles.append({'title':title, 'url':url,
'description':'', 'date':''})
return articles
# To parse article toc
def parse_index(self):
sections = []
for title, url in [
('Politics', 'http://www.spectator.co.uk/politics/all/'),
('Essays', 'http://www.spectator.co.uk/essays/'),
('Columnists', 'http://www.spectator.co.uk/columnists/all/'),
]:
self.log('Processing section:', title)
articles = self.articles_in_spec_section(url)
if articles:
sections.append((title,articles))
# raise SystemExit(0)
return sections