View Single Post
Old 10-13-2011, 04:43 AM   #5
Junior Member
RichardN began at the beginning.
Posts: 8
Karma: 10
Join Date: Mar 2011
Location: London, UK
Device: Paperwhite
I am happily using a very slightly expanded version of Krittika Goyals code, there are certain sections it does not get correctly ; and I will include them when I have debugged the problem. Try using this which gives most of what is needed

import string, re
from calibre import strftime
from import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup

class NYTimes(BasicNewsRecipe):

    title       = 'The Spectator'
    __author__  = 'Krittika Goyal'
    description = 'UK magazine'
    timefmt = ' [%d %b, %Y]'
    needs_subscription = False
    no_stylesheets = True
    auto_cleanup = True

    def articles_in_spec_section(self, section_url):
        articles = []
        soup = self.index_to_soup(section_url)
        div = soup.find(id='centre')
        for x in div.findAll(True):
                if == 'h1':
                    # Article found
                    title = self.tag_to_string(x)
                    self.log('\tFound article:', title)
                    a = x.find('a', href=True)
                    if a is None:
                    url = a['href']
                    if url.startswith('/'):
                        url = ''+url
                    articles.append({'title':title, 'url':url,
                           'description':'', 'date':''})
        return articles
    # To parse article toc
    def parse_index(self):
        sections = []
        for title, url in [
              ('Politics', ''),
              ('Essays', ''),
              ('Wit & Wisdom', ''),
              ('Columnists', ''),
              ('Arts', ''),
#              ('Books', ''),
            self.log('Processing section:', title)
            articles = self.articles_in_spec_section(url)
            if articles:
#        raise SystemExit(0)
        return sections

Last edited by Starson17; 10-13-2011 at 09:08 AM.
RichardN is offline   Reply With Quote