View Single Post
Old 10-13-2011, 04:43 AM   #5
RichardN
Junior Member
RichardN began at the beginning.
 
Posts: 8
Karma: 10
Join Date: Mar 2011
Location: London, UK
Device: Paperwhite
I am happily using a very slightly expanded version of Krittika Goyals code, there are certain sections it does not get correctly ; and I will include them when I have debugged the problem. Try using this which gives most of what is needed


=============================================
Code:
import string, re
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup

class NYTimes(BasicNewsRecipe):

    title       = 'The Spectator'
    __author__  = 'Krittika Goyal'
    description = 'UK magazine'
    timefmt = ' [%d %b, %Y]'
    needs_subscription = False
    
    no_stylesheets = True
    auto_cleanup = True


    def articles_in_spec_section(self, section_url):
        articles = []
        soup = self.index_to_soup(section_url)
        div = soup.find(id='centre')
        for x in div.findAll(True):
                if x.name == 'h1':
                    # Article found
                    title = self.tag_to_string(x)
                    self.log('\tFound article:', title)
                    a = x.find('a', href=True)
                    if a is None:
                        continue
                    url = a['href']
                    if url.startswith('/'):
                        url = 'http://www.spectator.co.uk'+url
                    articles.append({'title':title, 'url':url,
                           'description':'', 'date':''})
        return articles
                    
   
    # To parse article toc
    def parse_index(self):
        sections = []
        for title, url in [
              ('Politics', 'http://www.spectator.co.uk/politics/all/'),
              ('Essays', 'http://www.spectator.co.uk/essays/'),
              ('Wit & Wisdom', 'http://www.spectator.co.uk/wit-and-wisdom/all/'),
              ('Columnists', 'http://www.spectator.co.uk/columnists/all/'),
              ('Arts', 'http://www.spectator.co.uk/arts-and-culture/featured/'),
#              ('Books', 'http://www.spectator.co.uk/books/'),
                   ]:
            self.log('Processing section:', title)
            articles = self.articles_in_spec_section(url)
            if articles:
                 sections.append((title,articles))
#        raise SystemExit(0)
        return sections
==========================================

Last edited by Starson17; 10-13-2011 at 09:08 AM.
RichardN is offline   Reply With Quote