MobileRead Forums - View Single Post - The Spectator Magazine

Krittika Goyal · 09-04-2011, 06:22 PM

I have included 3 of the sections of the website. also I used auto clean up which removes one or two pictures. you can do the clean up in detail if you wish. for the most part he auto clean up works very well.

Hope this helps

Code:

import string, re
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup

class NYTimes(BasicNewsRecipe):

    title       = 'The Spectator'
    __author__  = 'Krittika Goyal'
    description = 'UK magazine'
    timefmt = ' [%d %b, %Y]'
    needs_subscription = False
    
    no_stylesheets = True
    auto_cleanup = True


    def articles_in_spec_section(self, section_url):
        articles = []
        soup = self.index_to_soup(section_url)
        div = soup.find(id='centre')
        for x in div.findAll(True):
                if x.name == 'h1':
                    # Article found
                    title = self.tag_to_string(x)
                    self.log('\tFound article:', title)
                    a = x.find('a', href=True)
                    if a is None:
                        continue
                    url = a['href']
                    if url.startswith('/'):
                        url = 'http://www.spectator.co.uk'+url
                    articles.append({'title':title, 'url':url,
                           'description':'', 'date':''})
        return articles
                    
   
    # To parse article toc
    def parse_index(self):
        sections = []
        for title, url in [
              ('Politics', 'http://www.spectator.co.uk/politics/all/'),
              ('Essays', 'http://www.spectator.co.uk/essays/'),
              ('Columnists', 'http://www.spectator.co.uk/columnists/all/'),
                   ]:
            self.log('Processing section:', title)
            articles = self.articles_in_spec_section(url)
            if articles:
                 sections.append((title,articles))
#        raise SystemExit(0)
        return sections