MobileRead Forums - View Single Post

Steven630 · 02-11-2013, 05:06 AM

I'm writing a recipe for http://www.economist.com/theworldin/2013

But I had a problem detecting all the articles because the first article of each section is different from the rest. I do know how to write two recipes that would include all the articles, but haven't figured out a way to do it in a single recipe.

Here is the recipe that fetches all the articles except the first article of each section. I'd appreciate it if someone can take a look and tweak the recipe.

Code:

from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString
from collections import OrderedDict

import re

class AdvancedUserRecipe1342144530(BasicNewsRecipe):

    title = 'The World In 2013'
    language = 'en'

    __author__ = "Kovid Goyal"
    INDEX = 'http://www.economist.com/theworldin/2013'
    description = ('Global news and current affairs from a European'
            ' perspective. Best downloaded on Friday mornings (GMT)')
    extra_css      = '''
        .headline {font-size: large;}
        '''

    keep_only_tags = [dict(name='article')]
    no_stylesheets = True

    delay = 1



    def parse_index(self):
        articles = []
        soup = self.index_to_soup(self.INDEX)
        feeds = []

        for section in soup.findAll('section'):
            h1 = section.find('h1')
            if h1 is None:
                continue
            section_title = self.tag_to_string(h1)
            if not section_title:
                continue
            self.log('Found section:', section_title)

        for section in soup.findAll('section'):
            h1 = section.find('h1')
            if h1 is None:
                continue
            section_title = self.tag_to_string(h1)
            if not section_title:
                continue
            self.log('Found section:', section_title)
            articles = []
            for post in section.findAll('li'):
                a = post.find(attrs={'class':'headline'})
                if a is None:
                    continue
                title = self.tag_to_string(a)
                url = a['href']
                if url.startswith('/'): url = 'http://www.economist.com'+url
                self.log('\tFound article:', title, 'at', url)
                articles.append({'title':title, 'url':url, 'description':'',
                    'date':''})
            if articles:
                feeds.append((section_title, articles))
        return feeds