I'm writing a recipe for
http://www.economist.com/theworldin/2013
But I had a problem detecting all the articles because the first article of each section is different from the rest. I do know how to write two recipes that would include all the articles, but haven't figured out a way to do it in a single recipe.
Here is the recipe that fetches all the articles except the first article of each section. I'd appreciate it if someone can take a look and tweak the recipe.
Code:
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString
from collections import OrderedDict
import re
class AdvancedUserRecipe1342144530(BasicNewsRecipe):
title = 'The World In 2013'
language = 'en'
__author__ = "Kovid Goyal"
INDEX = 'http://www.economist.com/theworldin/2013'
description = ('Global news and current affairs from a European'
' perspective. Best downloaded on Friday mornings (GMT)')
extra_css = '''
.headline {font-size: large;}
'''
keep_only_tags = [dict(name='article')]
no_stylesheets = True
delay = 1
def parse_index(self):
articles = []
soup = self.index_to_soup(self.INDEX)
feeds = []
for section in soup.findAll('section'):
h1 = section.find('h1')
if h1 is None:
continue
section_title = self.tag_to_string(h1)
if not section_title:
continue
self.log('Found section:', section_title)
for section in soup.findAll('section'):
h1 = section.find('h1')
if h1 is None:
continue
section_title = self.tag_to_string(h1)
if not section_title:
continue
self.log('Found section:', section_title)
articles = []
for post in section.findAll('li'):
a = post.find(attrs={'class':'headline'})
if a is None:
continue
title = self.tag_to_string(a)
url = a['href']
if url.startswith('/'): url = 'http://www.economist.com'+url
self.log('\tFound article:', title, 'at', url)
articles.append({'title':title, 'url':url, 'description':'',
'date':''})
if articles:
feeds.append((section_title, articles))
return feeds