It looks like they added another articles_list for the letters.
A quick fix would be to find more then one articles_list and iterate then over all table of contents, not just the first one.
Try this for now. This is Kovid's recipe so he should decide how the final fix should be.
Spoiler:
Code:
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
'''
nybooks.com
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
def find_header(tag):
return tag.name == 'header' and tag.parent['class'] == 'article'
class NewYorkReviewOfBooks(BasicNewsRecipe):
title = u'New York Review of Books (no subscription)'
description = u'Book reviews'
language = 'en'
__author__ = 'Kovid Goyal'
no_stylesheets = True
no_javascript = True
keep_only_tags = [
dict(name='section', attrs={'class':'article_body'}),
dict(name=find_header),
dict(name='div', attrs={'class':['footnotes', 'for-subscribers-only']}),
]
preprocess_regexps = [(re.compile(r'<head>.*?</head>', re.DOTALL), lambda
m:'<head></head>')]
def print_version(self, url):
return url+'?pagination=false'
def preprocess_html(self, soup):
header = soup.find('header')
body = soup.find('body')
body.insert(0, header)
header.find('div', attrs={'class':'details'}).extract()
for i in soup.findAll('input'):
i.extract()
return soup
def parse_index(self):
soup = self.index_to_soup('http://www.nybooks.com/current-issue')
# Find cover
sidebar = soup.find('div', attrs={'class':'issue_cover'})
if sidebar is not None:
img = sidebar.find('img', src=True)
self.cover_url = 'http://www.nybooks.com' + img['src']
self.log('Found cover at:', self.cover_url)
# Find date
div = soup.find('time', pubdate='pubdate')
if div is not None:
text = self.tag_to_string(div)
date = text.partition(u'\u2022')[0].strip()
self.timefmt = u' [%s]'%date
self.log('Issue date:', date)
# Find TOC
tocs = soup.find('div', attrs={'class':'current_issue'}).findAll('div', attrs={'class':'articles_list'})
articles = []
for toc in tocs:
for div in toc.findAll('div', attrs={'class':'row'}):
h2 = div.find('h2')
title = self.tag_to_string(h2).strip()
author = self.tag_to_string(div.find('div', attrs={'class':'author'})).strip()
title = title + u' (%s)'%author
url = 'http://www.nybooks.com' + h2.find('a', href=True)['href']
desc = ''
for p in div.findAll('p', attrs={'class':lambda x: x and 'quiet' in x}):
desc += self.tag_to_string(p)
self.log('Found article:', title)
self.log('\t', url)
self.log('\t', desc)
articles.append({'title':title, 'url':url, 'date':'',
'description':desc})
return [('Current Issue', articles)]