View Single Post
Old 01-04-2014, 04:07 PM   #2
aerodynamik
Enthusiast
aerodynamik doesn't litteraerodynamik doesn't litter
 
Posts: 43
Karma: 136
Join Date: Mar 2011
Device: Kindle Paperwhite
It looks like they added another articles_list for the letters.

A quick fix would be to find more then one articles_list and iterate then over all table of contents, not just the first one.

Try this for now. This is Kovid's recipe so he should decide how the final fix should be.

Spoiler:
Code:
#!/usr/bin/env  python
__license__   = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'

'''
nybooks.com
'''
import re

from calibre.web.feeds.news import BasicNewsRecipe

def find_header(tag):
    return tag.name == 'header' and tag.parent['class'] == 'article'

class NewYorkReviewOfBooks(BasicNewsRecipe):

    title = u'New York Review of Books (no subscription)'
    description = u'Book reviews'
    language = 'en'

    __author__ = 'Kovid Goyal'

    no_stylesheets = True
    no_javascript = True

    keep_only_tags = [
        dict(name='section', attrs={'class':'article_body'}),
        dict(name=find_header),
        dict(name='div', attrs={'class':['footnotes', 'for-subscribers-only']}),
    ]

    preprocess_regexps = [(re.compile(r'<head>.*?</head>', re.DOTALL), lambda
        m:'<head></head>')]

    def print_version(self, url):
        return url+'?pagination=false'

    def preprocess_html(self, soup):
        header = soup.find('header')
        body = soup.find('body')
        body.insert(0, header)
        header.find('div', attrs={'class':'details'}).extract()
        for i in soup.findAll('input'):
            i.extract()
        return soup

    def parse_index(self):
        soup = self.index_to_soup('http://www.nybooks.com/current-issue')

        # Find cover
        sidebar = soup.find('div', attrs={'class':'issue_cover'})
        if sidebar is not None:
            img = sidebar.find('img', src=True)
            self.cover_url = 'http://www.nybooks.com' + img['src']
            self.log('Found cover at:', self.cover_url)

        # Find date
        div = soup.find('time', pubdate='pubdate')
        if div is not None:
            text = self.tag_to_string(div)
            date = text.partition(u'\u2022')[0].strip()
            self.timefmt = u' [%s]'%date
            self.log('Issue date:', date)

        # Find TOC
        tocs = soup.find('div', attrs={'class':'current_issue'}).findAll('div', attrs={'class':'articles_list'})
        articles = []
        for toc in tocs:
            for div in toc.findAll('div', attrs={'class':'row'}):
                h2 = div.find('h2')
                title = self.tag_to_string(h2).strip()
                author = self.tag_to_string(div.find('div', attrs={'class':'author'})).strip()
                title = title + u' (%s)'%author
                url = 'http://www.nybooks.com' + h2.find('a', href=True)['href']
                desc = ''
                for p in div.findAll('p', attrs={'class':lambda x: x and 'quiet' in x}):
                    desc += self.tag_to_string(p)
                self.log('Found article:', title)
                self.log('\t', url)
                self.log('\t', desc)
                articles.append({'title':title, 'url':url, 'date':'',
                    'description':desc})

        return [('Current Issue', articles)]
aerodynamik is offline   Reply With Quote