Newsweek recipe fails

NSILMike · 11-20-2018, 11:07 AM

IndexError: list index out of range

lui1 · 11-21-2018, 10:27 PM

This update fixes the problem.

Newsweek Recipe Update:

Code:

from calibre.web.feeds.news import BasicNewsRecipe
from collections import defaultdict

BASE = 'https://www.newsweek.com'


def href_to_url(a, add_piano=False):
    return BASE + a.get('href') + ('?piano_d=1' if add_piano else '')


def class_sels(*args):
    q = set(args)
    return dict(attrs={'class': lambda x: x and set(x.split()).intersection(q)})


class Newsweek(BasicNewsRecipe):

    title = 'Newsweek'
    __author__ = 'Kovid Goyal'
    description = 'Weekly news and current affairs in the US'
    language = 'en'
    encoding = 'utf-8'
    no_stylesheets = True
    requires_version = (1, 40, 0)

    keep_only_tags = [
        dict(id='block-nw-magazine-article-header'),
        class_sels('article-header', 'article-body')
    ]
    remove_tags = [
        dict(name=['aside', 'meta', 'source']),
        class_sels(
            'block-openadstream', 'block-ibtmedia-social', 'issue-next',
            'most-popular', 'ibt-media-stories', 'user-btn-group',
            'trial-link', 'trc_related_container',
            'block-ibtmedia-top-stories', 'videocontent', 'newsletter-signup',
            'in-text-slideshows', 'content-correction', 'article-navigation'
        ),
        dict(id=['taboola-below-main-column', 'piano-root',
                 'block-nw-magazine-magazine-more-from-issue']),
    ]
    remove_attributes = ['style']

    def parse_index(self):
        root = self.index_to_soup(
            'https://www.newsweek.com/archive', as_tree=True)
        li = root.xpath(
            '//ul[contains(@class, "magazine-archive-items")]/li')[0]
        a = li.xpath('descendant::a[@href]')[0]
        url = href_to_url(a, add_piano=True)
        self.timefmt = self.tag_to_string(a)
        img = li.xpath('descendant::a[@href]//img[@src]')[0]
        self.cover_url = img.get('src')
        root = self.index_to_soup(url, as_tree=True)
        features = []
        href_xpath = 'descendant::*[local-name()="h1" or local-name()="h2" or local-name()="h3" or local-name()="h4"]/a[@href]'
        try:
            div = root.xpath('//div[@id="block-nw-magazine-magazine-features"]')[0]
        except IndexError:
            pass
        else:
            for a in div.xpath(href_xpath):
                title = self.tag_to_string(a)
                article = a.xpath('ancestor::article')[0]
                desc = ''
                s = article.xpath('descendant::div[@class="summary"]')
                if s:
                    desc = self.tag_to_string(s[0])
                features.append({'title': title, 'url': href_to_url(a), 'description': desc})
                self.log(title, href_to_url(a))

        index = []
        if features:
            index.append(('Features', features))
        sections = defaultdict(list)
        for block in ('magazine-magazine-issue-story-list', 'editors-pick'):
            div = root.xpath(
                '//div[@id="block-nw-{}"]'.format(block))[0]
            for a in div.xpath(href_xpath):
                title = self.tag_to_string(a)
                article = a.xpath('ancestor::article')[0]
                desc = ''
                s = article.xpath('descendant::div[@class="summary"]')
                if s:
                    desc = self.tag_to_string(s[0])
                sec = article.xpath('descendant::div[@class="category"]')
                if sec:
                    sec = self.tag_to_string(sec[0])
                else:
                    sec = 'Articles'
                sections[sec].append(
                    {'title': title, 'url': href_to_url(a), 'description': desc})
                self.log(title, href_to_url(a))
                if desc:
                    self.log('\t' + desc)
                self.log('')
        for k in sorted(sections):
            index.append((k, sections[k]))
        return index

    def print_version(self, url):
        return url + '?piano_d=1'

    def preprocess_html(self, soup):
        # Parallax images in the articles are loaded as background images
        # on <span> tags. Convert them to normal images.
        for span in soup.findAll('span', attrs={'class': lambda x: x and 'parallax' in x.split()}):
            s = span.find(style=True)
            if s is not None:
                url = s['style'].partition('(')[-1][:-1]
                s['style'] = 'display: block'
                s.name = 'img'
                s['src'] = url
        return soup

NSILMike · 11-22-2018, 08:37 AM

Thanks, but which lines changed?

lui1 · 11-22-2018, 10:44 AM

Quote:

Originally Posted by NSILMike

Thanks, but which lines changed?

The main problem was the "xpath expression" found on line 49 in the original recipe, which describes the pattern to find the cover of the magazine. The other changes or just cosmetic improvements which just remove things that don't belong to the articles, e.g. ads and unrelated material.

Here's a diff of the original and modified versions:

Code:

--- original	2018-11-22 07:22:37.923388857 -0800
+++ modified	2018-11-22 07:24:10.971801489 -0800
@@ -1,7 +1,7 @@
 from calibre.web.feeds.news import BasicNewsRecipe
 from collections import defaultdict
 
-BASE = 'http://www.newsweek.com'
+BASE = 'https://www.newsweek.com'
 
 
 def href_to_url(a, add_piano=False):
@@ -23,15 +23,18 @@
     no_stylesheets = True
     requires_version = (1, 40, 0)
 
-    keep_only_tags = class_sels(
-        'article-header', 'article-body', 'header-image')
+    keep_only_tags = [
+        dict(id='block-nw-magazine-article-header'),
+        class_sels('article-header', 'article-body')
+    ]
     remove_tags = [
-        dict(name='meta'),
+        dict(name=['aside', 'meta', 'source']),
         class_sels(
             'block-openadstream', 'block-ibtmedia-social', 'issue-next',
             'most-popular', 'ibt-media-stories', 'user-btn-group',
             'trial-link', 'trc_related_container',
             'block-ibtmedia-top-stories', 'videocontent', 'newsletter-signup',
+            'in-text-slideshows', 'content-correction', 'article-navigation'
         ),
         dict(id=['taboola-below-main-column', 'piano-root',
                  'block-nw-magazine-magazine-more-from-issue']),
@@ -46,7 +49,7 @@
         a = li.xpath('descendant::a[@href]')[0]
         url = href_to_url(a, add_piano=True)
         self.timefmt = self.tag_to_string(a)
-        img = li.xpath('descendant::a[@href]/img[@src]')[0]
+        img = li.xpath('descendant::a[@href]//img[@src]')[0]
         self.cover_url = img.get('src')
         root = self.index_to_soup(url, as_tree=True)
         features = []

kovidgoyal · 11-22-2018, 11:34 AM

thanks, I've updated the builtin recipe

11-20-2018, 11:07 AM	#1
NSILMike Guru Posts: 735 Karma: 35936 Join Date: Apr 2011 Location: Shrewsury, MA Device: Lenovo Android Tablet	Newsweek recipe fails IndexError: list index out of range

Similar Threads
Thread	Thread Starter	Forum	Replies	Last Post
Newsweek Recipe Error	lui1	Recipes	3	08-03-2018 07:21 PM
Newsweek recipe now fails	NSILMike	Recipes	6	08-02-2017 06:40 PM
Fails to download Newsweek	snailslow	Recipes	5	04-08-2016 10:58 PM
Newsweek recipe broken?	NSILMike	Recipes	3	08-04-2011 10:02 PM
Newsweek Recipe	SnafuRacer	Calibre	5	07-07-2008 02:35 PM

11-22-2018, 08:37 AM	#3
NSILMike Guru Posts: 735 Karma: 35936 Join Date: Apr 2011 Location: Shrewsury, MA Device: Lenovo Android Tablet	Thanks, but which lines changed?

11-22-2018, 11:34 AM	#5
kovidgoyal creator of calibre Posts: 43,858 Karma: 22666666 Join Date: Oct 2006 Location: Mumbai, India Device: Various	thanks, I've updated the builtin recipe

Advert

Advert