MobileRead Forums - View Single Post - News Fetch from Scientific American failing

lui1 · 01-08-2019, 07:14 PM

Actually, you can still download the first few sentences, so you can see what's in the articles and some of them do download completely. This code works for me if you want to try it.

Update to the Scientific American Recipe

Code:

#!/usr/bin/env  python2
__license__ = 'GPL v3'

from calibre.web.feeds.news import BasicNewsRecipe
from calibre.utils.date import now
from css_selectors import Select


def absurl(url):
    if url.startswith('/'):
        url = 'http://www.scientificamerican.com' + url
    return url

keep_classes = {'article-header', 'article-content',
                'article-media', 'article-author', 'article-text'}
remove_classes = {'aside-banner', 'moreToExplore', 'article-footer'}


class ScientificAmerican(BasicNewsRecipe):
    title = u'Scientific American'
    description = u'Popular Science. Monthly magazine. Should be downloaded around the middle of each month.'
    category = 'science'
    __author__ = 'Kovid Goyal'
    no_stylesheets = True
    language = 'en'
    publisher = 'Nature Publishing Group'
    remove_empty_feeds = True
    remove_javascript = True
    timefmt = ' [%B %Y]'

    needs_subscription = 'optional'

    keep_only_tags = [
        dict(attrs={'class': lambda x: x and bool(
            set(x.split()).intersection(keep_classes))}),
    ]
    remove_tags = [
        dict(attrs={'class': lambda x: x and bool(
            set(x.split()).intersection(remove_classes))}),
        dict(id=['seeAlsoLinks']),
    ]

    def get_browser(self, *args):
        br = BasicNewsRecipe.get_browser(self)
        if self.username and self.password:
            br.open('https://www.scientificamerican.com/my-account/login/')
            br.select_form(predicate=lambda f: f.attrs.get('id') == 'login')
            br['emailAddress'] = self.username
            br['password'] = self.password
            br.submit()
        return br

    def parse_index(self):
        # Get the cover, date and issue URL
        root = self.index_to_soup(
            'http://www.scientificamerican.com/sciammag/', as_tree=True)
        select = Select(root)
        url = [x.get('content', '') for x in select('html > head  meta') if x.get('property',None) == "og:url"][0]
        self.cover_url = [x.get('src', '') for x in select('main .product-detail__image picture img')][0]

        # Now parse the actual issue to get the list of articles
        select = Select(self.index_to_soup(url, as_tree=True))
        feeds = []
        for i, section in enumerate(select('#sa_body .toc-articles')):
            if i == 0:
                feeds.append(
                    ('Features', list(self.parse_sciam_features(select, section))))
            else:
                feeds.extend(self.parse_sciam_departments(select, section))

        return feeds

    def parse_sciam_features(self, select, section):
        for article in select('article[data-article-title]', section):
            title = article.get('data-article-title')
            for a in select('a[href]', article):
                url = absurl(a.get('href'))
                break
            desc = ''
            for p in select('p.t_body', article):
                desc = self.tag_to_string(p)
                break
            self.log('Found feature article: %s at %s' % (title, url))
            self.log('\t' + desc)
            yield {'title': title, 'url': url, 'description': desc}

    def parse_sciam_departments(self, select, section):
        section_title, articles = 'Unknown', []
        for li in select('li[data-article-title]', section):
            for span in select('span.department-title', li):
                if articles:
                    yield section_title, articles
                section_title, articles = self.tag_to_string(span), []
                self.log('\nFound section: %s' % section_title)
                break
            for a in select('h2 a[href]', li):
                title = self.tag_to_string(a)
                url = absurl(a.get('href'))
                articles.append(
                    {'title': title, 'url': url, 'description': ''})
                self.log('\tFound article: %s at %s' % (title, url))
        if articles:
            yield section_title, articles