Actually, you can still download the first few sentences, so you can see what's in the articles and some of them do download completely. This code works for me if you want to try it.
Update to the Scientific American Recipe
Code:
#!/usr/bin/env python2
__license__ = 'GPL v3'
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.utils.date import now
from css_selectors import Select
def absurl(url):
if url.startswith('/'):
url = 'http://www.scientificamerican.com' + url
return url
keep_classes = {'article-header', 'article-content',
'article-media', 'article-author', 'article-text'}
remove_classes = {'aside-banner', 'moreToExplore', 'article-footer'}
class ScientificAmerican(BasicNewsRecipe):
title = u'Scientific American'
description = u'Popular Science. Monthly magazine. Should be downloaded around the middle of each month.'
category = 'science'
__author__ = 'Kovid Goyal'
no_stylesheets = True
language = 'en'
publisher = 'Nature Publishing Group'
remove_empty_feeds = True
remove_javascript = True
timefmt = ' [%B %Y]'
needs_subscription = 'optional'
keep_only_tags = [
dict(attrs={'class': lambda x: x and bool(
set(x.split()).intersection(keep_classes))}),
]
remove_tags = [
dict(attrs={'class': lambda x: x and bool(
set(x.split()).intersection(remove_classes))}),
dict(id=['seeAlsoLinks']),
]
def get_browser(self, *args):
br = BasicNewsRecipe.get_browser(self)
if self.username and self.password:
br.open('https://www.scientificamerican.com/my-account/login/')
br.select_form(predicate=lambda f: f.attrs.get('id') == 'login')
br['emailAddress'] = self.username
br['password'] = self.password
br.submit()
return br
def parse_index(self):
# Get the cover, date and issue URL
root = self.index_to_soup(
'http://www.scientificamerican.com/sciammag/', as_tree=True)
select = Select(root)
url = [x.get('content', '') for x in select('html > head meta') if x.get('property',None) == "og:url"][0]
self.cover_url = [x.get('src', '') for x in select('main .product-detail__image picture img')][0]
# Now parse the actual issue to get the list of articles
select = Select(self.index_to_soup(url, as_tree=True))
feeds = []
for i, section in enumerate(select('#sa_body .toc-articles')):
if i == 0:
feeds.append(
('Features', list(self.parse_sciam_features(select, section))))
else:
feeds.extend(self.parse_sciam_departments(select, section))
return feeds
def parse_sciam_features(self, select, section):
for article in select('article[data-article-title]', section):
title = article.get('data-article-title')
for a in select('a[href]', article):
url = absurl(a.get('href'))
break
desc = ''
for p in select('p.t_body', article):
desc = self.tag_to_string(p)
break
self.log('Found feature article: %s at %s' % (title, url))
self.log('\t' + desc)
yield {'title': title, 'url': url, 'description': desc}
def parse_sciam_departments(self, select, section):
section_title, articles = 'Unknown', []
for li in select('li[data-article-title]', section):
for span in select('span.department-title', li):
if articles:
yield section_title, articles
section_title, articles = self.tag_to_string(span), []
self.log('\nFound section: %s' % section_title)
break
for a in select('h2 a[href]', li):
title = self.tag_to_string(a)
url = absurl(a.get('href'))
articles.append(
{'title': title, 'url': url, 'description': ''})
self.log('\tFound article: %s at %s' % (title, url))
if articles:
yield section_title, articles