I have created a recipe that works when pointed at an archived edition of the magazine. But when I set Needs Subscription to True, it does not download any of the articles. The current issue is May 2022 and is only available to subscribers.
I have tested my login on the website itself, so I know that is working. I also don't get a login error so it seems that my login is working. However, none of the links become active. It almost feels like my login works but then the recipe attempts to download according to the recipe before the links are activated.
Does anyone have any tips for troubleshooting subscription recipes? Testing only seems to work on without subscriptions, so I can only test on archived issues which works. I have included my recipe below. I modified recipe for The Atlantic to start with. Thanks for any help anyone can provide.
Code:
#!/usr/bin/env python
# vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2015, Kovid Goyal <kovid at kovidgoyal.net>
from __future__ import unicode_literals
import json
from xml.sax.saxutils import escape, quoteattr
from calibre.web.feeds.news import BasicNewsRecipe
# {{{ parse article JSON
def process_image_block(lines, block):
caption = block.get('captionText')
caption_lines = []
if caption:
if block.get('attributionText', '').strip():
caption += ' (' + block['attributionText'] + ')'
caption_lines.append('<p style="font-style: italic">' + caption + '</p>')
lines.append('<div style="text-align: center"><img src={}/>'.format(quoteattr(block['url'])))
lines.extend(caption_lines)
lines.append('</div>')
def json_to_html(raw):
data = json.loads(raw)
# open('/t/p.json', 'w').write(json.dumps(data, indent=2))
data = sorted((v['data'] for v in data['props']['pageProps']['urqlState'].values()), key=len)[-1]
article = json.loads(data)['article']
lines = []
lines.append('<h1 style="align: center">' + escape(article['title']) + '</h1>')
lines.append('<h2 style="align: center">' + escape(article['dek']) + '</h2>')
auts = ', '.join(x['displayName'] for x in article['authors'])
if auts:
lines.append('<p style="align: center">by ' + escape(auts) + '</p>')
if article.get('leadArt') and 'image' in article['leadArt']:
process_image_block(lines, article['leadArt']['image'])
for item in article['content']:
tn = item.get('__typename', '')
if tn.endswith('Image'):
process_image_block(lines, item)
continue
html = item.get('innerHtml')
if html is None or '</iframe>' in html:
continue
if 'innerHtml' not in item:
continue
tagname = item.get('tagName', 'P').lower()
lines.append('<{0}>{1}</{0}>'.format(tagname, html))
return '<html><body><div id="from-json-by-calibre">' + '\n'.join(lines) + '</div></body></html>'
class NoJSON(ValueError):
pass
def extract_html(soup):
script = soup.findAll('script', id='__NEXT_DATA__')
if not script:
raise NoJSON('No script tag with JSON data found')
raw = script[0].contents[0]
return json_to_html(raw)
# }}}
def classes(classes):
q = frozenset(classes.split(' '))
return dict(
attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)}
)
def prefix_classes(classes):
q = classes.split()
def test(x):
if x:
for cls in x.split():
for c in q:
if cls.startswith(c):
return True
return False
return dict(attrs={'class': test})
class Reason(BasicNewsRecipe):
title = 'Reason'
description = 'Free minds and free markets'
INDEX = 'https://reason.com/magazine/'
#INDEX = 'https://reason.com/issue/april-2022/'
__author__ = 'Howard Cornett'
language = 'en'
encoding = 'utf-8'
needs_subscription = True
remove_tags = [
classes(
'next-post-link the-tags tag rcom-social tools comments-header-show logo-header navbar navbar-expanded-lg primary content-info sidebar magicSidebar advertisement logo entry-subtitle'
),
]
no_stylesheets = True
remove_attributes = ['style']
extra_css = '''
.credit { text-align: right; font-size: 75%; display: block }
.figcaption { font-size: 75% }
.caption { font-size: 75% }
.lead-img { display: block }
p.dropcap:first-letter {
float: left; text-transform: uppercase; font-weight: bold; font-size: 5.55em; line-height: 0.83;
margin: 0; padding-right: 7px; margin-bottom: -2px; text-align: center;
}
'''
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
return br
def preprocess_raw_html(self, raw_html, url):
try:
return extract_html(self.index_to_soup(raw_html))
except NoJSON:
self.log.warn('No JSON found in: {} falling back to HTML'.format(url))
except Exception:
self.log.exception('Failed to extract JSON data from: {} falling back to HTML'.format(url))
return raw_html
def preprocess_html(self, soup):
for img in soup.findAll('img', attrs={'data-lazy-src': True}):
# img['src'] = img['data-lazy-src'].split()[0]
data_lazy_src = img['data-lazy-src']
if ',' in data_lazy_src:
img['src'] = data_lazy_src.split(',')[0]
else:
img['src'] = data_lazy_src.split()[0]
return soup
#def print_version(self, url):
#ans = url.partition('?')[0] + '?single_page=true'
#if '/video/' in ans:
#ans = None
#return ans
def parse_index(self):
soup = self.index_to_soup(self.INDEX)
cover = soup.find('img', title=lambda value: value and value.startswith('Reason Magazine,'))
if cover is not None:
self.cover_url = cover['data-lazy-src']
current_section, current_articles = 'Cover Story', []
feeds = []
for div in soup.findAll('div', attrs={'class': lambda x: x and set(x.split()).intersection({'issue-header-right', 'toc-category-list'})}):
#print('The value of div is ', div)
for h3 in div.findAll('h3', attrs={'class': True}):
cls = h3['class']
if hasattr(cls, 'split'):
cls = cls.split()
if 'toc-department' in cls:
if current_articles:
feeds.append((current_section, current_articles))
current_articles = []
current_section = self.tag_to_string(h3)
self.log('\nFound section:', current_section)
title = h3.find_next_sibling().a.text
url = h3.find_next_sibling().a['href']
desc = h3.find_next_sibling().p.text
current_articles.append({
'title': title,
'url': url,
'description': desc
})
for h2 in div.findAll('h2', attrs={'class': True}):
cls = h2['class']
if hasattr(cls, 'split'):
cls = cls.split()
if 'toc-department' in cls:
if current_articles:
feeds.append((current_section, current_articles))
current_articles = []
current_section = self.tag_to_string(h2)
self.log('\nFound section:', current_section)
for article in div.findAll('article', attrs={'class': True}):
h4 = article.find('h4')
if h4.a is not None:
title = h4.a.text
url = h4.a['href']
else:
title = ''
url = ''
p = article.find('p', class_='entry-subtitle')
desc = h4.find_next_sibling().text
current_articles.append({
'title': title,
'url': url,
'description': desc
})
if current_articles:
feeds.append((current_section, current_articles))
return feeds
if __name__ == '__main__':
import sys
from calibre.ebooks.BeautifulSoup import BeautifulSoup
print(extract_html(BeautifulSoup(open(sys.argv[-1]).read())))
calibre_most_common_ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36'