#!/usr/bin/env python # vim:fileencoding=utf-8 # License: GPLv3 Copyright: 2015, Kovid Goyal from __future__ import unicode_literals import json from xml.sax.saxutils import escape, quoteattr from calibre.web.feeds.news import BasicNewsRecipe # {{{ parse article JSON def process_image_block(lines, block): caption = block.get('captionText') caption_lines = [] if caption: if block.get('attributionText', '').strip(): caption += ' (' + block['attributionText'] + ')' caption_lines.append('

' + caption + '

') lines.append('

'.format(quoteattr(block['url']))) lines.extend(caption_lines) lines.append('

') def json_to_html(raw): data = json.loads(raw) # open('/t/p.json', 'w').write(json.dumps(data, indent=2)) data = sorted((v['data'] for v in data['props']['pageProps']['urqlState'].values()), key=len)[-1] article = json.loads(data)['article'] lines = [] lines.append('

' + escape(article['title']) + '

') lines.append('

' + escape(article['dek']) + '

') auts = ', '.join(x['displayName'] for x in article['authors']) if auts: lines.append('

by ' + escape(auts) + '

') if article.get('leadArt') and 'image' in article['leadArt']: process_image_block(lines, article['leadArt']['image']) for item in article['content']: tn = item.get('__typename', '') if tn.endswith('Image'): process_image_block(lines, item) continue html = item.get('innerHtml') if html is None or '' in html: continue if 'innerHtml' not in item: continue tagname = item.get('tagName', 'P').lower() lines.append('<{0}>{1}'.format(tagname, html)) return '

' + '\n'.join(lines) + '

' class NoJSON(ValueError): pass def extract_html(soup): script = soup.findAll('script', id='__NEXT_DATA__') if not script: raise NoJSON('No script tag with JSON data found') raw = script[0].contents[0] return json_to_html(raw) # }}} def classes(classes): q = frozenset(classes.split(' ')) return dict( attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)} ) def prefix_classes(classes): q = classes.split() def test(x): if x: for cls in x.split(): for c in q: if cls.startswith(c): return True return False return dict(attrs={'class': test}) class Reason(BasicNewsRecipe): title = 'Reason' description = 'Free minds and free markets' INDEX = 'https://reason.com/magazine/' #INDEX = 'https://reason.com/issue/april-2022/' __author__ = 'Howard Cornett' language = 'en' encoding = 'utf-8' needs_subscription = True remove_tags = [ classes( 'next-post-link the-tags tag rcom-social-tools most-read-container comments-header-show logo-header navbar navbar-expanded-lg primary content-info sidebar magicSidebar advertisement logo entry-subtitle' ), ] no_stylesheets = True remove_attributes = ['style'] extra_css = ''' .credit { text-align: right; font-size: 75%; display: block } .figcaption { font-size: 75% } .caption { font-size: 75% } .lead-img { display: block } p.dropcap:first-letter { float: left; text-transform: uppercase; font-weight: bold; font-size: 5.55em; line-height: 0.83; margin: 0; padding-right: 7px; margin-bottom: -2px; text-align: center; } ''' def get_browser(self): br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open('https://reason.com/login') br.select_form(id='login_form') br['text_username'] = self.username br['password_password'] = self.password br.submit() return br def preprocess_raw_html(self, raw_html, url): try: return extract_html(self.index_to_soup(raw_html)) except NoJSON: self.log.warn('No JSON found in: {} falling back to HTML'.format(url)) except Exception: self.log.exception('Failed to extract JSON data from: {} falling back to HTML'.format(url)) return raw_html def preprocess_html(self, soup): for img in soup.findAll('img', attrs={'data-lazy-src': True}): # img['src'] = img['data-lazy-src'].split()[0] data_lazy_src = img['data-lazy-src'] if ',' in data_lazy_src: img['src'] = data_lazy_src.split(',')[0] else: img['src'] = data_lazy_src.split()[0] return soup #def print_version(self, url): #ans = url.partition('?')[0] + '?single_page=true' #if '/video/' in ans: #ans = None #return ans def parse_index(self): soup = self.index_to_soup(self.INDEX) cover = soup.find('img', title=lambda value: value and value.startswith('Reason Magazine,')) if cover is not None: self.cover_url = cover['src'] current_section, current_articles = 'Cover Story', [] feeds = [] for div in soup.findAll('div', attrs={'class': lambda x: x and set(x.split()).intersection({'issue-header-right', 'toc-category-list'})}): #print('The value of div is ', div) for h3 in div.findAll('h3', attrs={'class': True}): cls = h3['class'] if hasattr(cls, 'split'): cls = cls.split() if 'toc-department' in cls: if current_articles: feeds.append((current_section, current_articles)) current_articles = [] current_section = self.tag_to_string(h3) self.log('\nFound section:', current_section) title = h3.find_next_sibling().a.text url = h3.find_next_sibling().a['href'] desc = h3.find_next_sibling().p.text current_articles.append({ 'title': title, 'url': url, 'description': desc }) for h2 in div.findAll('h2', attrs={'class': True}): cls = h2['class'] if hasattr(cls, 'split'): cls = cls.split() if 'toc-department' in cls: if current_articles: feeds.append((current_section, current_articles)) current_articles = [] current_section = self.tag_to_string(h2) self.log('\nFound section:', current_section) for article in div.findAll('article', attrs={'class': True}): h4 = article.find('h4') if h4.a is not None: title = h4.a.text url = h4.a['href'] else: title = '' url = '' p = article.find('p', class_='entry-subtitle') desc = h4.find_next_sibling().text current_articles.append({ 'title': title, 'url': url, 'description': desc }) if current_articles: feeds.append((current_section, current_articles)) return feeds if __name__ == '__main__': import sys from calibre.ebooks.BeautifulSoup import BeautifulSoup print(extract_html(BeautifulSoup(open(sys.argv[-1]).read()))) calibre_most_common_ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36'