#!/usr/bin/env python # vim:fileencoding=utf-8 # License: GPLv3 Copyright: 2020, Kovid Goyal import json from calibre import prepare_string_for_xml from calibre.web.feeds.recipes import BasicNewsRecipe # Article JSON parser {{{ def serialize_image(block): yield '

' block = block['model'] img = block['image'] alt = prepare_string_for_xml(img.get('alt') or '', True) for q in ('originalSrc', 'src'): if q in img: src = prepare_string_for_xml(img[q]) break else: raise ValueError('No src found in img block: {}'.format(img)) yield '

'.format(src, alt) caption = block.get('caption') if caption and caption.get('type') == 'text': yield '

' yield from serialize_paragraph(caption) yield '

' yield '

' def block_tag(name, generator): yield '<' + name + '>' yield from generator yield '' def serialize_paragraph(block): block = block['model'] for x in block['blocks']: xt = x['type'] if xt == 'fragment': styles = [] model = x['model'] for attr in model['attributes']: if attr == 'bold': styles.append('font-weight: bold') elif attr in ('italic', 'italics'): styles.append('font-style: italic') if styles: prefix = ''.format('; '.join(styles)) suffix = '' else: prefix = suffix = '' yield prefix + prepare_string_for_xml(model['text']) + suffix elif xt == 'urlLink': model = x['model'] yield '{}'.format(prepare_string_for_xml(model['locator'], True), prepare_string_for_xml(model['text'])) def serialize_list(block): for x in block['model']['blocks']: if x['type'] == 'listItem': yield from block_tag('li', serialize_paragraph(x)) def serialize_text(block): block = block['model'] for x in block['blocks']: xt = x['type'] if xt == 'paragraph': yield from block_tag('p', serialize_paragraph(x)) elif xt == 'unorderedList': yield from block_tag('ul', serialize_list(x)) elif xt == 'orderedList': yield from block_tag('ol', serialize_list(x)) else: raise KeyError('Unknown block type: ' + x['type']) def serialize_contributor(contributor): if 'title' in contributor: yield '

' + prepare_string_for_xml(contributor['title']) + '

' if 'subtitle' in contributor: yield '

' + prepare_string_for_xml(contributor['subtitle']) + '

' def parse_article_json(root, abort_article): data = root['data'] has_media_experience = False for key in data: if key.startswith('article?'): article = data[key]['data'] break elif key.startswith('media-experience?'): has_media_experience = True else: if has_media_experience: abort_article('Skipping video article') return raise KeyError('No article found in data keys: {}'.format(data.keys())) lines = [] if article.get('headline'): lines.append('

{}

'.format(prepare_string_for_xml(article['headline']))) if article.get('contributor'): lines.extend(serialize_contributor(article['contributor'])) for block in article['content']['model']['blocks']: bt = block.get('type') if bt == 'image': lines.extend(serialize_image(block)) elif bt == 'text': lines.extend(serialize_text(block)) return '' + '\n'.join(lines) + '' def parse_raw_html(html, abort_article): q = '>window.__INITIAL_DATA__="{' idx = html.find(q) if idx < 0: raise ValueError('Failed to find JSON') data = html[idx + len(q) - 2:] idx = data.find('}";') data = data[:idx+2] data = json.loads(data) root = json.loads(data) return parse_article_json(root, abort_article) if __name__ == '__main__': print(parse_raw_html(open('/t/raw.html').read(), print)) # }}} class BBCNews(BasicNewsRecipe): # Select / de-select the feeds you want in your ebook. feeds = [ ('News Home', 'https://feeds.bbci.co.uk/news/rss.xml'), ('UK', 'https://feeds.bbci.co.uk/news/uk/rss.xml'), ('World', 'https://feeds.bbci.co.uk/news/world/rss.xml'), # ("England", "https://feeds.bbci.co.uk/news/england/rss.xml"), # ("Scotland", "https://feeds.bbci.co.uk/news/scotland/rss.xml"), # ("Wales", "https://feeds.bbci.co.uk/news/wales/rss.xml"), # ("N. Ireland", "https://feeds.bbci.co.uk/news/northern_ireland/rss.xml"), # ("Africa", "https://feeds.bbci.co.uk/news/world/africa/rss.xml"), # ("Asia", "https://feeds.bbci.co.uk/news/world/asia/rss.xml"), # ("Europe", "https://feeds.bbci.co.uk/news/world/europe/rss.xml"), # ("Latin America", "https://feeds.bbci.co.uk/news/world/latin_america/rss.xml"), # ("Middle East", "https://feeds.bbci.co.uk/news/world/middle_east/rss.xml"), ('US & Canada', 'https://feeds.bbci.co.uk/news/world/us_and_canada/rss.xml'), ('Politics', 'https://feeds.bbci.co.uk/news/politics/rss.xml'), ('Science/Environment', 'https://feeds.bbci.co.uk/news/science_and_environment/rss.xml'), ('Technology', 'https://feeds.bbci.co.uk/news/technology/rss.xml'), ('Magazine', 'https://feeds.bbci.co.uk/news/magazine/rss.xml'), ('Entertainment/Arts', 'https://feeds.bbci.co.uk/news/entertainment_and_arts/rss.xml'), # ("Health", "https://feeds.bbci.co.uk/news/health/rss.xml"), # ("Education/Family", "https://feeds.bbci.co.uk/news/education/rss.xml"), ('Business', 'https://feeds.bbci.co.uk/news/business/rss.xml'), ('Special Reports', 'https://feeds.bbci.co.uk/news/special_reports/rss.xml'), ('Also in the News', 'https://feeds.bbci.co.uk/news/also_in_the_news/rss.xml'), # ("Newsbeat", "https://www.bbc.co.uk/newsbeat/rss.xml"), # ("Click", "http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/programmes/click_online/rss.xml"), # ("Blog: Mark D'Arcy (Parliamentary Correspondent)", "https://feeds.bbci.co.uk/news/correspondents/markdarcy/rss.sxml"), # ("Blog: Robert Peston (Business Editor)", "https://feeds.bbci.co.uk/news/correspondents/robertpeston/rss.sxml"), # ("Blog: Stephanie Flanders (Economics Editor)", "https://feeds.bbci.co.uk/news/correspondents/stephanieflanders/rss.sxml"), # ('Sport Front Page', # 'http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/front_page/rss.xml'), # ("Football", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/football/rss.xml"), # ("Cricket", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/cricket/rss.xml"), # ("Rugby Union", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/rugby_union/rss.xml"), # ("Rugby League", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/rugby_league/rss.xml"), # ("Tennis", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/tennis/rss.xml"), # ("Golf", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/golf/rss.xml"), # ("Motorsport", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/motorsport/rss.xml"), # ("Boxing", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/boxing/rss.xml"), # ("Athletics", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/athletics/rss.xml"), # ("Snooker", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/snooker/rss.xml"), # ("Horse Racing", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/horse_racing/rss.xml"), # ("Cycling", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/cycling/rss.xml"), # ("Disability Sport", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/disability_sport/rss.xml"), # ("Other Sport", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/rss.xml"), # ("Olympics 2012", "http://newsrss.bbc.co.uk/rss/sportonline_uk_edition/other_sports/olympics_2012/rss.xml"), # ("N. Ireland Politics", "https://feeds.bbci.co.uk/news/northern_ireland/northern_ireland_politics/rss.xml"), # ("Scotland Politics", "https://feeds.bbci.co.uk/news/scotland/scotland_politics/rss.xml"), # ("Scotland Business", "https://feeds.bbci.co.uk/news/scotland/scotland_business/rss.xml"), # ("E. Scotland, Edinburgh & Fife", "https://feeds.bbci.co.uk/news/scotland/edinburgh_east_and_fife/rss.xml"), # ("W. Scotland & Glasgow", "https://feeds.bbci.co.uk/news/scotland/glasgow_and_west/rss.xml"), # ("Highlands & Islands", "https://feeds.bbci.co.uk/news/scotland/highlands_and_islands/rss.xml"), # ("NE. Scotland, Orkney & Shetland", "https://feeds.bbci.co.uk/news/scotland/north_east_orkney_and_shetland/rss.xml"), # ("South Scotland", "https://feeds.bbci.co.uk/news/scotland/south_scotland/rss.xml"), # ("Central Scotland & Tayside", "https://feeds.bbci.co.uk/news/scotland/tayside_and_central/rss.xml"), # ("Wales Politics", "https://feeds.bbci.co.uk/news/wales/wales_politics/rss.xml"), # ("NW. Wales", "https://feeds.bbci.co.uk/news/wales/north_west_wales/rss.xml"), # ("NE. Wales", "https://feeds.bbci.co.uk/news/wales/north_east_wales/rss.xml"), # ("Mid. Wales", "https://feeds.bbci.co.uk/news/wales/mid_wales/rss.xml"), # ("SW. Wales", "https://feeds.bbci.co.uk/news/wales/south_west_wales/rss.xml"), # ("SE. Wales", "https://feeds.bbci.co.uk/news/wales/south_east_wales/rss.xml"), # ("Newyddion - News in Welsh", "https://feeds.bbci.co.uk/newyddion/rss.xml"), # ("Gwleidyddiaeth", "https://feeds.bbci.co.uk/newyddion/gwleidyddiaeth/rss.xml"), # ("Gogledd-Ddwyrain", "https://feeds.bbci.co.uk/newyddion/gogledd-ddwyrain/rss.xml"), # ("Gogledd-Orllewin", "https://feeds.bbci.co.uk/newyddion/gogledd-orllewin/rss.xml"), # ("Canolbarth", "https://feeds.bbci.co.uk/newyddion/canolbarth/rss.xml"), # ("De-Ddwyrain", "https://feeds.bbci.co.uk/newyddion/de-ddwyrain/rss.xml"), # ("De-Orllewin", "https://feeds.bbci.co.uk/newyddion/de-orllewin/rss.xml"), ] # **** SELECT YOUR USER PREFERENCES **** # Title to use for the ebook. # title = 'BBC News' # A brief description for the ebook. # description = u'BBC web site ebook created using rss feeds.' # The max number of articles which may be downloaded from each feed. # I've never seen more than about 70 articles in a single feed in the # BBC feeds. # max_articles_per_feed = 100 # The max age of articles which may be downloaded from each feed. This is # specified in days - note fractions of days are allowed, Eg. 2.5 (2 and a # half days). My default of 1.5 days is the last 36 hours, the point at # which I've decided 'news' becomes 'old news', but be warned this is not # so good for the blogs, technology, magazine, etc., and sports feeds. # You may wish to extend this to 2-5 but watch out ebook creation time will # increase as well. Setting this to 30 will get everything (AFAICT) as long # as max_articles_per_feed remains set high (except for 'Click' which is # v. low volume and its currently oldest article is 4th Feb 2011). # oldest_article = 1.5 recipe_specific_options = { 'days': { 'short': 'Oldest article to download from this news source. In days ', 'long': 'For example, 0.5, gives you articles from the past 12 hours', 'default': str(oldest_article) } } def __init__(self, *args, **kwargs): BasicNewsRecipe.__init__(self, *args, **kwargs) d = self.recipe_specific_options.get('days') if d and isinstance(d, str): self.oldest_article = float(d) # Number of simultaneous downloads. 20 is consistently working fine on the # BBC News feeds with no problems. Speeds things up from the default of 5. # If you have a lot of feeds and/or have increased oldest_article above 2 # then you may wish to try increasing simultaneous_downloads to 25-30, # Or, of course, if you are in a hurry. [I've not tried beyond 20.] # simultaneous_downloads = 20 # The format string for the date shown on the ebook's first page. # List of all values: https://docs.python.org/library/time.html # Default in news.py has a leading space so that's mirrored here. # As with 'feeds' select/de-select by adding/removing the initial '#', # only one timefmt should be selected, here's a few to choose from. # # [Fri, 14 Nov 2011] (Calibre default) timefmt = ' [%a, %d %b %Y]' # timefmt = ' [%a, %d %b %Y %H:%M]' # [Fri, 14 Nov 2011 18:30] # timefmt = ' [%a, %d %b %Y %I:%M %p]' # [Fri, 14 Nov 2011 06:30 PM] # timefmt = ' [%d %b %Y]' # [14 Nov 2011] # timefmt = ' [%d %b %Y %H:%M]' # [14 Nov 2011 18.30] # timefmt = ' [%Y-%m-%d]' # [2011-11-14] # timefmt = ' [%Y-%m-%d-%H-%M]' # [2011-11-14-18-30] # # **** IMPORTANT **** # # DO NOT EDIT BELOW HERE UNLESS YOU KNOW WHAT YOU ARE DOING. # # DO NOT EDIT BELOW HERE UNLESS YOU KNOW WHAT YOU ARE DOING. # # I MEAN IT, YES I DO, ABSOLUTELY, AT YOU OWN RISK. :) # # **** IMPORTANT **** # # Author of this recipe. __author__ = 'Kovid Goyal' # Specify English as the language of the RSS feeds (ISO-639 code). language = 'en_GB' # Set publisher and publication type. publication_type = 'newspaper' encoding = 'utf-8' use_embedded_content = False # Removes empty feeds - why keep them!? remove_empty_feeds = True ignore_duplicate_articles = {'title', 'url'} resolve_internal_links = True def preprocess_raw_html(self, raw_html, url): return parse_raw_html(raw_html, self.abort_article)