from calibre.web.feeds.news import BasicNewsRecipe from calibre import browser from html5_parser import parse from calibre.ptempfile import PersistentTemporaryFile import json import random import time class Bloomberg(BasicNewsRecipe): title = u'Bloomberg' language = 'en' __author__ = 'unkn0wn' no_stylesheets = True use_embedded_content = False remove_attributes = ['style', 'height', 'width'] ignore_duplicate_articles = {'url', 'title'} resolve_internal_links = True # delay = 7 # seconds extra_css = ''' #auth {font-size:small; font-weight:bold;} #time, .chart {font-size:small;} #subhead {font-style:italic; color:#404040;} #cat {font-size:small; color:gray;} .news-figure-caption-text, #cap, #img {font-size:small; text-align:center;} .news-figure-credit {font-size:small; text-align:center; color:#202020;} ''' articles_are_obfuscated = True def get_obfuscated_article(self, url): br = self.get_browser() try: br.open(url) except Exception as e: url = e.hdrs.get('location') soup = self.index_to_soup(url) link = soup.find('a', attrs={'href':lambda x: x and x.startswith('https://www.bloomberg.com')}) skip_sections =[ # add sections you want to skip '/video/', '/videos/', '/media/', 'podcast' ] if any(x in link['href'] for x in skip_sections): self.abort_article('Aborting Video article') self.log('Found link: ', link['href']) html = br.open(link['href']).read() pt = PersistentTemporaryFile('.html') pt.write(html) pt.close() return pt.name def get_browser(self): br = browser() br.set_handle_redirect(False) return br feeds = [ ('Features', 'https://news.google.com/rss/search?q=when:27h+allinurl:bloomberg.com%2Fnews%2Ffeatures%2F&hl=en-US&gl=US&ceid=US:en'), ('Opinion', 'https://news.google.com/rss/search?q=when:27h+allinurl:bloomberg.com%2Fopinion%2F&hl=en-US&gl=US&ceid=US:en'), ('Newsletters', 'https://news.google.com/rss/search?q=when:27h+allinurl:bloomberg.com%2Fnews%2Fnewsletters%2F&hl=en-US&gl=US&ceid=US:en'), ('News', 'https://news.google.com/rss/search?q=when:27h+allinurl:bloomberg.com%2Fnews%2Farticles%2F&hl=en-US&gl=US&ceid=US:en'), ('Others', 'https://news.google.com/rss/search?q=when:27h+allinurl:bloomberg.com&hl=en-US&gl=US&ceid=US:en') ] def preprocess_raw_html(self, raw, *a): root = parse(raw) m = root.xpath('//script[@data-component-props="ArticleBody"]') if not m: m = root.xpath('//script[@data-component-props="FeatureBody"]') if not m: m2 = root.xpath('//script[@id="__NEXT_DATA__"]') if m: data = json.loads(m[0].text) data = data['story'] elif m2: data = json.loads(m2[0].text) data = data['props']['pageProps']['story'] title = '

' + data['headline'] + '

' cat = subhead = lede = auth = caption = '' if 'primaryCategory' in data: if data['primaryCategory'] is not None: cat = '

' + data['primaryCategory'] + '

' if len(data['abstract']) != 0: if len(data['abstract']) == 2: subhead = '

' + data['abstract'][0] + '

' + data['abstract'][1] + '

' else: if 'summary' in data: subhead = '

' + data['summary'] + '

' if 'byline' in data: if data['byline'] is not None: auth = '

' + data['byline']\ + ' | ' + data['publishedAt'][:-14] + '

' if 'ledeImageUrl' in data: if data['ledeImageUrl'] is not None: lede = '

'.format(data['ledeImageUrl']) if 'ledeDescription' in data: if data['ledeDescription'] is not None: caption = '' + data['ledeDescription'] + '' else: if 'lede' in data: if data['lede'] is not None: if 'alt' in data['lede']: if data['lede']['alt'] is not None: caption = '' + data['lede']['alt'] + '' if m: body = data['body'] elif m2: body = '' body_data = data['body']['content'] for objects in body_data: pause = random.choice((1, 1.25, 1.5, 1.75, 2, 2.25, 2.5)) time.sleep(pause) if objects['type'] == 'media' and objects['subType'] == 'photo': body += '

'.format(objects['data']['photo']['src']) body += '' + objects['data']['photo']['caption'] + '

' if objects['type'] == 'media' and objects['subType'] == 'chart': if objects['data'] and objects['data']['chart']: body += '

'.format(objects['data']['chart']['fallback']) if objects['type'] == 'paragraph' or 'heading': # lists are missed :( body += '

' if 'content' not in objects: continue for item in objects['content']: if item['type'] == 'text' and item['value']: body += item['value'] if item['type'] == 'link' and item['data']: if 'href' not in item['data']: continue if item['content'] and item['content'][0] and item['content'][0]['value']: body += '' + item['content'][0]['value'] + '' if item['type'] == 'entity': if item['content'] and item['content'][0] and item['content'][0]['value']: if item['subType'] == 'story': if item['data'] and item['data']['link'] and item['data']['link']['destination']: if 'web' in item['data']['link']['destination']: body += '' + item['content'][0]['value'] + '' elif item['subType'] == 'person' or 'security': body += item['content'][0]['value'] if objects['type'] == 'quote': if 'content' not in objects: continue for item in objects['content']: if item['type'] == 'paragraph' and item['content'] and item['content'][0]: if 'value' not in item['content'][0]: continue body += '

' + item['content'][0]['value'] + '

' html = '' + cat + title + subhead + auth + lede + caption + '

' + body return html def preprocess_html(self, soup): for icon in soup.findAll('img', attrs={'class':'video-player__play-icon'}): icon.decompose() for div in soup.findAll('div', attrs={'class':'chart'}): nos = div.find('noscript') if nos: nos.name = 'span' for img in soup.findAll('img', attrs={'data-native-src':True}): if img['data-native-src'].__contains__('videos') is False: img['src'] = img['data-native-src'] else: img['src'] = '' for img in soup.findAll('img', attrs={'src':lambda x: x and x.endswith(('-1x-1.jpg', '-1x-1.png'))}): img['src'] = img['src'].replace('-1x-1', '750x-1') return soup