from calibre.web.feeds.news import BasicNewsRecipe from calibre import browser from calibre.ptempfile import PersistentTemporaryFile import json import re class Bloomberg(BasicNewsRecipe): title = u'Bloomberg' language = 'en' __author__ = 'unkn0wn' no_stylesheets = True use_embedded_content = False remove_attributes = ['style', 'height', 'width'] ignore_duplicate_articles = {'url'} resolve_internal_links = True oldest_article = 2 # days delay = 1.5 extra_css = ''' #auth {font-size:small; font-weight:bold;} #time {font-size:small;} #subhead {font-style:italic; color:#404040;} #cat {font-size:small; color:gray;} .news-figure-caption-text, #cap {font-size:small; text-align:center;} .news-figure-credit {font-size:small; text-align:center; color:#202020;} ''' articles_are_obfuscated = True def get_obfuscated_article(self, url): br = self.get_browser() try: br.open(url) except Exception as e: url = e.hdrs.get('location') soup = self.index_to_soup(url) link = soup.find('a', attrs={'href':lambda x: x and x.startswith('https://www.bloomberg.com')}) html = br.open(link['href']).read() pt = PersistentTemporaryFile('.html') pt.write(html) pt.close() return pt.name def get_browser(self): br = browser() br.set_handle_redirect(False) return br feeds = [ ('Articles', 'https://news.google.com/rss/search?q=when:24h+allinurl:bloomberg.com&hl=en-US&gl=US&ceid=US:en'), ] def preprocess_raw_html(self, raw, *a): m = re.search('data-component-props="ArticleBody">', raw) if not m: m = re.search('data-component-props="FeatureBody">', raw) raw = raw[m.start():] raw = raw.split('>', 1)[1] data = json.JSONDecoder().raw_decode(raw)[0] data = data['story'] title = '

' + data['headline'] + '

' cat = subhead = lede = auth = caption = '' if 'primaryCategory' in data: if data['primaryCategory'] is not None: cat = '

' + data['primaryCategory'] + '

' if len(data['abstract']) != 0: if len(data['abstract']) == 2: subhead = '

' + data['abstract'][0] + '

' + data['abstract'][1] + '

' else: if 'summary' in data: subhead = '

' + data['summary'] + '

' if 'byline' in data: if data['byline'] is not None: auth = '

' + data['byline']\ + ' | ' + data['publishedAt'][:-14] + '

' if 'ledeImageUrl' in data: if data['ledeImageUrl'] is not None: lede = '

'.format(data['ledeImageUrl'].replace('\\', '')) if data['ledeDescription'] is not None: caption = '' + data['ledeDescription'] + '' body = data['body'].replace('\\', '') html = '' + cat + title + subhead + auth + lede + caption + '

' + body return html def preprocess_html(self, soup): for icon in soup.findAll('img', attrs={'class':'video-player__play-icon'}): icon.decompose() for div in soup.findAll('div', attrs={'class':'chart'}): nos = div.find('noscript') if nos: nos.name = 'span' for img in soup.findAll('img', attrs={'data-native-src':True}): if img['data-native-src'].__contains__('videos') is False: img['src'] = img['data-native-src'] else: img['src'] = '' for img in soup.findAll('img', attrs={'src':lambda x: x and x.endswith(('-1x-1.jpg', '-1x-1.png'))}): img['src'] = img['src'].replace('-1x-1', '750x-1') return soup calibre_most_common_ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36'