import json import re from urllib.parse import quote from calibre.web.feeds.news import BasicNewsRecipe class ft(BasicNewsRecipe): title = 'Financial Times' language = 'en' __author__ = "Kovid Goyal" description = 'The Financial Times is one of the world’s leading news organisations, recognised internationally for its authority, integrity and accuracy.' oldest_article = 1.15 max_articles_per_feed = 50 no_stylesheets = True remove_javascript = True remove_empty_feeds = True ignore_duplicate_articles = {'url'} remove_attributes = ['style', 'width', 'height'] masthead_url = 'https://im.ft-static.com/m/img/masthead_main.jpg' extra_css = '#fig-cap{font-style:italic; text-align:left; font-size:small;}' def get_cover_url(self): soup = self.index_to_soup( 'https://www.todayspapers.co.uk/the-financial-times-front-page-today/' ) tag = soup.find('div', attrs={'class': 'elementor-image'}) if tag: self.cover_url = tag.find('img')['src'] return getattr(self, 'cover_url', self.cover_url) feeds = [ ('World', 'https://www.ft.com/world?format=rss'), ('US', 'https://www.ft.com/us?format=rss'), ('Companies', 'https://www.ft.com/companies?format=rss'), ('Tech', 'https://www.ft.com/technology?format=rss'), ('Markets', 'https://www.ft.com/markets?format=rss'), ('Climate', 'https://www.ft.com/climate-capital?format=rss'), ('Opinion', 'https://www.ft.com/opinion?format=rss'), ('Life & Arts', 'https://www.ft.com/life-arts?format=rss'), ('How to spend it', 'https://www.ft.com/htsi?format=rss'), ] def preprocess_raw_html(self, raw, *a): # with open('/t/raw.html', 'w') as f: # f.write(raw) m = re.search(r'type="application/ld\+json">[^<]+?"@type":"NewsArticle"', raw) raw = raw[m.start():] raw = raw.split('>', 1)[1] # with open('/t/raw.json', 'w') as f: # f.write(raw) data = json.JSONDecoder().raw_decode(raw)[0] title = data['headline'] body = data['articleBody'] body = body.replace('\n\n', '

') author = '' if 'author' in data: try: author = data['author']['name'] except TypeError: author = ' and '.join(x['name'] for x in data['author']) image = desc = title_image_url = '' def resize_img(img): a = 'https://www.ft.com/__origami/service/image/v2/images/raw/' b = quote(img, safe='') c = '?dpr=2&fit=scale-down&quality=medium&source=next&width=400' #use width = 200, 300, 400,.. 700... return a + b + c if data.get('image'): image_url = data['image']['url'] if body.__contains__(image_url) is False: title_image_url = resize_img(image_url) image = '

'.format(title_image_url) # embedded image links def insert_image(m): url = m.group()[1:-1] if url.__contains__('studio') is False: url = resize_img(url) return '

'.format(url) body = re.sub(r'\[https://\S+?\]', insert_image, body) if data.get('description'): desc = '

' + data['description'] + '

' html = '

' + title + '

' + desc + '

' + author + '

' + image + '

' + body return html def preprocess_html(self, soup): for span in soup.findAll('span'): p = span.findParent('p') if p: p['id'] = 'fig-cap' return soup calibre_most_common_ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36'