import json import re from datetime import date from calibre.web.feeds.news import BasicNewsRecipe, classes from urllib.parse import quote class ft(BasicNewsRecipe): title = 'Financial Times - Print Edition' language = 'en' __author__ = "Kovid Goyal" description = 'The Financial Times is one of the world’s leading news organisations, recognised internationally for its authority, integrity and accuracy.' no_stylesheets = True remove_javascript = True remove_empty_feeds = True ignore_duplicate_articles = {'url'} remove_attributes = ['style', 'width', 'height'] masthead_url = 'https://im.ft-static.com/m/img/masthead_main.jpg' extra_css = '#fig-cap{font-style:italic; text-align:left; font-size:small;}' def get_cover_url(self): soup = self.index_to_soup( 'https://www.todayspapers.co.uk/the-financial-times-front-page-today/' ) tag = soup.find('div', attrs={'class': 'elementor-image'}) if tag: self.cover_url = tag.find('img')['src'] return getattr(self, 'cover_url', self.cover_url) def parse_index(self): soup = self.index_to_soup('https://www.ft.com/todaysnewspaper/uk') # International edition: https://www.ft.com/todaysnewspaper/international ans = self.ft_parse_index(soup) if not ans: is_sunday = date.today().weekday() == 6 if is_sunday: raise ValueError( 'The Financial Times Newspaper is not published on Sundays.' ) else: raise ValueError( 'The Financial Times Newspaper is not published today.' ) return ans def ft_parse_index(self, soup): feeds = [] for section in soup.findAll(**classes('o-teaser-collection')): h2 = section.find('h2') secname = self.tag_to_string(h2) self.log(secname) articles = [] for a in section.findAll( 'a', href=True, **classes('js-teaser-heading-link') ): url = a['href'] url = 'https://www.ft.com' + url title = self.tag_to_string(a) desc = '' desc_parent = a.findParent('div') div = desc_parent.find_previous_sibling( 'div', **classes('o-teaser__meta') ) if div is not None: desc = div.find('a', **classes('o-teaser__tag')) desc = self.tag_to_string(desc) prefix = div.find('span', **classes('o-teaser__tag-prefix')) if prefix is not None: prefix = self.tag_to_string(prefix) desc = prefix + ' ' + desc articles.append({ 'title': title, 'url': url, 'description': desc }) self.log('\t', desc) self.log('\t', title) self.log('\t\t', url) if articles: feeds.append((secname, articles)) return feeds def preprocess_raw_html(self, raw, *a): # with open('/t/raw.html', 'w') as f: # f.write(raw) m = re.search( r'type="application/ld\+json">[^<]+?"@type":"NewsArticle"', raw ) raw = raw[m.start():] raw = raw.split('>', 1)[1] # with open('/t/raw.json', 'w') as f: # f.write(raw) data = json.JSONDecoder().raw_decode(raw)[0] title = data['headline'] body = data['articleBody'] body = body.replace('\n\n', '

') author = '' if 'author' in data: try: author = data['author']['name'] except TypeError: author = ' and '.join(x['name'] for x in data['author']) image = desc = title_image_url = '' def resize_img(img): a = 'https://www.ft.com/__origami/service/image/v2/images/raw/' b = quote(img, safe='') c = '?dpr=2&fit=scale-down&quality=medium&source=next&width=400' #use width = 200, 300, 400,.. 700... return a + b + c if data.get('image'): image_url = data['image']['url'] if body.__contains__(image_url) is False: title_image_url = resize_img(image_url) image = '

'.format(title_image_url) # embedded image links def insert_image(m): url = m.group()[1:-1] if url.__contains__('studio') is False: url = resize_img(url) return '

'.format(url) body = re.sub(r'\[https://\S+?\]', insert_image, body) if data.get('description'): desc = '

' + data['description'] + '

' html = '

' + title + '

' + desc + '

' + author + '

' + image + '

' + body return html def preprocess_html(self, soup): for span in soup.findAll('span'): p = span.findParent('p') if p: p['id'] = 'fig-cap' return soup calibre_most_common_ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36'