import json import re from calibre.web.feeds.news import BasicNewsRecipe, classes class outlook(BasicNewsRecipe): title = 'Outlook Business Magazine' __author__ = 'unkn0wn' description = ( 'Outlook Business (Monthly) Magazine produces Business, Market, Startup and Leadership' ' content that is differentiated to offer a deeper understanding of trends shaping India. Read to hone your leadership skills.' ) language = 'en_IN' use_embedded_content = False no_stylesheets = True remove_javascript = True remove_attributes = ['height', 'width', 'style'] ignore_duplicate_articles = {'url'} masthead_url = 'https://imgnew.outlookindia.com/uploadimage/library/free_files/jpg/logo_2022_04_30_092331.jpg' extra_css = '.author{font-size:small;}' def parse_index(self): soup = self.index_to_soup('https://www.outlookbusiness.com/magazine/') div = soup.find('div', attrs={'class':'SplWapper'}) url = div.find('a', href = True)['href'] self.cover_url = div.find('img', srcset=True)['srcset'] self.timefmt = '['+ self.tag_to_string(div.find('h6')) + ']' soup = self.index_to_soup('https://www.outlookbusiness.com' + url) ans = [] for section in soup.findAll(**classes('category-banner-content')): p = section.find('p', attrs={'class':lambda x: x and x.startswith('styled__Content')}) desc = self.tag_to_string(p) head = section.find('p', attrs={'class':lambda x: x and x.startswith('styled__Heading')}) title = self.tag_to_string(head) a = p.findParent('a', href = True)['href'] if a.startswith('/'): url = 'https://www.outlookbusiness.com' + a self.log('\t', title, '\n\t', desc, '\n\t\t', url) ans.append({ 'title': title, 'description': desc, 'url': url}) return [('Articles', ans)] def preprocess_raw_html(self, raw, *a): m = re.search('id="__NEXT_DATA__" type="application/json">', raw) raw = raw[m.start():] raw = raw.split('>', 1)[1] data = json.JSONDecoder().raw_decode(raw)[0] data = data['props']['initialState']['dashboard']['ARTICLE_POST_DETAIL_API']['data']['article_data'] title = data['title'] body = data['description'] cat = desc = image = author = '' if 'category_name' in data: try: cat = data['category_name'] except Exception: cat = '' if 'excerpt' in data: desc = '

'+ data['excerpt'] + '

' if 'author' in data: try: author = data['author'][0]['name'] except Exception: author = '' if 'images' in data: try: image = '

'.format(data['images'][0]['image']) except Exception: image = '' html = ''+ cat + '

' + title + '

' + desc + '

'+ author + '

' + image + body return html calibre_most_common_ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36'