I mixed and matched most things from your other recipes.
Don't even know why I used somethings here like
for script in soup.findAll('script'):
script.extract()
But recipe works great.
Code:
from calibre.web.feeds.news import BasicNewsRecipe, classes
class LiveMint(BasicNewsRecipe):
title = u'Live Mint - test'
language = 'en_IN'
__author__ = 'Krittika Goyal'
oldest_article = 1 # days
max_articles_per_feed = 50
encoding = 'utf-8'
use_embedded_content = False
remove_attributes = ['style', 'height', 'width']
keep_only_tags = [
dict(name='h1'),
dict(name='picture'),
dict(name='figcaption'),
classes('articleInfo FirstEle summary highlights paywall'),
]
remove_tags = [
classes('trendingSimilarHeight moreNews mobAppDownload label msgError msgOk')
]
feeds = [
('Companies','https://www.livemint.com/rss/companies'),
('Opinion','https://www.livemint.com/rss/opinion'),
('Money','https://www.livemint.com/rss/money'),
('Economy','https://www.livemint.com/rss/economy/'),
('Politics','https://www.livemint.com/rss/politics'),
('Science','https://www.livemint.com/rss/science'),
('Industry','https://www.livemint.com/rss/industry'),
('Lounge','https://www.livemint.com/rss/lounge'),
('Education','https://www.livemint.com/rss/education'),
('Sports','https://www.livemint.com/rss/sports'),
('Technology','https://www.livemint.com/rss/technology'),
('News','https://www.livemint.com/rss/news'),
('Mutual Funds','https://www.livemint.com/rss/Mutual Funds'),
('Markets','https://www.livemint.com/rss/markets'),
('AI','https://www.livemint.com/rss/AI'),
('Insurance','https://www.livemint.com/rss/insurance'),
('Budget','https://www.livemint.com/rss/budget'),
('Elections','https://www.livemint.com/rss/elections'),
]
def preprocess_raw_html(self, raw_html, url):
from calibre.ebooks.BeautifulSoup import BeautifulSoup
soup = BeautifulSoup(raw_html)
for script in soup.findAll('script'):
script.extract()
for style in soup.findAll('style'):
style.extract()
for img in soup.findAll('img', attrs={'data-src': True}):
img['src'] = img['data-src']
return str(soup)
calibre_most_common_ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36'