i still get HTTP Error 401: HTTP Forbidden.
maybe it kinda worked for you that one time.
Code:
#!/usr/bin/env python
# vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2020, Kovid Goyal <kovid at kovidgoyal.net>
from calibre.web.feeds.news import BasicNewsRecipe
def prefixed_classes(classes):
q = frozenset(classes.split(' '))
def matcher(x):
if x:
for candidate in frozenset(x.split()):
for x in q:
if candidate.startswith(x):
return True
return False
return {'attrs': {'class': matcher}}
class Reuters(BasicNewsRecipe):
title = 'Reuters'
description = 'News from all over'
__author__ = 'Kovid Goyal'
language = 'en'
keep_only_tags = [
prefixed_classes('article-body__container__ article-header__container__'),
]
remove_tags = [
prefixed_classes(
'context-widget__tabs___ article-header__toolbar__ read-next-mobile__container__ toolbar__container__ button__link__'
' ArticleBody-read-time-and-social Slideshow-expand-button- TwoColumnsLayout-footer- RegistrationPrompt__container___'
' SocialEmbed__inner___ trust-badge author-bio__social__ with-spinner__spinner__ author-bio__author-image__'
),
dict(name=['button', 'link', 'svg']),
]
remove_attributes = ['style', 'height', 'width']
extra_css = '''
img { max-width: 100%; }
[class^="article-header__tags__"],
[class^="author-bio__author-card__"],
[class^="article-header__author-date__"] {
font-size:small;
}
[data-testid="primary-gallery"], [data-testid="primary-image"] { font-size:small; text-align:center; }
'''
feeds = [
('World', 'https://rsshub.app/reuters/world'),
('Business', 'https://rsshub.app/reuters/business'),
('Finance', 'https://rsshub.app/reuters/business/finance'),
('Markets', 'https://rsshub.app/reuters/markets'),
('Technology', 'https://rsshub.app/reuters/technology'),
('Sports', 'https://rsshub.app/reuters/sports'),
('Science', 'https://rsshub.app/reuters/science'),
('Lifestyle', 'https://rsshub.app/reuters/lifestyle')
]
def preprocess_html(self, soup):
for noscript in soup.findAll('noscript'):
if noscript.findAll('img'):
noscript.name = 'div'
for img in soup.findAll('img', attrs={'srcset':True}):
img['src'] = img['srcset'].split()[0]
return soup