View Single Post
Old 06-12-2024, 04:23 AM   #4
unkn0wn
Guru
unkn0wn understands the Henderson-Hasselbalch Equation.unkn0wn understands the Henderson-Hasselbalch Equation.unkn0wn understands the Henderson-Hasselbalch Equation.unkn0wn understands the Henderson-Hasselbalch Equation.unkn0wn understands the Henderson-Hasselbalch Equation.unkn0wn understands the Henderson-Hasselbalch Equation.unkn0wn understands the Henderson-Hasselbalch Equation.unkn0wn understands the Henderson-Hasselbalch Equation.unkn0wn understands the Henderson-Hasselbalch Equation.unkn0wn understands the Henderson-Hasselbalch Equation.unkn0wn understands the Henderson-Hasselbalch Equation.
 
Posts: 631
Karma: 85520
Join Date: May 2021
Device: kindle
i still get HTTP Error 401: HTTP Forbidden.
maybe it kinda worked for you that one time.
Code:
#!/usr/bin/env python
# vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2020, Kovid Goyal <kovid at kovidgoyal.net>

from calibre.web.feeds.news import BasicNewsRecipe


def prefixed_classes(classes):
    q = frozenset(classes.split(' '))

    def matcher(x):
        if x:
            for candidate in frozenset(x.split()):
                for x in q:
                    if candidate.startswith(x):
                        return True
        return False
    return {'attrs': {'class': matcher}}


class Reuters(BasicNewsRecipe):
    title = 'Reuters'
    description = 'News from all over'
    __author__ = 'Kovid Goyal'
    language = 'en'


    keep_only_tags = [
        prefixed_classes('article-body__container__ article-header__container__'),
    ]
    remove_tags = [
        prefixed_classes(
            'context-widget__tabs___ article-header__toolbar__ read-next-mobile__container__ toolbar__container__ button__link__'
            ' ArticleBody-read-time-and-social Slideshow-expand-button- TwoColumnsLayout-footer- RegistrationPrompt__container___'
            ' SocialEmbed__inner___ trust-badge author-bio__social__ with-spinner__spinner__ author-bio__author-image__'
        ),
        dict(name=['button', 'link', 'svg']),
    ]
    remove_attributes = ['style', 'height', 'width']

    extra_css = '''
        img { max-width: 100%; }
        [class^="article-header__tags__"],
        [class^="author-bio__author-card__"],
        [class^="article-header__author-date__"] {
            font-size:small;
        }
        [data-testid="primary-gallery"], [data-testid="primary-image"] { font-size:small; text-align:center; }
    '''

    feeds = [
        ('World', 'https://rsshub.app/reuters/world'),
        ('Business', 'https://rsshub.app/reuters/business'),
        ('Finance', 'https://rsshub.app/reuters/business/finance'),
        ('Markets', 'https://rsshub.app/reuters/markets'),
        ('Technology', 'https://rsshub.app/reuters/technology'),
        ('Sports', 'https://rsshub.app/reuters/sports'),
        ('Science', 'https://rsshub.app/reuters/science'),
        ('Lifestyle', 'https://rsshub.app/reuters/lifestyle')
    ]

    def preprocess_html(self, soup):
        for noscript in soup.findAll('noscript'):
            if noscript.findAll('img'):
                noscript.name = 'div'
        for img in soup.findAll('img', attrs={'srcset':True}):
            img['src'] = img['srcset'].split()[0]
        return soup
unkn0wn is offline   Reply With Quote