MobileRead Forums - View Single Post - (mostly) Russian and Ukrainian sources: state of built-in recipes, fixes, new recipes

bugmen00t · 08-08-2023, 11:25 AM

UA-Футбол: soccer news from Ukraine and around the world. Favicon.
Fixes needed:

Text artifacts in articles with complex formatting (live feeds etc.)

Spoiler:

Code:

#!/usr/bin/env python
# vim:fileencoding=utf-8

from calibre.web.feeds.news import BasicNewsRecipe, classes

class UAFootball(BasicNewsRecipe):

#Russian version
#    title = 'UA-\u0424\u0443\u0442\u0431\u043E\u043B'
#    description = '\u0410\u043A\u0442\u0443\u0430\u043B\u044C\u043D\u044B\u0435 \u0442\u0435\u043C\u044B \u0444\u0443\u0442\u0431\u043E\u043B\u044C\u043D\u043E\u0439 \u0436\u0438\u0437\u043D\u0438 \u0423\u043A\u0440\u0430\u0438\u043D\u044B \u0438 \u0432\u0441\u0435\u0433\u043E \u043C\u0438\u0440\u0430.'
#    language = 'ru_UK'
#    feeds = [
#        ('\u041D\u043E\u0432\u043E\u0441\u0442\u0438 \u0444\u0443\u0442\u0431\u043E\u043B\u0430', 'https://www.ua-football.com/rss/all.xml')
#        ]

#Ukrainian version
    title = 'UA-\u0424\u0443\u0442\u0431\u043E\u043B'
    description = '\u0410\u043A\u0442\u0443\u0430\u043B\u044C\u043D\u0456 \u0442\u0435\u043C\u0438 \u0444\u0443\u0442\u0431\u043E\u043B\u044C\u043D\u043E\u0433\u043E \u0436\u0438\u0442\u0442\u044F \u0423\u043A\u0440\u0430\u0457\u043D\u0438 \u0442\u0430 \u0432\u0441\u044C\u043E\u0433\u043E \u0441\u0432\u0456\u0442\u0443.'
    language = 'uk'
    feeds = [
        ('\u041D\u043E\u0432\u0438\u043D\u0438', 'https://www.ua-football.com/ua/rss/all.xml')
        ]

    __author__ = 'bugmen00t'
    publisher = '1766 TEAM EOOD'
    category = 'news'
    cover_url = u'https://yt3.googleusercontent.com/11FSvKeWcjFhzKrO7nXZdc-I__UeZ0mhZwbwyOHtnx_1-q6d0zQ2LbOt2duNCY06JVg2cGXS-g=s900-c-k-c0x00ffffff-no-rj'
    no_stylesheets = False
    remove_javascript = False
    auto_cleanup = False
    remove_empty_feeds = True
    oldest_article = 7
    max_articles_per_feed = 200

    remove_tags_before = dict(name='h1')

    remove_tags_after = dict(name='div', attrs={'class': 'show-post'})

    remove_tags =   [
         dict(name='form'),
         dict(name='iframe'),
         dict(name='div', attrs={'class': 'language'}),
         dict(name='div', attrs={'class': 'article__read-also'}),
         dict(name='div', attrs={'class': 'card-player'}),
         dict(name='div', attrs={'class': 'show-post-socials'})
         ]

# Replacing articles in Ukraininan for RU-feed
#    def print_version(self, url):
#        return url.replace('ua-football.com/ua/', 'ua-football.com/')

Football.ua: soccer news portal from Ukraine. Favicon.

Spoiler:

UNIAN.net: Ukrainian Independent News Agency of News, one of the most cited source of news from across Ukraine. Favicon
Russian version (fixed)

Spoiler:

Ukrainian version

Spoiler:

Code:

#!/usr/bin/env python
# vim:fileencoding=utf-8

from calibre.web.feeds.news import BasicNewsRecipe


class Unian(BasicNewsRecipe):
    title = '\u0423\u041D\u0406\u0410\u041D'
    description = '\u0423\u041D\u0406\u0410\u041D (\u0423\u043A\u0440\u0430\u0457\u043D\u0441\u044C\u043A\u0435 \u041D\u0435\u0437\u0430\u043B\u0435\u0436\u043D\u0435 \u0406\u043D\u0444\u043E\u0440\u043C\u0430\u0446\u0456\u0439\u043D\u0435 \u0410\u0433\u0435\u043D\u0442\u0441\u0442\u0432\u043E \u041D\u043E\u0432\u0438\u043D) - \u043F\u0435\u0440\u0448\u0435 \u0432 \u0423\u043A\u0440\u0430\u0457\u043D\u0456 \u0442\u0430 \u043D\u0430\u0439\u0431\u0456\u043B\u044C\u0448\u0435 \u043D\u0435\u0437\u0430\u043B\u0435\u0436\u043D\u0435 \u0456\u043D\u0444\u043E\u0440\u043C\u0430\u0446\u0456\u0439\u043D\u0435 \u0430\u0433\u0435\u043D\u0442\u0441\u0442\u0432\u043E, \u0437\u0430\u0441\u043D\u043E\u0432\u0430\u043D\u0435 1993 \u0440\u043E\u043A\u0443, \u043B\u0456\u0434\u0435\u0440 \u0441\u0435\u0440\u0435\u0434 \u043D\u043E\u0432\u0438\u043D\u043D\u0438\u0445 \u043C\u0435\u0434\u0456\u0430 \u043A\u0440\u0430\u0457\u043D\u0438, \u043D\u0430\u0439\u0431\u0456\u043B\u044C\u0448 \u0446\u0438\u0442\u043E\u0432\u0430\u043D\u0435 \u0434\u0436\u0435\u0440\u0435\u043B\u043E \u043D\u043E\u0432\u0438\u043D \u043F\u0440\u043E \u043F\u043E\u0434\u0456\u0457 \u0432 \u043A\u0440\u0430\u0457\u043D\u0456.'
    __author__ = 'bugmen00t'
    publication_type = 'newspaper'
    oldest_article = 7
    max_articles_per_feed = 100
    language = 'uk'
    cover_url = 'https://www.unian.ua/images/unian-512x512.png'
    auto_cleanup = False
    no_stylesheets = True

    remove_tags_before = dict(name='h1')
    remove_tags_after = dict(name='div', attrs={'class': 'article-text'})
    remove_tags = [
        dict(name='span', attrs={'class': 'article__info-item comments'}),
        dict(name='span', attrs={'class': 'article__info-item views'}),
        dict(name='div', attrs={'class': 'read-also-slider'}),
        dict(name='div', attrs={'class': 'nts-video-wrapper'})
    ]

    feeds = [
    (u'\u0423\u041D\u0406\u0410\u041D', u'https://rss.unian.net/site/news_ukr.rss')
    ]

    def preprocess_html(self, soup):
        for img in soup.findAll('img', attrs={'data-src': True}):
            img['src'] = img['data-src']
        return soup

English version (no updates since April 2023)

Spoiler:

ЭХО: reincarnated news portal by former Echo Moskvy journalists. Favicon. Replacememt for older defunct recipe
Fixes needed:

No images in articles (webp)

Spoiler:

Продолжение следует: digital media founded by Novaya Gazeta journalist Dmitry Kanygin. Favicon.
Fixes needed:

Some images are missing

Spoiler:

Code:

#!/usr/bin/env python
# vim:fileencoding=utf-8

from __future__ import unicode_literals, division, absolute_import, print_function
from calibre.web.feeds.news import BasicNewsRecipe

class ProSleduet(BasicNewsRecipe):
    title          	  = '\u041F\u0440\u043E\u0434\u043E\u043B\u0436\u0435\u043D\u0438\u0435 \u0441\u043B\u0435\u0434\u0443\u0435\u0442'
    __author__            = 'bugmen00t'
    description           = '\u0414\u0438\u0434\u0436\u0438\u0442\u0430\u043B-\u043F\u0440\u043E\u0435\u043A\u0442 \u0436\u0443\u0440\u043D\u0430\u043B\u0438\u0441\u0442\u043E\u0432 \u00AB\u041D\u043E\u0432\u043E\u0439 \u0433\u0430\u0437\u0435\u0442\u044B\u00BB'
    publisher             = 'Pavel Kanygin, Natalia Zhdanova'
    category              = 'news'
    cover_url = u'https://prosleduet.media/wp-content/themes/prosle/assets/img/logo.svg'
    language              = 'ru'
    no_stylesheets        = True
    remove_javascript = False
    auto_cleanup   = False
    oldest_article = 7
    max_articles_per_feed = 20

    remove_tags_before = dict(name='div', attrs={'class': 'container'})
    
    remove_tags_after = dict(name='div', attrs={'class': 'container'})

    remove_tags =   [
        dict(name='div', attrs={'class': 'ya-share2 ya-share2_inited'})
        ] 

    feeds = [
#        ('\u041F\u0440\u043E\u0434\u043E\u043B\u0436\u0435\u043D\u0438\u0435 \u0441\u043B\u0435\u0434\u0443\u0435\u0442', 'https://prosleduet.media/feed/'),
        ('\u041D\u043E\u0432\u043E\u0441\u0442\u0438', 'https://prosleduet.media/category/news/feed/'),
        ('\u041B\u044E\u0434\u0438', 'https://prosleduet.media/category/people/feed/'),
        ('\u0421\u044E\u0436\u0435\u0442\u044B', 'https://prosleduet.media/category/syuzhety/feed/'),
        ('\u041F\u043E\u0434\u043A\u0430\u0441\u0442\u044B', 'https://prosleduet.media/category/podcasts/feed/'),
        ('\u0420\u0430\u0437\u0431\u043E\u0440\u044B', 'https://prosleduet.media/category/details/feed/'),
        ('\u0413\u043B\u0443\u0431\u0438\u043D\u043D\u0430\u044F \u0420\u043E\u0441\u0441\u0438\u044F', 'https://prosleduet.media/category/glubinnaya-rossiya/feed/')
    ]