View Single Post
Old 08-08-2023, 11:25 AM   #27
bugmen00t
Connoisseur
bugmen00t rocks like Gibraltar!bugmen00t rocks like Gibraltar!bugmen00t rocks like Gibraltar!bugmen00t rocks like Gibraltar!bugmen00t rocks like Gibraltar!bugmen00t rocks like Gibraltar!bugmen00t rocks like Gibraltar!bugmen00t rocks like Gibraltar!bugmen00t rocks like Gibraltar!bugmen00t rocks like Gibraltar!bugmen00t rocks like Gibraltar!
 
bugmen00t's Avatar
 
Posts: 82
Karma: 100000
Join Date: Aug 2015
Device: Kindle Keyboard 3G + Kindle Voyage WiFi + Kindle PW11 Kids WiFi
New/fixed Russian and Ukrainian recipes (part 14)

UA-Футбол: soccer news from Ukraine and around the world. Favicon.
Fixes needed:
  • Text artifacts in articles with complex formatting (live feeds etc.)
Spoiler:
Code:
#!/usr/bin/env python
# vim:fileencoding=utf-8

from calibre.web.feeds.news import BasicNewsRecipe, classes

class UAFootball(BasicNewsRecipe):

#Russian version
#    title = 'UA-\u0424\u0443\u0442\u0431\u043E\u043B'
#    description = '\u0410\u043A\u0442\u0443\u0430\u043B\u044C\u043D\u044B\u0435 \u0442\u0435\u043C\u044B \u0444\u0443\u0442\u0431\u043E\u043B\u044C\u043D\u043E\u0439 \u0436\u0438\u0437\u043D\u0438 \u0423\u043A\u0440\u0430\u0438\u043D\u044B \u0438 \u0432\u0441\u0435\u0433\u043E \u043C\u0438\u0440\u0430.'
#    language = 'ru_UK'
#    feeds = [
#        ('\u041D\u043E\u0432\u043E\u0441\u0442\u0438 \u0444\u0443\u0442\u0431\u043E\u043B\u0430', 'https://www.ua-football.com/rss/all.xml')
#        ]

#Ukrainian version
    title = 'UA-\u0424\u0443\u0442\u0431\u043E\u043B'
    description = '\u0410\u043A\u0442\u0443\u0430\u043B\u044C\u043D\u0456 \u0442\u0435\u043C\u0438 \u0444\u0443\u0442\u0431\u043E\u043B\u044C\u043D\u043E\u0433\u043E \u0436\u0438\u0442\u0442\u044F \u0423\u043A\u0440\u0430\u0457\u043D\u0438 \u0442\u0430 \u0432\u0441\u044C\u043E\u0433\u043E \u0441\u0432\u0456\u0442\u0443.'
    language = 'uk'
    feeds = [
        ('\u041D\u043E\u0432\u0438\u043D\u0438', 'https://www.ua-football.com/ua/rss/all.xml')
        ]

    __author__ = 'bugmen00t'
    publisher = '1766 TEAM EOOD'
    category = 'news'
    cover_url = u'https://yt3.googleusercontent.com/11FSvKeWcjFhzKrO7nXZdc-I__UeZ0mhZwbwyOHtnx_1-q6d0zQ2LbOt2duNCY06JVg2cGXS-g=s900-c-k-c0x00ffffff-no-rj'
    no_stylesheets = False
    remove_javascript = False
    auto_cleanup = False
    remove_empty_feeds = True
    oldest_article = 7
    max_articles_per_feed = 200

    remove_tags_before = dict(name='h1')

    remove_tags_after = dict(name='div', attrs={'class': 'show-post'})

    remove_tags =   [
         dict(name='form'),
         dict(name='iframe'),
         dict(name='div', attrs={'class': 'language'}),
         dict(name='div', attrs={'class': 'article__read-also'}),
         dict(name='div', attrs={'class': 'card-player'}),
         dict(name='div', attrs={'class': 'show-post-socials'})
         ]

# Replacing articles in Ukraininan for RU-feed
#    def print_version(self, url):
#        return url.replace('ua-football.com/ua/', 'ua-football.com/')


Football.ua: soccer news portal from Ukraine. Favicon.
Spoiler:
Code:
#!/usr/bin/env python
# vim:fileencoding=utf-8

from calibre.web.feeds.news import BasicNewsRecipe, classes

class FootballUA(BasicNewsRecipe):
    title = 'Football.UA'
    __author__ = 'bugmen00t'
    description = '\u0421\u043F\u043E\u0440\u0442\u0438\u0432\u043D\u0438\u0439 \u043F\u043E\u0440\u0442\u0430\u043B \u0432 \u0423\u043A\u0440\u0430\u0457\u043D\u0456, \u043F\u0440\u0438\u0441\u0432\u044F\u0447\u0435\u043D\u0438\u0439 \u043B\u0438\u0448\u0435 \u0444\u0443\u0442\u0431\u043E\u043B\u0443.'
    publisher = 'United Media Holding group'
    category = 'news'
    cover_url = u'https://s.ill.in.ua/i/news/570x380/212/212438.jpg'
    language = 'uk'
    no_stylesheets = False
    remove_javascript = False
    auto_cleanup = False
    remove_empty_feeds = True
    oldest_article = 3
    max_articles_per_feed = 20

    remove_tags_before = dict(name='article')

    remove_tags_after = dict(name='article')

    remove_tags =   [
         dict(name='div', attrs={'class': 'bottom-info'}),
         dict(name='div', attrs={'class': 'social-buttons'})
        ]

    feeds = [
        ('\u041D\u043E\u0432\u0438\u043D\u0438', 'https://football.ua/rss2.ashx'),
        ]


UNIAN.net: Ukrainian Independent News Agency of News, one of the most cited source of news from across Ukraine. Favicon
Russian version (fixed)
Spoiler:
Code:
#!/usr/bin/env python
# vim:fileencoding=utf-8

from calibre.web.feeds.news import BasicNewsRecipe


class Unian(BasicNewsRecipe):
    title = '\u0423\u041D\u0418\u0410\u041D '
    description = 'Украинское Независимое Информационное Агентство Новостей – первое в Украине и самое большое независимое информационное агентство, основанное в 1993 году, лидер среди новостных медиа страны, самый цитируемый источник новостей о событиях в стране.'  # noqa
    __author__ = 'bugmen00t'
    publication_type = 'newspaper'
    oldest_article = 7
    max_articles_per_feed = 100
    language = 'ru_UK'
    cover_url = 'https://www.unian.net/images/unian-512x512.png'
    auto_cleanup = False
    no_stylesheets = True

    remove_tags_before = dict(name='h1')
    remove_tags_after = dict(name='div', attrs={'class': 'article-text'})
    remove_tags = [
        dict(name='span', attrs={'class': 'article__info-item comments'}),
        dict(name='span', attrs={'class': 'article__info-item views'}),
        dict(name='div', attrs={'class': 'read-also-slider'}),
        dict(name='div', attrs={'class': 'nts-video-wrapper'})
    ]

    feeds = [
    (u'\u0423\u041D\u0418\u0410\u041D', u'https://rss.unian.net/site/news_rus.rss')
    ]

    def preprocess_html(self, soup):
        for img in soup.findAll('img', attrs={'data-src': True}):
            img['src'] = img['data-src']
        return soup

Ukrainian version
Spoiler:
Code:
#!/usr/bin/env python
# vim:fileencoding=utf-8

from calibre.web.feeds.news import BasicNewsRecipe


class Unian(BasicNewsRecipe):
    title = '\u0423\u041D\u0406\u0410\u041D'
    description = '\u0423\u041D\u0406\u0410\u041D (\u0423\u043A\u0440\u0430\u0457\u043D\u0441\u044C\u043A\u0435 \u041D\u0435\u0437\u0430\u043B\u0435\u0436\u043D\u0435 \u0406\u043D\u0444\u043E\u0440\u043C\u0430\u0446\u0456\u0439\u043D\u0435 \u0410\u0433\u0435\u043D\u0442\u0441\u0442\u0432\u043E \u041D\u043E\u0432\u0438\u043D) - \u043F\u0435\u0440\u0448\u0435 \u0432 \u0423\u043A\u0440\u0430\u0457\u043D\u0456 \u0442\u0430 \u043D\u0430\u0439\u0431\u0456\u043B\u044C\u0448\u0435 \u043D\u0435\u0437\u0430\u043B\u0435\u0436\u043D\u0435 \u0456\u043D\u0444\u043E\u0440\u043C\u0430\u0446\u0456\u0439\u043D\u0435 \u0430\u0433\u0435\u043D\u0442\u0441\u0442\u0432\u043E, \u0437\u0430\u0441\u043D\u043E\u0432\u0430\u043D\u0435 1993 \u0440\u043E\u043A\u0443, \u043B\u0456\u0434\u0435\u0440 \u0441\u0435\u0440\u0435\u0434 \u043D\u043E\u0432\u0438\u043D\u043D\u0438\u0445 \u043C\u0435\u0434\u0456\u0430 \u043A\u0440\u0430\u0457\u043D\u0438, \u043D\u0430\u0439\u0431\u0456\u043B\u044C\u0448 \u0446\u0438\u0442\u043E\u0432\u0430\u043D\u0435 \u0434\u0436\u0435\u0440\u0435\u043B\u043E \u043D\u043E\u0432\u0438\u043D \u043F\u0440\u043E \u043F\u043E\u0434\u0456\u0457 \u0432 \u043A\u0440\u0430\u0457\u043D\u0456.'
    __author__ = 'bugmen00t'
    publication_type = 'newspaper'
    oldest_article = 7
    max_articles_per_feed = 100
    language = 'uk'
    cover_url = 'https://www.unian.ua/images/unian-512x512.png'
    auto_cleanup = False
    no_stylesheets = True

    remove_tags_before = dict(name='h1')
    remove_tags_after = dict(name='div', attrs={'class': 'article-text'})
    remove_tags = [
        dict(name='span', attrs={'class': 'article__info-item comments'}),
        dict(name='span', attrs={'class': 'article__info-item views'}),
        dict(name='div', attrs={'class': 'read-also-slider'}),
        dict(name='div', attrs={'class': 'nts-video-wrapper'})
    ]

    feeds = [
    (u'\u0423\u041D\u0406\u0410\u041D', u'https://rss.unian.net/site/news_ukr.rss')
    ]

    def preprocess_html(self, soup):
        for img in soup.findAll('img', attrs={'data-src': True}):
            img['src'] = img['data-src']
        return soup

English version (no updates since April 2023)
Spoiler:
Code:
#!/usr/bin/env python
# vim:fileencoding=utf-8

from calibre.web.feeds.news import BasicNewsRecipe

class Unian(BasicNewsRecipe):
    title = 'UNIAN'
    description = 'UNIAN (Ukrainian Independent News Agency of News) is the largest independent news agency, first in Ukraine, founded in 1993, remaining the leader among the country\'s news media, being the most cited source of news from across Ukraine.'
    __author__ = 'bugmen00t'
    publication_type = 'newspaper'
    oldest_article = 30
    max_articles_per_feed = 100
    language = 'en_UK'
    cover_url = 'https://www.unian.info/images/unian-512x512.png'
    auto_cleanup = False
    no_stylesheets = True

    remove_tags_before = dict(name='h1')
    remove_tags_after = dict(name='div', attrs={'class': 'article-text'})
    remove_tags = [
        dict(name='span', attrs={'class': 'article__info-item comments'}),
        dict(name='span', attrs={'class': 'article__info-item views'}),
        dict(name='div', attrs={'class': 'read-also-slider'}),
        dict(name='div', attrs={'class': 'nts-video-wrapper'})
    ]

    feeds = [
    (u'News Agency UNIAN', u'https://rss.unian.net/site/news_eng.rss')
    ]

    def preprocess_html(self, soup):
        for img in soup.findAll('img', attrs={'data-src': True}):
            img['src'] = img['data-src']
        return soup


ЭХО: reincarnated news portal by former Echo Moskvy journalists. Favicon. Replacememt for older defunct recipe
Fixes needed:
  • No images in articles (webp)
Spoiler:
Code:
#!/usr/bin/env python
# vim:fileencoding=utf-8

from __future__ import unicode_literals, division, absolute_import, print_function
from calibre.web.feeds.news import BasicNewsRecipe

class EchoMsk(BasicNewsRecipe):
    title          	  = '\u042D\u0425\u041E'
    __author__            = 'bugmen00t'
    description           = '\u042D\u0425\u041E - \u043A\u0430\u043A \u043D\u0430 \u0441\u0442\u0430\u0440\u043E\u043C \u0434\u043E\u0431\u0440\u043E\u043C \u0440\u0430\u0434\u0438\u043E'
    publisher             = 'Radio Echo GmbH'
    category              = 'news'
    cover_url = u'https://echofm.online/logo.png'
    language              = 'ru'
    no_stylesheets        = True
    remove_javascript = False
    auto_cleanup   = False
    oldest_article = 7
    max_articles_per_feed = 50

    remove_tags_before = dict(name='article')
    
    remove_tags_after = dict(name='article')

    remove_tags =   [
        dict(name='span', attrs={'class': 'sc-7b4cbb79-0 guzUFC'}),
        dict(name='div', attrs={'class': 'sc-f94c4ef5-0 frGiYu'}),
        dict(name='div', attrs={'class': 'sc-f94c4ef5-0 frGiYu'})
        ] 

    feeds = [
        ('\u0413\u043B\u0430\u0432\u043D\u043E\u0435', 'https://echofm.online/feed'),
        ('\u041D\u043E\u0432\u043E\u0441\u0442\u0438', 'https://echofm.online/news/feed'),
        ('\u041C\u043D\u0435\u043D\u0438\u044F', 'https://echofm.online/opinions/feed'),
        ('\u0414\u043E\u043A\u0443\u043C\u0435\u043D\u0442\u044B', 'https://echofm.online/documents/feed')
    ]


Продолжение следует: digital media founded by Novaya Gazeta journalist Dmitry Kanygin. Favicon.
Fixes needed:
  • Some images are missing
Spoiler:
Code:
#!/usr/bin/env python
# vim:fileencoding=utf-8

from __future__ import unicode_literals, division, absolute_import, print_function
from calibre.web.feeds.news import BasicNewsRecipe

class ProSleduet(BasicNewsRecipe):
    title          	  = '\u041F\u0440\u043E\u0434\u043E\u043B\u0436\u0435\u043D\u0438\u0435 \u0441\u043B\u0435\u0434\u0443\u0435\u0442'
    __author__            = 'bugmen00t'
    description           = '\u0414\u0438\u0434\u0436\u0438\u0442\u0430\u043B-\u043F\u0440\u043E\u0435\u043A\u0442 \u0436\u0443\u0440\u043D\u0430\u043B\u0438\u0441\u0442\u043E\u0432 \u00AB\u041D\u043E\u0432\u043E\u0439 \u0433\u0430\u0437\u0435\u0442\u044B\u00BB'
    publisher             = 'Pavel Kanygin, Natalia Zhdanova'
    category              = 'news'
    cover_url = u'https://prosleduet.media/wp-content/themes/prosle/assets/img/logo.svg'
    language              = 'ru'
    no_stylesheets        = True
    remove_javascript = False
    auto_cleanup   = False
    oldest_article = 7
    max_articles_per_feed = 20

    remove_tags_before = dict(name='div', attrs={'class': 'container'})
    
    remove_tags_after = dict(name='div', attrs={'class': 'container'})

    remove_tags =   [
        dict(name='div', attrs={'class': 'ya-share2 ya-share2_inited'})
        ] 

    feeds = [
#        ('\u041F\u0440\u043E\u0434\u043E\u043B\u0436\u0435\u043D\u0438\u0435 \u0441\u043B\u0435\u0434\u0443\u0435\u0442', 'https://prosleduet.media/feed/'),
        ('\u041D\u043E\u0432\u043E\u0441\u0442\u0438', 'https://prosleduet.media/category/news/feed/'),
        ('\u041B\u044E\u0434\u0438', 'https://prosleduet.media/category/people/feed/'),
        ('\u0421\u044E\u0436\u0435\u0442\u044B', 'https://prosleduet.media/category/syuzhety/feed/'),
        ('\u041F\u043E\u0434\u043A\u0430\u0441\u0442\u044B', 'https://prosleduet.media/category/podcasts/feed/'),
        ('\u0420\u0430\u0437\u0431\u043E\u0440\u044B', 'https://prosleduet.media/category/details/feed/'),
        ('\u0413\u043B\u0443\u0431\u0438\u043D\u043D\u0430\u044F \u0420\u043E\u0441\u0441\u0438\u044F', 'https://prosleduet.media/category/glubinnaya-rossiya/feed/')
    ]
Attached Images
       
Attached Files
File Type: recipe ua_fooball.recipe (2.3 KB, 441 views)
File Type: recipe footballua.recipe (1.2 KB, 451 views)
File Type: recipe unian_net.recipe (1.6 KB, 434 views)
File Type: recipe unian_net_ua.recipe (2.3 KB, 441 views)
File Type: recipe unian_net_en.recipe (1.3 KB, 443 views)
File Type: recipe echo_moskvy.recipe (1.5 KB, 442 views)
File Type: recipe prosleduet.recipe (2.1 KB, 452 views)
bugmen00t is offline   Reply With Quote