MobileRead Forums - View Single Post - (mostly) Russian and Ukrainian sources: state of built-in recipes, fixes, new recipes

bugmen00t · 08-09-2022, 01:41 PM

More Russian recipes

Люди Байкала: blog about life in rural Siberia. Favicon.
Fixes needed:

No lead image, part of lead text is missing

Spoiler:

Code:

#!/usr/bin/env python
# vim:fileencoding=utf-8

from __future__ import unicode_literals, division, absolute_import, print_function
from calibre.web.feeds.news import BasicNewsRecipe

class BaikalJournal(BasicNewsRecipe):
    title           	  = '\u041B\u044E\u0434\u0438 \u0411\u0430\u0439\u043A\u0430\u043B\u0430'
    __author__            = 'bugmen00t'
    description           = '\u041D\u0435\u0437\u0430\u0432\u0438\u0441\u0438\u043C\u044B\u0439 \u0430\u0432\u0442\u043E\u0440\u0441\u043A\u0438\u0439 \u0438\u043D\u0442\u0435\u0440\u043D\u0435\u0442-\u0436\u0443\u0440\u043D\u0430\u043B \u043E \u0436\u0438\u0437\u043D\u0438 \u0432 \u0421\u0438\u0431\u0438\u0440\u0438 \u0438 \u0432\u043E\u043A\u0440\u0443\u0433 \u0411\u0430\u0439\u043A\u0430\u043B\u0430, \u043E\u043F\u0438\u0441\u044B\u0432\u0430\u044E\u0449\u0438\u0439 \u0436\u0438\u0437\u043D\u044C \u0432 \u0441\u0438\u0431\u0438\u0440\u0441\u043A\u043E\u0439 \u0433\u043B\u0443\u0431\u0438\u043D\u043A\u0435.'
    publisher             = '\u041E\u043B\u044C\u0433\u0430 \u041C\u0443\u0442\u043E\u0432\u0438\u043D\u0430, \u0415\u043B\u0435\u043D\u0430 \u0422\u0440\u0438\u0444\u043E\u043D\u043E\u0432\u0430'
    category              = 'blog'
    cover_url = u'https://baikal-journal.ru/wp-content/themes/baikal/assets/img/logo-full.svg'
    language              = 'ru'
    no_stylesheets        = False
    remove_javascript = False
    auto_cleanup   = False
    oldest_article = 30
    max_articles_per_feed = 10

    remove_tags =   [
        dict(name='div', attrs={'class': 'distance-badge'}),
        dict(name='div', attrs={'class': 'lead-footer__sharing'}),
        dict(name='div', attrs={'class': 'm-block-ctaline'}),
        dict(name='div', attrs={'class': 'm-block-readmore format-inline'}),
        dict(name='footer'),
        dict(name='div', attrs={'class': 'related-block'}),
        dict(name='div', attrs={'class': 'selection-block'}),
        dict(name='div', attrs={'class': 'last-cta'})
        ] 

    feeds = [
        ('\u041B\u044E\u0434\u0438 \u0411\u0430\u0439\u043A\u0430\u043B\u0430', 'https://baikal-journal.ru/feed/')
        ]

Настоящее время: Prague-based Russian-language TV channel founded by RFE/RL & VoA. Favicon.
Fixes needed:

In some aticles, all text is conglamerated into a single paragraph
Sometimes live online text translations are empty

Spoiler:

Code:

#!/usr/bin/env python
# vim:fileencoding=utf-8

from __future__ import unicode_literals, division, absolute_import, print_function
from calibre.web.feeds.news import BasicNewsRecipe

class CurrentTime(BasicNewsRecipe):
    title          	  = '\u041D\u0430\u0441\u0442\u043E\u044F\u0449\u0435\u0435 \u0432\u0440\u0435\u043C\u044F'
    __author__            = 'bugmen00t'
    description           = '\u0422\u0435\u043B\u0435\u043A\u0430\u043D\u0430\u043B "\u041D\u0430\u0441\u0442\u043E\u044F\u0449\u0435\u0435 \u0412\u0440\u0435\u043C\u044F" \u0440\u0430\u0441\u0441\u043A\u0430\u0437\u044B\u0432\u0430\u0435\u0442 \u043E \u0432\u0430\u0436\u043D\u044B\u0445 \u043D\u043E\u0432\u043E\u0441\u0442\u044F\u0445 \u0438 \u0437\u043B\u043E\u0431\u043E\u0434\u043D\u0435\u0432\u043D\u044B\u0445 \u0442\u0435\u043C\u0430\u0445, \u043F\u0440\u0435\u0434\u043E\u0441\u0442\u0430\u0432\u043B\u044F\u044F \u0430\u0443\u0434\u0438\u0442\u043E\u0440\u0438\u0438 \u0442\u043E, \u0447\u0442\u043E \u043D\u0435 \u0432\u0441\u0435\u0433\u0434\u0430 \u043C\u043E\u0433\u0443\u0442 \u043E\u0431\u0435\u0441\u043F\u0435\u0447\u0438\u0442\u044C \u043C\u0435\u0441\u0442\u043D\u044B\u0435 \u0421\u041C\u0418: \u043D\u043E\u0432\u043E\u0441\u0442\u0438 \u0431\u0435\u0437 \u0446\u0435\u043D\u0437\u0443\u0440\u044B, \u043E\u0442\u0432\u0435\u0442\u0441\u0442\u0432\u0435\u043D\u043D\u044B\u0439 \u043E\u0431\u043C\u0435\u043D \u043C\u043D\u0435\u043D\u0438\u044F\u043C\u0438, \u043E\u0442\u043A\u0440\u044B\u0442\u043E\u0435 \u043E\u0431\u0441\u0443\u0436\u0434\u0435\u043D\u0438\u0435 \u043F\u0440\u043E\u0431\u043B\u0435\u043C.'
    publisher             = 'RFE/RL ("\u0420\u0430\u0434\u0438\u043E \u0421\u0432\u043E\u0431\u043E\u0434\u043D\u0430\u044F \u0415\u0432\u0440\u043E\u043F\u0430"/"\u0420\u0430\u0434\u0438\u043E \u0421\u0432\u043E\u0431\u043E\u0434\u0430") \u043F\u0440\u0438 \u0443\u0447\u0430\u0441\u0442\u0438\u0438 VoA ("\u0413\u043E\u043B\u043E\u0441 \u0410\u043C\u0435\u0440\u0438\u043A\u0438")'
    category              = 'newspaper'
    cover_url = u'https://www.currenttime.tv/Content/responsive/RFE/ru-RU-TV/img/top_logo_news.png'
    language              = 'ru'
    no_stylesheets        = False
    remove_javascript = False
    auto_cleanup   = False
    remove_empty_feeds = True
    oldest_article = 14
    max_articles_per_feed = 20

    remove_tags_before = dict(name='h1')

    remove_tags_after = dict(name='div', attrs={'class': 'body-container'})

    remove_tags =   [
        dict(name='div', attrs={'class': 'publishing-details '}),
        dict(name='div', attrs={'class': 'separator'}),
        dict(name='div', attrs={'class': 'links'}),
        dict(name='div', attrs={'class': 'share--box'}),
        dict(name='aside'),
        dict(name='div', attrs={'class': 'media-block also-read'}),
        dict(name='div', attrs={'class': 'media-block-wrap'}),
        dict(name='div', attrs={'class': 'media-download'}),
        dict(name='div', attrs={'class': 'wsw__embed'}),
        dict(name='div', attrs={'class': 'share--box'}),
        dict(name='div', attrs={'class': 'prog-hdr'}),
        dict(name='div', attrs={'class': 'dropdown__holder'}),
        dict(name='div', attrs={'id': 'ymla-section'}),
        dict(name='div', attrs={'id': 'comments'}),
        dict(name='a', attrs={'class': 'back-to-top-link'}),
        dict(name='footer'),
        dict(name='li', attrs={'class': 'socials block-socials'}),
        dict(name='div', attrs={'data-sp_api': 'pangea-video'})
        ] 

    feeds = [
        ('\u041D\u043E\u0432\u043E\u0441\u0442\u0438', 'https://www.currenttime.tv/api/zgbip_e_tpp_'),
        ('\u0421\u0435\u043C\u044C \u0441\u043E\u0440\u043E\u043A', 'https://www.currenttime.tv/api/ztktpyeimupt'),
        ('\u042D\u043A\u0441\u043A\u043B\u044E\u0437\u0438\u0432', 'https://www.currenttime.tv/api/zpyomoe-rimi'),
        ('\u0412\u044B\u0431\u043E\u0440 \u0440\u0435\u0434\u0430\u043A\u0446\u0438\u0438', 'https://www.currenttime.tv/api/zqk-poekpbpo'),
        ('\u0420\u043E\u0441\u0441\u0438\u044F', 'https://www.currenttime.tv/api/zuvmpvepo_pv'),
        ('\u0423\u043A\u0440\u0430\u0438\u043D\u0430', 'https://www.currenttime.tv/api/zkvmptemo_pt'),
        ('\u0411\u0435\u043B\u0430\u0440\u0443\u0441\u044C', 'https://www.currenttime.tv/api/zvvmm_eoo_mt'),
        ('\u0410\u0437\u0438\u044F', 'https://www.currenttime.tv/api/zbvtpqetoupq'),
        ('\u0415\u0432\u0440\u043E\u043F\u0430', 'https://www.currenttime.tv/api/z-vmpoevo_pi'),
        ('\u0410\u043C\u0435\u0440\u0438\u043A\u0430', 'https://www.currenttime.tv/api/zbvmpieto_pp'),
        ('\u0411\u043B\u0438\u0436\u043D\u0438\u0439 \u0412\u043E\u0441\u0442\u043E\u043A', 'https://www.currenttime.tv/api/zrvtppeuqupm'),
        ('\u041B\u043E\u043D\u0433\u0440\u0438\u0434\u044B', 'https://www.currenttime.tv/api/zibmmyejv_my'),
        ('\u0420\u0435\u043F\u043E\u0440\u0442\u0430\u0436', 'https://www.currenttime.tv/api/zrpppqeujppo'),
        ('\u0420\u0435\u043F\u043E\u0440\u0442\u0430\u0436\u0438 \u043F\u0440\u043E\u0433\u0440\u0430\u043C\u043C\u044B \u0412\u0435\u0447\u0435\u0440', 'https://www.currenttime.tv/api/zvrrmoeourmp'),
        ('\u0418\u043D\u0442\u0435\u0440\u0432\u044C\u044E', 'https://www.currenttime.tv/api/zqpppoekjppi'),
        ('\u0418\u043D\u0444\u043E\u0433\u0440\u0430\u0444\u0438\u043A\u0430', 'https://www.currenttime.tv/api/zmqmpyebumpv')
        ]

Фонтанка: Saint Petersburg news portal. Favicon.
Fixes needed:

Broken formatting & missing text in articles with complex layout

Spoiler:

Code:

#!/usr/bin/env python
# vim:fileencoding=utf-8

from __future__ import unicode_literals, division, absolute_import, print_function
from calibre.web.feeds.news import BasicNewsRecipe

class Fontanka(BasicNewsRecipe):
    title           	  = '\u0424\u043E\u043D\u0442\u0430\u043D\u043A\u0430'
    __author__            = 'bugmen00t'
    description           = '\u0415\u0436\u0435\u0434\u043D\u0435\u0432\u043D\u043E\u0435 \u043F\u0435\u0442\u0435\u0440\u0431\u0443\u0440\u0433\u0441\u043A\u043E\u0435 \u043E\u0431\u0449\u0435\u0441\u0442\u0432\u0435\u043D\u043D\u043E-\u043F\u043E\u043B\u0438\u0442\u0438\u0447\u0435\u0441\u043A\u043E\u0435 \u0441\u0435\u0442\u0435\u0432\u043E\u0435 \u0438\u0437\u0434\u0430\u043D\u0438\u0435.'
    publisher             = '\u0410\u041E "\u0410\u0416\u0423\u0420-\u041C\u0415\u0414\u0418\u0410"'
    category              = 'blog'
    cover_url = u'https://www.fontanka.ru/longreads/69505589/2015/images/tild3834-3362-4166-b239-366134363733____-01.png'
    language              = 'ru'
    no_stylesheets        = False
    remove_javascript = False
    auto_cleanup   = False
    oldest_article = 7
    max_articles_per_feed = 30

    remove_tags_before = dict(name='h1')

    remove_tags_after = dict(name='section', attrs={'itemprop': 'articleBody'})

    remove_tags =   [
        dict(name='div', attrs={'class': 'ADdj ADc5'}),
        dict(name='div', attrs={'class': 'DLj1'}),
        dict(name='div', attrs={'class': 'DTrp'}),
        dict(name='div', attrs={'class': 'EHed'}),
        dict(name='div', attrs={'class': 'FHwp'}),
        dict(name='div', attrs={'class': 'ENr-'}),
        dict(name='div', attrs={'class': 'ENt7'}),
        dict(name='div', attrs={'class': 't004'}),
        dict(name='div', attrs={'class': 't120'}),
        dict(name='div', attrs={'class': 't123'}),
        dict(name='div', attrs={'class': 't405'}),
        dict(name='div', attrs={'class': 't463'}),
# For articles from https://doctorpiter.ru
        dict(name='div', attrs={'class': 'article__block article__block_type-links'}),
        dict(name='div', attrs={'class': 'feedback-request-form__before'}),
        dict(name='div', attrs={'class': 'related-entities-container'}),
        dict(name='div', attrs={'class': 'tags article-footer__tags'}),
        dict(name='hr', attrs={'class': 'article-footer-divider'})        
        ] 

    feeds = [
        ('Fontanka.ru', 'https://www.fontanka.ru/fontanka.rss')
        ]

The Bell: business news Favicon.
Fixes needed:

Minor in-text ads

Spoiler:

Code:

#!/usr/bin/env python
# vim:fileencoding=utf-8

from __future__ import unicode_literals, division, absolute_import, print_function
from calibre.web.feeds.news import BasicNewsRecipe

class TheBell(BasicNewsRecipe):
    title          	  = 'The Bell'
    __author__            = 'bugmen00t'
    description           = '\u0418\u043D\u0442\u0435\u0440\u043D\u0435\u0442-\u0438\u0437\u0434\u0430\u043D\u0438\u0435 \u0438 \u0438\u043D\u0444\u043E\u0440\u043C\u0430\u0446\u0438\u043E\u043D\u043D\u044B\u0439 \u0431\u044E\u043B\u043B\u0435\u0442\u0435\u043D\u044C, \u0441\u043F\u0435\u0446\u0438\u0430\u043B\u0438\u0437\u0438\u0440\u0443\u044E\u0449\u0435\u0435\u0441\u044F \u043D\u0430 \u0431\u0438\u0437\u043D\u0435\u0441-\u043D\u043E\u0432\u043E\u0441\u0442\u044F\u0445.'
    publisher             = '\u0415\u043B\u0438\u0437\u0430\u0432\u0435\u0442\u0430 \u041E\u0441\u0435\u0442\u0438\u043D\u0441\u043A\u0430\u044F, \u0418\u0440\u0438\u043D\u0430 \u041C\u0430\u043B\u043A\u043E\u0432\u0430'
    category              = 'newspaper'
    cover_url = u'https://thebell.io/wp-content/uploads/2018/03/thebell-cover.png'
    language              = 'ru'
    no_stylesheets        = True
    remove_javascript = False
    auto_cleanup   = False
    oldest_article = 7
    max_articles_per_feed = 20

    remove_tags_before = dict(name='h1')

    remove_tags_after = dict(name='div', attrs={'class': 'post'})

    remove_tags =   [
        dict(name='div', attrs={'class': 'post_bottom ng-tns-c84-1 ng-star-inserted'}),
        dict(name='div', attrs={'class': 'post_tags ng-tns-c84-1 ng-star-inserted'}),
        dict(name='div', attrs={'class': 'post_subscribe ng-tns-c84-1 ng-star-inserted'}),
        dict(name='div', attrs={'class': 'author__content_image author__content_image_small ng-star-inserted'}),
        dict(name='div', attrs={'class': 'email email_small ng-star-inserted'}),
        dict(name='div', attrs={'class': 'post_share ng-tns-c84-1'}),
        dict(name='div', attrs={'class': 'social_media'})
        ] 

    feeds = [
        ('The Bell', 'https://thebell.io/feed')
        ]

Fixed recipe for Сноб (snob.recipe): URL transformation subroutine lacked the last line

Spoiler:

Code:

#!/usr/bin/env python
# vim:fileencoding=utf-8

from __future__ import unicode_literals, division, absolute_import, print_function
from calibre.web.feeds.news import BasicNewsRecipe

class Snob(BasicNewsRecipe):
    title          	  = '\u0421\u043D\u043E\u0431'
    __author__            = 'bugmen00t'
    description           = '\u0414\u0438\u0441\u043A\u0443\u0441\u0441\u0438\u043E\u043D\u043D\u043E\u0435, \u0438\u043D\u0444\u043E\u0440\u043C\u0430\u0446\u0438\u043E\u043D\u043D\u043E\u0435 \u0438 \u043E\u0431\u0449\u0435\u0441\u0442\u0432\u0435\u043D\u043D\u043E\u0435 \u043F\u0440\u043E\u0441\u0442\u0440\u0430\u043D\u0441\u0442\u0432\u043E \u0434\u043B\u044F \u043B\u044E\u0434\u0435\u0439, \u043A\u043E\u0442\u043E\u0440\u044B\u0435 \u0436\u0438\u0432\u0443\u0442 \u0432 \u0440\u0430\u0437\u043D\u044B\u0445 \u0441\u0442\u0440\u0430\u043D\u0430\u0445, \u0433\u043E\u0432\u043E\u0440\u044F\u0442 \u043D\u0430 \u0440\u0430\u0437\u043D\u044B\u0445 \u044F\u0437\u044B\u043A\u0430\u0445, \u043D\u043E \u0434\u0443\u043C\u0430\u044E\u0442 \u043F\u043E-\u0440\u0443\u0441\u0441\u043A\u0438.'
    publisher             = '\u041E\u041E\u041E \u00AB\u0421\u043D\u043E\u0431 \u041C\u0435\u0434\u0438\u0430\u00BB'
    category              = 'magazine'
    cover_url = u'https://snob.ru/indoc/tilda/995317/images/tild3233-6631-4664-b663-353636373235__e3057a5fee932ada1aaf.png'
    language              = 'ru'
    no_stylesheets        = False
    remove_javascript = False
    auto_cleanup   = False
    oldest_article = 5
    max_articles_per_feed = 50

    remove_tags_before = dict(name='h1')
    
    remove_tags_after = dict(name='article')

    remove_tags =   [
        dict(name='div', attrs={'class': 'entry__tags'}),
        dict(name='div', attrs={'class': 'entry__likes'})
        ] 

    feeds = [
        ('\u0421\u043D\u043E\u0431', 'https://snob.ru/rss/')
        ]
        
    def get_article_url(self, article):
        link = article.get('link', None)
        if 'utm_source' in link:
            return link.split('?utm')[0]