View Single Post
Old 07-21-2022, 04:08 PM   #8
bugmen00t
Connoisseur
bugmen00t rocks like Gibraltar!bugmen00t rocks like Gibraltar!bugmen00t rocks like Gibraltar!bugmen00t rocks like Gibraltar!bugmen00t rocks like Gibraltar!bugmen00t rocks like Gibraltar!bugmen00t rocks like Gibraltar!bugmen00t rocks like Gibraltar!bugmen00t rocks like Gibraltar!bugmen00t rocks like Gibraltar!bugmen00t rocks like Gibraltar!
 
bugmen00t's Avatar
 
Posts: 82
Karma: 100000
Join Date: Aug 2015
Device: Kindle Keyboard 3G + Kindle Voyage WiFi + Kindle PW11 Kids WiFi
New Russian Recipes (part 01 of ??)

A couple new recipes. Not sure how to correctly specify the language for those recipes that are being downloaded in Russian but their originating source is outside Russia. To avoid fragmentation, it probably would be easier to change language from "ru_UK", "ru_DE", "ru_GB" back to just "ru".

UNIAN.net (Russian version): Ukrainian Independent News Agency of News, one of the most cited source of news from across Ukraine. Favicon

Spoiler:
Code:
#!/usr/bin/env python
# vim:fileencoding=utf-8

from __future__ import unicode_literals, division, absolute_import, print_function
from calibre.web.feeds.news import BasicNewsRecipe, classes

class Unian(BasicNewsRecipe):
    title = u'\u0423\u041D\u0418\u0410\u041D '
    description = u'\u0423\u043A\u0440\u0430\u0438\u043D\u0441\u043A\u043E\u0435 \u041D\u0435\u0437\u0430\u0432\u0438\u0441\u0438\u043C\u043E\u0435 \u0418\u043D\u0444\u043E\u0440\u043C\u0430\u0446\u0438\u043E\u043D\u043D\u043E\u0435 \u0410\u0433\u0435\u043D\u0442\u0441\u0442\u0432\u043E \u041D\u043E\u0432\u043E\u0441\u0442\u0435\u0439 \u2013 \u043F\u0435\u0440\u0432\u043E\u0435 \u0432 \u0423\u043A\u0440\u0430\u0438\u043D\u0435 \u0438 \u0441\u0430\u043C\u043E\u0435 \u0431\u043E\u043B\u044C\u0448\u043E\u0435 \u043D\u0435\u0437\u0430\u0432\u0438\u0441\u0438\u043C\u043E\u0435 \u0438\u043D\u0444\u043E\u0440\u043C\u0430\u0446\u0438\u043E\u043D\u043D\u043E\u0435 \u0430\u0433\u0435\u043D\u0442\u0441\u0442\u0432\u043E, \u043E\u0441\u043D\u043E\u0432\u0430\u043D\u043D\u043E\u0435 \u0432 1993 \u0433\u043E\u0434\u0443, \u043B\u0438\u0434\u0435\u0440 \u0441\u0440\u0435\u0434\u0438 \u043D\u043E\u0432\u043E\u0441\u0442\u043D\u044B\u0445 \u043C\u0435\u0434\u0438\u0430 \u0441\u0442\u0440\u0430\u043D\u044B, \u0441\u0430\u043C\u044B\u0439 \u0446\u0438\u0442\u0438\u0440\u0443\u0435\u043C\u044B\u0439 \u0438\u0441\u0442\u043E\u0447\u043D\u0438\u043A \u043D\u043E\u0432\u043E\u0441\u0442\u0435\u0439 \u043E \u0441\u043E\u0431\u044B\u0442\u0438\u044F\u0445 \u0432 \u0441\u0442\u0440\u0430\u043D\u0435.'
    __author__ = 'bugmen00t'
    publication_type = 'newspaper'
    oldest_article = 7
    max_articles_per_feed = 100
    language = 'ru_UK'
    cover_url = 'https://www.unian.net/images/unian-512x512.png'
    auto_cleanup = False
    no_stylesheets = True
    
    remove_tags_before = dict(name='h1')
    remove_tags_after = dict(name='div', attrs={'class': 'article-text'})
    remove_tags = [
        dict(name='span', attrs={'class': 'article__info-item comments'}),
        dict(name='span', attrs={'class': 'article__info-item views'}),
        dict(name='div', attrs={'class': 'read-also-slider'})
    ]

    feeds = [
    (u'\u0423\u041D\u0418\u0410\u041D', u'https://rss.unian.net/site/news_rus.rss')
    ]


Old-Games.ru: community project devoted to preservation of old computer games. Favicon

Fixes needed:
  • Replace list elements with <div> tags
  • Find less brutal way of removing attribute style='display:none ' from all <div> tags rather than just nuking all styles
Spoiler:
Code:
#!/usr/bin/env python
# vim:fileencoding=utf-8

from __future__ import unicode_literals, division, absolute_import, print_function
from calibre.web.feeds.news import BasicNewsRecipe, classes


class OGRU(BasicNewsRecipe):
    title = u'Old-Games.RU'
    __author__ = 'bugmen00t'
    description = u'Old-Games.RU \u2014 \u043A\u0440\u0443\u043F\u043D\u0435\u0439\u0448\u0438\u0439 \u0440\u043E\u0441\u0441\u0438\u0439\u0441\u043A\u0438\u0439 \u0430\u0440\u0445\u0438\u0432 \u0441\u0442\u0430\u0440\u044B\u0445 \u043A\u043E\u043C\u043F\u044C\u044E\u0442\u0435\u0440\u043D\u044B\u0445 \u0438\u0433\u0440. \u041C\u044B \u043D\u0435 \u0441\u0442\u0430\u0432\u0438\u043C \u043F\u0435\u0440\u0435\u0434 \u0441\u043E\u0431\u043E\u0439 \u0446\u0435\u043B\u0438 \u0441\u043E\u0431\u0440\u0430\u0442\u044C \u0432\u0441\u0435 \u0438\u0433\u0440\u044B, \u0447\u0442\u043E \u0435\u0441\u0442\u044C \u0432 \u043C\u0438\u0440\u0435, \u043D\u043E \u043C\u044B \u0441\u0442\u0430\u0440\u0430\u0435\u043C\u0441\u044F, \u0447\u0442\u043E\u0431\u044B \u043D\u0430 \u0441\u0430\u0439\u0442\u0435 \u0431\u044B\u043B\u043E \u043F\u0440\u0435\u0434\u0441\u0442\u0430\u0432\u043B\u0435\u043D\u043E \u0431\u043E\u043B\u044C\u0448\u0438\u043D\u0441\u0442\u0432\u043E \u0448\u0435\u0434\u0435\u0432\u0440\u043E\u0432, \u0440\u0435\u0434\u043A\u043E\u0441\u0442\u0435\u0439 \u0438 \u043F\u0440\u043E\u0441\u0442\u043E \u0438\u043D\u0442\u0435\u0440\u0435\u0441\u043D\u044B\u0445 \u043F\u0440\u043E\u0435\u043A\u0442\u043E\u0432 \u043F\u0440\u043E\u0448\u043B\u044B\u0445 \u043B\u0435\u0442. \u0421 \u0442\u0435\u0447\u0435\u043D\u0438\u0435\u043C \u0432\u0440\u0435\u043C\u0435\u043D\u0438 \u0433\u0440\u0430\u0444\u0438\u0447\u0435\u0441\u043A\u043E\u0435 \u0438 \u0437\u0432\u0443\u043A\u043E\u0432\u043E\u0435 \u043E\u0444\u043E\u0440\u043C\u043B\u0435\u043D\u0438\u0435 \u0438\u0433\u0440 \u043D\u0430\u0448\u0435\u0433\u043E \u0430\u0440\u0445\u0438\u0432\u0430 \u0437\u0430\u043C\u0435\u0442\u043D\u043E \u0443\u0441\u0442\u0430\u0440\u0435\u043B\u043E, \u043D\u043E \u0438\u0433\u0440\u043E\u0432\u043E\u0439 \u043F\u0440\u043E\u0446\u0435\u0441\u0441 \u043E\u0441\u0442\u0430\u043B\u0441\u044F \u043F\u0440\u0435\u0436\u043D\u0438\u043C, \u0438 \u043F\u043E\u0440\u043E\u0439 \u043E\u043D \u0433\u043E\u0440\u0430\u0437\u0434\u043E \u0438\u043D\u0442\u0435\u0440\u0435\u0441\u043D\u0435\u0435, \u0447\u0435\u043C \u0432\u043E \u043C\u043D\u043E\u0433\u0438\u0445 \u0441\u043E\u0432\u0440\u0435\u043C\u0435\u043D\u043D\u044B\u0445 \u00AB\u0445\u0438\u0442\u0430\u0445\u00BB.'
    publisher = 'Old-Games.RU'
    publication_type = 'blog'
    category = 'news, games, retro'
    language = 'ru'
    cover_url = 'https://www.old-games.ru/forum/styles/default/old-games/logo.og.png'
    oldest_article = 50
    max_articles_per_feed = 50
    no_stylesheets = True
    auto_cleanup = False
    
    remove_tags_before = dict(name='article')

    remove_tags_after = dict(name='article')

    remove_attributes = ['style']

    remove_tags =   [
        dict(name='p', attrs={'id': 'pageDescription'}),
        dict(name='div', attrs={'class': 'pageNavLinkGroup'}),
        dict(name='div', attrs={'class': 'tagBlock TagContainer'}),
        dict(name='div', attrs={'class': 'NoAutoHeader PollContainer'}),
        dict(name='div', attrs={'class': 'likesSummary secondaryContent'}),
        dict(name='div', attrs={'class': 'editDate'}),
        dict(name='div', attrs={'class': 'attachedFiles'}),
        dict(name='div', attrs={'class': 'item muted postNumber hashPermalink OverlayTrigger'}),
        dict(name='div', attrs={'class': 'messageUserInfo'})
        ]

    feeds = [
        (u'\u041D\u043E\u0432\u043E\u0441\u0442\u0438', 'https://feeds.feedburner.com/Old-games-ru-news'),
        (u'\u0421\u0442\u0430\u0442\u044C\u0438', 'https://feeds.feedburner.com/Old-games-ru-articles')
        ]


Новая Газета. Европа (Russian version): European re-incarnation of Новая Газета newspaper. Favicon

Fixes needed:
  • No images in articles (webp format)
Spoiler:
Code:
#!/usr/bin/env python
# vim:fileencoding=utf-8

from __future__ import unicode_literals, division, absolute_import, print_function
from calibre.web.feeds.news import BasicNewsRecipe, classes

class NovayaGazetaEurope(BasicNewsRecipe):
    title = u'\u041D\u043E\u0432\u0430\u044F \u0413\u0430\u0437\u0435\u0442\u0430. \u0415\u0432\u0440\u043E\u043F\u0430'
    __author__ = 'bugmen00t'
    description = u'\u0413\u043E\u0432\u043E\u0440\u0438\u043C \u043A\u0430\u043A \u0435\u0441\u0442\u044C. \u041F\u0438\u0448\u0435\u043C \u043E \u043F\u0440\u043E\u0438\u0441\u0445\u043E\u0434\u044F\u0449\u0435\u043C \u0432 \u0420\u043E\u0441\u0441\u0438\u0438, \u0423\u043A\u0440\u0430\u0438\u043D\u0435 \u0438 \u0415\u0432\u0440\u043E\u043F\u0435. \u041D\u043E\u0432\u043E\u0441\u0442\u0438, \u0430\u043D\u0430\u043B\u0438\u0442\u0438\u043A\u0430, \u043C\u043D\u0435\u043D\u0438\u044F \u044D\u043A\u0441\u043F\u0435\u0440\u0442\u043E\u0432, \u0441\u043F\u0435\u0446\u0438\u0430\u043B\u044C\u043D\u044B\u0435 \u0440\u0435\u043F\u043E\u0440\u0442\u0430\u0436\u0438 \u0438 \u0436\u0443\u0440\u043D\u0430\u043B\u0438\u0441\u0442\u0441\u043A\u0438\u0435 \u0440\u0430\u0441\u0441\u043B\u0435\u0434\u043E\u0432\u0430\u043D\u0438\u044F.'
    publisher = '\u041A\u0438\u0440\u0438\u043B\u043B \u041C\u0430\u0440\u0442\u044B\u043D\u043E\u0432'
    publication_type = 'newspaper'
    category = 'news'
    language = 'ru'
    cover_url = 'https://bucketeer-e05bbc84-baa3-437e-9518-adb32be77984.s3.amazonaws.com/public/images/5dc71e2d-9763-4f05-8f4e-92049fa32af7_513x513.png'
    oldest_article = 15
    max_articles_per_feed = 50
    auto_cleanup = False
    
    remove_tags_before = dict(name='h1')

    remove_tags_after = dict(name='div', attrs={'class': 'ArticleBlocks_wrapperNoAside__11_bu'})

    remove_tags =   [
        dict(name='div', attrs={'class': 'EmbedNative_root__2lgsH'})
        ]

    feeds = [
        (u'\u041D\u043E\u0432\u043E\u0441\u0442\u0438', 'https://novayagazeta.eu/feed/rss/ru')
        ]
        
    def preprocess_html(self, soup):
        for alink in soup.findAll('a'):
            if alink.string is not None:
               tstr = alink.string
               alink.replaceWith(tstr)
        return soup



Вёрстка: socio-political online media researching Russian society. Favicon.
Spoiler:
Code:
#!/usr/bin/env python
# vim:fileencoding=utf-8

from __future__ import unicode_literals, division, absolute_import, print_function
from calibre.web.feeds.news import BasicNewsRecipe, classes

class Verstka(BasicNewsRecipe):
    title = u'\u0412\u0451\u0440\u0441\u0442\u043A\u0430'
    description = u'\u041E\u0431\u0449\u0435\u0441\u0442\u0432\u0435\u043D\u043D\u043E-\u043F\u043E\u043B\u0438\u0442\u0438\u0447\u0435\u0441\u043A\u043E\u0435 \u0438\u0437\u0434\u0430\u043D\u0438\u0435, \u043A\u043E\u0442\u043E\u0440\u043E\u0435 \u0438\u0441\u0441\u043B\u0435\u0434\u0443\u0435\u0442 \u0438 \u043E\u043F\u0438\u0441\u044B\u0432\u0430\u0435\u0442, \u043A\u0430\u043A \u0444\u0443\u043D\u043A\u0446\u0438\u043E\u043D\u0438\u0440\u0443\u0435\u0442 \u043E\u0431\u0449\u0435\u0441\u0442\u0432\u043E \u0432 \u0420\u043E\u0441\u0441\u0438\u0438.'
    __author__ = 'bugmen00t'
    publication_type = 'newspaper'
    oldest_article = 21
    max_articles_per_feed = 20
    language = 'ru'
    cover_url = 'https://secureservercdn.net/160.153.137.128/yji.7dd.myftpupload.com/wp-content/uploads/2022/04/Screenshot-2022-04-26-at-22.19.30-300x68.png'
    auto_cleanup = False
    no_stylesheets = False
    
    remove_tags_before = dict(name='h1')

    remove_tags_after = dict(name='div', attrs={'class': 'wp-block-spacer'})

    feeds = [
        (u'\u041D\u043E\u0432\u043E\u0441\u0442\u0438', 'https://verstka.media/category/news/feed/'),
        (u'\u0421\u0442\u0430\u0442\u044C\u0438', 'https://verstka.media/category/article/feed/'),
        (u'\u041A\u043E\u043B\u043E\u043D\u043A\u0438', 'https://verstka.media/category/column/feed/'),
        (u'\u0418\u043D\u0442\u0435\u0440\u0432\u044C\u044E', 'https://verstka.media/category/interview/feed/')
    ]


Кедр: independent environmental media. Favicon.

Fixes needed:
  • No images in some articles (<figure> tag and/or webp format)
Spoiler:
Code:
#!/usr/bin/env python
# vim:fileencoding=utf-8

from __future__ import unicode_literals, division, absolute_import, print_function
from calibre.web.feeds.news import BasicNewsRecipe, classes

class Cedar(BasicNewsRecipe):
    title = u'\u041A\u0435\u0434\u0440'
    description = u'\u041D\u0435\u0437\u0430\u0432\u0438\u0441\u0438\u043C\u043E\u0435 \u043C\u0435\u0434\u0438\u0430 \u043E\u0431 \u043E\u043A\u0440\u0443\u0436\u0430\u044E\u0449\u0435\u0439 \u0441\u0440\u0435\u0434\u0435'
    __author__ = 'bugmen00t'
    publication_type = 'blog'
    oldest_article = 30
    max_articles_per_feed = 20
    language = 'ru'
    cover_url = 'https://kedr.media/wp-content/themes/kedrmedia_gutenberg/assets/img/logo-bg.svg'
    auto_cleanup = True
    no_stylesheets = False
    
    remove_tags_before = dict(name='div', attrs={'class': 'post-header'})

    remove_tags_after = dict(name='div', attrs={'class': 'post-content'})

    feeds = [
        (u'\u0418\u0441\u0441\u043B\u0435\u0434\u043E\u0432\u0430\u043D\u0438\u044F', 'https://kedr.media/category/research/feed'),
        (u'\u0418\u0441\u0442\u043E\u0440\u0438\u0438', 'https://kedr.media/category/stories/feed'),
        (u'\u041C\u043D\u0435\u043D\u0438\u044F', 'https://kedr.media/category/opinions/feed'),
        (u'\u0418\u043D\u0442\u0435\u0440\u0432\u044C\u044E', 'https://kedr.media/category/interview/feed'),
        (u'\u041E\u0431\u044A\u044F\u0441\u043D\u044F\u0435\u043C', 'https://kedr.media/category/explain/feed')
    ]


Deutsche Welle на русском: Russian version of Deutsche Welle. Favicon.
Spoiler:
Code:
#!/usr/bin/env python
# vim:fileencoding=utf-8

from __future__ import unicode_literals, division, absolute_import, print_function
from calibre.web.feeds.news import BasicNewsRecipe, classes

class DeutscheWelle(BasicNewsRecipe):
    title = u'Deutsche Welle \u043D\u0430 \u0440\u0443\u0441\u0441\u043A\u043E\u043C'
    description = u'\u0420\u0443\u0441\u0441\u043A\u0430\u044F \u0440\u0435\u0434\u0430\u043A\u0446\u0438\u044F Deutsche Welle: \u043D\u043E\u0432\u043E\u0441\u0442\u0438, \u0430\u043D\u0430\u043B\u0438\u0442\u0438\u043A\u0430, \u043A\u043E\u043C\u043C\u0435\u043D\u0442\u0430\u0440\u0438\u0438 \u0438 \u0440\u0435\u043F\u043E\u0440\u0442\u0430\u0436\u0438 \u0438\u0437 \u0413\u0435\u0440\u043C\u0430\u043D\u0438\u0438 \u0438 \u0415\u0432\u0440\u043E\u043F\u044B, \u043D\u0435\u043C\u0435\u0446\u043A\u0438\u0439 \u0438 \u0435\u0432\u0440\u043E\u043F\u0435\u0439\u0441\u043A\u0438\u0439 \u0432\u0437\u0433\u043B\u044F\u0434 \u043D\u0430 \u0441\u043E\u0431\u044B\u0442\u0438\u044F \u0432 \u0420\u043E\u0441\u0441\u0438\u0438 \u0438 \u043C\u0438\u0440\u0435,  \u043F\u0440\u0430\u043A\u0442\u0438\u0447\u0435\u0441\u043A\u0438\u0435 \u0441\u043E\u0432\u0435\u0442\u044B \u0434\u043B\u044F \u0442\u0443\u0440\u0438\u0441\u0442\u043E\u0432 \u0438 \u0442\u0435\u0445, \u043A\u0442\u043E \u0436\u0435\u043B\u0430\u0435\u0442 \u0443\u0447\u0438\u0442\u044C\u0441\u044F \u0438\u043B\u0438 \u0440\u0430\u0431\u043E\u0442\u0430\u0442\u044C \u0432 \u0413\u0435\u0440\u043C\u0430\u043D\u0438\u0438 \u0438 \u0434\u0440\u0443\u0433\u0438\u0445 \u0441\u0442\u0440\u0430\u043D\u0430\u0445 \u0415\u0432\u0440\u043E\u0441\u043E\u044E\u0437\u0430.'
    __author__ = 'bugmen00t'
    publication_type = 'newspaper'
    oldest_article = 14
    max_articles_per_feed = 100
    language = 'ru_DE'
    cover_url = 'https://www.dw.com/cssi/dwlogo-print.gif'
    auto_cleanup = False
    no_stylesheets = False
    
    remove_tags_before = dict(name='h1')

    remove_tags_after = dict(name='div', attrs={'class': 'longText'})

    feeds = [
        (u'\u0412\u0435\u0441\u044C \u0441\u0430\u0439\u0442', 'https://rss.dw.com/xml/rss-ru-all'),
        (u'\u041D\u043E\u0432\u043E\u0441\u0442\u0438', 'http://rss.dw.de/xml/rss-ru-news'),
        (u'\u041F\u043E\u043B\u0438\u0442\u0438\u043A\u0430 \u0438 \u043E\u0431\u0449\u0435\u0441\u0442\u0432\u043E', 'http://rss.dw.de/xml/rss-ru-pol'),
        (u'\u042D\u043A\u043E\u043D\u043E\u043C\u0438\u043A\u0430', 'http://rss.dw.de/xml/rss-ru-eco'),
        (u'\u0410\u0432\u0442\u043E\u043C\u043E\u0431\u0438\u043B\u044C', 'http://rss.dw.de/xml/rss-ru-auto'),
        (u'\u041A\u0443\u043B\u044C\u0442\u0443\u0440\u0430 \u0438 \u0441\u0442\u0438\u043B\u044C \u0436\u0438\u0437\u043D\u0438', 'http://rss.dw.de/xml/rss-ru-cul'),
        (u'\u0420\u043E\u0441\u0441\u0438\u044F', 'http://rss.dw.de/xml/rss-ru-rus'),
        (u'\u0413\u0435\u0440\u043C\u0430\u043D\u0438\u044F', 'http://rss.dw.de/xml/rss-ru-ger'),
        (u'\u0415\u0432\u0440\u043E\u043F\u0430', 'http://rss.dw.de/xml/rss-ru-eu'),
        (u'\u0411\u0435\u043B\u0430\u0440\u0443\u0441\u044C', 'http://rss.dw.de/xml/rss-ru-bel'),
        (u'\u0423\u0447\u0435\u0431\u0430 \u0438 \u043A\u0430\u0440\u044C\u0435\u0440\u0430', 'http://rss.dw.de/xml/rss-ru-campus-karriere'),
        (u'\u0423\u0447\u0435\u0431\u0430 ', 'http://rss.dw.de/xml/rss-ru-campus'),
        (u'\u041A\u0430\u0440\u044C\u0435\u0440\u0430 ', 'http://rss.dw.de/xml/rss-ru-karriere'),
        (u'\u0422\u0443\u0440\u0438\u0441\u0442\u0443 \u043D\u0430 \u0437\u0430\u043C\u0435\u0442\u043A\u0443', 'http://rss.dw.de/xml/rss-ru-discover-ger')
    ]


Русская служба BBC: BBC News in Russian. Favicon.

Fixes needed:
  • No images in some articles (lazyload)
  • No images in some articles (webp format)
Spoiler:
Code:
#!/usr/bin/env python
# vim:fileencoding=utf-8

from __future__ import unicode_literals, division, absolute_import, print_function
from calibre.web.feeds.news import BasicNewsRecipe, classes

class BBC(BasicNewsRecipe):
    title = u'BBC Russian'
    description = u'\u0420\u0443\u0441\u0441\u043A\u0430\u044F \u0441\u043B\u0443\u0436\u0431\u0430 BBC'
    __author__ = 'bugmen00t'
    publication_type = 'newspaper'
    oldest_article = 14
    max_articles_per_feed = 50
    language = 'ru_GB'
    cover_url = 'https://news.files.bbci.co.uk/ws/img/logos/og/russian.png'
    auto_cleanup = False
    no_stylesheets = True
    
    remove_tags_before = dict(name='h1')

    remove_tags_after = dict(name='main', attrs={'aria-hidden': 'true'})

    remove_tags =   [
        dict(name='section', attrs={'role': 'region'}),
        dict(name='footer'),
        dict(name='aside')
        ]

    feeds = [
        (u'\u041D\u043E\u0432\u043E\u0441\u0442\u0438 BBC', 'https://feeds.bbci.co.uk/russian/rss.xml')
    ]

Last edited by bugmen00t; 07-21-2022 at 04:18 PM.
bugmen00t is offline   Reply With Quote