Thread: [New] La Presse
View Single Post
Old 07-08-2025, 08:21 PM   #1
quatorze
Junior Member
quatorze began at the beginning.
 
Posts: 2
Karma: 10
Join Date: Jun 2023
Device: Kobo Sage
[New] La Presse

La Presse is a French-language daily from Montreal, Canada.

Spoiler:
Code:
#!/usr/bin/env python
# vim:fileencoding=utf-8

__license__     = 'GPL v3'
__author__      = 'quatorze'
__copyright__   = '2025, quatorze'
__version__     = 'v1.00'
__date__        = '8 July 2025'
__description__ = 'La Presse '

from calibre.web.feeds.news import BasicNewsRecipe

class LaPresse(BasicNewsRecipe):
    title                          = u'La Presse'
    timefmt                        = ' %Y-%m-%d'
    language                       = 'fr'
    encoding                       = 'utf-8'
    publisher                      = 'www.lapresse.ca'
    publication_type               = 'newspaper'
    category                       = 'News, finance, economy, politics'
    ignore_duplicate_articles      = {'title', 'url'}
    oldest_article                 = 1.0
    max_articles_per_feed          = 100
    min_articles_per_feed          = 0
    auto_cleanup                   = False
    remove_empty_feeds             = True
    use_embedded_content           = False
    needs_subscription             = False
    remove_javascript              = True
    compress_news_images           = True
    scale_news_images_to_device    = True
    compress_news_images_auto_size = 4

    no_stylesheets                 = True
    extra_css                      = '''
        a { text-decoration: none; }
        a.badge { font-size: 80%; }
        div.capsuleModule { border-style: solid; margin:0% 8%; padding:0% 2%; }
        div.quote { border-style: none none none solid; }
        h2.textModule--type-chapter { font-weight: bold; font-size: 110%; }
        p { font-size: 100%; }
        p.chapter { font-weight: bold; }
        p.credit { font-size: 80% }
        p.description { font-size: 80% }
        p.lead { font-weight: bold; font-size: 120%; }
        p.quoteSource { padding-left:7%; font-size: 110%; }
        p.quoteText { padding-left:5%; font-weight: bold; font-style: italic; font-size: 110%; }
        p.teaser { font-weight: bold; font-size: 120%; }
        small.suptitle { display: block; font-size: 50%; }
        span.authorModule__name { display: block; font-size: 80%; }
        span.authorModule__organisation { display: block; font-style: italic; font-size: 80%; }
        span.title { display: block; padding-top:2%; font-weight: bold; font-style: italic; }
        time.publicationsDate--type-publication { display: block; font-size: 80%; }
        time.publicationsDate--type-update { display: block; font-size: 80%; }
        div.complementaryInformation { border-style: solid; margin:0% 8%; padding:0% 2%; }
        div.complementaryInformation__title { font-weight: bold; font-size: 120%; }
        dt { font-weight: bold; font-size: 100%; }
        dd { font-size: 100%; }
        div.source { font-size: 80%; }
    '''
    
    keep_only_tags                 = dict(name='article', class_='mainStory')

    remove_attributes              = [ 'href' ]

    # Clean up some of the extraneous/interactives inside the article boundaries
    preprocess_regexps = [
        (re.compile(r'</small>.*?<span>', re.DOTALL|re.IGNORECASE), lambda match: '</small><p> </p><span>'),
        (re.compile(r'<div id="socialShare_.*?</div>', re.DOTALL|re.IGNORECASE), lambda match: ''),
        (re.compile(r'<a class="linkModule ACT".*?</a>', re.DOTALL|re.IGNORECASE), lambda match: ''),
        (re.compile(r'<span class="linkModule__content">.*?</span>', re.DOTALL|re.IGNORECASE), lambda match: ''),
        (re.compile(r'<ul class="buttons ">.*?</ul>', re.DOTALL|re.IGNORECASE), lambda match: ''),
        (re.compile(r'<div class="visual__title.*?">.*?</div>', re.DOTALL|re.IGNORECASE), lambda match: ''),
        (re.compile(r'<div class="photoGalleryModule__wrapper">.*?</div>', re.DOTALL|re.IGNORECASE), lambda match: ''),
        (re.compile(r'<ul class="photoGalleryModule__navigation">.*?</ul>', re.DOTALL|re.IGNORECASE), lambda match: '')
    ]

    feeds = [
        # Actualités - Reporting
        (u'Politique',                  'https://www.lapresse.ca/actualites/politique/rss'),
        (u'National',                   'https://www.lapresse.ca/actualites/national/rss'),
        (u'Analyses',                   'https://www.lapresse.ca/actualites/analyses/rss'),
        (u'Grand Montréal',             'https://www.lapresse.ca/actualites/grand-montreal/rss'),
        (u'Régional',                   'https://www.lapresse.ca/actualites/regional/rss'),
        (u'Justice et faits divers',    'https://www.lapresse.ca/actualites/justice-et-faits-divers/rss'),
        (u'Santé',                      'https://www.lapresse.ca/actualites/sante/rss'),
        (u'Éducation',                  'https://www.lapresse.ca/actualites/education/rss'),
        (u'Environnement',              'https://www.lapresse.ca/actualites/environnement/rss'),
        (u'Sciences',                   'https://www.lapresse.ca/actualites/sciences/rss'),
        # Actualités - Opinions
        (u'Chroniques',                 'https://www.lapresse.ca/actualites/chroniques/rss'),
        (u'Caricatures',                'https://www.lapresse.ca/actualites/caricatures/rss'),
        (u'Éditoriaux',                 'https://www.lapresse.ca/actualites/editoriaux/rss'),
        (u'Manchettes - Actualités',    'http://www.lapresse.ca/actualites/rss'),
        # International - Reporting
        (u'États-Unis',                 'https://www.lapresse.ca/international/etats-unis/rss'),
        (u'Europe',                     'https://www.lapresse.ca/international/europe/rss'),
        (u'Moyen-Orient',               'https://www.lapresse.ca/international/moyen-orient/rss'),
        (u'Caraïbes',                   'https://www.lapresse.ca/international/caraibes/rss'),
        (u'Amérique latine',            'https://www.lapresse.ca/international/amerique-latine/rss'),
        (u'Asie et Océanie',            'https://www.lapresse.ca/international/asie-et-oceanie/rss'),
        (u'Afrique',                    'https://www.lapresse.ca/international/afrique/rss'),
        (u'Manchettes - International', 'http://www.lapresse.ca/international/rss'),
        # International - Opinions
        (u'Chroniques',                 'https://www.lapresse.ca/international/chroniques/rss'),
        # Dialogue
        (u'Chroniques',                 'https://www.lapresse.ca/dialogue/chroniques/rss'),
        (u'Opinions',                   'https://www.lapresse.ca/dialogue/opinions/rss'),
        (u'Courrier des lecteurs',      'https://www.lapresse.ca/dialogue/courrier-des-lecteurs/rss'),
        (u'Témoignages',                'https://www.lapresse.ca/dialogue/temoignages/rss'),
        # Contexte
        (u'Chroniques',                 'https://www.lapresse.ca/contexte/chroniques/rss'),
        # En vrac / In bulk ...
        (u'Affaires',                   'https://www.lapresse.ca/affaires/rss'),
        (u'Sports',                     'https://www.lapresse.ca/sports/rss'),
        (u'Auto',                       'https://www.lapresse.ca/auto/rss'),
        (u'Arts',                       'https://www.lapresse.ca/arts/rss'),
        (u'Cinéma',                     'https://www.lapresse.ca/cinema/rss'),
        (u'Société',                    'https://www.lapresse.ca/societe/rss'),
        (u'Gourmand',                   'https://www.lapresse.ca/gourmand/rss'),
        (u'Voyage',                     'https://www.lapresse.ca/voyage/rss'),
        (u'Maison',                     'https://www.lapresse.ca/maison/rss')
    ]

calibre_most_common_ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36'

Regards,

-xiv
quatorze is offline   Reply With Quote