View Single Post
Old Yesterday, 04:37 PM   #1
alphonk
Member
alphonk began at the beginning.
 
Posts: 20
Karma: 10
Join Date: Dec 2024
Device: kindle scribe
Le Canard Enchainé (update)

Code:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Recette Calibre — Le Canard Enchaîné
# Auteur : Kabonix
# Le contenu premium est dans le HTML (CSS paywall) — pas besoin de login

from calibre.web.feeds.news import BasicNewsRecipe
from datetime import datetime, timedelta
from zoneinfo import ZoneInfo

class LeCanardEnchaine(BasicNewsRecipe):
    title       = 'Le Canard Enchaîné'
    __author__  = 'Kabonix'
    description = 'Articles du Canard Enchaîné (sans login — CSS paywall)'
    language    = 'fr'
    no_stylesheets   = True
    auto_cleanup      = False
    remove_javascript = True
    max_image_width  = 600
    max_image_height = 800

    # Spoofer le referer Google pour contourner le paywall CSS
    # Googlebot spoof — le Canard sert le contenu complet aux robots Google (pour indexation)
    browser_user_agent = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'

    def get_browser(self, *args, **kwargs):
        br = BasicNewsRecipe.get_browser(self, *args, **kwargs)
        br.set_handle_robots(False)
        br.addheaders = [
            ('User-Agent', self.browser_user_agent),
            ('Referer',         'https://www.google.fr/'),
            ('X-Forwarded-For', '66.249.66.1'),  # IP officielle Googlebot
            ('Accept-Language', 'fr-FR,fr;q=0.9'),
        ]
        return br

    # ------------------------------------------------------------------ #
    #  Couverture — scraping page boutique                                #
    # ------------------------------------------------------------------ #
    def get_cover_url(self):
        try:
            br   = self.get_browser()
            soup = self.index_to_soup(
                br.open('https://boutique.lecanardenchaine.fr/acheter-au-numero/').read()
            )
            li = soup.find('li', {'class': 'list-item'})
            if li:
                img = li.find('img')
                if img and img.get('srcset'):
                    return 'https://boutique.lecanardenchaine.fr' + img['srcset'].split()[0]
                if img and img.get('src'):
                    return 'https://boutique.lecanardenchaine.fr' + img['src']
        except Exception:
            pass
        return 'https://image.ausha.co/2x1H3rkhwjmSwAa8KzIFfcN0G9GxfJWY83UafXn8_400x400.jpeg'

    # ------------------------------------------------------------------ #
    #  Sélection du contenu                                               #
    # ------------------------------------------------------------------ #
    keep_only_tags = [
        dict(name='div', attrs={'class': 'article__heading'}),
        dict(name='div', attrs={'class': 'editorial'}),
    ]

    remove_tags = [
        dict(name=['script', 'style', 'nav', 'header', 'footer', 'button', 'form']),
        dict(name='div', attrs={'class': [
            'share-mobile', 'share-sticky', 'article__author',
            'article__tags', 'list-breadcrumb', 'modal',
        ]}),
    ]

    extra_css = '''
        h1, h2 { font-size: 1.2em !important; font-weight: bold; }
        .editorial__chapo { font-style: italic; margin-bottom: 1em; }
        p { line-height: 1.5; }
        a { color: black !important; text-decoration: none !important; }
        .zoom { border-left: 3px solid #ccc; padding-left: 1em; margin: 1em 0; }
    '''

    # ------------------------------------------------------------------ #
    #  Le bypass CSS paywall : on vire juste la classe paywall            #
    #  Le contenu est déjà dans le HTML — c'est un CSS paywall pur       #
    # ------------------------------------------------------------------ #
    def preprocess_html(self, soup):
        # Déverrouiller le contenu caché par CSS
        for div in soup.findAll('div', attrs={'id': 'paywall'}):
            div['class'] = ''
            div['id']    = ''
        for div in soup.findAll('div', attrs={'class': 'paywall'}):
            div['class'] = ''
        # Nettoyer les overlays de paywall
        for div in soup.findAll('div', attrs={'class': 'non-paywall'}):
            div['class'] = ''
        return soup

    def postprocess_html(self, soup, first_fetch):
        for tag in soup.findAll(True):
            for attr in list(tag.attrs):
                if attr not in ['href', 'src', 'class']:
                    del tag[attr]
        return soup

    # ------------------------------------------------------------------ #
    #  Index des sections                                                  #
    # ------------------------------------------------------------------ #
    SECTIONS = {
        'Politique':             '/politique/',
        'Économie':              '/economie/',
        'International':         '/international/',
        'Défense':               '/defense/',
        'Société':               '/societe/',
        'Police-Justice':        '/police-justice/',
        'Santé':                 '/sante/',
        'Éducation':             '/education/',
        'Environnement':         '/environnement/',
        'Technologie-Sciences':  '/technologie-sciences/',
        'Culture-Idées':         '/culture-idees/',
        'Médias':                '/medias/',
        'Sport':                 '/sport/',
        'Social':                '/social/',
        'Brèves':                '/breves/',
    }

    def parse_index(self):
        br      = self.get_browser()
        feeds   = []
        today   = datetime.now(ZoneInfo('Europe/Paris'))
        week_ago = today - timedelta(days=7)

        for section_title, section_url in self.SECTIONS.items():
            articles = []
            try:
                url  = 'https://www.lecanardenchaine.fr' + section_url
                soup = self.index_to_soup(br.open(url).read())

                for article in soup.findAll('article', {'class': 'article-item'}):
                    link     = article.find('a', href=True)
                    date_div = article.find('div', {'class': 'article-item__date'})

                    if not (link and date_div):
                        continue

                    href         = link['href']
                    title        = link.get_text(separator=' ').strip()
                    time_element = date_div.find('time')

                    if not (time_element and time_element.get('datetime')):
                        continue

                    article_date = datetime.fromisoformat(time_element['datetime'])
                    if article_date.date() <= week_ago.date():
                        continue

                    if not href.startswith('http'):
                        href = 'https://www.lecanardenchaine.fr' + href

                    articles.append({
                        'title':       title,
                        'url':         href,
                        'description': f"Publié le {article_date.strftime('%d/%m/%Y')}",
                        'date':        article_date.strftime('%a, %d %b %Y %H:%M:%S %z'),
                    })

            except Exception as e:
                self.log.warning(f'Erreur sur {section_title}: {e}')

            if articles:
                feeds.append((section_title, articles))

        if not feeds:
            raise ValueError('Aucun article trouvé')
        return feeds
alphonk is offline   Reply With Quote