Register Guidelines E-Books Search Today's Posts Mark Forums Read

Go Back   MobileRead Forums > E-Book Software > Calibre > Recipes

Notices

Reply
 
Thread Tools Search this Thread
Old Yesterday, 04:37 PM   #1
alphonk
Member
alphonk began at the beginning.
 
Posts: 20
Karma: 10
Join Date: Dec 2024
Device: kindle scribe
Le Canard Enchainé (update)

Code:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Recette Calibre — Le Canard Enchaîné
# Auteur : Kabonix
# Le contenu premium est dans le HTML (CSS paywall) — pas besoin de login

from calibre.web.feeds.news import BasicNewsRecipe
from datetime import datetime, timedelta
from zoneinfo import ZoneInfo

class LeCanardEnchaine(BasicNewsRecipe):
    title       = 'Le Canard Enchaîné'
    __author__  = 'Kabonix'
    description = 'Articles du Canard Enchaîné (sans login — CSS paywall)'
    language    = 'fr'
    no_stylesheets   = True
    auto_cleanup      = False
    remove_javascript = True
    max_image_width  = 600
    max_image_height = 800

    # Spoofer le referer Google pour contourner le paywall CSS
    # Googlebot spoof — le Canard sert le contenu complet aux robots Google (pour indexation)
    browser_user_agent = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'

    def get_browser(self, *args, **kwargs):
        br = BasicNewsRecipe.get_browser(self, *args, **kwargs)
        br.set_handle_robots(False)
        br.addheaders = [
            ('User-Agent', self.browser_user_agent),
            ('Referer',         'https://www.google.fr/'),
            ('X-Forwarded-For', '66.249.66.1'),  # IP officielle Googlebot
            ('Accept-Language', 'fr-FR,fr;q=0.9'),
        ]
        return br

    # ------------------------------------------------------------------ #
    #  Couverture — scraping page boutique                                #
    # ------------------------------------------------------------------ #
    def get_cover_url(self):
        try:
            br   = self.get_browser()
            soup = self.index_to_soup(
                br.open('https://boutique.lecanardenchaine.fr/acheter-au-numero/').read()
            )
            li = soup.find('li', {'class': 'list-item'})
            if li:
                img = li.find('img')
                if img and img.get('srcset'):
                    return 'https://boutique.lecanardenchaine.fr' + img['srcset'].split()[0]
                if img and img.get('src'):
                    return 'https://boutique.lecanardenchaine.fr' + img['src']
        except Exception:
            pass
        return 'https://image.ausha.co/2x1H3rkhwjmSwAa8KzIFfcN0G9GxfJWY83UafXn8_400x400.jpeg'

    # ------------------------------------------------------------------ #
    #  Sélection du contenu                                               #
    # ------------------------------------------------------------------ #
    keep_only_tags = [
        dict(name='div', attrs={'class': 'article__heading'}),
        dict(name='div', attrs={'class': 'editorial'}),
    ]

    remove_tags = [
        dict(name=['script', 'style', 'nav', 'header', 'footer', 'button', 'form']),
        dict(name='div', attrs={'class': [
            'share-mobile', 'share-sticky', 'article__author',
            'article__tags', 'list-breadcrumb', 'modal',
        ]}),
    ]

    extra_css = '''
        h1, h2 { font-size: 1.2em !important; font-weight: bold; }
        .editorial__chapo { font-style: italic; margin-bottom: 1em; }
        p { line-height: 1.5; }
        a { color: black !important; text-decoration: none !important; }
        .zoom { border-left: 3px solid #ccc; padding-left: 1em; margin: 1em 0; }
    '''

    # ------------------------------------------------------------------ #
    #  Le bypass CSS paywall : on vire juste la classe paywall            #
    #  Le contenu est déjà dans le HTML — c'est un CSS paywall pur       #
    # ------------------------------------------------------------------ #
    def preprocess_html(self, soup):
        # Déverrouiller le contenu caché par CSS
        for div in soup.findAll('div', attrs={'id': 'paywall'}):
            div['class'] = ''
            div['id']    = ''
        for div in soup.findAll('div', attrs={'class': 'paywall'}):
            div['class'] = ''
        # Nettoyer les overlays de paywall
        for div in soup.findAll('div', attrs={'class': 'non-paywall'}):
            div['class'] = ''
        return soup

    def postprocess_html(self, soup, first_fetch):
        for tag in soup.findAll(True):
            for attr in list(tag.attrs):
                if attr not in ['href', 'src', 'class']:
                    del tag[attr]
        return soup

    # ------------------------------------------------------------------ #
    #  Index des sections                                                  #
    # ------------------------------------------------------------------ #
    SECTIONS = {
        'Politique':             '/politique/',
        'Économie':              '/economie/',
        'International':         '/international/',
        'Défense':               '/defense/',
        'Société':               '/societe/',
        'Police-Justice':        '/police-justice/',
        'Santé':                 '/sante/',
        'Éducation':             '/education/',
        'Environnement':         '/environnement/',
        'Technologie-Sciences':  '/technologie-sciences/',
        'Culture-Idées':         '/culture-idees/',
        'Médias':                '/medias/',
        'Sport':                 '/sport/',
        'Social':                '/social/',
        'Brèves':                '/breves/',
    }

    def parse_index(self):
        br      = self.get_browser()
        feeds   = []
        today   = datetime.now(ZoneInfo('Europe/Paris'))
        week_ago = today - timedelta(days=7)

        for section_title, section_url in self.SECTIONS.items():
            articles = []
            try:
                url  = 'https://www.lecanardenchaine.fr' + section_url
                soup = self.index_to_soup(br.open(url).read())

                for article in soup.findAll('article', {'class': 'article-item'}):
                    link     = article.find('a', href=True)
                    date_div = article.find('div', {'class': 'article-item__date'})

                    if not (link and date_div):
                        continue

                    href         = link['href']
                    title        = link.get_text(separator=' ').strip()
                    time_element = date_div.find('time')

                    if not (time_element and time_element.get('datetime')):
                        continue

                    article_date = datetime.fromisoformat(time_element['datetime'])
                    if article_date.date() <= week_ago.date():
                        continue

                    if not href.startswith('http'):
                        href = 'https://www.lecanardenchaine.fr' + href

                    articles.append({
                        'title':       title,
                        'url':         href,
                        'description': f"Publié le {article_date.strftime('%d/%m/%Y')}",
                        'date':        article_date.strftime('%a, %d %b %Y %H:%M:%S %z'),
                    })

            except Exception as e:
                self.log.warning(f'Erreur sur {section_title}: {e}')

            if articles:
                feeds.append((section_title, articles))

        if not feeds:
            raise ValueError('Aucun article trouvé')
        return feeds
alphonk is offline   Reply With Quote
Old Yesterday, 10:40 PM   #2
kovidgoyal
creator of calibre
kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.
 
kovidgoyal's Avatar
 
Posts: 46,099
Karma: 29579912
Join Date: Oct 2006
Location: Mumbai, India
Device: Various
https://github.com/kovidgoyal/calibr...7ee24c0fae5b75
kovidgoyal is online now   Reply With Quote
Advert
Reply

Thread Tools Search this Thread
Search this Thread:

Advanced Search

Forum Jump

Similar Threads
Thread Thread Starter Forum Replies Last Post
Tolino Shine 2 HD bricked after failed 12.2.0 update – stuck in update loop, no USB a Minzmarshmallow Tolino 4 08-01-2025 05:15 PM
Le Canard Enchainé (French) - 2025 receipt alphonk Recipes 0 01-18-2025 10:52 AM
Le Canard Enchainé (French) - receipt alphonk Recipes 1 01-18-2025 10:51 AM
How can I update creation time for a title when I update the book info or convert it setherd Calibre 3 10-30-2010 01:51 PM


All times are GMT -4. The time now is 09:34 AM.


MobileRead.com is a privately owned, operated and funded community.