Register Guidelines E-Books Search Today's Posts Mark Forums Read

Go Back   MobileRead Forums > E-Book Software > Calibre > Recipes

Notices

Reply
 
Thread Tools Search this Thread
Old 01-18-2025, 10:52 AM   #1
alphonk
Member
alphonk began at the beginning.
 
Posts: 10
Karma: 10
Join Date: Dec 2024
Device: kindle scribe
Le Canard Enchainé (French) - 2025 receipt

Le Canard Enchainé (French) - 2025 receipt


Quote:
Code:
#!/usr/bin/env python
from calibre.web.feeds.news import BasicNewsRecipe

class LeCanardEnchaine(BasicNewsRecipe):
    title = 'Le Canard Enchaîné'
    author = 'Kabonix'
    description = 'Articles du Canard Enchaîné'
    language = 'fr'
    no_stylesheets = True
    remove_javascript = True
    
    # Ajout des préférences pour les identifiants
    needs_subscription = True
    
    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
        br.set_handle_robots(False)
        
        if self.username and self.password:
            br.open('https://www.lecanardenchaine.fr/coin/identification?u=/')
            br.select_form(nr=13)
            br['_username'] = self.username
            br['_password'] = self.password
            br.submit()
        else:
            raise Exception('Les identifiants de connexion sont requis. Veuillez les configurer dans les préférences de la recette.')
        
        return br

    # Le reste du code reste identique
    keep_only_tags = [
        dict(name='div', attrs={'class': ['editorial', 'article__core']}),
        dict(name='div', attrs={'class': ['non-paywall', 'paywall']})
    ]
    
    remove_tags = [
        dict(name=['script', 'style', 'nav', 'header', 'footer']),
        dict(name='div', attrs={'class': ['social-share', 'comments', 'share-mobile', 'article__author', 'article__tags']})
    ]
    
    extra_css = '''
    body, p, div, h1, h2, h3, 
    .article__subtitle, .article__chapeau, .chapeau {
        font-size: 1em !important;
        line-height: 1.5 !important;
    }
    '''
    
    def get_cover_url(self):
        """Récupère dynamiquement l'URL de la dernière une"""
        br = self.get_browser()
        try:
            soup = self.index_to_soup(br.open('https://boutique.lecanardenchaine.fr/acheter-au-numero/').read())
            
            list_item = soup.find('li', {'class': 'list-item'})
            if list_item:
                img = list_item.find('img')
                if img and img.get('srcset'):
                    return 'https://boutique.lecanardenchaine.fr' + img['srcset'].split()[0]
                elif img and img.get('src'):
                    return 'https://boutique.lecanardenchaine.fr' + img['src']
            
            print('Aucune couverture trouvée, utilisation de l\'image par défaut')
            return 'https://image.ausha.co/2x1H3rkhwjmSwAa8KzIFfcN0G9GxfJWY83UafXn8_400x400.jpeg'
            
        except Exception as e:
            self.log.exception('Erreur lors de la récupération de la couverture')
            print(f'Erreur : {str(e)}')
            return 'https://image.ausha.co/2x1H3rkhwjmSwAa8KzIFfcN0G9GxfJWY83UafXn8_400x400.jpeg'

    SECTIONS = {
        'Politique': '/politique/',
        'Économie': '/economie/',
        'International': '/international/',
        'Défense': '/defense/',
        'Société': '/societe/',
        'Police-Justice': '/police-justice/',
        'Santé': '/sante/',
        'Éducation': '/education/',
        'Environnement': '/environnement/',
        'Technologie-Sciences': '/technologie-sciences/',
        'Culture-Idées': '/culture-idees/',
        'Médias': '/medias/',
        'Sport': '/sport/',
        'Social': '/social/',
        'Brèves': '/breves/'
    }
    
    def parse_index(self):
        br = self.get_browser()
        feeds = []
        
        for section_title, section_url in self.SECTIONS.items():
            print(f"Exploration de la rubrique : {section_title}")
            articles = []
            try:
                url = 'https://www.lecanardenchaine.fr' + section_url
                raw = br.open(url).read()
                soup = self.index_to_soup(raw)
                
                for link in soup.findAll('a', href=True):
                    href = link.get('href', '')
                    if section_url[1:-1] in href and href.count('/') == 2:
                        title = link.get_text().strip()
                        if title:
                            if not href.startswith('http'):
                                href = 'https://www.lecanardenchaine.fr' + href
                            articles.append({
                                'title': title,
                                'url': href,
                                'description': ''
                            })
                
                seen_urls = set()
                unique_articles = []
                for article in articles:
                    if article['url'] not in seen_urls:
                        seen_urls.add(article['url'])
                        unique_articles.append(article)
                
                if unique_articles:
                    feeds.append((section_title, unique_articles))
                    print(f"  {len(unique_articles)} articles trouvés")
                
            except Exception as e:
                print(f"Erreur sur {section_title}: {str(e)}")
            
        return feeds
        
    def preprocess_html(self, soup):
        for div in soup.findAll('div', attrs={'class': ['unlocked', 'paywall']}):
            div['class'] = ''
        return soup
    
    def postprocess_html(self, soup, first_fetch):
        for tag in soup.findAll(True):
            for attr in list(tag.attrs):
                if attr not in ['href', 'src', 'class']:
                    del tag[attr]
        return soup

Last edited by PeterT; 01-18-2025 at 05:59 PM.
alphonk is offline   Reply With Quote
Reply

Thread Tools Search this Thread
Search this Thread:

Advanced Search

Forum Jump

Similar Threads
Thread Thread Starter Forum Replies Last Post
Le Canard Enchainé (French) - receipt alphonk Recipes 1 01-18-2025 10:51 AM
Delay in processing payment/receipt for Kobo book purchase HarleyB Kobo Reader 7 03-24-2012 12:23 PM
Kindle WiFi LG Receipt Date sirmaru Amazon Kindle 0 07-31-2010 10:30 AM
Buy Looking for PRS-505LC purchase receipt wild_Willie2 Flea Market 0 02-06-2009 04:21 PM
Ended SONY PRS-505 MINT W/Receipt + 8GB SDHC Card + Suprise! chengster Flea Market 11 10-13-2008 01:01 PM


All times are GMT -4. The time now is 03:57 AM.


MobileRead.com is a privately owned, operated and funded community.