Code:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Recette Calibre — Le Canard Enchaîné
# Auteur : Kabonix
# Le contenu premium est dans le HTML (CSS paywall) — pas besoin de login
from calibre.web.feeds.news import BasicNewsRecipe
from datetime import datetime, timedelta
from zoneinfo import ZoneInfo
class LeCanardEnchaine(BasicNewsRecipe):
title = 'Le Canard Enchaîné'
__author__ = 'Kabonix'
description = 'Articles du Canard Enchaîné (sans login — CSS paywall)'
language = 'fr'
no_stylesheets = True
auto_cleanup = False
remove_javascript = True
max_image_width = 600
max_image_height = 800
# Spoofer le referer Google pour contourner le paywall CSS
# Googlebot spoof — le Canard sert le contenu complet aux robots Google (pour indexation)
browser_user_agent = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
def get_browser(self, *args, **kwargs):
br = BasicNewsRecipe.get_browser(self, *args, **kwargs)
br.set_handle_robots(False)
br.addheaders = [
('User-Agent', self.browser_user_agent),
('Referer', 'https://www.google.fr/'),
('X-Forwarded-For', '66.249.66.1'), # IP officielle Googlebot
('Accept-Language', 'fr-FR,fr;q=0.9'),
]
return br
# ------------------------------------------------------------------ #
# Couverture — scraping page boutique #
# ------------------------------------------------------------------ #
def get_cover_url(self):
try:
br = self.get_browser()
soup = self.index_to_soup(
br.open('https://boutique.lecanardenchaine.fr/acheter-au-numero/').read()
)
li = soup.find('li', {'class': 'list-item'})
if li:
img = li.find('img')
if img and img.get('srcset'):
return 'https://boutique.lecanardenchaine.fr' + img['srcset'].split()[0]
if img and img.get('src'):
return 'https://boutique.lecanardenchaine.fr' + img['src']
except Exception:
pass
return 'https://image.ausha.co/2x1H3rkhwjmSwAa8KzIFfcN0G9GxfJWY83UafXn8_400x400.jpeg'
# ------------------------------------------------------------------ #
# Sélection du contenu #
# ------------------------------------------------------------------ #
keep_only_tags = [
dict(name='div', attrs={'class': 'article__heading'}),
dict(name='div', attrs={'class': 'editorial'}),
]
remove_tags = [
dict(name=['script', 'style', 'nav', 'header', 'footer', 'button', 'form']),
dict(name='div', attrs={'class': [
'share-mobile', 'share-sticky', 'article__author',
'article__tags', 'list-breadcrumb', 'modal',
]}),
]
extra_css = '''
h1, h2 { font-size: 1.2em !important; font-weight: bold; }
.editorial__chapo { font-style: italic; margin-bottom: 1em; }
p { line-height: 1.5; }
a { color: black !important; text-decoration: none !important; }
.zoom { border-left: 3px solid #ccc; padding-left: 1em; margin: 1em 0; }
'''
# ------------------------------------------------------------------ #
# Le bypass CSS paywall : on vire juste la classe paywall #
# Le contenu est déjà dans le HTML — c'est un CSS paywall pur #
# ------------------------------------------------------------------ #
def preprocess_html(self, soup):
# Déverrouiller le contenu caché par CSS
for div in soup.findAll('div', attrs={'id': 'paywall'}):
div['class'] = ''
div['id'] = ''
for div in soup.findAll('div', attrs={'class': 'paywall'}):
div['class'] = ''
# Nettoyer les overlays de paywall
for div in soup.findAll('div', attrs={'class': 'non-paywall'}):
div['class'] = ''
return soup
def postprocess_html(self, soup, first_fetch):
for tag in soup.findAll(True):
for attr in list(tag.attrs):
if attr not in ['href', 'src', 'class']:
del tag[attr]
return soup
# ------------------------------------------------------------------ #
# Index des sections #
# ------------------------------------------------------------------ #
SECTIONS = {
'Politique': '/politique/',
'Économie': '/economie/',
'International': '/international/',
'Défense': '/defense/',
'Société': '/societe/',
'Police-Justice': '/police-justice/',
'Santé': '/sante/',
'Éducation': '/education/',
'Environnement': '/environnement/',
'Technologie-Sciences': '/technologie-sciences/',
'Culture-Idées': '/culture-idees/',
'Médias': '/medias/',
'Sport': '/sport/',
'Social': '/social/',
'Brèves': '/breves/',
}
def parse_index(self):
br = self.get_browser()
feeds = []
today = datetime.now(ZoneInfo('Europe/Paris'))
week_ago = today - timedelta(days=7)
for section_title, section_url in self.SECTIONS.items():
articles = []
try:
url = 'https://www.lecanardenchaine.fr' + section_url
soup = self.index_to_soup(br.open(url).read())
for article in soup.findAll('article', {'class': 'article-item'}):
link = article.find('a', href=True)
date_div = article.find('div', {'class': 'article-item__date'})
if not (link and date_div):
continue
href = link['href']
title = link.get_text(separator=' ').strip()
time_element = date_div.find('time')
if not (time_element and time_element.get('datetime')):
continue
article_date = datetime.fromisoformat(time_element['datetime'])
if article_date.date() <= week_ago.date():
continue
if not href.startswith('http'):
href = 'https://www.lecanardenchaine.fr' + href
articles.append({
'title': title,
'url': href,
'description': f"Publié le {article_date.strftime('%d/%m/%Y')}",
'date': article_date.strftime('%a, %d %b %Y %H:%M:%S %z'),
})
except Exception as e:
self.log.warning(f'Erreur sur {section_title}: {e}')
if articles:
feeds.append((section_title, articles))
if not feeds:
raise ValueError('Aucun article trouvé')
return feeds