Member
Posts: 10
Karma: 10
Join Date: Dec 2024
Device: kindle scribe
|
Alternatives Economiques (French) - recipe
Alternatives Economiques (French)
Quote:
Code:
#!/usr/bin/env python
from calibre.web.feeds.news import BasicNewsRecipe
import re
from bs4 import BeautifulSoup
class AlternativesEconomiques(BasicNewsRecipe):
title = 'Alternatives Économiques'
__author__ = 'Kabonix'
description = 'Articles de toutes les rubriques (10 articles max par rubrique)'
publisher = 'Alternatives Économiques'
language = 'fr'
oldest_article = 90
max_articles_per_feed = 10
no_stylesheets = True
remove_javascript = True
encoding = 'utf-8'
auto_cleanup = False
remove_empty_feeds = True
remove_images = False
def get_cover_url(self):
"""Récupère dynamiquement l'URL de la dernière une depuis MLP"""
br = self.get_browser()
try:
# Accéder à la page du magazine sur MLP
soup = self.index_to_soup(br.open('https://catalogueproduits.mlp.fr/produit.aspx?tit_code=1BMwusijpg0%3D').read())
# Chercher la div qui contient les images
gallery = soup.find('div', id='gallery')
if gallery:
img = gallery.find('img', id='couverture_1')
if img and img.get('src'):
cover_url = img['src']
if not cover_url.startswith('http'):
cover_url = 'https://catalogueproduits.mlp.fr/' + cover_url
self.log('Cover URL found:', cover_url)
return cover_url
self.log('Aucune couverture trouvée, utilisation de l\'image par défaut')
return 'https://www.alternatives-economiques.fr/sites/all/themes/alternatives-economiques-main/assets/logo-alternatives-economiques.svg'
except Exception as e:
self.log.error('Erreur lors de la récupération de la couverture:', str(e))
return 'https://www.alternatives-economiques.fr/sites/all/themes/alternatives-economiques-main/assets/logo-alternatives-economiques.svg'
def is_article_url(self, url):
article_pattern = re.compile(r'/[^/]+/00\d{6}$')
return bool(article_pattern.search(url))
def parse_index(self):
articles = []
base_url = 'https://www.alternatives-economiques.fr'
thematiques = [
'biodiversite', 'ideesdebats', 'entreprise', 'europe', 'direct-de-recherche',
'asie', 'a-la-carte', 'chine', 'culture', 'des-idees-pour-sortir-de-la-crise',
'amerique-du-sud', 'idees-0', 'transport', 'services-publics', 'allemagne',
'face-a-face', 'politique-monetaire', 'logement', 'climat', 'innovation',
'agir', 'economie-sociale-et-solidaire', 'societe', 'theorie', 'revenus',
'tribune', 'conditions-de-travail', 'familles', 'politique', 'numerique',
'histoire', 'sociorama', 'mondialisation', 'opinions', 'consommation',
'dette', 'assurance-chomage', 'finance', 'temps-de-travail', 'emploi',
'protection-sociale', 'refugies', 'royaume-uni', 'conjoncture', 'population',
'libertes', 'jeunes', 'droit-du-travail', 'responsabilite-sociale', 'industrie',
'economie', 'ukraine', 'immigration', 'travail', 'etats-unis',
'inflation', 'relations-sociales', 'inegalites', 'management', 'energie',
'environnement', 'fiscalite', 'social', 'elections-europeennes', 'retraites',
'international', 'commerce-exterieur', 'sante', 'gestion', 'salaires',
'education', 'lanceurs-dalerte', 'japon', 'geopolitique', 'afrique',
'commerce', 'politiques-publiques', 'budget', 'grece', 'genre',
'services', 'pollution', 'agriculture', 'legislatives', 'chomage',
'graphorama', 'formation', 'budget-2025', 'territoires', 'espagne'
]
special_sections = {
'grands-formats': '/grands-formats',
'dessin': '/dessin'
}
sections = {t: f'/thematiques/{t}' for t in thematiques}
sections.update(special_sections)
for section_name, section_path in sections.items():
url = f'{base_url}{section_path}'
self.log('Analyzing section:', url)
try:
soup = self.index_to_soup(url)
feed_articles = self.extract_articles(soup, base_url)
if feed_articles:
display_name = section_name.replace('-', ' ').title()
articles.append((display_name, feed_articles[:self.max_articles_per_feed]))
except Exception as e:
self.log.error(f'Error processing {section_name}: {str(e)}')
continue
return articles
def extract_articles(self, soup, base_url):
feed_articles = []
processed_urls = set()
for link in soup.find_all('a', href=True):
article_url = link['href']
if self.is_article_url(article_url):
if not article_url.startswith('http'):
article_url = base_url + article_url
if article_url in processed_urls:
continue
processed_urls.add(article_url)
try:
article_soup = self.index_to_soup(article_url)
h1_title = article_soup.find('h1', class_='o-head__title')
if h1_title:
title = h1_title.get_text().strip()
else:
title_elem = link.find('h2')
if title_elem:
title = title_elem.get_text().strip()
else:
title = link.get_text().strip()
if not title:
title = article_url.split('/')[-2].replace('-', ' ').title()
if title:
feed_articles.append({
'title': title,
'url': article_url,
'description': ''
})
except Exception as e:
self.log.error(f'Error getting H1 title for {article_url}: {str(e)}')
continue
return feed_articles
keep_only_tags = [
dict(name='h1', class_='o-head__title'),
dict(name='div', class_='chapo'),
dict(name='time', class_='o-infos__date-full'),
dict(name='div', class_='o-page__content__who'),
dict(name='div', class_='field-item even'),
dict(name='div', attrs={'property': 'content:encoded'})
]
remove_tags = [
dict(name=['script', 'style', 'iframe', 'svg', 'audio', 'video', 'button', 'form', 'input']),
dict(name='div', class_=[
'c-article__social', 'social-buttons', 'social-sharing', 'social-media',
'share-buttons', 'share-links', 'social-links', 'social-icons',
'embedded-content', 'embed-container', 'embed-wrapper', 'media-embed',
'twitter-embed', 'facebook-embed', 'social-embed',
'c-kiosk--single', 'c-comments', 'c-article__toolbar',
'c-article__related', 'c-epigraph', 'newsletter-signup',
'twitter-tweet', 'twitter-timeline', 'twitter-follow-button',
'c-footer__promo', 'o-page__block--offset--invert',
'newsletter-form', 'newsletter-block', 'newsletter',
'c-kiosk--single__content', 'c-kiosk--single__figure',
'c-kiosk--single__body', 'c-kiosk--single__cta',
'field-name-field-issue-cover'
])
]
extra_css = '''
body { line-height: 1.6; margin: 1em; }
h1 { font-size: 1.8em; margin-bottom: 0.5em; font-weight: bold; }
.chapo { font-style: italic; margin: 1em 0; font-size: 1.2em; }
.o-infos__date-full { color: #666; margin: 0.5em 0; font-size: 0.9em; }
.o-page__content__who { color: #333; margin: 0.5em 0; font-weight: bold; }
p { margin: 0.8em 0; }
a {
text-decoration: none !important;
color: inherit !important;
}
.o-page__figure-full {
break-inside: avoid;
margin: 1em 0;
page-break-inside: avoid;
}
.o-page__figure-full figcaption {
font-style: italic;
text-align: center;
margin-top: 0.5em;
font-size: 0.9em;
color: #666;
}
'''
def preprocess_html(self, soup):
# Remove unwanted tags
for tag in soup.find_all(['script', 'style', 'iframe', 'svg', 'audio', 'video']):
tag.decompose()
# Clean attributes
for tag in soup.find_all(True):
if tag.name not in ['a', 'img']:
allowed_attrs = {'src', 'href', 'alt', 'title'}
tag.attrs = {k: v for k, v in tag.attrs.items() if k in allowed_attrs}
return soup
|
Last edited by PeterT; 01-18-2025 at 05:58 PM.
|