Member
Posts: 10
Karma: 10
Join Date: Dec 2024
Device: kindle scribe
|
Zérodeux (revue d'art contemporain) (French) -recipe
Zérodeux (revue d'art contemporain) (French) -recipe
Quote:
Code:
#!/usr/bin/env python
'''
zerodeux.fr
'''
from calibre.web.feeds.news import BasicNewsRecipe
class ZeroDeuxRecipe(BasicNewsRecipe):
title = 'Zérodeux'
__author__ = 'Kabonix'
description = 'Revue d\'art contemporain trimestrielle'
publisher = 'Zérodeux'
category = 'art, contemporary art, criticism'
language = 'fr'
encoding = 'utf-8'
oldest_article = 60
max_articles_per_feed = 25
no_stylesheets = True
remove_javascript = True
auto_cleanup = False
feeds = [
('Essais', 'https://www.zerodeux.fr/category/essais/feed/'),
('Guests', 'https://www.zerodeux.fr/category/guests/feed/'),
('Interviews', 'https://www.zerodeux.fr/category/interviews/feed/'),
('Reviews', 'https://www.zerodeux.fr/category/reviews/feed/'),
('News', 'https://www.zerodeux.fr/category/news/feed/'),
('Special Web', 'https://www.zerodeux.fr/category/specialweb/feed/')
]
def get_cover_url(self):
soup = self.index_to_soup('https://www.zerodeux.fr')
cover_div = soup.find('div', {'id': 'responsive_lightbox_image_widget-2'})
if cover_div:
cover_img = cover_div.find('img', {'class': 'rl-image-widget-image'})
if cover_img and 'src' in cover_img.attrs:
return cover_img['src']
return None
keep_only_tags = [
dict(name='article', attrs={'id': 'single-post'})
]
remove_tags = [
dict(name='div', attrs={'class': ['single-associate', 'single-info', 'transition']}),
dict(name='div', attrs={'id': ['sidebar', 'menu_footer']}),
dict(name='ul', attrs={'class': ['single-info']}), # Supprime le bloc "Partage"
dict(name='li', text='Du même auteur :'), # Supprime le titre "Du même auteur"
dict(name='p', attrs={'class': ['recomand']}) # Supprime "articles liés"
]
def preprocess_html(self, soup):
# Nettoyage des styles
for tag in soup.findAll(True):
if 'style' in tag.attrs:
del tag['style']
# Nettoyage des images
for img in soup.findAll('img'):
for attr in ['srcset', 'sizes', 'loading', 'class', 'width', 'height', 'decoding', 'fetchpriority']:
if attr in img.attrs:
del img[attr]
if img.get('src', '').startswith('/'):
img['src'] = 'https://www.zerodeux.fr' + img['src']
# Suppression du texte "Partage :" et "Du même auteur :"
for text in soup.findAll(text=True):
if text.strip() in ['Partage :', 'Du même auteur :']:
text.extract()
return soup
extra_css = '''
h1 { font-size: 1.8em; font-weight: bold; margin: 0 0 1em 0; }
h2 { font-size: 1.4em; font-weight: bold; margin: 1em 0; }
.single-author { font-style: italic; margin-bottom: 1.5em; color: #666; }
img { max-width: 100%; height: auto; margin: 1em auto; }
figcaption { font-size: 0.9em; font-style: italic; color: #666; margin: 0.5em 0 1.5em 0; }
p { margin-bottom: 1em; line-height: 1.5; }
ol { margin: 1em 0 1em 2em; }
blockquote { margin: 1em 0; padding: 0 1em; border-left: 3px solid #ccc; }
.wp-block-image { margin: 1.5em 0; }
.has-small-font-size { font-size: 0.9em; }
'''
def populate_article_metadata(self, article, soup, first):
author_div = soup.find('div', {'class': 'single-author'})
if author_div:
author = author_div.get_text().strip()
if author.startswith('par '):
author = author[4:]
article.author = author
return
|
Last edited by PeterT; 01-18-2025 at 05:57 PM.
|