Register Guidelines E-Books Today's Posts Search

Go Back   MobileRead Forums > E-Book Software > Calibre > Recipes

Notices

Reply
 
Thread Tools Search this Thread
Old 04-25-2025, 11:21 AM   #1
alphonk
Member
alphonk began at the beginning.
 
Posts: 11
Karma: 10
Join Date: Dec 2024
Device: kindle scribe
(uk) Frieze (international contemporary art magazine)

(uk) Frieze (international contemporary art magazine)


Spoiler:
#!/usr/bin/env python
'''
frieze.com - Magazine d'art contemporain
'''
from calibre.web.feeds.news import BasicNewsRecipe
from datetime import datetime
from zoneinfo import ZoneInfo
import re

class FriezeMagazineRecipe(BasicNewsRecipe):
title = 'Frieze Magazine'
author = 'Kabonix'
description = 'Magazine international d\'art contemporain'
language = 'en'
oldest_article = 60
max_articles_per_feed = 50
auto_cleanup = False
encoding = 'utf-8'
no_stylesheets = True
remove_javascript = True
scale_news_images_to_device = True

base_url = 'https://www.frieze.com'

keep_only_tags = [
{'class': 'article-header-container'},
{'class': 'article-header-title'},
{'class': 'article-header-headline'},
{'class': 'article-header-author'},
{'class': 'body-text'},
{'class': 'body-field'}
]

remove_tags = [
{'class': 'social-share-container'},
{'class': 'share-buttons'},
{'class': 'article-header-social-responsive'},
{'class': 'newsletter-subscribe-container'},
{'class': 'suggested-articles-container'},
{'class': 'article-footer-container'},
{'class': 'ad-surround'},
{'id': re.compile('ad-.*')},
{'class': 'hidden'}
]

extra_css = '''
img { max-width: 100%; height: auto; display: block; margin: 1em auto; }
h1 { font-size: 2em; margin: 1em 0; font-weight: bold; }
h2 { font-size: 1.5em; margin: 1em 0; font-weight: bold; }
p { font-size: 1.1em; line-height: 1.6; margin-bottom: 1em; }
figcaption { font-style: italic; font-size: 0.9em; color: #555; text-align: center; }
em { font-style: italic; }
.article-header-title { margin-bottom: 0.5em; }
.article-header-headline { margin-bottom: 1.5em; }
.article-header-author { font-size: 0.9em; margin-bottom: 2em; color: #555; }
'''

def parse_index(self):
magazine_url = self.base_url + '/magazines/frieze-magazine'
soup = self.index_to_soup(magazine_url)

# Recherche du dernier numéro de façon plus robuste
# Essayer plusieurs sélecteurs possibles pour trouver la liste des numéros
issue_containers = soup.find_all('div', {'class': re.compile('teaser-search-col')})

# S'il n'y a pas de conteneurs trouvés avec la classe spécifique, cherchons plus largement
if not issue_containers:
self.log.warning("Recherche élargie des numéros du magazine")
issue_containers = soup.find_all('div', {'class': re.compile('teaser')})

# Trouvons le premier numéro de magazine (le plus récent) parmi les conteneurs
latest_issue = None
for container in issue_containers:
# Vérifie si c'est un conteneur de magazine (pas un article)
if container.find('a', href=re.compile('/magazines/|/issues/')):
latest_issue = container
break

if not latest_issue:
self.log.warning("Aucun numéro spécifique trouvé, utilisation de la page principale")
issue_url = magazine_url
else:
issue_link = latest_issue.find('a')
if issue_link and issue_link.get('href'):
issue_url = self.base_url + issue_link['href'] if issue_link['href'].startswith('/') else issue_link['href']
self.log.info(f"Dernier numéro trouvé: {issue_url}")
else:
issue_url = magazine_url
self.log.warning("Lien vers le numéro non trouvé")

self.log.info(f"Accès au numéro: {issue_url}")
issue_soup = self.index_to_soup(issue_url)

# Extraction plus robuste de l'image de couverture
# Essayons plusieurs sélecteurs possibles
cover_image = None

# Méthode 1: Chercher dans la section d'en-tête du magazine
cover_div = issue_soup.find('div', class_=re.compile('magazine-header-image|issue-header-image'))
if cover_div:
img_tag = cover_div.find('img')
if img_tag and img_tag.get('src'):
cover_image = img_tag['src']

# Méthode 2: Chercher dans la section principale
if not cover_image:
main_section = issue_soup.find('section', class_=re.compile('main|content'))
if main_section:
img_tag = main_section.find('img')
if img_tag and img_tag.get('src'):
cover_image = img_tag['src']

# Méthode 3: Recherche générale d'une grande image en haut de la page
if not cover_image:
for img in issue_soup.find_all('img', src=True)[:5]: # Limiter aux 5 premières images
if 'cover' in img.get('src', '').lower() or 'header' in img.get('src', '').lower():
cover_image = img['src']
break

if cover_image:
self.cover_url = cover_image if cover_image.startswith('http') else self.base_url + cover_image
self.log.info(f"Couverture trouvée: {self.cover_url}")
else:
self.log.warning("Pas d'image de couverture trouvée")

# Extraction du titre du numéro
issue_title = None
for heading in issue_soup.find_all(['h1', 'h2']):
if 'issue' in heading.text.lower() or 'magazine' in heading.text.lower():
issue_title = heading.text.strip()
break

if not issue_title:
# Recherche plus générale
for heading in issue_soup.find_all(['h1', 'h2'])[:3]: # Limiter aux 3 premiers titres
issue_title = heading.text.strip()
if issue_title:
break

if not issue_title:
issue_title = 'Frieze Magazine - Dernier numéro'

# Extraction des articles
articles = []

# Recherche différents modèles de teasers d'articles
article_containers = issue_soup.find_all('div', {'class': re.compile('teaser-content|article-teaser|article-item')})

if not article_containers:
# Recherche plus large si les conteneurs spécifiques ne sont pas trouvés
article_containers = issue_soup.find_all('div', {'class': re.compile('teaser|article')})

for container in article_containers:
article_link = container.find('a')
if not article_link or not article_link.get('href'):
continue

article_url = article_link['href']
if article_url.startswith('/'):
article_url = self.base_url + article_url

# Extraction du titre avec différentes classes possibles
title_element = container.find(['div', 'h2', 'h3', 'h4'], {'class': re.compile('title|heading')})
title = title_element.text.strip() if title_element else 'Sans titre'

# Extraction de la description
desc_element = container.find(['div', 'p'], {'class': re.compile('deck|description|summary|excerpt')})
description = desc_element.text.strip() if desc_element else ''

# Extraction de l'auteur
author_element = container.find(['div', 'span'], {'class': re.compile('author|byline')})
if author_element:
author_links = author_element.find_all('a')
if author_links:
author = ', '.join([a.text.strip() for a in author_links])
else:
author = author_element.text.strip()
else:
author = ''

# Vérification que c'est bien un article et pas un lien interne
if '/article/' in article_url or '/feature/' in article_url or '/review/' in article_url:
articles.append({
'title': title,
'url': article_url,
'description': description,
'author': author,
'date': datetime.now(ZoneInfo('Europe/Paris')).strftime('%Y-%m-%d')
})
self.log.debug(f"Article trouvé: {title} - {article_url}")

self.log.info(f"Nombre d'articles trouvés: {len(articles)}")
return [(issue_title, articles)]

def get_cover_url(self):
return getattr(self, 'cover_url', None)

def preprocess_html(self, soup):
# Nettoyage des éléments superflus
for element in soup.find_all(class_=lambda c: c and ('share' in c or 'ad-' in c)):
element.decompose()

for tag in soup.find_all(['script', 'style']):
tag.decompose()

# Optimisation des images
for img in soup.find_all('img'):
# Sauvegarde des attributs importants seulement
src = img.get('src') or img.get('data-src') or img.get('data-lazy-src')
alt = img.get('alt', '')

# Réinitialiser tous les attributs
img.attrs = {}

# Réappliquer seulement src et alt
if src:
img['src'] = src
if alt:
img['alt'] = alt

# Formatage des métadonnées
for class_name, style in [
('article-header-title', 'font-size: 2em; font-weight: bold;'),
('article-header-headline', 'font-style: italic;'),
('article-header-author', 'color: #555;')
]:
element = soup.find('div', class_=class_name)
if element:
element.attrs.clear()
element['style'] = style

return soup

Last edited by theducks; 04-25-2025 at 11:52 AM. Reason: SPOILERED files
alphonk is offline   Reply With Quote
Old 04-25-2025, 12:44 PM   #2
kovidgoyal
creator of calibre
kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.
 
kovidgoyal's Avatar
 
Posts: 45,345
Karma: 27182818
Join Date: Oct 2006
Location: Mumbai, India
Device: Various
https://github.com/kovidgoyal/calibr...f47d8d888225c9
kovidgoyal is offline   Reply With Quote
Reply


Forum Jump

Similar Threads
Thread Thread Starter Forum Replies Last Post
Amazon Art: Online Marketplace for serious art fjtorres News 7 08-07-2013 07:25 AM
International Speculative Fiction magazine kennyc Reading Recommendations 8 07-06-2012 07:27 AM
Kindle 3 3G International Coverage US vs International model fidjit Amazon Kindle 4 02-04-2012 07:19 AM
magazine subscription - international location hurricane Amazon Kindle 0 09-01-2010 06:30 AM


All times are GMT -4. The time now is 06:57 PM.


MobileRead.com is a privately owned, operated and funded community.