Thanks a lot, sadly it's not working for me. It's perfect with Calibre's ereader but the articles are randomly cut on my Libra.
It always cut just before the links to other articles ("
à lire aussi"). Kepub conversion didn't help so I ran a book check and got 19 "
Parsing failed: redefinition of the xmlns prefix is forbidden" errors (screen attached).
Here's the full recipe :
Spoiler:
#!/usr/bin/env python
# vim:fileencoding=utf-8
#
# 11 Jan 2021 - L. Houpert - Major changes in the Mediapart recipe:
# 1) Summary of the article are noow available
# 2) Additional sections International, France, Economie and Culture have
# been added through custom entries in the function my_parse_index.
# 3) Fix the cover image so it doesnt disappear from the Kindle menu
# ( cover image format is changed to .jpeg)
# 14 Jan 2021 - Add Mediapart Logo url as masthead_url and change cover
# by overlaying the date on top of the Mediapart cover
from __future__ import unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2021, Loïc Houpert <houpertloic at gmail .com>. Adapted from: 2016, Daniel Bonnery; 2009, Mathieu Godlewski; 2010-2012, Louis Gesbert' # noqa
'''
Mediapart
'''
import re
from datetime import date, datetime, timezone, timedelta
from calibre.web.feeds import feeds_from_index
from calibre.web.feeds.news import BasicNewsRecipe
def classes(classes):
q = frozenset(classes.split(' '))
return dict(
attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)}
)
class Mediapart(BasicNewsRecipe):
title = 'Mediapart'
__author__ = 'Loïc Houpert'
description = 'Global news in French from news site Mediapart'
publication_type = 'newspaper'
language = 'fr'
needs_subscription = True
oldest_article = 2
use_embedded_content = False
no_stylesheets = True
keep_only_tags = [
dict(name='h1'),
dict(name='div', **classes('author')),
classes('news__heading__top__intro news__body__center__article')
]
remove_tags = [classes('login-subscribe print-source_url')]
conversion_options = {'smarten_punctuation': True}
masthead_url = "https://raw.githubusercontent.com/lhoupert/calibre_contrib/main/mediapart_masthead.png"
# cover_url = 'https://raw.githubusercontent.com/lhoupert/calibre_contrib/main/mediapart.jpeg'
# --
# Get date in french time zone format
today = datetime.now(timezone.utc) + timedelta(hours=1)
oldest_article_date = today - timedelta(days=oldest_article)
feeds = [
('La Une', 'http://www.mediapart.fr/articles/feed'),
]
# The feed at 'http://www.mediapart.fr/articles/feed' only displayed the 10
# last elements so the articles are indexed on specific pages
# in the function my_parse_index. In this function the article are parsed
# using the funtion get_articles and the dict values dict_article_sources
def parse_feeds(self):
feeds = super(Mediapart, self).parse_feeds()
feeds += feeds_from_index(self.my_parse_index(feeds))
return feeds
def my_parse_index(self, la_une):
dict_article_sources = [
{
'type': 'Brèves',
'webpage': 'https://www.mediapart.fr/journal/fil-dactualites',
'separador': {
'page': 'ul',
'thread': 'li'
}
},
{
'type': 'International',
'webpage': 'https://www.mediapart.fr/journal/international',
'separador': {
'page': 'div',
'thread': 'div'
}
},
{
'type': 'France',
'webpage': 'https://www.mediapart.fr/journal/france',
'separador': {
'page': 'div',
'thread': 'div'
}
},
{
'type': 'Économie',
'webpage': 'https://www.mediapart.fr/journal/economie',
'separador': {
'page': 'div',
'thread': 'div'
}
},
{
'type': 'Culture',
'webpage': 'https://www.mediapart.fr/journal/culture-idees',
'separador': {
'page': 'div',
'thread': 'div'
}
},
]
def get_articles(
type_of_article, webpage, separador_page='ul', separador_thread='li'
):
specific_articles = []
webpage_article = []
soup = self.index_to_soup(webpage)
page = soup.find('main', {'class': 'global-wrapper'})
fils = page.find(separador_page, {'class': 'post-list universe-journal'})
all_articles = fils.findAll(separador_thread)
for article in all_articles:
try:
title = article.find('h3', recursive=False)
if title is None or ''.join(title['class']) == 'title-specific':
# print(f"[BAD title entry] Print value of title:\n {title}")
continue
# print(f"\n[OK title entry] Print value of title:\n {title}\n")
try:
article_mot_cle = article.find(
'a', {
'href': re.compile(r'.*\/mot-cle\/.*')
}
).renderContents().decode('utf-8')
except Exception:
article_mot_cle = ''
try:
article_type = article.find(
'a', {
'href': re.compile(r'.*\/type-darticles\/.*')
}
).renderContents().decode('utf-8')
except Exception:
article_type = ''
for s in title('span'):
s.replaceWith(s.renderContents().decode('utf-8') + "\n")
url = title.find('a', href=True)['href']
date = article.find('time', datetime=True)['datetime']
article_date = datetime.strptime(date, '%Y-%m-%d')
# Add French timezone to date of the article for date check
article_date = article_date.replace(tzinfo=timezone.utc) + timedelta(hours=1)
if article_date < self.oldest_article_date:
print("article_date < self.oldest_article_date\n")
continue
# print("-------- Recent article added to the list ------- \n")
all_authors = article.findAll(
'a', {'class': re.compile(r'\bjournalist\b')}
)
authors = [self.tag_to_string(a) for a in all_authors]
# print(f"Authors in tag <a>: {authors}")
# If not link to the author profile is available the
# html separador is a span tag
if not all_authors:
try:
all_authors = article.findAll(
'span', {'class': re.compile(r'\bjournalist\b')}
)
authors = [self.tag_to_string(a) for a in all_authors]
# print(f"Authors in tag <span>: {authors}")
except:
authors = 'unknown'
description = article.find('p').renderContents().decode('utf-8')
# print(f" <p> in article : {self.tag_to_string(description).strip()} ")
summary = {
'title': self.tag_to_string(title).strip(),
'description': description,
'date': article_date.strftime("%a, %d %b, %Y %H:%M"),
'author': ', '.join(authors),
'article_type': article_type,
'mot_cle': article_mot_cle.capitalize(),
'url': 'https://www.mediapart.fr' + url,
}
webpage_article.append(summary)
except Exception:
pass
specific_articles += [(type_of_article,
webpage_article)] if webpage_article else []
return specific_articles
articles = []
for category in dict_article_sources:
articles += get_articles(
category['type'], category['webpage'], category['separador']['page'],
category['separador']['thread']
)
return articles
# non-locale specific date parse (strptime("%d %b %Y",s) would work with
# french locale)
def parse_french_date(self, date_str):
date_arr = date_str.lower().split()
return date(
day=int(date_arr[0]),
year=int(date_arr[2]),
month=[
None, 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'
].index(date_arr[1])
)
def get_browser(self):
# -- Handle login
def is_form_login(form):
return "id" in form.attrs and form.attrs['id'] == "logFormEl"
br = BasicNewsRecipe.get_browser(self)
if self.username is not None and self.password is not None:
br.open('https://www.mediapart.fr/login')
br.select_form(predicate=is_form_login)
br['name'] = self.username
br['password'] = self.password
br.submit()
return br
def default_cover(self, cover_file):
'''
Create a generic cover for recipes that don't have a cover
'''
from PyQt5.Qt import QImage, QPainter, QPen, Qt, QFont, QRect
from calibre.gui2 import ensure_app, load_builtin_fonts, pixmap_to_data
def init_environment():
ensure_app()
load_builtin_fonts()
def create_cover_mediapart(date):
' Create a cover for mediapart adding the date on Mediapart Cover'
init_environment()
# Get data
image_url = 'https://raw.githubusercontent.com/lhoupert/calibre_contrib/main/mediapart.jpeg'
data = self.index_to_soup(image_url, raw=True)
# Get date and hour corresponding to french time zone
today = datetime.now(timezone.utc) + timedelta(hours=1)
wkd = today.weekday()
french_weekday={0:'Mon',1:'Mar',2:'Mer',3:'Jeu',4: 'Ven',5:'Sam',6:'Dim'}
day = french_weekday[wkd]+'.'
date = day + ' ' + today.strftime('%d %b. %Y')
edition = today.strftime('Édition de %Hh')
# Get Cover data
img = QImage()
img.loadFromData(data)
# Overlay date on cover
p = QPainter(img)
pen = QPen(Qt.black)
pen.setWidth(6)
p.setPen(pen)
font = QFont()
font.setFamily('Times')
font.setPointSize(78)
p.setFont(font)
r = QRect(0, 600, 744,100)
p.drawText(r, Qt.AlignmentFlag.AlignJustify | Qt.AlignmentFlag.AlignVCenter | Qt.AlignmentFlag.AlignCenter, date)
p.end()
# Overlay edition information on cover
p = QPainter(img)
pen = QPen(Qt.black)
pen.setWidth(4)
p.setPen(pen)
font = QFont()
font.setFamily('Times')
font.setItalic(True)
font.setPointSize(66)
p.setFont(font)
# Add date
r = QRect(0, 720, 744,100)
p.drawText(r, Qt.AlignmentFlag.AlignJustify | Qt.AlignmentFlag.AlignVCenter | Qt.AlignmentFlag.AlignCenter, edition)
p.end()
return pixmap_to_data(img)
try:
today=datetime.today()
date = today.strftime('%d %b %Y')
img_data = create_cover_mediapart(date)
cover_file.write(img_data)
cover_file.flush()
except Exception:
self.log.exception('Failed to generate default cover')
return False
return True
calibre_most_common_ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36'
I'm assuming those links are the issue, at least with Kobo Libra (that's where it cuts and where the errors are), but I have no idea how to fix this.