MobileRead Forums - View Single Post - Mediapart recipe doesn't work anymore

Thread: Mediapart recipe doesn't work anymore

View Single Post

12-04-2021, 11:51 AM	#7
MoloBolo Junior Member Posts: 8 Karma: 10 Join Date: Nov 2021 Device: Kobo Libra H20	Thanks a lot, sadly it's not working for me. It's perfect with Calibre's ereader but the articles are randomly cut on my Libra. It always cut just before the links to other articles ("à lire aussi"). Kepub conversion didn't help so I ran a book check and got 19 "Parsing failed: redefinition of the xmlns prefix is forbidden" errors (screen attached). Here's the full recipe : Spoiler: #!/usr/bin/env python # vim:fileencoding=utf-8 # # 11 Jan 2021 - L. Houpert - Major changes in the Mediapart recipe: # 1) Summary of the article are noow available # 2) Additional sections International, France, Economie and Culture have # been added through custom entries in the function my_parse_index. # 3) Fix the cover image so it doesnt disappear from the Kindle menu # ( cover image format is changed to .jpeg) # 14 Jan 2021 - Add Mediapart Logo url as masthead_url and change cover # by overlaying the date on top of the Mediapart cover from __future__ import unicode_literals __license__ = 'GPL v3' __copyright__ = '2021, Loïc Houpert <houpertloic at gmail .com>. Adapted from: 2016, Daniel Bonnery; 2009, Mathieu Godlewski; 2010-2012, Louis Gesbert' # noqa ''' Mediapart ''' import re from datetime import date, datetime, timezone, timedelta from calibre.web.feeds import feeds_from_index from calibre.web.feeds.news import BasicNewsRecipe def classes(classes): q = frozenset(classes.split(' ')) return dict( attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)} ) class Mediapart(BasicNewsRecipe): title = 'Mediapart' __author__ = 'Loïc Houpert' description = 'Global news in French from news site Mediapart' publication_type = 'newspaper' language = 'fr' needs_subscription = True oldest_article = 2 use_embedded_content = False no_stylesheets = True keep_only_tags = [ dict(name='h1'), dict(name='div', *classes('author')), classes('news__heading__top__intro news__body__center__article') ] remove_tags = [classes('login-subscribe print-source_url')] conversion_options = {'smarten_punctuation': True} masthead_url = "https://raw.githubusercontent.com/lhoupert/calibre_contrib/main/mediapart_masthead.png" # cover_url = 'https://raw.githubusercontent.com/lhoupert/calibre_contrib/main/mediapart.jpeg' # -- # Get date in french time zone format today = datetime.now(timezone.utc) + timedelta(hours=1) oldest_article_date = today - timedelta(days=oldest_article) feeds = [ ('La Une', 'http://www.mediapart.fr/articles/feed'), ] # The feed at 'http://www.mediapart.fr/articles/feed' only displayed the 10 # last elements so the articles are indexed on specific pages # in the function my_parse_index. In this function the article are parsed # using the funtion get_articles and the dict values dict_article_sources def parse_feeds(self): feeds = super(Mediapart, self).parse_feeds() feeds += feeds_from_index(self.my_parse_index(feeds)) return feeds def my_parse_index(self, la_une): dict_article_sources = [ { 'type': 'Brèves', 'webpage': 'https://www.mediapart.fr/journal/fil-dactualites', 'separador': { 'page': 'ul', 'thread': 'li' } }, { 'type': 'International', 'webpage': 'https://www.mediapart.fr/journal/international', 'separador': { 'page': 'div', 'thread': 'div' } }, { 'type': 'France', 'webpage': 'https://www.mediapart.fr/journal/france', 'separador': { 'page': 'div', 'thread': 'div' } }, { 'type': 'Économie', 'webpage': 'https://www.mediapart.fr/journal/economie', 'separador': { 'page': 'div', 'thread': 'div' } }, { 'type': 'Culture', 'webpage': 'https://www.mediapart.fr/journal/culture-idees', 'separador': { 'page': 'div', 'thread': 'div' } }, ] def get_articles( type_of_article, webpage, separador_page='ul', separador_thread='li' ): specific_articles = [] webpage_article = [] soup = self.index_to_soup(webpage) page = soup.find('main', {'class': 'global-wrapper'}) fils = page.find(separador_page, {'class': 'post-list universe-journal'}) all_articles = fils.findAll(separador_thread) for article in all_articles: try: title = article.find('h3', recursive=False) if title is None or ''.join(title['class']) == 'title-specific': # print(f"[BAD title entry] Print value of title:\n {title}") continue # print(f"\n[OK title entry] Print value of title:\n {title}\n") try: article_mot_cle = article.find( 'a', { 'href': re.compile(r'.\/mot-cle\/.') } ).renderContents().decode('utf-8') except Exception: article_mot_cle = '' try: article_type = article.find( 'a', { 'href': re.compile(r'.\/type-darticles\/.*') } ).renderContents().decode('utf-8') except Exception: article_type = '' for s in title('span'): s.replaceWith(s.renderContents().decode('utf-8') + "\n") url = title.find('a', href=True)['href'] date = article.find('time', datetime=True)['datetime'] article_date = datetime.strptime(date, '%Y-%m-%d') # Add French timezone to date of the article for date check article_date = article_date.replace(tzinfo=timezone.utc) + timedelta(hours=1) if article_date < self.oldest_article_date: print("article_date < self.oldest_article_date\n") continue # print("-------- Recent article added to the list ------- \n") all_authors = article.findAll( 'a', {'class': re.compile(r'\bjournalist\b')} ) authors = [self.tag_to_string(a) for a in all_authors] # print(f"Authors in tag <a>: {authors}") # If not link to the author profile is available the # html separador is a span tag if not all_authors: try: all_authors = article.findAll( 'span', {'class': re.compile(r'\bjournalist\b')} ) authors = [self.tag_to_string(a) for a in all_authors] # print(f"Authors in tag <span>: {authors}") except: authors = 'unknown' description = article.find('p').renderContents().decode('utf-8') # print(f" <p> in article : {self.tag_to_string(description).strip()} ") summary = { 'title': self.tag_to_string(title).strip(), 'description': description, 'date': article_date.strftime("%a, %d %b, %Y %H:%M"), 'author': ', '.join(authors), 'article_type': article_type, 'mot_cle': article_mot_cle.capitalize(), 'url': 'https://www.mediapart.fr' + url, } webpage_article.append(summary) except Exception: pass specific_articles += [(type_of_article, webpage_article)] if webpage_article else [] return specific_articles articles = [] for category in dict_article_sources: articles += get_articles( category['type'], category['webpage'], category['separador']['page'], category['separador']['thread'] ) return articles # non-locale specific date parse (strptime("%d %b %Y",s) would work with # french locale) def parse_french_date(self, date_str): date_arr = date_str.lower().split() return date( day=int(date_arr[0]), year=int(date_arr[2]), month=[ None, 'janvier', 'février', 'mars', 'avril', 'mai', 'juin', 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre' ].index(date_arr[1]) ) def get_browser(self): # -- Handle login def is_form_login(form): return "id" in form.attrs and form.attrs['id'] == "logFormEl" br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open('https://www.mediapart.fr/login') br.select_form(predicate=is_form_login) br['name'] = self.username br['password'] = self.password br.submit() return br def default_cover(self, cover_file): ''' Create a generic cover for recipes that don't have a cover ''' from PyQt5.Qt import QImage, QPainter, QPen, Qt, QFont, QRect from calibre.gui2 import ensure_app, load_builtin_fonts, pixmap_to_data def init_environment(): ensure_app() load_builtin_fonts() def create_cover_mediapart(date): ' Create a cover for mediapart adding the date on Mediapart Cover' init_environment() # Get data image_url = 'https://raw.githubusercontent.com/lhoupert/calibre_contrib/main/mediapart.jpeg' data = self.index_to_soup(image_url, raw=True) # Get date and hour corresponding to french time zone today = datetime.now(timezone.utc) + timedelta(hours=1) wkd = today.weekday() french_weekday={0:'Mon',1:'Mar',2:'Mer',3:'Jeu',4: 'Ven',5:'Sam',6:'Dim'} day = french_weekday[wkd]+'.' date = day + ' ' + today.strftime('%d %b. %Y') edition = today.strftime('Édition de %Hh') # Get Cover data img = QImage() img.loadFromData(data) # Overlay date on cover p = QPainter(img) pen = QPen(Qt.black) pen.setWidth(6) p.setPen(pen) font = QFont() font.setFamily('Times') font.setPointSize(78) p.setFont(font) r = QRect(0, 600, 744,100) p.drawText(r, Qt.AlignmentFlag.AlignJustify \| Qt.AlignmentFlag.AlignVCenter \| Qt.AlignmentFlag.AlignCenter, date) p.end() # Overlay edition information on cover p = QPainter(img) pen = QPen(Qt.black) pen.setWidth(4) p.setPen(pen) font = QFont() font.setFamily('Times') font.setItalic(True) font.setPointSize(66) p.setFont(font) # Add date r = QRect(0, 720, 744,100) p.drawText(r, Qt.AlignmentFlag.AlignJustify \| Qt.AlignmentFlag.AlignVCenter \| Qt.AlignmentFlag.AlignCenter, edition) p.end() return pixmap_to_data(img) try: today=datetime.today() date = today.strftime('%d %b %Y') img_data = create_cover_mediapart(date) cover_file.write(img_data) cover_file.flush() except Exception: self.log.exception('Failed to generate default cover') return False return True calibre_most_common_ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36' I'm assuming those links are the issue, at least with Kobo Libra (that's where it cuts and where the errors are), but I have no idea how to fix this. Attached Thumbnails