MobileRead Forums - View Single Post

MoloBolo · 12-05-2021, 07:35 AM

I added this :

Code:

remove_tags = [dict(name='svg')]

and ran a book check and now I have 150+ new errors

I tried to add stuff like "class" which didn't help, and eventually decided to test it on my Libra anyway...

It's working

I didn't notice any cuts and the table of content is functional.

The full recipe :

Spoiler:

Code:

#!/usr/bin/env python
# vim:fileencoding=utf-8
#
# 11 Jan 2021 -  L. Houpert - Major changes in the Mediapart recipe:
#   1) Summary of the article are noow available
#   2) Additional sections  International, France, Economie and Culture have
# been added through custom entries in the function my_parse_index.
#   3) Fix the cover image so it doesnt disappear from the Kindle menu
# ( cover image format is changed to .jpeg)
# 14 Jan 2021 - Add Mediapart Logo url as masthead_url and change cover
#   by overlaying the date on top of the Mediapart cover
from __future__ import unicode_literals

__license__ = 'GPL v3'
__copyright__ = '2021, Loïc Houpert <houpertloic at gmail .com>. Adapted from: 2016, Daniel Bonnery; 2009, Mathieu Godlewski; 2010-2012, Louis Gesbert'  # noqa
'''
Mediapart
'''

import re
from datetime import date, datetime, timezone, timedelta
from calibre.web.feeds import feeds_from_index
from calibre.web.feeds.news import BasicNewsRecipe


def classes(classes):
    q = frozenset(classes.split(' '))
    return dict(
        attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)}
    )


class Mediapart(BasicNewsRecipe):
    title = 'Mediapart'
    __author__ = 'Loïc Houpert'
    description = 'Global news in French from news site Mediapart'
    publication_type = 'newspaper'
    language = 'fr'
    needs_subscription = True
    oldest_article = 2

    use_embedded_content = False
    no_stylesheets = True

    keep_only_tags = [
dict(name='h1'),
dict(name='div', **classes('author')),
classes('news__heading__top__intro news__body__center__article')
]
    remove_tags = [classes('login-subscribe print-source_url')]
    remove_tags = [dict(name='svg')]
    conversion_options = {'smarten_punctuation': True}

    masthead_url = "https://raw.githubusercontent.com/lhoupert/calibre_contrib/main/mediapart_masthead.png"
    # cover_url = 'https://raw.githubusercontent.com/lhoupert/calibre_contrib/main/mediapart.jpeg'

    # --

    # Get date in french time zone format
    today = datetime.now(timezone.utc) + timedelta(hours=1)
    oldest_article_date = today - timedelta(days=oldest_article)

    feeds = [
        ('La Une', 'http://www.mediapart.fr/articles/feed'),
    ]

    # The feed at 'http://www.mediapart.fr/articles/feed' only displayed the 10
    # last elements so the articles are indexed on specific pages
    # in the function my_parse_index. In this function the article are parsed
    # using the funtion get_articles and the dict values dict_article_sources

    def parse_feeds(self):
        feeds = super(Mediapart, self).parse_feeds()
        feeds += feeds_from_index(self.my_parse_index(feeds))
        return feeds

    def my_parse_index(self, la_une):

        dict_article_sources = [
            {
                'type': 'Brèves',
                'webpage': 'https://www.mediapart.fr/journal/fil-dactualites',
                'separador': {
                    'page': 'ul',
                    'thread': 'li'
                }
            },
            {
                'type': 'International',
                'webpage': 'https://www.mediapart.fr/journal/international',
                'separador': {
                    'page': 'div',
                    'thread': 'div'
                }
            },
            {
                'type': 'France',
                'webpage': 'https://www.mediapart.fr/journal/france',
                'separador': {
                    'page': 'div',
                    'thread': 'div'
                }
            },
            {
                'type': 'Économie',
                'webpage': 'https://www.mediapart.fr/journal/economie',
                'separador': {
                    'page': 'div',
                    'thread': 'div'
                }
            },
            {
                'type': 'Culture',
                'webpage': 'https://www.mediapart.fr/journal/culture-idees',
                'separador': {
                    'page': 'div',
                    'thread': 'div'
                }
            },
        ]

        def get_articles(
            type_of_article, webpage, separador_page='ul', separador_thread='li'
        ):

            specific_articles = []

            webpage_article = []
            soup = self.index_to_soup(webpage)
            page = soup.find('main', {'class': 'global-wrapper'})
            fils = page.find(separador_page, {'class': 'post-list universe-journal'})

            all_articles = fils.findAll(separador_thread)
            for article in all_articles:
                try:
                    title = article.find('h3', recursive=False)
                    if title is None or ''.join(title['class']) == 'title-specific':
                        # print(f"[BAD title entry] Print value of title:\n {title}")
                        continue
                    # print(f"\n[OK title entry] Print value of title:\n {title}\n")

                    try:
                        article_mot_cle = article.find(
                            'a', {
                                'href': re.compile(r'.*\/mot-cle\/.*')
                            }
                        ).renderContents().decode('utf-8')
                    except Exception:
                        article_mot_cle = ''

                    try:
                        article_type = article.find(
                            'a', {
                                'href': re.compile(r'.*\/type-darticles\/.*')
                            }
                        ).renderContents().decode('utf-8')
                    except Exception:
                        article_type = ''

                    for s in title('span'):
                        s.replaceWith(s.renderContents().decode('utf-8') + "\n")
                    url = title.find('a', href=True)['href']

                    date = article.find('time', datetime=True)['datetime']
                    article_date = datetime.strptime(date, '%Y-%m-%d')
                    # Add French timezone to date of the article for date check
                    article_date = article_date.replace(tzinfo=timezone.utc) + timedelta(hours=1)
                    if article_date < self.oldest_article_date:
                        print("article_date < self.oldest_article_date\n")
                        continue

                    # print("-------- Recent article added to the list ------- \n")
                    all_authors = article.findAll(
                        'a', {'class': re.compile(r'\bjournalist\b')}
                    )
                    authors = [self.tag_to_string(a) for a in all_authors]
                    # print(f"Authors in tag <a>: {authors}")

                    # If not link to the author profile is available the
                    # html separador is a span tag
                    if not all_authors:
                        try:
                            all_authors = article.findAll(
                                'span', {'class': re.compile(r'\bjournalist\b')}
                            )
                            authors = [self.tag_to_string(a) for a in all_authors]
                            # print(f"Authors in tag <span>: {authors}")
                        except:
                            authors = 'unknown'

                    description = article.find('p').renderContents().decode('utf-8')
                    # print(f" <p> in article : {self.tag_to_string(description).strip()} ")

                    summary = {
                        'title': self.tag_to_string(title).strip(),
                        'description': description,
                        'date': article_date.strftime("%a, %d %b, %Y %H:%M"),
                        'author': ', '.join(authors),
                        'article_type': article_type,
                        'mot_cle': article_mot_cle.capitalize(),
                        'url': 'https://www.mediapart.fr' + url,
                    }

                    webpage_article.append(summary)
                except Exception:
                    pass

            specific_articles += [(type_of_article,
                                   webpage_article)] if webpage_article else []
            return specific_articles

        articles = []

        for category in dict_article_sources:
            articles += get_articles(
                category['type'], category['webpage'], category['separador']['page'],
                category['separador']['thread']
            )

        return articles

    # non-locale specific date parse (strptime("%d %b %Y",s) would work with
    # french locale)
    def parse_french_date(self, date_str):
        date_arr = date_str.lower().split()
        return date(
            day=int(date_arr[0]),
            year=int(date_arr[2]),
            month=[
                None, 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
                'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'
            ].index(date_arr[1])
        )

    def get_browser(self):
        # -- Handle login

        def is_form_login(form):
            return "id" in form.attrs and form.attrs['id'] == "logFormEl"

        br = BasicNewsRecipe.get_browser(self)
        if self.username is not None and self.password is not None:
            br.open('https://www.mediapart.fr/login')
            br.select_form(predicate=is_form_login)
            br['name'] = self.username
            br['password'] = self.password
            br.submit()
        return br

    def default_cover(self, cover_file):
        '''
        Create a generic cover for recipes that don't have a cover
        '''
        from PyQt5.Qt import QImage, QPainter, QPen, Qt, QFont, QRect
        from calibre.gui2 import ensure_app, load_builtin_fonts, pixmap_to_data

        def init_environment():
            ensure_app()
            load_builtin_fonts()

        def create_cover_mediapart(date):
            ' Create a cover for mediapart adding the date on Mediapart Cover'
            init_environment()
            # Get data
            image_url = 'https://raw.githubusercontent.com/lhoupert/calibre_contrib/main/mediapart.jpeg'
            data = self.index_to_soup(image_url, raw=True)
            # Get date and hour corresponding to french time zone
            today = datetime.now(timezone.utc) + timedelta(hours=1)
            wkd = today.weekday()
            french_weekday={0:'Mon',1:'Mar',2:'Mer',3:'Jeu',4:'Ven',5:'Sam',6:'Dim'}
            day = french_weekday[wkd]+'.'
            date = day + ' ' + today.strftime('%d %b. %Y')
            edition = today.strftime('Édition de %Hh')

            # Get Cover data
            img  = QImage()
            img.loadFromData(data)

            # Overlay date on cover
            p = QPainter(img)
            pen = QPen(Qt.black)
            pen.setWidth(6)
            p.setPen(pen)
            font = QFont()
            font.setFamily('Times')
            font.setPointSize(78)
            p.setFont(font)
            r = QRect(0, 600, 744,100)
            p.drawText(r, Qt.AlignmentFlag.AlignJustify | Qt.AlignmentFlag.AlignVCenter | Qt.AlignmentFlag.AlignCenter, date)
            p.end()

            # Overlay edition information on cover
            p = QPainter(img)
            pen = QPen(Qt.black)
            pen.setWidth(4)
            p.setPen(pen)
            font = QFont()
            font.setFamily('Times')
            font.setItalic(True)
            font.setPointSize(66)
            p.setFont(font)
            # Add date
            r = QRect(0, 720, 744,100)
            p.drawText(r, Qt.AlignmentFlag.AlignJustify | Qt.AlignmentFlag.AlignVCenter | Qt.AlignmentFlag.AlignCenter, edition)
            p.end()
            return pixmap_to_data(img)

        try:
            today=datetime.today()
            date = today.strftime('%d %b %Y')
            img_data = create_cover_mediapart(date)
            cover_file.write(img_data)
            cover_file.flush()
        except Exception:
            self.log.exception('Failed to generate default cover')
            return False
        return True


calibre_most_common_ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36'

I can post screenshots of the errors if anyone is interested, errors are :

Missing standard property 'border-bottom-right-radius' to go along with '-webkit-border-bottom-right-radius'. [stylesheet.css]
Link points to a location not present in the target file [feed_0/article_0/index_u27.html]
The linked resource 'w.colibris-lemouvement.org' does not exist [feed_0/article_2/index_u1.html]

Thanks a lot for the help !