Register Guidelines E-Books Today's Posts Search

Go Back   MobileRead Forums > E-Book Software > Calibre > Recipes

Notices

Reply
 
Thread Tools Search this Thread
Old 03-07-2016, 12:11 PM   #16
bernard.ryefield
Connoisseur
bernard.ryefield does all things with Zen-like beautybernard.ryefield does all things with Zen-like beautybernard.ryefield does all things with Zen-like beautybernard.ryefield does all things with Zen-like beautybernard.ryefield does all things with Zen-like beautybernard.ryefield does all things with Zen-like beautybernard.ryefield does all things with Zen-like beautybernard.ryefield does all things with Zen-like beautybernard.ryefield does all things with Zen-like beautybernard.ryefield does all things with Zen-like beautybernard.ryefield does all things with Zen-like beauty
 
Posts: 93
Karma: 32466
Join Date: Jul 2013
Location: Paris
Device: Kobo Desktop, Kindle Desktop, Kobo Forma
Quote:
Originally Posted by DanielBonnery View Post
Hi,
thank you for your answer,
the mediapart recipe may not have been updated yet in calibre, have you tried the one posted above ?

On my PC it works fine, the log is posted below.

Bests,
Daniel
yes, it's the one I used
bernard.ryefield is offline   Reply With Quote
Old 03-07-2016, 04:09 PM   #17
DanielBonnery
Member
DanielBonnery began at the beginning.
 
Posts: 11
Karma: 10
Join Date: Jan 2016
Device: no
Quote:
Originally Posted by bernard.ryefield View Post
yes, it's the one I used
Hi,
thank you for your message.
I am reposting the recipe without the print statements.
Code:
# -*- mode:python -*-
from __future__ import unicode_literals

__license__   = 'GPL v3'
__copyright__ = '2016, Daniel Bonnery (contact: DanielBonnery sur mobileread.com) 2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010-2012, Louis Gesbert <meta at antislash dot info>'
'''
Mediapart
'''

__author__ = '2016, Daniel Bonnery (contact: DanielBonnery sur mobileread.com), 2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010-2012, Louis Gesbert <meta at antislash dot info>'

import re
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.web.feeds import feeds_from_index
from datetime import date,timedelta

class Mediapart(BasicNewsRecipe):
    title = 'Mediapart'
    __author__ = 'Daniel Bonnery from a version by Mathieu Godlewski, Louis Gesbert'
    description = 'Global news in french from news site Mediapart'
    publication_type = 'newspaper'
    language = 'fr'
    needs_subscription = True
    oldest_article = 2

    use_embedded_content = False
    no_stylesheets = True

    cover_url = 'https://static.mediapart.fr/files/M%20Philips/logo-mediapart.png'

# --

    oldest_article_date = date.today() - timedelta(days=oldest_article)

# -- get the index (the feed at 'http://www.mediapart.fr/articles/feed' only has
#    the 10 last elements :/)

    feeds =  [
        ('La Une', 'http://www.mediapart.fr/articles/feed'),
    ]

    def parse_feeds(self):
        feeds = super(Mediapart, self).parse_feeds()
        feeds += feeds_from_index(self.my_parse_index(feeds))
        return feeds

    def my_parse_index(self, la_une):
        articles = []

        breves = []
        liens = []
        confidentiels = []
       
        soup = self.index_to_soup('https://www.mediapart.fr/journal/fil-dactualites')
        page = soup.find('div', {'class':'page-content bust'})
        fils = page.find('ul', {'class':'post-list universe-journal'})

        for article in fils.findAll('li'):
            try:
                title = article.find('h3',recursive=False)

                if title is None or title['class'] == 'title-specific':
                    continue

                article_type = article.find('a', {'href': re.compile(r'.*\/type-darticles\/.*')}).renderContents()
    
                for s in title('span'):
                    s.replaceWith(s.renderContents() + "\n")
                url = title.find('a', href=True)['href']
                              
                #article_date = self.parse_french_date(article.find("span", "article-date").renderContents())
                #print("################################# 9")
                #print(article_date)

                #if article_date < self.oldest_article_date:
                    # print "too old"
                #    continue

                authors = article.findAll('a',{'class':re.compile(r'\bjournalist\b')})
                authors = [self.tag_to_string(a) for a in authors]

                #description = article.find('div', {'class': lambda c: c != 'taxonomy-teaser'}, recursive=False).findAll('p')

                # print "fil ",title," by ",authors," : ",description

                summary = {
                    'title': self.tag_to_string(title).strip(),
                    'author': ', '.join(authors),
                    'url': url,
                    #'date': u'' + article_date.strftime("%A %d %b %Y"),
                    'description': '\n'.join([self.tag_to_string(d) for d in description]),
                }
                {
                    "Brève": breves,
                    "Lien": liens,
                    "Confidentiel": confidentiels,
                }.get(article_type).append(summary)
            except:
                pass

        # print 'La Une: ', len(la_une), ' articles'
        # for a in la_une: print a["title"]
        # print 'Brèves: ', len(breves), ' articles'
        # print 'Revue web: ', len(liens), ' articles'
        # print 'Confidentiel: ', len(confidentiels), ' articles'

        articles += [('Brèves', breves)] if breves else []
        articles += [('Revue du Web', liens)] if liens else []
        articles += [('Confidentiel', confidentiels)] if confidentiels else []
        return articles
# -- print-version

    conversion_options = {'smarten_punctuation' : True}

    remove_tags = [dict(name='div', attrs={'class':'print-source_url'})]

    # non-locale specific date parse (strptime("%d %b %Y",s) would work with french locale)
    def parse_french_date(self, date_str):
        date_arr = date_str.lower().split()
        return date(day=int(date_arr[0]),
                    year=int(date_arr[2]),
                    month=[None, 'janvier', 'février', 'mars', 'avril', 'mai', 'juin', 'juillet',
                       'août', 'septembre', 'octobre', 'novembre', 'décembre'].index(date_arr[1]))

    def print_version(self, url):
        raw = self.browser.open(url).read()
        soup = BeautifulSoup(raw.decode('utf8', 'replace'))
        # Filter old articles
 #       article_date = self.parse_french_date(self.tag_to_string(soup.find('span', 'article-date')))

  #      if article_date < self.oldest_article_date:
   #         return None

        tools = soup.find('li', {'class':'print'})
        link = tools.find('a', {'href': re.compile(r'\/print\/.*')})
    #    print(link['href'])
      #       if link is None:
 #           print 'Error: print link not found'
 #           return None
        return 'https://mediapart.fr' + link['href']
#        return url
  
# -- Handle login
    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
        if self.username is not None and self.password is not None:
            br.open('https://www.mediapart.fr/login')
            br.select_form(nr=1)
            br['name'] = self.username
            br['password'] = self.password
            br.submit()
        return br

    # This is a workaround articles with scribd content that include
    # <body></body> tags _within_ the body
    preprocess_regexps = [
        (re.compile(r'(<body.*?>)(.*)</body>', re.IGNORECASE|re.DOTALL),
         lambda match:
             match.group(1) + re.sub(
                 re.compile(r'</?body>', re.IGNORECASE|re.DOTALL),'', match.group(2)) + '</body>')
    ]
It works fine on my pc (see attachment)
Sorry I could not help more.
Attached Thumbnails
Click image for larger version

Name:	Capture d'écran de 2016-03-07 16:03:43.png
Views:	317
Size:	341.1 KB
ID:	146947  
DanielBonnery is offline   Reply With Quote
Advert
Old 03-08-2016, 08:15 AM   #18
Frenchdummy
Enthusiast
Frenchdummy can illuminate an eclipseFrenchdummy can illuminate an eclipseFrenchdummy can illuminate an eclipseFrenchdummy can illuminate an eclipseFrenchdummy can illuminate an eclipseFrenchdummy can illuminate an eclipseFrenchdummy can illuminate an eclipseFrenchdummy can illuminate an eclipseFrenchdummy can illuminate an eclipseFrenchdummy can illuminate an eclipseFrenchdummy can illuminate an eclipse
 
Frenchdummy's Avatar
 
Posts: 29
Karma: 8300
Join Date: Apr 2013
Location: France
Device: Kobo glo, Apple devices
Hi I am totally beginner in retrieving news article but I also subscribed to MediaPart, I intend to follow this thread and its updates. I do not know anything about this party recipes. I think it is rather reserved for connoisseurs and hackers. Is it easy to use/implement on a mac?
Frenchdummy is offline   Reply With Quote
Old 03-08-2016, 04:31 PM   #19
DanielBonnery
Member
DanielBonnery began at the beginning.
 
Posts: 11
Karma: 10
Join Date: Jan 2016
Device: no
[QUOTE]
Hi I am totally beginner in retrieving news article but I also subscribed to MediaPart, I intend to follow this thread and its updates. I do not know anything about this party recipes. I think it is rather reserved for connoisseurs and hackers. Is it easy to use/implement on a mac?
[\QUOTE]

Hi,
you may just wait for the next update of calibre, the recipe may or may not be added by the developpers. Otherwise you just do
add custom news source>New recipe>Switch to advanced>
paste the recipe here.
That is it

Last edited by DanielBonnery; 03-08-2016 at 04:37 PM.
DanielBonnery is offline   Reply With Quote
Old 03-08-2016, 04:36 PM   #20
DanielBonnery
Member
DanielBonnery began at the beginning.
 
Posts: 11
Karma: 10
Join Date: Jan 2016
Device: no
Hi,

it is me again.

So I tried this recipe on windows.

It failed, although it works on ubuntu.

I get a different error message than the one I used to receive (the message I had previously came from the fact they changed the webpage code) the message I receive now is:


urllib2.URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:581)>


So apparently is a certificate problem external to calibre and the recipe. Kovid Goyal wrote about it earlier and I will try to see if I can apply what he propose and post here.

Bests,
Daniel
DanielBonnery is offline   Reply With Quote
Advert
Old 03-11-2016, 11:13 AM   #21
bernard.ryefield
Connoisseur
bernard.ryefield does all things with Zen-like beautybernard.ryefield does all things with Zen-like beautybernard.ryefield does all things with Zen-like beautybernard.ryefield does all things with Zen-like beautybernard.ryefield does all things with Zen-like beautybernard.ryefield does all things with Zen-like beautybernard.ryefield does all things with Zen-like beautybernard.ryefield does all things with Zen-like beautybernard.ryefield does all things with Zen-like beautybernard.ryefield does all things with Zen-like beautybernard.ryefield does all things with Zen-like beauty
 
Posts: 93
Karma: 32466
Join Date: Jul 2013
Location: Paris
Device: Kobo Desktop, Kindle Desktop, Kobo Forma
I don't know what is the validating process of recipes but it seems a new and still incorrect recipe for Mediapart was posted this Friday. It won't help neither Calibre nor Mediapart users.
bernard.ryefield is offline   Reply With Quote
Old 03-18-2016, 10:52 AM   #22
DanielBonnery
Member
DanielBonnery began at the beginning.
 
Posts: 11
Karma: 10
Join Date: Jan 2016
Device: no
Hi Bernard,
as I mentioned in a previous message, the recipe works on ubuntu and not on windows for me. The problem may come from the certificate and not from the recipe. So it may be helpfull for the mediapart users that do not have certificate issues. I am very happy with the Mediapart recipe that was posted this Friday, it works on one of my PC, so it helps at least one Calibre and Mediapart user. I tried to figure out how to solve this certificate issue on windows, but unfortunately, I don't understand anything to those things.
Bests,
Daniel
DanielBonnery is offline   Reply With Quote
Old 03-18-2016, 11:47 PM   #23
kovidgoyal
creator of calibre
kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.
 
kovidgoyal's Avatar
 
Posts: 45,351
Karma: 27182818
Join Date: Oct 2006
Location: Mumbai, India
Device: Various
A certificate error just means that the root certificate for the certificate authority that has issued the https certificate for the website is not installed in the windows system. You can simply install that root certificate yourself to fix it. A bit of googling will show you how to do that. (Usually it is enough to just install all windows updates, including optional ones).

Note that on my windows system, I get no certificate errors for this recipe.
kovidgoyal is offline   Reply With Quote
Old 03-20-2016, 07:37 AM   #24
bernard.ryefield
Connoisseur
bernard.ryefield does all things with Zen-like beautybernard.ryefield does all things with Zen-like beautybernard.ryefield does all things with Zen-like beautybernard.ryefield does all things with Zen-like beautybernard.ryefield does all things with Zen-like beautybernard.ryefield does all things with Zen-like beautybernard.ryefield does all things with Zen-like beautybernard.ryefield does all things with Zen-like beautybernard.ryefield does all things with Zen-like beautybernard.ryefield does all things with Zen-like beautybernard.ryefield does all things with Zen-like beauty
 
Posts: 93
Karma: 32466
Join Date: Jul 2013
Location: Paris
Device: Kobo Desktop, Kindle Desktop, Kobo Forma
First I want to acknowledge the effort from DanielBonnery and Kovid, especially since the original developers have abandoned the project. I'm happy this recipe works for you. So basically this works for some and fails for others. As Kovid mentioned updates on Windows, I'm up to date. I don't want to tweak my system especially with certificates. I'm still concerned by a newcomer on Calibre using the Mediapart recipe and seeing it fail. This thread might be useful in this case.
bernard.ryefield is offline   Reply With Quote
Old 12-14-2017, 04:58 PM   #25
DanielBonnery
Member
DanielBonnery began at the beginning.
 
Posts: 11
Karma: 10
Join Date: Jan 2016
Device: no
Hi,

I created another recipe for calibre by removing the SSL.
I have read somewhere that it is not safe, but I have limited motivation to go further.
The following recipe works fine on windows machines, I think one should not replace the current recipe for mediapart that works fine for linux without removing ssl.


Code:
# -*- mode:python -*-
#----- this is an non secure recipe that works on windows machines. ssl were removed to cope with urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed

from __future__ import unicode_literals

__license__   = 'GPL v3'
__copyright__ = '2016, Daniel Bonnery (contact: DanielBonnery sur mobileread.com) 2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010-2012, Louis Gesbert <meta at antislash dot info>'
'''
Mediapart
'''

__author__ = '2017, Daniel Bonnery (contact: DanielBonnery sur mobileread.com), 2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010-2012, Louis Gesbert <meta at antislash dot info>'

import re
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.web.feeds import feeds_from_index
from datetime import date,timedelta

#----- this was copied from https://gist.github.com/jchirschy/7717cb067bee7e081070
from calibre import strftime
from string import Template
import json
import operator
import tempfile
import urllib
import urllib2
import ssl
#ssl._create_default_https_context = ssl._create_unverified_context

#----- end of this was copied from https://gist.github.com/jchirschy/7717cb067bee7e081070

ssl._create_default_https_context = ssl._create_unverified_context

class Mediapart(BasicNewsRecipe):
    title = 'Mediapart sans ssl!!!'
    __author__ = 'Daniel Bonnery from a version by Mathieu Godlewski, Louis Gesbert'
    description = 'Global news in french from news site Mediapart without ssl'
    publication_type = 'newspaper'
    language = 'fr'
    needs_subscription = True
    oldest_article = 2

    use_embedded_content = False
    no_stylesheets = True
    #----- this was copied from https://gist.github.com/jchirschy/7717cb067bee7e081070
    index_url= 'https://mediapart.fr/login'
    #----- end of this was copied from https://gist.github.com/jchirschy/7717cb067bee7e081070

    cover_url = 'https://static.mediapart.fr/files/M%20Philips/logo-mediapart.png'

#----- this was copied from https://gist.github.com/jchirschy/7717cb067bee7e081070
    legacy_login_url = index_url   # We use this to cheat oAuth
  #----- end of this was copied from https://gist.github.com/jchirschy/7717cb067bee7e081070

# --

    oldest_article_date = date.today() - timedelta(days=oldest_article)

# -- get the index (the feed at 'http://www.mediapart.fr/articles/feed' only has
#    the 10 last elements :/)

    feeds =  [
        ('La Une', 'http://www.mediapart.fr/articles/feed'),
    ]

    def parse_feeds(self):
        feeds = super(Mediapart, self).parse_feeds()
        print '1.############################### ca va.' 
        feeds += feeds_from_index(self.my_parse_index(feeds))
        print '2.############################### ca va.' 
        return feeds

    def my_parse_index(self, la_une):
        
        articles = []
       
      #----- this was copied from https://gist.github.com/jchirschy/7717cb067bee7e081070
        ssl._create_default_https_context = ssl._create_unverified_context
      #----- end of this was copied from https://gist.github.com/jchirschy/7717cb067bee7e081070

       
        breves = []
        liens = []
        confidentiels = []
        
        soup = self.index_to_soup('https://www.mediapart.fr/journal/fil-dactualites')
        page = soup.find('div', {'class':' col-left fractal-desktop fractal-10-desktop collapse-7-desktop fractal-tablet fractal-6-tablet collapse-4-tablet '})
        fils = page.find('ul', {'class':'post-list universe-journal'})
        print '1.1. ########################################## et la ca va'
        print len(fils)
        try:
            for article in fils.findAll('li'):
                try:
                    title = article.find('h3',recursive=False)
                    print '1.1.1 ########################################## et la ca va'

                    if title is None or title['class'] == 'title-specific':
                        continue

                    # print "found fil ",title
                    article_type = article.find('a', {'href': re.compile(r'.*\/type-darticles\/.*')}).renderContents()
                    # print "kind: ",article_type

                    for s in title('span'):
                        s.replaceWith(s.renderContents() + "\n")
                    url = title.find('a', href=True)['href']
                    print '1.1.2 ########################################## et la ca aussi'

                    #article_date = self.parse_french_date(article.find("span", "article-date").renderContents())
                    #print("################################# 9")
                    #print(article_date)

                    #if article_date < self.oldest_article_date:
                        # print "too old"
                    #    continue

                    authors = article.findAll('a',{'class':re.compile(r'\bjournalist\b')})
                    authors = [self.tag_to_string(a) for a in authors]

                    #description = article.find('div', {'class': lambda c: c != 'taxonomy-teaser'}, recursive=False).findAll('p')

                    # print "fil ",title," by ",authors," : ",description

                    summary = {
                        'title': self.tag_to_string(title).strip(),
                        'author': ', '.join(authors),
                        'url': url,
                        #'date': u'' + article_date.strftime("%A %d %b %Y"),
                        'description': '\n'.join([self.tag_to_string(d) for d in description]),
                    }
                    {
                        "Brève": breves,
                        "Lien": liens,
                        "Confidentiel": confidentiels,
                    }.get(article_type).append(summary)
                except:
                    pass
                print '1#################################################################### ca va.'
            # print 'La Une: ', len(la_une), ' articles'
            # for a in la_une: print a["title"]
            # print 'Brèves: ', len(breves), ' articles'
            # print 'Revue web: ', len(liens), ' articles'
            # print 'Confidentiel: ', len(confidentiels), ' articles'

            articles += [('Brèves', breves)] if breves else []
            articles += [('Revue du Web', liens)] if liens else []
            articles += [('Confidentiel', confidentiels)] if confidentiels else []
        except:
            pass
        return articles
# -- print-version
    print '0#################################################################### ca va.'
    conversion_options = {'smarten_punctuation' : True}

    remove_tags = [dict(name='div', attrs={'class':'print-source_url'})]

    # non-locale specific date parse (strptime("%d %b %Y",s) would work with french locale)
    def parse_french_date(self, date_str):
        date_arr = date_str.lower().split()
        return date(day=int(date_arr[0]),
                    year=int(date_arr[2]),
                    month=[None, 'janvier', 'février', 'mars', 'avril', 'mai', 'juin', 'juillet',
                       'août', 'septembre', 'octobre', 'novembre', 'décembre'].index(date_arr[1]))

    def print_version(self, url):
        raw = self.browser.open(url).read()
        soup = BeautifulSoup(raw.decode('utf8', 'replace'))
        # Filter old articles
 #       article_date = self.parse_french_date(self.tag_to_string(soup.find('span', 'article-date')))

  #      if article_date < self.oldest_article_date:
   #         return None

        tools = soup.find('li', {'class':'print'})
        link = tools.find('a', {'href': re.compile(r'\/print\/.*')})
        print(link['href'])
      #       if link is None:
 #           print 'Error: print link not found'
 #           return None
        return 'https://mediapart.fr' + link['href']
#        return url
  
# -- Handle login
    def get_browser(self, *args, **kwargs):
        """
        We need to pretend to be a recent version of safari for the mac to
        prevent User-Agent checks Pocket api requires username and password so
        fail loudly if it's missing from the config.
        """
        br = BasicNewsRecipe.get_browser(self,
                user_agent='Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; \
                        en-us) AppleWebKit/533.19.4 (KHTML, like Gecko) \
                        Version/5.0.3 Safari/533.19.4')
        # br = BasicNewsRecipe.get_browser(self)
        print '#########################################'
        
        if self.username is not None and self.password is not None:
            br.open(self.legacy_login_url)
            for form in br.forms():
                if form.attrs['id'] == 'logFormEl':
                    br.form = form
                break
            #br.select_form(id=logFormEl)
            
            br['name'] = self.username
            br['password'] = self.password
            br.submit()
        else:
            self.user_error("This Recipe requires authentication")
        print 'jusqu ici tout va bien'
        return br

    # This is a workaround articles with scribd content that include
    # <body></body> tags _within_ the body
    preprocess_regexps = [
        (re.compile(r'(<body.*?>)(.*)</body>', re.IGNORECASE|re.DOTALL),
         lambda match:
             match.group(1) + re.sub(
                 re.compile(r'</?body>', re.IGNORECASE|re.DOTALL),'', match.group(2)) + '</body>')
    ]
DanielBonnery is offline   Reply With Quote
Reply


Forum Jump

Similar Threads
Thread Thread Starter Forum Replies Last Post
Rules for mediapart.fr and rue89.com (french news websites) Metapioca Recipes 18 08-25-2013 08:48 AM
Recipe help please wmaurer Recipes 0 04-23-2012 03:48 AM
Recipe works when mocked up as Python file, fails when converted to Recipe ode Recipes 7 09-04-2011 04:57 AM
recipe please Torx Recipes 0 01-22-2011 12:18 PM
Recipe Help lrain5 Calibre 3 05-09-2010 10:42 PM


All times are GMT -4. The time now is 03:53 AM.


MobileRead.com is a privately owned, operated and funded community.