![]() |
#16 |
Connoisseur
![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() Posts: 93
Karma: 32466
Join Date: Jul 2013
Location: Paris
Device: Kobo Desktop, Kindle Desktop, Kobo Forma
|
|
![]() |
![]() |
![]() |
#17 |
Member
![]() Posts: 11
Karma: 10
Join Date: Jan 2016
Device: no
|
Hi,
thank you for your message. I am reposting the recipe without the print statements. Code:
# -*- mode:python -*- from __future__ import unicode_literals __license__ = 'GPL v3' __copyright__ = '2016, Daniel Bonnery (contact: DanielBonnery sur mobileread.com) 2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010-2012, Louis Gesbert <meta at antislash dot info>' ''' Mediapart ''' __author__ = '2016, Daniel Bonnery (contact: DanielBonnery sur mobileread.com), 2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010-2012, Louis Gesbert <meta at antislash dot info>' import re from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds import feeds_from_index from datetime import date,timedelta class Mediapart(BasicNewsRecipe): title = 'Mediapart' __author__ = 'Daniel Bonnery from a version by Mathieu Godlewski, Louis Gesbert' description = 'Global news in french from news site Mediapart' publication_type = 'newspaper' language = 'fr' needs_subscription = True oldest_article = 2 use_embedded_content = False no_stylesheets = True cover_url = 'https://static.mediapart.fr/files/M%20Philips/logo-mediapart.png' # -- oldest_article_date = date.today() - timedelta(days=oldest_article) # -- get the index (the feed at 'http://www.mediapart.fr/articles/feed' only has # the 10 last elements :/) feeds = [ ('La Une', 'http://www.mediapart.fr/articles/feed'), ] def parse_feeds(self): feeds = super(Mediapart, self).parse_feeds() feeds += feeds_from_index(self.my_parse_index(feeds)) return feeds def my_parse_index(self, la_une): articles = [] breves = [] liens = [] confidentiels = [] soup = self.index_to_soup('https://www.mediapart.fr/journal/fil-dactualites') page = soup.find('div', {'class':'page-content bust'}) fils = page.find('ul', {'class':'post-list universe-journal'}) for article in fils.findAll('li'): try: title = article.find('h3',recursive=False) if title is None or title['class'] == 'title-specific': continue article_type = article.find('a', {'href': re.compile(r'.*\/type-darticles\/.*')}).renderContents() for s in title('span'): s.replaceWith(s.renderContents() + "\n") url = title.find('a', href=True)['href'] #article_date = self.parse_french_date(article.find("span", "article-date").renderContents()) #print("################################# 9") #print(article_date) #if article_date < self.oldest_article_date: # print "too old" # continue authors = article.findAll('a',{'class':re.compile(r'\bjournalist\b')}) authors = [self.tag_to_string(a) for a in authors] #description = article.find('div', {'class': lambda c: c != 'taxonomy-teaser'}, recursive=False).findAll('p') # print "fil ",title," by ",authors," : ",description summary = { 'title': self.tag_to_string(title).strip(), 'author': ', '.join(authors), 'url': url, #'date': u'' + article_date.strftime("%A %d %b %Y"), 'description': '\n'.join([self.tag_to_string(d) for d in description]), } { "Brève": breves, "Lien": liens, "Confidentiel": confidentiels, }.get(article_type).append(summary) except: pass # print 'La Une: ', len(la_une), ' articles' # for a in la_une: print a["title"] # print 'Brèves: ', len(breves), ' articles' # print 'Revue web: ', len(liens), ' articles' # print 'Confidentiel: ', len(confidentiels), ' articles' articles += [('Brèves', breves)] if breves else [] articles += [('Revue du Web', liens)] if liens else [] articles += [('Confidentiel', confidentiels)] if confidentiels else [] return articles # -- print-version conversion_options = {'smarten_punctuation' : True} remove_tags = [dict(name='div', attrs={'class':'print-source_url'})] # non-locale specific date parse (strptime("%d %b %Y",s) would work with french locale) def parse_french_date(self, date_str): date_arr = date_str.lower().split() return date(day=int(date_arr[0]), year=int(date_arr[2]), month=[None, 'janvier', 'février', 'mars', 'avril', 'mai', 'juin', 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'].index(date_arr[1])) def print_version(self, url): raw = self.browser.open(url).read() soup = BeautifulSoup(raw.decode('utf8', 'replace')) # Filter old articles # article_date = self.parse_french_date(self.tag_to_string(soup.find('span', 'article-date'))) # if article_date < self.oldest_article_date: # return None tools = soup.find('li', {'class':'print'}) link = tools.find('a', {'href': re.compile(r'\/print\/.*')}) # print(link['href']) # if link is None: # print 'Error: print link not found' # return None return 'https://mediapart.fr' + link['href'] # return url # -- Handle login def get_browser(self): br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open('https://www.mediapart.fr/login') br.select_form(nr=1) br['name'] = self.username br['password'] = self.password br.submit() return br # This is a workaround articles with scribd content that include # <body></body> tags _within_ the body preprocess_regexps = [ (re.compile(r'(<body.*?>)(.*)</body>', re.IGNORECASE|re.DOTALL), lambda match: match.group(1) + re.sub( re.compile(r'</?body>', re.IGNORECASE|re.DOTALL),'', match.group(2)) + '</body>') ] Sorry I could not help more. |
![]() |
![]() |
Advert | |
|
![]() |
#18 |
Enthusiast
![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() Posts: 29
Karma: 8300
Join Date: Apr 2013
Location: France
Device: Kobo glo, Apple devices
|
Hi I am totally beginner in retrieving news article but I also subscribed to MediaPart, I intend to follow this thread and its updates. I do not know anything about this party recipes. I think it is rather reserved for connoisseurs and hackers. Is it easy to use/implement on a mac?
|
![]() |
![]() |
![]() |
#19 |
Member
![]() Posts: 11
Karma: 10
Join Date: Jan 2016
Device: no
|
[QUOTE]
Hi I am totally beginner in retrieving news article but I also subscribed to MediaPart, I intend to follow this thread and its updates. I do not know anything about this party recipes. I think it is rather reserved for connoisseurs and hackers. Is it easy to use/implement on a mac? [\QUOTE] Hi, you may just wait for the next update of calibre, the recipe may or may not be added by the developpers. Otherwise you just do add custom news source>New recipe>Switch to advanced> paste the recipe here. That is it Last edited by DanielBonnery; 03-08-2016 at 04:37 PM. |
![]() |
![]() |
![]() |
#20 |
Member
![]() Posts: 11
Karma: 10
Join Date: Jan 2016
Device: no
|
Hi,
it is me again. So I tried this recipe on windows. It failed, although it works on ubuntu. I get a different error message than the one I used to receive (the message I had previously came from the fact they changed the webpage code) the message I receive now is: urllib2.URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:581)> So apparently is a certificate problem external to calibre and the recipe. Kovid Goyal wrote about it earlier and I will try to see if I can apply what he propose and post here. Bests, Daniel |
![]() |
![]() |
Advert | |
|
![]() |
#21 |
Connoisseur
![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() Posts: 93
Karma: 32466
Join Date: Jul 2013
Location: Paris
Device: Kobo Desktop, Kindle Desktop, Kobo Forma
|
I don't know what is the validating process of recipes but it seems a new and still incorrect recipe for Mediapart was posted this Friday. It won't help neither Calibre nor Mediapart users.
|
![]() |
![]() |
![]() |
#22 |
Member
![]() Posts: 11
Karma: 10
Join Date: Jan 2016
Device: no
|
Hi Bernard,
as I mentioned in a previous message, the recipe works on ubuntu and not on windows for me. The problem may come from the certificate and not from the recipe. So it may be helpfull for the mediapart users that do not have certificate issues. I am very happy with the Mediapart recipe that was posted this Friday, it works on one of my PC, so it helps at least one Calibre and Mediapart user. I tried to figure out how to solve this certificate issue on windows, but unfortunately, I don't understand anything to those things. Bests, Daniel |
![]() |
![]() |
![]() |
#23 |
creator of calibre
![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() Posts: 45,351
Karma: 27182818
Join Date: Oct 2006
Location: Mumbai, India
Device: Various
|
A certificate error just means that the root certificate for the certificate authority that has issued the https certificate for the website is not installed in the windows system. You can simply install that root certificate yourself to fix it. A bit of googling will show you how to do that. (Usually it is enough to just install all windows updates, including optional ones).
Note that on my windows system, I get no certificate errors for this recipe. |
![]() |
![]() |
![]() |
#24 |
Connoisseur
![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() Posts: 93
Karma: 32466
Join Date: Jul 2013
Location: Paris
Device: Kobo Desktop, Kindle Desktop, Kobo Forma
|
First I want to acknowledge the effort from DanielBonnery and Kovid, especially since the original developers have abandoned the project. I'm happy this recipe works for you. So basically this works for some and fails for others. As Kovid mentioned updates on Windows, I'm up to date. I don't want to tweak my system especially with certificates. I'm still concerned by a newcomer on Calibre using the Mediapart recipe and seeing it fail. This thread might be useful in this case.
|
![]() |
![]() |
![]() |
#25 |
Member
![]() Posts: 11
Karma: 10
Join Date: Jan 2016
Device: no
|
Hi,
I created another recipe for calibre by removing the SSL. I have read somewhere that it is not safe, but I have limited motivation to go further. The following recipe works fine on windows machines, I think one should not replace the current recipe for mediapart that works fine for linux without removing ssl. Code:
# -*- mode:python -*- #----- this is an non secure recipe that works on windows machines. ssl were removed to cope with urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed from __future__ import unicode_literals __license__ = 'GPL v3' __copyright__ = '2016, Daniel Bonnery (contact: DanielBonnery sur mobileread.com) 2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010-2012, Louis Gesbert <meta at antislash dot info>' ''' Mediapart ''' __author__ = '2017, Daniel Bonnery (contact: DanielBonnery sur mobileread.com), 2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010-2012, Louis Gesbert <meta at antislash dot info>' import re from calibre.ebooks.BeautifulSoup import BeautifulSoup from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds import feeds_from_index from datetime import date,timedelta #----- this was copied from https://gist.github.com/jchirschy/7717cb067bee7e081070 from calibre import strftime from string import Template import json import operator import tempfile import urllib import urllib2 import ssl #ssl._create_default_https_context = ssl._create_unverified_context #----- end of this was copied from https://gist.github.com/jchirschy/7717cb067bee7e081070 ssl._create_default_https_context = ssl._create_unverified_context class Mediapart(BasicNewsRecipe): title = 'Mediapart sans ssl!!!' __author__ = 'Daniel Bonnery from a version by Mathieu Godlewski, Louis Gesbert' description = 'Global news in french from news site Mediapart without ssl' publication_type = 'newspaper' language = 'fr' needs_subscription = True oldest_article = 2 use_embedded_content = False no_stylesheets = True #----- this was copied from https://gist.github.com/jchirschy/7717cb067bee7e081070 index_url= 'https://mediapart.fr/login' #----- end of this was copied from https://gist.github.com/jchirschy/7717cb067bee7e081070 cover_url = 'https://static.mediapart.fr/files/M%20Philips/logo-mediapart.png' #----- this was copied from https://gist.github.com/jchirschy/7717cb067bee7e081070 legacy_login_url = index_url # We use this to cheat oAuth #----- end of this was copied from https://gist.github.com/jchirschy/7717cb067bee7e081070 # -- oldest_article_date = date.today() - timedelta(days=oldest_article) # -- get the index (the feed at 'http://www.mediapart.fr/articles/feed' only has # the 10 last elements :/) feeds = [ ('La Une', 'http://www.mediapart.fr/articles/feed'), ] def parse_feeds(self): feeds = super(Mediapart, self).parse_feeds() print '1.############################### ca va.' feeds += feeds_from_index(self.my_parse_index(feeds)) print '2.############################### ca va.' return feeds def my_parse_index(self, la_une): articles = [] #----- this was copied from https://gist.github.com/jchirschy/7717cb067bee7e081070 ssl._create_default_https_context = ssl._create_unverified_context #----- end of this was copied from https://gist.github.com/jchirschy/7717cb067bee7e081070 breves = [] liens = [] confidentiels = [] soup = self.index_to_soup('https://www.mediapart.fr/journal/fil-dactualites') page = soup.find('div', {'class':' col-left fractal-desktop fractal-10-desktop collapse-7-desktop fractal-tablet fractal-6-tablet collapse-4-tablet '}) fils = page.find('ul', {'class':'post-list universe-journal'}) print '1.1. ########################################## et la ca va' print len(fils) try: for article in fils.findAll('li'): try: title = article.find('h3',recursive=False) print '1.1.1 ########################################## et la ca va' if title is None or title['class'] == 'title-specific': continue # print "found fil ",title article_type = article.find('a', {'href': re.compile(r'.*\/type-darticles\/.*')}).renderContents() # print "kind: ",article_type for s in title('span'): s.replaceWith(s.renderContents() + "\n") url = title.find('a', href=True)['href'] print '1.1.2 ########################################## et la ca aussi' #article_date = self.parse_french_date(article.find("span", "article-date").renderContents()) #print("################################# 9") #print(article_date) #if article_date < self.oldest_article_date: # print "too old" # continue authors = article.findAll('a',{'class':re.compile(r'\bjournalist\b')}) authors = [self.tag_to_string(a) for a in authors] #description = article.find('div', {'class': lambda c: c != 'taxonomy-teaser'}, recursive=False).findAll('p') # print "fil ",title," by ",authors," : ",description summary = { 'title': self.tag_to_string(title).strip(), 'author': ', '.join(authors), 'url': url, #'date': u'' + article_date.strftime("%A %d %b %Y"), 'description': '\n'.join([self.tag_to_string(d) for d in description]), } { "Brève": breves, "Lien": liens, "Confidentiel": confidentiels, }.get(article_type).append(summary) except: pass print '1#################################################################### ca va.' # print 'La Une: ', len(la_une), ' articles' # for a in la_une: print a["title"] # print 'Brèves: ', len(breves), ' articles' # print 'Revue web: ', len(liens), ' articles' # print 'Confidentiel: ', len(confidentiels), ' articles' articles += [('Brèves', breves)] if breves else [] articles += [('Revue du Web', liens)] if liens else [] articles += [('Confidentiel', confidentiels)] if confidentiels else [] except: pass return articles # -- print-version print '0#################################################################### ca va.' conversion_options = {'smarten_punctuation' : True} remove_tags = [dict(name='div', attrs={'class':'print-source_url'})] # non-locale specific date parse (strptime("%d %b %Y",s) would work with french locale) def parse_french_date(self, date_str): date_arr = date_str.lower().split() return date(day=int(date_arr[0]), year=int(date_arr[2]), month=[None, 'janvier', 'février', 'mars', 'avril', 'mai', 'juin', 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'].index(date_arr[1])) def print_version(self, url): raw = self.browser.open(url).read() soup = BeautifulSoup(raw.decode('utf8', 'replace')) # Filter old articles # article_date = self.parse_french_date(self.tag_to_string(soup.find('span', 'article-date'))) # if article_date < self.oldest_article_date: # return None tools = soup.find('li', {'class':'print'}) link = tools.find('a', {'href': re.compile(r'\/print\/.*')}) print(link['href']) # if link is None: # print 'Error: print link not found' # return None return 'https://mediapart.fr' + link['href'] # return url # -- Handle login def get_browser(self, *args, **kwargs): """ We need to pretend to be a recent version of safari for the mac to prevent User-Agent checks Pocket api requires username and password so fail loudly if it's missing from the config. """ br = BasicNewsRecipe.get_browser(self, user_agent='Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; \ en-us) AppleWebKit/533.19.4 (KHTML, like Gecko) \ Version/5.0.3 Safari/533.19.4') # br = BasicNewsRecipe.get_browser(self) print '#########################################' if self.username is not None and self.password is not None: br.open(self.legacy_login_url) for form in br.forms(): if form.attrs['id'] == 'logFormEl': br.form = form break #br.select_form(id=logFormEl) br['name'] = self.username br['password'] = self.password br.submit() else: self.user_error("This Recipe requires authentication") print 'jusqu ici tout va bien' return br # This is a workaround articles with scribd content that include # <body></body> tags _within_ the body preprocess_regexps = [ (re.compile(r'(<body.*?>)(.*)</body>', re.IGNORECASE|re.DOTALL), lambda match: match.group(1) + re.sub( re.compile(r'</?body>', re.IGNORECASE|re.DOTALL),'', match.group(2)) + '</body>') ] |
![]() |
![]() |
![]() |
|
![]() |
||||
Thread | Thread Starter | Forum | Replies | Last Post |
Rules for mediapart.fr and rue89.com (french news websites) | Metapioca | Recipes | 18 | 08-25-2013 08:48 AM |
Recipe help please | wmaurer | Recipes | 0 | 04-23-2012 03:48 AM |
Recipe works when mocked up as Python file, fails when converted to Recipe | ode | Recipes | 7 | 09-04-2011 04:57 AM |
recipe please | Torx | Recipes | 0 | 01-22-2011 12:18 PM |
Recipe Help | lrain5 | Calibre | 3 | 05-09-2010 10:42 PM |