MobileRead Forums - View Single Post

DanielBonnery · 12-14-2017, 04:58 PM

Hi,

I created another recipe for calibre by removing the SSL.
I have read somewhere that it is not safe, but I have limited motivation to go further.
The following recipe works fine on windows machines, I think one should not replace the current recipe for mediapart that works fine for linux without removing ssl.

Code:

# -*- mode:python -*-
#----- this is an non secure recipe that works on windows machines. ssl were removed to cope with urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed

from __future__ import unicode_literals

__license__   = 'GPL v3'
__copyright__ = '2016, Daniel Bonnery (contact: DanielBonnery sur mobileread.com) 2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010-2012, Louis Gesbert <meta at antislash dot info>'
'''
Mediapart
'''

__author__ = '2017, Daniel Bonnery (contact: DanielBonnery sur mobileread.com), 2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010-2012, Louis Gesbert <meta at antislash dot info>'

import re
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.web.feeds import feeds_from_index
from datetime import date,timedelta

#----- this was copied from https://gist.github.com/jchirschy/7717cb067bee7e081070
from calibre import strftime
from string import Template
import json
import operator
import tempfile
import urllib
import urllib2
import ssl
#ssl._create_default_https_context = ssl._create_unverified_context

#----- end of this was copied from https://gist.github.com/jchirschy/7717cb067bee7e081070

ssl._create_default_https_context = ssl._create_unverified_context

class Mediapart(BasicNewsRecipe):
    title = 'Mediapart sans ssl!!!'
    __author__ = 'Daniel Bonnery from a version by Mathieu Godlewski, Louis Gesbert'
    description = 'Global news in french from news site Mediapart without ssl'
    publication_type = 'newspaper'
    language = 'fr'
    needs_subscription = True
    oldest_article = 2

    use_embedded_content = False
    no_stylesheets = True
    #----- this was copied from https://gist.github.com/jchirschy/7717cb067bee7e081070
    index_url= 'https://mediapart.fr/login'
    #----- end of this was copied from https://gist.github.com/jchirschy/7717cb067bee7e081070

    cover_url = 'https://static.mediapart.fr/files/M%20Philips/logo-mediapart.png'

#----- this was copied from https://gist.github.com/jchirschy/7717cb067bee7e081070
    legacy_login_url = index_url   # We use this to cheat oAuth
  #----- end of this was copied from https://gist.github.com/jchirschy/7717cb067bee7e081070

# --

    oldest_article_date = date.today() - timedelta(days=oldest_article)

# -- get the index (the feed at 'http://www.mediapart.fr/articles/feed' only has
#    the 10 last elements :/)

    feeds =  [
        ('La Une', 'http://www.mediapart.fr/articles/feed'),
    ]

    def parse_feeds(self):
        feeds = super(Mediapart, self).parse_feeds()
        print '1.############################### ca va.' 
        feeds += feeds_from_index(self.my_parse_index(feeds))
        print '2.############################### ca va.' 
        return feeds

    def my_parse_index(self, la_une):
        
        articles = []
       
      #----- this was copied from https://gist.github.com/jchirschy/7717cb067bee7e081070
        ssl._create_default_https_context = ssl._create_unverified_context
      #----- end of this was copied from https://gist.github.com/jchirschy/7717cb067bee7e081070

       
        breves = []
        liens = []
        confidentiels = []
        
        soup = self.index_to_soup('https://www.mediapart.fr/journal/fil-dactualites')
        page = soup.find('div', {'class':' col-left fractal-desktop fractal-10-desktop collapse-7-desktop fractal-tablet fractal-6-tablet collapse-4-tablet '})
        fils = page.find('ul', {'class':'post-list universe-journal'})
        print '1.1. ########################################## et la ca va'
        print len(fils)
        try:
            for article in fils.findAll('li'):
                try:
                    title = article.find('h3',recursive=False)
                    print '1.1.1 ########################################## et la ca va'

                    if title is None or title['class'] == 'title-specific':
                        continue

                    # print "found fil ",title
                    article_type = article.find('a', {'href': re.compile(r'.*\/type-darticles\/.*')}).renderContents()
                    # print "kind: ",article_type

                    for s in title('span'):
                        s.replaceWith(s.renderContents() + "\n")
                    url = title.find('a', href=True)['href']
                    print '1.1.2 ########################################## et la ca aussi'

                    #article_date = self.parse_french_date(article.find("span", "article-date").renderContents())
                    #print("################################# 9")
                    #print(article_date)

                    #if article_date < self.oldest_article_date:
                        # print "too old"
                    #    continue

                    authors = article.findAll('a',{'class':re.compile(r'\bjournalist\b')})
                    authors = [self.tag_to_string(a) for a in authors]

                    #description = article.find('div', {'class': lambda c: c != 'taxonomy-teaser'}, recursive=False).findAll('p')

                    # print "fil ",title," by ",authors," : ",description

                    summary = {
                        'title': self.tag_to_string(title).strip(),
                        'author': ', '.join(authors),
                        'url': url,
                        #'date': u'' + article_date.strftime("%A %d %b %Y"),
                        'description': '\n'.join([self.tag_to_string(d) for d in description]),
                    }
                    {
                        "Brève": breves,
                        "Lien": liens,
                        "Confidentiel": confidentiels,
                    }.get(article_type).append(summary)
                except:
                    pass
                print '1#################################################################### ca va.'
            # print 'La Une: ', len(la_une), ' articles'
            # for a in la_une: print a["title"]
            # print 'Brèves: ', len(breves), ' articles'
            # print 'Revue web: ', len(liens), ' articles'
            # print 'Confidentiel: ', len(confidentiels), ' articles'

            articles += [('Brèves', breves)] if breves else []
            articles += [('Revue du Web', liens)] if liens else []
            articles += [('Confidentiel', confidentiels)] if confidentiels else []
        except:
            pass
        return articles
# -- print-version
    print '0#################################################################### ca va.'
    conversion_options = {'smarten_punctuation' : True}

    remove_tags = [dict(name='div', attrs={'class':'print-source_url'})]

    # non-locale specific date parse (strptime("%d %b %Y",s) would work with french locale)
    def parse_french_date(self, date_str):
        date_arr = date_str.lower().split()
        return date(day=int(date_arr[0]),
                    year=int(date_arr[2]),
                    month=[None, 'janvier', 'février', 'mars', 'avril', 'mai', 'juin', 'juillet',
                       'août', 'septembre', 'octobre', 'novembre', 'décembre'].index(date_arr[1]))

    def print_version(self, url):
        raw = self.browser.open(url).read()
        soup = BeautifulSoup(raw.decode('utf8', 'replace'))
        # Filter old articles
 #       article_date = self.parse_french_date(self.tag_to_string(soup.find('span', 'article-date')))

  #      if article_date < self.oldest_article_date:
   #         return None

        tools = soup.find('li', {'class':'print'})
        link = tools.find('a', {'href': re.compile(r'\/print\/.*')})
        print(link['href'])
      #       if link is None:
 #           print 'Error: print link not found'
 #           return None
        return 'https://mediapart.fr' + link['href']
#        return url
  
# -- Handle login
    def get_browser(self, *args, **kwargs):
        """
        We need to pretend to be a recent version of safari for the mac to
        prevent User-Agent checks Pocket api requires username and password so
        fail loudly if it's missing from the config.
        """
        br = BasicNewsRecipe.get_browser(self,
                user_agent='Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; \
                        en-us) AppleWebKit/533.19.4 (KHTML, like Gecko) \
                        Version/5.0.3 Safari/533.19.4')
        # br = BasicNewsRecipe.get_browser(self)
        print '#########################################'
        
        if self.username is not None and self.password is not None:
            br.open(self.legacy_login_url)
            for form in br.forms():
                if form.attrs['id'] == 'logFormEl':
                    br.form = form
                break
            #br.select_form(id=logFormEl)
            
            br['name'] = self.username
            br['password'] = self.password
            br.submit()
        else:
            self.user_error("This Recipe requires authentication")
        print 'jusqu ici tout va bien'
        return br

    # This is a workaround articles with scribd content that include
    # <body></body> tags _within_ the body
    preprocess_regexps = [
        (re.compile(r'(<body.*?>)(.*)</body>', re.IGNORECASE|re.DOTALL),
         lambda match:
             match.group(1) + re.sub(
                 re.compile(r'</?body>', re.IGNORECASE|re.DOTALL),'', match.group(2)) + '</body>')
    ]