Member
Posts: 18
Karma: 10
Join Date: Sep 2010
Device: Kindle 3 3G intl
|
Finally got the time to update the rule
Hi there ! Sorry for the lag, I finally got the time to update the Mediapart rule. It's an improved version that adds sections for 'Brèves', 'Links', etc.
The rule can be found here:
https://github.com/AltGr/Calibre-fre...diapart.recipe
Thanks Malah for being quicker ! Feel free to update if you have improvements on the rule.
Full code:
Spoiler:
Code:
# -*- mode:python -*-
__license__ = 'GPL v3'
__copyright__ = '2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010-2012, Louis Gesbert <meta at antislash dot info>'
'''
Mediapart
'''
__author__ = '2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010-2012, Louis Gesbert <meta at antislash dot info>'
import re
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.web.feeds import feeds_from_index
from datetime import datetime,date,timedelta
class Mediapart(BasicNewsRecipe):
title = 'Mediapart'
__author__ = 'Mathieu Godlewski, Louis Gesbert'
description = 'Global news in french from news site Mediapart'
publication_type = 'newspaper'
language = 'fr'
needs_subscription = True
oldest_article = 2
use_embedded_content = False
no_stylesheets = True
cover_url = 'https://static.mediapart.fr/files/M%20Philips/logo-mediapart.png'
# --
oldest_article_date = date.today() - timedelta(days=oldest_article)
# -- get the index (the feed at 'http://www.mediapart.fr/articles/feed' only has
# the 10 last elements :/)
feeds = [
('La Une', 'http://www.mediapart.fr/articles/feed'),
]
def parse_feeds(self):
feeds = super(Mediapart, self).parse_feeds()
feeds += feeds_from_index(self.my_parse_index(feeds))
return feeds
def my_parse_index(self, la_une):
articles = []
breves = []
liens = []
confidentiels = []
soup = self.index_to_soup('https://www.mediapart.fr/journal/fil-dactualites')
page = soup.find('div', {'id':'pageFirstContent'})
fils = page.find('div', {'class':re.compile(r'\bcontent-journal\b')})
for article in fils.findAll('div'):
try:
title = article.find('h2',recursive=False)
if title == None or title['class'] == 'title-specific': continue
print "found fil ",title
article_type = article.find('a', {'href': re.compile(r'.*\/type-darticles\/.*')}).renderContents()
print "kind: ",article_type
for s in title('span'): s.replaceWith(s.renderContents() + "\n")
url = title.find('a', href=True)['href']
article_date = self.parse_french_date(article.find("span", "article-date").renderContents())
if article_date < self.oldest_article_date:
print "too old"
continue
authors = article.findAll('a',{'class':re.compile(r'\bjournalist\b')})
authors = [ self.tag_to_string(a) for a in authors ]
description = article.find('div', {'class': lambda c: c != 'taxonomy-teaser'}, recursive=False).findAll('p')
print "fil ",title," by ",authors," : ",description
summary = {
'title': self.tag_to_string(title).strip(),
'author': ', '.join(authors),
'url': url,
'date': u'' + article_date.strftime("%A %d %b %Y"),
'description': '\n'.join([self.tag_to_string(d) for d in description]),
}
{
"Brève": breves,
"Lien": liens,
"Confidentiel": confidentiels,
}.get(article_type).append(summary)
except: pass
# print 'La Une: ', len(la_une), ' articles'
# for a in la_une: print a["title"]
print 'Brèves: ', len(breves), ' articles'
print 'Revue web: ', len(liens), ' articles'
print 'Confidentiel: ', len(confidentiels), ' articles'
articles += [('Brèves', breves)] if breves else []
articles += [('Revue du Web', liens)] if liens else []
articles += [('Confidentiel', confidentiels)] if confidentiels else []
return articles
# -- print-version
conversion_options = { 'smarten_punctuation' : True }
remove_tags = [ dict(name='div', attrs={'class':'print-source_url'}) ]
# non-locale specific date parse (strptime("%d %b %Y",s) would work with french locale)
def parse_french_date(self, date_str):
date_arr = date_str.lower().split()
return date(day=int(date_arr[0]),
year=int(date_arr[2]),
month=
[None, 'janvier', 'février', 'mars', 'avril', 'mai', 'juin', 'juillet',
'août', 'septembre', 'octobre', 'novembre', 'décembre'].index(date_arr[1]))
def print_version(self, url):
raw = self.browser.open(url).read()
soup = BeautifulSoup(raw.decode('utf8', 'replace'))
# Filter old articles
article_date = self.parse_french_date(self.tag_to_string(soup.find('span', 'article-date')))
if article_date < self.oldest_article_date:
return None
tools = soup.find('div', {'class':'menu-tools'})
link = tools.find('a', {'href': re.compile(r'\/print\/.*')})
if link is None:
print 'Error: print link not found'
return None
return 'https://mediapart.fr/' + link['href']
# -- Handle login
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
if self.username is not None and self.password is not None:
br.open('https://www.mediapart.fr/user')
br.select_form(nr=1)
br['name'] = self.username
br['pass'] = self.password
br.submit()
print "LOGIN"
return br
# This is a workaround articles with scribd content that include
# <body></body> tags _within_ the body
preprocess_regexps = [
(re.compile(r'(<body.*?>)(.*)</body>', re.IGNORECASE|re.DOTALL),
lambda match:
match.group(1)
+ re.sub(re.compile(r'</?body>', re.IGNORECASE|re.DOTALL),'',
match.group(2))
+ '</body>')
]
|