![]() |
#1 |
plus ça change
![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() Posts: 101
Karma: 32134
Join Date: Dec 2009
Location: France
Device: Kindle PW2, Voyage
|
Improved recipe for Le Monde
Slightly improved version of the existing recipe: excludes video, blog and chat content from download; reduced font sizes for bylines, etc.
Code:
__license__ = 'GPL v3' __copyright__ = '2011' ''' lemonde.fr ''' import re from calibre.web.feeds.recipes import BasicNewsRecipe class LeMonde(BasicNewsRecipe): title = 'Le Monde' __author__ = 'veezh' description = 'Actualités' oldest_article = 1 max_articles_per_feed = 100 no_stylesheets = True #delay = 1 use_embedded_content = False encoding = 'cp1252' publisher = 'lemonde.fr' category = 'news, France, world' language = 'fr_FR' #publication_type = 'newsportal' extra_css = ''' h1{font-size:130%;} .ariane{font-size:xx-small;} .source{font-size:xx-small;} #.href{font-size:xx-small;} .LM_caption{color:#666666; font-size:x-small;} #.main-article-info{font-family:Arial,Helvetica,sans-serif;} #full-contents{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;} #match-stats-summary{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;} ''' #preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')] conversion_options = { 'comments' : description ,'tags' : category ,'language' : language ,'publisher' : publisher ,'linearize_tables': True } remove_empty_feeds = True filterDuplicates = True def preprocess_html(self, soup): for alink in soup.findAll('a'): if alink.string is not None: tstr = alink.string alink.replaceWith(tstr) return soup preprocess_regexps = [ (re.compile(r'([0-9])%'), lambda m: m.group(1) + ' %'), (re.compile(r'([0-9])([0-9])([0-9]) ([0-9])([0-9])([0-9])'), lambda m: m.group(1) + m.group(2) + m.group(3) + ' ' + m.group(4) + m.group(5) + m.group(6)), (re.compile(r'([0-9]) ([0-9])([0-9])([0-9])'), lambda m: m.group(1) + ' ' + m.group(2) + m.group(3) + m.group(4)), (re.compile(r'<span>'), lambda match: ' <span>'), (re.compile(r'\("'), lambda match: '(« '), (re.compile(r'"\)'), lambda match: ' »)'), (re.compile(r'“'), lambda match: '(« '), (re.compile(r'”'), lambda match: ' »)'), (re.compile(r'>\''), lambda match: '>‘'), (re.compile(r' \''), lambda match: ' ‘'), (re.compile(r'\''), lambda match: '’'), (re.compile(r'"<em>'), lambda match: '<em>« '), (re.compile(r'"<em>"</em><em>'), lambda match: '<em>« '), (re.compile(r'"<a href='), lambda match: '« <a href='), (re.compile(r'</em>"'), lambda match: ' »</em>'), (re.compile(r'</a>"'), lambda match: ' »</a>'), (re.compile(r'"</'), lambda match: ' »</'), (re.compile(r'>"'), lambda match: '>« '), (re.compile(r'"<'), lambda match: ' »<'), (re.compile(r'’"'), lambda match: '’« '), (re.compile(r' "'), lambda match: ' « '), (re.compile(r'" '), lambda match: ' » '), (re.compile(r'"\.'), lambda match: ' ».'), (re.compile(r'",'), lambda match: ' »,'), (re.compile(r'"\?'), lambda match: ' »?'), (re.compile(r'":'), lambda match: ' »:'), (re.compile(r'";'), lambda match: ' »;'), (re.compile(r'"\!'), lambda match: ' »!'), (re.compile(r' :'), lambda match: ' :'), (re.compile(r' ;'), lambda match: ' ;'), (re.compile(r' \?'), lambda match: ' ?'), (re.compile(r' \!'), lambda match: ' !'), (re.compile(r'\s»'), lambda match: ' »'), (re.compile(r'«\s'), lambda match: '« '), (re.compile(r' %'), lambda match: ' %'), (re.compile(r'\.jpg » border='), lambda match: '.jpg'), (re.compile(r'\.png » border='), lambda match: '.png'), (re.compile(r' – '), lambda match: ' – '), (re.compile(r' – '), lambda match: ' – '), (re.compile(r' - '), lambda match: ' – '), (re.compile(r' -,'), lambda match: ' –,'), (re.compile(r'»:'), lambda match: '» :'), ] keep_only_tags = [ dict(name='div', attrs={'class':['contenu']}) ] remove_tags_after = [dict(id='appel_temoignage')] def get_article_url(self, article): url = article.get('guid', None) if '/chat/' in url or '.blog' in url or '/video/' in url or '/sport/' in url or '/portfolio/' in url or '/visuel/' in url : url = None return url # def get_article_url(self, article): # link = article.get('link') # if 'blog' not in link and ('chat' not in link): # return link feeds = [ ('A la une', 'http://www.lemonde.fr/rss/une.xml'), ('International', 'http://www.lemonde.fr/rss/tag/international.xml'), ('Europe', 'http://www.lemonde.fr/rss/tag/europe.xml'), (u'Société', 'http://www.lemonde.fr/rss/tag/societe.xml'), ('Economie', 'http://www.lemonde.fr/rss/tag/economie.xml'), (u'Médias', 'http://www.lemonde.fr/rss/tag/actualite-medias.xml'), (u'Planète', 'http://www.lemonde.fr/rss/tag/planete.xml'), ('Culture', 'http://www.lemonde.fr/rss/tag/culture.xml'), ('Technologies', 'http://www.lemonde.fr/rss/tag/technologies.xml'), ('Livres', 'http://www.lemonde.fr/rss/tag/livres.xml'), ] def get_cover_url(self): cover_url = None soup = self.index_to_soup('http://www.lemonde.fr/web/monde_pdf/0,33-0,1-0,0.html') link_item = soup.find('div',attrs={'class':'pg-gch'}) if link_item and link_item.img: cover_url = link_item.img['src'] return cover_url |
![]() |
![]() |
![]() |
|
![]() |
||||
Thread | Thread Starter | Forum | Replies | Last Post |
Updated recipe for Le Monde? | veezh | Recipes | 5 | 01-20-2011 09:06 PM |
Le Monde | peg32 | Recipes | 2 | 12-27-2010 12:59 PM |
Request for Le Monde Diplo En archive recipe | michaelernst | Recipes | 6 | 10-17-2010 11:13 AM |
salut tout le monde | annacover | Introduce Yourself | 1 | 12-01-2009 03:11 PM |
Seriously thoughtful Le livre numérique et le piratage (Le Monde) | roger64 | Lounge français | 7 | 11-08-2009 04:58 PM |