Register Guidelines E-Books Today's Posts Search

Go Back   MobileRead Forums > E-Book Software > Calibre > Recipes

Notices

Reply
 
Thread Tools Search this Thread
Old 10-30-2010, 06:54 AM   #1
Metapioca
Member
Metapioca began at the beginning.
 
Posts: 18
Karma: 10
Join Date: Sep 2010
Device: Kindle 3 3G intl
Rules for mediapart.fr and rue89.com (french news websites)

One new rule for rue89.com, free french news website:
Spoiler:

Code:
__license__   = 'GPL v3'
__copyright__ = '2010, Louis Gesbert <meta at antislash dot info>'
'''
Rue89
'''

import re, string
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
from calibre.web.feeds.news import BasicNewsRecipe

class Rue89(BasicNewsRecipe):
    title = 'Rue89'
    __author__ = '2010, Louis Gesbert <meta at antislash dot info>'
    description = 'Popular free french news website'
    title = u'Rue89'
    language = 'fr'
    oldest_article = 7
    max_articles_per_feed = 50

    feeds = [(u'La Une', u'http://www.rue89.com/homepage/feed')]

    no_stylesheets = True

    preprocess_regexps = [
        (re.compile(r'<(/?)h2>', re.IGNORECASE|re.DOTALL),
         lambda match : '<'+match.group(1)+'h3>'),
        (re.compile(r'<div class="print-title">([^>]+)</div>', re.IGNORECASE|re.DOTALL),
         lambda match : '<h2>'+match.group(1)+'</h2>'),
        (re.compile(r'<img[^>]+src="[^"]*/numeros/(\d+)[^0-9.">]*.gif"[^>]*/>', re.IGNORECASE|re.DOTALL),
         lambda match : '<span style="font-family: Sans-serif; color: red; font-size:24pt; padding=2pt;">'+match.group(1)+'</span>'),
        (re.compile(r'\''), lambda match: '&rsquo;'),
        ]

    def preprocess_html(self,soup):
        body = Tag(soup, 'body')
        title = soup.find('h1', {'class':'title'})
        content = soup.find('div', {'class':'content'})
        soup.body.replaceWith(body)
        body.insert(0, title)
        body.insert(1, content)
        return soup

    remove_tags = [ #dict(name='div', attrs={'class':'print-source_url'}),
                    #dict(name='div', attrs={'class':'print-links'}),
                    #dict(name='img', attrs={'class':'print-logo'}),
                    dict(name='div', attrs={'class':'content_top'}),
                    dict(name='div', attrs={'id':'sidebar-left'}), ]

# -- print-version has poor quality on this website, better do the conversion ourselves
#    def print_version(self, url):
#        return re.sub('^.*-([0-9]+)$', 'http://www.rue89.com/print/\\1',url)


And a widely improved one (original version by Mathieu Godlewski) for Mediapart, a famous online-only newspaper with paying subscription:
Spoiler:

Code:
__license__   = 'GPL v3'
__copyright__ = '2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010, Louis Gesbert <meta at antislash dot info>'
'''
Mediapart
'''

import re, string
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
from calibre.web.feeds.news import BasicNewsRecipe

class Mediapart(BasicNewsRecipe):
    title          = 'Mediapart'
    __author__ = 'Mathieu Godlewski <mathieu at godlewski.fr>'
    description = 'Global news in french from online newspapers'
    oldest_article = 7
    language = 'fr'
    needs_subscription = True

    max_articles_per_feed = 50
    no_stylesheets = True

    cover_url = 'http://www.mediapart.fr/sites/all/themes/mediapart/mediapart/images/annonce.jpg'

    feeds =  [
        ('Les articles', 'http://www.mediapart.fr/articles/feed'),
    ]

# -- print-version has poor quality on this website, better do the conversion ourselves
#
#     preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE|re.DOTALL), i[1]) for i in
#         [
#             (r'<div class="print-title">([^>]+)</div>', lambda match : '<h2>'+match.group(1)+'</h2>'),
#             (r'<span class=\'auteur_staff\'>[^>]+<a title=\'[^\']*\'[^>]*>([^<]*)</a>[^<]*</span>',
#              lambda match : '<i>'+match.group(1)+'</i>'),
#             (r'\'', lambda match: '&rsquo;'),
#         ]
#      ]
#
#     remove_tags    = [ dict(name='div', attrs={'class':'print-source_url'}),
#                        dict(name='div', attrs={'class':'print-links'}),
#                        dict(name='img', attrs={'src':'entete_article.png'}),
#                        dict(name='br') ]
#
#     def print_version(self, url):
#         raw = self.browser.open(url).read()
#         soup = BeautifulSoup(raw.decode('utf8', 'replace'))
#         div = soup.find('div', {'id':re.compile('node-\d+')})
#         if div is None:
#             return None
#         article_id = string.replace(div['id'], 'node-', '')
#         if article_id is None:
#             return None
#         return 'http://www.mediapart.fr/print/'+article_id

# -- Non-print version [dict(name='div', attrs={'class':'advert'})]

    keep_only_tags = [
        dict(name='h1', attrs={'class':'title'}),
        dict(name='div', attrs={'class':'page_papier_detail'}),
        ]

    def preprocess_html(self,soup):
        for title in soup.findAll('div', {'class':'titre'}):
            tag = Tag(soup, 'h3')
            title.replaceWith(tag)
            tag.insert(0,title)
        return soup

# -- Handle login

    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
        if self.username is not None and self.password is not None:
            br.open('http://www.mediapart.fr/')
            br.select_form(nr=1)
            br['name'] = self.username
            br['pass'] = self.password
            br.submit()
        return br


I've been testing them for a few days, but there's probably room for improvement.
Metapioca is offline   Reply With Quote
Old 05-22-2011, 06:56 AM   #2
zuzur
Junior Member
zuzur began at the beginning.
 
Posts: 1
Karma: 10
Join Date: May 2011
Device: sony PRS-650
Hello, Mediapart recently changed their home page, making the login form the first in the page, which broke the mediapart collection script.

I have updated the existing rule to account for that and it works pretty well : http://arzur.net/2011/05/22/calibre-mediapart-ftw/ (in french)

I just which br.select_form could address the form's id attribute, or Mediapart would put a name= for their form :-)
zuzur is offline   Reply With Quote
Advert
Old 05-30-2011, 02:33 PM   #3
Metapioca
Member
Metapioca began at the beginning.
 
Posts: 18
Karma: 10
Join Date: Sep 2010
Device: Kindle 3 3G intl
Great, thanks for the update, I saw it wasn't working anymore but was too busy recently to take the time to fix it.

And happy to see there are other users around
Metapioca is offline   Reply With Quote
Old 02-11-2012, 10:37 AM   #4
rogerben
Junior Member
rogerben began at the beginning.
 
Posts: 8
Karma: 10
Join Date: Dec 2011
Device: Kindle
This recipe seems broken, at least for me it fetches only rubbish. Does it still work for you?

Best regards
-br
rogerben is offline   Reply With Quote
Old 02-19-2012, 07:34 AM   #5
Metapioca
Member
Metapioca began at the beginning.
 
Posts: 18
Karma: 10
Join Date: Sep 2010
Device: Kindle 3 3G intl
Oops, sorry, I made an update to the Mediapart one some time ago, but forgot to share.
I've switched to the print version, which they much improved on the site (most of the code existed but was commented out)
Spoiler:

Code:
__license__   = 'GPL v3'
__copyright__ = '2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010, 2011, Louis Gesbert <meta at antislash dot info>'
'''
Mediapart
'''

import re
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ebooks.BeautifulSoup import Tag
from calibre.web.feeds.news import BasicNewsRecipe

class Mediapart(BasicNewsRecipe):
    title          = 'Mediapart'
    __author__ = 'Mathieu Godlewski, Louis Gesbert'
    description = 'Global news in french from news site Mediapart'
    oldest_article = 7
    language = 'fr'
    needs_subscription = True

    max_articles_per_feed = 50
    no_stylesheets = True

    cover_url = 'http://static.mediapart.fr/files/pave_mediapart.jpg'

    feeds =  [
        ('Les articles', 'http://www.mediapart.fr/articles/feed'),
    ]

# -- print-version

    preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE|re.DOTALL), i[1]) for i in
        [
            (r'<div class="print-title">([^>]+)</div>', lambda match : '<h2>'+match.group(1)+'</h2>'),
            (r'\'', lambda match: '&rsquo;')
        ]
    ]

    remove_tags    = [ dict(name='div', attrs={'class':'print-source_url'}) ]

    def print_version(self, url):
        raw = self.browser.open(url).read()
        soup = BeautifulSoup(raw.decode('utf8', 'replace'))
        link = soup.find('a', {'title':'Imprimer'})
        if link is None:
            return None
        return link['href']

# -- Handle login

    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
        if self.username is not None and self.password is not None:
            br.open('http://www.mediapart.fr/')
            br.select_form(nr=0)
            br['name'] = self.username
            br['pass'] = self.password
            br.submit()
        return br


I don't have a fix for rue89 right now though, I'll try to find the time to look into it.
Metapioca is offline   Reply With Quote
Advert
Old 02-24-2012, 09:47 AM   #6
rogerben
Junior Member
rogerben began at the beginning.
 
Posts: 8
Karma: 10
Join Date: Dec 2011
Device: Kindle
Hi,

great work, thanks a lot! That will make mediapart far more comfortable to read.

Have a nice week-end!

-br
rogerben is offline   Reply With Quote
Old 02-24-2012, 01:37 PM   #7
Metapioca
Member
Metapioca began at the beginning.
 
Posts: 18
Karma: 10
Join Date: Sep 2010
Device: Kindle 3 3G intl
Ok, it still needs some polishing (gets a little bit of garbage in some articles), but I've made the rue89 recipe work again.

I've put both recipes in a git repo: https://github.com/AltGr/Calibre-french-news-rules

The one for Mediapart there is updated also
Metapioca is offline   Reply With Quote
Old 02-25-2012, 08:29 AM   #8
Metapioca
Member
Metapioca began at the beginning.
 
Posts: 18
Karma: 10
Join Date: Sep 2010
Device: Kindle 3 3G intl
Ok, think I got rid of the garbage, anyone is welcome to test & report the new rue89 recipe.

Video articles ("zapnet") should be removed but it seems they're not parsed correctly (the soup has everything within <script> tags) ; any hints how to detect and remove them ?
Metapioca is offline   Reply With Quote
Old 02-25-2012, 08:34 AM   #9
kovidgoyal
creator of calibre
kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.
 
kovidgoyal's Avatar
 
Posts: 45,235
Karma: 27110894
Join Date: Oct 2006
Location: Mumbai, India
Device: Various
Add a preprocess_regexp to your recipe to remove <script>.*?</script>
kovidgoyal is offline   Reply With Quote
Old 02-29-2012, 01:44 PM   #10
Metapioca
Member
Metapioca began at the beginning.
 
Posts: 18
Karma: 10
Join Date: Sep 2010
Device: Kindle 3 3G intl
That helped, thanks. I think the recipe is alright now -- except for a margin to the left that I can't get rid off.
I removed the different feeds from that site because they are mostly overlapping (it's more tags than sections ; ); there is no way to detect multiple links to the same article and make them point to the same place in the ebook at the moment, is there ?
Metapioca is offline   Reply With Quote
Old 02-29-2012, 01:49 PM   #11
kovidgoyal
creator of calibre
kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.
 
kovidgoyal's Avatar
 
Posts: 45,235
Karma: 27110894
Join Date: Oct 2006
Location: Mumbai, India
Device: Various
You can have calibre ignore duplicate links easily see the sticky for a technique to do that. But there is no easy way to have the entries point to a single place in the book.
kovidgoyal is offline   Reply With Quote
Old 03-30-2012, 07:14 AM   #12
Metapioca
Member
Metapioca began at the beginning.
 
Posts: 18
Karma: 10
Join Date: Sep 2010
Device: Kindle 3 3G intl
Didn't work on the multi-RSS yet, but I just pushed a few fixes to the git.
Metapioca is offline   Reply With Quote
Old 07-05-2013, 11:22 AM   #13
malah
Junior Member
malah began at the beginning.
 
Posts: 6
Karma: 10
Join Date: Jul 2013
Device: kindle 4
Hello, mediapart had taken a new editorial form and i think that have break the current recipe.

Sorry for my bad broken english

I post the error:

Quote:
Récupérer des informations de Mediapart
Resolved conversion options
calibre version: 0.9.37
{'asciiize': False,
'author_sort': None,
'authors': None,
'base_font_size': 0,
'book_producer': None,
'change_justification': 'original',
'chapter': None,
'chapter_mark': 'pagebreak',
'comments': None,
'cover': None,
'debug_pipeline': None,
'dehyphenate': True,
'delete_blank_paragraphs': True,
'disable_font_rescaling': False,
'dont_compress': False,
'dont_download_recipe': False,
'duplicate_links_in_toc': False,
'embed_all_fonts': False,
'embed_font_family': None,
'enable_heuristics': False,
'extra_css': None,
'extract_to': None,
'filter_css': None,
'fix_indents': True,
'font_size_mapping': None,
'format_scene_breaks': True,
'html_unwrap_factor': 0.4,
'input_encoding': None,
'input_profile': <calibre.customize.profiles.InputProfile object at 0xa039e8c>,
'insert_blank_line': False,
'insert_blank_line_size': 0.5,
'insert_metadata': False,
'isbn': None,
'italicize_common_cases': True,
'keep_ligatures': False,
'language': None,
'level1_toc': None,
'level2_toc': None,
'level3_toc': None,
'line_height': 0,
'linearize_tables': False,
'lrf': False,
'margin_bottom': 5.0,
'margin_left': 5.0,
'margin_right': 5.0,
'margin_top': 5.0,
'markup_chapter_headings': True,
'max_toc_links': 50,
'minimum_line_height': 120.0,
'mobi_file_type': 'old',
'mobi_ignore_margins': False,
'mobi_keep_original_images': False,
'mobi_toc_at_start': False,
'no_chapters_in_toc': False,
'no_inline_navbars': False,
'no_inline_toc': False,
'output_profile': <calibre.customize.profiles.GenericEink object at 0xa0754ac>,
'page_breaks_before': None,
'personal_doc': '[PDOC]',
'prefer_author_sort': False,
'prefer_metadata_cover': False,
'pretty_print': False,
'pubdate': None,
'publisher': None,
'rating': None,
'read_metadata_from_opf': None,
'remove_fake_margins': True,
'remove_first_image': False,
'remove_paragraph_spacing': False,
'remove_paragraph_spacing_indent_size': 1.5,
'renumber_headings': True,
'replace_scene_breaks': '',
'search_replace': None,
'series': None,
'series_index': None,
'share_not_sync': False,
'smarten_punctuation': False,
'sr1_replace': '',
'sr1_search': '',
'sr2_replace': '',
'sr2_search': '',
'sr3_replace': '',
'sr3_search': '',
'start_reading_at': None,
'subset_embedded_fonts': False,
'tags': None,
'test': False,
'timestamp': None,
'title': None,
'title_sort': None,
'toc_filter': None,
'toc_threshold': 6,
'toc_title': None,
'unsmarten_punctuation': False,
'unwrap_lines': True,
'use_auto_toc': False,
'verbose': 2}
InputFormatPlugin: Recipe Input running
Using custom recipe
Traceback (most recent call last):
File "/usr/bin/calibre-parallel", line 20, in <module>
sys.exit(main())
File "/usr/lib/calibre/calibre/utils/ipc/worker.py", line 189, in main
result = func(*args, **kwargs)
File "/usr/lib/calibre/calibre/gui2/convert/gui_conversion.py", line 25, in gui_convert
plumber.run()
File "/usr/lib/calibre/calibre/ebooks/conversion/plumber.py", line 1023, in run
accelerators, tdir)
File "/usr/lib/calibre/calibre/customize/conversion.py", line 239, in __call__
log, accelerators)
File "/usr/lib/calibre/calibre/ebooks/conversion/plugins/recipe_input.py", line 113, in convert
ro = recipe(opts, log, self.report_progress)
File "/usr/lib/calibre/calibre/web/feeds/news.py", line 869, in __init__
self.browser = self.get_browser()
File "<string>", line 50, in get_browser
File "/usr/lib/python2.7/site-packages/mechanize/_mechanize.py", line 499, in select_form
global_form = self._factory.global_form
File "/usr/lib/python2.7/site-packages/mechanize/_html.py", line 544, in __getattr__
self.forms()
File "/usr/lib/python2.7/site-packages/mechanize/_html.py", line 557, in forms
self._forms_factory.forms())
File "/usr/lib/python2.7/site-packages/mechanize/_html.py", line 237, in forms
_urlunparse=_rfc3986.urlunsplit,
File "/usr/lib/python2.7/site-packages/mechanize/_form.py", line 844, in ParseResponseEx
_urlunparse=_urlunparse,
File "/usr/lib/python2.7/site-packages/mechanize/_form.py", line 1017, in _ParseFileEx
type, name, attrs, select_default=select_default, index=ii*10)
File "/usr/lib/python2.7/site-packages/mechanize/_form.py", line 2735, in new_control
control = klass(type, name, a, index)
File "/usr/lib/python2.7/site-packages/mechanize/_form.py", line 2336, in __init__
if self.value is None: self.value = ""
File "/usr/lib/python2.7/site-packages/mechanize/_form.py", line 1221, in __setattr__
raise AttributeError("control '%s' is disabled" % self.name)
AttributeError: control 'None' is disabled
malah is offline   Reply With Quote
Old 07-07-2013, 06:43 AM   #14
malah
Junior Member
malah began at the beginning.
 
Posts: 6
Karma: 10
Join Date: Jul 2013
Device: kindle 4
Hello, i have update the recipe. That the first time i try this but it work

What i have changed :
Code:
#1
link = soup.find('a', {'title':'Imprimer'})
link = soup.find('a', {'href':re.compile('^/print/[0-9]+')})
#2
return link['href']
return 'http://www.mediapart.fr' + link['href']
#3
br.open('http://www.mediapart.fr/')
br.open('http://blogs.mediapart.fr/editions/guide-du-coordonnateur-d-edition')
#4
br.select_form(nr=0)
br.select_form(nr=1)
#5 i have also add:
masthead_url       = 'https://upload.wikimedia.org/wikipedia/fr/2/23/Mediapart.png'
What it done:
Code:
__license__   = 'GPL v3'
__copyright__ = '2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010-2012, Louis Gesbert <meta at antislash dot info>; 2013, Malah <malah at neuf.fr>'
'''
Mediapart
'''

__author__ = '2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010-2012, Louis Gesbert <meta at antislash dot info>; 2013, Malah <malah at neuf.fr>'

from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
from calibre.web.feeds.news import BasicNewsRecipe

class Mediapart(BasicNewsRecipe):
    title          = 'Mediapart'
    __author__ = 'Mathieu Godlewski, Louis Gesbert, Malah'
    description = 'Global news in french from news site Mediapart'
    oldest_article = 7
    language = 'fr'
    needs_subscription = True
    max_articles_per_feed = 50

    use_embedded_content = False
    no_stylesheets = True

    masthead_url       = 'https://upload.wikimedia.org/wikipedia/fr/2/23/Mediapart.png'
    cover_url = 'http://static.mediapart.fr/files/pave_mediapart.jpg'

    feeds =  [
        ('Les articles', 'http://www.mediapart.fr/articles/feed'),
    ]

# -- print-version

    conversion_options = { 'smarten_punctuation' : True }

    remove_tags = [ dict(name='div', attrs={'class':'print-source_url'}) ]

    def print_version(self, url):
        raw = self.browser.open(url).read()
        soup = BeautifulSoup(raw.decode('utf8', 'replace'))
        link = soup.find('a', {'href':re.compile('^/print/[0-9]+')})
        if link is None:
            return None
        return 'http://www.mediapart.fr' + link['href']

# -- Handle login

    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
        if self.username is not None and self.password is not None:
            br.open('http://blogs.mediapart.fr/editions/guide-du-coordonnateur-d-edition')
            br.select_form(nr=1)
            br['name'] = self.username
            br['pass'] = self.password
            br.submit()
        return br

    def preprocess_html(self, soup):
        for title in soup.findAll('p', {'class':'titre_page'}):
            title.name = 'h3'
        for legend in soup.findAll('span', {'class':'legend'}):
            legend.insert(0, Tag(soup, 'br', []))
            legend.name = 'small'
        return soup
Or Download the recipe : https://www.dropbox.com/s/7yysniz5q1...0130707.recipe

Last edited by malah; 08-07-2013 at 04:48 PM.
malah is offline   Reply With Quote
Old 07-25-2013, 07:47 AM   #15
malah
Junior Member
malah began at the beginning.
 
Posts: 6
Karma: 10
Join Date: Jul 2013
Device: kindle 4
Hello, since 2 days the printed version of mediapart have a small change and that break the recipe. I haven't find how to use it simply, and that because, the new recipe does not use the printed version, dont know if that the best way, but it work.

Code:
__license__   = 'GPL v3'
__copyright__ = '2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010-2012, Louis Gesbert <meta at antislash dot info>; 2013, Malah <malah at neuf dot fr>'
'''
Mediapart
'''

__author__ = '2009, Mathieu Godlewski <mathieu at godlewski.fr>; 2010-2012, Louis Gesbert <meta at antislash dot info>; 2013, Malah <malah at neuf dot fr>'

from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
from calibre.web.feeds.news import BasicNewsRecipe

class Mediapart(BasicNewsRecipe):
    title          = 'Mediapart'
    __author__ = 'Mathieu Godlewski, Louis Gesbert, Malah'
    description = 'Global news in french from news site Mediapart'
    oldest_article = 7
    language = 'fr'
    needs_subscription = True
    max_articles_per_feed = 50

    use_embedded_content = False
    no_stylesheets = True

    masthead_url       = 'https://upload.wikimedia.org/wikipedia/fr/2/23/Mediapart.png'
    cover_url = 'http://static.mediapart.fr/files/pave_mediapart.jpg'

    feeds =  [
        ('Les articles', 'http://www.mediapart.fr/articles/feed'),
    ]

# -- full-page-version

    conversion_options = { 'smarten_punctuation' : True }

    keep_only_tags = [ 
        dict(name='div', attrs={'class':'col-left fractal-desktop fractal-10-desktop collapse-7-desktop fractal-tablet fractal-6-tablet collapse-4-tablet'}),
        dict(name='div', attrs={'id':'pageFirstContent'})
    ]
    remove_tags = [ 
        dict(name='div', attrs={'id':'lire-aussi'}),
        dict(name='div', attrs={'class':'col-right-content'})
    ]

    def print_version(self, url):
        raw = self.browser.open(url).read()
        soup = BeautifulSoup(raw.decode('utf8', 'replace'))
        link = soup.find('a', {'href':re.compile('^.*?onglet=full$')})
        if link is None:
            return None
        return link['href']

# -- Handle login

    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
        if self.username is not None and self.password is not None:
            br.open('http://blogs.mediapart.fr/editions/guide-du-coordonnateur-d-edition')
            br.select_form(nr=1)
            br['name'] = self.username
            br['pass'] = self.password
            br.submit()
        return br

    def preprocess_html(self, soup):
        for title in soup.findAll('p', {'class':'titre_page'}):
            title.name = 'h3'
        for legend in soup.findAll('span', {'class':'legend'}):
            legend.insert(0, Tag(soup, 'br', []))
            legend.name = 'small'
        return soup
Or Download the recipe : https://www.dropbox.com/s/jwugf1oym5...0130725.recipe

Last edited by malah; 08-07-2013 at 04:49 PM.
malah is offline   Reply With Quote
Reply


Forum Jump

Similar Threads
Thread Thread Starter Forum Replies Last Post
This Site RULES! Drezin News 3 12-31-2008 10:10 AM
@page rules mtravellerh Calibre 1 12-28-2008 05:01 PM
Rules Alexander Turcic Flea Market 0 05-13-2008 03:35 AM


All times are GMT -4. The time now is 12:23 PM.


MobileRead.com is a privately owned, operated and funded community.