View Single Post
Old 06-23-2012, 10:48 PM   #17
kovidgoyal
creator of calibre
kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.
 
kovidgoyal's Avatar
 
Posts: 45,434
Karma: 27757438
Join Date: Oct 2006
Location: Mumbai, India
Device: Various
Yes, it will be included in the next calibre relase. And just for completeness, here is a version that avoids double fetching the article html.

Code:
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.utils.ipc.simple_worker import fork_job
from calibre.ptempfile import PersistentTemporaryFile

js_fetcher = '''

import calibre.web.jsbrowser.browser as jsbrowser

def grab(url):
    browser = jsbrowser.Browser()
    #10 second timeout
    browser.visit(url, 10)
    browser.run_for_a_time(10)
    html = browser.html
    browser.close()
    return html

    '''
class MarketingSensoriale(BasicNewsRecipe):

    title                 = u'Marketing sensoriale'
    description           = 'Marketing Sensoriale, il Blog'
    category              = 'Blog'
    oldest_article        = 7
    max_articles_per_feed = 200
    no_stylesheets        = True
    encoding              = 'utf8'
    use_embedded_content  = False
    language              = 'it'
    remove_empty_feeds    = True
    recursions = 0
    requires_version = (0, 8, 58)
    auto_cleanup = False
    simultaneous_downloads = 1
    articles_are_obfuscated = True

    remove_tags_after    = [dict(name='div', attrs={'class':['article-footer']})]


    def get_article_url(self, article):
        return article.get('feedburner_origlink',  None)

    def get_obfuscated_article(self, url):
        result = fork_job(js_fetcher, 'grab', (url,), module_is_source_code=True)

        html = result['result']
        if isinstance(html, type(u'')):
            html = html.encode('utf-8')
        pt = PersistentTemporaryFile('.html')
        pt.write(html)
        pt.close()
        return pt.name

    feeds          = [(u'Marketing sensoriale', u'http://feeds.feedburner.com/MarketingSensoriale?format=xml')]
kovidgoyal is offline   Reply With Quote