View Single Post
Old 11-01-2010, 09:36 AM   #1
marbs
Zealot
marbs began at the beginning.
 
Posts: 122
Karma: 10
Join Date: Jul 2010
Device: nook
new recipe all done. and an idea.

the idea is that some of the web sites we use for recipes earn money for advertising. if we skip the article page and go to the print version, the site will suffer. in this recipe, and in all my future ones, i will download the article page before i go to the print version.

so this recipe is ready to be builtin.
Spoiler:
Code:
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, re

class AdvancedUserRecipe1283848012(BasicNewsRecipe):
    description   = 'This is a recipe of Calcalist.co.il. The recipe downloads the article page to not hurt the sites advertising income.'
    cover_url      = 'http://ftp5.bizportal.co.il/web/giflib/news/calcalist.JPG'
    title          = u'Calcalist'
    language              = _('Hebrew')
    __author__ = 'marbs'
    extra_css='img {max-width:100%;} body{direction: rtl;},title{direction: rtl; } ,article_description{direction: rtl; }, a.article{direction: rtl; } ,calibre_feed_description{direction: rtl; }'
    simultaneous_downloads = 5
    remove_javascript     = True
    timefmt        = '[%a, %d %b, %Y]'
    oldest_article = 1
    max_articles_per_feed = 100
    remove_attributes = ['width']
    simultaneous_downloads = 5
    keep_only_tags =dict(name='div', attrs={'id':'articleContainer'}) 
    remove_tags = [dict(name='p', attrs={'text':[' ']})]
    max_articles_per_feed = 100
    preprocess_regexps = [
        (re.compile(r'<p>&nbsp;</p>', re.DOTALL|re.IGNORECASE), lambda match: '')
        ]


    feeds          = [(u'\u05d3\u05e3 \u05d4\u05d1\u05d9\u05ea', u'http://www.calcalist.co.il/integration/StoryRss8.xml'),                            
                           (u'24/7', u'http://www.calcalist.co.il/integration/StoryRss3674.xml'), 
                           (u'\u05d1\u05d0\u05d6\u05d6', u'http://www.calcalist.co.il/integration/StoryRss3674.xml'),                            
                           (u'\u05de\u05d1\u05d6\u05e7\u05d9\u05dd', u'http://www.calcalist.co.il/integration/StoryRss184.xml'), 
                           (u'\u05d4\u05e9\u05d5\u05e7', u'http://www.calcalist.co.il/integration/StoryRss2.xml'), 
                           (u'\u05d1\u05d0\u05e8\u05e5', u'http://www.calcalist.co.il/integration/StoryRss14.xml'), 
                           (u'\u05d4\u05db\u05e1\u05e3', u'http://www.calcalist.co.il/integration/StoryRss9.xml'), 
                           (u'\u05e0\u05d3\u05dc"\u05df', u'http://www.calcalist.co.il/integration/StoryRss7.xml'), 
                           (u'\u05e2\u05d5\u05dc\u05dd', u'http://www.calcalist.co.il/integration/StoryRss13.xml'), 
                           (u'\u05e4\u05e8\u05e1\u05d5\u05dd \u05d5\u05e9\u05d9\u05d5\u05d5\u05e7', u'http://www.calcalist.co.il/integration/StoryRss5.xml'), 
                           (u'\u05e4\u05e0\u05d0\u05d9', u'http://www.calcalist.co.il/integration/StoryRss3.xml'), 
                           (u'\u05d8\u05db\u05e0\u05d5\u05dc\u05d5\u05d2\u05d9', u'http://www.calcalist.co.il/integration/StoryRss4.xml'), 
                           (u'\u05e2\u05e1\u05e7\u05d9 \u05e1\u05e4\u05d5\u05e8\u05d8', u'http://www.calcalist.co.il/integration/StoryRss18.xml')]
       
    def print_version(self, url):
        br = BasicNewsRecipe.get_browser()
        br.open(url)
        print 'ORG URL IS: ', url
        split1 = url.split("-")
        print 'THE SPLIT IS: ', split1 
        weblinks = url
        print_url = 'http://www.calcalist.co.il/Ext/Comp/ArticleLayout/CdaArticlePrintPreview/1,2506,L-' + split1[1]      
        print 'THIS URL WILL PRINT: ', print_url # this is a test string to see what the url is it will return
        return print_url
marbs is offline   Reply With Quote