Code:
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, re
import mechanize
class AdvancedUserRecipe1283848012(BasicNewsRecipe):
description = 'This is a recipe of Ynet.co.il. The recipe opens the article page and clicks on an advertisement to not hurt the sites advertising income.'
cover_url = 'http://www.bneiakiva.net/uploads/images/ynet%282%29.jpg'
title = u'Ynet'
__author__ = 'marbs'
language = _('Hebrew')
extra_css='img {max-width:100%;direction: rtl;} #article{direction: rtl;} div{direction: rtl;} title{direction: rtl; } article_description{direction: rtl; } a.article{direction: rtl; } calibre_feed_description{direction: rtl; } body{direction: ltr;}'
remove_attributes = ['width']
simultaneous_downloads = 5
keep_only_tags =dict(name='div', attrs={'id':'articleContainer'})
remove_javascript = True
timefmt = '[%a, %d %b, %Y]'
oldest_article = 1
remove_tags = [dict(name='p', attrs={'text':[' ']})]
max_articles_per_feed = 100
preprocess_regexps = [
(re.compile(r'<p> </p>', re.DOTALL|re.IGNORECASE), lambda match: '')
]
def preprocess_html(self, soup):
soup.html['dir'] = 'rtl'
soup.body['dir'] = 'rtl'
return soup
feeds = [(u'חדשות', u'http://www.ynet.co.il/Integration/StoryRss2.xml'),
(u'כלכלה', u'http://www.ynet.co.il/Integration/StoryRss6.xml'),
(u'צרכנות', u'http://www.ynet.co.il/Integration/StoryRss437.xml'),
(u'ספורט', u'http://www.ynet.co.il/Integration/StoryRss3.xml'),
(u'תרבות', u'http://www.ynet.co.il/Integration/StoryRss538.xml'),
(u'מעורבות וחברה', u'http://www.ynet.co.il/Integration/StoryRss3262.xml'),
(u'בריאות', u'http://www.ynet.co.il/Integration/StoryRss1208.xml'),
(u'ירוק', u'http://www.ynet.co.il/Integration/StoryRss4872.xml'),
(u'מחשבים', u'http://www.ynet.co.il/Integration/StoryRss544.xml'),
(u'רכב', u'http://www.ynet.co.il/Integration/StoryRss550.xml'),
(u'תיירות', u'http://www.ynet.co.il/Integration/StoryRss598.xml'),
(u'הורים', u'http://www.ynet.co.il/Integration/StoryRss3052.xml'),
(u'אוכל', u'http://www.ynet.co.il/Integration/StoryRss975.xml'),
(u'יהדות', u'http://www.ynet.co.il/Integration/StoryRss4403.xml'),
(u'מדע וטבע', u'http://www.ynet.co.il/Integration/StoryRss2142.xml'),
(u'יחסים', u'http://www.ynet.co.il/Integration/StoryRss3925.xml'),
(u'דעות', u'http://www.ynet.co.il/Integration/StoryRss194.xml')]
def print_version(self, url):
#remove from here
br = BasicNewsRecipe.get_browser()
br.set_debug_http(True)
br.open(url)
response = br.follow_link(mechanize.Link(base_url = '', url =url, text = '', tag = 'a', attrs = [{'id':'buzzerATop'}]))
#to here to stop supporting ynet...
print 'ORG URL IS: ', url
split1 = url.split("-")
print 'THE SPLIT IS: ', split1
weblinks = url
print_url = 'http://www.ynet.co.il/Ext/Comp/ArticleLayout/CdaArticlePrintPreview/1,2506,L-' + split1[1]
print 'THIS URL WILL PRINT: ', print_url # this is a test string to see what the url is it will return
return print_url