Thank you for the link. I did read it - and understood very little. However in the
4. Getting obfuscated content part they mention JavaScript function. But where should I copy this (if I add this to my recipe, then I get an error). So I think, there is something missing.
I did some changes to recipe - just copy&paste from that instructions:
Spoiler:
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2010'
'''
dnevnik.si
'''
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe
class Dnevnik(BasicNewsRecipe):
title = u'Dnevnik.si'
__author__ = 'Test'
description = 'News'
oldest_article = 5
max_articles_per_feed = 20
no_stylesheets = True
use_embedded_content = False
articles_are_obfuscated = True
def get_obfuscated_article(self, url):
br = self.get_browser()
br.open(url)
import mechanize
print_url = url + '?version=print'
response = br.follow_link(mechanize.Link(base_url = '', url = print_url, text = '', tag = '', attrs = []))
html = response.read()
self.temp_files.append(PersistentTemporaryFile('_f a.html'))
self.temp_files[-1].write(html)
self.temp_files[-1].close()
return self.temp_files[-1].name
cover_url = 'http://www.dnevnik.si/dsg/dnevnik.si.gif'
keep_only_tags = [dict(name='div' , attrs={'id':['content', 'heading']})]
remove_tags = [
dict(name='div' , attrs={'id':'header' })
,dict(name='div' , attrs={'class':['related', 'tools', 'inside' ]})
,dict(name='dl' ,attrs={'class':'ad'})
]
remove_tags_after = [dict(id='_iprom_inStream')]
feeds = [
(u'Izpostavljene novice' , u'http://www.dnevnik.si/rss/?articleType=9')
,(u'Slovenija' , u'http://www.dnevnik.si/rss/?articleType=13')
,(u'Svet' , u'http://www.dnevnik.si/rss/?articleType=14')
,(u'Kronika', u'http://www.dnevnik.si/rss/?articleType=15')
,(u'Pop/kultura', u'http://www.dnevnik.si/rss/?articleType=17')
,(u'Zdravje', u'http://www.dnevnik.si/rss/?articleType=18')
]
Now, my ebook is empty.
I tried TamperData and the "javascript
:window.print()" calls for same URL of the article. So there is no way - at least I don't know any - to see the "print version" in browser.