[[SOLVED]]
--Hi all,
I have been trying to create a recipe (modified from an outdated one by mr. Mellink), to download articles from the Dutch newspaper Volkskrant.
I have been able to correct the title reading and index creating of the original script to the new newspaper layout. However, the login part doesn't seem to work. This may have something to do with the fact that the site uses specific functions for it's form. Perhaps you can help me.
I have pasted the login code of the recipe and the URL form code below. The form loads (I can print the hidden values) but after the submit command, articles still refer to the login page (ie. not logged in).
I just can't seem to persuade the login to work correctly. I hope anyone has any ideas?
The part of the recipe that loads the form:
Code:
class Volkskrant_full(BasicNewsRecipe):
title = strftime('Volkskrant: %Y%m%d')
__author__ = u'Jaap Mellink'
description = u"Volkskrant"
oldest_article = 30
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
simultaneous_downloads = 1
delay = 1
needs_subscription = True
INDEX_MAIN = strftime('http://www.volkskrant.nl/vk-online/VK/%Y%m%d___/1_001/#text')
INDEX_ARTICLE = strftime('http://www.volkskrant.nl/vk-online/VK/%Y%m%d___/1_001/')
LOGIN = 'http://www.volkskrant.nl/gatekeeper/login.jsp'
remove_tags = [dict(name='address')]
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open(self.LOGIN)
br.select_form(name="UserLogin")
print br.select_form(name="UserLogin")
br['userName'] = self.username
br['password'] = self.password
br.submit()
return br
Then there is the form from the website (source at
http://www.volkskrant.nl/gatekeeper/login.jsp):
Code:
<form action="/action" method="post" name="UserLogin" id="UserLogin">
<input type="hidden" name="action" value="login"/>
<input type="hidden" name="goto" value="/gatekeeper/view-profile.jsp"/>
<input type="hidden" name="source" value="/gatekeeper/login.jsp"/>
<input type="hidden" name="entree" value="nwsl"/>
<input type="hidden" name="success" value="/gatekeeper/view-profile.jsp"/>
<div class="left">
<h3 class="">Gebruikersnaam:<span class="mandatory">*</span></h3>
</div>
<div class="right">
<input type="text" name="userName" id="userName" size="30" maxlength="28" value="" class="formfield"/>
</div>
<br />
<div class="left">
<h3 class="">Wachtwoord:<span class="mandatory">*</span></h3>
</div>
<div class="right">
<input type="password" name="password" id="password" maxlength="28" size="30" class="formfield"/>
</div>
<div class="clear"></div>
<div class="plain">
<input name="saveuserIdPassword" type="checkbox" value="yes" checked="checked"/>Uw gebruikersnaam en wachtwoord opslaan op deze computer (aanbevolen)
</div>
<br/>
<input type="image" src="/gatekeeper/images/but-login.gif" alt="login" onclick="validateData()"/>
<div class="plain">
<ul class="links">
<li><b><a href="/gatekeeper/register_only.jsp">Nog geen login? Registreren</a></b></li>
</ul>
</div>
<br/>
<br/>
</form>
And finally, for reference, the full recipe so far:
Code:
from calibre import strftime
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from BeautifulSoup import BeautifulStoneSoup
from calibre.web.feeds.news import BasicNewsRecipe
class Volkskrant_full(BasicNewsRecipe):
title = strftime('Volkskrant: %Y%m%d')
__author__ = u'Jaap Mellink'
description = u"Volkskrant"
oldest_article = 30
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
simultaneous_downloads = 1
delay = 1
needs_subscription = True
INDEX_MAIN = strftime('http://www.volkskrant.nl/vk-online/VK/%Y%m%d___/1_001/#text')
INDEX_ARTICLE = strftime('http://www.volkskrant.nl/vk-online/VK/%Y%m%d___/1_001/')
LOGIN = 'http://www.volkskrant.nl/gatekeeper/login.jsp'
#TEST = 'http://www.volkskrant.nl/vk-online/VK/20100109___/1_001/article9_text.html'
#keep_only_tags = [ dict(name='div', attrs={'class':'page'})]
#keep_only_tags = []
#remove_tags = [{'class':['info']}, dict(name='address')]
remove_tags = [dict(name='address')]
#keep_only_tags = [{'class':['article HorizontalHeader',
# 'articlecontent','photoBox', 'article columnist first']}, ]
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open(self.LOGIN)
br.select_form(name="UserLogin")
print br.select_form(name="UserLogin")
br['userName'] = self.username
br['password'] = self.password
br.submit()
return br
def parse_index(self):
krant = []
def strip_title(_title):
i = 0
while ((_title[i] <> ":") and (i <= len(_title))):
i = i + 1
return(_title[0:i])
print 'Processing ' + self.INDEX_MAIN
soup = self.index_to_soup(self.INDEX_MAIN)
mainsoup = soup.find('td', attrs={'id': 'select_page_top'})
for option in mainsoup.findAll('option'):
articles = []
_INDEX = strftime('http://www.volkskrant.nl/vk-online/VK/%Y%m%d___/') + option['value'] + '/#text'
_INDEX_ARTICLE = strftime('http://www.volkskrant.nl/vk-online/VK/%Y%m%d___/') + option['value'] + '/'
print 'Processing ' + option['value']
soup = self.index_to_soup(_INDEX)
for item in soup.findAll('area'):
art_nr = item['class']
attrname = art_nr[0:11] + '_section' + option['value'][0:1] + '_' + art_nr[12:len(art_nr)]
index_title = soup.find('div', attrs={'class': attrname})
get_title = index_title['title'];
url = _INDEX_ARTICLE + attrname + '.html#text'
title = get_title;
if (get_title <> ''):
title = strip_title(get_title)
date = strftime(' %B %Y')
if (title <> ''):
articles.append({
'title' :title
,'date' :date
,'url' :url
,'description':''
})
krant.append( (option.string, articles))
return krant