I'm guessing you still can't figure out the log out code? You should just log in using a browser, save the index page to the disk and use that to develop the parse_index function. Here's a working recipe:
Code:
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
import time
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class WallStreetJournalPaper(BasicNewsRecipe):
import time
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.lrf.web.profiles import DefaultProfile
from calibre.ebooks.BeautifulSoup import BeautifulSoup
title = 'Wall Street Print Edition'
__author__ = 'Kovid Goyal'
simultaneous_downloads = 1
max_articles_per_feed = 200
INDEX = 'http://online.wsj.com/page/us_in_todays_paper.html'
timefmt = ' [%a, %b %d, %Y]'
no_stylesheets = False
html2lrf_options = [('--ignore-tables')]
issue_date = time.ctime()
## Don't grab articles more than 7 days old
oldest_article = 7
def get_browser(self):
br = DefaultProfile.get_browser()
if self.username is not None and self.password is not None:
br.open('http://online.wsj.com/login')
br.select_form(name='login_form')
br['user'] = self.username
br['password'] = self.password
br.submit()
return br
preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
## Remove anything before the body of the article.
(r'<body.*?<!-- article start', lambda match: '<body><!-- article start'),
## Remove any insets from the body of the article.
(r'<div id="inset".*?</div>.?</div>.?<p', lambda match : '<p'),
## Remove anything after the end of the article.
(r'<!-- article end.*?</body>', lambda match : '</body>'),
]
]
def parse_index(self):
articles = []
soup = self.index_to_soup(self.INDEX)
issue_date = time.ctime()
for a in soup.findAll('a', href=True, attrs={'class':'bold80'}):
url = a['href']
url = 'http://online.wsj.com'+url.replace('/article', '/article_print')
title = self.tag_to_string(a)
description = ''
articles.append({
'title':title,
'date':'',
'url':url,
'description':description
})
return [('Todays Paper', articles)]