The attached recipe works for me with the command line
Recipe:
Code:
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
import time
import re
## from libprs500.ebooks.lrf.web.profiles import DefaultProfile
## from libprs500.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class WallStreetJournalPaper(BasicNewsRecipe):
import time
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.lrf.web.profiles import DefaultProfile
from calibre.ebooks.BeautifulSoup import BeautifulSoup
title = 'Wall Street Print Edition'
__author__ = 'Kovid Goyal'
simultaneous_downloads = 1
max_articles_per_feed = 200
INDEX = 'http://online.wsj.com/page/2_0133.html'
timefmt = ' [%a, %b %d, %Y]'
no_stylesheets = False
html2lrf_options = [('--ignore-tables')]
issue_date = time.ctime()
print issue_date
## Don't grab articles more than 7 days old
oldest_article = 7
def get_browser(self):
br = DefaultProfile.get_browser()
if self.username is not None and self.password is not None:
br.open('http://online.wsj.com/login')
br.select_form(name='login_form')
br['user'] = self.username
br['password'] = self.password
br.submit()
return br
preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
## Remove anything before the body of the article.
(r'<body.*?<!-- article start', lambda match: '<body><!-- article start'),
## Remove any insets from the body of the article.
(r'<div id="inset".*?</div>.?</div>.?<p', lambda match : '<p'),
## Remove anything after the end of the article.
(r'<!-- article end.*?</body>', lambda match : '</body>'),
]
]
def parse_index(self):
articles = []
soup = self.index_to_soup(self.INDEX)
issue_date = time.ctime()
for item in soup.findAll('a', attrs={'class':'bold80'}):
a = item.find('a')
if a and a.has_key('href'):
url = item['href']
url = 'http://online.wsj.com'+url.replace('/article', '/article_print')
title = self.tag_to_string(item)
description = ''
articles.append({
'title':title,
'date':date,
'url':url,
'description':description
})
return [('Todays Paper', articles)]