MobileRead Forums - View Single Post

kovidgoyal · 07-09-2008, 12:56 PM

I'm guessing you still can't figure out the log out code? You should just log in using a browser, save the index page to the disk and use that to develop the parse_index function. Here's a working recipe:

Code:

##    it under the terms of the GNU General Public License as published by
##    the Free Software Foundation; either version 2 of the License, or
##    (at your option) any later version.
##
##    This program is distributed in the hope that it will be useful,
##    but WITHOUT ANY WARRANTY; without even the implied warranty of
##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
##    GNU General Public License for more details.
##
##    You should have received a copy of the GNU General Public License along
##    with this program; if not, write to the Free Software Foundation, Inc.,
##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.


import time
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup

class WallStreetJournalPaper(BasicNewsRecipe):
    import time
    import re
    from calibre.web.feeds.news import BasicNewsRecipe
    from calibre.ebooks.lrf.web.profiles import DefaultProfile
    from calibre.ebooks.BeautifulSoup import BeautifulSoup

    title = 'Wall Street Print Edition'
    __author__ = 'Kovid Goyal'
    simultaneous_downloads = 1
    max_articles_per_feed = 200
    INDEX = 'http://online.wsj.com/page/us_in_todays_paper.html'
    timefmt  = ' [%a, %b %d, %Y]'
    no_stylesheets = False
    html2lrf_options = [('--ignore-tables')]
    issue_date = time.ctime()




    ## Don't grab articles more than 7 days old
    oldest_article = 7

    def get_browser(self):
        br = DefaultProfile.get_browser()
        if self.username is not None and self.password is not None:
            br.open('http://online.wsj.com/login')
            br.select_form(name='login_form')
            br['user']   = self.username
            br['password'] = self.password
            br.submit()
        return br

    preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
        [
        ## Remove anything before the body of the article.
        (r'<body.*?<!-- article start', lambda match: '<body><!-- article start'),

        ## Remove any insets from the body of the article.
        (r'<div id="inset".*?</div>.?</div>.?<p', lambda match : '<p'),

        ## Remove anything after the end of the article.
        (r'<!-- article end.*?</body>', lambda match : '</body>'),
        ]
    ]



    def parse_index(self):
        articles = []
        soup = self.index_to_soup(self.INDEX)
        issue_date = time.ctime()

        for a in soup.findAll('a', href=True, attrs={'class':'bold80'}):
	    url = a['href']
	    url = 'http://online.wsj.com'+url.replace('/article', '/article_print')
	    title = self.tag_to_string(a)
	    description = ''
	    articles.append({
		'title':title,
		'date':'',
		'url':url,
		'description':description
		})
	return [('Todays Paper', articles)]