MobileRead Forums - View Single Post

kovidgoyal · 07-07-2008, 02:08 PM

The attached recipe works for me with the command line

Code:

feeds2lrf test.py

Recipe:

Code:

##    Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
##    This program is free software; you can redistribute it and/or modify
##    it under the terms of the GNU General Public License as published by
##    the Free Software Foundation; either version 2 of the License, or
##    (at your option) any later version.
##
##    This program is distributed in the hope that it will be useful,
##    but WITHOUT ANY WARRANTY; without even the implied warranty of
##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
##    GNU General Public License for more details.
##
##    You should have received a copy of the GNU General Public License along
##    with this program; if not, write to the Free Software Foundation, Inc.,
##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 

import time
import re
## from libprs500.ebooks.lrf.web.profiles import DefaultProfile
## from libprs500.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup

class WallStreetJournalPaper(BasicNewsRecipe): 
    import time
    import re
    from calibre.web.feeds.news import BasicNewsRecipe
    from calibre.ebooks.lrf.web.profiles import DefaultProfile
    from calibre.ebooks.BeautifulSoup import BeautifulSoup
    
    title = 'Wall Street Print Edition' 
    __author__ = 'Kovid Goyal'
    simultaneous_downloads = 1    
    max_articles_per_feed = 200
    INDEX = 'http://online.wsj.com/page/2_0133.html'
    timefmt  = ' [%a, %b %d, %Y]' 
    no_stylesheets = False
    html2lrf_options = [('--ignore-tables')]
    issue_date = time.ctime()
    print issue_date




    ## Don't grab articles more than 7 days old 
    oldest_article = 7

    def get_browser(self): 
        br = DefaultProfile.get_browser() 
        if self.username is not None and self.password is not None: 
            br.open('http://online.wsj.com/login') 
            br.select_form(name='login_form') 
            br['user']   = self.username 
            br['password'] = self.password 
            br.submit() 
        return br 
   
    preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in  
        [ 
        ## Remove anything before the body of the article. 
        (r'<body.*?<!-- article start', lambda match: '<body><!-- article start'), 
 
        ## Remove any insets from the body of the article. 
        (r'<div id="inset".*?</div>.?</div>.?<p', lambda match : '<p'), 
 
        ## Remove anything after the end of the article. 
        (r'<!-- article end.*?</body>', lambda match : '</body>'), 
        ] 
    ] 
 
 
     
    def parse_index(self):
        articles = []
        soup = self.index_to_soup(self.INDEX)
        issue_date = time.ctime()
        
        for item in soup.findAll('a', attrs={'class':'bold80'}):
            a = item.find('a')
            if a and a.has_key('href'):
                url = item['href']
                url = 'http://online.wsj.com'+url.replace('/article', '/article_print')
                title = self.tag_to_string(item)
                description = ''
                articles.append({
                    'title':title,
                    'date':date,
                    'url':url,
                    'description':description
                    })
               
    
        return [('Todays Paper', articles)]