Thread: Calibre recipes
View Single Post
Old 07-07-2008, 02:08 PM   #26
kovidgoyal
creator of calibre
kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.
 
kovidgoyal's Avatar
 
Posts: 43,844
Karma: 22666666
Join Date: Oct 2006
Location: Mumbai, India
Device: Various
The attached recipe works for me with the command line
Code:
feeds2lrf test.py
Recipe:
Code:
##    Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
##    This program is free software; you can redistribute it and/or modify
##    it under the terms of the GNU General Public License as published by
##    the Free Software Foundation; either version 2 of the License, or
##    (at your option) any later version.
##
##    This program is distributed in the hope that it will be useful,
##    but WITHOUT ANY WARRANTY; without even the implied warranty of
##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
##    GNU General Public License for more details.
##
##    You should have received a copy of the GNU General Public License along
##    with this program; if not, write to the Free Software Foundation, Inc.,
##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 

import time
import re
## from libprs500.ebooks.lrf.web.profiles import DefaultProfile
## from libprs500.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup

class WallStreetJournalPaper(BasicNewsRecipe): 
    import time
    import re
    from calibre.web.feeds.news import BasicNewsRecipe
    from calibre.ebooks.lrf.web.profiles import DefaultProfile
    from calibre.ebooks.BeautifulSoup import BeautifulSoup
    
    title = 'Wall Street Print Edition' 
    __author__ = 'Kovid Goyal'
    simultaneous_downloads = 1    
    max_articles_per_feed = 200
    INDEX = 'http://online.wsj.com/page/2_0133.html'
    timefmt  = ' [%a, %b %d, %Y]' 
    no_stylesheets = False
    html2lrf_options = [('--ignore-tables')]
    issue_date = time.ctime()
    print issue_date




    ## Don't grab articles more than 7 days old 
    oldest_article = 7

    def get_browser(self): 
        br = DefaultProfile.get_browser() 
        if self.username is not None and self.password is not None: 
            br.open('http://online.wsj.com/login') 
            br.select_form(name='login_form') 
            br['user']   = self.username 
            br['password'] = self.password 
            br.submit() 
        return br 
   
    preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in  
        [ 
        ## Remove anything before the body of the article. 
        (r'<body.*?<!-- article start', lambda match: '<body><!-- article start'), 
 
        ## Remove any insets from the body of the article. 
        (r'<div id="inset".*?</div>.?</div>.?<p', lambda match : '<p'), 
 
        ## Remove anything after the end of the article. 
        (r'<!-- article end.*?</body>', lambda match : '</body>'), 
        ] 
    ] 
 
 
     
    def parse_index(self):
        articles = []
        soup = self.index_to_soup(self.INDEX)
        issue_date = time.ctime()
        
        for item in soup.findAll('a', attrs={'class':'bold80'}):
            a = item.find('a')
            if a and a.has_key('href'):
                url = item['href']
                url = 'http://online.wsj.com'+url.replace('/article', '/article_print')
                title = self.tag_to_string(item)
                description = ''
                articles.append({
                    'title':title,
                    'date':date,
                    'url':url,
                    'description':description
                    })
               
    
        return [('Todays Paper', articles)]
kovidgoyal is offline   Reply With Quote