Thread: Calibre recipes
View Single Post
Old 06-12-2008, 08:12 PM   #16
ddavtian
Addict
ddavtian has a complete set of Star Wars action figures.ddavtian has a complete set of Star Wars action figures.ddavtian has a complete set of Star Wars action figures.ddavtian has a complete set of Star Wars action figures.
 
Posts: 262
Karma: 332
Join Date: Nov 2003
Location: San Francisco, USA
Device: Sony 505 & 900, Kindle DX, Samsung Galaxy Tab, EVO
This is based on published WSJ profile.
I had pm'ed you my login name and password, feel free to use it for testing/reading.


PHP Code:
##    Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
##    This program is free software; you can redistribute it and/or modify
##    it under the terms of the GNU General Public License as published by
##    the Free Software Foundation; either version 2 of the License, or
##    (at your option) any later version.
##
##    This program is distributed in the hope that it will be useful,
##    but WITHOUT ANY WARRANTY; without even the implied warranty of
##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
##    GNU General Public License for more details.
##
##    You should have received a copy of the GNU General Public License along
##    with this program; if not, write to the Free Software Foundation, Inc.,
##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 

import time
import re
## from libprs500.ebooks.lrf.web.profiles import DefaultProfile
## from libprs500.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe
from calibre
.ebooks.lrf.web.profiles import DefaultProfile
from calibre
.ebooks.BeautifulSoup import BeautifulSoup

class WallStreetJournalPaper(BasicNewsRecipe): 
    
import time
    import re
    from calibre
.web.feeds.news import BasicNewsRecipe
    from calibre
.ebooks.lrf.web.profiles import DefaultProfile
    from calibre
.ebooks.BeautifulSoup import BeautifulSoup
    
    title 
'Wall Street Print Edition' 
    
__author__ 'Kovid Goyal'
    
simultaneous_downloads 1    
    max_articles_per_feed 
200
    INDEX 
'http://online.wsj.com/page/2_0133.html'
    
timefmt  ' [%a, %b %d, %Y]' 
    
no_stylesheets False
    html2lrf_options 
= [('--ignore-tables')]
    
issue_date time.ctime()
    print 
issue_date




    
## Don't grab articles more than 7 days old 
    
oldest_article 7

    def get_browser
(self): 
        
br DefaultProfile.get_browser() 
        if 
self.username is not None and self.password is not None
            
br.open('http://online.wsj.com/login'
            
br.select_form(name='login_form'
            
br['user']   = self.username 
            br
['password'] = self.password 
            br
.submit() 
        return 
br 
   
    preprocess_regexps 
= [(re.compile(i[0], re.IGNORECASE re.DOTALL), i[1]) for i in  
        

        
## Remove anything before the body of the article. 
        
(r'<body.*?<!-- article start'lambda match'<body><!-- article start'), 
 
        
## Remove any insets from the body of the article. 
        
(r'<div id="inset".*?</div>.?</div>.?<p'lambda match '<p'), 
 
        
## Remove anything after the end of the article. 
        
(r'<!-- article end.*?</body>'lambda match '</body>'), 
        ] 
    ] 
 
 
     
    
def parse_index(self):
        
articles = []
            
soup self.index_to_soup(self.INDEX)
        
issue_date time.ctime()
        
        for 
item in soup.findAll('a'attrs={'class':'bold80'}):
            
item.find('a')
            if 
and a.has_key('href'):
                
url item['href']
                
url 'http://online.wsj.com'+url.replace('/article''/article_print')
                
title self.tag_to_string(item)
                
description ''
                
articles.append({
                    
'title':title,
                    
'date':date,
                    
'url':url,
                    
'description':description
                    
})
               
    
        return {
'Todays Paper' articles 
ddavtian is offline   Reply With Quote