View Single Post
Old 09-18-2012, 07:16 PM   #1
rainrdx
Connoisseur
rainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy blue
 
Posts: 55
Karma: 13316
Join Date: Jul 2012
Device: iPad
London Review of Books recipe updated

This is an update to the built-in recipe. I made some aesthetic changes.

Code:

__license__   = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
'''
lrb.co.uk
'''
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe

class LondonReviewOfBooksPayed(BasicNewsRecipe):
    title                 = 'London Review of Books'
    __author__            = 'Rich Shang, Darko Miletic'
    description           = 'Subscription content. Literary review publishing essay-length book reviews and topical articles on politics, literature, history, philosophy, science and the arts by leading writers and thinkers'
    category              = 'news, literature, UK'
    publisher             = 'LRB Ltd.'
    max_articles_per_feed = 100
    language              = 'en_GB'
    no_stylesheets        = True
    delay                 = 1
    use_embedded_content  = False
    encoding              = 'utf-8'
    INDEX                 = 'http://www.lrb.co.uk'
    LOGIN                 = INDEX + '/login'
    masthead_url          = INDEX + '/assets/images/lrb_logo_big.gif'
    needs_subscription    = True
    publication_type      = 'magazine'
    extra_css             = ' body{font-family: Georgia,Palatino,"Palatino Linotype",serif} '


    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
        if self.username is not None and self.password is not None:
            br.open(self.LOGIN)
            br.select_form(nr=1)
            br['username'] = self.username
            br['password'] = self.password
            br.submit()
        return br

    def parse_index(self):
        articles = []
        soup = self.index_to_soup(self.INDEX)
        cover_item = soup.find('p',attrs={'class':'cover'})
	dates = str(soup.find('span', attrs={'class':'coverdate'}))
	newdates = re.sub('\<.*\>','',re.split('<br />',dates)[1])
	self.timefmt = ' [%s]'%newdates
        lrbtitle = self.title
        if  cover_item:
            self.cover_url = re.sub('/m/','/l/',cover_item.a.img['src'])
            content = self.INDEX + cover_item.a['href']
            soup2 = self.index_to_soup(content)
            sitem = soup2.find(attrs={'class':'article-list'})
            lrbtitle = soup2.head.title.string
            for item in sitem.findAll('a',attrs={'class':'title'}):
                description = u''
                title_prefix = u''
                feed_link = item
                if feed_link.has_key('href'):
                    url   = self.INDEX + feed_link['href']
	            title_link = re.split('<br />',str(feed_link))
		    if len (title_link) > 1:
			 title = title_prefix + re.sub('\<.*\>','',title_link[0]) + ' - ' + re.sub('\<.*\>','',title_link[1])
		    else:
			 title = title_prefix + self.tag_to_string(feed_link)
		    desc = item.findNext('li')
	            if desc is not None and desc.find('cite') is not None and desc.find('ul') is None:
			description=self.tag_to_string(desc)
                    date  = strftime(self.timefmt)
                    articles.append({
                                      'title'      :title
                                     ,'date'       :date
                                     ,'url'        :url
                                     ,'description':description
                                    })
        return [(lrbtitle, articles)]

    conversion_options = {
                             'comments'  : description
                            ,'tags'      : category
                            ,'language'  : language
                            ,'publisher' : publisher
                         }

    keep_only_tags = [dict(name='div' , attrs={'class':['article-body indent','letters']})]
    remove_attributes = ['width','height']
rainrdx is offline   Reply With Quote