MobileRead Forums - View Single Post

rainrdx · 09-18-2012, 07:16 PM

This is an update to the built-in recipe. I made some aesthetic changes.

Code:


__license__   = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
'''
lrb.co.uk
'''
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe

class LondonReviewOfBooksPayed(BasicNewsRecipe):
    title                 = 'London Review of Books'
    __author__            = 'Rich Shang, Darko Miletic'
    description           = 'Subscription content. Literary review publishing essay-length book reviews and topical articles on politics, literature, history, philosophy, science and the arts by leading writers and thinkers'
    category              = 'news, literature, UK'
    publisher             = 'LRB Ltd.'
    max_articles_per_feed = 100
    language              = 'en_GB'
    no_stylesheets        = True
    delay                 = 1
    use_embedded_content  = False
    encoding              = 'utf-8'
    INDEX                 = 'http://www.lrb.co.uk'
    LOGIN                 = INDEX + '/login'
    masthead_url          = INDEX + '/assets/images/lrb_logo_big.gif'
    needs_subscription    = True
    publication_type      = 'magazine'
    extra_css             = ' body{font-family: Georgia,Palatino,"Palatino Linotype",serif} '


    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
        if self.username is not None and self.password is not None:
            br.open(self.LOGIN)
            br.select_form(nr=1)
            br['username'] = self.username
            br['password'] = self.password
            br.submit()
        return br

    def parse_index(self):
        articles = []
        soup = self.index_to_soup(self.INDEX)
        cover_item = soup.find('p',attrs={'class':'cover'})
	dates = str(soup.find('span', attrs={'class':'coverdate'}))
	newdates = re.sub('\<.*\>','',re.split('<br />',dates)[1])
	self.timefmt = ' [%s]'%newdates
        lrbtitle = self.title
        if  cover_item:
            self.cover_url = re.sub('/m/','/l/',cover_item.a.img['src'])
            content = self.INDEX + cover_item.a['href']
            soup2 = self.index_to_soup(content)
            sitem = soup2.find(attrs={'class':'article-list'})
            lrbtitle = soup2.head.title.string
            for item in sitem.findAll('a',attrs={'class':'title'}):
                description = u''
                title_prefix = u''
                feed_link = item
                if feed_link.has_key('href'):
                    url   = self.INDEX + feed_link['href']
	            title_link = re.split('<br />',str(feed_link))
		    if len (title_link) > 1:
			 title = title_prefix + re.sub('\<.*\>','',title_link[0]) + ' - ' + re.sub('\<.*\>','',title_link[1])
		    else:
			 title = title_prefix + self.tag_to_string(feed_link)
		    desc = item.findNext('li')
	            if desc is not None and desc.find('cite') is not None and desc.find('ul') is None:
			description=self.tag_to_string(desc)
                    date  = strftime(self.timefmt)
                    articles.append({
                                      'title'      :title
                                     ,'date'       :date
                                     ,'url'        :url
                                     ,'description':description
                                    })
        return [(lrbtitle, articles)]

    conversion_options = {
                             'comments'  : description
                            ,'tags'      : category
                            ,'language'  : language
                            ,'publisher' : publisher
                         }

    keep_only_tags = [dict(name='div' , attrs={'class':['article-body indent','letters']})]
    remove_attributes = ['width','height']