View Single Post
Old 08-24-2012, 01:15 PM   #1
rainrdx
Connoisseur
rainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy blue
 
Posts: 55
Karma: 13316
Join Date: Jul 2012
Device: iPad
Time Magazine Recipe

I kinda wondered why this recipe is broken for a long while so I took some time and fixed it up.

Code:
#!/usr/bin/env  python

__license__   = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid@kovidgoyal.net>'

'''
time.com
'''

import re
from calibre.web.feeds.news import BasicNewsRecipe
from lxml import html

class Time(BasicNewsRecipe):
    #recipe_disabled = ('This recipe has been disabled as TIME no longer'
    #        ' publish complete articles on the web.')
    title                 = u'Time'
    __author__            = 'Kovid Goyal, Rick Shang'
    description           = ('Weekly US magazine.')
    encoding = 'utf-8'
    no_stylesheets        = True
    language = 'en'
    remove_javascript     = True
    needs_subscription = True

    keep_only_tags = [
            {
                'class':['tout1', 'entry-content', 'external-gallery-img', 'image-meta']
            },
        ]
    remove_tags = [
            {'class':['thumbnail', 'button']},
 
            ]

    recursions = 10
    match_regexps = [r'/[0-9,]+-(2|3|4|5|6|7|8|9)(,\d+){0,1}.html',r'http://www.time.com/time/specials/packages/article/.*']

    preprocess_regexps = [(re.compile(
        r'<meta .+/>'), lambda m:'')]

    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
        # This site uses javascript in its login process
        if self.username is not None and self.password is not None:
		br.open('http://www.time.com/time/magazine')
	        br.select_form(predicate=lambda f: 'action' in f.attrs and f.attrs['action'] == 'https://auth.time.com/login.php')
		br['username']   = self.username
	        br['password'] = self.password
		br['magcode'] = ['TD']
		br.find_control('turl').readonly = False
		br['turl'] = 'http://www.time.com/time/magazine'
		br.find_control('rurl').readonly = False
		br['rurl'] = 'http://www.time.com/time/magazine'
		br['remember'] = False
		br.submit() 		

	return br

    def parse_index(self):
        raw = self.index_to_soup('http://www.time.com/time/magazine', raw=True)
	soup = self.index_to_soup('http://www.time.com/time/magazine')
	dates = self.tag_to_string(soup.find('time', attrs={'class':'updated'}))
	self.timefmt = ' [%s]'%dates
        root = html.fromstring(raw)
        img = root.xpath('//a[.="View Large Cover" and @href]')
        if img:
            cover_url = 'http://www.time.com' + img[0].get('href')
            try:
                nsoup = self.index_to_soup(cover_url)
                img = nsoup.find('img', src=re.compile('archive/covers'))
                if img is not None:
                    self.cover_url = img['src']
            except:
                self.log.exception('Failed to fetch cover')


        feeds = []
        parent = root.xpath('//div[@class="content-main-aside"]')[0]
        for sec in parent.xpath(
                'descendant::section[contains(@class, "sec-mag-section")]'):
            h3 = sec.xpath('./h3')
            if h3:
                section = html.tostring(h3[0], encoding=unicode,
                        method='text').strip().capitalize()
                self.log('Found section', section)
                articles = list(self.find_articles(sec))
                if articles:
                    feeds.append((section, articles))

        return feeds

    def find_articles(self, sec):

        for article in sec.xpath('./article'):
            h2 = article.xpath('./*[@class="entry-title"]')
            if not h2: continue
            a = h2[0].xpath('./a[@href]')
            if not a: continue
            title = html.tostring(a[0], encoding=unicode,
                        method='text').strip()
            if not title: continue
            url =  re.sub('/magazine/article/0,9171','/subscriber/printout/0,8816',a[0].get('href'))
            if url.startswith('/'):
                url = 'http://www.time.com'+url
            desc = ''
            p = article.xpath('./*[@class="entry-content"]')
            if p:
                desc = html.tostring(p[0], encoding=unicode,
                        method='text')
            self.log('\t', title, ':\n\t\t', desc)
            yield {
                    'title' : title,
                    'url'   : url,
                    'date'  : '',
                    'description' : desc
                    }

Last edited by rainrdx; 08-24-2012 at 01:20 PM. Reason: A minor update
rainrdx is offline   Reply With Quote