View Single Post
Old 01-02-2013, 08:36 PM   #1
rainrdx
Connoisseur
rainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy blue
 
Posts: 55
Karma: 13316
Join Date: Jul 2012
Device: iPad
The Economist Recipe Update

Fixed the cover

Code:
#!/usr/bin/env  python

__license__   = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
economist.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag, NavigableString
from collections import OrderedDict

import re

class Economist(BasicNewsRecipe):

    title = 'The Economist'
    language = 'en'

    __author__ = "Kovid Goyal"
    INDEX = 'http://www.economist.com/printedition'
    description = ('Global news and current affairs from a European'
            ' perspective. Best downloaded on Friday mornings (GMT)')
    extra_css      = '''
        .headline {font-size: x-large;}
        h2 { font-size: small;  }
        h1 { font-size: medium;  }
        .pullquote {
            float: right;
            font-size: larger;
            font-weight: bold;
            font-style: italic;
            page-break-inside:avoid;
            border-bottom: 3px solid black;
            border-top: 3px solid black;
            width: 228px;
            margin: 0px 0px 10px 15px;
            padding: 7px 0px 9px;
        }
        '''
    oldest_article = 7.0
    remove_tags = [
            dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent']),
            dict(attrs={'class':['dblClkTrk', 'ec-article-info',
                'share_inline_header', 'related-items']}),
            {'class': lambda x: x and 'share-links-header' in x},
    ]
    keep_only_tags = [dict(id='ec-article-body')]
    no_stylesheets = True
    preprocess_regexps = [(re.compile('</html>.*', re.DOTALL),
        lambda x:'</html>')]

    # economist.com has started throttling after about 60% of the total has
    # downloaded with connection reset by peer (104) errors.
    delay = 1

    needs_subscription = False
    '''
    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
        if self.username and self.password:
            br.open('http://www.economist.com/user/login')
            br.select_form(nr=1)
            br['name'] = self.username
            br['pass'] = self.password
            res = br.submit()
            raw = res.read()
            if '>Log out<' not in raw:
                raise ValueError('Failed to login to economist.com. '
                        'Check your username and password.')
        return br
    '''


    def parse_index(self):
        return self.economist_parse_index()

    def economist_parse_index(self):
        soup = self.index_to_soup(self.INDEX)
        div = soup.find('div', attrs={'class':'issue-image'})
        if div is not None:
            img = div.find('img', src=True)
            if img is not None:
                self.cover_url = re.sub('thumbnail','full',img['src'])
        feeds = OrderedDict()
        for section in soup.findAll(attrs={'class':lambda x: x and 'section' in
            x}):
            h4 = section.find('h4')
            if h4 is None:
                continue
            section_title = self.tag_to_string(h4).strip()
            if not section_title:
                continue
            self.log('Found section: %s'%section_title)
            articles = []
            subsection = ''
            for node in section.findAll(attrs={'class':'article'}):
                subsec = node.findPreviousSibling('h5')
                if subsec is not None:
                    subsection = self.tag_to_string(subsec)
                prefix = (subsection+': ') if subsection else ''
                a = node.find('a', href=True)
                if a is not None:
                    url = a['href']
                    if url.startswith('/'): url = 'http://www.economist.com'+url
                    url += '/print'
                    title = self.tag_to_string(a)
                    if title:
                        title = prefix + title
                        self.log('\tFound article:', title)
                        articles.append({'title':title, 'url':url,
                        'description':'', 'date':''})

            if articles:
                if section_title not in feeds:
                    feeds[section_title] = []
                feeds[section_title] += articles

        ans = [(key, val) for key, val in feeds.iteritems()]
        if not ans:
            raise Exception('Could not find any articles, either the '
                    'economist.com server is having trouble and you should '
                    'try later or the website format has changed and the '
                    'recipe needs to be updated.')
        return ans

    def eco_find_image_tables(self, soup):
        for x in soup.findAll('table', align=['right', 'center']):
            if len(x.findAll('font')) in (1,2) and len(x.findAll('img')) == 1:
                yield x

    def postprocess_html(self, soup, first):
        body = soup.find('body')
        for name, val in body.attrs:
            del body[name]

        for table in list(self.eco_find_image_tables(soup)):
            caption = table.find('font')
            img = table.find('img')
            div = Tag(soup, 'div')
            div['style'] = 'text-align:left;font-size:70%'
            ns = NavigableString(self.tag_to_string(caption))
            div.insert(0, ns)
            div.insert(1, Tag(soup, 'br'))
            del img['width']
            del img['height']
            img.extract()
            div.insert(2, img)
            table.replaceWith(div)
        return soup
rainrdx is offline   Reply With Quote