Register Guidelines E-Books Search Today's Posts Mark Forums Read

Go Back   MobileRead Forums > E-Book Software > Calibre > Recipes

Notices

Reply
 
Thread Tools Search this Thread
Old 01-02-2013, 08:36 PM   #1
rainrdx
Enthusiast
rainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy blue
 
Posts: 49
Karma: 13316
Join Date: Jul 2012
Device: iPad
The Economist Recipe Update

Fixed the cover

Code:
#!/usr/bin/env  python

__license__   = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
economist.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag, NavigableString
from collections import OrderedDict

import re

class Economist(BasicNewsRecipe):

    title = 'The Economist'
    language = 'en'

    __author__ = "Kovid Goyal"
    INDEX = 'http://www.economist.com/printedition'
    description = ('Global news and current affairs from a European'
            ' perspective. Best downloaded on Friday mornings (GMT)')
    extra_css      = '''
        .headline {font-size: x-large;}
        h2 { font-size: small;  }
        h1 { font-size: medium;  }
        .pullquote {
            float: right;
            font-size: larger;
            font-weight: bold;
            font-style: italic;
            page-break-inside:avoid;
            border-bottom: 3px solid black;
            border-top: 3px solid black;
            width: 228px;
            margin: 0px 0px 10px 15px;
            padding: 7px 0px 9px;
        }
        '''
    oldest_article = 7.0
    remove_tags = [
            dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent']),
            dict(attrs={'class':['dblClkTrk', 'ec-article-info',
                'share_inline_header', 'related-items']}),
            {'class': lambda x: x and 'share-links-header' in x},
    ]
    keep_only_tags = [dict(id='ec-article-body')]
    no_stylesheets = True
    preprocess_regexps = [(re.compile('</html>.*', re.DOTALL),
        lambda x:'</html>')]

    # economist.com has started throttling after about 60% of the total has
    # downloaded with connection reset by peer (104) errors.
    delay = 1

    needs_subscription = False
    '''
    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
        if self.username and self.password:
            br.open('http://www.economist.com/user/login')
            br.select_form(nr=1)
            br['name'] = self.username
            br['pass'] = self.password
            res = br.submit()
            raw = res.read()
            if '>Log out<' not in raw:
                raise ValueError('Failed to login to economist.com. '
                        'Check your username and password.')
        return br
    '''


    def parse_index(self):
        return self.economist_parse_index()

    def economist_parse_index(self):
        soup = self.index_to_soup(self.INDEX)
        div = soup.find('div', attrs={'class':'issue-image'})
        if div is not None:
            img = div.find('img', src=True)
            if img is not None:
                self.cover_url = re.sub('thumbnail','full',img['src'])
        feeds = OrderedDict()
        for section in soup.findAll(attrs={'class':lambda x: x and 'section' in
            x}):
            h4 = section.find('h4')
            if h4 is None:
                continue
            section_title = self.tag_to_string(h4).strip()
            if not section_title:
                continue
            self.log('Found section: %s'%section_title)
            articles = []
            subsection = ''
            for node in section.findAll(attrs={'class':'article'}):
                subsec = node.findPreviousSibling('h5')
                if subsec is not None:
                    subsection = self.tag_to_string(subsec)
                prefix = (subsection+': ') if subsection else ''
                a = node.find('a', href=True)
                if a is not None:
                    url = a['href']
                    if url.startswith('/'): url = 'http://www.economist.com'+url
                    url += '/print'
                    title = self.tag_to_string(a)
                    if title:
                        title = prefix + title
                        self.log('\tFound article:', title)
                        articles.append({'title':title, 'url':url,
                        'description':'', 'date':''})

            if articles:
                if section_title not in feeds:
                    feeds[section_title] = []
                feeds[section_title] += articles

        ans = [(key, val) for key, val in feeds.iteritems()]
        if not ans:
            raise Exception('Could not find any articles, either the '
                    'economist.com server is having trouble and you should '
                    'try later or the website format has changed and the '
                    'recipe needs to be updated.')
        return ans

    def eco_find_image_tables(self, soup):
        for x in soup.findAll('table', align=['right', 'center']):
            if len(x.findAll('font')) in (1,2) and len(x.findAll('img')) == 1:
                yield x

    def postprocess_html(self, soup, first):
        body = soup.find('body')
        for name, val in body.attrs:
            del body[name]

        for table in list(self.eco_find_image_tables(soup)):
            caption = table.find('font')
            img = table.find('img')
            div = Tag(soup, 'div')
            div['style'] = 'text-align:left;font-size:70%'
            ns = NavigableString(self.tag_to_string(caption))
            div.insert(0, ns)
            div.insert(1, Tag(soup, 'br'))
            del img['width']
            del img['height']
            img.extract()
            div.insert(2, img)
            table.replaceWith(div)
        return soup
rainrdx is offline   Reply With Quote
Old 01-17-2013, 10:17 PM   #2
rainrdx
Enthusiast
rainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy blue
 
Posts: 49
Karma: 13316
Join Date: Jul 2012
Device: iPad
Fixed the empty article issue

Code:
#!/usr/bin/env  python

__license__   = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
economist.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag, NavigableString
from collections import OrderedDict

import re

class Economist(BasicNewsRecipe):

    title = 'The Economist'
    language = 'en'

    __author__ = "Kovid Goyal"
    INDEX = 'http://www.economist.com/printedition'
    description = ('Global news and current affairs from a European'
            ' perspective. Best downloaded on Friday mornings (GMT)')
    extra_css      = '''
        .headline {font-size: x-large;}
        h2 { font-size: small;  }
        h1 { font-size: medium;  }
        .pullquote {
            float: right;
            font-size: larger;
            font-weight: bold;
            font-style: italic;
            page-break-inside:avoid;
            border-bottom: 3px solid black;
            border-top: 3px solid black;
            width: 228px;
            margin: 0px 0px 10px 15px;
            padding: 7px 0px 9px;
        }
        '''
    oldest_article = 7.0
    remove_tags = [
            dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent']),
            dict(attrs={'class':['dblClkTrk', 'ec-article-info',
                'share_inline_header', 'related-items', 'bottom-links', 'expanded-list white-palette typog-list-exp related-items', 'source', 'secondary-header grey-header size-compact']}),
            {'class': lambda x: x and 'share-links-header' in x},
    ]
    keep_only_tags = [dict(name='article'), dict(id='ec-article-body')]
    no_stylesheets = True
    preprocess_regexps = [(re.compile('</html>.*', re.DOTALL),
        lambda x:'</html>')]

    # economist.com has started throttling after about 60% of the total has
    # downloaded with connection reset by peer (104) errors.
    delay = 1
    simultaneous_downloads = 1

    needs_subscription = False
    '''
    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
        if self.username and self.password:
            br.open('http://www.economist.com/user/login')
            br.select_form(nr=1)
            br['name'] = self.username
            br['pass'] = self.password
            res = br.submit()
            raw = res.read()
            if '>Log out<' not in raw:
                raise ValueError('Failed to login to economist.com. '
                        'Check your username and password.')
        return br
    '''


    def parse_index(self):
        return self.economist_parse_index()

    def economist_parse_index(self):
        soup = self.index_to_soup(self.INDEX)
        div = soup.find('div', attrs={'class':'issue-image'})
        if div is not None:
            img = div.find('img', src=True)
            if img is not None:
                self.cover_url = re.sub('thumbnail','full',img['src'])
        feeds = OrderedDict()
        for section in soup.findAll(attrs={'class':lambda x: x and 'section' in
            x}):
            h4 = section.find('h4')
            if h4 is None:
                continue
            section_title = self.tag_to_string(h4).strip()
            if not section_title:
                continue
            self.log('Found section: %s'%section_title)
            articles = []
            subsection = ''
            for node in section.findAll(attrs={'class':'article'}):
                subsec = node.findPreviousSibling('h5')
                if subsec is not None:
                    subsection = self.tag_to_string(subsec)
                prefix = (subsection+': ') if subsection else ''
                a = node.find('a', href=True)
                if a is not None:
                    url = a['href']
                    if url.startswith('/'): url = 'http://www.economist.com'+url
                    url += '/print'
                    title = self.tag_to_string(a)
                    if title:
                        title = prefix + title
                        self.log('\tFound article:', title)
                        articles.append({'title':title, 'url':url,
                        'description':'', 'date':''})

            if articles:
                if section_title not in feeds:
                    feeds[section_title] = []
                feeds[section_title] += articles

        ans = [(key, val) for key, val in feeds.iteritems()]
        if not ans:
            raise Exception('Could not find any articles, either the '
                    'economist.com server is having trouble and you should '
                    'try later or the website format has changed and the '
                    'recipe needs to be updated.')
        return ans

    def eco_find_image_tables(self, soup):
        for x in soup.findAll('table', align=['right', 'center']):
            if len(x.findAll('font')) in (1,2) and len(x.findAll('img')) == 1:
                yield x

    def postprocess_html(self, soup, first):
        body = soup.find('body')
        for name, val in body.attrs:
            del body[name]

        for table in list(self.eco_find_image_tables(soup)):
            caption = table.find('font')
            img = table.find('img')
            div = Tag(soup, 'div')
            div['style'] = 'text-align:left;font-size:70%'
            ns = NavigableString(self.tag_to_string(caption))
            div.insert(0, ns)
            div.insert(1, Tag(soup, 'br'))
            del img['width']
            del img['height']
            img.extract()
            div.insert(2, img)
            table.replaceWith(div)
        return soup
rainrdx is offline   Reply With Quote
Reply

Thread Tools Search this Thread
Search this Thread:

Advanced Search

Forum Jump

Similar Threads
Thread Thread Starter Forum Replies Last Post
Economist recipe jdomingos76 Recipes 1 03-08-2011 08:33 AM
Problems with economist recipe lady kay Calibre 1 08-06-2010 07:49 AM
Economist (Free) Recipe geneaber Calibre 2 01-08-2010 09:21 PM
Economist Free Recipe geneaber Calibre 10 12-31-2009 03:45 PM
Economist Recipe - broken? dieterpops Calibre 1 02-20-2009 09:14 PM


All times are GMT -4. The time now is 09:36 AM.


MobileRead.com is a privately owned, operated and funded community.