MobileRead Forums - View Single Post

duhduhduh · 04-02-2017, 10:40 AM

Hi,

Code:

#!/usr/bin/env  python2
# -*- mode: python -*-
# -*- coding: utf-8 -*-

__license__ = 'GPL v3'
__copyright__ = '2010-2017, Darko Miletic <darko.miletic at gmail.com>'
'''
www.ft.com/international-edition
'''

from calibre.web.feeds.news import BasicNewsRecipe
from collections import OrderedDict
from urllib import unquote


def classes(classes):
    q = frozenset(classes.split(' '))
    return dict(attrs={
        'class': lambda x: x and frozenset(x.split()).intersection(q)})


class FinancialTimes(BasicNewsRecipe):
    title = 'Financial Times (International) printed edition'
    __author__ = 'Darko Miletic'
    description = "The Financial Times (FT) is one of the world's leading business news and information organisations, recognised internationally for its authority, integrity and accuracy."  # noqa
    publisher = 'The Financial Times Ltd.'
    category = 'news, finances, politics, World'
    oldest_article = 2
    scale_news_images_to_device = True
    language = 'en'
    max_articles_per_feed = 250
    no_stylesheets = True
    use_embedded_content = False
    needs_subscription = True
    encoding = 'utf8'
    publication_type = 'newspaper'
    handle_gzip = True
    LOGIN = 'https://accounts.ft.com/login?location=https%3A%2F%2Fwww.ft.com%2F'
    LOGOUT = 'https://myaccount.ft.com/logout'
    INDEX = 'http://www.ft.com/international-edition'
    PREFIX = 'http://www.ft.com'
    useHighResImages = False
    compress_news_images = True
    compress_news_images_auto_size = 5
    excludeSections = ['life-arts']

    keep_only_tags = [
        classes('article__header--wrapper article__time-byline article__body n-content-image barrier-grid__heading')
    ]

    remove_tags = [
        classes('n-content-related-box tour-tip')
    ]

    remove_attributes = ['width', 'height', 'lang', 'style']

    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
        br.open(self.INDEX)
        if self.username is not None and self.password is not None:
            br.open(self.LOGIN)
            br.select_form(name='enter-email-form')
            br['email'] = self.username
            br.submit()
            br.select_form(name='enter-password-form')
            br['password'] = self.password
            br.submit()
        return br

    def parse_index(self):
        feeds = OrderedDict()
        soup = self.index_to_soup(self.INDEX)
        section_title = 'Untitled'

        for column in soup.findAll('div', attrs={'class': 'feedBoxes clearfix'}):
            for section in column.findAll('div', attrs={'class': 'feedBox'}):
                sectiontitle = self.tag_to_string(section.find('h4'))
                if '...' not in sectiontitle:
                    section_title = sectiontitle
                for article in section.ul.findAll('li'):
                    articles = []
                    title = self.tag_to_string(article.a)
                    url = article.a['href']
                    articles.append(
                        {'title': title, 'url': url, 'description': '', 'date': ''})

                    if articles:
                        if section_title not in feeds:
                            feeds[section_title] = []
                        feeds[section_title] += articles

        ans = [(key, val) for key, val in feeds.iteritems()]
        return ans

    def preprocess_html(self, soup):
        for img in soup.findAll('img', srcset=True):
            src = img['srcset'].split(',')[0].strip()
            src = unquote(src.rpartition('/')[2].partition('?')[0])
            img['src'] = src
        return soup

    def cleanup(self):
        self.browser.open(self.LOGOUT)

Did I place it wrongly?