View Single Post
Old 05-17-2016, 06:42 AM   #2
Bola de Fogo
Bola de Fogo began at the beginning.
Posts: 21
Karma: 10
Join Date: May 2016
Device: Kindle Paper White
Hi guys,

I´ve improved this old Receipt for Folha de São Paulo (no login required).

Cover, sections and columinsts are all updated.

Note: I did not included all available blogs, but it is easy to include the ones you like.

# -*- coding: utf-8 -*-
from import BasicNewsRecipe
from datetime import datetime, timedelta
from calibre.ebooks.BeautifulSoup import Tag,BeautifulSoup
from calibre.utils.magick import Image, PixelWand
from urllib2 import Request, urlopen, URLError

class FolhaOnline(BasicNewsRecipe):
    THUMBALIZR_API        = ''  # ---->Get your at and put here
    LANGUAGE              = 'pt_br'
    language = 'pt_BR'
    LANGHTM               = 'pt-br'
    ENCODING              = 'cp1252'
    ENCHTM                = 'iso-8859-1'
    directionhtm          = 'ltr'
    requires_version      = (0,7,47)
    news                  = True

    title                 = u'Folha de S\xE3o Paulo improved'
    __author__            = 'Euler Alves and Alex Mitrani, improved by Bola de Fogo'
    description           = u'Brazilian news from Folha de S\xE3o Paulo'
    publisher             = u'Folha de S\xE3o Paulo'
    category              = 'news, rss'

    oldest_article        = 4
    max_articles_per_feed = 200
    summary_length        = 1000

    remove_javascript     = True
    no_stylesheets        = True
    use_embedded_content  = False
    remove_empty_feeds    = True
    timefmt               = ' [%d %b %Y (%a)]'

    html2lrf_options      = [
                            '--comment', description
                            ,'--category', category
                            ,'--publisher', publisher

    html2epub_options     = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'

    hoje                  =
    pubdate               = hoje.strftime('%a, %d %b')
    if hoje.hour<6:
        hoje = hoje-timedelta(days=1)
    CAPA = ''+hoje.strftime('%Y')+'/'+hoje.strftime('%m')+'/'+hoje.strftime('%d')+'/br/br_folha_spaulo.200.jpg'
    SCREENSHOT            = ''
    cover_margins         = (0,0,'white')
    masthead_url          = ''

    keep_only_tags      = [
        dict(name='div', attrs={'id':'articleNew'}),
        dict(name='article', id='news'),

    feeds = [
    (u'Em cima da hora', u'')
    ,(u'Poder', u'')
    ,(u'Cotidiano', u'')
    ,(u'Mercado', u'')
    ,(u'Mundo', u'')
    ,(u'Esporte', u'')
    ,(u'Comida', u'')
    ,(u'Tec', u'')
    ,(u'Ilustrada', u'')
    ,(u'Ambiente', u'')
    ,(u'Opiniao', u'')
    ,(u'Ci\xEAncia', u'')
    ,(u'Equil\xEDbrio e Sa\xFAde', u'')
    ,(u'Elio Gaspari', u'')
    ,(u'Tati Bernardi', u'')
    ,(u'PVC', u'')
    ,(u'Clóvis Rossi', u'')
    ,(u'Hélio Schwartsman', u'')
    ,(u'Humberto Luiz Peron', u'')
    ,(u'João Pereira Coutinho', u'')
    ,(u'Cony', u'')
    ,(u'Juca', u'')
    ,(u'Viniciu Torres Freitas', u'')
    ,(u'Monica Bergamo', u'')
    ,(u'Vinicius Mota', u'')
    ,(u'Bernardo Guimaraes', u'')
    ,(u'Tostao', u'')
    ,(u'Valdo Cruz', u'')

    conversion_options = {
    'title'            : title
    ,'comments'        : description
    ,'publisher'       : publisher
    ,'tags'            : category
    ,'language'        : LANGUAGE
    ,'linearize_tables': True

    def preprocess_html(self, soup):
        for item in soup.findAll(style=True):
            del item['style']
        if not soup.find(attrs={'http-equiv':'Content-Language'}):
            meta0 = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.LANGHTM)])
        if not soup.find(attrs={'http-equiv':'Content-Type'}):
            meta1 = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset="+self.ENCHTM)])
        return soup

    def postprocess_html(self, soup, first):
        # process all the images. assumes that the new html has the correct path
        for tag in soup.findAll(lambda tag:'img' and 'src' in tag):
            iurl = tag['src']
            img = Image()
            width, height = img.size
            print 'img is: ', iurl, 'width is: ', width, 'height is: ', height
            if img < 0:
                raise RuntimeError('Out of memory')
            pw = PixelWand()
            if(width > height and width > 590) :
                print 'Rotate image'
                img.rotate(pw, -90)
        return soup

    def get_cover_url(self):
        cover_url      = self.CAPA
        pedido         = Request(self.CAPA)
        pedido.add_header('User-agent','Mozilla/5.0 (Windows; U; Windows NT 5.1; '+self.LANGHTM+'; userid='+self.THUMBALIZR_API+') Calibre/0.8.47 (like Gecko)')
            resposta   = urlopen(pedido)
            soup       = BeautifulSoup(resposta)
            cover_item = soup.find('body')
            if cover_item:
            return cover_url
        except URLError:
            return cover_url
Bola de Fogo is offline   Reply With Quote