Register Guidelines E-Books Today's Posts Search

Go Back   MobileRead Forums > E-Book Software > Calibre > Recipes


Thread Tools Search this Thread
Old 09-25-2017, 03:00 PM   #1
desUBIKado began at the beginning.
Posts: 22
Karma: 12
Join Date: Feb 2009
Location: Zaragoza, Spain
Device: prs-505, iliad
Updates - El Periódico de Aragón, El Correo, Heraldo de Aragón [ES]

Hi there:

Updates for these three newspapers in spanish from Spain

El Periódico de Aragón


#!/usr/bin/env  python
# -*- coding: utf-8 -*-

__license__     = 'GPL v3'
__copyright__   = '04 December 2010, desUBIKado'
__author__      = 'desUBIKado'
__description__ = 'Daily newspaper from Aragon'
__version__     = 'v0.10'
__date__        = '09, September 2017'
import re
from import BasicNewsRecipe

class elperiodicodearagon(BasicNewsRecipe):
    title                 = u'El Periodico de Aragon'
    __author__            = u'desUBIKado'
    description           = u'Noticias desde Aragon'
    publisher             = u''
    category              = u'news, politics, Spain, Aragon'
    oldest_article        = 1
    delay                 = 1
    max_articles_per_feed = 100
    no_stylesheets        = True
    use_embedded_content  = False
    language              = 'es'
    masthead_url          = ''
    encoding              = 'iso-8859-1'
    remove_empty_feeds    = True
    remove_javascript     = True

    feeds              = [
                           (u'Portada', u''),
                           (u'Arag\xf3n', u''),
                           (u'Internacional', u''),
                           (u'Espa\xf1a', u''),                          
                           (u'Econom\xeda', u''),
                           (u'Deportes', u''),
                           (u'Real Zaragoza', u''),
                           (u'Tecnyconta Zaragoza', u''),
                           (u'Monta\xf1ismo', u''),
                           (u'Opini\xf3n', u''),
                           (u'Tema del d\xeda', u''),
                           (u'Escenarios', u''),
                           (u'Sociedad', u''),
                           (u'Gente', u''),
                           (u'Espacio 3', u''),
                           (u'Fiestas del Pilar', u''),
                           (u'Semana Santa', u'')
                           ,(u'La crónica de Valdejal\xf3n', u'')
                           ,(u'La crónica de Campo de Borja', u'')
                           ,(u'La crónica de Ejea y sus pueblos', u'')
                           ,(u'La crónica del Bajo Gállego', u'')
                           ,(u'La crónica del Campo de Cariñena', u'')
                           ,(u'La crónica de la Ribera Alta del Ebro', u'')
                           ,(u'La crónica del Campo de Belchite', u'')                           


    remove_tags_before = dict(name='div' , attrs={'class':'Pagina'})
    remove_tags_after  = dict(name='div' , attrs={'class':'ComentariosNew'})                                                                             
    keep_only_tags     = [dict(name='div', attrs={'class':'Pagina'})]     
    remove_tags        = [
                          dict(name='nav', attrs={'class':['Compartir','HerramientasConversacion Herramientas']}),
                          dict(name='h5', attrs={'class':['CintilloBox']}),
                          dict(name='div', attrs={'class':['BoxMenu BoxMenuConFoto','BxGalerias','ConStick','HerramientasComentarioNew Herramientas','NumeroComentarioNew']}),
                          dict(name='div', attrs={'class':['BoxPestanas','Box','ColumnaDerecha','NoticiasRelacionadasDeNoticia','CintilloNoticiasRelacionadasDeNoticia']}),
                          dict(name='a', attrs={'class':['IrA BotonLink']})
    # Recuperamos la portada de papel (la imagen format=1 tiene mayor resolucion)

    def get_cover_url(self):
        index = ''
        soup = self.index_to_soup(index)
        for image in soup.findAll('img',src=True):
           if image['src'].startswith('/funciones/img-public.php?key='):
              return '' + image['src']
        return None    
    extra_css = '''
                    h1 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:28px;}   
                    h2 {font-family:Arial,Helvetica,sans-serif; font-style:italic;font-size:14px;color:#4D4D4D;}             
                    h3 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:18px;}                        

El Correo

#!/usr/bin/env  python
__license__     = 'GPL v3'
__copyright__   = '08 Januery 2011, desUBIKado'
__author__      = 'desUBIKado'
__description__ = 'Daily newspaper from Biscay'
__version__     = 'v0.14'
__date__        = '10, September 2017'

import time
import re
from import BasicNewsRecipe

class elcorreo(BasicNewsRecipe):
    author                = 'desUBIKado'
    description           = 'Daily newspaper from Biscay'
    title                 = u'El Correo'
    publisher             = 'Vocento'
    category              = 'News, politics, culture, economy, general interest'
    oldest_article        = 1
    delay                 = 1
    max_articles_per_feed = 100
    no_stylesheets        = True
    use_embedded_content  = False
    masthead_url          = ''
    language              = 'es'
    timefmt               = '[%a, %d %b, %Y]'
    encoding              = 'utf-8'
    remove_empty_feeds    = True
    remove_javascript     = True

    feeds              = [                          
                           (u'Portada',      	 u''),                           
                           (u'Mundo',       	 u''),                                                   
                           (u'Bizkaia',          u''),      
                           (u'Guipuzkoa',        u''),
                           (u'Araba',            u''),
                           (u'La Rioja',         u''),
                           (u'Miranda',          u''),      
                           (u'Economía',         u''),                            
                           (u'Culturas',         u''),
                           (u'Politica',         u''),                           
                           (u'Tecnología',       u''),
                           (u'Gente - Estilo',   u''),                              
                           (u'Planes',           u''),
                           (u'Athletic',         u''),   
                           (u'Alavés',           u''),   
                           (u'Bilbao Basket',    u''),   
                           (u'Baskonia',         u''),
                           (u'Deportes',         u''),  
                           (u'Jaiak',            u''),
                           (u'La Blanca',        u''),
                           (u'Aste Nagusia',     u''),
                           (u'Semana Santa',     u''),
                           (u'Festivales',       u'')     

    keep_only_tags     = [
                          dict(name='div', attrs={'class':['col-xs-12 col-sm-12 col-md-8 col-lg-8']})                          

    remove_tags        = [                          
                          dict(name='div', attrs={'class':['voc-topics voc-detail-grid ','voc-newsletter ','voc-author-social']}),
                          dict(name='section', attrs={'class':['voc-ficha-detail voc-file-sports']})

    remove_tags_before = dict(name='div' , attrs={'class':'col-xs-12 col-sm-12 col-md-8 col-lg-8'})
    remove_tags_after  = dict(name='div' , attrs={'class':'col-xs-12 col-sm-12 col-md-8 col-lg-8'})  
    _processed_links = []
    def get_article_url(self, article):
       link = article.get('link', None)
       if link is None:
           return article               
       # modificamos la url de las noticias de los equipos deportivos para que funcionen, por ejemplo:
       parte = link.split('/')
       if parte[2] == '':
	  link = '' + parte[3] + '/' + parte[4] + '/' + parte[5] + '/' + parte[6] + '?external=deportes/athletic'
	   if parte[2] == '':
	      link = '' + parte[3] + '/' + parte[4] + '/' + parte[5] + '/' + parte[6] + '?external=deportes/baskonia'
	       if parte[2] == '':
	          link = '' + parte[3] + '/' + parte[4] + '/' + parte[5] + '/' + parte[6] + '?external=deportes/bilbaobasket'
		   if parte[2] == '':
	              link = '' + parte[3] + '/' + parte[4] + '/' + parte[5] + '/' + parte[6] + '?external=deportes/alaves'
       # A veces el mismo articulo aparece en la versión de Alava y en la de Bizkaia. Por ejemplo:
       # para controlar los duplicados, unificamos las url para que sean siempre de bizkaia (excepto para la sección "araba")
       if ((parte[3] == 'alava') and (parte[4] != 'araba')):
	  link = link.replace('', '')
       # Controlamos si el artículo ha sido incluido en otro feed para eliminarlo 
       if not (link in self._processed_links):
            link = None                       
       return link

    # Recuperamos la portada de papel (la imagen format=1 tiene mayor resolucion)
    def get_cover_url(self):
       cover = None
       st = time.localtime()
       year = str(st.tm_year)       
       month = "%.2d" % st.tm_mon
       day = "%.2d" % st.tm_mday
       cover=''+ day +  month + year +'-viz.pdf'       
       br = BasicNewsRecipe.get_browser(self)
           self.log("\nPortada no disponible")
           cover =''
       return cover             
    # Para cambiar el estilo del texto
    extra_css = '''
		    h1 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:28px;}   
                    h2 {font-family:georgia,serif; font-style:italic; font-weight:normal;font-size:16px;color:#4D4D4D;}             
                    h3 {font-family:georgia,serif; font-weight:bold;font-size:18px;}

    preprocess_regexps = [     

    # Para presentar la imagen de los video incrustados                           
                           (re.compile(r'stillURLVideo: \'', re.DOTALL|re.IGNORECASE), lambda match: '</script><img src="'),  
                           (re.compile(r'.jpg\',', re.DOTALL|re.IGNORECASE), lambda match: '.jpg"><SCRIPT TYPE="text/JavaScript"'),

    # Para quitar el punto de la lista                                             
                           (re.compile(r'<li class="destacada">', re.DOTALL|re.IGNORECASE), lambda match: '<div class="destacada"></div>')                         


Heraldo de Aragón

#!/usr/bin/env  python
__license__     = 'GPL v3'
__copyright__   = '04 December 2010, desUBIKado'
__author__      = 'desUBIKado'
__description__ = 'Daily newspaper from Aragon'
__version__     = 'v0.08'
__date__        = '10, September 2017'

import time
import re
from import BasicNewsRecipe

class heraldo(BasicNewsRecipe):
    author        = 'desUBIKado'
    description   = 'Daily newspaper from Aragon'
    title          = u'Heraldo de Aragon'
    publisher      = 'Grupo Heraldo'
    category       = 'News, politics, culture, economy, general interest'
    language       = 'es'
    timefmt        = '[%a, %d %b, %Y]'
    oldest_article = 2
    delay          = 1
    max_articles_per_feed = 100
    use_embedded_content  = False
    masthead_url          = ''
    remove_empty_feeds    = True
    remove_javascript = True
    no_stylesheets = True

    feeds          = [
                        (u'Noticias', u'')

    keep_only_tags     = [dict(name='div', attrs={'class':['row-f2 brd-row-f4 bck-row-f1-f1 padd-t padd-btt con n-marg-btt']}),
			  dict(name='div', attrs={'id':['dts','com']}),
			  dict(name='img', attrs={'class':['lazy']})]			  			  
    remove_tags        = [dict(name='a', attrs={'class':['com flo-r','enl-if','enl-df','next_com']}),
                          dict(name='div', attrs={'class':['brb-b-s con marg-btt','cnt-rel con','col5-f1','tit txt-wh f-s con','con cont-top ','col5-f1 flo-l','cnt-rel brr','caj_part con','caj_topic con']}),
			  dict(name='div', attrs={'id':['cont-Top-8760','caj-pub','8760-cpt1','caj_topic con','slider-oferplan','cont-Top-']}),
                          dict(name='form', attrs={'class':'form'}), 
			  dict(name='ul', attrs={'class':['tabs-nav','men_nav con hg_2n','lst-not-f2 con  ']}),
			  dict(name='span', attrs={'class':['flo-r']}),
                          dict(name='ul', attrs={'id':['cont-tags','pag-1','pag-cnt-I-']})]

    remove_tags_before = dict(name='div' , attrs={'id':'dts'})
    remove_tags_after  = dict(name='div' , attrs={'id':'com'})
    def get_cover_url(self):
       cover = None
       st = time.localtime()
       year = str(st.tm_year)
       month = "%.2d" % st.tm_mon
       day = "%.2d" % st.tm_mday                
       cover=''+ year +'/'+  month + '/' + day +'/es/heraldo_aragon.750.jpg'
       br = BasicNewsRecipe.get_browser(self)
           self.log("\nPortada no disponible")
           cover =''
       return cover

    extra_css = '''                    
                    h1 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:28px;}   
                    h2 {font-family:georgia,serif; font-style:italic; font-weight:normal;font-size:22px;color:#4D4D4D;}             
                    .ladillo {font-family:georgia,serif; font-weight:bold;font-size:18px;}                      
		    .firm, .sp, .fech, ".com flo-r" {font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:12px;}                                   
                    img{margin-bottom: 0.4em}

    preprocess_regexps = [     
# Para separar los comentarios con una linea en blanco
                           (re.compile(r'<div class="tit-f2">', re.DOTALL|re.IGNORECASE), lambda match: '<br /><br /><div class="tit-f2">'),
                           (re.compile(r'<div id="com"', re.DOTALL|re.IGNORECASE), lambda match: '<br><div id="com"'),
# Para ver las imágenes de las noticias
                           (re.compile(r'<img class="lazy" data-original="', re.DOTALL|re.IGNORECASE), lambda match: '<img src="')



desUBIKado is offline   Reply With Quote

Forum Jump

Similar Threads
Thread Thread Starter Forum Replies Last Post
Updated recipe: Heraldo de Aragon [ES] desUBIKado Recipes 0 12-02-2013 02:33 PM
Updated recipes - El Correo & El periódico de Aragón [ES] desUBIKado Recipes 0 08-09-2013 05:33 AM
Updated recipe: Heraldo de Aragon [ES] desUBIKado Recipes 0 06-30-2012 09:58 AM
Updated recipe: El periódico de Aragón (Spanish) desUBIKado Recipes 0 02-10-2011 02:17 PM

All times are GMT -4. The time now is 11:24 AM. is a privately owned, operated and funded community.