Member
Posts: 22
Karma: 12
Join Date: Feb 2009
Location: Zaragoza, Spain
Device: prs-505, iliad
|
Updates for 4 Spanish news sources
elcorreo.com
Spoiler:
Code:
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '08 Januery 2011, desUBIKado'
__author__ = 'desUBIKado'
__description__ = 'Daily newspaper from Biscay'
__version__ = 'v0.13'
__date__ = '28, July 2016'
'''
http://www.elcorreo.com/
'''
import time
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
class elcorreo(BasicNewsRecipe):
author = 'desUBIKado'
description = 'Daily newspaper from Biscay'
title = u'El Correo'
publisher = 'Vocento'
category = 'News, politics, culture, economy, general interest'
oldest_article = 1
delay = 1
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
masthead_url = 'http://www.elcorreo.com/vizcaya/noticias/201002/02/Media/logo-elcorreo-nuevo.png'
language = 'es'
timefmt = '[%a, %d %b, %Y]'
encoding = 'utf-8'
remove_empty_feeds = True
remove_javascript = True
feeds = [
(u'Portada', u'http://www.elcorreo.com/bizkaia/rss/atom/portada'),
(u'Mundo', u'http://www.elcorreo.com/bizkaia/rss/atom?seccion=internacional'),
(u'Bizkaia', u'http://www.elcorreo.com/bizkaia/rss/atom?seccion=bizkaia'),
(u'Guipuzkoa', u'http://www.elcorreo.com/bizkaia/rss/atom?seccion=gipuzkoa'),
(u'Araba', u'http://www.elcorreo.com/bizkaia/rss/atom?seccion=araba'),
(u'La Rioja', u'http://www.elcorreo.com/bizkaia/rss/atom?seccion=larioja'),
(u'Miranda', u'http://www.elcorreo.com/bizkaia/rss/atom?seccion=miranda'),
(u'Economía', u'http://www.elcorreo.com/bizkaia/rss/atom?seccion=economia'),
(u'Culturas', u'http://www.elcorreo.com/bizkaia/rss/atom?seccion=culturas'),
(u'Politica', u'http://www.elcorreo.com/bizkaia/rss/atom?seccion=politica'),
(u'Tecnología', u'http://www.elcorreo.com/bizkaia/rss/atom?seccion=tecnologia'),
(u'Gente - Estilo', u'http://www.elcorreo.com/bizkaia/rss/atom?seccion=gente-estilo'),
(u'Planes', u'http://www.elcorreo.com/bizkaia/rss/atom?seccion=planes'),
(u'Athletic', u'http://www.elcorreo.com/bizkaia/rss/atom?seccion=athletic'),
(u'Alavés', u'http://www.elcorreo.com/bizkaia/rss/atom?seccion=alaves'),
(u'Bilbao Basket', u'http://www.elcorreo.com/bizkaia/rss/atom?seccion=bilbaobasket'),
(u'Baskonia', u'http://www.elcorreo.com/bizkaia/rss/atom?seccion=baskonia'),
(u'Deportes', u'http://www.elcorreo.com/bizkaia/rss/atom?seccion=deportes'),
(u'Jaiak', u'http://www.elcorreo.com/bizkaia/rss/atom?seccion=jaiak'),
(u'La Blanca', u'http://www.elcorreo.com/bizkaia/rss/atom?seccion=la-blanca-vitoria'),
(u'Aste Nagusia', u'http://www.elcorreo.com/bizkaia/rss/atom?seccion=aste-nagusia-bilbao'),
(u'Semana Santa', u'http://www.elcorreo.com/bizkaia/rss/atom?seccion=semana-santa'),
(u'Festivales', u'http://www.elcorreo.com/bizkaia/rss/atom?seccion=festivales')
]
keep_only_tags = [
dict(name='article', attrs={'class':['story media-list ']})
]
remove_tags = [
dict(name='span', attrs={'class':['no-comments']}),
dict(name='div', attrs={'class':['compApoyosText compNoticiasR']})
]
remove_tags_before = dict(name='article' , attrs={'class':'story media-list '})
remove_tags_after = dict(name='article' , attrs={'class':'story media-list '})
# Usamos la versión para móviles
def print_version(self, url):
return url.replace('http://www.', 'http://m.')
_processed_links = []
def get_article_url(self, article):
link = article.get('link', None)
if link is None:
return article
# modificamos la url de las noticias de los equipos deportivos para que funcionen, por ejemplo:
# http://athletic.elcorreo.com/noticias/201407/27/muniain-estrella-athletic-para-20140727093046.html
# http://m.elcorreo.com/noticias/201407/27/muniain-estrella-athletic-para-20140727093046.html?external=deportes/athletic
parte = link.split('/')
if parte[2] == 'athletic.elcorreo.com':
link = 'http://m.elcorreo.com/' + parte[3] + '/' + parte[4] + '/' + parte[5] + '/' + parte[6] + '?external=deportes/athletic'
else:
if parte[2] == 'baskonia.elcorreo.com':
link = 'http://m.elcorreo.com/' + parte[3] + '/' + parte[4] + '/' + parte[5] + '/' + parte[6] + '?external=deportes/baskonia'
else:
if parte[2] == 'bilbaobasket.elcorreo.com':
link = 'http://m.elcorreo.com/' + parte[3] + '/' + parte[4] + '/' + parte[5] + '/' + parte[6] + '?external=deportes/bilbaobasket'
else:
if parte[2] == 'alaves.elcorreo.com':
link = 'http://m.elcorreo.com/' + parte[3] + '/' + parte[4] + '/' + parte[5] + '/' + parte[6] + '?external=deportes/alaves'
# A veces el mismo articulo aparece en la versión de Alava y en la de Bizkaia. Por ejemplo:
# http://www.elcorreo.com/alava/deportes/motor/formula-1/201407/27/ecclestone-quiere-briatore-ayude-20140727140820-rc.html
# http://www.elcorreo.com/bizkaia/deportes/motor/formula-1/201407/27/ecclestone-quiere-briatore-ayude-20140727140820-rc.html
# para controlar los duplicados, unificamos las url para que sean siempre de bizkaia (excepto para la sección "araba")
if ((parte[3] == 'alava') and (parte[4] != 'araba')):
link = link.replace('elcorreo.com/alava', 'elcorreo.com/bizkaia')
# Controlamos si el artículo ha sido incluido en otro feed para eliminarlo
if not (link in self._processed_links):
self._processed_links.append(link)
else:
link = None
return link
# Recuperamos la portada de papel (la imagen format=1 tiene mayor resolucion)
def get_cover_url(self):
cover = None
st = time.localtime()
year = str(st.tm_year)
month = "%.2d" % st.tm_mon
day = "%.2d" % st.tm_mday
#http://info.elcorreo.com/pdf/07082013-viz.pdf
cover='http://info.elcorreo.com/pdf/'+ day + month + year +'-viz.pdf'
br = BasicNewsRecipe.get_browser(self)
try:
br.open(cover)
except:
self.log("\nPortada no disponible")
cover ='http://www.elcorreo.com/vizcaya/noticias/201002/02/Media/logo-elcorreo-nuevo.png'
return cover
# Para cambiar el estilo del texto
extra_css = '''
h4 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:28px;}
.place {font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:12px;}
.name {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:12px;}
.overhead, .compLadillo, description {font-family:georgia,serif; font-weight:bold;font-size:18px;}
.compSumario, .detalle-titular {font-family:georgia,serif; font-style:italic; font-weight:normal;font-size:22px;color:#4D4D4D;}
img{margin-bottom: 0.4em}
'''
preprocess_regexps = [
# Para presentar la imagen de los video incrustados
(re.compile(r'stillURLVideo: \'', re.DOTALL|re.IGNORECASE), lambda match: '</script><img src="'),
(re.compile(r'.jpg\',', re.DOTALL|re.IGNORECASE), lambda match: '.jpg"><SCRIPT TYPE="text/JavaScript"'),
# Para quitar el punto de la lista
(re.compile(r'<li class="destacada">', re.DOTALL|re.IGNORECASE), lambda match: '<div class="destacada"></div>')
]
Hola.com
Spoiler:
Code:
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '30 June 2012, desUBIKado'
__author__ = 'desUBIKado'
__description__ = 'Diario de actualidad, moda y belleza'
__version__ = 'v0.03'
__date__ = '28, Jul 2016'
'''
http://www.hola.com/
'''
import time
import re
from calibre.web.feeds.news import BasicNewsRecipe
class hola_es(BasicNewsRecipe):
author = 'desUBIKado'
description = 'Diario de actualidad, moda y belleza'
title = u'¡Hola!'
publisher = 'Hola S.L.'
category = 'Spanish celebrities, Entertainment News, Royalty, Daily Variety, Hollywood'
language = 'es'
masthead_url = 'http://imagenes.hola.com/comunes/2008/logo-holacom.gif'
timefmt = '[%a, %d %b, %Y]'
oldest_article = 7
delay = 1
encoding = 'utf-8'
max_articles_per_feed = 100
use_embedded_content = False
remove_empty_feeds = True
remove_javascript = True
no_stylesheets = True
feeds = [
(u'Famosos' , u'http://www.hola.com/famosos/rss.xml' )
,(u'Realeza' , u'http://www.hola.com/realeza/rss.xml' )
,(u'Cine' , u'http://www.hola.com/cine/rss.xml' )
,(u'M\xfasica' , u'http://www.hola.com/musica/rss.xml' )
,(u'Moda y modelos' , u'http://www.hola.com/moda/portada/rss.xml' )
,(u'Belleza y salud', u'http://www.hola.com/belleza/portada/rss.xml' )
,(u'Ni\xf1os' , u'http://www.hola.com/ninos/rss.xml' )
]
keep_only_tags = [dict(name='article', attrs={'class':['body col-md-8 col-xs-12']})]
remove_tags = [dict(name='div', attrs={'class':['comments','news-share','sponsored-news']}),
dict(name='div', attrs={'itemprop':['logo']}),
dict(name='span', attrs={'class':['hidden']}),
dict(name='p', attrs={'class':['hidden']}),
dict(name='section', attrs={'class':['news-tags']})
]
remove_tags_after = dict(name='div' , attrs={'class':'comments'})
# <span>VER GALERÍA<i data-icon="1" class="icon"></i></span>
preprocess_regexps = [
# Quitar VER GALERÍA
(re.compile(r'<span>VER GALER', re.DOTALL|re.IGNORECASE), lambda m: '<!--'),
(re.compile(r'class="icon"></i></span>', re.DOTALL|re.IGNORECASE), lambda m: '-->'),
# Quitar enlaces varios
(re.compile(r'<p><a href="http://www.hola.com', re.DOTALL|re.IGNORECASE), lambda m: '<!--'),
(re.compile(r'<p style="text-align: center;">', re.DOTALL|re.IGNORECASE), lambda m: '<!--'),
(re.compile(r'<p style="line-height: 20.8px;"><a href="http://www.hola.com', re.DOTALL|re.IGNORECASE), lambda m: '<!--'),
(re.compile(r'</strong></a></p>', re.DOTALL|re.IGNORECASE), lambda m: '-->')
]
# Recuperamos la portada de papel (la imagen 520 tiene mayor resolucion)
# http://www.hola.com/imagenes/revista/3727/portada-revista-hola-520.jpg
def get_cover_url(self):
index = 'http://www.hola.com/abono/ediciondigital/'
soup = self.index_to_soup(index)
for image in soup.findAll('img',src=True):
if image['src'].endswith('portada-revista-hola-520.jpg'):
return 'http://www.hola.com' + image['src']
return None
def get_article_url(self, article):
url = article.get('guid', None)
return url
extra_css = '''
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:30px;}
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal; font-style:italic; font-size:18px;}
'''
Weblogssl
Spoiler:
Code:
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '4 February 2011, desUBIKado'
__author__ = 'desUBIKado'
__version__ = 'v0.12'
__date__ = '28, Jul 2016'
'''
http://www.weblogssl.com/
'''
import time
import re
from calibre.web.feeds.news import BasicNewsRecipe
class weblogssl(BasicNewsRecipe):
__author__ = 'desUBIKado'
description = u'Weblogs colectivos dedicados a seguir la actualidad sobre tecnologia, entretenimiento, estilos de vida, motor, deportes y economia.'
title = u'Weblogs SL (Xataka, Genbeta, VidaExtra, Blog de Cine y otros)'
publisher = 'Weblogs SL'
category = 'Gadgets, Tech news, Product reviews, mobiles, science, cinema, entertainment, culture, tv, food, recipes, life style, motor, F1, sports, economy'
language = 'es'
timefmt = '[%a, %d %b, %Y]'
oldest_article = 1
max_articles_per_feed = 100
encoding = 'utf-8'
use_embedded_content = False
remove_empty_feeds = True
remove_javascript = True
no_stylesheets = True
# Si no se quiere recuperar todos los blogs se puede suprimir la descarga del que se desee poniendo
# un caracter # por delante, es decir, # ,(u'Applesfera', u'http://feeds.weblogssl.com/applesfera')
# haría que no se descargase Applesfera.
feeds = [
(u'Xataka', u'http://feeds.weblogssl.com/xataka2')
,(u'Xataka M\xf3vil', u'http://feeds.weblogssl.com/xatakamovil')
,(u'Xataka Foto', u'http://feeds.weblogssl.com/xatakafoto')
,(u'Xataka Android', u'http://feeds.weblogssl.com/xatakandroid')
,(u'Xataka Smart Home', u'http://feeds.weblogssl.com/Xatakahome')
,(u'Xataka Windows', u'http://feeds.weblogssl.com/xatakawindows')
,(u'Xataka Ciencia', u'http://feeds.weblogssl.com/xatakaciencia')
,(u'Applesfera', u'http://feeds.weblogssl.com/applesfera')
,(u'Vida Extra', u'http://feeds.weblogssl.com/vidaextra')
,(u'Genbeta', u'http://feeds.weblogssl.com/genbeta')
,(u'Genbeta Dev', u'http://feeds.weblogssl.com/genbetadev')
,(u'Magnet', u'http://feeds.weblogssl.com/xatakamagnet2')
,(u'Tendencias', u'http://feeds.weblogssl.com/trendencias')
,(u'Tendencias Belleza', u'http://feeds.weblogssl.com/trendenciasbelleza')
,(u'Tendencias Hombre', u'http://feeds.weblogssl.com/trendenciashombre')
,(u'Tendencias Lifestyle', u'http://feeds.weblogssl.com/trendenciaslifestyle')
,(u'Directo al paladar', u'http://feeds.weblogssl.com/directoalpaladar')
,(u'Beb\xe9s y m\xe1s', u'http://feeds.weblogssl.com/bebesymas')
,(u'Vit\xf3nica', u'http://feeds.weblogssl.com/vitonica')
,(u'Decoesfera', u'http://feeds.weblogssl.com/decoesfera')
,(u'Embelezzia', u'http://feeds.weblogssl.com/embelezzia')
,(u'Pop rosa', u'http://feeds.weblogssl.com/poprosa')
,(u'Motorpasi\xf3n', u'http://feeds.weblogssl.com/motorpasion')
,(u'Motorpasi\xf3n Moto', u'http://feeds.weblogssl.com/motorpasionmoto')
,(u'Motorpasi\xf3n Futuro', u'http://feeds.weblogssl.com/motorpasionfuturo')
,(u'Blog de Cine', u'http://feeds.weblogssl.com/blogdecine')
,(u'Vaya tele', u'http://feeds.weblogssl.com/vayatele2')
,(u'Diario del viajero', u'http://feeds.weblogssl.com/diariodelviajero')
,(u'Papel en blanco', u'http://feeds.weblogssl.com/papelenblanco')
,(u'El blog salm\xf3n', u'http://feeds.weblogssl.com/elblogsalmon2')
,(u'Pymes y aut\xf3nomos', u'http://feeds.weblogssl.com/pymesyautonomos')
,(u'Ahorro diario', u'http://feeds.weblogssl.com/ahorrodiario')
,(u'Xataka México', u'http://feeds.weblogssl.com/xatakamx')
,(u'Xataka Android México', u'http://feeds.weblogssl.com/xatakandroidmx')
,(u'Vida Extra México', u'http://feeds.weblogssl.com/vidaextramx')
,(u'Xataka Colombia', u'http://feeds.weblogssl.com/xatakaco')
,(u'Directo al paladar México', u'http://feeds.weblogssl.com/directoalpaladarmx')
,(u'Vit\xf3nica México', u'http://feeds.weblogssl.com/vitonicamx')
,(u'Tendencias Hombre México', u'http://feeds.weblogssl.com/trendenciashombremx')
,(u'Motorpasi\xf3n México', u'http://feeds.weblogssl.com/motorpasionmx')
]
keep_only_tags = [
dict(name='div', attrs={'class':'content-container'})
]
remove_tags = [
dict(name='div', attrs={'class':'article-social-share m-v1 js-article-share js-article-social-share'}),
dict(name='div', attrs={'class':'article-social-share m-v1 js-article-social-share'}),
dict(name='div', attrs={'class':'social-widgets'}),
dict(name='div', attrs={'class':'article-social-share m-in-normal'}),
dict(name='div', attrs={'class':'article-comments'}),
dict(name='div', attrs={'class':'article-links'}),
dict(name='div', attrs={'class':'article-topics-list'}),
dict(name='div', attrs={'class':'ad-box'}),
dict(name='blockquote', attrs={'class':'instagram-media'}),
dict(name='img', attrs={'alt':'Código QR'}),
dict(name='div', attrs={'id':'comments'})
]
remove_tags_after = dict(name='div' , attrs={'id':'comments'})
def print_version(self, url):
if url.startswith('http://www'):
return url.replace('http://www.', 'http://m.')
else:
return url.replace('http://', 'http://m.')
preprocess_regexps = [
# Para poner una linea en blanco entre un comentario y el siguiente
(re.compile(r'<li id="c', re.DOTALL|re.IGNORECASE), lambda match: '<br><br><li id="c'),
# Para ver las imágenes en las noticias de m.xataka.com
(re.compile(r'<noscript>', re.DOTALL|re.IGNORECASE), lambda m: ''),
(re.compile(r'</noscript>', re.DOTALL|re.IGNORECASE), lambda m: ''),
#Para cambiar de sitio el more
(re.compile(r'<div class="article-content">', re.DOTALL|re.IGNORECASE), lambda m: '<div class="article-content"><!--more-->'),
(re.compile(r'<div class="\/n<!--more-->">', re.DOTALL|re.IGNORECASE), lambda m: ''),
# Para presentar la primera imagen del artículo
(re.compile(r' srcset="http://i.blogs.es/', re.DOTALL|re.IGNORECASE), lambda match: ' src="http://i.blogs.es/'),
(re.compile(r' 450w, http://i.blogs.es', re.DOTALL|re.IGNORECASE), lambda match: '"><!--'),
(re.compile(r'1366w"><span></span>', re.DOTALL|re.IGNORECASE), lambda match: '-->'),
(re.compile(r'1366w" sf-src="http://i.blogs.es', re.DOTALL|re.IGNORECASE), lambda match: '--> sf-src="http://i.blogs.es')
]
# Para sustituir el video incrustado de YouTube por una imagen
def preprocess_html(self, soup):
for video_yt in soup.findAll('iframe',{'title':'YouTube video player'}):
if video_yt:
video_yt.name = 'img'
fuente = video_yt['src']
fuente2pre = fuente.replace('http://www.youtube.com/embed/','http://img.youtube.com/vi/')
fuente2 = fuente2pre.replace('https://www.youtube.com/embed/','https://img.youtube.com/vi/')
fuente3 = fuente2.replace('?rel=0','')
video_yt['src'] = fuente3 + '/0.jpg'
for video_yt2 in soup.findAll('iframe',{'allowfullscreen'}):
if video_yt2:
esyt = video_yt2.find('youtube')
if esyt:
video_yt2.name = 'img'
fuente = video_yt2['src']
fuente2pre = fuente.replace('http://www.youtube.com/embed/','http://img.youtube.com/vi/')
fuente2 = fuente2pre.replace('https://www.youtube.com/embed/','https://img.youtube.com/vi/')
video_yt2['src'] = fuente2 + '/0.jpg'
return soup
def get_article_url(self, article):
return article.get('guid', None)
JotDown.es
Spoiler:
Code:
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '23 June 2013, desUBIKado'
__author__ = 'desUBIKado'
__description__ = 'Contemporary Culture Magazine'
__version__ = 'v0.03'
__date__ = '28, July 2016'
'''
http://www.jotdown.es/
'''
import time
import re
from calibre.web.feeds.news import BasicNewsRecipe
class jotdown(BasicNewsRecipe):
author = 'desUBIKado'
description = 'Revista digital con magníficos y extensos artículos'
title = u'Jot Down - Contemporary Culture Magazine'
publisher = 'Wabi Sabi Investments, S.C.'
category = 'Opinion, culture, science, movies, TV shows, music, blogs'
language = 'es'
timefmt = '[%a, %d %b, %Y]'
oldest_article = 7
delay = 1
max_articles_per_feed = 20
masthead_url = 'http://www.jotdown.es/wp-content/uploads/2011/04/logoJotDown.png'
use_embedded_content = False
remove_javascript = True
no_stylesheets = True
feeds = [
(u'Portada', u'http://www.jotdown.es/feed/')
]
keep_only_tags = [dict(name='div', attrs={'id':['content']}),
dict(name='div', attrs={'id':['comments']}),
]
remove_tags = [dict(name='a', attrs={'href':['http://alternativaseconomicas.coop/']}),
dict(name='div', attrs={'class':['reply','after-meta','tags_list','wp_rp_wrap wp_rp_plain','after-meta','share_box']}),
dict(name='div', attrs={'align':['center']}),
dict(name='span', attrs={'class':['fbreplace','says']}),
dict(name='img', attrs={'class':['avatar avatar-60 photo']}),
dict(name='li', attrs={'class':['post pingback']}),
dict(name='div', attrs={'id':'respond'})
]
remove_tags_after = dict(name='div' , attrs={'id':'respond'})
preprocess_regexps = [
# To change the small size of the text
(re.compile(r'font-size: small', re.DOTALL|re.IGNORECASE), lambda match: 'font-size: medium'),
# To present the image of the embedded video
(re.compile(r'<object type="application/x-shockwave-flash" data="http://www.youtube.com/v', re.DOTALL|re.IGNORECASE), lambda match: '<img src="http://img.youtube.com/vi'),
(re.compile(r'&rel=0&fs=1"', re.DOTALL|re.IGNORECASE), lambda match: '/0.jpg"><object'),
# To remove the link of the category
(re.compile(r'<div class="meta">', re.DOTALL|re.IGNORECASE), lambda match: '<div class="meta"><!-- '),
(re.compile(r'</a>, <a href="http://www.jotdown.es/category', re.DOTALL|re.IGNORECASE), lambda match: ', <!--'),
(re.compile(r'"category tag">', re.DOTALL|re.IGNORECASE), lambda match: '--> '),
(re.compile(r'</a> —', re.DOTALL|re.IGNORECASE), lambda match: ''),
# To remove the link of the title
(re.compile(r'<h1 class="title"><a href="', re.DOTALL|re.IGNORECASE), lambda match: '<h1 class="title"><div class="'),
(re.compile(r'</a></h1>', re.DOTALL|re.IGNORECASE), lambda match: '</div></h1>')
]
|