|
Latest Uploads |
Browse Latest Uploads |
Latest Uploads (Feed) |
Kindle Download Guide |
Formats |
Kindle (MOBI) |
Sony BBeB (LRF) |
ePub (EPUB) |
eBookwise (IMP) |
Miscellaneous |
Search Forums |
Advanced Search |
Attachment Search |
Find New Posts |
All New Posts |
- w/o E-Book Uploads |
Last 24 Hours |
Last 2 Days |
Last 5 Days |
Your Posts |
Go to Page... |
![]() |
|
Thread Tools | Search this Thread |
![]() |
#1 |
Member
![]() Posts: 22
Karma: 12
Join Date: Feb 2009
Location: Zaragoza, Spain
Device: prs-505, iliad
|
Updates for 4 Spanish news sources
elcorreo.com
Spoiler:
Code:
#!/usr/bin/env python __license__ = 'GPL v3' __copyright__ = '08 Januery 2011, desUBIKado' __author__ = 'desUBIKado' __description__ = 'Daily newspaper from Biscay' __version__ = 'v0.13' __date__ = '28, July 2016' ''' http://www.elcorreo.com/ ''' import time import re from calibre.web.feeds.recipes import BasicNewsRecipe class elcorreo(BasicNewsRecipe): author = 'desUBIKado' description = 'Daily newspaper from Biscay' title = u'El Correo' publisher = 'Vocento' category = 'News, politics, culture, economy, general interest' oldest_article = 1 delay = 1 max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False masthead_url = 'http://www.elcorreo.com/vizcaya/noticias/201002/02/Media/logo-elcorreo-nuevo.png' language = 'es' timefmt = '[%a, %d %b, %Y]' encoding = 'utf-8' remove_empty_feeds = True remove_javascript = True feeds = [ (u'Portada', u'http://www.elcorreo.com/bizkaia/rss/atom/portada'), (u'Mundo', u'http://www.elcorreo.com/bizkaia/rss/atom?seccion=internacional'), (u'Bizkaia', u'http://www.elcorreo.com/bizkaia/rss/atom?seccion=bizkaia'), (u'Guipuzkoa', u'http://www.elcorreo.com/bizkaia/rss/atom?seccion=gipuzkoa'), (u'Araba', u'http://www.elcorreo.com/bizkaia/rss/atom?seccion=araba'), (u'La Rioja', u'http://www.elcorreo.com/bizkaia/rss/atom?seccion=larioja'), (u'Miranda', u'http://www.elcorreo.com/bizkaia/rss/atom?seccion=miranda'), (u'Economía', u'http://www.elcorreo.com/bizkaia/rss/atom?seccion=economia'), (u'Culturas', u'http://www.elcorreo.com/bizkaia/rss/atom?seccion=culturas'), (u'Politica', u'http://www.elcorreo.com/bizkaia/rss/atom?seccion=politica'), (u'Tecnología', u'http://www.elcorreo.com/bizkaia/rss/atom?seccion=tecnologia'), (u'Gente - Estilo', u'http://www.elcorreo.com/bizkaia/rss/atom?seccion=gente-estilo'), (u'Planes', u'http://www.elcorreo.com/bizkaia/rss/atom?seccion=planes'), (u'Athletic', u'http://www.elcorreo.com/bizkaia/rss/atom?seccion=athletic'), (u'Alavés', u'http://www.elcorreo.com/bizkaia/rss/atom?seccion=alaves'), (u'Bilbao Basket', u'http://www.elcorreo.com/bizkaia/rss/atom?seccion=bilbaobasket'), (u'Baskonia', u'http://www.elcorreo.com/bizkaia/rss/atom?seccion=baskonia'), (u'Deportes', u'http://www.elcorreo.com/bizkaia/rss/atom?seccion=deportes'), (u'Jaiak', u'http://www.elcorreo.com/bizkaia/rss/atom?seccion=jaiak'), (u'La Blanca', u'http://www.elcorreo.com/bizkaia/rss/atom?seccion=la-blanca-vitoria'), (u'Aste Nagusia', u'http://www.elcorreo.com/bizkaia/rss/atom?seccion=aste-nagusia-bilbao'), (u'Semana Santa', u'http://www.elcorreo.com/bizkaia/rss/atom?seccion=semana-santa'), (u'Festivales', u'http://www.elcorreo.com/bizkaia/rss/atom?seccion=festivales') ] keep_only_tags = [ dict(name='article', attrs={'class':['story media-list ']}) ] remove_tags = [ dict(name='span', attrs={'class':['no-comments']}), dict(name='div', attrs={'class':['compApoyosText compNoticiasR']}) ] remove_tags_before = dict(name='article' , attrs={'class':'story media-list '}) remove_tags_after = dict(name='article' , attrs={'class':'story media-list '}) # Usamos la versión para móviles def print_version(self, url): return url.replace('http://www.', 'http://m.') _processed_links = [] def get_article_url(self, article): link = article.get('link', None) if link is None: return article # modificamos la url de las noticias de los equipos deportivos para que funcionen, por ejemplo: # http://athletic.elcorreo.com/noticias/201407/27/muniain-estrella-athletic-para-20140727093046.html # http://m.elcorreo.com/noticias/201407/27/muniain-estrella-athletic-para-20140727093046.html?external=deportes/athletic parte = link.split('/') if parte[2] == 'athletic.elcorreo.com': link = 'http://m.elcorreo.com/' + parte[3] + '/' + parte[4] + '/' + parte[5] + '/' + parte[6] + '?external=deportes/athletic' else: if parte[2] == 'baskonia.elcorreo.com': link = 'http://m.elcorreo.com/' + parte[3] + '/' + parte[4] + '/' + parte[5] + '/' + parte[6] + '?external=deportes/baskonia' else: if parte[2] == 'bilbaobasket.elcorreo.com': link = 'http://m.elcorreo.com/' + parte[3] + '/' + parte[4] + '/' + parte[5] + '/' + parte[6] + '?external=deportes/bilbaobasket' else: if parte[2] == 'alaves.elcorreo.com': link = 'http://m.elcorreo.com/' + parte[3] + '/' + parte[4] + '/' + parte[5] + '/' + parte[6] + '?external=deportes/alaves' # A veces el mismo articulo aparece en la versión de Alava y en la de Bizkaia. Por ejemplo: # http://www.elcorreo.com/alava/deportes/motor/formula-1/201407/27/ecclestone-quiere-briatore-ayude-20140727140820-rc.html # http://www.elcorreo.com/bizkaia/deportes/motor/formula-1/201407/27/ecclestone-quiere-briatore-ayude-20140727140820-rc.html # para controlar los duplicados, unificamos las url para que sean siempre de bizkaia (excepto para la sección "araba") if ((parte[3] == 'alava') and (parte[4] != 'araba')): link = link.replace('elcorreo.com/alava', 'elcorreo.com/bizkaia') # Controlamos si el artículo ha sido incluido en otro feed para eliminarlo if not (link in self._processed_links): self._processed_links.append(link) else: link = None return link # Recuperamos la portada de papel (la imagen format=1 tiene mayor resolucion) def get_cover_url(self): cover = None st = time.localtime() year = str(st.tm_year) month = "%.2d" % st.tm_mon day = "%.2d" % st.tm_mday #http://info.elcorreo.com/pdf/07082013-viz.pdf cover='http://info.elcorreo.com/pdf/'+ day + month + year +'-viz.pdf' br = BasicNewsRecipe.get_browser(self) try: br.open(cover) except: self.log("\nPortada no disponible") cover ='http://www.elcorreo.com/vizcaya/noticias/201002/02/Media/logo-elcorreo-nuevo.png' return cover # Para cambiar el estilo del texto extra_css = ''' h4 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:28px;} .place {font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:12px;} .name {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:12px;} .overhead, .compLadillo, description {font-family:georgia,serif; font-weight:bold;font-size:18px;} .compSumario, .detalle-titular {font-family:georgia,serif; font-style:italic; font-weight:normal;font-size:22px;color:#4D4D4D;} img{margin-bottom: 0.4em} ''' preprocess_regexps = [ # Para presentar la imagen de los video incrustados (re.compile(r'stillURLVideo: \'', re.DOTALL|re.IGNORECASE), lambda match: '</script><img src="'), (re.compile(r'.jpg\',', re.DOTALL|re.IGNORECASE), lambda match: '.jpg"><SCRIPT TYPE="text/JavaScript"'), # Para quitar el punto de la lista (re.compile(r'<li class="destacada">', re.DOTALL|re.IGNORECASE), lambda match: '<div class="destacada"></div>') ] Hola.com Spoiler:
Code:
#!/usr/bin/env python __license__ = 'GPL v3' __copyright__ = '30 June 2012, desUBIKado' __author__ = 'desUBIKado' __description__ = 'Diario de actualidad, moda y belleza' __version__ = 'v0.03' __date__ = '28, Jul 2016' ''' http://www.hola.com/ ''' import time import re from calibre.web.feeds.news import BasicNewsRecipe class hola_es(BasicNewsRecipe): author = 'desUBIKado' description = 'Diario de actualidad, moda y belleza' title = u'¡Hola!' publisher = 'Hola S.L.' category = 'Spanish celebrities, Entertainment News, Royalty, Daily Variety, Hollywood' language = 'es' masthead_url = 'http://imagenes.hola.com/comunes/2008/logo-holacom.gif' timefmt = '[%a, %d %b, %Y]' oldest_article = 7 delay = 1 encoding = 'utf-8' max_articles_per_feed = 100 use_embedded_content = False remove_empty_feeds = True remove_javascript = True no_stylesheets = True feeds = [ (u'Famosos' , u'http://www.hola.com/famosos/rss.xml' ) ,(u'Realeza' , u'http://www.hola.com/realeza/rss.xml' ) ,(u'Cine' , u'http://www.hola.com/cine/rss.xml' ) ,(u'M\xfasica' , u'http://www.hola.com/musica/rss.xml' ) ,(u'Moda y modelos' , u'http://www.hola.com/moda/portada/rss.xml' ) ,(u'Belleza y salud', u'http://www.hola.com/belleza/portada/rss.xml' ) ,(u'Ni\xf1os' , u'http://www.hola.com/ninos/rss.xml' ) ] keep_only_tags = [dict(name='article', attrs={'class':['body col-md-8 col-xs-12']})] remove_tags = [dict(name='div', attrs={'class':['comments','news-share','sponsored-news']}), dict(name='div', attrs={'itemprop':['logo']}), dict(name='span', attrs={'class':['hidden']}), dict(name='p', attrs={'class':['hidden']}), dict(name='section', attrs={'class':['news-tags']}) ] remove_tags_after = dict(name='div' , attrs={'class':'comments'}) # <span>VER GALERÍA<i data-icon="1" class="icon"></i></span> preprocess_regexps = [ # Quitar VER GALERÍA (re.compile(r'<span>VER GALER', re.DOTALL|re.IGNORECASE), lambda m: '<!--'), (re.compile(r'class="icon"></i></span>', re.DOTALL|re.IGNORECASE), lambda m: '-->'), # Quitar enlaces varios (re.compile(r'<p><a href="http://www.hola.com', re.DOTALL|re.IGNORECASE), lambda m: '<!--'), (re.compile(r'<p style="text-align: center;">', re.DOTALL|re.IGNORECASE), lambda m: '<!--'), (re.compile(r'<p style="line-height: 20.8px;"><a href="http://www.hola.com', re.DOTALL|re.IGNORECASE), lambda m: '<!--'), (re.compile(r'</strong></a></p>', re.DOTALL|re.IGNORECASE), lambda m: '-->') ] # Recuperamos la portada de papel (la imagen 520 tiene mayor resolucion) # http://www.hola.com/imagenes/revista/3727/portada-revista-hola-520.jpg def get_cover_url(self): index = 'http://www.hola.com/abono/ediciondigital/' soup = self.index_to_soup(index) for image in soup.findAll('img',src=True): if image['src'].endswith('portada-revista-hola-520.jpg'): return 'http://www.hola.com' + image['src'] return None def get_article_url(self, article): url = article.get('guid', None) return url extra_css = ''' h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:30px;} h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal; font-style:italic; font-size:18px;} ''' Weblogssl Spoiler:
Code:
#!/usr/bin/env python __license__ = 'GPL v3' __copyright__ = '4 February 2011, desUBIKado' __author__ = 'desUBIKado' __version__ = 'v0.12' __date__ = '28, Jul 2016' ''' http://www.weblogssl.com/ ''' import time import re from calibre.web.feeds.news import BasicNewsRecipe class weblogssl(BasicNewsRecipe): __author__ = 'desUBIKado' description = u'Weblogs colectivos dedicados a seguir la actualidad sobre tecnologia, entretenimiento, estilos de vida, motor, deportes y economia.' title = u'Weblogs SL (Xataka, Genbeta, VidaExtra, Blog de Cine y otros)' publisher = 'Weblogs SL' category = 'Gadgets, Tech news, Product reviews, mobiles, science, cinema, entertainment, culture, tv, food, recipes, life style, motor, F1, sports, economy' language = 'es' timefmt = '[%a, %d %b, %Y]' oldest_article = 1 max_articles_per_feed = 100 encoding = 'utf-8' use_embedded_content = False remove_empty_feeds = True remove_javascript = True no_stylesheets = True # Si no se quiere recuperar todos los blogs se puede suprimir la descarga del que se desee poniendo # un caracter # por delante, es decir, # ,(u'Applesfera', u'http://feeds.weblogssl.com/applesfera') # haría que no se descargase Applesfera. feeds = [ (u'Xataka', u'http://feeds.weblogssl.com/xataka2') ,(u'Xataka M\xf3vil', u'http://feeds.weblogssl.com/xatakamovil') ,(u'Xataka Foto', u'http://feeds.weblogssl.com/xatakafoto') ,(u'Xataka Android', u'http://feeds.weblogssl.com/xatakandroid') ,(u'Xataka Smart Home', u'http://feeds.weblogssl.com/Xatakahome') ,(u'Xataka Windows', u'http://feeds.weblogssl.com/xatakawindows') ,(u'Xataka Ciencia', u'http://feeds.weblogssl.com/xatakaciencia') ,(u'Applesfera', u'http://feeds.weblogssl.com/applesfera') ,(u'Vida Extra', u'http://feeds.weblogssl.com/vidaextra') ,(u'Genbeta', u'http://feeds.weblogssl.com/genbeta') ,(u'Genbeta Dev', u'http://feeds.weblogssl.com/genbetadev') ,(u'Magnet', u'http://feeds.weblogssl.com/xatakamagnet2') ,(u'Tendencias', u'http://feeds.weblogssl.com/trendencias') ,(u'Tendencias Belleza', u'http://feeds.weblogssl.com/trendenciasbelleza') ,(u'Tendencias Hombre', u'http://feeds.weblogssl.com/trendenciashombre') ,(u'Tendencias Lifestyle', u'http://feeds.weblogssl.com/trendenciaslifestyle') ,(u'Directo al paladar', u'http://feeds.weblogssl.com/directoalpaladar') ,(u'Beb\xe9s y m\xe1s', u'http://feeds.weblogssl.com/bebesymas') ,(u'Vit\xf3nica', u'http://feeds.weblogssl.com/vitonica') ,(u'Decoesfera', u'http://feeds.weblogssl.com/decoesfera') ,(u'Embelezzia', u'http://feeds.weblogssl.com/embelezzia') ,(u'Pop rosa', u'http://feeds.weblogssl.com/poprosa') ,(u'Motorpasi\xf3n', u'http://feeds.weblogssl.com/motorpasion') ,(u'Motorpasi\xf3n Moto', u'http://feeds.weblogssl.com/motorpasionmoto') ,(u'Motorpasi\xf3n Futuro', u'http://feeds.weblogssl.com/motorpasionfuturo') ,(u'Blog de Cine', u'http://feeds.weblogssl.com/blogdecine') ,(u'Vaya tele', u'http://feeds.weblogssl.com/vayatele2') ,(u'Diario del viajero', u'http://feeds.weblogssl.com/diariodelviajero') ,(u'Papel en blanco', u'http://feeds.weblogssl.com/papelenblanco') ,(u'El blog salm\xf3n', u'http://feeds.weblogssl.com/elblogsalmon2') ,(u'Pymes y aut\xf3nomos', u'http://feeds.weblogssl.com/pymesyautonomos') ,(u'Ahorro diario', u'http://feeds.weblogssl.com/ahorrodiario') ,(u'Xataka México', u'http://feeds.weblogssl.com/xatakamx') ,(u'Xataka Android México', u'http://feeds.weblogssl.com/xatakandroidmx') ,(u'Vida Extra México', u'http://feeds.weblogssl.com/vidaextramx') ,(u'Xataka Colombia', u'http://feeds.weblogssl.com/xatakaco') ,(u'Directo al paladar México', u'http://feeds.weblogssl.com/directoalpaladarmx') ,(u'Vit\xf3nica México', u'http://feeds.weblogssl.com/vitonicamx') ,(u'Tendencias Hombre México', u'http://feeds.weblogssl.com/trendenciashombremx') ,(u'Motorpasi\xf3n México', u'http://feeds.weblogssl.com/motorpasionmx') ] keep_only_tags = [ dict(name='div', attrs={'class':'content-container'}) ] remove_tags = [ dict(name='div', attrs={'class':'article-social-share m-v1 js-article-share js-article-social-share'}), dict(name='div', attrs={'class':'article-social-share m-v1 js-article-social-share'}), dict(name='div', attrs={'class':'social-widgets'}), dict(name='div', attrs={'class':'article-social-share m-in-normal'}), dict(name='div', attrs={'class':'article-comments'}), dict(name='div', attrs={'class':'article-links'}), dict(name='div', attrs={'class':'article-topics-list'}), dict(name='div', attrs={'class':'ad-box'}), dict(name='blockquote', attrs={'class':'instagram-media'}), dict(name='img', attrs={'alt':'Código QR'}), dict(name='div', attrs={'id':'comments'}) ] remove_tags_after = dict(name='div' , attrs={'id':'comments'}) def print_version(self, url): if url.startswith('http://www'): return url.replace('http://www.', 'http://m.') else: return url.replace('http://', 'http://m.') preprocess_regexps = [ # Para poner una linea en blanco entre un comentario y el siguiente (re.compile(r'<li id="c', re.DOTALL|re.IGNORECASE), lambda match: '<br><br><li id="c'), # Para ver las imágenes en las noticias de m.xataka.com (re.compile(r'<noscript>', re.DOTALL|re.IGNORECASE), lambda m: ''), (re.compile(r'</noscript>', re.DOTALL|re.IGNORECASE), lambda m: ''), #Para cambiar de sitio el more (re.compile(r'<div class="article-content">', re.DOTALL|re.IGNORECASE), lambda m: '<div class="article-content"><!--more-->'), (re.compile(r'<div class="\/n<!--more-->">', re.DOTALL|re.IGNORECASE), lambda m: ''), # Para presentar la primera imagen del artículo (re.compile(r' srcset="http://i.blogs.es/', re.DOTALL|re.IGNORECASE), lambda match: ' src="http://i.blogs.es/'), (re.compile(r' 450w, http://i.blogs.es', re.DOTALL|re.IGNORECASE), lambda match: '"><!--'), (re.compile(r'1366w"><span></span>', re.DOTALL|re.IGNORECASE), lambda match: '-->'), (re.compile(r'1366w" sf-src="http://i.blogs.es', re.DOTALL|re.IGNORECASE), lambda match: '--> sf-src="http://i.blogs.es') ] # Para sustituir el video incrustado de YouTube por una imagen def preprocess_html(self, soup): for video_yt in soup.findAll('iframe',{'title':'YouTube video player'}): if video_yt: video_yt.name = 'img' fuente = video_yt['src'] fuente2pre = fuente.replace('http://www.youtube.com/embed/','http://img.youtube.com/vi/') fuente2 = fuente2pre.replace('https://www.youtube.com/embed/','https://img.youtube.com/vi/') fuente3 = fuente2.replace('?rel=0','') video_yt['src'] = fuente3 + '/0.jpg' for video_yt2 in soup.findAll('iframe',{'allowfullscreen'}): if video_yt2: esyt = video_yt2.find('youtube') if esyt: video_yt2.name = 'img' fuente = video_yt2['src'] fuente2pre = fuente.replace('http://www.youtube.com/embed/','http://img.youtube.com/vi/') fuente2 = fuente2pre.replace('https://www.youtube.com/embed/','https://img.youtube.com/vi/') video_yt2['src'] = fuente2 + '/0.jpg' return soup def get_article_url(self, article): return article.get('guid', None) JotDown.es Spoiler:
Code:
#!/usr/bin/env python __license__ = 'GPL v3' __copyright__ = '23 June 2013, desUBIKado' __author__ = 'desUBIKado' __description__ = 'Contemporary Culture Magazine' __version__ = 'v0.03' __date__ = '28, July 2016' ''' http://www.jotdown.es/ ''' import time import re from calibre.web.feeds.news import BasicNewsRecipe class jotdown(BasicNewsRecipe): author = 'desUBIKado' description = 'Revista digital con magníficos y extensos artículos' title = u'Jot Down - Contemporary Culture Magazine' publisher = 'Wabi Sabi Investments, S.C.' category = 'Opinion, culture, science, movies, TV shows, music, blogs' language = 'es' timefmt = '[%a, %d %b, %Y]' oldest_article = 7 delay = 1 max_articles_per_feed = 20 masthead_url = 'http://www.jotdown.es/wp-content/uploads/2011/04/logoJotDown.png' use_embedded_content = False remove_javascript = True no_stylesheets = True feeds = [ (u'Portada', u'http://www.jotdown.es/feed/') ] keep_only_tags = [dict(name='div', attrs={'id':['content']}), dict(name='div', attrs={'id':['comments']}), ] remove_tags = [dict(name='a', attrs={'href':['http://alternativaseconomicas.coop/']}), dict(name='div', attrs={'class':['reply','after-meta','tags_list','wp_rp_wrap wp_rp_plain','after-meta','share_box']}), dict(name='div', attrs={'align':['center']}), dict(name='span', attrs={'class':['fbreplace','says']}), dict(name='img', attrs={'class':['avatar avatar-60 photo']}), dict(name='li', attrs={'class':['post pingback']}), dict(name='div', attrs={'id':'respond'}) ] remove_tags_after = dict(name='div' , attrs={'id':'respond'}) preprocess_regexps = [ # To change the small size of the text (re.compile(r'font-size: small', re.DOTALL|re.IGNORECASE), lambda match: 'font-size: medium'), # To present the image of the embedded video (re.compile(r'<object type="application/x-shockwave-flash" data="http://www.youtube.com/v', re.DOTALL|re.IGNORECASE), lambda match: '<img src="http://img.youtube.com/vi'), (re.compile(r'&rel=0&fs=1"', re.DOTALL|re.IGNORECASE), lambda match: '/0.jpg"><object'), # To remove the link of the category (re.compile(r'<div class="meta">', re.DOTALL|re.IGNORECASE), lambda match: '<div class="meta"><!-- '), (re.compile(r'</a>, <a href="http://www.jotdown.es/category', re.DOTALL|re.IGNORECASE), lambda match: ', <!--'), (re.compile(r'"category tag">', re.DOTALL|re.IGNORECASE), lambda match: '--> '), (re.compile(r'</a> —', re.DOTALL|re.IGNORECASE), lambda match: ''), # To remove the link of the title (re.compile(r'<h1 class="title"><a href="', re.DOTALL|re.IGNORECASE), lambda match: '<h1 class="title"><div class="'), (re.compile(r'</a></h1>', re.DOTALL|re.IGNORECASE), lambda match: '</div></h1>') ] |
![]() |
![]() |
![]() |
|
![]() |
||||
Thread | Thread Starter | Forum | Replies | Last Post |
Updates for Austrian news sources | jr17oo | Recipes | 3 | 06-26-2016 03:51 AM |
Automatic news fetch: different news sources to different Kindles? | kcp | Calibre | 4 | 01-24-2015 02:43 PM |
News Sources | sisterphonetica | Recipes | 5 | 06-27-2014 11:30 AM |
[Enhancement] Add new news sources of ABC NEWS | donnie888 | Recipes | 0 | 12-23-2012 12:39 AM |
News sources | Lob | Recipes | 2 | 02-17-2011 11:49 AM |