MobileRead Forums - View Single Post - One new recipe and other one updated (In Spanish)

Thread: One new recipe and other one updated (In Spanish)

View Single Post

01-09-2011, 01:36 PM	#1
desUBIKado Member Posts: 22 Karma: 12 Join Date: Feb 2009 Location: Zaragoza, Spain Device: prs-505, iliad	One new recipe and other one updated (In Spanish) Hi there, I bring a newspaper from northern Spain, elcorreo.com Spoiler: #!/usr/bin/env python __license__ = 'GPL v3' __copyright__ = '08 Januery 2011, desUBIKado' __author__ = 'desUBIKado' __description__ = 'Daily newspaper from Biscay' __version__ = 'v0.08' __date__ = '08, Januery 2011' ''' http://www.elcorreo.com/ ''' import time import re from calibre.web.feeds.news import BasicNewsRecipe class heraldo(BasicNewsRecipe): author = 'desUBIKado' description = 'Daily newspaper from Biscay' title = u'El Correo' publisher = 'Vocento' category = 'News, politics, culture, economy, general interest' oldest_article = 2 delay = 1 max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False language = 'es' timefmt = '[%a, %d %b, %Y]' encoding = 'iso-8859-1' remove_empty_feeds = True remove_javascript = False feeds = [ (u'Portada', u'http://www.elcorreo.com/vizcaya/portada.xml'), (u'Local', u'http://www.elcorreo.com/vizcaya/rss/feeds/vizcaya.xml'), (u'Internacional', u'hhttp://www.elcorreo.com/vizcaya/rss/feeds/internacional.xml'), (u'Econom\xeda', u'http://www.elcorreo.com/vizcaya/rss/feeds/economia.xml'), (u'Pol\xedtica', u'http://www.elcorreo.com/vizcaya/rss/feeds/politica.xml'), (u'Opini\xf3n', u'http://www.elcorreo.com/vizcaya/rss/feeds/opinion.xml'), (u'Deportes', u'http://www.elcorreo.com/vizcaya/rss/feeds/deportes.xml'), (u'Sociedad', u'http://www.elcorreo.com/vizcaya/rss/feeds/sociedad.xml'), (u'Cultura', u'http://www.elcorreo.com/vizcaya/rss/feeds/cultura.xml'), (u'Televisi\xf3n', u'http://www.elcorreo.com/vizcaya/rss/feeds/television.xml'), (u'Gente', u'http://www.elcorreo.com/vizcaya/rss/feeds/gente.xml') ] keep_only_tags = [ dict(name='div', attrs={'class':['grouphead','date','art_head','story-texto','text','colC_articulo','contenido_comentari os']}), dict(name='div' , attrs={'id':['articulo','story-texto','story-entradilla']}) ] remove_tags = [ dict(name='div', attrs={'class':['art_barra','detalles-opinion','formdenunciar','modulo calculadoras','nubetags','pie']}), dict(name='div', attrs={'class':['mod_lomas','bloque_lomas','blm_header','link-app3','link-app4','botones_listado']}), dict(name='div', attrs={'class':['navegacion_galeria','modulocanalpromocion','separ a','separacion','compartir','tags_relacionados']}), dict(name='div', attrs={'class':['moduloBuscadorDeportes','modulo-gente','moddestacadopeq','OpcArt','articulopinione s']}), dict(name='div', attrs={'class':['modulo-especial','publiEspecial']}), dict(name='div', attrs={'id':['articulopina']}), dict(name='br', attrs={'class':'clear'}), dict(name='form', attrs={'name':'frm_conversor2'}) ] remove_tags_before = dict(name='div' , attrs={'class':'articulo '}) remove_tags_after = dict(name='div' , attrs={'class':'comentarios'}) def get_cover_url(self): cover = None st = time.localtime() year = str(st.tm_year) month = "%.2d" % st.tm_mon day = "%.2d" % st.tm_mday #http://img.kiosko.net/2011/01/02/es/elcorreo.750.jpg #http://info.elcorreo.com/pdf/06012011-viz.pdf cover='http://info.elcorreo.com/pdf/'+ day + month + year +'-viz.pdf' br = BasicNewsRecipe.get_browser() try: br.open(cover) except: self.log("\nPortada no disponible") cover ='http://www.elcorreo.com/vizcaya/noticias/201002/02/Media/logo-elcorreo-nuevo.png' return cover extra_css = ''' h1, .headline {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:30px;} h2, .subhead {font-family:Arial,Helvetica,sans-serif; font-style:italic; font-weight:normal;font-size:18px;} h3, .overhead {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:16px;} h4 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:16px;} h5 {font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:16px;} h6 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:16px;} .date,.byline, .photo {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:14px;} img{margin-bottom: 0.4em} ''' preprocess_regexps = [ # To present the image of the embedded video (re.compile(r'var RUTA_IMAGEN', re.DOTALL\|re.IGNORECASE), lambda match: '</script><img src'), (re.compile(r'.jpg";', re.DOTALL\|re.IGNORECASE), lambda match: '.jpg">'), (re.compile(r'var SITIO = "elcorreo";', re.DOTALL\|re.IGNORECASE), lambda match: '<SCRIPT TYPE="text/JavaScript"'), # To separate paragraphs with a blank line (re.compile(r'<div class="p"', re.DOTALL\|re.IGNORECASE), lambda match: '<p></p><div class="p"'), # To put a blank line between the subtitle and the date and time of the news (re.compile(r'<div class="date">', re.DOTALL\|re.IGNORECASE), lambda match: '<br><div class="date">'), # To put a blank line between the intro of the embedded videos and the previous text (re.compile(r'<div class="video"', re.DOTALL\|re.IGNORECASE), lambda match: '<br><div class="video"'), # To view photos from the first when these are presented as a gallery (re.compile(r'src="/img/shim.gif"', re.DOTALL\|re.IGNORECASE), lambda match: ''), (re.compile(r'rel=', re.DOTALL\|re.IGNORECASE), lambda match: 'src='), # To remove the link of the title (re.compile(r'<h1 class="headline">\n<a href="', re.DOTALL\|re.IGNORECASE), lambda match: '<h1 class="'), (re.compile(r'</a>\n</h1>', re.DOTALL\|re.IGNORECASE), lambda match: '</h1>'), ] And an update to heraldo.es Spoiler: #!/usr/bin/env python __license__ = 'GPL v3' __copyright__ = '04 December 2010, desUBIKado' __author__ = 'desUBIKado' __description__ = 'Daily newspaper from Aragon' __version__ = 'v0.04' __date__ = '6, Januery 2011' ''' http://www.heraldo.es/ ''' import time import re from calibre.web.feeds.news import BasicNewsRecipe class heraldo(BasicNewsRecipe): author = 'desUBIKado' description = 'Daily newspaper from Aragon' title = u'Heraldo de Aragon' publisher = 'OJD Nielsen' category = 'News, politics, culture, economy, general interest' language = 'es' timefmt = '[%a, %d %b, %Y]' oldest_article = 2 delay = 1 max_articles_per_feed = 100 use_embedded_content = False remove_javascript = True no_stylesheets = True feeds = [ (u'Portadas', u'http://www.heraldo.es/index.php/mod.portadas/mem.rss') ] keep_only_tags = [dict(name='div', attrs={'id':['dts','com']})] remove_tags = [dict(name='a', attrs={'class':['com flo-r','enl-if','enl-df']}), dict(name='div', attrs={'class':['brb-b-s con marg-btt','cnt-rel con']}), dict(name='form', attrs={'class':'form'}), dict(name='ul', attrs={'id':['cont-tags','pag-1']})] remove_tags_before = dict(name='div' , attrs={'id':'dts'}) remove_tags_after = dict(name='div' , attrs={'id':'com'}) def get_cover_url(self): cover = None st = time.localtime() year = str(st.tm_year) month = "%.2d" % st.tm_mon day = "%.2d" % st.tm_mday #http://oldorigin-www.heraldo.es/2010...ada_aragon.pdf cover='http://oldorigin-www.heraldo.es/'+ year + month + day +'/primeras/portada_aragon.pdf' br = BasicNewsRecipe.get_browser() try: br.open(cover) except: self.log("\nPortada no disponible") cover ='http://www.heraldo.es/MODULOS/global/publico/interfaces/img/logo-Heraldo.png' return cover extra_css = ''' .con strong{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:16px;} .con h2{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:30px;} .con span{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:12px;} .ent {font-family:Arial,Helvetica,sans-serif; font-weight:normal; font-style:italic; font-size:18px;} img{margin-bottom: 0.4em} ''' preprocess_regexps = [ # To separate the comments with a blank line (re.compile(r'<div id="com"', re.DOTALL\|re.IGNORECASE), lambda match: '<br><div id="com"') ] Greetings