View Single Post
Old 02-27-2011, 03:49 PM   #1
partymonkey
Junior Member
partymonkey began at the beginning.
 
Posts: 9
Karma: 10
Join Date: Feb 2011
Device: Kindle
Recipe for La Nacion (argentina) has issues

In the last few days, I've noticed issues with the La Nacion feed:
1) Some articles are downloading as gobbly gook, whereas others don't. Also, it doesn't seem to be consistent as to which ones get scrambled.
2) Articles contain additional content, such as related articles, comments, recommended articles, etc.
3) The logo/header directory location changed.

I tried to figure out both of these issues, so I've made some changes to the recipe, but I'm still getting cases of #1. I've been able to fix several of #2, and fixed #3.

Can anyone help me with #1? This is new to me so I'm not sure where to go next.

Here's my edited recipe, based on the built-in one:
__license__ = 'GPL v3'
__copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
'''
lanacion.com.ar
'''

from calibre.web.feeds.news import BasicNewsRecipe

class Lanacion(BasicNewsRecipe):
title = u'La Nacion'
__author__ = u'Darko Miletic'
description = "lanacion.com - Informacion actualizada las 24 horas, con noticias de Argentina y del mundo"
publisher = u'La Nacion S.A.'
category = 'news, politics, Argentina'
oldest_article = 1
max_articles_per_feed = 100
use_embedded_content = False
no_stylesheets = True
language = 'es_AR'
publication_type = 'newspaper'
remove_empty_feeds = True
cover_url = 'http://www.lanacion.com.ar/_ui/desktop/imgs/layout/logos/ln341x47.gif'
masthead_url = 'http://www.lanacion.com.ar/_ui/desktop/imgs/layout/logos/ln341x47.gif'
extra_css = """ h1{font-family: Georgia,serif}
h2{color: #626262}
body{font-family: Arial,sans-serif}
img{margin-top: 0.5em; margin-bottom: 0.2em; display: block}
.notaFecha{color: #808080}
.notaEpigrafe{font-size: x-small}
.topNota h1{font-family: Arial,sans-serif}
"""


conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher': publisher
, 'language' : language
}

keep_only_tags = [dict(name='div', attrs={'class':['nota floatFix','nota floatfix','topNota','nota','post']})]
remove_tags = [
dict(name='div' , attrs={'class':'notaComentario floatFix noprint' })
,dict(name='ul' , attrs={'class':['cajaHerramientas cajaTop noprint','herramientas noprint']})
,dict(name='li' , attrs={'class':'floatFix'})
,dict(name='div' , attrs={'class':['cajaHerramientas noprint','cajaHerramientas floatFix'] })
,dict(attrs={'class':['titulosMultimedia','derecha','techo color','encuesta','izquierda compartir','floatFix','videoCentro','leyo','relaci onadas', 'relac noprint']})
,dict(name=['iframe','embed','object','form','base','hr','meta ','link','input'])
]
remove_tags_after = dict(attrs={'class':['tags','nota-destacado','leyo','relacionadas','floatFix ultimasNoticias']})
remove_attributes = ['height','width','visible','onclick','data-count','name']

feeds = [
(u'Ultimas noticias' , u'http://www.lanacion.com.ar/herramientas/rss/index.asp?origen=2' )
,(u'Politica' , u'http://www.lanacion.com.ar/herramientas/rss/index.asp?categoria_id=30' )
,(u'Economia' , u'http://www.lanacion.com.ar/herramientas/rss/index.asp?categoria_id=272' )
,(u'Deportes' , u'http://www.lanacion.com.ar/herramientas/rss/index.asp?categoria_id=131' )
,(u'Informacion General' , u'http://www.lanacion.com.ar/herramientas/rss/index.asp?categoria_id=21' )
,(u'Cultura' , u'http://www.lanacion.com.ar/herramientas/rss/index.asp?categoria_id=1' )
,(u'Opinion' , u'http://www.lanacion.com.ar/herramientas/rss/index.asp?categoria_id=28' )
,(u'Espectaculos' , u'http://www.lanacion.com.ar/herramientas/rss/index.asp?categoria_id=120' )
,(u'Exterior' , u'http://www.lanacion.com.ar/herramientas/rss/index.asp?categoria_id=7' )
,(u'Ciencia&Salud' , u'http://www.lanacion.com.ar/herramientas/rss/index.asp?categoria_id=498' )
,(u'Revista' , u'http://www.lanacion.com.ar/herramientas/rss/index.asp?categoria_id=494' )
,(u'Enfoques' , u'http://www.lanacion.com.ar/herramientas/rss/index.asp?categoria_id=421' )
,(u'Comercio Exterior' , u'http://www.lanacion.com.ar/herramientas/rss/index.asp?categoria_id=347' )
,(u'Tecnologia' , u'http://www.lanacion.com.ar/herramientas/rss/index.asp?categoria_id=432' )
,(u'Arquitectura' , u'http://www.lanacion.com.ar/herramientas/rss/index.asp?categoria_id=366' )
,(u'Turismo' , u'http://www.lanacion.com.ar/herramientas/rss/index.asp?categoria_id=504' )
,(u'Al volante' , u'http://www.lanacion.com.ar/herramientas/rss/index.asp?categoria_id=371' )
,(u'El Campo' , u'http://www.lanacion.com.ar/herramientas/rss/index.asp?categoria_id=337' )
,(u'Moda y Belleza' , u'http://www.lanacion.com.ar/herramientas/rss/index.asp?categoria_id=1312' )
,(u'Inmuebles Comerciales', u'http://www.lanacion.com.ar/herramientas/rss/index.asp?categoria_id=1363' )
,(u'Countries' , u'http://www.lanacion.com.ar/herramientas/rss/index.asp?categoria_id=1348' )
,(u'adnCultura' , u'http://www.lanacion.com.ar/herramientas/rss/index.asp?categoria_id=6734' )
,(u'The Wall Street Journal Americas', u'http://www.lanacion.com.ar/herramientas/rss/index.asp?categoria_id=6373' )
,(u'Management' , u'http://www.lanacion.com.ar/herramientas/rss/index.asp?categoria_id=7380' )
,(u'Bicentenario' , u'http://www.lanacion.com.ar/herramientas/rss/index.asp?categoria_id=7276' )
]

def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return self.adeify_images(soup)
partymonkey is offline   Reply With Quote