View Single Post
Old 03-07-2011, 10:47 AM   #4
oneillpt began at the beginning.
Posts: 56
Karma: 46
Join Date: Feb 2011
Device: Kindle 3 (cracked screen!); PW1
Originally Posted by luiscc View Post
Hi, could anyone create a recipe for Babelia?

I tried to change the recipe to El país semanal, but i'm a newbie in python and recipes, the result is far from good, although i get all articles.

Can anyone help?

Kind regards
Here is a recipe. I have also posted this recipe with explanatory comments you may find useful in reply to another post just after yours, "How to convert newspaper which do not have RSS feed?"

from import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag, NavigableString

class ElPaisBabelia(BasicNewsRecipe):

    title      = 'El Pais Babelia'
    __author__ = 'oneillpt'
    description = 'El Pais Babelia'
    INDEX = ''
    language = 'es'

    remove_tags_before = dict(name='div', attrs={'class':'estructura_2col'})
    keep_tags = [dict(name='div', attrs={'class':'estructura_2col'})]
    remove_tags = [dict(name='div', attrs={'class':'votos estirar'}),
        dict(name='div', attrs={'id':'utilidades'}),
        dict(name='div', attrs={'class':'info_relacionada'}),
        dict(name='div', attrs={'class':'mod_apoyo'}),
        dict(name='div', attrs={'class':'contorno_f'}),
        dict(name='div', attrs={'class':'pestanias'}),
        dict(name='div', attrs={'class':'otros_webs'}),
        dict(name='div', attrs={'id':'pie'})
    #no_stylesheets = True
    remove_javascript     = True

    def parse_index(self):
        articles = []
        soup = self.index_to_soup(self.INDEX)
        cover = None
        feeds = []
        for section in soup.findAll('div', attrs={'class':'contenedor_nuevo'}):
            section_title = self.tag_to_string(section.find('h1'))
            articles = []
            for post in section.findAll('a', href=True):
                url = post['href']
                if url.startswith('/'):
                  url = ''+url
                  title = self.tag_to_string(post)
                  if str(post).find('class=') > 0:
                    klass = post['class']
                    if klass != "":
                      self.log('--> post:  ', post)
                      self.log('--> url:   ', url)
                      self.log('--> title: ', title)
                      self.log('--> class: ', klass)
                      articles.append({'title':title, 'url':url})
            if articles:
                feeds.append((section_title, articles))
        return feeds
oneillpt is offline   Reply With Quote