View Single Post
Old 06-04-2010, 09:19 AM   #2044
gambarini
Connoisseur
gambarini began at the beginning.
 
Posts: 98
Karma: 22
Join Date: Mar 2010
Device: IRiver Story, Ipod Touch, Android SmartPhone
Code:
from calibre.web.feeds.news import BasicNewsRecipe
class LaStampaParseIndex(BasicNewsRecipe):

 title                 = u'Debug Parse Index'
 cover_url             = 'http://www.lastampa.it/edicola/PDF/1.pdf'
 remove_javascript     = True
 no_stylesheets        = True


        
 def nz_parse_section(self, url):

            def get_article_url(self, article):
              link = article.get('links')
              print link
              if link:
               return link[0]['href']
            soup  = self.index_to_soup(url)
            head  = soup.findAll('div',attrs= {'class': 'entry'})
            descr = soup.findAll('div',attrs= {'class': 'feedEntryConteny'})
            dt    = soup.findAll('div',attrs= {'class': 'lastUpdated'})
            print head
            print descr
            print dt
            current_articles = []
#            a = head.find('a', href = True)
#            title       = self.tag_to_string(a)
#            url         = a.get('href', False)
#            description = self.tag_to_string(descr)
#            date        = self.tag_to_string(dt)
#            self.log('title ', title)
#            self.log('url ', url)
#            self.log('description ', description)
#            self.log('date ', date)
#            current_articles.append({'title': title, 'url': url, 'description':description, 'date':date})
            current_articles.append({'title': '', 'url':'', 'description':'', 'date':''})


            return current_articles
 keep_only_tags = [dict(attrs={'class':['boxocchiello2','titoloRub','titologir','catenaccio','sezione','articologirata']}),
                   dict(name='div', attrs={'id':'corpoarticolo'})
                  ]

 remove_tags = [dict(name='div', attrs={'id':'menutop'}),
                dict(name='div', attrs={'id':'fwnetblocco'}),
                dict(name='table', attrs={'id':'strumenti'}),
                dict(name='table', attrs={'id':'imgesterna'}),
                dict(name='a', attrs={'class':'linkblu'}),
                dict(name='a', attrs={'class':'link'}),
                dict(name='span', attrs={'class':['boxocchiello','boxocchiello2','sezione']})
               ]
 def parse_index(self):
            feeds = []
            for title, url in [(u'Politica', u'http://www.lastampa.it/redazione/cmssezioni/politica/rss_politica.xml'),
                               (u'Torino', u'http://rss.feedsportal.com/c/32418/f/466938/index.rss')
                              ]:
               print url
               articles = self.nz_parse_section(url)

               if articles:
                   feeds.append((title, articles))
            return feeds
I don't know why but the soup.findall don't find anything.
Probably it's the same problem that calibre find when parse itself the feed and don't put the correct values into title.

I don't understand why...
I am don't understand to use the normal method to parse the feeds (using get_article('links')) and override only the title.
gambarini is offline