View Single Post
Old 06-02-2010, 04:48 AM   #2025
gambarini
Connoisseur
gambarini began at the beginning.
 
Posts: 98
Karma: 22
Join Date: Mar 2010
Device: IRiver Story, Ipod Touch, Android SmartPhone
Quote:
Originally Posted by Starson17 View Post
I suspect there might be some questions here that I can help with.... but perhaps not

More info about whether there's a question and what it is might help me decide.
this is my recipe:
Code:
from calibre.web.feeds.news import BasicNewsRecipe
class LaStampaParseIndex(BasicNewsRecipe):

 title                 = u'Debug Parse Index'
 cover_url             = 'http://www.lastampa.it/edicola/PDF/1.pdf'
 remove_javascript     = True
 no_stylesheets        = True


        
 def nz_parse_section(self, url):
            soup  = self.index_to_soup(url)
            head  = soup.find(attrs= {'class': 'entry'})
            descr = soup.find(attrs= {'class': 'feedEntryConteny'})
            dt    = soup.find(attrs= {'class': 'lastUpdated'})

            current_articles = []
            a = head.find('a', href = True)
            title       = self.tag_to_string(a)
            url         = a.get('href', False)
            description = self.tag_to_string(descr)
            date        = self.tag_to_string(dt)
            self.log('title ', title)
            self.log('url ', url)
            self.log('description ', description)
            self.log('date ', date)
            current_articles.append({'title': title, 'url': url, 'description':description, 'date':date})


            return current_articles
 keep_only_tags = [dict(attrs={'class':['boxocchiello2','titoloRub','titologir','catenaccio','sezione','articologirata']}),
                   dict(name='div', attrs={'id':'corpoarticolo'})
                  ]

 remove_tags = [dict(name='div', attrs={'id':'menutop'}),
                dict(name='div', attrs={'id':'fwnetblocco'}),
                dict(name='table', attrs={'id':'strumenti'}),
                dict(name='table', attrs={'id':'imgesterna'}),
                dict(name='a', attrs={'class':'linkblu'}),
                dict(name='a', attrs={'class':'link'}),
                dict(name='span', attrs={'class':['boxocchiello','boxocchiello2','sezione']})
               ]
 def parse_index(self):
            feeds = []
            for title, url in [(u'Politica', u'http://www.lastampa.it/redazione/cmssezioni/politica/rss_politica.xml'),
                               (u'Torino', u'http://rss.feedsportal.com/c/32418/f/466938/index.rss')
                              ]:
               articles = self.nz_parse_section(url)
               if articles:
                   feeds.append((title, articles))
            return feeds
gambarini is offline