Code:
from calibre.web.feeds.news import BasicNewsRecipe
class LaStampaParseIndex(BasicNewsRecipe):
title = u'Debug Parse Index'
cover_url = 'http://www.lastampa.it/edicola/PDF/1.pdf'
remove_javascript = True
no_stylesheets = True
def nz_parse_section(self, url):
def get_article_url(self, article):
link = article.get('links')
print link
if link:
return link[0]['href']
soup = self.index_to_soup(url)
head = soup.findAll('div',attrs= {'class': 'entry'})
descr = soup.findAll('div',attrs= {'class': 'feedEntryConteny'})
dt = soup.findAll('div',attrs= {'class': 'lastUpdated'})
print head
print descr
print dt
current_articles = []
# a = head.find('a', href = True)
# title = self.tag_to_string(a)
# url = a.get('href', False)
# description = self.tag_to_string(descr)
# date = self.tag_to_string(dt)
# self.log('title ', title)
# self.log('url ', url)
# self.log('description ', description)
# self.log('date ', date)
# current_articles.append({'title': title, 'url': url, 'description':description, 'date':date})
current_articles.append({'title': '', 'url':'', 'description':'', 'date':''})
return current_articles
keep_only_tags = [dict(attrs={'class':['boxocchiello2','titoloRub','titologir','catenaccio','sezione','articologirata']}),
dict(name='div', attrs={'id':'corpoarticolo'})
]
remove_tags = [dict(name='div', attrs={'id':'menutop'}),
dict(name='div', attrs={'id':'fwnetblocco'}),
dict(name='table', attrs={'id':'strumenti'}),
dict(name='table', attrs={'id':'imgesterna'}),
dict(name='a', attrs={'class':'linkblu'}),
dict(name='a', attrs={'class':'link'}),
dict(name='span', attrs={'class':['boxocchiello','boxocchiello2','sezione']})
]
def parse_index(self):
feeds = []
for title, url in [(u'Politica', u'http://www.lastampa.it/redazione/cmssezioni/politica/rss_politica.xml'),
(u'Torino', u'http://rss.feedsportal.com/c/32418/f/466938/index.rss')
]:
print url
articles = self.nz_parse_section(url)
if articles:
feeds.append((title, articles))
return feeds
I don't know why but the soup.findall don't find anything.
Probably it's the same problem that calibre find when parse itself the feed and don't put the correct values into title.
I don't understand why...
I am don't understand to use the normal method to parse the feeds (using get_article('links')) and override only the title.