Quote:
Originally Posted by Starson17
I suspect there might be some questions here that I can help with.... but perhaps not
More info about whether there's a question and what it is might help me decide. 
|
this is my recipe:
Code:
from calibre.web.feeds.news import BasicNewsRecipe
class LaStampaParseIndex(BasicNewsRecipe):
title = u'Debug Parse Index'
cover_url = 'http://www.lastampa.it/edicola/PDF/1.pdf'
remove_javascript = True
no_stylesheets = True
def nz_parse_section(self, url):
soup = self.index_to_soup(url)
head = soup.find(attrs= {'class': 'entry'})
descr = soup.find(attrs= {'class': 'feedEntryConteny'})
dt = soup.find(attrs= {'class': 'lastUpdated'})
current_articles = []
a = head.find('a', href = True)
title = self.tag_to_string(a)
url = a.get('href', False)
description = self.tag_to_string(descr)
date = self.tag_to_string(dt)
self.log('title ', title)
self.log('url ', url)
self.log('description ', description)
self.log('date ', date)
current_articles.append({'title': title, 'url': url, 'description':description, 'date':date})
return current_articles
keep_only_tags = [dict(attrs={'class':['boxocchiello2','titoloRub','titologir','catenaccio','sezione','articologirata']}),
dict(name='div', attrs={'id':'corpoarticolo'})
]
remove_tags = [dict(name='div', attrs={'id':'menutop'}),
dict(name='div', attrs={'id':'fwnetblocco'}),
dict(name='table', attrs={'id':'strumenti'}),
dict(name='table', attrs={'id':'imgesterna'}),
dict(name='a', attrs={'class':'linkblu'}),
dict(name='a', attrs={'class':'link'}),
dict(name='span', attrs={'class':['boxocchiello','boxocchiello2','sezione']})
]
def parse_index(self):
feeds = []
for title, url in [(u'Politica', u'http://www.lastampa.it/redazione/cmssezioni/politica/rss_politica.xml'),
(u'Torino', u'http://rss.feedsportal.com/c/32418/f/466938/index.rss')
]:
articles = self.nz_parse_section(url)
if articles:
feeds.append((title, articles))
return feeds