MobileRead Forums - View Single Post - Help to debug and download pages from the web

Thread: Help to debug and download pages from the web

View Single Post

09-07-2012, 04:13 AM	#4
eroche Junior Member Posts: 4 Karma: 10 Join Date: Sep 2012 Device: sony ereader	Thanks Scissors, I know it does I guess its my way of learning how calibre works. I wanted to parse the links myself from the webpage so that I can do some testing for duplicates etc (when I combine multiple rss feeds) and manually identify which ones I want to include. My latest version for the RTE website is: Spoiler: #The following recipe extracts the text from all the RSS articles that are linked. The photos on the RTE website do not lend themselves to being included in a recipe from BeautifulSoup import BeautifulSoup import urllib2 from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.feedparser import parse #import sys global newsUrl newsUrl = [] global sportUrl sportUrl = [] global businessUrl businessUrl = [] global masterUrl masterUrl = [] class RTE(BasicNewsRecipe): title = 'RTE (Ireland)' description = 'Morning Newspaper from Ireland' __author__ = 'Edward Roche' language = 'en' oldest_article = 1.0 #Start by getting the rss feeds and saving them in lists by section #News Headlines entries = parse('http://www.rte.ie/rss/news.xml').entries for i, item in enumerate(entries): feedtitle = item.get('title') link = item.get('link') description = item.get('description') author = item.get('author') date = item.get('date') newsUrl.append( ( feedtitle , link, date)) masterUrl.append(link) #Business Headlines entries = parse('http://www.rte.ie/rss/business.xml').entries for i, item in enumerate(entries): feedtitle = item.get('title') link = item.get('link') description = item.get('description') author = item.get('author') date = item.get('date') duplicateInd = False for i in masterUrl: if link == i: duplicateInd = True print "duplicate found =, ", link if duplicateInd == False: businessUrl.append( ( feedtitle , link, date)) masterUrl.append(link) #Sports Headlines entries = parse('http://www.rte.ie/rss/sport.xml').entries for i, item in enumerate(entries): feedtitle = item.get('title') link = item.get('link') description = item.get('description') author = item.get('author') date = item.get('date') duplicateInd = False for i in masterUrl: if link == i: duplicateInd = True print "duplicate found =, ", link if duplicateInd == False: sportUrl.append( ( feedtitle , link, date)) masterUrl.append(link) #The saved lists will each make up an article group in the ebook. For each article group add the headins to the TOC def parse_index(self): feeds = [] articles = self.RTE_parse_section(newsUrl) feeds.append(('News Headlines', articles)) articles = self.RTE_parse_section(businessUrl) feeds.append(('Business Headlines', articles)) articles = self.RTE_parse_section(sportUrl) feeds.append(('Sport Headlines', articles)) return feeds #Each article group will be made up of articles, set up the articles based on the URLS that we have already gotten def RTE_parse_section(self, link): current_articles = [] for file in link: current_articles.append({'title': file[0], 'url': file[1], 'description':'', 'date':file[2]}) return current_articles #Clean up the output keep_only_tags = [ dict(name='div',attrs={'id': ['news-article-container']}) #,dict(name='article',attrs={'class': ['rte-sport-article']}) ,dict(name='div',attrs={'class': ['rte_gr_8']}) ] remove_tags_after = [ dict(name='ul',attrs={'class': 'keywords'}) ,dict(name='p',attrs={'class': 'sticky-footer-leadin'}) ,dict(name='div',attrs={'id': 'storyBody'}) ] remove_tags = [ dict(name='ul',attrs={'class': 'keywords'}) ,dict(name='div',attrs={'id': ['user-options-top','tab-group','related','photography','user-options-bottom']}) ,dict(name='div',attrs={'class': ['clear','photo-count','thumbnails','news-gallery-regular','side-content multimedia video','side-content multimedia audio']}) ,dict(name='a',attrs={'class': ['photo-prev','photo-next']}) , dict(name='meta') , dict(name='link') , dict(name='script') ,dict(name='figure') ,dict(name='p',attrs={'class': 'sticky-footer-leadin'}) ,dict(name='section',attrs={'id': 'article-media-box'}) ,dict(name='footer',attrs={'class': 'clearfix'}) ,dict(name='nav',attrs={'id': 'breadcrumb'}) ] no_stylesheets = True extra_css = ''' body { #color: rgb(0,0,0); #background-color:rgb(174,174,174); text-align:justify; line-spacing:1.8; #margin-top:0px; #margin-bottom:4px; #margin-right:50px; #margin-left:50px; #text-indent:2em; } h1, h2, h3, h4, h5, h6 { #color:white; text-align:center; font-style:italic; font-weight:bold; } p { text-align:left; } ul{ list-style: none } li { list-style: none padding-top:5px; } img { } ''' def preprocess_html(self, soup): #outputFile = 'D:\My Python Sample Code\Calibre Recipes\RTE\RawSoup\output'+soup.title.string+'.ht ml' #print "out " +outputFile #if 'Final Countdown' in soup.title.string: # sys.exit() #f = open(outputFile,"w") #f.write(soup.prettify()) #f.close() for alink in soup.findAll('a'): if alink.string is not None: tstr = alink.string alink.replaceWith(tstr) return soup def get_cover_url(self): url = 'http://dramafestival.ie/index.php_files/images/RTE%20logo.gif' return url This recipe extracts all the text from the news, business and sport rss feeds. It ignores the pictures as they are difficult to handle from this site. Last edited by eroche; 09-07-2012 at 07:03 AM.