Thanks Scissors, I know it does I guess its my way of learning how calibre works. I wanted to parse the links myself from the webpage so that I can do some testing for duplicates etc (when I combine multiple rss feeds) and manually identify which ones I want to include. My latest version for the RTE website is:
Spoiler:
#The following recipe extracts the text from all the RSS articles that are linked. The photos on the RTE website do not lend themselves to being included in a recipe
from BeautifulSoup import BeautifulSoup
import urllib2
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.web.feeds.feedparser import parse
#import sys
global newsUrl
newsUrl = []
global sportUrl
sportUrl = []
global businessUrl
businessUrl = []
global masterUrl
masterUrl = []
class RTE(BasicNewsRecipe):
title = 'RTE (Ireland)'
description = 'Morning Newspaper from Ireland'
__author__ = 'Edward Roche'
language = 'en'
oldest_article = 1.0
#Start by getting the rss feeds and saving them in lists by section
#News Headlines
entries = parse('http://www.rte.ie/rss/news.xml').entries
for i, item in enumerate(entries):
feedtitle = item.get('title')
link = item.get('link')
description = item.get('description')
author = item.get('author')
date = item.get('date')
newsUrl.append( ( feedtitle , link, date))
masterUrl.append(link)
#Business Headlines
entries = parse('http://www.rte.ie/rss/business.xml').entries
for i, item in enumerate(entries):
feedtitle = item.get('title')
link = item.get('link')
description = item.get('description')
author = item.get('author')
date = item.get('date')
duplicateInd = False
for i in masterUrl:
if link == i:
duplicateInd = True
print "duplicate found =, ", link
if duplicateInd == False:
businessUrl.append( ( feedtitle , link, date))
masterUrl.append(link)
#Sports Headlines
entries = parse('http://www.rte.ie/rss/sport.xml').entries
for i, item in enumerate(entries):
feedtitle = item.get('title')
link = item.get('link')
description = item.get('description')
author = item.get('author')
date = item.get('date')
duplicateInd = False
for i in masterUrl:
if link == i:
duplicateInd = True
print "duplicate found =, ", link
if duplicateInd == False:
sportUrl.append( ( feedtitle , link, date))
masterUrl.append(link)
#The saved lists will each make up an article group in the ebook. For each article group add the headins to the TOC
def parse_index(self):
feeds = []
articles = self.RTE_parse_section(newsUrl)
feeds.append(('News Headlines', articles))
articles = self.RTE_parse_section(businessUrl)
feeds.append(('Business Headlines', articles))
articles = self.RTE_parse_section(sportUrl)
feeds.append(('Sport Headlines', articles))
return feeds
#Each article group will be made up of articles, set up the articles based on the URLS that we have already gotten
def RTE_parse_section(self, link):
current_articles = []
for file in link:
current_articles.append({'title': file[0], 'url': file[1], 'description':'', 'date':file[2]})
return current_articles
#Clean up the output
keep_only_tags = [
dict(name='div',attrs={'id': ['news-article-container']})
#,dict(name='article',attrs={'class': ['rte-sport-article']})
,dict(name='div',attrs={'class': ['rte_gr_8']})
]
remove_tags_after = [
dict(name='ul',attrs={'class': 'keywords'})
,dict(name='p',attrs={'class': 'sticky-footer-leadin'})
,dict(name='div',attrs={'id': 'storyBody'})
]
remove_tags = [
dict(name='ul',attrs={'class': 'keywords'})
,dict(name='div',attrs={'id': ['user-options-top','tab-group','related','photography','user-options-bottom']})
,dict(name='div',attrs={'class': ['clear','photo-count','thumbnails','news-gallery-regular','side-content multimedia video','side-content multimedia audio']})
,dict(name='a',attrs={'class': ['photo-prev','photo-next']})
, dict(name='meta')
, dict(name='link')
, dict(name='script')
,dict(name='figure')
,dict(name='p',attrs={'class': 'sticky-footer-leadin'})
,dict(name='section',attrs={'id': 'article-media-box'})
,dict(name='footer',attrs={'class': 'clearfix'})
,dict(name='nav',attrs={'id': 'breadcrumb'})
]
no_stylesheets = True
extra_css = '''
body {
#color: rgb(0,0,0);
#background-color:rgb(174,174,174);
text-align:justify;
line-spacing:1.8;
#margin-top:0px;
#margin-bottom:4px;
#margin-right:50px;
#margin-left:50px;
#text-indent:2em;
}
h1, h2, h3, h4, h5, h6 {
#color:white;
text-align:center;
font-style:italic;
font-weight:bold;
}
p {
text-align:left;
}
ul{
list-style: none
}
li {
list-style: none
padding-top:5px;
}
img {
}
'''
def preprocess_html(self, soup):
#outputFile = 'D:\My Python Sample Code\Calibre Recipes\RTE\RawSoup\output'+soup.title.string+'.ht ml'
#print "out " +outputFile
#if 'Final Countdown' in soup.title.string:
# sys.exit()
#f = open(outputFile,"w")
#f.write(soup.prettify())
#f.close()
for alink in soup.findAll('a'):
if alink.string is not None:
tstr = alink.string
alink.replaceWith(tstr)
return soup
def get_cover_url(self):
url = 'http://dramafestival.ie/index.php_files/images/RTE%20logo.gif'
return url
This recipe extracts all the text from the news, business and sport rss feeds. It ignores the pictures as they are difficult to handle from this site.