View Single Post
Old 09-07-2012, 04:13 AM   #4
eroche
Junior Member
eroche began at the beginning.
 
Posts: 4
Karma: 10
Join Date: Sep 2012
Device: sony ereader
Thanks Scissors, I know it does I guess its my way of learning how calibre works. I wanted to parse the links myself from the webpage so that I can do some testing for duplicates etc (when I combine multiple rss feeds) and manually identify which ones I want to include. My latest version for the RTE website is:

Spoiler:
#The following recipe extracts the text from all the RSS articles that are linked. The photos on the RTE website do not lend themselves to being included in a recipe

from BeautifulSoup import BeautifulSoup
import urllib2
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.web.feeds.feedparser import parse
#import sys

global newsUrl
newsUrl = []
global sportUrl
sportUrl = []
global businessUrl
businessUrl = []
global masterUrl
masterUrl = []


class RTE(BasicNewsRecipe):
title = 'RTE (Ireland)'
description = 'Morning Newspaper from Ireland'
__author__ = 'Edward Roche'
language = 'en'
oldest_article = 1.0


#Start by getting the rss feeds and saving them in lists by section

#News Headlines
entries = parse('http://www.rte.ie/rss/news.xml').entries
for i, item in enumerate(entries):
feedtitle = item.get('title')
link = item.get('link')
description = item.get('description')
author = item.get('author')
date = item.get('date')
newsUrl.append( ( feedtitle , link, date))
masterUrl.append(link)

#Business Headlines
entries = parse('http://www.rte.ie/rss/business.xml').entries
for i, item in enumerate(entries):
feedtitle = item.get('title')
link = item.get('link')
description = item.get('description')
author = item.get('author')
date = item.get('date')
duplicateInd = False
for i in masterUrl:
if link == i:
duplicateInd = True
print "duplicate found =, ", link
if duplicateInd == False:
businessUrl.append( ( feedtitle , link, date))
masterUrl.append(link)

#Sports Headlines
entries = parse('http://www.rte.ie/rss/sport.xml').entries
for i, item in enumerate(entries):
feedtitle = item.get('title')
link = item.get('link')
description = item.get('description')
author = item.get('author')
date = item.get('date')
duplicateInd = False
for i in masterUrl:
if link == i:
duplicateInd = True
print "duplicate found =, ", link
if duplicateInd == False:
sportUrl.append( ( feedtitle , link, date))
masterUrl.append(link)

#The saved lists will each make up an article group in the ebook. For each article group add the headins to the TOC

def parse_index(self):
feeds = []
articles = self.RTE_parse_section(newsUrl)
feeds.append(('News Headlines', articles))
articles = self.RTE_parse_section(businessUrl)
feeds.append(('Business Headlines', articles))
articles = self.RTE_parse_section(sportUrl)
feeds.append(('Sport Headlines', articles))
return feeds

#Each article group will be made up of articles, set up the articles based on the URLS that we have already gotten
def RTE_parse_section(self, link):
current_articles = []
for file in link:
current_articles.append({'title': file[0], 'url': file[1], 'description':'', 'date':file[2]})
return current_articles


#Clean up the output
keep_only_tags = [
dict(name='div',attrs={'id': ['news-article-container']})
#,dict(name='article',attrs={'class': ['rte-sport-article']})
,dict(name='div',attrs={'class': ['rte_gr_8']})
]



remove_tags_after = [
dict(name='ul',attrs={'class': 'keywords'})
,dict(name='p',attrs={'class': 'sticky-footer-leadin'})
,dict(name='div',attrs={'id': 'storyBody'})
]

remove_tags = [
dict(name='ul',attrs={'class': 'keywords'})
,dict(name='div',attrs={'id': ['user-options-top','tab-group','related','photography','user-options-bottom']})
,dict(name='div',attrs={'class': ['clear','photo-count','thumbnails','news-gallery-regular','side-content multimedia video','side-content multimedia audio']})
,dict(name='a',attrs={'class': ['photo-prev','photo-next']})
, dict(name='meta')
, dict(name='link')
, dict(name='script')
,dict(name='figure')
,dict(name='p',attrs={'class': 'sticky-footer-leadin'})
,dict(name='section',attrs={'id': 'article-media-box'})
,dict(name='footer',attrs={'class': 'clearfix'})
,dict(name='nav',attrs={'id': 'breadcrumb'})
]

no_stylesheets = True

extra_css = '''
body {
#color: rgb(0,0,0);
#background-color:rgb(174,174,174);
text-align:justify;
line-spacing:1.8;
#margin-top:0px;
#margin-bottom:4px;
#margin-right:50px;
#margin-left:50px;
#text-indent:2em;
}
h1, h2, h3, h4, h5, h6 {
#color:white;
text-align:center;
font-style:italic;
font-weight:bold;
}
p {
text-align:left;
}
ul{
list-style: none
}
li {
list-style: none
padding-top:5px;
}
img {
}

'''


def preprocess_html(self, soup):
#outputFile = 'D:\My Python Sample Code\Calibre Recipes\RTE\RawSoup\output'+soup.title.string+'.ht ml'
#print "out " +outputFile
#if 'Final Countdown' in soup.title.string:
# sys.exit()
#f = open(outputFile,"w")
#f.write(soup.prettify())
#f.close()
for alink in soup.findAll('a'):
if alink.string is not None:
tstr = alink.string
alink.replaceWith(tstr)
return soup

def get_cover_url(self):
url = 'http://dramafestival.ie/index.php_files/images/RTE%20logo.gif'
return url



This recipe extracts all the text from the news, business and sport rss feeds. It ignores the pictures as they are difficult to handle from this site.

Last edited by eroche; 09-07-2012 at 07:03 AM.
eroche is offline   Reply With Quote