MobileRead Forums - View Single Post

cfholbert · 02-10-2011, 01:39 PM

Added delay=1 and that seemed to help reduce the number of articles that were showing up as garbage. Also added a code to download a cover page. If anyone is interested, the recipe is posted below. Still looking for advice on how to solve the remaining problems.

from calibre.web.feeds.news import BasicNewsRecipe

class AdvancedUserRecipe1278347258(BasicNewsRecipe):
title = u'Salt Lake City Tribune'
__author__ = 'Charles Holbert'
oldest_article = 1
max_articles_per_feed = 100

description = '''Utah's independent news source since 1871'''
publisher = 'http://www.sltrib.com/'
category = 'news, Utah, SLC'
language = 'en'
encoding = 'utf-8'
delay = 1
#simultaneous_downloads = 1
remove_javascript = True
use_embedded_content = False
no_stylesheets = True

#masthead_url = 'http://www.sltrib.com/csp/cms/sites/sltrib/assets/images/logo_main.png'
#cover_url = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg9/lg/UT_SLT.jpg'

remove_tags = [dict(name='div',attrs={'id':['teaser','adCol', 'keywordStories']})
,dict(name='div',attrs={'class':'tripleWide datos'})]

keep_only_tags = [dict(name='div',attrs={'class':'theImage'})
,dict(name='div',attrs={'id':'topImageCaption'})
,dict(name='div',attrs={'class':'theHeadline entry-title'})
,dict(name='div',attrs={'class':'byline'})
,dict(name='div',attrs={'id':'storytext'})]

feeds = [(u'SL Tribune Today', u'http://www.sltrib.com/csp/cms/sites/sltrib/RSS/rss.csp?cat=All'),
(u'Utah News', u'http://www.sltrib.com/csp/cms/sites/sltrib/RSS/rss.csp?cat=UtahNews'),
(u'Business News', u'http://www.sltrib.com/csp/cms/sites/sltrib/RSS/rss.csp?cat=Money'),
(u'Most Popular', u'http://www.sltrib.com/csp/cms/sites/sltrib/RSS/rsspopular.csp'),
(u'Sports', u'http://www.sltrib.com/csp/cms/sites/sltrib/RSS/rss.csp?cat=Sports')]

extra_css = '''
.theHeadline{font-family:Arial,Helvetica,sans-serif; font-size:xx-large; font-weight: bold; color:#0E5398;}
.byline{font-family:Arial,Helvetica,sans-serif; color:#333333; font-size:xx-small;}
.storytext{font-family:Arial,Helvetica,sans-serif; font-size:medium;}
.articleText{font-family:Arial,Helvetica,sans-serif; font-size:medium;}
.caption{font-family:Arial,Helvetica,sans-serif; font-size:xx-small; margin-bottom: 1em;}
'''

def get_cover_url(self):
cover_url = None
href = 'http://www.newseum.org/todaysfrontpages/hr.asp?fpVname=UT_SLT&ref_pge=lst'
soup = self.index_to_soup(href)
div = soup.find('div',attrs={'class':'tfpLrgView_contain er'})
if div:
cover_url = div.img['src']
return cover_url

02-10-2011, 01:39 PM	#2
cfholbert Junior Member Posts: 5 Karma: 10 Join Date: Feb 2011 Device: kindle, nook, nookcolor, PDN	Added delay=1 and that seemed to help reduce the number of articles that were showing up as garbage. Also added a code to download a cover page. If anyone is interested, the recipe is posted below. Still looking for advice on how to solve the remaining problems. from calibre.web.feeds.news import BasicNewsRecipe class AdvancedUserRecipe1278347258(BasicNewsRecipe): title = u'Salt Lake City Tribune' __author__ = 'Charles Holbert' oldest_article = 1 max_articles_per_feed = 100 description = '''Utah's independent news source since 1871''' publisher = 'http://www.sltrib.com/' category = 'news, Utah, SLC' language = 'en' encoding = 'utf-8' delay = 1 #simultaneous_downloads = 1 remove_javascript = True use_embedded_content = False no_stylesheets = True #masthead_url = 'http://www.sltrib.com/csp/cms/sites/sltrib/assets/images/logo_main.png' #cover_url = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg9/lg/UT_SLT.jpg' remove_tags = [dict(name='div',attrs={'id':['teaser','adCol', 'keywordStories']}) ,dict(name='div',attrs={'class':'tripleWide datos'})] keep_only_tags = [dict(name='div',attrs={'class':'theImage'}) ,dict(name='div',attrs={'id':'topImageCaption'}) ,dict(name='div',attrs={'class':'theHeadline entry-title'}) ,dict(name='div',attrs={'class':'byline'}) ,dict(name='div',attrs={'id':'storytext'})] feeds = [(u'SL Tribune Today', u'http://www.sltrib.com/csp/cms/sites/sltrib/RSS/rss.csp?cat=All'), (u'Utah News', u'http://www.sltrib.com/csp/cms/sites/sltrib/RSS/rss.csp?cat=UtahNews'), (u'Business News', u'http://www.sltrib.com/csp/cms/sites/sltrib/RSS/rss.csp?cat=Money'), (u'Most Popular', u'http://www.sltrib.com/csp/cms/sites/sltrib/RSS/rsspopular.csp'), (u'Sports', u'http://www.sltrib.com/csp/cms/sites/sltrib/RSS/rss.csp?cat=Sports')] extra_css = ''' .theHeadline{font-family:Arial,Helvetica,sans-serif; font-size:xx-large; font-weight: bold; color:#0E5398;} .byline{font-family:Arial,Helvetica,sans-serif; color:#333333; font-size:xx-small;} .storytext{font-family:Arial,Helvetica,sans-serif; font-size:medium;} .articleText{font-family:Arial,Helvetica,sans-serif; font-size:medium;} .caption{font-family:Arial,Helvetica,sans-serif; font-size:xx-small; margin-bottom: 1em;} ''' def get_cover_url(self): cover_url = None href = 'http://www.newseum.org/todaysfrontpages/hr.asp?fpVname=UT_SLT&ref_pge=lst' soup = self.index_to_soup(href) div = soup.find('div',attrs={'class':'tfpLrgView_contain er'}) if div: cover_url = div.img['src'] return cover_url