Junior Member
Posts: 7
Karma: 10
Join Date: Dec 2009
Device: Kindle
|
This recipe for The Hartford Courant is a little more complete, with the addition of national news, sports, etc. The user can edit it to their liking, adding more columnists, etc., by going to www.courant.com, clicking on RSS at the bottom, and getting the correct URL's for the RSS feeds to add. For example, Politics is included with this line:
('Politics', 'http://feeds.feedburner.com/courant-politics/'),
Here's the complete recipe:
Code:
from __future__ import with_statement
__license__ = 'GPL 3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from calibre.web.feeds.news import BasicNewsRecipe
class ChicagoTribune(BasicNewsRecipe):
title = 'The Hartford Courant'
__author__ = 'Kovid Goyal and Sujata Raman'
description = 'Politics, local and business news from Hartford'
language = 'en'
use_embedded_content = False
no_stylesheets = True
remove_javascript = True
keep_only_tags = [dict(name='div', attrs={'class':["story","entry-asset asset hentry"]}),
dict(name='div', attrs={'id':["pagebody","story","maincontentcontainer"]}),
]
remove_tags_after = [ {'class':['photo_article',]} ]
remove_tags = [{'id':["moduleArticleTools","content-bottom","rail","articleRelates module","toolSet","relatedrailcontent","div-wrapper","beta","atp-comments","footer"]},
{'class':["clearfix","relatedTitle","articleRelates module","asset-footer","tools","comments","featurePromo","featurePromo fp-topjobs brownBackground","clearfix fullSpan brownBackground","curvedContent"]},
dict(name='font',attrs={'id':["cr-other-headlines"]})]
extra_css = '''
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
.byline {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
.date {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
.copyright {font-family:Arial,Helvetica,sans-serif;font-size:xx-small;text-align:center}
.story{font-family:Arial,Helvetica,sans-serif;font-size:small;}
.entry-asset asset hentry{font-family:Arial,Helvetica,sans-serif;font-size:small;}
.pagebody{font-family:Arial,Helvetica,sans-serif;font-size:small;}
.maincontentcontainer{font-family:Arial,Helvetica,sans-serif;font-size:small;}
.story-body{font-family:Arial,Helvetica,sans-serif;font-size:small;}
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
'''
feeds = [
('Breaking News', 'http://feeds.feedburner.com/courant-breaking-news/'),
('Nation/World News', 'http://feeds.feedburner.com/courant-nation-world/'),
('Connecticut News', 'http://feeds.feedburner.com/courant-connecticut-news/'),
('Hartford News', 'http://feeds.feedburner.com/courant-hartford/'),
('West Hartford News', 'http://feeds.feedburner.com/courant-west-hartford/'),
('Bristol', 'http://feeds.feedburner.com/courant-bristol/'),
('Politics', 'http://feeds.feedburner.com/courant-politics/'),
('Opinion', 'http://feeds.feedburner.com/courant-opinion/'),
('Editorials', 'http://feeds.feedburner.com/courant-editorials/'),
('Letters', 'http://feeds.feedburner.com/courant-letters/'),
('Bob Englehart', 'http://feeds2.feedburner.com/BobEnglehartEnglehartsView'),
('Business', 'http://feeds.feedburner.com/courant-business/'),
('Sports', 'http://feeds.feedburner.com/courant-sports/'),
('Features', 'http://feeds.feedburner.com/courant-features/'),
('Consumer', 'http://feeds.feedburner.com/courant-consumer/'),
('Shopping', 'http://feeds.feedburner.com/courant-shopping/'),
('Arts & Theater', 'http://feeds.feedburner.com/courant-entertainment/'),
('Entertainment', 'http://feeds.feedburner.com/courant-stage/'),
('Music', 'http://feeds.feedburner.com/courant-music/'),
('TV', 'http://feeds.feedburner.com/courant-tv/'),
('Movies', 'http://feeds.feedburner.com/courant-movies/'),
#('Metromix headlines', 'http://feeds.feedburner.com/metromix/topheadlines/'),
#('Metromix events', 'http://feeds.feedburner.com/metromix/events/'),
#('Metromix restaurants', 'http://feeds.feedburner.com/metromix/restaurants/'),
('Outdoors', 'http://feeds.feedburner.com/courant-outdoors/'),
('Peter Marteka', 'http://feeds.feedburner.com/courant-marteka-column/'),
('Susan Campbell', 'http://feeds.feedburner.com/courant-campbell-column/'),
('Helen Ubinas', 'http://feeds.feedburner.com/courant-helen-ubinas-column/'),
('Jim Shea', 'http://feeds.feedburner.com/courant-jim-shea-column/'),
('Tom Condon', 'http://feeds.feedburner.com/courant-tom-condon-column/'),
('Colin McEnroe', 'http://feeds.feedburner.com/courant-colin-mcenroe-column/'),
]
def get_article_url(self, article):
print article.get('feedburner_origlink', article.get('guid', article.get('link')))
return article.get('feedburner_origlink', article.get('guid', article.get('link')))
def postprocess_html(self, soup, first_fetch):
for t in soup.findAll(['table', 'tr', 'td']):
t.name = 'div'
for tag in soup.findAll('form', dict(attrs={'name':["comments_form"]})):
tag.extract()
for tag in soup.findAll('font', dict(attrs={'id':["cr-other-headlines"]})):
tag.extract()
return soup
Last edited by kovidgoyal; 12-27-2009 at 11:57 AM.
|