Recipe for The New London Day

Being · 01-06-2011, 05:10 PM

Kovid Goyal: The following is a recipe for the New London Day in New London, Connecticut. It works, but it needs cleaning up to remove extraneous stuff besides the articles, and to remove some redundant lines. Please post it (and fix it up, if you like).

from __future__ import with_statement
__license__ = 'GPL 3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'

from calibre.web.feeds.news import BasicNewsRecipe

class AdvancedUserRecipe1294342201(BasicNewsRecipe):
title = u'New London Day'
__author__ = 'Kovid Goyal, Sujata Raman and Being'
description = 'State, local and business news from New London, CT'
language = 'en'
oldest_article = 1
max_articles_per_feed = 200

use_embedded_content = False
no_stylesheets = True
remove_javascript = True
remove_tags_before = dict(id='article')
remove_tags_after = dict(id='article')
remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}),
dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']),
dict(name=['script', 'noscript', 'style'])]
remove_tags_after = [ {'class':['photo_article',]} ]
remove_tags = [{'id':["moduleArticleTools","content-bottom","rail","articleRelates module","toolSet","relatedrailcontent","div-wrapper","beta","atp-comments","footer"]},
{'class':["clearfix","relatedTitle","articleRelates module","asset-footer","tools","comments","featurePromo","feature Promo fp-topjobs brownBackground","clearfix fullSpan brownBackground","curvedContent"]},
dict(name='font',attrs={'id':["cr-other-headlines"]})]
extra_css = '''
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
.byline {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
.date {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;}
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
.copyright {font-family:Arial,Helvetica,sans-serif;font-size:xx-small;text-align:center}
.story{font-family:Arial,Helvetica,sans-serif;font-size:small;}
.entry-asset asset hentry{font-family:Arial,Helvetica,sans-serif;font-size:small;}
.pagebody{font-family:Arial,Helvetica,sans-serif;font-size:small;}
.maincontentcontainer{font-family:Arial,Helvetica,sans-serif;font-size:small;}
.story-body{font-family:Arial,Helvetica,sans-serif;font-size:small;}
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
'''

feeds = [
(u'All News', u'http://www.theday.com/section/rss'),
(u'Breaking News', u'http://www.theday.com/section/rss01'),
(u'Police and Courts', u'http://www.theday.com/section/rss02'),
(u'State News', u'http://www.theday.com/section/rss03'),
(u'Local Business', u'http://www.theday.com/section/rss04'),
(u'Entertainment', u'http://www.theday.com/section/rss05'),
(u'Opinion', u'http://www.theday.com/section/rss06'),
(u'Casinos', u'http://www.theday.com/section/rss12'),
(u'Defense and Military', u'http://www.theday.com/section/rss14'),
(u'Ann Baldelli Ruminations', u'http://www.theday.com/section/rss20'),
(u'Paul Choiniere Ruminations', u'http://www.theday.com/section/rss21'),
(u'Michael Costanza Omnivore', u'http://www.theday.com/section/rss23'),
(u'Rebecca Dangelo Reel Life', u'http://www.theday.com/section/rss25'),]

def print_version(self, url):
return url.replace('/index.html', '/print.html')

def get_article_url(self, article):
print article.get('feedburner_origlink', article.get('guid', article.get('link')))
return article.get('feedburner_origlink', article.get('guid', article.get('link')))

def postprocess_html(self, soup, first_fetch):
for t in soup.findAll(['table', 'tr', 'td']):
t.name = 'div'

for tag in soup.findAll('form', dict(attrs={'name':["comments_form"]})):
tag.extract()
for tag in soup.findAll('font', dict(attrs={'id':["cr-other-headlines"]})):
tag.extract()

return soup

01-06-2011, 05:10 PM	#1
Being Junior Member Posts: 7 Karma: 10 Join Date: Dec 2009 Device: Kindle	Recipe for The New London Day Kovid Goyal: The following is a recipe for the New London Day in New London, Connecticut. It works, but it needs cleaning up to remove extraneous stuff besides the articles, and to remove some redundant lines. Please post it (and fix it up, if you like). from __future__ import with_statement __license__ = 'GPL 3' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __docformat__ = 'restructuredtext en' from calibre.web.feeds.news import BasicNewsRecipe class AdvancedUserRecipe1294342201(BasicNewsRecipe): title = u'New London Day' __author__ = 'Kovid Goyal, Sujata Raman and Being' description = 'State, local and business news from New London, CT' language = 'en' oldest_article = 1 max_articles_per_feed = 200 use_embedded_content = False no_stylesheets = True remove_javascript = True remove_tags_before = dict(id='article') remove_tags_after = dict(id='article') remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}), dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']), dict(name=['script', 'noscript', 'style'])] remove_tags_after = [ {'class':['photo_article',]} ] remove_tags = [{'id':["moduleArticleTools","content-bottom","rail","articleRelates module","toolSet","relatedrailcontent","div-wrapper","beta","atp-comments","footer"]}, {'class':["clearfix","relatedTitle","articleRelates module","asset-footer","tools","comments","featurePromo","feature Promo fp-topjobs brownBackground","clearfix fullSpan brownBackground","curvedContent"]}, dict(name='font',attrs={'id':["cr-other-headlines"]})] extra_css = ''' h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;} h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;} .byline {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;} .date {font-family:Arial,Helvetica,sans-serif; font-size:xx-small;} p{font-family:Arial,Helvetica,sans-serif;font-size:small;} .copyright {font-family:Arial,Helvetica,sans-serif;font-size:xx-small;text-align:center} .story{font-family:Arial,Helvetica,sans-serif;font-size:small;} .entry-asset asset hentry{font-family:Arial,Helvetica,sans-serif;font-size:small;} .pagebody{font-family:Arial,Helvetica,sans-serif;font-size:small;} .maincontentcontainer{font-family:Arial,Helvetica,sans-serif;font-size:small;} .story-body{font-family:Arial,Helvetica,sans-serif;font-size:small;} body{font-family:Helvetica,Arial,sans-serif;font-size:small;} ''' feeds = [ (u'All News', u'http://www.theday.com/section/rss'), (u'Breaking News', u'http://www.theday.com/section/rss01'), (u'Police and Courts', u'http://www.theday.com/section/rss02'), (u'State News', u'http://www.theday.com/section/rss03'), (u'Local Business', u'http://www.theday.com/section/rss04'), (u'Entertainment', u'http://www.theday.com/section/rss05'), (u'Opinion', u'http://www.theday.com/section/rss06'), (u'Casinos', u'http://www.theday.com/section/rss12'), (u'Defense and Military', u'http://www.theday.com/section/rss14'), (u'Ann Baldelli Ruminations', u'http://www.theday.com/section/rss20'), (u'Paul Choiniere Ruminations', u'http://www.theday.com/section/rss21'), (u'Michael Costanza Omnivore', u'http://www.theday.com/section/rss23'), (u'Rebecca Dangelo Reel Life', u'http://www.theday.com/section/rss25'),] def print_version(self, url): return url.replace('/index.html', '/print.html') def get_article_url(self, article): print article.get('feedburner_origlink', article.get('guid', article.get('link'))) return article.get('feedburner_origlink', article.get('guid', article.get('link'))) def postprocess_html(self, soup, first_fetch): for t in soup.findAll(['table', 'tr', 'td']): t.name = 'div' for tag in soup.findAll('form', dict(attrs={'name':["comments_form"]})): tag.extract() for tag in soup.findAll('font', dict(attrs={'id':["cr-other-headlines"]})): tag.extract() return soup

Similar Threads
Thread	Thread Starter	Forum	Replies	Last Post
Recipe works when mocked up as Python file, fails when converted to Recipe	ode	Recipes	7	09-04-2011 04:57 AM
Other Fiction Day, Holman: The Rainy Day Railroad War. V1. 31 Dec 2010	crutledge	Kindle Books	0	12-31-2010 12:43 PM
Other Fiction Day, Holman: The Rainy Day Railroad War. V1. 31 Dec 2010	crutledge	ePub Books	0	12-31-2010 12:42 PM
Other Fiction Day, Holman: The Rainy Day Railroad War. V1. 31 Dec 2010	crutledge	BBeB/LRF Books	0	12-31-2010 12:39 PM
Hello from London	PaulJWK	Introduce Yourself	10	07-09-2010 09:50 AM