Thanks man, it's a great start! I tried to refine it somewhat further.
Key changes:
- Removed more "value-add" content from the article (social and related links, etc.)
- Fixed multi-page layout issues, should always full the full article now
- Fixed sports and auto feed naming
- Fixed commentary feed link
- Added code to find and eliminate duplicating articles across multiple feeds (i.e., the same article showing up in Business and Investing)
- Added Toronto section
- Changed cover image
Outstanding issues:
- Failing articles: ~5% of articles randomly end up being empty, any ideas on how to debug?
- Articles with only media content, would be great to identify and eliminate
Thoughts? Comments?
- guterm
Code:
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2011, Szing, guterm'
__docformat__ = 'restructuredtext en'
'''
globeandmail.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class TheGlobeAndMailAdvancedRecipe(BasicNewsRecipe):
title = u'The Globe And Mail'
__license__ = 'GPL v3'
__author__ = 'Szing, guterm'
oldest_article = 2
no_stylesheets = True
max_articles_per_feed = 100
encoding = 'utf8'
publisher = 'Globe & Mail'
language = 'en_CA'
extra_css = 'p.meta {font-size:75%}\n .redtext {color: red;}\n .byline {font-size: 70%}'
feeds = [
(u'Top National Stories', u'http://www.theglobeandmail.com/news/national/?service=rss'),
(u'Business', u'http://www.theglobeandmail.com/report-on-business/?service=rss'),
(u'Investing', u'http://www.theglobeandmail.com/globe-investor/?service=rss'),
(u'Politics', u'http://www.theglobeandmail.com/news/politics/?service=rss'),
(u'Commentary', u'http://www.theglobeandmail.com/news/opinions/?service=rss'),
(u'Toronto', u'http://www.theglobeandmail.com/news/national/toronto/?service=rss'),
(u'Facts & Arguments', u'http://www.theglobeandmail.com/life/facts-and-arguments/?service=rss'),
(u'Technology', u'http://www.theglobeandmail.com/news/technology/?service=rss'),
(u'Arts', u'http://www.theglobeandmail.com/news/arts/?service=rss'),
(u'Life', u'http://www.theglobeandmail.com/life/?service=rss'),
(u'Real Estate', u'http://www.theglobeandmail.com/real-estate/?service=rss'),
(u'Auto', u'http://www.theglobeandmail.com/auto/?service=rss'),
(u'Sports', u'http://www.theglobeandmail.com/sports/?service=rss')
]
keep_only_tags = [
dict(name='h1'),
dict(name='h2', attrs={'id':'articletitle'}),
dict(name='p', attrs={'class':['leadText', 'meta', 'leadImage', 'redtext byline', 'bodyText']}),
dict(name='div', attrs={'class':['news','articlemeta','articlecopy']}),
dict(name='id', attrs={'class':'article'}),
dict(name='table', attrs={'class':'todays-market'}),
dict(name='header', attrs={'id':'leadheader'})
]
remove_tags = [
dict(name='ul', attrs={'class':['pillboxcontainer arttoolsbpbx']}),
dict(name='div', attrs={'class':['relcont', 'articleTools', 'ypad fontsmall', 'pagination']}),
dict(name='a', attrs={'href':['javascript:void(0);', 'http://m.yp.ca?tracking=globeandmail']}),
dict(name='div', attrs={'id':['ShareArticles', 'topStories']})
]
def postprocess_html(self, soup, first_fetch):
# Find and preserve single page article layout, can be first or last
allArts = soup.findAll(True, {'id':'article'})
if len(allArts)==2:
if(len(allArts[0].contents)>len(allArts[1].contents)):
allArts[1].extract()
else:
allArts[0].extract()
return soup
def parse_feeds(self, *args, **kwargs):
parsed_feeds = BasicNewsRecipe.parse_feeds(self, *args, **kwargs)
# Eliminate the duplicates
urlSet = set()
for feed in parsed_feeds:
newArticles = []
for article in feed:
if article.url in urlSet:
feed.articles.remove( article )
else:
urlSet.add(article.url)
newArticles.append(article)
feed.articles = newArticles
return parsed_feeds
#
cover_url = 'http://www.freewarepocketpc.net/wp7/img/the-globe-and-mail.png'
#Use the mobile version rather than the web version
def print_version(self, url):
return url.replace('cmpid=rss1','service=mobile')