View Single Post
Old 01-15-2011, 10:43 PM   #3
guterm
Junior Member
guterm began at the beginning.
 
Posts: 9
Karma: 10
Join Date: Jan 2011
Device: Sony PRS-650
And here is even more polished recipe.
Failing downloads are fixed, sending right away to the mobile site avoiding redirect, other minor tweaks.

/guterm

Code:
#!/usr/bin/env  python
__license__   = 'GPL v3'

__copyright__ = '2011, Szing, guterm'
__docformat__ = 'restructuredtext en'

'''
globeandmail.com
'''

from calibre.web.feeds.news import BasicNewsRecipe

class TheGlobeAndMailAdvancedRecipe(BasicNewsRecipe):
    title          = u'The Globe And Mail'
    __license__   = 'GPL v3'
    __author__ = 'Szing, guterm'
    oldest_article = 2
    no_stylesheets = True
    max_articles_per_feed = 100
    encoding               = 'utf8'
    publisher              = 'Globe & Mail'
    language               = 'en_CA'
    extra_css = 'p.meta {font-size:75%}\n .redtext {color: red;}\n .byline {font-size: 70%}'

    feeds          = [
      (u'Top National Stories', u'http://www.theglobeandmail.com/news/national/?service=rss'),
      (u'Business', u'http://www.theglobeandmail.com/report-on-business/?service=rss'),
      (u'Investing', u'http://www.theglobeandmail.com/globe-investor/?service=rss'),
      (u'Politics', u'http://www.theglobeandmail.com/news/politics/?service=rss'),
      (u'Commentary', u'http://www.theglobeandmail.com/news/opinions/?service=rss'),
      (u'Toronto', u'http://www.theglobeandmail.com/news/national/toronto/?service=rss'),
      (u'Facts & Arguments', u'http://www.theglobeandmail.com/life/facts-and-arguments/?service=rss'),
      (u'Technology', u'http://www.theglobeandmail.com/news/technology/?service=rss'),
      (u'Arts', u'http://www.theglobeandmail.com/news/arts/?service=rss'),
      (u'Life', u'http://www.theglobeandmail.com/life/?service=rss'),
      (u'Real Estate', u'http://www.theglobeandmail.com/real-estate/?service=rss'),
      (u'Auto', u'http://www.theglobeandmail.com/auto/?service=rss'),
      (u'Sports', u'http://www.theglobeandmail.com/sports/?service=rss')
    ]

    keep_only_tags = [
	      dict(name='h1'),
	      dict(name='h2', attrs={'id':'articletitle'}),
	      dict(name='p', attrs={'class':['leadText', 'meta', 'leadImage', 'redtext byline', 'bodyText']}),
	      dict(name='div', attrs={'class':['news','articlemeta','articlecopy','columnist', 'blog']}),
	      dict(name='id', attrs={'class':'article'}),
	      dict(name='table', attrs={'class':'todays-market'}),
	      dict(name='header', attrs={'id':'leadheader'})
    ]

    remove_tags = [
	dict(name='ul', attrs={'class':['pillboxcontainer arttoolsbpbx']}),
	dict(name='div', attrs={'class':['relcont', 'articleTools', 'ypad fontsmall', 'pagination']}),
	dict(name='a', attrs={'href':['javascript:void(0);', 'http://m.yp.ca?tracking=globeandmail']}),
	dict(name='div', attrs={'id':['ShareArticles', 'topStories', 'seealsobottom']})
    ]

    def postprocess_html(self, soup, first_fetch):
	# Find and preserve single page article layout, can be first or last
	allArts = soup.findAll(True, {'id':'article'})
	if len(allArts)==2:
	    if(len(allArts[0].contents)>len(allArts[1].contents)):
		allArts[1].extract()
	    else:
		allArts[0].extract()

	return soup
    
    def parse_feeds(self, *args, **kwargs):
        parsed_feeds = BasicNewsRecipe.parse_feeds(self, *args, **kwargs)
        # Eliminate the duplicates
        urlSet = set()
	
        for feed in parsed_feeds:
	    newArticles = []
            for article in feed:
		if article.url in urlSet:
		    feed.articles.remove( article )
		else:
		    urlSet.add(article.url)
		    newArticles.append(article)

	    feed.articles = newArticles
		    
        return parsed_feeds

    #
    cover_url = 'http://www.freewarepocketpc.net/wp7/img/the-globe-and-mail.png'

    #Use the mobile version rather than the web version
    def print_version(self, url):
	return (url.replace('cmpid=rss1','service=mobile')).replace('http://www.','http://m.')
guterm is offline   Reply With Quote