View Full Version : Globe and Mail Recipe Rewrite..


Szing
11-15-2010, 01:00 PM
Here is a rewrite of the Globe & Mail recipe.

It has no ads
It has pictures related to the article
News download size is around 1.5-2 MB (epub)
It may have a problem with some multi page articles
It does not have author pictures



class AdvancedUserRecipe1287083651(BasicNewsRecipe):
title = u'Globe & Mail'
__license__ = 'GPL v3'
__copyright__ = '2010, Szing'
oldest_article = 2
no_stylesheets = True
max_articles_per_feed = 100
encoding = 'utf8'
publisher = 'Globe & Mail'
category = 'news, Canada, world'
language = 'en_CA'
extra_css = 'p.meta {font-size:75%}\n .redtext {color: red;}\n .byline {font-size: 70%}'

feeds = [
(u'Top National Stories', u'http://www.theglobeandmail.com/news/national/?service=rss'),
(u'Business', u'http://www.theglobeandmail.com/report-on-business/?service=rss'),
(u'Commentary', u'http://www.theglobeandmail.com/report-on-business/commentary/?service=rss'),
(u'Blogs', u'http://www.theglobeandmail.com/blogs/?service=rss'),
(u'Facts & Arguments', u'http://www.theglobeandmail.com/life/facts-and-arguments/?service=rss'),
(u'Technology', u'http://www.theglobeandmail.com/news/technology/?service=rss'),
(u'Investing', u'http://www.theglobeandmail.com/globe-investor/?service=rss'),
(u'Top Polical Stories', u'http://www.theglobeandmail.com/news/politics/?service=rss'),
(u'Arts', u'http://www.theglobeandmail.com/news/arts/?service=rss'),
(u'Life', u'http://www.theglobeandmail.com/life/?service=rss'),
(u'Real Estate', u'http://www.theglobeandmail.com/real-estate/?service=rss'),
(u'Auto', u'http://www.theglobeandmail.com/sports/?service=rss'),
(u'Sports', u'http://www.theglobeandmail.com/auto/?service=rss')
]

keep_only_tags = [
dict(name='h1'),
dict(name='h2', attrs={'id':'articletitle'}),
dict(name='p', attrs={'class':['leadText', 'meta', 'leadImage', 'redtext byline', 'bodyText']}),
dict(name='div', attrs={'class':['news','articlemeta','articlecopy']}),
dict(name='id', attrs={'class':'article'}),
dict(name='table', attrs={'class':'todays-market'}),
dict(name='header', attrs={'id':'leadheader'})
]

remove_tags = [
dict(name='div', attrs={'id':['tabInside', 'ShareArticles', 'topStories']})
]

#this has to be here or the text in the article appears twice.
remove_tags_after = [dict(id='article')]

#Use the mobile version rather than the web version
def print_version(self, url):
return url + '&service=mobile'


-Szing

guterm
01-12-2011, 02:57 PM
Thanks man, it's a great start! I tried to refine it somewhat further.
Key changes:

Removed more "value-add" content from the article (social and related links, etc.)
Fixed multi-page layout issues, should always full the full article now
Fixed sports and auto feed naming
Fixed commentary feed link
Added code to find and eliminate duplicating articles across multiple feeds (i.e., the same article showing up in Business and Investing)
Added Toronto section
Changed cover image


Outstanding issues:

Failing articles: ~5% of articles randomly end up being empty, any ideas on how to debug?
Articles with only media content, would be great to identify and eliminate


Thoughts? Comments?

- guterm

#!/usr/bin/env python
__license__ = 'GPL v3'

__copyright__ = '2011, Szing, guterm'
__docformat__ = 'restructuredtext en'

'''
globeandmail.com
'''

from calibre.web.feeds.news import BasicNewsRecipe

class TheGlobeAndMailAdvancedRecipe(BasicNewsRecipe):
title = u'The Globe And Mail'
__license__ = 'GPL v3'
__author__ = 'Szing, guterm'
oldest_article = 2
no_stylesheets = True
max_articles_per_feed = 100
encoding = 'utf8'
publisher = 'Globe & Mail'
language = 'en_CA'
extra_css = 'p.meta {font-size:75%}\n .redtext {color: red;}\n .byline {font-size: 70%}'

feeds = [
(u'Top National Stories', u'http://www.theglobeandmail.com/news/national/?service=rss'),
(u'Business', u'http://www.theglobeandmail.com/report-on-business/?service=rss'),
(u'Investing', u'http://www.theglobeandmail.com/globe-investor/?service=rss'),
(u'Politics', u'http://www.theglobeandmail.com/news/politics/?service=rss'),
(u'Commentary', u'http://www.theglobeandmail.com/news/opinions/?service=rss'),
(u'Toronto', u'http://www.theglobeandmail.com/news/national/toronto/?service=rss'),
(u'Facts & Arguments', u'http://www.theglobeandmail.com/life/facts-and-arguments/?service=rss'),
(u'Technology', u'http://www.theglobeandmail.com/news/technology/?service=rss'),
(u'Arts', u'http://www.theglobeandmail.com/news/arts/?service=rss'),
(u'Life', u'http://www.theglobeandmail.com/life/?service=rss'),
(u'Real Estate', u'http://www.theglobeandmail.com/real-estate/?service=rss'),
(u'Auto', u'http://www.theglobeandmail.com/auto/?service=rss'),
(u'Sports', u'http://www.theglobeandmail.com/sports/?service=rss')
]

keep_only_tags = [
dict(name='h1'),
dict(name='h2', attrs={'id':'articletitle'}),
dict(name='p', attrs={'class':['leadText', 'meta', 'leadImage', 'redtext byline', 'bodyText']}),
dict(name='div', attrs={'class':['news','articlemeta','articlecopy']}),
dict(name='id', attrs={'class':'article'}),
dict(name='table', attrs={'class':'todays-market'}),
dict(name='header', attrs={'id':'leadheader'})
]

remove_tags = [
dict(name='ul', attrs={'class':['pillboxcontainer arttoolsbpbx']}),
dict(name='div', attrs={'class':['relcont', 'articleTools', 'ypad fontsmall', 'pagination']}),
dict(name='a', attrs={'href':['javascript:void(0);', 'http://m.yp.ca?tracking=globeandmail']}),
dict(name='div', attrs={'id':['ShareArticles', 'topStories']})
]

def postprocess_html(self, soup, first_fetch):
# Find and preserve single page article layout, can be first or last
allArts = soup.findAll(True, {'id':'article'})
if len(allArts)==2:
if(len(allArts[0].contents)>len(allArts[1].contents)):
allArts[1].extract()
else:
allArts[0].extract()

return soup

def parse_feeds(self, *args, **kwargs):
parsed_feeds = BasicNewsRecipe.parse_feeds(self, *args, **kwargs)
# Eliminate the duplicates
urlSet = set()

for feed in parsed_feeds:
newArticles = []
for article in feed:
if article.url in urlSet:
feed.articles.remove( article )
else:
urlSet.add(article.url)
newArticles.append(article)

feed.articles = newArticles

return parsed_feeds

#
cover_url = 'http://www.freewarepocketpc.net/wp7/img/the-globe-and-mail.png'

#Use the mobile version rather than the web version
def print_version(self, url):
return url.replace('cmpid=rss1','service=mobile')

guterm
01-15-2011, 10:43 PM
And here is even more polished recipe.
Failing downloads are fixed, sending right away to the mobile site avoiding redirect, other minor tweaks.

/guterm


#!/usr/bin/env python
__license__ = 'GPL v3'

__copyright__ = '2011, Szing, guterm'
__docformat__ = 'restructuredtext en'

'''
globeandmail.com
'''

from calibre.web.feeds.news import BasicNewsRecipe

class TheGlobeAndMailAdvancedRecipe(BasicNewsRecipe):
title = u'The Globe And Mail'
__license__ = 'GPL v3'
__author__ = 'Szing, guterm'
oldest_article = 2
no_stylesheets = True
max_articles_per_feed = 100
encoding = 'utf8'
publisher = 'Globe & Mail'
language = 'en_CA'
extra_css = 'p.meta {font-size:75%}\n .redtext {color: red;}\n .byline {font-size: 70%}'

feeds = [
(u'Top National Stories', u'http://www.theglobeandmail.com/news/national/?service=rss'),
(u'Business', u'http://www.theglobeandmail.com/report-on-business/?service=rss'),
(u'Investing', u'http://www.theglobeandmail.com/globe-investor/?service=rss'),
(u'Politics', u'http://www.theglobeandmail.com/news/politics/?service=rss'),
(u'Commentary', u'http://www.theglobeandmail.com/news/opinions/?service=rss'),
(u'Toronto', u'http://www.theglobeandmail.com/news/national/toronto/?service=rss'),
(u'Facts & Arguments', u'http://www.theglobeandmail.com/life/facts-and-arguments/?service=rss'),
(u'Technology', u'http://www.theglobeandmail.com/news/technology/?service=rss'),
(u'Arts', u'http://www.theglobeandmail.com/news/arts/?service=rss'),
(u'Life', u'http://www.theglobeandmail.com/life/?service=rss'),
(u'Real Estate', u'http://www.theglobeandmail.com/real-estate/?service=rss'),
(u'Auto', u'http://www.theglobeandmail.com/auto/?service=rss'),
(u'Sports', u'http://www.theglobeandmail.com/sports/?service=rss')
]

keep_only_tags = [
dict(name='h1'),
dict(name='h2', attrs={'id':'articletitle'}),
dict(name='p', attrs={'class':['leadText', 'meta', 'leadImage', 'redtext byline', 'bodyText']}),
dict(name='div', attrs={'class':['news','articlemeta','articlecopy','columnist', 'blog']}),
dict(name='id', attrs={'class':'article'}),
dict(name='table', attrs={'class':'todays-market'}),
dict(name='header', attrs={'id':'leadheader'})
]

remove_tags = [
dict(name='ul', attrs={'class':['pillboxcontainer arttoolsbpbx']}),
dict(name='div', attrs={'class':['relcont', 'articleTools', 'ypad fontsmall', 'pagination']}),
dict(name='a', attrs={'href':['javascript:void(0);', 'http://m.yp.ca?tracking=globeandmail']}),
dict(name='div', attrs={'id':['ShareArticles', 'topStories', 'seealsobottom']})
]

def postprocess_html(self, soup, first_fetch):
# Find and preserve single page article layout, can be first or last
allArts = soup.findAll(True, {'id':'article'})
if len(allArts)==2:
if(len(allArts[0].contents)>len(allArts[1].contents)):
allArts[1].extract()
else:
allArts[0].extract()

return soup

def parse_feeds(self, *args, **kwargs):
parsed_feeds = BasicNewsRecipe.parse_feeds(self, *args, **kwargs)
# Eliminate the duplicates
urlSet = set()

for feed in parsed_feeds:
newArticles = []
for article in feed:
if article.url in urlSet:
feed.articles.remove( article )
else:
urlSet.add(article.url)
newArticles.append(article)

feed.articles = newArticles

return parsed_feeds

#
cover_url = 'http://www.freewarepocketpc.net/wp7/img/the-globe-and-mail.png'

#Use the mobile version rather than the web version
def print_version(self, url):
return (url.replace('cmpid=rss1','service=mobile')).repla ce('http://www.','http://m.')

mufc
01-16-2011, 12:20 AM
"Added code to find and eliminate duplicating articles across multiple feeds (i.e., the same article showing up in Business and Investing)"
def parse_feeds(self, *args, **kwargs):
parsed_feeds = BasicNewsRecipe.parse_feeds(self, *args, **kwargs)
# Eliminate the duplicates
urlSet = set()

for feed in parsed_feeds:
newArticles = []
for article in feed:
if article.url in urlSet:
feed.articles.remove( article )
else:
urlSet.add(article.url)
newArticles.append(article)

feed.articles = newArticles

return parsed_feeds


If you run the recipe without this and with it you will notice that when it deletes an article it also deletes the article below it that does not occur anywhere else so you actually lose articles.

Also
def postprocess_html(self, soup, first_fetch):
# Find and preserve single page article layout, can be first or last
allArts = soup.findAll(True, {'id':'article'})
if len(allArts)==2:
if(len(allArts[0].contents)>len(allArts[1].contents)):
allArts[1].extract()
else:
allArts[0].extract()

return soup


All this did was get rid of the links to the rest of the pages but did not add the rest of the article from the other pages

kovidgoyal
01-16-2011, 01:19 AM
Guys, when you settle on a final recipe, let me know so I can update the builtin one in calibre.

guterm
01-16-2011, 01:49 PM
mufc, are you trying to cut and paste pieces into old recipe? It will not work.
It feels like you still have this in your recipe:

#this has to be here or the text in the article appears twice.
remove_tags_after = [dict(id='article')]

Give the whole new recipe a try without pasting pieces.

mufc
01-16-2011, 03:02 PM
I was testing that code in another globe recipe and having problems. I tried yours this morning and it works fine. My apologies.
Let me ask you a question.
The code for removing articles. Is it carried out prior to downloading. Why I ask this is my recipe does not include all sections of the G + M. The extra articles I lose could be because they show up first in a section that I do not download. Is that possible ?
Could not get the single page layout to work at all.

Maybe I should just modify your recipe to suit my Sony 650.

Your recipe seems to have given me more questions than answers.

How much of this is generic and could they be adapted to other recipes
def postprocess_html(self, soup, first_fetch):
# Find and preserve single page article layout, can be first or last
allArts = soup.findAll(True, {'id':'article'})
if len(allArts)==2:
if(len(allArts[0].contents)>len(allArts[1].contents)):
allArts[1].extract()
else:
allArts[0].extract()

return soup

def parse_feeds(self, *args, **kwargs):
parsed_feeds = BasicNewsRecipe.parse_feeds(self, *args, **kwargs)
# Eliminate the duplicates
urlSet = set()

for feed in parsed_feeds:
newArticles = []
for article in feed:
if article.url in urlSet:
feed.articles.remove( article )
else:
urlSet.add(article.url)
newArticles.append(article)

feed.articles = newArticles

return parsed_feeds

Also this is the first instance I have seen of using the mobile instead of web version. What is your reasoning and could it be adapted for other recipes ?

mufc
01-16-2011, 04:39 PM
I use you recipe now with a couple of tweaks for my Sony 650. Your recipe is quite brilliant actually. I only wish I could learn how to adapt your ideas into a couple of my own recipes

guterm
01-21-2011, 01:05 PM
Thanks man, really appreciate good word!
Would you mind sharing back your "couple of tweaks" for all us Globe fans?

mufc
01-21-2011, 09:06 PM
This is made for my Sony 650. Gets rid of images and hyperlinks etc.
#!/usr/bin/env python
__license__ = 'GPL v3'

__copyright__ = '2011, Szing, guterm'
__docformat__ = 'restructuredtext en'

'''
globeandmail.com
'''

from calibre.web.feeds.news import BasicNewsRecipe

class TheGlobeAndMailAdvancedRecipe(BasicNewsRecipe):
title = u'The Globe And Mail'
__license__ = 'GPL v3'
__author__ = 'Szing, guterm'
oldest_article = 2
no_stylesheets = True
max_articles_per_feed = 100
encoding = 'utf8'
publisher = 'Globe & Mail'
language = 'en_CA'
extra_css = '''
h1{font-family:Georgia,serif; font-weight:bold;font-size:large;}
h2{font-family:Georgia,serif; font-weight:bold;font-size:large;}
p{font-family:Georgia,serif;font-size:small;}
body{font-family:Georgia,serif;font-size:small;}
'''

feeds = [
(u'National', u'http://www.theglobeandmail.com/news/national/?service=rss'),
(u'World', u'http://www.theglobeandmail.com/news/world/?service=rss'),
(u'Commentary', u'http://www.theglobeandmail.com/news/opinions/?service=rss'),
(u'Sports', u'http://www.theglobeandmail.com/sports/?service=rss'),
(u'Technology', u'http://www.theglobeandmail.com/news/technology/?service=rss'),
(u'Personal Tech', u'http://www.theglobeandmail.com/news/technology/personal-tech/?service=rss'),
(u'Arts', u'http://www.theglobeandmail.com/news/arts/?service=rss'),
(u'Life', u'http://www.theglobeandmail.com/life/?service=rss'),
(u'Opinion', u'http://www.theglobeandmail.com/news/opinions/opinion/?service=rss'),
]

keep_only_tags = [
dict(name='h1'),
dict(name='h2', attrs={'id':'articletitle'}),
dict(name='p', attrs={'class':['leadText', 'bodyText']}),
dict(name='div', attrs={'class':['news','articlecopy','columnist', 'blog']}),
dict(name='id', attrs={'class':'article'}),
dict(name='table', attrs={'class':'todays-market'}),
dict(name='header', attrs={'id':'leadheader'})
]

remove_tags = [
dict(name='ul', attrs={'class':['pillboxcontainer arttoolsbpbx']}),
dict(name='div', attrs={'class':['relcont', 'articleTools', 'ypad fontsmall', 'pagination','meta mb10']}),
dict(name='a', attrs={'href':['javascript:void(0);', 'http://m.yp.ca?tracking=globeandmail']}),
dict(name='div', attrs={'id':['ShareArticles', 'topStories', 'seealsobottom']}),
dict(name='p', attrs={'class':['leadCaption fontxsmall','lastMod fontxsmall mt10']}),
dict(name='img')
]

def postprocess_html(self, soup, first_fetch):
# Find and preserve single page article layout, can be first or last
allArts = soup.findAll(True, {'id':'article'})
if len(allArts)==2:
if(len(allArts[0].contents)>len(allArts[1].contents)):
allArts[1].extract()
else:
allArts[0].extract()

return soup

def parse_feeds(self, *args, **kwargs):
parsed_feeds = BasicNewsRecipe.parse_feeds(self, *args, **kwargs)
# Eliminate the duplicates
urlSet = set()

for feed in parsed_feeds:
newArticles = []
for article in feed:
if article.url in urlSet:
feed.articles.remove( article )
else:
urlSet.add(article.url)
newArticles.append(article)

feed.articles = newArticles

return parsed_feeds

#
cover_url = 'http://www.freewarepocketpc.net/wp7/img/the-globe-and-mail.png'

#Use the mobile version rather than the web version
def print_version(self, url):
return (url.replace('cmpid=rss1','service=mobile')).repla ce('http://www.','http://m.')