View Single Post
Old 01-20-2010, 08:21 PM   #1204
nickredding
onlinenewsreader.net
nickredding knows the difference between 'who' and 'whom'nickredding knows the difference between 'who' and 'whom'nickredding knows the difference between 'who' and 'whom'nickredding knows the difference between 'who' and 'whom'nickredding knows the difference between 'who' and 'whom'nickredding knows the difference between 'who' and 'whom'nickredding knows the difference between 'who' and 'whom'nickredding knows the difference between 'who' and 'whom'nickredding knows the difference between 'who' and 'whom'nickredding knows the difference between 'who' and 'whom'nickredding knows the difference between 'who' and 'whom'
 
Posts: 328
Karma: 10143
Join Date: Dec 2009
Location: Phoenix, AZ & Victoria, BC
Device: Kindle 3, Kindle Fire, IPad3, iPhone4, Playbook, HTC Inspire
Canadian Newspapers--CanWest chain

The CanWest chain of Candian newspapers all use the same web format. Here is a recipe that will handle any of them--just un-comment the three lines in the header corresponding to the paper you want.
Code:
#!/usr/bin/env  python

__license__   = 'GPL v3'

'''
www.canada.com
'''
import string, re
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe

import string, re
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag


class CanWestPaper(BasicNewsRecipe):

    # un-comment the following three lines for the Victoria Times Colonist
    #title = u'Victoria Times Colonist'
    #url_prefix = 'http://www.timescolonist.com'
    #description = u'News from Victoria, BC'

    # un-comment the following three lines for the Vancouver Province
    #title = u'Vancouver Province'
    #url_prefix = 'http://www.theprovince.com'
    #description = u'News from Vancouver, BC'

    # un-comment the following three lines for the Vancouver Sun
    #title = u'Vancouver Sun'
    #url_prefix = 'http://www.vancouversun.com'
    #description = u'News from Vancouver, BC'

    # un-comment the following three lines for the Edmonton Journal
    #title = u'Edmonton Journal'
    #url_prefix = 'http://www.edmontonjournal.com'
    #description = u'News from Edmonton, AB'

    # un-comment the following three lines for the Calgary Herald
    #title = u'Calgary Herald'
    #url_prefix = 'http://www.calgaryherald.com'
    #description = u'News from Calgary, AB'

    # un-comment the following three lines for the Regina Leader-Post
    #title = u'Regina Leader-Post'
    #url_prefix = 'http://www.leaderpost.com'
    #description = u'News from Regina, SK'

    # un-comment the following three lines for the Saskatoon Star-Phoenix
    #title = u'Saskatoon Star-Phoenix'
    #url_prefix = 'http://www.thestarphoenix.com'
    #description = u'News from Saskatoon, SK'

    # un-comment the following three lines for the Windsor Star
    #title = u'Windsor Star'
    #url_prefix = 'http://www.windsorstar.com'
    #description = u'News from Windsor, ON'

    # un-comment the following three lines for the Ottawa Citizen
    #title = u'Ottawa Citizen'
    #url_prefix = 'http://www.ottawacitizen.com'
    #description = u'News from Ottawa, ON'

    # un-comment the following three lines for the Montreal Gazette
    #title = u'Montreal Gazette'
    #url_prefix = 'http://www.montrealgazette.com'
    #description = u'News from Montreal, QC'

    
    language = 'en_CA'
    __author__ = 'Nick Redding'
    no_stylesheets = True
    timefmt = ' [%b %d]'
    extra_css = '''
                .timestamp {  font-size:xx-small; display: block; }
                #storyheader { font-size: medium; }
                #storyheader h1 { font-size: x-large; }
                #storyheader h2 { font-size: large;  font-style: italic; }
                .byline { font-size:xx-small; }
                #photocaption { font-size: small; font-style: italic }
                #photocredit { font-size: xx-small; }'''
    keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
    remove_tags = [{'class':'comments'},
                   dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
                   dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
                   dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
                   dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
                   dict(name='div', attrs={'class':'rule_grey_solid'}),
                   dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]

    def preprocess_html(self,soup):
        #delete iempty id attributes--they screw up the TOC for unknow reasons
        divtags = soup.findAll('div',attrs={'id':''})
        if divtags:
            for div in divtags:
                del(div['id'])
        return soup


    def parse_index(self):
        soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')

        articles = {}
        key = 'News'
        ans = ['News']

        # Find each instance of class="sectiontitle", class="featurecontent"
        for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
                #self.log(" div class = %s" % divtag['class'])
                if divtag['class'].startswith('section_title'):
                    # div contains section title
                    if not divtag.h3:
                        continue
                    key = self.tag_to_string(divtag.h3,False)
                    ans.append(key)
                    self.log("Section name %s" % key)
                    continue
                # div contains article data
                h1tag = divtag.find('h1')
                if not h1tag:
                    continue
                atag = h1tag.find('a',href=True)
                if not atag:
                    continue
                url = self.url_prefix+'/news/todays-paper/'+atag['href']
                #self.log("Section %s" % key)
                #self.log("url %s" % url)
                title = self.tag_to_string(atag,False)
                #self.log("title %s" % title)
                pubdate = ''
                description = ''
                ptag = divtag.find('p');         
                if ptag:
                    description = self.tag_to_string(ptag,False)
                    #self.log("description %s" % description)
                author = ''
                autag = divtag.find('h4')
                if autag:
                    author = self.tag_to_string(autag,False)
                    #self.log("author %s" % author)
                if not articles.has_key(key):
                    articles[key] = []
                articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))

        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
        return ans
nickredding is offline