View Single Post
Old 10-19-2013, 03:37 PM   #6
mauropiccolo
Member
mauropiccolo began at the beginning.
 
Posts: 12
Karma: 10
Join Date: Sep 2013
Device: kindle
Hi,
can You try this recipe ?
Awaiting feedback

Code:
#!/usr/bin/env  python
# -*- coding: utf-8 -*-
__license__   = 'GPL v3'

'''
www.canada.com
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
import datetime
import locale
from pprint import pprint
from calibre.ebooks.BeautifulSoup import Tag, BeautifulStoneSoup


class TimesColonist(BasicNewsRecipe):

    # Customization -- remove sections you don't want.
    # If your e-reader is an e-ink Kindle and your output profile is
    # set properly this recipe will not include images because the
    # resulting file is too large. If you have one of these and want
    # images you can set kindle_omit_images = False
    # and remove sections (typically the e-ink Kindles will
    # work with about a dozen of these, but your mileage may vary).

    kindle_omit_images = True

    section_list = [
        ('','Web Front Page'),
        ('news/','News Headlines'),
#        ('news/b-c/','BC News'),
#         ('news/national/','National News'),
#         ('news/world/','World News'),
#         ('opinion/','Opinion'),
#         ('opinion/letters/','Letters'),
#         ('business/','Business'),
#         ('business/money/','Money'),
#         ('business/technology/','Technology'),
#         ('business/working/','Working'),
#         ('sports/','Sports'),
#         ('sports/hockey/','Hockey'),
#         ('sports/football/','Football'),
#         ('sports/basketball/','Basketball'),
#         ('sports/golf/','Golf'),
#         ('entertainment/','entertainment'),
#         ('entertainment/go/','Go!'),
#         ('entertainment/music/','Music'),
#         ('entertainment/books/','Books'),
#         ('entertainment/Movies/','Movies'),
#         ('entertainment/television/','Television'),
#         ('life/','Life'),
#         ('life/health/','Health'),
#         ('life/travel/','Travel'),
#         ('life/driving/','Driving'),
#         ('life/homes/','Homes'),
#         ('life/food-drink/','Food & Drink')
    ]
    keep_only_tags = [dict(name='section',attrs={'class':"story-carousel row-fluid"}),
                      dict(name='div', attrs={'class':"wrapper appendbottom-10"}),
                      dict(name='div', attrs={'class':"story-content row-fluid"}),
                                             # ,
                                             ]

    remove_tags = [dict(name='ul', attrs={'class':'tools no-list'})]
    title = u'Victoria Times Colonist'
    url_prefix = 'http://www.timescolonist.com'
    description = u'News from Victoria, BC'
    fp_tag = 'CAN_TC'

    masthead_url = 'http://www.timescolonist.com/gmg/img/global/logoTimesColonist.png'

    oldest_article = 1.0
    url_list = []
    language = 'en_CA'
    locale.setlocale(locale.LC_ALL, '%s.UTF-8' % "en_US")
    __author__ = 'Nick Redding' + '& Mauropiccolo'
    no_stylesheets = True
    timefmt =  ' [%b %d]'
    preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
    encoding = 'utf-8'
#     extra_css = '''
#                 .byline { font-size:xx-small; font-weight: bold;}
#                 h3 { margin-bottom: 6px; }
#                 .caption { font-size: xx-small; font-style: italic; font-weight: normal; }
#                 '''




    def get_cover_url(self):
        from datetime import timedelta, date
        cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
        br = BasicNewsRecipe.get_browser(self)
        daysback=1
        try:
            br.open(cover)
        except:
            while daysback<7:
                cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
                br = BasicNewsRecipe.get_browser(self)
                try:
                    br.open(cover)
                except:
                    daysback = daysback+1
                    continue
                break
        if daysback==7:
            self.log("\nCover unavailable")
            cover = None
        return cover

    def prepare_masthead_image(self, path_to_image, out_path):
        if self.Kindle_Fire:
            from calibre.utils.magick import Image, create_canvas
            img = Image()
            img.open(path_to_image)
            width, height = img.size
            img2 = create_canvas(width, height)
            img2.compose(img)
            img2.save(out_path)
        else:
            BasicNewsRecipe.prepare_masthead_image(path_to_image, out_path)


    
    def read_article(self,url):
        author = ''
        data = None
        content = ''
        try:
            soup = self.index_to_soup(url)
        except:
            pass
        #content = self.tag_to_string(soup.find("div",attrs={'class':"story-content row-fluid"}))
        #print content
        info = soup.find("p",attrs={"class":"ancillary light-border-bottom"})
        if info:
            try:
                data = datetime.datetime.strptime(str(info.contents[-1].strip()),"%B %d, %Y %I:%M %p")
                
            except Exception as e:
                print e
#                 now = datetime.datetime.now()
#                 print now.strftime("%B %d, %Y %I:%M %p")
#                 data = info.contents[-1].strip()
            #print "data = "+data
        return (author,data,content)
    
    
    def handle_articles(self,article,article_list,sectitle):
        #TODO: prendere header->hgroup-> h2|3 -> a
        #print article
        for atag in article.hgroup.findAll("a"):
            #print atag
            url = atag['href']
            url = url.strip()
            # print("Checking >>"+url+'<<\n\r')
            if url.startswith('/'):
                url = self.url_prefix+url
            if url in self.url_list:
                return
            self.url_list.append(url)
            title = self.tag_to_string(atag,False)
            if 'VIDEO' in title.upper():
                return
            if 'GALLERY' in title.upper():
                return
            if 'PHOTOS' in title.upper():
                return
            if 'RAESIDE' in title.upper():
                if self.raeside:
                    return
                self.raeside = True
            description=''
            data = self.limite
            author,data,content = self.read_article(url)
            if data and data >= self.limite:
                data = data.isoformat()
            else:
                return
            article_list.append(dict(title=title,url=url,date=data,description=description,author=author,content=content))
            print(sectitle+title+": description = "+description+" URL="+url+'\n\r')

    def add_section_index(self,ans,securl,sectitle):
        print("Add section url="+self.url_prefix+'/'+securl+'\n\r')
        try:
            soup = self.index_to_soup(self.url_prefix+'/'+securl)
        except:
            return ans
        article_list = []
        for article in  soup.findAll('article'):
                self.handle_articles(article,article_list,sectitle)
        ans.append((sectitle,article_list))
        return ans

    def parse_index(self):
        ans = []
        self.limite = datetime.datetime.now()-datetime.timedelta(self.oldest_article)
        for (url,title) in self.section_list:
            ans = self.add_section_index(ans,url,title)
        ans = self.sort_index(ans)
        #pprint( ans)
        return ans
    
    def sort_index(self,index):
        ''' sort by date '''
        ans = []
        for section,list_ in index:
            newlist = sorted( list_, key=lambda k: k['date']) 
            ans.append((section,newlist))
        return ans
mauropiccolo is offline   Reply With Quote