Code:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
'''
www.canada.com
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
import datetime
import locale
from pprint import pprint
from calibre.ebooks.BeautifulSoup import Tag, BeautifulStoneSoup
class TimesColonist(BasicNewsRecipe):
# Customization -- remove sections you don't want.
# If your e-reader is an e-ink Kindle and your output profile is
# set properly this recipe will not include images because the
# resulting file is too large. If you have one of these and want
# images you can set kindle_omit_images = False
# and remove sections (typically the e-ink Kindles will
# work with about a dozen of these, but your mileage may vary).
kindle_omit_images = True
section_list = [
('','Web Front Page'),
('news/','News Headlines'),
# ('news/b-c/','BC News'),
# ('news/national/','National News'),
# ('news/world/','World News'),
# ('opinion/','Opinion'),
# ('opinion/letters/','Letters'),
# ('business/','Business'),
# ('business/money/','Money'),
# ('business/technology/','Technology'),
# ('business/working/','Working'),
# ('sports/','Sports'),
# ('sports/hockey/','Hockey'),
# ('sports/football/','Football'),
# ('sports/basketball/','Basketball'),
# ('sports/golf/','Golf'),
# ('entertainment/','entertainment'),
# ('entertainment/go/','Go!'),
# ('entertainment/music/','Music'),
# ('entertainment/books/','Books'),
# ('entertainment/Movies/','Movies'),
# ('entertainment/television/','Television'),
# ('life/','Life'),
# ('life/health/','Health'),
# ('life/travel/','Travel'),
# ('life/driving/','Driving'),
# ('life/homes/','Homes'),
# ('life/food-drink/','Food & Drink')
]
keep_only_tags = [dict(name='section',attrs={'class':"story-carousel row-fluid"}),
dict(name='div', attrs={'class':"wrapper appendbottom-10"}),
dict(name='div', attrs={'class':"story-content row-fluid"}),
# ,
]
remove_tags = [dict(name='ul', attrs={'class':'tools no-list'})]
title = u'Victoria Times Colonist'
url_prefix = 'http://www.timescolonist.com'
description = u'News from Victoria, BC'
fp_tag = 'CAN_TC'
masthead_url = 'http://www.timescolonist.com/gmg/img/global/logoTimesColonist.png'
oldest_article = 1.0
url_list = []
language = 'en_CA'
locale.setlocale(locale.LC_ALL, '%s.UTF-8' % "en_US")
__author__ = 'Nick Redding' + '& Mauropiccolo'
no_stylesheets = True
timefmt = ' [%b %d]'
preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
encoding = 'utf-8'
# extra_css = '''
# .byline { font-size:xx-small; font-weight: bold;}
# h3 { margin-bottom: 6px; }
# .caption { font-size: xx-small; font-style: italic; font-weight: normal; }
# '''
def get_cover_url(self):
from datetime import timedelta, date
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser(self)
daysback=1
try:
br.open(cover)
except:
while daysback<7:
cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
br = BasicNewsRecipe.get_browser(self)
try:
br.open(cover)
except:
daysback = daysback+1
continue
break
if daysback==7:
self.log("\nCover unavailable")
cover = None
return cover
def prepare_masthead_image(self, path_to_image, out_path):
if self.Kindle_Fire:
from calibre.utils.magick import Image, create_canvas
img = Image()
img.open(path_to_image)
width, height = img.size
img2 = create_canvas(width, height)
img2.compose(img)
img2.save(out_path)
else:
BasicNewsRecipe.prepare_masthead_image(path_to_image, out_path)
def read_article(self,url):
author = ''
data = None
content = ''
try:
soup = self.index_to_soup(url)
except:
pass
#content = self.tag_to_string(soup.find("div",attrs={'class':"story-content row-fluid"}))
#print content
info = soup.find("p",attrs={"class":"ancillary light-border-bottom"})
if info:
try:
data = datetime.datetime.strptime(str(info.contents[-1].strip()),"%B %d, %Y %I:%M %p")
except Exception as e:
print e
# now = datetime.datetime.now()
# print now.strftime("%B %d, %Y %I:%M %p")
# data = info.contents[-1].strip()
#print "data = "+data
return (author,data,content)
def handle_articles(self,article,article_list,sectitle):
#TODO: prendere header->hgroup-> h2|3 -> a
#print article
for atag in article.hgroup.findAll("a"):
#print atag
url = atag['href']
url = url.strip()
# print("Checking >>"+url+'<<\n\r')
if url.startswith('/'):
url = self.url_prefix+url
if url in self.url_list:
return
self.url_list.append(url)
title = self.tag_to_string(atag,False)
if 'VIDEO' in title.upper():
return
if 'GALLERY' in title.upper():
return
if 'PHOTOS' in title.upper():
return
if 'RAESIDE' in title.upper():
if self.raeside:
return
self.raeside = True
description=''
data = self.limite
author,data,content = self.read_article(url)
if data and data >= self.limite:
data = data.isoformat()
else:
return
article_list.append(dict(title=title,url=url,date=data,description=description,author=author,content=content))
print(sectitle+title+": description = "+description+" URL="+url+'\n\r')
def add_section_index(self,ans,securl,sectitle):
print("Add section url="+self.url_prefix+'/'+securl+'\n\r')
try:
soup = self.index_to_soup(self.url_prefix+'/'+securl)
except:
return ans
article_list = []
for article in soup.findAll('article'):
self.handle_articles(article,article_list,sectitle)
ans.append((sectitle,article_list))
return ans
def parse_index(self):
ans = []
self.limite = datetime.datetime.now()-datetime.timedelta(self.oldest_article)
for (url,title) in self.section_list:
ans = self.add_section_index(ans,url,title)
ans = self.sort_index(ans)
#pprint( ans)
return ans
def sort_index(self,index):
''' sort by date '''
ans = []
for section,list_ in index:
newlist = sorted( list_, key=lambda k: k['date'])
ans.append((section,newlist))
return ans