Register Guidelines E-Books Today's Posts Search

Go Back   MobileRead Forums > E-Book Software > Calibre > Recipes

Notices

Reply
 
Thread Tools Search this Thread
Old 09-29-2013, 10:28 AM   #1
kmcgladr
Junior Member
kmcgladr began at the beginning.
 
Posts: 2
Karma: 10
Join Date: Sep 2013
Device: Kindle
Victoria Times Colonist old articles

Hi,

Long time Calibre user here. One of the papers I receive is the Victoria Times Colonist. For some time, I've been receiving really, really old articles in this publication.

I no longer want to receive these out of date articles. I only want articles up to 3 days old.

For example, today is September 29th.

My copy of the Victoria Times Colonist has an article from May 15th in it. That's not the oldest, though.

And December 8, 2012. "Victoria's unemployment rate improves", from http://www.timescolonist.com/busines...proves-1.23175


I've tried hacking the recipe, which follows here, but that does not work. You'll notice I've pulled out a lot of sections that I wasn't interested in. If there's an easy way to set a time limit, I have not found it. This only affects this publication; other newspapers are working correctly and normally and only sending the day's news.




Code:
#!/usr/bin/env  python
# -*- coding: utf-8 -*-
__license__   = 'GPL v3'

'''
www.canada.com
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe

from calibre.ebooks.BeautifulSoup import Tag, BeautifulStoneSoup


class TimesColonist(BasicNewsRecipe):

    # Customization -- remove sections you don't want.
    # If your e-reader is an e-ink Kindle and your output profile is
    # set properly this recipe will not include images because the
    # resulting file is too large. If you have one of these and want
    # images you can set kindle_omit_images = False
    # and remove sections (typically the e-ink Kindles will
    # work with about a dozen of these, but your mileage may vary).

    kindle_omit_images = True

    section_list = [
        ('','Web Front Page'),
        ('news/','News Headlines'),
        ('news/b-c/','BC News'),
#        ('news/national/','National News'),
#        ('news/world/','World News'),
#        ('opinion/','Opinion'),
#        ('opinion/letters/','Letters'),
        ('business/','Business'),
        ('business/money/','Money'),
        ('business/technology/','Technology'),
        ('business/working/','Working'),
#        ('sports/','Sports'),
#        ('sports/hockey/','Hockey'),
#        ('sports/football/','Football'),
#        ('sports/basketball/','Basketball'),
#        ('sports/golf/','Golf'),
#        ('entertainment/','entertainment'),
#        ('entertainment/go/','Go!'),
#        ('entertainment/music/','Music'),
#        ('entertainment/books/','Books'),
#        ('entertainment/Movies/','Movies'),
#        ('entertainment/television/','Television'),
#        ('life/','Life'),
#        ('life/health/','Health'),
#        ('life/travel/','Travel'),
#        ('life/driving/','Driving'),
#        ('life/homes/','Homes'),
#        ('life/food-drink/','Food & Drink')
    ]

    title = u'Victoria Times Colonist'
    url_prefix = 'http://www.timescolonist.com'
    description = u'News from Victoria, BC'
    fp_tag = 'CAN_TC'

    masthead_url = 'http://www.timescolonist.com/gmg/img/global/logoTimesColonist.png'


    url_list = []
    language = 'en_CA'
    __author__ = 'Nick Redding'
    no_stylesheets = True
    timefmt =  ' [%b %d]'
    encoding = 'utf-8'
    extra_css = '''
                .byline { font-size:xx-small; font-weight: bold;}
                h3 { margin-bottom: 6px; }
                .caption { font-size: xx-small; font-style: italic; font-weight: normal; }
                '''
    keep_only_tags = [dict(name='div', attrs={'class':re.compile('main.content')})]

    def __init__(self, options, log, progress_reporter):
        self.remove_tags = [{'class':'comments'},
                       {'id':'photocredit'},
                       dict(name='div', attrs={'class':re.compile('top.controls')}),
                       dict(name='div', attrs={'class':re.compile('^comments')}),
                       dict(name='div', attrs={'class':re.compile('social')}),
                       dict(name='div', attrs={'class':re.compile('tools')}),
                       dict(name='div', attrs={'class':re.compile('bottom.tools')}),
                       dict(name='div', attrs={'class':re.compile('window')}),
                       dict(name='div', attrs={'class':re.compile('related.news.element')})]
        print("PROFILE NAME = "+options.output_profile.short_name)
#        if self.kindle_omit_images and options.output_profile.short_name in ['kindle', 'kindle_dx', 'kindle_pw']:
        if self.kindle_omit_images:
            self.remove_tags.append(dict(name='div', attrs={'class':re.compile('image-container')}))
        BasicNewsRecipe.__init__(self, options, log, progress_reporter)

    def get_cover_url(self):
        from datetime import timedelta, date
        cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
        br = BasicNewsRecipe.get_browser(self)
        daysback=1
        try:
            br.open(cover)
        except:
            while daysback<3:
                cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
                br = BasicNewsRecipe.get_browser(self)
                try:
                    br.open(cover)
                except:
                    daysback = daysback+1
                    continue
                break
        if daysback==7:
            self.log("\nCover unavailable")
            cover = None
        return cover

    def prepare_masthead_image(self, path_to_image, out_path):
        if self.Kindle_Fire:
            from calibre.utils.magick import Image, create_canvas
            img = Image()
            img.open(path_to_image)
            width, height = img.size
            img2 = create_canvas(width, height)
            img2.compose(img)
            img2.save(out_path)
        else:
            BasicNewsRecipe.prepare_masthead_image(path_to_image, out_path)

    def fixChars(self,string):
        # Replace lsquo (\x91)
        fixed = re.sub("\x91","‘",string)
        # Replace rsquo (\x92)
        fixed = re.sub("\x92","’",fixed)
        # Replace ldquo (\x93)
        fixed = re.sub("\x93","“",fixed)
        # Replace rdquo (\x94)
        fixed = re.sub("\x94","”",fixed)
        # Replace ndash (\x96)
        fixed = re.sub("\x96","–",fixed)
        # Replace mdash (\x97)
        fixed = re.sub("\x97","—",fixed)
        fixed = re.sub("&#x2019;","’",fixed)
        return fixed

    def massageNCXText(self, description):
        # Kindle TOC descriptions won't render certain characters
        if description:
            massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
            # Replace '&' with '&'
            massaged = re.sub("&","&", massaged)
            return self.fixChars(massaged)
        else:
            return description

    def populate_article_metadata(self, article, soup, first):
        if first:
            picdiv = soup.find('body').find('img')
            if picdiv is not None:
                self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
        xtitle = article.text_summary.strip()
        if len(xtitle) == 0:
            desc = soup.find('meta',attrs={'property':'og:description'})
            if desc is not None:
                article.summary = article.text_summary = desc['content']

    def strip_anchors(self,soup):
        paras = soup.findAll(True)
        for para in paras:
            aTags = para.findAll('a')
            for a in aTags:
                if a.img is None:
                    a.replaceWith(a.renderContents().decode('cp1252','replace'))
        return soup

    def preprocess_html(self,soup):
        byline = soup.find('p',attrs={'class':re.compile('ancillary')})
        if byline is not None:
            authstr = self.tag_to_string(byline,False)
            authstr = re.sub('/ *Times Colonist','/',authstr, flags=re.IGNORECASE)
            authstr = re.sub('BY */','',authstr, flags=re.IGNORECASE)
            newdiv = Tag(soup,'div')
            newdiv.insert(0,authstr)
            newdiv['class']='byline'
            byline.replaceWith(newdiv)
        for caption in soup.findAll('p',attrs={'class':re.compile('caption')}):
            capstr = self.tag_to_string(caption,False)
            capstr = re.sub('Photograph by.*$','',capstr, flags=re.IGNORECASE)
            newdiv = Tag(soup,'div')
            newdiv.insert(0,capstr)
            newdiv['class']='caption'
            caption.replaceWith(newdiv)
        for ptag in soup.findAll('p'):
            ptext = self.tag_to_string(ptag,use_alt=False, normalize_whitespace=True)
            ptext = re.sub(r'\s+','', ptext)
            if (ptext=='') or (ptext=='&nbsp;'):
                ptag.extract()
        return self.strip_anchors(soup)

    raeside = False
    def handle_articles(self,htag,article_list,sectitle):
        atag = htag.a
        if atag is not None:
            url = atag['href']
            url = url.strip()
            # print("Checking >>"+url+'<<\n\r')
            if url.startswith('/'):
                url = self.url_prefix+url
            if url in self.url_list:
                return
            self.url_list.append(url)
            title = self.tag_to_string(atag,False)
            if 'VIDEO' in title.upper():
                return
            if 'GALLERY' in title.upper():
                return
            if 'PHOTOS' in title.upper():
                return
            if 'RAESIDE' in title.upper():
                if self.raeside:
                    return
                self.raeside = True
            dtag = htag.findNext('p')
            description=''
            if dtag is not None:
                description = self.tag_to_string(dtag,False)
            article_list.append(dict(title=title,url=url,date='',description=description,author='',content=''))
            print(sectitle+title+": description = "+description+" URL="+url+'\n\r')

    def add_section_index(self,ans,securl,sectitle):
        print("Add section url="+self.url_prefix+'/'+securl+'\n\r')
        try:
            soup = self.index_to_soup(self.url_prefix+'/'+securl)
        except:
            return ans
        mainsoup = soup.find('div',attrs={'class':re.compile('main.content')})
        article_list = []
        for wdiv in mainsoup.findAll('div',attrs={'id':re.compile('featured.story')}):
            for htag in wdiv.findAll('h3'):
                self.handle_articles(htag,article_list,sectitle)
        for ladiv in mainsoup.findAll(attrs={'class':re.compile('leading.articles')}):
            for wdiv in mainsoup.findAll('div',attrs={'class':re.compile('article.row')}):
                for htag in wdiv.findAll('h2'):
                    self.handle_articles(htag,article_list,sectitle)
        ans.append((sectitle,article_list))
        return ans

    def parse_index(self):
        ans = []
        for (url,title) in self.section_list:
            ans = self.add_section_index(ans,url,title)
        return ans
kmcgladr is offline   Reply With Quote
Old 09-29-2013, 12:49 PM   #2
mauropiccolo
Member
mauropiccolo began at the beginning.
 
Posts: 12
Karma: 10
Join Date: Sep 2013
Device: kindle
try

http://manual.calibre-ebook.com/news...oldest_article
mauropiccolo is offline   Reply With Quote
Advert
Old 09-29-2013, 03:45 PM   #3
kmcgladr
Junior Member
kmcgladr began at the beginning.
 
Posts: 2
Karma: 10
Join Date: Sep 2013
Device: Kindle
Nope

Hi,

Thanks for the prompt reply; however, that didn't solve it. I'm hoping this is an easy fix, though, as I'm not a Python coder, so I may have made a mistake.

I've added the following:

Code:
    masthead_url = 'http://www.timescolonist.com/gmg/img/global/logoTimesColonist.png'

    oldest_article = 3.0

    url_list = []
The masthead_url and url_list were there before, so I just put in the oldest_article in between.

If that's where I was to have put it, it did not solve the problem. There are still articles from December 2012 coming into the news feed.
kmcgladr is offline   Reply With Quote
Old 09-30-2013, 10:22 AM   #4
mauropiccolo
Member
mauropiccolo began at the beginning.
 
Posts: 12
Karma: 10
Join Date: Sep 2013
Device: kindle
seems correct to me,
but I too have problems :-(

If you must rewrite populate_article_metadata,
may be that your news do not have date
or you recipe is too customized.
mauropiccolo is offline   Reply With Quote
Old 10-02-2013, 11:31 PM   #5
kovidgoyal
creator of calibre
kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.
 
kovidgoyal's Avatar
 
Posts: 43,869
Karma: 22666666
Join Date: Oct 2006
Location: Mumbai, India
Device: Various
oldest_article will not work unless your articles have correct dates. You must somehow get the correct article date in parse_index()
kovidgoyal is online now   Reply With Quote
Advert
Old 10-19-2013, 03:37 PM   #6
mauropiccolo
Member
mauropiccolo began at the beginning.
 
Posts: 12
Karma: 10
Join Date: Sep 2013
Device: kindle
Hi,
can You try this recipe ?
Awaiting feedback

Code:
#!/usr/bin/env  python
# -*- coding: utf-8 -*-
__license__   = 'GPL v3'

'''
www.canada.com
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
import datetime
import locale
from pprint import pprint
from calibre.ebooks.BeautifulSoup import Tag, BeautifulStoneSoup


class TimesColonist(BasicNewsRecipe):

    # Customization -- remove sections you don't want.
    # If your e-reader is an e-ink Kindle and your output profile is
    # set properly this recipe will not include images because the
    # resulting file is too large. If you have one of these and want
    # images you can set kindle_omit_images = False
    # and remove sections (typically the e-ink Kindles will
    # work with about a dozen of these, but your mileage may vary).

    kindle_omit_images = True

    section_list = [
        ('','Web Front Page'),
        ('news/','News Headlines'),
#        ('news/b-c/','BC News'),
#         ('news/national/','National News'),
#         ('news/world/','World News'),
#         ('opinion/','Opinion'),
#         ('opinion/letters/','Letters'),
#         ('business/','Business'),
#         ('business/money/','Money'),
#         ('business/technology/','Technology'),
#         ('business/working/','Working'),
#         ('sports/','Sports'),
#         ('sports/hockey/','Hockey'),
#         ('sports/football/','Football'),
#         ('sports/basketball/','Basketball'),
#         ('sports/golf/','Golf'),
#         ('entertainment/','entertainment'),
#         ('entertainment/go/','Go!'),
#         ('entertainment/music/','Music'),
#         ('entertainment/books/','Books'),
#         ('entertainment/Movies/','Movies'),
#         ('entertainment/television/','Television'),
#         ('life/','Life'),
#         ('life/health/','Health'),
#         ('life/travel/','Travel'),
#         ('life/driving/','Driving'),
#         ('life/homes/','Homes'),
#         ('life/food-drink/','Food & Drink')
    ]
    keep_only_tags = [dict(name='section',attrs={'class':"story-carousel row-fluid"}),
                      dict(name='div', attrs={'class':"wrapper appendbottom-10"}),
                      dict(name='div', attrs={'class':"story-content row-fluid"}),
                                             # ,
                                             ]

    remove_tags = [dict(name='ul', attrs={'class':'tools no-list'})]
    title = u'Victoria Times Colonist'
    url_prefix = 'http://www.timescolonist.com'
    description = u'News from Victoria, BC'
    fp_tag = 'CAN_TC'

    masthead_url = 'http://www.timescolonist.com/gmg/img/global/logoTimesColonist.png'

    oldest_article = 1.0
    url_list = []
    language = 'en_CA'
    locale.setlocale(locale.LC_ALL, '%s.UTF-8' % "en_US")
    __author__ = 'Nick Redding' + '& Mauropiccolo'
    no_stylesheets = True
    timefmt =  ' [%b %d]'
    preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
    encoding = 'utf-8'
#     extra_css = '''
#                 .byline { font-size:xx-small; font-weight: bold;}
#                 h3 { margin-bottom: 6px; }
#                 .caption { font-size: xx-small; font-style: italic; font-weight: normal; }
#                 '''




    def get_cover_url(self):
        from datetime import timedelta, date
        cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
        br = BasicNewsRecipe.get_browser(self)
        daysback=1
        try:
            br.open(cover)
        except:
            while daysback<7:
                cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
                br = BasicNewsRecipe.get_browser(self)
                try:
                    br.open(cover)
                except:
                    daysback = daysback+1
                    continue
                break
        if daysback==7:
            self.log("\nCover unavailable")
            cover = None
        return cover

    def prepare_masthead_image(self, path_to_image, out_path):
        if self.Kindle_Fire:
            from calibre.utils.magick import Image, create_canvas
            img = Image()
            img.open(path_to_image)
            width, height = img.size
            img2 = create_canvas(width, height)
            img2.compose(img)
            img2.save(out_path)
        else:
            BasicNewsRecipe.prepare_masthead_image(path_to_image, out_path)


    
    def read_article(self,url):
        author = ''
        data = None
        content = ''
        try:
            soup = self.index_to_soup(url)
        except:
            pass
        #content = self.tag_to_string(soup.find("div",attrs={'class':"story-content row-fluid"}))
        #print content
        info = soup.find("p",attrs={"class":"ancillary light-border-bottom"})
        if info:
            try:
                data = datetime.datetime.strptime(str(info.contents[-1].strip()),"%B %d, %Y %I:%M %p")
                
            except Exception as e:
                print e
#                 now = datetime.datetime.now()
#                 print now.strftime("%B %d, %Y %I:%M %p")
#                 data = info.contents[-1].strip()
            #print "data = "+data
        return (author,data,content)
    
    
    def handle_articles(self,article,article_list,sectitle):
        #TODO: prendere header->hgroup-> h2|3 -> a
        #print article
        for atag in article.hgroup.findAll("a"):
            #print atag
            url = atag['href']
            url = url.strip()
            # print("Checking >>"+url+'<<\n\r')
            if url.startswith('/'):
                url = self.url_prefix+url
            if url in self.url_list:
                return
            self.url_list.append(url)
            title = self.tag_to_string(atag,False)
            if 'VIDEO' in title.upper():
                return
            if 'GALLERY' in title.upper():
                return
            if 'PHOTOS' in title.upper():
                return
            if 'RAESIDE' in title.upper():
                if self.raeside:
                    return
                self.raeside = True
            description=''
            data = self.limite
            author,data,content = self.read_article(url)
            if data and data >= self.limite:
                data = data.isoformat()
            else:
                return
            article_list.append(dict(title=title,url=url,date=data,description=description,author=author,content=content))
            print(sectitle+title+": description = "+description+" URL="+url+'\n\r')

    def add_section_index(self,ans,securl,sectitle):
        print("Add section url="+self.url_prefix+'/'+securl+'\n\r')
        try:
            soup = self.index_to_soup(self.url_prefix+'/'+securl)
        except:
            return ans
        article_list = []
        for article in  soup.findAll('article'):
                self.handle_articles(article,article_list,sectitle)
        ans.append((sectitle,article_list))
        return ans

    def parse_index(self):
        ans = []
        self.limite = datetime.datetime.now()-datetime.timedelta(self.oldest_article)
        for (url,title) in self.section_list:
            ans = self.add_section_index(ans,url,title)
        ans = self.sort_index(ans)
        #pprint( ans)
        return ans
    
    def sort_index(self,index):
        ''' sort by date '''
        ans = []
        for section,list_ in index:
            newlist = sorted( list_, key=lambda k: k['date']) 
            ans.append((section,newlist))
        return ans
mauropiccolo is offline   Reply With Quote
Reply


Forum Jump

Similar Threads
Thread Thread Starter Forum Replies Last Post
Updated Victoria Times Colonist nickredding Recipes 1 04-11-2013 01:00 AM
NY Times - Multiple articles problem Waldo3 Recipes 13 03-29-2013 01:38 PM
Victoria Times Colonist Recipe Error chrystyna Recipes 10 02-05-2013 01:34 PM
New York Times recipe skipping some articles? gianfri Recipes 20 02-18-2012 03:29 AM
(another) FIX: New York Times Missing Articles bcollier Recipes 11 02-11-2011 03:16 PM


All times are GMT -4. The time now is 10:53 PM.


MobileRead.com is a privately owned, operated and funded community.