MobileRead Forums - View Single Post - Custom recipes (archive, read-only)

nickredding · 02-04-2010, 04:25 PM

Recipe for The Register -- a UK Information Technology news site.

Code:

#!/usr/bin/env  python
__license__   = 'GPL v3'
__copyright__ = '2010, Nick Redding'
'''
www.theregister.co.uk
'''
import string, re
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from datetime import timedelta, datetime, date


class TheRegister(BasicNewsRecipe):
    title = u'The Register'
    language = 'en_GB'
    __author__ = 'Nick Redding'
    oldest_article = 2
    timefmt = '' # '[%b %d]'
    needs_subscription = False
    keep_only_tags = [dict(name='div', attrs={'id':'article'})]
    #remove_tags_before = []
    remove_tags = [
		{'id':['related-stories','ad-mpu1-spot'] },
		{'class':['orig-url','article-nav','wptl btm','wptl top']}
		]
    #remove_tags_after = []

    no_stylesheets = True
    extra_css = '''
                h2 {font-size: x-large; }
                h3 {font-size: large; font-weight: bold; }
                .byline {font-size: x-small; }
                .dateline {font-size: x-small; }
                '''
    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
        return br

    def get_masthead_url(self):
        masthead = 'http://www.theregister.co.uk/Design/graphics/std/logo_414_80.png'
        br = BasicNewsRecipe.get_browser()
        try:
            br.open(masthead)
        except:
            self.log("\nMasthead unavailable")
            masthead = None
        return masthead

    def preprocess_html(self,soup):
        # this removes the explicit url after links
        for span_tag in soup.findAll('span','URL'):
            span_tag.previous.replaceWith(re.sub("\ \($","",self.tag_to_string(span_tag.previous)))
            span_tag.next.next.replaceWith(re.sub("^\)","",self.tag_to_string(span_tag.next.next)))
            span_tag.extract()
        return soup
                                   

    def parse_index(self):

        def decode_date(datestr):
            udate = datestr.strip().lower().split()
            m = ['jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec'].index(udate[1])+1
            d = int(udate[0])
            y = date.today().year
            return date(y,m,d)


        articles = {}
        key = None
        ans = []

        def parse_index_page(page_name,page_title):

            def article_title(tag):
                atag = tag.find('a',href=True)
                return ''.join(atag.findAll(text=True, recursive=False)).strip()

            def article_date(tag):
                t = tag.find(True, {'class' : 'date'})
                if t:
                    return ''.join(t.findAll(text=True, recursive=False)).strip()
                return ''

            def article_summary(tag):
                t = tag.find(True, {'class' : 'standfirst'})
                if t:
                    return ''.join(t.findAll(text=True, recursive=False)).strip()
                return ''

            def article_url(tag):
                atag = tag.find('a',href=True)
                url = atag['href']
                return url

            mainurl = 'http://www.theregister.co.uk'
            soup = self.index_to_soup(mainurl+page_name)
            # Find each instance of class="section-headline", class="story", class="story headline"
            for div in soup.findAll('div',attrs={'class':re.compile('^story-ref')}):
                # div contains all article data

                # check if article is too old
                datetag = div.find('span','date')
                if datetag:
                    dateline_string = self.tag_to_string(datetag,False)
                    a_date = decode_date(dateline_string)
                    earliest_date = date.today() - timedelta(days=self.oldest_article)
                    if a_date < earliest_date:
                        self.log("Skipping article dated %s" % dateline_string)
                        continue


                url = article_url(div)
                if 'http' in url:
                    continue
                url = mainurl + url + 'print.html'
                self.log("URL %s" % url)
                title = article_title(div)
                self.log("Title %s" % title)
                pubdate = article_date(div)
                self.log("Date %s" % pubdate)
                description = article_summary(div)
                self.log("Description %s" % description)
                author = ''
                if not articles.has_key(page_title):
                    articles[page_title] = []
                articles[page_title].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))


        parse_index_page('','Front Page')
        ans.append('Front Page')
        parse_index_page('/hardware','Hardware')
        ans.append('Hardware')
        parse_index_page('/software','Software')
        ans.append('Software')
        parse_index_page('/music_media','Music & Media')
        ans.append('Music & Media')
        parse_index_page('/networks','Networks')
        ans.append('Networks')
        parse_index_page('/security','Security')
        ans.append('Security')
        parse_index_page('/public_sector','Public Sector')
        ans.append('Public Sector')
        parse_index_page('/business','Business')
        ans.append('Business')
        parse_index_page('/science','Science')
        ans.append('Science')
        parse_index_page('/odds','Odds & Sods')
        ans.append('Odds & Sods')
        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
        return ans