View Single Post
Old 02-04-2010, 04:25 PM   #1361
nickredding
onlinenewsreader.net
nickredding knows the difference between 'who' and 'whom'nickredding knows the difference between 'who' and 'whom'nickredding knows the difference between 'who' and 'whom'nickredding knows the difference between 'who' and 'whom'nickredding knows the difference between 'who' and 'whom'nickredding knows the difference between 'who' and 'whom'nickredding knows the difference between 'who' and 'whom'nickredding knows the difference between 'who' and 'whom'nickredding knows the difference between 'who' and 'whom'nickredding knows the difference between 'who' and 'whom'nickredding knows the difference between 'who' and 'whom'
 
Posts: 328
Karma: 10143
Join Date: Dec 2009
Location: Phoenix, AZ & Victoria, BC
Device: Kindle 3, Kindle Fire, IPad3, iPhone4, Playbook, HTC Inspire
The Register (biting the hand that feeds IT)

Recipe for The Register -- a UK Information Technology news site.

Code:
#!/usr/bin/env  python
__license__   = 'GPL v3'
__copyright__ = '2010, Nick Redding'
'''
www.theregister.co.uk
'''
import string, re
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from datetime import timedelta, datetime, date


class TheRegister(BasicNewsRecipe):
    title = u'The Register'
    language = 'en_GB'
    __author__ = 'Nick Redding'
    oldest_article = 2
    timefmt = '' # '[%b %d]'
    needs_subscription = False
    keep_only_tags = [dict(name='div', attrs={'id':'article'})]
    #remove_tags_before = []
    remove_tags = [
		{'id':['related-stories','ad-mpu1-spot'] },
		{'class':['orig-url','article-nav','wptl btm','wptl top']}
		]
    #remove_tags_after = []

    no_stylesheets = True
    extra_css = '''
                h2 {font-size: x-large; }
                h3 {font-size: large; font-weight: bold; }
                .byline {font-size: x-small; }
                .dateline {font-size: x-small; }
                '''
    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
        return br

    def get_masthead_url(self):
        masthead = 'http://www.theregister.co.uk/Design/graphics/std/logo_414_80.png'
        br = BasicNewsRecipe.get_browser()
        try:
            br.open(masthead)
        except:
            self.log("\nMasthead unavailable")
            masthead = None
        return masthead

    def preprocess_html(self,soup):
        # this removes the explicit url after links
        for span_tag in soup.findAll('span','URL'):
            span_tag.previous.replaceWith(re.sub("\ \($","",self.tag_to_string(span_tag.previous)))
            span_tag.next.next.replaceWith(re.sub("^\)","",self.tag_to_string(span_tag.next.next)))
            span_tag.extract()
        return soup
                                   

    def parse_index(self):

        def decode_date(datestr):
            udate = datestr.strip().lower().split()
            m = ['jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec'].index(udate[1])+1
            d = int(udate[0])
            y = date.today().year
            return date(y,m,d)


        articles = {}
        key = None
        ans = []

        def parse_index_page(page_name,page_title):

            def article_title(tag):
                atag = tag.find('a',href=True)
                return ''.join(atag.findAll(text=True, recursive=False)).strip()

            def article_date(tag):
                t = tag.find(True, {'class' : 'date'})
                if t:
                    return ''.join(t.findAll(text=True, recursive=False)).strip()
                return ''

            def article_summary(tag):
                t = tag.find(True, {'class' : 'standfirst'})
                if t:
                    return ''.join(t.findAll(text=True, recursive=False)).strip()
                return ''

            def article_url(tag):
                atag = tag.find('a',href=True)
                url = atag['href']
                return url

            mainurl = 'http://www.theregister.co.uk'
            soup = self.index_to_soup(mainurl+page_name)
            # Find each instance of class="section-headline", class="story", class="story headline"
            for div in soup.findAll('div',attrs={'class':re.compile('^story-ref')}):
                # div contains all article data

                # check if article is too old
                datetag = div.find('span','date')
                if datetag:
                    dateline_string = self.tag_to_string(datetag,False)
                    a_date = decode_date(dateline_string)
                    earliest_date = date.today() - timedelta(days=self.oldest_article)
                    if a_date < earliest_date:
                        self.log("Skipping article dated %s" % dateline_string)
                        continue


                url = article_url(div)
                if 'http' in url:
                    continue
                url = mainurl + url + 'print.html'
                self.log("URL %s" % url)
                title = article_title(div)
                self.log("Title %s" % title)
                pubdate = article_date(div)
                self.log("Date %s" % pubdate)
                description = article_summary(div)
                self.log("Description %s" % description)
                author = ''
                if not articles.has_key(page_title):
                    articles[page_title] = []
                articles[page_title].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))


        parse_index_page('','Front Page')
        ans.append('Front Page')
        parse_index_page('/hardware','Hardware')
        ans.append('Hardware')
        parse_index_page('/software','Software')
        ans.append('Software')
        parse_index_page('/music_media','Music & Media')
        ans.append('Music & Media')
        parse_index_page('/networks','Networks')
        ans.append('Networks')
        parse_index_page('/security','Security')
        ans.append('Security')
        parse_index_page('/public_sector','Public Sector')
        ans.append('Public Sector')
        parse_index_page('/business','Business')
        ans.append('Business')
        parse_index_page('/science','Science')
        ans.append('Science')
        parse_index_page('/odds','Odds & Sods')
        ans.append('Odds & Sods')
        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
        return ans
nickredding is offline