Recipe for The Register -- a UK Information Technology news site.
Code:
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2010, Nick Redding'
'''
www.theregister.co.uk
'''
import string, re
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from datetime import timedelta, datetime, date
class TheRegister(BasicNewsRecipe):
title = u'The Register'
language = 'en_GB'
__author__ = 'Nick Redding'
oldest_article = 2
timefmt = '' # '[%b %d]'
needs_subscription = False
keep_only_tags = [dict(name='div', attrs={'id':'article'})]
#remove_tags_before = []
remove_tags = [
{'id':['related-stories','ad-mpu1-spot'] },
{'class':['orig-url','article-nav','wptl btm','wptl top']}
]
#remove_tags_after = []
no_stylesheets = True
extra_css = '''
h2 {font-size: x-large; }
h3 {font-size: large; font-weight: bold; }
.byline {font-size: x-small; }
.dateline {font-size: x-small; }
'''
def get_browser(self):
br = BasicNewsRecipe.get_browser()
return br
def get_masthead_url(self):
masthead = 'http://www.theregister.co.uk/Design/graphics/std/logo_414_80.png'
br = BasicNewsRecipe.get_browser()
try:
br.open(masthead)
except:
self.log("\nMasthead unavailable")
masthead = None
return masthead
def preprocess_html(self,soup):
# this removes the explicit url after links
for span_tag in soup.findAll('span','URL'):
span_tag.previous.replaceWith(re.sub("\ \($","",self.tag_to_string(span_tag.previous)))
span_tag.next.next.replaceWith(re.sub("^\)","",self.tag_to_string(span_tag.next.next)))
span_tag.extract()
return soup
def parse_index(self):
def decode_date(datestr):
udate = datestr.strip().lower().split()
m = ['jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec'].index(udate[1])+1
d = int(udate[0])
y = date.today().year
return date(y,m,d)
articles = {}
key = None
ans = []
def parse_index_page(page_name,page_title):
def article_title(tag):
atag = tag.find('a',href=True)
return ''.join(atag.findAll(text=True, recursive=False)).strip()
def article_date(tag):
t = tag.find(True, {'class' : 'date'})
if t:
return ''.join(t.findAll(text=True, recursive=False)).strip()
return ''
def article_summary(tag):
t = tag.find(True, {'class' : 'standfirst'})
if t:
return ''.join(t.findAll(text=True, recursive=False)).strip()
return ''
def article_url(tag):
atag = tag.find('a',href=True)
url = atag['href']
return url
mainurl = 'http://www.theregister.co.uk'
soup = self.index_to_soup(mainurl+page_name)
# Find each instance of class="section-headline", class="story", class="story headline"
for div in soup.findAll('div',attrs={'class':re.compile('^story-ref')}):
# div contains all article data
# check if article is too old
datetag = div.find('span','date')
if datetag:
dateline_string = self.tag_to_string(datetag,False)
a_date = decode_date(dateline_string)
earliest_date = date.today() - timedelta(days=self.oldest_article)
if a_date < earliest_date:
self.log("Skipping article dated %s" % dateline_string)
continue
url = article_url(div)
if 'http' in url:
continue
url = mainurl + url + 'print.html'
self.log("URL %s" % url)
title = article_title(div)
self.log("Title %s" % title)
pubdate = article_date(div)
self.log("Date %s" % pubdate)
description = article_summary(div)
self.log("Description %s" % description)
author = ''
if not articles.has_key(page_title):
articles[page_title] = []
articles[page_title].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
parse_index_page('','Front Page')
ans.append('Front Page')
parse_index_page('/hardware','Hardware')
ans.append('Hardware')
parse_index_page('/software','Software')
ans.append('Software')
parse_index_page('/music_media','Music & Media')
ans.append('Music & Media')
parse_index_page('/networks','Networks')
ans.append('Networks')
parse_index_page('/security','Security')
ans.append('Security')
parse_index_page('/public_sector','Public Sector')
ans.append('Public Sector')
parse_index_page('/business','Business')
ans.append('Business')
parse_index_page('/science','Science')
ans.append('Science')
parse_index_page('/odds','Odds & Sods')
ans.append('Odds & Sods')
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans