A Recipe for The Sun tabloid UK using the google reader recipe.
The reason it uses google reader is because the feeds keep dissappearing using the direct method. (I think the site monitors access?)
Anyway - I set up a gmail account user called sunreader solely for the reader.
I then subscribed to the suns RSS feeds at
http://www.thesun.co.uk/sol/homepage...icle247949.ece
examples are
News
http://www.thesun.co.uk/sol/homepage...icle312900.ece
Sport
http://www.thesun.co.uk/sol/homepage...icle247732.ece
ShowBiz
http://www.thesun.co.uk/sol/homepage...cle1999685.ece
Bizarre
http://www.thesun.co.uk/sol/homepage...icle247767.ece
Then in the google reader for each feed subscribed to click feed settings and select new folder - the name you enter here is the name that will appear in the TOC
Code:
import urllib, re, mechanize
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre import __appname__
from calibre.utils.magick import Image, PixelWand
class GoogleReader(BasicNewsRecipe):
title = 'The Sun UK Via Google Reader'
#last updated 2/11/11 images to greyscale - by Starson17
cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png'
description = 'A Recipe for The Sun tabloid UK using the google reader recipe. You need to set up a gmail account solely for the reader, then subscribe to the suns RSS feeds at http://www.thesun.co.uk/sol/homepage/hygiene/rss_sign_up/article247949.ece'
needs_subscription = True
__author__ = ' Dave Asbury, davec, rollercoaster, Starson17'
base_url = 'http://www.google.com/reader/atom/'
oldest_article = 1
max_articles_per_feed = 20
get_options = '?n=%d&xt=user/-/state/com.google/read' % max_articles_per_feed
# use_embedded_content = True
masthead_url = 'http://www.thesun.co.uk/sol/img/global/Sun-logo.gif'
#encoding = 'iso-8859-1'
encoding = 'cp1252'
remove_empty_feeds = True
remove_javascript = True
no_stylesheets = True
extra_css = '''
body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
'''
preprocess_regexps = [
(re.compile(r'<div class="foot-copyright".*?</div>', re.IGNORECASE | re.DOTALL), lambda match: '')]
keep_only_tags = [
dict(name='h1'),dict(name='h2',attrs={'class' : 'medium centered'}),
dict(name='div',attrs={'class' : 'text-center'}),
dict(name='div',attrs={'id' : 'bodyText'})
# dict(name='p')
]
remove_tags=[
#dict(name='head'),
dict(attrs={'class' : ['mystery-meat-link','ltbx-container','ltbx-var ltbx-hbxpn','ltbx-var ltbx-nav-loop','ltbx-var ltbx-url']}),
dict(name='div',attrs={'class' : 'cf'}),
dict(attrs={'title' : 'download flash'}),
dict(attrs={'style' : 'padding: 5px'})
]
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
if self.username is not None and self.password is not None:
request = urllib.urlencode([('Email', self.username), ('Passwd', self.password),
('service', 'reader'), ('accountType', 'HOSTED_OR_GOOGLE'), ('source', __appname__)])
response = br.open('https://www.google.com/accounts/ClientLogin', request)
auth = re.search('Auth=(\S*)', response.read()).group(1)
cookies = mechanize.CookieJar()
br = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies))
br.addheaders = [('Authorization', 'GoogleLogin auth='+auth)]
return br
def get_feeds(self):
feeds = []
soup = self.index_to_soup('http://www.google.com/reader/api/0/tag/list')
for id in soup.findAll(True, attrs={'name':['id']}):
url = id.contents[0]
feeds.append((re.search('/([^/]*)$', url).group(1),
self.base_url + urllib.quote(url.encode('utf-8')) + self.get_options))
return feeds
def print_soup(self, soup):
print(soup)
def postprocess_html(self, soup, first):
#process all the images
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
iurl = tag['src']
img = Image()
img.open(iurl)
if img < 0:
raise RuntimeError('Out of memory')
img.type = "GrayscaleType"
img.save(iurl)
return soup
#auto_cleanup = True