Thread: The Sun UK
View Single Post
Old 10-02-2011, 10:29 AM   #1
scissors
Addict
scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.
 
Posts: 241
Karma: 1001369
Join Date: Sep 2010
Device: prs300, kindle keyboard 3g
The Sun UK

A Recipe for The Sun tabloid UK using the google reader recipe.

The reason it uses google reader is because the feeds keep dissappearing using the direct method. (I think the site monitors access?)

Anyway - I set up a gmail account user called sunreader solely for the reader.

I then subscribed to the suns RSS feeds at

http://www.thesun.co.uk/sol/homepage...icle247949.ece

examples are

News http://www.thesun.co.uk/sol/homepage...icle312900.ece
Sport http://www.thesun.co.uk/sol/homepage...icle247732.ece
ShowBiz http://www.thesun.co.uk/sol/homepage...cle1999685.ece
Bizarre http://www.thesun.co.uk/sol/homepage...icle247767.ece

Then in the google reader for each feed subscribed to click feed settings and select new folder - the name you enter here is the name that will appear in the TOC



Code:
import urllib, re, mechanize
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre import __appname__
from calibre.utils.magick import Image, PixelWand

class GoogleReader(BasicNewsRecipe):
    title   = 'The Sun UK Via Google Reader'
    
    #last updated 2/11/11 images to greyscale - by Starson17
    cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png'
     
    description = 'A Recipe for The Sun tabloid UK using the google reader recipe. You need to set up a gmail account solely for the reader, then subscribe to the suns RSS feeds at http://www.thesun.co.uk/sol/homepage/hygiene/rss_sign_up/article247949.ece'
    needs_subscription = True
    __author__ = ' Dave Asbury, davec, rollercoaster, Starson17'
    base_url = 'http://www.google.com/reader/atom/'
    oldest_article = 1
    max_articles_per_feed = 20
    get_options = '?n=%d&xt=user/-/state/com.google/read' % max_articles_per_feed
   # use_embedded_content = True

    masthead_url = 'http://www.thesun.co.uk/sol/img/global/Sun-logo.gif'
    #encoding = 'iso-8859-1'
    
    encoding = 'cp1252'
    remove_empty_feeds = True
    remove_javascript     = True
    no_stylesheets = True
    
    extra_css  = '''
	body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
                	 '''
    
    preprocess_regexps = [
    (re.compile(r'<div class="foot-copyright".*?</div>', re.IGNORECASE | re.DOTALL), lambda match: '')]
    
      
   
    keep_only_tags = [
                               dict(name='h1'),dict(name='h2',attrs={'class' : 'medium centered'}),
	           dict(name='div',attrs={'class' : 'text-center'}),
	           dict(name='div',attrs={'id' : 'bodyText'})
	           # dict(name='p')
	           ]
    remove_tags=[
	       #dict(name='head'),
	       dict(attrs={'class' : ['mystery-meat-link','ltbx-container','ltbx-var ltbx-hbxpn','ltbx-var ltbx-nav-loop','ltbx-var ltbx-url']}),
                           dict(name='div',attrs={'class' : 'cf'}),
	       dict(attrs={'title' : 'download flash'}),
                           dict(attrs={'style' : 'padding: 5px'})
	      
	       ]
    
    

    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
        if self.username is not None and self.password is not None:
            request = urllib.urlencode([('Email', self.username), ('Passwd', self.password),
                                        ('service', 'reader'), ('accountType', 'HOSTED_OR_GOOGLE'), ('source', __appname__)])
            response = br.open('https://www.google.com/accounts/ClientLogin', request)
            auth = re.search('Auth=(\S*)', response.read()).group(1)
            cookies = mechanize.CookieJar()
            br = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies))
            br.addheaders = [('Authorization', 'GoogleLogin auth='+auth)]
        return br
    

    def get_feeds(self):
        feeds = []
        soup = self.index_to_soup('http://www.google.com/reader/api/0/tag/list')
        for id in soup.findAll(True, attrs={'name':['id']}):
            url = id.contents[0]
            feeds.append((re.search('/([^/]*)$', url).group(1),
                          self.base_url + urllib.quote(url.encode('utf-8')) + self.get_options))
        return feeds

    def print_soup(self, soup):
          print(soup)
    
 
    def postprocess_html(self, soup, first):
        #process all the images
        for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
            iurl = tag['src']
            img = Image()
            img.open(iurl)
            if img < 0:
                raise RuntimeError('Out of memory')
            img.type = "GrayscaleType"
            img.save(iurl)
        return soup
        #auto_cleanup = True

Last edited by scissors; 11-02-2011 at 03:52 PM. Reason: images to greyscale - by Starson17
scissors is offline   Reply With Quote