MobileRead Forums - View Single Post

scissors · 04-30-2012, 04:56 PM

The Sun (rss working again)

Spoiler:

Code:

import urllib, re, mechanize, random
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre import __appname__
from calibre.utils.magick import Image, PixelWand
class AdvancedUserRecipe1325006965(BasicNewsRecipe):

    title          = u'The Sun UK'
    def get_cover_url(self):
            soup = self.index_to_soup('http://www.politicshome.com/uk/latest_frontpage.html')
# look for the block containing the sun button and url
            cov = soup.find(attrs={'style' : 'background-image: url(http://www.politicshome.com/images/sources/source_frontpage_button_84.gif);'})



            #cov = soup.find(attrs={'id' : 'large'})
            cov2 = str(cov)
           
            cov2='http://www.politicshome.com'+cov2[9:-133]
#cov2 now contains url of the page containing pic
            print '******** string is !', cov2,'! ***'
#cov2 now contains url of the page containing pic
            soup = self.index_to_soup(cov2)
            cov = soup.find(attrs={'id' : 'large'})
            cov2 = str(cov)
            print '******** string is !', cov2,'! ***'
            cov2=cov2[27:-18]
            #cov2 now is pic url, now  go back to original function            
          
            print '******** string is !', cov2,'! ***'
            br = mechanize.Browser()
            br.set_handle_redirect(False)
            try:
               br.open_novisit(cov2)
               cover_url = cov2
            except:
               rand = random.randint(1, 5)
               print "number = ",rand
               if rand == 1:
                   cover_url = 'http://img.thesun.co.uk/multimedia/archive/00905/errorpage6_677961a_905507a.jpg'
               else:
                if rand == 2:
                   cover_url = 'http://img.thesun.co.uk/multimedia/archive/00905/errorpage7_677962a_905505a.jpg'
                else:
                       if rand == 3:
                               cover_url = 'http://img.thesun.co.uk/multimedia/archive/00905/errorpage5_677960a_905512a.jpg'
                       else:
                              if rand == 4:
                                 cover_url = 'http://img.thesun.co.uk/multimedia/archive/00905/errorpage2_677957a_905502a.jpg'
                              else:
                                     if rand == 5:
                                                 cover_url = 'http://img.thesun.co.uk/multimedia/archive/00905/errorpage3_677958a_905503a.jpg'
               #cover_url = 'http://img.thesun.co.uk/multimedia/archive/00905/errorpage3_677958a_905503a.jpg'

            #cover_url = cov2
            #cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png'
            return cover_url
     
    
    description = 'A Recipe for The Sun tabloid UK'
    __author__ = 'Dave Asbury'
    # last updated 29/4/12
    language = 'en_GB'
    oldest_article = 1
    max_articles_per_feed = 15
    remove_empty_feeds = True
    no_stylesheets = True
    #auto_cleanup = True
    #articles_are_obfuscated = True

    masthead_url = 'http://www.thesun.co.uk/sol/img/global/Sun-logo.gif'
    encoding = 'UTF-8'
        
    remove_empty_feeds = True
    remove_javascript     = True
    no_stylesheets = True
    
    extra_css  = '''
	body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
                	 '''
    
    preprocess_regexps = [
    	(re.compile(r'<div class="foot-copyright".*?</div>', re.IGNORECASE | re.DOTALL), lambda match: '')]
    
      
   
    keep_only_tags = [
                               dict(name='h1'),dict(name='h2',attrs={'class' : 'medium centered'}),
	           dict(name='div',attrs={'class' : 'text-center'}),
	           dict(name='div',attrs={'id' : 'bodyText'})
	           # dict(name='p')
	           ]
    remove_tags=[
	       #dict(name='head'),
	       dict(attrs={'class' : ['mystery-meat-link','ltbx-container','ltbx-var ltbx-hbxpn','ltbx-var ltbx-nav-loop','ltbx-var ltbx-url']}),
                           dict(name='div',attrs={'class' : 'cf'}),
	       dict(attrs={'title' : 'download flash'}),
                           dict(attrs={'style' : 'padding: 5px'})
	      
	       ]

	
    feeds          = [
	(u'News', u'http://www.thesun.co.uk/sol/homepage/news/rss'),
	(u'Sport', u'http://www.thesun.co.uk/sol/homepage/sport/rss'),
	(u'Showbiz', u'http://www.thesun.co.uk/sol/homepage/showbiz/rss'),
                           (u'Woman', u'http://www.thesun.co.uk/sol/homepage/woman/rss'),

]
	
    def postprocess_html(self, soup, first):
        #process all the images
        for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
            iurl = tag['src']
            img = Image()
            img.open(iurl)
            if img < 0:
                raise RuntimeError('Out of memory')
            img.type = "GrayscaleType"
           # pw.MagickResizeimage(img, 200, 200)
            img.save(iurl)
        return soup
#http://www.bbc.co.uk/midlandstoday/content/images/2007/11/09/autumnwatch_203_203x152.jpg

The Mirror (also rss feeds working - abandoned auto clean up - too aggressive)

Spoiler:

Code:

from calibre.web.feeds.news import BasicNewsRecipe
import re
import mechanize
from calibre.utils.magick import Image, PixelWand
class AdvancedUserRecipe1306061239(BasicNewsRecipe):
    title          = u'The Daily Mirror'
    description = 'News as provided by The Daily Mirror -UK'

    __author__ = 'Dave Asbury'
    # last updated 28/4/12
    language = 'en_GB'
    def get_cover_url(self):
            soup = self.index_to_soup('http://www.politicshome.com/uk/latest_frontpage.html')
# look for the block containing the mirror button and url
            cov = soup.find(attrs={'style' : 'background-image: url(http://www.politicshome.com/images/sources/source_frontpage_button_92.gif);'})
            cov2 = str(cov)
            cov2='http://www.politicshome.com'+cov2[9:-142]
#cov2 now contains url of the page containing pic
            soup = self.index_to_soup(cov2)
            cov = soup.find(attrs={'id' : 'large'})
            cov2 = str(cov)
            cov2=cov2[27:-18]
            #cov2 now is pic url, now  go back to original function            
            br = mechanize.Browser()
            br.set_handle_redirect(False)
            try:
                  br.open_novisit(cov2)
                  cover_url = cov2
            except:
                     cover_url ='http://profile.ak.fbcdn.net/hprofile-ak-snc4/373019_6149699161_1710984811_n.jpg'
                  #cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg'

           # print '******** string is  ', cov2,' ***'
            #cover_url = cov2
            #cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png'
            return cover_url

    #cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg'

    masthead_url = 'http://www.nmauk.co.uk/nma/images/daily_mirror.gif'


    oldest_article = 1
    max_articles_per_feed = 12
    remove_empty_feeds = True
    remove_javascript     = True
    no_stylesheets = True
   # auto_cleanup = True
    #conversion_options = { 'linearize_tables' : True }
    
    
    
    keep_only_tags = [         dict(name='h1'),
	                     dict(name='div',attrs={'class' : 'lead-text'}),
	                     dict(name='div',attrs={'class' : 'styleGroup clearfix'}),
	                     dict(name='div',attrs={'class' : 'widget relatedContents pictures widget-editable viziwyg-section-245 inpage-widget-158123'}),
	                     dict(name='figure',attrs={'class' : 'clearfix'}),     
	                     dict(name='div',attrs={'class' :'body '}),
       
       #dict(attrs={'class' : ['article-attr','byline append-1','published']}),
       #dict(name='p'),
        ]

   
    remove_tags = [
           dict(attrs={'class' : 'comment'}),
           dict(name='title'),
           dict(name='ul',attrs={'class' :  'clearfix breadcrumbs '}),
           dict(name='ul',attrs={'id' : 'login-201109171215'}),
           dict(name='div',attrs={'class' : ['inline-ad span-16 last','caption']}),#'widget navigation breadcrumb widget-editable viziwyg-section-198 inpage-widget-80721 span-17','image-credit'
                    ]
    
    preprocess_regexps = [
    	(re.compile(r'- mirror.co.uk', re.IGNORECASE | re.DOTALL), lambda match: '')]
    
   
    feeds          = [
        (u'News',u'http://www.mirror.co.uk/news/rss.xml'),
        (u'Sports',u'http://www.mirror.co.uk/sport/rss.xml'),
        (u'3AM',u'http://www.mirror.co.uk/3am/rss.xml'),
        (u'Lifestyle',u'http://www.mirror.co.uk/lifestyle/rss.xml')
     



           # example of commented out feed not needed ,(u'Travel','http://www.mirror.co.uk/advice/travel/rss.xml')
  ]
    extra_css  = '''
                           h1{ font-size:medium;}
	body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
                           img { display:block}
                	 '''# 
    def postprocess_html(self, soup, first):
        #process all the images
        for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
            iurl = tag['src']
            img = Image()
            img.open(iurl)
            if img < 0:
                raise RuntimeError('Out of memory')
            img.type = "GrayscaleType"
            img.save(iurl)
        return soup