View Single Post
Old 04-30-2012, 04:56 PM   #1
scissors
Addict
scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.
 
Posts: 241
Karma: 1001369
Join Date: Sep 2010
Device: prs300, kindle keyboard 3g
UK Papers, Sun,Mirror Updates

The Sun (rss working again)

Spoiler:
Code:
import urllib, re, mechanize, random
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre import __appname__
from calibre.utils.magick import Image, PixelWand
class AdvancedUserRecipe1325006965(BasicNewsRecipe):

    title          = u'The Sun UK'
    def get_cover_url(self):
            soup = self.index_to_soup('http://www.politicshome.com/uk/latest_frontpage.html')
# look for the block containing the sun button and url
            cov = soup.find(attrs={'style' : 'background-image: url(http://www.politicshome.com/images/sources/source_frontpage_button_84.gif);'})



            #cov = soup.find(attrs={'id' : 'large'})
            cov2 = str(cov)
           
            cov2='http://www.politicshome.com'+cov2[9:-133]
#cov2 now contains url of the page containing pic
            print '******** string is !', cov2,'! ***'
#cov2 now contains url of the page containing pic
            soup = self.index_to_soup(cov2)
            cov = soup.find(attrs={'id' : 'large'})
            cov2 = str(cov)
            print '******** string is !', cov2,'! ***'
            cov2=cov2[27:-18]
            #cov2 now is pic url, now  go back to original function            
          
            print '******** string is !', cov2,'! ***'
            br = mechanize.Browser()
            br.set_handle_redirect(False)
            try:
               br.open_novisit(cov2)
               cover_url = cov2
            except:
               rand = random.randint(1, 5)
               print "number = ",rand
               if rand == 1:
                   cover_url = 'http://img.thesun.co.uk/multimedia/archive/00905/errorpage6_677961a_905507a.jpg'
               else:
                if rand == 2:
                   cover_url = 'http://img.thesun.co.uk/multimedia/archive/00905/errorpage7_677962a_905505a.jpg'
                else:
                       if rand == 3:
                               cover_url = 'http://img.thesun.co.uk/multimedia/archive/00905/errorpage5_677960a_905512a.jpg'
                       else:
                              if rand == 4:
                                 cover_url = 'http://img.thesun.co.uk/multimedia/archive/00905/errorpage2_677957a_905502a.jpg'
                              else:
                                     if rand == 5:
                                                 cover_url = 'http://img.thesun.co.uk/multimedia/archive/00905/errorpage3_677958a_905503a.jpg'
               #cover_url = 'http://img.thesun.co.uk/multimedia/archive/00905/errorpage3_677958a_905503a.jpg'

            #cover_url = cov2
            #cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png'
            return cover_url
     
    
    description = 'A Recipe for The Sun tabloid UK'
    __author__ = 'Dave Asbury'
    # last updated 29/4/12
    language = 'en_GB'
    oldest_article = 1
    max_articles_per_feed = 15
    remove_empty_feeds = True
    no_stylesheets = True
    #auto_cleanup = True
    #articles_are_obfuscated = True

    masthead_url = 'http://www.thesun.co.uk/sol/img/global/Sun-logo.gif'
    encoding = 'UTF-8'
        
    remove_empty_feeds = True
    remove_javascript     = True
    no_stylesheets = True
    
    extra_css  = '''
	body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
                	 '''
    
    preprocess_regexps = [
    	(re.compile(r'<div class="foot-copyright".*?</div>', re.IGNORECASE | re.DOTALL), lambda match: '')]
    
      
   
    keep_only_tags = [
                               dict(name='h1'),dict(name='h2',attrs={'class' : 'medium centered'}),
	           dict(name='div',attrs={'class' : 'text-center'}),
	           dict(name='div',attrs={'id' : 'bodyText'})
	           # dict(name='p')
	           ]
    remove_tags=[
	       #dict(name='head'),
	       dict(attrs={'class' : ['mystery-meat-link','ltbx-container','ltbx-var ltbx-hbxpn','ltbx-var ltbx-nav-loop','ltbx-var ltbx-url']}),
                           dict(name='div',attrs={'class' : 'cf'}),
	       dict(attrs={'title' : 'download flash'}),
                           dict(attrs={'style' : 'padding: 5px'})
	      
	       ]

	
    feeds          = [
	(u'News', u'http://www.thesun.co.uk/sol/homepage/news/rss'),
	(u'Sport', u'http://www.thesun.co.uk/sol/homepage/sport/rss'),
	(u'Showbiz', u'http://www.thesun.co.uk/sol/homepage/showbiz/rss'),
                           (u'Woman', u'http://www.thesun.co.uk/sol/homepage/woman/rss'),

]
	
    def postprocess_html(self, soup, first):
        #process all the images
        for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
            iurl = tag['src']
            img = Image()
            img.open(iurl)
            if img < 0:
                raise RuntimeError('Out of memory')
            img.type = "GrayscaleType"
           # pw.MagickResizeimage(img, 200, 200)
            img.save(iurl)
        return soup
#http://www.bbc.co.uk/midlandstoday/content/images/2007/11/09/autumnwatch_203_203x152.jpg


The Mirror (also rss feeds working - abandoned auto clean up - too aggressive)
Spoiler:
Code:
from calibre.web.feeds.news import BasicNewsRecipe
import re
import mechanize
from calibre.utils.magick import Image, PixelWand
class AdvancedUserRecipe1306061239(BasicNewsRecipe):
    title          = u'The Daily Mirror'
    description = 'News as provided by The Daily Mirror -UK'

    __author__ = 'Dave Asbury'
    # last updated 28/4/12
    language = 'en_GB'
    def get_cover_url(self):
            soup = self.index_to_soup('http://www.politicshome.com/uk/latest_frontpage.html')
# look for the block containing the mirror button and url
            cov = soup.find(attrs={'style' : 'background-image: url(http://www.politicshome.com/images/sources/source_frontpage_button_92.gif);'})
            cov2 = str(cov)
            cov2='http://www.politicshome.com'+cov2[9:-142]
#cov2 now contains url of the page containing pic
            soup = self.index_to_soup(cov2)
            cov = soup.find(attrs={'id' : 'large'})
            cov2 = str(cov)
            cov2=cov2[27:-18]
            #cov2 now is pic url, now  go back to original function            
            br = mechanize.Browser()
            br.set_handle_redirect(False)
            try:
                  br.open_novisit(cov2)
                  cover_url = cov2
            except:
                     cover_url ='http://profile.ak.fbcdn.net/hprofile-ak-snc4/373019_6149699161_1710984811_n.jpg'
                  #cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg'

           # print '******** string is  ', cov2,' ***'
            #cover_url = cov2
            #cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png'
            return cover_url

    #cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg'

    masthead_url = 'http://www.nmauk.co.uk/nma/images/daily_mirror.gif'


    oldest_article = 1
    max_articles_per_feed = 12
    remove_empty_feeds = True
    remove_javascript     = True
    no_stylesheets = True
   # auto_cleanup = True
    #conversion_options = { 'linearize_tables' : True }
    
    
    
    keep_only_tags = [         dict(name='h1'),
	                     dict(name='div',attrs={'class' : 'lead-text'}),
	                     dict(name='div',attrs={'class' : 'styleGroup clearfix'}),
	                     dict(name='div',attrs={'class' : 'widget relatedContents pictures widget-editable viziwyg-section-245 inpage-widget-158123'}),
	                     dict(name='figure',attrs={'class' : 'clearfix'}),     
	                     dict(name='div',attrs={'class' :'body '}),
       
       #dict(attrs={'class' : ['article-attr','byline append-1','published']}),
       #dict(name='p'),
        ]

   
    remove_tags = [
           dict(attrs={'class' : 'comment'}),
           dict(name='title'),
           dict(name='ul',attrs={'class' :  'clearfix breadcrumbs '}),
           dict(name='ul',attrs={'id' : 'login-201109171215'}),
           dict(name='div',attrs={'class' : ['inline-ad span-16 last','caption']}),#'widget navigation breadcrumb widget-editable viziwyg-section-198 inpage-widget-80721 span-17','image-credit'
                    ]
    
    preprocess_regexps = [
    	(re.compile(r'- mirror.co.uk', re.IGNORECASE | re.DOTALL), lambda match: '')]
    
   
    feeds          = [
        (u'News',u'http://www.mirror.co.uk/news/rss.xml'),
        (u'Sports',u'http://www.mirror.co.uk/sport/rss.xml'),
        (u'3AM',u'http://www.mirror.co.uk/3am/rss.xml'),
        (u'Lifestyle',u'http://www.mirror.co.uk/lifestyle/rss.xml')
     



           # example of commented out feed not needed ,(u'Travel','http://www.mirror.co.uk/advice/travel/rss.xml')
  ]
    extra_css  = '''
                           h1{ font-size:medium;}
	body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
                           img { display:block}
                	 '''# 
    def postprocess_html(self, soup, first):
        #process all the images
        for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
            iurl = tag['src']
            img = Image()
            img.open(iurl)
            if img < 0:
                raise RuntimeError('Out of memory')
            img.type = "GrayscaleType"
            img.save(iurl)
        return soup
scissors is offline   Reply With Quote