Register Guidelines E-Books Today's Posts Search

Go Back   MobileRead Forums > E-Book Software > Calibre > Recipes

Notices

Reply
 
Thread Tools Search this Thread
Old 04-12-2012, 04:51 AM   #1
scissors
Addict
scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.
 
Posts: 241
Karma: 1001369
Join Date: Sep 2010
Device: prs300, kindle keyboard 3g
UK Papers, Sun,Mirror,Metro Updates

Sun
Spoiler:
Code:
import urllib, re, mechanize
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre import __appname__
from calibre.utils.magick import Image, PixelWand
class AdvancedUserRecipe1325006965(BasicNewsRecipe):

    title          = u'The Sun UK'
    def get_cover_url(self):
            soup = self.index_to_soup('http://www.politicshome.com/uk/latest_frontpage.html')
# look for the block containing the sun button and url
            cov = soup.find(attrs={'style' : 'background-image: url(http://www.politicshome.com/images/sources/source_frontpage_button_84.gif);'})



            #cov = soup.find(attrs={'id' : 'large'})
            cov2 = str(cov)
           
            cov2='http://www.politicshome.com'+cov2[9:-133]
#cov2 now contains url of the page containing pic

#cov2 now contains url of the page containing pic
            soup = self.index_to_soup(cov2)
            cov = soup.find(attrs={'id' : 'large'})
            cov2 = str(cov)
            cov2=cov2[27:-18]
            #cov2 now is pic url, now  go back to original function            
          
            print '******** string is !', cov2,'! ***'
            br = mechanize.Browser()
            br.set_handle_redirect(False)
            try:
               br.open_novisit(cov2)
               cover_url = cov2
            except:
               cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png'

            #cover_url = cov2
            #cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png'
            return cover_url
     
    
    description = 'A Recipe for The Sun tabloid UK'
    __author__ = 'Dave Asbury'
    # last updated 7/4/12
    language = 'en_GB'
    oldest_article = 1
    max_articles_per_feed = 15
    remove_empty_feeds = True
    no_stylesheets = True
    #auto_cleanup = True
    #articles_are_obfuscated = True

    masthead_url = 'http://www.thesun.co.uk/sol/img/global/Sun-logo.gif'
    encoding = 'UTF-8'
        
    remove_empty_feeds = True
    remove_javascript     = True
    no_stylesheets = True
    
    extra_css  = '''
	body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
                	 '''
    
    preprocess_regexps = [
    	(re.compile(r'<div class="foot-copyright".*?</div>', re.IGNORECASE | re.DOTALL), lambda match: '')]
    
      
   
    keep_only_tags = [
                               dict(name='h1'),dict(name='h2',attrs={'class' : 'medium centered'}),
	           dict(name='div',attrs={'class' : 'text-center'}),
	           dict(name='div',attrs={'id' : 'bodyText'})
	           # dict(name='p')
	           ]
    remove_tags=[
	       #dict(name='head'),
	       dict(attrs={'class' : ['mystery-meat-link','ltbx-container','ltbx-var ltbx-hbxpn','ltbx-var ltbx-nav-loop','ltbx-var ltbx-url']}),
                           dict(name='div',attrs={'class' : 'cf'}),
	       dict(attrs={'title' : 'download flash'}),
                           dict(attrs={'style' : 'padding: 5px'})
	      
	       ]

	
    feeds          = [
	#(u'News', u'http://www.thesun.co.uk/sol/homepage/news/rss'),
	(u'News','http://feed43.com/2517447382644748.xml'),
	(u'Sport', u'http://feed43.com/4283846255668687.xml'),
	(u'Bizarre', u'http://feed43.com/0233840304242011.xml'),
	(u'Film',u'http://feed43.com/1307545221226200.xml'),
        	(u'Music',u'http://feed43.com/1701513435064132.xml'),
	(u'Sun Woman',u'http://feed43.com/0022626854226453.xml'),
]
    def postprocess_html(self, soup, first):
        #process all the images
        for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
            iurl = tag['src']
            img = Image()
            img.open(iurl)
            if img < 0:
                raise RuntimeError('Out of memory')
            img.type = "GrayscaleType"
           # pw.MagickResizeimage(img, 200, 200)
            img.save(iurl)
        return soup
#http://www.bbc.co.uk/midlandstoday/content/images/2007/11/09/autumnwatch_203_203x152.jpg


Mirror

Spoiler:
Code:
from calibre.web.feeds.news import BasicNewsRecipe
import re
import mechanize
from calibre.utils.magick import Image, PixelWand
class AdvancedUserRecipe1306061239(BasicNewsRecipe):
    title          = u'The Daily Mirror'
    description = 'News as provide by The Daily Mirror -UK'

    __author__ = 'Dave Asbury'
    # last updated 7/4/12
    language = 'en_GB'
    def get_cover_url(self):
            soup = self.index_to_soup('http://www.politicshome.com/uk/latest_frontpage.html')
# look for the block containing the mirror button and url
            cov = soup.find(attrs={'style' : 'background-image: url(http://www.politicshome.com/images/sources/source_frontpage_button_92.gif);'})
            cov2 = str(cov)
            cov2='http://www.politicshome.com'+cov2[9:-142]
#cov2 now contains url of the page containing pic
            soup = self.index_to_soup(cov2)
            cov = soup.find(attrs={'id' : 'large'})
            cov2 = str(cov)
            cov2=cov2[27:-18]
            #cov2 now is pic url, now  go back to original function            
            br = mechanize.Browser()
            br.set_handle_redirect(False)
            try:
                  br.open_novisit(cov2)
                  cover_url = cov2
            except:
                  cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg'

           # print '******** string is  ', cov2,' ***'
            #cover_url = cov2
            #cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png'
            return cover_url

    #cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg'

    masthead_url = 'http://www.nmauk.co.uk/nma/images/daily_mirror.gif'


    oldest_article = 1
    max_articles_per_feed = 10
    remove_empty_feeds = True
    remove_javascript     = True
    no_stylesheets = True
    auto_cleanup = True
    #conversion_options = { 'linearize_tables' : True }
    
    
    #keep_only_tags = [
      #     dict(name='h1'),
      # dict(name='div',attrs={'id' : 'body-content'}),
       #dict(name='div',atts={'class' : 'article-body'}),
       #dict(attrs={'class' : ['article-attr','byline append-1','published']}),
       #dict(name='p'),
       # ]

    #remove_tags_after = [dict (name='div',attrs={'class' : 'related'})]

    remove_tags = [
           dict(name='title'),
           dict(name='div',attrs={'class' : ['inline-ad span-16 last','caption']}),
          # dict(name='div',attrs={'id' : ['sidebar','menu','search-box','roffers-top']}),
           #dict(name='div',attrs={'class' :['inline-ad span-16 last','article-resize','related','list teasers']}),
           #dict(attrs={'class' : ['channellink','article-tags','replace','append-html']}),
          ]
    
   # preprocess_regexps = [
    #(re.compile(r'<dl class="q-search">.*?</dl>', re.IGNORECASE | re.DOTALL), lambda match: '')]
    preprocess_regexps = [
    (re.compile(r'- mirror.co.uk', re.IGNORECASE | re.DOTALL), lambda match: '')]
    
    preprocess_regexps = [
    (re.compile(r'Advertisement >>', re.IGNORECASE | re.DOTALL), lambda match: '')]
    
    #preprocess_regexps = [
    #(re.compile(r'Sponsored Links', re.IGNORECASE | re.DOTALL), lambda match: '')]
    
    feeds          = [

        (u'UK News', u'http://feed43.com/0287771688643868.xml')
        ,(u'Tech News', u'http://feed43.com/2455520588350501.xml')
        ,(u'Weird World','http://feed43.com/0863800333634654.xml')
        ,(u'Sport','http://feed43.com/7713243036546130.xml')
        ,(u'Sport : Boxing ','http://feed43.com/0414732220804255.xml')
        ,(u'Sport : Rugby Union','http://feed43.com/4710138762362383.xml')
        ,(u'Sport : Other','http://feed43.com/4501416886323415.xml')
        ,(u'TV and Film','http://feed43.com/5238302853765104.xml')
        ,(u'Celebs','http://feed43.com/8770061048844683.xml')        
        ,(u'Life Style : Family','http://feed43.com/4356170742410338.xml')
         ,(u'Travel','http://feed43.com/1436576006476607.xml')



           # example of commented out feed not needed ,(u'Travel','http://www.mirror.co.uk/advice/travel/rss.xml')
  ]
    extra_css  = '''
	body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
                    h1{ font-size:18px;}
                    img { display:block}
                	 '''
    def postprocess_html(self, soup, first):
        #process all the images
        for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
            iurl = tag['src']
            img = Image()
            img.open(iurl)
            if img < 0:
                raise RuntimeError('Out of memory')
            img.type = "GrayscaleType"
            img.save(iurl)
        return soup


Metro

Spoiler:
Code:
import re
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1306097511(BasicNewsRecipe):
    title          = u'Metro_UK'
    description = 'News as provide by The Metro -UK'
    #timefmt = ''
    __author__ = 'Dave Asbury'
    cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/276636_117118184990145_2132092232_n.jpg'
    #no_stylesheets = True
    oldest_article = 1
    max_articles_per_feed = 10
    remove_empty_feeds = True
    remove_javascript     = True
    auto_cleanup = True
   
    
    language = 'en_GB'
    masthead_url        = 'http://e-edition.metro.co.uk/images/metro_logo.gif'
    keep_only_tags = [
	
    ]
    remove_tags    = [
                             
                               ]

    
    feeds          = [
        (u'News', u'http://www.metro.co.uk/rss/news/'), (u'Money', u'http://www.metro.co.uk/rss/money/'), (u'Sport', u'http://www.metro.co.uk/rss/sport/'), (u'Film', u'http://www.metro.co.uk/rss/metrolife/film/'), (u'Music', u'http://www.metro.co.uk/rss/metrolife/music/'), (u'TV', u'http://www.metro.co.uk/rss/tv/'), (u'Showbiz', u'http://www.metro.co.uk/rss/showbiz/'), (u'Weird News', u'http://www.metro.co.uk/rss/weird/'), (u'Travel', u'http://www.metro.co.uk/rss/travel/'), (u'Lifestyle', u'http://www.metro.co.uk/rss/lifestyle/'), (u'Books', u'http://www.metro.co.uk/rss/lifestyle/books/'), (u'Food', u'http://www.metro.co.uk/rss/lifestyle/restaurants/')]
    extra_css  = '''
	body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
                	 '''
scissors is offline   Reply With Quote
Reply


Forum Jump

Similar Threads
Thread Thread Starter Forum Replies Last Post
Metro Skin for Calibre Steppa Library Management 12 05-01-2015 10:35 AM
improved metro uk recipe scissors Recipes 0 12-03-2011 05:01 PM
Recipe for Metro UK Bogg Recipes 10 10-07-2011 01:06 PM
Metro News NL drMerry Recipes 1 07-07-2011 07:23 PM
Metro Map Viewer faxi PocketBook 7 07-31-2010 07:50 AM


All times are GMT -4. The time now is 05:25 AM.


MobileRead.com is a privately owned, operated and funded community.