Register Guidelines E-Books Search Today's Posts Mark Forums Read

Go Back   MobileRead Forums > E-Book Software > Calibre > Recipes

Notices

Reply
 
Thread Tools Search this Thread
Old 04-30-2012, 04:56 PM   #1
scissors
Addict
scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.
 
Posts: 203
Karma: 1001369
Join Date: Sep 2010
Device: prs300, kindle keyboard 3g
UK Papers, Sun,Mirror Updates

The Sun (rss working again)

Spoiler:
Code:
import urllib, re, mechanize, random
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre import __appname__
from calibre.utils.magick import Image, PixelWand
class AdvancedUserRecipe1325006965(BasicNewsRecipe):

    title          = u'The Sun UK'
    def get_cover_url(self):
            soup = self.index_to_soup('http://www.politicshome.com/uk/latest_frontpage.html')
# look for the block containing the sun button and url
            cov = soup.find(attrs={'style' : 'background-image: url(http://www.politicshome.com/images/sources/source_frontpage_button_84.gif);'})



            #cov = soup.find(attrs={'id' : 'large'})
            cov2 = str(cov)
           
            cov2='http://www.politicshome.com'+cov2[9:-133]
#cov2 now contains url of the page containing pic
            print '******** string is !', cov2,'! ***'
#cov2 now contains url of the page containing pic
            soup = self.index_to_soup(cov2)
            cov = soup.find(attrs={'id' : 'large'})
            cov2 = str(cov)
            print '******** string is !', cov2,'! ***'
            cov2=cov2[27:-18]
            #cov2 now is pic url, now  go back to original function            
          
            print '******** string is !', cov2,'! ***'
            br = mechanize.Browser()
            br.set_handle_redirect(False)
            try:
               br.open_novisit(cov2)
               cover_url = cov2
            except:
               rand = random.randint(1, 5)
               print "number = ",rand
               if rand == 1:
                   cover_url = 'http://img.thesun.co.uk/multimedia/archive/00905/errorpage6_677961a_905507a.jpg'
               else:
                if rand == 2:
                   cover_url = 'http://img.thesun.co.uk/multimedia/archive/00905/errorpage7_677962a_905505a.jpg'
                else:
                       if rand == 3:
                               cover_url = 'http://img.thesun.co.uk/multimedia/archive/00905/errorpage5_677960a_905512a.jpg'
                       else:
                              if rand == 4:
                                 cover_url = 'http://img.thesun.co.uk/multimedia/archive/00905/errorpage2_677957a_905502a.jpg'
                              else:
                                     if rand == 5:
                                                 cover_url = 'http://img.thesun.co.uk/multimedia/archive/00905/errorpage3_677958a_905503a.jpg'
               #cover_url = 'http://img.thesun.co.uk/multimedia/archive/00905/errorpage3_677958a_905503a.jpg'

            #cover_url = cov2
            #cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png'
            return cover_url
     
    
    description = 'A Recipe for The Sun tabloid UK'
    __author__ = 'Dave Asbury'
    # last updated 29/4/12
    language = 'en_GB'
    oldest_article = 1
    max_articles_per_feed = 15
    remove_empty_feeds = True
    no_stylesheets = True
    #auto_cleanup = True
    #articles_are_obfuscated = True

    masthead_url = 'http://www.thesun.co.uk/sol/img/global/Sun-logo.gif'
    encoding = 'UTF-8'
        
    remove_empty_feeds = True
    remove_javascript     = True
    no_stylesheets = True
    
    extra_css  = '''
	body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
                	 '''
    
    preprocess_regexps = [
    	(re.compile(r'<div class="foot-copyright".*?</div>', re.IGNORECASE | re.DOTALL), lambda match: '')]
    
      
   
    keep_only_tags = [
                               dict(name='h1'),dict(name='h2',attrs={'class' : 'medium centered'}),
	           dict(name='div',attrs={'class' : 'text-center'}),
	           dict(name='div',attrs={'id' : 'bodyText'})
	           # dict(name='p')
	           ]
    remove_tags=[
	       #dict(name='head'),
	       dict(attrs={'class' : ['mystery-meat-link','ltbx-container','ltbx-var ltbx-hbxpn','ltbx-var ltbx-nav-loop','ltbx-var ltbx-url']}),
                           dict(name='div',attrs={'class' : 'cf'}),
	       dict(attrs={'title' : 'download flash'}),
                           dict(attrs={'style' : 'padding: 5px'})
	      
	       ]

	
    feeds          = [
	(u'News', u'http://www.thesun.co.uk/sol/homepage/news/rss'),
	(u'Sport', u'http://www.thesun.co.uk/sol/homepage/sport/rss'),
	(u'Showbiz', u'http://www.thesun.co.uk/sol/homepage/showbiz/rss'),
                           (u'Woman', u'http://www.thesun.co.uk/sol/homepage/woman/rss'),

]
	
    def postprocess_html(self, soup, first):
        #process all the images
        for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
            iurl = tag['src']
            img = Image()
            img.open(iurl)
            if img < 0:
                raise RuntimeError('Out of memory')
            img.type = "GrayscaleType"
           # pw.MagickResizeimage(img, 200, 200)
            img.save(iurl)
        return soup
#http://www.bbc.co.uk/midlandstoday/content/images/2007/11/09/autumnwatch_203_203x152.jpg


The Mirror (also rss feeds working - abandoned auto clean up - too aggressive)
Spoiler:
Code:
from calibre.web.feeds.news import BasicNewsRecipe
import re
import mechanize
from calibre.utils.magick import Image, PixelWand
class AdvancedUserRecipe1306061239(BasicNewsRecipe):
    title          = u'The Daily Mirror'
    description = 'News as provided by The Daily Mirror -UK'

    __author__ = 'Dave Asbury'
    # last updated 28/4/12
    language = 'en_GB'
    def get_cover_url(self):
            soup = self.index_to_soup('http://www.politicshome.com/uk/latest_frontpage.html')
# look for the block containing the mirror button and url
            cov = soup.find(attrs={'style' : 'background-image: url(http://www.politicshome.com/images/sources/source_frontpage_button_92.gif);'})
            cov2 = str(cov)
            cov2='http://www.politicshome.com'+cov2[9:-142]
#cov2 now contains url of the page containing pic
            soup = self.index_to_soup(cov2)
            cov = soup.find(attrs={'id' : 'large'})
            cov2 = str(cov)
            cov2=cov2[27:-18]
            #cov2 now is pic url, now  go back to original function            
            br = mechanize.Browser()
            br.set_handle_redirect(False)
            try:
                  br.open_novisit(cov2)
                  cover_url = cov2
            except:
                     cover_url ='http://profile.ak.fbcdn.net/hprofile-ak-snc4/373019_6149699161_1710984811_n.jpg'
                  #cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg'

           # print '******** string is  ', cov2,' ***'
            #cover_url = cov2
            #cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png'
            return cover_url

    #cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg'

    masthead_url = 'http://www.nmauk.co.uk/nma/images/daily_mirror.gif'


    oldest_article = 1
    max_articles_per_feed = 12
    remove_empty_feeds = True
    remove_javascript     = True
    no_stylesheets = True
   # auto_cleanup = True
    #conversion_options = { 'linearize_tables' : True }
    
    
    
    keep_only_tags = [         dict(name='h1'),
	                     dict(name='div',attrs={'class' : 'lead-text'}),
	                     dict(name='div',attrs={'class' : 'styleGroup clearfix'}),
	                     dict(name='div',attrs={'class' : 'widget relatedContents pictures widget-editable viziwyg-section-245 inpage-widget-158123'}),
	                     dict(name='figure',attrs={'class' : 'clearfix'}),     
	                     dict(name='div',attrs={'class' :'body '}),
       
       #dict(attrs={'class' : ['article-attr','byline append-1','published']}),
       #dict(name='p'),
        ]

   
    remove_tags = [
           dict(attrs={'class' : 'comment'}),
           dict(name='title'),
           dict(name='ul',attrs={'class' :  'clearfix breadcrumbs '}),
           dict(name='ul',attrs={'id' : 'login-201109171215'}),
           dict(name='div',attrs={'class' : ['inline-ad span-16 last','caption']}),#'widget navigation breadcrumb widget-editable viziwyg-section-198 inpage-widget-80721 span-17','image-credit'
                    ]
    
    preprocess_regexps = [
    	(re.compile(r'- mirror.co.uk', re.IGNORECASE | re.DOTALL), lambda match: '')]
    
   
    feeds          = [
        (u'News',u'http://www.mirror.co.uk/news/rss.xml'),
        (u'Sports',u'http://www.mirror.co.uk/sport/rss.xml'),
        (u'3AM',u'http://www.mirror.co.uk/3am/rss.xml'),
        (u'Lifestyle',u'http://www.mirror.co.uk/lifestyle/rss.xml')
     



           # example of commented out feed not needed ,(u'Travel','http://www.mirror.co.uk/advice/travel/rss.xml')
  ]
    extra_css  = '''
                           h1{ font-size:medium;}
	body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
                           img { display:block}
                	 '''# 
    def postprocess_html(self, soup, first):
        #process all the images
        for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
            iurl = tag['src']
            img = Image()
            img.open(iurl)
            if img < 0:
                raise RuntimeError('Out of memory')
            img.type = "GrayscaleType"
            img.save(iurl)
        return soup
scissors is offline   Reply With Quote
Old 05-02-2012, 04:41 AM   #2
jeffkey1
Junior Member
jeffkey1 began at the beginning.
 
Posts: 4
Karma: 10
Join Date: Dec 2011
Device: Kindle Keyboard, Kindle 4
Thanks
jeffkey1 is offline   Reply With Quote
Old 07-15-2012, 12:39 PM   #3
scissors
Addict
scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.
 
Posts: 203
Karma: 1001369
Join Date: Sep 2010
Device: prs300, kindle keyboard 3g
Uk Sun update 15/7/12

Some sub headlines missed -corrected.

Spoiler:
Code:
import re, random

from calibre import browser
from calibre.web.feeds.recipes import BasicNewsRecipe

class AdvancedUserRecipe1325006965(BasicNewsRecipe):

    title          = u'The Sun UK'
    description = 'Recipe Author D.Asbury. Articles from The Sun tabloid UK'
    __author__ = 'Dave Asbury'
    # last updated 15/7/12
    language = 'en_GB'
    oldest_article = 1
    max_articles_per_feed = 15
    remove_empty_feeds = True
    no_stylesheets = True
    

    masthead_url = 'http://www.thesun.co.uk/sol/img/global/Sun-logo.gif'
    encoding = 'UTF-8'

    remove_empty_feeds = True
    remove_javascript     = True
    no_stylesheets = True
    
    extra_css  = '''
    body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
                     '''

    preprocess_regexps = [
        (re.compile(r'<div class="foot-copyright".*?</div>', re.IGNORECASE | re.DOTALL), lambda match: '')]



    keep_only_tags = [
                               dict(name='h1'),dict(name='h2',attrs={'class' : ['large','large centered','medium centered','medium']}),dict(name='h3'),
               dict(name='div',attrs={'class' : 'text-center'}),
               dict(name='div',attrs={'id' : 'bodyText'})
               # dict(name='p')
               ]
    remove_tags=[
           #dict(name='head'),
           dict(attrs={'class' : ['mystery-meat-link','ltbx-container','ltbx-var ltbx-hbxpn','ltbx-var ltbx-nav-loop','ltbx-var ltbx-url']}),
                           dict(name='div',attrs={'class' : 'cf'}),
           dict(attrs={'title' : 'download flash'}),
                           dict(attrs={'style' : 'padding: 5px'})

           ]


    feeds          = [
    (u'News', u'http://www.thesun.co.uk/sol/homepage/news/rss'),
    (u'Sport', u'http://www.thesun.co.uk/sol/homepage/sport/rss'),
    (u'Showbiz', u'http://www.thesun.co.uk/sol/homepage/showbiz/rss'),
    (u'Woman', u'http://www.thesun.co.uk/sol/homepage/woman/rss'),
    ]

    def get_cover_url(self):
        soup = self.index_to_soup('http://www.politicshome.com/uk/latest_frontpage.html')
        # look for the block containing the sun button and url
        cov = soup.find(attrs={'style' : 'background-image: url(http://www.politicshome.com/images/sources/source_frontpage_button_84.gif);'})

        #cov = soup.find(attrs={'id' : 'large'})
        cov2 = str(cov)

        cov2='http://www.politicshome.com'+cov2[9:-133]
        #cov2 now contains url of the page containing pic
        #cov2 now contains url of the page containing pic
        soup = self.index_to_soup(cov2)
        cov = soup.find(attrs={'id' : 'large'})
        cov2 = str(cov)
        cov2=cov2[27:-18]
        #cov2 now is pic url, now  go back to original function
        print "**** cov2 =",cov2,"****"
        br = browser()
        br.set_handle_redirect(False)
        try:
            br.open_novisit(cov2)
            cover_url = cov2
        except:
            cover_url = random.choice([
                'http://img.thesun.co.uk/multimedia/archive/00905/errorpage6_677961a_905507a.jpg'
                ,'http://img.thesun.co.uk/multimedia/archive/00905/errorpage7_677962a_905505a.jpg'
                ,'http://img.thesun.co.uk/multimedia/archive/00905/errorpage5_677960a_905512a.jpg'
                ,'http://img.thesun.co.uk/multimedia/archive/00905/errorpage2_677957a_905502a.jpg'
                ,'http://img.thesun.co.uk/multimedia/archive/00905/errorpage3_677958a_905503a.jpg'
                ])

        return cover_url
scissors is offline   Reply With Quote
Old 07-22-2012, 05:51 AM   #4
scissors
Addict
scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.
 
Posts: 203
Karma: 1001369
Join Date: Sep 2010
Device: prs300, kindle keyboard 3g
The sun update 22/7/12

REMOVED

Last edited by scissors; 07-22-2012 at 05:52 AM. Reason: removed text opened new message
scissors is offline   Reply With Quote
Reply

Thread Tools Search this Thread
Search this Thread:

Advanced Search

Forum Jump

Similar Threads
Thread Thread Starter Forum Replies Last Post
UK Papers, Sun,Mirror,Metro Updates scissors Recipes 0 04-12-2012 04:51 AM
EntourageEdge.com mirror and Archive Qwill enTourage eDGe 8 02-07-2012 01:50 AM
Mirror a Library leehach Library Management 1 09-01-2011 05:00 AM
mirror pages after convestion chervo Calibre 0 05-07-2010 02:14 PM
Two Way Mirror B.K. Wright Writers' Corner 0 11-01-2009 11:26 AM


All times are GMT -4. The time now is 04:26 PM.


MobileRead.com is a privately owned, operated and funded community.