Register Guidelines E-Books Today's Posts Search

Go Back   MobileRead Forums > E-Book Software > Calibre > Recipes

Notices

Reply
 
Thread Tools Search this Thread
Old 05-01-2014, 02:16 AM   #1
scissors
Addict
scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.
 
Posts: 241
Karma: 1001369
Join Date: Sep 2010
Device: prs300, kindle keyboard 3g
2 recipes update/changes

birmingham evening mail

Spoiler:
Code:
from calibre.web.feeds.news import BasicNewsRecipe
from calibre import browser
import re
import mechanize
from calibre.utils.magick import Image

class AdvancedUserRecipe1306097511(BasicNewsRecipe):
    title          = u'Birmingham Evening Mail'
    description = 'News for Birmingham UK'
    #timefmt = ''
    __author__ = 'Dave Asbury'
    # 1/5/14
    masthead_url        = 'http://images.icnetwork.co.uk/upl/icbirmingham/apr2004/6/5/0007417F-982A-107F-969980BFB6FA0000.jpg'
    oldest_article = 2
    max_articles_per_feed = 10
    #linearize_tables = True
    remove_empty_feeds = True
    remove_javascript     = True
    no_stylesheets = True
    remove_attributes = ['style']
    #auto_cleanup = True
    language = 'en_GB'
    compress_news_images = True
    compress_news_images_max_size = 30
    ignore_duplicate_articles = {'title', 'url'}

    remove_tags = [
                                dict(attrs={'class' : 'gallery-data'}),
	            dict(attrs={'class' : 'ir btn-fullscreen'}),
                                dict(attrs={'class' : 'tools clearfix'}),
	            dict(attrs={'class' : 'shareButtons'}),

		]
    keep_only_tags = [
                     dict(name='h1'),
                     dict(attrs={'class' : 'lead-text'}),
                  	#dict(attrs={'class' : 'styleGroup article-header'}),
                 	#dict(attrs={'class' : 'body '}),
                 	dict(attrs={'class' : 'tmCol article'}),]

    feeds          = [
        (u'Local News', u'http://www.birminghammail.co.uk/news/local-news/rss.xml'),
        (u'UK News', u'http://www.birminghammail.co.uk/news/uk-news/rss.xml'),
        (u'Sport', u'http://www.birminghammail.co.uk/sport/rss.xml'),
        (u'Whats On', u'http://www.birminghammail.co.uk/whats-on/rss.xml'),
        (u'Lifestyle',u'http://www.birminghammail.co.uk/lifestyle/rss.xml'),
        ]
    extra_css = '''
        	 h1{font-weight:bold;}
                     h2{font-weight:normal;font-size:75%;}
                     figure {font-size:50%;}
                    #body{font-size:14px;}
                    #.photo-caption {display: block;margin-left: auto;margin-right: auto;width:100%;font-size:40%;}
                    #.publish-info {font-size:50%;}
                     img {display: block;margin-left: auto;margin-right: auto;width:100%;font-size:50%;}
                      '''
    def get_cover_url(self):
        print '============Cover ================='
        print
        soup = self.index_to_soup('http://www.birminghammail.co.uk')
        cov = soup.find(attrs={'src' : re.compile('http://images.icnetwork.co.uk/upl/birm')})
        cov=str(cov)
        print '^^^^^^^', cov
        cov2 =  re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', cov)

        cov=str(cov2)
        cov=cov[2:len(cov)-2]

        print '&&&&&&&&',cov,'***'
        cover_url=cov
        br = browser()
        br.set_handle_redirect(False)
        try:
            br.open_novisit(cov)
            cover_url = cov
        except:
            cover_url ='http://s.birminghammail.co.uk/skins/birminghammail/gfx/follow-media.jpg'

        return cover_url


Daily Mirror

Spoiler:
Code:
from calibre.web.feeds.news import BasicNewsRecipe
from calibre import browser
import re

class AdvancedUserRecipe1306061239(BasicNewsRecipe):
    title          = u'The Daily Mirror'
    description = 'News as provided by The Daily Mirror -UK'

    __author__ = 'Dave Asbury'
    # last updated 1/5/14
    language = 'en_GB'
    #cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg'

    masthead_url = 'http://www.nmauk.co.uk/nma/images/daily_mirror.gif'

    compress_news_images = True
    compress_news_images_max_size = 30
    oldest_article = 1.5
    max_articles_per_feed = 12
    remove_empty_feeds = True
    remove_javascript     = True
    no_stylesheets = True
    ignore_duplicate_articles = {'title'}
    
   # auto_cleanup = True
    #conversion_options = { 'linearize_tables' : True }



    keep_only_tags = [         dict(name='h1'),
                         dict(name='div',attrs={'class' : 'lead-text'}),
                         dict(name='figure',attrs={'class' : 'inline-image clearfix '}),
                         dict(name='div',attrs={'class' : 'styleGroup clearfix'}),
                         dict(name='div',attrs={'class' : 'widget relatedContents pictures widget-editable viziwyg-section-70 inpage-widget-2230659'}),#widget relatedContents pictures widget-editable viziwyg-section-245 inpage-widget-158123'}),
                        # dict(name='figure',attrs={'class' : 'clearfix'}),
                         dict(name='div',attrs={'class' :'body '}),

       #dict(attrs={'class' : ['article-attr','byline append-1','published']}),
       #dict(name='p'),
        ]


    remove_tags = [
           dict(attrs={'class' : ['article sa-teaser type-opinion','image-gallery','gallery-caption']}),
           dict(attrs={'class' : 'comment'}),
           dict(name='title'),
           dict(name='ul'),
           dict(name='ul',attrs={'class' :  'clearfix breadcrumbs '}),
           dict(name='ul',attrs={'id' : 'login-201109171215'}),
           dict(name='div',attrs={'class' : ['inline-ad span-16 last','caption']}),#'widget navigation breadcrumb widget-editable viziwyg-section-198 inpage-widget-80721 span-17','image-credit'
                    ]

    preprocess_regexps = [
        (re.compile(r'- mirror.co.uk', re.IGNORECASE | re.DOTALL), lambda match: ''),
        (re.compile(r'<span class="image-credit">Getty</span>', re.IGNORECASE | re.DOTALL), lambda match: ''),
	]


    feeds          = [
        (u'UK News',u'http://www.mirror.co.uk/news/uk-news/rss.xml'),
        (u'world News',u'http://www.mirror.co.uk/news/world-news/rss.xml'),
        (u'Sports',u'http://www.mirror.co.uk/sport/rss.xml'),
        (u'3AM',u'http://www.mirror.co.uk/3am/rss.xml'),
        (u'Lifestyle',u'http://www.mirror.co.uk/lifestyle/rss.xml')




           # example of commented out feed not needed ,(u'Travel','http://www.mirror.co.uk/advice/travel/rss.xml')
  ]
    extra_css = '''
                    h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
                    h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
                    p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
                    body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
		'''

    def get_cover_url(self):
        soup = self.index_to_soup('http://www.politicshome.com/uk/latest_frontpage.html')
# look for the block containing the mirror button and url
        cov = soup.find(attrs={'style' : 'background-image: url(http://www.politicshome.com/images/sources/source_frontpage_button_92.gif);'})
        cov2 = str(cov)
        cov2='http://www.politicshome.com'+cov2[9:-142]
#cov2 now contains url of the page containing pic
        soup = self.index_to_soup(cov2)
        cov = soup.find(attrs={'id' : 'large'})
        cov=str(cov)
        cov2 =  re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', cov)
        cov2 = str(cov2)
        cov2=cov2[2:len(cov2)-2]
        print '******** ',cov2,' ***'
        #cov2 now is pic url, now  go back to original function
        br = browser()
        br.set_handle_redirect(False)
        try:
            br.open_novisit(cov2)
            cover_url = cov2
        except:
            cover_url ='http://profile.ak.fbcdn.net/hprofile-ak-snc4/373019_6149699161_1710984811_n.jpg'

        # print '******** string is  ', cov2,' ***'
        #cover_url = cov2
        #cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png'
        return cover_url
scissors is offline   Reply With Quote
Reply


Forum Jump

Similar Threads
Thread Thread Starter Forum Replies Last Post
Free (iTunes) Tasting Chef's Recipes: Summer 2012 Cookbook [Enhanced Recipes] ATDrake Deals and Resources (No Self-Promotion or Affiliate Links) 3 08-15-2012 11:15 PM
Asian Recipes - 50 Tasty & Easy Unique Exotic Recipes (With Images Of Each Dish And C asiafoodguru Self-Promotions by Authors and Publishers 1 08-10-2012 05:01 AM
Recipes need update Mixx Recipes 6 09-15-2011 03:53 PM
update for JBPress and NBOnline recipes. adonishi Recipes 0 07-19-2011 03:14 AM
won't update...after completing the update cycle edge returns to the first screen WeAreBorrg enTourage Archive 2 03-18-2011 06:39 PM


All times are GMT -4. The time now is 11:39 AM.


MobileRead.com is a privately owned, operated and funded community.