Register Guidelines E-Books Search Today's Posts Mark Forums Read

Go Back   MobileRead Forums > E-Book Software > Calibre > Recipes

Notices

Reply
 
Thread Tools Search this Thread
Old 08-27-2013, 05:12 PM   #1
scissors
Addict
scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.
 
Posts: 241
Karma: 1001369
Join Date: Sep 2010
Device: prs300, kindle keyboard 3g
Daily Mirror update 27/8/13

Some tidying up

Spoiler:

Code:
from calibre.web.feeds.news import BasicNewsRecipe
from calibre import browser
import re

class AdvancedUserRecipe1306061239(BasicNewsRecipe):
    title          = u'The Daily Mirror'
    description = 'News as provided by The Daily Mirror -UK'

    __author__ = 'Dave Asbury'
    # last updated 27/8/13
    language = 'en_GB'
    #cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg'

    masthead_url = 'http://www.nmauk.co.uk/nma/images/daily_mirror.gif'
    #recursions = 10
    compress_news_images = True
    compress_news_images_max_size = 30
    oldest_article = 1.5
    max_articles_per_feed = 10
    remove_empty_feeds = True
    remove_javascript     = True
    no_stylesheets = True
    ignore_duplicate_articles = {'url'}

    #auto_cleanup = True
    #conversion_options = { 'linearize_tables' : True }



    keep_only_tags = [         dict(name='h1'),
                         dict(name='div',attrs={'class' : 'lead-text'}),
                         dict(attrs={'class' : 'tools clearfix'}),
                         dict(name='div',attrs={'class' : 'widget relatedContents pictures widget-editable viziwyg-section-245 inpage-widget-158123'}),
                        # dict(name='figure',attrs={'class' : 'clearfix'}),
                         dict(name='div',attrs={'class' :'body '}),
	         dict(name='div',attrs={'class' :'thumb'}),
                         dict(attrs={'img alt' : ['Perishers','Horace']}),
	         #dict(attrs={'class' : 'tmRow span-15-5 col-1 article-page'}),
       #dict(attrs={'class' : ['article-attr','byline append-1','published']}),
       #dict(name='p'),
        ]


    remove_tags = [
           dict(attrs={'class' : ['article sa-teaser type-opinion','last','gallery-caption','gallery-data','ir btn-fullscreen','avatar']}),#,'image-gallery'
           dict(attrs={'class' : 'comment'}),
           dict(name='title'),
           dict(name='ul',attrs={'class' :  'clearfix breadcrumbs '}),
           dict(name='ul',attrs={'id' : 'login-201109171215'}),
           dict(name='div',attrs={'class' : ['inline-ad span-16 last','caption']}),#'widget navigation breadcrumb widget-editable viziwyg-section-198 inpage-widget-80721 span-17','image-credit'
                    ]

    preprocess_regexps = [
        (re.compile(r'- mirror.co.uk', re.IGNORECASE | re.DOTALL), lambda match: '')]


    feeds          = [
        (u'News',u'http://www.mirror.co.uk/news/rss.xml'),
        (u'Sports',u'http://www.mirror.co.uk/sport/rss.xml'),
        (u'3AM',u'http://www.mirror.co.uk/3am/rss.xml'),
        (u'Lifestyle',u'http://www.mirror.co.uk/lifestyle/rss.xml')




           # example of commented out feed not needed ,(u'Travel','http://www.mirror.co.uk/advice/travel/rss.xml')
  ]
    extra_css = '''
                    h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:170%;}
                    .article figure figcaption {display: block;margin-left: auto;margin-right: auto;width:100%;font-family:Arial,Helvetica,sans-serif;font-size:40%;}
                    
                    #h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;}
                    p{font-family:Arial,Helvetica,sans-serif;}
                    body{font-family:Helvetica,Arial,sans-serif;}
	    .article figure{display: block;margin-left: auto;margin-right: auto;width:100%;}
                    .lead-text p {font-size:150%}
    	    '''

    def get_cover_url(self):
        soup = self.index_to_soup('http://www.politicshome.com/uk/latest_frontpage.html')
# look for the block containing the mirror button and url
        cov = soup.find(attrs={'style' : 'background-image: url(http://www.politicshome.com/images/sources/source_frontpage_button_92.gif);'})
        cov2 = str(cov)
        cov2='http://www.politicshome.com'+cov2[9:-142]
#cov2 now contains url of the page containing pic
        soup = self.index_to_soup(cov2)
        cov = soup.find(attrs={'id' : 'large'})
        cov=str(cov)
        cov2 =  re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', cov)
        cov2 = str(cov2)
        cov2=cov2[2:len(cov2)-2]
        #cov2 now is pic url, now  go back to original function
        br = browser()
        br.set_handle_redirect(False)
        try:
            br.open_novisit(cov2)
            cover_url = cov2
        except:
            cover_url ='http://profile.ak.fbcdn.net/hprofile-ak-snc4/373019_6149699161_1710984811_n.jpg'

        # print '******** string is  ', cov2,' ***'
        #cover_url = cov2
        #cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png'
        return cover_url
scissors is offline   Reply With Quote
Reply

Thread Tools Search this Thread
Search this Thread:

Advanced Search

Forum Jump

Similar Threads
Thread Thread Starter Forum Replies Last Post
daily mirror uk update 12/10/12 scissors Recipes 2 10-19-2012 09:57 AM
update daily mirror uk scissors Recipes 0 02-11-2012 09:27 AM
Improved Daily Mirror UK recipe scissors Recipes 2 12-27-2011 03:14 AM
UK Daily Mirror Recipe. scissors Recipes 0 06-11-2011 04:44 AM
Daily update June 24 slow? PKFFW OpenInkpot 0 06-27-2010 02:02 AM


All times are GMT -4. The time now is 06:18 PM.


MobileRead.com is a privately owned, operated and funded community.