Register Guidelines E-Books Today's Posts Search

Go Back   MobileRead Forums > E-Book Software > Calibre > Recipes

Notices

Reply
 
Thread Tools Search this Thread
Old 09-09-2013, 03:06 PM   #1
scissors
Addict
scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.
 
Posts: 241
Karma: 1001369
Join Date: Sep 2010
Device: prs300, kindle keyboard 3g
Daily Express 9/9/13

Added Article authors and got rid of quoted text

Spoiler:
Code:
from calibre import browser
from calibre.web.feeds.news import BasicNewsRecipe
import mechanize
from calibre.constants import config_dir, CONFIG_DIR_MODE
import os, os.path, urllib
class AdvancedUserRecipe1376229553(BasicNewsRecipe):
    title          = u'Daily Express'
    __author__ = 'Dave Asbury'
    # 9-9-13 added article author and now use (re.compile(r'>[\w].+? News<'
    encoding    = 'utf-8'
    remove_empty_feeds = True
    #remove_javascript     = True
    no_stylesheets        = True
    oldest_article = 1
    max_articles_per_feed = 10
    #auto_cleanup = True
    compress_news_images = True
    compress_news_images_max_size = 30
    ignore_duplicate_articles = {'title', 'url'}
    masthead_url = 'http://cdn.images.dailyexpress.co.uk/img/page/express_logo.png'
    
    
    preprocess_regexps = [

		(re.compile(r'widget', re.IGNORECASE | re.DOTALL), lambda match: ''),
         		(re.compile(r'Related articles', re.IGNORECASE | re.DOTALL), lambda match: ''),
         		(re.compile(r'Add Your Comment<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
		(re.compile(r'>More [\w].+?<', re.IGNORECASE ), lambda match: '><'),
                                (re.compile(r'>[\w].+? News<', re.IGNORECASE ), lambda match: '><'),
                                #(re.compile(r'Health News<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
	                #(re.compile(r'Car News<', re.IGNORECASE | re.DOTALL), lambda match: '<'),  
         		#(re.compile(r'TV & Radio News<', re.IGNORECASE | re.DOTALL), lambda match: '<'),                       
         		#(re.compile(r'Food & Recipe News<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
         		#(re.compile(r'More City & Business<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
         		#(re.compile(r'Travel News<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
         		#(re.compile(r'Garden News<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
		#(re.compile(r'Fashion & Beauty News<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
		#(re.compile(r'More Personal Finance<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
                                #(re.compile(r'<h3>More UK</h3>', re.IGNORECASE | re.DOTALL), lambda match: ''),
         			
		]


    remove_tags = [
                                dict(attrs={'class' : 'quote'}),
		#dict(attrs={'class' : 'author'}),
                                dict(name='footer'),
                                dict(attrs={'id' : 'header_addons'}),
		dict(attrs={'class' : 'hoverException'}),
                                dict(name='_li'),dict(name='li'),
          		dict(attrs={'class' : 'box related-articles clear'}),
                                dict(attrs={'class' : 'news-list'}),
                                dict(attrs={'class' : 'sponsored-section'}),
                                dict(attrs={'class' : 'pull-quote on-right'}),
                                dict(attrs={'class' : 'pull-quote on-left'}),
                                
          	             ]
    keep_only_tags = [   
		dict(name='h1'),
                                dict(attrs={'class' : 'publish-info'}),
                                dict(name='h3', limit=2),
                                dict(attrs={'class' : 'clearfix hR new-style'}),
                             ]
    
    

    
    feeds          = [(u'UK News', u'http://www.express.co.uk/posts/rss/1/uk'),
    	         (u'World News',u'http://www.express.co.uk/posts/rss/78/world'),
                         (u'Finance',u'http://www.express.co.uk/posts/rss/21/finance'),
	         (u'Sport',u'http://www.express.co.uk/posts/rss/65/sport'),
	         (u'Entertainment',u'http://www.express.co.uk/posts/rss/18/entertainment'),
                         (u'Lifestyle',u'http://www.express.co.uk/posts/rss/8/life&style'),
	         (u'Fun',u'http://www.express.co.uk/posts/rss/110/fun'),
                        ]

    def get_cover_url(self):
            soup = self.index_to_soup('http://www.express.co.uk/ourpaper/')
            cov = soup.find(attrs={'src' : re.compile('http://images.dailyexpress.co.uk/img/covers/')})
            cov=str(cov)
            cov2 =  re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', cov)
            print '&&&&&&&& ',cov2,' ***'
            
            cov=str(cov2)
            cov=cov[2:len(cov)-2]
            print '&&&&&&&& ',cov,' ***'
            cover_url=cov
            return cover_url

    extra_css = '''
                    h1{font-weight:bold;font-size:175%;}
                    h2{font-weight:normal;font-size:75%;}
                    #p{font-size:14px;}
                    #body{font-size:14px;}
                    .photo-caption {display: block;margin-left: auto;margin-right: auto;width:100%;font-size:40%;}
	    .publish-info {font-size:50%;}
                    .photo img {display: block;margin-left: auto;margin-right: auto;width:100%;}
      '''
scissors is offline   Reply With Quote
Reply


Forum Jump

Similar Threads
Thread Thread Starter Forum Replies Last Post
The Daily Express scissors Recipes 1 08-17-2013 08:26 AM
New Musical Express update 9/6/12 scissors Recipes 0 06-09-2012 07:53 AM
Free (Kindle/Nook/Christianbook) Daily Light on the Daily Path [Devotional] ATDrake Deals and Resources (No Self-Promotion or Affiliate Links) 4 04-20-2012 02:48 PM
Fresh daily papers daily with Calibre, on a headless server Fastolfe General Discussions 2 12-20-2010 05:29 AM


All times are GMT -4. The time now is 02:56 PM.


MobileRead.com is a privately owned, operated and funded community.