View Single Post
Old 08-17-2013, 08:26 AM   #2
scissors
Addict
scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.
 
Posts: 241
Karma: 1001369
Join Date: Sep 2010
Device: prs300, kindle keyboard 3g
Express Update 18th Aug 13

18-8-13 Added Sub Headings

Quote box removed from articles
Fonts size expressed as a %
Images centred and image caption font reduced

Spoiler:

Code:
from calibre import browser
from calibre.web.feeds.news import BasicNewsRecipe
import mechanize
from calibre.constants import config_dir, CONFIG_DIR_MODE
import os, os.path, urllib
class AdvancedUserRecipe1376229553(BasicNewsRecipe):
    title          = u'Daily Express'
    __author__ = 'Dave Asbury'
    # 18-08-17 remove quoted text from article, add sub heading
    encoding    = 'utf-8'
    remove_empty_feeds = True
    #remove_javascript     = True
    no_stylesheets        = True
    oldest_article = 1
    max_articles_per_feed = 10
    #auto_cleanup = True
    compress_news_images = True
    compress_news_images_max_size = 30
    ignore_duplicate_articles = {'title', 'url'}
    
    
    preprocess_regexps = [

		(re.compile(r'<h3>More UK</h3>', re.IGNORECASE | re.DOTALL), lambda match: ''),
         		(re.compile(r'widget', re.IGNORECASE | re.DOTALL), lambda match: ''),
         		(re.compile(r'Related articles', re.IGNORECASE | re.DOTALL), lambda match: ''),
         		(re.compile(r'Car News<', re.IGNORECASE | re.DOTALL), lambda match: '<'),  
         		(re.compile(r'TV & Radio News<', re.IGNORECASE | re.DOTALL), lambda match: '<'),                       
         		(re.compile(r'Food & Recipe News<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
         		(re.compile(r'More City & Business<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
         		(re.compile(r'Travel News<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
         		(re.compile(r'Garden News<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
		(re.compile(r'Fashion & Beauty News<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
		(re.compile(r'More Personal Finance<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
		(re.compile(r'Add Your Comment<', re.IGNORECASE | re.DOTALL), lambda match: '<'),



                                  ]


    remove_tags = [
                                dict(attrs={'class' : 'quote'}),
		dict(attrs={'class' : 'author'}),
                                dict(name='footer'),
                                dict(attrs={'id' : 'header_addons'}),
		dict(attrs={'class' : 'hoverException'}),
                                dict(name='_li'),dict(name='li'),
          		dict(attrs={'class' : 'box related-articles clear'}),
                                dict(attrs={'class' : 'news-list'}),
                                dict(attrs={'class' : 'sponsored-section'}),
          	             ]
    keep_only_tags = [   
		dict(name='h1'),
                                dict(attrs={'class' : 'publish-info'}),
                                dict(name='h3', limit=2),
                                #dict(name='section',attrs={'class' : 'photo'}),
		#dict(name='section',attrs={'class' : 'text-description'}),

		dict(attrs={'class' : 'clearfix hR new-style'}),
                             ]
    
    

    
    feeds          = [(u'UK News', u'http://www.express.co.uk/posts/rss/1/uk'),
    	         (u'World News',u'http://www.express.co.uk/posts/rss/78/world'),
                         (u'Finance',u'http://www.express.co.uk/posts/rss/21/finance'),
	         (u'Sport',u'http://www.express.co.uk/posts/rss/65/sport'),
	         (u'Entertainment',u'http://www.express.co.uk/posts/rss/18/entertainment'),
                         (u'Lifestyle',u'http://www.express.co.uk/posts/rss/8/life&style'),
	         (u'Fun',u'http://www.express.co.uk/posts/rss/110/fun'),
                        ]

    def get_cover_url(self):
            soup = self.index_to_soup('http://www.express.co.uk/ourpaper/')
            cov = soup.find(attrs={'src' : re.compile('http://images.dailyexpress.co.uk/img/covers/')})
            cov=str(cov)
            cov2 =  re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', cov)
            print '&&&&&&&& ',cov2,' ***'
            
            cov=str(cov2)
            cov=cov[2:len(cov)-2]
            print '&&&&&&&& ',cov,' ***'
            cover_url=cov
            return cover_url

    extra_css = '''
                    h1{font-weight:bold;font-size:175%;}
                    h2{font-weight:normal;font-size:75%;}
                    #p{font-size:14px;}
                    #body{font-size:14px;}
                    .photo-caption {display: block;margin-left: auto;margin-right: auto;width:100%;font-size:40%;}
	    .publish-info {font-size:50%;}
                    .photo img {display: block;margin-left: auto;margin-right: auto;width:100%;}
      '''

Last edited by scissors; 08-18-2013 at 09:26 AM. Reason: Recipe Editted - added sub headings
scissors is offline   Reply With Quote