Register Guidelines E-Books Today's Posts Search

Go Back   MobileRead Forums > E-Book Software > Calibre > Recipes

Notices

Reply
 
Thread Tools Search this Thread
Old 08-11-2013, 02:35 PM   #1
scissors
Addict
scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.
 
Posts: 241
Karma: 1001369
Join Date: Sep 2010
Device: prs300, kindle keyboard 3g
The Daily Express

Recipe, Daily Express. Uk Newspaper.

Spoiler:

Code:
from calibre import browser
from calibre.web.feeds.news import BasicNewsRecipe
import mechanize
from calibre.constants import config_dir, CONFIG_DIR_MODE
import os, os.path, urllib
class AdvancedUserRecipe1376229553(BasicNewsRecipe):
    title          = u'Daily Express'
    __author__ = 'Dave Asbury'
    encoding    = 'utf-8'
    remove_empty_feeds = True
    #remove_javascript     = True
    no_stylesheets        = True
    oldest_article = 2
    max_articles_per_feed = 10
    #auto_cleanup = True
    compress_news_images = True
    compress_news_images_max_size = 25
    ignore_duplicate_articles = {'title', 'url'}

    remove_tags = [
                                dict(name='footer'),
                                dict(attrs={'id' : 'header_addons'}),
		dict(attrs={'class' : 'hoverException'}),
                                dict(name='_li'),dict(name='li'),
          		dict(attrs={'class' : 'box related-articles clear'}),
                                dict(attrs={'class' : 'news-list'}),
          	             ]
    keep_only_tags = [   
		dict(name='h1'),
                                dict(attrs={'class' : 'publish-info'}),
                                #dict(name='h3'),
                                #dict(name='section',attrs={'class' : 'photo'}),
		#dict(name='section',attrs={'class' : 'text-description'}),

		dict(attrs={'class' : 'clearfix hR new-style'}),
                             ]
    
    preprocess_regexps = [
        (re.compile(r'widget', re.IGNORECASE | re.DOTALL), lambda match: '')]

    preprocess_regexps = [
        (re.compile(r'<h3>More UK</h3>', re.IGNORECASE | re.DOTALL), lambda match: '')]

    
    feeds          = [(u'UK News', u'http://www.express.co.uk/posts/rss/1/uk'),
    	         (u'World News',u'http://www.express.co.uk/posts/rss/78/world'),
                         (u'Finance',u'http://www.express.co.uk/posts/rss/21/finance'),
	         (u'Sport',u'http://www.express.co.uk/posts/rss/65/sport'),
	         (u'Entertainment',u'http://www.express.co.uk/posts/rss/18/entertainment'),
                         (u'Lifestyle',u'http://www.express.co.uk/posts/rss/8/life&style'),
	         (u'Fun',u'http://www.express.co.uk/posts/rss/110/fun'),
                        ]

    def get_cover_url(self):
            soup = self.index_to_soup('http://www.express.co.uk/ourpaper/')
            cov = soup.find(attrs={'src' : re.compile('http://images.dailyexpress.co.uk/img/covers/')})
            cov=str(cov)
            cov2 =  re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', cov)
            print '&&&&&&&& ',cov2,' ***'
            
            cov=str(cov2)
            cov=cov[2:len(cov)-2]
            print '&&&&&&&& ',cov,' ***'
            cover_url=cov
            return cover_url

    extra_css = '''
                    h1{font-weight:bold;font-size:26px;}
                    h2{font-weight:normal;font-size:small;}
                    p{font-size:14px;}
                    body{font-size:14px;}
      '''
scissors is offline   Reply With Quote
Old 08-17-2013, 08:26 AM   #2
scissors
Addict
scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.
 
Posts: 241
Karma: 1001369
Join Date: Sep 2010
Device: prs300, kindle keyboard 3g
Express Update 18th Aug 13

18-8-13 Added Sub Headings

Quote box removed from articles
Fonts size expressed as a %
Images centred and image caption font reduced

Spoiler:

Code:
from calibre import browser
from calibre.web.feeds.news import BasicNewsRecipe
import mechanize
from calibre.constants import config_dir, CONFIG_DIR_MODE
import os, os.path, urllib
class AdvancedUserRecipe1376229553(BasicNewsRecipe):
    title          = u'Daily Express'
    __author__ = 'Dave Asbury'
    # 18-08-17 remove quoted text from article, add sub heading
    encoding    = 'utf-8'
    remove_empty_feeds = True
    #remove_javascript     = True
    no_stylesheets        = True
    oldest_article = 1
    max_articles_per_feed = 10
    #auto_cleanup = True
    compress_news_images = True
    compress_news_images_max_size = 30
    ignore_duplicate_articles = {'title', 'url'}
    
    
    preprocess_regexps = [

		(re.compile(r'<h3>More UK</h3>', re.IGNORECASE | re.DOTALL), lambda match: ''),
         		(re.compile(r'widget', re.IGNORECASE | re.DOTALL), lambda match: ''),
         		(re.compile(r'Related articles', re.IGNORECASE | re.DOTALL), lambda match: ''),
         		(re.compile(r'Car News<', re.IGNORECASE | re.DOTALL), lambda match: '<'),  
         		(re.compile(r'TV & Radio News<', re.IGNORECASE | re.DOTALL), lambda match: '<'),                       
         		(re.compile(r'Food & Recipe News<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
         		(re.compile(r'More City & Business<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
         		(re.compile(r'Travel News<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
         		(re.compile(r'Garden News<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
		(re.compile(r'Fashion & Beauty News<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
		(re.compile(r'More Personal Finance<', re.IGNORECASE | re.DOTALL), lambda match: '<'),
		(re.compile(r'Add Your Comment<', re.IGNORECASE | re.DOTALL), lambda match: '<'),



                                  ]


    remove_tags = [
                                dict(attrs={'class' : 'quote'}),
		dict(attrs={'class' : 'author'}),
                                dict(name='footer'),
                                dict(attrs={'id' : 'header_addons'}),
		dict(attrs={'class' : 'hoverException'}),
                                dict(name='_li'),dict(name='li'),
          		dict(attrs={'class' : 'box related-articles clear'}),
                                dict(attrs={'class' : 'news-list'}),
                                dict(attrs={'class' : 'sponsored-section'}),
          	             ]
    keep_only_tags = [   
		dict(name='h1'),
                                dict(attrs={'class' : 'publish-info'}),
                                dict(name='h3', limit=2),
                                #dict(name='section',attrs={'class' : 'photo'}),
		#dict(name='section',attrs={'class' : 'text-description'}),

		dict(attrs={'class' : 'clearfix hR new-style'}),
                             ]
    
    

    
    feeds          = [(u'UK News', u'http://www.express.co.uk/posts/rss/1/uk'),
    	         (u'World News',u'http://www.express.co.uk/posts/rss/78/world'),
                         (u'Finance',u'http://www.express.co.uk/posts/rss/21/finance'),
	         (u'Sport',u'http://www.express.co.uk/posts/rss/65/sport'),
	         (u'Entertainment',u'http://www.express.co.uk/posts/rss/18/entertainment'),
                         (u'Lifestyle',u'http://www.express.co.uk/posts/rss/8/life&style'),
	         (u'Fun',u'http://www.express.co.uk/posts/rss/110/fun'),
                        ]

    def get_cover_url(self):
            soup = self.index_to_soup('http://www.express.co.uk/ourpaper/')
            cov = soup.find(attrs={'src' : re.compile('http://images.dailyexpress.co.uk/img/covers/')})
            cov=str(cov)
            cov2 =  re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', cov)
            print '&&&&&&&& ',cov2,' ***'
            
            cov=str(cov2)
            cov=cov[2:len(cov)-2]
            print '&&&&&&&& ',cov,' ***'
            cover_url=cov
            return cover_url

    extra_css = '''
                    h1{font-weight:bold;font-size:175%;}
                    h2{font-weight:normal;font-size:75%;}
                    #p{font-size:14px;}
                    #body{font-size:14px;}
                    .photo-caption {display: block;margin-left: auto;margin-right: auto;width:100%;font-size:40%;}
	    .publish-info {font-size:50%;}
                    .photo img {display: block;margin-left: auto;margin-right: auto;width:100%;}
      '''

Last edited by scissors; 08-18-2013 at 09:26 AM. Reason: Recipe Editted - added sub headings
scissors is offline   Reply With Quote
Advert
Reply


Forum Jump

Similar Threads
Thread Thread Starter Forum Replies Last Post
New Musical Express update 9/6/12 scissors Recipes 0 06-09-2012 07:53 AM
Free (Kindle/Nook/Christianbook) Daily Light on the Daily Path [Devotional] ATDrake Deals and Resources (No Self-Promotion or Affiliate Links) 4 04-20-2012 02:48 PM
Indian Express Recipe sexymax15 Recipes 0 06-16-2011 06:06 AM
Fresh daily papers daily with Calibre, on a headless server Fastolfe General Discussions 2 12-20-2010 05:29 AM


All times are GMT -4. The time now is 01:34 AM.


MobileRead.com is a privately owned, operated and funded community.