Register Guidelines E-Books Search Today's Posts Mark Forums Read

Go Back   MobileRead Forums > E-Book Software > Calibre > Recipes

Notices

Reply
 
Thread Tools Search this Thread
Old 03-24-2012, 10:15 AM   #1
scissors
Addict
scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.
 
Posts: 241
Karma: 1001369
Join Date: Sep 2010
Device: prs300, kindle keyboard 3g
shortlist.com recipe update

24/3/12

Uses soup to get correct cover
have set oldest article to 7 days as site updates weekly
Max articles per feed set to 10
Spoiler:


Code:
import urllib, re, mechanize
from calibre import __appname__
from calibre.utils.magick import Image, PixelWand
class AdvancedUserRecipe1324663493(BasicNewsRecipe):
    title          = u'Shortlist'
    description = 'Articles From Shortlist.com using feed43.'
    # I've set oldest article to 7 days as the website updates weekly
    oldest_article = 7
    max_articles_per_feed = 10
    remove_empty_feeds = True
    remove_javascript     = True
    no_stylesheets = True
    __author__ = 'Dave Asbury'
    # last updated 24/03/12
    language = 'en_GB'
    def get_cover_url(self):
            soup = self.index_to_soup('http://www.newsstand.co.uk/512-Weekly-Mens-Magazines/13810-Subscribe-to-SHORTLIST-Magazine-Subscription.aspx')
            #cov = soup.findl('img', attrs : alt= '"SHORTLIST Magazine UK')
            cov = soup.find(attrs={'id' : 'ContentPlaceHolder1_ctl00_imgCoverShot'}) 
            #print '******** ',cov,' ***'
            cover_url = 'http://www.newsstand.co.uk'+cov['src']
            return cover_url

    masthead_url = 'http://www.mediauk.com/logos/100/344096.png'

    #auto_cleanup_keep = '//*[@class="hero-image"]'
    #auto_cleanup_keep = '//*[@class="article "]'

    #auto_cleanup = True
    preprocess_regexps = [
    (re.compile(r'…or.*?email to your friends</a>.', re.IGNORECASE | re.DOTALL), lambda match: '')]

    keep_only_tags = [
	          dict(name='h1'),
	          dict(name='h2',attrs={'class' : 'title'}),
                              dict(name='h3',atts={'class' : 'subheading'}),
	          dict(attrs={'class' : [ 'hero-static','stand-first']}), 
                              dict(attrs={'class' : 'hero-image'}),
       	          dict(name='div',attrs={'id' : ['list','article','article alternate']}),
	          dict(name='div',attrs={'class' : 'stand-first'}),
          #dict(name='p')

        ]
    remove_tags = [dict(name='h2',attrs={'class' : 'graphic-header'}),
	       dict(attrs={'id' : ['share','twitter','facebook','digg','delicious','facebook-like']}),
	       dict(atts={'class' : ['related-content','related-content-item','related-content horizontal','more']}),

	]

    remove_tags_after = [dict(name='p',attrs={'id' : 'tags'})
	]

    feeds          = [
                               (u'This Weeks Issue', u'http://feed43.com/0323588208751786.xml'),
	     (u'Instant Improver', u'http://feed43.com/1236541026275417.xml'),
	     (u'Cool Stuff',u'http://feed43.com/6253845228768456.xml'),
                                (u'Style',u'http://feed43.com/7217107577215678.xml'),
                                (u'Films',u'http://feed43.com/3101308515277265.xml'),
	     (u'Music',u'http://feed43.com/2416400550560162.xml'),
	     (u'TV',u'http://feed43.com/4781172470717123.xml'),
	     (u'Sport',u'http://feed43.com/5303151885853308.xml'),
	     (u'Gaming',u'http://feed43.com/8883764600355347.xml'),
                                (u'Women',u'http://feed43.com/2648221746514241.xml'),
	#(u'Articles', u'http://feed43.com/3428534448355545.xml')
	]

Last edited by scissors; 03-24-2012 at 11:27 AM. Reason: Max articles per feed set back to 10 reduce size of file
scissors is offline   Reply With Quote
Old 03-31-2012, 08:24 AM   #2
scissors
Addict
scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.
 
Posts: 241
Karma: 1001369
Join Date: Sep 2010
Device: prs300, kindle keyboard 3g
Cover taken from shortlist site

Spoiler:
Code:

import urllib, re, mechanize
from calibre import __appname__
from calibre.utils.magick import Image, PixelWand
class AdvancedUserRecipe1324663493(BasicNewsRecipe):
    title          = u'Shortlist'
    description = 'Articles From Shortlist.com using feed43.'
    # I've set oldest article to 7 days as the website updates weekly
    oldest_article = 7
    max_articles_per_feed = 12
    remove_empty_feeds = True
    remove_javascript     = True
    no_stylesheets = True
    __author__ = 'Dave Asbury'
    # last updated 31/03/12
    language = 'en_GB'
    def get_cover_url(self):
            soup = self.index_to_soup('http://www.shortlist.com')
            cov = soup.find(attrs={'width' : '160'}) 
            #print '******** ',cov,' ***'
            cover_url = 'http://www.shortlist.com'+cov['src']
            return cover_url

    masthead_url = 'http://www.mediauk.com/logos/100/344096.png'

    preprocess_regexps = [
    (re.compile(r'…or.*?email to your friends</a>.', re.IGNORECASE | re.DOTALL), lambda match: '')]

    keep_only_tags = [
	          #dict(name='h1'),
	          dict(name='h2',attrs={'class' : 'title'}),
                                     dict(name='h3',atts={'class' : 'subheading'}),
	          dict(attrs={'class' : [ 'hero-static','stand-first']}), 
                                    dict(attrs={'class' : 'hero-image'}),
       	          dict(name='div',attrs={'id' : ['list','article','article alternate']}),
	          dict(name='div',attrs={'class' : 'stand-first'}),
       	         ]
    remove_tags = [dict(name='h2',attrs={'class' : 'graphic-header'}),
	       dict(attrs={'id' : ['share','twitter','facebook','digg','delicious','facebook-like']}),
	       dict(atts={'class' : ['related-content','related-content-item','related-content horizontal','more']}),

	      ]

    remove_tags_after = [dict(name='p',attrs={'id' : 'tags'})
	                 ]

    feeds          = [
                               (u'This Weeks Issue', u'http://feed43.com/0323588208751786.xml'),
	     (u'Cool Stuff',u'http://feed43.com/6253845228768456.xml'),
                                (u'Style',u'http://feed43.com/7217107577215678.xml'),
                                (u'Films',u'http://feed43.com/3101308515277265.xml'),
	     (u'Music',u'http://feed43.com/2416400550560162.xml'),
	     (u'TV',u'http://feed43.com/4781172470717123.xml'),
	     (u'Sport',u'http://feed43.com/5303151885853308.xml'),
	     (u'Gaming',u'http://feed43.com/8883764600355347.xml'),
                                (u'Women',u'http://feed43.com/2648221746514241.xml'),
                                (u'Instant Improver', u'http://feed43.com/1236541026275417.xml'),
	    
	#(u'Articles', u'http://feed43.com/3428534448355545.xml')
	]

Last edited by scissors; 03-31-2012 at 09:04 AM. Reason: removed h1 - header repeated
scissors is offline   Reply With Quote
Advert
Old 04-07-2012, 09:46 AM   #3
scissors
Addict
scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.
 
Posts: 241
Karma: 1001369
Join Date: Sep 2010
Device: prs300, kindle keyboard 3g
update 7/4/12 Added "carousel off homepage"

added carousel off homepage (often cover article is in here)

Spoiler:
Code:
import urllib, re, mechanize
from calibre import __appname__
from calibre.utils.magick import Image, PixelWand
class AdvancedUserRecipe1324663493(BasicNewsRecipe):
    title          = u'Shortlist'
    description = 'Articles From Shortlist.com using feed43.'
    # I've set oldest article to 7 days as the website updates weekly
    oldest_article = 7
    max_articles_per_feed = 12
    remove_empty_feeds = True
    remove_javascript     = True
    no_stylesheets = True
    __author__ = 'Dave Asbury'
    # last updated 7/4/12
    language = 'en_GB'
    def get_cover_url(self):
            soup = self.index_to_soup('http://www.shortlist.com')
            cov = soup.find(attrs={'width' : '160'}) 
            #print '******** ',cov,' ***'
            cover_url = 'http://www.shortlist.com'+cov['src']
            return cover_url

    masthead_url = 'http://www.mediauk.com/logos/100/344096.png'

    preprocess_regexps = [
    (re.compile(r'…or.*?email to your friends</a>.', re.IGNORECASE | re.DOTALL), lambda match: '')]

    keep_only_tags = [
	          #dict(name='h1'),
	          dict(name='h2',attrs={'class' : 'title'}),
                                     dict(name='h3',atts={'class' : 'subheading'}),
	          dict(attrs={'class' : [ 'hero-static','stand-first']}), 
                                    dict(attrs={'class' : 'hero-image'}),
       	          dict(name='div',attrs={'id' : ['list','article','article alternate']}),
	          dict(name='div',attrs={'class' : 'stand-first'}),
       	         ]
    remove_tags = [dict(name='h2',attrs={'class' : 'graphic-header'}),
	       dict(attrs={'id' : ['share','twitter','facebook','digg','delicious','facebook-like']}),
	       dict(atts={'class' : ['related-content','related-content-item','related-content horizontal','more']}),

	      ]

    remove_tags_after = [dict(name='p',attrs={'id' : 'tags'})
	                 ]

    feeds          = [
                               (u'Home carousel',u'http://feed43.com/7106317222455380.xml'),
                               (u'This Weeks Issue', u'http://feed43.com/0323588208751786.xml'),
	     (u'Cool Stuff',u'http://feed43.com/6253845228768456.xml'),
                                (u'Style',u'http://feed43.com/7217107577215678.xml'),
                                (u'Films',u'http://feed43.com/3101308515277265.xml'),
	     (u'Music',u'http://feed43.com/2416400550560162.xml'),
	     (u'TV',u'http://feed43.com/4781172470717123.xml'),
	     (u'Sport',u'http://feed43.com/5303151885853308.xml'),
	     (u'Gaming',u'http://feed43.com/8883764600355347.xml'),
                                (u'Women',u'http://feed43.com/2648221746514241.xml'),
                                (u'Instant Improver', u'http://feed43.com/1236541026275417.xml'),
	    
	#(u'Articles', u'http://feed43.com/3428534448355545.xml')
	]
scissors is offline   Reply With Quote
Old 05-19-2012, 01:22 AM   #4
scissors
Addict
scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.
 
Posts: 241
Karma: 1001369
Join Date: Sep 2010
Device: prs300, kindle keyboard 3g
19/5/12
Website changed cover url - adjusted to suit

Spoiler:
Code:
import urllib, re, mechanize
from calibre import __appname__
from calibre.utils.magick import Image, PixelWand
class AdvancedUserRecipe1324663493(BasicNewsRecipe):
    title          = u'Shortlist'
    description = 'Articles From Shortlist.com using feed43.'
    # I've set oldest article to 7 days as the website updates weekly
    oldest_article = 7
    max_articles_per_feed = 12
    remove_empty_feeds = True
    remove_javascript     = True
    no_stylesheets = True
    __author__ = 'Dave Asbury'
    # last updated 19/5/12
    language = 'en_GB'
    def get_cover_url(self):
            soup = self.index_to_soup('http://www.shortlist.com')
            cov = soup.find(attrs={'width' : '121'}) 
            #print '******** ',cov,' ***'
            #cover_url = 'http://www.shortlist.com'+cov['src']
            cover_url =cov['src']
            return cover_url

    masthead_url = 'http://www.mediauk.com/logos/100/344096.png'

    preprocess_regexps = [
    (re.compile(r'…or.*?email to your friends</a>.', re.IGNORECASE | re.DOTALL), lambda match: '')]

    keep_only_tags = [
	          #dict(name='h1'),
	          dict(name='h2',attrs={'class' : 'title'}),
                                     dict(name='h3',atts={'class' : 'subheading'}),
	          dict(attrs={'class' : [ 'hero-static','stand-first']}), 
                                    dict(attrs={'class' : 'hero-image'}),
       	          dict(name='div',attrs={'id' : ['list','article','article alternate']}),
	          dict(name='div',attrs={'class' : 'stand-first'}),
       	         ]
    remove_tags = [dict(name='h2',attrs={'class' : 'graphic-header'}),
	       dict(attrs={'id' : ['share','twitter','facebook','digg','delicious','facebook-like']}),
	       dict(atts={'class' : ['related-content','related-content-item','related-content horizontal','more']}),

	      ]

    remove_tags_after = [dict(name='p',attrs={'id' : 'tags'})
	                 ]

    feeds          = [
                               (u'Home carousel',u'http://feed43.com/7106317222455380.xml'),
                               (u'This Weeks Issue', u'http://feed43.com/0323588208751786.xml'),
	     (u'Cool Stuff',u'http://feed43.com/6253845228768456.xml'),
                                (u'Style',u'http://feed43.com/7217107577215678.xml'),
                                (u'Films',u'http://feed43.com/3101308515277265.xml'),
	     (u'Music',u'http://feed43.com/2416400550560162.xml'),
	     (u'TV',u'http://feed43.com/4781172470717123.xml'),
	     (u'Sport',u'http://feed43.com/5303151885853308.xml'),
	     (u'Gaming',u'http://feed43.com/8883764600355347.xml'),
                                (u'Women',u'http://feed43.com/2648221746514241.xml'),
                                (u'Instant Improver', u'http://feed43.com/1236541026275417.xml'),
	    
	#(u'Articles', u'http://feed43.com/3428534448355545.xml')
	]
scissors is offline   Reply With Quote
Reply

Thread Tools Search this Thread
Search this Thread:

Advanced Search

Forum Jump

Similar Threads
Thread Thread Starter Forum Replies Last Post
Tagesschau - Update of recipe a.peter Recipes 3 05-26-2013 03:57 PM
Kurier recipe update clanger9 Recipes 4 02-13-2012 12:52 AM
update recipe cnd.org derekliang Recipes 1 12-14-2011 01:46 AM
Kurier recipe update clanger9 Recipes 0 09-24-2011 09:45 AM
Books disappearing when added to Shortlist after update HarleyB Kobo Reader 25 09-02-2011 07:02 PM


All times are GMT -4. The time now is 05:09 PM.


MobileRead.com is a privately owned, operated and funded community.