MobileRead Forums - View Single Post

scissors · 12-24-2011, 05:53 AM

Shortlist doesn't provide RSS.
I just can't get my head around the use of soup to extract links.
www.feed43 allowed me to get RSS.

Here is the recipe - it produces a 7 meg file of most of the site.
Personally I think it's good for us none python folks to get feeds from difficult sites

Anyone agree?

Here's the recipe

Spoiler:

Code:

import re
from calibre import __appname__
from calibre.utils.magick import Image, PixelWand
class AdvancedUserRecipe1324663493(BasicNewsRecipe):
    title          = u'Shortlist'
    oldest_article = 7
    max_articles_per_feed = 10
    remove_empty_feeds = True
    remove_javascript     = True
    no_stylesheets = True
    __author__ = 'Dave Asbury'
    # last updated 24/12/11
    language = 'en_GB'

    cover_url = 'http://www.originalpenguin.eu/wp-content/uploads/2010/05/shortlist-cover.jpg'
    masthead_url = 'http://www.mediauk.com/logos/100/344096.png'

    #auto_cleanup_keep = '//*[@class="hero-image"]'
    #auto_cleanup_keep = '//*[@class="article "]'

    #auto_cleanup = True
    preprocess_regexps = [
    (re.compile(r'…or.*?email to your friends</a>.', re.IGNORECASE | re.DOTALL), lambda match: '')]

    keep_only_tags = [
	          dict(name='h1'),
	          dict(name='h2',attrs={'class' : 'title'}),
                              dict(name='h3',atts={'class' : 'subheading'}),
	          dict(attrs={'class' : [ 'hero-static','stand-first']}), 
                              dict(attrs={'class' : 'hero-image'}),
       	          dict(name='div',attrs={'id' : ['list','article','article alternate']}),
	          dict(name='div',attrs={'class' : 'stand-first'}),
          #dict(name='p')

        ]
    remove_tags = [dict(name='h2',attrs={'class' : 'graphic-header'}),
	       dict(attrs={'id' : ['share','twitter','facebook','digg','delicious','facebook-like']}),
	       dict(atts={'class' : ['related-content','related-content-item','related-content horizontal','more']}),

	]

    remove_tags_after = [dict(name='p',attrs={'id' : 'tags'})
	]

    feeds          = [
	     (u'Instant Improver', u'http://feed43.com/1236541026275417.xml'),
	     (u'Cool Stuff',u'http://feed43.com/6253845228768456.xml'),
                         (u'Style',u'http://feed43.com/7217107577215678.xml'),
                         (u'Films',u'http://feed43.com/3101308515277265.xml'),
	     (u'Music',u'http://feed43.com/2416400550560162.xml'),
	     (u'TV',u'http://feed43.com/4781172470717123.xml'),
	     (u'Sport',u'http://feed43.com/5303151885853308.xml'),
	     (u'Gaming',u'http://feed43.com/8883764600355347.xml'),
                         (u'Women',u'http://feed43.com/2648221746514241.xml'),
	#(u'Articles', u'http://feed43.com/3428534448355545.xml')
	]