View Single Post
Old 01-20-2014, 06:30 PM   #3
alex.x
Junior Member
alex.x began at the beginning.
 
Posts: 1
Karma: 10
Join Date: Jan 2014
Device: kindle
The Guardian

Since 1 Jan 2014 the Gausrdian has a new web address. I have managed to compile this, not as good as original, but ok:

class AdvancedUserRecipe1388882568(BasicNewsRecipe):
title = u"Alex's Guardian"

base_url = "http://www.theguardian.com/theguardian"
cover_pic = 'Guardian digital edition'
masthead_url = 'http://static.guim.co.uk/static/3a21a6225712e7df59854c0749abc6cffcf00ef2/common/images/logos/the-guardian/titlepiece.gif'
oldest_article = 1
max_articles_per_feed = 100

auto_cleanup = True
auto_cleanup_keep = '//div[@id="main-content-picture"]'

# Removes empty feeds
remove_empty_feeds = True

feeds = [
(u'Top Stories', u'http://www.theguardian.com/theguardian/mainsection/topstories/rss'),
(u'UK News', u'http://feeds.theguardian.com/theguardian/uk-news/rss'),
(u'World', u'http://www.theguardian.com/world/rss'),
(u'Politics', u'http://www.theguardian.com/politics'),
(u'Comment', u'http://www.theguardian.com/uk/commentisfree'),
(u'Science', u'http://www.theguardian.com/science'),
(u'Education', u'http://www.theguardian.com/education'),
(u'Culture', u'http://www.theguardian.com/uk/culture'),
(u'Environment', u'http://www.theguardian.com/environment/rss'),
(u'Technology', u'http://feeds.theguardian.com/theguardian/technology/rss'),
(u'Saturday', u'http://www.theguardian.com/theguardian/2014/jan/04/mainsection/saturday'),
(u'Money', u'http://www.theguardian.com/uk/money/rss'),
(u'Editorials and Reply', u'http://www.theguardian.com/theguardian/mainsection/editorialsandreply'),
(u'Obituaries', u'http://www.theguardian.com/tone/obituaries/rss'),
(u'Reviews', u'http://www.theguardian.com/theguardian/guardianreview/rss'),
(u'Travel', u'http://www.theguardian.com/travel'),
(u'G2', u'http://www.theguardian.com/theguardian/g2/rss')
]



timefmt = ' [%a, %d %b %Y]'


remove_tags = [
dict(name='div', attrs={'class':["video-content","videos-third-column"]}),
dict(name='div', attrs={'id':["article-toolbox","subscribe-feeds",]}),
dict(name='div', attrs={'class':["guardian-tickets promo-component",]}),
dict(name='ul', attrs={'class':["pagination"]}),
dict(name='ul', attrs={'id':["content-actions"]}),
# article history link
dict(name='a', attrs={'class':["rollover history-link"]}),
# "a version of this article ..." speil
dict(name='div' , attrs = { 'class' : ['section']}),
# "about this article" js dialog
dict(name='div', attrs={'class':["share-top",]}),
# author picture
dict(name='img', attrs={'class':["contributor-pic-small"]}),
# embedded videos/captions
dict(name='span',attrs={'class' : ['inline embed embed-media']}),
#dict(name='img'),
]
use_embedded_content = False


#: Ignore duplicates of articles that are present in more than one section.
#: A duplicate article is an article that has the same title and/or URL.
#: To ignore articles with the same title, set this to:
#: ignore_duplicate_articles = {'title'}
#: To use URLs instead, set it to:
#: ignore_duplicate_articles = {'url'}
#: To match on title or URL, set it to:
ignore_duplicate_articles = {'title', 'url'}



#: Rescale images to fit in the device screen dimensions set by the output profile.
#: Ignored if no output profile is set.
scale_news_images_to_device = True

#: Maximum dimensions (w,h) to scale images to. If scale_news_images_to_device is True
#: this is set to the device screen dimensions set by the output profile unless
#: there is no profile set, in which case it is left at whatever value it has been
#: assigned (default None).
scale_news_images = None


#: The factor used when auto compressing jpeg images. If set to None,
#: auto compression is disabled. Otherwise, the images will be reduced in size to
#: (w * h)/compress_news_images_auto_size bytes if possible by reducing
#: the quality level, where w x h are the image dimensions in pixels.
#: The minimum jpeg quality will be 5/100 so it is possible this constraint
#: will not be met. This parameter can be overridden by the parameter
#: compress_news_images_max_size which provides a fixed maximum size for images.
#: Note that if you enable scale_news_images_to_device then the image will
#: first be scaled and then its quality lowered until its size is less than
#: (w * h)/factor where w and h are now the *scaled* image dimensions. In
#: other words, this compression happens after scaling.
compress_news_images_auto_size = 16

no_stylesheets = True
extra_css = '''
.article-attributes{font-size: x-small; font-family:Arial,Helvetica,sans-serif;}
.h1{font-size: large ;font-family:georgia,serif; font-weight:bold;}
.stand-first-alone{color:#040404; font-size:small; font-family:Arial,Helvetica,sans-serif;}
.caption{color:#040404; font-size:x-small; font-family:Arial,Helvetica,sans-serif;}
#article-wrapper{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;}
.main-article-info{font-family:Arial,Helvetica,sans-serif;}
#full-contents{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;}
#match-stats-summary{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;}
'''

def get_article_url(self, article):
url = article.get('guid', None)
if '/video/' in url or '/flyer/' in url or '/quiz/' in url or \
'/gallery/' in url or 'ivebeenthere' in url or \
'pickthescore' in url or 'audioslideshow' in url :
url = None
return url

def populate_article_metadata(self, article, soup, first):
if first and hasattr(self, 'add_toc_thumbnail'):
picdiv = soup.find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,picdiv['src'])
alex.x is offline   Reply With Quote