View Single Post
Old 10-06-2012, 11:46 AM   #1
scissors
Addict
scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.
 
Posts: 241
Karma: 1001369
Join Date: Sep 2010
Device: prs300, kindle keyboard 3g
The Sun UK - update 6/10/12

Added Starson's code to remove article "Sun Says web porn harms our kids" always being first article

Spoiler:


Code:
import re, random

from calibre import browser
from calibre.web.feeds.recipes import BasicNewsRecipe

class AdvancedUserRecipe1325006965(BasicNewsRecipe):

    title          = u'The Sun UK'
    description = 'Recipe Author D.Asbury. UK'
    __author__ = 'Dave Asbury'
    # last updated 6/10/12 added starsons remove article code
    language = 'en_GB'
    oldest_article = 1
    max_articles_per_feed = 15
    remove_empty_feeds = True
    
    masthead_url = 'http://www.thesun.co.uk/sol/img/global/Sun-logo.gif'
    encoding = 'UTF-8'
    remove_javascript     = True
    no_stylesheets = True
    
    

    extra_css  = '''
    body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
           '''
    keep_only_tags = [
		dict(name='div',attrs={'class' : 'intro'}), 
                                dict(name='h3'),
		dict(name='div',attrs={'id' : 'articlebody'}),
           #dict(attrs={'class' : ['right_col_branding','related-stories','mystery-meat-link','ltbx-container','ltbx-var ltbx-hbxpn','ltbx-var ltbx-nav-loop','ltbx-var ltbx-url']}),
           #                dict(name='div',attrs={'class' : 'cf'}),
          # dict(attrs={'title' : 'download flash'}),
          #                 dict(attrs={'style' : 'padding: 5px'})

           ]
    remove_tags_after = [dict(id='bodyText')]
    remove_tags=[
	              dict(name='li'),
                              dict(attrs={'class' : 'grid-4 right-hand-column'}),
		]

    feeds          = [
    (u'News', u'http://www.thesun.co.uk/sol/homepage/news/rss'),
    (u'Sport', u'http://www.thesun.co.uk/sol/homepage/sport/rss'),
    (u'Showbiz', u'http://www.thesun.co.uk/sol/homepage/showbiz/rss'),
    (u'Woman', u'http://www.thesun.co.uk/sol/homepage/woman/rss'),
    ]
# starsons code
    def parse_feeds (self): 
      feeds = BasicNewsRecipe.parse_feeds(self) 
      for feed in feeds:
        for article in feed.articles[:]:
          print 'article.title is: ', article.title
          if 'Web porn harms kids' in article.title.upper() or 'The-Sun-says' in article.url:
            feed.articles.remove(article)
      return feeds

    def get_cover_url(self):
        soup = self.index_to_soup('http://www.politicshome.com/uk/latest_frontpage.html')
        # look for the block containing the sun button and url
        cov = soup.find(attrs={'style' : 'background-image: url(http://www.politicshome.com/images/sources/source_frontpage_button_84.gif);'})

        #cov = soup.find(attrs={'id' : 'large'})
        cov2 = str(cov)

        cov2='http://www.politicshome.com'+cov2[9:-133]
        #cov2 now contains url of the page containing pic
        #cov2 now contains url of the page containing pic
        soup = self.index_to_soup(cov2)
        cov = soup.find(attrs={'id' : 'large'})
        cov2 = str(cov)
        cov2=cov2[27:-18]
        #cov2 now is pic url, now  go back to original function
        print "**** cov2 =",cov2,"****"
        br = browser()
        br.set_handle_redirect(False)
        try:
            br.open_novisit(cov2)
            cover_url = cov2
        except:
            cover_url = random.choice([
                'http://img.thesun.co.uk/multimedia/archive/00905/errorpage6_677961a_905507a.jpg'
                ,'http://img.thesun.co.uk/multimedia/archive/00905/errorpage7_677962a_905505a.jpg'
                ,'http://img.thesun.co.uk/multimedia/archive/00905/errorpage5_677960a_905512a.jpg'
                ,'http://img.thesun.co.uk/multimedia/archive/00905/errorpage2_677957a_905502a.jpg'
                ,'http://img.thesun.co.uk/multimedia/archive/00905/errorpage3_677958a_905503a.jpg'
                ])

        return cover_url
scissors is offline   Reply With Quote