Register Guidelines E-Books Search Today's Posts Mark Forums Read

Go Back   MobileRead Forums > E-Book Software > Calibre > Recipes

Notices

Reply
 
Thread Tools Search this Thread
Old 10-19-2012, 10:17 AM   #1
scissors
Addict
scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.
 
Posts: 203
Karma: 1001369
Join Date: Sep 2010
Device: prs300, kindle keyboard 3g
the sun 19/10/12

better cover url fetch

Spoiler:
Code:
import re, random

from calibre import browser
from calibre.web.feeds.recipes import BasicNewsRecipe

class AdvancedUserRecipe1325006965(BasicNewsRecipe):

    title          = u'The Sun UK'
    description = 'Recipe Author D.Asbury. Articles from The Sun tabloid UK'
    __author__ = 'Dave Asbury'
    # last updated 19/10/12 better cover fetch
    language = 'en_GB'
    oldest_article = 1
    max_articles_per_feed = 15
    remove_empty_feeds = True
    
    masthead_url = 'http://www.thesun.co.uk/sol/img/global/Sun-logo.gif'
    encoding = 'UTF-8'
    remove_javascript     = True
    no_stylesheets = True
    
    ignore_duplicate_articles = {'title','url'}
    

    extra_css  = '''
    body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
           '''
    keep_only_tags = [
		dict(name='div',attrs={'class' : 'intro'}), 
                                dict(name='h3'),
		dict(name='div',attrs={'id' : 'articlebody'}),
           #dict(attrs={'class' : ['right_col_branding','related-stories','mystery-meat-link','ltbx-container','ltbx-var ltbx-hbxpn','ltbx-var ltbx-nav-loop','ltbx-var ltbx-url']}),
           #                dict(name='div',attrs={'class' : 'cf'}),
          # dict(attrs={'title' : 'download flash'}),
          #                 dict(attrs={'style' : 'padding: 5px'})

           ]
    remove_tags_after = [dict(id='bodyText')]
    remove_tags=[
	              dict(name='li'),
                              dict(attrs={'class' : 'grid-4 right-hand-column'}),
		]

    feeds          = [
    (u'News', u'http://www.thesun.co.uk/sol/homepage/news/rss'),
    (u'Sport', u'http://www.thesun.co.uk/sol/homepage/sport/rss'),
    (u'Showbiz', u'http://www.thesun.co.uk/sol/homepage/showbiz/rss'),
    (u'Woman', u'http://www.thesun.co.uk/sol/homepage/woman/rss'),
    ]
# starsons code
    def parse_feeds (self): 
      feeds = BasicNewsRecipe.parse_feeds(self) 
      for feed in feeds:
        for article in feed.articles[:]:
          print 'article.title is: ', article.title
          if 'Try out The Sun' in article.title.upper() or 'Try-out-The-Suns' in article.url:
            feed.articles.remove(article)
          if 'Web porn harms kids' in article.title.upper() or 'Sun-says-Web-porn' in article.url:
            feed.articles.remove(article)
      return feeds

    def get_cover_url(self):
        soup = self.index_to_soup('http://www.politicshome.com/uk/latest_frontpage.html')
        # look for the block containing the sun button and url
        cov = soup.find(attrs={'style' : 'background-image: url(http://www.politicshome.com/images/sources/source_frontpage_button_84.gif);'})

        #cov = soup.find(attrs={'id' : 'large'})
        cov2 = str(cov)

        cov2='http://www.politicshome.com'+cov2[9:-133]
        #cov2 now contains url of the page containing pic
        #cov2 now contains url of the page containing pic
        soup = self.index_to_soup(cov2)
        cov = soup.find(attrs={'id' : 'large'})
        cov=str(cov)
        print
        print '!!!!! cov = ',cov
        cov2 =  re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', cov)
        cov2 = str(cov2)
        print
        print '@@@@@@ cov2 = ',cov2
        cov2=cov2[2:len(cov2)-2]
        print
        print '@@@@@@ chopped cov2 =',cov2
        #cov2 = str(cov)
        #cov2=cov2[27:-18]
        #cov2 now is pic url, now  go back to original function
        print "**** cov2 =",cov2,"****"
        br = browser()
        br.set_handle_redirect(False)
        try:
            br.open_novisit(cov2)
            cover_url = cov2
        except:
            cover_url = random.choice([
                'http://img.thesun.co.uk/multimedia/archive/00905/errorpage6_677961a_905507a.jpg'
                ,'http://img.thesun.co.uk/multimedia/archive/00905/errorpage7_677962a_905505a.jpg'
                ,'http://img.thesun.co.uk/multimedia/archive/00905/errorpage5_677960a_905512a.jpg'
                ,'http://img.thesun.co.uk/multimedia/archive/00905/errorpage2_677957a_905502a.jpg'
                ,'http://img.thesun.co.uk/multimedia/archive/00905/errorpage3_677958a_905503a.jpg'
                ])

        return cover_url
scissors is offline   Reply With Quote
Reply

Thread Tools Search this Thread
Search this Thread:

Advanced Search

Forum Jump

Similar Threads
Thread Thread Starter Forum Replies Last Post
The Sun UK - update 6/10/12 scissors Recipes 0 10-06-2012 11:46 AM
Sun UK Update 20/2/12 scissors Recipes 4 04-07-2012 08:44 AM
The Sun UK scissors Recipes 8 11-03-2011 05:43 AM
Problems with the sun mokel22 enTourage eDGe 2 07-10-2011 04:25 PM
Sun Fading SanAntone Amazon Kindle 23 07-08-2009 06:36 PM


All times are GMT -4. The time now is 05:48 PM.


MobileRead.com is a privately owned, operated and funded community.