Register Guidelines E-Books Today's Posts Search

Go Back   MobileRead Forums > E-Book Software > Calibre > Recipes

Notices

Reply
 
Thread Tools Search this Thread
Old 10-07-2012, 11:27 AM   #1
scissors
Addict
scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.
 
Posts: 241
Karma: 1001369
Join Date: Sep 2010
Device: prs300, kindle keyboard 3g
shortlist update

Feed43 corrections plus removal of duplicates

Spoiler:
Code:
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.constants import config_dir, CONFIG_DIR_MODE

#declare global temp file
Feeds_File = config_dir+'\\feeds.txt'

# needed for getting rid of repeat feeds

class AdvancedUserRecipe1324663493(BasicNewsRecipe):
    title          = u'Shortlist'
    description = 'Articles From Shortlist.com'
    # I've set oldest article to 7 days as the website updates weekly
    oldest_article = 8
    max_articles_per_feed = 20
    remove_empty_feeds = True
    remove_javascript     = True
    no_stylesheets = True
    #global variables required for getting rid of duplicate articles
    article_already_exists = False

    __author__ = 'Dave Asbury'
    # last updated 7/10/12
    language = 'en_GB'
    def get_cover_url(self):
            soup = self.index_to_soup('http://www.shortlist.com')
            cov = soup.find(attrs={'width' : '121'})
            #print '******** ',cov,' ***'
            #cover_url = 'http://www.shortlist.com'+cov['src']
            cover_url =cov['src']
            return cover_url

    masthead_url = 'http://www.mediauk.com/logos/100/344096.png'

    preprocess_regexps = [
    (re.compile(r'…or.*?email to your friends</a>.', re.IGNORECASE | re.DOTALL), lambda match: '')]

    keep_only_tags = [
              #dict(name='h1'),
              dict(name='h2',attrs={'class' : 'title'}),
                                     dict(name='h3',atts={'class' : 'subheading'}),
              dict(attrs={'class' : [ 'hero-static','stand-first']}),
                                    dict(attrs={'class' : 'hero-image'}),
                  dict(name='div',attrs={'id' : ['list','article','article alternate']}),
              dict(name='div',attrs={'class' : 'stand-first'}),
                 ]
    remove_tags = [dict(name='h2',attrs={'class' : 'graphic-header'}),
           dict(attrs={'id' : ['share','twitter','facebook','digg','delicious','facebook-like']}),
           dict(atts={'class' : ['related-content','related-content-item','related-content horizontal','more']}),

          ]

    remove_tags_after = [dict(name='p',attrs={'id' : 'tags'})
                     ]

    feeds          = [
                               #edit http://feed43.com/feed.html?name=3156308700147005
                               # repeatable pattern = <h3>{_}<a href="{%}">{%}</a>{*}</h3>
                               
                                (u'This Weeks Issue', u'http://feed43.com/5205766657404804.xml'),
                                (u'Home Page',u'http://feed43.com/3156308700147005.xml'),
                                (u'Cool Stuff',u'http://feed43.com/1557051772026706.xml'),
                                (u'Style',u'http://feed43.com/4168836374571502.xml'),
                                (u'Entertainment',u'http://feed43.com/4578504030588024.xml'),
                                

    ]

    
    print '@@@@@@@',Feeds_File
    def parse_feeds(self):
        feeds = BasicNewsRecipe.parse_feeds(self)
        print 'create empty file'
        print
       
        #open and close empty file - otherwise crashes as you can't append a file that doesn't exist?

        read_file=open(Feeds_File,'w+')
        read_file.close()

        # repeat for all feeds
        for feed in feeds:
            print 'Feed file = ',Feeds_File
            
            # for each section do
            print
            print 'Feed section is ',feed.title
            # for each artcile in each section check if it's in the feeds file
            for article in feed.articles[:]:
                 article_already_exists = False
                 
                 print
                #open the file and reads lines of text
                 read_file=open(Feeds_File)
                 while 1:
                          line=read_file.readline()
                          print
                          print'****'
                          print 'Value of line:',line
                          print 'article.title is:',article.title
                          if str(line) == str(article.title+'\n'):
                             article_already_exists = True
                             print 'repeated article'
                             break
                          print'*****'
                          print                         
                          # eof reached   
                          if not line: break
                          
                 read_file.close()
                 # couldn't find article so write it to file
                 if article_already_exists == False:
                    read_file=open(Feeds_File,'a')
                    read_file.write(article.title+'\n')
                    read_file.close()
                 if article_already_exists == True:
                    article.url ='' # delete the url so won't download
        return feeds
scissors is offline   Reply With Quote
Reply


Forum Jump

Similar Threads
Thread Thread Starter Forum Replies Last Post
shortlist.com recipe update scissors Recipes 3 05-19-2012 01:22 AM
Shortlist UK scissors Recipes 1 12-24-2011 01:51 PM
Books disappearing when added to Shortlist after update HarleyB Kobo Reader 25 09-02-2011 07:02 PM
Man Booker Prize shortlist announced TGS News 124 10-13-2010 07:45 AM
Booker shortlist to be available online rixte News 4 09-16-2008 04:35 AM


All times are GMT -4. The time now is 05:10 AM.


MobileRead.com is a privately owned, operated and funded community.