View Single Post
Old 10-07-2012, 12:55 PM   #1
scissors
Addict
scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.
 
Posts: 204
Karma: 1001369
Join Date: Sep 2010
Device: prs300, kindle keyboard 3g
countryfile.com update 7/10/12

duplicate article removal and cover fetch

Spoiler:
Code:
ffrom calibre import browser
from calibre.web.feeds.news import BasicNewsRecipe
import mechanize
from calibre.constants import config_dir, CONFIG_DIR_MODE
import os, os.path, urllib
from hashlib import md5

#declare global temp file
Feeds_File = config_dir+'\\feeds.txt'

# needed for getting rid of repeat feeds

class AdvancedUserRecipe1325006965(BasicNewsRecipe):
    title          = u'Countryfile.com'
    #cover_url = 'http://www.countryfile.com/sites/default/files/imagecache/160px_wide/cover/2_1.jpg'
    __author__ = 'Dave Asbury'
    description           = 'The official website of Countryfile Magazine'
    # last updated 7/10/12
    language = 'en_GB'
    oldest_article = 30
    max_articles_per_feed = 25
    remove_empty_feeds = True
    no_stylesheets = True
    auto_cleanup = True
    #articles_are_obfuscated = True
    article_already_exists = False
    feed_hash = ''
    def get_cover_url(self):
            soup = self.index_to_soup('http://www.countryfile.com/')
            
            cov = soup.find(attrs={'width' : '160', 'class' : re.compile('imagecache imagecache-160px_wide')})
            print '******** ',cov,' ***'
            cov2 = str(cov)
            cov2=cov2[10:101]
            print '******** ',cov2,' ***'
            #cov2='http://www.countryfile.com/sites/default/files/imagecache/160px_wide/cover/1b_0.jpg'
            # try to get cover - if can't get known cover
            br = mechanize.Browser()
            
            br.set_handle_redirect(False)
            try:
                br.open_novisit(cov2)
                cover_url = cov2
            except:
                  cover_url = 'http://www.countryfile.com/sites/default/files/imagecache/160px_wide/cover/2_1.jpg'
            return cover_url
    remove_tags    = [
                             # dict(attrs={'class' : ['player']}),

    ]
    feeds          = [
    (u'Homepage', u'http://www.countryfile.com/rss/home'),
    (u'Country News', u'http://www.countryfile.com/rss/news'),
            (u'Countryside', u'http://www.countryfile.com/rss/countryside'),
            ]

    
    def parse_feeds(self):
        feeds = BasicNewsRecipe.parse_feeds(self)
        print 'create empty file'
        print
       
        #open and close empty file - otherwise crashes as you can't append a file that doesn't exist?

        read_file=open(Feeds_File,'w+')
        read_file.close()

        # repeat for all feeds
        for feed in feeds:
            print 'Feed file = ',Feeds_File
            
            # for each section do
            print
            print 'Feed section is ',feed.title
            # for each artcile in each section check if it's in the feeds file
            for article in feed.articles[:]:
                 article_already_exists = False
                 
                 print
                #open the file and reads lines of text
                 read_file=open(Feeds_File)
                 while 1:
                          line=read_file.readline()
                          print
                          print'****'
                          print 'Value of line:',line
                          print 'article.title is:',article.title
                          if str(line) == str(article.title+'\n'):
                             article_already_exists = True
                             print 'repeated article'
                             break
                          print'*****'
                          print                         
                          # eof reached   
                          if not line: break
                          
                 read_file.close()
                 # couldn't find article so write it to file
                 if article_already_exists == False:
                    read_file=open(Feeds_File,'a')
                    read_file.write(article.title+'\n')
                    read_file.close()
                 if article_already_exists == True:
                    article.url ='' # delete the url so won't download
        return feeds

Last edited by scissors; 10-07-2012 at 01:01 PM.
scissors is offline   Reply With Quote