Register Guidelines E-Books Search Today's Posts Mark Forums Read

Go Back   MobileRead Forums > E-Book Software > Calibre > Recipes

Notices

Reply
 
Thread Tools Search this Thread
Old 10-19-2012, 10:49 AM   #1
scissors
Addict
scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.
 
Posts: 206
Karma: 1001369
Join Date: Sep 2010
Device: prs300, kindle keyboard 3g
countryfile 19/10/12

now uses kovid's ignore duplicate articles routine

Spoiler:
Code:
from calibre import browser
from calibre.web.feeds.news import BasicNewsRecipe
import mechanize
from calibre.constants import config_dir, CONFIG_DIR_MODE
import os, os.path, urllib
#from hashlib import md5
#import urlparse


#declare global temp file
#Feeds_File = config_dir+'\\feeds.txt'

# needed for getting rid of repeat feeds

class AdvancedUserRecipe1325006965(BasicNewsRecipe):
    title          = u'Countryfile.com'
    #cover_url = 'http://www.countryfile.com/sites/default/files/imagecache/160px_wide/cover/2_1.jpg'
    __author__ = 'Dave Asbury'
    description           = 'The official website of Countryfile Magazine'
    # last updated 19/10/12
    language = 'en_GB'
    oldest_article = 30
    max_articles_per_feed = 25
    remove_empty_feeds = True
    no_stylesheets = True
    auto_cleanup = True
    ignore_duplicate_articles = {'title', 'url'}
    #articles_are_obfuscated = True
    #article_already_exists = False
    #feed_hash = ''
    def get_cover_url(self):
            soup = self.index_to_soup('http://www.countryfile.com/')
            cov = soup.find(attrs={'class' : re.compile('imagecache imagecache-160px_wide')})#'width' : '160', 
            print '&&&&&&&& ',cov,' ***'
            cov=str(cov)
            cov2 =  re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', cov)

            cov2 = str(cov2)
            cov2=cov2[2:len(cov2)-2]
            print '******** ',cov2,' ***'
             # try to get cover - if can't get known cover
            br = mechanize.Browser()
            
            br.set_handle_redirect(False)
            try:
                br.open_novisit(cov2)
                cover_url = cov2
            except:
                  cover_url = 'http://www.countryfile.com/sites/default/files/imagecache/160px_wide/cover/2_1.jpg'
            return cover_url
    remove_tags    = [
                             # dict(attrs={'class' : ['player']}),

    ]
    feeds          = [
    (u'Homepage', u'http://www.countryfile.com/rss/home'),
    (u'Country News', u'http://www.countryfile.com/rss/news'),
            (u'Countryside', u'http://www.countryfile.com/rss/countryside'),
            ]

    
#    def parse_feeds(self):
#      feeds = BasicNewsRecipe.parse_feeds(self)
 #       print 'create empty file'
 #       print
       
        #open and close empty file - otherwise crashes as you can't append a file that doesn't exist?

   #     read_file=open(Feeds_File,'w+')
    #    read_file.close()

        # repeat for all feeds
     #   for feed in feeds:
      #      print 'Feed file = ',Feeds_File
            
            # for each section do
     #       print
      #      print 'Feed section is ',feed.title
            # for each artcile in each section check if it's in the feeds file
      #      for article in feed.articles[:]:
      #           article_already_exists = False
                 
     #            print
                #open the file and reads lines of text
      #           read_file=open(Feeds_File)
      #           while 1:
        #                  line=read_file.readline()
       #                   print
        #                  print'****'
        #                  print 'Value of line:',line
         #                 print 'article.title is:',article.title
        #                  if str(line) == str(article.title+'\n'):
          #                   article_already_exists = True
         #                    print 'repeated article'
        #                     break
          #                print'*****'
          #                print                         
           #               # eof reached   
             #             if not line: break
                          
            #     read_file.close()
                 # couldn't find article so write it to file
         #        if article_already_exists == False:
           #         read_file=open(Feeds_File,'a')
           #         read_file.write(article.title+'\n')
          #          read_file.close()
                    
         #        if article_already_exists == True:
           #         article.url ='' # delete the url so won't download
       # return feeds
scissors is offline   Reply With Quote
Reply

Thread Tools Search this Thread
Search this Thread:

Advanced Search

Forum Jump

Similar Threads
Thread Thread Starter Forum Replies Last Post
countryfile.com update 7/10/12 scissors Recipes 0 10-07-2012 01:55 PM
countryfile.com scissors Recipes 0 09-09-2012 04:07 AM
countryfile.com update scissors Recipes 0 04-15-2012 11:53 AM
BBC countryfile scissors Recipes 0 01-29-2012 04:08 PM


All times are GMT -4. The time now is 08:40 PM.


MobileRead.com is a privately owned, operated and funded community.