View Single Post
Old 10-07-2012, 12:09 PM   #1
scissors
Addict
scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.
 
Posts: 204
Karma: 1001369
Join Date: Sep 2010
Device: prs300, kindle keyboard 3g
FHM update 7/10/12

use sites rss feeds
added duplication removal

Spoiler:
Code:
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.constants import config_dir, CONFIG_DIR_MODE

   #declare global temp file
Feeds_File = config_dir+'\\feeds.txt'

class AdvancedUserRecipe1325006965(BasicNewsRecipe):
    title          = u'FHM UK'
    description = 'Author D.Asbury. Using feed43 Good News for Men.'
    cover_url = 'http://www.greatmagazines.co.uk/covers/large/w197/current/fhm.jpg'
    #   cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/373529_38324934806_64930243_n.jpg'
    masthead_url = 'http://www.fhm.com/App_Resources/Images/Site/re-design/logo.gif'
    __author__ = 'Dave Asbury'
    # last updated 7/10/12
    language = 'en_GB'
    oldest_article = 31
    max_articles_per_feed = 15
    remove_empty_feeds = True
    no_stylesheets = True
    #auto_cleanup = True
   # articles_are_obfuscated = True

   #global variables required for getting rid of duplicate articles
    article_already_exists = False


# needed for getting rid of repeat feeds

    keep_only_tags = [
               dict(name='h1'),
               dict(name='img',attrs={'id' : 'ctl00_Body_imgMainImage'}),
               dict(name='div',attrs={'id' : ['profileLeft','articleLeft','profileRight','profileBody']}),
               dict(name='div',attrs={'class' : ['imagesCenterArticle','containerCenterArticle','articleBody',]}),

        ]

    remove_tags    = [
                              dict(attrs={'id' : ['ctl00_Body_divSlideShow' ]}),

    ]
    feeds          = [
                        # repeatable search = </div>{|}<a href="{%}" class="{*}">{%}</a>{|}<p>{*}</p>
    	(u'Homepage',u'http://rss.feedsportal.com/c/375/f/434908/index.rss'),
    	(u'Funny',u'http://rss.feedsportal.com/c/375/f/434910/index.rss'),
    	(u'Girls',u'http://rss.feedsportal.com/c/375/f/434913/index.rss'),
]

    
    print '@@@@@@@',Feeds_File
    def parse_feeds(self):
        feeds = BasicNewsRecipe.parse_feeds(self)
        print 'create empty file'
        print
       
        #open and close empty file - otherwise crashes as you can't append a file that doesn't exist?

        read_file=open(Feeds_File,'w+')
        read_file.close()

        # repeat for all feeds
        for feed in feeds:
            print 'Feed file = ',Feeds_File
            
            # for each section do
            print
            print 'Feed section is ',feed.title
            # for each artcile in each section check if it's in the feeds file
            for article in feed.articles[:]:
                 article_already_exists = False
                 
                 print
                #open the file and reads lines of text
                 read_file=open(Feeds_File)
                 while 1:
                          line=read_file.readline()
                          print
                          print'****'
                          print 'Value of line:',line
                          print 'article.title is:',article.title
                          if str(line) == str(article.title+'\n'):
                             article_already_exists = True
                             print 'repeated article'
                             break
                          print'*****'
                          print                         
                          # eof reached   
                          if not line: break
                          
                 read_file.close()
                 # couldn't find article so write it to file
                 if article_already_exists == False:
                    read_file=open(Feeds_File,'a')
                    read_file.write(article.title+'\n')
                    read_file.close()
                 if article_already_exists == True:
                    article.url ='' # delete the url so won't download
        return feeds


    extra_css = '''
                    h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
                    h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
                    p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
                    body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
		'''
scissors is offline   Reply With Quote