use sites rss feeds
added duplication removal
Spoiler:
Code:
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.constants import config_dir, CONFIG_DIR_MODE
#declare global temp file
Feeds_File = config_dir+'\\feeds.txt'
class AdvancedUserRecipe1325006965(BasicNewsRecipe):
title = u'FHM UK'
description = 'Author D.Asbury. Using feed43 Good News for Men.'
cover_url = 'http://www.greatmagazines.co.uk/covers/large/w197/current/fhm.jpg'
# cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/373529_38324934806_64930243_n.jpg'
masthead_url = 'http://www.fhm.com/App_Resources/Images/Site/re-design/logo.gif'
__author__ = 'Dave Asbury'
# last updated 7/10/12
language = 'en_GB'
oldest_article = 31
max_articles_per_feed = 15
remove_empty_feeds = True
no_stylesheets = True
#auto_cleanup = True
# articles_are_obfuscated = True
#global variables required for getting rid of duplicate articles
article_already_exists = False
# needed for getting rid of repeat feeds
keep_only_tags = [
dict(name='h1'),
dict(name='img',attrs={'id' : 'ctl00_Body_imgMainImage'}),
dict(name='div',attrs={'id' : ['profileLeft','articleLeft','profileRight','profileBody']}),
dict(name='div',attrs={'class' : ['imagesCenterArticle','containerCenterArticle','articleBody',]}),
]
remove_tags = [
dict(attrs={'id' : ['ctl00_Body_divSlideShow' ]}),
]
feeds = [
# repeatable search = </div>{|}<a href="{%}" class="{*}">{%}</a>{|}<p>{*}</p>
(u'Homepage',u'http://rss.feedsportal.com/c/375/f/434908/index.rss'),
(u'Funny',u'http://rss.feedsportal.com/c/375/f/434910/index.rss'),
(u'Girls',u'http://rss.feedsportal.com/c/375/f/434913/index.rss'),
]
print '@@@@@@@',Feeds_File
def parse_feeds(self):
feeds = BasicNewsRecipe.parse_feeds(self)
print 'create empty file'
print
#open and close empty file - otherwise crashes as you can't append a file that doesn't exist?
read_file=open(Feeds_File,'w+')
read_file.close()
# repeat for all feeds
for feed in feeds:
print 'Feed file = ',Feeds_File
# for each section do
print
print 'Feed section is ',feed.title
# for each artcile in each section check if it's in the feeds file
for article in feed.articles[:]:
article_already_exists = False
print
#open the file and reads lines of text
read_file=open(Feeds_File)
while 1:
line=read_file.readline()
print
print'****'
print 'Value of line:',line
print 'article.title is:',article.title
if str(line) == str(article.title+'\n'):
article_already_exists = True
print 'repeated article'
break
print'*****'
print
# eof reached
if not line: break
read_file.close()
# couldn't find article so write it to file
if article_already_exists == False:
read_file=open(Feeds_File,'a')
read_file.write(article.title+'\n')
read_file.close()
if article_already_exists == True:
article.url ='' # delete the url so won't download
return feeds
extra_css = '''
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
'''