The Age Feed - repeat articles

Quasii · 03-09-2011, 12:35 AM

Hi Guys,

I am a massive fan of Calibre! Especially the News Feeds; The only question I have is in regards to The Age feed. Each day I get the feed delivered to my kindle but I have noticed that it is repeating a lot of the articles for previous days. Is there a way to change the recipe so that it only downloads articles from that day?

I have no knowledge of Python so please speak slowly

Starson17 · 03-09-2011, 08:44 AM

Quote:

Originally Posted by Quasii

Hi Guys,

I am a massive fan of Calibre! Especially the News Feeds; The only question I have is in regards to The Age feed. Each day I get the feed delivered to my kindle but I have noticed that it is repeating a lot of the articles for previous days. Is there a way to change the recipe so that it only downloads articles from that day?

I have no knowledge of Python so please speak slowly

Short answer: No
Longer answer: you need to understand Python and recipes well enough to implement this: https://www.mobileread.com/forums/sho...5&postcount=10

Quasii · 03-09-2011, 06:38 PM

Thanks for your quick reply.

And your link was helpful, my complete lack of knowledge in regard to python is only holding me back now.

if there is any bored python experts out there I'd be greatly appreciative, but won't be holding me breath. (and sorry I don't know how to open one of those funky code boxes like everyone else does)

original recipe for the Age -
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Matthew Briggs <hal.sulphur@gmail.com>'
__docformat__ = 'restructuredtext en'
'''
theage.com.au
'''
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
import re
class TheAge(BasicNewsRecipe):
title = 'The Age'
description = 'Business News, World News and Breaking News in Melbourne, Australia'
publication_type = 'newspaper'
__author__ = 'Matthew Briggs'
language = 'en_AU'

max_articles_per_feed = 1000
recursions = 0
remove_tags = [dict(name=['table', 'script', 'noscript', 'style']), dict(name='a', attrs={'href':'/'}), dict(name='a', attrs={'href':'/text/'})]
def get_browser(self):
br = BasicNewsRecipe.get_browser()
br.set_handle_refresh(False)
return br
def parse_index(self):
soup = BeautifulSoup(self.browser.open('http://www.theage.com.au/text/').read())
section = None
sections = {}
for tag in soup.findAll(['h3', 'a']):
if tag.name == 'h3':
section = self.tag_to_string(tag)
sections[section] = []
# Make sure to skip: <a href="/">TheAge</a>
elif section and tag.has_key('href') and len(tag['href'].strip())>1:
url = tag['href'].strip()
if url.startswith('/'):
url = 'http://www.theage.com.au' + url
title = self.tag_to_string(tag)
sections[section].append({
'title': title,
'url' : url,
'date' : strftime('%a, %d %b'),
'description' : '',
'content' : '',
})

feeds = []
# Insert feeds in specified order, if available

feedSort = [ 'National', 'World', 'Opinion', 'Columns', 'Business', 'Sport', 'Entertainment' ]
for i in feedSort:
if i in sections:
feeds.append((i,sections[i]))
# Done with the sorted feeds
for i in feedSort:
del sections[i]

# Append what is left over...
for i in sections:
feeds.append((i,sections[i]))

return feeds
def get_cover_url(self):
soup = BeautifulSoup(self.browser.open('http://www.theage.com.au/todays-paper').read())
for i in soup.findAll('a'):
href = i['href']
if href and re.match('http://www.theage.com.au/frontpage/[0-9]+/[0-9]+/[0-9]+/frontpage.pdf',href):
return href
return None
def preprocess_html(self,soup):

for p in soup.findAll('p'):

# Collapse the paragraph by joining the non-tag contents
contents = [i for i in p.contents if isinstance(i,unicode)]
if len(contents):
contents = ''.join(contents)
# Filter out what's left of the text-mode navigation stuff
if re.match('((\s)|(\&nbsp\

)*\[[\|\s*]*\]((\s)|(\&nbsp\

)*$',contents):
p.extract()
continue
# Shrink the fine print font
if contents=='This material is subject to copyright and any unauthorised use, copying or mirroring is prohibited.':
p['style'] = 'font-size:small'
continue

return soup

Fix on mobile read forum -

Code:

from calibre.constants import config_dir, CONFIG_DIR_MODE
import os, os.path, urllib
from hashlib import md5

class OnlyLatestRecipe(BasicNewsRecipe):
title = u'Unknown News Source'
oldest_article = 10000
max_articles_per_feed = 10000
feeds = [ ]

def parse_feeds(self):
recipe_dir = os.path.join(config_dir,'recipes')
hash_dir = os.path.join(recipe_dir,'recipe_storage')
feed_dir = os.path.join(hash_dir,self.title.encode('utf-8').replace('/',':'))
if not os.path.isdir(feed_dir):
os.makedirs(feed_dir,mode=CONFIG_DIR_MODE)

feeds = BasicNewsRecipe.parse_feeds(self)

for feed in feeds:
feed_hash = urllib.quote(feed.title.encode('utf-8'),safe='')
feed_fn = os.path.join(feed_dir,feed_hash)

past_items = set()
if os.path.exists(feed_fn):
with file(feed_fn) as f:
for h in f:
past_items.add(h.strip())

cur_items = set()
for article in feed.articles[:]:
item_hash = md5()
if article.content: item_hash.update(article.content.encode('utf-8'))
if article.summary: item_hash.update(article.summary.encode('utf-8'))
item_hash = item_hash.hexdigest()
if article.url:
item_hash = article.url + ':' + item_hash
cur_items.add(item_hash)
if item_hash in past_items:
feed.articles.remove(article)
with file(feed_fn,'w') as f:
for h in cur_items:
f.write(h+'\n')

remove = [f for f in feeds if len(f) == 0 and
self.remove_empty_feeds]
for f in remove:
feeds.remove(f)

return feeds

Anyones help would be appreciated.

03-09-2011, 12:35 AM	#1
Quasii Member Posts: 12 Karma: 10 Join Date: Feb 2011 Device: Kindle3	The Age Feed - repeat articles Hi Guys, I am a massive fan of Calibre! Especially the News Feeds; The only question I have is in regards to The Age feed. Each day I get the feed delivered to my kindle but I have noticed that it is repeating a lot of the articles for previous days. Is there a way to change the recipe so that it only downloads articles from that day? I have no knowledge of Python so please speak slowly

03-09-2011, 06:38 PM	#3
Quasii Member Posts: 12 Karma: 10 Join Date: Feb 2011 Device: Kindle3	Thanks for your quick reply. And your link was helpful, my complete lack of knowledge in regard to python is only holding me back now. if there is any bored python experts out there I'd be greatly appreciative, but won't be holding me breath. (and sorry I don't know how to open one of those funky code boxes like everyone else does) original recipe for the Age - #!/usr/bin/env python __license__ = 'GPL v3' __copyright__ = '2009, Matthew Briggs <hal.sulphur@gmail.com>' __docformat__ = 'restructuredtext en' ''' theage.com.au ''' from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup import re class TheAge(BasicNewsRecipe): title = 'The Age' description = 'Business News, World News and Breaking News in Melbourne, Australia' publication_type = 'newspaper' __author__ = 'Matthew Briggs' language = 'en_AU' max_articles_per_feed = 1000 recursions = 0 remove_tags = [dict(name=['table', 'script', 'noscript', 'style']), dict(name='a', attrs={'href':'/'}), dict(name='a', attrs={'href':'/text/'})] def get_browser(self): br = BasicNewsRecipe.get_browser() br.set_handle_refresh(False) return br def parse_index(self): soup = BeautifulSoup(self.browser.open('http://www.theage.com.au/text/').read()) section = None sections = {} for tag in soup.findAll(['h3', 'a']): if tag.name == 'h3': section = self.tag_to_string(tag) sections[section] = [] # Make sure to skip: <a href="/">TheAge</a> elif section and tag.has_key('href') and len(tag['href'].strip())>1: url = tag['href'].strip() if url.startswith('/'): url = 'http://www.theage.com.au' + url title = self.tag_to_string(tag) sections[section].append({ 'title': title, 'url' : url, 'date' : strftime('%a, %d %b'), 'description' : '', 'content' : '', }) feeds = [] # Insert feeds in specified order, if available feedSort = [ 'National', 'World', 'Opinion', 'Columns', 'Business', 'Sport', 'Entertainment' ] for i in feedSort: if i in sections: feeds.append((i,sections[i])) # Done with the sorted feeds for i in feedSort: del sections[i] # Append what is left over... for i in sections: feeds.append((i,sections[i])) return feeds def get_cover_url(self): soup = BeautifulSoup(self.browser.open('http://www.theage.com.au/todays-paper').read()) for i in soup.findAll('a'): href = i['href'] if href and re.match('http://www.theage.com.au/frontpage/[0-9]+/[0-9]+/[0-9]+/frontpage.pdf',href): return href return None def preprocess_html(self,soup): for p in soup.findAll('p'): # Collapse the paragraph by joining the non-tag contents contents = [i for i in p.contents if isinstance(i,unicode)] if len(contents): contents = ''.join(contents) # Filter out what's left of the text-mode navigation stuff if re.match('((\s)\|(\&nbsp\)\[[\\|\s]\]((\s)\|(\&nbsp\)$',contents): p.extract() continue # Shrink the fine print font if contents=='This material is subject to copyright and any unauthorised use, copying or mirroring is prohibited.': p['style'] = 'font-size:small' continue return soup Fix on mobile read forum - Code: from calibre.constants import config_dir, CONFIG_DIR_MODE import os, os.path, urllib from hashlib import md5 class OnlyLatestRecipe(BasicNewsRecipe): title = u'Unknown News Source' oldest_article = 10000 max_articles_per_feed = 10000 feeds = [ ] def parse_feeds(self): recipe_dir = os.path.join(config_dir,'recipes') hash_dir = os.path.join(recipe_dir,'recipe_storage') feed_dir = os.path.join(hash_dir,self.title.encode('utf-8').replace('/',':')) if not os.path.isdir(feed_dir): os.makedirs(feed_dir,mode=CONFIG_DIR_MODE) feeds = BasicNewsRecipe.parse_feeds(self) for feed in feeds: feed_hash = urllib.quote(feed.title.encode('utf-8'),safe='') feed_fn = os.path.join(feed_dir,feed_hash) past_items = set() if os.path.exists(feed_fn): with file(feed_fn) as f: for h in f: past_items.add(h.strip()) cur_items = set() for article in feed.articles[:]: item_hash = md5() if article.content: item_hash.update(article.content.encode('utf-8')) if article.summary: item_hash.update(article.summary.encode('utf-8')) item_hash = item_hash.hexdigest() if article.url: item_hash = article.url + ':' + item_hash cur_items.add(item_hash) if item_hash in past_items: feed.articles.remove(article) with file(feed_fn,'w') as f: for h in cur_items: f.write(h+'\n') remove = [f for f in feeds if len(f) == 0 and self.remove_empty_feeds] for f in remove: feeds.remove(f) return feeds Anyones help would be appreciated.

Similar Threads
Thread	Thread Starter	Forum	Replies	Last Post
Insert Hyperlinks in Feed Articles	Bushwil	Recipes	1	01-21-2011 02:51 PM
Sorting articles of RSS feed	miwie	Recipes	1	11-21-2010 01:02 AM
multiple repeat error converting HTML to MOBI	moog	Calibre	0	02-05-2010 01:03 PM
Lines repeat on page turn	XK143	Sony Reader	4	05-30-2009 04:43 PM
Sony: Will History Repeat?	Kingston	Sony Reader	67	01-18-2008 03:17 PM