Feeds with unicode titles are not identified properly
Hello,
in order to download feeds only once I implemented the solution from the receipes collection.
The solution works fine on most feeds but doesn't identify feeds with unicode characters automatically. According to Google, Unicode handling in Python 2 appears to be problematic.
This is my code:
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import unicode_literals, division, absolute_import, print_function
from calibre.constants import config_dir, CONFIG_DIR_MODE
from calibre.web.feeds.news import BasicNewsRecipe
import os, os.path, urllib
from hashlib import md5
class OnlyLatestRecipe(BasicNewsRecipe):
title = 'DARC E13'
oldest_article = 10000
max_articles_per_feed = 10000
auto_cleanup = True
remove_empty_feeds = True
feeds = [
('Amateurfunk im Alterstal', 'http://www.amateurfunk-im-alstertal.de/?format=feed&type=atom'),
]
def parse_feeds(self):
recipe_dir = os.path.join(config_dir,'recipes')
hash_dir = os.path.join(recipe_dir,'recipe_storage')
feed_dir = os.path.join(hash_dir,self.title.decode('utf-8', 'replace').replace('/',':'))
if not os.path.isdir(feed_dir):
os.makedirs(feed_dir,mode=CONFIG_DIR_MODE)
feeds = BasicNewsRecipe.parse_feeds(self)
for feed in feeds:
feed_hash = urllib.quote(feed.title.decode('utf-8', 'replace'),safe='')
feed_fn = os.path.join(feed_dir,feed_hash)
past_items = set()
if os.path.exists(feed_fn):
with file(feed_fn) as f:
for h in f:
past_items.add(h.strip())
cur_items = set()
for article in feed.articles[:]:
item_hash = md5()
if article.content: item_hash.update(article.content.decode('utf-8', 'replace'))
if article.summary: item_hash.update(article.summary.decode('utf-8', 'replace'))
item_hash = item_hash.hexdigest()
if article.url:
item_hash = article.url + ':' + item_hash
cur_items.add(item_hash)
if item_hash in past_items:
feed.articles.remove(article)
with file(feed_fn,'w') as f:
for h in cur_items:
f.write(h+'\n')
remove = [f for f in feeds if len(f) == 0 and
self.remove_empty_feeds]
for f in remove:
feeds.remove(f)
return feeds
It works fine except for 6 entries which are downloaded again, and again, and again...
It appears to me that theses entries do all have unciode characters in the URL and it seems like the hashes are not build/compared properly so these feeds are not identified as old feeds.
I already played around with unicode handling in the phyton code but I couldn't make it work yet.
This feed is open and fully accessible to anyone, it would therefore be nice if someone could help and point me to the bug in my script.
Thank you in advance!
-inte-
|