MobileRead Forums - View Single Post - Feeds with unicode titles are not identified properly

inte · 11-11-2017, 05:39 AM

Hello,

in order to download feeds only once I implemented the solution from the receipes collection.
The solution works fine on most feeds but doesn't identify feeds with unicode characters automatically. According to Google, Unicode handling in Python 2 appears to be problematic.
This is my code:

#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import unicode_literals, division, absolute_import, print_function
from calibre.constants import config_dir, CONFIG_DIR_MODE
from calibre.web.feeds.news import BasicNewsRecipe

import os, os.path, urllib
from hashlib import md5

class OnlyLatestRecipe(BasicNewsRecipe):
title = 'DARC E13'
oldest_article = 10000
max_articles_per_feed = 10000
auto_cleanup = True
remove_empty_feeds = True
feeds = [
('Amateurfunk im Alterstal', 'http://www.amateurfunk-im-alstertal.de/?format=feed&type=atom'),
]
def parse_feeds(self):
recipe_dir = os.path.join(config_dir,'recipes')
hash_dir = os.path.join(recipe_dir,'recipe_storage')
feed_dir = os.path.join(hash_dir,self.title.decode('utf-8', 'replace').replace('/',':'))
if not os.path.isdir(feed_dir):
os.makedirs(feed_dir,mode=CONFIG_DIR_MODE)

feeds = BasicNewsRecipe.parse_feeds(self)

for feed in feeds:
feed_hash = urllib.quote(feed.title.decode('utf-8', 'replace'),safe='')
feed_fn = os.path.join(feed_dir,feed_hash)

past_items = set()
if os.path.exists(feed_fn):
with file(feed_fn) as f:
for h in f:
past_items.add(h.strip())

cur_items = set()
for article in feed.articles[:]:
item_hash = md5()
if article.content: item_hash.update(article.content.decode('utf-8', 'replace'))
if article.summary: item_hash.update(article.summary.decode('utf-8', 'replace'))
item_hash = item_hash.hexdigest()
if article.url:
item_hash = article.url + ':' + item_hash
cur_items.add(item_hash)
if item_hash in past_items:
feed.articles.remove(article)
with file(feed_fn,'w') as f:
for h in cur_items:
f.write(h+'\n')

remove = [f for f in feeds if len(f) == 0 and
self.remove_empty_feeds]
for f in remove:
feeds.remove(f)

return feeds

It works fine except for 6 entries which are downloaded again, and again, and again...
It appears to me that theses entries do all have unciode characters in the URL and it seems like the hashes are not build/compared properly so these feeds are not identified as old feeds.
I already played around with unicode handling in the phyton code but I couldn't make it work yet.
This feed is open and fully accessible to anyone, it would therefore be nice if someone could help and point me to the bug in my script.

Thank you in advance!

-inte-

11-11-2017, 05:39 AM	#1
inte Member Posts: 11 Karma: 10 Join Date: Nov 2017 Device: Kindle Oasis	Feeds with unicode titles are not identified properly Hello, in order to download feeds only once I implemented the solution from the receipes collection. The solution works fine on most feeds but doesn't identify feeds with unicode characters automatically. According to Google, Unicode handling in Python 2 appears to be problematic. This is my code: #!/usr/bin/env python2 # vim:fileencoding=utf-8 from __future__ import unicode_literals, division, absolute_import, print_function from calibre.constants import config_dir, CONFIG_DIR_MODE from calibre.web.feeds.news import BasicNewsRecipe import os, os.path, urllib from hashlib import md5 class OnlyLatestRecipe(BasicNewsRecipe): title = 'DARC E13' oldest_article = 10000 max_articles_per_feed = 10000 auto_cleanup = True remove_empty_feeds = True feeds = [ ('Amateurfunk im Alterstal', 'http://www.amateurfunk-im-alstertal.de/?format=feed&type=atom'), ] def parse_feeds(self): recipe_dir = os.path.join(config_dir,'recipes') hash_dir = os.path.join(recipe_dir,'recipe_storage') feed_dir = os.path.join(hash_dir,self.title.decode('utf-8', 'replace').replace('/',':')) if not os.path.isdir(feed_dir): os.makedirs(feed_dir,mode=CONFIG_DIR_MODE) feeds = BasicNewsRecipe.parse_feeds(self) for feed in feeds: feed_hash = urllib.quote(feed.title.decode('utf-8', 'replace'),safe='') feed_fn = os.path.join(feed_dir,feed_hash) past_items = set() if os.path.exists(feed_fn): with file(feed_fn) as f: for h in f: past_items.add(h.strip()) cur_items = set() for article in feed.articles[:]: item_hash = md5() if article.content: item_hash.update(article.content.decode('utf-8', 'replace')) if article.summary: item_hash.update(article.summary.decode('utf-8', 'replace')) item_hash = item_hash.hexdigest() if article.url: item_hash = article.url + ':' + item_hash cur_items.add(item_hash) if item_hash in past_items: feed.articles.remove(article) with file(feed_fn,'w') as f: for h in cur_items: f.write(h+'\n') remove = [f for f in feeds if len(f) == 0 and self.remove_empty_feeds] for f in remove: feeds.remove(f) return feeds It works fine except for 6 entries which are downloaded again, and again, and again... It appears to me that theses entries do all have unciode characters in the URL and it seems like the hashes are not build/compared properly so these feeds are not identified as old feeds. I already played around with unicode handling in the phyton code but I couldn't make it work yet. This feed is open and fully accessible to anyone, it would therefore be nice if someone could help and point me to the bug in my script. Thank you in advance! -inte-