MobileRead Forums - View Single Post - Duplicated news in recipe with multiple feeds

romualdinho · 09-26-2011, 11:34 PM

I used Pahan's code to get rid of already downloaded items and also filtered the code, but I couldn't resolve the main problem: not to get repeated articles from different feeds in the same run. I've been spending some time with this without success.

Though I've done some things in PHP for websites, I couldn't say I'm a programmer, so I will try a little more, and in case of failing again, I'll have to skip the articles in the kindle while reading

Regards.

PS: that's my code now:

Spoiler:

Code:

from calibre.constants import config_dir, CONFIG_DIR_MODE
import re, os, os.path, urllib
from hashlib import md5

class OnlyLatestRecipe(BasicNewsRecipe):
    title                 = u'Mongabay'
    oldest_article        = 30
    max_articles_per_feed = 50
    auto_cleanup          = True
    description           = 'Mongabay.com seeks to raise interest in and appreciation of wild lands and wildlife, while examining the impact of emerging trends in climate, technology, economics, and finance on conservation and development'
    category              = 'Ecología'
    language              = 'en'
    remove_tags           = [dict(name='p', attrs={'class':'hide'})]
    auto_cleanup_keep     = '//div[@class="imageWrap"]'
    remove_javascript     = True
    extra_css             = 'font span.italic { display: block; padding-bottom: 12px; }'
    preprocess_regexps    = [
        (re.compile(r'Mongabay\.com seeks.*and development\.', re.DOTALL|re.IGNORECASE),
        lambda match: ''),
        (re.compile(r'Please consider the environment.*PDF version</a>', re.DOTALL|re.IGNORECASE), # I AM considering the environment
        lambda match: ''),
        (re.compile(r'<A HREF="http://www.mongabay.com/copyright.htm">Copyright mongabay 2009', re.DOTALL|re.IGNORECASE),
        lambda match: ''),
        (re.compile(r' - Print', re.DOTALL|re.IGNORECASE),
        lambda match: ''),
        (re.compile(r'(<br\s*\/?>\s*){3,}', re.DOTALL|re.IGNORECASE),
        lambda match: ' <br /><br /> '),
        (re.compile(r'<table', re.DOTALL|re.IGNORECASE),
        lambda match: '<div> <table'),
        (re.compile(r'</table>', re.DOTALL|re.IGNORECASE),
        lambda match: '</table></div> <br />'),
        (re.compile(r'<div> <table align=right>', re.DOTALL|re.IGNORECASE),
        lambda match: '<div class="imageWrap"> <table align="left">'),
        (re.compile(r'<td width=20></td>', re.DOTALL|re.IGNORECASE),
        lambda match: '')
		
    ]
    conversion_options = {
         'comments'        : description
        ,'language'        : language
		,'linearize_tables': True
    }
    feeds   = [
                 (u'Amazon', u'http://news.mongabay.com/xml/amazon1.xml')
                ,(u'Species discovery', u'http://news.mongabay.com/xml/species_discovery1.xml')
                ,(u'Rainforest animals', u'http://news.mongabay.com/xml/rainforest%20animals1.xml')
                ,(u'Cats', u'http://news.mongabay.com/xml/cats1.xml')
                ,(u'Pantanal', u'http://news.mongabay.com/xml/pantanal1.xml')
                ,(u'Boreal forests', u'http://news.mongabay.com/xml/boreal_forests1.xml')
                ,(u'Atlantic Forest', 'http://news.mongabay.com/xml/Atlantic%20Forest1.xml')
                ,(u'Panama', 'http://news.mongabay.com/xml/Panama1.xml')
            ]

    def print_version(self, url):
        return url.replace('http://', 'http://print.')

    def parse_feeds(self):
        recipe_dir = os.path.join(config_dir,'recipes')
        hash_dir = os.path.join(recipe_dir,'recipe_storage')
        feed_dir = os.path.join(hash_dir,self.title.encode('utf-8').replace('/',':'))
        if not os.path.isdir(feed_dir):
            os.makedirs(feed_dir,mode=CONFIG_DIR_MODE)

        feeds = BasicNewsRecipe.parse_feeds(self)

        for feed in feeds:
            feed_hash = urllib.quote(feed.title.encode('utf-8'),safe='')
            feed_fn = os.path.join(feed_dir,feed_hash)

            past_items = set()
            if os.path.exists(feed_fn):
               with file(feed_fn) as f:
                   for h in f:
                       past_items.add(h.strip())
                       
            cur_items = set()
            for article in feed.articles[:]:
                item_hash = md5()
                if article.content: item_hash.update(article.content.encode('utf-8'))
                if article.summary: item_hash.update(article.summary.encode('utf-8'))
                item_hash = item_hash.hexdigest()
                if article.url:
                    item_hash = article.url + ':' + item_hash
                cur_items.add(item_hash)
                if item_hash in past_items:
                    feed.articles.remove(article)
            with file(feed_fn,'w') as f:
                for h in cur_items:
                    f.write(h+'\n')

        remove = [f for f in feeds if len(f) == 0 and
                self.remove_empty_feeds]
        for f in remove:
            feeds.remove(f)

        return feeds