MobileRead Forums - View Single Post

kovidgoyal · 07-25-2011, 11:12 AM

I'm confused, there is no builtin IHT recipe. There used to be one, but its broken so it was removed. Here's the old recipe:

Code:

__license__   = 'GPL v3'
__copyright__ = '2008, Derry FitzGerald'
'''
iht.com
'''
import re

from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile


class InternationalHeraldTribune(BasicNewsRecipe):
    title          = u'The International Herald Tribune'
    __author__     = 'Derry FitzGerald'
    language = 'en'

    oldest_article = 1
    max_articles_per_feed = 30
    no_stylesheets = True

    remove_tags    = [dict(name='div', attrs={'class':['footer','header']}),
                      dict(name=['form'])]
    preprocess_regexps = [
            (re.compile(r'<!-- webtrends.*', re.DOTALL),
             lambda m:'</body></html>')
                          ]
    extra_css      = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt  }'

    remove_empty_feeds = True
    
    feeds          = [
                      (u'Frontpage', u'http://www.iht.com/rss/frontpage.xml'),
                      (u'Business', u'http://www.iht.com/rss/business.xml'),
                      (u'Americas', u'http://www.iht.com/rss/america.xml'),
                      (u'Europe', u'http://www.iht.com/rss/europe.xml'),
                      (u'Asia', u'http://www.iht.com/rss/asia.xml'),
                      (u'Africa and Middle East', u'http://www.iht.com/rss/africa.xml'),
                      (u'Opinion', u'http://www.iht.com/rss/opinion.xml'),
                      (u'Technology', u'http://www.iht.com/rss/technology.xml'),
                      (u'Health and Science', u'http://www.iht.com/rss/healthscience.xml'),
                      (u'Sports', u'http://www.iht.com/rss/sports.xml'),
                      (u'Culture', u'http://www.iht.com/rss/arts.xml'),
                      (u'Style and Design', u'http://www.iht.com/rss/style.xml'),
                      (u'Travel', u'http://www.iht.com/rss/travel.xml'),
                      (u'At Home Abroad', u'http://www.iht.com/rss/athome.xml'),
                      (u'Your Money', u'http://www.iht.com/rss/yourmoney.xml'),
                      (u'Properties', u'http://www.iht.com/rss/properties.xml')
                    ]
    temp_files = []
    articles_are_obfuscated = True
    
    masthead_url = 'http://graphics8.nytimes.com/images/misc/iht-masthead-logo.gif'
    
    def get_obfuscated_article(self, url):
        br = self.get_browser()
        br.open(url)
        response1 = br.follow_link(url_regex=re.compile(r'.*pagewanted=print.*'))
        html = response1.read()
        
        self.temp_files.append(PersistentTemporaryFile('_iht.html'))
        self.temp_files[-1].write(html)
        self.temp_files[-1].close()
        return self.temp_files[-1].name