MobileRead Forums - View Single Post

nickredding · 10-26-2010, 11:42 AM

Format changes on the NYT web site. Here is an updated recipe:

Code:

#!/usr/bin/env  python

__license__   = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
nytimes.com
'''
import string, re, time
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup

def decode(self, src):
    enc = 'utf-8'
    if 'iso-8859-1' in src:
        enc = 'cp1252'
    return src.decode(enc, 'ignore')

class NYTimes(BasicNewsRecipe):

    title       = u'New York Times'
    __author__  = 'Kovid Goyal/Nick Redding'
    language = 'en'
    requires_version = (0, 6, 36)

    description = 'Daily news from the New York Times (subscription version)'
    timefmt = ' [%b %d]'
    needs_subscription = True
    remove_tags_before = dict(id='article')
    remove_tags_after  = dict(id='article')
    remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool','nextArticleLink',
                                        'nextArticleLink clearfix','columnGroup doubleRule','doubleRule','entry-meta',
                                        'icon enlargeThis','columnGroup  last','relatedSearchesModule']}),
                   dict({'class':re.compile('^subNavigation')}),
                   dict({'class':re.compile('^leaderboard')}),
                   dict({'class':re.compile('^module')}),
                   dict({'class':'metaFootnote'}),
                   dict(id=['inlineBox','footer', 'toolsRight', 'articleInline','login','masthead',
                            'navigation', 'archive', 'side_search', 'blog_sidebar','cCol','portfolioInline',
                            'side_tool', 'side_index','header','readerReviewsCount','readerReviews',
                            'relatedArticles', 'relatedTopics', 'adxSponLink']),
                   dict(name=['script', 'noscript', 'style','form','hr'])]
    encoding = decode
    no_stylesheets = True
    extra_css = '''
                .articleHeadline { margin-top:0.5em; margin-bottom:0.25em; }
                .credit { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
                .byline { font-size: small; font-style:italic; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; }
                .dateline { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
                .kicker { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
                .timestamp { font-size: small; }
                .caption { font-size: small; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
                a:link {text-decoration: none; }'''

    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
        if self.username is not None and self.password is not None:
            br.open('http://www.nytimes.com/auth/login')
            br.select_form(name='login')
            br['USERID']   = self.username
            br['PASSWORD'] = self.password
            raw = br.submit().read()
            if 'Sorry, we could not find the combination you entered. Please try again.' in raw:
                raise Exception('Your username and password are incorrect')
            #open('/t/log.html', 'wb').write(raw)
        return br

    def get_masthead_url(self):
        masthead = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
        #masthead = 'http://members.cox.net/nickredding/nytlogo.gif'
        br = BasicNewsRecipe.get_browser()
        try:
            br.open(masthead)
        except:
            self.log("\nMasthead unavailable")
            masthead = None
        return masthead


    def get_cover_url(self):
        cover = None
        st = time.localtime()
        year = str(st.tm_year)
        month = "%.2d" % st.tm_mon
        day = "%.2d" % st.tm_mday
        cover = 'http://graphics8.nytimes.com/images/' + year + '/' +  month +'/' + day +'/nytfrontpage/scan.jpg'
        br = BasicNewsRecipe.get_browser()
        try:
            br.open(cover)
        except:
            self.log("\nCover unavailable")
            cover = None
        return cover

    def short_title(self):
        return 'New York Times'

    def parse_index(self):
        self.encoding = 'cp1252'
        soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
        self.encoding = decode

        def feed_title(div):
            return ''.join(div.findAll(text=True, recursive=True)).strip()

        articles = {}
        key = None
        ans = []
        url_list = []

        def handle_article(div):
            a = div.find('a', href=True)
            if not a:
                return
            url = re.sub(r'\?.*', '', a['href'])
            if not url.startswith("http"):
                return
            if not url.endswith(".html"):
                return
            if 'podcast' in url:
                return
            url += '?pagewanted=all'
            if url in url_list:
                return
            url_list.append(url)
            title = self.tag_to_string(a, use_alt=True).strip()
            #self.log("Title: %s" % title)
            description = ''
            pubdate = strftime('%a, %d %b')
            summary = div.find(True, attrs={'class':'summary'})
            if summary:
                description = self.tag_to_string(summary, use_alt=False)
            author = ''
            authorAttribution = div.find(True, attrs={'class':'byline'})
            if authorAttribution:
                author = self.tag_to_string(authorAttribution, use_alt=False)
            else:
                authorAttribution = div.find(True, attrs={'class':'byline'})
                if authorAttribution:
                    author = self.tag_to_string(authorAttribution, use_alt=False)
            feed = key if key is not None else 'Uncategorized'
            if not articles.has_key(feed):
                articles[feed] = []
            articles[feed].append(
                            dict(title=title, url=url, date=pubdate,
                                description=description, author=author,
                                content=''))
            


        # Find each instance of class="section-headline", class="story", class="story headline"
        for div in soup.findAll(True,
            attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):

            if div['class'] in ['section-headline','sectionHeader']:
                key = string.capwords(feed_title(div))
                articles[key] = []
                ans.append(key)
                #self.log('Section: %s' % key)                

            elif div['class'] in ['story', 'story headline'] :
                handle_article(div)
            elif div['class'] == 'headlinesOnly multiline flush':
                for lidiv in div.findAll('li'):
                    handle_article(lidiv)
                    
#        ans = self.sort_index_by(ans, {'The Front Page':-1,
#                                      'Dining In, Dining Out':1,
#                                     'Obituaries':2})
        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]

        return ans

    def preprocess_html(self, soup):
        kicker_tag = soup.find(attrs={'class':'kicker'})
        if kicker_tag:
            tagline = self.tag_to_string(kicker_tag)
            #self.log("FOUND KICKER %s" % tagline)
            if tagline=='Op-Ed Columnist':
                img_div = soup.find('div','inlineImage module')
                #self.log("Searching for photo")
                if img_div:
                    img_div.extract()
                    #self.log("Photo deleted")
        refresh = soup.find('meta', {'http-equiv':'refresh'})
        if refresh is None:
            return soup
        content = refresh.get('content').partition('=')[2]
        raw = self.browser.open_novisit('http://www.nytimes.com'+content).read()
        return BeautifulSoup(raw.decode('cp1252', 'replace'))