View Single Post
Old 10-26-2010, 11:42 AM   #5
nickredding
onlinenewsreader.net
nickredding knows the difference between 'who' and 'whom'nickredding knows the difference between 'who' and 'whom'nickredding knows the difference between 'who' and 'whom'nickredding knows the difference between 'who' and 'whom'nickredding knows the difference between 'who' and 'whom'nickredding knows the difference between 'who' and 'whom'nickredding knows the difference between 'who' and 'whom'nickredding knows the difference between 'who' and 'whom'nickredding knows the difference between 'who' and 'whom'nickredding knows the difference between 'who' and 'whom'nickredding knows the difference between 'who' and 'whom'
 
Posts: 328
Karma: 10143
Join Date: Dec 2009
Location: Phoenix, AZ & Victoria, BC
Device: Kindle 3, Kindle Fire, IPad3, iPhone4, Playbook, HTC Inspire
NYT recipe update

Format changes on the NYT web site. Here is an updated recipe:
Code:
#!/usr/bin/env  python

__license__   = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
nytimes.com
'''
import string, re, time
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup

def decode(self, src):
    enc = 'utf-8'
    if 'iso-8859-1' in src:
        enc = 'cp1252'
    return src.decode(enc, 'ignore')

class NYTimes(BasicNewsRecipe):

    title       = u'New York Times'
    __author__  = 'Kovid Goyal/Nick Redding'
    language = 'en'
    requires_version = (0, 6, 36)

    description = 'Daily news from the New York Times (subscription version)'
    timefmt = ' [%b %d]'
    needs_subscription = True
    remove_tags_before = dict(id='article')
    remove_tags_after  = dict(id='article')
    remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool','nextArticleLink',
                                        'nextArticleLink clearfix','columnGroup doubleRule','doubleRule','entry-meta',
                                        'icon enlargeThis','columnGroup  last','relatedSearchesModule']}),
                   dict({'class':re.compile('^subNavigation')}),
                   dict({'class':re.compile('^leaderboard')}),
                   dict({'class':re.compile('^module')}),
                   dict({'class':'metaFootnote'}),
                   dict(id=['inlineBox','footer', 'toolsRight', 'articleInline','login','masthead',
                            'navigation', 'archive', 'side_search', 'blog_sidebar','cCol','portfolioInline',
                            'side_tool', 'side_index','header','readerReviewsCount','readerReviews',
                            'relatedArticles', 'relatedTopics', 'adxSponLink']),
                   dict(name=['script', 'noscript', 'style','form','hr'])]
    encoding = decode
    no_stylesheets = True
    extra_css = '''
                .articleHeadline { margin-top:0.5em; margin-bottom:0.25em; }
                .credit { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
                .byline { font-size: small; font-style:italic; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; }
                .dateline { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
                .kicker { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
                .timestamp { font-size: small; }
                .caption { font-size: small; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
                a:link {text-decoration: none; }'''

    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
        if self.username is not None and self.password is not None:
            br.open('http://www.nytimes.com/auth/login')
            br.select_form(name='login')
            br['USERID']   = self.username
            br['PASSWORD'] = self.password
            raw = br.submit().read()
            if 'Sorry, we could not find the combination you entered. Please try again.' in raw:
                raise Exception('Your username and password are incorrect')
            #open('/t/log.html', 'wb').write(raw)
        return br

    def get_masthead_url(self):
        masthead = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif'
        #masthead = 'http://members.cox.net/nickredding/nytlogo.gif'
        br = BasicNewsRecipe.get_browser()
        try:
            br.open(masthead)
        except:
            self.log("\nMasthead unavailable")
            masthead = None
        return masthead


    def get_cover_url(self):
        cover = None
        st = time.localtime()
        year = str(st.tm_year)
        month = "%.2d" % st.tm_mon
        day = "%.2d" % st.tm_mday
        cover = 'http://graphics8.nytimes.com/images/' + year + '/' +  month +'/' + day +'/nytfrontpage/scan.jpg'
        br = BasicNewsRecipe.get_browser()
        try:
            br.open(cover)
        except:
            self.log("\nCover unavailable")
            cover = None
        return cover

    def short_title(self):
        return 'New York Times'

    def parse_index(self):
        self.encoding = 'cp1252'
        soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html')
        self.encoding = decode

        def feed_title(div):
            return ''.join(div.findAll(text=True, recursive=True)).strip()

        articles = {}
        key = None
        ans = []
        url_list = []

        def handle_article(div):
            a = div.find('a', href=True)
            if not a:
                return
            url = re.sub(r'\?.*', '', a['href'])
            if not url.startswith("http"):
                return
            if not url.endswith(".html"):
                return
            if 'podcast' in url:
                return
            url += '?pagewanted=all'
            if url in url_list:
                return
            url_list.append(url)
            title = self.tag_to_string(a, use_alt=True).strip()
            #self.log("Title: %s" % title)
            description = ''
            pubdate = strftime('%a, %d %b')
            summary = div.find(True, attrs={'class':'summary'})
            if summary:
                description = self.tag_to_string(summary, use_alt=False)
            author = ''
            authorAttribution = div.find(True, attrs={'class':'byline'})
            if authorAttribution:
                author = self.tag_to_string(authorAttribution, use_alt=False)
            else:
                authorAttribution = div.find(True, attrs={'class':'byline'})
                if authorAttribution:
                    author = self.tag_to_string(authorAttribution, use_alt=False)
            feed = key if key is not None else 'Uncategorized'
            if not articles.has_key(feed):
                articles[feed] = []
            articles[feed].append(
                            dict(title=title, url=url, date=pubdate,
                                description=description, author=author,
                                content=''))
            


        # Find each instance of class="section-headline", class="story", class="story headline"
        for div in soup.findAll(True,
            attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}):

            if div['class'] in ['section-headline','sectionHeader']:
                key = string.capwords(feed_title(div))
                articles[key] = []
                ans.append(key)
                #self.log('Section: %s' % key)                

            elif div['class'] in ['story', 'story headline'] :
                handle_article(div)
            elif div['class'] == 'headlinesOnly multiline flush':
                for lidiv in div.findAll('li'):
                    handle_article(lidiv)
                    
#        ans = self.sort_index_by(ans, {'The Front Page':-1,
#                                      'Dining In, Dining Out':1,
#                                     'Obituaries':2})
        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]

        return ans

    def preprocess_html(self, soup):
        kicker_tag = soup.find(attrs={'class':'kicker'})
        if kicker_tag:
            tagline = self.tag_to_string(kicker_tag)
            #self.log("FOUND KICKER %s" % tagline)
            if tagline=='Op-Ed Columnist':
                img_div = soup.find('div','inlineImage module')
                #self.log("Searching for photo")
                if img_div:
                    img_div.extract()
                    #self.log("Photo deleted")
        refresh = soup.find('meta', {'http-equiv':'refresh'})
        if refresh is None:
            return soup
        content = refresh.get('content').partition('=')[2]
        raw = self.browser.open_novisit('http://www.nytimes.com'+content).read()
        return BeautifulSoup(raw.decode('cp1252', 'replace'))
nickredding is offline   Reply With Quote