View Single Post
Old 09-23-2009, 01:01 PM   #753
MichaelMSeattle
Enthusiast
MichaelMSeattle began at the beginning.
 
Posts: 30
Karma: 16
Join Date: Sep 2009
Device: sony prs-505/600
help with NYTMagazine

Quote:
Originally Posted by kiklop74 View Post
That will not work since NYT has quite good scraping protection.

This is the recipe that works for NYT magazine, same can be easily modified for other parts of NYT site.
Thanks very much for responding so quickly! I love how you were able to get the cover image.

Your recipe returned the main articles of the magazine but not the sub-sections (which are listed in the TOC). I modified the recipe to add the sub-section feeds and that only added those to the TOC.

For all the sub articles (not those in the main section) I just see:
"This article was downloaded by calibre from http://www.nytimes.com/2009/09/20/magazine/20Letters-t-001.html" (or whatever was the source).

I'm attaching the full recipe below. Thanks again for your help!
-Mike

==============================================
#!/usr/bin/env python

__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
'''
nytimes.com/pages/magazine
'''

import time
from calibre.web.feeds.news import BasicNewsRecipe

class NewYorkTimesMagazine(BasicNewsRecipe):
title = 'The New York Times Magazine3'
__author__ = 'Darko Miletic'
description = 'News from New York'
publisher = 'The New York Times'
category = 'news, politics, US'
delay = 1
language = 'en_US'
oldest_article = 10
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
encoding = 'cp1252'
INDEX = 'http://www.nytimes.com/pages/magazine/'

conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher': publisher
}


keep_only_tags = [dict(name='div', attrs={'id':'article'})]

remove_tags = [
dict(name='div', attrs={'class':['header','nextArticleLink clearfix','correctionNote']})
,dict(name='div', attrs={'id':['toolsRight','articleInline','readerscomment','aut horId']})
,dict(name=['object','link'])
]

remove_tags_after = dict(name='div',attrs={'id':'pageLinks'})

feeds = [(u'Articles', u'http://feeds.nytimes.com/nyt/rss/Magazine' ),
(u'The Ethicist', u'http://ethicist.blogs.nytimes.com/feed/'),
(u'Medium', u'http://themedium.blogs.nytimes.com/feed/'),
(u'Motherload', u'http://parenting.blogs.nytimes.com/feed/')
]


def append_page(self, soup, appendtag, position):
pager = soup.find('div',attrs={'id':'pageLinks'})
if pager:
atag = pager.find('a',attrs={'title':'Next Page'})
if atag:
soup2 = self.index_to_soup('http://www.nytimes.com' + atag['href'])
st = soup2.find('div',attrs={'id':'articleInline'})
if st:
st.extract()
tt = soup2.find('div',attrs={'class':'nextArticleLink clearfix'})
if tt:
tt.extract()
texttag = soup2.find('div', attrs={'id':'articleBody'})
for it in texttag.findAll(style=True):
del it['style']
for it in texttag.findAll(attrs={'id':'authorId'}):
it.extract()
for it in texttag.findAll(attrs={'class':'correctionNote'}):
it.extract()
newpos = len(texttag.contents)
self.append_page(soup2,texttag,newpos)
pager.extract()
pager2 = texttag.find('div',attrs={'id':'pageLinks'})
if pager2:
pager2.extract()
texttag.extract()
appendtag.insert(position,texttag)

def get_article_url(self, article):
return article.get('guid', None)

def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
self.append_page(soup, soup.body, 3)
return soup

def get_cover_url(self):
cover = None
soup = self.index_to_soup(self.INDEX)
tag = soup.find('div',attrs={'id':'ABcolumnPromo'})
if tag:
st = time.strptime(tag.h3.string,'%m.%d.%Y')
year = str(st.tm_year)
month = "%.2d" % st.tm_mon
day = "%.2d" % st.tm_mday
cover = 'http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/magazine/' + day +'cover-395.jpg'
return cover
MichaelMSeattle is offline