Also, for those who don't actually live in New York, the "Going On" articles are probably pretty useless, and only clutter up the download, so here's a version that suppresses those articles:
Spoiler:
Code:
__license__ = 'GPL v3'
__copyright__ = '2008-2011, Darko Miletic <darko.miletic at gmail.com>'
'''
newyorker.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class NewYorker(BasicNewsRecipe):
title = 'The New Yorker'
__author__ = 'Darko Miletic'
description = 'Free Articles'
oldest_article = 7
language = 'en'
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
publisher = 'Conde Nast Publications'
category = 'news'
encoding = 'cp1252'
publication_type = 'magazine'
masthead_url = 'http://www.newyorker.com/css/i/hed/logo.gif'
extra_css = """
body {font-family: "Times New Roman",Times,serif}
.articleauthor{color: #9F9F9F;
font-family: Arial, sans-serif;
font-size: small;
text-transform: uppercase}
.rubric,.dd,h6#credit{color: #CD0021;
font-family: Arial, sans-serif;
font-size: small;
text-transform: uppercase}
.descender:first-letter{display: inline; font-size: xx-large; font-weight: bold}
.dd,h6#credit{color: gray}
.c{display: block}
.caption,h2#articleintro{font-style: italic}
.caption{font-size: small}
"""
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
keep_only_tags = [
dict(name='div', attrs={'class':'headers'})
,dict(name='div', attrs={'id':['articleheads','items-container','articleRail','articletext','photocredits']})
]
remove_tags = [
dict(name=['meta','iframe','base','link','embed','object'])
,dict(attrs={'class':['utils','socialUtils','articleRailLinks','icons'] })
,dict(attrs={'id':['show-header','show-footer'] })
]
remove_attributes = ['lang']
feeds = [
(u'Reporting', u'http://www.newyorker.com/services/mrss/feeds/reporting.xml'),
(u'Arts', u'http://www.newyorker.com/services/mrss/feeds/arts.xml'),
(u'Humor',u'http://www.newyorker.com/services/mrss/feeds/humor.xml'),
(u'Culture', u'http://www.newyorker.com/online/blogs/culture/rss.xml')
]
# remove unwanted articles
def parse_feeds(self):
# Call parent's method.
feeds = BasicNewsRecipe.parse_feeds(self)
# Loop through all feeds.
for feed in feeds:
# Loop through all articles in feed.
for article in feed.articles[:]:
# No "Goings on about town" articles
if 'GOINGS ON' in article.title.upper():
feed.articles.remove(article)
return feeds
def print_version(self, url):
return url + '?printable=true'
def image_url_processor(self, baseurl, url):
return url.strip()
def get_cover_url(self):
cover_url = "http://www.newyorker.com/images/covers/1925/1925_02_21_p233.jpg"
soup = self.index_to_soup('http://www.newyorker.com/magazine?intcid=magazine')
cover_item = soup.find('div',attrs={'id':'media-count-1'})
if cover_item:
cover_url = 'http://www.newyorker.com' + cover_item.div.img['src'].strip()
return cover_url
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
auth = soup.find(attrs={'id':'articleauthor'})
if auth:
alink = auth.find('a')
if alink and alink.string is not None:
txt = alink.string
alink.replaceWith(txt)
return soup
To use this, instead of the default recipe, follow these steps:
Click on the arrow next to the "Fetch news" icon
Select "Add a custom news source"
Click the "Customize builtin recipe" button
Select the entry for The New Yorker, and click OK
Select the new entry from the left "user recipe" listbox
Replace the code now listed on the right side with the one posted above (beneath the spoiler tag)
Click the "Add/Update recipe" button
Confirm replacement with "Yes"
Close the custom recipe window, and confirm with "Yes"
The modified recipe will now show up under "Custom" in the news download section, and can be scheduled from there (after which it will also be shown in the "Scheduled" group).