View Single Post
Old 05-15-2015, 11:35 PM   #4
zachlapidus
Junior Member
zachlapidus began at the beginning.
 
Posts: 2
Karma: 10
Join Date: Apr 2015
Device: Kindle Keyboard
Thanks truth1ness! What a great update. I just ran the script and had to make a slight tweak to get it to fetch the "long read" article about the silk road -- had to add something to the keep_only_tags to get the article to actually download, as it is (sigh) in a different format:

Code:
__license__   = 'GPL v3'
__copyright__ = '2014, Darko Miletic <darko.miletic at gmail.com>'
'''
www.wired.com
'''

from calibre.web.feeds.news import BasicNewsRecipe
from datetime import date
import urllib2

class WiredDailyNews(BasicNewsRecipe):
    title                 = 'Wired Magazine, Monthly Edition'
    __author__            = 'Darko Miletic, update by Zach Lapidus, Michael Marotta'
    description           = ('Wired is a full-color monthly American magazine, published in both print '
                             'and online editions, that reports on how emerging technologies affect culture,'
                             'the economy and politics. Monthly edition, best run at the start of every month.')
    publisher             = 'Conde Nast'
    category              = 'news, IT, computers, technology'
    oldest_article        = 2
    max_articles_per_feed = 200
    no_stylesheets        = True
    encoding              = 'utf-8'
    use_embedded_content  = False
    language              = 'en'
    ignore_duplicate_articles = {'url'}
    remove_empty_feeds    = True
    publication_type      = 'newsportal'
    extra_css             = """
                            .entry-header{
                                          text-transform: uppercase;
                                          vertical-align: baseline;
                                          display: inline;
                                         }
                            """

    remove_tags = [
        dict(name=['meta','link']),
        dict(name='div', attrs={'class':'podcast_storyboard'}),
        dict(id=['sharing', 'social', 'article-tags', 'sidebar']),
                  ]
    keep_only_tags=[
        dict(attrs={'data-js':['post', 'postHeader']}),
        dict(attrs={'class':'exchange fsb-content relative'})
    ]

    def get_date_url(self):
        '''
        get month and year, add year modifier, append to wired magazine url,
        :return: url
        '''
        baseurl = 'http://www.wired.com/tag/magazine-'
        monthurl = str('{:02d}'.format(date.today().month))
        yearurl = str(date.today().year - 1992)
        dateurl = baseurl + yearurl + '-' + monthurl + '/page/'
        return dateurl

    def parse_wired_index_page(self, currenturl, seen):
        soup   = self.index_to_soup(currenturl)
        for a in soup.find('main').findAll('a', href=True):
            url = a['href']
            if url.startswith('http://www.wired.com/') and url.endswith('/'):
                title = self.tag_to_string(a.find('h2'))
                dateloc = a.find('time')
                date = self.tag_to_string(dateloc)
                if title.lower() != 'read more' and title and url not in seen:
                    seen.add(url)
                    self.log('Found article:', title)
                    yield {'title':title, 'date':date, 'url':url, 'description':''}

    def parse_index(self):
        '''
        get the current month's url, index first page to soup, find number of pages,
        just keep adding to page num until soup is not none instead of scraping page for
        :return:
        '''
        baseurl = self.get_date_url()
        pagenum = 1
        articles = []
        seen = set()
        morepages = True
        while morepages:
            try:
                urllib2.urlopen(baseurl + str(pagenum))
                currenturl = baseurl + str(pagenum)
                articles.extend(self.parse_wired_index_page(currenturl, seen))
                pagenum += 1
            except urllib2.HTTPError:
                morepages = False
        return [('Articles', articles)]
Only one problem -- the svg images used in the article appear as black rectangles in both Calibre's viewer, and on the kindle. I tried fetching the news source as an ePub instead, thinking that that format did not rasterize svg, but got the black squares again. Any ideas?
zachlapidus is offline   Reply With Quote