MobileRead Forums - View Single Post

Starson17 · 07-16-2011, 09:51 AM

Wired Magazine UK, which IMHO is much better than Wired, has been broken for a long time. I finally had time to look at it. With apologies to Darko who wrote the original, the site had changed so much, I wasn't able to salvage anything from the previous version beyond the name and the categories, so I wrote an entirely new recipe.

It's pulling content from two related sources and is in pretty good shape, although it could be coded more densely. The current cover is being retrieved. I ran out of time, so if anyone wants to find the masthead, or work on the Image gallery that accompanies some articles, feel free.

Here it is: enjoy

Spoiler:

Code:

__license__   = 'GPL v3'
__copyright__ = '2011, Starson17 <Starson17 at gmail.com>'
'''
www.wired.co.uk
'''

from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
import re

class Wired_UK(BasicNewsRecipe):
    title                 = 'Wired Magazine - UK edition'
    __author__            = 'Starson17'
    __version__           = 'v1.30'
    __date__              = '15 July 2011'
    description           = 'Gaming news'
    publisher             = 'Conde Nast Digital'
    category              = 'news, games, IT, gadgets'
    oldest_article        = 40
    max_articles_per_feed = 100
    no_stylesheets        = True
    encoding              = 'utf-8'
    use_embedded_content  = False
    #masthead_url          = 'http://www.wired.co.uk/_/media/wired-logo_UK.gif'
    language              = 'en_GB'
    index                 = 'http://www.wired.co.uk'
    
    conversion_options = {
                          'comment'   : description
                        , 'tags'      : category
                        , 'publisher' : publisher
                        , 'language'  : language
                        }

    keep_only_tags = [dict(name='div', attrs={'class':['layoutColumn1']})]
    remove_tags = [dict(name='div',attrs={'class':['articleSidebar1','commentAddBox linkit','commentCountBox commentCountBoxBig']})]
    remove_tags_after = dict(name='div',attrs={'class':['mainCopy entry-content','mainCopy']})
    '''
    remove_attributes = ['height','width']
                   ,dict(name=['object','embed','iframe','link'])
                   ,dict(attrs={'class':['opts','comment','stories']})
                   ]
    '''
    def parse_index(self):
        totalfeeds = []
        soup   = self.index_to_soup(self.index)
        recentcontent = soup.find('ul',attrs={'class':'linkList3'})
        mfeed = []
        if recentcontent:
          for li in recentcontent.findAll('li'):
            a = li.h2.a
            url  = self.index + a['href'] + '?page=all'
            title = self.tag_to_string(a)
            description = ''
            date  = strftime(self.timefmt)
            mfeed.append({
                  'title'      :title
                 ,'date'       :date
                 ,'url'        :url
                 ,'description':description
                })
        totalfeeds.append(('Wired UK Magazine Latest News', mfeed))
        popmagcontent = soup.findAll('div',attrs={'class':'sidebarLinkList'})
        magcontent = popmagcontent[1]
        mfeed2 = []
        if magcontent:
          a = magcontent.h3.a
          if a:
            url   = self.index + a['href'] + '?page=all'
            title = self.tag_to_string(a)
            description = ''
            date  = strftime(self.timefmt)
            mfeed2.append({
                  'title'      :title
                 ,'date'       :date
                 ,'url'        :url
                 ,'description':description
                })
          for li in magcontent.findAll('li'):
            a = li.a
            url   = self.index + a['href'] + '?page=all'
            title = self.tag_to_string(a)
            description = ''
            date  = strftime(self.timefmt)
            mfeed2.append({
                  'title'      :title
                 ,'date'       :date
                 ,'url'        :url
                 ,'description':description
                })
          totalfeeds.append(('Wired UK Magazine Features', mfeed2))

        magsoup = self.index_to_soup(self.index + '/magazine')
        startcontent = magsoup.find('h3',attrs={'class':'magSubSectionTitle titleStart'}).parent
        mfeed3 = []
        if startcontent:
          for li in startcontent.findAll('li'):
            a = li.a
            url   = self.index + a['href'] + '?page=all'
            title = self.tag_to_string(a)
            description = ''
            date  = strftime(self.timefmt)
            mfeed3.append({
                  'title'      :title
                 ,'date'       :date
                 ,'url'        :url
                 ,'description':description
                })
          totalfeeds.append(('Wired UK Magazine More', mfeed3))

        playcontent = magsoup.find('h3',attrs={'class':'magSubSectionTitle titlePlay'}).parent
        mfeed4 = []
        if playcontent:
          for li in playcontent.findAll('li'):
            a = li.a
            url   = self.index + a['href'] + '?page=all'
            title = self.tag_to_string(a)
            description = ''
            date  = strftime(self.timefmt)
            mfeed4.append({
                  'title'      :title
                 ,'date'       :date
                 ,'url'        :url
                 ,'description':description
                })
          totalfeeds.append(('Wired UK Magazine Play', mfeed4))
        return totalfeeds

    def get_cover_url(self):
        cover_url = ''
        soup = self.index_to_soup(self.index + '/magazine/archive')
        cover_item = soup.find('div', attrs={'class':'image linkme'})
        if cover_item:
           cover_url = cover_item.img['src']
        return cover_url

    def preprocess_html(self, soup):
        for tag in soup.findAll(name='p'):
            if tag.find(name='span', text=re.compile(r'This article was taken from.*', re.DOTALL|re.IGNORECASE)):
                tag.extract()
        return soup 
        
    extra_css = '''
                    h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
                    h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
                    p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
                    body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
		'''