Register Guidelines E-Books Today's Posts Search

Go Back   MobileRead Forums > E-Book Software > Calibre > Recipes

Notices

Reply
 
Thread Tools Search this Thread
Old 10-27-2012, 09:45 AM   #1
rainrdx
Connoisseur
rainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy blue
 
Posts: 55
Karma: 13316
Join Date: Jul 2012
Device: iPad
Atlantic Recipe 10/27

Some aesthetic changes and bug fixes

Code:
#!/usr/bin/env  python

__license__   = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
theatlantic.com
'''
import re

from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag, NavigableString

class TheAtlantic(BasicNewsRecipe):

    title      = 'The Atlantic'
    __author__ = 'Kovid Goyal and Sujata Raman'
    description = 'Current affairs and politics focussed on the US'
    INDEX = 'http://www.theatlantic.com/magazine/toc/0/'
    language = 'en'

    remove_tags_before = dict(name='div', id='articleHead')
    remove_tags_after  = dict(id='copyright')
    remove_tags        = [dict(id=['header', 'printAds', 'pageControls'])]
    no_stylesheets = True

    preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]


    def print_version(self, url):
        return url.replace('/archive/', '/print/')

    def parse_index(self):
        articles = []

        soup = self.index_to_soup(self.INDEX)
        ts = soup.find(id='magazineTopStories')
        ds = self.tag_to_string(ts.find('h1')).split(':')[-1]
        self.timefmt = ' [%s]'%ds

        cover = soup.find('img', src=True, attrs={'class':'cover'})
	
        if cover is not None:
            self.cover_url = re.sub('\s','%20',re.sub('jpg.*','jpg',cover['src']))
            self.log(self.cover_url)

        feeds = []
        seen_titles = set([])
        for section in soup.findAll('div', attrs={'class':'magazineSection'}):
            section_title = self.tag_to_string(section.find('h2'))
            self.log('Found section:', section_title)
            articles = []
            for post in section.findAll('h3', attrs={'class':'headline'}):
                a = post.find('a', href=True)
                title = self.tag_to_string(a)
                if title in seen_titles:
                    continue
                seen_titles.add(title)
                url = a['href']
                if url.startswith('/'):
                    url = 'http://www.theatlantic.com'+url
                p = post.parent.find('p', attrs={'class':'dek'})
                desc = None
                self.log('\tFound article:', title, 'at', url)
                if p is not None:
                    desc = self.tag_to_string(p)
                    self.log('\t\t', desc)
                articles.append({'title':title, 'url':url, 'description':desc,
                    'date':''})
            if articles:
                feeds.append((section_title, articles))
	
	rightContent=soup.find('div', attrs = {'class':'rightContent'})
	for module in rightContent.findAll('div', attrs={'class':'module'}):
		section_title = self.tag_to_string(module.find('h2'))
		articles = []
                for post in module.findAll('div', attrs={'class':'post'}):
			a = post.find('a', href=True)
			title = self.tag_to_string(a)
			if title in seen_titles:
				continue
			seen_titles.add(title)
			url = a['href']
			if url.startswith('/'):
				url = 'http://www.theatlantic.com'+url
			p = post.parent.find('p', attrs={'class':'dek'})
			desc = None
			self.log('\tFound article:', title, 'at', url)
			if p is not None:
				desc = self.tag_to_string(p)
				self.log('\t\t', desc)
			articles.append({'title':title, 'url':url, 'description':desc, 'date':''})
		if articles:
			feeds.append((section_title, articles))

 
        return feeds

    def postprocess_html(self, soup, first):
        for table in soup.findAll('table', align='right'):
            img = table.find('img')
            if img is not None:
                img.extract()
                caption = self.tag_to_string(table).strip()
                div = Tag(soup, 'div')
                div['style'] = 'text-align:center'
                div.insert(0, img)
                div.insert(1, Tag(soup, 'br'))
                if caption:
                    div.insert(2, NavigableString(caption))
                table.replaceWith(div)

        return soup
rainrdx is offline   Reply With Quote
Reply


Forum Jump

Similar Threads
Thread Thread Starter Forum Replies Last Post
Trying to make a modified version of the recipe for "The Atlantic" camiller Recipes 3 02-14-2012 03:59 PM
Hello from Atlantic Canada Moxie51 Introduce Yourself 4 11-14-2011 08:21 AM
Error on downloading The Atlantic sdow1 Recipes 4 01-26-2011 01:53 AM
The Atlantic Feed Seems to Be Broken in 0.6.43 jbone1313 Calibre 2 02-28-2010 07:48 PM
Hello from the Mid-Atlantic Shemmy Introduce Yourself 10 01-08-2010 07:13 PM


All times are GMT -4. The time now is 11:08 AM.


MobileRead.com is a privately owned, operated and funded community.