Register Guidelines E-Books Today's Posts Search

Go Back   MobileRead Forums > E-Book Software > Calibre > Recipes

Notices

Reply
 
Thread Tools Search this Thread
Old 07-25-2012, 09:32 PM   #1
rainrdx
Connoisseur
rainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy blue
 
Posts: 55
Karma: 13316
Join Date: Jul 2012
Device: iPad
Psychology Today/Smithsonian/The New Republic

I was originally posting a request for paid recipe. And in the end I decided to write it myself. Haven't being coding in a long while and almost forget everything. My code is rusty but it works. Let me know if it has any problem.

Psychology Today

Code:
import re
from calibre.web.feeds.recipes import BasicNewsRecipe


class PsychologyToday(BasicNewsRecipe):

    title       = 'Psychology Today'
    __author__  = 'Rick Shang'

    description = 'This magazine takes information from the latest research in the field of psychology and makes it useful to people in their everyday lives. Its coverage encompasses self-improvement, relationships, the mind-body connection, health, family, the workplace and culture.'
    language = 'en'
    category = 'news'
    encoding = 'UTF-8'
    keep_only_tags = [dict(attrs={'class':['print-title', 'print-submitted', 'print-content', 'print-footer', 'print-source_url', 'print-links']})]
    no_javascript = True
    no_stylesheets = True


    def parse_index(self):
        articles = []
        soup = self.index_to_soup('http://www.psychologytoday.com/magazine')

	
	#Go to the main body
	div = soup.find('div',attrs={'id':'content-content'})
	#Find cover & date
	cover_item = div.find('div', attrs={'class':'collections-header-image'})
	cover = cover_item.find('img',src=True)
	self.cover_url = cover['src']
	date = self.tag_to_string(cover['title'])
	self.timefmt = u' [%s]'%date

        articles = []
        for post in div.findAll('div', attrs={'class':'collections-node-feature-info'}):
		title = self.tag_to_string(post.find('h2'))
		author_item=post.find('div', attrs={'class':'collection-node-byline'})
		author = re.sub(r'.*by\s',"",self.tag_to_string(author_item).strip())
		title = title + u' (%s)'%author
		article_page= self.index_to_soup('http://www.psychologytoday.com'+post.find('a', href=True)['href'])
		print_page=article_page.find('li', attrs={'class':'print_html first'})
		url='http://www.psychologytoday.com'+print_page.find('a',href=True)['href']
		desc = self.tag_to_string(post.find('div', attrs={'class':'collection-node-description'})).strip()
		self.log('Found article:', title)
		self.log('\t', url)
		self.log('\t', desc)
		articles.append({'title':title, 'url':url, 'date':'','description':desc})

        for post in div.findAll('div', attrs={'class':'collections-node-thumbnail-info'}):
		title = self.tag_to_string(post.find('h2'))
		author_item=post.find('div', attrs={'class':'collection-node-byline'})		
		article_page= self.index_to_soup('http://www.psychologytoday.com'+post.find('a', href=True)['href'])
		print_page=article_page.find('li', attrs={'class':'print_html first'})
		description = post.find('div', attrs={'class':'collection-node-description'})
		author = re.sub(r'.*by\s',"",self.tag_to_string(description.nextSibling).strip())
		desc = self.tag_to_string(description).strip()
		url='http://www.psychologytoday.com'+print_page.find('a',href=True)['href']
		title = title + u' (%s)'%author
		self.log('Found article:', title)
		self.log('\t', url)
		self.log('\t', desc)
		articles.append({'title':title, 'url':url, 'date':'','description':desc})

        for post in div.findAll('li', attrs={'class':['collection-item-list-odd','collection-item-list-even']}):
		title = self.tag_to_string(post.find('h2'))
		author_item=post.find('div', attrs={'class':'collection-node-byline'})
		author = re.sub(r'.*by\s',"",self.tag_to_string(author_item).strip())
		title = title + u' (%s)'%author
		article_page= self.index_to_soup('http://www.psychologytoday.com'+post.find('a', href=True)['href'])
		print_page=article_page.find('li', attrs={'class':'print_html first'})
		url='http://www.psychologytoday.com'+print_page.find('a',href=True)['href']
		desc = self.tag_to_string(post.find('div', attrs={'class':'collection-node-description'})).strip()
		self.log('Found article:', title)
		self.log('\t', url)
		self.log('\t', desc)
		articles.append({'title':title, 'url':url, 'date':'','description':desc})

        return [('Current Issue', articles)]
Smithsonian

Code:
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
from collections import OrderedDict

class Smithsonian(BasicNewsRecipe):

    title       = 'Smithsonian Magazine'
    __author__  = 'Rick Shang'

    description = 'This magazine chronicles the arts, environment, sciences and popular culture of the times. It is edited for modern, well-rounded individuals with diverse, general interests. With your order, you become a National Associate Member of the Smithsonian. Membership benefits include your subscription to Smithsonian magazine, a personalized membership card, discounts from the Smithsonian catalog, and more.'
    language = 'en'
    category = 'news'
    encoding = 'UTF-8'
    keep_only_tags = [dict(attrs={'id':['articleTitle', 'subHead', 'byLine', 'articleImage', 'article-text']})]
    remove_tags = [dict(attrs={'class':['related-articles-inpage', 'viewMorePhotos']})]
    no_javascript = True
    no_stylesheets = True

    def parse_index(self):

	#Go to the issue
        soup0 = self.index_to_soup('http://www.smithsonianmag.com/issue/archive/')
        div = soup0.find('div',attrs={'id':'archives'})
        issue = div.find('ul',attrs={'class':'clear-both'})
	current_issue_url = issue.find('a', href=True)['href']
        soup = self.index_to_soup(current_issue_url)

	#Go to the main body
	div = soup.find ('div', attrs={'id':'content-inset'})

	#Find date
	date = re.sub('.*\:\W*', "", self.tag_to_string(div.find('h2')).strip())
	self.timefmt = u' [%s]'%date

	#Find cover
	self.cover_url = div.find('img',src=True)['src']	

        feeds = OrderedDict()
	section_title = ''
	subsection_title = ''
        for post in div.findAll('div', attrs={'class':['plainModule', 'departments plainModule']}):
		articles = []
		prefix = ''
		h3=post.find('h3')
		if h3 is not None:
			section_title = self.tag_to_string(h3)
		else:
			subsection=post.find('p',attrs={'class':'article-cat'})
			link=post.find('a',href=True)
			url=link['href']+'?c=y&story=fullstory'
			if subsection is not None:
				subsection_title = self.tag_to_string(subsection)
				prefix = (subsection_title+': ')
				description=self.tag_to_string(post('p', limit=2)[1]).strip()
			else:
				description=self.tag_to_string(post.find('p')).strip()
			desc=re.sub('\sBy\s.*', '', description, re.DOTALL)
			author=re.sub('.*By\s', '', description, re.DOTALL)
			title=prefix + self.tag_to_string(link).strip()+ u' (%s)'%author
			articles.append({'title':title, 'url':url, 'description':desc, 'date':''})
		
		if articles:
			if section_title not in feeds:
	                    feeds[section_title] = []
			feeds[section_title] += articles
        ans = [(key, val) for key, val in feeds.iteritems()]
        return ans
The New Republic

Code:
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
from collections import OrderedDict

class TNR(BasicNewsRecipe):

    title       = 'The New Republic'
    __author__  = 'Rick Shang'

    description = 'The New Republic is a journal of opinion with an emphasis on politics and domestic and international affairs. It carries feature articles by staff and contributing editors. The second half of each issue is devoted to book and the arts, theater, motion pictures, music and art.'
    language = 'en'
    category = 'news'
    encoding = 'UTF-8'
    remove_tags = [dict(attrs={'class':['print-logo','print-site_name','print-hr']})]
    no_javascript = True
    no_stylesheets = True


    def parse_index(self):

	#Go to the issue
        soup0 = self.index_to_soup('http://www.tnr.com/magazine-issues')
        issue = soup0.find('div',attrs={'id':'current_issue'})

	#Find date
	date = self.tag_to_string(issue.find('div',attrs={'class':'date'})).strip()
	self.timefmt = u' [%s]'%date

        #Go to the main body
	current_issue_url = 'http://www.tnr.com' + issue.find('a', href=True)['href']
        soup = self.index_to_soup(current_issue_url)
	div = soup.find ('div', attrs={'class':'article_detail_body'})



	#Find cover
	self.cover_url = div.find('img',src=True)['src']	

        feeds = OrderedDict()
	section_title = ''
	subsection_title = ''
        for post in div.findAll('p'):
		articles = []
		em=post.find('em')
		b=post.find('b')
		a=post.find('a',href=True)
		if em is not None:
			section_title = self.tag_to_string(em).strip()
			subsection_title = ''
		elif b is not None:
			subsection_title=self.tag_to_string(b).strip()
		elif a is not None:
			prefix = (subsection_title+': ') if subsection_title else ''
			url=re.sub('www.tnr.com','www.tnr.com/print', a['href'])
			author=re.sub('.*by\s', '', self.tag_to_string(post), re.DOTALL)
			title=prefix + self.tag_to_string(a).strip()+ u' (%s)'%author
			articles.append({'title':title, 'url':url, 'description':'', 'date':''})
		
		if articles:
			if section_title not in feeds:
	                    feeds[section_title] = []
			feeds[section_title] += articles
        ans = [(key, val) for key, val in feeds.iteritems()]
        return ans

Last edited by rainrdx; 07-26-2012 at 03:29 PM. Reason: Recipe Updates
rainrdx is offline   Reply With Quote
Old 07-26-2012, 03:31 PM   #2
rainrdx
Connoisseur
rainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy blue
 
Posts: 55
Karma: 13316
Join Date: Jul 2012
Device: iPad
Ok, the recipes are updated Please enjoy.
rainrdx is offline   Reply With Quote
Advert
Reply


Forum Jump

Similar Threads
Thread Thread Starter Forum Replies Last Post
recipe request chell1948 Recipes 1 06-02-2011 01:23 PM
Paid Hack Request: Photo Slideshow Hack for Kindle 3 chmreader Kindle Developer's Corner 0 05-25-2011 01:24 PM
Request for recipe sumper Recipes 2 10-11-2010 02:25 AM
Recipe Volkskrant paid version prodsaaw Calibre 0 02-18-2010 04:00 PM
Request for Recipe girlperson1 Calibre 2 11-14-2008 10:43 PM


All times are GMT -4. The time now is 01:43 AM.


MobileRead.com is a privately owned, operated and funded community.