07-25-2012, 09:32 PM | #1 |
Connoisseur
Posts: 55
Karma: 13316
Join Date: Jul 2012
Device: iPad
|
Psychology Today/Smithsonian/The New Republic
I was originally posting a request for paid recipe. And in the end I decided to write it myself. Haven't being coding in a long while and almost forget everything. My code is rusty but it works. Let me know if it has any problem.
Psychology Today Code:
import re from calibre.web.feeds.recipes import BasicNewsRecipe class PsychologyToday(BasicNewsRecipe): title = 'Psychology Today' __author__ = 'Rick Shang' description = 'This magazine takes information from the latest research in the field of psychology and makes it useful to people in their everyday lives. Its coverage encompasses self-improvement, relationships, the mind-body connection, health, family, the workplace and culture.' language = 'en' category = 'news' encoding = 'UTF-8' keep_only_tags = [dict(attrs={'class':['print-title', 'print-submitted', 'print-content', 'print-footer', 'print-source_url', 'print-links']})] no_javascript = True no_stylesheets = True def parse_index(self): articles = [] soup = self.index_to_soup('http://www.psychologytoday.com/magazine') #Go to the main body div = soup.find('div',attrs={'id':'content-content'}) #Find cover & date cover_item = div.find('div', attrs={'class':'collections-header-image'}) cover = cover_item.find('img',src=True) self.cover_url = cover['src'] date = self.tag_to_string(cover['title']) self.timefmt = u' [%s]'%date articles = [] for post in div.findAll('div', attrs={'class':'collections-node-feature-info'}): title = self.tag_to_string(post.find('h2')) author_item=post.find('div', attrs={'class':'collection-node-byline'}) author = re.sub(r'.*by\s',"",self.tag_to_string(author_item).strip()) title = title + u' (%s)'%author article_page= self.index_to_soup('http://www.psychologytoday.com'+post.find('a', href=True)['href']) print_page=article_page.find('li', attrs={'class':'print_html first'}) url='http://www.psychologytoday.com'+print_page.find('a',href=True)['href'] desc = self.tag_to_string(post.find('div', attrs={'class':'collection-node-description'})).strip() self.log('Found article:', title) self.log('\t', url) self.log('\t', desc) articles.append({'title':title, 'url':url, 'date':'','description':desc}) for post in div.findAll('div', attrs={'class':'collections-node-thumbnail-info'}): title = self.tag_to_string(post.find('h2')) author_item=post.find('div', attrs={'class':'collection-node-byline'}) article_page= self.index_to_soup('http://www.psychologytoday.com'+post.find('a', href=True)['href']) print_page=article_page.find('li', attrs={'class':'print_html first'}) description = post.find('div', attrs={'class':'collection-node-description'}) author = re.sub(r'.*by\s',"",self.tag_to_string(description.nextSibling).strip()) desc = self.tag_to_string(description).strip() url='http://www.psychologytoday.com'+print_page.find('a',href=True)['href'] title = title + u' (%s)'%author self.log('Found article:', title) self.log('\t', url) self.log('\t', desc) articles.append({'title':title, 'url':url, 'date':'','description':desc}) for post in div.findAll('li', attrs={'class':['collection-item-list-odd','collection-item-list-even']}): title = self.tag_to_string(post.find('h2')) author_item=post.find('div', attrs={'class':'collection-node-byline'}) author = re.sub(r'.*by\s',"",self.tag_to_string(author_item).strip()) title = title + u' (%s)'%author article_page= self.index_to_soup('http://www.psychologytoday.com'+post.find('a', href=True)['href']) print_page=article_page.find('li', attrs={'class':'print_html first'}) url='http://www.psychologytoday.com'+print_page.find('a',href=True)['href'] desc = self.tag_to_string(post.find('div', attrs={'class':'collection-node-description'})).strip() self.log('Found article:', title) self.log('\t', url) self.log('\t', desc) articles.append({'title':title, 'url':url, 'date':'','description':desc}) return [('Current Issue', articles)] Code:
import re from calibre.web.feeds.recipes import BasicNewsRecipe from collections import OrderedDict class Smithsonian(BasicNewsRecipe): title = 'Smithsonian Magazine' __author__ = 'Rick Shang' description = 'This magazine chronicles the arts, environment, sciences and popular culture of the times. It is edited for modern, well-rounded individuals with diverse, general interests. With your order, you become a National Associate Member of the Smithsonian. Membership benefits include your subscription to Smithsonian magazine, a personalized membership card, discounts from the Smithsonian catalog, and more.' language = 'en' category = 'news' encoding = 'UTF-8' keep_only_tags = [dict(attrs={'id':['articleTitle', 'subHead', 'byLine', 'articleImage', 'article-text']})] remove_tags = [dict(attrs={'class':['related-articles-inpage', 'viewMorePhotos']})] no_javascript = True no_stylesheets = True def parse_index(self): #Go to the issue soup0 = self.index_to_soup('http://www.smithsonianmag.com/issue/archive/') div = soup0.find('div',attrs={'id':'archives'}) issue = div.find('ul',attrs={'class':'clear-both'}) current_issue_url = issue.find('a', href=True)['href'] soup = self.index_to_soup(current_issue_url) #Go to the main body div = soup.find ('div', attrs={'id':'content-inset'}) #Find date date = re.sub('.*\:\W*', "", self.tag_to_string(div.find('h2')).strip()) self.timefmt = u' [%s]'%date #Find cover self.cover_url = div.find('img',src=True)['src'] feeds = OrderedDict() section_title = '' subsection_title = '' for post in div.findAll('div', attrs={'class':['plainModule', 'departments plainModule']}): articles = [] prefix = '' h3=post.find('h3') if h3 is not None: section_title = self.tag_to_string(h3) else: subsection=post.find('p',attrs={'class':'article-cat'}) link=post.find('a',href=True) url=link['href']+'?c=y&story=fullstory' if subsection is not None: subsection_title = self.tag_to_string(subsection) prefix = (subsection_title+': ') description=self.tag_to_string(post('p', limit=2)[1]).strip() else: description=self.tag_to_string(post.find('p')).strip() desc=re.sub('\sBy\s.*', '', description, re.DOTALL) author=re.sub('.*By\s', '', description, re.DOTALL) title=prefix + self.tag_to_string(link).strip()+ u' (%s)'%author articles.append({'title':title, 'url':url, 'description':desc, 'date':''}) if articles: if section_title not in feeds: feeds[section_title] = [] feeds[section_title] += articles ans = [(key, val) for key, val in feeds.iteritems()] return ans Code:
import re from calibre.web.feeds.recipes import BasicNewsRecipe from collections import OrderedDict class TNR(BasicNewsRecipe): title = 'The New Republic' __author__ = 'Rick Shang' description = 'The New Republic is a journal of opinion with an emphasis on politics and domestic and international affairs. It carries feature articles by staff and contributing editors. The second half of each issue is devoted to book and the arts, theater, motion pictures, music and art.' language = 'en' category = 'news' encoding = 'UTF-8' remove_tags = [dict(attrs={'class':['print-logo','print-site_name','print-hr']})] no_javascript = True no_stylesheets = True def parse_index(self): #Go to the issue soup0 = self.index_to_soup('http://www.tnr.com/magazine-issues') issue = soup0.find('div',attrs={'id':'current_issue'}) #Find date date = self.tag_to_string(issue.find('div',attrs={'class':'date'})).strip() self.timefmt = u' [%s]'%date #Go to the main body current_issue_url = 'http://www.tnr.com' + issue.find('a', href=True)['href'] soup = self.index_to_soup(current_issue_url) div = soup.find ('div', attrs={'class':'article_detail_body'}) #Find cover self.cover_url = div.find('img',src=True)['src'] feeds = OrderedDict() section_title = '' subsection_title = '' for post in div.findAll('p'): articles = [] em=post.find('em') b=post.find('b') a=post.find('a',href=True) if em is not None: section_title = self.tag_to_string(em).strip() subsection_title = '' elif b is not None: subsection_title=self.tag_to_string(b).strip() elif a is not None: prefix = (subsection_title+': ') if subsection_title else '' url=re.sub('www.tnr.com','www.tnr.com/print', a['href']) author=re.sub('.*by\s', '', self.tag_to_string(post), re.DOTALL) title=prefix + self.tag_to_string(a).strip()+ u' (%s)'%author articles.append({'title':title, 'url':url, 'description':'', 'date':''}) if articles: if section_title not in feeds: feeds[section_title] = [] feeds[section_title] += articles ans = [(key, val) for key, val in feeds.iteritems()] return ans Last edited by rainrdx; 07-26-2012 at 03:29 PM. Reason: Recipe Updates |
07-26-2012, 03:31 PM | #2 |
Connoisseur
Posts: 55
Karma: 13316
Join Date: Jul 2012
Device: iPad
|
Ok, the recipes are updated Please enjoy.
|
Advert | |
|
|
Similar Threads | ||||
Thread | Thread Starter | Forum | Replies | Last Post |
recipe request | chell1948 | Recipes | 1 | 06-02-2011 01:23 PM |
Paid Hack Request: Photo Slideshow Hack for Kindle 3 | chmreader | Kindle Developer's Corner | 0 | 05-25-2011 01:24 PM |
Request for recipe | sumper | Recipes | 2 | 10-11-2010 02:25 AM |
Recipe Volkskrant paid version | prodsaaw | Calibre | 0 | 02-18-2010 04:00 PM |
Request for Recipe | girlperson1 | Calibre | 2 | 11-14-2008 10:43 PM |