04-04-2013, 01:05 PM | #1 |
Connoisseur
Posts: 55
Karma: 13316
Join Date: Jul 2012
Device: iPad
|
Harvard Business Review Update
Fixed cover image and the missing last section.
Code:
from calibre.web.feeds.news import BasicNewsRecipe import re from datetime import date, timedelta class HBR(BasicNewsRecipe): title = 'Harvard Business Review' description = 'To subscribe go to http://hbr.harvardbusiness.org' needs_subscription = True __author__ = 'Kovid Goyal and Sujata Raman' timefmt = ' [%B %Y]' language = 'en' no_stylesheets = True # recipe_disabled = ('hbr.org has started requiring the use of javascript' # ' to log into their website. This is unsupported in calibre, so' # ' this recipe has been disabled. If you would like to see ' # ' HBR supported in calibre, contact hbr.org and ask them' # ' to provide a javascript free login method.') LOGIN_URL = 'https://hbr.org/login?request_url=/' LOGOUT_URL = 'https://hbr.org/logout?request_url=/' INDEX = 'http://hbr.org' keep_only_tags = [dict(name='div', id='pageContainer')] remove_tags = [dict(id=['mastheadContainer', 'magazineHeadline', 'articleToolbarTopRD', 'pageRightSubColumn', 'pageRightColumn', 'todayOnHBRListWidget', 'mostWidget', 'keepUpWithHBR', 'mailingListTout', 'partnerCenter', 'pageFooter', 'superNavHeadContainer', 'hbrDisqus', 'articleToolbarTop', 'articleToolbarBottom', 'articleToolbarRD']), dict(name='iframe')] extra_css = ''' a {font-family:Georgia,"Times New Roman",Times,serif; font-style:italic; color:#000000; } .article{font-family:Georgia,"Times New Roman",Times,serif; font-size: xx-small;} h2{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:large; } h4{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:small; } #articleAuthors{font-family:Georgia,"Times New Roman",Times,serif; font-style:italic; color:#000000;font-size:x-small;} #summaryText{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:x-small;} ''' use_javascript_to_login = True def javascript_login(self, br, username, password): from calibre.web.jsbrowser.browser import Timeout try: br.visit('https://hbr.org/login?request_url=/', timeout=20) except Timeout: pass br.click('#accordion div[tabindex="0"]', wait_for_load=False) f = br.select_form('#signin-form') f['signin-form:username'] = username f['signin-form:password'] = password br.submit(wait_for_load=False) br.run_for_a_time(30) def map_url(self, url): if url.endswith('/ar/1'): return url[:-1]+'pr' def hbr_parse_toc(self, soup): feeds = [] current_section = None articles = [] for x in soup.find(id='issueFeaturesContent').findAll(['li', 'h4']): if x.name == 'h4': if x.get('class', None) == 'basic':continue if current_section is not None and articles: feeds.append((current_section, articles)) current_section = self.tag_to_string(x).capitalize() articles = [] self.log('\tFound section:', current_section) else: a = x.find('a', href=True) if a is None: continue title = self.tag_to_string(a) url = a['href'] if '/ar/' not in url: continue if url.startswith('/'): url = 'http://hbr.org' + url url = self.map_url(url) p = x.find('p', attrs={'class':'author'}) desc = '' if p is not None: desc = self.tag_to_string(p) self.log('\t\tFound article:', title) self.log('\t\t\t', url) self.log('\t\t\t', desc) articles.append({'title':title, 'url':url, 'description':desc, 'date':''}) if current_section is not None and articles: feeds.append((current_section, articles)) return feeds def parse_index(self): soup0 = self.index_to_soup('http://hbr.org/magazine') datencover = soup0.find('ul', attrs={'id':'magazineArchiveCarousel'}).findAll('li')[-1] #find date & cover self.cover_url=datencover.img['src'] dates=self.tag_to_string(datencover.img['alt']) self.timefmt = u' [%s]'%dates soup = self.index_to_soup(self.INDEX + soup0.find('div', attrs = {'class':'magazine_page'}).a['href']) feeds = self.hbr_parse_toc(soup) return feeds |
04-04-2013, 02:04 PM | #2 |
Connoisseur
Posts: 55
Karma: 13316
Join Date: Jul 2012
Device: iPad
|
Updated the remove_tags array to clean up the output
Code:
from calibre.web.feeds.news import BasicNewsRecipe import re from datetime import date, timedelta class HBR(BasicNewsRecipe): title = 'Harvard Business Review' description = 'To subscribe go to http://hbr.harvardbusiness.org' needs_subscription = True __author__ = 'Kovid Goyal and Sujata Raman' timefmt = ' [%B %Y]' language = 'en' no_stylesheets = True # recipe_disabled = ('hbr.org has started requiring the use of javascript' # ' to log into their website. This is unsupported in calibre, so' # ' this recipe has been disabled. If you would like to see ' # ' HBR supported in calibre, contact hbr.org and ask them' # ' to provide a javascript free login method.') LOGIN_URL = 'https://hbr.org/login?request_url=/' LOGOUT_URL = 'https://hbr.org/logout?request_url=/' INDEX = 'http://hbr.org' keep_only_tags = [dict(name='div', id='pageContainer')] remove_tags = [dict(id=['mastheadContainer', 'magazineHeadline', 'articleToolbarTopRD', 'pageRightSubColumn', 'pageRightColumn', 'todayOnHBRListWidget', 'mostWidget', 'keepUpWithHBR', 'mailingListTout', 'partnerCenter', 'pageFooter', 'superNavHeadContainer', 'hbrDisqus', 'article-toolbox', 'articleToolbarTop', 'articleToolbarBottom', 'articleToolbarRD']), dict(name='iframe')] extra_css = ''' a {font-family:Georgia,"Times New Roman",Times,serif; font-style:italic; color:#000000; } .article{font-family:Georgia,"Times New Roman",Times,serif; font-size: xx-small;} h2{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:large; } h4{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:small; } #articleAuthors{font-family:Georgia,"Times New Roman",Times,serif; font-style:italic; color:#000000;font-size:x-small;} #summaryText{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:x-small;} ''' use_javascript_to_login = True def javascript_login(self, br, username, password): from calibre.web.jsbrowser.browser import Timeout try: br.visit('https://hbr.org/login?request_url=/', timeout=20) except Timeout: pass br.click('#accordion div[tabindex="0"]', wait_for_load=False) f = br.select_form('#signin-form') f['signin-form:username'] = username f['signin-form:password'] = password br.submit(wait_for_load=False) br.run_for_a_time(30) def map_url(self, url): if url.endswith('/ar/1'): return url[:-1]+'pr' def hbr_parse_toc(self, soup): feeds = [] current_section = None articles = [] for x in soup.find(id='issueFeaturesContent').findAll(['li', 'h4']): if x.name == 'h4': if x.get('class', None) == 'basic':continue if current_section is not None and articles: feeds.append((current_section, articles)) current_section = self.tag_to_string(x).capitalize() articles = [] self.log('\tFound section:', current_section) else: a = x.find('a', href=True) if a is None: continue title = self.tag_to_string(a) url = a['href'] if '/ar/' not in url: continue if url.startswith('/'): url = 'http://hbr.org' + url url = self.map_url(url) p = x.find('p', attrs={'class':'author'}) desc = '' if p is not None: desc = self.tag_to_string(p) self.log('\t\tFound article:', title) self.log('\t\t\t', url) self.log('\t\t\t', desc) articles.append({'title':title, 'url':url, 'description':desc, 'date':''}) if current_section is not None and articles: feeds.append((current_section, articles)) return feeds def parse_index(self): soup0 = self.index_to_soup('http://hbr.org/magazine') datencover = soup0.find('ul', attrs={'id':'magazineArchiveCarousel'}).findAll('li')[-1] #find date & cover self.cover_url=datencover.img['src'] dates=self.tag_to_string(datencover.img['alt']) self.timefmt = u' [%s]'%dates soup = self.index_to_soup(self.INDEX + soup0.find('div', attrs = {'class':'magazine_page'}).a['href']) feeds = self.hbr_parse_toc(soup) return feeds Last edited by rainrdx; 04-04-2013 at 02:30 PM. |
Advert | |
|
|
Similar Threads | ||||
Thread | Thread Starter | Forum | Replies | Last Post |
Harvard Business Review DISABLED? | besianm | Recipes | 3 | 09-12-2012 04:28 PM |
harvard business review (hbr) disabled? | oddboy | Recipes | 3 | 09-10-2012 03:24 PM |
Previous Issues HBR, Harvard Business Review | heuristics9 | Recipes | 0 | 12-19-2010 10:38 AM |
iPad Harvard Business Review: "Why I Returned My iPad" | kjk | Apple Devices | 17 | 06-18-2010 01:52 AM |
Harvard Business School:Close of a Chapter in Publishing | Sydney's Mom | News | 1 | 04-19-2010 10:54 PM |