Register Guidelines E-Books Today's Posts Search

Go Back   MobileRead Forums > E-Book Software > Calibre > Recipes

Notices

Reply
 
Thread Tools Search this Thread
Old 04-04-2013, 01:05 PM   #1
rainrdx
Connoisseur
rainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy blue
 
Posts: 55
Karma: 13316
Join Date: Jul 2012
Device: iPad
Harvard Business Review Update

Fixed cover image and the missing last section.

Code:
from calibre.web.feeds.news import BasicNewsRecipe
import re
from datetime import date, timedelta

class HBR(BasicNewsRecipe):

    title = 'Harvard Business Review'
    description = 'To subscribe go to http://hbr.harvardbusiness.org'
    needs_subscription = True
    __author__ = 'Kovid Goyal and Sujata Raman'
    timefmt                = ' [%B %Y]'
    language = 'en'
    no_stylesheets = True
    # recipe_disabled = ('hbr.org has started requiring the use of javascript'
    #         ' to log into their website. This is unsupported in calibre, so'
    #         ' this recipe has been disabled. If you would like to see '
    #         ' HBR supported in calibre, contact hbr.org and ask them'
    #         ' to provide a javascript free login method.')

    LOGIN_URL = 'https://hbr.org/login?request_url=/'
    LOGOUT_URL = 'https://hbr.org/logout?request_url=/'

    INDEX = 'http://hbr.org'

    keep_only_tags = [dict(name='div', id='pageContainer')]
    remove_tags = [dict(id=['mastheadContainer', 'magazineHeadline',
        'articleToolbarTopRD', 'pageRightSubColumn', 'pageRightColumn',
        'todayOnHBRListWidget', 'mostWidget', 'keepUpWithHBR',
        'mailingListTout', 'partnerCenter', 'pageFooter',
        'superNavHeadContainer', 'hbrDisqus',
        'articleToolbarTop', 'articleToolbarBottom', 'articleToolbarRD']),
        dict(name='iframe')]
    extra_css = '''
                a {font-family:Georgia,"Times New Roman",Times,serif; font-style:italic; color:#000000; }
                .article{font-family:Georgia,"Times New Roman",Times,serif; font-size: xx-small;}
                h2{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:large; }
                h4{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:small;  }
                #articleAuthors{font-family:Georgia,"Times New Roman",Times,serif; font-style:italic; color:#000000;font-size:x-small;}
                #summaryText{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:x-small;}
                '''
    use_javascript_to_login = True

    def javascript_login(self, br, username, password):
        from calibre.web.jsbrowser.browser import Timeout
        try:
            br.visit('https://hbr.org/login?request_url=/', timeout=20)
        except Timeout:
            pass
        br.click('#accordion div[tabindex="0"]', wait_for_load=False)
        f = br.select_form('#signin-form')
        f['signin-form:username'] = username
        f['signin-form:password'] = password
        br.submit(wait_for_load=False)
        br.run_for_a_time(30)

    def map_url(self, url):
        if url.endswith('/ar/1'):
            return url[:-1]+'pr'


    def hbr_parse_toc(self, soup):
        feeds = []
        current_section = None
        articles = []
        for x in soup.find(id='issueFeaturesContent').findAll(['li', 'h4']):
            if x.name == 'h4':
                if x.get('class', None) == 'basic':continue
                if current_section is not None and articles:
                    feeds.append((current_section, articles))
                current_section = self.tag_to_string(x).capitalize()
                articles = []
                self.log('\tFound section:', current_section)
            else:
                a = x.find('a', href=True)
                if a is None: continue
                title = self.tag_to_string(a)
                url = a['href']
                if '/ar/' not in url:
                    continue
                if url.startswith('/'):
                    url = 'http://hbr.org' + url
                url = self.map_url(url)
                p = x.find('p', attrs={'class':'author'})
                desc = ''
                if p is not None:
                    desc = self.tag_to_string(p)
                self.log('\t\tFound article:', title)
                self.log('\t\t\t', url)
                self.log('\t\t\t', desc)

                articles.append({'title':title, 'url':url, 'description':desc,
                    'date':''})

        if current_section is not None and articles:
                    feeds.append((current_section, articles))
	return feeds

    def parse_index(self):
        soup0 = self.index_to_soup('http://hbr.org/magazine')

	datencover = soup0.find('ul', attrs={'id':'magazineArchiveCarousel'}).findAll('li')[-1]
	
	#find date & cover
	self.cover_url=datencover.img['src']
	dates=self.tag_to_string(datencover.img['alt'])
	self.timefmt = u' [%s]'%dates
	soup = self.index_to_soup(self.INDEX + soup0.find('div', attrs = {'class':'magazine_page'}).a['href'])


        feeds = self.hbr_parse_toc(soup)
        return feeds
rainrdx is offline   Reply With Quote
Old 04-04-2013, 02:04 PM   #2
rainrdx
Connoisseur
rainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy blue
 
Posts: 55
Karma: 13316
Join Date: Jul 2012
Device: iPad
Updated the remove_tags array to clean up the output

Code:
from calibre.web.feeds.news import BasicNewsRecipe
import re
from datetime import date, timedelta

class HBR(BasicNewsRecipe):

    title = 'Harvard Business Review'
    description = 'To subscribe go to http://hbr.harvardbusiness.org'
    needs_subscription = True
    __author__ = 'Kovid Goyal and Sujata Raman'
    timefmt                = ' [%B %Y]'
    language = 'en'
    no_stylesheets = True
    # recipe_disabled = ('hbr.org has started requiring the use of javascript'
    #         ' to log into their website. This is unsupported in calibre, so'
    #         ' this recipe has been disabled. If you would like to see '
    #         ' HBR supported in calibre, contact hbr.org and ask them'
    #         ' to provide a javascript free login method.')

    LOGIN_URL = 'https://hbr.org/login?request_url=/'
    LOGOUT_URL = 'https://hbr.org/logout?request_url=/'

    INDEX = 'http://hbr.org'

    keep_only_tags = [dict(name='div', id='pageContainer')]
    remove_tags = [dict(id=['mastheadContainer', 'magazineHeadline',
        'articleToolbarTopRD', 'pageRightSubColumn', 'pageRightColumn',
        'todayOnHBRListWidget', 'mostWidget', 'keepUpWithHBR',
        'mailingListTout', 'partnerCenter', 'pageFooter',
        'superNavHeadContainer', 'hbrDisqus', 'article-toolbox',
        'articleToolbarTop', 'articleToolbarBottom', 'articleToolbarRD']),
        dict(name='iframe')]
    extra_css = '''
                a {font-family:Georgia,"Times New Roman",Times,serif; font-style:italic; color:#000000; }
                .article{font-family:Georgia,"Times New Roman",Times,serif; font-size: xx-small;}
                h2{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:large; }
                h4{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:small;  }
                #articleAuthors{font-family:Georgia,"Times New Roman",Times,serif; font-style:italic; color:#000000;font-size:x-small;}
                #summaryText{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:x-small;}
                '''
    use_javascript_to_login = True

    def javascript_login(self, br, username, password):
        from calibre.web.jsbrowser.browser import Timeout
        try:
            br.visit('https://hbr.org/login?request_url=/', timeout=20)
        except Timeout:
            pass
        br.click('#accordion div[tabindex="0"]', wait_for_load=False)
        f = br.select_form('#signin-form')
        f['signin-form:username'] = username
        f['signin-form:password'] = password
        br.submit(wait_for_load=False)
        br.run_for_a_time(30)

    def map_url(self, url):
        if url.endswith('/ar/1'):
            return url[:-1]+'pr'


    def hbr_parse_toc(self, soup):
        feeds = []
        current_section = None
        articles = []
        for x in soup.find(id='issueFeaturesContent').findAll(['li', 'h4']):
            if x.name == 'h4':
                if x.get('class', None) == 'basic':continue
                if current_section is not None and articles:
                    feeds.append((current_section, articles))
                current_section = self.tag_to_string(x).capitalize()
                articles = []
                self.log('\tFound section:', current_section)
            else:
                a = x.find('a', href=True)
                if a is None: continue
                title = self.tag_to_string(a)
                url = a['href']
                if '/ar/' not in url:
                    continue
                if url.startswith('/'):
                    url = 'http://hbr.org' + url
                url = self.map_url(url)
                p = x.find('p', attrs={'class':'author'})
                desc = ''
                if p is not None:
                    desc = self.tag_to_string(p)
                self.log('\t\tFound article:', title)
                self.log('\t\t\t', url)
                self.log('\t\t\t', desc)

                articles.append({'title':title, 'url':url, 'description':desc,
                    'date':''})

        if current_section is not None and articles:
                    feeds.append((current_section, articles))
	return feeds

    def parse_index(self):
        soup0 = self.index_to_soup('http://hbr.org/magazine')

	datencover = soup0.find('ul', attrs={'id':'magazineArchiveCarousel'}).findAll('li')[-1]
	
	#find date & cover
	self.cover_url=datencover.img['src']
	dates=self.tag_to_string(datencover.img['alt'])
	self.timefmt = u' [%s]'%dates
	soup = self.index_to_soup(self.INDEX + soup0.find('div', attrs = {'class':'magazine_page'}).a['href'])


        feeds = self.hbr_parse_toc(soup)
        return feeds

Last edited by rainrdx; 04-04-2013 at 02:30 PM.
rainrdx is offline   Reply With Quote
Advert
Reply


Forum Jump

Similar Threads
Thread Thread Starter Forum Replies Last Post
Harvard Business Review DISABLED? besianm Recipes 3 09-12-2012 04:28 PM
harvard business review (hbr) disabled? oddboy Recipes 3 09-10-2012 03:24 PM
Previous Issues HBR, Harvard Business Review heuristics9 Recipes 0 12-19-2010 10:38 AM
iPad Harvard Business Review: "Why I Returned My iPad" kjk Apple Devices 17 06-18-2010 01:52 AM
Harvard Business School:Close of a Chapter in Publishing Sydney's Mom News 1 04-19-2010 10:54 PM


All times are GMT -4. The time now is 04:27 AM.


MobileRead.com is a privately owned, operated and funded community.