Register Guidelines E-Books Today's Posts Search

Go Back   MobileRead Forums > E-Book Software > Calibre > Recipes

Notices

Reply
 
Thread Tools Search this Thread
Old 01-01-2013, 08:10 PM   #1
rainrdx
Connoisseur
rainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy blue
 
Posts: 55
Karma: 13316
Join Date: Jul 2012
Device: iPad
Foreign Affairs recipe update

This is my update on the broken Foreign Affairs recipe. It now fully works (and nicely ). I have set the recipe to require subscription given that free content on FA is very very limited nowadays.

Code:
from calibre.web.feeds.news import BasicNewsRecipe
import re
from calibre.ptempfile import PersistentTemporaryFile

class ForeignAffairsRecipe(BasicNewsRecipe):
    ''' there are three modifications:
    1) fetch issue cover
    2) toggle ignore premium articles
    3) extract proper section names, ie. "Comments", "Essay"

    by Chen Wei weichen302@gmx.com, 2012-02-05'''

    __license__  = 'GPL v3'
    __author__ = 'Rick Shang, kwetal'
    language = 'en'
    version = 1.01

    title = u'Foreign Affairs (Subcription)'
    publisher = u'Council on Foreign Relations'
    category = u'USA, Foreign Affairs'
    description = u'The leading forum for serious discussion of American foreign policy and international affairs.'

    no_stylesheets = True
    remove_javascript = True
    needs_subscription = True

    INDEX = 'http://www.foreignaffairs.com'
    FRONTPAGE = 'http://www.foreignaffairs.com/magazine'


    remove_tags = []
    remove_tags.append(dict(name = 'base'))
    #remove_tags.append(dict(name = '', attrs = {'': ''}))

    remove_tags_before = dict(name = 'h1', attrs = {'class': 'print-title'})

    remove_tags_after = dict(name = 'div', attrs = {'class': 'print-footer'})

    extra_css = '''
                body{font-family:verdana,arial,helvetica,geneva,sans-serif;}
                div.print-footer {font-size: x-small; color: #696969;}
                '''

    conversion_options = {'comments': description, 'tags': category, 'language': 'en',
                          'publisher': publisher}

    temp_files = []
    articles_are_obfuscated = True

    def get_cover_url(self):
        soup = self.index_to_soup(self.FRONTPAGE)
        div = soup.find('div', attrs={'class':'inthemag-issuebuy-cover'})
        img_url =  div.find('img')['src']
        return self.INDEX + img_url

    def get_obfuscated_article(self, url):
        br = self.get_browser()
        br.open(url)

        response = br.follow_link(url_regex = r'/print/[0-9]+', nr = 0)
        html = response.read()

        self.temp_files.append(PersistentTemporaryFile('_fa.html'))
        self.temp_files[-1].write(html)
        self.temp_files[-1].close()

        return self.temp_files[-1].name


    def parse_index(self):

        answer = []
        soup = self.index_to_soup(self.FRONTPAGE)
	#get dates
	date = re.split('\s\|\s',self.tag_to_string(soup.head.title.string))[0]
	self.timefmt =  u' [%s]'%date

        sec_start = soup.findAll('div', attrs= {'class':'panel-pane'})
        for sec in sec_start:
	    articles = []
	    section = self.tag_to_string(sec.find('h2')) 
	    if 'Books' in section:
		reviewsection=sec.find('div', attrs = {'class': 'item-list'})
		for subsection in reviewsection.findAll('div'):
			subsectiontitle=self.tag_to_string(subsection.span.a)
			subsectionurl=self.INDEX + subsection.span.a['href']
			soup1 = self.index_to_soup(subsectionurl)
			for div in soup1.findAll('div', attrs = {'class': 'views-field-title'}):
				if div.find('a') is not None:
					originalauthor=self.tag_to_string(div.findNext('div', attrs = {'class':'views-field-field-article-book-nid'}).div.a)
					title=subsectiontitle+': '+self.tag_to_string(div.span.a)+' by '+originalauthor
					url=self.INDEX+div.span.a['href']
					atr=div.findNext('div', attrs = {'class': 'views-field-field-article-display-authors-value'})
					if atr is not None:
						author=self.tag_to_string(atr.span.a)
					else:
						author=''
					desc=div.findNext('span', attrs = {'class': 'views-field-field-article-summary-value'})
					if desc is not None:
						description=self.tag_to_string(desc.div.p)
					else:
						description=''
					articles.append({'title':title, 'date':None, 'url':url, 'description':description, 'author':author})
			subsectiontitle=''
	    else:      
                for div in sec.findAll('div', attrs = {'class': 'views-field-title'}):
			if div.find('a') is not None:
				title=self.tag_to_string(div.span.a)
				url=self.INDEX+div.span.a['href']
				atr=div.findNext('div', attrs = {'class': 'views-field-field-article-display-authors-value'})
				if atr is not None:
					author=self.tag_to_string(atr.span.a)
				else:
					author=''
				desc=div.findNext('span', attrs = {'class': 'views-field-field-article-summary-value'})
				if desc is not None:
					description=self.tag_to_string(desc.div.p)
				else:
					description=''
				articles.append({'title':title, 'date':None, 'url':url, 'description':description, 'author':author})
            if articles:
                    answer.append((section, articles))
        return answer

    def preprocess_html(self, soup):
        for img in soup.findAll('img', attrs = {'src': True}):
            if not img['src'].startswith('http://'):
                img['src'] = self.INDEX + img['src']

        return soup

    

    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
        if self.username is not None and self.password is not None:
            br.open('https://www.foreignaffairs.com/user?destination=user%3Fop%3Dlo')
            br.select_form(nr = 1)
            br['name']   = self.username
            br['pass'] = self.password
            br.submit()
        return br

    def cleanup(self):
        self.browser.open('http://www.foreignaffairs.com/logout?destination=user%3Fop=lo')
rainrdx is offline   Reply With Quote
Old 03-25-2013, 06:06 PM   #2
rainrdx
Connoisseur
rainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy blue
 
Posts: 55
Karma: 13316
Join Date: Jul 2012
Device: iPad
Bug Fixes
Code:
from calibre.web.feeds.news import BasicNewsRecipe
import re
from calibre.ptempfile import PersistentTemporaryFile

class ForeignAffairsRecipe(BasicNewsRecipe):
    ''' there are three modifications:
    1) fetch issue cover
    2) toggle ignore premium articles
    3) extract proper section names, ie. "Comments", "Essay"

    by Chen Wei weichen302@gmx.com, 2012-02-05'''

    __license__  = 'GPL v3'
    __author__ = 'Rick Shang, kwetal'
    language = 'en'
    version = 1.01

    title = u'Foreign Affairs (Subcription)'
    publisher = u'Council on Foreign Relations'
    category = u'USA, Foreign Affairs'
    description = u'The leading forum for serious discussion of American foreign policy and international affairs.'

    no_stylesheets = True
    remove_javascript = True
    needs_subscription = True

    INDEX = 'http://www.foreignaffairs.com'
    FRONTPAGE = 'http://www.foreignaffairs.com/magazine'


    remove_tags = []
    remove_tags.append(dict(name = 'base'))
    #remove_tags.append(dict(name = '', attrs = {'': ''}))

    remove_tags_before = dict(name = 'h1', attrs = {'class': 'print-title'})

    remove_tags_after = dict(name = 'div', attrs = {'class': 'print-footer'})

    extra_css = '''
                body{font-family:verdana,arial,helvetica,geneva,sans-serif;}
                div.print-footer {font-size: x-small; color: #696969;}
                '''

    conversion_options = {'comments': description, 'tags': category, 'language': 'en',
                          'publisher': publisher}

    temp_files = []

    def get_cover_url(self):
        soup = self.index_to_soup(self.FRONTPAGE)
        div = soup.find('div', attrs={'class':'inthemag-issuebuy-cover'})
        img_url =  div.find('img')['src']
        return self.INDEX + img_url

    def parse_index(self):

        answer = []
        soup = self.index_to_soup(self.FRONTPAGE)
	#get dates
	date = re.split('\s\|\s',self.tag_to_string(soup.head.title.string))[0]
	self.timefmt =  u' [%s]'%date

        sec_start = soup.findAll('div', attrs= {'class':'panel-pane'})
        for sec in sec_start:
	    articles = []
	    section = self.tag_to_string(sec.find('h2')) 
	    if 'Books' in section:
		reviewsection=sec.find('div', attrs = {'class': 'item-list'})
		for subsection in reviewsection.findAll('div'):
			subsectiontitle=self.tag_to_string(subsection.span.a)
			subsectionurl=self.INDEX + subsection.span.a['href']
			soup1 = self.index_to_soup(subsectionurl)
			for div in soup1.findAll('div', attrs = {'class': 'views-field-title'}):
				if div.find('a') is not None:
					originalauthor=self.tag_to_string(div.findNext('div', attrs = {'class':'views-field-field-article-book-nid'}).div.a)
					title=subsectiontitle+': '+self.tag_to_string(div.span.a)+' by '+originalauthor
					url=self.INDEX+self.index_to_soup(self.INDEX+div.span.a['href']).find('a', attrs={'class':'fa_addthis_print'})['href']
					atr=div.findNext('div', attrs = {'class': 'views-field-field-article-display-authors-value'})
					if atr is not None:
						author=self.tag_to_string(atr.span)
					else:
						author=''
					desc=div.findNext('span', attrs = {'class': 'views-field-field-article-summary-value'})
					if desc is not None:
						description=self.tag_to_string(desc.div.p)
					else:
						description=''
					articles.append({'title':title, 'date':None, 'url':url, 'description':description, 'author':author})

			subsectiontitle=''
	    else:      
                for div in sec.findAll('div', attrs = {'class': 'views-field-title'}):
			if div.find('a') is not None:
				title=self.tag_to_string(div.span.a)
				url=self.INDEX+self.index_to_soup(self.INDEX+div.span.a['href']).find('a', attrs={'class':'fa_addthis_print'})['href']
				atr=div.findNext('div', attrs = {'class': 'views-field-field-article-display-authors-value'})
				if atr is not None:
					author=self.tag_to_string(atr.span)
				else:
					author=''
				desc=div.findNext('span', attrs = {'class': 'views-field-field-article-summary-value'})
				if desc is not None:
					description=self.tag_to_string(desc.div.p)
				else:
					description=''
				articles.append({'title':title, 'date':None, 'url':url, 'description':description, 'author':author})

            if articles:
                    answer.append((section, articles))
        return answer

    def preprocess_html(self, soup):
        for img in soup.findAll('img', attrs = {'src': True}):
            if not img['src'].startswith('http://'):
                img['src'] = self.INDEX + img['src']

        return soup

    

    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
        if self.username is not None and self.password is not None:
            br.open('https://www.foreignaffairs.com/user?destination=user%3Fop%3Dlo')
            br.select_form(nr = 1)
            br['name']   = self.username
            br['pass'] = self.password
            br.submit()
        return br

    def cleanup(self):
        self.browser.open('http://www.foreignaffairs.com/logout?destination=user%3Fop=lo')
rainrdx is offline   Reply With Quote
Advert
Reply


Forum Jump

Similar Threads
Thread Thread Starter Forum Replies Last Post
Foreign Affairs-Free tdonline Recipes 2 03-11-2012 09:51 PM
Problem: Recipe for Foreign Affairs not fetching premium articles besianm Recipes 1 03-07-2012 04:41 AM
minor modified Foreign Affairs receipe forceps Recipes 3 03-06-2012 10:43 PM
Foreign Affairs subscription - I don't understand the pricing. adriatikfan Amazon Kindle 9 11-08-2009 12:05 AM
Foreign Affairs Replacing Previous Issue Spankypoo Amazon Kindle 6 07-08-2009 12:31 PM


All times are GMT -4. The time now is 11:23 AM.


MobileRead.com is a privately owned, operated and funded community.