View Single Post
Old 01-01-2013, 08:10 PM   #1
rainrdx
Connoisseur
rainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy blue
 
Posts: 55
Karma: 13316
Join Date: Jul 2012
Device: iPad
Foreign Affairs recipe update

This is my update on the broken Foreign Affairs recipe. It now fully works (and nicely ). I have set the recipe to require subscription given that free content on FA is very very limited nowadays.

Code:
from calibre.web.feeds.news import BasicNewsRecipe
import re
from calibre.ptempfile import PersistentTemporaryFile

class ForeignAffairsRecipe(BasicNewsRecipe):
    ''' there are three modifications:
    1) fetch issue cover
    2) toggle ignore premium articles
    3) extract proper section names, ie. "Comments", "Essay"

    by Chen Wei weichen302@gmx.com, 2012-02-05'''

    __license__  = 'GPL v3'
    __author__ = 'Rick Shang, kwetal'
    language = 'en'
    version = 1.01

    title = u'Foreign Affairs (Subcription)'
    publisher = u'Council on Foreign Relations'
    category = u'USA, Foreign Affairs'
    description = u'The leading forum for serious discussion of American foreign policy and international affairs.'

    no_stylesheets = True
    remove_javascript = True
    needs_subscription = True

    INDEX = 'http://www.foreignaffairs.com'
    FRONTPAGE = 'http://www.foreignaffairs.com/magazine'


    remove_tags = []
    remove_tags.append(dict(name = 'base'))
    #remove_tags.append(dict(name = '', attrs = {'': ''}))

    remove_tags_before = dict(name = 'h1', attrs = {'class': 'print-title'})

    remove_tags_after = dict(name = 'div', attrs = {'class': 'print-footer'})

    extra_css = '''
                body{font-family:verdana,arial,helvetica,geneva,sans-serif;}
                div.print-footer {font-size: x-small; color: #696969;}
                '''

    conversion_options = {'comments': description, 'tags': category, 'language': 'en',
                          'publisher': publisher}

    temp_files = []
    articles_are_obfuscated = True

    def get_cover_url(self):
        soup = self.index_to_soup(self.FRONTPAGE)
        div = soup.find('div', attrs={'class':'inthemag-issuebuy-cover'})
        img_url =  div.find('img')['src']
        return self.INDEX + img_url

    def get_obfuscated_article(self, url):
        br = self.get_browser()
        br.open(url)

        response = br.follow_link(url_regex = r'/print/[0-9]+', nr = 0)
        html = response.read()

        self.temp_files.append(PersistentTemporaryFile('_fa.html'))
        self.temp_files[-1].write(html)
        self.temp_files[-1].close()

        return self.temp_files[-1].name


    def parse_index(self):

        answer = []
        soup = self.index_to_soup(self.FRONTPAGE)
	#get dates
	date = re.split('\s\|\s',self.tag_to_string(soup.head.title.string))[0]
	self.timefmt =  u' [%s]'%date

        sec_start = soup.findAll('div', attrs= {'class':'panel-pane'})
        for sec in sec_start:
	    articles = []
	    section = self.tag_to_string(sec.find('h2')) 
	    if 'Books' in section:
		reviewsection=sec.find('div', attrs = {'class': 'item-list'})
		for subsection in reviewsection.findAll('div'):
			subsectiontitle=self.tag_to_string(subsection.span.a)
			subsectionurl=self.INDEX + subsection.span.a['href']
			soup1 = self.index_to_soup(subsectionurl)
			for div in soup1.findAll('div', attrs = {'class': 'views-field-title'}):
				if div.find('a') is not None:
					originalauthor=self.tag_to_string(div.findNext('div', attrs = {'class':'views-field-field-article-book-nid'}).div.a)
					title=subsectiontitle+': '+self.tag_to_string(div.span.a)+' by '+originalauthor
					url=self.INDEX+div.span.a['href']
					atr=div.findNext('div', attrs = {'class': 'views-field-field-article-display-authors-value'})
					if atr is not None:
						author=self.tag_to_string(atr.span.a)
					else:
						author=''
					desc=div.findNext('span', attrs = {'class': 'views-field-field-article-summary-value'})
					if desc is not None:
						description=self.tag_to_string(desc.div.p)
					else:
						description=''
					articles.append({'title':title, 'date':None, 'url':url, 'description':description, 'author':author})
			subsectiontitle=''
	    else:      
                for div in sec.findAll('div', attrs = {'class': 'views-field-title'}):
			if div.find('a') is not None:
				title=self.tag_to_string(div.span.a)
				url=self.INDEX+div.span.a['href']
				atr=div.findNext('div', attrs = {'class': 'views-field-field-article-display-authors-value'})
				if atr is not None:
					author=self.tag_to_string(atr.span.a)
				else:
					author=''
				desc=div.findNext('span', attrs = {'class': 'views-field-field-article-summary-value'})
				if desc is not None:
					description=self.tag_to_string(desc.div.p)
				else:
					description=''
				articles.append({'title':title, 'date':None, 'url':url, 'description':description, 'author':author})
            if articles:
                    answer.append((section, articles))
        return answer

    def preprocess_html(self, soup):
        for img in soup.findAll('img', attrs = {'src': True}):
            if not img['src'].startswith('http://'):
                img['src'] = self.INDEX + img['src']

        return soup

    

    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
        if self.username is not None and self.password is not None:
            br.open('https://www.foreignaffairs.com/user?destination=user%3Fop%3Dlo')
            br.select_form(nr = 1)
            br['name']   = self.username
            br['pass'] = self.password
            br.submit()
        return br

    def cleanup(self):
        self.browser.open('http://www.foreignaffairs.com/logout?destination=user%3Fop=lo')
rainrdx is offline   Reply With Quote