View Single Post
Old 03-24-2012, 07:43 AM   #5
PatStapleton
Member
PatStapleton began at the beginning.
 
Posts: 22
Karma: 10
Join Date: Nov 2011
Location: Australia
Device: Kindle 4
Lightbulb Updated version

Hi everybody, the site has since changed so I've rewritten this recipe so it works again.

Like last time, I've written it to use my institution's login so for your own use you should modify/remove the #LOGIN section and the line:

"url = 'http://www.mja.com.au.ipacez.nd.edu.au' + section.a.get('href')"

should be changed to:

"url = 'http://www.mja.com.au' + section.a.get('href')"

Spoiler:
Code:
__license__   = 'GPL v3'
__copyright__ = '2012, Pat Stapleton <pat.stapleton at gmail.com>'

from calibre.web.feeds import Feed
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from dateutil import parser
import datetime
import time
import re

class MJA(BasicNewsRecipe):
    title          = u'MJA'
    description = u'The Medical Journal of Australia'
    category = u'medical, science, health, Australia'
    __author__            = 'Pat Stapleton'
    oldest_article = 14
    max_articles_per_feed = 1000
    auto_cleanup = True
    needs_subscription = True
    language              = 'en_AU'
    remove_empty_feeds    = True
    publication_type      = 'journal'
    publisher            = u'Australian Medical Association'
    #masthead_url = 'http://www.mja.com.au/MJAnav.gif'
    issue = None
    
    #LOGIN
    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
        br.open('http://ipacez.nd.edu.au/login?url=http://www.mja.com.au')
        br.select_form(nr=0)
        br['user'] = self.username
        br['pass'] = self.password
        br.submit()
        return br
    
    #Get cover image
    def get_cover_url(self):
        retUrl = ''
        rawc = self.index_to_soup('http://www.mja.com.au',True)
        soup = BeautifulSoup(rawc)
        for itimg in soup.findAll('img',src=True):
            if '/cover' in itimg['src']:
                retUrl = 'http://www.mja.com.au/' + itimg['src']
        
        return retUrl

    def parse_index(self):
	#Get link to contents of current issue
	rawc = self.index_to_soup('http://www.mja.com.au',True)
        soup = BeautifulSoup(rawc)
	linkObject = soup.find('div', 'homepage-current-issue-inner')
	link = 'http://www.mja.com.au' + linkObject.contents[0]['href']
	
	feeds = []
	rawc = self.index_to_soup(link, True)
	soup = BeautifulSoup(rawc)

	artCount = 0
	sectionCount = 0
	sectionTitles = soup.findAll('h3')
	sectionTitle = self.tag_to_string(sectionTitles[0])
	articles = []
	for section in soup.findAll(attrs={'class':lambda x: x and 'views-row' in x}):
		artCount = artCount + 1
		classField = section.get('class')
		if(classField.find('views-row-1') != -1): #this is first of new section
			if(articles):
				print(articles)
				feeds.append((sectionTitle, articles))
				articles = []
			if(sectionCount < len(sectionTitles)):
				sectionTitle = self.tag_to_string(sectionTitles[sectionCount])
				sectionCount = sectionCount + 1
			else:
				break
		if(section.p): 
			artTitle = self.tag_to_string(section.p)
		else:
			artTitle = self.tag_to_string(section.a)
		url = 'http://www.mja.com.au.ipacez.nd.edu.au' + section.a.get('href')
		date = ''
		desc = ''
		content = ''
		article = {'title':artTitle, 'url':url, 'date':date, 'description':desc, 'content':content}
		articles.append(article)
	
	if(articles):
		feeds.append((sectionTitle, articles))
		articles = []

	return feeds


Enjoy!

-Pat
PatStapleton is offline   Reply With Quote