View Single Post
Old 11-18-2011, 09:09 PM   #1
PatStapleton
Member
PatStapleton began at the beginning.
 
Posts: 22
Karma: 10
Join Date: Nov 2011
Location: Australia
Device: Kindle 4
Recipe for Medical Journal of Australia

I wasn't sure where to post custom recipes, so I'm posting here.

I wrote a recipe for the Medical Journal of Australia, hopefully this is useful to somebody.

Note that I wrote it to use my institution's login so you may need to adjust this for your own use. The parts to adjust are the "#LOGIN" section, and the "#fix url to pickup institution login by appending" line.

Spoiler:

Code:
__license__   = 'GPL v3'
__copyright__ = '2011, Pat Stapleton <pat.stapleton at gmail.com>'

from calibre.web.feeds import Feed
from calibre.web.feeds.recipes import BasicNewsRecipe
import re

class MJA(BasicNewsRecipe):
    title          = u'MJA'
    description = u'The Medical Journal of Australia'
    category = u'medical, science, health, Australia'
    __author__            = 'Pat Stapleton'
    oldest_article = 14
    max_articles_per_feed = 100
    auto_cleanup = True
    needs_subscription = True
    language              = 'en_AU'
    remove_empty_feeds    = True
    publication_type      = 'journal'
    publisher            = u'Australian Medical Association'
    #masthead_url = 'http://www.mja.com.au/MJAnav.gif'
    
    feeds          = [(u'MJA', u'http://feeds.feedburner.com/TheMedicalJournalOfAustralia?format=xml')]
    
    #LOGIN
    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
        br.open('http://ipacez.nd.edu.au/login?url=http://www.mja.com.au')
        br.select_form(nr=0)
        br['user'] = self.username
        br['pass'] = self.password
        br.submit()
        return br
    
    def parse_feeds(self):
        # Do the "official" parse_feeds first
	myFeeds = BasicNewsRecipe.parse_feeds(self)
        
        # Loop thru all articles and compile list of sections
	sectionedArticles = {}
	for curfeed in myFeeds:
            for a, curarticle in enumerate(curfeed.articles):
                #MJA articles have the title format '[SECTION] Article Title' so lets grab section
                section_article = re.split(']', curarticle.title)
                sectionTitle = section_article[0].lstrip('[')
                articleTitle = section_article[1].lstrip()
                if sectionTitle not in sectionedArticles:
                    sectionedArticles[sectionTitle] = []
                
                #cleanup article's title (remove ugly section prefix)
                curarticle.title = articleTitle
                
                #fix url to pickup institution login by appending
                curarticle.url = curarticle.url.replace('http://www.mja.com.au', 'http://www.mja.com.au.ipacez.nd.edu.au')
                
                sectionedArticles[sectionTitle].append(curarticle)
        
        #Create our nice list of sectioned feeds to return
        retFeeds = []
        for section in sectionedArticles:
            newSection = Feed()
            newSection.title = section
            newSection.description = self.description
            newSection.articles = sectionedArticles[section]
            newSection.image_url = None
            retFeeds.append(newSection)
        
        return retFeeds


Enjoy!

-Pat
PatStapleton is offline   Reply With Quote