Hi everybody, the site has since changed so I've rewritten this recipe so it works again.
Like last time, I've written it to use my institution's login so for your own use you should modify/remove the #LOGIN section and the line:
"url = 'http://www.mja.com.au.ipacez.nd.edu.au' + section.a.get('href')"
should be changed to:
"url = 'http://www.mja.com.au' + section.a.get('href')"
Spoiler:
Code:
__license__ = 'GPL v3'
__copyright__ = '2012, Pat Stapleton <pat.stapleton at gmail.com>'
from calibre.web.feeds import Feed
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from dateutil import parser
import datetime
import time
import re
class MJA(BasicNewsRecipe):
title = u'MJA'
description = u'The Medical Journal of Australia'
category = u'medical, science, health, Australia'
__author__ = 'Pat Stapleton'
oldest_article = 14
max_articles_per_feed = 1000
auto_cleanup = True
needs_subscription = True
language = 'en_AU'
remove_empty_feeds = True
publication_type = 'journal'
publisher = u'Australian Medical Association'
#masthead_url = 'http://www.mja.com.au/MJAnav.gif'
issue = None
#LOGIN
def get_browser(self):
br = BasicNewsRecipe.get_browser()
br.open('http://ipacez.nd.edu.au/login?url=http://www.mja.com.au')
br.select_form(nr=0)
br['user'] = self.username
br['pass'] = self.password
br.submit()
return br
#Get cover image
def get_cover_url(self):
retUrl = ''
rawc = self.index_to_soup('http://www.mja.com.au',True)
soup = BeautifulSoup(rawc)
for itimg in soup.findAll('img',src=True):
if '/cover' in itimg['src']:
retUrl = 'http://www.mja.com.au/' + itimg['src']
return retUrl
def parse_index(self):
#Get link to contents of current issue
rawc = self.index_to_soup('http://www.mja.com.au',True)
soup = BeautifulSoup(rawc)
linkObject = soup.find('div', 'homepage-current-issue-inner')
link = 'http://www.mja.com.au' + linkObject.contents[0]['href']
feeds = []
rawc = self.index_to_soup(link, True)
soup = BeautifulSoup(rawc)
artCount = 0
sectionCount = 0
sectionTitles = soup.findAll('h3')
sectionTitle = self.tag_to_string(sectionTitles[0])
articles = []
for section in soup.findAll(attrs={'class':lambda x: x and 'views-row' in x}):
artCount = artCount + 1
classField = section.get('class')
if(classField.find('views-row-1') != -1): #this is first of new section
if(articles):
print(articles)
feeds.append((sectionTitle, articles))
articles = []
if(sectionCount < len(sectionTitles)):
sectionTitle = self.tag_to_string(sectionTitles[sectionCount])
sectionCount = sectionCount + 1
else:
break
if(section.p):
artTitle = self.tag_to_string(section.p)
else:
artTitle = self.tag_to_string(section.a)
url = 'http://www.mja.com.au.ipacez.nd.edu.au' + section.a.get('href')
date = ''
desc = ''
content = ''
article = {'title':artTitle, 'url':url, 'date':date, 'description':desc, 'content':content}
articles.append(article)
if(articles):
feeds.append((sectionTitle, articles))
articles = []
return feeds
Enjoy!
-Pat