View Single Post
Old 10-02-2020, 11:59 AM   #2
lui1
Enthusiast
lui1 began at the beginning.
 
Posts: 36
Karma: 10
Join Date: Dec 2017
Location: Los Angeles, CA
Device: Smart Phone
iteritems -> items

Oops... there is no "iteritems" method for dict in python 3. So these calls need to be changed to "items". And since the latest versions of calibre will be using python 3 this recipe needs to be amended.

Update for Journal of Accountancy

Code:
#!/usr/bin/python2
# vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2020, Jose Ortiz <jlortiz84 at gmail.com>

from __future__ import absolute_import, division, print_function, unicode_literals

import json
from time import sleep
from mechanize import Request
from contextlib import closing
from calibre.web.feeds.news import BasicNewsRecipe


def absolutize(url):
    if url.startswith('/'):
        url = ('https://www.journalofaccountancy.com' + url).partition('#')[0]
    return url


def classes(classes):
    q = frozenset(classes.split(' '))
    return dict(
        attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)}
    )


class JournalOfAccountancy(BasicNewsRecipe):
    __author__ = 'Jose Ortiz'
    language = 'en_US'
    title = u'Journal of Accountancy'
    description = (
        'A monthly journal of tax, financial reporting, auditing and other'
        ' topics of accountancy from American Institute of Certified Public'
        ' Accountants (AICPA).'
    )
    publication_type = 'magazine'
    masthead_url = 'http://developmentprofits.com/images/JournalOfAccountancy.jpg'
    no_stylesheets = True
    remove_javascript = True

    conversion_options = {
        'comments': description,
        'tags': 'News, Accountancy',
        'publisher': 'American Institute of Certified Public Accountants (AICPA)'
    }

    keep_only_tags = [classes('contentSectionArticlePage')]

    def parse_index(self):
        # ISSUES ######################
        issues_url = 'https://www.journalofaccountancy.com/issues.html'
        with closing(self.browser.open(issues_url)):
            pass
        ###############################

        common_headers = {
            'X-Requested-With': 'XMLHttpRequest',
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'DNT': '1',
            'Pragma': 'no-cache',
            'Cache-Control': 'no-cache'
        }

        URL_TEMPLATE = 'https://www.journalofaccountancy.com/content/jofa-home/issues/jcr:content/main-content-section/issuelibrary.%s.service'

        # INIT #################################################
        init_url = URL_TEMPLATE % 'init'
        init_headers = {'Referer': issues_url}
        init_headers.update(common_headers)

        self.log('\nINIT URL at ', init_url)
        with closing(self.browser.open(Request(init_url, None, init_headers))) as r:
            issue_path = json.loads(r.read())[0]['page']['path']
        ########################################################

        # FILTER ###############################
        filter_url = URL_TEMPLATE % ('filter.' + issue_path.split('/')[-2])
        filter_headers = {'issues': issue_path}
        filter_headers.update(init_headers)

        self.log('\nFILTER URL at ', filter_url)
        with closing(
            self.browser.open(Request(filter_url, None, filter_headers))
        ) as r:
            issue_data = json.loads(r.read())[0]
        ########################################

        self.cover_url = absolutize(issue_data['issueCover']['src'])
        self.log('cover_url at ', self.cover_url)
        self.timefmt = ' ' + issue_data['issueName']

        # INDEX ####################################
        index_url = absolutize(issue_path + '.html')
        self.log('INDEX URL at ', index_url)
        self.log('3 second pause')
        sleep(3)  # mimicking human user behavior
        with closing(self.browser.open(index_url)):
            pass
        ############################################

        service_headers = {'Referer': index_url}
        service_headers.update(common_headers)

        def get_data(service):
            service_url = (
                'https://www.journalofaccountancy.com' + issue_path +
                '/jcr:content/main-content-section/' + service + '.en.service'
            )
            self.log('\nSERIVICE URL at ', service_url)
            req = Request(service_url, None, service_headers)
            with closing(self.browser.open(req)) as r:
                return json.loads(r.read())

        def make_topic(category, articles):
            topic = (category, [])
            self.log(topic[0])
            for article in articles:
                title = article['articleTitle']
                url = absolutize(article['page']['path'] + '.html')
                desc = article.get('articleAbstract')
                self.log('\t', title, ' at ', url)
                topic[1].append({'title': title, 'url': url, 'description': desc})
            return topic

        ans = [
            make_topic('SPOTLIGHT', get_data('issuelanding/articles1')),
            make_topic('FEATURES', get_data('issuelanding/articles2'))
        ]

        for category, articles in get_data('articletypelist').items():
            ans.append(make_topic(category, articles))

        return ans
lui1 is offline   Reply With Quote