View Single Post
Old 12-07-2016, 07:24 AM   #10
leo738
Enthusiast
leo738 began at the beginning.
 
Posts: 39
Karma: 10
Join Date: Jul 2011
Device: Kindle 3
Managed to get something going:

Code:
__license__  = 'GPL v3'
__copyright__ = "2008, Derry FitzGerald. 2009 Modified by Ray Kinsella and David O'Callaghan, 2011 Modified by Phil Burns, 2013 Tom Scholl"
'''
irishtimes.com
'''
import urlparse, re
import json
from mechanize import Request

from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile

USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0'

class IrishTimes(BasicNewsRecipe):
    title          = u'The Irish Times'
    __author__    = "Derry FitzGerald, Ray Kinsella, David O'Callaghan and Phil Burns, Tom Scholl"
    description = 'Daily news from The Irish Times'
    needs_subscription = True

    language = 'en_IE'

    masthead_url = 'http://www.irishtimes.com/assets/images/generic/website/logo_theirishtimes.png'

    encoding = 'utf-8'
    oldest_article = 1.0
    max_articles_per_feed = 100
    simultaneous_downloads = 5
    remove_empty_feeds = True
    no_stylesheets = True
    temp_files = []
    articles_are_obfuscated = True

    feeds          = [
                      ('News', 'https://www.irishtimes.com/cmlink/the-irish-times-news-1.1319192'),
                      ('World', 'https://www.irishtimes.com/cmlink/irishtimesworldfeed-1.1321046'),
                      ('Politics', 'https://www.irishtimes.com/cmlink/irish-times-politics-rss-1.1315953'),
                      ('Business', 'https://www.irishtimes.com/cmlink/the-irish-times-business-1.1319195'),
                      ('Culture', 'https://www.irishtimes.com/cmlink/the-irish-times-culture-1.1319213'),
# Not interested in sport so commented out..                     
#		  ('Sport', 'https://www.irishtimes.com/cmlink/the-irish-times-sport-1.1319194'),
                      ('Debate', 'https://www.irishtimes.com/cmlink/debate-1.1319211'),
                      ('Life & Style', 'https://www.irishtimes.com/cmlink/the-irish-times-life-style-1.1319214'),
                    ]

    def get_browser(self):
        # To understand the signin logic read signin javascript from submit button from
        # https://www.irishtimes.com/signin

        br = BasicNewsRecipe.get_browser(self, user_agent=USER_AGENT)

        url = 'https://www.irishtimes.com/signin'
        br.set_debug_http(True)
        br.open(url).read()
        rurl = 'https://www.irishtimes.com/auth-rest-api/v1/paywall/login'
        rq = Request(rurl, headers={
            'Accept': '*/*',
            'Accept-Language': 'en-US,en;q=0.5',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Referer': url,
            'X-Requested-With': 'XMLHttpRequest',
        }, data=json.dumps({
            'username': self.username,
            'password': self.password,
            'deviceid': '53c835787f4d2406131985553c1842d0',
            'persistent': 'on',
        }))
        r = br.open(rq)
        if r.code != 200:
            raise ValueError('Failed to login, check username and password')
        data = json.loads(r.read())
        print(data)
        #if data.get('result') != 'success':
        #    raise ValueError(
        #        'Failed to login (XHR failed), check username and password')
        #br.set_cookie('m', data['username'], '.wsj.com')
        #r = br.open(data['url'])
        #self.wsj_itp_page = raw = r.read()
        #if b'>Sign Out<' not in raw:
        #    raise ValueError(
        #        'Failed to login (auth URL failed), check username and password')
        # open('/t/raw.html', 'w').write(raw)
        return br

    def get_obfuscated_article(self, url):
        # Insert a pic from the original url, but use content from the print url
        pic = None
        pics = self.index_to_soup(url)
        div = pics.find('div', {'class' : re.compile('image-carousel')})
        if div:
            pic = div.img
            if pic:
                try:
                    pic['src'] = urlparse.urljoin(url, pic['src'])
                    pic.extract()
                except:
                    pic = None

        content = self.index_to_soup(url + '?mode=print&ot=example.AjaxPageLayout.ot')
        if pic:
            content.p.insert(0, pic)

        self.temp_files.append(PersistentTemporaryFile('_fa.html'))
        self.temp_files[-1].write(content.prettify())
        self.temp_files[-1].close()
        return self.temp_files[-1].name
But the json stuff contains a 'deviceid' which I don't seem to be able to find much stuff on.

Any pointers what it is??

Thanks,

Leo
leo738 is offline   Reply With Quote