View Single Post
Old 07-16-2011, 11:23 PM   #1
adonishi
Junior Member
adonishi began at the beginning.
 
Posts: 7
Karma: 10
Join Date: May 2011
Device: kindle, SONY T1
Nikkei News (paper section)

Hi, I made a Japanese Nikkei News paper section recipe. there are already some Nikkei News recipe are included into Calibre. but this recipe fetch paper section of it (which means same contents as real news paper contents). Hope this will useful and may include into Calibre.

Thank you
Ado Nishimura


Code:
from calibre.web.feeds.recipes import BasicNewsRecipe
import re

#import pprint, sys
#pp = pprint.PrettyPrinter(indent=4)

class NikkeiNet_paper_subscription(BasicNewsRecipe):
    title           = u'\u65E5\u672C\u7D4C\u6E08\u65B0\u805E\uFF08\u671D\u520A\u30FB\u5915\u520A\uFF09'
    __author__      = 'Ado Nishimura'
    description     = u'\u65E5\u7D4C\u96FB\u5B50\u7248\u306B\u3088\u308B\u65E5\u672C\u7D4C\u6E08\u65B0\u805E\u3002\u671D\u520A\u30FB\u5915\u520A\u306F\u53D6\u5F97\u6642\u9593\u306B\u3088\u308A\u5207\u308A\u66FF\u308F\u308A\u307E\u3059\u3002\u8981\u8CFC\u8AAD'
    needs_subscription = True
    oldest_article  = 1
    max_articles_per_feed = 30
    language        = 'ja'
    no_stylesheets  = True
    cover_url       = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
    masthead_url    = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'

    remove_tags_before = {'class':"cmn-indent"}
    remove_tags = [
#                       {'class':"cmn-article_move"},
#                       {'class':"cmn-pr_list"},
#                       {'class':"cmnc-zoom"},
                       {'class':"cmn-hide"},
                       {'name':'form'},
                  ]
    remove_tags_after = {'class':"cmn-indent"}

    def get_browser(self):
        br = BasicNewsRecipe.get_browser()

        #pp.pprint(self.parse_index())
        #exit(1)

        #br.set_debug_http(True)
        #br.set_debug_redirects(True)
        #br.set_debug_responses(True)

        if self.username is not None and self.password is not None:
            print "----------------------------open top page----------------------------------------"
            br.open('http://www.nikkei.com/')
            print "----------------------------open first login form--------------------------------"
            link = br.links(url_regex="www.nikkei.com/etc/accounts/login").next()
            br.follow_link(link)
            #response = br.response()
            #print response.get_data()
            print "----------------------------JS redirect(send autoPostForm)-----------------------"
            br.select_form(name='autoPostForm')
            br.submit()
            #response = br.response()
            print "----------------------------got login form---------------------------------------"
            br.select_form(name='LA0210Form01')
            br['LA0210Form01:LA0210Email']    = self.username
            br['LA0210Form01:LA0210Password'] = self.password
            br.submit()
            #response = br.response()
            print "----------------------------JS redirect------------------------------------------"
            br.select_form(nr=0)
            br.submit()

            #br.set_debug_http(False)
            #br.set_debug_redirects(False)
            #br.set_debug_responses(False)
        return br

    def cleanup(self):
        print "----------------------------logout-----------------------------------------------"
        self.browser.open('https://regist.nikkei.com/ds/etc/accounts/logout')

    def parse_index(self):
        print "----------------------------get index of paper-----------------------------------"
        result = []
        soup = self.index_to_soup('http://www.nikkei.com/paper/')
        #soup = self.index_to_soup(self.test_data())
        for sect in soup.findAll('div', 'cmn-section kn-special JSID_baseSection'):
            sect_title = sect.find('h3', 'cmnc-title').string
            sect_result = []
            for elem in sect.findAll(attrs={'class':['cmn-article_title']}):
                url = 'http://www.nikkei.com' + elem.span.a['href']
                url = re.sub("/article/", "/print-article/", url) # print version.
                span = elem.span.a.span
                if ((span is not None) and (len(span.contents) > 1)):
                    title = span.contents[1].string
                    sect_result.append(dict(title=title, url=url, date='',
                                            description='', content=''))
            result.append([sect_title, sect_result])
        #pp.pprint(result)
        return result
adonishi is offline   Reply With Quote