Nikkei News (paper section)

adonishi · 07-17-2011, 12:23 AM

Hi, I made a Japanese Nikkei News paper section recipe. there are already some Nikkei News recipe are included into Calibre. but this recipe fetch paper section of it (which means same contents as real news paper contents). Hope this will useful and may include into Calibre.

Thank you
Ado Nishimura

Code:

from calibre.web.feeds.recipes import BasicNewsRecipe
import re

#import pprint, sys
#pp = pprint.PrettyPrinter(indent=4)

class NikkeiNet_paper_subscription(BasicNewsRecipe):
    title           = u'\u65E5\u672C\u7D4C\u6E08\u65B0\u805E\uFF08\u671D\u520A\u30FB\u5915\u520A\uFF09'
    __author__      = 'Ado Nishimura'
    description     = u'\u65E5\u7D4C\u96FB\u5B50\u7248\u306B\u3088\u308B\u65E5\u672C\u7D4C\u6E08\u65B0\u805E\u3002\u671D\u520A\u30FB\u5915\u520A\u306F\u53D6\u5F97\u6642\u9593\u306B\u3088\u308A\u5207\u308A\u66FF\u308F\u308A\u307E\u3059\u3002\u8981\u8CFC\u8AAD'
    needs_subscription = True
    oldest_article  = 1
    max_articles_per_feed = 30
    language        = 'ja'
    no_stylesheets  = True
    cover_url       = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
    masthead_url    = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'

    remove_tags_before = {'class':"cmn-indent"}
    remove_tags = [
#                       {'class':"cmn-article_move"},
#                       {'class':"cmn-pr_list"},
#                       {'class':"cmnc-zoom"},
                       {'class':"cmn-hide"},
                       {'name':'form'},
                  ]
    remove_tags_after = {'class':"cmn-indent"}

    def get_browser(self):
        br = BasicNewsRecipe.get_browser()

        #pp.pprint(self.parse_index())
        #exit(1)

        #br.set_debug_http(True)
        #br.set_debug_redirects(True)
        #br.set_debug_responses(True)

        if self.username is not None and self.password is not None:
            print "----------------------------open top page----------------------------------------"
            br.open('http://www.nikkei.com/')
            print "----------------------------open first login form--------------------------------"
            link = br.links(url_regex="www.nikkei.com/etc/accounts/login").next()
            br.follow_link(link)
            #response = br.response()
            #print response.get_data()
            print "----------------------------JS redirect(send autoPostForm)-----------------------"
            br.select_form(name='autoPostForm')
            br.submit()
            #response = br.response()
            print "----------------------------got login form---------------------------------------"
            br.select_form(name='LA0210Form01')
            br['LA0210Form01:LA0210Email']    = self.username
            br['LA0210Form01:LA0210Password'] = self.password
            br.submit()
            #response = br.response()
            print "----------------------------JS redirect------------------------------------------"
            br.select_form(nr=0)
            br.submit()

            #br.set_debug_http(False)
            #br.set_debug_redirects(False)
            #br.set_debug_responses(False)
        return br

    def cleanup(self):
        print "----------------------------logout-----------------------------------------------"
        self.browser.open('https://regist.nikkei.com/ds/etc/accounts/logout')

    def parse_index(self):
        print "----------------------------get index of paper-----------------------------------"
        result = []
        soup = self.index_to_soup('http://www.nikkei.com/paper/')
        #soup = self.index_to_soup(self.test_data())
        for sect in soup.findAll('div', 'cmn-section kn-special JSID_baseSection'):
            sect_title = sect.find('h3', 'cmnc-title').string
            sect_result = []
            for elem in sect.findAll(attrs={'class':['cmn-article_title']}):
                url = 'http://www.nikkei.com' + elem.span.a['href']
                url = re.sub("/article/", "/print-article/", url) # print version.
                span = elem.span.a.span
                if ((span is not None) and (len(span.contents) > 1)):
                    title = span.contents[1].string
                    sect_result.append(dict(title=title, url=url, date='',
                                            description='', content=''))
            result.append([sect_title, sect_result])
        #pp.pprint(result)
        return result

jetbee · 11-18-2014, 04:51 AM

I wrote a update for nikkei recipe.
Adding summary text.

Code:

from calibre.web.feeds.recipes import BasicNewsRecipe
import re

#import pprint, sys
#pp = pprint.PrettyPrinter(indent=4)

class NikkeiNet_paper_subscription(BasicNewsRecipe):
    title           = u'\u65E5\u672C\u7D4C\u6E08\u65B0\u805E\uFF08\u671D\u520A\u30FB\u5915\u520A\uFF09'
    __author__      = 'Ado Nishimura'
    description     = u'\u65E5\u7D4C\u96FB\u5B50\u7248\u306B\u3088\u308B\u65E5\u672C\u7D4C\u6E08\u65B0\u805E\u3002\u671D\u520A\u30FB\u5915\u520A\u306F\u53D6\u5F97\u6642\u9593\u306B\u3088\u308A\u5207\u308A\u66FF\u308F\u308A\u307E\u3059\u3002\u8981\u8CFC\u8AAD'
    needs_subscription = True
    oldest_article  = 1
    max_articles_per_feed = 30
    language        = 'ja'
    no_stylesheets  = True
    #cover_url       = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
    cover_url       = 'http://cdn.nikkei.co.jp/parts/ds/images/common/st_nikkei_r1_20101003_1.gif'
    #masthead_url    = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
    masthead_url    = 'http://cdn.nikkei.co.jp/parts/ds/images/common/st_nikkei_r1_20101003_1.gif'
    cover_margins   = (10, 188, '#ffffff')

    remove_tags_before = {'class':"cmn-indent"}
    remove_tags = [
#                       {'class':"cmn-article_move"},
#                       {'class':"cmn-pr_list"},
#                       {'class':"cmnc-zoom"},
                       {'class':"cmn-hide"},
                       {'name':'form'},
                       {'class':'cmn-print_headline cmn-clearfix'},
                       {'id':'ABOUT_NIKKEI'},
                  ]
    remove_tags_after = {'class':"cmn-indent"}

    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)

        #pp.pprint(self.parse_index())
        #exit(1)

        #br.set_debug_http(True)
        #br.set_debug_redirects(True)
        #br.set_debug_responses(True)

        if self.username is not None and self.password is not None:
            print "-------------------------open top page-------------------------------------"
            br.open('http://www.nikkei.com/')
            print "-------------------------open first login form-----------------------------"
            try: 
                url = br.links(url_regex="www.nikkei.com/etc/accounts/login").next().url
            except StopIteration:
                url = 'http://www.nikkei.com/etc/accounts/login?dps=3&pageflag=top&url=http%3A%2F%2Fwww.nikkei.com%2F'
            br.open(url)     #br.follow_link(link)
            #response = br.response()
            #print response.get_data()
            print "-------------------------JS redirect(send autoPostForm)--------------------"
            br.select_form(name='autoPostForm')
            br.submit()
            #response = br.response()
            print "-------------------------got login form------------------------------------"
            br.select_form(name='LA0210Form01')
            br['LA0210Form01:LA0210Email']    = self.username
            br['LA0210Form01:LA0210Password'] = self.password
            br.submit()
            #response = br.response()
            print "-------------------------JS redirect---------------------------------------"
            br.select_form(nr=0)
            br.submit()

            #br.set_debug_http(False)
            #br.set_debug_redirects(False)
            #br.set_debug_responses(False)
        return br

    def cleanup(self):
        print "-------------------------logout--------------------------------------------"
        self.browser.open('https://regist.nikkei.com/ds/etc/accounts/logout')

    def parse_index(self):
        print "-------------------------get index of paper--------------------------------"
        result = []
        soup = self.index_to_soup('http://www.nikkei.com/paper/')
        #soup = self.index_to_soup(self.test_data())
        sections = soup.findAll('div', 'cmn-section kn-special JSID_baseSection')
	if len(sections) == 0:
            sections = soup.findAll('div', 'cmn-section kn-special')
        for sect in sections:
            sect_title = sect.find('h3', 'cmnc-title').string
            sect_result = []
            for elem in sect.findAll(attrs={'class':['cmn-article_title']}):
                if elem.span.a == None  or  elem.span.a['href'].startswith('javascript') :
                    continue
                url = 'http://www.nikkei.com' + elem.span.a['href']
                url = re.sub("/article/", "/print-article/", url) # print version.
                span = elem.span.a.span
                if ((span is not None) and (len(span.contents) > 1)):
                    title = span.contents[1].string
                    sect_result.append(dict(title=title, url=url, date='',
                                            description='', content=''))
            result.append([sect_title, sect_result])
        return result

    def populate_article_metadata(self, article, soup, first):
        elm = soup.find('div', {"class":"cmn-article_text JSID_key_fonttxt"})
        elm_text = ''.join([ s.string for s in elm ])
        article.summary = elm_text
        article.text_summary = elm_text

szk2005 · 12-22-2014, 09:31 AM

Thanks for the great update, but the populate_article_metadata part seems causing the problem that completely skip some articles.

Quote:

Originally Posted by jetbee

Code:

elm_text = ''.join([ s.string for s in elm ])

At the above line, the s.string returns error in the situation that the input html source code inside the <p> tag contains <a> tag.

My solution for this problem is as follows:

Code:

    def populate_article_metadata(self, article, soup, first):
        try:
            elms = soup.findAll('div', {"class":"cmn-article_text JSID_key_fonttxt"})
            elm_list = [self.tag_to_string(elm).strip() for elm in elms]
            while elm_list.count('') > 0:
                elm_list.remove('')
            elm_text = '◆'.join(elm_list)
            elm_text = unicodedata.normalize('NFKC', elm_text)
            article.summary = article.text_summary = elm_text
        except:
            self.log("Error: Failed to get article summary.")
            return

The self.tag_to_string function solves the problem and the try-except clause prevents calibre from skipping the article when the error is occured.
For readability, I made an additional change at the unicodedata.normalize part. It converts full-width alphanumeric to half-width one for the article summary mode. It needs "import unicodedata" line at the beginning of the code.

Entire code is as follows:

Code:

from calibre.web.feeds.recipes import BasicNewsRecipe
import re

import unicodedata

#import pprint, sys
#pp = pprint.PrettyPrinter(indent=4)

class NikkeiNet_paper_subscription(BasicNewsRecipe):
    title           = u'\u65E5\u672C\u7D4C\u6E08\u65B0\u805E\uFF08\u671D\u520A\u30FB\u5915\u520A\uFF09'
    __author__      = 'Ado Nishimura'
    description     = u'\u65E5\u7D4C\u96FB\u5B50\u7248\u306B\u3088\u308B\u65E5\u672C\u7D4C\u6E08\u65B0\u805E\u3002\u671D\u520A\u30FB\u5915\u520A\u306F\u53D6\u5F97\u6642\u9593\u306B\u3088\u308A\u5207\u308A\u66FF\u308F\u308A\u307E\u3059\u3002\u8981\u8CFC\u8AAD'
    needs_subscription = True
    oldest_article  = 1
    max_articles_per_feed = 30
    language        = 'ja'
    no_stylesheets  = True
    #cover_url       = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
    cover_url       = 'http://cdn.nikkei.co.jp/parts/ds/images/common/st_nikkei_r1_20101003_1.gif'
    #masthead_url    = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
    masthead_url    = 'http://cdn.nikkei.co.jp/parts/ds/images/common/st_nikkei_r1_20101003_1.gif'
    cover_margins   = (10, 188, '#ffffff')

    remove_tags_before = {'class':"cmn-indent"}
    remove_tags = [
#                       {'class':"cmn-article_move"},
#                       {'class':"cmn-pr_list"},
#                       {'class':"cmnc-zoom"},
                       {'class':"cmn-hide"},
                       {'name':'form'},
                       {'class':'cmn-print_headline cmn-clearfix'},
                       {'id':'ABOUT_NIKKEI'},
                  ]
    remove_tags_after = {'class':"cmn-indent"}

    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)

        #pp.pprint(self.parse_index())
        #exit(1)

        #br.set_debug_http(True)
        #br.set_debug_redirects(True)
        #br.set_debug_responses(True)

        if self.username is not None and self.password is not None:
            print "-------------------------open top page-------------------------------------"
            br.open('http://www.nikkei.com/')
            print "-------------------------open first login form-----------------------------"
            try: 
                url = br.links(url_regex="www.nikkei.com/etc/accounts/login").next().url
            except StopIteration:
                url = 'http://www.nikkei.com/etc/accounts/login?dps=3&pageflag=top&url=http%3A%2F%2Fwww.nikkei.com%2F'
            br.open(url)     #br.follow_link(link)
            #response = br.response()
            #print response.get_data()
            print "-------------------------JS redirect(send autoPostForm)--------------------"
            br.select_form(name='autoPostForm')
            br.submit()
            #response = br.response()
            print "-------------------------got login form------------------------------------"
            br.select_form(name='LA0210Form01')
            br['LA0210Form01:LA0210Email']    = self.username
            br['LA0210Form01:LA0210Password'] = self.password
            br.submit()
            #response = br.response()
            print "-------------------------JS redirect---------------------------------------"
            br.select_form(nr=0)
            br.submit()

            #br.set_debug_http(False)
            #br.set_debug_redirects(False)
            #br.set_debug_responses(False)
        return br

    def cleanup(self):
        print "-------------------------logout--------------------------------------------"
        self.browser.open('https://regist.nikkei.com/ds/etc/accounts/logout')

    def parse_index(self):
        print "-------------------------get index of paper--------------------------------"
        result = []
        soup = self.index_to_soup('http://www.nikkei.com/paper/')
        #soup = self.index_to_soup(self.test_data())
        sections = soup.findAll('div', 'cmn-section kn-special JSID_baseSection')
	if len(sections) == 0:
            sections = soup.findAll('div', 'cmn-section kn-special')
        for sect in sections:
            sect_title = sect.find('h3', 'cmnc-title').string
            sect_result = []
            for elem in sect.findAll(attrs={'class':['cmn-article_title']}):
                if elem.span.a == None  or  elem.span.a['href'].startswith('javascript') :
                    continue
                url = 'http://www.nikkei.com' + elem.span.a['href']
                url = re.sub("/article/", "/print-article/", url) # print version.
                span = elem.span.a.span
                if ((span is not None) and (len(span.contents) > 1)):
                    title = span.contents[1].string
                    sect_result.append(dict(title=title, url=url, date='',
                                            description='', content=''))
            result.append([sect_title, sect_result])
        return result

    def populate_article_metadata(self, article, soup, first):
        try:
            elms = soup.findAll('div', {"class":"cmn-article_text JSID_key_fonttxt"})
            elm_list = [self.tag_to_string(elm).strip() for elm in elms]
            while elm_list.count('') > 0:
                elm_list.remove('')
            elm_text = '◆'.join(elm_list)
            elm_text = unicodedata.normalize('NFKC', elm_text)
            article.summary = article.text_summary = elm_text
        except:
            self.log("Error: Failed to get article summary.")
            return

Screenshots:
http://i.imgur.com/id45HwQ.png
http://i.imgur.com/RcPkSVu.png

Similar Threads
Thread	Thread Starter	Forum	Replies	Last Post
Suppress next \| prev \| section \| main menu for news?	Barty	Calibre	2	02-13-2011 09:47 PM
Nikkei/Problematic site that need form-post before processing	miurahr	Recipes	6	11-21-2010 02:27 PM
Split the "News and Commentary" section	Phogg	Feedback	15	11-23-2009 06:06 PM
MR News: Team members / Writers' section	Alexander Turcic	Announcements	21	02-27-2009 11:55 AM

Advert