Register Guidelines E-Books Today's Posts Search

Go Back   MobileRead Forums > E-Book Software > Calibre > Recipes

Notices

Reply
 
Thread Tools Search this Thread
Old 07-16-2011, 11:23 PM   #1
adonishi
Junior Member
adonishi began at the beginning.
 
Posts: 7
Karma: 10
Join Date: May 2011
Device: kindle, SONY T1
Nikkei News (paper section)

Hi, I made a Japanese Nikkei News paper section recipe. there are already some Nikkei News recipe are included into Calibre. but this recipe fetch paper section of it (which means same contents as real news paper contents). Hope this will useful and may include into Calibre.

Thank you
Ado Nishimura


Code:
from calibre.web.feeds.recipes import BasicNewsRecipe
import re

#import pprint, sys
#pp = pprint.PrettyPrinter(indent=4)

class NikkeiNet_paper_subscription(BasicNewsRecipe):
    title           = u'\u65E5\u672C\u7D4C\u6E08\u65B0\u805E\uFF08\u671D\u520A\u30FB\u5915\u520A\uFF09'
    __author__      = 'Ado Nishimura'
    description     = u'\u65E5\u7D4C\u96FB\u5B50\u7248\u306B\u3088\u308B\u65E5\u672C\u7D4C\u6E08\u65B0\u805E\u3002\u671D\u520A\u30FB\u5915\u520A\u306F\u53D6\u5F97\u6642\u9593\u306B\u3088\u308A\u5207\u308A\u66FF\u308F\u308A\u307E\u3059\u3002\u8981\u8CFC\u8AAD'
    needs_subscription = True
    oldest_article  = 1
    max_articles_per_feed = 30
    language        = 'ja'
    no_stylesheets  = True
    cover_url       = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
    masthead_url    = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'

    remove_tags_before = {'class':"cmn-indent"}
    remove_tags = [
#                       {'class':"cmn-article_move"},
#                       {'class':"cmn-pr_list"},
#                       {'class':"cmnc-zoom"},
                       {'class':"cmn-hide"},
                       {'name':'form'},
                  ]
    remove_tags_after = {'class':"cmn-indent"}

    def get_browser(self):
        br = BasicNewsRecipe.get_browser()

        #pp.pprint(self.parse_index())
        #exit(1)

        #br.set_debug_http(True)
        #br.set_debug_redirects(True)
        #br.set_debug_responses(True)

        if self.username is not None and self.password is not None:
            print "----------------------------open top page----------------------------------------"
            br.open('http://www.nikkei.com/')
            print "----------------------------open first login form--------------------------------"
            link = br.links(url_regex="www.nikkei.com/etc/accounts/login").next()
            br.follow_link(link)
            #response = br.response()
            #print response.get_data()
            print "----------------------------JS redirect(send autoPostForm)-----------------------"
            br.select_form(name='autoPostForm')
            br.submit()
            #response = br.response()
            print "----------------------------got login form---------------------------------------"
            br.select_form(name='LA0210Form01')
            br['LA0210Form01:LA0210Email']    = self.username
            br['LA0210Form01:LA0210Password'] = self.password
            br.submit()
            #response = br.response()
            print "----------------------------JS redirect------------------------------------------"
            br.select_form(nr=0)
            br.submit()

            #br.set_debug_http(False)
            #br.set_debug_redirects(False)
            #br.set_debug_responses(False)
        return br

    def cleanup(self):
        print "----------------------------logout-----------------------------------------------"
        self.browser.open('https://regist.nikkei.com/ds/etc/accounts/logout')

    def parse_index(self):
        print "----------------------------get index of paper-----------------------------------"
        result = []
        soup = self.index_to_soup('http://www.nikkei.com/paper/')
        #soup = self.index_to_soup(self.test_data())
        for sect in soup.findAll('div', 'cmn-section kn-special JSID_baseSection'):
            sect_title = sect.find('h3', 'cmnc-title').string
            sect_result = []
            for elem in sect.findAll(attrs={'class':['cmn-article_title']}):
                url = 'http://www.nikkei.com' + elem.span.a['href']
                url = re.sub("/article/", "/print-article/", url) # print version.
                span = elem.span.a.span
                if ((span is not None) and (len(span.contents) > 1)):
                    title = span.contents[1].string
                    sect_result.append(dict(title=title, url=url, date='',
                                            description='', content=''))
            result.append([sect_title, sect_result])
        #pp.pprint(result)
        return result
adonishi is offline   Reply With Quote
Old 11-18-2014, 03:51 AM   #2
jetbee
Junior Member
jetbee began at the beginning.
 
Posts: 1
Karma: 10
Join Date: Nov 2014
Device: kindle paperwhite
Update Nikkei recipe

I wrote a update for nikkei recipe.
Adding summary text.

Code:
from calibre.web.feeds.recipes import BasicNewsRecipe
import re

#import pprint, sys
#pp = pprint.PrettyPrinter(indent=4)

class NikkeiNet_paper_subscription(BasicNewsRecipe):
    title           = u'\u65E5\u672C\u7D4C\u6E08\u65B0\u805E\uFF08\u671D\u520A\u30FB\u5915\u520A\uFF09'
    __author__      = 'Ado Nishimura'
    description     = u'\u65E5\u7D4C\u96FB\u5B50\u7248\u306B\u3088\u308B\u65E5\u672C\u7D4C\u6E08\u65B0\u805E\u3002\u671D\u520A\u30FB\u5915\u520A\u306F\u53D6\u5F97\u6642\u9593\u306B\u3088\u308A\u5207\u308A\u66FF\u308F\u308A\u307E\u3059\u3002\u8981\u8CFC\u8AAD'
    needs_subscription = True
    oldest_article  = 1
    max_articles_per_feed = 30
    language        = 'ja'
    no_stylesheets  = True
    #cover_url       = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
    cover_url       = 'http://cdn.nikkei.co.jp/parts/ds/images/common/st_nikkei_r1_20101003_1.gif'
    #masthead_url    = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
    masthead_url    = 'http://cdn.nikkei.co.jp/parts/ds/images/common/st_nikkei_r1_20101003_1.gif'
    cover_margins   = (10, 188, '#ffffff')

    remove_tags_before = {'class':"cmn-indent"}
    remove_tags = [
#                       {'class':"cmn-article_move"},
#                       {'class':"cmn-pr_list"},
#                       {'class':"cmnc-zoom"},
                       {'class':"cmn-hide"},
                       {'name':'form'},
                       {'class':'cmn-print_headline cmn-clearfix'},
                       {'id':'ABOUT_NIKKEI'},
                  ]
    remove_tags_after = {'class':"cmn-indent"}

    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)

        #pp.pprint(self.parse_index())
        #exit(1)

        #br.set_debug_http(True)
        #br.set_debug_redirects(True)
        #br.set_debug_responses(True)

        if self.username is not None and self.password is not None:
            print "-------------------------open top page-------------------------------------"
            br.open('http://www.nikkei.com/')
            print "-------------------------open first login form-----------------------------"
            try: 
                url = br.links(url_regex="www.nikkei.com/etc/accounts/login").next().url
            except StopIteration:
                url = 'http://www.nikkei.com/etc/accounts/login?dps=3&pageflag=top&url=http%3A%2F%2Fwww.nikkei.com%2F'
            br.open(url)     #br.follow_link(link)
            #response = br.response()
            #print response.get_data()
            print "-------------------------JS redirect(send autoPostForm)--------------------"
            br.select_form(name='autoPostForm')
            br.submit()
            #response = br.response()
            print "-------------------------got login form------------------------------------"
            br.select_form(name='LA0210Form01')
            br['LA0210Form01:LA0210Email']    = self.username
            br['LA0210Form01:LA0210Password'] = self.password
            br.submit()
            #response = br.response()
            print "-------------------------JS redirect---------------------------------------"
            br.select_form(nr=0)
            br.submit()

            #br.set_debug_http(False)
            #br.set_debug_redirects(False)
            #br.set_debug_responses(False)
        return br

    def cleanup(self):
        print "-------------------------logout--------------------------------------------"
        self.browser.open('https://regist.nikkei.com/ds/etc/accounts/logout')

    def parse_index(self):
        print "-------------------------get index of paper--------------------------------"
        result = []
        soup = self.index_to_soup('http://www.nikkei.com/paper/')
        #soup = self.index_to_soup(self.test_data())
        sections = soup.findAll('div', 'cmn-section kn-special JSID_baseSection')
	if len(sections) == 0:
            sections = soup.findAll('div', 'cmn-section kn-special')
        for sect in sections:
            sect_title = sect.find('h3', 'cmnc-title').string
            sect_result = []
            for elem in sect.findAll(attrs={'class':['cmn-article_title']}):
                if elem.span.a == None  or  elem.span.a['href'].startswith('javascript') :
                    continue
                url = 'http://www.nikkei.com' + elem.span.a['href']
                url = re.sub("/article/", "/print-article/", url) # print version.
                span = elem.span.a.span
                if ((span is not None) and (len(span.contents) > 1)):
                    title = span.contents[1].string
                    sect_result.append(dict(title=title, url=url, date='',
                                            description='', content=''))
            result.append([sect_title, sect_result])
        return result

    def populate_article_metadata(self, article, soup, first):
        elm = soup.find('div', {"class":"cmn-article_text JSID_key_fonttxt"})
        elm_text = ''.join([ s.string for s in elm ])
        article.summary = elm_text
        article.text_summary = elm_text
jetbee is offline   Reply With Quote
Advert
Old 12-22-2014, 08:31 AM   #3
szk2005
Junior Member
szk2005 began at the beginning.
 
Posts: 1
Karma: 10
Join Date: Dec 2014
Location: Japan
Device: Kindle Voyage, Kindoe Paperwhite 2012
Fixed some problems.

Thanks for the great update, but the populate_article_metadata part seems causing the problem that completely skip some articles.
Quote:
Originally Posted by jetbee View Post
Code:
elm_text = ''.join([ s.string for s in elm ])
At the above line, the s.string returns error in the situation that the input html source code inside the <p> tag contains <a> tag.

My solution for this problem is as follows:
Code:
    def populate_article_metadata(self, article, soup, first):
        try:
            elms = soup.findAll('div', {"class":"cmn-article_text JSID_key_fonttxt"})
            elm_list = [self.tag_to_string(elm).strip() for elm in elms]
            while elm_list.count('') > 0:
                elm_list.remove('')
            elm_text = '◆'.join(elm_list)
            elm_text = unicodedata.normalize('NFKC', elm_text)
            article.summary = article.text_summary = elm_text
        except:
            self.log("Error: Failed to get article summary.")
            return
The self.tag_to_string function solves the problem and the try-except clause prevents calibre from skipping the article when the error is occured.
For readability, I made an additional change at the unicodedata.normalize part. It converts full-width alphanumeric to half-width one for the article summary mode. It needs "import unicodedata" line at the beginning of the code.

Entire code is as follows:
Code:
from calibre.web.feeds.recipes import BasicNewsRecipe
import re

import unicodedata

#import pprint, sys
#pp = pprint.PrettyPrinter(indent=4)

class NikkeiNet_paper_subscription(BasicNewsRecipe):
    title           = u'\u65E5\u672C\u7D4C\u6E08\u65B0\u805E\uFF08\u671D\u520A\u30FB\u5915\u520A\uFF09'
    __author__      = 'Ado Nishimura'
    description     = u'\u65E5\u7D4C\u96FB\u5B50\u7248\u306B\u3088\u308B\u65E5\u672C\u7D4C\u6E08\u65B0\u805E\u3002\u671D\u520A\u30FB\u5915\u520A\u306F\u53D6\u5F97\u6642\u9593\u306B\u3088\u308A\u5207\u308A\u66FF\u308F\u308A\u307E\u3059\u3002\u8981\u8CFC\u8AAD'
    needs_subscription = True
    oldest_article  = 1
    max_articles_per_feed = 30
    language        = 'ja'
    no_stylesheets  = True
    #cover_url       = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
    cover_url       = 'http://cdn.nikkei.co.jp/parts/ds/images/common/st_nikkei_r1_20101003_1.gif'
    #masthead_url    = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
    masthead_url    = 'http://cdn.nikkei.co.jp/parts/ds/images/common/st_nikkei_r1_20101003_1.gif'
    cover_margins   = (10, 188, '#ffffff')

    remove_tags_before = {'class':"cmn-indent"}
    remove_tags = [
#                       {'class':"cmn-article_move"},
#                       {'class':"cmn-pr_list"},
#                       {'class':"cmnc-zoom"},
                       {'class':"cmn-hide"},
                       {'name':'form'},
                       {'class':'cmn-print_headline cmn-clearfix'},
                       {'id':'ABOUT_NIKKEI'},
                  ]
    remove_tags_after = {'class':"cmn-indent"}

    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)

        #pp.pprint(self.parse_index())
        #exit(1)

        #br.set_debug_http(True)
        #br.set_debug_redirects(True)
        #br.set_debug_responses(True)

        if self.username is not None and self.password is not None:
            print "-------------------------open top page-------------------------------------"
            br.open('http://www.nikkei.com/')
            print "-------------------------open first login form-----------------------------"
            try: 
                url = br.links(url_regex="www.nikkei.com/etc/accounts/login").next().url
            except StopIteration:
                url = 'http://www.nikkei.com/etc/accounts/login?dps=3&pageflag=top&url=http%3A%2F%2Fwww.nikkei.com%2F'
            br.open(url)     #br.follow_link(link)
            #response = br.response()
            #print response.get_data()
            print "-------------------------JS redirect(send autoPostForm)--------------------"
            br.select_form(name='autoPostForm')
            br.submit()
            #response = br.response()
            print "-------------------------got login form------------------------------------"
            br.select_form(name='LA0210Form01')
            br['LA0210Form01:LA0210Email']    = self.username
            br['LA0210Form01:LA0210Password'] = self.password
            br.submit()
            #response = br.response()
            print "-------------------------JS redirect---------------------------------------"
            br.select_form(nr=0)
            br.submit()

            #br.set_debug_http(False)
            #br.set_debug_redirects(False)
            #br.set_debug_responses(False)
        return br

    def cleanup(self):
        print "-------------------------logout--------------------------------------------"
        self.browser.open('https://regist.nikkei.com/ds/etc/accounts/logout')

    def parse_index(self):
        print "-------------------------get index of paper--------------------------------"
        result = []
        soup = self.index_to_soup('http://www.nikkei.com/paper/')
        #soup = self.index_to_soup(self.test_data())
        sections = soup.findAll('div', 'cmn-section kn-special JSID_baseSection')
	if len(sections) == 0:
            sections = soup.findAll('div', 'cmn-section kn-special')
        for sect in sections:
            sect_title = sect.find('h3', 'cmnc-title').string
            sect_result = []
            for elem in sect.findAll(attrs={'class':['cmn-article_title']}):
                if elem.span.a == None  or  elem.span.a['href'].startswith('javascript') :
                    continue
                url = 'http://www.nikkei.com' + elem.span.a['href']
                url = re.sub("/article/", "/print-article/", url) # print version.
                span = elem.span.a.span
                if ((span is not None) and (len(span.contents) > 1)):
                    title = span.contents[1].string
                    sect_result.append(dict(title=title, url=url, date='',
                                            description='', content=''))
            result.append([sect_title, sect_result])
        return result

    def populate_article_metadata(self, article, soup, first):
        try:
            elms = soup.findAll('div', {"class":"cmn-article_text JSID_key_fonttxt"})
            elm_list = [self.tag_to_string(elm).strip() for elm in elms]
            while elm_list.count('') > 0:
                elm_list.remove('')
            elm_text = '◆'.join(elm_list)
            elm_text = unicodedata.normalize('NFKC', elm_text)
            article.summary = article.text_summary = elm_text
        except:
            self.log("Error: Failed to get article summary.")
            return
Screenshots:
http://i.imgur.com/id45HwQ.png
http://i.imgur.com/RcPkSVu.png

Last edited by szk2005; 12-23-2014 at 02:14 AM.
szk2005 is offline   Reply With Quote
Reply


Forum Jump

Similar Threads
Thread Thread Starter Forum Replies Last Post
Suppress next | prev | section | main menu for news? Barty Calibre 2 02-13-2011 08:47 PM
Nikkei/Problematic site that need form-post before processing miurahr Recipes 6 11-21-2010 01:27 PM
Split the "News and Commentary" section Phogg Feedback 15 11-23-2009 05:06 PM
MR News: Team members / Writers' section Alexander Turcic Announcements 21 02-27-2009 10:55 AM


All times are GMT -4. The time now is 05:47 PM.


MobileRead.com is a privately owned, operated and funded community.