|
|
#1 |
|
Junior Member
![]() Posts: 7
Karma: 10
Join Date: May 2011
Device: kindle, SONY T1
|
Nikkei News (paper section)
Hi, I made a Japanese Nikkei News paper section recipe. there are already some Nikkei News recipe are included into Calibre. but this recipe fetch paper section of it (which means same contents as real news paper contents). Hope this will useful and may include into Calibre.
Thank you Ado Nishimura Code:
from calibre.web.feeds.recipes import BasicNewsRecipe
import re
#import pprint, sys
#pp = pprint.PrettyPrinter(indent=4)
class NikkeiNet_paper_subscription(BasicNewsRecipe):
title = u'\u65E5\u672C\u7D4C\u6E08\u65B0\u805E\uFF08\u671D\u520A\u30FB\u5915\u520A\uFF09'
__author__ = 'Ado Nishimura'
description = u'\u65E5\u7D4C\u96FB\u5B50\u7248\u306B\u3088\u308B\u65E5\u672C\u7D4C\u6E08\u65B0\u805E\u3002\u671D\u520A\u30FB\u5915\u520A\u306F\u53D6\u5F97\u6642\u9593\u306B\u3088\u308A\u5207\u308A\u66FF\u308F\u308A\u307E\u3059\u3002\u8981\u8CFC\u8AAD'
needs_subscription = True
oldest_article = 1
max_articles_per_feed = 30
language = 'ja'
no_stylesheets = True
cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
remove_tags_before = {'class':"cmn-indent"}
remove_tags = [
# {'class':"cmn-article_move"},
# {'class':"cmn-pr_list"},
# {'class':"cmnc-zoom"},
{'class':"cmn-hide"},
{'name':'form'},
]
remove_tags_after = {'class':"cmn-indent"}
def get_browser(self):
br = BasicNewsRecipe.get_browser()
#pp.pprint(self.parse_index())
#exit(1)
#br.set_debug_http(True)
#br.set_debug_redirects(True)
#br.set_debug_responses(True)
if self.username is not None and self.password is not None:
print "----------------------------open top page----------------------------------------"
br.open('http://www.nikkei.com/')
print "----------------------------open first login form--------------------------------"
link = br.links(url_regex="www.nikkei.com/etc/accounts/login").next()
br.follow_link(link)
#response = br.response()
#print response.get_data()
print "----------------------------JS redirect(send autoPostForm)-----------------------"
br.select_form(name='autoPostForm')
br.submit()
#response = br.response()
print "----------------------------got login form---------------------------------------"
br.select_form(name='LA0210Form01')
br['LA0210Form01:LA0210Email'] = self.username
br['LA0210Form01:LA0210Password'] = self.password
br.submit()
#response = br.response()
print "----------------------------JS redirect------------------------------------------"
br.select_form(nr=0)
br.submit()
#br.set_debug_http(False)
#br.set_debug_redirects(False)
#br.set_debug_responses(False)
return br
def cleanup(self):
print "----------------------------logout-----------------------------------------------"
self.browser.open('https://regist.nikkei.com/ds/etc/accounts/logout')
def parse_index(self):
print "----------------------------get index of paper-----------------------------------"
result = []
soup = self.index_to_soup('http://www.nikkei.com/paper/')
#soup = self.index_to_soup(self.test_data())
for sect in soup.findAll('div', 'cmn-section kn-special JSID_baseSection'):
sect_title = sect.find('h3', 'cmnc-title').string
sect_result = []
for elem in sect.findAll(attrs={'class':['cmn-article_title']}):
url = 'http://www.nikkei.com' + elem.span.a['href']
url = re.sub("/article/", "/print-article/", url) # print version.
span = elem.span.a.span
if ((span is not None) and (len(span.contents) > 1)):
title = span.contents[1].string
sect_result.append(dict(title=title, url=url, date='',
description='', content=''))
result.append([sect_title, sect_result])
#pp.pprint(result)
return result
|
|
|
|
|
|
#2 |
|
Junior Member
![]() Posts: 1
Karma: 10
Join Date: Nov 2014
Device: kindle paperwhite
|
Update Nikkei recipe
I wrote a update for nikkei recipe.
Adding summary text. Code:
from calibre.web.feeds.recipes import BasicNewsRecipe
import re
#import pprint, sys
#pp = pprint.PrettyPrinter(indent=4)
class NikkeiNet_paper_subscription(BasicNewsRecipe):
title = u'\u65E5\u672C\u7D4C\u6E08\u65B0\u805E\uFF08\u671D\u520A\u30FB\u5915\u520A\uFF09'
__author__ = 'Ado Nishimura'
description = u'\u65E5\u7D4C\u96FB\u5B50\u7248\u306B\u3088\u308B\u65E5\u672C\u7D4C\u6E08\u65B0\u805E\u3002\u671D\u520A\u30FB\u5915\u520A\u306F\u53D6\u5F97\u6642\u9593\u306B\u3088\u308A\u5207\u308A\u66FF\u308F\u308A\u307E\u3059\u3002\u8981\u8CFC\u8AAD'
needs_subscription = True
oldest_article = 1
max_articles_per_feed = 30
language = 'ja'
no_stylesheets = True
#cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
cover_url = 'http://cdn.nikkei.co.jp/parts/ds/images/common/st_nikkei_r1_20101003_1.gif'
#masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
masthead_url = 'http://cdn.nikkei.co.jp/parts/ds/images/common/st_nikkei_r1_20101003_1.gif'
cover_margins = (10, 188, '#ffffff')
remove_tags_before = {'class':"cmn-indent"}
remove_tags = [
# {'class':"cmn-article_move"},
# {'class':"cmn-pr_list"},
# {'class':"cmnc-zoom"},
{'class':"cmn-hide"},
{'name':'form'},
{'class':'cmn-print_headline cmn-clearfix'},
{'id':'ABOUT_NIKKEI'},
]
remove_tags_after = {'class':"cmn-indent"}
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
#pp.pprint(self.parse_index())
#exit(1)
#br.set_debug_http(True)
#br.set_debug_redirects(True)
#br.set_debug_responses(True)
if self.username is not None and self.password is not None:
print "-------------------------open top page-------------------------------------"
br.open('http://www.nikkei.com/')
print "-------------------------open first login form-----------------------------"
try:
url = br.links(url_regex="www.nikkei.com/etc/accounts/login").next().url
except StopIteration:
url = 'http://www.nikkei.com/etc/accounts/login?dps=3&pageflag=top&url=http%3A%2F%2Fwww.nikkei.com%2F'
br.open(url) #br.follow_link(link)
#response = br.response()
#print response.get_data()
print "-------------------------JS redirect(send autoPostForm)--------------------"
br.select_form(name='autoPostForm')
br.submit()
#response = br.response()
print "-------------------------got login form------------------------------------"
br.select_form(name='LA0210Form01')
br['LA0210Form01:LA0210Email'] = self.username
br['LA0210Form01:LA0210Password'] = self.password
br.submit()
#response = br.response()
print "-------------------------JS redirect---------------------------------------"
br.select_form(nr=0)
br.submit()
#br.set_debug_http(False)
#br.set_debug_redirects(False)
#br.set_debug_responses(False)
return br
def cleanup(self):
print "-------------------------logout--------------------------------------------"
self.browser.open('https://regist.nikkei.com/ds/etc/accounts/logout')
def parse_index(self):
print "-------------------------get index of paper--------------------------------"
result = []
soup = self.index_to_soup('http://www.nikkei.com/paper/')
#soup = self.index_to_soup(self.test_data())
sections = soup.findAll('div', 'cmn-section kn-special JSID_baseSection')
if len(sections) == 0:
sections = soup.findAll('div', 'cmn-section kn-special')
for sect in sections:
sect_title = sect.find('h3', 'cmnc-title').string
sect_result = []
for elem in sect.findAll(attrs={'class':['cmn-article_title']}):
if elem.span.a == None or elem.span.a['href'].startswith('javascript') :
continue
url = 'http://www.nikkei.com' + elem.span.a['href']
url = re.sub("/article/", "/print-article/", url) # print version.
span = elem.span.a.span
if ((span is not None) and (len(span.contents) > 1)):
title = span.contents[1].string
sect_result.append(dict(title=title, url=url, date='',
description='', content=''))
result.append([sect_title, sect_result])
return result
def populate_article_metadata(self, article, soup, first):
elm = soup.find('div', {"class":"cmn-article_text JSID_key_fonttxt"})
elm_text = ''.join([ s.string for s in elm ])
article.summary = elm_text
article.text_summary = elm_text
|
|
|
|
| Advert | |
|
|
|
|
#3 |
|
Junior Member
![]() Posts: 1
Karma: 10
Join Date: Dec 2014
Location: Japan
Device: Kindle Voyage, Kindoe Paperwhite 2012
|
Fixed some problems.
Thanks for the great update, but the populate_article_metadata part seems causing the problem that completely skip some articles.
At the above line, the s.string returns error in the situation that the input html source code inside the <p> tag contains <a> tag. My solution for this problem is as follows: Code:
def populate_article_metadata(self, article, soup, first):
try:
elms = soup.findAll('div', {"class":"cmn-article_text JSID_key_fonttxt"})
elm_list = [self.tag_to_string(elm).strip() for elm in elms]
while elm_list.count('') > 0:
elm_list.remove('')
elm_text = '◆'.join(elm_list)
elm_text = unicodedata.normalize('NFKC', elm_text)
article.summary = article.text_summary = elm_text
except:
self.log("Error: Failed to get article summary.")
return
For readability, I made an additional change at the unicodedata.normalize part. It converts full-width alphanumeric to half-width one for the article summary mode. It needs "import unicodedata" line at the beginning of the code. Entire code is as follows: Code:
from calibre.web.feeds.recipes import BasicNewsRecipe
import re
import unicodedata
#import pprint, sys
#pp = pprint.PrettyPrinter(indent=4)
class NikkeiNet_paper_subscription(BasicNewsRecipe):
title = u'\u65E5\u672C\u7D4C\u6E08\u65B0\u805E\uFF08\u671D\u520A\u30FB\u5915\u520A\uFF09'
__author__ = 'Ado Nishimura'
description = u'\u65E5\u7D4C\u96FB\u5B50\u7248\u306B\u3088\u308B\u65E5\u672C\u7D4C\u6E08\u65B0\u805E\u3002\u671D\u520A\u30FB\u5915\u520A\u306F\u53D6\u5F97\u6642\u9593\u306B\u3088\u308A\u5207\u308A\u66FF\u308F\u308A\u307E\u3059\u3002\u8981\u8CFC\u8AAD'
needs_subscription = True
oldest_article = 1
max_articles_per_feed = 30
language = 'ja'
no_stylesheets = True
#cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
cover_url = 'http://cdn.nikkei.co.jp/parts/ds/images/common/st_nikkei_r1_20101003_1.gif'
#masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg'
masthead_url = 'http://cdn.nikkei.co.jp/parts/ds/images/common/st_nikkei_r1_20101003_1.gif'
cover_margins = (10, 188, '#ffffff')
remove_tags_before = {'class':"cmn-indent"}
remove_tags = [
# {'class':"cmn-article_move"},
# {'class':"cmn-pr_list"},
# {'class':"cmnc-zoom"},
{'class':"cmn-hide"},
{'name':'form'},
{'class':'cmn-print_headline cmn-clearfix'},
{'id':'ABOUT_NIKKEI'},
]
remove_tags_after = {'class':"cmn-indent"}
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
#pp.pprint(self.parse_index())
#exit(1)
#br.set_debug_http(True)
#br.set_debug_redirects(True)
#br.set_debug_responses(True)
if self.username is not None and self.password is not None:
print "-------------------------open top page-------------------------------------"
br.open('http://www.nikkei.com/')
print "-------------------------open first login form-----------------------------"
try:
url = br.links(url_regex="www.nikkei.com/etc/accounts/login").next().url
except StopIteration:
url = 'http://www.nikkei.com/etc/accounts/login?dps=3&pageflag=top&url=http%3A%2F%2Fwww.nikkei.com%2F'
br.open(url) #br.follow_link(link)
#response = br.response()
#print response.get_data()
print "-------------------------JS redirect(send autoPostForm)--------------------"
br.select_form(name='autoPostForm')
br.submit()
#response = br.response()
print "-------------------------got login form------------------------------------"
br.select_form(name='LA0210Form01')
br['LA0210Form01:LA0210Email'] = self.username
br['LA0210Form01:LA0210Password'] = self.password
br.submit()
#response = br.response()
print "-------------------------JS redirect---------------------------------------"
br.select_form(nr=0)
br.submit()
#br.set_debug_http(False)
#br.set_debug_redirects(False)
#br.set_debug_responses(False)
return br
def cleanup(self):
print "-------------------------logout--------------------------------------------"
self.browser.open('https://regist.nikkei.com/ds/etc/accounts/logout')
def parse_index(self):
print "-------------------------get index of paper--------------------------------"
result = []
soup = self.index_to_soup('http://www.nikkei.com/paper/')
#soup = self.index_to_soup(self.test_data())
sections = soup.findAll('div', 'cmn-section kn-special JSID_baseSection')
if len(sections) == 0:
sections = soup.findAll('div', 'cmn-section kn-special')
for sect in sections:
sect_title = sect.find('h3', 'cmnc-title').string
sect_result = []
for elem in sect.findAll(attrs={'class':['cmn-article_title']}):
if elem.span.a == None or elem.span.a['href'].startswith('javascript') :
continue
url = 'http://www.nikkei.com' + elem.span.a['href']
url = re.sub("/article/", "/print-article/", url) # print version.
span = elem.span.a.span
if ((span is not None) and (len(span.contents) > 1)):
title = span.contents[1].string
sect_result.append(dict(title=title, url=url, date='',
description='', content=''))
result.append([sect_title, sect_result])
return result
def populate_article_metadata(self, article, soup, first):
try:
elms = soup.findAll('div', {"class":"cmn-article_text JSID_key_fonttxt"})
elm_list = [self.tag_to_string(elm).strip() for elm in elms]
while elm_list.count('') > 0:
elm_list.remove('')
elm_text = '◆'.join(elm_list)
elm_text = unicodedata.normalize('NFKC', elm_text)
article.summary = article.text_summary = elm_text
except:
self.log("Error: Failed to get article summary.")
return
http://i.imgur.com/id45HwQ.png http://i.imgur.com/RcPkSVu.png Last edited by szk2005; 12-23-2014 at 03:14 AM. |
|
|
|
![]() |
|
Similar Threads
|
||||
| Thread | Thread Starter | Forum | Replies | Last Post |
| Suppress next | prev | section | main menu for news? | Barty | Calibre | 2 | 02-13-2011 09:47 PM |
| Nikkei/Problematic site that need form-post before processing | miurahr | Recipes | 6 | 11-21-2010 02:27 PM |
| Split the "News and Commentary" section | Phogg | Feedback | 15 | 11-23-2009 06:06 PM |
| MR News: Team members / Writers' section | Alexander Turcic | Announcements | 21 | 02-27-2009 11:55 AM |