07-16-2011, 11:23 PM | #1 |
Junior Member
Posts: 7
Karma: 10
Join Date: May 2011
Device: kindle, SONY T1
|
Nikkei News (paper section)
Hi, I made a Japanese Nikkei News paper section recipe. there are already some Nikkei News recipe are included into Calibre. but this recipe fetch paper section of it (which means same contents as real news paper contents). Hope this will useful and may include into Calibre.
Thank you Ado Nishimura Code:
from calibre.web.feeds.recipes import BasicNewsRecipe import re #import pprint, sys #pp = pprint.PrettyPrinter(indent=4) class NikkeiNet_paper_subscription(BasicNewsRecipe): title = u'\u65E5\u672C\u7D4C\u6E08\u65B0\u805E\uFF08\u671D\u520A\u30FB\u5915\u520A\uFF09' __author__ = 'Ado Nishimura' description = u'\u65E5\u7D4C\u96FB\u5B50\u7248\u306B\u3088\u308B\u65E5\u672C\u7D4C\u6E08\u65B0\u805E\u3002\u671D\u520A\u30FB\u5915\u520A\u306F\u53D6\u5F97\u6642\u9593\u306B\u3088\u308A\u5207\u308A\u66FF\u308F\u308A\u307E\u3059\u3002\u8981\u8CFC\u8AAD' needs_subscription = True oldest_article = 1 max_articles_per_feed = 30 language = 'ja' no_stylesheets = True cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' remove_tags_before = {'class':"cmn-indent"} remove_tags = [ # {'class':"cmn-article_move"}, # {'class':"cmn-pr_list"}, # {'class':"cmnc-zoom"}, {'class':"cmn-hide"}, {'name':'form'}, ] remove_tags_after = {'class':"cmn-indent"} def get_browser(self): br = BasicNewsRecipe.get_browser() #pp.pprint(self.parse_index()) #exit(1) #br.set_debug_http(True) #br.set_debug_redirects(True) #br.set_debug_responses(True) if self.username is not None and self.password is not None: print "----------------------------open top page----------------------------------------" br.open('http://www.nikkei.com/') print "----------------------------open first login form--------------------------------" link = br.links(url_regex="www.nikkei.com/etc/accounts/login").next() br.follow_link(link) #response = br.response() #print response.get_data() print "----------------------------JS redirect(send autoPostForm)-----------------------" br.select_form(name='autoPostForm') br.submit() #response = br.response() print "----------------------------got login form---------------------------------------" br.select_form(name='LA0210Form01') br['LA0210Form01:LA0210Email'] = self.username br['LA0210Form01:LA0210Password'] = self.password br.submit() #response = br.response() print "----------------------------JS redirect------------------------------------------" br.select_form(nr=0) br.submit() #br.set_debug_http(False) #br.set_debug_redirects(False) #br.set_debug_responses(False) return br def cleanup(self): print "----------------------------logout-----------------------------------------------" self.browser.open('https://regist.nikkei.com/ds/etc/accounts/logout') def parse_index(self): print "----------------------------get index of paper-----------------------------------" result = [] soup = self.index_to_soup('http://www.nikkei.com/paper/') #soup = self.index_to_soup(self.test_data()) for sect in soup.findAll('div', 'cmn-section kn-special JSID_baseSection'): sect_title = sect.find('h3', 'cmnc-title').string sect_result = [] for elem in sect.findAll(attrs={'class':['cmn-article_title']}): url = 'http://www.nikkei.com' + elem.span.a['href'] url = re.sub("/article/", "/print-article/", url) # print version. span = elem.span.a.span if ((span is not None) and (len(span.contents) > 1)): title = span.contents[1].string sect_result.append(dict(title=title, url=url, date='', description='', content='')) result.append([sect_title, sect_result]) #pp.pprint(result) return result |
11-18-2014, 03:51 AM | #2 |
Junior Member
Posts: 1
Karma: 10
Join Date: Nov 2014
Device: kindle paperwhite
|
Update Nikkei recipe
I wrote a update for nikkei recipe.
Adding summary text. Code:
from calibre.web.feeds.recipes import BasicNewsRecipe import re #import pprint, sys #pp = pprint.PrettyPrinter(indent=4) class NikkeiNet_paper_subscription(BasicNewsRecipe): title = u'\u65E5\u672C\u7D4C\u6E08\u65B0\u805E\uFF08\u671D\u520A\u30FB\u5915\u520A\uFF09' __author__ = 'Ado Nishimura' description = u'\u65E5\u7D4C\u96FB\u5B50\u7248\u306B\u3088\u308B\u65E5\u672C\u7D4C\u6E08\u65B0\u805E\u3002\u671D\u520A\u30FB\u5915\u520A\u306F\u53D6\u5F97\u6642\u9593\u306B\u3088\u308A\u5207\u308A\u66FF\u308F\u308A\u307E\u3059\u3002\u8981\u8CFC\u8AAD' needs_subscription = True oldest_article = 1 max_articles_per_feed = 30 language = 'ja' no_stylesheets = True #cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' cover_url = 'http://cdn.nikkei.co.jp/parts/ds/images/common/st_nikkei_r1_20101003_1.gif' #masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' masthead_url = 'http://cdn.nikkei.co.jp/parts/ds/images/common/st_nikkei_r1_20101003_1.gif' cover_margins = (10, 188, '#ffffff') remove_tags_before = {'class':"cmn-indent"} remove_tags = [ # {'class':"cmn-article_move"}, # {'class':"cmn-pr_list"}, # {'class':"cmnc-zoom"}, {'class':"cmn-hide"}, {'name':'form'}, {'class':'cmn-print_headline cmn-clearfix'}, {'id':'ABOUT_NIKKEI'}, ] remove_tags_after = {'class':"cmn-indent"} def get_browser(self): br = BasicNewsRecipe.get_browser(self) #pp.pprint(self.parse_index()) #exit(1) #br.set_debug_http(True) #br.set_debug_redirects(True) #br.set_debug_responses(True) if self.username is not None and self.password is not None: print "-------------------------open top page-------------------------------------" br.open('http://www.nikkei.com/') print "-------------------------open first login form-----------------------------" try: url = br.links(url_regex="www.nikkei.com/etc/accounts/login").next().url except StopIteration: url = 'http://www.nikkei.com/etc/accounts/login?dps=3&pageflag=top&url=http%3A%2F%2Fwww.nikkei.com%2F' br.open(url) #br.follow_link(link) #response = br.response() #print response.get_data() print "-------------------------JS redirect(send autoPostForm)--------------------" br.select_form(name='autoPostForm') br.submit() #response = br.response() print "-------------------------got login form------------------------------------" br.select_form(name='LA0210Form01') br['LA0210Form01:LA0210Email'] = self.username br['LA0210Form01:LA0210Password'] = self.password br.submit() #response = br.response() print "-------------------------JS redirect---------------------------------------" br.select_form(nr=0) br.submit() #br.set_debug_http(False) #br.set_debug_redirects(False) #br.set_debug_responses(False) return br def cleanup(self): print "-------------------------logout--------------------------------------------" self.browser.open('https://regist.nikkei.com/ds/etc/accounts/logout') def parse_index(self): print "-------------------------get index of paper--------------------------------" result = [] soup = self.index_to_soup('http://www.nikkei.com/paper/') #soup = self.index_to_soup(self.test_data()) sections = soup.findAll('div', 'cmn-section kn-special JSID_baseSection') if len(sections) == 0: sections = soup.findAll('div', 'cmn-section kn-special') for sect in sections: sect_title = sect.find('h3', 'cmnc-title').string sect_result = [] for elem in sect.findAll(attrs={'class':['cmn-article_title']}): if elem.span.a == None or elem.span.a['href'].startswith('javascript') : continue url = 'http://www.nikkei.com' + elem.span.a['href'] url = re.sub("/article/", "/print-article/", url) # print version. span = elem.span.a.span if ((span is not None) and (len(span.contents) > 1)): title = span.contents[1].string sect_result.append(dict(title=title, url=url, date='', description='', content='')) result.append([sect_title, sect_result]) return result def populate_article_metadata(self, article, soup, first): elm = soup.find('div', {"class":"cmn-article_text JSID_key_fonttxt"}) elm_text = ''.join([ s.string for s in elm ]) article.summary = elm_text article.text_summary = elm_text |
Advert | |
|
12-22-2014, 08:31 AM | #3 |
Junior Member
Posts: 1
Karma: 10
Join Date: Dec 2014
Location: Japan
Device: Kindle Voyage, Kindoe Paperwhite 2012
|
Fixed some problems.
Thanks for the great update, but the populate_article_metadata part seems causing the problem that completely skip some articles.
At the above line, the s.string returns error in the situation that the input html source code inside the <p> tag contains <a> tag. My solution for this problem is as follows: Code:
def populate_article_metadata(self, article, soup, first): try: elms = soup.findAll('div', {"class":"cmn-article_text JSID_key_fonttxt"}) elm_list = [self.tag_to_string(elm).strip() for elm in elms] while elm_list.count('') > 0: elm_list.remove('') elm_text = '◆'.join(elm_list) elm_text = unicodedata.normalize('NFKC', elm_text) article.summary = article.text_summary = elm_text except: self.log("Error: Failed to get article summary.") return For readability, I made an additional change at the unicodedata.normalize part. It converts full-width alphanumeric to half-width one for the article summary mode. It needs "import unicodedata" line at the beginning of the code. Entire code is as follows: Code:
from calibre.web.feeds.recipes import BasicNewsRecipe import re import unicodedata #import pprint, sys #pp = pprint.PrettyPrinter(indent=4) class NikkeiNet_paper_subscription(BasicNewsRecipe): title = u'\u65E5\u672C\u7D4C\u6E08\u65B0\u805E\uFF08\u671D\u520A\u30FB\u5915\u520A\uFF09' __author__ = 'Ado Nishimura' description = u'\u65E5\u7D4C\u96FB\u5B50\u7248\u306B\u3088\u308B\u65E5\u672C\u7D4C\u6E08\u65B0\u805E\u3002\u671D\u520A\u30FB\u5915\u520A\u306F\u53D6\u5F97\u6642\u9593\u306B\u3088\u308A\u5207\u308A\u66FF\u308F\u308A\u307E\u3059\u3002\u8981\u8CFC\u8AAD' needs_subscription = True oldest_article = 1 max_articles_per_feed = 30 language = 'ja' no_stylesheets = True #cover_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' cover_url = 'http://cdn.nikkei.co.jp/parts/ds/images/common/st_nikkei_r1_20101003_1.gif' #masthead_url = 'http://parts.nikkei.com/parts/ds/images/common/logo_r1.svg' masthead_url = 'http://cdn.nikkei.co.jp/parts/ds/images/common/st_nikkei_r1_20101003_1.gif' cover_margins = (10, 188, '#ffffff') remove_tags_before = {'class':"cmn-indent"} remove_tags = [ # {'class':"cmn-article_move"}, # {'class':"cmn-pr_list"}, # {'class':"cmnc-zoom"}, {'class':"cmn-hide"}, {'name':'form'}, {'class':'cmn-print_headline cmn-clearfix'}, {'id':'ABOUT_NIKKEI'}, ] remove_tags_after = {'class':"cmn-indent"} def get_browser(self): br = BasicNewsRecipe.get_browser(self) #pp.pprint(self.parse_index()) #exit(1) #br.set_debug_http(True) #br.set_debug_redirects(True) #br.set_debug_responses(True) if self.username is not None and self.password is not None: print "-------------------------open top page-------------------------------------" br.open('http://www.nikkei.com/') print "-------------------------open first login form-----------------------------" try: url = br.links(url_regex="www.nikkei.com/etc/accounts/login").next().url except StopIteration: url = 'http://www.nikkei.com/etc/accounts/login?dps=3&pageflag=top&url=http%3A%2F%2Fwww.nikkei.com%2F' br.open(url) #br.follow_link(link) #response = br.response() #print response.get_data() print "-------------------------JS redirect(send autoPostForm)--------------------" br.select_form(name='autoPostForm') br.submit() #response = br.response() print "-------------------------got login form------------------------------------" br.select_form(name='LA0210Form01') br['LA0210Form01:LA0210Email'] = self.username br['LA0210Form01:LA0210Password'] = self.password br.submit() #response = br.response() print "-------------------------JS redirect---------------------------------------" br.select_form(nr=0) br.submit() #br.set_debug_http(False) #br.set_debug_redirects(False) #br.set_debug_responses(False) return br def cleanup(self): print "-------------------------logout--------------------------------------------" self.browser.open('https://regist.nikkei.com/ds/etc/accounts/logout') def parse_index(self): print "-------------------------get index of paper--------------------------------" result = [] soup = self.index_to_soup('http://www.nikkei.com/paper/') #soup = self.index_to_soup(self.test_data()) sections = soup.findAll('div', 'cmn-section kn-special JSID_baseSection') if len(sections) == 0: sections = soup.findAll('div', 'cmn-section kn-special') for sect in sections: sect_title = sect.find('h3', 'cmnc-title').string sect_result = [] for elem in sect.findAll(attrs={'class':['cmn-article_title']}): if elem.span.a == None or elem.span.a['href'].startswith('javascript') : continue url = 'http://www.nikkei.com' + elem.span.a['href'] url = re.sub("/article/", "/print-article/", url) # print version. span = elem.span.a.span if ((span is not None) and (len(span.contents) > 1)): title = span.contents[1].string sect_result.append(dict(title=title, url=url, date='', description='', content='')) result.append([sect_title, sect_result]) return result def populate_article_metadata(self, article, soup, first): try: elms = soup.findAll('div', {"class":"cmn-article_text JSID_key_fonttxt"}) elm_list = [self.tag_to_string(elm).strip() for elm in elms] while elm_list.count('') > 0: elm_list.remove('') elm_text = '◆'.join(elm_list) elm_text = unicodedata.normalize('NFKC', elm_text) article.summary = article.text_summary = elm_text except: self.log("Error: Failed to get article summary.") return http://i.imgur.com/id45HwQ.png http://i.imgur.com/RcPkSVu.png Last edited by szk2005; 12-23-2014 at 02:14 AM. |
|
Similar Threads | ||||
Thread | Thread Starter | Forum | Replies | Last Post |
Suppress next | prev | section | main menu for news? | Barty | Calibre | 2 | 02-13-2011 08:47 PM |
Nikkei/Problematic site that need form-post before processing | miurahr | Recipes | 6 | 11-21-2010 01:27 PM |
Split the "News and Commentary" section | Phogg | Feedback | 15 | 11-23-2009 05:06 PM |
MR News: Team members / Writers' section | Alexander Turcic | Announcements | 21 | 02-27-2009 10:55 AM |