10-25-2010, 08:29 PM | #1 |
Connoisseur
Posts: 82
Karma: 10
Join Date: Oct 2010
Device: Kindle
|
Ming Pao (明報) - Hong Kong
Code:
__license__ = 'GPL v3' __copyright__ = '2010, Eddie Lau' ''' modified from Singtao Toronto calibre recipe by rty ''' import datetime, time from calibre.web.feeds.recipes import BasicNewsRecipe class AdvancedUserRecipe1278063072(BasicNewsRecipe): title = 'Ming Pao - Hong Kong' oldest_article = 1 max_articles_per_feed = 100 __author__ = 'Eddie Lau, modified from Singtao Toronto template from rty' description = 'Hong Kong Chinese Newspaper' publisher = 'news.mingpao.com' category = 'Chinese, News, Hong Kong' remove_javascript = True use_embedded_content = False no_stylesheets = True language = 'zh' encoding = 'Big5-HKSCS' recursions = 0 conversion_options = {'linearize_tables':True} masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif' keep_only_tags = [dict(name='h1'), dict(attrs={'id':['newscontent01','newscontent02']})] def get_fetchdate(self): dt_utc = datetime.datetime.utcnow() # convert UTC to local hk time dt_local = dt_utc - datetime.timedelta(-8.0/24) return dt_local.strftime("%Y%m%d") def parse_index(self): feeds = [] dateStr = self.get_fetchdate() for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'), (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'), (u'\u793e\u8a55\u2027\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'), (u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'), (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'), ('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'), (u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),]: articles = self.parse_section(url) if articles: feeds.append((title, articles)) return feeds def parse_section(self, url): dateStr = self.get_fetchdate() soup = self.index_to_soup(url) divs = soup.findAll(attrs={'class': ['bullet']}) current_articles = [] for i in divs: a = i.find('a', href = True) title = self.tag_to_string(a) url = a.get('href', False) url = 'http://news.mingpao.com/' + dateStr + '/' +url current_articles.append({'title': title, 'url': url, 'description':''}) return current_articles def preprocess_html(self, soup): for item in soup.findAll(style=True): del item['style'] for item in soup.findAll(width=True): del item['width'] return soup |
10-31-2010, 06:08 PM | #2 |
Connoisseur
Posts: 82
Karma: 10
Join Date: Oct 2010
Device: Kindle
|
Updated Ming Pao - Hong Kong recipe (2010/10/31)
Code:
__license__ = 'GPL v3' __copyright__ = '2010, Eddie Lau' ''' modified from Singtao Toronto calibre recipe by rty Change Log: 2010/10/31: skip repeated articles in section pages ''' import datetime, time from calibre.web.feeds.recipes import BasicNewsRecipe class AdvancedUserRecipe1278063072(BasicNewsRecipe): title = 'Ming Pao - Hong Kong' oldest_article = 1 max_articles_per_feed = 100 __author__ = 'Eddie Lau' description = 'Hong Kong Chinese Newspaper' publisher = 'news.mingpao.com' category = 'Chinese, News, Hong Kong' remove_javascript = True use_embedded_content = False no_stylesheets = True language = 'zh' encoding = 'Big5-HKSCS' recursions = 0 conversion_options = {'linearize_tables':True} masthead_url = 'http://news.mingpao.com/image/portals_top_logo_news.gif' keep_only_tags = [dict(name='h1'), dict(attrs={'id':['newscontent01','newscontent02']})] def get_fetchdate(self): dt_utc = datetime.datetime.utcnow() # convert UTC to local hk time - at around HKT 5.30am, all news are available dt_local = dt_utc - datetime.timedelta(-2.5/24) return dt_local.strftime("%Y%m%d") def parse_index(self): feeds = [] dateStr = self.get_fetchdate() for title, url in [(u'\u8981\u805e Headline', 'http://news.mingpao.com/' + dateStr + '/gaindex.htm'), (u'\u6559\u80b2 Education', 'http://news.mingpao.com/' + dateStr + '/gfindex.htm'), (u'\u6e2f\u805e Local', 'http://news.mingpao.com/' + dateStr + '/gbindex.htm'), (u'\u793e\u8a55\u2027\u7b46\u9663 Editorial', 'http://news.mingpao.com/' + dateStr + '/mrindex.htm'), (u'\u8ad6\u58c7 Forum', 'http://news.mingpao.com/' + dateStr + '/faindex.htm'), (u'\u4e2d\u570b China', 'http://news.mingpao.com/' + dateStr + '/caindex.htm'), (u'\u570b\u969b World', 'http://news.mingpao.com/' + dateStr + '/taindex.htm'), ('Tech News', 'http://news.mingpao.com/' + dateStr + '/naindex.htm'), (u'\u9ad4\u80b2 Sport', 'http://news.mingpao.com/' + dateStr + '/spindex.htm'), (u'\u526f\u520a Supplement', 'http://news.mingpao.com/' + dateStr + '/jaindex.htm'),]: articles = self.parse_section(url) if articles: feeds.append((title, articles)) return feeds def parse_section(self, url): dateStr = self.get_fetchdate() soup = self.index_to_soup(url) divs = soup.findAll(attrs={'class': ['bullet']}) current_articles = [] included_urls = [] for i in divs: a = i.find('a', href = True) title = self.tag_to_string(a) url = a.get('href', False) url = 'http://news.mingpao.com/' + dateStr + '/' +url if url not in included_urls: current_articles.append({'title': title, 'url': url, 'description':''}) included_urls.append(url) return current_articles |
Advert | |
|
|
Similar Threads | ||||
Thread | Thread Starter | Forum | Replies | Last Post |
Hello from Hong Kong | manou | Introduce Yourself | 11 | 09-12-2010 11:25 AM |
Hi from Hong Kong | dr_garfield | Introduce Yourself | 6 | 07-12-2010 07:46 AM |
Hello from Hong Kong | anniebh | Introduce Yourself | 10 | 08-07-2009 05:25 AM |
hello from hong kong! | carpetfish | Introduce Yourself | 4 | 03-17-2008 06:23 AM |
Hello from Hong Kong | tsuria | Introduce Yourself | 2 | 03-21-2007 05:00 PM |