It looks like the links were broken and the structure of the markup changed. So it needed to be redone.
Update for Global News:
Code:
from calibre.web.feeds.news import BasicNewsRecipe
import re
def check_words(words):
return lambda x: x and frozenset(words.split()).intersection(x.split())
class GlobalTimes(BasicNewsRecipe):
title = u'Global Times'
__author__ = 'Jose Ortiz' # lui1 at mobileread.com
language = 'en_CN'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
keep_only_tags = [
{ 'class': check_words('article-title article-source row-content') },
]
extra_css = '''
.article-title {
font-family:Arial,Helvetica,sans-serif;
font-weight:bold;font-size:large;
}
.article-source, .row-content {
font-family:Arial,Helvetica,sans-serif;
font-size:small;
}
'''
def parse_index(self):
catnames = {}
catnames["http://www.globaltimes.cn/china/politics/"] = "China Politics"
catnames["http://www.globaltimes.cn/china/diplomacy/"] = "China Diplomacy"
catnames["http://www.globaltimes.cn/china/military/"] = "China Military"
catnames["http://www.globaltimes.cn/business/economy/"] = "China Economy"
catnames["http://www.globaltimes.cn/world/asia-pacific/"] = "Asia Pacific"
feeds = []
for cat in catnames.keys():
articles = []
soup = self.index_to_soup(cat)
for a in soup.findAll('a', attrs={'href': re.compile(r'https?://www.globaltimes.cn/content/[0-9]{4,10}[.]shtml')}):
url = a['href'].strip() # Typical url http://www.globaltimes.cn/content/5555555.shtml
title = self.tag_to_string(a).strip()
if not title:
continue
myarticle = ({'title': title,
'url': url,
'description': '',
'date': ''})
self.log("found '%s'" % title)
articles.append(myarticle)
self.log("Adding URL %s\n" % url)
if articles:
feeds.append((catnames[cat], articles))
return feeds