Hi,
I'm a newbie at this. So please pardon if I don't know the right terms.
I'm trying to modify an existing code to try and download news from thestar.com.my. I couldn't use the simple rss method because the website add an additional advertisement page with a 'please click here' link between the main rss page and the article. All I get are blank advertisement method. (if you like to see what I mean, try this link
http://thestar.com.my.feedsportal.co...4555/index.rss)
So I am trying the other method of using beautifulsoup by modifying an existing code.
Unfortunately I seem to only get the message "This article was downloaded by calibre from http:...." in every feed.
I could find the article but seems like the recipe unable to download the article content.
Here's the code. Appreciate it if someone can guide me on why the content is not downloaded.
Code:
#!/usr/bin/env python
__license__ = 'GPL v3'
__author__ = 'Tony Stegall'
__copyright__ = '2010, Tony Stegall or Tonythebookworm on mobiread.com'
__version__ = '1'
__date__ = '16, October 2010'
__docformat__ = 'English'
from calibre.web.feeds.news import BasicNewsRecipe
class TheStarMalaysia(BasicNewsRecipe):
title = 'TheStarMalaysia'
__author__ = 'Calibre'
description = 'The Star Newspaper Malaysia'
recursions = 6
language = 'en'
no_stylesheets = True
publisher = 'Calibre'
category = 'news'
use_embedded_content = False
no_stylesheets = True
oldest_article = 24
remove_javascript = True
remove_empty_feeds = True
conversion_options = {'linearize_tables' : True}
extra_css = '''
#content_heading{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
td{text-align:right; font-size:small;margin-top:0px;margin-bottom: 0px;}
#content_body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
'''
keep_only_tags = [dict(name='table', attrs={'class':['contentpaneopen']})
]
remove_tags = [dict(name='table', attrs={'class':['buttonheading']})]
#######################################################################################################################
max_articles_per_feed = 30
'''
Make a variable that will hold the url for the main site because our links do not include the index
'''
INDEX = 'http://thestar.com.my'
def parse_index(self):
feeds = []
for title, url in [
(u"Nation", u"http://thestar.com.my/news/nation/"),
]:
articles = self.make_links(url)
if articles:
feeds.append((title, articles))
return feeds
def make_links(self, url):
title = 'Temp'
current_articles = []
soup = self.index_to_soup(url)
print 'The soup is: ', soup
for item in soup.findAll('div', attrs = {'class':'news_container'}):
print 'item is: ', item
link = item.find('a')
print 'the link is: ', link
if link:
url = self.INDEX + link['href']
title = self.tag_to_string(link)
print 'the title is: ', title
print 'the url is: ', url
print 'the title is: ', title
current_articles.append({'title': title, 'url': url, 'description':'', 'date':''}) # append all this
return current_articles
def preprocess_html(self, soup):
for item in soup.findAll(attrs={'style':True}):
del item['style']
return soup
regards,
elteebee