Regarding the use of rss feeds, you could always override the skip_ad_pages method, eg:
Code:
#!/usr/bin/env python
__license__ = 'GPL v3'
__author__ = 'Tony Stegall'
__copyright__ = '2010, Tony Stegall or Tonythebookworm on mobiread.com'
__version__ = '1'
__date__ = '16, October 2010'
__docformat__ = 'English'
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import NavigableString
class TheStarMalaysia(BasicNewsRecipe):
title = 'TheStarMalaysia'
__author__ = 'Calibre'
description = 'The Star Newspaper Malaysia'
recursions = 0
language = 'en'
no_stylesheets = True
publisher = 'Calibre'
category = 'news'
use_embedded_content = False
no_stylesheets = True
oldest_article = 24
remove_javascript = True
remove_empty_feeds = True
conversion_options = {'linearize_tables' : True}
keep_only_tags = [dict(name='div', attrs={'id':['story_main']})
]
remove_tags_after = [dict(name='div', attrs={'id':['story_content']})]
max_articles_per_feed = 30
def skip_ad_pages(self,soup):
for item in soup.findAll(name = 'a', attrs = {'href': True}):
for content in item:
if isinstance(content,NavigableString):
if str(content) == 'click here to continue to article':
return self.index_to_soup(item.get('href'), raw=True)
return soup
def preprocess_html(self, soup):
for item in soup.findAll(attrs={'style':True}):
del item['style']
return soup
feeds = [
(u'News - Nation',
u'http://thestar.com.my.feedsportal.com/c/33048/f/534555/index.rss'),
]