MobileRead Forums - View Single Post - Problem trying to download news from thestar.com.my

NotTaken · 11-26-2011, 09:26 AM

Regarding the use of rss feeds, you could always override the skip_ad_pages method, eg:

Code:

#!/usr/bin/env  python
__license__   = 'GPL v3'
__author__    = 'Tony Stegall'
__copyright__ = '2010, Tony Stegall or Tonythebookworm on mobiread.com'
__version__   = '1'
__date__      = '16, October 2010'
__docformat__ = 'English'



from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import NavigableString

class TheStarMalaysia(BasicNewsRecipe):
    title      = 'TheStarMalaysia'
    __author__ = 'Calibre'
    description = 'The Star Newspaper Malaysia'
    recursions = 0
    language = 'en'
    no_stylesheets = True
    publisher           = 'Calibre'
    category            = 'news'
    use_embedded_content = False
    no_stylesheets      = True
    oldest_article      = 24
    remove_javascript   = True
    remove_empty_feeds  = True
    conversion_options = {'linearize_tables' : True}

    keep_only_tags     = [dict(name='div', attrs={'id':['story_main']})
                          ]
    remove_tags_after = [dict(name='div', attrs={'id':['story_content']})]



    max_articles_per_feed = 30

    def skip_ad_pages(self,soup):
        for item in soup.findAll(name = 'a', attrs = {'href': True}):
            for content in item:
                if isinstance(content,NavigableString):
                    if str(content) == 'click here to continue to article':
                        return self.index_to_soup(item.get('href'), raw=True)
        return soup

    def preprocess_html(self, soup):
        for item in soup.findAll(attrs={'style':True}):
            del item['style']
        return soup
        
        
    feeds = [
        (u'News - Nation',
         u'http://thestar.com.my.feedsportal.com/c/33048/f/534555/index.rss'),
         ]