MobileRead Forums - View Single Post - Problem trying to download news from thestar.com.my

NotTaken · 11-26-2011, 09:52 AM

Just tried your code, think your tag removal settings were cutting all of the page content.

Code:

#!/usr/bin/env  python
__license__   = 'GPL v3'
__author__    = 'Tony Stegall'
__copyright__ = '2010, Tony Stegall or Tonythebookworm on mobiread.com'
__version__   = '1'
__date__      = '16, October 2010'
__docformat__ = 'English'



from calibre.web.feeds.news import BasicNewsRecipe

class TheStarMalaysia(BasicNewsRecipe):
    title      = 'TheStarMalaysia'
    __author__ = 'Calibre'
    description = 'The Star Newspaper Malaysia'
    recursions = 0
    language = 'en'
    no_stylesheets = True
    publisher           = 'Calibre'
    category            = 'news'
    use_embedded_content = False
    no_stylesheets      = True
    oldest_article      = 24
    remove_javascript   = True
    remove_empty_feeds  = True
    conversion_options = {'linearize_tables' : True}

    keep_only_tags     = [dict(name='div', attrs={'id':['story_main']})
                          ]
    remove_tags_after = [dict(name='div', attrs={'id':['story_content']})]
    
    extra_css = '''
                    #story_content {font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;}
                    h3 {font-size: x-small; color:#888888;}
                '''    

    max_articles_per_feed = 30

    '''
    Make a variable that will hold the url for the main site because our links do not include the index
    '''

    INDEX = 'http://thestar.com.my'



    def parse_index(self):
        feeds = []
        for title, url in [
                            (u"Nation", u"http://thestar.com.my/news/nation/"),
                           
                              ]:

            articles = self.make_links(url)
            if articles:
                feeds.append((title, articles))
        return feeds

    def make_links(self, url):
        title = 'Temp'
        current_articles = []
        soup = self.index_to_soup(url)
        for item in soup.findAll('div', attrs = {'class':'news_container'}):
            link = item.find('a')
            print 'the link is: ', link
            if link:
                url         = self.INDEX + link['href']
                title       = self.tag_to_string(link)
                print 'the title is: ', title
                print 'the url is: ', url
                print 'the title is: ', title
                current_articles.append({'title': title, 'url': url, 'description':'', 'date':''}) # append all this
        return current_articles

    def preprocess_html(self, soup):
        for item in soup.findAll(attrs={'style':True}):
            del item['style']
        return soup