View Single Post
Old 11-26-2011, 09:52 AM   #3
NotTaken
Connoisseur
NotTaken is fluent in JavaScript as well as Klingon.NotTaken is fluent in JavaScript as well as Klingon.NotTaken is fluent in JavaScript as well as Klingon.NotTaken is fluent in JavaScript as well as Klingon.NotTaken is fluent in JavaScript as well as Klingon.NotTaken is fluent in JavaScript as well as Klingon.NotTaken is fluent in JavaScript as well as Klingon.NotTaken is fluent in JavaScript as well as Klingon.NotTaken is fluent in JavaScript as well as Klingon.NotTaken is fluent in JavaScript as well as Klingon.NotTaken is fluent in JavaScript as well as Klingon.
 
Posts: 65
Karma: 4640
Join Date: Aug 2011
Device: kindle
Just tried your code, think your tag removal settings were cutting all of the page content.

Code:
#!/usr/bin/env  python
__license__   = 'GPL v3'
__author__    = 'Tony Stegall'
__copyright__ = '2010, Tony Stegall or Tonythebookworm on mobiread.com'
__version__   = '1'
__date__      = '16, October 2010'
__docformat__ = 'English'



from calibre.web.feeds.news import BasicNewsRecipe

class TheStarMalaysia(BasicNewsRecipe):
    title      = 'TheStarMalaysia'
    __author__ = 'Calibre'
    description = 'The Star Newspaper Malaysia'
    recursions = 0
    language = 'en'
    no_stylesheets = True
    publisher           = 'Calibre'
    category            = 'news'
    use_embedded_content = False
    no_stylesheets      = True
    oldest_article      = 24
    remove_javascript   = True
    remove_empty_feeds  = True
    conversion_options = {'linearize_tables' : True}

    keep_only_tags     = [dict(name='div', attrs={'id':['story_main']})
                          ]
    remove_tags_after = [dict(name='div', attrs={'id':['story_content']})]
    
    extra_css = '''
                    #story_content {font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;}
                    h3 {font-size: x-small; color:#888888;}
                '''    

    max_articles_per_feed = 30

    '''
    Make a variable that will hold the url for the main site because our links do not include the index
    '''

    INDEX = 'http://thestar.com.my'



    def parse_index(self):
        feeds = []
        for title, url in [
                            (u"Nation", u"http://thestar.com.my/news/nation/"),
                           
                              ]:

            articles = self.make_links(url)
            if articles:
                feeds.append((title, articles))
        return feeds

    def make_links(self, url):
        title = 'Temp'
        current_articles = []
        soup = self.index_to_soup(url)
        for item in soup.findAll('div', attrs = {'class':'news_container'}):
            link = item.find('a')
            print 'the link is: ', link
            if link:
                url         = self.INDEX + link['href']
                title       = self.tag_to_string(link)
                print 'the title is: ', title
                print 'the url is: ', url
                print 'the title is: ', title
                current_articles.append({'title': title, 'url': url, 'description':'', 'date':''}) # append all this
        return current_articles

    def preprocess_html(self, soup):
        for item in soup.findAll(attrs={'style':True}):
            del item['style']
        return soup
NotTaken is offline   Reply With Quote