View Single Post
Old 11-26-2011, 09:26 AM   #2
NotTaken
Connoisseur
NotTaken is fluent in JavaScript as well as Klingon.NotTaken is fluent in JavaScript as well as Klingon.NotTaken is fluent in JavaScript as well as Klingon.NotTaken is fluent in JavaScript as well as Klingon.NotTaken is fluent in JavaScript as well as Klingon.NotTaken is fluent in JavaScript as well as Klingon.NotTaken is fluent in JavaScript as well as Klingon.NotTaken is fluent in JavaScript as well as Klingon.NotTaken is fluent in JavaScript as well as Klingon.NotTaken is fluent in JavaScript as well as Klingon.NotTaken is fluent in JavaScript as well as Klingon.
 
Posts: 65
Karma: 4640
Join Date: Aug 2011
Device: kindle
Regarding the use of rss feeds, you could always override the skip_ad_pages method, eg:

Code:
#!/usr/bin/env  python
__license__   = 'GPL v3'
__author__    = 'Tony Stegall'
__copyright__ = '2010, Tony Stegall or Tonythebookworm on mobiread.com'
__version__   = '1'
__date__      = '16, October 2010'
__docformat__ = 'English'



from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import NavigableString

class TheStarMalaysia(BasicNewsRecipe):
    title      = 'TheStarMalaysia'
    __author__ = 'Calibre'
    description = 'The Star Newspaper Malaysia'
    recursions = 0
    language = 'en'
    no_stylesheets = True
    publisher           = 'Calibre'
    category            = 'news'
    use_embedded_content = False
    no_stylesheets      = True
    oldest_article      = 24
    remove_javascript   = True
    remove_empty_feeds  = True
    conversion_options = {'linearize_tables' : True}

    keep_only_tags     = [dict(name='div', attrs={'id':['story_main']})
                          ]
    remove_tags_after = [dict(name='div', attrs={'id':['story_content']})]



    max_articles_per_feed = 30

    def skip_ad_pages(self,soup):
        for item in soup.findAll(name = 'a', attrs = {'href': True}):
            for content in item:
                if isinstance(content,NavigableString):
                    if str(content) == 'click here to continue to article':
                        return self.index_to_soup(item.get('href'), raw=True)
        return soup

    def preprocess_html(self, soup):
        for item in soup.findAll(attrs={'style':True}):
            del item['style']
        return soup
        
        
    feeds = [
        (u'News - Nation',
         u'http://thestar.com.my.feedsportal.com/c/33048/f/534555/index.rss'),
         ]
NotTaken is offline   Reply With Quote