MobileRead Forums - View Single Post - Beautiful soup findAll doesn't seem to work

NotTaken · 07-27-2012, 09:44 PM

The unordered list is being moved from within the section tags to after (possibly due to some non-compliant nesting). You could try awork around like this:

Code:

    def find_articles(self,section):
        for post in section.findAll(attrs={'class':'package-link'}):
                title = self.tag_to_string(post)
                url = post['href']
                if url.startswith('/'): url = 'http://www.economist.com'+url+'/print'
                self.log('\tFound article:', title, 'at', url)
                yield {'title':title, 'url':url, 'description':'',
                    'date':''}
            

    def parse_index(self):
        soup = self.index_to_soup(self.INDEX)
        feeds = []
        for section in soup.findAll('section'):
            h1 = section.find('h1')
            if h1 is None:
                continue
            section_title = self.tag_to_string(h1).strip()
            self.log('Found section: %s'%section_title)
            
            articles = []
            articles.extend(self.find_articles(section))

            ul = section.findNextSibling('ul')
            if ul:
                articles.extend(self.find_articles(ul))
            
            if articles:
                feeds.append((section_title, articles))
        return feeds