MobileRead Forums - View Single Post

dkfurrow · 06-13-2013, 05:25 PM

Thanks for the prompt reply. Okay, I think I got it...the problem was the descendants attribute, when I used findChildren, it all worked. I posted the recipe below, will test it for awhile, then submit it for calibre inclusion...

There's no description on the the sites from which I'm obtaining the feeds, but there is a description on the feed destination...Is there any established way to handle this, other than by grabbing the text from a call within parse_index ?

Code:

#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
__license__   = 'GPL v3'
__copyright__ = '2013, Dale Furrow dkfurrow@gmail.com'
'''
chron.com
'''
import re
import urllib2
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag


class HoustonChronicle(BasicNewsRecipe):

    title      =  u'The Houston Chronicle'
    description    = 'News from Houston, Texas'
    __author__ = 'Dale Furrow'
    language = 'en'
    no_stylesheets = True
    #use_embedded_content = False
    remove_attributes = ['style']
    auto_cleanup = True
    
    

    def parse_index(self):
        
        self.timefmt = ' [%a, %d %b, %Y]'
        baseUrl = 'http://www.chron.com'
        pages = [('news' , '/news/houston-texas/'), 
        ('business' , '/business/'), 
        ('opinion', '/opinion/'), 
        ('sports', '/sports/')]
        feeds = []
        totalLinks = 0
        for page in pages:
            articles = []
            section_links = set()
            url = urllib2.urlopen(baseUrl + page[1])
            content = url.read()
            soup = BeautifulSoup(content)
            divs = soup.findAll('div', attrs={'class': re.compile('scp-feature|simplelist')})
            for div in divs:
                self.log( 'Page: ', page[0], ' div: ', div['class'], ' Number of Children: ', len(div.findChildren()) )
                for child in div.findChildren():
                    if isinstance(child, Tag) and child.name == u'a' and len(child['href']) > 10:
                        if len(child.contents[0]) > 10 and child['href'] not in section_links:
                            section_links.add(child['href'])
                            if child['href'].find('http') == -1:
                                link = baseUrl + child['href']
                            else:
                                link = child['href']
                            title = child.contents[0]
                            totalLinks += 1
                            self.log('\tFound article ', totalLinks, " at " ,title, 'at', link)
                            articles.append({'title':title, 'url':link, 'description':'', 'date':''})
            if articles:
                feeds.append((page[0], articles))
        self.log('Found ', totalLinks, ' articles --returning feeds')
        return feeds