MobileRead Forums - View Single Post

dkfurrow · 06-19-2013, 05:50 PM

Yes that works, was able to populate summary from the article body. I'd like to populate the date as well, but it appears that the soup in populate_article_metadata has already been stripped down to the basic article body, thus removing the tags I'm interested in. I tried to use the keep_only_tags feature to add the appropriate tags to the article body...didn't work. I see another poster has the same issue w/ that feature, so I'll just watch that thread. Recipe posted below:

Code:

#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__   = 'GPL v3'
__copyright__ = '2013, Dale Furrow dkfurrow@gmail.com'
'''
chron.com
'''
import re, string
import urllib2
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag

class HoustonChronicle(BasicNewsRecipe):

    title      =  u'The Houston Chronicle'
    description    = 'News from Houston, Texas'
    __author__ = 'Dale Furrow'
    language = 'en'
    no_stylesheets = True
    #use_embedded_content = False
    remove_attributes = ['style']
    auto_cleanup = True
    

    def parse_index(self):
        
        self.timefmt = ' [%a, %d %b, %Y]'
        baseUrl = 'http://www.chron.com'
        pages = [('news' , '/news/houston-texas/'), 
        ('business' , '/business/'), 
        ('opinion', '/opinion/'), 
        ('sports', '/sports/')]
        feeds = []
        totalLinks = 0
        for page in pages:
            articles = []
            section_links = set()
            url = urllib2.urlopen(baseUrl + page[1])
            content = url.read()
            soup = BeautifulSoup(content)
            divs = soup.findAll('div', attrs={'class': re.compile('scp-feature|simplelist|scp-item')})
            for div in divs:
                self.log( 'Page: ', page[0], ' div: ', div['class'], ' Number of Children: ', len(div.findChildren()) )
                for child in div.findChildren():
                    if isinstance(child, Tag) and child.name == u'a' and len(child['href']) > 10:
                        if len(child.contents[0]) > 10 and child['href'] not in section_links:
                            section_links.add(child['href'])
                            if child['href'].find('http') == -1:
                                link = baseUrl + child['href']
                            else:
                                link = child['href']
                            title = child.contents[0]
                            totalLinks += 1
                            self.log('\tFound article ', totalLinks, " at " ,title, 'at', link)
                            articles.append({'title':title, 'url':link, 'description':'', 'date':''})
            if articles:
                feeds.append((page[0], articles))
        self.log('Found ', totalLinks, ' articles --returning feeds')
        return feeds
        
    def populate_article_metadata(self, article, soup, first):
        if not first:
            return
        outputParagraph = ""
        max_length = 210 #approximately three line of text
        try:
            if len(article.text_summary.strip()) == 0:
                articlebody = soup.find('body')
                if articlebody:
                    paras = articlebody.findAll('p')
                    for p in paras:
                            refparagraph = self.tag_to_string(p,use_alt=False).strip()
                            #account for blank paragraphs and short paragraphs by appending them to longer ones
                            outputParagraph += (" " + refparagraph)
                            if len(outputParagraph) > max_length: 
                                article.summary = article.text_summary = outputParagraph.strip()[0 : max_length]
                                return
            else:
                article.summary = article.text_summary = article.text_summary
        except:
            self.log("Error creating article descriptions")
            return