Yes that works, was able to populate summary from the article body. I'd like to populate the date as well, but it appears that the soup in populate_article_metadata has already been stripped down to the basic article body, thus removing the tags I'm interested in. I tried to use the keep_only_tags feature to add the appropriate tags to the article body...didn't work. I see another poster has the same issue w/ that feature, so I'll just watch that thread. Recipe posted below:
Code:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2013, Dale Furrow dkfurrow@gmail.com'
'''
chron.com
'''
import re, string
import urllib2
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class HoustonChronicle(BasicNewsRecipe):
title = u'The Houston Chronicle'
description = 'News from Houston, Texas'
__author__ = 'Dale Furrow'
language = 'en'
no_stylesheets = True
#use_embedded_content = False
remove_attributes = ['style']
auto_cleanup = True
def parse_index(self):
self.timefmt = ' [%a, %d %b, %Y]'
baseUrl = 'http://www.chron.com'
pages = [('news' , '/news/houston-texas/'),
('business' , '/business/'),
('opinion', '/opinion/'),
('sports', '/sports/')]
feeds = []
totalLinks = 0
for page in pages:
articles = []
section_links = set()
url = urllib2.urlopen(baseUrl + page[1])
content = url.read()
soup = BeautifulSoup(content)
divs = soup.findAll('div', attrs={'class': re.compile('scp-feature|simplelist|scp-item')})
for div in divs:
self.log( 'Page: ', page[0], ' div: ', div['class'], ' Number of Children: ', len(div.findChildren()) )
for child in div.findChildren():
if isinstance(child, Tag) and child.name == u'a' and len(child['href']) > 10:
if len(child.contents[0]) > 10 and child['href'] not in section_links:
section_links.add(child['href'])
if child['href'].find('http') == -1:
link = baseUrl + child['href']
else:
link = child['href']
title = child.contents[0]
totalLinks += 1
self.log('\tFound article ', totalLinks, " at " ,title, 'at', link)
articles.append({'title':title, 'url':link, 'description':'', 'date':''})
if articles:
feeds.append((page[0], articles))
self.log('Found ', totalLinks, ' articles --returning feeds')
return feeds
def populate_article_metadata(self, article, soup, first):
if not first:
return
outputParagraph = ""
max_length = 210 #approximately three line of text
try:
if len(article.text_summary.strip()) == 0:
articlebody = soup.find('body')
if articlebody:
paras = articlebody.findAll('p')
for p in paras:
refparagraph = self.tag_to_string(p,use_alt=False).strip()
#account for blank paragraphs and short paragraphs by appending them to longer ones
outputParagraph += (" " + refparagraph)
if len(outputParagraph) > max_length:
article.summary = article.text_summary = outputParagraph.strip()[0 : max_length]
return
else:
article.summary = article.text_summary = article.text_summary
except:
self.log("Error creating article descriptions")
return