Quote:
Originally Posted by bcollier
Ok, thanks for the quick response. Where is the documentation for the article object being passed in? I'm just looking for the main article text and can't seem to get it in the populate_article_metadata. I have
if len(article.text_summary) == 0:
article.text_summary = "the first two sentences of the article"
should I somehow pull the main article text from soup, or is it already parsed in in the article object?
|
This populate_article_metadata() function was once in the NYTimes recipe, but was removed at some point. You can use it as a point of reference:
Spoiler:
Code:
def populate_article_metadata(self,article,soup,first):
'''
Extract author and description from article, add to article metadata
'''
def extract_author(soup):
byline = soup.find('meta',attrs={'name':['byl','CLMST']})
if byline :
author = byline['content']
else :
# Try for <div class="byline">
byline = soup.find('div', attrs={'class':'byline'})
if byline:
author = byline.renderContents()
else:
print soup.prettify()
return None
return author
def extract_description(soup):
description = soup.find('meta',attrs={'name':['description','description ']})
if description :
return self.massageNCXText(description['content'])
else:
# Take first paragraph of article
articlebody = soup.find('div',attrs={'id':'articlebody'})
if not articlebody:
# Try again with class instead of id
articlebody = soup.find('div',attrs={'class':'articlebody'})
if not articlebody:
print 'postprocess_book.extract_description(): Did not find <div id="articlebody">:'
print soup.prettify()
return None
paras = articlebody.findAll('p')
for p in paras:
if p.renderContents() > '' :
return self.massageNCXText(self.tag_to_string(p,use_alt=False))
return None
article.author = extract_author(soup)
article.summary = article.text_summary = extract_description(soup)
G