Got the tags cleaned up tolerably well...Only thing I haven't seemed to be able to do is to delete specific articles, after parsing, based on date (line 148, I'd like to delete articles with age > 2 days). I've attached the correct date to the article in populate_article_metadata...is it possible to delete the article there? If so, what's the correct syntax? Couldn't seem to make anything work to do that.
Code:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2013, Dale Furrow dkfurrow@gmail.com'
'''
chron.com
'''
import re, string, time
import urllib2
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
from calibre.utils.date import dt_factory, utcnow, local_tz
from datetime import datetime, timedelta
def getRegularTimestamp(dateString):
try:
outDate = time.strptime(dateString, "%Y-%m-%dT%H:%M:%SZ")
return outDate
except:
return None
regextest = '(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|\
Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?) \
[0-9]{1,2}, 20[01][0-9]'
def GetDateFromString(inText):
match = re.findall(regextest, inText)
if match:
try:
outDate = time.strptime(match[0], "%B %d, %Y")
return outDate
except:
return None
else:
return None
def isWithinDays(inTT,daysAgo):
daysAgoDateTime = datetime.now()-timedelta(days = daysAgo)
DaysAgoDateTime = datetime(inTT[0], inTT[1], inTT[2], inTT[3],
inTT[4], inTT[5])
return DaysAgoDateTime > daysAgoDateTime
def getTimestampFromSoup (soup):
timestampEle = soup.find('h5', attrs={'class': re.compile('timestamp')})
if timestampEle is not None:
try:
timestampText = timestampEle['title']
return getRegularTimestamp(timestampText)
except:
return None
else:
timestampEle = soup.find('span', attrs={'class': re.compile('post-date|entry-date')})
if timestampEle is not None:
try:
timestampText = timestampEle.string
return GetDateFromString(timestampText)
except:
return None
else:
return None
class HoustonChronicle(BasicNewsRecipe):
title = u'The Houston Chronicle'
description = 'News from Houston, Texas'
__author__ = 'Dale Furrow'
language = 'en'
no_stylesheets = True
#use_embedded_content = False
remove_attributes = ['style']
remove_empty_feeds = True
keep_only_tags = [dict(name='div', attrs={'class':re.compile('hentry')}),
dict(name='span', attrs={'class':re.compile('post-date|entry-date')}),
dict(name='h5', attrs={'class':re.compile('timestamp')}),
dict(name='div', attrs={'id':re.compile('post-')}) ]
remove_tags = [dict(name='div', attrs={'class':'socialBar'}),
dict(name='div', attrs={'class':re.compile('post-commentmeta')}),
dict(name='div', attrs={'class':re.compile('slideshow_wrapper')})]
def parse_index(self):
self.timefmt = ' [%a, %d %b, %Y]'
baseUrl = 'http://www.chron.com'
pages = [('business' , '/business/')]
feeds = []
totalLinks = 0
for page in pages:
articles = []
section_links = set()
#url = urllib2.urlopen(baseUrl + page[1])
#content = url.read()
soup = self.index_to_soup(baseUrl + page[1])
divs = soup.findAll('div', attrs={'class': re.compile('scp-feature|simplelist|scp-item')})
for div in divs:
#self.log( 'Page: ', page[0], ' div: ', div['class'], ' Number of Children: ', len(div.findChildren()) )
for child in div.findChildren():
if isinstance(child, Tag) and child.name == u'a' and len(child['href']) > 10:
if len(child.contents[0]) > 10 and child['href'] not in section_links:
section_links.add(child['href'])
if child['href'].find('http') == -1:
link = baseUrl + child['href']
else:
link = child['href']
title = child.contents[0]
totalLinks += 1
self.log('\tFound article ', totalLinks, " at " ,title, 'at', link)
articles.append({'title':title, 'url':link, 'description':'', 'date':''})
if articles:
feeds.append((page[0], articles))
self.log('Found ', totalLinks, ' articles --returning feeds')
return feeds
def populate_article_metadata(self, article, soup, first):
if not first:
return
outputParagraph = ""
max_length = 210 #approximately three line of text
#self.log('printing article: ', article.title) # remove after debug
#self.log(soup.prettify()) # remove after debug
try:
articleDate = getTimestampFromSoup(soup) # remove after debug
except Exception as inst: # remove after debug
self.log('Exception: ', article.title) # remove after debug
self.log(type(inst)) # remove after debug
self.log(inst) # remove after debug
if articleDate is not None:
dateText = time.strftime('%Y-%m-%d', articleDate)
#self.log(article.title, ' has timestamp of ', dateText)
#self.log('Article Date is of type: ', type(article.date)) # remove after debug
#self.log('Derived time is of type: ', type(articleDate)) # remove after debug
try:
article.date = articleDate
article.utctime = dt_factory(articleDate, assume_utc=True, as_utc=True)
article.localtime = article.utctime.astimezone(local_tz)
if not isWithinDays(articleDate, 2):
print 'Article: ', article.title, ' is more than 2 days old'
except Exception as inst: # remove after debug
self.log('Exception: ', article.title) # remove after debug
self.log(type(inst)) # remove after debug
self.log(inst) # remove after debug
else:
dateText = time.strftime('%Y-%m-%d', time.gmtime())
self.log(article.title, ' has no timestamp')
#article.date = strftime('%a, %d %b') # remove after debug
try:
if len(article.text_summary.strip()) == 0:
articlebody = soup.find('body')
if articlebody:
paras = articlebody.findAll('p')
for p in paras:
refparagraph = self.tag_to_string(p,use_alt=False).strip()
#account for blank paragraphs and short paragraphs by appending them to longer ones
outputParagraph += (" " + refparagraph)
if len(outputParagraph) > max_length:
article.summary = article.text_summary = outputParagraph.strip()[0 : max_length]
return
else:
article.summary = article.text_summary = article.text_summary
except:
self.log("Error creating article descriptions")
return