View Single Post
Old 06-27-2013, 11:13 PM   #10
dkfurrow
Member
dkfurrow began at the beginning.
 
Posts: 13
Karma: 10
Join Date: Jun 2013
Device: LG G-Pad 8.3
Got the tags cleaned up tolerably well...Only thing I haven't seemed to be able to do is to delete specific articles, after parsing, based on date (line 148, I'd like to delete articles with age > 2 days). I've attached the correct date to the article in populate_article_metadata...is it possible to delete the article there? If so, what's the correct syntax? Couldn't seem to make anything work to do that.


Code:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__license__   = 'GPL v3'
__copyright__ = '2013, Dale Furrow dkfurrow@gmail.com'
'''
chron.com
'''
import re, string, time
import urllib2
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
from calibre.utils.date import dt_factory, utcnow, local_tz
from datetime import datetime, timedelta


def getRegularTimestamp(dateString):
    try:
        outDate = time.strptime(dateString, "%Y-%m-%dT%H:%M:%SZ")
        return outDate
    except:
        return None    

regextest = '(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|\
Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?) \
[0-9]{1,2}, 20[01][0-9]'

def GetDateFromString(inText):
    match = re.findall(regextest, inText)
    if match:
        try: 
            outDate = time.strptime(match[0], "%B %d, %Y")
            return outDate
        except:
            return None
    else:
        return None

def isWithinDays(inTT,daysAgo):
    daysAgoDateTime = datetime.now()-timedelta(days = daysAgo)
    DaysAgoDateTime =  datetime(inTT[0], inTT[1], inTT[2], inTT[3], 
    inTT[4], inTT[5])
    return DaysAgoDateTime > daysAgoDateTime
    

def getTimestampFromSoup (soup):
    timestampEle = soup.find('h5', attrs={'class': re.compile('timestamp')})
    if timestampEle is not None:
        try:
            timestampText = timestampEle['title']
            return getRegularTimestamp(timestampText)
        except:
            return None    
    else:
        timestampEle = soup.find('span', attrs={'class': re.compile('post-date|entry-date')})
        if timestampEle is not None:
            try:
                timestampText = timestampEle.string
                return GetDateFromString(timestampText)
            except:
                return None
        else:
            return None


class HoustonChronicle(BasicNewsRecipe):

    title      =  u'The Houston Chronicle'
    description    = 'News from Houston, Texas'
    __author__ = 'Dale Furrow'
    language = 'en'
    no_stylesheets = True
    #use_embedded_content = False
    remove_attributes = ['style']
    remove_empty_feeds = True
    
    
    keep_only_tags = [dict(name='div', attrs={'class':re.compile('hentry')}), 
                      dict(name='span', attrs={'class':re.compile('post-date|entry-date')}), 
                      dict(name='h5', attrs={'class':re.compile('timestamp')}), 
                      dict(name='div', attrs={'id':re.compile('post-')}) ]
    
    
    remove_tags = [dict(name='div', attrs={'class':'socialBar'}), 
                   dict(name='div', attrs={'class':re.compile('post-commentmeta')}),
                   dict(name='div', attrs={'class':re.compile('slideshow_wrapper')})]
        
    

    def parse_index(self):
        
        self.timefmt = ' [%a, %d %b, %Y]'
        baseUrl = 'http://www.chron.com'
        pages = [('business' , '/business/')]
        feeds = []
        totalLinks = 0
        for page in pages:
            articles = []
            section_links = set()
            #url = urllib2.urlopen(baseUrl + page[1])
            #content = url.read()
            soup = self.index_to_soup(baseUrl + page[1]) 
            divs = soup.findAll('div', attrs={'class': re.compile('scp-feature|simplelist|scp-item')})
            for div in divs:
                #self.log( 'Page: ', page[0], ' div: ', div['class'], ' Number of Children: ', len(div.findChildren()) )
                for child in div.findChildren():
                    if isinstance(child, Tag) and child.name == u'a' and len(child['href']) > 10:
                        if len(child.contents[0]) > 10 and child['href'] not in section_links:
                            section_links.add(child['href'])
                            if child['href'].find('http') == -1:
                                link = baseUrl + child['href']
                            else:
                                link = child['href']
                            title = child.contents[0]
                            totalLinks += 1
                            self.log('\tFound article ', totalLinks, " at " ,title, 'at', link)
                            articles.append({'title':title, 'url':link, 'description':'', 'date':''})
            if articles:
                feeds.append((page[0], articles))
        self.log('Found ', totalLinks, ' articles --returning feeds')
        return feeds
    
    
    
        
    def populate_article_metadata(self, article, soup, first):
        if not first:
            return
        outputParagraph = ""
        max_length = 210 #approximately three line of text
        #self.log('printing article: ', article.title) # remove after debug
        #self.log(soup.prettify()) # remove after debug
        try:
            articleDate = getTimestampFromSoup(soup) # remove after debug
        except Exception as inst: # remove after debug
            self.log('Exception: ', article.title) # remove after debug
            self.log(type(inst)) # remove after debug
            self.log(inst) # remove after debug
        if articleDate is not None:
            dateText = time.strftime('%Y-%m-%d', articleDate)
            #self.log(article.title, ' has timestamp of ', dateText)
            #self.log('Article Date is of type: ', type(article.date)) # remove after debug
            #self.log('Derived time is of type: ', type(articleDate)) # remove after debug
            try:
                article.date = articleDate
                article.utctime = dt_factory(articleDate, assume_utc=True, as_utc=True)
                article.localtime = article.utctime.astimezone(local_tz)
                if not isWithinDays(articleDate, 2):
                    print 'Article: ', article.title, ' is more than 2 days old'
            except Exception as inst: # remove after debug
                self.log('Exception: ', article.title) # remove after debug
                self.log(type(inst)) # remove after debug
                self.log(inst) # remove after debug
        else:
            dateText = time.strftime('%Y-%m-%d', time.gmtime())
            self.log(article.title, ' has no timestamp')
            #article.date = strftime('%a, %d %b') # remove after debug
        try:
            if len(article.text_summary.strip()) == 0:
                articlebody = soup.find('body')
                if articlebody:
                    paras = articlebody.findAll('p')
                    for p in paras:
                            refparagraph = self.tag_to_string(p,use_alt=False).strip()
                            #account for blank paragraphs and short paragraphs by appending them to longer ones
                            outputParagraph += (" " + refparagraph)
                            if len(outputParagraph) > max_length: 
                                article.summary = article.text_summary = outputParagraph.strip()[0 : max_length]
                                return
            else:
                article.summary = article.text_summary = article.text_summary
        except:
            self.log("Error creating article descriptions")
            return
dkfurrow is offline   Reply With Quote