View Single Post
Old 09-12-2010, 12:47 AM   #2697
TonytheBookworm
Addict
TonytheBookworm is on a distinguished road
 
TonytheBookworm's Avatar
 
Posts: 264
Karma: 62
Join Date: May 2010
Device: kindle 2, kindle 3, Kindle fire
Quote:
Originally Posted by cynvision View Post
I took a look at this one and if there was a way to get the RSS of the weekly archive... but I don't see one. Maybe it's members only?
I didn't see an rss for it so I just parsed the links.
I'm not certain if the link will list the feeds each time for I have no known way of testing that. But the following code will use the link provided by the original poster and then parse the links on that page and look for Alison Berkley in the link text. If it finds it then that link will be used and converted in the print_url to the pretty print version...
Spoiler:

Code:
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, re
class FIELDSTREAM(BasicNewsRecipe):
    title      = 'Alison Berkley Column'
    __author__ = 'Tonythebookworm'
    description = 'Some dudes column'
    language = 'en'
    no_stylesheets = True
    publisher           = 'Tonythebookworm'
    category            = 'column'
    use_embedded_content= False
    no_stylesheets      = True
    oldest_article      = 24
    remove_javascript   = True
    remove_empty_feeds  = True
    
    max_articles_per_feed = 10
    INDEX = 'http://www.aspentimes.com'
    
    
    def parse_index(self):
        feeds = []
        for title, url in [
                            (u"Alison Berkley", u"http://www.aspentimes.com/SECTION/&Profile=1021&ParentProfile=1061"),
                            
                            
                             ]:
            articles = self.make_links(url)
            if articles:
                feeds.append((title, articles))
        return feeds
        
    def make_links(self, url):
        title = 'Temp'
        current_articles = []
        soup = self.index_to_soup(url)
        print 'The soup is: ', soup
        for item in soup.findAll('div',attrs={'class':'title'}):
            print 'item is: ', item
            link = item.find('a')
            print 'the link is: ', link
            titlecheck = self.tag_to_string(link)
            #once we get a link we need to check to see if it contains Alison Berkley and if it does use it
            if link.find(text=re.compile('Alison Berkley')) :
                print 'FOUND TITLE AND IT IS : ', titlecheck
            
                url         = self.INDEX + link['href']
                title       = self.tag_to_string(link)
                print 'the title is: ', title
                print 'the url is: ', url
                print 'the title is: ', title
                current_articles.append({'title': title, 'url': url, 'description':'', 'date':''}) # append all this
        return current_articles
        
        
    def print_version(self, url):
        split1 = url.split("article")
        print 'THE SPLIT IS: ', split1 
        #original is: http://www.aspentimes.com/article/20100909/COLUMN/100909869/1021&parentprofile=1061
        #need this to be print_url:
        #http://www.aspentimes.com/apps/pbcs.dll/article?AID=/20100909/COLUMN/100909869/1021&parentprofile=1061&template=printart         
         
        print_url = 'http://www.aspentimes.com/apps/pbcs.dll/article?AID=' + split1[1] + '&template=printart'
        print 'THIS URL WILL PRINT: ', print_url # this is a test string to see what the url is it will return
        return print_url
TonytheBookworm is offline