View Single Post
Old 09-12-2010, 12:19 PM   #2704
TonytheBookworm
Addict
TonytheBookworm is on a distinguished road
 
TonytheBookworm's Avatar
 
Posts: 264
Karma: 62
Join Date: May 2010
Device: kindle 2, kindle 3, Kindle fire
Quote:
Originally Posted by cynvision View Post
Ah yes. I'm still not comfortable with how the multiple page link following works. You'd have to follow the 'more articles' link at least once to get more than one article from that author.
Okay I'm sure there might be another way to do this and reduce the redundancy yet I'm not certain how to do that yet. Anyway, this will work. The only issue I see is the title says the same for all the articles (but I'll leave that one to you )
Spoiler:

Code:
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, re
class AlisonB(BasicNewsRecipe):
    title      = 'Alison Berkley Column'
    __author__ = 'Tonythebookworm'
    description = 'Some dudes column'
    language = 'en'
    no_stylesheets = True
    publisher           = 'Tonythebookworm'
    category            = 'column'
    use_embedded_content= False
    no_stylesheets      = True
    oldest_article      = 24
    remove_javascript   = True
    remove_empty_feeds  = True
    
    max_articles_per_feed = 10
    INDEX = 'http://www.aspentimes.com'
    
    
    def parse_index(self):
        feeds = []
        for title, url in [
                            (u"Alison Berkley", u"http://www.aspentimes.com/SECTION/&Profile=1021&ParentProfile=1061"),
                            
                            
                             ]:
            articles = self.make_links(url)
            if articles:
                feeds.append((title, articles))
        return feeds
        
    def make_links(self, url):
        title = 'Temp'
        current_articles = []
        soup = self.index_to_soup(url)
        print 'The soup is: ', soup
        for item in soup.findAll('div',attrs={'class':'title'}):
            print 'item is: ', item
            link = item.find('a')
            print 'the link is: ', link
            titlecheck = self.tag_to_string(link)
            #once we get a link we need to check to see if it contains Alison Berkley and if it does use it
            if link.find(text=re.compile('Alison Berkley')) :
                print 'FOUND TITLE AND IT IS : ', titlecheck
            
                url         = self.INDEX + link['href']
                title       = self.tag_to_string(link)
                print 'the title is: ', title
                print 'the url is: ', url
                print 'the title is: ', title
                current_articles.append({'title': title, 'url': url, 'description':'', 'date':''}) # append all this
         
        #FIND MORE LINKS HERE   
        counter = 0
        while counter <= 5:
         for item in soup.findAll('span',attrs={'class':'links'}):
           # print 'item is: ', item
            link = item.find('a')
            if link.find(text=re.compile('More Articles')):
               print 'counter is : ', counter
               url = self.INDEX + link['href']
               print 'THE NEXT URL IS: ', url
               soup = self.index_to_soup(url)
       
         for item in soup.findAll('div',attrs={'class':'title'}):
           
            link = item.find('a')
           
            titlecheck = self.tag_to_string(link)
            #once we get a link we need to check to see if it contains Alison Berkley and if it does use it
            if link.find(text=re.compile('Alison Berkley')) :
                print 'FOUND NEW TITLES AND IT IS : ', titlecheck
            
                url         = self.INDEX + link['href']
                title       = self.tag_to_string(link)
                print 'the title is: ', title
                print 'the url is: ', url
                print 'the title is: ', title
                current_articles.append({'title': title, 'url': url, 'description':'', 'date':''}) # append all this        
         counter +=1
            
           
        return current_articles
        
        
    def print_version(self, url):
        split1 = url.split("article")
        print 'THE SPLIT IS: ', split1 
        #original is: http://www.aspentimes.com/article/20100909/COLUMN/100909869/1021&parentprofile=1061
        #need this to be print_url:
        #http://www.aspentimes.com/apps/pbcs.dll/article?AID=/20100909/COLUMN/100909869/1021&parentprofile=1061&template=printart         
         
        print_url = 'http://www.aspentimes.com/apps/pbcs.dll/article?AID=' + split1[1] + '&template=printart'
        print 'THIS URL WILL PRINT: ', print_url # this is a test string to see what the url is it will return
        return print_url

Last edited by TonytheBookworm; 09-12-2010 at 04:45 PM. Reason: updated code to run 5 times
TonytheBookworm is offline