Quote:
Originally Posted by cynvision
Ah yes. I'm still not comfortable with how the multiple page link following works. You'd have to follow the 'more articles' link at least once to get more than one article from that author.
|
Okay I'm sure there might be another way to do this and reduce the redundancy yet I'm not certain how to do that yet. Anyway, this will work. The only issue I see is the title says the same for all the articles (but I'll leave that one to you

)
Spoiler:
Code:
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, re
class AlisonB(BasicNewsRecipe):
title = 'Alison Berkley Column'
__author__ = 'Tonythebookworm'
description = 'Some dudes column'
language = 'en'
no_stylesheets = True
publisher = 'Tonythebookworm'
category = 'column'
use_embedded_content= False
no_stylesheets = True
oldest_article = 24
remove_javascript = True
remove_empty_feeds = True
max_articles_per_feed = 10
INDEX = 'http://www.aspentimes.com'
def parse_index(self):
feeds = []
for title, url in [
(u"Alison Berkley", u"http://www.aspentimes.com/SECTION/&Profile=1021&ParentProfile=1061"),
]:
articles = self.make_links(url)
if articles:
feeds.append((title, articles))
return feeds
def make_links(self, url):
title = 'Temp'
current_articles = []
soup = self.index_to_soup(url)
print 'The soup is: ', soup
for item in soup.findAll('div',attrs={'class':'title'}):
print 'item is: ', item
link = item.find('a')
print 'the link is: ', link
titlecheck = self.tag_to_string(link)
#once we get a link we need to check to see if it contains Alison Berkley and if it does use it
if link.find(text=re.compile('Alison Berkley')) :
print 'FOUND TITLE AND IT IS : ', titlecheck
url = self.INDEX + link['href']
title = self.tag_to_string(link)
print 'the title is: ', title
print 'the url is: ', url
print 'the title is: ', title
current_articles.append({'title': title, 'url': url, 'description':'', 'date':''}) # append all this
#FIND MORE LINKS HERE
counter = 0
while counter <= 5:
for item in soup.findAll('span',attrs={'class':'links'}):
# print 'item is: ', item
link = item.find('a')
if link.find(text=re.compile('More Articles')):
print 'counter is : ', counter
url = self.INDEX + link['href']
print 'THE NEXT URL IS: ', url
soup = self.index_to_soup(url)
for item in soup.findAll('div',attrs={'class':'title'}):
link = item.find('a')
titlecheck = self.tag_to_string(link)
#once we get a link we need to check to see if it contains Alison Berkley and if it does use it
if link.find(text=re.compile('Alison Berkley')) :
print 'FOUND NEW TITLES AND IT IS : ', titlecheck
url = self.INDEX + link['href']
title = self.tag_to_string(link)
print 'the title is: ', title
print 'the url is: ', url
print 'the title is: ', title
current_articles.append({'title': title, 'url': url, 'description':'', 'date':''}) # append all this
counter +=1
return current_articles
def print_version(self, url):
split1 = url.split("article")
print 'THE SPLIT IS: ', split1
#original is: http://www.aspentimes.com/article/20100909/COLUMN/100909869/1021&parentprofile=1061
#need this to be print_url:
#http://www.aspentimes.com/apps/pbcs.dll/article?AID=/20100909/COLUMN/100909869/1021&parentprofile=1061&template=printart
print_url = 'http://www.aspentimes.com/apps/pbcs.dll/article?AID=' + split1[1] + '&template=printart'
print 'THIS URL WILL PRINT: ', print_url # this is a test string to see what the url is it will return
return print_url