so thanks starson17. it now downloads my articles (there is still a lot of work, but i get news ant the end, and not an error).
i didnt think of this when i started, but and calibre deal with pdf files?
some of the reports come in pdf form. i get gibrish where the pdf use to be. can i do anything about it? does it matter if my output format is pdf?
this is the code:
Spoiler:
Code:
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, re
class AlisonB(BasicNewsRecipe):
title = 'blah'
__author__ = 'Tonythebookworm'
description = 'blah'
language = 'en'
no_stylesheets = True
publisher = 'Tonythebookworm'
category = 'column'
use_embedded_content= False
no_stylesheets = True
oldest_article = 24
remove_javascript = True
remove_empty_feeds = True
max_articles_per_feed = 10
INDEX = 'http://maya.tase.co.il/'
def parse_index(self):
feeds = []
for title, url in [
(u"Feed", u"http://maya.tase.co.il/bursa/index.asp?view=search&company_group=3000&arg_comp=&srh_comp_lb=1007&srh_from=2010-01-01&srh_until=2010-09-28&srh_anaf=-1&srh_event=9999&is_urgent=0&srh_company_press="),
]:
articles = self.make_links(url)
if articles:
feeds.append((title, articles))
return feeds
def make_links(self, url):
title = 'Temp'
current_articles = []
soup = self.index_to_soup(url)
print 'url is', url
# print 'The soup is: ', soup
for item in soup.findAll('a',attrs={'class':'A3'}):
print 'item is: ',item
#link = item.find('a')
#titlecheck = self.tag_to_string(link)
#url_test = re.search('javascript', item['href'])
if not re.search('javascript', item['href']):
temp2= self.INDEX + 'bursa/' + item['href']
# temp2=[temp3]
print 'url1 is', temp2
soup1 = self.index_to_soup(temp2)
# print 'the new soup is', soup1
print '6714'
for item1 in soup1.findAll('iframe'):
print 'item1 is:' , item1
print 'FOUND GOOD URL'
url = item1['src']
print 'url is: ', url
title = self.tag_to_string(item)
print 'title is: ', title
current_articles.append({'title': title, 'url': url, 'description':'', 'date':''}) # append all this
return current_articles
the 2nd article is a pdf file. (i am working with a feed that is very rarely updated, so i know the page format very well.)
can i import a library that deals with pdf?
thanks for the help.
ps
i also wanted to know if you can add an output file type to the recipe it self that will override the default for calibre (if the default is pdf, but i want one self built recipe to come out as epub?)