On first look at that thing, why not do something in the area of this:
You said you wanted the second link
for example:
http://maya.tase.co.il/bursa/report....port_cd=570152
it always has report_cd in it so why not just follow it with a regex match ?
or maybe use something like this:
Spoiler:
Code:
def parse_index(self):
feeds = []
for title, url in [
(u"Feed", u"http://maya.tase.co.il/bursa/index.asp?view=search&company_group=3000&arg_comp=&srh_comp_lb=1007&srh_from=2010-01-01&srh_until=2010-09-28&srh_anaf=-1&srh_event=9999&is_urgent=0&srh_company_press="),
]:
articles = self.make_links(url)
if articles:
feeds.append((title, articles))
return feeds
def make_links(self, url):
title = 'Temp'
current_articles = []
soup = self.index_to_soup(url)
print 'The soup is: ', soup
for item in soup.findAll('a',attrs={'class':'A3'}):
print 'item is: ',item
if not re.search('javascript', item['href']):
print 'FOUND GOOD URL'
url = self.INDEX + item['href']
print 'url is: ', url
title = self.tag_to_string(item)
print 'title is: ', title
current_articles.append({'title': title, 'url': url, 'description':'', 'date':''}) # append all this
return current_articles