Code:
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, re
import urllib, mechanize
from calibre import __appname__
class AlisonB(BasicNewsRecipe):
title ='Maya'
__author__ = 'marbs'
description = 'blah'
language = 'en'
no_stylesheets = True
publisher = 'marbs'
# simultaneous_downloads = 1
# delay = 25
category = 'column'
extra_css='body{direction: rtl;} .article_description{direction: rtl; } a.article{direction: rtl; } .calibre_feed_description{direction: rtl; }'
# no_stylesheets = True
use_embedded_content= False
remove_attributes = ['width','height']
no_stylesheets = True
oldest_article = 24
remove_javascript = True
remove_empty_feeds = True
rec_index = 0
max_articles_per_feed = 5000
INDEX = 'http://maya.tase.co.il/'
def append_page(self, url, soup, appendtag, articles_left, position):
# print url, 'the soup is:' ,soup, 'appendtag is',appendtag, articles_left, position
articles_left = articles_left - 30
articlenum= articles_left + (30*position)
print articles_left, articlenum
br = BasicNewsRecipe.get_browser(self)
if (articles_left <'0') :
print 'do i get this far?'
# Print HTTP headers.
br.set_debug_http(True)
request = urllib.urlencode([
('view','search'),
('arg_comp',''),
('srh_company_group','3000'),
('srh_company_press',''),
('srh_comp_lb',''),
('srh_free_text_opt','1'),
('cmbHavarottext',''),
('cmbHavarothidden',''),
('srh_txt',''),
('optionFTSearch','1'),
('srh_from','2010-01-01'),
('srh_from_yr','2010'),
('srh_from_mon','1'),
('srh_from_day','1'),
('srh_until','2010-10-28'),
('srh_until_yr','2010'),
('srh_until_mon','10'),
('srh_until_day','28'),
('srh_event','9999'),
('srh_min_day','2010-01-01'),
('srh_max_day','2010-10-28'),
('rsSearchRes_pgNo',position),
('rsSearchRes_Count',articlenum),
('repTotal','30'),
('ToPage',position),
('_method','%2Fbursa%2Findex.asp%3F_method%3D_EM__onclientevent%26pcount%3D2%26p0%3DBTNNEXT%26p1%3Donclick'),
('_BTNNEXT_state','_nStyle%3D1%26value%3D%26src%3Dimg%2Fkadima.gif%26alt%3D%25u05DC%25u05D3%25u05E3%2520%25u05D4%25u05D1%25u05D0'),
('_BTNPREV_state','_nStyle%3D1%26value%3D%26src%3Dimg%2Fahora.gif%26alt%3D%25u05DC%25u05D3%25u05E3%2520%25u05D4%25u05E7%25u05D5%25u05D3%25u05DD'),
('_thisPage_state','pb_rsSearchRes%3D0%26pb_rsComByNm%3D0')
])
# print 'lalala', request
# Print HTTP headers.
br.set_debug_http(True)
nexturl = br.open(url, request)
print nexturl, 'this is ok'
soup2 = self.index_to_soup(nexturl)
print 'this is my real tesst', soup2
# texttag = soup2.find('div', attrs={'class':'bodytext'})
# for it in texttag.findAll(style=True):
# del it['style']
# newpos = len(texttag.contents)
# self.append_page(soup2,texttag,newpos)
# texttag.extract()
# appendtag.insert(position,texttag)
def parse_index(self):
feeds = []
for title, url in [
(u"too long",u"http://maya.tase.co.il/bursa/index.asp?view=search&company_group=3000&arg_comp=&srh_comp_lb=&srh_from=2010-01-01&srh_until=2010-10-28&srh_anaf=-1&srh_event=9999&is_urgent=0&srh_company_press="),
#(u"Feed", u"http://maya.tase.co.il/bursa/index.asp?view=search&company_group=3000&arg_comp=&srh_comp_lb=1007&srh_from=2010-01-01&srh_until=2010-09-28&srh_anaf=-1&srh_event=9999&is_urgent=0&srh_company_press="),
(u"הודעות מאתמול", u"http://maya.tase.co.il/bursa/index.asp?view=yesterday"),
]:
articles = self.make_links(url)
if articles:
feeds.append((title, articles))
return feeds
def make_links(self, url):
title = 'Temp'
current_articles = []
soup = self.index_to_soup(url)
print 'url is', url
print 'The soup is: ', soup
stop = soup.find('td',attrs={'height':'19'})
print 'the stop is', stop
report1 = stop.contents[1].contents
print report1
report2 = report1[0]
print report2
report3=int(report2.encode('ascii'))
print report3
self.append_page(url,soup, soup.body, report3,1)
for item in soup.findAll('a',attrs={'class':'A3'}):
print 'item is: ',item
itemcomp= item.findPrevious('a',attrs={'id':'CompNmHref'})
#link = item.find('a')
#titlecheck = self.tag_to_string(link)
#url_test = re.search('javascript', item['href'])
if not re.search('javascript', item['href']):
temp2= self.INDEX + 'bursa/' + item['href']
# temp2=[temp3]
print 'url1 is', temp2
soup1 = self.index_to_soup(temp2)
# print 'the new soup is', soup1
print '6714'
for item1 in soup1.findAll('iframe'):
print 'item1 is:' , item1
# print soup.item.previous.previousSibling
txt= item1['src']
print 'FOUND GOOD URL'
re1='.*?' # Non-greedy match on filler
re2='(mayafiles)' # Variable Name 1
re3='(.)' # Any Single Character 1
re4='.*?' # Non-greedy match on filler
re5='htm' # Uninteresting: word
re6='.*?' # Non-greedy match on filler
re7='(htm)' # Word 1
rg = re.compile(re1+re2+re3+re4+re5+re6+re7,re.IGNORECASE|re.DOTALL)
m = rg.search(txt)
if m:
var1=m.group(1)
c1=m.group(2)
word1=m.group(3)
print "("+var1+")"+"("+c1+")"+"("+word1+")"+"\n"
url = item1['src']
else:
url = 'http://www.pdfdownload.org/pdf2html/pdf2html.php?url=' + item1['src'] + '&images=yes'
print 'url is: ', url
title = self.tag_to_string(itemcomp)+ ' - ' + self.tag_to_string(item)
print 'title is: ', title
current_articles.append({'title': title, 'url': url, 'description':'', 'date':''}) # append all this
return current_articles