Thread: maya recipe
View Single Post
Old 10-28-2010, 05:21 AM   #57
marbs
Zealot
marbs began at the beginning.
 
Posts: 122
Karma: 10
Join Date: Jul 2010
Device: nook
i reacreated the post perfectly

it still does not work.
i also do not know how to deal with the difference between a request on this page and this page. or how to deal with dates. i think i will wait for when you have time and energy to lead the way. i am dreaming post and get on tamper data and it is time to step in down a noch. at least for a day or two.

here is the code:
Spoiler:
Code:
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, re
import urllib, mechanize
from calibre import __appname__

class AlisonB(BasicNewsRecipe):
    title      ='Maya'
    __author__ = 'marbs'
    description = 'blah'
    language = 'en'
    no_stylesheets = True
    publisher           = 'marbs'
#    simultaneous_downloads = 1
#    delay                  = 25   
    category            = 'column'
    extra_css='body{direction: rtl;} .article_description{direction: rtl; } a.article{direction: rtl; } .calibre_feed_description{direction: rtl; }'
#    no_stylesheets = True
    use_embedded_content= False
    remove_attributes = ['width','height']
    no_stylesheets      = True
    oldest_article      = 24
    remove_javascript   = True
    remove_empty_feeds  = True
    rec_index = 0
    max_articles_per_feed = 5000
    INDEX = 'http://maya.tase.co.il/'

    def append_page(self, url, soup, appendtag, articles_left, position):
   #     print url, 'the soup is:' ,soup, 'appendtag is',appendtag,  articles_left, position
        articles_left = articles_left - 30
        articlenum= articles_left + (30*position)
        print  articles_left, articlenum
        br = BasicNewsRecipe.get_browser(self)
        if (articles_left <'0') :
           print 'do i get this far?'
           # Print HTTP headers.
           br.set_debug_http(True)
           request = urllib.urlencode([
                                                  ('view','search'),
                                                  ('arg_comp',''),
                                                  ('srh_company_group','3000'),
                                                  ('srh_company_press',''),
                                                  ('srh_comp_lb',''),
                                                  ('srh_free_text_opt','1'),
                                                  ('cmbHavarottext',''),
                                                  ('cmbHavarothidden',''),
                                                  ('srh_txt',''),
                                                  ('optionFTSearch','1'),
                                                  ('srh_from','2010-01-01'),
                                                  ('srh_from_yr','2010'),
                                                  ('srh_from_mon','1'),
                                                  ('srh_from_day','1'),
                                                  ('srh_until','2010-10-28'),
                                                  ('srh_until_yr','2010'),
                                                  ('srh_until_mon','10'),
                                                  ('srh_until_day','28'),
                                                  ('srh_event','9999'),
                                                  ('srh_min_day','2010-01-01'),
                                                  ('srh_max_day','2010-10-28'),
                                                  ('rsSearchRes_pgNo',position),
                                                  ('rsSearchRes_Count',articlenum),
                                                  ('repTotal','30'),
                                                  ('ToPage',position),
                                                  ('_method','%2Fbursa%2Findex.asp%3F_method%3D_EM__onclientevent%26pcount%3D2%26p0%3DBTNNEXT%26p1%3Donclick'),
                                                  ('_BTNNEXT_state','_nStyle%3D1%26value%3D%26src%3Dimg%2Fkadima.gif%26alt%3D%25u05DC%25u05D3%25u05E3%2520%25u05D4%25u05D1%25u05D0'),
                                                  ('_BTNPREV_state','_nStyle%3D1%26value%3D%26src%3Dimg%2Fahora.gif%26alt%3D%25u05DC%25u05D3%25u05E3%2520%25u05D4%25u05E7%25u05D5%25u05D3%25u05DD'),
                                                  ('_thisPage_state','pb_rsSearchRes%3D0%26pb_rsComByNm%3D0')
                                                   ])
        #   print 'lalala', request
           # Print HTTP headers.
           br.set_debug_http(True)
           nexturl = br.open(url, request)
           print nexturl, 'this is ok'
           soup2 = self.index_to_soup(nexturl)
           print 'this is my real tesst', soup2

#           texttag = soup2.find('div', attrs={'class':'bodytext'})
#           for it in texttag.findAll(style=True):
#               del it['style']
#           newpos = len(texttag.contents)
#           self.append_page(soup2,texttag,newpos)
#           texttag.extract()
#           appendtag.insert(position,texttag)

    

    def parse_index(self):
        feeds = []
        for title, url in [
                            (u"too long",u"http://maya.tase.co.il/bursa/index.asp?view=search&company_group=3000&arg_comp=&srh_comp_lb=&srh_from=2010-01-01&srh_until=2010-10-28&srh_anaf=-1&srh_event=9999&is_urgent=0&srh_company_press="),
                        #(u"Feed", u"http://maya.tase.co.il/bursa/index.asp?view=search&company_group=3000&arg_comp=&srh_comp_lb=1007&srh_from=2010-01-01&srh_until=2010-09-28&srh_anaf=-1&srh_event=9999&is_urgent=0&srh_company_press="),
                            (u"הודעות מאתמול", u"http://maya.tase.co.il/bursa/index.asp?view=yesterday"),                            
                            
                             ]:
            articles = self.make_links(url)
            if articles:
                feeds.append((title, articles))
        return feeds
        
    def make_links(self, url):
        title = 'Temp'
        current_articles = []
        soup = self.index_to_soup(url)
        print 'url is', url

        print 'The soup is: ', soup
        stop = soup.find('td',attrs={'height':'19'})
        print 'the stop is', stop
        report1 = stop.contents[1].contents 
        print report1
        report2 = report1[0]
        print report2
        report3=int(report2.encode('ascii'))
        print report3
        self.append_page(url,soup, soup.body, report3,1)
        for item in soup.findAll('a',attrs={'class':'A3'}):
            print 'item is: ',item
            itemcomp= item.findPrevious('a',attrs={'id':'CompNmHref'})
            #link = item.find('a')
            #titlecheck = self.tag_to_string(link)
            #url_test = re.search('javascript', item['href'])
           
            if not re.search('javascript', item['href']):
              temp2= self.INDEX + 'bursa/' + item['href']
        #      temp2=[temp3]
              print 'url1 is', temp2
              soup1 = self.index_to_soup(temp2)
  #            print 'the new soup is', soup1
              print '6714' 
              for item1 in soup1.findAll('iframe'):
                 print 'item1 is:' , item1
 #                print soup.item.previous.previousSibling
                 txt= item1['src']
                 print 'FOUND GOOD URL'
                 re1='.*?'	# Non-greedy match on filler
                 re2='(mayafiles)'	# Variable Name 1
                 re3='(.)'	# Any Single Character 1
                 re4='.*?'	# Non-greedy match on filler
                 re5='htm'	# Uninteresting: word
                 re6='.*?'	# Non-greedy match on filler
                 re7='(htm)'	# Word 1
                 
                 rg = re.compile(re1+re2+re3+re4+re5+re6+re7,re.IGNORECASE|re.DOTALL)
                 m = rg.search(txt)
                 if m:
                     var1=m.group(1)
                     c1=m.group(2)
                     word1=m.group(3)
                     print "("+var1+")"+"("+c1+")"+"("+word1+")"+"\n"
                     url = item1['src']
                 else:
                     url = 'http://www.pdfdownload.org/pdf2html/pdf2html.php?url=' + item1['src'] + '&images=yes'

                 print 'url is: ', url

                 title       = self.tag_to_string(itemcomp)+ ' - ' + self.tag_to_string(item)
                 print 'title is: ', title
                 current_articles.append({'title': title, 'url': url, 'description':'', 'date':''}) # append all this
            
           
        return current_articles
marbs is offline   Reply With Quote