View Single Post
Old 01-07-2014, 10:07 AM   #1
skyland_rw
Junior Member
skyland_rw began at the beginning.
 
Posts: 1
Karma: 10
Join Date: Jan 2014
Device: kindle kpw2
Need help in parse_index with irregular css

Hi guys, i was confused when i use parse_index to get a section_title and ariticles[] in different soup tag. if using recipe-1 ,there is only section-1 with all 4 aritcles in it. else using recipe-2, the 4 articles was good sectioned, but the parse_index could not catch section_title(output showed "unknow section")

is there any good ideas ,thank u.


Code:
<div class="clearfix bgline">
 <div class="left">
	 <h2>section-1</h2>
	  <dl>
      <dd>
	      <ul class="relnews">
	         <li><a href="" title="ariticle-1">ariticle-1</a></li>
           <li><a href="" title="ariticle-2">ariticle-2</a></li>
      	</ul>
      </dd>
	   </dl>
	 <h2>section-2</h2>
	 <dl>
      <dd>
	      <ul class="relnews">
	       <li><a href="" title="ariticle-3">ariticle-3</a></li>
         <li><a href="" title="ariticle-4">ariticle-4</a></li>
      	</ul>
      </dd>
	</dl>
 </div>
</div>

RECIPE-1
Code:
def parse_index(self):
        articles = []
        soup = self.index_to_soup(self.INDEX)
        #self.timefmt = ' [%s]'%ds
        
        leftContent=soup.find('div', attrs = {'class':'clearfix bgline'})
        for moduleleft in leftContent.findAll('div', attrs = {'class':'left'}):  
            section_title = self.tag_to_string(moduleleft.find('h2'))
            articles = []
            for post in moduleleft.findAll('li'):
                a = post.find('a', href=True)
                title = self.tag_to_string(a)
                if title in seen_titles:
                    continue
                seen_titles.add(title)
                url = a['href']
                if url.startswith('/'):
                    url = ''+url
                p = post.parent.find('p', attrs={'class':'summary'})
                desc = None
                self.log('\tFound article:', title, 'at', url)
                if p is not None:
                    desc = self.tag_to_string(p)
                    self.log('\t\t', desc)
                articles.append({'title':title, 'url':url, 'description':desc, 'date':''})
            if articles:
                feeds.append((section_title, articles))
        return feeds

RECIPE-2
Code:
def parse_index(self):
        articles = []
        soup = self.index_to_soup(self.INDEX)
        #self.timefmt = ' [%s]'%ds
        
        leftContent=soup.find('div', attrs = {'class':'left'})
        for moduleleft in leftContent.findAll('dd'):  
            section_title = self.tag_to_string(moduleleft.find('h2'))
            articles = []
            for post in moduleleft.findAll('li'):
                a = post.find('a', href=True)
                title = self.tag_to_string(a)
                if title in seen_titles:
                    continue
                seen_titles.add(title)
                url = a['href']
                if url.startswith('/'):
                    url = ''+url
                p = post.parent.find('p', attrs={'class':'summary'})
                desc = None
                self.log('\tFound article:', title, 'at', url)
                if p is not None:
                    desc = self.tag_to_string(p)
                    self.log('\t\t', desc)
                articles.append({'title':title, 'url':url, 'description':desc, 'date':''})
            if articles:
                feeds.append((section_title, articles))
        return feeds
skyland_rw is offline   Reply With Quote