Hi guys, i was confused when i use parse_index to get a section_title and ariticles[] in different soup tag. if using recipe-1 ,there is only section-1 with all 4 aritcles in it. else using recipe-2, the 4 articles was good sectioned, but the parse_index could not catch section_title(output showed "unknow section")
is there any good ideas ,thank u.
Code:
<div class="clearfix bgline">
<div class="left">
<h2>section-1</h2>
<dl>
<dd>
<ul class="relnews">
<li><a href="" title="ariticle-1">ariticle-1</a></li>
<li><a href="" title="ariticle-2">ariticle-2</a></li>
</ul>
</dd>
</dl>
<h2>section-2</h2>
<dl>
<dd>
<ul class="relnews">
<li><a href="" title="ariticle-3">ariticle-3</a></li>
<li><a href="" title="ariticle-4">ariticle-4</a></li>
</ul>
</dd>
</dl>
</div>
</div>
RECIPE-1
Code:
def parse_index(self):
articles = []
soup = self.index_to_soup(self.INDEX)
#self.timefmt = ' [%s]'%ds
leftContent=soup.find('div', attrs = {'class':'clearfix bgline'})
for moduleleft in leftContent.findAll('div', attrs = {'class':'left'}):
section_title = self.tag_to_string(moduleleft.find('h2'))
articles = []
for post in moduleleft.findAll('li'):
a = post.find('a', href=True)
title = self.tag_to_string(a)
if title in seen_titles:
continue
seen_titles.add(title)
url = a['href']
if url.startswith('/'):
url = ''+url
p = post.parent.find('p', attrs={'class':'summary'})
desc = None
self.log('\tFound article:', title, 'at', url)
if p is not None:
desc = self.tag_to_string(p)
self.log('\t\t', desc)
articles.append({'title':title, 'url':url, 'description':desc, 'date':''})
if articles:
feeds.append((section_title, articles))
return feeds
RECIPE-2
Code:
def parse_index(self):
articles = []
soup = self.index_to_soup(self.INDEX)
#self.timefmt = ' [%s]'%ds
leftContent=soup.find('div', attrs = {'class':'left'})
for moduleleft in leftContent.findAll('dd'):
section_title = self.tag_to_string(moduleleft.find('h2'))
articles = []
for post in moduleleft.findAll('li'):
a = post.find('a', href=True)
title = self.tag_to_string(a)
if title in seen_titles:
continue
seen_titles.add(title)
url = a['href']
if url.startswith('/'):
url = ''+url
p = post.parent.find('p', attrs={'class':'summary'})
desc = None
self.log('\tFound article:', title, 'at', url)
if p is not None:
desc = self.tag_to_string(p)
self.log('\t\t', desc)
articles.append({'title':title, 'url':url, 'description':desc, 'date':''})
if articles:
feeds.append((section_title, articles))
return feeds