Since this is the first time I'm dealing with multiple pages, figured I'd post the changes and see if you think I got it. I'm good with feeds with single page articles already but these multipage.....
So I have it finding the p with align right from the web page, unfortunately tonights download was all one page documents so I can't tell if it worked right or not... thanks for your help on this recipe
Code:
#another attempt at pulling more than one page
def append_page(self, soup, appendtag, position, surl):
pager = soup.find('p', attrs={'align':'right'})
if pager:
nextpages = soup.findAll('p', attrs={'align':'right'})
nextpage = nextpages[1]
if nextpage and (nextpage['href'] != surl):
nexturl = nextpage['href']
soup2 = self.index_to_soup(nexturl)
texttag = soup2.find('p', attrs={'align':'right'})
for it in texttag.findAll(style=True):
del it['style']
newpos = len(texttag.contents)
self.append_page(soup2,texttag,newpos,nexturl)
texttag.extract()
pager.extract()
appendtag.insert(position,texttag)
def preprocess_html(self, soup):
self.append_page(soup, soup.body, 3, '')
pager = soup.find('div', attrs={'id':'pages'})
if pager:
pager.extract()
return self.adeify_images(soup)