View Single Post
Old 06-18-2010, 06:03 AM   #2133
rty
Zealot
rty got an A in P-Chem.rty got an A in P-Chem.rty got an A in P-Chem.rty got an A in P-Chem.rty got an A in P-Chem.rty got an A in P-Chem.rty got an A in P-Chem.rty got an A in P-Chem.rty got an A in P-Chem.rty got an A in P-Chem.rty got an A in P-Chem.
 
Posts: 108
Karma: 6066
Join Date: Apr 2010
Location: Singapore
Device: iPad Air, Kindle DXG, Kindle Paperwhite
Can anybody help why the Multipage part doesn't work on following recipe:

Spoiler:
Code:
 
class AdvancedUserRecipe1275708473(BasicNewsRecipe):
    title          = u'My Psychology Today'
    # oldest_article = 7
    max_articles_per_feed = 100
    remove_javascript = True
    use_embedded_content   = False
    no_stylesheets = True
    language = 'en'

    keep_only_tags = [dict(name='div', attrs={'id':['contentColumn','content-content']})]
    remove_tags = [
                    dict(name='div', attrs={'id':'advertisement advertisement-zone-51'}),
                    dict(name='div', attrs={'id':'block-td_search_160'}),
                    dict(name='div', attrs={'id':'block-cam_search_160'}),
                    dict(name='div', attrs={'class':'article-sub-meta'}),
   	dict(name='div', attrs={'class':'article-terms meta'}),
                         ] 
    # remove_tags_after  = dict(id=['rightColumn'])
    feeds          = [(u'Contents', u'http://www.psychologytoday.com/articles/index.rss')]

   
    def append_page(self, soup, appendtag, position):
        pager = soup.find('div',attrs={'class':'pager-next'})
        if pager:
           nexturl = self.INDEX + pager.a['href']
           soup2 = self.index_to_soup(nexturl)
           texttag = soup2.find('div', attrs={'id':'contentColumn'})
           for it in texttag.findAll(style=True):
               del it['style']
           newpos = len(texttag.contents)          
           self.append_page(soup2,texttag,newpos)
           texttag.extract()
           appendtag.insert(position,texttag)

    def postprocess_html(self, soup, first):
               for tag in soup.findAll(name=['ul', 'li']):
                    tag.name = 'div'
               return soup


Thank you in advance.

Last edited by rty; 06-19-2010 at 08:48 AM.
rty is offline