MobileRead Forums - View Single Post

Camper65 · 04-01-2014, 11:30 PM

Still having trouble getting page 2+ of articles from InformationWeek.

Here is the recipe I'm trying (with two different ways I think of getting more than one page)

Spoiler:

Code:

from calibre.web.feeds.news import BasicNewsRecipe
from calibre.web.feeds import Feed

class InformationWeek(BasicNewsRecipe):
    title          = u'InformationWeek'
    oldest_article = 3
    max_articles_per_feed = 150
    auto_cleanup = True
    ignore_duplicate_articles = {'title', 'url'}
    remove_empty_feeds = True
    remove_javascript = True
    use_embedded_content   = False
    recursions = 0

    feeds          = [
                          (u'InformationWeek - Stories', u'http://www.informationweek.com/rss_simple.asp'),
                          (u'InformationWeek - Software', u'http://www.informationweek.com/rss_simple.asp?f_n=476&f_ln=Software'),
                          (u'InformationWeek - Mobile', u'http://www.informationweek.com/rss_simple.asp?f_n=457&f_ln=Mobile')
                     ]

    def parse_feeds (self): 
      feeds = BasicNewsRecipe.parse_feeds(self) 
      for feed in feeds:
        for article in feed.articles[:]:
          print 'article.title is: ', article.title
          if 'healthcare' in article.title or 'healthcare' in article.url:
            feed.articles.remove(article)
      return feeds

#    def is_link_wanted(self, url, tag):
#            ans = re.match(r'href://.*/[2-9]/', url) is not None
#            if ans:
#                self.log('Following multipage link: %s'%url)
#            return ans
    
#    def postprocess_html(self, soup, first_fetch):
#            for pag in soup.findAll(True, 'pagination'):
#                pag.extract()
#            if not first_fetch:
#                h1 = soup.find('h1')
#                if h1 is not None:
#                    h1.extract()
#            return soup

#another attempt at pulling more than one page

    def append_page(self, soup, appendtag, position, surl):
        pager = soup.find('div', attrs={'class':'pages'})
        if pager:
          nextpages = soup.findAll('a', attrs={'class':'a1'})
          nextpage = nextpages[1]
          if nextpage and (nextpage['href'] != surl):
              nexturl = nextpage['href']
              soup2 = self.index_to_soup(nexturl)
              texttag = soup2.find('div', attrs={'class':'content_left_5'})
              for it in texttag.findAll(style=True):
                  del it['style']
              newpos = len(texttag.contents)
              self.append_page(soup2,texttag,newpos,nexturl)
              texttag.extract()
              pager.extract()
              appendtag.insert(position,texttag)

    def preprocess_html(self, soup):
        self.append_page(soup, soup.body, 3, '')
        pager = soup.find('div', attrs={'id':'pages'})
        if pager:
          pager.extract()
        return self.adeify_images(soup)

here is a link to an article with more than one page
http://www.informationweek.com/softw...d/d-id/1141628

and the text for the next page area of the first page

Spoiler:

what am I doing wrong to get the next page (and more if more than 2 pages)?