View Single Post
Old 04-01-2014, 11:30 PM   #4
Camper65
Enthusiast
Camper65 began at the beginning.
 
Posts: 32
Karma: 10
Join Date: Apr 2011
Device: Kindle wifi; Dell 2in1
Still trying to fix this recipe

Still having trouble getting page 2+ of articles from InformationWeek.

Here is the recipe I'm trying (with two different ways I think of getting more than one page)

Spoiler:
Code:
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.web.feeds import Feed

class InformationWeek(BasicNewsRecipe):
    title          = u'InformationWeek'
    oldest_article = 3
    max_articles_per_feed = 150
    auto_cleanup = True
    ignore_duplicate_articles = {'title', 'url'}
    remove_empty_feeds = True
    remove_javascript = True
    use_embedded_content   = False
    recursions = 0

    feeds          = [
                          (u'InformationWeek - Stories', u'http://www.informationweek.com/rss_simple.asp'),
                          (u'InformationWeek - Software', u'http://www.informationweek.com/rss_simple.asp?f_n=476&f_ln=Software'),
                          (u'InformationWeek - Mobile', u'http://www.informationweek.com/rss_simple.asp?f_n=457&f_ln=Mobile')
                     ]

    def parse_feeds (self): 
      feeds = BasicNewsRecipe.parse_feeds(self) 
      for feed in feeds:
        for article in feed.articles[:]:
          print 'article.title is: ', article.title
          if 'healthcare' in article.title or 'healthcare' in article.url:
            feed.articles.remove(article)
      return feeds

#    def is_link_wanted(self, url, tag):
#            ans = re.match(r'href://.*/[2-9]/', url) is not None
#            if ans:
#                self.log('Following multipage link: %s'%url)
#            return ans
    
#    def postprocess_html(self, soup, first_fetch):
#            for pag in soup.findAll(True, 'pagination'):
#                pag.extract()
#            if not first_fetch:
#                h1 = soup.find('h1')
#                if h1 is not None:
#                    h1.extract()
#            return soup

#another attempt at pulling more than one page

    def append_page(self, soup, appendtag, position, surl):
        pager = soup.find('div', attrs={'class':'pages'})
        if pager:
          nextpages = soup.findAll('a', attrs={'class':'a1'})
          nextpage = nextpages[1]
          if nextpage and (nextpage['href'] != surl):
              nexturl = nextpage['href']
              soup2 = self.index_to_soup(nexturl)
              texttag = soup2.find('div', attrs={'class':'content_left_5'})
              for it in texttag.findAll(style=True):
                  del it['style']
              newpos = len(texttag.contents)
              self.append_page(soup2,texttag,newpos,nexturl)
              texttag.extract()
              pager.extract()
              appendtag.insert(position,texttag)

    def preprocess_html(self, soup):
        self.append_page(soup, soup.body, 3, '')
        pager = soup.find('div', attrs={'id':'pages'})
        if pager:
          pager.extract()
        return self.adeify_images(soup)


here is a link to an article with more than one page
http://www.informationweek.com/softw...d/d-id/1141628

and the text for the next page area of the first page

Spoiler:
<div class="divsplitter" style="height: 1.25em;"></div><div style="height: 1.666em;"><div style="float: right;"><span class="smaller blue"><img src="http://img.deusm.com/informationweek/slideshow-arrow-gray-left.png" alt="Previous" style="width: 1.666em; height: 1.666em; border: 0; float: left; margin-right: 0.666em;" /><div style="float: left; height: 1.416666em; padding-top: .25em;">1 of 2</div><a href="http://www.informationweek.com/software/productivity-collaboration-apps/6-new-google-apps-tips-and-tricks/d/d-id/1141627?page_number=2" title="Next" ><img src="http://img.deusm.com/informationweek/slideshow-arrow-black-right.png" alt="Next" style="width: 1.666em; height: 1.666em; border: 0; float: right; margin-left: 0.666em;" /></a></span></div></div>


what am I doing wrong to get the next page (and more if more than 2 pages)?
Camper65 is offline   Reply With Quote