View Single Post
Old 09-12-2013, 11:35 PM   #7
Camper65
Enthusiast
Camper65 began at the beginning.
 
Posts: 32
Karma: 10
Join Date: Apr 2011
Device: Kindle wifi; Dell 2in1
Okay I'm finally getting back to fixing this (had to build a new tower, one of my notebooks started to constantly crash on me). I'm trying to have it pull multipage articles by searching and using the pgno=# area (just part of a long article page)

Code:
<div class="article-pagination">
<strong>
	<a class="contentgating_article" href="/security/privacy/nsa-vs-your-smartphone-5-facts/240161133?pgno=2">
		Page 2:&nbsp;BlackBerry Isn't Immune	</a>
</strong>
<img hspace="0" height="5" border="0" width="10" vspace="0" src="http://i.cmpnet.com/infoweek/spacer.gif">
<br/>
<div class="controls">
	<strong>&nbsp;1&nbsp;|&nbsp;</strong><a class="contentgating_article" href="/security/privacy/nsa-vs-your-smartphone-5-facts/240161133?pgno=2">2</a>&nbsp;&nbsp;|<a class="contentgating_article" href="/security/privacy/nsa-vs-your-smartphone-5-facts/240161133?pgno=2"> Next Page »</a>&nbsp;</div>
</div>
</article>
this is what I've come up with but it still only gets page one of multi-page articles.

Code:
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.web.feeds import Feed

class InformationWeek(BasicNewsRecipe):
    title          = u'InformationWeek'
    oldest_article = 3
    max_articles_per_feed = 150
    auto_cleanup = True
    ignore_duplicate_articles = {'title', 'url'}
    remove_empty_feeds = True
    remove_javascript = True
    use_embedded_content   = False
    recursions = 1
    match_regexps = [r'\?pgno=\d+$']

    preprocess_regexps = [
        (re.compile(r'<!-- End SiteCatalyst code version: H.16 -->.*</body>', re.DOTALL), lambda match: '</body>')
    ]

    feeds          = [
                          (u'InformationWeek - Stories', u'http://www.informationweek.com/rss/pheedo/all_story_blog.xml?cid=RSSfeed_IWK_ALL'),
                          (u'InformationWeek - News', u'http://www.informationweek.com/rss/pheedo/news.xml?cid=RSSfeed_IWK_News'),
                          (u'InformationWeek - Personal Tech', u'http://www.informationweek.com/rss/pheedo/personaltech.xml?cid=RSSfeed_IWK_Personal_Tech'),
                          (u'InformationWeek - Software', u'http://www.informationweek.com/rss/pheedo/software.xml?cid=RSSfeed_IWK_Software'),
              (u'InforamtionWeek - Hardware', u'http://www.informationweek.com/rss/pheedo/hardware.xml?cid=RSSfeed_IWK_Hardware')
                     ]

    def parse_feeds (self): 
      feeds = BasicNewsRecipe.parse_feeds(self) 
      for feed in feeds:
        for article in feed.articles[:]:
          print 'article.title is: ', article.title
          if 'healthcare' in article.title or 'healthcare' in article.url:
            feed.articles.remove(article)
      return feeds

    def append_page(self, soup, appendtag, position):
        pager = soup.find('div', attrs={'class':'article-pagination'})
        if pager:
          nextpage = soup.find('a', attrs={'class':'contentgating_article'})
          if nextpage:
              nexturl = nextpage['href']
              soup2 = self.index_to_soup(nexturl)
              texttag = soup2.find('div', attrs={'class':'article-v2'})
              for it in texttag.findAll(style=True):
                  del it['style']
              newpos = len(texttag.contents)
              self.append_page(soup2,texttag,newpos)
              texttag.extract()
              pager.extract()
              appendtag.insert(position,texttag)
			  
			  
			  
	remove_tags_before = dict(name='article', id=lambda x:not x)
Can someone help me fix this up, thanks.
Camper65 is offline   Reply With Quote