Okay I'm finally getting back to fixing this (had to build a new tower, one of my notebooks started to constantly crash on me). I'm trying to have it pull multipage articles by searching and using the pgno=# area (just part of a long article page)
Code:
<div class="article-pagination">
<strong>
<a class="contentgating_article" href="/security/privacy/nsa-vs-your-smartphone-5-facts/240161133?pgno=2">
Page 2: BlackBerry Isn't Immune </a>
</strong>
<img hspace="0" height="5" border="0" width="10" vspace="0" src="http://i.cmpnet.com/infoweek/spacer.gif">
<br/>
<div class="controls">
<strong> 1 | </strong><a class="contentgating_article" href="/security/privacy/nsa-vs-your-smartphone-5-facts/240161133?pgno=2">2</a> |<a class="contentgating_article" href="/security/privacy/nsa-vs-your-smartphone-5-facts/240161133?pgno=2"> Next Page »</a> </div>
</div>
</article>
this is what I've come up with but it still only gets page one of multi-page articles.
Code:
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.web.feeds import Feed
class InformationWeek(BasicNewsRecipe):
title = u'InformationWeek'
oldest_article = 3
max_articles_per_feed = 150
auto_cleanup = True
ignore_duplicate_articles = {'title', 'url'}
remove_empty_feeds = True
remove_javascript = True
use_embedded_content = False
recursions = 1
match_regexps = [r'\?pgno=\d+$']
preprocess_regexps = [
(re.compile(r'<!-- End SiteCatalyst code version: H.16 -->.*</body>', re.DOTALL), lambda match: '</body>')
]
feeds = [
(u'InformationWeek - Stories', u'http://www.informationweek.com/rss/pheedo/all_story_blog.xml?cid=RSSfeed_IWK_ALL'),
(u'InformationWeek - News', u'http://www.informationweek.com/rss/pheedo/news.xml?cid=RSSfeed_IWK_News'),
(u'InformationWeek - Personal Tech', u'http://www.informationweek.com/rss/pheedo/personaltech.xml?cid=RSSfeed_IWK_Personal_Tech'),
(u'InformationWeek - Software', u'http://www.informationweek.com/rss/pheedo/software.xml?cid=RSSfeed_IWK_Software'),
(u'InforamtionWeek - Hardware', u'http://www.informationweek.com/rss/pheedo/hardware.xml?cid=RSSfeed_IWK_Hardware')
]
def parse_feeds (self):
feeds = BasicNewsRecipe.parse_feeds(self)
for feed in feeds:
for article in feed.articles[:]:
print 'article.title is: ', article.title
if 'healthcare' in article.title or 'healthcare' in article.url:
feed.articles.remove(article)
return feeds
def append_page(self, soup, appendtag, position):
pager = soup.find('div', attrs={'class':'article-pagination'})
if pager:
nextpage = soup.find('a', attrs={'class':'contentgating_article'})
if nextpage:
nexturl = nextpage['href']
soup2 = self.index_to_soup(nexturl)
texttag = soup2.find('div', attrs={'class':'article-v2'})
for it in texttag.findAll(style=True):
del it['style']
newpos = len(texttag.contents)
self.append_page(soup2,texttag,newpos)
texttag.extract()
pager.extract()
appendtag.insert(position,texttag)
remove_tags_before = dict(name='article', id=lambda x:not x)
Can someone help me fix this up, thanks.