Still having trouble getting page 2+ of articles from InformationWeek.
Here is the recipe I'm trying (with two different ways I think of getting more than one page)
Spoiler:
Code:
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.web.feeds import Feed
class InformationWeek(BasicNewsRecipe):
title = u'InformationWeek'
oldest_article = 3
max_articles_per_feed = 150
auto_cleanup = True
ignore_duplicate_articles = {'title', 'url'}
remove_empty_feeds = True
remove_javascript = True
use_embedded_content = False
recursions = 0
feeds = [
(u'InformationWeek - Stories', u'http://www.informationweek.com/rss_simple.asp'),
(u'InformationWeek - Software', u'http://www.informationweek.com/rss_simple.asp?f_n=476&f_ln=Software'),
(u'InformationWeek - Mobile', u'http://www.informationweek.com/rss_simple.asp?f_n=457&f_ln=Mobile')
]
def parse_feeds (self):
feeds = BasicNewsRecipe.parse_feeds(self)
for feed in feeds:
for article in feed.articles[:]:
print 'article.title is: ', article.title
if 'healthcare' in article.title or 'healthcare' in article.url:
feed.articles.remove(article)
return feeds
# def is_link_wanted(self, url, tag):
# ans = re.match(r'href://.*/[2-9]/', url) is not None
# if ans:
# self.log('Following multipage link: %s'%url)
# return ans
# def postprocess_html(self, soup, first_fetch):
# for pag in soup.findAll(True, 'pagination'):
# pag.extract()
# if not first_fetch:
# h1 = soup.find('h1')
# if h1 is not None:
# h1.extract()
# return soup
#another attempt at pulling more than one page
def append_page(self, soup, appendtag, position, surl):
pager = soup.find('div', attrs={'class':'pages'})
if pager:
nextpages = soup.findAll('a', attrs={'class':'a1'})
nextpage = nextpages[1]
if nextpage and (nextpage['href'] != surl):
nexturl = nextpage['href']
soup2 = self.index_to_soup(nexturl)
texttag = soup2.find('div', attrs={'class':'content_left_5'})
for it in texttag.findAll(style=True):
del it['style']
newpos = len(texttag.contents)
self.append_page(soup2,texttag,newpos,nexturl)
texttag.extract()
pager.extract()
appendtag.insert(position,texttag)
def preprocess_html(self, soup):
self.append_page(soup, soup.body, 3, '')
pager = soup.find('div', attrs={'id':'pages'})
if pager:
pager.extract()
return self.adeify_images(soup)
here is a link to an article with more than one page
http://www.informationweek.com/softw...d/d-id/1141628
and the text for the next page area of the first page
what am I doing wrong to get the next page (and more if more than 2 pages)?