Been looking at the AventureGamer code and I have a few questions.
Spoiler:
Code:
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="en-US"/>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>'
soup.head.insert(0,mtag)
what is the reason for inserting the meta tag ?
Code:
for item in soup.findAll(style=True):
del item['style']
why is the above used? It appears to remove all instance of style but why is it needed?
Code:
self.append_page(soup, soup.body, 3)
I'm not really clear on this. It appears to me that you are taking the whole soup. appending to the body of the soup with a position of 3?
Code:
pager = soup.find('div',attrs={'class':'toolbar_fat'})
if pager:
pager.extract()
I looked in the code and didn't see why the extraction of this is needed. Because the navigation appears to be inside toolbar_fat_next
and here is my painful attempt
Spoiler:
Code:
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1282101454(BasicNewsRecipe):
title = 'How Stuff Works'
language = 'en'
__author__ = 'TonytheBookworm'
description = 'How stuff works'
publisher = 'Tony'
category = 'information'
oldest_article = 10
max_articles_per_feed = 100
no_stylesheets = True
#INDEX = u'http://www.adventuregamers.com'
#extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }'
#masthead_url = 'http://gawand.org/wp-content/uploads/2010/06/ajc-logo.gif'
#keep_only_tags = [
# dict(name='div', attrs={'class':['blogEntryHeader','blogEntryContent']})
# ,dict(attrs={'id':['cxArticleText','cxArticleBodyText']})
# ]
feeds = [
('AutoStuff', 'http://feeds.feedburner.com/HowstuffworksAutostuffDailyRssFeed'),
]
def append_page(self, soup, appendtag, position):
pager = soup.find('div',attrs={'class':'pagination'})
if pager:
nexturl = pager.a['href']
soup2 = self.index_to_soup(nexturl)
texttag = soup2.find('div', attrs={'class':'content'})
for it in texttag.findAll(style=True):
del it['style']
newpos = len(texttag.contents)
self.append_page(soup2,texttag,newpos)
texttag.extract()
appendtag.insert(position,texttag)
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="en-US"/>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
self.append_page(soup, soup.body, 3)
pager = soup.find('div',attrs={'class':'toolbar_fat'})
if pager:
pager.extract()
return soup