Tried to put a recipe together for instructables.
So far so good, but can not tidy the final output up nicely. (Some ads and some bad css)
Spoiler:
Code:
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2010, Brendan Sleight <bms.calibre at barwap.com>'
'''
www.instructables.com
'''
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
import string
class Instructables(BasicNewsRecipe):
title = u'Instructables'
__author__ = 'Bmsleight'
description = 'Make, How To, and DIY'
oldest_article = 100
max_articles_per_feed = 5
no_stylesheets = True
language = 'en'
index = 'http://www.instructables.com'
remove_tags = [
dict(name='div', attrs={'class':'remove-ads'})
]
extra_css = '''
.steplabel{font-size:xx-small;}
.txt{font-size:xx-small;}
#txt{font-size:xx-small;}
'''
feeds = [
(u'Instructables Featured' , u'http://www.instructables.com/tag/type-id/featured-true/rss.xml' )
]
def append_page(self, soup, appendtag, position, pre=[]):
# Multi threading is fun .....
for itt in soup.findAll('a',href=True):
str_itt = str(itt['href'])
if "/step" in str_itt and "id" in str_itt and "/account/" not in str_itt and str_itt not in pre:
pre.append(itt['href'])
nurl = self.index + itt['href']
soup2 = self.index_to_soup(nurl)
texttag = soup2.find('body')
newpos = len(texttag.contents)
self.append_page(soup2,texttag,newpos, pre)
texttag.extract()
appendtag.insert(position,texttag)
def preprocess_html(self, soup):
self.append_page(soup, soup.body, 99)
return soup
def postprocess_html(self, soup, first_fetch):
# subtree = soup.findAll('div class="remove-ads"')
# subtree.extract()
rawc = soup.findAll('div',attrs={'class':'stepdescription'})
# Reoved bad nested H2s...
r = str(rawc).replace("<h2>", "").replace("</h2>", "")
s = BeautifulSoup(r)
return s
Any pointers to where I am going wrong ?