View Single Post
Old 08-30-2010, 05:56 PM   #2568
bmsleight
Member
bmsleight will become famous soon enoughbmsleight will become famous soon enoughbmsleight will become famous soon enoughbmsleight will become famous soon enoughbmsleight will become famous soon enoughbmsleight will become famous soon enough
 
Posts: 24
Karma: 540
Join Date: Aug 2010
Device: Kindle 3
instructables

Tried to put a recipe together for instructables.

So far so good, but can not tidy the final output up nicely. (Some ads and some bad css)

Spoiler:
Code:
#!/usr/bin/env  python
__license__   = 'GPL v3'
__copyright__ = '2010, Brendan Sleight <bms.calibre at barwap.com>'
'''
www.instructables.com
'''
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup

import string

class Instructables(BasicNewsRecipe):
    title                 = u'Instructables'
    __author__            = 'Bmsleight'
    description           = 'Make, How To, and DIY'
    oldest_article        = 100
    max_articles_per_feed = 5
    no_stylesheets        = True
    language = 'en'
    index                 = 'http://www.instructables.com'
    remove_tags = [
                    dict(name='div', attrs={'class':'remove-ads'})
                  ]
    extra_css      = '''
                        .steplabel{font-size:xx-small;}
                        .txt{font-size:xx-small;}
                        #txt{font-size:xx-small;}
                     '''

    feeds               = [
                         (u'Instructables Featured'        , u'http://www.instructables.com/tag/type-id/featured-true/rss.xml'                                      )
                         ]


    def append_page(self, soup, appendtag, position, pre=[]):
        # Multi threading is fun .....
        for itt in soup.findAll('a',href=True):
            str_itt = str(itt['href'])
            if "/step" in str_itt and "id" in str_itt and "/account/" not in str_itt and str_itt not in pre:
                pre.append(itt['href'])
                nurl = self.index + itt['href']
                soup2 = self.index_to_soup(nurl)
                texttag = soup2.find('body')
                newpos = len(texttag.contents)
                self.append_page(soup2,texttag,newpos, pre)
                texttag.extract()
                appendtag.insert(position,texttag)

    def preprocess_html(self, soup):
        self.append_page(soup, soup.body, 99)
        return soup

    def postprocess_html(self, soup, first_fetch):
#        subtree = soup.findAll('div class="remove-ads"')
#        subtree.extract()
        rawc = soup.findAll('div',attrs={'class':'stepdescription'})
        # Reoved bad nested H2s...
        r = str(rawc).replace("<h2>", "").replace("</h2>", "")
        s = BeautifulSoup(r)
        return s


Any pointers to where I am going wrong ?
bmsleight is offline