MobileRead Forums - View Single Post - Custom recipes (archive, read-only)

bmsleight · 08-30-2010, 05:56 PM

Tried to put a recipe together for instructables.

So far so good, but can not tidy the final output up nicely. (Some ads and some bad css)

Spoiler:

Code:

#!/usr/bin/env  python
__license__   = 'GPL v3'
__copyright__ = '2010, Brendan Sleight <bms.calibre at barwap.com>'
'''
www.instructables.com
'''
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup

import string

class Instructables(BasicNewsRecipe):
    title                 = u'Instructables'
    __author__            = 'Bmsleight'
    description           = 'Make, How To, and DIY'
    oldest_article        = 100
    max_articles_per_feed = 5
    no_stylesheets        = True
    language = 'en'
    index                 = 'http://www.instructables.com'
    remove_tags = [
                    dict(name='div', attrs={'class':'remove-ads'})
                  ]
    extra_css      = '''
                        .steplabel{font-size:xx-small;}
                        .txt{font-size:xx-small;}
                        #txt{font-size:xx-small;}
                     '''

    feeds               = [
                         (u'Instructables Featured'        , u'http://www.instructables.com/tag/type-id/featured-true/rss.xml'                                      )
                         ]


    def append_page(self, soup, appendtag, position, pre=[]):
        # Multi threading is fun .....
        for itt in soup.findAll('a',href=True):
            str_itt = str(itt['href'])
            if "/step" in str_itt and "id" in str_itt and "/account/" not in str_itt and str_itt not in pre:
                pre.append(itt['href'])
                nurl = self.index + itt['href']
                soup2 = self.index_to_soup(nurl)
                texttag = soup2.find('body')
                newpos = len(texttag.contents)
                self.append_page(soup2,texttag,newpos, pre)
                texttag.extract()
                appendtag.insert(position,texttag)

    def preprocess_html(self, soup):
        self.append_page(soup, soup.body, 99)
        return soup

    def postprocess_html(self, soup, first_fetch):
#        subtree = soup.findAll('div class="remove-ads"')
#        subtree.extract()
        rawc = soup.findAll('div',attrs={'class':'stepdescription'})
        # Reoved bad nested H2s...
        r = str(rawc).replace("<h2>", "").replace("</h2>", "")
        s = BeautifulSoup(r)
        return s

Any pointers to where I am going wrong ?