MobileRead Forums - View Single Post - Custom recipes (archive, read-only)

Starson17 · 08-11-2010, 05:27 PM

Spoiler:

Code:

from calibre.web.feeds.news import BasicNewsRecipe
import re

class BigOven(BasicNewsRecipe):
    title               = 'BigOven'
    __author__          = 'Starson17'
    description         = 'Recipes for the Foodie in us all. Registration is free. A fake username and password just gives smaller photos.'
    language            = 'en'
    category            = 'news, food, recipes, gourmet'
    publisher           = 'Starson17'
    use_embedded_content= False
    no_stylesheets      = True
    oldest_article      = 24
    remove_javascript   = True
    remove_empty_feeds    = True
    cover_url           = 'http://www.software.com/images/products/BigOven%20Logo_177_216.JPG'
    max_articles_per_feed = 30
    needs_subscription = True

    conversion_options = {'linearize_tables'  : True
                        , 'comment'           : description
                        , 'tags'              : category
                        , 'publisher'         : publisher
                        , 'language'          : language
                        }
    
    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
        if self.username is not None and self.password is not None:
            br.open('http://www.bigoven.com/account/login?ReturnUrl=/')
            br.select_form(nr=1)
            br['Email']  = self.username
            br['Password'] = self.password
            br.submit()
        return br

    remove_attributes = ['style', 'font']

    remove_tags     = [dict(name='div', attrs={'class':['ppy-caption']})
                                  ,dict(name='div', attrs={'id':['float_corner']})
                                  ]

    def preprocess_html(self, soup):
        for tag in soup.findAll(name='a', attrs={'class':['deflink']}):
          tag.replaceWith(tag.string)
        for tag in soup.findAll(name='a', text=re.compile(r'.*View Metric.*', re.DOTALL)):
          tag.parent.parent.extract()
        for tag in soup.findAll(name='a', text=re.compile(r'.*Add my own photo.*', re.DOTALL)):
          tag.parent.parent.extract()
        for tag in soup.findAll(name='div', attrs={'class':['container']}):
          if tag.find(name='h1'):
              continue
          if tag.find(name='h2', text=re.compile(r'.*Ingredients.*', re.DOTALL)):
              print 'tag found Ingred h2'
              continue
          if tag.find(name='h2', text=re.compile(r'Preparation.*', re.DOTALL)):
              print 'tag found Prep h2'
              continue
          tag.extract()
        return soup

    feeds = [(u'4 & 5 Star Rated Recipes', u'http://feeds.feedburner.com/Bigovencom-RecipeRaves?format=xml')]

Since I wrote this recipe a few weeks ago, this site has been totally redesigned, and is currently in beta.

In the redesign, they've removed a lot of the class and id tag labels previously found on this site, so the recipe has some interesting (to me) methods of removing tags by identifying text within a sub-tag and removing the parent tag.