View Single Post
Old 09-02-2010, 01:28 PM   #2593
TonytheBookworm
Addict
TonytheBookworm is on a distinguished road
 
TonytheBookworm's Avatar
 
Posts: 264
Karma: 62
Join Date: May 2010
Device: kindle 2, kindle 3, Kindle fire
Been looking at the AventureGamer code and I have a few questions.

Spoiler:

Code:
def preprocess_html(self, soup):
       mtag = '<meta http-equiv="Content-Language" content="en-US"/>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>'
       soup.head.insert(0,mtag)
what is the reason for inserting the meta tag ?
Code:
       for item in soup.findAll(style=True):
           del item['style']
why is the above used? It appears to remove all instance of style but why is it needed?
Code:
       self.append_page(soup, soup.body, 3)
I'm not really clear on this. It appears to me that you are taking the whole soup. appending to the body of the soup with a position of 3?

Code:
       pager = soup.find('div',attrs={'class':'toolbar_fat'})
       if pager:
          pager.extract()
I looked in the code and didn't see why the extraction of this is needed. Because the navigation appears to be inside toolbar_fat_next


and here is my painful attempt
Spoiler:

Code:
from calibre.web.feeds.news import BasicNewsRecipe

class AdvancedUserRecipe1282101454(BasicNewsRecipe):
    title = 'How Stuff Works'
    language = 'en'
    __author__ = 'TonytheBookworm'
    description = 'How stuff works'
    publisher = 'Tony'
    category = 'information'
    oldest_article = 10
    max_articles_per_feed = 100
    no_stylesheets = True
    #INDEX                 = u'http://www.adventuregamers.com'
    #extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }'
    #masthead_url = 'http://gawand.org/wp-content/uploads/2010/06/ajc-logo.gif'
    #keep_only_tags    = [
     #                    dict(name='div', attrs={'class':['blogEntryHeader','blogEntryContent']})
      #                 ,dict(attrs={'id':['cxArticleText','cxArticleBodyText']})
      #                  ]
    feeds          = [
                      ('AutoStuff', 'http://feeds.feedburner.com/HowstuffworksAutostuffDailyRssFeed'),
                      
                    ]

   
        
        
    def append_page(self, soup, appendtag, position):
        pager = soup.find('div',attrs={'class':'pagination'})
        if pager:
           nexturl = pager.a['href']
           soup2 = self.index_to_soup(nexturl)
           texttag = soup2.find('div', attrs={'class':'content'})
           for it in texttag.findAll(style=True):
               del it['style']
           newpos = len(texttag.contents)          
           self.append_page(soup2,texttag,newpos)
           texttag.extract()
           appendtag.insert(position,texttag)     

    def preprocess_html(self, soup):
       mtag = '<meta http-equiv="Content-Language" content="en-US"/>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>'
       soup.head.insert(0,mtag)    
       for item in soup.findAll(style=True):
           del item['style']
       self.append_page(soup, soup.body, 3)
       pager = soup.find('div',attrs={'class':'toolbar_fat'})
       if pager:
          pager.extract()        
        return soup
TonytheBookworm is offline