View Single Post
Old 09-26-2010, 12:37 AM   #1
TonytheBookworm
Addict
TonytheBookworm is on a distinguished road
 
TonytheBookworm's Avatar
 
Posts: 264
Karma: 62
Join Date: May 2010
Device: kindle 2, kindle 3, Kindle fire
Nfl Recipe -- Almost done need a little help

Starson,
If you get a few minutes could you look at this code and maybe explain to me why I never get the pcard content (the photo with the players stats). I don't see where I'm removing it anywhere and I'm parsing the */printable/* link and that page has the pcard.
Thanks.
Spoiler:

Code:
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile

class AdvancedUserRecipe1282101454(BasicNewsRecipe):
    title = 'NFL'
    language = 'en'
    __author__ = 'TonytheBookworm'
    description = 'National FootBall League Coverage'
    publisher = 'Tonythebookworm'
    category = 'sports, football, USA'
    oldest_article = 10
    max_articles_per_feed = 100
    no_stylesheets = True
    remove_javascript = True
    
    extra_css = '''
                    article-hdr{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
                    article-hdr-meta{text-align:right; font-size:small;margin-top:0px;margin-bottom: 0px;}
                    
                    article-hdr-meta-pub{text-align:right; font-size:small;margin-top:0px;margin-bottom: 0px;}
                    article-hdr-meta-updated{text-align:right; font-size:small;margin-top:0px;margin-bottom: 0px;}
                    
                    
                    
                    p{font-family:Helvetica,Arial,sans-serif;font-size:small;}
		        '''
    
    #masthead_url = 'http://gawand.org/wp-content/uploads/2010/06/ajc-logo.gif'
    #keep_only_tags    = [
     #                     dict(name='div', attrs={'id':['col1','article-hdr']})
      #                   ,dict(attrs={'class':['articleText']})
       #                 ]
                        
    remove_tags = [{'id':['print-ribbon']},
                   
                  ]     
    #remove_tags_after = dict(name='div', attrs={'style':['margin']})                 
    feeds          = [
                      ('NFL NEWS', 'http://www.nfl.com/rss/rsslanding?searchString=home'),
                      #('ARZ Cardinals', 'http://www.nfl.com/rss/rsslanding?searchString=team&abbr=ARZ'),
                      ('ATL Falcons',  'http://www.nfl.com/rss/rsslanding?searchString=team&abbr=ATL'),
                      
                     ]
    temp_files = []
    articles_are_obfuscated = True

    def get_article_url(self, article):
       return article.get('link',  None)

    def get_obfuscated_article(self, url):
        br = self.get_browser()
        br.open(url)
        response = br.follow_link(url_regex = r'/printable/', nr = 0)
        html = response.read()
        self.temp_files.append(PersistentTemporaryFile('_fa.html'))
        self.temp_files[-1].write(html)
        self.temp_files[-1].close()
        return self.temp_files[-1].name
        
    def postprocess_html(self, soup, first):
        for tag in soup.findAll(name=['li']):
            tag.name = 'div'
        return soup
        
    def preprocess_html(self, soup):
        for item in soup.findAll(attrs={'style':True}):
            del item['style']
        return soup

I see iframe is turned off by default. How do i turn it back on?

Last edited by TonytheBookworm; 09-26-2010 at 12:48 AM.
TonytheBookworm is offline   Reply With Quote