Starson,
If you get a few minutes could you look at this code and maybe explain to me why I never get the pcard content (the photo with the players stats). I don't see where I'm removing it anywhere and I'm parsing the */printable/* link and that page has the pcard.
Thanks.
Spoiler:
Code:
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile
class AdvancedUserRecipe1282101454(BasicNewsRecipe):
title = 'NFL'
language = 'en'
__author__ = 'TonytheBookworm'
description = 'National FootBall League Coverage'
publisher = 'Tonythebookworm'
category = 'sports, football, USA'
oldest_article = 10
max_articles_per_feed = 100
no_stylesheets = True
remove_javascript = True
extra_css = '''
article-hdr{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
article-hdr-meta{text-align:right; font-size:small;margin-top:0px;margin-bottom: 0px;}
article-hdr-meta-pub{text-align:right; font-size:small;margin-top:0px;margin-bottom: 0px;}
article-hdr-meta-updated{text-align:right; font-size:small;margin-top:0px;margin-bottom: 0px;}
p{font-family:Helvetica,Arial,sans-serif;font-size:small;}
'''
#masthead_url = 'http://gawand.org/wp-content/uploads/2010/06/ajc-logo.gif'
#keep_only_tags = [
# dict(name='div', attrs={'id':['col1','article-hdr']})
# ,dict(attrs={'class':['articleText']})
# ]
remove_tags = [{'id':['print-ribbon']},
]
#remove_tags_after = dict(name='div', attrs={'style':['margin']})
feeds = [
('NFL NEWS', 'http://www.nfl.com/rss/rsslanding?searchString=home'),
#('ARZ Cardinals', 'http://www.nfl.com/rss/rsslanding?searchString=team&abbr=ARZ'),
('ATL Falcons', 'http://www.nfl.com/rss/rsslanding?searchString=team&abbr=ATL'),
]
temp_files = []
articles_are_obfuscated = True
def get_article_url(self, article):
return article.get('link', None)
def get_obfuscated_article(self, url):
br = self.get_browser()
br.open(url)
response = br.follow_link(url_regex = r'/printable/', nr = 0)
html = response.read()
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
self.temp_files[-1].write(html)
self.temp_files[-1].close()
return self.temp_files[-1].name
def postprocess_html(self, soup, first):
for tag in soup.findAll(name=['li']):
tag.name = 'div'
return soup
def preprocess_html(self, soup):
for item in soup.findAll(attrs={'style':True}):
del item['style']
return soup
I see iframe is turned off by default. How do i turn it back on?