oK I did my own recipe for Macleans and this works.
Code:
class AdvancedUserRecipe1289709253(BasicNewsRecipe):
title = u'Macleans Magazine'
oldest_article = 7
max_articles_per_feed = 100
use_embedded_content = False
no_stylesheets = True
remove_javascript = True
extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }'
remove_tags_before = dict(id='')
remove_tags = [dict(name='div', attrs={'class':['wp-caption','']}),
dict(name='div', attrs={'id':['headerimg','footer',]}),
dict(name='ul', attrs={'class':['']}),
dict(name='ul', attrs={'id':['']}),
dict(name='ol', attrs={'id':['']}),
dict(name='span', attrs={'class':['']}),
dict(name='p', attrs={'class':'postmetadata'}),
dict(name='img'),]
feeds = [(u'Canada', u'http://www2.macleans.ca/category/canada/feed/')
]
def preprocess_html(self, soup):
for alink in soup.findAll('a'):
if alink.string is not None:
tstr = alink.string
alink.replaceWith(tstr)
return soup
def print_version(self, url):
return url + "print/"
But Men's Fitness would not
Code:
class AdvancedUserRecipe1289709253(BasicNewsRecipe):
title = u'test'
oldest_article = 7
max_articles_per_feed = 100
use_embedded_content = False
no_stylesheets = True
remove_javascript = True
extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }'
feeds = (u'News', u'http://www.mensfitness.com/rss_global/')
def preprocess_html(self, soup):
for alink in soup.findAll('a'):
if alink.string is not None:
tstr = alink.string
alink.replaceWith(tstr)
return soup
def print_version(self, url):
return url + "?print=1"
I have since redone Men's Fitness using to remove_tags etc but being OCD
I want to get the print option correct.
Code:
class AdvancedUserRecipe1289709253(BasicNewsRecipe):
title = u'Mens Fitness'
oldest_article = 7
max_articles_per_feed = 100
use_embedded_content = False
no_stylesheets = True
remove_javascript = True
extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }'
keep_only_tags = [dict(name='div', attrs={'id':'content_items'})]
remove_attributes = ['style', '']
remove_tags_before = dict(id='')
remove_tags = [dict(name='div', attrs={'class':['','']}),
dict(name='div', attrs={'id':['','',]}),
dict(name='ul', attrs={'class':['']}),
dict(name='ul', attrs={'id':['']}),
dict(name='ol', attrs={'id':['']}),
dict(name='span', attrs={'class':['']}),
dict(name='p', attrs={'id':''}),
dict(name='img'),]
feeds = (u'News', u'http://www.mensfitness.com/rss_global/')
def preprocess_html(self, soup):
for alink in soup.findAll('a'):
if alink.string is not None:
tstr = alink.string
alink.replaceWith(tstr)
return soup