I modified Darko's BBC script to get the full story version of The Oakland Press (Oakland County Michigan).
In the debug, it seems to be fetching the fullstory version, but the HTML is of the paged version.
Can anyone tell me how to get and keep only the full version so that I don't have any duplicate content?
Code:
'''
theoaklandpress.com
'''
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
class Oakland_Press(BasicNewsRecipe):
title = 'The Oakland Press'
__author__ = 'Roger Easlick'
description = 'Oakland County News '
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
#delay = 1
use_embedded_content = False
encoding = 'utf8'
publisher = 'The Oakland Press'
category = 'news'
language = 'en_US'
publication_type = 'newsportal'
extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} '
preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
conversion_options = {
'comments' : description
,'tags' : category
,'language' : language
,'publisher' : publisher
,'linearize_tables': True
}
keep_only_tags = [
dict(name='div', attrs={'class':['story_headline']})
,dict(name='div', attrs={'class':['story_timestamp']})
,dict(name='p', attrs={'class':['byline']})
,dict(name='div', attrs={'class':['story_body clear']})
]
remove_tags = [
dict(name='div', attrs={'class':['comments-link-block']})
,dict(name='ul', attrs={'id':['paging']})
]
remove_attributes = ['width','height']
feeds = [
('News', 'http://www.theoaklandpress.com/?rss=news'),
]
def print_version(self, url):
return url + '?viewmode=fullstory'
Any help would be greatly appreciated!