And next question! Is there a way to get rid of the top image in this feed (i've cut out the majority of feeds for this example, but each article is preceded by the ad images, starting with "Share" and "Larger Text" . . . Whatever I try hasn't worked so far.
Here's the recipe:
import string, re
class AdvancedUserRecipe1252944207(BasicNewsRecipe):
title = u'Worcester Telegram test'
oldest_article = 1
max_articles_per_feed = 50
timefmt = ''
no_stylesheets = True
preprocess_regexps = [(re.compile(r'<strong.?>', re.DOTALL|re.IGNORECASE), lambda match: '<b>')]
preprocess_regexps = [(re.compile(r'</strong.?>', re.DOTALL|re.IGNORECASE), lambda match: '</b>')]
preprocess_regexps = [(re.compile(r'and #8217.?;', re.DOTALL|re.IGNORECASE), lambda match: '"')]
preprocess_regexps = [(re.compile(r'and #8216.?;', re.DOTALL|re.IGNORECASE), lambda match: '"')]
keep_only_tags = [dict(id=['frontpage_section', 'articleWell', 'headline', 'subheadline', 'SuperHeading', 'byline', 'articleBody', 'zoom1'])]
remove_tags = [dict(id=['factBoxes'])]
preprocess_regexps = [(re.compile(r'<!-- This code displays columnist headshots: -->.*?<p>', re.DOTALL|re.IGNORECASE), lambda match: '')]
preprocess_regexps = [(re.compile(r'<div class="verdana11">.*?<!-- END ARTICLE COMMENTS -->', re.DOTALL|re.IGNORECASE), lambda match: '')]
encoding = 'cp1252'
remove_tags_after = [dict(id='leaderboardBot')]
feeds = [(u'Local News', u'
http://www.telegram.com/apps/pbcs.dl...le=1101')]