MobileRead Forums - View Single Post - Custom recipes (archive, read-only)

olaf · 09-26-2009, 12:06 PM

And next question! Is there a way to get rid of the top image in this feed (i've cut out the majority of feeds for this example, but each article is preceded by the ad images, starting with "Share" and "Larger Text" . . . Whatever I try hasn't worked so far.

Here's the recipe:

import string, re

class AdvancedUserRecipe1252944207(BasicNewsRecipe):
title = u'Worcester Telegram test'
oldest_article = 1
max_articles_per_feed = 50
timefmt = ''
no_stylesheets = True

preprocess_regexps = [(re.compile(r'<strong.?>', re.DOTALL|re.IGNORECASE), lambda match: '<b>')]
preprocess_regexps = [(re.compile(r'</strong.?>', re.DOTALL|re.IGNORECASE), lambda match: '</b>')]
preprocess_regexps = [(re.compile(r'and #8217.?;', re.DOTALL|re.IGNORECASE), lambda match: '"')]
preprocess_regexps = [(re.compile(r'and #8216.?;', re.DOTALL|re.IGNORECASE), lambda match: '"')]

keep_only_tags = [dict(id=['frontpage_section', 'articleWell', 'headline', 'subheadline', 'SuperHeading', 'byline', 'articleBody', 'zoom1'])]
remove_tags = [dict(id=['factBoxes'])]
preprocess_regexps = [(re.compile(r'.*?<p>', re.DOTALL|re.IGNORECASE), lambda match: '')]
preprocess_regexps = [(re.compile(r'<div class="verdana11">.*?', re.DOTALL|re.IGNORECASE), lambda match: '')]

encoding = 'cp1252'

remove_tags_after = [dict(id='leaderboardBot')]

feeds = [(u'Local News', u' http://www.telegram.com/apps/pbcs.dl...le=1101')]

09-26-2009, 12:06 PM	#777
olaf Enthusiast Posts: 43 Karma: 50 Join Date: May 2009 Device: Kindle3	And next question! Is there a way to get rid of the top image in this feed (i've cut out the majority of feeds for this example, but each article is preceded by the ad images, starting with "Share" and "Larger Text" . . . Whatever I try hasn't worked so far. Here's the recipe: import string, re class AdvancedUserRecipe1252944207(BasicNewsRecipe): title = u'Worcester Telegram test' oldest_article = 1 max_articles_per_feed = 50 timefmt = '' no_stylesheets = True preprocess_regexps = [(re.compile(r'<strong.?>', re.DOTALL\|re.IGNORECASE), lambda match: '<b>')] preprocess_regexps = [(re.compile(r'</strong.?>', re.DOTALL\|re.IGNORECASE), lambda match: '</b>')] preprocess_regexps = [(re.compile(r'and #8217.?;', re.DOTALL\|re.IGNORECASE), lambda match: '"')] preprocess_regexps = [(re.compile(r'and #8216.?;', re.DOTALL\|re.IGNORECASE), lambda match: '"')] keep_only_tags = [dict(id=['frontpage_section', 'articleWell', 'headline', 'subheadline', 'SuperHeading', 'byline', 'articleBody', 'zoom1'])] remove_tags = [dict(id=['factBoxes'])] preprocess_regexps = [(re.compile(r'<!-- This code displays columnist headshots: -->.?<p>', re.DOTALL\|re.IGNORECASE), lambda match: '')] preprocess_regexps = [(re.compile(r'<div class="verdana11">.?<!-- END ARTICLE COMMENTS -->', re.DOTALL\|re.IGNORECASE), lambda match: '')] encoding = 'cp1252' remove_tags_after = [dict(id='leaderboardBot')] feeds = [(u'Local News', u' http://www.telegram.com/apps/pbcs.dl...le=1101')]