keep_only_tags = dict(attrs={'class':'asset story clearfix'}) def preprocess_raw_html(self, html, url): if '<article class="asset clearfix">' in html: self.abort_article() return html