Hello, I modified the Cracked.com recipe to customise it but now the heading for each article appears at the end. I've tried editing it but can't get it to work. Any advice?
Thanks.
Code:
from calibre.web.feeds.news import BasicNewsRecipe
class Cracked(BasicNewsRecipe):
title = u'Cracked.com Weekly download'
__author__ = 'Update Sept 2017'
language = 'en'
description = "America's Only HumorSite since 1958"
publisher = 'Cracked'
category = 'comedy, lists'
oldest_article =9 # days
max_articles_per_feed = 100
no_stylesheets = True
encoding = 'utf-8'
remove_javascript = True
use_embedded_content = False
recursions = 11
remove_attributes = ['size', 'style']
feeds = [(u'Articles', u'http://feeds.feedburner.com/CrackedRSS/')]
conversion_options = {
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
}
keep_only_tags = [dict(name='article', attrs={'class': 'module article dropShadowBottomCurved'}),
dict(name='article', attrs={'class': 'module blog dropShadowBottomCurved'}),
dict(name='div', attrs={'class': 'content-content'}),
dict(name='div', attrs={'class': 'content-header'})]
remove_tags = [
dict(name='section', attrs={'class': ['socialTools', 'quickFixModule', 'continue-reading']}),
dict(attrs={'class':['socialShareAfterContent', 'socialShareModule', 'continue-reading', 'social-share-bottom list-inline']}),
dict(name='div', attrs={'id': ['relatedArticle']}),
dict(name='div', attrs={'class': ['bannerAd hidden-sm hidden-md hidden-lg introAd']})
]
def is_link_wanted(self, url, a):
return a['class'] == 'next' and a.findParent('nav', attrs={'class':'PaginationContent'}) is not None
def preprocess_html(self, soup):
for img in soup.findAll('img', attrs={'data-img':True}):
img['src'] = img['data-img']
for img in soup.findAll('img', attrs={'data-original':True}):
img['src'] = img['data-original']
return soup
def postprocess_html(self, soup, first_fetch):
for div in soup.findAll(attrs={'class':'PaginationContent'}):
div.extract()
if not first_fetch:
for div in soup.findAll(attrs={'class':'meta'}):
div.extract()
for h1 in soup.findAll('h1'):
h1.extract()
for title in soup.findAll('title'):
title.extract()
return soup