View Single Post
Old 09-04-2017, 08:33 AM   #1
Phoebus
Member
Phoebus began at the beginning.
 
Posts: 22
Karma: 10
Join Date: Aug 2015
Device: Kobo Aura H2O
Article title appears at head

Hello, I modified the Cracked.com recipe to customise it but now the heading for each article appears at the end. I've tried editing it but can't get it to work. Any advice?

Thanks.

Code:
from calibre.web.feeds.news import BasicNewsRecipe


class Cracked(BasicNewsRecipe):
    title = u'Cracked.com Weekly download'
    __author__ = 'Update Sept 2017'
    language = 'en'
    description = "America's Only HumorSite since 1958"
    publisher = 'Cracked'
    category = 'comedy, lists'
    oldest_article =9  # days
    max_articles_per_feed = 100
    no_stylesheets = True
    encoding = 'utf-8'
    remove_javascript = True
    use_embedded_content = False
    recursions = 11
    remove_attributes = ['size', 'style']

    feeds = [(u'Articles', u'http://feeds.feedburner.com/CrackedRSS/')]

    conversion_options = {
        'comment': description, 'tags': category, 'publisher': publisher, 'language': language
    }

    keep_only_tags = [dict(name='article', attrs={'class': 'module article dropShadowBottomCurved'}),
                        dict(name='article', attrs={'class': 'module blog dropShadowBottomCurved'}),
                        dict(name='div', attrs={'class': 'content-content'}),
                        dict(name='div', attrs={'class': 'content-header'})]

    remove_tags = [
        dict(name='section', attrs={'class': ['socialTools', 'quickFixModule', 'continue-reading']}),
        dict(attrs={'class':['socialShareAfterContent', 'socialShareModule', 'continue-reading', 'social-share-bottom list-inline']}),
        dict(name='div', attrs={'id': ['relatedArticle']}),
        dict(name='div', attrs={'class': ['bannerAd hidden-sm hidden-md hidden-lg introAd']})
    ]

    def is_link_wanted(self, url, a):
        return a['class'] == 'next' and a.findParent('nav', attrs={'class':'PaginationContent'}) is not None

    def preprocess_html(self, soup):
        for img in soup.findAll('img', attrs={'data-img':True}):
            img['src'] = img['data-img']
        for img in soup.findAll('img', attrs={'data-original':True}):
            img['src'] = img['data-original']                     
        return soup
    
    def postprocess_html(self, soup, first_fetch):
        for div in soup.findAll(attrs={'class':'PaginationContent'}):
            div.extract()
        if not first_fetch:
            for div in soup.findAll(attrs={'class':'meta'}):
                div.extract()
        for h1 in soup.findAll('h1'):
                h1.extract()
        for title in soup.findAll('title'):
                title.extract()   
        return soup
Phoebus is offline   Reply With Quote