View Single Post
Old 06-29-2018, 07:31 AM   #3
Phoebus
Member
Phoebus began at the beginning.
 
Posts: 22
Karma: 10
Join Date: Aug 2015
Device: Kobo Aura H2O
Updated slightly following a site update that added unnecessary social share links and recommendations.

Code:
from calibre.web.feeds.news import BasicNewsRecipe


class Cracked(BasicNewsRecipe):
    title = u'Cracked.com Weekly download'
    __author__ = 'Update June 2018'
    language = 'en'
    description = "America's Only HumorSite since 1958"
    publisher = 'Cracked'
    category = 'comedy, lists'
    oldest_article =15  # days
    max_articles_per_feed = 100
    no_stylesheets = True
    encoding = 'utf-8'
    remove_javascript = True
    use_embedded_content = False
    recursions = 11
    remove_attributes = ['size', 'style']

    feeds = [(u'Articles', u'http://feeds.feedburner.com/CrackedRSS/')]

    conversion_options = {
        'comment': description, 'tags': category, 'publisher': publisher, 'language': language
    }
   
    keep_only_tags = [  
                    dict(name='div', attrs={'class': [
                                                'content-content',
                                                'contentWrapper',
                                                'content-header',
                                                        ]}),
                    dict(name='article', attrs={'class': [
                                                'module article dropShadowBottomCurved',
                                                'module blog dropShadowBottomCurved',
                                                            ]}),
                      ]

    remove_tags = [
        dict(name='section', attrs={'class': ['socialTools', 'quickFixModule', 'continue-reading']}),
        dict(attrs={'class':['socialShareAfterContent', 'socialShareModule', 'continue-reading', 'social-share-bottom list-inline']}),
        dict(name='div', attrs={'id': ['relatedArticle', 'content-card-top', 'recommendedForYourPleasure', 'navbar', 'flashbackModuleWrap', 'moreRecommendedArticles']}),
        dict(name='div', attrs={'class': ['comments-wrap', 'container continue-reading', 'row breadcrumbs-wrapper', 'btn-social-favorites col', 'hidden-social col', 'ajax-loader comments-loader-bottom', 'flashback-module-new', 'card-md-list card-sm-list card-xs-list', 'popular-module card-md-list card-sm-list card-xs-list', 'col-md-12 list-title', 'content-cards d-flex flex-wrap', 'google-plus btn btn-social', 'twitter btn btn-socia', 'facebook btn btn-social', 'row social-share-top-wrapper']}),
        dict(name='h4', attrs={'class': ['mobile-ad-label']}),
        dict(name='ul', attrs={'id': [
                                'breadcrumbs',
                                'socialShare',
                                ]}),       
        dict(name='ul', attrs={'class': ['list-unstyled offcanvas-sections']}),
        dict(name='div', attrs={'class': ['bannerAd hidden-sm hidden-md hidden-lg introAd']})
    ]

    def is_link_wanted(self, url, a):
        return a['class'] == 'next' and a.findParent('nav', attrs={'class':'PaginationContent'}) is not None

    def preprocess_html(self, soup):
        for img in soup.findAll('img', attrs={'data-img':True}):
            img['src'] = img['data-img']
        for img in soup.findAll('img', attrs={'data-original':True}):
            img['src'] = img['data-original']                              
        for img in soup.findAll('img', attrs={'data-src':True}):
            img['src'] = img['data-src'] 
        return soup
    
    def postprocess_html(self, soup, first_fetch):
        for div in soup.findAll(attrs={'class':'PaginationContent'}):
            div.extract()
        if not first_fetch:
            for div in soup.findAll(attrs={'class':'meta'}):
                div.extract()
 
        return soup
Phoebus is offline   Reply With Quote