|
|
#1 |
|
Member
![]() Posts: 22
Karma: 10
Join Date: Aug 2015
Device: Kobo Aura H2O
|
Cracked.com - May 2018 update
Cracked.com have changed their code again and broke the feed. This is the code I use:
Code:
from calibre.web.feeds.news import BasicNewsRecipe
class Cracked(BasicNewsRecipe):
title = u'Cracked.com Weekly download'
__author__ = 'Update June 2018'
language = 'en'
description = "America's Only HumorSite since 1958"
publisher = 'Cracked'
category = 'comedy, lists'
oldest_article =9 # days
max_articles_per_feed = 100
no_stylesheets = True
encoding = 'utf-8'
remove_javascript = True
use_embedded_content = False
recursions = 11
remove_attributes = ['size', 'style']
feeds = [(u'Articles', u'http://feeds.feedburner.com/CrackedRSS/')]
conversion_options = {
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
}
keep_only_tags = [
dict(name='div', attrs={'class': [
'content-content',
'contentWrapper',
'content-header',
]}),
dict(name='article', attrs={'class': [
'module article dropShadowBottomCurved',
'module blog dropShadowBottomCurved',
]}),
]
remove_tags = [
dict(name='section', attrs={'class': ['socialTools', 'quickFixModule', 'continue-reading']}),
dict(attrs={'class':['socialShareAfterContent', 'socialShareModule', 'continue-reading', 'social-share-bottom list-inline']}),
dict(name='div', attrs={'id': ['relatedArticle', 'content-card-top', 'recommendedForYourPleasure', 'navbar']}),
dict(name='div', attrs={'class': ['comments-wrap', 'container continue-reading', 'row breadcrumbs-wrapper']}),
dict(name='h4', attrs={'class': ['mobile-ad-label']}),
dict(name='ul', attrs={'id': [
'breadcrumbs',
'socialShare',
]}),
dict(name='div', attrs={'class': ['bannerAd hidden-sm hidden-md hidden-lg introAd']})
]
def is_link_wanted(self, url, a):
return a['class'] == 'next' and a.findParent('nav', attrs={'class':'PaginationContent'}) is not None
def preprocess_html(self, soup):
for img in soup.findAll('img', attrs={'data-img':True}):
img['src'] = img['data-img']
for img in soup.findAll('img', attrs={'data-original':True}):
img['src'] = img['data-original']
for img in soup.findAll('img', attrs={'data-src':True}):
img['src'] = img['data-src']
return soup
def postprocess_html(self, soup, first_fetch):
for div in soup.findAll(attrs={'class':'PaginationContent'}):
div.extract()
if not first_fetch:
for div in soup.findAll(attrs={'class':'meta'}):
div.extract()
return soup
|
|
|
|
|
|
#2 |
|
creator of calibre
![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() Posts: 45,598
Karma: 28548962
Join Date: Oct 2006
Location: Mumbai, India
Device: Various
|
thanks, updated.
|
|
|
|
| Advert | |
|
|
|
|
#3 |
|
Member
![]() Posts: 22
Karma: 10
Join Date: Aug 2015
Device: Kobo Aura H2O
|
Updated slightly following a site update that added unnecessary social share links and recommendations.
Code:
from calibre.web.feeds.news import BasicNewsRecipe
class Cracked(BasicNewsRecipe):
title = u'Cracked.com Weekly download'
__author__ = 'Update June 2018'
language = 'en'
description = "America's Only HumorSite since 1958"
publisher = 'Cracked'
category = 'comedy, lists'
oldest_article =15 # days
max_articles_per_feed = 100
no_stylesheets = True
encoding = 'utf-8'
remove_javascript = True
use_embedded_content = False
recursions = 11
remove_attributes = ['size', 'style']
feeds = [(u'Articles', u'http://feeds.feedburner.com/CrackedRSS/')]
conversion_options = {
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
}
keep_only_tags = [
dict(name='div', attrs={'class': [
'content-content',
'contentWrapper',
'content-header',
]}),
dict(name='article', attrs={'class': [
'module article dropShadowBottomCurved',
'module blog dropShadowBottomCurved',
]}),
]
remove_tags = [
dict(name='section', attrs={'class': ['socialTools', 'quickFixModule', 'continue-reading']}),
dict(attrs={'class':['socialShareAfterContent', 'socialShareModule', 'continue-reading', 'social-share-bottom list-inline']}),
dict(name='div', attrs={'id': ['relatedArticle', 'content-card-top', 'recommendedForYourPleasure', 'navbar', 'flashbackModuleWrap', 'moreRecommendedArticles']}),
dict(name='div', attrs={'class': ['comments-wrap', 'container continue-reading', 'row breadcrumbs-wrapper', 'btn-social-favorites col', 'hidden-social col', 'ajax-loader comments-loader-bottom', 'flashback-module-new', 'card-md-list card-sm-list card-xs-list', 'popular-module card-md-list card-sm-list card-xs-list', 'col-md-12 list-title', 'content-cards d-flex flex-wrap', 'google-plus btn btn-social', 'twitter btn btn-socia', 'facebook btn btn-social', 'row social-share-top-wrapper']}),
dict(name='h4', attrs={'class': ['mobile-ad-label']}),
dict(name='ul', attrs={'id': [
'breadcrumbs',
'socialShare',
]}),
dict(name='ul', attrs={'class': ['list-unstyled offcanvas-sections']}),
dict(name='div', attrs={'class': ['bannerAd hidden-sm hidden-md hidden-lg introAd']})
]
def is_link_wanted(self, url, a):
return a['class'] == 'next' and a.findParent('nav', attrs={'class':'PaginationContent'}) is not None
def preprocess_html(self, soup):
for img in soup.findAll('img', attrs={'data-img':True}):
img['src'] = img['data-img']
for img in soup.findAll('img', attrs={'data-original':True}):
img['src'] = img['data-original']
for img in soup.findAll('img', attrs={'data-src':True}):
img['src'] = img['data-src']
return soup
def postprocess_html(self, soup, first_fetch):
for div in soup.findAll(attrs={'class':'PaginationContent'}):
div.extract()
if not first_fetch:
for div in soup.findAll(attrs={'class':'meta'}):
div.extract()
return soup
|
|
|
|
|
|
#4 |
|
creator of calibre
![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() Posts: 45,598
Karma: 28548962
Join Date: Oct 2006
Location: Mumbai, India
Device: Various
|
thanks, updated
|
|
|
|
![]() |
|
Similar Threads
|
||||
| Thread | Thread Starter | Forum | Replies | Last Post |
| Collected Works Joyce, James: Complete Works | v.12.0 | Update 8 Apr 2018 | pynch | ePub Books | 128 | 05-30-2023 02:19 PM |
| FTI Max2 update available, 2018-03-19_13_47_1.8.3_9269185/1043:user/release-keys | everalm | Onyx Boox | 7 | 04-06-2018 03:33 PM |
| Cracked.com | Calia | Recipes | 0 | 08-28-2014 11:48 PM |
| Screen Cracked | omro | Astak EZReader | 13 | 05-07-2010 11:39 AM |
| DH cracked my K2 | lala | Amazon Kindle | 6 | 02-22-2010 04:43 PM |