Hi, I've created a recipe (follow up to an earlier post where I have since found a different feed with nicer HTML without infinite scroll) but I cannot for the life of me remove a specific tag.
I'm trying to remove <div class="side"> and/or <div class="spacer". I do want the tag <div class="md">, just not when it is nested within a "side" or "spacer" div.
As shown by the commented out code I have tried a few things (both using Beautiful Soup and without it) but nothing seems to work. Any suggestions?
The other problem is that some pages ask for me to click a button to confirm I want to view the page. Inspecting the code I can't see any <a> link it goes to. I've tried
return button['value'] == 'yes'
But to no avail. But that's secondary to removing the tags.
Code:
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import unicode_literals, division, absolute_import, print_function
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class AdvancedUserRecipe1542030622(BasicNewsRecipe):
title = 'Strange Reddit'
auto_cleanup = False
__author__ = 'Phoebus'
language = 'en'
description = "Strange tales"
publisher = 'Reddit users'
category = 'horror'
oldest_article =40 # days
max_articles_per_feed = 50
no_stylesheets = True
encoding = 'utf-8'
remove_javascript = True
use_embedded_content = False
recursions = 11
remove_attributes = ['size', 'style']
feeds = [
(u'Articles', u'http://feeds.feedburner.com/CreepiestReddit-Month'),
]
conversion_options = {
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
}
remove_tags_before = dict(id='top-matter')
remove_tags = [
# dict(name='span', attrs={'class': [
# 'flair',
# 'flair ',
# 'user',
#
# ]}),
dict(name='div', attrs={'data-author': [
'AutoModerator',
]}),
dict(name='a', attrs={'class': [
'expand',
]}),
dict(name='div', attrs={'class': [
'titlebox',
'spacer',
# 'side',
]}),
dict(id='side'),
dict(attrs={'class':'spacer'}),
]
keep_only_tags = [
dict(name='title'),
dict(name='div', attrs={'class': [
'entry unvoted',
'md',
]}),
# dict(id='md'),
]
def is_link_wanted(self, url, a):
return button['value'] == 'yes'
def preprocess_html(self, soup):
# for div in soup.findAll('div', attrs={'class':'side'}):
# div.decompose()
# soup.find('div', id='side').decompose()
# for div in soup.find_all("div", {'class':'spacer'}):
# div.decompose()
for div in soup('div', {'class':'side'}):
div.decompose()
return soup
# def postprocess_html(self, soup, first_fetch):
# for div in soup.findAll(attrs={'class':'side'}):
# div.decompose()
# soup.find('div', id='side').decompose()
# for div in soup.find_all("div", {'class':'spacer'}):
# div.decompose()
# for div in soup('div', {'class':'side'}):
# div.decompose()
# return soup