View Single Post
Old 02-27-2019, 10:42 AM   #1
Phoebus
Member
Phoebus began at the beginning.
 
Posts: 22
Karma: 10
Join Date: Aug 2015
Device: Kobo Aura H2O
Recipe not removing tags

Hi, I've created a recipe (follow up to an earlier post where I have since found a different feed with nicer HTML without infinite scroll) but I cannot for the life of me remove a specific tag.

I'm trying to remove <div class="side"> and/or <div class="spacer". I do want the tag <div class="md">, just not when it is nested within a "side" or "spacer" div.

As shown by the commented out code I have tried a few things (both using Beautiful Soup and without it) but nothing seems to work. Any suggestions?

The other problem is that some pages ask for me to click a button to confirm I want to view the page. Inspecting the code I can't see any <a> link it goes to. I've tried

return button['value'] == 'yes'

But to no avail. But that's secondary to removing the tags.

Code:
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import unicode_literals, division, absolute_import, print_function
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup

class AdvancedUserRecipe1542030622(BasicNewsRecipe):
    title          = 'Strange Reddit'
    auto_cleanup   = False
    __author__ = 'Phoebus'
    language = 'en'
    description = "Strange tales"
    publisher = 'Reddit users'
    category = 'horror'
    oldest_article =40  # days
    max_articles_per_feed = 50
    no_stylesheets = True
    encoding = 'utf-8'
    remove_javascript = True
    use_embedded_content = False
    recursions = 11
    remove_attributes = ['size', 'style']


    feeds          = [
        (u'Articles', u'http://feeds.feedburner.com/CreepiestReddit-Month'),
    ]
    
    
    conversion_options = {
        'comment': description, 'tags': category, 'publisher': publisher, 'language': language
    }

    remove_tags_before = dict(id='top-matter')

    remove_tags = [

#		        dict(name='span', attrs={'class': [
#        									'flair',
#        									'flair ',
#        									'user',
#        									
#        													]}),

		        dict(name='div', attrs={'data-author': [
        									'AutoModerator',
        													]}), 
        													 
		        dict(name='a', attrs={'class': [
        									'expand',
        													]}),  
       													 
		        dict(name='div', attrs={'class': [
        									'titlebox',
        									'spacer',
#        									'side',
        													]}),          													
				dict(id='side'),
			
                dict(attrs={'class':'spacer'}),
				
    				]


    
    keep_only_tags = [  

		 	        dict(name='title'),
		 	   


                    dict(name='div', attrs={'class': [
        									'entry unvoted',
        									'md',
        													]}),
#        			dict(id='md'),										                     
 
                                                ]


  
    

    def is_link_wanted(self, url, a):
        return button['value'] == 'yes'


    def preprocess_html(self, soup):
#       for div in soup.findAll('div', attrs={'class':'side'}):
#            div.decompose()
#        soup.find('div', id='side').decompose()
#       for div in soup.find_all("div", {'class':'spacer'}): 
#            div.decompose()
		for div in soup('div', {'class':'side'}):
			div.decompose()
                return soup 

 
        
#    def postprocess_html(self, soup, first_fetch):
#        for div in soup.findAll(attrs={'class':'side'}):
#            div.decompose()
#        soup.find('div', id='side').decompose()
#       for div in soup.find_all("div", {'class':'spacer'}): 
#            div.decompose()
#		for div in soup('div', {'class':'side'}):
#			div.decompose()
 
#                return soup
Phoebus is offline   Reply With Quote