View Single Post
Old 02-22-2019, 08:34 AM   #1
Phoebus
Member
Phoebus began at the beginning.
 
Posts: 22
Karma: 10
Join Date: Aug 2015
Device: Kobo Aura H2O
Stumped by infinite scroll

Hi,

I've created a recipe to scrape Reddit searches each month. However I am only getting a few replies, in part as I think that Reddit has an infinite scroll, though this may not be the right term.

I can't follow up with 'more replies' either.

I've searched this forum and it looks like I should look for the Ajax script but can't seem to do this. Any tips?

Thanks

Code:
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import unicode_literals, division, absolute_import, print_function
from calibre.web.feeds.news import BasicNewsRecipe

class AdvancedUserRecipe1542030622(BasicNewsRecipe):
    title          = 'Monthy Reddit scrape'
    auto_cleanup   = False
    __author__ = '2019-02-22'
    language = 'en'
    description = "Creepiest tales on the internet"
    publisher = 'Reddit users'
    category = 'horror'
    oldest_article =40  # days
    max_articles_per_feed = 50
    no_stylesheets = True
    encoding = 'utf-8'
    remove_javascript = True
    use_embedded_content = False
    recursions = 11
    remove_attributes = ['size', 'style']


    feeds          = [
        (u'Articles', u'http://feeds.feedburner.com/CreepiestReddit-Month'),
    ]
    
    
    conversion_options = {
        'comment': description, 'tags': category, 'publisher': publisher, 'language': language
    }

    keep_only_tags = [  
                    dict(name='p', attrs={'class': [
                                                'title',
                                                            ]}),
                    dict(name='span', attrs={'class': [
                                                'domain',
                                                            ]}),                    

                    dict(name='div', attrs={'tabindex': [
												'-1',
                                                            ]}),
                    dict(name='div', attrs={'data-test-id': [
                                                'post-content',                             
                                                            ]}), 

                                        dict(name='span'),
                
                                                ]

    remove_tags = [

		        dict(name='button'),
		        dict(name='span', attrs={'class': [
        									'flair',
        									'flair ',
        									's6wlmco-0 jecSt',
        									's7pq5uy-2 iCbvoa',
        									'cu1hzx-0 iogJLn',
        									's6wlmco-3 bsaIpo',
        									
        													]}),
		        dict(name='div', attrs={'data-author': [
        									'AutoModerator',
        													]}),  
		        dict(name='div', attrs={'data-redditstyle': [
        									'false',
        													]}),
		        dict(name='div', attrs={'class': [
        									's6wlmco-0 jecSt',
        									's7pq5uy-2 iCbvoa',
        									's1muqojl-0 jMnEuz',
        													]}),  
		        dict(name='ul', attrs={'class': [
        									'flat-list buttons',
        													]}),        													
		        dict(name='input', attrs={'type': [
        									'hidden',
        													]}),   
 		        dict(name='svg'),
 		        dict(name='i'),
 		        dict(name='img', attrs={'role': [
        									'presentation',
        													]}),   
    				]

    def is_link_wanted(self, url, a):
        return a['class'] == 'next' and a.findParent('nav', attrs={'class':'PaginationContent'}) is not None

    def postprocess_html(self, soup, first_fetch):
        for div in soup.findAll(attrs={'data-author':'AutoModerator'}):
            div.extract()
        return soup
Phoebus is offline   Reply With Quote