Hi,
I've created a recipe to scrape Reddit searches each month. However I am only getting a few replies, in part as I think that Reddit has an infinite scroll, though this may not be the right term.
I can't follow up with 'more replies' either.
I've searched this forum and it looks like I should look for the Ajax script but can't seem to do this. Any tips?
Thanks
Code:
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import unicode_literals, division, absolute_import, print_function
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1542030622(BasicNewsRecipe):
title = 'Monthy Reddit scrape'
auto_cleanup = False
__author__ = '2019-02-22'
language = 'en'
description = "Creepiest tales on the internet"
publisher = 'Reddit users'
category = 'horror'
oldest_article =40 # days
max_articles_per_feed = 50
no_stylesheets = True
encoding = 'utf-8'
remove_javascript = True
use_embedded_content = False
recursions = 11
remove_attributes = ['size', 'style']
feeds = [
(u'Articles', u'http://feeds.feedburner.com/CreepiestReddit-Month'),
]
conversion_options = {
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
}
keep_only_tags = [
dict(name='p', attrs={'class': [
'title',
]}),
dict(name='span', attrs={'class': [
'domain',
]}),
dict(name='div', attrs={'tabindex': [
'-1',
]}),
dict(name='div', attrs={'data-test-id': [
'post-content',
]}),
dict(name='span'),
]
remove_tags = [
dict(name='button'),
dict(name='span', attrs={'class': [
'flair',
'flair ',
's6wlmco-0 jecSt',
's7pq5uy-2 iCbvoa',
'cu1hzx-0 iogJLn',
's6wlmco-3 bsaIpo',
]}),
dict(name='div', attrs={'data-author': [
'AutoModerator',
]}),
dict(name='div', attrs={'data-redditstyle': [
'false',
]}),
dict(name='div', attrs={'class': [
's6wlmco-0 jecSt',
's7pq5uy-2 iCbvoa',
's1muqojl-0 jMnEuz',
]}),
dict(name='ul', attrs={'class': [
'flat-list buttons',
]}),
dict(name='input', attrs={'type': [
'hidden',
]}),
dict(name='svg'),
dict(name='i'),
dict(name='img', attrs={'role': [
'presentation',
]}),
]
def is_link_wanted(self, url, a):
return a['class'] == 'next' and a.findParent('nav', attrs={'class':'PaginationContent'}) is not None
def postprocess_html(self, soup, first_fetch):
for div in soup.findAll(attrs={'data-author':'AutoModerator'}):
div.extract()
return soup