Thanks again for your help. Here is an Alpha version of the code. Bugs:
- a subreddit's automoderator rules will appear at the start of each post
- in page links to images not pulled in (though may be for the best) eg those to imgur, i.reddit
- some of the code is junk as I've cannibalised from other recipes and may not need to be there
- subreddit name is not displayed in title
Usage: you must get your links as per these guides
https://www.reddit.com/wiki/rss or
https://www.reddit.com/r/pathogendav...ss_and_reddit/
For example I use it as a search to get results for horror stories, but you can use it for any search, subreddit, post, comments or users as per the links above.
I've set it for a weekly search but obviously you can change this.
Code:
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import unicode_literals, division, absolute_import, print_function
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1542030622(BasicNewsRecipe):
title = 'Reddit weekly - alpha'
auto_cleanup = False
__author__ = 'phoebus'
language = 'en'
description = "Tales from the internet"
publisher = 'Reddit users'
oldest_article =7 # days - change as required
max_articles_per_feed = 50 # change as required
no_stylesheets = True
encoding = 'utf-8'
remove_javascript = True
use_embedded_content = False
recursions = 11
remove_attributes = ['size', 'style']
feeds = [
(u'Articles', u'INSERT YOUR RSS LINK),
] # see https://www.reddit.com/wiki/rss or https://www.reddit.com/r/pathogendavid/comments/tv8m9/pathogendavids_guide_to_rss_and_reddit/'
conversion_options = {
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
}
keep_only_tags = [
dict(name='p', attrs={'class': [
'title',
]}),
dict(name='span', attrs={'class': [
'domain',
]}),
dict(name='div', attrs={'class': [
'expando',
]}),
dict(name='h1', attrs={'class': [
'hover redditname',
]}),
dict(name='meta', attrs={'property': [
'og:title',
]}),
dict(name='meta', attrs={'title'}),
dict(name='div', attrs={'class': [
'entry unvoted',
'usertext-body may-blank-within md-container ',
'usertext-body may-blank-within md-container',
'md',
]}),
dict(name='div', attrs={'data-test-id': [
'post-content',
]}),
dict(name='div', attrs={'class': [
's10usnt7-0 gxtxxZ'
]}),
]
remove_tags = [
dict(name='button'),
dict(name='span', attrs={'class': [
'flair',
'flair ',
]}),
dict(name='div', attrs={'data-author': [
'AutoModerator',
]}),
dict(name='ul', attrs={'class': [
'flat-list buttons',
]}),
dict(name='input', attrs={'type': [
'hidden',
]}),
dict(name='svg'),
]
def is_link_wanted(self, url, a):
return a['class'] == 'next' and a.findParent('nav', attrs={'class':'PaginationContent'}) is not None
def postprocess_html(self, soup, first_fetch):
for div in soup.findAll(attrs={'data-author':'AutoModerator'}):
div.extract()
return soup