Unless mistaken, I can't edit my post above anymore, I made the recipe better yesterday evening. The most annoying thing still, is empty pages between articles now. I also have pictures in it now, but still some unwanted which I don't know how to exclude without excluding all pictures. Also the cover picture isn't ideal and titles of "chapters" don't match the actual content. but yeah, ... here's the new code that still needs work. At least, the content is there again
Code:
#!/usr/bin/env python2
__license__ = 'GPL v3'
__copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
'''
demorgen.be
'''
from calibre.web.feeds.news import BasicNewsRecipe
class DeMorganBe(BasicNewsRecipe):
title = u'De Morgen'
__author__ = u'Darko Miletic'
description = u'News from Belgium in Dutch'
oldest_article = 1
language = 'nl_BE'
max_articles_per_feed = 100
no_stylesheets = False
use_embedded_content = False
def get_cover_url(self):
cover_url = "https://usercontent.one/wp/www.insidejazz.be/wp-content/uploads/2018/11/pic0143.png"
return cover_url
keep_only_tags = [
dict(name='div', attrs={'class': 'reader-title'}),
dict(name='h1'),
dict(name='div', attrs={'class': 'credits'}),
dict(name='div', attrs={'class': 'meta-data'}),
dict(name='div', attrs={'class': 'moz-reader-block-img'}), dict(name='img'),
dict(name='div', attrs={'class': 'header-intro'}),
dict(name='p'),
]
remove_tags = [
# dict(name='script'),
dict(name='p', attrs={'class': 'rtlowr1'}),
dict(name='p', attrs={'class': 'qmn3qt1'}),
dict(name='img', attrs={'class': '_1ubw0re1 _3ej1u36'}),
dict(name='img', attrs={'class': '_15tatjw0'}),
# dict(name='ul', attrs={'class': 'bulletSeparatedList'}),
# dict(name='a', attrs={'class': 'shareImage'}),
dict(name='h2'),
]
feeds = [
(u'Nieuws', u'http://www.demorgen.be/nieuws/rss.xml'),
(u'In het nieuws', u'https://www.demorgen.be/in-het-nieuws/rss.xml'),
(u'Niet te missen', u'https://www.demorgen.be/niet-te-missen/rss.xml'),
(u'Beter leven', u'http://www.demorgen.be/beter-leven/rss.xml'),
(u'Crisis Midden-Oosten', u'http://www.demorgen.be/aanval-op-israel/rss.xml'),
# (u'Koken met de Morgen', u'http://www.demorgen.be/koken-met-de-morgen/rss.xml'),
(u'Meningen', u'http://www.demorgen.be/meningen/rss.xml'),
(u'Politiek', u'http://www.demorgen.be/politiek/rss.xml'),
(u'TV & Cultuur', u'http://www.demorgen.be/tv-cultuur/rss.xml'),
(u'Oorlog in Oekraine', u'http://www.demorgen.be/oorlog-in-oekraine/rss.xml'),
(u'Tech & Wetenschap', u'http://www.demorgen.be/tech-wetenschap/rss.xml'),
# (u'Sport', u'http://www.demorgen.be/sport/rss.xml'),
# (u'Podcasts', u'http://www.demorgen.be/podcasts/rss.xml'),
# (u'Puzzels', u'http://www.demorgen.be/puzzels/rss.xml'),
# (u'Cartoons', u'http://www.demorgen.be/puzzels-cartoons/rss.xml'),
# (u'Achter de schermen', u'http://www.demorgen.be/achter-de-schermen/rss.xml'),
# (u'Best gelezen', u'http://www.demorgen.be/popular/rss.xml')
]