Hello! There is
this blog on Medium that I really like, and the other day I decided to try and make a calibre recipe to read it on my Kobo. All of Medium's blogs have a hidden RSS feed, obtained by adding "/feed" to the link, so I could easily make a recipe:
Feed recipe
Spoiler:
Code:
#!/usr/bin/env python
# vim:fileencoding=utf-8
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1688220087(BasicNewsRecipe):
title = 'Plane Crash Series (extracted)'
language = 'en'
publication_type = 'blog'
oldest_article = 60
max_articles_per_feed = 100
no_stylesheets = True
auto_cleanup = False
feeds = [
('Admiral Cloudberg', 'https://admiralcloudberg.medium.com/feed'),
]
remove_tags_after = [dict(name='div', class_='ob oc od oe of l bw')]
remove_tags = [dict(name='img', width='1', height='1')]
extra_css = 'div > div > img {max-width: 100%} \n div > div {margin-left: 10%; margin-right: 10%} \n div > div > div {font-size: 0.8rem; font-style: italic; margin-left: 0; margin-right: 0} \n p { margin: 2em 0} \n p.separator {text-align: center} \n hr.line {border: none; border-top: 0.15em dashed black; margin: 3em 5% 2em 0}'
preprocess_regexps = [
(re.compile(r'<p>◊◊◊</p>', re.DOTALL|re.IGNORECASE),
lambda match: '<p class="separator">◊◊◊</p>'),
(re.compile(r'<p>_{10,100}?</p>', re.DOTALL|re.IGNORECASE),
lambda match: '<hr class="line"/>'),
]
This recipe gets the full text from the feed itself and works perfectly. The only problem with it is that the feed lists only the latest 10 articles, and ideally I'd like a way bigger number. So I read the manual and I embarked on making a way more complicated recipe, based on the complete list of articles
found here on Reddit
After some hours of frustrations, print statements and regex hell I got this more than acceptable result:
New recipe
Spoiler:
Code:
#!/usr/bin/env python
# vim:fileencoding=utf-8
import string, re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class AdvancedUserRecipe1688290526(BasicNewsRecipe):
title = 'Reddit-PLANE'
language = 'en'
publication_type = 'blog'
oldest_article = 60
max_articles_per_feed = 3
no_stylesheets = True
auto_cleanup = True
# keep_only_tags = [dict(name='article')]
# remove_tags_after = [dict(name='div', class_='ds dt du dv dw l')]
remove_tags = [dict(name='img', width='1', height='1'), dict(name='div', class_='speechify-ignore ab co')]
extra_css = 'img {max-width: 100%} \n div.paragraph-image {width:90%; margin-left: auto; margin-right: auto} \n div.paragraph-image div:nth-child(2) {font-size: 0.8rem; font-style: italic} \n p.separator {text-align: center} \n hr.line {border: none; border-top: 0.15em dashed black; margin: 3em 5% 2em 0}'
preprocess_regexps = [
(re.compile(r'<p id=".{0,6}" class="pw-post-body-paragraph lg lh ev li b lj lk ll lm ln lo lp lq lr ls lt lu lv lw lx ly lz ma mb mc md eo bj">◊◊◊</p>', re.DOTALL|re.IGNORECASE),
lambda match: '<p class="separator">◊◊◊</p>'),
(re.compile(r'<p id=".{0,6}" class="pw-post-body-paragraph lg lh ev li b lj lk ll lm ln lo lp lq lr ls lt lu lv lw lx ly lz ma mb mc md eo bj">_{10,100}?</p>', re.DOTALL|re.IGNORECASE),
lambda match: '<hr class="line"/>'),
]
def parse_index(self):
title = ''
url = ''
feed_title = 'Plane Crash Series'
articles = {}
articles[feed_title] = []
soup = self.index_to_soup('https://old.reddit.com/r/AdmiralCloudberg/comments/e6n80m/plane_crash_series_archive_patreon_contact_info/')
header = soup.find("p", string="Reddit archives:")
next = header.find_next_sibling("p")
while next in soup.find_all('p'):
p = str(next.contents)
title_num = re.sub('\[\'(\d{1,3}).+?\]', r'\1', p, count=1)
title_desc = re.sub('\[\'.+?>(.+?)</a>.+?</a>]', r'\1', p, count=1)
title = title_num + ') ' + title_desc + ''
date = re.sub('.+(\d{1,2}/\d{1,2}/\d{2}).+', r'\1', title_desc, count=1)
for link in next.find_all("a", string="Medium"):
url = (link.get('href'))
articles[feed_title].append(
dict(title=title, url=url, date=date)
)
if next in soup.find_all('hr'):
break
next = next.find_next_sibling()
ans = [(feed_title, articles[feed_title])]
return ans
The new recipe does all I want it to, except for a big problem: no images are downloaded. The two relevant lines while debugging are:
Code:
Processing images...
Recursion limit reached. Skipping links in https://admiralcloudberg.medium.com/a-sickness-and-its-cure-the-crash-of-trans-colorado-airlines-flight-2286-fab2e7b2fcc3
I tried lots of things: disabiling the auto-cleanup, changing the recursion limit, adding a timeout, etc. with no luck.It seems that calibre can't see the images inside the article, even though it could download some decorative ones.
I don't really know Python apart from the basics, so any help debugging or with the parameters would be really appreciated