That was the problem, thanks! The new recipe is now almost perfect, I'll post it here for anyone interested in working with Medium.
New recipe
Spoiler:
Code:
#!/usr/bin/env python
# vim:fileencoding=utf-8
import string, re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class AdvancedUserRecipe1688290526(BasicNewsRecipe):
title = 'Reddit-PLANE'
language = 'en'
publication_type = 'blog'
oldest_article = 60
max_articles_per_feed = 20
no_stylesheets = True
auto_cleanup = False
keep_only_tags = [dict(name='article')]
remove_tags = [dict(name='img', width='1', height='1'), dict(name='div', class_='speechify-ignore ab co'), dict(name='article', class_='sf')]
extra_css = 'img {max-width: 100%} \n div.paragraph-image {width:90%; margin-left: auto; margin-right: auto} \n div.paragraph-image div:nth-child(2) {font-size: 0.8rem; font-style: italic} \n p.separator {text-align: center} \n hr.line {border: none; border-top: 0.15em dashed black; margin: 3em 5% 2em 0}'
preprocess_regexps = [
(re.compile(r'<p id=".{0,6}" class="pw-post-body-paragraph lg lh ev li b lj lk ll lm ln lo lp lq lr ls lt lu lv lw lx ly lz ma mb mc md eo bj">◊◊◊</p>', re.DOTALL|re.IGNORECASE),
lambda match: '<p class="separator">◊◊◊</p>'),
(re.compile(r'<p id=".{0,6}" class="pw-post-body-paragraph lg lh ev li b lj lk ll lm ln lo lp lq lr ls lt lu lv lw lx ly lz ma mb mc md eo bj">_{10,100}?</p>', re.DOTALL|re.IGNORECASE),
lambda match: '<hr class="line"/>'),
]
def parse_index(self):
title = ''
url = ''
feed_title = 'Plane Crash Series'
articles = {}
articles[feed_title] = []
soup = self.index_to_soup('https://old.reddit.com/r/AdmiralCloudberg/comments/e6n80m/plane_crash_series_archive_patreon_contact_info/')
header = soup.find("p", string="Reddit archives:")
next = header.find_next_sibling("p")
while next in soup.find_all('p'):
p = str(next.contents)
title_num = re.sub('\[\'(\d{1,3}).+?\]', r'\1', p, count=1)
title_desc = re.sub('\[\'.+?>(.+?)</a>.+?</a>]', r'\1', p, count=1)
title = title_num + ') ' + title_desc + ''
date = re.sub('.+(\d{1,2}/\d{1,2}/\d{2}).+', r'\1', title_desc, count=1)
for link in next.find_all("a", string="Medium"):
url = (link.get('href'))
articles[feed_title].append(
dict(title=title, url=url, date=date)
)
if next in soup.find_all('hr'):
break
next = next.find_next_sibling()
ans = [(feed_title, articles[feed_title])]
return ans
def preprocess_html(self, soup):
# with open('your\\debug\\path\\soup.html', 'a') as f: # Useful for debugging auto_cleanup, but it's now disabled
# print(soup, file=f)
picture = soup.find_all('picture')
i = 0
while i <= len(picture):
for source in picture[i].findAll('source', attrs={'type': 'image/webp'}): # There is another source tag with the not-webp pic links, but they may be bigger
img_links = source['srcset']
img_link = re.sub(r'https://miro\.medium\.com/v2/resize:fit:\d{2,4}/(format:webp/)?(.+?)(\.png|\.jpg|\.gif)?\s.+\d{2,4}w', r'https://miro.medium.com/v2/\1\2\3', img_links, count=1)
for img in picture[i].findAll('img'):
img['src'] = img_link
i += 1
if i == len(picture):
break
return soup
def postprocess_html(self, soup, first_fetch):
for source in soup.find_all('source'): # removes <source> tags from inside <picture>
source.decompose()
for x in soup.find_all('span'): # removes empty spans
if len(x.get_text()) == 0:
x.unwrap()
return soup
There are still two minor things with which I'm having a hard time:
- I want to clean up 7/8 nested divs that have no use, but I can't use auto_cleanup. What would be the best way to remove nested tags with Beautifulsoup? I made some tries but none worked
- Downloading a 20 articles EPUB I noticed some problems caused by subtle differences in the source html. I don't want to download 20 pages each time i change the script to test it, is there a simple way to tell calibre to download the nth article of the feed only?
Otherwise I have to modify the parse_index funtion, but I really don't want to...