MobileRead Forums - View Single Post - [Help] Recipe for a Medium blog, images missing

413Michele · 07-04-2023, 11:42 AM

That was the problem, thanks! The new recipe is now almost perfect, I'll post it here for anyone interested in working with Medium.

New recipe

Spoiler:

Code:

#!/usr/bin/env python
# vim:fileencoding=utf-8
import string, re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup

class AdvancedUserRecipe1688290526(BasicNewsRecipe):
    title          = 'Reddit-PLANE'
    language = 'en'
    publication_type = 'blog'
    oldest_article = 60
    max_articles_per_feed = 20
    no_stylesheets = True
    auto_cleanup   = False
    keep_only_tags = [dict(name='article')]
    remove_tags = [dict(name='img', width='1', height='1'), dict(name='div', class_='speechify-ignore ab co'), dict(name='article', class_='sf')]
    extra_css      = 'img {max-width: 100%} \n div.paragraph-image {width:90%; margin-left: auto; margin-right: auto} \n div.paragraph-image div:nth-child(2) {font-size: 0.8rem; font-style: italic} \n p.separator {text-align: center} \n hr.line {border: none; border-top: 0.15em dashed black; margin: 3em 5% 2em 0}'
    preprocess_regexps = [
        (re.compile(r'<p id=".{0,6}" class="pw-post-body-paragraph lg lh ev li b lj lk ll lm ln lo lp lq lr ls lt lu lv lw lx ly lz ma mb mc md eo bj">◊◊◊</p>', re.DOTALL|re.IGNORECASE),
         lambda match: '<p class="separator">◊◊◊</p>'),
        (re.compile(r'<p id=".{0,6}" class="pw-post-body-paragraph lg lh ev li b lj lk ll lm ln lo lp lq lr ls lt lu lv lw lx ly lz ma mb mc md eo bj">_{10,100}?</p>', re.DOTALL|re.IGNORECASE),
         lambda match: '<hr class="line"/>'),
    ]
    def parse_index(self):
        title = ''
        url = ''
        feed_title = 'Plane Crash Series'
        articles = {}
        articles[feed_title] = []
        soup = self.index_to_soup('https://old.reddit.com/r/AdmiralCloudberg/comments/e6n80m/plane_crash_series_archive_patreon_contact_info/')
        header = soup.find("p", string="Reddit archives:")
        next = header.find_next_sibling("p")
        while next in soup.find_all('p'):
            p = str(next.contents)
            title_num = re.sub('\[\'(\d{1,3}).+?\]', r'\1', p, count=1)
            title_desc = re.sub('\[\'.+?>(.+?)</a>.+?</a>]', r'\1', p, count=1)
            title = title_num + ') ' + title_desc + ''
            date = re.sub('.+(\d{1,2}/\d{1,2}/\d{2}).+', r'\1', title_desc, count=1)
            for link in next.find_all("a", string="Medium"):
                url = (link.get('href'))
            articles[feed_title].append(
                dict(title=title, url=url, date=date)
            )
            if next in soup.find_all('hr'):
                break
            next = next.find_next_sibling()
        ans = [(feed_title, articles[feed_title])]
        return ans
    def preprocess_html(self, soup):
        # with open('your\\debug\\path\\soup.html', 'a') as f:          # Useful for debugging auto_cleanup, but it's now disabled
        #     print(soup, file=f)
        picture = soup.find_all('picture')
        i = 0
        while i <= len(picture):
            for source in picture[i].findAll('source', attrs={'type': 'image/webp'}):   # There is another source tag with the not-webp pic links, but they may be bigger
                img_links = source['srcset']
                img_link = re.sub(r'https://miro\.medium\.com/v2/resize:fit:\d{2,4}/(format:webp/)?(.+?)(\.png|\.jpg|\.gif)?\s.+\d{2,4}w', r'https://miro.medium.com/v2/\1\2\3', img_links, count=1)
            for img in picture[i].findAll('img'):
                img['src'] = img_link
            i += 1
            if i == len(picture):
                break
        return soup
    def postprocess_html(self, soup, first_fetch):
        for source in soup.find_all('source'):          # removes <source> tags from inside <picture>
            source.decompose()
        for x in soup.find_all('span'):                 # removes empty spans
            if len(x.get_text()) == 0:
                x.unwrap()
        return soup

There are still two minor things with which I'm having a hard time:

I want to clean up 7/8 nested divs that have no use, but I can't use auto_cleanup. What would be the best way to remove nested tags with Beautifulsoup? I made some tries but none worked
Downloading a 20 articles EPUB I noticed some problems caused by subtle differences in the source html. I don't want to download 20 pages each time i change the script to test it, is there a simple way to tell calibre to download the nth article of the feed only?

Otherwise I have to modify the parse_index funtion, but I really don't want to...