View Single Post
Old 07-04-2023, 11:42 AM   #3
413Michele
Enthusiast
413Michele began at the beginning.
 
Posts: 47
Karma: 10
Join Date: Jan 2021
Location: Italy
Device: Kobo Libra 2, Kindle Paperwhite (1st gen)
That was the problem, thanks! The new recipe is now almost perfect, I'll post it here for anyone interested in working with Medium.

New recipe

Spoiler:
Code:
#!/usr/bin/env python
# vim:fileencoding=utf-8
import string, re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup

class AdvancedUserRecipe1688290526(BasicNewsRecipe):
    title          = 'Reddit-PLANE'
    language = 'en'
    publication_type = 'blog'
    oldest_article = 60
    max_articles_per_feed = 20
    no_stylesheets = True
    auto_cleanup   = False
    keep_only_tags = [dict(name='article')]
    remove_tags = [dict(name='img', width='1', height='1'), dict(name='div', class_='speechify-ignore ab co'), dict(name='article', class_='sf')]
    extra_css      = 'img {max-width: 100%} \n div.paragraph-image {width:90%; margin-left: auto; margin-right: auto} \n div.paragraph-image div:nth-child(2) {font-size: 0.8rem; font-style: italic} \n p.separator {text-align: center} \n hr.line {border: none; border-top: 0.15em dashed black; margin: 3em 5% 2em 0}'
    preprocess_regexps = [
        (re.compile(r'<p id=".{0,6}" class="pw-post-body-paragraph lg lh ev li b lj lk ll lm ln lo lp lq lr ls lt lu lv lw lx ly lz ma mb mc md eo bj">◊◊◊</p>', re.DOTALL|re.IGNORECASE),
         lambda match: '<p class="separator">◊◊◊</p>'),
        (re.compile(r'<p id=".{0,6}" class="pw-post-body-paragraph lg lh ev li b lj lk ll lm ln lo lp lq lr ls lt lu lv lw lx ly lz ma mb mc md eo bj">_{10,100}?</p>', re.DOTALL|re.IGNORECASE),
         lambda match: '<hr class="line"/>'),
    ]
    def parse_index(self):
        title = ''
        url = ''
        feed_title = 'Plane Crash Series'
        articles = {}
        articles[feed_title] = []
        soup = self.index_to_soup('https://old.reddit.com/r/AdmiralCloudberg/comments/e6n80m/plane_crash_series_archive_patreon_contact_info/')
        header = soup.find("p", string="Reddit archives:")
        next = header.find_next_sibling("p")
        while next in soup.find_all('p'):
            p = str(next.contents)
            title_num = re.sub('\[\'(\d{1,3}).+?\]', r'\1', p, count=1)
            title_desc = re.sub('\[\'.+?>(.+?)</a>.+?</a>]', r'\1', p, count=1)
            title = title_num + ') ' + title_desc + ''
            date = re.sub('.+(\d{1,2}/\d{1,2}/\d{2}).+', r'\1', title_desc, count=1)
            for link in next.find_all("a", string="Medium"):
                url = (link.get('href'))
            articles[feed_title].append(
                dict(title=title, url=url, date=date)
            )
            if next in soup.find_all('hr'):
                break
            next = next.find_next_sibling()
        ans = [(feed_title, articles[feed_title])]
        return ans
    def preprocess_html(self, soup):
        # with open('your\\debug\\path\\soup.html', 'a') as f:          # Useful for debugging auto_cleanup, but it's now disabled
        #     print(soup, file=f)
        picture = soup.find_all('picture')
        i = 0
        while i <= len(picture):
            for source in picture[i].findAll('source', attrs={'type': 'image/webp'}):   # There is another source tag with the not-webp pic links, but they may be bigger
                img_links = source['srcset']
                img_link = re.sub(r'https://miro\.medium\.com/v2/resize:fit:\d{2,4}/(format:webp/)?(.+?)(\.png|\.jpg|\.gif)?\s.+\d{2,4}w', r'https://miro.medium.com/v2/\1\2\3', img_links, count=1)
            for img in picture[i].findAll('img'):
                img['src'] = img_link
            i += 1
            if i == len(picture):
                break
        return soup
    def postprocess_html(self, soup, first_fetch):
        for source in soup.find_all('source'):          # removes <source> tags from inside <picture>
            source.decompose()
        for x in soup.find_all('span'):                 # removes empty spans
            if len(x.get_text()) == 0:
                x.unwrap()
        return soup


There are still two minor things with which I'm having a hard time:
  1. I want to clean up 7/8 nested divs that have no use, but I can't use auto_cleanup. What would be the best way to remove nested tags with Beautifulsoup? I made some tries but none worked
  2. Downloading a 20 articles EPUB I noticed some problems caused by subtle differences in the source html. I don't want to download 20 pages each time i change the script to test it, is there a simple way to tell calibre to download the nth article of the feed only?

    Otherwise I have to modify the parse_index funtion, but I really don't want to...
413Michele is offline   Reply With Quote