View Single Post
Old 07-03-2023, 01:42 PM   #1
413Michele
Enthusiast
413Michele began at the beginning.
 
Posts: 47
Karma: 10
Join Date: Jan 2021
Location: Italy
Device: Kobo Libra 2, Kindle Paperwhite (1st gen)
Question [Help] Recipe for a Medium blog, images missing

Hello! There is this blog on Medium that I really like, and the other day I decided to try and make a calibre recipe to read it on my Kobo. All of Medium's blogs have a hidden RSS feed, obtained by adding "/feed" to the link, so I could easily make a recipe:

Feed recipe
Spoiler:
Code:
#!/usr/bin/env python
# vim:fileencoding=utf-8
from calibre.web.feeds.news import BasicNewsRecipe

class AdvancedUserRecipe1688220087(BasicNewsRecipe):
    title          = 'Plane Crash Series (extracted)'
    language = 'en'
    publication_type = 'blog'
    oldest_article = 60
    max_articles_per_feed = 100
    no_stylesheets = True
    auto_cleanup   = False

    feeds          = [
        ('Admiral Cloudberg', 'https://admiralcloudberg.medium.com/feed'),
    ]
    remove_tags_after = [dict(name='div', class_='ob oc od oe of l bw')]
    remove_tags = [dict(name='img', width='1', height='1')]
    extra_css      = 'div > div > img {max-width: 100%} \n div > div {margin-left: 10%; margin-right: 10%} \n div > div > div {font-size: 0.8rem; font-style: italic; margin-left: 0; margin-right: 0} \n p { margin: 2em 0} \n p.separator {text-align: center} \n hr.line {border: none; border-top: 0.15em dashed black; margin: 3em 5% 2em 0}'
    preprocess_regexps = [
        (re.compile(r'<p>◊◊◊</p>', re.DOTALL|re.IGNORECASE),
         lambda match: '<p class="separator">◊◊◊</p>'),
        (re.compile(r'<p>_{10,100}?</p>', re.DOTALL|re.IGNORECASE),
         lambda match: '<hr class="line"/>'),
]


This recipe gets the full text from the feed itself and works perfectly. The only problem with it is that the feed lists only the latest 10 articles, and ideally I'd like a way bigger number. So I read the manual and I embarked on making a way more complicated recipe, based on the complete list of articles found here on Reddit

After some hours of frustrations, print statements and regex hell I got this more than acceptable result:

New recipe
Spoiler:
Code:
#!/usr/bin/env python
# vim:fileencoding=utf-8
import string, re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup

class AdvancedUserRecipe1688290526(BasicNewsRecipe):
    title          = 'Reddit-PLANE'
    language = 'en'
    publication_type = 'blog'
    oldest_article = 60
    max_articles_per_feed = 3
    no_stylesheets = True
    auto_cleanup   = True
    # keep_only_tags = [dict(name='article')]
    # remove_tags_after = [dict(name='div', class_='ds dt du dv dw l')]
    remove_tags = [dict(name='img', width='1', height='1'), dict(name='div', class_='speechify-ignore ab co')]
    extra_css      = 'img {max-width: 100%} \n div.paragraph-image {width:90%; margin-left: auto; margin-right: auto} \n div.paragraph-image div:nth-child(2) {font-size: 0.8rem; font-style: italic} \n p.separator {text-align: center} \n hr.line {border: none; border-top: 0.15em dashed black; margin: 3em 5% 2em 0}'
    preprocess_regexps = [
        (re.compile(r'<p id=".{0,6}" class="pw-post-body-paragraph lg lh ev li b lj lk ll lm ln lo lp lq lr ls lt lu lv lw lx ly lz ma mb mc md eo bj">◊◊◊</p>', re.DOTALL|re.IGNORECASE),
         lambda match: '<p class="separator">◊◊◊</p>'),
        (re.compile(r'<p id=".{0,6}" class="pw-post-body-paragraph lg lh ev li b lj lk ll lm ln lo lp lq lr ls lt lu lv lw lx ly lz ma mb mc md eo bj">_{10,100}?</p>', re.DOTALL|re.IGNORECASE),
         lambda match: '<hr class="line"/>'),
    ]
    def parse_index(self):
        title = ''
        url = ''
        feed_title = 'Plane Crash Series'
        articles = {}
        articles[feed_title] = []
        soup = self.index_to_soup('https://old.reddit.com/r/AdmiralCloudberg/comments/e6n80m/plane_crash_series_archive_patreon_contact_info/')
        header = soup.find("p", string="Reddit archives:")
        next = header.find_next_sibling("p")
        while next in soup.find_all('p'):
            p = str(next.contents)
            title_num = re.sub('\[\'(\d{1,3}).+?\]', r'\1', p, count=1)
            title_desc = re.sub('\[\'.+?>(.+?)</a>.+?</a>]', r'\1', p, count=1)
            title = title_num + ') ' + title_desc + ''
            date = re.sub('.+(\d{1,2}/\d{1,2}/\d{2}).+', r'\1', title_desc, count=1)
            for link in next.find_all("a", string="Medium"):
                url = (link.get('href'))
            articles[feed_title].append(
                dict(title=title, url=url, date=date)
            )
            if next in soup.find_all('hr'):
                break
            next = next.find_next_sibling()
        ans = [(feed_title, articles[feed_title])]
        return ans


The new recipe does all I want it to, except for a big problem: no images are downloaded. The two relevant lines while debugging are:

Code:
Processing images...
Recursion limit reached. Skipping links in https://admiralcloudberg.medium.com/a-sickness-and-its-cure-the-crash-of-trans-colorado-airlines-flight-2286-fab2e7b2fcc3
I tried lots of things: disabiling the auto-cleanup, changing the recursion limit, adding a timeout, etc. with no luck.It seems that calibre can't see the images inside the article, even though it could download some decorative ones.

I don't really know Python apart from the basics, so any help debugging or with the parameters would be really appreciated
413Michele is offline   Reply With Quote