#!/usr/bin/env python # vim:fileencoding=utf-8 import string, re from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup class AdvancedUserRecipe1688290526(BasicNewsRecipe): title = 'Plane Crash Series (Reddit)' language = 'en' publication_type = 'blog' oldest_article = 120 max_articles_per_feed = 5 no_stylesheets = True auto_cleanup = False compress_news_images = True compress_news_images_auto_size = 12 # compress_news_images_max_size = 600 keep_only_tags = [dict(name='article')] remove_tags = [dict(name='img', width='1', height='1'), dict(name='div', class_='speechify-ignore ab co'), dict(name='article', class_='sf')] extra_css = 'img {max-width: 100%} \n div.paragraph-image {width:90%; margin-left: auto; margin-right: auto; text-align: center} \n div.paragraph-image div:nth-child(2) {font-size: 0.8rem; font-style: italic} \n p.separator {text-align: center; font-size: 1.2em} \n hr.line {border: none; border-top: 0.15em dashed black; margin: 3em 5% 2em 0} \n blockquote {font-style: italic}' preprocess_regexps = [ (re.compile(r'

◊◊◊

', re.DOTALL|re.IGNORECASE), lambda match: '

◊ ◊ ◊

'), (re.compile(r'

_{10,100}?

', re.DOTALL|re.IGNORECASE), lambda match: '

'), ] # Gets cover from Medium's About page def get_cover_url(self): soup = self.index_to_soup('https://admiralcloudberg.medium.com/about') cover_url_list = [] for style in soup.find_all('style'): for element in style: match = re.search('https://.+?\.jpeg', element) cover_url_list.append(match) i = 0 while i < len(cover_url_list): if cover_url_list[i] is not None: cover_url = cover_url_list[i].group() i = i+1 return cover_url # Gets articles index from Reddit def parse_index(self): title = '' url = '' feed_title = 'Plane Crash Series' articles = {} articles[feed_title] = [] soup = self.index_to_soup('https://old.reddit.com/r/AdmiralCloudberg/comments/e6n80m/plane_crash_series_archive_patreon_contact_info/') header = soup.find("p", string="Reddit archives:") next = header.find_next_sibling("p") while next in soup.find_all('p'): p = str(next.contents) title_num = re.sub('\[\'(\d{1,3}).+?\]', r'\1', p, count=1) title_desc = re.sub('\[\'.+?>(.+?).+?]', r'\1', p, count=1) title = title_num + ') ' + title_desc + '' date = re.sub('.*?(\d{1,2}/\d{1,2}/\d{2}).+', r'\1', title_desc, count=1) for link in next.find_all("a", string="Medium"): url = (link.get('href')) articles[feed_title].append( dict(title=title, url=url, date=date) ) if next in soup.find_all('hr'): break next = next.find_next_sibling() ans = [(feed_title, articles[feed_title])] return ans def preprocess_html(self, soup): # with open('your\\debug\\path\\soup.html', 'a') as f: # Useful for debugging auto_cleanup # print(soup, file=f) picture = soup.find_all('picture') i = 0 while i <= len(picture): for source in picture[i].findAll('source', attrs={'type': 'image/webp'}): # There is another source tag with the not-webp pic links, but they may be bigger img_links = source['srcset'] img_link = re.sub(r'https://miro\.medium\.com/v2/resize:fit:\d{2,4}/(format:webp/)?(.+?)(\.png|\.jpg|\.gif)?\s.+\d{2,4}w', r'https://miro.medium.com/v2/\1\2\3', img_links, count=1) for img in picture[i].findAll('img'): img['src'] = img_link i += 1 if i == len(picture): break return soup def postprocess_html(self, soup, first_fetch): for source in soup.find_all('source'): # removes tags from inside