#!/usr/bin/env python
# vim:fileencoding=utf-8
import string, re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup

class AdvancedUserRecipe1688290526(BasicNewsRecipe):
    title          = 'Plane Crash Series (Reddit)'
    language = 'en'
    publication_type = 'blog'
    oldest_article = 120
    max_articles_per_feed = 5
    no_stylesheets = True
    auto_cleanup   = False
    compress_news_images = True
    compress_news_images_auto_size = 12
    # compress_news_images_max_size = 600
    keep_only_tags = [dict(name='article')]
    remove_tags = [dict(name='img', width='1', height='1'), dict(name='div', class_='speechify-ignore ab co'), dict(name='article', class_='sf')]
    extra_css      = 'img {max-width: 100%} \n div.paragraph-image {width:90%; margin-left: auto; margin-right: auto; text-align: center} \n div.paragraph-image div:nth-child(2) {font-size: 0.8rem; font-style: italic} \n p.separator {text-align: center; font-size: 1.2em} \n hr.line {border: none; border-top: 0.15em dashed black; margin: 3em 5% 2em 0} \n blockquote {font-style: italic}'
    preprocess_regexps = [
        (re.compile(r'<p id=".{0,6}" class="pw-post-body-paragraph [\w\s]+?">◊◊◊</p>', re.DOTALL|re.IGNORECASE),
         lambda match: '<p class="separator">◊ ◊ ◊</p>'),
        (re.compile(r'<p id=".{0,6}" class="pw-post-body-paragraph [\w\s]+?">_{10,100}?</p>', re.DOTALL|re.IGNORECASE),
         lambda match: '<hr class="line"/>'),
    ]
    # Gets cover from Medium's About page
    def get_cover_url(self):
        soup = self.index_to_soup('https://admiralcloudberg.medium.com/about')
        cover_url_list = []
        for style in soup.find_all('style'):
            for element in style:
                match = re.search('https://.+?\.jpeg', element)
                cover_url_list.append(match)
        i = 0
        while i < len(cover_url_list):
            if cover_url_list[i] is not None:
                cover_url = cover_url_list[i].group()
            i = i+1
        return cover_url
    # Gets articles index from Reddit
    def parse_index(self):
        title = ''
        url = ''
        feed_title = 'Plane Crash Series'
        articles = {}
        articles[feed_title] = []
        soup = self.index_to_soup('https://old.reddit.com/r/AdmiralCloudberg/comments/e6n80m/plane_crash_series_archive_patreon_contact_info/')
        header = soup.find("p", string="Reddit archives:")
        next = header.find_next_sibling("p")
        while next in soup.find_all('p'):
            p = str(next.contents)
            title_num = re.sub('\[\'(\d{1,3}).+?\]', r'\1', p, count=1)
            title_desc = re.sub('\[\'.+?>(.+?)</a>.+?</a>]', r'\1', p, count=1)
            title = title_num + ') ' + title_desc + ''
            date = re.sub('.*?(\d{1,2}/\d{1,2}/\d{2}).+', r'\1', title_desc, count=1)
            for link in next.find_all("a", string="Medium"):
                url = (link.get('href'))
            articles[feed_title].append(
                dict(title=title, url=url, date=date)
            )
            if next in soup.find_all('hr'):
                break
            next = next.find_next_sibling()
        ans = [(feed_title, articles[feed_title])]
        return ans
    def preprocess_html(self, soup):
        # with open('your\\debug\\path\\soup.html', 'a') as f:          # Useful for debugging auto_cleanup
        #     print(soup, file=f)
        picture = soup.find_all('picture')
        i = 0
        while i <= len(picture):
            for source in picture[i].findAll('source', attrs={'type': 'image/webp'}):   # There is another source tag with the not-webp pic links, but they may be bigger
                img_links = source['srcset']
                img_link = re.sub(r'https://miro\.medium\.com/v2/resize:fit:\d{2,4}/(format:webp/)?(.+?)(\.png|\.jpg|\.gif)?\s.+\d{2,4}w', r'https://miro.medium.com/v2/\1\2\3', img_links, count=1)
            for img in picture[i].findAll('img'):
                img['src'] = img_link
            i += 1
            if i == len(picture):
                break
        return soup
    def postprocess_html(self, soup, first_fetch):
        for source in soup.find_all('source'):          # removes <source> tags from inside <picture>
            source.decompose()
        for x in soup.find_all('span'):                 # removes empty spans
            if len(x.get_text()) == 0:
                x.unwrap()
        for image in soup.find_all('img'):
            del image['class']                          # removes some bloat from all <img>
            del image['width']
            del image['height']
            del image['loading']
            del image['role']
        return soup