MobileRead Forums - View Single Post

flobotnik · 08-19-2023, 06:55 PM

Hi there,

This needed updating. Here it is:

Code:

from calibre.web.feeds.news import BasicNewsRecipe, classes


class TheBaffler(BasicNewsRecipe):
    title = 'The Baffler'
    __author__ = 'flobotnik' #based on unkn0wn's version
    description = ('This magazine contains left-wing criticism, cultural analysis, shorts'
                   ' stories, poems and art.  They publish six print issues annually.')
    language = 'en'
    encoding = 'UTF-8'
    no_javascript = True
    no_stylesheets = True
    remove_attributes = ['style','height','width']

    extra_css = '''
        .entry-subtitle{color:#202020; font-style:italic; text:align:left;}
        blockquote{color:gray;}
        em{color:#404040;}
        .wp-caption-text{font-size:small; text-align:center;}
        .lg:text-xs{color:gray; font-size:small; text-align:center;}
        .author-meta{font-size:small; color:gray;}
    '''

    keep_only_tags = [
        dict(name='main', attrs={'id':'main'})
    ]

    remove_tags = [
        classes('entry-date issue-number-segment single-article-vertical donation-footer story-footer ml-4 mt-14'),
        dict(name='footer'),
        dict(name='a', class_='ml-4 pr-px font-sans text-sm lg:text-xs whitespace-nowrap')
    ]

    def get_cover_url(self):
        soup = self.index_to_soup('https://shop.exacteditions.com/us/the-baffler')
        tag = soup.find('div', attrs={'class': 'row'})
        if tag:
            self.cover_url = tag.find('img')['src']
        return getattr(self, 'cover_url', self.cover_url)

    def parse_index(self):
        soup = self.index_to_soup('https://thebaffler.com/issues')
        issue = soup.find('article')
        edition = self.tag_to_string(issue.find('h2')).strip().split('—')[1]
        if edition:
            self.log('Downloading Issue: ', edition)
            self.title = 'The Baffler : ' + edition
        self.timefmt = ' [' + self.tag_to_string(issue.find('div', **classes('font-lion'))).strip() + ']'
        a = issue.find('a')

        self.description = (
            u'\n\n' + self.tag_to_string(a).strip() +
            u'\n\n' + self.description
        )

        soup = self.index_to_soup(a['href'])
        ans = []
        main = soup.find('main', attrs={'id':'main'})
        for section in main.findAll('section'):
            current_section = self.tag_to_string(section.h3).strip()
            self.log(current_section)
            articles = []
            for h4 in section.findAll('h4'):
                title = self.tag_to_string(h4)
                url = h4.a['href']
                desc = ''
                span = h4.findNext('span')
                if span:
                    desc = self.tag_to_string(span).strip()
                span2 = span.findNext('span')
                if span2:
                    desc = self.tag_to_string(span2).strip() + ' | ' + desc
                self.log('\t', title, '\n\t', desc, '\n\t\t', url)
                articles.append(
                    {'title': title, 'url': url, 'description': desc})
            if articles:
                ans.append((current_section,articles))
        return ans

    def preprocess_html(self, soup):
        div = soup.find('div', **classes('entry-title'))
        if div:
            div.name = 'h1'
        for p in soup.findAll('p', attrs={'class':'parasectionhed'}):
            p.name = 'h4'
        return soup