View Single Post
Old 12-06-2024, 07:30 AM   #5
oneillpt
Connoisseur
oneillpt began at the beginning.
 
Posts: 63
Karma: 46
Join Date: Feb 2011
Device: Kindle 3 (cracked screen!); PW1; Oasis
Updated Popular Science recipe

Required replacement of 'PostsContainer' by 'category-content-wrapper lgb-12' (lgb-12 possibly not needed), 'PostItem' by 'card-post', 'PostItem-link' by 'card-post-image-link', 'PostItem-title' by 'card-post-title' and 'PostItem-excerpt' by 'card-post-excerpt'. I have left the corresponding original lines commented out in the recipe.

Also, added article-title article-dek article-paragraph articlebody to the original keep_only_tags (I have left the original keep_only_tags unchanged although they are no longer used. They might be needed again for some future revision of the publication source coding)

I have also left some 'self.log' statements in the recipe (some commented out, some still active) as these may be helpful to anyone learning recipe construction.
Code:
#!/usr/bin/env python
# vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2020, Kovid Goyal <kovid at kovidgoyal.net>

from __future__ import absolute_import, division, print_function, unicode_literals

from calibre.web.feeds.news import BasicNewsRecipe


def classes(classes):
    q = frozenset(classes.split(' '))
    return dict(attrs={
        'class': lambda x: x and frozenset(x.split()).intersection(q)})


class AdvancedUserRecipe1282101454(BasicNewsRecipe):
    title = 'Popular Science 2'
    language = 'en'
    __author__ = 'Kovid Goyal'
    description = 'Popular Science'
    publisher = 'Popular Science'
    max_articles_per_feed = 100
    ignore_duplicate_articles = {'url'}
    no_stylesheets = True
    timefmt = ' [%a, %d %b, %Y  %H:%M]'
    keep_only_tags = [
        classes('Article-header Article-excerpt Article-author Article-thumbnail Article-bodyText article-title article-dek article-paragraph articlebody'),
    ]
    remove_tags = [
        dict(name='section',   attrs={'class': ['recurrent-share']})
    ]

    def parse_section_index(self, slug):
        url = 'https://www.popsci.com/{}/'.format(slug)
        #self.log('Section:', url)
        soup = self.index_to_soup(url)
        #self.log('==> parse_section_index SOUP')
        #main = soup.find(**classes('PostsContainer'))
        main = soup.find(**classes('category-content-wrapper lg:pb-12')) #  lg:pb-12
        if main is None:
            self.log("main is None for ", slug) 
        if main is not None:
            self.log("===main is not None for ", slug)
            #self.log(main)
            self.log("===main is not None for ", slug)           
            #for div in main.findAll(**classes('PostItem')):
            for div in main.findAll(**classes('card-post')):
                #a = div.find('a', href=True, **classes('PostItem-link'))
                a = div.find('a', href=True, **classes('card-post-image-link'))
                self.log("a:", a)
                url = a['href']
                self.log("url:", url)
                #title = self.tag_to_string(div.find(**classes('PostItem-title')))
                title = self.tag_to_string(div.find(**classes('card-post-title')))
                self.log("title:", title)
                desc = ''
                #dek = div.find(**classes('PostItem-excerpt'))
                dek = div.find(**classes('card-post-excerpt'))
                if dek is not None:
                    desc = self.tag_to_string(dek)
                self.log(' ', title, url)
                self.log('title:', title, 'url:', url, 'description:', desc)
                yield {'title': title, 'url': url, 'description': desc}

    def parse_index(self):
        sections = []
        for slug, title in {
            'science': 'Science',
            'technology': 'Technology',
            'diy': 'DIY',
            'reviews': 'Reviews',
        }.items():
            self.log('slug:', slug)
            articles = list(self.parse_section_index(slug))
            if articles:
                sections.append((title, articles))
        return sections

    def preprocess_html(self, soup):
        for img in soup.findAll('img', attrs={'data-medsrc': True}):
            img['src'] = img['data-medsrc']
        return soup


calibre_most_common_ua = 'Gen Software Updater'

Last edited by PeterT; 12-06-2024 at 08:00 AM. Reason: added in code tags
oneillpt is offline   Reply With Quote