Thread: HTTP/3 support?
View Single Post
Old 07-26-2024, 05:24 AM   #6
kovidgoyal
creator of calibre
kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.
 
kovidgoyal's Avatar
 
Posts: 45,377
Karma: 27230406
Join Date: Oct 2006
Location: Mumbai, India
Device: Various
Something like this (untested)

Code:
#!/usr/bin/env python
import threading
from io import BytesIO

from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.scraper.simple import read_url
from calibre.web.feeds.news import BasicNewsRecipe, classes


def absurl(url):
    if url.startswith('/'):
        url = 'https://www.science.org' + url
    return url



class Response(BytesIO):
    def __init__(self, url, text):
        super().__init__(text.encode('utf-8'))
        self.url = url
    def geturl(self): return self.url


class Browser:

    def __init__(self, local):
        self.thread_local = local
        self.thread_local.storage = []

    def open(self, url, *a, **kw):
        return Response(read_url(self.thread_local.storage, url), url)
    open_novisit = open


class science(BasicNewsRecipe):
    title = 'Science Journal'
    __author__ = 'unkn0wn'
    description = (
        'Science continues to publish the very best in research across the sciences, with articles that '
        'consistently rank among the most cited in the world.'
    )
    encoding = 'utf-8'
    no_javascript = True
    no_stylesheets = True
    remove_attributes = ['style', 'height', 'width']
    language = 'en'
    simultaneous_downloads = 1

    extra_css = '''
        .news-article__figure__caption {font-size:small; text-align:center;}
        .contributors, .core-self-citation, .meta-panel__left-content, .news-article__hero__top-meta,
		.news-article__hero__bottom-meta, #bibliography, #elettersSection {font-size:small;}
        img {display:block; margin:0 auto;}
        .core-lede {font-style:italic; color:#202020;}
    '''

    ignore_duplicate_articles = {'url'}

    keep_only_tags = [
        classes('meta-panel__left-content news-article__hero__info news-article__hero__figure bodySection'),
        dict(name='h1', attrs={'property':'name'}),
        classes('core-lede contributors core-self-citation'),
        dict(attrs={'data-core-wrapper':'content'})
    ]

    remove_tags = [
        classes('pb-ad')
    ]

    def __init__(self, *a, **kw):
        self.thread_local = threading.local()
        super().__init__(*a, **kw)

    def get_browser(self, *a, **kw):
        return Browser(self.thread_local)

    def preprocess_html(self, soup):
        for p in soup.findAll(attrs={'role':'paragraph'}):
            p.name = 'p'
        return soup

    def parse_index(self):
        url = 'https://www.science.org/toc/science/current'

        soup = BeautifulSoup(read_url([], url))
        tme = soup.find(**classes('journal-issue__vol'))
        if tme:
            self.timefmt = ' [%s]' % self.tag_to_string(tme).strip()
        det = soup.find(attrs={'id':'journal-issue-details'})
        if det:
            self.description = self.tag_to_string(det).strip()

        feeds = []

        div = soup.find('div', attrs={'class':'toc__body'})
        for sec in div.findAll('section', **classes('toc__section')):
            name = sec.find(**classes('sidebar-article-title--decorated'))
            section = self.tag_to_string(name).strip()
            self.log(section)

            articles = []

            for card in sec.findAll(**classes('card-header')):
                ti = card.find(**classes('article-title'))
                url = absurl(ti.a['href'])
                title = self.tag_to_string(ti).strip()
                desc = ''
                meta = card.find(**classes('card-meta'))
                if meta:
                    desc = self.tag_to_string(meta).strip()
                self.log('          ', title, '\n\t', desc, '\n\t', url)
                articles.append({'title': title, 'description':desc, 'url': url})
            feeds.append((section, articles))
        return feeds
kovidgoyal is online now   Reply With Quote