MobileRead Forums - View Single Post

lui1 · 06-10-2019, 08:03 PM

Hello Samdiggly,

This downloads the articles displayed on the homepage.

Recipe for AINOnline:

Code:

#!/usr/bin/env python2
# vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2019, Jose Ortiz <jlortiz84 at gmail.com>

from __future__ import (unicode_literals, division, absolute_import,
                        print_function)
from calibre.web.feeds.recipes import BasicNewsRecipe


INDEX = 'https://www.ainonline.com/'


def absurl(url):
    if url.startswith('/'):
        url = INDEX + url[1:]
    return url


def classes(classes):
    q = frozenset(classes.split(' '))
    return dict(attrs={
        'class': lambda x: x and frozenset(x.split()).intersection(q)})


class AINOnline(BasicNewsRecipe):
    title = 'Aviation International News'
    __author__ = 'Jose Ortiz'
    description = ('Aviation International News covers all sectors of the aviation'
                   ' industry, from business aviation to air transport to defense and'
                   ' unmanned aerial vehicles.')
    language = 'en'
    encoding = 'utf-8'
    no_stylesheets = True
    remove_javascript = True
    masthead_url = 'https://www.ainonline.com/sites/ainonline.com/themes/ain30/images/ainlogo-small.jpg'
    keep_only_tags=[classes('main-content')]
    remove_tags = [
        dict(name=['button','input']),
        dict(attrs={'class': lambda x: x and 'comments' in x})
    ]

    def parse_index(self):

        soup = self.index_to_soup(INDEX)

        # css selectors for articles
        #     .view-content [class *= 'featured-story']
        #     .view-content .views-row
        article_attrs = {
            'class': lambda x: x and (
                'featured-story' in x
                or {'views-row'}.intersection(x.split()))}

        ans = []

        for section in soup.findAll(**classes('view-content')):

            if section.findParent(
                    attrs=dict(id='featured')) is not None:
                current_section = 'Featured'
            elif section.findParent(
                    attrs=dict(
                        id='home-top-stories')) is not None:
                current_section = 'Top Stories'
            elif section.findParent(
                    attrs=dict(
                        id='quicktabs-container-latest_trending'
                    )) is not None:
                current_section = 'Latest/Trending'
            else:
                current_section = 'Articles'

            articles = []
            for div in section.findAll(attrs=article_attrs):
                if {'views-row'}.intersection(div['class'].split()):
                    a = div.find(**classes('title')).a
                elif 'featured-story' in div['class']:
                    a = div.find(
                        lambda tag: tag.name == 'a'
                        and tag.find(['h1','h2','h3','h4','h5','h6'])
                        is not None)
                title = self.tag_to_string(a)
                url = absurl(a['href'])
                desc = ''
                r = div.find(**classes('teaser'))
                if r is not None:
                    desc = self.tag_to_string(r)
                articles.append(
                    {'title': title, 'url': url, 'description': desc})
            if articles:
                for title, articles_  in ans:
                    if current_section == title:
                        articles_.extend(articles)
                        break
                else:
                    ans.append((current_section, articles))

        return ans