Register Guidelines E-Books Search Today's Posts Mark Forums Read

Go Back   MobileRead Forums > E-Book Software > Calibre > Recipes

Notices

Reply
 
Thread Tools Search this Thread
Old 01-08-2019, 05:41 PM   #1
lui1
Member
lui1 began at the beginning.
 
Posts: 13
Karma: 10
Join Date: Dec 2017
Location: Los Angeles, CA
Device: Smart Phone
Nature Journal Recipe

Here is a recipe for the `Nature' journal.

Nature Recipe
Code:
#!/usr/bin/env python2

from collections import defaultdict
from calibre.web.feeds.news import BasicNewsRecipe

BASE = 'https://www.nature.com'


def absurl(url):
  if url.startswith('/'):
      url = BASE + url
  elif url.startswith('http://'):
      url = 'https' + url[4:]
  return url


def check_words(words):
    return lambda x: x and frozenset(words.split()).intersection(x.split())


class Nature(BasicNewsRecipe):
    title = 'Nature'
    __author__ = 'Jose Ortiz'
    description = ('Nature is a weekly international multidisciplinary scientific journal'
                   ' publishing peer-reviewed research in all fields of science and'
                   ' technology on the basis of its originality, importance,'
                   ' interdisciplinary interest, timeliness, accessibility, elegance and'
                   ' surprising conclusions.  Nauture also provides rapid, authoritative,'
                   ' insightful and arresting news and interpretation of topical and coming'
                   ' trends affecting science, scientists and the wider public.')
    language = 'en'
    encoding = 'UTF-8'
    no_javascript = True
    no_stylesheets = True

    keep_only_tags = [
        dict(name='div',attrs={'data-component' : check_words('article-container')})
    ]

    remove_tags = [
        dict(attrs={'class' : check_words('hide-print')})
    ]

    def parse_index(self):
        soup = self.index_to_soup(BASE + '/nature/current-issue')
        self.cover_url = 'https:' + soup.find('img',attrs={'data-test' : 'issue-cover-image'})['src']
        section_tags = soup.find('div', {'data-container-type' : check_words('issue-section-list')})
        section_tags = section_tags.findAll('div', {'class' : check_words('article-section')})

        sections = defaultdict(list)
        ordered_sec_titles = []
        index = []

        for sec in section_tags:
            sec_title = self.tag_to_string(sec.find('h2'))
            ordered_sec_titles.append(sec_title)
            for article in sec.findAll('article'):
                title = self.tag_to_string(article.find('h3', {'itemprop' : check_words('name headline')}))
                date = ' [' + self.tag_to_string(article.find('time', {'itemprop' : check_words('datePublished')})) + ']'
                author = self.tag_to_string(article.find('li', {'itemprop' : check_words('creator')}))
                url =  absurl(article.find('a',{'itemprop' : check_words('url')})['href'])
                label = self.tag_to_string(article.find(attrs={'data-test' : check_words('article.type')}))
                description = label + ': ' + self.tag_to_string(article.find('div', attrs={'itemprop' : check_words('description')}))
                sections[sec_title].append(
                    {'title' : title, 'url' : url, 'description' : description, 'date' : date, 'author' : author})

        for k in ordered_sec_titles:
            index.append((k, sections[k]))
        return index

    def preprocess_html(self, soup):
        for img in soup.findAll('img',{'data-src' : True}):
            if img['data-src'].startswith('//'):
                img['src'] = 'https:' + img['data-src']
            else:
                img['src'] = img['data-src']
        return soup
lui1 is offline   Reply With Quote
Old 01-09-2019, 12:39 AM   #2
kovidgoyal
creator of calibre
kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.
 
kovidgoyal's Avatar
 
Posts: 33,930
Karma: 10254170
Join Date: Oct 2006
Location: Mumbai, India
Device: Various
thanks, added.
kovidgoyal is offline   Reply With Quote
Old 01-17-2019, 04:04 PM   #3
lui1
Member
lui1 began at the beginning.
 
Posts: 13
Karma: 10
Join Date: Dec 2017
Location: Los Angeles, CA
Device: Smart Phone
Update to Nature

Hello Kovid, thanks for adding my recipe. Here's an update that fixes an error I found this morning.

Update to Nature:
Code:
#!/usr/bin/env python2

from collections import defaultdict
from calibre.web.feeds.news import BasicNewsRecipe

BASE = 'https://www.nature.com'


def absurl(url):
  if url.startswith('/'):
      url = BASE + url
  elif url.startswith('http://'):
      url = 'https' + url[4:]
  return url


def check_words(words):
    return lambda x: x and frozenset(words.split()).intersection(x.split())


def has_all_of(words):
    return lambda x: x and frozenset(words.split()).issubset(x.split())

    
class Nature(BasicNewsRecipe):
    title = 'Nature'
    __author__ = 'Jose Ortiz'
    description = ('Nature is a weekly international multidisciplinary scientific journal'
                   ' publishing peer-reviewed research in all fields of science and'
                   ' technology on the basis of its originality, importance,'
                   ' interdisciplinary interest, timeliness, accessibility, elegance and'
                   ' surprising conclusions.  Nauture also provides rapid, authoritative,'
                   ' insightful and arresting news and interpretation of topical and coming'
                   ' trends affecting science, scientists and the wider public.')
    language = 'en'
    encoding = 'UTF-8'
    no_javascript = True
    no_stylesheets = True

    keep_only_tags = [
        dict(name='div', attrs={'data-component' : check_words('article-container')})
    ]

    remove_tags = [
        dict(attrs={'class' : check_words('hide-print')})
    ]

    def parse_index(self):
        soup = self.index_to_soup(BASE + '/nature/current-issue')
        self.cover_url = 'https:' + soup.find('img',attrs={'data-test' : check_words('issue-cover-image')})['src']
        section_tags = soup.find('div', {'data-container-type' : check_words('issue-section-list')})
        section_tags = section_tags.findAll('div', {'class' : check_words('article-section')})

        sections = defaultdict(list)
        ordered_sec_titles = []
        index = []

        for sec in section_tags:
            sec_title = self.tag_to_string(sec.find('h2'))
            ordered_sec_titles.append(sec_title)
            for article in sec.findAll('article'):
                try:
                    url =  absurl(article.find('a',{'itemprop' : check_words('url')})['href'])
                except TypeError:
                    continue
                title = self.tag_to_string(article.find('h3', {'itemprop' : has_all_of('name headline')}))
                date = ' [' + self.tag_to_string(article.find('time', {'itemprop' : check_words('datePublished')})) + ']'
                author = self.tag_to_string(article.find('li', {'itemprop' : check_words('creator')}))
                description  = self.tag_to_string(article.find(attrs={'data-test' : check_words('article.type')})) + u' • '
                description += self.tag_to_string(article.find('div', attrs={'itemprop' : check_words('description')}))
                sections[sec_title].append(
                    {'title' : title, 'url' : url, 'description' : description, 'date' : date, 'author' : author})

        for k in ordered_sec_titles:
            index.append((k, sections[k]))
        return index

    def preprocess_html(self, soup):
        for img in soup.findAll('img',{'data-src' : True}):
            if img['data-src'].startswith('//'):
                img['src'] = 'https:' + img['data-src']
            else:
                img['src'] = img['data-src']
        for div in soup.findAll('div', {'data-component': check_words('article-container')})[1:]:
            div.extract()
        return soup
lui1 is offline   Reply With Quote
Reply

Thread Tools Search this Thread
Search this Thread:

Advanced Search

Forum Jump

Similar Threads
Thread Thread Starter Forum Replies Last Post
Nature (journal) special on scientific publishing jehane News 9 07-03-2013 12:53 PM
Nature news - updated recipe Alexis Recipes 3 10-05-2012 03:36 PM
Nature recipe request whitecow Recipes 0 03-13-2012 03:28 PM
bbc nature recipe update scissors Recipes 0 01-28-2012 04:58 AM
BBC Nature Recipe scissors Recipes 0 12-28-2011 05:44 AM


All times are GMT -4. The time now is 05:58 PM.


MobileRead.com is a privately owned, operated and funded community.