﻿#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai

from __future__ import (unicode_literals, division, absolute_import,
                        print_function)

__license__   = 'GPL v3'
__copyright__ = '2016, John Howell <jhowell@acm.org>'
__docformat__ = 'restructuredtext en'

import re
import mechanize
import urllib
import urllib2
import time
import cookielib
import json
import urlparse

from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.utils.date import parse_only_date

from calibre_plugins.overdrive_link.numbers import value_unit
from calibre_plugins.overdrive_link.book import (LibraryBook, InfoBook)
from calibre_plugins.overdrive_link.formats import (FORMAT_SCRIBD_READER, FORMAT_SCRIBD_AUDIOBOOK)
from calibre_plugins.overdrive_link.json import js_value
from calibre_plugins.overdrive_link.library import SearchableLibrary
from calibre_plugins.overdrive_link.net import (browse_url, open_url)
from calibre_plugins.overdrive_link.match import (normalize_author, normalize_title)
from calibre_plugins.overdrive_link.parseweb import (LibraryError, class_contains, must_find, text_only)

SEARCH_BY_AUTHOR = False    # perform search by author to avoid large numbers of incorrect results, but misses books

# Scribd returns a 'forbidden' error when requests are received too quickly. Take a long delay when this is detected.
# Pace requests to try to avoid this condition. 

OVERALL_MAX_QPS = 1.0       # maximum queries per second for Scribd across all jobs
THROTTLE_DELAY = 300.0      # seconds to delay when throttled by Scribd

LANGUAGES = {
    'English': 1,
    'Chinese': 6,
    'Spanish': 4,
    'Arabic': 11,
    'Portuguese': 13,
    'Japanese': 3,
    'German': 9,
    'French': 5,
    'Korean': 7,
    'Turkish': 78,
    'Vietnamese': 103,
    'Russian': 14,
    'Tamil': 104,
    'Italian': 8,
    'Thai': 60,
    'Polish': 89,
    }
    
NON_ENGLISH_WORDS = {
    # German
    "aber", "als", "andere", "anderer", "anderes", "auch", "auf", "aus", "bei", "beispiel",
    "bin", "bis", "da", "damit", "dann", "das", "dass", "dem", "den", "denn", "der", "dich", "dir", "doch",
    "du", "durch", "eigentlich", "ein", "eine", "einen", "er", "erste", "erster", "erstes", "es", "für", "ganz",
    "geben", "gehen", "groß", "habe", "haben", "hier", "ich", "ihm", "ihn", "ihr", "immer", "ist", "ja", "jahr",
    "jede", "jeder", "jedes", "jetzt", "können", "kann", "kein", "kommen", "lassen", "müssen", "machen", "mehr",
    "mein", "mich", "mir", "mit", "nach", "nein", "neu", "nicht", "noch", "nur", "oben", "oder", "sagen", "schon",
    "sehen", "sehr", "sein", "selber", "selbst", "sich", "sie", "sind", "sollen", "stehen", "über", "um", "und",
    "uns", "unser", "unter", "viel", "von", "vor", "weil", "wenn", "werden", "wie", "wieder", "wir", "wissen",
    "wo", "wollen", "zeit", "zu", "zwei",
    
    "la", "de", "și", "ce", "ca", "un", "cu", "nu", "va", "să", "că", "fi", "mai", 
    }
    
NON_ENGLISH_LETTERS = r"[àáâãäăåāăąèéêëēĕėęěìíîïĩīĭįıòóôõöōŏőơùúûüũūŭůűųñńņňýÿƴßșțçćĉċč]"    # as a re pattern
    

class Scribd(SearchableLibrary):
    id = 'sc'
    name = 'Scribd'
    formats_supported = {FORMAT_SCRIBD_READER, FORMAT_SCRIBD_AUDIOBOOK}

    @staticmethod    
    def validate_library_id(library_id, migrate=True, config=None):
        if library_id:
            raise ValueError('Scribd library id must be left blank: "%s"' % library_id)

        return library_id
             

    @staticmethod    
    def validate_book_id(book_id, library_id):
        if not re.match(r'^([0-9]+)$', book_id):
            raise ValueError('Scribd book id must be numeric: "%s"' % book_id)
            
        return book_id
            

    @staticmethod    
    def book_url(library_id, book_id):
        # Using /book/ will redirect properly for audiobooks
        return 'https://www.scribd.com/book/%s' % book_id


    def __init__(self):
        self.cookiejar = cookielib.CookieJar()
        self.authors_searched = set()
        self.ids_of_author = {}
        self.user_id_of_user_name = {}
        self.user_content_keys = {}
        self.filter_reported = False
        
        
    def sign_in(self, use_credentials):
        if self.card_number:
            # sign in to Scribd to produce selective results
            self.signin_required = True
        
            self.log.info('Signing in to %s' % self.name)
            
            br = mechanize.Browser()
            br.set_cookiejar(self.cookiejar)
            browse_url(self.log, br, mechanize.Request('https://www.scribd.com/login'))

            # Select the Sign in form
            br.select_form(predicate=lambda f: f.attrs.get('action','').endswith('/login')) # Exception if not found
            
            # User credentials
            br.form['login_or_email'] = self.card_number
            br.form['login_password'] = self.card_pin
                    
            # Login
            page = browse_url(self.log, br, None)

            if ('Invalid username or password.' in page) or ('Log in to Scribd' in page): 
                raise LibraryError('Sign in failed. Check Email/username (card number) and password (PIN).')

            if 'Scribd.logged_in = true;' not in page: 
                raise LibraryError('Sign in unsuccessful.')
            
            self.log.info('Sign in to Scribd successful')
            self.signed_in = True
        
        
    def open_scribd_url(self, url, **kwargs):
        while True:
        
            try:
                response = open_url(self.log, url, **kwargs)
                
                redirect_url = response.geturl()
                redirect_path = urlparse.urlparse(redirect_url).path
                
                if len(redirect_path) < 2:
                    raise LibraryError('Scribd redirected query to home page - Retry search later')
                    
                break
                
            except Exception as e:
                if type(e) == urllib2.HTTPError:
                    if e.code == 403:
                        # error 403 (forbidden) occurs for throttling if requests are received too quickly by scribd.
                        
                        #self.log.info('headers: %s' % e.headers)
                        #self.log.info('Forbidden: ' + e.response_data)
                        
                        if 'your computer or network may be sending automated search queries' in e.response_data:
                            self.log.info('Delaying due to throttling')
                            time.sleep(THROTTLE_DELAY)
                            continue
                        
                raise   # report other errors
        
        return response
        
    
    def report_filter(self):
        if (not self.filter_reported) and (not self.signed_in):
            self.log.error('Scribd results are being filtered. Providing credentials to sign in to Scribd may prevent this problem.')
            self.filter_reported = True


    def find_books(self, books, search_author, search_title, keyword_search, find_recommendable):
        # doing search of Scribd by query performs poorly since it returns excessive non-matching results
        
        for content_type in ['books', 'audiobooks', 'comics']:
            search_format = FORMAT_SCRIBD_AUDIOBOOK if (content_type == 'audiobooks') else FORMAT_SCRIBD_READER
            
            if search_format not in self.config.search_formats:
                continue    # skip undesired formats
                    
            if SEARCH_BY_AUTHOR and not (keyword_search or search_title):
                if self.find_books_using_author(books, search_author, content_type):
                    return True     # Too many results
            else:
                RESULTS_PER_PAGE = 30
                MAX_PAGES_FOR_NON_AUTHOR_SEARCH = 3
                MAX_RESULTS_ALLOWED = 500
                
                page_num = 1
                total_pages = 1
                total_results = 0
                results_processed = 0
                
                
                while (page_num <= total_pages):
                    if (page_num > MAX_PAGES_FOR_NON_AUTHOR_SEARCH) and not (keyword_search or search_title):
                        # too many results - author name is a common term. Do search specifically by author.
                        self.log.info('Too many results -- switching to search by author')
                        if self.find_books_using_author(books, search_author, content_type):
                            return True
                            
                        results_processed = total_results   # prevent warning
                        break
                        
                    data = {}
                    query = []
                    
                    if search_author:
                        query.append('"%s"' % search_author.replace("'", ""))
                        
                    if search_title:
                        query.append('"%s"' % search_title)
                        
                    data['query'] = ' '.join(query)
                    
                    if self.config.search_language and self.config.search_language in LANGUAGES:
                        language = self.config.search_language
                        data['language'] = unicode(LANGUAGES[self.config.search_language])
                    else:
                        language = ''
                        
                    data['page'] = unicode(page_num)
                    
                    # tops (all), books, audiobooks, comics, authors, documents, sheet_music, collections, users
                    data['content_type'] = content_type
  
                    response = self.open_scribd_url('https://www.scribd.com/search?%s' % urllib.urlencode(data),
                                        cookiejar=self.cookiejar, qps=OVERALL_MAX_QPS)
                    
                    # Parse the html results for analysis
                    soup = BeautifulSoup(response.data, convertEntities=BeautifulSoup.HTML_ENTITIES)
                    
                    scripts = soup.findAll('script', attrs={'type':"text/javascript"})
                    for script in scripts:
                        if 'Scribd.R.render(Scribd.R.Search.App(' in unicode(script):
                            search_info = js_value(self.log, unicode(script), 'Scribd.R.render(Scribd.R.Search.App(')
                            break
                    else:
                        raise Exception('Missing Scribd.R.render(Scribd.R.Search.App)')

                    #self.log.info('Scribd.R.Search.App: %s' % unicode(search_info))
                    
                    result_count = search_info['result_count']
                    if result_count == 0:
                        break
                    
                    if total_results and (result_count != total_results):
                        self.log.info('Total results changed from %d to %d'%(total_results, result_count))
                    
                    total_results = result_count
                    total_pages = ((total_results - 1) // RESULTS_PER_PAGE) + 1  # floor division
                        
                    self.log.info('Response: page %d of %d. %d total results'%(page_num, total_pages, total_results))
                    
                    html_results = search_info['results'][content_type]['content']['document_cells_html']
                    results_container = BeautifulSoup(html_results, convertEntities=BeautifulSoup.HTML_ENTITIES)
                    
                    # class="object_cell document_cell   has_flag is_geo_restricted  is_book"
                    for doc in results_container.findAll('div', attrs=class_contains('document_cell'), recursive=True):
                        self.process_search_result(books, search_author, doc, content_type, language)
                        results_processed += 1

                    if results_processed >= MAX_RESULTS_ALLOWED:
                        return True
                        
                    page_num += 1
                
                if results_processed != total_results:
                    # this happens frequently for Scribd
                    self.log.info('Expected %s results but found %d'%(value_unit(total_results,'book'), results_processed))
                    self.report_filter()
        
        return False
        
        
    def process_search_result(self, books, search_author, doc, content_type, language):
        authors = []
        title = ''
        available = True
        
        book_id = doc['data-object_id']
        book_classes = doc['class'].split()
        
        document_title = doc.find('div', attrs={'class': 'document_title'})
        if document_title:
            title_text = text_only(document_title)
            if not title_text.endswith('...'):
                title = normalize_title(title_text)
            
        document_author = doc.find('div', attrs={'class': 'document_author'})
        if document_author:
            # May contain non-author contributors. These will be eliminated by get_book_info
            for a in document_author.findAll('a', recursive=True):
                authors.append(normalize_author(text_only(a), unreverse=False))
                    
        for flag_div in doc.findAll('div', attrs={'class': 'flag'}, recursive=True):
            flag_text = text_only(flag_div).lower()
            if flag_text == "not available" or flag_text == "sample":
                available = False
                
            #flag='!' for audiobook requiring a credit
                
        audiobook_indicator = doc.find('div', attrs={'class': 'audiobook_indicator'}) is not None
        audiobook_indicator = audiobook_indicator or ('is_audiobook' in book_classes)
        
        if audiobook_indicator != (content_type == 'audiobooks'):
            raise LibraryError('Format mismatch: content_type=%s, audiobook_indicator=%s' %
                    (content_type, unicode(audiobook_indicator)))
                    
                    
        if 'is_series' in book_classes or 'is_comic_series' in book_classes: available = False    # series, not book
        
        # don't specify format yet since may have been removed even if shown in search results
        
        lbook = LibraryBook(authors=authors, title=title, language=language,
            available=available, recommendable=not available, lib=self, book_id=book_id,
            search_author=search_author)
                
        
        if not available:
            self.log.info('Ignoring unavailable: %s' % repr(lbook))
        else:
            self.log.info('Found: %s' % repr(lbook))
            books.add(lbook)
                            

        
        
    def find_books_using_author(self, books, search_author, content_type):
        '''
        faster than a general search using a common author name, but cannot filter by language or title
        
        This is not used by default because it takes more queries to produce results and because some books
        are not findable in the author's book list even though they link back to the proper author.
        Some examples are:
            https://www.scribd.com/book/206531958/Nightside-CIty
            https://www.scribd.com/book/205643330/The-Vondish-Ambassador-A-Legend-of-Ethshar
        '''
        
        CONTENT_KEYS = {
            'books': 'authored_documents',
            'audiobooks': 'authored_audiobooks',
            'comics': 'authored_comics',
            }
        
        content_key = CONTENT_KEYS[content_type]
        author_ids = self.find_author_ids(search_author)
        
        for author_id in author_ids:
            if (author_id, content_key) in self.authors_searched:
                self.log.info('Already searched Scribd author user id %s for %s' % (author_id, content_key))
                
            elif content_key not in self.user_content_keys[author_id]:
                pass    # this author has no content of this type
            
            else:
                self.authors_searched.add((author_id, content_key))
                
                RESULTS_PER_PAGE = 20
                MAX_RESULTS_ALLOWED = 500

                page_num = 1
                results_processed = 0
                has_more = True
                
                while has_more:
                    data = {}
                    data['id'] = author_id
                    data['content_key'] = content_key
                    data['page'] = unicode(page_num)
                        
                    url = 'https://www.scribd.com/profiles/content.json?%s' % urllib.urlencode(data)
                        
                    response = self.open_scribd_url(url, cookiejar=self.cookiejar, qps=OVERALL_MAX_QPS)
                    
                    redirect_url = response.geturl()
                    if urlparse.urlparse(redirect_url).path != '/profiles/content.json':
                        self.log.error('Author user id %s redirects to page %s' % (author_id, redirect_url))
                        break
                    
                    data = json.loads(response.data)
            
                    documents = data['documents']
                
                    # Parse the html results for analysis
                    soup = BeautifulSoup(documents, convertEntities=BeautifulSoup.HTML_ENTITIES)
                    
                    # class="document_grid document_drop object_grid has_document_cells"
                    authored_docs = must_find(soup, 'div', attrs=class_contains('document_grid'))
                        
                    # class="object_cell document_cell   has_flag is_geo_restricted  is_book"
                    docs = authored_docs.findAll('div', attrs=class_contains('document_cell'), recursive=True)
                    
                    has_more = data['has_more']
                    
                    if (len(docs) < RESULTS_PER_PAGE) and has_more:
                        has_more = False        # will show true even on last to force load of empty, avoid that for performance
                    
                    for doc in docs:
                        self.process_search_result(books, search_author, doc, content_type, '')
                        results_processed += 1
                        
                        if results_processed > MAX_RESULTS_ALLOWED:
                            return True     # limit_exceeded
                    
                    page_num += 1
            
        return False
        
        
    def find_author_ids(self, search_author):
        if search_author not in self.ids_of_author:
            self.ids_of_author[search_author] = set()
            
            if search_author:
                start_time = time.time()

                RESULTS_PER_PAGE = 10
                MAX_RESULTS_ALLOWED = 20

                page_num = 1
                total_pages = 1
                total_results = 0
                results_processed = 0
                
                while (page_num <= total_pages):
                    data = {}
                    data['query'] = '"%s"' % search_author
                    data['page'] = unicode(page_num)
                    data['content_type'] = 'authors'
                    
                    response = self.open_scribd_url('https://www.scribd.com/search?%s' % (
                        urllib.urlencode(data)), cookiejar=self.cookiejar, qps=OVERALL_MAX_QPS)
                    
                    # Parse the html results
                    soup = BeautifulSoup(response.data, convertEntities=BeautifulSoup.HTML_ENTITIES)
                    
                    scripts = soup.findAll('script', attrs={'type':"text/javascript"})
                    for script in scripts:
                        if 'Scribd.R.render(Scribd.R.Search.App(' in unicode(script):
                            search_info = js_value(self.log, unicode(script), 'Scribd.R.render(Scribd.R.Search.App(')
                            break
                    else:
                        raise Exception('Missing Scribd.R.render(Scribd.R.Search.App)')

                    #self.log.info('Scribd.R.Search.App: %s' % unicode(search_info))
                    
                    result_count = search_info['result_count']
                    if result_count == 0:
                        break
                    
                    if total_results and (result_count != total_results):
                        self.log.info('Total results changed from %d to %d'%(total_results, result_count))
                    
                    total_results = result_count
                    total_pages = ((total_results - 1) // RESULTS_PER_PAGE) + 1  # floor division
                        
                    self.log.info('Response: page %d of %d. %d total results'%(page_num, total_pages, total_results))
                    
                    author_results = search_info['results']['authors']['content']['authors']
                    
                    for author in author_results:
                        user_id = unicode(author['id'])
                        self.ids_of_author[search_author].add(user_id)
                        if user_id not in self.user_content_keys:
                            self.user_content_keys[user_id] = self.find_user_content_keys(author['profile_url'])
                            
                        results_processed += 1

                    if results_processed >= MAX_RESULTS_ALLOWED:
                        self.log.info('author limit reached')
                        break
                        
                    page_num += 1
                    
                if results_processed != total_results and results_processed < MAX_RESULTS_ALLOWED:
                    # this happens frequently for Scribd
                    self.log.info('Expected %s but found %d'%(value_unit(total_results,'author user id'), results_processed))
                    # self.report_filter() -- This just happens without filtering
                    
                self.log.info('Search returned %s in %.1f sec: %s'%(
                        value_unit(total_results, 'author user id'), 
                        time.time() - start_time,
                        ', '.join(list(self.ids_of_author[search_author]))))
        
        return self.ids_of_author[search_author]

        
    def find_user_content_keys(self, user_url):
        content_keys = set()
        response = self.open_scribd_url(user_url, cookiejar=self.cookiejar, qps=OVERALL_MAX_QPS)
        
        # Parse the html results
        soup = BeautifulSoup(response.data, convertEntities=BeautifulSoup.HTML_ENTITIES)
        
        for form in soup.findAll('form', attrs=class_contains('search_form'), recursive=True):
            action = form.get('action', '')
            if action.startswith('https://www.scribd.com/profiles/search/'):
                center_tabs = soup.find('div', attrs=class_contains('center_tabs'))
                if center_tabs:
                    for div in center_tabs.findAll('div', attrs={'data-section': re.compile('authored')}):
                        content_keys.add(div['data-section'])
                            
                break
        else:
            self.log.info('User has no content: %s' % user_url)
            
        return content_keys
    

    def get_book_info(self, book_id):
        try:
            response = self.open_scribd_url(self.book_url(self.library_id, book_id),
                    cookiejar=self.cookiejar, qps=OVERALL_MAX_QPS)
            
        except Exception as e:
            if type(e) == urllib2.HTTPError:
                if e.code == 410:
                    # error 410 (gone) occurs if book is not authorized or has been removed from scribd.
                    
                    #self.log.info('headers: %s' % e.headers)

                    if 'This document is not publicly available.' in e.response_data:
                        self.log.info('Access denied: %s is not publically available' % book_id)
                    elif 'Deletion notice' in e.response_data:
                        self.log.info('Deleted: %s is no longer available' % book_id)
                    else:
                        self.log.info('Gone: %s is not available' % book_id)
                    
                    return None
                    
                if e.code == 500:    
                    # occurs consistently for some books that are not accessible
                    self.log.warn('Internal Server Error: %s is not available' % book_id)
                    return
            
            raise   # report other errors
            
        
        book_path = urlparse.urlparse(response.geturl()).path
        if book_path.startswith('/book/'):
            formats = {FORMAT_SCRIBD_READER}    # comics are still identified as "book" in URL
        elif book_path.startswith('/audiobook/'):
            formats = {FORMAT_SCRIBD_AUDIOBOOK}
        else:
            self.log.info('unknown document_type=%s' % book_path)
            return None
            
            
        soup = BeautifulSoup(response.data, convertEntities=BeautifulSoup.HTML_ENTITIES)
        
        doc_container = soup.find('div', attrs={'class': 'doc_container'})
        if doc_container:
            self.log.info('Found a personal document instead of an e-book')
            return None
            
        title = ''
        authors = []
        publisher = ''
        pubdate = None
        isbn = ''
        
        content = soup.find('div', attrs=class_contains('overview'))
        if not content:
            # some books (a few fixed format graphic novels) do not contain book details on the main
            # book page but go directly to the actual content instead. Use json metadata instead.
            return self.get_book_info_via_popup(book_id)
            
            
        for badge in content.findAll('div', attrs={'class': 'badge'}):
            badge_text = text_only(badge).lower()
            if badge_text in ['book series', 'comic series', 'audiobook series']:
                formats = set()     # series, not an actual book
            elif badge_text in ['book', 'comic', 'audiobook']:
                pass
            else:
                self.log.warn('Unknown book type: %s' % badge_text)
                
        flag_div = content.find('div', attrs={'class': 'flag'})
        if flag_div:
            flag_text = text_only(flag_div).lower()
            if not flag_text:
                pass    # no flag
            elif flag_text == 'not available':
                self.log.info('Restricted: %s' % flag_text)
                formats = set()    # not available
            elif flag_text in ['!', '1']:
                self.log.info('Restricted audiobook - requires credit')
            elif flag_text == 'expiring':
                pass    # book will be expiring from Scribd
            else:
                self.log.warn('Unknown book flag: %s' % flag_text)
       
        # detect occasional too-short books that cannot be read
        pages_s = content.find('span', attrs={'itemprop': 'numberOfPages'})
        if pages_s:
            number_pages = int(text_only(pages_s).split()[0].replace(',',''))
            if number_pages <= 15:
                read_response = self.open_scribd_url(self.book_url(self.library_id, book_id).replace('/book/','/read/'),
                        cookiejar=self.cookiejar, qps=OVERALL_MAX_QPS)
                redirect_path = urlparse.urlparse(read_response.geturl()).path
                if not redirect_path.startswith('/read/'):
                    self.log.info('Non-functional book: %s, %s' % (text_only(pages_s), redirect_path))
                    formats = set()    # not available
                
        
        title_h = content.find('h1')
        if title_h:
            title = normalize_title(text_only(title_h))
            
        author_h = content.find('h2')
        if author_h:
            for author_a in author_h.findAll('a', recursive=True):
                authors.append(normalize_author(text_only(author_a), unreverse=False))
                
        publisher_d = content.find('div', attrs={'class': 'published'})
        if publisher_d:
            publisher_a = publisher_d.find('a')
            if publisher_a:
                publisher = text_only(publisher_a)
                
            pub_text = text_only(publisher_d)
            m = re.search(r'(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) [0-9]+, [0-9]+', pub_text, flags=re.IGNORECASE)
            if m:
                try:
                    pubdate = parse_only_date(m.group(0), assume_utc=True)
                except:
                    # ignore invalid dates such as Feb 30, 1970 on https://www.scribd.com/book/150477853/Tales-from-the-White-Hart
                    pass
            
        isbn_s = content.find('span', attrs={'itemprop': 'isbn'})
        if isbn_s:
            isbn = text_only(isbn_s)
        
        if self.config.search_language == 'English':
            # detect books in other languages mistakenly found when doing English search
            description_d = content.find('div', attrs={'class': 'description'})
            if description_d:
                description = text_only(description_d)
                total_word_count = non_english_word_count = 0
                for word in description.split():
                    total_word_count += 1
                    if (word.lower() in NON_ENGLISH_WORDS) or re.search(NON_ENGLISH_LETTERS, word):
                        non_english_word_count += 1
                
                if ((total_word_count < 10 and non_english_word_count > 0) or
                        (float(non_english_word_count)/float(total_word_count) > 0.05)):
                    self.log.info('Ignoring non-English book found using English search')
                    formats = set()    # not available
                
        
        
        return InfoBook(authors=authors, title=title, isbn=isbn,
                publisher=publisher, pubdate=pubdate,
                formats=formats, lib=self, book_id=book_id)
                
              
    def get_book_info_via_popup(self, book_id):
        try:
            response = self.open_scribd_url('https://www.scribd.com/documents/popup_data?id=%s' % book_id,
                    cookiejar=self.cookiejar, qps=OVERALL_MAX_QPS)
            
        except Exception as e:
            if type(e) == urllib2.HTTPError:
                if e.code == 410:
                    # error 410 (gone) occurs if book is not authorized or has been removed from scribd.
                    
                    #self.log.info('headers: %s' % e.headers)
                    
                    if 'This document is not publicly available.' in e.response_data:
                        self.log.info('Access denied: %s is not publically available' % book_id)
                    elif 'Deletion notice' in e.response_data:
                        self.log.info('Deleted: %s is no longer available' % book_id)
                    else:
                        self.log.info('Gone: %s is not available' % book_id)
                    
                    return None
                    
                if e.code == 500:    
                    # occurs consistently for some books that are not accessible
                    self.log.warn('Internal Server Error: %s is not available' % book_id)
                    return
            
            raise   # report other errors
        
        popup_data = json.loads(response.data)      # Parse the json results
        
        if popup_data['document_type'] in ["book", "comic"]:
            formats = {FORMAT_SCRIBD_READER}
        elif popup_data['document_type'] == "audiobook":
            formats = {FORMAT_SCRIBD_AUDIOBOOK}
        else:
            self.log.info('unknown document_type=%s' % popup_data['document_type'])
            return None
            
        # No indication of restricted audiobook (requires credit) in json
        
        view_restriction_message = popup_data.get("view_restriction_message", '')
        if view_restriction_message:
            self.log.info('Restricted: %s' % view_restriction_message)
            formats = set()    # not available
            
        if popup_data.get("is_series", False) or popup_data.get("is_comic_series", False):
            formats = set()    # series, not a book
        
        title = normalize_title(popup_data['title'])
        
        authors = []
        for author_info in popup_data['authors']:
            authors.append(normalize_author(author_info['name'], unreverse=False))
            
        publisher = ''
        if 'publisher' in popup_data:
            publisher = popup_data['publisher']['name']
            
        # No ISBN or pubdate in JSON
            
        return InfoBook(authors=authors, title=title, publisher=publisher,
                    formats=formats, lib=self, book_id=book_id)

                    
    def check_book_obtainable(self, book_id):
        try:
            response = self.open_scribd_url(self.book_url(self.library_id, book_id),
                    cookiejar=self.cookiejar, qps=OVERALL_MAX_QPS)
            
        except Exception as e:
            if type(e) == urllib2.HTTPError:
                if e.code == 410:
                    # error 410 (gone) occurs if book is not authorized or has been removed from scribd.
                    
                    #self.log.info('headers: %s' % e.headers)

                    if 'This document is not publicly available.' in e.response_data:
                        self.log.info('Access denied: %s is not publically available' % book_id)
                    elif 'Deletion notice' in e.response_data:
                        self.log.info('Deleted: %s is no longer available' % book_id)
                    else:
                        self.log.info('Gone: %s is not available' % book_id)
                    
                    return None
                    
                if e.code == 500:    
                    # occurs consistently for some books that are not accessible
                    self.log.warn('Internal Server Error: %s is not available' % book_id)
                    return
            
            raise   # report other errors
            
        
        soup = BeautifulSoup(response.data, convertEntities=BeautifulSoup.HTML_ENTITIES)
        
        doc_container = soup.find('div', attrs={'class': 'doc_container'})
        if doc_container:
            self.log.info('Found a personal document instead of an e-book')
            return None
            
        scripts = soup.findAll('script', attrs={'type':"text/javascript"})
        for script in scripts:
            if 'Scribd.R.render(Scribd.R.Shared.React.LibraryButton(' in unicode(script):
                book_info = js_value(self.log, unicode(script), 'Scribd.R.render(Scribd.R.Shared.React.LibraryButton(')
                break
        else:
            # check for additional shadow data
            response = self.open_scribd_url('https://www.scribd.com/book-preview/%s/shadow_loader' % book_id,
                    cookiejar=self.cookiejar, qps=OVERALL_MAX_QPS)
                    
            json_data = json.loads(response.data)
            soup = BeautifulSoup(json_data["bookpreview_actions"], convertEntities=BeautifulSoup.HTML_ENTITIES)
            scripts = soup.findAll('script', attrs={'type':"text/javascript"})
            for script in scripts:
                if 'Scribd.R.render(Scribd.R.Shared.React.LibraryButton(' in unicode(script):
                    book_info = js_value(self.log, unicode(script), 'Scribd.R.render(Scribd.R.Shared.React.LibraryButton(')
                    break
            else:
                raise Exception('Missing Scribd.R.render(Scribd.R.Shared.React.LibraryButton)')

        #self.log.info('Scribd.R.Shared.React.LibraryButton: %s' % unicode(search_info))
        
        if book_info["credit_restricted"]:
            return 1    # credit required
    
        return 0    # always available, assuming no pre-release titles
