﻿#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai

from __future__ import (unicode_literals, division, absolute_import, print_function)

__license__   = 'GPL v3'
__copyright__ = '2016, John Howell <jhowell@acm.org>'
__docformat__ = 'restructuredtext en'

import re
import time
import urllib
import dateutil.parser
import cookielib
from dateutil.tz import tzutc
from collections import defaultdict

from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.utils.date import parse_only_date
from calibre.utils.config_base import tweaks

from calibre_plugins.overdrive_link.numbers import (value_unit, word_number)
from calibre_plugins.overdrive_link.link import IDENT_AMAZON
from calibre_plugins.overdrive_link.formats import FORMAT_KINDLE_BOOK
from calibre_plugins.overdrive_link.book import (LibraryBook, InfoBook)
from calibre_plugins.overdrive_link.library import (SearchableLibrary, LendingLibrary)
from calibre_plugins.overdrive_link.match import (normalize_author, normalize_title)
from calibre_plugins.overdrive_link.net import open_url
from calibre_plugins.overdrive_link.parseweb import (
    LibraryError, must_find, text_only, double_quote, class_contains)
from calibre_plugins.overdrive_link.language import ALL_LANGUAGE_NAMES
from calibre_plugins.overdrive_link.tweak import TWEAK_SAVE_RESPONSES_ON_ERROR

# Amazon sends a captcha when requests are received too quickly. Take a long delay when this is detected to make Amazon happy.
# Pace requests to try to avoid this condition.

OVERALL_MAX_QPS = 0.5       # maximum queries per second for Amazon across all jobs
THROTTLE_DELAY = 120.0      # seconds to delay when throttled by Amazon


'''
Browse node IDs are locale-specific and may change without warning. Arranged in a hierarchy.
http://docs.aws.amazon.com/AWSECommerceService/latest/DG/BrowseNodeIDs.html

Find the name of the browse node:
http://www.amazon.com/exec/obidos/tg/browse/-/[Browse Node ID]

See contents of node:
http://www.amazon.com/b/?node=[Browse Node ID]
'''

# search index types
SI_BOOKS = 'Books'
SI_KINDLE_STORE = 'KindleStore'

# Locale
LOCALE_USA = ''
LOCALE_BRAZIL = 'br'
LOCALE_CANADA = 'ca'    # not supported by calibre
LOCALE_CHINA = 'cn'     # not supported by calibre
LOCALE_GERMANY = 'de'
LOCALE_SPAIN = 'es'
LOCALE_FRANCE = 'fr'
LOCALE_INDIA = 'in'     # not supported by calibre
LOCALE_ITALY = 'it'
LOCALE_JAPAN = 'jp'
LOCALE_UK = 'uk'

WEB_HOSTS = {
    LOCALE_USA: 'www.amazon.com',
    LOCALE_UK: 'www.amazon.co.uk',
    LOCALE_CANADA: 'www.amazon.ca',
    
    # Unable to detect prime eligible books at these sites
    #LOCALE_GERMANY: 'www.amazon.de',
    #LOCALE_FRANCE: 'www.amazon.fr',
    
    # Enable the following amazon sites if they support prime lending
    #LOCAL_BRAZIL: 'www.amazon.com.br',
    #LOCALE_SPAIN': 'www.amazon.es',
    #LOCALE_ITALY: 'www.amazon.it',
    #LOCALE_JAPAN: 'www.amazon.co.jp',
    }
    
COLLECTION_KOLL = 'prime'                           # kindle owners' lending library
COLLECTION_UNLIMITED = 'unlimited'                  # kindle unlimited 
COLLECTION_UNLIMITED_WITH_NARRATION = 'kuwn'        # kindle unlimited with narration
COLLECTION_STORE = 'store'                          # kindle e-book store 
COLLECTION_WHISPERSYNC_FOR_VOICE = 'w4v.store'

COLLECTIONS = {COLLECTION_KOLL, COLLECTION_UNLIMITED, COLLECTION_UNLIMITED_WITH_NARRATION, 
    COLLECTION_STORE, COLLECTION_WHISPERSYNC_FOR_VOICE}
    
PURCHASABLE_COLLECTIONS = {COLLECTION_STORE, COLLECTION_WHISPERSYNC_FOR_VOICE}
    
LIBRARY_ID_SEPERATOR = '-'

COUNTRY_COLLECTION_INFO_ATTRS = ('use_api_search', 'search_index', 'browse_node')
COUNTRY_COLLECTION_INFO = {
    (LOCALE_USA, COLLECTION_KOLL):                     (False, SI_BOOKS, '618073011'),
    (LOCALE_USA, COLLECTION_UNLIMITED):                (True, SI_BOOKS, '9069934011'),
    (LOCALE_USA, COLLECTION_UNLIMITED_WITH_NARRATION): (True, SI_BOOKS, '9630682011'),
    (LOCALE_USA, COLLECTION_STORE):                    (True, SI_BOOKS, '154606011'),  # Kindle Store (133140011)->Kindle eBooks
    (LOCALE_USA, COLLECTION_WHISPERSYNC_FOR_VOICE):    (True, SI_BOOKS, '5744819011'),
    
    (LOCALE_UK, COLLECTION_UNLIMITED):                 (True, SI_BOOKS, '4764713031'),
    (LOCALE_UK, COLLECTION_UNLIMITED_WITH_NARRATION):  (True, SI_BOOKS, '5232194031'),
    (LOCALE_UK, COLLECTION_STORE):                     (True, SI_BOOKS, '341689031'),
    (LOCALE_UK, COLLECTION_WHISPERSYNC_FOR_VOICE):     (True, SI_BOOKS, '4824710031'),
    
    (LOCALE_CANADA, COLLECTION_UNLIMITED):             (True, SI_BOOKS, '9333751011'),
    (LOCALE_CANADA, COLLECTION_STORE):                 (True, SI_BOOKS, '2980423011'),
    
    }
    
    
DESIRED_ROLES = {'Editor', 'Author', 'Collaborator', 'Contributor', 'Illustrator'}

UNDESIRED_ROLES = {'Preface', 'Introduction', 'Foreword', 'Afterword',
                   'Photographer', 'Translator', 'Narrator', 'Reader', 'Compiler'}
                   
KNOWN_ROLES = DESIRED_ROLES | UNDESIRED_ROLES


ADD_HEADERS = [
    # Chrome Windows
    ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'),
    ('Upgrade-Insecure-Requests', '1'),
    ('Accept-Language', 'en-US,en;q=0.8'),
    ('Pragma', 'no-cache'),
    ('Cache-Control', 'no-cache'),
    ('Connection', 'keep-alive'),
    ]


def lib_locale(library_id):
    locale = library_id.rpartition(LIBRARY_ID_SEPERATOR)[0]
    
    if locale not in WEB_HOSTS:
        raise ValueError('Invalid Amazon library id (country): "%s"' % locale)
        
    return locale

    
def lib_collection(library_id):
    collection = library_id.rpartition(LIBRARY_ID_SEPERATOR)[2]
    
    if collection not in COLLECTIONS:
        raise ValueError('Amazon library id (collection) may only be %s, found: "%s"' % (
            ' or '.join(list(COLLECTIONS)), collection))
    
    return collection

    
    

class Amazon(SearchableLibrary):
    # Amazon.com
    
    id = 'ak'
    name = 'Amazon'
    formats_supported = {FORMAT_KINDLE_BOOK}
    
    is_amazon = True
    
    
    @staticmethod    
    def validate_library_id(library_id, migrate=True, config=None):
        library_id = library_id.lower()
        
        if migrate:
            # migrate old allowed values
            if library_id == '':
                library_id = COLLECTION_KOLL  
   

        locale = lib_locale(library_id)             # check country
        collection = lib_collection(library_id)     # check collection
        
        if (locale, collection) not in COUNTRY_COLLECTION_INFO:
            raise ValueError('Invalid Amazon locale-collection combination: "%s"' % library_id)
            
        if locale:
            return '%s-%s' % (locale, collection)
            
        return collection
        
         
    @staticmethod    
    def validate_book_id(book_id, library_id):
        # book id for Amazon is 10-character alphanumeric ASIN
        if not re.match(r'^([0-9A-Za-z]{10})$', book_id):
            raise ValueError('Amazon book id must be 10 alphanumberic characters: "%s"' % book_id)
             
        return book_id
        
    @staticmethod    
    def book_url(library_id, book_id):
        return 'http://%s/dp/%s' % (WEB_HOSTS[lib_locale(library_id)], book_id)
        
    @staticmethod    
    def book_key_library_id(library_id):
        return lib_locale(library_id)     # book ids differ between countries for Amazon
         
    @staticmethod
    def amazon_ident(library_id):
        locale = lib_locale(library_id)
        if locale:
            return IDENT_AMAZON + '_' + locale
            
        return IDENT_AMAZON # us - no locale
        
    @staticmethod
    def supports_purchase(library_id):
        return lib_collection(library_id) in PURCHASABLE_COLLECTIONS
        

    def __init__(self):
        self.cookiejar = cookielib.CookieJar()  # having cookies enabled causes Amazon to give more consistent results
        self.signed_in = False
        self.item_cache = {}
        
    
    def sign_in(self, use_credentials):
        self.locale = lib_locale(self.library_id)
        self.collection = lib_collection(self.library_id)
        
        self.web_host = WEB_HOSTS[self.locale]
        
        for attrib, value in zip(COUNTRY_COLLECTION_INFO_ATTRS, COUNTRY_COLLECTION_INFO[(self.locale, self.collection)]):
            setattr(self, attrib, value)
            
            
    def open_amazon_url(self, url=None, book_id=None):
        retries = 1
        
        while True:
            if book_id:
                url = self.book_url(self.library_id, book_id)
                
            response = open_url(self.log, url, qps=OVERALL_MAX_QPS, cookiejar=self.cookiejar,
                            addheaders=ADD_HEADERS)
                
            if '<h4>Enter the characters you see below</h4>' not in response.data:
                break
                
            # Amazon thinks we are a bot.
            
            if tweaks.get(TWEAK_SAVE_RESPONSES_ON_ERROR, False):
                self.log.response()

            delay_sec = THROTTLE_DELAY * retries
            self.log.info('Delaying %d seconds due to throttling' % int(delay_sec))
            time.sleep(delay_sec)
            
            self.cookiejar = cookielib.CookieJar()  # flush cookies
            retries += 1
            
        return response
        
        
    def find_books(self, books, search_author, search_title, keyword_search, find_recommendable):
        
        '''
        Amazon sometimes switches back and forth between multiple sets of results, possibly due to different databases
        being queried internally. Try to get one complete set, even if it isn't the largest.
        '''

        page_num = 1
        results_processed = 0
        RESULTS_PER_PAGE = 12 if self.collection == COLLECTION_KOLL else 16
        MAX_RESULTS_ALLOWED = 500
        
        pages_needed_by_results = defaultdict(set)
        
        MAX_SEARCH_PASSES = 2
        passes = 0
        
        search_language = ''
        
        while (True):
            data = {}
            
            if self.collection != COLLECTION_KOLL:
                data['unfiltered'] = '1'    # don't filter out less relevant results
                data['fap'] = '1'           # don't filter out adult content
                data['url'] = 'node=' + self.browse_node
                data['sort'] = 'price'      # low to high
                
                if search_author:
                    data['field-author'] = search_author
                
                if keyword_search:
                    data['field-keywords'] = double_quote(search_title)    # double quote grouping only works for keywords, not author
                elif search_title:
                    data['field-title'] = search_title
                    
                if self.locale == LOCALE_USA:
                    if self.config.search_language in {'English','Franch','German','Spanish'}:
                        data['field-language'] = self.config.search_language
                        search_language = self.config.search_language
                        
            else:
                # kindle owners' lending library
                
                data['unfiltered'] = '1'    # don't filter out less relevant results
                data['fap'] = '1'           # don't filter out adult content
                
                if search_author:
                    data['field-author'] = search_author
                
                if search_title:
                    data['field-title'] = search_title
                    
                data['search-alias'] = 'stripbooks' if self.locale != LOCALE_BRAZIL else 'digital-text'
                
                # sort by
                if self.locale == LOCALE_USA: data['sort'] = 'titlerank'
                elif self.locale == LOCALE_UK: data['sort'] = ''
                elif self.locale == LOCALE_GERMANY: data['sort'] = 'relevancerank'
                
                # Only Books -> Kindle books
                if self.locale == LOCALE_USA: data['field-feature_browse-bin'] = self.browse_node
                elif self.locale == LOCALE_UK: data['field-binding_browse-bin'] = '368165031'  
                elif self.locale == LOCALE_GERMANY: data['field-binding_browse-bin'] = '600849031'
                else:
                    raise LibraryError('kindle books browse node unknown for ' + self.locale)
                
                # Filter on only Kindle Owners' Lending Library - Amazon Prime
                if self.locale == LOCALE_USA: data['rh'] = 'p_85:2470955011'  # Refine: limit search to Prime Eligible
                else:
                    raise LibraryError('Prime Eligible-only value unknown for ' + self.locale)
                
                # Select language
                if self.locale == LOCALE_USA:
                    if self.config.search_language in {'English','Franch','German','Spanish'}:
                        data['field-language'] = self.config.search_language
                        search_language = self.config.search_language
                        
                elif self.locale == LOCALE_UK:
                    language_feature = {
                        'Dutch': '400531011',
                        'English': '400530011', 
                        'Finnish': '405250011',
                        'French': '400532011',
                        'German': '400533011',
                        'Italian': '400534011',
                        'Latin': '405232011',
                        'Portuguese': '400535011',
                        'Spanish': '400537011',
                        }

                    lf = language_feature.get(self.config.search_language)
                    if lf:
                        data['field-feature_browse-bin'] = lf
                        search_language = self.config.search_language
                
                
            if page_num > 1:
                data['page'] = unicode(page_num)
            
            response = self.open_amazon_url(url='http://%s/s/?%s'%(self.web_host, urllib.urlencode(data)))
                    
            # Parse the html results for analysis
            soup = BeautifulSoup(response.data, convertEntities=BeautifulSoup.HTML_ENTITIES)
            soup_text = text_only(soup)
            
            if 'did not match any products' in soup_text:
                break
                
            result_count_h = soup.find('h2', attrs={'id':"resultCount"})
            if result_count_h:
                # Showing 1 - 12 of 67 Results # Showing 2 Results
                
                result_count = text_only(result_count_h).split()
                    
                if (len(result_count) >= 3 and result_count[0] == 'Showing' and
                        (result_count[2] == 'Results' or result_count[2] == 'Result')):
                    first_result = 1
                    total_results = int(result_count[1].replace(',',''))
                    
                elif (len(result_count) >= 7 and result_count[0] == 'Showing' and result_count[2] == '-' and
                        result_count[4] == 'of' and
                        (result_count[6] == 'Results' or result_count[6] == 'Result')):
                        
                    first_result = int(result_count[1].replace(',',''))
                    total_results = int(result_count[5].replace(',',''))
                    
                else:
                    raise LibraryError('Bad resultCount: ' + text_only(result_count_h))
            else:
                result_count_h = must_find(soup, 'h2', attrs={'id':"s-result-count"})
                    
                # 1 result for Prime Eligible : Books : Kindle Edition : English : "john brunner"
                # 1–12 of 23 results for Prime Eligible : Books : Kindle Edition : English : "kurt vonnegut"

                result_count = re.sub(ur'[-–‒—―]', ' - ', text_only(result_count_h).lower()).split()   # unicode dashes to ascii dash
                    
                if len(result_count) >= 1 and (result_count[1] == 'results' or result_count[1] == 'result'):
                    first_result = 1
                    if result_count[0] == 'one':
                        total_results = 1
                    else:
                        total_results = int(result_count[0].replace(',',''))
                    
                elif len(result_count) >= 5 and result_count[1] == '-' and result_count[3] == 'of' and \
                        (result_count[5] == 'results' or result_count[5] == 'result'):
                        
                    first_result = int(result_count[0].replace(',',''))
                    total_results = int(result_count[4].replace(',',''))
                    
                else:
                    raise LibraryError('Bad s-result-count: ' + text_only(result_count_h))
            
            total_pages = ((total_results - 1) // RESULTS_PER_PAGE) + 1  # floor division
            self.log.info('Response: page %d of %d. %d total results'%(page_num, total_pages, total_results))
            
            expected_first_result = ((page_num - 1) * RESULTS_PER_PAGE) + 1
            if first_result != expected_first_result:
                raise LibraryError('Unexpected first result %d instead of %d'%(first_result, expected_first_result))
                
            if total_results > MAX_RESULTS_ALLOWED:
                return True
                
            pages_needed = pages_needed_by_results[total_results]
            if not pages_needed:
                # first time seeing this number of results. Seems to happen regularly at Amazon
                if len(pages_needed_by_results) > 1:
                    self.log.info('Result set changed between pages')
                
                pages_needed.update(set(range(1, total_pages + 1))) # need all pages for this result set
            
            if page_num in pages_needed:
                book_elems = []
                
                atf_results = must_find(soup, 'div', attrs={'id': 'atfResults'})
                book_elems.extend(atf_results.findAll('div', attrs={'id': re.compile('^result_[0-9]+')}))
                book_elems.extend(atf_results.findAll('li', attrs={'id': re.compile('^result_[0-9]+')}))
                
                btf_results = soup.find('div', attrs={'id': 'btfResults'})
                if btf_results:
                    book_elems.extend(btf_results.findAll('div', attrs={'id': re.compile('^result_[0-9]+')}))
                    book_elems.extend(btf_results.findAll('li', attrs={'id': re.compile('^result_[0-9]+')}))
                    
                hero_results = len(soup.findAll('div', attrs=class_contains('sx-hero-container'), recursive=True))
                page_results = 0
                    
                for book_elem in book_elems:
                    kindle_edition = False
                    kindle_unlimited = False
                    amazon_prime = False
                    voice = False
                    
                    available = False
                    purchasable = False
                    title = ''
                    authors = []
                    series = ''
                    series_index = 0
                    language = search_language
                    
                    result_num = int(book_elem['id'][7:])     # "result_123"
                    expected_result_num = first_result - 1 + page_results
                    if result_num != expected_result_num:
                        # missing results may occur due to adult content filtering
                        self.log.info('Expected result %d, found %d' % (expected_result_num, result_num))
                        
                        if result_num == 0:
                            break   # restart of result counting indicates a new result table, possibly suggested books
                    
                    if book_elem.has_key("data-asin"):
                        book_id = book_elem["data-asin"]
                    elif book_elem.has_key("name"):
                        book_id = book_elem["name"]
                    else:
                        raise LibraryError('Missing data-asin/name')
                    
                    book_text = text_only(book_elem)
                    new_aps = book_elem.find('h3', attrs={'class':"newaps"})
                    
                    title_elem = book_elem.find('a', attrs={'class':"title"})
                    
                    if (not title_elem) and new_aps:
                        title_elem = new_aps.find('span', attrs={'class':"lrg bold"})
                        
                    if not title_elem:
                        title_elem = book_elem.find('h2', attrs=class_contains('s-access-title'))
                        
                    if title_elem:
                        title_plus = text_only(title_elem).strip()
                        
                        title_words = title_plus.split()    # look for "(German Edition)" suffix
                        if len(title_words) > 2 and title_words[-1] == 'Edition)' and title_words[-2][0] == '(':
                            lang = title_words[-2][1:]
                            if lang in ALL_LANGUAGE_NAMES:
                                language = lang
                                title_plus = ' '.join(title_words[:-2])
                        
                        if '...' not in title_plus:
                            title = normalize_title(title_plus)  # not truncated
                            
                        
                    author_elem = book_elem.find('span', attrs={'class':"ptBrand"})
                    
                    if (not author_elem) and new_aps:
                        author_elem = new_aps.find('span', attrs={'class':"med reg"})
                        
                    if not author_elem:
                        author_elem = book_elem.find('div', attrs={'class':"a-row a-spacing-none"})
                        
                    if author_elem:
                        author = text_only(author_elem)
                        if ' (' in author:
                            author = author.partition(' (')[0]   # " (Feb 1, 2011) - Kindle eBook"
                            
                        if (re.match(r'^by [a-zàáâãäåāăąèéêëēĕėęěìíîïĩīĭįıòóôõöōŏőơùúûüũūŭůűųñńņňýÿƴçćĉċč .-]+$', author.lower()) and
                                (' and ' not in author)):
                            authors = [normalize_author(author[3:])]  # simple case of single author
                        else:
                            # Cannot parse: "by Isaac Asimov, Philip K. Dick, John Gregory Betancourt and Milton Lesser"
                            pass
                    
                    series_li = book_elem.find('li', attrs={'class':"seriesInfo"})
                    if series_li:
                        # Book 3 of 4 in the Space Odyssey Series
                        series_text = text_only(series_li)
                        split_text = series_text.split()
                        if len(split_text) > 6 and split_text[0] == 'Book' and split_text[2] == 'of' and \
                                split_text[4] == 'in' and split_text[5] == 'the':
                                
                            series = ' '.join(split_text[6:-1 if split_text[-1] == 'Series' else None])
                            series_index = int(split_text[1])
                            
                            
                    # book type flags change over time and are often not present in search results
                    
                    if not kindle_edition:
                        for a in book_elem.findAll('a'):
                            if (a.has_key("title") and a["title"] == "Kindle Edition") or \
                                    (a.has_key("href") and "s=digital-text" in a["href"]):
                                kindle_edition = True
                                break
                            
                    if not kindle_edition:
                        for type_td in book_elem.findAll('td', attrs={'class': re.compile("tpType")}):
                            if 'Kindle Edition' in text_only(type_td):
                                kindle_edition = True
                                break
                                
                    if not kindle_edition:
                        for kindle_tag in ['Auto-delivered wirelessly', 'Whispersync for Voice-ready', 'Print Price',
                                'Subscribers read for free.', 'Kindle Purchase', 'Kindle Edition', 'Kindle eBook']:
                            if kindle_tag in book_text:
                                kindle_edition = True
                                break
                                
                    if book_elem.find('span', attrs=class_contains('sprPrime')):
                        amazon_prime = True     # seems only present if signed in
                            
                    if book_elem.find('span', attrs=class_contains('sprKindleUnlimited')) or \
                            book_elem.find('span', attrs=class_contains('s-icon-kindle-unlimited')) or \
                            "Subscribers read for free." in book_text:
                        kindle_unlimited = True
                        
                    if 'Whispersync for Voice-ready' in book_text:
                        voice = True
                        

                    # assume results will match what we a looking for
                    purchasable = (self.collection in PURCHASABLE_COLLECTIONS)
                    available = (not purchasable)

                    lbook = LibraryBook(authors=authors, title=title, formats=set(),
                            language=language, series=series, series_index=series_index,
                            available=available, purchasable=purchasable, lib=self, book_id=book_id, 
                            search_author=search_author)
                                
                    flags = ('k' if kindle_edition else '') + ('p' if amazon_prime else '') + \
                            ('u' if kindle_unlimited else '') + ('v' if voice else '')
                            
                    self.log.info('Found (%s) %s' % (flags, repr(lbook)))
                    books.add(lbook)
                        
                    page_results += 1
                    results_processed += 1
                       
                if page_num < total_pages:
                    expected_results = RESULTS_PER_PAGE
                else:
                    expected_results = total_results - ((page_num - 1) * RESULTS_PER_PAGE)
                
                if (page_results  + hero_results) != expected_results:
                    self.log.info('Expected %s but found %d actual and %d hero' % (
                            value_unit(expected_results,'page result'), page_results, hero_results))
                    
                pages_needed.remove(page_num)       # done with this page
                
                if not pages_needed:
                    break                           # done with all pages for a result set!
            
                
            # get a page number from the pages needed, keeping to roughly sequential order
            page_num += 1
            all_pages_needed = set()
            
            for result_set in pages_needed_by_results.values():
                all_pages_needed.update(result_set)
                
            needed_page_list = sorted(list(all_pages_needed))
            lowest_needed_page = needed_page_list[0]
            highest_needed_page = needed_page_list[-1]
            
            while page_num not in all_pages_needed:
                if page_num > highest_needed_page:
                    page_num = lowest_needed_page  # wrap around
                    passes += 1
                    break
                    
                page_num += 1
                    
            if passes >= MAX_SEARCH_PASSES:
                '''
                amazon sometimes fails to give all of the pages for a result, bouncing between
                different numbers of results for a query.
                this happens often enough to consider it a somewhat normal occurrence.
                '''
                self.log.warn('Unable to get a complete set of results')
                break
                
        return False
        
        
    def get_book_info(self, book_id):
    
        MAX_RETRIES = 4
        retries = 0
    
        while (True):
             
            response = self.open_amazon_url(book_id=book_id)
                
            # Parse page
            
            authors = []
            title = ''
            language = ''
            publisher = ''
            pubdate = None
            series = ''
            series_index = 0.0
            formats = {FORMAT_KINDLE_BOOK}
            
            soup = BeautifulSoup(response.data, convertEntities=BeautifulSoup.HTML_ENTITIES)
            
            alert_div = soup.find('div', attrs={'class':'a-alert-content'})
            if alert_div:
                alert_text = text_only(alert_div)
                # "Please retry" is the default message when there is no real alert -- ignore
                    
                if alert_text and alert_text != '"Please retry"':
                    self.log.info('Alert: %s' % alert_text)
                    
                    if 'Please reload this page' in alert_text:
                        # We're sorry, an error has occurred. Please reload this page and try again.
                    
                        retries += 1
                        if retries > MAX_RETRIES:
                            raise LibraryError('Repeating alert: %s' % alert_text)
                            
                        time.sleep(30)  # delay before retry to give temporary problems time to clear
                        continue
                    
            break
                
        
        #if soup.find('span', attrs=class_contains('series-detail-title')):
        if soup.find('div', attrs=class_contains('series-header')):
            # page representing a series, rather than an individual book
            self.log.info('%s at %s is a series, not a book' % (book_id, self.name))
            return None
        
        title_div = soup.find('div', attrs={'id':'booksTitle'})
        if title_div:
            # new Kindle book web page format starting 05/2015
            title_span = must_find(title_div, 'span', attrs={'id':re.compile('productTitle|ebooksProductTitle')})
            title = normalize_title(text_only(title_span))
            
            bylines = title_div.findAll('span', attrs={'class':re.compile('contribution')})
            
        else:
            title_h1 = soup.find('h1', attrs={'class':re.compile('parseasinTitle')})
            if title_h1:
                title_span = must_find(title_h1, 'span', attrs={'id':'btAsinTitle'})
                title = normalize_title(text_only(title_span))
                
                bylines = soup.findAll('span', attrs={'class':re.compile("byLinePipe")})
                    
            else:
                raise LibraryError('Unable to determine Amazon title format')
                
                        
        for byline in bylines:
            if byline.get('id') == "pubdatelabel":
                continue    # not an author
                
            role_m = re.search(r'\((.+)\)', text_only(byline).replace(' ',''))
            if not role_m:
                self.log.warn('Failed to parse creator roles: %s' % text_only(byline))
                continue
                
            roles = set(role_m.group(1).split(','))
                
            author_tag = byline.previousSibling   # author precedes byline
            
            while author_tag and (getattr(author_tag, 'name', '') not in ['a', 'b', 'div', 'span']):
                author_tag = author_tag.previousSibling     # skip backward to tag containing the author name
            
            if author_tag:
                author = normalize_author(text_only(author_tag).strip())
            else:
                author = None
                
            if author and (author not in KNOWN_ROLES):
                for role in list(roles):
                    if role not in KNOWN_ROLES:
                        self.log.warn('Unexpected role %s for creator %s' % (role, author))
                        
                    if role in UNDESIRED_ROLES:
                        roles.remove(role)
                        
                if len(roles) > 0:
                    authors.append(author)
            #else:
            #    self.log.warn('Failed to parse %s name %s' % (','.join(roles), unicode(author)))
                
                        
        content_div = soup.find('div', attrs={'id':'ps-content'})
        if content_div:
            metadata_div = content_div.find('div', attrs={'id':'bookmetadata'})
            if metadata_div:
                pub_date_input = metadata_div.find('input',attrs={'id':'pubdate'})
                if pub_date_input:
                    #self.log.info('pubdate from div id-pubdate: %s'%pub_date_input['value'])
                    pubdate = dateutil.parser.parse(pub_date_input['value']).replace(tzinfo=tzutc())  # ISO 8601 format
                    #self.log.info('pubdate = %s'%pubdate.isoformat())

                else:
                    pub_date_span = metadata_div.find('span', attrs={'id':'pubdatevalue'})
                    if pub_date_span:
                        #self.log.info('pubdate from span id-pubdatevalue: %s'%text_only(pub_date_span))
                        pubdate = parse_only_date(text_only(pub_date_span), assume_utc=True)
                        #self.log.info('pubdate = %s'%pubdate.isoformat())
                     
        details = []
        product_details_table = soup.find('table', attrs={'id':'productDetailsTable'})
        if product_details_table:
            details.extend(product_details_table.findAll('li', recursive=True))
            
        etextbook_bullet_list = soup.find('ul', attrs={'id':'eTextbookBulletList'})
        if etextbook_bullet_list:
            details.extend(etextbook_bullet_list.findAll('li', recursive=True))
                    
        for buying_div in soup.findAll('div', attrs={'class': 'buying'}, recursive=True):
            details.extend(buying_div.findAll('li', attrs={'class': 'listItem'}, recursive=True))
            
        about_section = soup.find('div', attrs={'id':'aboutEbooksSection'})
        if about_section:
            details.extend(about_section.findAll('td', recursive=True))
            
        for detail in details:    
            t = text_only(detail)
            if t:
                key,sep,val = t.partition(':')
                val = val.strip()
                #self.log.info('detail: %s' % t)
                
                if key == 'Publisher' or (key == 'Sold by' and not publisher):
                    publisher = val
                    if publisher and publisher[-1] == ')':
                        publisher,sep,date = publisher.rpartition('(')
                        date = date.replace(')', '').strip()
                        if date and not pubdate:
                            #self.log.info('pubdate from publisher: %s'%date)
                            pubdate = parse_only_date(date, assume_utc=True)
                            #self.log.info('pubdate = %s'%pubdate.isoformat())
                            
                        publisher = publisher.strip()
                        
                        if ';' in publisher:
                            publisher = publisher.partition(';')[0].strip()    # drop edition
                            
                elif key == 'Publication Date':
                        if not pubdate:
                            pubdate = parse_only_date(val, assume_utc=True)
                            
                elif key == 'Language':
                    language = val
                    
                else:
                    match = re.match(r'^(Series|Collection): (.+) \(Book ([0-9]+)\)$', t, flags=re.IGNORECASE)
                    if match:
                        series = match.group(2)
                        series_index = float(match.group(3))
                    else:
                        match = re.match(r'^Book ([0-9]+) of ([0-9]+) in (.+)$', t, flags=re.IGNORECASE)
                        if match:
                            series = match.group(3)
                            series_index = float(match.group(1))
                            
                        else:
                            detail_a = detail.find('a')
                            if detail_a and detail_a.get('href', '').startswith('/s/ref=series'):
                                detail_a_text = text_only(detail_a)
                                if detail_a_text != 'Similar books':
                                    series = detail_a_text
                                    series_index = 0.0
                                    
                                    if series != t:
                                        self.log.warn('Unexpected series format: %s' % t)
                                        
                            elif tweaks.get(TWEAK_SAVE_RESPONSES_ON_ERROR, False):
                                for msg in [r'Length:', r"Don't have a Kindle", r'Whispersync for Voice:', r'Word Wise:',
                                            r'Available only on these devices', r'Due to its large file size', 
                                            r'Prime members can borrow', r'Available on these devices',
                                            r'File Size:', r'Print Length:', r'Page Numbers Source ISBN:',
                                            r'Sold by:', r'ASIN:', r'Text-to-Speech:', r'X-Ray:', r'Lending:',
                                            r'Amazon Best Sellers Rank:', r'#[0-9]+ in Books', r'Format:',
                                            r'Average Customer Review:', r'#[0-9]+ in Kindle Store',
                                            r'Amazon Bestsellers Rank:', r'Simultaneous Device Usage:',
                                            r'Optimised for larger screens', r'ISBN-10:', r'ISBN-13:',
                                            r'Optimized for larger screens', r'Due to large size of this book,',
                                            r'Due to large file size,', r'Enhanced Typesetting:',
                                            r'Matchbook Price:', r'Age Level:', r'Grade Level:',
                                            r'Audible Narration', r'X-Ray for Textbooks:', r'Similar books',
                                            r'Page Flip:', r'Highlight, take notes, and search in the book',
                                            r'Thousands of books are eligible,', r'Look for the Kindle MatchBook icon',
                                            r'Read the Kindle edition on any Kindle device', r'Print edition must be purchased new',
                                            r'Gifting of the Kindle edition at the Kindle MatchBook price is not available.',
                                            r'Page numbers are just like the physical edition', r'Create digital flashcards instantly',
                                            ]:
                                    if re.match(msg, t, flags=re.IGNORECASE):
                                        break
                                else:
                                    self.log.warn('unexpected detail: %s' % t)
                     
        if re.search(r'book [0-9]+', series, flags=re.IGNORECASE) or series == 'With Active Table of Contents':
            self.log.info('Ignoring incorrect series name: %s' % series)
            series = ''
            series_index = 0.0
            
        if series:
            series = re.sub(r' \([0-9]+ Book Series\)$', '', series)    # drop optional series count
            #self.log.info('found series "%s" [%02d]' %(series, int(series_index)))
            
            # Try variations of how the series may have been combined with the title to remove it
            
            index_pat = ur'(%d|%s)' % (int(series_index), word_number(int(series_index)))
            
            for pat1 in [
                        '(The |)%s(,|:|) (Book |Volume |Vol |Issue |)%s' % (re.escape(series), index_pat),
                        '(Book|Volume|Vol|Issue) %s of( The)? %s' % (index_pat, re.escape(series)),
                        '(The |)%s' % re.escape(series),
                        'Book %s' % index_pat,
                        ]:
                        
                for pat in [
                        ': %s$' % pat1,
                        ur' \(%s\)' % pat1,
                        ]:
                        
                    new_title = re.sub(pat, '', title, flags=re.IGNORECASE).strip()
                    if new_title != title:
                        break
                        
                if new_title != title:
                    #self.log.info('shortened title from "%s" to "%s"' % (title, new_title))
                    title = new_title
                    break
                    
            #else:
            #    self.log.warn('title unchanged "%s" for series "%s" [%02d]' % (title, series, int(series_index)))
            
            if series.endswith(' series'):
                series = series[:-len(' series')]   # e.g.: "The Dagger and the Coin series"
                #self.log.info('shortened series to "%s"' % series)
        
       
        warning_msg = soup.find('div', attrs={'class':'message warning'})
        if warning_msg:
            msg_text = text_only(warning_msg)
            self.log.info('Book warning message: ' + msg_text)
            
            if "book is currently unavailable" in msg_text:
                formats = set()     # not available
                
        
        no_offer_msg = soup.find('span', attrs=class_contains('no-kindle-offer-message'))
        if no_offer_msg:
            self.log.info('Book no-offer message: ' + text_only(no_offer_msg))
            formats = set()     # not available
                
        
        return InfoBook(
            authors=authors, title=title, publisher=publisher, pubdate=pubdate, 
            language=language, series=series, series_index=series_index,
            formats=formats, lib=self, book_id=book_id)
            

    def check_book_obtainable(self, book_id):
        response = self.open_amazon_url(book_id=book_id)
                    
        soup = BeautifulSoup(response.data, convertEntities=BeautifulSoup.HTML_ENTITIES)
        release_date = None
        
        buy_elems = soup.findAll('div', attrs={'class':'buying'})
        buy_elems.extend(soup.findAll('div', attrs={'id':'deliverTo'}))
        buy_elems.extend(soup.findAll('form', attrs={'id':'buyOneClick'}))    # 05/2015
        
        for buying in buy_elems:
            #self.log.info('Checking: %s' % text_only(buying))
            # "This title will be auto-delivered to your Kindle on February 16, 2016."
            m = re.search(r'This title will be auto-delivered to your Kindle( and you will be charged)? on ([a-z0-9, ]+)\.',
                                text_only(buying), flags=re.IGNORECASE)
            if m:
                release_date = parse_only_date(m.group(2), assume_utc=True)
                break
            
        return self.when_obtainable(library_copies=True, release_date=release_date)
                        
                        
def asins_with_names(asins, names):
    result = {}
    for asin in sorted(list(asins)):
        result[asin] = names.get(asin, "")
        
    return result
    

def inventory_amazon_et(abort, log, status, config):
    # determine which Amazon US books in a set support enhanced typesetting
    
    lending_lib = LendingLibrary(library_id='store', name='Amazon', enabled=True, 
        card_number='', card_pin='', branch_id='', provider_id=Amazon.id)

    lib = SearchableLibrary.create(log, config, lending_lib)
    lib.sign_in(False)
    
    for i,book in enumerate(config.calibre_books):
        status.update(i / len(config.calibre_books), unicode(book))
        
        book.preserve_all_links()   # don't alter links
        book.et_asin = book.orig_et_asin
        book.et_status = None
        book.pf_status = None
        asins_tried = set()
        
        while book.et_asin and (book.et_asin not in asins_tried):
            asins_tried.add(book.et_asin)
            
            try:
                response = lib.open_amazon_url(book_id=book.et_asin)
                soup = BeautifulSoup(response.data, convertEntities=BeautifulSoup.HTML_ENTITIES)
                    
                details = []
                product_details_table = soup.find('table', attrs={'id':'productDetailsTable'})
                if product_details_table:
                    details.extend(product_details_table.findAll('li', recursive=True))
                    
                etextbook_bullet_list = soup.find('ul', attrs={'id':'eTextbookBulletList'})
                if etextbook_bullet_list:
                    details.extend(etextbook_bullet_list.findAll('li', recursive=True))
                    
                for buying_div in soup.findAll('div', attrs={'class': 'buying'}, recursive=True):
                    details.extend(buying_div.findAll('li', attrs={'class': 'listItem'}, recursive=True))
                    
                about_section = soup.find('div', attrs={'id':'aboutEbooksSection'})
                if about_section:
                    details.extend(about_section.findAll('td', recursive=True))
                    
                for detail in details:    
                    t = text_only(detail)
                    if t:
                        key,sep,val = t.partition(':')
                        #log.info('detail: %s' % t)
                        
                        if key == 'Enhanced Typesetting' and book.et_status is None:
                            book.et_status = ('not' not in val.lower())
                            log.info('Enhanced typesetting %s for %s' % (unicode(book.et_status), unicode(book)))
                            
                        if key == 'Page Flip' and book.pf_status is None:
                            book.pf_status = ('not' not in val.lower())
                            log.info('Page flip %s for %s' % (unicode(book.et_status), unicode(book)))
                            
                if book.et_status is None:
                    for button_a in soup.findAll('a', recursive=True):
                        if text_only(button_a).startswith("Kindle"):
                            href = button_a.get('href', '')
                            m = re.search(r'/dp/(B[A-Z0-9]{9})/ref=(tmm_kin|mt_kindle)', href)
                            if m:
                                book.et_asin = m.group(1)
                                log.info('Book id should be %s for %s' % (book.et_asin, unicode(book)))
                                break
                    else:            
                        log.error('Enhanced typesetting indicator not found for %s' % unicode(book))
                        
                elif (book.et_status is True) != (book.pf_status is True):
                    log.warn('Enhanced typesetting %s != Page flip %s for %s' % (
                            unicode(book.et_status), unicode(book.pf_status), unicode(book)))
                                
            except Exception as e:
                log.exception(unicode(book), e)

    return (config.calibre_books, [])
