#!/usr/bin/env python
from __future__ import (unicode_literals, division, absolute_import,
                        print_function)

__license__   = 'GPL v3' #Based on B&N plugin by Grant Drake
__copyright__ = '2011, Rodrigo Coin Curvo <rodrigoccurvo@gmail.com>'
__docformat__ = 'en'

import httplib
from urlparse import urlparse
import socket, re, datetime
from collections import OrderedDict
from threading import Thread

from lxml.html import fromstring, tostring

from calibre.ebooks.metadata import check_isbn
from calibre.ebooks.metadata.book.base import Metadata
from calibre.library.comments import sanitize_comments_html
from calibre.utils.cleantext import clean_ascii_chars

from calibre_plugins.skoob_metadata import Skoob
import calibre_plugins.skoob_metadata.config as cfg

class Worker(Thread): # Get details

    AI_LABELS = [
                    ('^Editora:$', 'publisher'),
                    ('^ISBN:$', 'isbn'),
                    ('^Ano:$', 'year')
                ]

    '''
    Get book details from Skoob book page in a separate thread
    '''

    def __init__(self, url, result_queue, browser, log, relevance, plugin, timeout=20):
        Thread.__init__(self)
        self.daemon = True
        self.url, self.result_queue = url, result_queue
        self.log, self.timeout = log, timeout
        self.relevance, self.plugin = relevance, plugin
        self.browser = browser.clone_browser()
        self.cover_url = self.skoob_id = self.isbn = None

    def run(self):
        try:
            self.get_details()
        except:
            self.log.exception('get_details failed for url: %r'%self.url)

    def get_root(self, url):
        try:
            self.log.info('Skoob url: %r'%url)
            raw = self.browser.open_novisit(url, timeout=self.timeout).read().strip()
        except Exception as e:
            if callable(getattr(e, 'getcode', None)) and \
                    e.getcode() == 404:
                self.log.error('URL malformed: %r'%url)
                return
            attr = getattr(e, 'args', [None])
            attr = attr if attr else [None]
            if isinstance(attr[0], socket.timeout):
                msg = 'Skoob timed out. Try again later.'
                self.log.error(msg)
            else:
                msg = 'Failed to make details query: %r'%url
                self.log.exception(msg)
            return

        raw = raw.decode('iso-8859-1', errors='replace')
        #open('E:\\t.html', 'wb').write(raw)

        if 'Erro 404: p' in raw:
            self.log.error('URL malformed: %r'%url)
            return

        try:
            root = fromstring(clean_ascii_chars(raw))
        except:
            msg = 'Failed to parse Skoob details page: %r'%url
            self.log.exception(msg)
            return None

        return root

    def get_additional_url(self):
        return self.url.replace('/livro/', '/livro/edicoes/')

    def get_details(self):

        root = self.get_root(self.url)

        if not root:
            return

        try:
            skoob_id = self.parse_skoob_id(self.url)
        except:
            self.log.exception('Error parsing Skoob id for url: %r'%self.url)
            skoob_id = None

        try:
            (title, series, series_index) = self.parse_title_series(root)
        except:
            self.log.exception('Error parsing title and series for url: %r'%self.url)
            title = series = series_index = None

        try:
            authors = self.parse_authors(root)
        except:
            self.log.exception('Error parsing authors for url: %r'%self.url)
            authors = []

        if not title or not authors or not skoob_id:
            self.log.error('Could not find title/authors/Skoob id for %r'%self.url)
            self.log.error('Skoob: %r Title: %r Authors: %r'%(skoob_id, title,
                authors))
            return

        mi = Metadata(title, authors)

        # This should never happen, since series is not supported yet
        if series:
            mi.series = series
            mi.series_index = series_index

        mi.set_identifier('skoob', skoob_id)
        self.skoob_id = skoob_id

        try:
            mi.rating = self.parse_rating(root)
        except:
            self.log.exception('Error parsing ratings for url: %r'%self.url)

        try:
            mi.comments = self.parse_comments(root)
        except:
            self.log.exception('Error parsing comments for url: %r'%self.url)

        try:
            self.cover_url = self.parse_cover(root)
        except:
            self.log.exception('Error parsing cover for url: %r'%self.url)
        mi.has_cover = bool(self.cover_url)

        try:
            mi.tags = self.parse_tags(root)
        except:
            self.log.exception('Error parsing tags for url: %r'%ai_url)

        default_get_additional_info = cfg.DEFAULT_STORE_VALUES[cfg.KEY_GET_ADDITIONAL_INFO]
        get_additional_info = cfg.plugin_prefs[cfg.STORE_NAME].get(cfg.KEY_GET_ADDITIONAL_INFO, default_get_additional_info)

        if get_additional_info:
            ai_url = self.get_additional_url()
            ai_root = self.get_root(ai_url)

            if ai_root:
                additional_info = self.parse_additional_info(ai_root)
                try:
                    isbn = self.get_isbn(additional_info)
                    if isbn:
                        self.isbn = mi.isbn = isbn
                except:
                    self.log.exception('Error parsing ISBN for url: %r'%ai_url)

                try:
                    mi.publisher = self.get_publisher(additional_info)
                except:
                    self.log.exception('Error parsing publisher for url: %r'%ai_url)

                try:
                    mi.pubdate = self.get_pubdate(additional_info)
                except:
                    self.log.exception('Error parsing published date for url: %r'%ai_url)

        mi.source_relevance = self.relevance

        if self.skoob_id:
            if self.isbn:
                self.plugin.cache_isbn_to_identifier(self.isbn, self.skoob_id)

        self.plugin.clean_downloaded_metadata(mi)

        self._log_metadata(mi)

        self.result_queue.put(mi)

    def get_additional_info_label(self, text):
        for regex, label in Worker.AI_LABELS:
            if re.search(regex, text):
                return label

    def parse_additional_info(self, root):
        # Additional info page is not very well structured, so this tries
        # to find the info (hopefully) without breaking too easily
        ai_nodes = root.xpath('//img[@src="/img/icones/adicionar_barra_100.gif"]/../../../../../div/div/following-sibling::node()')

        ai_dict = {}

        # There probably is a better way to do this
        for ai_node in (ai_nodes):
            if hasattr(ai_node, 'tag') and ai_node.tag == 'span':
                ai_label = self.get_additional_info_label(ai_node.text_content().strip())
            elif ai_label and isinstance(ai_node, str):
                ai_value = ai_node.strip()
                if ai_label not in ai_dict:
                    ai_dict[ai_label] = []
                ai_dict[ai_label].append(ai_value)
                ai_label = None

        return ai_dict

    def _log_metadata(self, mi):
        self.log.info('-'*70)
        self.log.info(mi)
        self.log.info('-'*70)

    def parse_skoob_id(self, url):
        return re.search('skoob.com.br/livro/([0-9]+)', url).groups(0)[0]

    def parse_title_series(self, root):
        title_node = root.xpath("id('barra_titulo')/h1")
        if not title_node:
            return (None, None, None)
        title_text = title_node[0].text_content().strip()
        
        # TODO Try to guess series based on title and subtitle

        return (title_text, None, None)

    def parse_tags(self, root):
        tags_node = root.xpath("id('tags')/li/*")
        tags = []
        if tags_node:
            for tag_node in tags_node:
                tag = tag_node.text_content().strip()
                tags.append(tag)
                
        return tags

    def parse_authors(self, root):
        # Sometimes there's a photo <div>/<a>...
        author_nodes = root.xpath("id('barra_autor')/div//a")
        # ... and sometimes not
        # Can't use just this because the photo is an <a> too
        if not author_nodes:
            author_nodes = root.xpath("id('barra_autor')//a")

        authors = []

        for author_node in author_nodes:
            authors.append(author_node.text_content().strip())
            
        return authors

    def parse_rating(self, root):
        rating_node = root.xpath("id('bt_ranking')")
        if rating_node:
            return float(rating_node[0].text.strip())

    def get_isbn(self, ai):
        label = 'isbn'
        if label in ai:
            valid_isbn = filter(check_isbn, ai[label])
            if valid_isbn:
                return valid_isbn[0]
            elif ai[label]:
                return ai[label][0]
                

    def get_publisher(self, ai):
        label = 'publisher'
        if label in ai and ai[label]:
            return ai[label][0]

    def get_pubdate(self, ai):
        label = 'year'
        year = -1
        if label in ai and ai[label]:
            # Get earliest pub year
            for y in ai[label]:
                try:
                    new_y = int(y)
                    # Calibre's year range is 101-7999
                    if new_y > 100 and new_y < 8000:
                        year = min(year, new_y)
                except:
                    pass

            if year != -1:
                from calibre.utils.date import utc_tz
                # We only have the year, so let's use Jan 1st
                return datetime.datetime(year, 1, 1, tzinfo=utc_tz)

    def parse_comments(self, root):
        # Look for description in hidden div
        description_node = root.xpath("id('csinopse')")
        if description_node:
            comments = tostring(description_node[0], method='html')
            comments = re.sub(r'<h2.*</h2>', '', comments, flags=re.IGNORECASE)
            comments = re.sub(r'<strong>.*</strong>', '', comments, flags=re.IGNORECASE)
            comments = sanitize_comments_html(comments)
            return comments

    def parse_cover(self, root):
        imgcol_node = root.xpath("id('livro_capa')/img/@src")
        if imgcol_node:
            img_url = imgcol_node[0]
            if not urlparse(img_url).netloc and \
                not re.search(Skoob.BASE_DOMAIN, img_url, flags=re.IGNORECASE):
                img_url = Skoob.BASE_URL + img_url
            
            big_img = re.sub('(P|Mini)(\\.[^.]*)$', 'B\\2', img_url, count=1)

            if self._url_exists(big_img):
              img_url = big_img

            if self.skoob_id:
                self.plugin.cache_identifier_to_cover_url(self.skoob_id, img_url)
            return img_url

    def _url_exists(self, url):
        r = urlparse(url)
        conn = httplib.HTTPConnection(r.netloc)
        conn.request('HEAD', r.path)
        response = conn.getresponse()
        conn.close()
        return response.status == 200




