MobileRead Forums - View Single Post

druss67 · 10-04-2020, 12:09 PM

Hello,

I try to convert the babelio plugin. I did conversion using the command

Code:

python-modernize -w __init__.py

and I also tried this command

Code:

python-modernize -w --future-unicode __init__.py

I have corrected some error about ASCII but I am struggled with this error:

Quote:

Running identify query with parameters:
{'title': 'Victime 2117', 'authors': ['Jussi Adler-Olsen'], 'identifiers': {'isbn': '9782226396334', 'mobi-asin': 'B0814FYJJJ'}, 'timeout': 30}
Using plugins: Babelio (0, 4, 0)
The log from individual plugins is below

****************************** Babelio (0, 4, 0) ******************************
Found 0 results
Downloading from Babelio took 0.004427194595336914
Plugin Babelio failed
Traceback (most recent call last):
File "site-packages/calibre/ebooks/metadata/sources/identify.py", line 47, in run
File "calibre_plugins.babelio.__init__", line 72, in identify
File "calibre_plugins.babelio.__init__", line 64, in create_query
TypeError: can only concatenate str (not "bytes") to str

************************************************** ******************************
The identify phase took 0.23 seconds
The longest time (0.004427) was taken by: Babelio
Merging results from different sources
We have 0 merged results, merging took: 0.00 seconds

I did some search and understand that this error come from the fact in Python 2 the strings could be text and bytes but in Python3 this is no more possible.

But, as I do not know anything in code I am not able to find the way to solve this issue, I did some tries but without any success

Can someone help me to find a solution, please ?

Here below the code of __init__.py

Code:

#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
                        print_function)
from six.moves import range
from six.moves import zip

__license__ = 'GPL v3'
__copyright__ = '2014, VdF>'
__docformat__ = 'restructuredtext'

import time, six.moves.http_cookiejar, unicodedata
from six.moves.urllib.parse import quote, unquote
from six.moves.queue import Queue, Empty
from difflib import SequenceMatcher
from lxml.html import fromstring, tostring

from calibre.ebooks.metadata import check_isbn
from calibre.ebooks.metadata.sources.base import Source
from calibre.utils.cleantext import clean_ascii_chars
from calibre.utils.config import JSONConfig

class Babelio(Source):

    name = 'Babelio'
    description = 'Telecharge les metadonnees et couverture depuis Babelio.com'
    author = 'VdF'
    version = (0, 4, 0)
    minimum_calibre_version = (0, 8, 0)
    capabilities = frozenset(['identify', 'cover'])
    touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'rating', 'comments', 'publisher', 'pubdate', 'tags'])
    has_html_comments = False
    supports_gzip_transfer_encoding = True
    BASE_URL = 'https://www.babelio.com'

    def config_widget(self):
        from calibre_plugins.babelio.config import ConfigWidget
        return ConfigWidget(self)

    def create_query(self, log, title=None, authors=None, identifiers={}):
        q = ''
        isbn = check_isbn(identifiers.get('isbn', None))
        tokens = []
        if title is not None:
            tokens += title.replace('\u2019', ' ').replace("'", ' ').replace('  ', ' ').replace('\u2013', ' ').replace('\u0153', '\u006f\u0065')
        if authors is not None and len(authors) >= 1:
            for i in range(0, len(authors)):
                tokens += ' '
                if ',' in authors[i] :
                    auteur = authors[i].split(',')[0]
                elif ' ' in authors[i]  :
                    auteur = authors[i].rsplit(' ')[-1]
                else :
                    auteur = authors[i]
                tokens += auteur
        tokens = [quote(t.encode('iso-8859-1')) for t in tokens]
        q = ''.join(tokens)
        q = '/resrecherche.php?Recherche=' + q + '&page=1&item_recherche=livres&tri=titre'

        if not q:
            return None
        if isinstance(q, str):
            q = q.encode('utf-8')
        return Babelio.BASE_URL + q

    def identify(self, log, result_queue, abort, title=None, authors=None,
            identifiers={}, timeout=30):
        matches = []
        br = self.browser
        cj = six.moves.http_cookiejar.LWPCookieJar()
        br.set_cookiejar(cj)
        query = self.create_query(log, title=title, authors=authors, identifiers=identifiers)
        if query is None:
            log.error(b'Metadonnees insuffisantes pour la requete'.encode('latin-1'))
            return
        log.info(b'Recherche de : %s' % unquote(query).encode('latin-1'))
        response = br.open_novisit(query, timeout=timeout)
        try:
            raw = response.read().strip()
            raw = raw.decode('latin-1', errors='replace')
            #open('E:\\babelio.html', 'wb').write(raw)
            if not raw:
                log.error(b'Pas de resultat pour la requete : %r'.encode('latin-1') % unquote(query).encode('latin-1'))
                return
            root = fromstring(clean_ascii_chars(raw))
        except:
            msg = b'Impossible de parcourir la page babelio avec la requete : %r'.encode('latin-1') % unquote(query).encode('latin-1')
            log.exception(msg)
            return msg
        self._parse_search_results(log, title, authors, root, matches, timeout)

        if abort.is_set():
            return

        if not matches:
            if title and authors and len(authors) > 1:
                log.info(b'Pas de resultat avec les auteurs, on utilise uniquement le premier.'.encode('latin-1'))
                return self.identify(log, result_queue, abort, title=title,
                        authors=[authors[0]], timeout=timeout)
            elif authors and len(authors) == 1 :
                log.info(b'Pas de resultat, on utilise uniquement le titre.'.encode('latin-1'))
                return self.identify(log, result_queue, abort, title=title, timeout=timeout)
            log.error(b'Pas de resultat pour la requete : %r'.encode('latin-1') % unquote(query.encode('latin-1')))
            return

        from calibre_plugins.babelio.worker import Worker
        workers = [Worker(url, result_queue, br, log, i, self) for i, url in
                enumerate(matches)]

        for w in workers:
            w.start()
            # Don't send all requests at the same time
            time.sleep(0.1)

        while not abort.is_set():
            a_worker_is_alive = False
            for w in workers:
                w.join(0.1)
                if abort.is_set():
                    break
                if w.is_alive():
                    a_worker_is_alive = True
            if not a_worker_is_alive:
                break

        return None

    def _parse_search_results(self, log, orig_title, orig_authors, root, matches, timeout):
        orig_aut = None
        if orig_authors is not None:
            orig_aut = [author.split(',')[0] for author in orig_authors if (',' in author)] \
                        + [author.split(' ')[1] for author in orig_authors if (' ' in author)]
        # log.info([author.split(',')[0] for author in orig_authors if (',' in author)])
        # log.info([author.split(' ')[1] for author in orig_authors if (' ' in author)])
        non_trouve = root.xpath('//div[@class="module_t1"]/h2')
        '''if non_trouve :
            non_trouve_text = non_trouve[0].text_content()
            if '(0)' in non_trouve_text :
                return'''

        def minussa(chaine):
            chaine = str(chaine.lower())
            chnorm = unicodedata.normalize('NFKD', chaine)
            return "".join([car for car in chnorm if not unicodedata.combining(car)])

        def simil(mot1, mot2, ratio):
            mot1, mot2 = minussa(mot1), minussa(mot2)
            return SequenceMatcher(None, mot1, mot2).ratio() >= ratio

        def is_simil(orig_aut, dict_res, ratio):
            for aut_compl in (v.text for v in dict_res.values()) :
                for a in orig_aut :
                    if simil(aut_compl.split()[-1], a, ratio):
                        return True
                    return False

        titre_res = root.xpath(".//*[@id='page_corps']/div/div[3]/div[2]/table/tbody/tr/td[2]/a[1]")
        # log.info('t_res', titre_res)
        if len(titre_res) == 0 :
            return
        else :
            matches.append(Babelio.BASE_URL + titre_res[0].get('href'))
            return
        aut_res = root.xpath(".//*[@id='page_corps']/div/div[3]/div[3]/table/tbody/tr/td[3]/a")
        dict_res = dict(list(zip(titre_res, aut_res)))
        # log.info('dict', dict_res)
        if orig_aut is not None :
            ratio = 0.7
            for k in dict_res.keys():
                if is_simil(orig_aut, dict_res, ratio):
                    matches.append(Babelio.BASE_URL + k.get('href'))
        else :
            for i in range(0, len(titre_res)):
                matches.append(Babelio.BASE_URL + titre_res[i].get('href'))
                matches = matches[:5]
        # log.info('mat', matches)

    def get_cached_cover_url(self, identifiers):
        if JSONConfig('plugins/Babelio').get('cover', False) == False:
            return None
        url = None
        bab_id = identifiers.get('babelio', None)
        if bab_id is None:
            isbn = identifiers.get('isbn', None)
            if isbn is not None:
                bab_id = self.cached_isbn_to_identifier(isbn)
        if bab_id is not None:
            url = self.cached_identifier_to_cover_url(bab_id)
        return url

    def download_cover(self, log, result_queue, abort,
        title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
        if JSONConfig('plugins/Babelio').get('cover', False) == False:
            return
        cached_url = self.get_cached_cover_url(identifiers)
        log.info('cache :', cached_url)
        if cached_url is None:
            log.info('Pas de cache, on lance identify')
            rq = Queue()
            self.identify(log, rq, abort, title=title, authors=authors,
                    identifiers=identifiers)
            if abort.is_set():
                return
            results = []
            while True:
                try:
                    results.append(rq.get_nowait())
                except Empty:
                    break
            # results.sort(key=self.identify_results_keygen(
                # title=title, authors=authors, identifiers=identifiers))
            for mi in results:
                cached_url = self.get_cached_cover_url(mi.identifiers)
                if cached_url is not None:
                    break
        if cached_url is None:
            log.info(b'Pas de couverture trouvee.'.encode('latin-1'))
            return

        if abort.is_set():
            return
        br = self.browser

        log.info(b'On telecharge la couverture depuis :'.encode('latin-1'), cached_url)
        try:
            cdata = br.open_novisit(cached_url, timeout=timeout).read()
            result_queue.put((self, cdata))
        except:
            log.exception(b'Impossible de telecharger la couverture depuis :'.encode('latin-1'), cached_url)