Connoisseur
Posts: 79
Karma: 24492
Join Date: Jul 2013
Location: France
Device: Kindle 4, PBk Lux 2, PBk Lux 3, K Aura, K Libra H2O, K Libra2
|
Metadata plugin Babelio
Hello,
I try to convert the babelio plugin. I did conversion using the command
Code:
python-modernize -w __init__.py
and I also tried this command
Code:
python-modernize -w --future-unicode __init__.py
I have corrected some error about ASCII but I am struggled with this error:
Quote:
Running identify query with parameters:
{'title': 'Victime 2117', 'authors': ['Jussi Adler-Olsen'], 'identifiers': {'isbn': '9782226396334', 'mobi-asin': 'B0814FYJJJ'}, 'timeout': 30}
Using plugins: Babelio (0, 4, 0)
The log from individual plugins is below
****************************** Babelio (0, 4, 0) ******************************
Found 0 results
Downloading from Babelio took 0.004427194595336914
Plugin Babelio failed
Traceback (most recent call last):
File "site-packages/calibre/ebooks/metadata/sources/identify.py", line 47, in run
File "calibre_plugins.babelio.__init__", line 72, in identify
File "calibre_plugins.babelio.__init__", line 64, in create_query
TypeError: can only concatenate str (not "bytes") to str
************************************************** ******************************
The identify phase took 0.23 seconds
The longest time (0.004427) was taken by: Babelio
Merging results from different sources
We have 0 merged results, merging took: 0.00 seconds
|
I did some search and understand that this error come from the fact in Python 2 the strings could be text and bytes but in Python3 this is no more possible.
But, as I do not know anything in code I am not able to find the way to solve this issue, I did some tries but without any success
Can someone help me to find a solution, please ?
Here below the code of __init__.py
Code:
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
print_function)
from six.moves import range
from six.moves import zip
__license__ = 'GPL v3'
__copyright__ = '2014, VdF>'
__docformat__ = 'restructuredtext'
import time, six.moves.http_cookiejar, unicodedata
from six.moves.urllib.parse import quote, unquote
from six.moves.queue import Queue, Empty
from difflib import SequenceMatcher
from lxml.html import fromstring, tostring
from calibre.ebooks.metadata import check_isbn
from calibre.ebooks.metadata.sources.base import Source
from calibre.utils.cleantext import clean_ascii_chars
from calibre.utils.config import JSONConfig
class Babelio(Source):
name = 'Babelio'
description = 'Telecharge les metadonnees et couverture depuis Babelio.com'
author = 'VdF'
version = (0, 4, 0)
minimum_calibre_version = (0, 8, 0)
capabilities = frozenset(['identify', 'cover'])
touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'rating', 'comments', 'publisher', 'pubdate', 'tags'])
has_html_comments = False
supports_gzip_transfer_encoding = True
BASE_URL = 'https://www.babelio.com'
def config_widget(self):
from calibre_plugins.babelio.config import ConfigWidget
return ConfigWidget(self)
def create_query(self, log, title=None, authors=None, identifiers={}):
q = ''
isbn = check_isbn(identifiers.get('isbn', None))
tokens = []
if title is not None:
tokens += title.replace('\u2019', ' ').replace("'", ' ').replace(' ', ' ').replace('\u2013', ' ').replace('\u0153', '\u006f\u0065')
if authors is not None and len(authors) >= 1:
for i in range(0, len(authors)):
tokens += ' '
if ',' in authors[i] :
auteur = authors[i].split(',')[0]
elif ' ' in authors[i] :
auteur = authors[i].rsplit(' ')[-1]
else :
auteur = authors[i]
tokens += auteur
tokens = [quote(t.encode('iso-8859-1')) for t in tokens]
q = ''.join(tokens)
q = '/resrecherche.php?Recherche=' + q + '&page=1&item_recherche=livres&tri=titre'
if not q:
return None
if isinstance(q, str):
q = q.encode('utf-8')
return Babelio.BASE_URL + q
def identify(self, log, result_queue, abort, title=None, authors=None,
identifiers={}, timeout=30):
matches = []
br = self.browser
cj = six.moves.http_cookiejar.LWPCookieJar()
br.set_cookiejar(cj)
query = self.create_query(log, title=title, authors=authors, identifiers=identifiers)
if query is None:
log.error(b'Metadonnees insuffisantes pour la requete'.encode('latin-1'))
return
log.info(b'Recherche de : %s' % unquote(query).encode('latin-1'))
response = br.open_novisit(query, timeout=timeout)
try:
raw = response.read().strip()
raw = raw.decode('latin-1', errors='replace')
#open('E:\\babelio.html', 'wb').write(raw)
if not raw:
log.error(b'Pas de resultat pour la requete : %r'.encode('latin-1') % unquote(query).encode('latin-1'))
return
root = fromstring(clean_ascii_chars(raw))
except:
msg = b'Impossible de parcourir la page babelio avec la requete : %r'.encode('latin-1') % unquote(query).encode('latin-1')
log.exception(msg)
return msg
self._parse_search_results(log, title, authors, root, matches, timeout)
if abort.is_set():
return
if not matches:
if title and authors and len(authors) > 1:
log.info(b'Pas de resultat avec les auteurs, on utilise uniquement le premier.'.encode('latin-1'))
return self.identify(log, result_queue, abort, title=title,
authors=[authors[0]], timeout=timeout)
elif authors and len(authors) == 1 :
log.info(b'Pas de resultat, on utilise uniquement le titre.'.encode('latin-1'))
return self.identify(log, result_queue, abort, title=title, timeout=timeout)
log.error(b'Pas de resultat pour la requete : %r'.encode('latin-1') % unquote(query.encode('latin-1')))
return
from calibre_plugins.babelio.worker import Worker
workers = [Worker(url, result_queue, br, log, i, self) for i, url in
enumerate(matches)]
for w in workers:
w.start()
# Don't send all requests at the same time
time.sleep(0.1)
while not abort.is_set():
a_worker_is_alive = False
for w in workers:
w.join(0.1)
if abort.is_set():
break
if w.is_alive():
a_worker_is_alive = True
if not a_worker_is_alive:
break
return None
def _parse_search_results(self, log, orig_title, orig_authors, root, matches, timeout):
orig_aut = None
if orig_authors is not None:
orig_aut = [author.split(',')[0] for author in orig_authors if (',' in author)] \
+ [author.split(' ')[1] for author in orig_authors if (' ' in author)]
# log.info([author.split(',')[0] for author in orig_authors if (',' in author)])
# log.info([author.split(' ')[1] for author in orig_authors if (' ' in author)])
non_trouve = root.xpath('//div[@class="module_t1"]/h2')
'''if non_trouve :
non_trouve_text = non_trouve[0].text_content()
if '(0)' in non_trouve_text :
return'''
def minussa(chaine):
chaine = str(chaine.lower())
chnorm = unicodedata.normalize('NFKD', chaine)
return "".join([car for car in chnorm if not unicodedata.combining(car)])
def simil(mot1, mot2, ratio):
mot1, mot2 = minussa(mot1), minussa(mot2)
return SequenceMatcher(None, mot1, mot2).ratio() >= ratio
def is_simil(orig_aut, dict_res, ratio):
for aut_compl in (v.text for v in dict_res.values()) :
for a in orig_aut :
if simil(aut_compl.split()[-1], a, ratio):
return True
return False
titre_res = root.xpath(".//*[@id='page_corps']/div/div[3]/div[2]/table/tbody/tr/td[2]/a[1]")
# log.info('t_res', titre_res)
if len(titre_res) == 0 :
return
else :
matches.append(Babelio.BASE_URL + titre_res[0].get('href'))
return
aut_res = root.xpath(".//*[@id='page_corps']/div/div[3]/div[3]/table/tbody/tr/td[3]/a")
dict_res = dict(list(zip(titre_res, aut_res)))
# log.info('dict', dict_res)
if orig_aut is not None :
ratio = 0.7
for k in dict_res.keys():
if is_simil(orig_aut, dict_res, ratio):
matches.append(Babelio.BASE_URL + k.get('href'))
else :
for i in range(0, len(titre_res)):
matches.append(Babelio.BASE_URL + titre_res[i].get('href'))
matches = matches[:5]
# log.info('mat', matches)
def get_cached_cover_url(self, identifiers):
if JSONConfig('plugins/Babelio').get('cover', False) == False:
return None
url = None
bab_id = identifiers.get('babelio', None)
if bab_id is None:
isbn = identifiers.get('isbn', None)
if isbn is not None:
bab_id = self.cached_isbn_to_identifier(isbn)
if bab_id is not None:
url = self.cached_identifier_to_cover_url(bab_id)
return url
def download_cover(self, log, result_queue, abort,
title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
if JSONConfig('plugins/Babelio').get('cover', False) == False:
return
cached_url = self.get_cached_cover_url(identifiers)
log.info('cache :', cached_url)
if cached_url is None:
log.info('Pas de cache, on lance identify')
rq = Queue()
self.identify(log, rq, abort, title=title, authors=authors,
identifiers=identifiers)
if abort.is_set():
return
results = []
while True:
try:
results.append(rq.get_nowait())
except Empty:
break
# results.sort(key=self.identify_results_keygen(
# title=title, authors=authors, identifiers=identifiers))
for mi in results:
cached_url = self.get_cached_cover_url(mi.identifiers)
if cached_url is not None:
break
if cached_url is None:
log.info(b'Pas de couverture trouvee.'.encode('latin-1'))
return
if abort.is_set():
return
br = self.browser
log.info(b'On telecharge la couverture depuis :'.encode('latin-1'), cached_url)
try:
cdata = br.open_novisit(cached_url, timeout=timeout).read()
result_queue.put((self, cdata))
except:
log.exception(b'Impossible de telecharger la couverture depuis :'.encode('latin-1'), cached_url)
Last edited by druss67; 10-04-2020 at 11:32 AM.
|