@Kovid - the Find Duplicates plugin for binary comparison does two passes.
The first is to add candidates to a map by getting the os stat size and modified datetime:
Spoiler:
Code:
def _find_candidate_by_file_size(self, book_id, candidates_map):
formats = self.db.formats(book_id, index_is_id=True, verify_formats=False)
count = 0
for fmt in formats.split(','):
fmt_path = self.db.format_abspath(book_id, fmt, index_is_id=True)
if fmt_path:
try:
stats = os.stat(fmt_path)
mtime = stats.st_mtime
size = stats.st_size
candidates_map[size].add((book_id, fmt, fmt_path, mtime))
count += 1
except:
traceback.print_exc()
return count
The second pass is on the reduced subset (where size and modified datetime match) to compute a hash for each of those books:
Spoiler:
Code:
def _find_candidate_by_hash(self, book_id, fmt, fmt_path, mtime, size, candidates_map, hash_map, result_hash_map):
# Work out whether we need to calculate a hash for this file from
# book plugin data from a previous run
book_data = hash_map.get(book_id, {}).get(fmt, {})
if book_data.get('mtime', None) == mtime:
sha = book_data.get('sha', None)
size = book_data.get('size', None)
if sha and size:
candidates_map[(sha, size)].add(book_id)
self._add_to_hash_map(result_hash_map, book_id, fmt, book_data)
return
try:
with open(fmt_path, 'rb') as f:
content = f.read()
sha = hashlib.sha256()
sha.update(content)
hash = (sha.hexdigest(), size)
candidates_map[hash].add(book_id)
# Store our plugin book data for future repeat scanning
book_data['mtime'] = mtime
book_data['sha'] = sha.hexdigest()
book_data['size'] = size
self._add_to_hash_map(result_hash_map, book_id, fmt, book_data)
except:
traceback.print_exc()