/usr/lib/python3/dist-packages/joblib/hashing.py is in python3-joblib 0.9.4-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 | """
Fast cryptographic hash of Python objects, with a special case for fast
hashing of numpy arrays.
"""
# Author: Gael Varoquaux <gael dot varoquaux at normalesup dot org>
# Copyright (c) 2009 Gael Varoquaux
# License: BSD Style, 3 clauses.
import pickle
import hashlib
import sys
import types
import struct
import io
from ._compat import _bytes_or_unicode, PY3_OR_LATER
if PY3_OR_LATER:
Pickler = pickle._Pickler
else:
Pickler = pickle.Pickler
class _ConsistentSet(object):
""" Class used to ensure the hash of Sets is preserved
whatever the order of its items.
"""
def __init__(self, set_sequence):
# Forces order of elements in set to ensure consistent hash.
try:
# Trying first to order the set assuming the type of elements is
# consistent and orderable.
# This fails on python 3 when elements are unorderable
# but we keep it in a try as it's faster.
self._sequence = sorted(set_sequence)
except TypeError:
# If elements are unorderable, sorting them using their hash.
# This is slower but works in any case.
self._sequence = sorted((hash(e) for e in set_sequence))
class _MyHash(object):
""" Class used to hash objects that won't normally pickle """
def __init__(self, *args):
self.args = args
class Hasher(Pickler):
""" A subclass of pickler, to do cryptographic hashing, rather than
pickling.
"""
def __init__(self, hash_name='md5'):
self.stream = io.BytesIO()
# By default we want a pickle protocol that only changes with
# the major python version and not the minor one
protocol = (pickle.DEFAULT_PROTOCOL if PY3_OR_LATER
else pickle.HIGHEST_PROTOCOL)
Pickler.__init__(self, self.stream, protocol=protocol)
# Initialise the hash obj
self._hash = hashlib.new(hash_name)
def hash(self, obj, return_digest=True):
try:
self.dump(obj)
except pickle.PicklingError as e:
e.args += ('PicklingError while hashing %r: %r' % (obj, e),)
raise
dumps = self.stream.getvalue()
self._hash.update(dumps)
if return_digest:
return self._hash.hexdigest()
def save(self, obj):
if isinstance(obj, (types.MethodType, type({}.pop))):
# the Pickler cannot pickle instance methods; here we decompose
# them into components that make them uniquely identifiable
if hasattr(obj, '__func__'):
func_name = obj.__func__.__name__
else:
func_name = obj.__name__
inst = obj.__self__
if type(inst) == type(pickle):
obj = _MyHash(func_name, inst.__name__)
elif inst is None:
# type(None) or type(module) do not pickle
obj = _MyHash(func_name, inst)
else:
cls = obj.__self__.__class__
obj = _MyHash(func_name, inst, cls)
Pickler.save(self, obj)
def memoize(self, obj):
# We want hashing to be sensitive to value instead of reference.
# For example we want ['aa', 'aa'] and ['aa', 'aaZ'[:2]]
# to hash to the same value and that's why we disable memoization
# for strings
if isinstance(obj, _bytes_or_unicode):
return
Pickler.memoize(self, obj)
# The dispatch table of the pickler is not accessible in Python
# 3, as these lines are only bugware for IPython, we skip them.
def save_global(self, obj, name=None, pack=struct.pack):
# We have to override this method in order to deal with objects
# defined interactively in IPython that are not injected in
# __main__
kwargs = dict(name=name, pack=pack)
if sys.version_info >= (3, 4):
del kwargs['pack']
try:
Pickler.save_global(self, obj, **kwargs)
except pickle.PicklingError:
Pickler.save_global(self, obj, **kwargs)
module = getattr(obj, "__module__", None)
if module == '__main__':
my_name = name
if my_name is None:
my_name = obj.__name__
mod = sys.modules[module]
if not hasattr(mod, my_name):
# IPython doesn't inject the variables define
# interactively in __main__
setattr(mod, my_name, obj)
dispatch = Pickler.dispatch.copy()
# builtin
dispatch[type(len)] = save_global
# type
dispatch[type(object)] = save_global
# classobj
dispatch[type(Pickler)] = save_global
# function
dispatch[type(pickle.dump)] = save_global
def _batch_setitems(self, items):
# forces order of keys in dict to ensure consistent hash.
try:
# Trying first to compare dict assuming the type of keys is
# consistent and orderable.
# This fails on python 3 when keys are unorderable
# but we keep it in a try as it's faster.
Pickler._batch_setitems(self, iter(sorted(items)))
except TypeError:
# If keys are unorderable, sorting them using their hash. This is
# slower but works in any case.
Pickler._batch_setitems(self, iter(sorted((hash(k), v)
for k, v in items)))
def save_set(self, set_items):
# forces order of items in Set to ensure consistent hash
Pickler.save(self, _ConsistentSet(set_items))
dispatch[type(set())] = save_set
class NumpyHasher(Hasher):
""" Special case the hasher for when numpy is loaded.
"""
def __init__(self, hash_name='md5', coerce_mmap=False):
"""
Parameters
----------
hash_name: string
The hash algorithm to be used
coerce_mmap: boolean
Make no difference between np.memmap and np.ndarray
objects.
"""
self.coerce_mmap = coerce_mmap
Hasher.__init__(self, hash_name=hash_name)
# delayed import of numpy, to avoid tight coupling
import numpy as np
self.np = np
if hasattr(np, 'getbuffer'):
self._getbuffer = np.getbuffer
else:
self._getbuffer = memoryview
def save(self, obj):
""" Subclass the save method, to hash ndarray subclass, rather
than pickling them. Off course, this is a total abuse of
the Pickler class.
"""
if isinstance(obj, self.np.ndarray) and not obj.dtype.hasobject:
# Compute a hash of the object:
try:
# memoryview is not supported for some dtypes,
# e.g. datetime64, see
# https://github.com/numpy/numpy/issues/4983. The
# workaround is to view the array as bytes before
# taking the memoryview
obj_bytes_view = obj.view(self.np.uint8)
self._hash.update(self._getbuffer(obj_bytes_view))
# ValueError is raised by .view when the array is not contiguous
# BufferError is raised by Python 3 in the hash update if
# the array is Fortran rather than C contiguous
except (ValueError, BufferError):
# Cater for non-single-segment arrays: this creates a
# copy, and thus aleviates this issue.
# XXX: There might be a more efficient way of doing this
obj_bytes_view = obj.flatten().view(self.np.uint8)
self._hash.update(self._getbuffer(obj_bytes_view))
# We store the class, to be able to distinguish between
# Objects with the same binary content, but different
# classes.
if self.coerce_mmap and isinstance(obj, self.np.memmap):
# We don't make the difference between memmap and
# normal ndarrays, to be able to reload previously
# computed results with memmap.
klass = self.np.ndarray
else:
klass = obj.__class__
# We also return the dtype and the shape, to distinguish
# different views on the same data with different dtypes.
# The object will be pickled by the pickler hashed at the end.
obj = (klass, ('HASHED', obj.dtype, obj.shape, obj.strides))
elif isinstance(obj, self.np.dtype):
# Atomic dtype objects are interned by their default constructor:
# np.dtype('f8') is np.dtype('f8')
# This interning is not maintained by a
# pickle.loads + pickle.dumps cycle, because __reduce__
# uses copy=True in the dtype constructor. This
# non-deterministic behavior causes the internal memoizer
# of the hasher to generate different hash values
# depending on the history of the dtype object.
# To prevent the hash from being sensitive to this, we use
# .descr which is a full (and never interned) description of
# the array dtype according to the numpy doc.
klass = obj.__class__
obj = (klass, ('HASHED', obj.descr))
Hasher.save(self, obj)
def hash(obj, hash_name='md5', coerce_mmap=False):
""" Quick calculation of a hash to identify uniquely Python objects
containing numpy arrays.
Parameters
-----------
hash_name: 'md5' or 'sha1'
Hashing algorithm used. sha1 is supposedly safer, but md5 is
faster.
coerce_mmap: boolean
Make no difference between np.memmap and np.ndarray
"""
if 'numpy' in sys.modules:
hasher = NumpyHasher(hash_name=hash_name, coerce_mmap=coerce_mmap)
else:
hasher = Hasher(hash_name=hash_name)
return hasher.hash(obj)
|