1""" 2data hash pandas / numpy objects 3""" 4import itertools 5from typing import Optional 6 7import numpy as np 8 9import pandas._libs.hashing as hashing 10 11from pandas.core.dtypes.common import ( 12 is_categorical_dtype, 13 is_extension_array_dtype, 14 is_list_like, 15) 16from pandas.core.dtypes.generic import ( 17 ABCDataFrame, 18 ABCIndexClass, 19 ABCMultiIndex, 20 ABCSeries, 21) 22 23# 16 byte long hashing key 24_default_hash_key = "0123456789123456" 25 26 27def combine_hash_arrays(arrays, num_items: int): 28 """ 29 Parameters 30 ---------- 31 arrays : generator 32 num_items : int 33 34 Should be the same as CPython's tupleobject.c 35 """ 36 try: 37 first = next(arrays) 38 except StopIteration: 39 return np.array([], dtype=np.uint64) 40 41 arrays = itertools.chain([first], arrays) 42 43 mult = np.uint64(1000003) 44 out = np.zeros_like(first) + np.uint64(0x345678) 45 for i, a in enumerate(arrays): 46 inverse_i = num_items - i 47 out ^= a 48 out *= mult 49 mult += np.uint64(82520 + inverse_i + inverse_i) 50 assert i + 1 == num_items, "Fed in wrong num_items" 51 out += np.uint64(97531) 52 return out 53 54 55def hash_pandas_object( 56 obj, 57 index: bool = True, 58 encoding: str = "utf8", 59 hash_key: Optional[str] = _default_hash_key, 60 categorize: bool = True, 61): 62 """ 63 Return a data hash of the Index/Series/DataFrame. 64 65 Parameters 66 ---------- 67 index : bool, default True 68 Include the index in the hash (if Series/DataFrame). 69 encoding : str, default 'utf8' 70 Encoding for data & key when strings. 71 hash_key : str, default _default_hash_key 72 Hash_key for string key to encode. 73 categorize : bool, default True 74 Whether to first categorize object arrays before hashing. This is more 75 efficient when the array contains duplicate values. 76 77 Returns 78 ------- 79 Series of uint64, same length as the object 80 """ 81 from pandas import Series 82 83 if hash_key is None: 84 hash_key = _default_hash_key 85 86 if isinstance(obj, ABCMultiIndex): 87 return Series(hash_tuples(obj, encoding, hash_key), dtype="uint64", copy=False) 88 89 elif isinstance(obj, ABCIndexClass): 90 h = hash_array(obj._values, encoding, hash_key, categorize).astype( 91 "uint64", copy=False 92 ) 93 h = Series(h, index=obj, dtype="uint64", copy=False) 94 95 elif isinstance(obj, ABCSeries): 96 h = hash_array(obj._values, encoding, hash_key, categorize).astype( 97 "uint64", copy=False 98 ) 99 if index: 100 index_iter = ( 101 hash_pandas_object( 102 obj.index, 103 index=False, 104 encoding=encoding, 105 hash_key=hash_key, 106 categorize=categorize, 107 )._values 108 for _ in [None] 109 ) 110 arrays = itertools.chain([h], index_iter) 111 h = combine_hash_arrays(arrays, 2) 112 113 h = Series(h, index=obj.index, dtype="uint64", copy=False) 114 115 elif isinstance(obj, ABCDataFrame): 116 hashes = (hash_array(series._values) for _, series in obj.items()) 117 num_items = len(obj.columns) 118 if index: 119 index_hash_generator = ( 120 hash_pandas_object( 121 obj.index, 122 index=False, 123 encoding=encoding, 124 hash_key=hash_key, 125 categorize=categorize, 126 )._values 127 for _ in [None] 128 ) 129 num_items += 1 130 131 # keep `hashes` specifically a generator to keep mypy happy 132 _hashes = itertools.chain(hashes, index_hash_generator) 133 hashes = (x for x in _hashes) 134 h = combine_hash_arrays(hashes, num_items) 135 136 h = Series(h, index=obj.index, dtype="uint64", copy=False) 137 else: 138 raise TypeError(f"Unexpected type for hashing {type(obj)}") 139 return h 140 141 142def hash_tuples(vals, encoding="utf8", hash_key: str = _default_hash_key): 143 """ 144 Hash an MultiIndex / list-of-tuples efficiently 145 146 Parameters 147 ---------- 148 vals : MultiIndex, list-of-tuples, or single tuple 149 encoding : str, default 'utf8' 150 hash_key : str, default _default_hash_key 151 152 Returns 153 ------- 154 ndarray of hashed values array 155 """ 156 is_tuple = False 157 if isinstance(vals, tuple): 158 vals = [vals] 159 is_tuple = True 160 elif not is_list_like(vals): 161 raise TypeError("must be convertible to a list-of-tuples") 162 163 from pandas import Categorical, MultiIndex 164 165 if not isinstance(vals, ABCMultiIndex): 166 vals = MultiIndex.from_tuples(vals) 167 168 # create a list-of-Categoricals 169 vals = [ 170 Categorical(vals.codes[level], vals.levels[level], ordered=False, fastpath=True) 171 for level in range(vals.nlevels) 172 ] 173 174 # hash the list-of-ndarrays 175 hashes = ( 176 _hash_categorical(cat, encoding=encoding, hash_key=hash_key) for cat in vals 177 ) 178 h = combine_hash_arrays(hashes, len(vals)) 179 if is_tuple: 180 h = h[0] 181 182 return h 183 184 185def _hash_categorical(c, encoding: str, hash_key: str): 186 """ 187 Hash a Categorical by hashing its categories, and then mapping the codes 188 to the hashes 189 190 Parameters 191 ---------- 192 c : Categorical 193 encoding : str 194 hash_key : str 195 196 Returns 197 ------- 198 ndarray of hashed values array, same size as len(c) 199 """ 200 # Convert ExtensionArrays to ndarrays 201 values = np.asarray(c.categories._values) 202 hashed = hash_array(values, encoding, hash_key, categorize=False) 203 204 # we have uint64, as we don't directly support missing values 205 # we don't want to use take_nd which will coerce to float 206 # instead, directly construct the result with a 207 # max(np.uint64) as the missing value indicator 208 # 209 # TODO: GH 15362 210 211 mask = c.isna() 212 if len(hashed): 213 result = hashed.take(c.codes) 214 else: 215 result = np.zeros(len(mask), dtype="uint64") 216 217 if mask.any(): 218 result[mask] = np.iinfo(np.uint64).max 219 220 return result 221 222 223def hash_array( 224 vals, 225 encoding: str = "utf8", 226 hash_key: str = _default_hash_key, 227 categorize: bool = True, 228): 229 """ 230 Given a 1d array, return an array of deterministic integers. 231 232 Parameters 233 ---------- 234 vals : ndarray, Categorical 235 encoding : str, default 'utf8' 236 Encoding for data & key when strings. 237 hash_key : str, default _default_hash_key 238 Hash_key for string key to encode. 239 categorize : bool, default True 240 Whether to first categorize object arrays before hashing. This is more 241 efficient when the array contains duplicate values. 242 243 Returns 244 ------- 245 1d uint64 numpy array of hash values, same length as the vals 246 """ 247 if not hasattr(vals, "dtype"): 248 raise TypeError("must pass a ndarray-like") 249 dtype = vals.dtype 250 251 # For categoricals, we hash the categories, then remap the codes to the 252 # hash values. (This check is above the complex check so that we don't ask 253 # numpy if categorical is a subdtype of complex, as it will choke). 254 if is_categorical_dtype(dtype): 255 return _hash_categorical(vals, encoding, hash_key) 256 elif is_extension_array_dtype(dtype): 257 vals, _ = vals._values_for_factorize() 258 dtype = vals.dtype 259 260 # we'll be working with everything as 64-bit values, so handle this 261 # 128-bit value early 262 if np.issubdtype(dtype, np.complex128): 263 return hash_array(np.real(vals)) + 23 * hash_array(np.imag(vals)) 264 265 # First, turn whatever array this is into unsigned 64-bit ints, if we can 266 # manage it. 267 elif isinstance(dtype, bool): 268 vals = vals.astype("u8") 269 elif issubclass(dtype.type, (np.datetime64, np.timedelta64)): 270 vals = vals.view("i8").astype("u8", copy=False) 271 elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8: 272 vals = vals.view(f"u{vals.dtype.itemsize}").astype("u8") 273 else: 274 # With repeated values, its MUCH faster to categorize object dtypes, 275 # then hash and rename categories. We allow skipping the categorization 276 # when the values are known/likely to be unique. 277 if categorize: 278 from pandas import Categorical, Index, factorize 279 280 codes, categories = factorize(vals, sort=False) 281 cat = Categorical(codes, Index(categories), ordered=False, fastpath=True) 282 return _hash_categorical(cat, encoding, hash_key) 283 284 try: 285 vals = hashing.hash_object_array(vals, hash_key, encoding) 286 except TypeError: 287 # we have mixed types 288 vals = hashing.hash_object_array( 289 vals.astype(str).astype(object), hash_key, encoding 290 ) 291 292 # Then, redistribute these 64-bit ints within the space of 64-bit ints 293 vals ^= vals >> 30 294 vals *= np.uint64(0xBF58476D1CE4E5B9) 295 vals ^= vals >> 27 296 vals *= np.uint64(0x94D049BB133111EB) 297 vals ^= vals >> 31 298 return vals 299