1"""
2data hash pandas / numpy objects
3"""
4import itertools
5from typing import Optional
6
7import numpy as np
8
9import pandas._libs.hashing as hashing
10
11from pandas.core.dtypes.common import (
12    is_categorical_dtype,
13    is_extension_array_dtype,
14    is_list_like,
15)
16from pandas.core.dtypes.generic import (
17    ABCDataFrame,
18    ABCIndexClass,
19    ABCMultiIndex,
20    ABCSeries,
21)
22
23# 16 byte long hashing key
24_default_hash_key = "0123456789123456"
25
26
27def combine_hash_arrays(arrays, num_items: int):
28    """
29    Parameters
30    ----------
31    arrays : generator
32    num_items : int
33
34    Should be the same as CPython's tupleobject.c
35    """
36    try:
37        first = next(arrays)
38    except StopIteration:
39        return np.array([], dtype=np.uint64)
40
41    arrays = itertools.chain([first], arrays)
42
43    mult = np.uint64(1000003)
44    out = np.zeros_like(first) + np.uint64(0x345678)
45    for i, a in enumerate(arrays):
46        inverse_i = num_items - i
47        out ^= a
48        out *= mult
49        mult += np.uint64(82520 + inverse_i + inverse_i)
50    assert i + 1 == num_items, "Fed in wrong num_items"
51    out += np.uint64(97531)
52    return out
53
54
55def hash_pandas_object(
56    obj,
57    index: bool = True,
58    encoding: str = "utf8",
59    hash_key: Optional[str] = _default_hash_key,
60    categorize: bool = True,
61):
62    """
63    Return a data hash of the Index/Series/DataFrame.
64
65    Parameters
66    ----------
67    index : bool, default True
68        Include the index in the hash (if Series/DataFrame).
69    encoding : str, default 'utf8'
70        Encoding for data & key when strings.
71    hash_key : str, default _default_hash_key
72        Hash_key for string key to encode.
73    categorize : bool, default True
74        Whether to first categorize object arrays before hashing. This is more
75        efficient when the array contains duplicate values.
76
77    Returns
78    -------
79    Series of uint64, same length as the object
80    """
81    from pandas import Series
82
83    if hash_key is None:
84        hash_key = _default_hash_key
85
86    if isinstance(obj, ABCMultiIndex):
87        return Series(hash_tuples(obj, encoding, hash_key), dtype="uint64", copy=False)
88
89    elif isinstance(obj, ABCIndexClass):
90        h = hash_array(obj._values, encoding, hash_key, categorize).astype(
91            "uint64", copy=False
92        )
93        h = Series(h, index=obj, dtype="uint64", copy=False)
94
95    elif isinstance(obj, ABCSeries):
96        h = hash_array(obj._values, encoding, hash_key, categorize).astype(
97            "uint64", copy=False
98        )
99        if index:
100            index_iter = (
101                hash_pandas_object(
102                    obj.index,
103                    index=False,
104                    encoding=encoding,
105                    hash_key=hash_key,
106                    categorize=categorize,
107                )._values
108                for _ in [None]
109            )
110            arrays = itertools.chain([h], index_iter)
111            h = combine_hash_arrays(arrays, 2)
112
113        h = Series(h, index=obj.index, dtype="uint64", copy=False)
114
115    elif isinstance(obj, ABCDataFrame):
116        hashes = (hash_array(series._values) for _, series in obj.items())
117        num_items = len(obj.columns)
118        if index:
119            index_hash_generator = (
120                hash_pandas_object(
121                    obj.index,
122                    index=False,
123                    encoding=encoding,
124                    hash_key=hash_key,
125                    categorize=categorize,
126                )._values
127                for _ in [None]
128            )
129            num_items += 1
130
131            # keep `hashes` specifically a generator to keep mypy happy
132            _hashes = itertools.chain(hashes, index_hash_generator)
133            hashes = (x for x in _hashes)
134        h = combine_hash_arrays(hashes, num_items)
135
136        h = Series(h, index=obj.index, dtype="uint64", copy=False)
137    else:
138        raise TypeError(f"Unexpected type for hashing {type(obj)}")
139    return h
140
141
142def hash_tuples(vals, encoding="utf8", hash_key: str = _default_hash_key):
143    """
144    Hash an MultiIndex / list-of-tuples efficiently
145
146    Parameters
147    ----------
148    vals : MultiIndex, list-of-tuples, or single tuple
149    encoding : str, default 'utf8'
150    hash_key : str, default _default_hash_key
151
152    Returns
153    -------
154    ndarray of hashed values array
155    """
156    is_tuple = False
157    if isinstance(vals, tuple):
158        vals = [vals]
159        is_tuple = True
160    elif not is_list_like(vals):
161        raise TypeError("must be convertible to a list-of-tuples")
162
163    from pandas import Categorical, MultiIndex
164
165    if not isinstance(vals, ABCMultiIndex):
166        vals = MultiIndex.from_tuples(vals)
167
168    # create a list-of-Categoricals
169    vals = [
170        Categorical(vals.codes[level], vals.levels[level], ordered=False, fastpath=True)
171        for level in range(vals.nlevels)
172    ]
173
174    # hash the list-of-ndarrays
175    hashes = (
176        _hash_categorical(cat, encoding=encoding, hash_key=hash_key) for cat in vals
177    )
178    h = combine_hash_arrays(hashes, len(vals))
179    if is_tuple:
180        h = h[0]
181
182    return h
183
184
185def _hash_categorical(c, encoding: str, hash_key: str):
186    """
187    Hash a Categorical by hashing its categories, and then mapping the codes
188    to the hashes
189
190    Parameters
191    ----------
192    c : Categorical
193    encoding : str
194    hash_key : str
195
196    Returns
197    -------
198    ndarray of hashed values array, same size as len(c)
199    """
200    # Convert ExtensionArrays to ndarrays
201    values = np.asarray(c.categories._values)
202    hashed = hash_array(values, encoding, hash_key, categorize=False)
203
204    # we have uint64, as we don't directly support missing values
205    # we don't want to use take_nd which will coerce to float
206    # instead, directly construct the result with a
207    # max(np.uint64) as the missing value indicator
208    #
209    # TODO: GH 15362
210
211    mask = c.isna()
212    if len(hashed):
213        result = hashed.take(c.codes)
214    else:
215        result = np.zeros(len(mask), dtype="uint64")
216
217    if mask.any():
218        result[mask] = np.iinfo(np.uint64).max
219
220    return result
221
222
223def hash_array(
224    vals,
225    encoding: str = "utf8",
226    hash_key: str = _default_hash_key,
227    categorize: bool = True,
228):
229    """
230    Given a 1d array, return an array of deterministic integers.
231
232    Parameters
233    ----------
234    vals : ndarray, Categorical
235    encoding : str, default 'utf8'
236        Encoding for data & key when strings.
237    hash_key : str, default _default_hash_key
238        Hash_key for string key to encode.
239    categorize : bool, default True
240        Whether to first categorize object arrays before hashing. This is more
241        efficient when the array contains duplicate values.
242
243    Returns
244    -------
245    1d uint64 numpy array of hash values, same length as the vals
246    """
247    if not hasattr(vals, "dtype"):
248        raise TypeError("must pass a ndarray-like")
249    dtype = vals.dtype
250
251    # For categoricals, we hash the categories, then remap the codes to the
252    # hash values. (This check is above the complex check so that we don't ask
253    # numpy if categorical is a subdtype of complex, as it will choke).
254    if is_categorical_dtype(dtype):
255        return _hash_categorical(vals, encoding, hash_key)
256    elif is_extension_array_dtype(dtype):
257        vals, _ = vals._values_for_factorize()
258        dtype = vals.dtype
259
260    # we'll be working with everything as 64-bit values, so handle this
261    # 128-bit value early
262    if np.issubdtype(dtype, np.complex128):
263        return hash_array(np.real(vals)) + 23 * hash_array(np.imag(vals))
264
265    # First, turn whatever array this is into unsigned 64-bit ints, if we can
266    # manage it.
267    elif isinstance(dtype, bool):
268        vals = vals.astype("u8")
269    elif issubclass(dtype.type, (np.datetime64, np.timedelta64)):
270        vals = vals.view("i8").astype("u8", copy=False)
271    elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8:
272        vals = vals.view(f"u{vals.dtype.itemsize}").astype("u8")
273    else:
274        # With repeated values, its MUCH faster to categorize object dtypes,
275        # then hash and rename categories. We allow skipping the categorization
276        # when the values are known/likely to be unique.
277        if categorize:
278            from pandas import Categorical, Index, factorize
279
280            codes, categories = factorize(vals, sort=False)
281            cat = Categorical(codes, Index(categories), ordered=False, fastpath=True)
282            return _hash_categorical(cat, encoding, hash_key)
283
284        try:
285            vals = hashing.hash_object_array(vals, hash_key, encoding)
286        except TypeError:
287            # we have mixed types
288            vals = hashing.hash_object_array(
289                vals.astype(str).astype(object), hash_key, encoding
290            )
291
292    # Then, redistribute these 64-bit ints within the space of 64-bit ints
293    vals ^= vals >> 30
294    vals *= np.uint64(0xBF58476D1CE4E5B9)
295    vals ^= vals >> 27
296    vals *= np.uint64(0x94D049BB133111EB)
297    vals ^= vals >> 31
298    return vals
299