""" Template for each `dtype` helper function for hashtable WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in """ {{py: # name cimported_types = ['float32', 'float64', 'int8', 'int16', 'int32', 'int64', 'pymap', 'str', 'strbox', 'uint8', 'uint16', 'uint32', 'uint64'] }} {{for name in cimported_types}} from pandas._libs.khash cimport ( kh_destroy_{{name}}, kh_exist_{{name}}, kh_get_{{name}}, kh_init_{{name}}, kh_put_{{name}}, kh_resize_{{name}}, ) {{endfor}} # ---------------------------------------------------------------------- # VectorData # ---------------------------------------------------------------------- from pandas._libs.tslibs.util cimport get_c_string from pandas._libs.missing cimport C_NA {{py: # name, dtype, c_type # the generated StringVector is not actually used # but is included for completeness (rather ObjectVector is used # for uniques in hashtables) dtypes = [('Float64', 'float64', 'float64_t'), ('Float32', 'float32', 'float32_t'), ('Int64', 'int64', 'int64_t'), ('Int32', 'int32', 'int32_t'), ('Int16', 'int16', 'int16_t'), ('Int8', 'int8', 'int8_t'), ('String', 'string', 'char *'), ('UInt64', 'uint64', 'uint64_t'), ('UInt32', 'uint32', 'uint32_t'), ('UInt16', 'uint16', 'uint16_t'), ('UInt8', 'uint8', 'uint8_t')] }} {{for name, dtype, c_type in dtypes}} {{if dtype != 'int64'}} ctypedef struct {{name}}VectorData: {{c_type}} *data Py_ssize_t n, m {{endif}} @cython.wraparound(False) @cython.boundscheck(False) cdef inline void append_data_{{dtype}}({{name}}VectorData *data, {{c_type}} x) nogil: data.data[data.n] = x data.n += 1 {{endfor}} ctypedef fused vector_data: Int64VectorData Int32VectorData Int16VectorData Int8VectorData UInt64VectorData UInt32VectorData UInt16VectorData UInt8VectorData Float64VectorData Float32VectorData StringVectorData cdef inline bint needs_resize(vector_data *data) nogil: return data.n == data.m # ---------------------------------------------------------------------- # Vector # ---------------------------------------------------------------------- {{py: # name, dtype, c_type dtypes = [('Float64', 'float64', 'float64_t'), ('UInt64', 'uint64', 'uint64_t'), ('Int64', 'int64', 'int64_t'), ('Float32', 'float32', 'float32_t'), ('UInt32', 'uint32', 'uint32_t'), ('Int32', 'int32', 'int32_t'), ('UInt16', 'uint16', 'uint16_t'), ('Int16', 'int16', 'int16_t'), ('UInt8', 'uint8', 'uint8_t'), ('Int8', 'int8', 'int8_t')] }} {{for name, dtype, c_type in dtypes}} cdef class {{name}}Vector: {{if dtype != 'int64'}} cdef: bint external_view_exists {{name}}VectorData *data ndarray ao {{endif}} def __cinit__(self): self.data = <{{name}}VectorData *>PyMem_Malloc( sizeof({{name}}VectorData)) if not self.data: raise MemoryError() self.external_view_exists = False self.data.n = 0 self.data.m = _INIT_VEC_CAP self.ao = np.empty(self.data.m, dtype=np.{{dtype}}) self.data.data = <{{c_type}}*>self.ao.data cdef resize(self): self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) self.ao.resize(self.data.m, refcheck=False) self.data.data = <{{c_type}}*>self.ao.data def __dealloc__(self): if self.data is not NULL: PyMem_Free(self.data) self.data = NULL def __len__(self) -> int: return self.data.n cpdef to_array(self): if self.data.m != self.data.n: if self.external_view_exists: # should never happen raise ValueError("should have raised on append()") self.ao.resize(self.data.n, refcheck=False) self.data.m = self.data.n self.external_view_exists = True return self.ao cdef inline void append(self, {{c_type}} x): if needs_resize(self.data): if self.external_view_exists: raise ValueError("external reference but " "Vector.resize() needed") self.resize() append_data_{{dtype}}(self.data, x) cdef extend(self, const {{c_type}}[:] x): for i in range(len(x)): self.append(x[i]) {{endfor}} cdef class StringVector: cdef: StringVectorData *data bint external_view_exists def __cinit__(self): self.data = PyMem_Malloc(sizeof(StringVectorData)) if not self.data: raise MemoryError() self.external_view_exists = False self.data.n = 0 self.data.m = _INIT_VEC_CAP self.data.data = malloc(self.data.m * sizeof(char *)) if not self.data.data: raise MemoryError() cdef resize(self): cdef: char **orig_data Py_ssize_t i, m m = self.data.m self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) orig_data = self.data.data self.data.data = malloc(self.data.m * sizeof(char *)) if not self.data.data: raise MemoryError() for i in range(m): self.data.data[i] = orig_data[i] def __dealloc__(self): if self.data is not NULL: if self.data.data is not NULL: free(self.data.data) PyMem_Free(self.data) self.data = NULL def __len__(self) -> int: return self.data.n def to_array(self): cdef: ndarray ao Py_ssize_t n object val ao = np.empty(self.data.n, dtype=object) for i in range(self.data.n): val = self.data.data[i] ao[i] = val self.external_view_exists = True self.data.m = self.data.n return ao cdef inline void append(self, char *x): if needs_resize(self.data): self.resize() append_data_string(self.data, x) cdef extend(self, ndarray[object] x): for i in range(len(x)): self.append(x[i]) cdef class ObjectVector: cdef: PyObject **data Py_ssize_t n, m ndarray ao bint external_view_exists def __cinit__(self): self.external_view_exists = False self.n = 0 self.m = _INIT_VEC_CAP self.ao = np.empty(_INIT_VEC_CAP, dtype=object) self.data = self.ao.data def __len__(self) -> int: return self.n cdef inline append(self, object obj): if self.n == self.m: if self.external_view_exists: raise ValueError("external reference but " "Vector.resize() needed") self.m = max(self.m * 2, _INIT_VEC_CAP) self.ao.resize(self.m, refcheck=False) self.data = self.ao.data Py_INCREF(obj) self.data[self.n] = obj self.n += 1 def to_array(self): if self.m != self.n: if self.external_view_exists: raise ValueError("should have raised on append()") self.ao.resize(self.n, refcheck=False) self.m = self.n self.external_view_exists = True return self.ao cdef extend(self, ndarray[object] x): for i in range(len(x)): self.append(x[i]) # ---------------------------------------------------------------------- # HashTable # ---------------------------------------------------------------------- cdef class HashTable: pass {{py: # name, dtype, float_group dtypes = [('Float64', 'float64', True), ('UInt64', 'uint64', False), ('Int64', 'int64', False), ('Float32', 'float32', True), ('UInt32', 'uint32', False), ('Int32', 'int32', False), ('UInt16', 'uint16', False), ('Int16', 'int16', False), ('UInt8', 'uint8', False), ('Int8', 'int8', False)] }} {{for name, dtype, float_group in dtypes}} cdef class {{name}}HashTable(HashTable): def __cinit__(self, int64_t size_hint=1): self.table = kh_init_{{dtype}}() if size_hint is not None: size_hint = min(size_hint, SIZE_HINT_LIMIT) kh_resize_{{dtype}}(self.table, size_hint) def __len__(self) -> int: return self.table.size def __dealloc__(self): if self.table is not NULL: kh_destroy_{{dtype}}(self.table) self.table = NULL def __contains__(self, object key): cdef: khiter_t k k = kh_get_{{dtype}}(self.table, key) return k != self.table.n_buckets def sizeof(self, deep=False): """ return the size of my table in bytes """ overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*) for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t) for_pairs = self.table.n_buckets * (sizeof({{dtype}}_t) + # keys sizeof(Py_ssize_t)) # vals return overhead + for_flags + for_pairs cpdef get_item(self, {{dtype}}_t val): cdef: khiter_t k k = kh_get_{{dtype}}(self.table, val) if k != self.table.n_buckets: return self.table.vals[k] else: raise KeyError(val) cpdef set_item(self, {{dtype}}_t key, Py_ssize_t val): cdef: khiter_t k int ret = 0 k = kh_put_{{dtype}}(self.table, key, &ret) self.table.keys[k] = key if kh_exist_{{dtype}}(self.table, k): self.table.vals[k] = val else: raise KeyError(key) @cython.boundscheck(False) def map(self, const {{dtype}}_t[:] keys, const int64_t[:] values): cdef: Py_ssize_t i, n = len(values) int ret = 0 {{dtype}}_t key khiter_t k with nogil: for i in range(n): key = keys[i] k = kh_put_{{dtype}}(self.table, key, &ret) self.table.vals[k] = values[i] @cython.boundscheck(False) def map_locations(self, const {{dtype}}_t[:] values): cdef: Py_ssize_t i, n = len(values) int ret = 0 {{dtype}}_t val khiter_t k with nogil: for i in range(n): val = values[i] k = kh_put_{{dtype}}(self.table, val, &ret) self.table.vals[k] = i @cython.boundscheck(False) def lookup(self, const {{dtype}}_t[:] values): cdef: Py_ssize_t i, n = len(values) int ret = 0 {{dtype}}_t val khiter_t k intp_t[:] locs = np.empty(n, dtype=np.intp) with nogil: for i in range(n): val = values[i] k = kh_get_{{dtype}}(self.table, val) if k != self.table.n_buckets: locs[i] = self.table.vals[k] else: locs[i] = -1 return np.asarray(locs) @cython.boundscheck(False) @cython.wraparound(False) def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None, bint ignore_na=False, object mask=None, bint return_inverse=False): """ Calculate unique values and labels (no sorting!) Parameters ---------- values : ndarray[{{dtype}}] Array of values of which unique will be calculated uniques : {{name}}Vector Vector into which uniques will be written count_prior : Py_ssize_t, default 0 Number of existing entries in uniques na_sentinel : Py_ssize_t, default -1 Sentinel value used for all NA-values in inverse na_value : object, default None Value to identify as missing. If na_value is None, then any value "val" satisfying val != val is considered missing. If na_value is not None, then _additionally_, any value "val" satisfying val == na_value is considered missing. ignore_na : boolean, default False Whether NA-values should be ignored for calculating the uniques. If True, the labels corresponding to missing values will be set to na_sentinel. mask : ndarray[bool], optional If not None, the mask is used as indicator for missing values (True = missing, False = valid) instead of `na_value` or condition "val != val". return_inverse : boolean, default False Whether the mapping of the original array values to their location in the vector of uniques should be returned. Returns ------- uniques : ndarray[{{dtype}}] Unique values of input, not sorted labels : ndarray[int64] (if return_inverse=True) The labels from values to uniques """ cdef: Py_ssize_t i, idx, count = count_prior, n = len(values) int64_t[:] labels int ret = 0 {{dtype}}_t val, na_value2 khiter_t k {{name}}VectorData *ud bint use_na_value, use_mask uint8_t[:] mask_values if return_inverse: labels = np.empty(n, dtype=np.int64) ud = uniques.data use_na_value = na_value is not None use_mask = mask is not None if use_mask: mask_values = mask.view("uint8") if use_na_value: # We need this na_value2 because we want to allow users # to *optionally* specify an NA sentinel *of the correct* type. # We use None, to make it optional, which requires `object` type # for the parameter. To please the compiler, we use na_value2, # which is only used if it's *specified*. na_value2 = <{{dtype}}_t>na_value else: na_value2 = 0 with nogil: for i in range(n): val = values[i] if ignore_na and use_mask: if mask_values[i]: labels[i] = na_sentinel continue elif ignore_na and ( {{if not name.lower().startswith(("uint", "int"))}} val != val or {{endif}} (use_na_value and val == na_value2) ): # if missing values do not count as unique values (i.e. if # ignore_na is True), skip the hashtable entry for them, # and replace the corresponding label with na_sentinel labels[i] = na_sentinel continue k = kh_get_{{dtype}}(self.table, val) if k == self.table.n_buckets: # k hasn't been seen yet k = kh_put_{{dtype}}(self.table, val, &ret) if needs_resize(ud): with gil: if uniques.external_view_exists: raise ValueError("external reference to " "uniques held, but " "Vector.resize() needed") uniques.resize() append_data_{{dtype}}(ud, val) if return_inverse: self.table.vals[k] = count labels[i] = count count += 1 elif return_inverse: # k falls into a previous bucket # only relevant in case we need to construct the inverse idx = self.table.vals[k] labels[i] = idx if return_inverse: return uniques.to_array(), np.asarray(labels) return uniques.to_array() def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False): """ Calculate unique values and labels (no sorting!) Parameters ---------- values : ndarray[{{dtype}}] Array of values of which unique will be calculated return_inverse : boolean, default False Whether the mapping of the original array values to their location in the vector of uniques should be returned. Returns ------- uniques : ndarray[{{dtype}}] Unique values of input, not sorted labels : ndarray[int64] (if return_inverse) The labels from values to uniques """ uniques = {{name}}Vector() return self._unique(values, uniques, ignore_na=False, return_inverse=return_inverse) def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1, object na_value=None, object mask=None): """ Calculate unique values and labels (no sorting!) Missing values are not included in the "uniques" for this method. The labels for any missing values will be set to "na_sentinel" Parameters ---------- values : ndarray[{{dtype}}] Array of values of which unique will be calculated na_sentinel : Py_ssize_t, default -1 Sentinel value used for all NA-values in inverse na_value : object, default None Value to identify as missing. If na_value is None, then any value "val" satisfying val != val is considered missing. If na_value is not None, then _additionally_, any value "val" satisfying val == na_value is considered missing. mask : ndarray[bool], optional If not None, the mask is used as indicator for missing values (True = missing, False = valid) instead of `na_value` or condition "val != val". Returns ------- uniques : ndarray[{{dtype}}] Unique values of input, not sorted labels : ndarray[int64] The labels from values to uniques """ uniques_vector = {{name}}Vector() return self._unique(values, uniques_vector, na_sentinel=na_sentinel, na_value=na_value, ignore_na=True, mask=mask, return_inverse=True) def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None): _, labels = self._unique(values, uniques, count_prior=count_prior, na_sentinel=na_sentinel, na_value=na_value, ignore_na=True, return_inverse=True) return labels @cython.boundscheck(False) def get_labels_groupby(self, const {{dtype}}_t[:] values): cdef: Py_ssize_t i, n = len(values) intp_t[:] labels Py_ssize_t idx, count = 0 int ret = 0 {{dtype}}_t val khiter_t k {{name}}Vector uniques = {{name}}Vector() {{name}}VectorData *ud labels = np.empty(n, dtype=np.intp) ud = uniques.data with nogil: for i in range(n): val = values[i] # specific for groupby {{if dtype != 'uint64'}} if val < 0: labels[i] = -1 continue {{endif}} k = kh_get_{{dtype}}(self.table, val) if k != self.table.n_buckets: idx = self.table.vals[k] labels[i] = idx else: k = kh_put_{{dtype}}(self.table, val, &ret) self.table.vals[k] = count if needs_resize(ud): with gil: uniques.resize() append_data_{{dtype}}(ud, val) labels[i] = count count += 1 arr_uniques = uniques.to_array() return np.asarray(labels), arr_uniques {{endfor}} cdef class StringHashTable(HashTable): # these by-definition *must* be strings # or a sentinel np.nan / None missing value na_string_sentinel = '__nan__' def __init__(self, int64_t size_hint=1): self.table = kh_init_str() if size_hint is not None: size_hint = min(size_hint, SIZE_HINT_LIMIT) kh_resize_str(self.table, size_hint) def __dealloc__(self): if self.table is not NULL: kh_destroy_str(self.table) self.table = NULL def sizeof(self, deep=False): overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*) for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t) for_pairs = self.table.n_buckets * (sizeof(char *) + # keys sizeof(Py_ssize_t)) # vals return overhead + for_flags + for_pairs cpdef get_item(self, str val): cdef: khiter_t k const char *v v = get_c_string(val) k = kh_get_str(self.table, v) if k != self.table.n_buckets: return self.table.vals[k] else: raise KeyError(val) cpdef set_item(self, str key, Py_ssize_t val): cdef: khiter_t k int ret = 0 const char *v v = get_c_string(key) k = kh_put_str(self.table, v, &ret) self.table.keys[k] = v if kh_exist_str(self.table, k): self.table.vals[k] = val else: raise KeyError(key) @cython.boundscheck(False) def get_indexer(self, ndarray[object] values): cdef: Py_ssize_t i, n = len(values) ndarray[intp_t] labels = np.empty(n, dtype=np.intp) intp_t *resbuf = labels.data khiter_t k kh_str_t *table = self.table const char *v const char **vecs vecs = malloc(n * sizeof(char *)) for i in range(n): val = values[i] v = get_c_string(val) vecs[i] = v with nogil: for i in range(n): k = kh_get_str(table, vecs[i]) if k != table.n_buckets: resbuf[i] = table.vals[k] else: resbuf[i] = -1 free(vecs) return labels @cython.boundscheck(False) def lookup(self, ndarray[object] values): cdef: Py_ssize_t i, n = len(values) int ret = 0 object val const char *v khiter_t k intp_t[:] locs = np.empty(n, dtype=np.intp) # these by-definition *must* be strings vecs = malloc(n * sizeof(char *)) for i in range(n): val = values[i] if isinstance(val, str): # GH#31499 if we have a np.str_ get_c_string won't recognize # it as a str, even though isinstance does. v = get_c_string(val) else: v = get_c_string(self.na_string_sentinel) vecs[i] = v with nogil: for i in range(n): v = vecs[i] k = kh_get_str(self.table, v) if k != self.table.n_buckets: locs[i] = self.table.vals[k] else: locs[i] = -1 free(vecs) return np.asarray(locs) @cython.boundscheck(False) def map_locations(self, ndarray[object] values): cdef: Py_ssize_t i, n = len(values) int ret = 0 object val const char *v const char **vecs khiter_t k # these by-definition *must* be strings vecs = malloc(n * sizeof(char *)) for i in range(n): val = values[i] if isinstance(val, str): # GH#31499 if we have a np.str_ get_c_string won't recognize # it as a str, even though isinstance does. v = get_c_string(val) else: v = get_c_string(self.na_string_sentinel) vecs[i] = v with nogil: for i in range(n): v = vecs[i] k = kh_put_str(self.table, v, &ret) self.table.vals[k] = i free(vecs) @cython.boundscheck(False) @cython.wraparound(False) def _unique(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None, bint ignore_na=False, bint return_inverse=False): """ Calculate unique values and labels (no sorting!) Parameters ---------- values : ndarray[object] Array of values of which unique will be calculated uniques : ObjectVector Vector into which uniques will be written count_prior : Py_ssize_t, default 0 Number of existing entries in uniques na_sentinel : Py_ssize_t, default -1 Sentinel value used for all NA-values in inverse na_value : object, default None Value to identify as missing. If na_value is None, then any value that is not a string is considered missing. If na_value is not None, then _additionally_ any value "val" satisfying val == na_value is considered missing. ignore_na : boolean, default False Whether NA-values should be ignored for calculating the uniques. If True, the labels corresponding to missing values will be set to na_sentinel. return_inverse : boolean, default False Whether the mapping of the original array values to their location in the vector of uniques should be returned. Returns ------- uniques : ndarray[object] Unique values of input, not sorted labels : ndarray[int64] (if return_inverse=True) The labels from values to uniques """ cdef: Py_ssize_t i, idx, count = count_prior, n = len(values) int64_t[:] labels int64_t[:] uindexer int ret = 0 object val const char *v const char **vecs khiter_t k bint use_na_value if return_inverse: labels = np.zeros(n, dtype=np.int64) uindexer = np.empty(n, dtype=np.int64) use_na_value = na_value is not None # assign pointers and pre-filter out missing (if ignore_na) vecs = malloc(n * sizeof(char *)) for i in range(n): val = values[i] if (ignore_na and (not isinstance(val, str) or (use_na_value and val == na_value))): # if missing values do not count as unique values (i.e. if # ignore_na is True), we can skip the actual value, and # replace the label with na_sentinel directly labels[i] = na_sentinel else: # if ignore_na is False, we also stringify NaN/None/etc. try: v = get_c_string(val) except UnicodeEncodeError: v = get_c_string(repr(val)) vecs[i] = v # compute with nogil: for i in range(n): if ignore_na and labels[i] == na_sentinel: # skip entries for ignored missing values (see above) continue v = vecs[i] k = kh_get_str(self.table, v) if k == self.table.n_buckets: # k hasn't been seen yet k = kh_put_str(self.table, v, &ret) uindexer[count] = i if return_inverse: self.table.vals[k] = count labels[i] = count count += 1 elif return_inverse: # k falls into a previous bucket # only relevant in case we need to construct the inverse idx = self.table.vals[k] labels[i] = idx free(vecs) # uniques for i in range(count): uniques.append(values[uindexer[i]]) if return_inverse: return uniques.to_array(), np.asarray(labels) return uniques.to_array() def unique(self, ndarray[object] values, bint return_inverse=False): """ Calculate unique values and labels (no sorting!) Parameters ---------- values : ndarray[object] Array of values of which unique will be calculated return_inverse : boolean, default False Whether the mapping of the original array values to their location in the vector of uniques should be returned. Returns ------- uniques : ndarray[object] Unique values of input, not sorted labels : ndarray[int64] (if return_inverse) The labels from values to uniques """ uniques = ObjectVector() return self._unique(values, uniques, ignore_na=False, return_inverse=return_inverse) def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1, object na_value=None, object mask=None): """ Calculate unique values and labels (no sorting!) Missing values are not included in the "uniques" for this method. The labels for any missing values will be set to "na_sentinel" Parameters ---------- values : ndarray[object] Array of values of which unique will be calculated na_sentinel : Py_ssize_t, default -1 Sentinel value used for all NA-values in inverse na_value : object, default None Value to identify as missing. If na_value is None, then any value that is not a string is considered missing. If na_value is not None, then _additionally_ any value "val" satisfying val == na_value is considered missing. mask : ndarray[bool], optional Not yet implementd for StringHashTable. Returns ------- uniques : ndarray[object] Unique values of input, not sorted labels : ndarray[int64] The labels from values to uniques """ uniques_vector = ObjectVector() return self._unique(values, uniques_vector, na_sentinel=na_sentinel, na_value=na_value, ignore_na=True, return_inverse=True) def get_labels(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None): _, labels = self._unique(values, uniques, count_prior=count_prior, na_sentinel=na_sentinel, na_value=na_value, ignore_na=True, return_inverse=True) return labels cdef class PyObjectHashTable(HashTable): def __init__(self, int64_t size_hint=1): self.table = kh_init_pymap() if size_hint is not None: size_hint = min(size_hint, SIZE_HINT_LIMIT) kh_resize_pymap(self.table, size_hint) def __dealloc__(self): if self.table is not NULL: kh_destroy_pymap(self.table) self.table = NULL def __len__(self) -> int: return self.table.size def __contains__(self, object key): cdef: khiter_t k hash(key) k = kh_get_pymap(self.table, key) return k != self.table.n_buckets def sizeof(self, deep=False): """ return the size of my table in bytes """ overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*) for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t) for_pairs = self.table.n_buckets * (sizeof(PyObject *) + # keys sizeof(Py_ssize_t)) # vals return overhead + for_flags + for_pairs cpdef get_item(self, object val): cdef: khiter_t k k = kh_get_pymap(self.table, val) if k != self.table.n_buckets: return self.table.vals[k] else: raise KeyError(val) cpdef set_item(self, object key, Py_ssize_t val): cdef: khiter_t k int ret = 0 char* buf hash(key) k = kh_put_pymap(self.table, key, &ret) # self.table.keys[k] = key if kh_exist_pymap(self.table, k): self.table.vals[k] = val else: raise KeyError(key) def map_locations(self, ndarray[object] values): cdef: Py_ssize_t i, n = len(values) int ret = 0 object val khiter_t k for i in range(n): val = values[i] hash(val) k = kh_put_pymap(self.table, val, &ret) self.table.vals[k] = i def lookup(self, ndarray[object] values): cdef: Py_ssize_t i, n = len(values) int ret = 0 object val khiter_t k intp_t[:] locs = np.empty(n, dtype=np.intp) for i in range(n): val = values[i] hash(val) k = kh_get_pymap(self.table, val) if k != self.table.n_buckets: locs[i] = self.table.vals[k] else: locs[i] = -1 return np.asarray(locs) @cython.boundscheck(False) @cython.wraparound(False) def _unique(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None, bint ignore_na=False, bint return_inverse=False): """ Calculate unique values and labels (no sorting!) Parameters ---------- values : ndarray[object] Array of values of which unique will be calculated uniques : ObjectVector Vector into which uniques will be written count_prior : Py_ssize_t, default 0 Number of existing entries in uniques na_sentinel : Py_ssize_t, default -1 Sentinel value used for all NA-values in inverse na_value : object, default None Value to identify as missing. If na_value is None, then None _plus_ any value "val" satisfying val != val is considered missing. If na_value is not None, then _additionally_, any value "val" satisfying val == na_value is considered missing. ignore_na : boolean, default False Whether NA-values should be ignored for calculating the uniques. If True, the labels corresponding to missing values will be set to na_sentinel. return_inverse : boolean, default False Whether the mapping of the original array values to their location in the vector of uniques should be returned. Returns ------- uniques : ndarray[object] Unique values of input, not sorted labels : ndarray[int64] (if return_inverse=True) The labels from values to uniques """ cdef: Py_ssize_t i, idx, count = count_prior, n = len(values) int64_t[:] labels int ret = 0 object val khiter_t k bint use_na_value if return_inverse: labels = np.empty(n, dtype=np.int64) use_na_value = na_value is not None for i in range(n): val = values[i] hash(val) if ignore_na and ( (val is C_NA) or (val != val) or (val is None) or (use_na_value and val == na_value) ): # if missing values do not count as unique values (i.e. if # ignore_na is True), skip the hashtable entry for them, and # replace the corresponding label with na_sentinel labels[i] = na_sentinel continue k = kh_get_pymap(self.table, val) if k == self.table.n_buckets: # k hasn't been seen yet k = kh_put_pymap(self.table, val, &ret) uniques.append(val) if return_inverse: self.table.vals[k] = count labels[i] = count count += 1 elif return_inverse: # k falls into a previous bucket # only relevant in case we need to construct the inverse idx = self.table.vals[k] labels[i] = idx if return_inverse: return uniques.to_array(), np.asarray(labels) return uniques.to_array() def unique(self, ndarray[object] values, bint return_inverse=False): """ Calculate unique values and labels (no sorting!) Parameters ---------- values : ndarray[object] Array of values of which unique will be calculated return_inverse : boolean, default False Whether the mapping of the original array values to their location in the vector of uniques should be returned. Returns ------- uniques : ndarray[object] Unique values of input, not sorted labels : ndarray[int64] (if return_inverse) The labels from values to uniques """ uniques = ObjectVector() return self._unique(values, uniques, ignore_na=False, return_inverse=return_inverse) def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1, object na_value=None, object mask=None): """ Calculate unique values and labels (no sorting!) Missing values are not included in the "uniques" for this method. The labels for any missing values will be set to "na_sentinel" Parameters ---------- values : ndarray[object] Array of values of which unique will be calculated na_sentinel : Py_ssize_t, default -1 Sentinel value used for all NA-values in inverse na_value : object, default None Value to identify as missing. If na_value is None, then None _plus_ any value "val" satisfying val != val is considered missing. If na_value is not None, then _additionally_, any value "val" satisfying val == na_value is considered missing. mask : ndarray[bool], optional Not yet implemented for PyObjectHashTable. Returns ------- uniques : ndarray[object] Unique values of input, not sorted labels : ndarray[int64] The labels from values to uniques """ uniques_vector = ObjectVector() return self._unique(values, uniques_vector, na_sentinel=na_sentinel, na_value=na_value, ignore_na=True, return_inverse=True) def get_labels(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None): _, labels = self._unique(values, uniques, count_prior=count_prior, na_sentinel=na_sentinel, na_value=na_value, ignore_na=True, return_inverse=True) return labels