1""" 2Template for each `dtype` helper function for hashtable 3 4WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in 5""" 6 7 8{{py: 9 10# name 11cimported_types = ['float32', 12 'float64', 13 'int8', 14 'int16', 15 'int32', 16 'int64', 17 'pymap', 18 'str', 19 'strbox', 20 'uint8', 21 'uint16', 22 'uint32', 23 'uint64'] 24}} 25 26{{for name in cimported_types}} 27from pandas._libs.khash cimport ( 28 kh_destroy_{{name}}, 29 kh_exist_{{name}}, 30 kh_get_{{name}}, 31 kh_init_{{name}}, 32 kh_put_{{name}}, 33 kh_resize_{{name}}, 34) 35{{endfor}} 36 37# ---------------------------------------------------------------------- 38# VectorData 39# ---------------------------------------------------------------------- 40 41from pandas._libs.tslibs.util cimport get_c_string 42from pandas._libs.missing cimport C_NA 43 44{{py: 45 46# name, dtype, c_type 47# the generated StringVector is not actually used 48# but is included for completeness (rather ObjectVector is used 49# for uniques in hashtables) 50 51dtypes = [('Float64', 'float64', 'float64_t'), 52 ('Float32', 'float32', 'float32_t'), 53 ('Int64', 'int64', 'int64_t'), 54 ('Int32', 'int32', 'int32_t'), 55 ('Int16', 'int16', 'int16_t'), 56 ('Int8', 'int8', 'int8_t'), 57 ('String', 'string', 'char *'), 58 ('UInt64', 'uint64', 'uint64_t'), 59 ('UInt32', 'uint32', 'uint32_t'), 60 ('UInt16', 'uint16', 'uint16_t'), 61 ('UInt8', 'uint8', 'uint8_t')] 62}} 63 64{{for name, dtype, c_type in dtypes}} 65 66 67{{if dtype != 'int64'}} 68 69ctypedef struct {{name}}VectorData: 70 {{c_type}} *data 71 Py_ssize_t n, m 72 73{{endif}} 74 75 76@cython.wraparound(False) 77@cython.boundscheck(False) 78cdef inline void append_data_{{dtype}}({{name}}VectorData *data, 79 {{c_type}} x) nogil: 80 81 data.data[data.n] = x 82 data.n += 1 83 84{{endfor}} 85 86ctypedef fused vector_data: 87 Int64VectorData 88 Int32VectorData 89 Int16VectorData 90 Int8VectorData 91 UInt64VectorData 92 UInt32VectorData 93 UInt16VectorData 94 UInt8VectorData 95 Float64VectorData 96 Float32VectorData 97 StringVectorData 98 99cdef inline bint needs_resize(vector_data *data) nogil: 100 return data.n == data.m 101 102# ---------------------------------------------------------------------- 103# Vector 104# ---------------------------------------------------------------------- 105 106{{py: 107 108# name, dtype, c_type 109dtypes = [('Float64', 'float64', 'float64_t'), 110 ('UInt64', 'uint64', 'uint64_t'), 111 ('Int64', 'int64', 'int64_t'), 112 ('Float32', 'float32', 'float32_t'), 113 ('UInt32', 'uint32', 'uint32_t'), 114 ('Int32', 'int32', 'int32_t'), 115 ('UInt16', 'uint16', 'uint16_t'), 116 ('Int16', 'int16', 'int16_t'), 117 ('UInt8', 'uint8', 'uint8_t'), 118 ('Int8', 'int8', 'int8_t')] 119 120}} 121 122{{for name, dtype, c_type in dtypes}} 123 124cdef class {{name}}Vector: 125 126 {{if dtype != 'int64'}} 127 cdef: 128 bint external_view_exists 129 {{name}}VectorData *data 130 ndarray ao 131 {{endif}} 132 133 def __cinit__(self): 134 self.data = <{{name}}VectorData *>PyMem_Malloc( 135 sizeof({{name}}VectorData)) 136 if not self.data: 137 raise MemoryError() 138 self.external_view_exists = False 139 self.data.n = 0 140 self.data.m = _INIT_VEC_CAP 141 self.ao = np.empty(self.data.m, dtype=np.{{dtype}}) 142 self.data.data = <{{c_type}}*>self.ao.data 143 144 cdef resize(self): 145 self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) 146 self.ao.resize(self.data.m, refcheck=False) 147 self.data.data = <{{c_type}}*>self.ao.data 148 149 def __dealloc__(self): 150 if self.data is not NULL: 151 PyMem_Free(self.data) 152 self.data = NULL 153 154 def __len__(self) -> int: 155 return self.data.n 156 157 cpdef to_array(self): 158 if self.data.m != self.data.n: 159 if self.external_view_exists: 160 # should never happen 161 raise ValueError("should have raised on append()") 162 self.ao.resize(self.data.n, refcheck=False) 163 self.data.m = self.data.n 164 self.external_view_exists = True 165 return self.ao 166 167 cdef inline void append(self, {{c_type}} x): 168 169 if needs_resize(self.data): 170 if self.external_view_exists: 171 raise ValueError("external reference but " 172 "Vector.resize() needed") 173 self.resize() 174 175 append_data_{{dtype}}(self.data, x) 176 177 cdef extend(self, const {{c_type}}[:] x): 178 for i in range(len(x)): 179 self.append(x[i]) 180 181{{endfor}} 182 183cdef class StringVector: 184 185 cdef: 186 StringVectorData *data 187 bint external_view_exists 188 189 def __cinit__(self): 190 self.data = <StringVectorData *>PyMem_Malloc(sizeof(StringVectorData)) 191 if not self.data: 192 raise MemoryError() 193 self.external_view_exists = False 194 self.data.n = 0 195 self.data.m = _INIT_VEC_CAP 196 self.data.data = <char **>malloc(self.data.m * sizeof(char *)) 197 if not self.data.data: 198 raise MemoryError() 199 200 cdef resize(self): 201 cdef: 202 char **orig_data 203 Py_ssize_t i, m 204 205 m = self.data.m 206 self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) 207 208 orig_data = self.data.data 209 self.data.data = <char **>malloc(self.data.m * sizeof(char *)) 210 if not self.data.data: 211 raise MemoryError() 212 for i in range(m): 213 self.data.data[i] = orig_data[i] 214 215 def __dealloc__(self): 216 if self.data is not NULL: 217 if self.data.data is not NULL: 218 free(self.data.data) 219 PyMem_Free(self.data) 220 self.data = NULL 221 222 def __len__(self) -> int: 223 return self.data.n 224 225 def to_array(self): 226 cdef: 227 ndarray ao 228 Py_ssize_t n 229 object val 230 231 ao = np.empty(self.data.n, dtype=object) 232 for i in range(self.data.n): 233 val = self.data.data[i] 234 ao[i] = val 235 self.external_view_exists = True 236 self.data.m = self.data.n 237 return ao 238 239 cdef inline void append(self, char *x): 240 241 if needs_resize(self.data): 242 self.resize() 243 244 append_data_string(self.data, x) 245 246 cdef extend(self, ndarray[object] x): 247 for i in range(len(x)): 248 self.append(x[i]) 249 250 251cdef class ObjectVector: 252 253 cdef: 254 PyObject **data 255 Py_ssize_t n, m 256 ndarray ao 257 bint external_view_exists 258 259 def __cinit__(self): 260 self.external_view_exists = False 261 self.n = 0 262 self.m = _INIT_VEC_CAP 263 self.ao = np.empty(_INIT_VEC_CAP, dtype=object) 264 self.data = <PyObject**>self.ao.data 265 266 def __len__(self) -> int: 267 return self.n 268 269 cdef inline append(self, object obj): 270 if self.n == self.m: 271 if self.external_view_exists: 272 raise ValueError("external reference but " 273 "Vector.resize() needed") 274 self.m = max(self.m * 2, _INIT_VEC_CAP) 275 self.ao.resize(self.m, refcheck=False) 276 self.data = <PyObject**>self.ao.data 277 278 Py_INCREF(obj) 279 self.data[self.n] = <PyObject*>obj 280 self.n += 1 281 282 def to_array(self): 283 if self.m != self.n: 284 if self.external_view_exists: 285 raise ValueError("should have raised on append()") 286 self.ao.resize(self.n, refcheck=False) 287 self.m = self.n 288 self.external_view_exists = True 289 return self.ao 290 291 cdef extend(self, ndarray[object] x): 292 for i in range(len(x)): 293 self.append(x[i]) 294 295# ---------------------------------------------------------------------- 296# HashTable 297# ---------------------------------------------------------------------- 298 299 300cdef class HashTable: 301 302 pass 303 304{{py: 305 306# name, dtype, float_group 307dtypes = [('Float64', 'float64', True), 308 ('UInt64', 'uint64', False), 309 ('Int64', 'int64', False), 310 ('Float32', 'float32', True), 311 ('UInt32', 'uint32', False), 312 ('Int32', 'int32', False), 313 ('UInt16', 'uint16', False), 314 ('Int16', 'int16', False), 315 ('UInt8', 'uint8', False), 316 ('Int8', 'int8', False)] 317 318}} 319 320 321{{for name, dtype, float_group in dtypes}} 322 323cdef class {{name}}HashTable(HashTable): 324 325 def __cinit__(self, int64_t size_hint=1): 326 self.table = kh_init_{{dtype}}() 327 if size_hint is not None: 328 size_hint = min(size_hint, SIZE_HINT_LIMIT) 329 kh_resize_{{dtype}}(self.table, size_hint) 330 331 def __len__(self) -> int: 332 return self.table.size 333 334 def __dealloc__(self): 335 if self.table is not NULL: 336 kh_destroy_{{dtype}}(self.table) 337 self.table = NULL 338 339 def __contains__(self, object key): 340 cdef: 341 khiter_t k 342 k = kh_get_{{dtype}}(self.table, key) 343 return k != self.table.n_buckets 344 345 def sizeof(self, deep=False): 346 """ return the size of my table in bytes """ 347 overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*) 348 for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t) 349 for_pairs = self.table.n_buckets * (sizeof({{dtype}}_t) + # keys 350 sizeof(Py_ssize_t)) # vals 351 return overhead + for_flags + for_pairs 352 353 cpdef get_item(self, {{dtype}}_t val): 354 cdef: 355 khiter_t k 356 k = kh_get_{{dtype}}(self.table, val) 357 if k != self.table.n_buckets: 358 return self.table.vals[k] 359 else: 360 raise KeyError(val) 361 362 cpdef set_item(self, {{dtype}}_t key, Py_ssize_t val): 363 cdef: 364 khiter_t k 365 int ret = 0 366 367 k = kh_put_{{dtype}}(self.table, key, &ret) 368 self.table.keys[k] = key 369 if kh_exist_{{dtype}}(self.table, k): 370 self.table.vals[k] = val 371 else: 372 raise KeyError(key) 373 374 @cython.boundscheck(False) 375 def map(self, const {{dtype}}_t[:] keys, const int64_t[:] values): 376 cdef: 377 Py_ssize_t i, n = len(values) 378 int ret = 0 379 {{dtype}}_t key 380 khiter_t k 381 382 with nogil: 383 for i in range(n): 384 key = keys[i] 385 k = kh_put_{{dtype}}(self.table, key, &ret) 386 self.table.vals[k] = <Py_ssize_t>values[i] 387 388 @cython.boundscheck(False) 389 def map_locations(self, const {{dtype}}_t[:] values): 390 cdef: 391 Py_ssize_t i, n = len(values) 392 int ret = 0 393 {{dtype}}_t val 394 khiter_t k 395 396 with nogil: 397 for i in range(n): 398 val = values[i] 399 k = kh_put_{{dtype}}(self.table, val, &ret) 400 self.table.vals[k] = i 401 402 @cython.boundscheck(False) 403 def lookup(self, const {{dtype}}_t[:] values): 404 cdef: 405 Py_ssize_t i, n = len(values) 406 int ret = 0 407 {{dtype}}_t val 408 khiter_t k 409 intp_t[:] locs = np.empty(n, dtype=np.intp) 410 411 with nogil: 412 for i in range(n): 413 val = values[i] 414 k = kh_get_{{dtype}}(self.table, val) 415 if k != self.table.n_buckets: 416 locs[i] = self.table.vals[k] 417 else: 418 locs[i] = -1 419 420 return np.asarray(locs) 421 422 @cython.boundscheck(False) 423 @cython.wraparound(False) 424 def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, 425 Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, 426 object na_value=None, bint ignore_na=False, 427 object mask=None, bint return_inverse=False): 428 """ 429 Calculate unique values and labels (no sorting!) 430 431 Parameters 432 ---------- 433 values : ndarray[{{dtype}}] 434 Array of values of which unique will be calculated 435 uniques : {{name}}Vector 436 Vector into which uniques will be written 437 count_prior : Py_ssize_t, default 0 438 Number of existing entries in uniques 439 na_sentinel : Py_ssize_t, default -1 440 Sentinel value used for all NA-values in inverse 441 na_value : object, default None 442 Value to identify as missing. If na_value is None, then 443 any value "val" satisfying val != val is considered missing. 444 If na_value is not None, then _additionally_, any value "val" 445 satisfying val == na_value is considered missing. 446 ignore_na : boolean, default False 447 Whether NA-values should be ignored for calculating the uniques. If 448 True, the labels corresponding to missing values will be set to 449 na_sentinel. 450 mask : ndarray[bool], optional 451 If not None, the mask is used as indicator for missing values 452 (True = missing, False = valid) instead of `na_value` or 453 condition "val != val". 454 return_inverse : boolean, default False 455 Whether the mapping of the original array values to their location 456 in the vector of uniques should be returned. 457 458 Returns 459 ------- 460 uniques : ndarray[{{dtype}}] 461 Unique values of input, not sorted 462 labels : ndarray[int64] (if return_inverse=True) 463 The labels from values to uniques 464 """ 465 cdef: 466 Py_ssize_t i, idx, count = count_prior, n = len(values) 467 int64_t[:] labels 468 int ret = 0 469 {{dtype}}_t val, na_value2 470 khiter_t k 471 {{name}}VectorData *ud 472 bint use_na_value, use_mask 473 uint8_t[:] mask_values 474 475 if return_inverse: 476 labels = np.empty(n, dtype=np.int64) 477 ud = uniques.data 478 use_na_value = na_value is not None 479 use_mask = mask is not None 480 481 if use_mask: 482 mask_values = mask.view("uint8") 483 484 if use_na_value: 485 # We need this na_value2 because we want to allow users 486 # to *optionally* specify an NA sentinel *of the correct* type. 487 # We use None, to make it optional, which requires `object` type 488 # for the parameter. To please the compiler, we use na_value2, 489 # which is only used if it's *specified*. 490 na_value2 = <{{dtype}}_t>na_value 491 else: 492 na_value2 = 0 493 494 with nogil: 495 for i in range(n): 496 val = values[i] 497 498 if ignore_na and use_mask: 499 if mask_values[i]: 500 labels[i] = na_sentinel 501 continue 502 elif ignore_na and ( 503 {{if not name.lower().startswith(("uint", "int"))}} 504 val != val or 505 {{endif}} 506 (use_na_value and val == na_value2) 507 ): 508 # if missing values do not count as unique values (i.e. if 509 # ignore_na is True), skip the hashtable entry for them, 510 # and replace the corresponding label with na_sentinel 511 labels[i] = na_sentinel 512 continue 513 514 k = kh_get_{{dtype}}(self.table, val) 515 516 if k == self.table.n_buckets: 517 # k hasn't been seen yet 518 k = kh_put_{{dtype}}(self.table, val, &ret) 519 520 if needs_resize(ud): 521 with gil: 522 if uniques.external_view_exists: 523 raise ValueError("external reference to " 524 "uniques held, but " 525 "Vector.resize() needed") 526 uniques.resize() 527 append_data_{{dtype}}(ud, val) 528 if return_inverse: 529 self.table.vals[k] = count 530 labels[i] = count 531 count += 1 532 elif return_inverse: 533 # k falls into a previous bucket 534 # only relevant in case we need to construct the inverse 535 idx = self.table.vals[k] 536 labels[i] = idx 537 538 if return_inverse: 539 return uniques.to_array(), np.asarray(labels) 540 return uniques.to_array() 541 542 def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False): 543 """ 544 Calculate unique values and labels (no sorting!) 545 546 Parameters 547 ---------- 548 values : ndarray[{{dtype}}] 549 Array of values of which unique will be calculated 550 return_inverse : boolean, default False 551 Whether the mapping of the original array values to their location 552 in the vector of uniques should be returned. 553 554 Returns 555 ------- 556 uniques : ndarray[{{dtype}}] 557 Unique values of input, not sorted 558 labels : ndarray[int64] (if return_inverse) 559 The labels from values to uniques 560 """ 561 uniques = {{name}}Vector() 562 return self._unique(values, uniques, ignore_na=False, 563 return_inverse=return_inverse) 564 565 def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1, 566 object na_value=None, object mask=None): 567 """ 568 Calculate unique values and labels (no sorting!) 569 570 Missing values are not included in the "uniques" for this method. 571 The labels for any missing values will be set to "na_sentinel" 572 573 Parameters 574 ---------- 575 values : ndarray[{{dtype}}] 576 Array of values of which unique will be calculated 577 na_sentinel : Py_ssize_t, default -1 578 Sentinel value used for all NA-values in inverse 579 na_value : object, default None 580 Value to identify as missing. If na_value is None, then 581 any value "val" satisfying val != val is considered missing. 582 If na_value is not None, then _additionally_, any value "val" 583 satisfying val == na_value is considered missing. 584 mask : ndarray[bool], optional 585 If not None, the mask is used as indicator for missing values 586 (True = missing, False = valid) instead of `na_value` or 587 condition "val != val". 588 589 Returns 590 ------- 591 uniques : ndarray[{{dtype}}] 592 Unique values of input, not sorted 593 labels : ndarray[int64] 594 The labels from values to uniques 595 """ 596 uniques_vector = {{name}}Vector() 597 return self._unique(values, uniques_vector, na_sentinel=na_sentinel, 598 na_value=na_value, ignore_na=True, mask=mask, 599 return_inverse=True) 600 601 def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, 602 Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, 603 object na_value=None): 604 _, labels = self._unique(values, uniques, count_prior=count_prior, 605 na_sentinel=na_sentinel, na_value=na_value, 606 ignore_na=True, return_inverse=True) 607 return labels 608 609 @cython.boundscheck(False) 610 def get_labels_groupby(self, const {{dtype}}_t[:] values): 611 cdef: 612 Py_ssize_t i, n = len(values) 613 intp_t[:] labels 614 Py_ssize_t idx, count = 0 615 int ret = 0 616 {{dtype}}_t val 617 khiter_t k 618 {{name}}Vector uniques = {{name}}Vector() 619 {{name}}VectorData *ud 620 621 labels = np.empty(n, dtype=np.intp) 622 ud = uniques.data 623 624 with nogil: 625 for i in range(n): 626 val = values[i] 627 628 # specific for groupby 629 {{if dtype != 'uint64'}} 630 if val < 0: 631 labels[i] = -1 632 continue 633 {{endif}} 634 635 k = kh_get_{{dtype}}(self.table, val) 636 if k != self.table.n_buckets: 637 idx = self.table.vals[k] 638 labels[i] = idx 639 else: 640 k = kh_put_{{dtype}}(self.table, val, &ret) 641 self.table.vals[k] = count 642 643 if needs_resize(ud): 644 with gil: 645 uniques.resize() 646 append_data_{{dtype}}(ud, val) 647 labels[i] = count 648 count += 1 649 650 arr_uniques = uniques.to_array() 651 652 return np.asarray(labels), arr_uniques 653 654{{endfor}} 655 656 657cdef class StringHashTable(HashTable): 658 # these by-definition *must* be strings 659 # or a sentinel np.nan / None missing value 660 na_string_sentinel = '__nan__' 661 662 def __init__(self, int64_t size_hint=1): 663 self.table = kh_init_str() 664 if size_hint is not None: 665 size_hint = min(size_hint, SIZE_HINT_LIMIT) 666 kh_resize_str(self.table, size_hint) 667 668 def __dealloc__(self): 669 if self.table is not NULL: 670 kh_destroy_str(self.table) 671 self.table = NULL 672 673 def sizeof(self, deep=False): 674 overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*) 675 for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t) 676 for_pairs = self.table.n_buckets * (sizeof(char *) + # keys 677 sizeof(Py_ssize_t)) # vals 678 return overhead + for_flags + for_pairs 679 680 cpdef get_item(self, str val): 681 cdef: 682 khiter_t k 683 const char *v 684 v = get_c_string(val) 685 686 k = kh_get_str(self.table, v) 687 if k != self.table.n_buckets: 688 return self.table.vals[k] 689 else: 690 raise KeyError(val) 691 692 cpdef set_item(self, str key, Py_ssize_t val): 693 cdef: 694 khiter_t k 695 int ret = 0 696 const char *v 697 698 v = get_c_string(key) 699 700 k = kh_put_str(self.table, v, &ret) 701 self.table.keys[k] = v 702 if kh_exist_str(self.table, k): 703 self.table.vals[k] = val 704 else: 705 raise KeyError(key) 706 707 @cython.boundscheck(False) 708 def get_indexer(self, ndarray[object] values): 709 cdef: 710 Py_ssize_t i, n = len(values) 711 ndarray[intp_t] labels = np.empty(n, dtype=np.intp) 712 intp_t *resbuf = <intp_t*>labels.data 713 khiter_t k 714 kh_str_t *table = self.table 715 const char *v 716 const char **vecs 717 718 vecs = <const char **>malloc(n * sizeof(char *)) 719 for i in range(n): 720 val = values[i] 721 v = get_c_string(val) 722 vecs[i] = v 723 724 with nogil: 725 for i in range(n): 726 k = kh_get_str(table, vecs[i]) 727 if k != table.n_buckets: 728 resbuf[i] = table.vals[k] 729 else: 730 resbuf[i] = -1 731 732 free(vecs) 733 return labels 734 735 @cython.boundscheck(False) 736 def lookup(self, ndarray[object] values): 737 cdef: 738 Py_ssize_t i, n = len(values) 739 int ret = 0 740 object val 741 const char *v 742 khiter_t k 743 intp_t[:] locs = np.empty(n, dtype=np.intp) 744 745 # these by-definition *must* be strings 746 vecs = <const char **>malloc(n * sizeof(char *)) 747 for i in range(n): 748 val = values[i] 749 750 if isinstance(val, str): 751 # GH#31499 if we have a np.str_ get_c_string won't recognize 752 # it as a str, even though isinstance does. 753 v = get_c_string(<str>val) 754 else: 755 v = get_c_string(self.na_string_sentinel) 756 vecs[i] = v 757 758 with nogil: 759 for i in range(n): 760 v = vecs[i] 761 k = kh_get_str(self.table, v) 762 if k != self.table.n_buckets: 763 locs[i] = self.table.vals[k] 764 else: 765 locs[i] = -1 766 767 free(vecs) 768 return np.asarray(locs) 769 770 @cython.boundscheck(False) 771 def map_locations(self, ndarray[object] values): 772 cdef: 773 Py_ssize_t i, n = len(values) 774 int ret = 0 775 object val 776 const char *v 777 const char **vecs 778 khiter_t k 779 780 # these by-definition *must* be strings 781 vecs = <const char **>malloc(n * sizeof(char *)) 782 for i in range(n): 783 val = values[i] 784 785 if isinstance(val, str): 786 # GH#31499 if we have a np.str_ get_c_string won't recognize 787 # it as a str, even though isinstance does. 788 v = get_c_string(<str>val) 789 else: 790 v = get_c_string(self.na_string_sentinel) 791 vecs[i] = v 792 793 with nogil: 794 for i in range(n): 795 v = vecs[i] 796 k = kh_put_str(self.table, v, &ret) 797 self.table.vals[k] = i 798 free(vecs) 799 800 @cython.boundscheck(False) 801 @cython.wraparound(False) 802 def _unique(self, ndarray[object] values, ObjectVector uniques, 803 Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, 804 object na_value=None, bint ignore_na=False, 805 bint return_inverse=False): 806 """ 807 Calculate unique values and labels (no sorting!) 808 809 Parameters 810 ---------- 811 values : ndarray[object] 812 Array of values of which unique will be calculated 813 uniques : ObjectVector 814 Vector into which uniques will be written 815 count_prior : Py_ssize_t, default 0 816 Number of existing entries in uniques 817 na_sentinel : Py_ssize_t, default -1 818 Sentinel value used for all NA-values in inverse 819 na_value : object, default None 820 Value to identify as missing. If na_value is None, then any value 821 that is not a string is considered missing. If na_value is 822 not None, then _additionally_ any value "val" satisfying 823 val == na_value is considered missing. 824 ignore_na : boolean, default False 825 Whether NA-values should be ignored for calculating the uniques. If 826 True, the labels corresponding to missing values will be set to 827 na_sentinel. 828 return_inverse : boolean, default False 829 Whether the mapping of the original array values to their location 830 in the vector of uniques should be returned. 831 832 Returns 833 ------- 834 uniques : ndarray[object] 835 Unique values of input, not sorted 836 labels : ndarray[int64] (if return_inverse=True) 837 The labels from values to uniques 838 """ 839 cdef: 840 Py_ssize_t i, idx, count = count_prior, n = len(values) 841 int64_t[:] labels 842 int64_t[:] uindexer 843 int ret = 0 844 object val 845 const char *v 846 const char **vecs 847 khiter_t k 848 bint use_na_value 849 850 if return_inverse: 851 labels = np.zeros(n, dtype=np.int64) 852 uindexer = np.empty(n, dtype=np.int64) 853 use_na_value = na_value is not None 854 855 # assign pointers and pre-filter out missing (if ignore_na) 856 vecs = <const char **>malloc(n * sizeof(char *)) 857 for i in range(n): 858 val = values[i] 859 860 if (ignore_na 861 and (not isinstance(val, str) 862 or (use_na_value and val == na_value))): 863 # if missing values do not count as unique values (i.e. if 864 # ignore_na is True), we can skip the actual value, and 865 # replace the label with na_sentinel directly 866 labels[i] = na_sentinel 867 else: 868 # if ignore_na is False, we also stringify NaN/None/etc. 869 try: 870 v = get_c_string(<str>val) 871 except UnicodeEncodeError: 872 v = get_c_string(<str>repr(val)) 873 vecs[i] = v 874 875 # compute 876 with nogil: 877 for i in range(n): 878 if ignore_na and labels[i] == na_sentinel: 879 # skip entries for ignored missing values (see above) 880 continue 881 882 v = vecs[i] 883 k = kh_get_str(self.table, v) 884 if k == self.table.n_buckets: 885 # k hasn't been seen yet 886 k = kh_put_str(self.table, v, &ret) 887 uindexer[count] = i 888 if return_inverse: 889 self.table.vals[k] = count 890 labels[i] = <int64_t>count 891 count += 1 892 elif return_inverse: 893 # k falls into a previous bucket 894 # only relevant in case we need to construct the inverse 895 idx = self.table.vals[k] 896 labels[i] = <int64_t>idx 897 898 free(vecs) 899 900 # uniques 901 for i in range(count): 902 uniques.append(values[uindexer[i]]) 903 904 if return_inverse: 905 return uniques.to_array(), np.asarray(labels) 906 return uniques.to_array() 907 908 def unique(self, ndarray[object] values, bint return_inverse=False): 909 """ 910 Calculate unique values and labels (no sorting!) 911 912 Parameters 913 ---------- 914 values : ndarray[object] 915 Array of values of which unique will be calculated 916 return_inverse : boolean, default False 917 Whether the mapping of the original array values to their location 918 in the vector of uniques should be returned. 919 920 Returns 921 ------- 922 uniques : ndarray[object] 923 Unique values of input, not sorted 924 labels : ndarray[int64] (if return_inverse) 925 The labels from values to uniques 926 """ 927 uniques = ObjectVector() 928 return self._unique(values, uniques, ignore_na=False, 929 return_inverse=return_inverse) 930 931 def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1, 932 object na_value=None, object mask=None): 933 """ 934 Calculate unique values and labels (no sorting!) 935 936 Missing values are not included in the "uniques" for this method. 937 The labels for any missing values will be set to "na_sentinel" 938 939 Parameters 940 ---------- 941 values : ndarray[object] 942 Array of values of which unique will be calculated 943 na_sentinel : Py_ssize_t, default -1 944 Sentinel value used for all NA-values in inverse 945 na_value : object, default None 946 Value to identify as missing. If na_value is None, then any value 947 that is not a string is considered missing. If na_value is 948 not None, then _additionally_ any value "val" satisfying 949 val == na_value is considered missing. 950 mask : ndarray[bool], optional 951 Not yet implementd for StringHashTable. 952 953 Returns 954 ------- 955 uniques : ndarray[object] 956 Unique values of input, not sorted 957 labels : ndarray[int64] 958 The labels from values to uniques 959 """ 960 uniques_vector = ObjectVector() 961 return self._unique(values, uniques_vector, na_sentinel=na_sentinel, 962 na_value=na_value, ignore_na=True, 963 return_inverse=True) 964 965 def get_labels(self, ndarray[object] values, ObjectVector uniques, 966 Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, 967 object na_value=None): 968 _, labels = self._unique(values, uniques, count_prior=count_prior, 969 na_sentinel=na_sentinel, na_value=na_value, 970 ignore_na=True, return_inverse=True) 971 return labels 972 973 974cdef class PyObjectHashTable(HashTable): 975 976 def __init__(self, int64_t size_hint=1): 977 self.table = kh_init_pymap() 978 if size_hint is not None: 979 size_hint = min(size_hint, SIZE_HINT_LIMIT) 980 kh_resize_pymap(self.table, size_hint) 981 982 def __dealloc__(self): 983 if self.table is not NULL: 984 kh_destroy_pymap(self.table) 985 self.table = NULL 986 987 def __len__(self) -> int: 988 return self.table.size 989 990 def __contains__(self, object key): 991 cdef: 992 khiter_t k 993 hash(key) 994 995 k = kh_get_pymap(self.table, <PyObject*>key) 996 return k != self.table.n_buckets 997 998 def sizeof(self, deep=False): 999 """ return the size of my table in bytes """ 1000 overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*) 1001 for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t) 1002 for_pairs = self.table.n_buckets * (sizeof(PyObject *) + # keys 1003 sizeof(Py_ssize_t)) # vals 1004 return overhead + for_flags + for_pairs 1005 1006 cpdef get_item(self, object val): 1007 cdef: 1008 khiter_t k 1009 1010 k = kh_get_pymap(self.table, <PyObject*>val) 1011 if k != self.table.n_buckets: 1012 return self.table.vals[k] 1013 else: 1014 raise KeyError(val) 1015 1016 cpdef set_item(self, object key, Py_ssize_t val): 1017 cdef: 1018 khiter_t k 1019 int ret = 0 1020 char* buf 1021 1022 hash(key) 1023 1024 k = kh_put_pymap(self.table, <PyObject*>key, &ret) 1025 # self.table.keys[k] = key 1026 if kh_exist_pymap(self.table, k): 1027 self.table.vals[k] = val 1028 else: 1029 raise KeyError(key) 1030 1031 def map_locations(self, ndarray[object] values): 1032 cdef: 1033 Py_ssize_t i, n = len(values) 1034 int ret = 0 1035 object val 1036 khiter_t k 1037 1038 for i in range(n): 1039 val = values[i] 1040 hash(val) 1041 1042 k = kh_put_pymap(self.table, <PyObject*>val, &ret) 1043 self.table.vals[k] = i 1044 1045 def lookup(self, ndarray[object] values): 1046 cdef: 1047 Py_ssize_t i, n = len(values) 1048 int ret = 0 1049 object val 1050 khiter_t k 1051 intp_t[:] locs = np.empty(n, dtype=np.intp) 1052 1053 for i in range(n): 1054 val = values[i] 1055 hash(val) 1056 1057 k = kh_get_pymap(self.table, <PyObject*>val) 1058 if k != self.table.n_buckets: 1059 locs[i] = self.table.vals[k] 1060 else: 1061 locs[i] = -1 1062 1063 return np.asarray(locs) 1064 1065 @cython.boundscheck(False) 1066 @cython.wraparound(False) 1067 def _unique(self, ndarray[object] values, ObjectVector uniques, 1068 Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, 1069 object na_value=None, bint ignore_na=False, 1070 bint return_inverse=False): 1071 """ 1072 Calculate unique values and labels (no sorting!) 1073 1074 Parameters 1075 ---------- 1076 values : ndarray[object] 1077 Array of values of which unique will be calculated 1078 uniques : ObjectVector 1079 Vector into which uniques will be written 1080 count_prior : Py_ssize_t, default 0 1081 Number of existing entries in uniques 1082 na_sentinel : Py_ssize_t, default -1 1083 Sentinel value used for all NA-values in inverse 1084 na_value : object, default None 1085 Value to identify as missing. If na_value is None, then None _plus_ 1086 any value "val" satisfying val != val is considered missing. 1087 If na_value is not None, then _additionally_, any value "val" 1088 satisfying val == na_value is considered missing. 1089 ignore_na : boolean, default False 1090 Whether NA-values should be ignored for calculating the uniques. If 1091 True, the labels corresponding to missing values will be set to 1092 na_sentinel. 1093 return_inverse : boolean, default False 1094 Whether the mapping of the original array values to their location 1095 in the vector of uniques should be returned. 1096 1097 Returns 1098 ------- 1099 uniques : ndarray[object] 1100 Unique values of input, not sorted 1101 labels : ndarray[int64] (if return_inverse=True) 1102 The labels from values to uniques 1103 """ 1104 cdef: 1105 Py_ssize_t i, idx, count = count_prior, n = len(values) 1106 int64_t[:] labels 1107 int ret = 0 1108 object val 1109 khiter_t k 1110 bint use_na_value 1111 1112 if return_inverse: 1113 labels = np.empty(n, dtype=np.int64) 1114 use_na_value = na_value is not None 1115 1116 for i in range(n): 1117 val = values[i] 1118 hash(val) 1119 1120 if ignore_na and ( 1121 (val is C_NA) 1122 or (val != val) 1123 or (val is None) 1124 or (use_na_value and val == na_value) 1125 ): 1126 # if missing values do not count as unique values (i.e. if 1127 # ignore_na is True), skip the hashtable entry for them, and 1128 # replace the corresponding label with na_sentinel 1129 labels[i] = na_sentinel 1130 continue 1131 1132 k = kh_get_pymap(self.table, <PyObject*>val) 1133 if k == self.table.n_buckets: 1134 # k hasn't been seen yet 1135 k = kh_put_pymap(self.table, <PyObject*>val, &ret) 1136 uniques.append(val) 1137 if return_inverse: 1138 self.table.vals[k] = count 1139 labels[i] = count 1140 count += 1 1141 elif return_inverse: 1142 # k falls into a previous bucket 1143 # only relevant in case we need to construct the inverse 1144 idx = self.table.vals[k] 1145 labels[i] = idx 1146 1147 if return_inverse: 1148 return uniques.to_array(), np.asarray(labels) 1149 return uniques.to_array() 1150 1151 def unique(self, ndarray[object] values, bint return_inverse=False): 1152 """ 1153 Calculate unique values and labels (no sorting!) 1154 1155 Parameters 1156 ---------- 1157 values : ndarray[object] 1158 Array of values of which unique will be calculated 1159 return_inverse : boolean, default False 1160 Whether the mapping of the original array values to their location 1161 in the vector of uniques should be returned. 1162 1163 Returns 1164 ------- 1165 uniques : ndarray[object] 1166 Unique values of input, not sorted 1167 labels : ndarray[int64] (if return_inverse) 1168 The labels from values to uniques 1169 """ 1170 uniques = ObjectVector() 1171 return self._unique(values, uniques, ignore_na=False, 1172 return_inverse=return_inverse) 1173 1174 def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1, 1175 object na_value=None, object mask=None): 1176 """ 1177 Calculate unique values and labels (no sorting!) 1178 1179 Missing values are not included in the "uniques" for this method. 1180 The labels for any missing values will be set to "na_sentinel" 1181 1182 Parameters 1183 ---------- 1184 values : ndarray[object] 1185 Array of values of which unique will be calculated 1186 na_sentinel : Py_ssize_t, default -1 1187 Sentinel value used for all NA-values in inverse 1188 na_value : object, default None 1189 Value to identify as missing. If na_value is None, then None _plus_ 1190 any value "val" satisfying val != val is considered missing. 1191 If na_value is not None, then _additionally_, any value "val" 1192 satisfying val == na_value is considered missing. 1193 mask : ndarray[bool], optional 1194 Not yet implemented for PyObjectHashTable. 1195 1196 Returns 1197 ------- 1198 uniques : ndarray[object] 1199 Unique values of input, not sorted 1200 labels : ndarray[int64] 1201 The labels from values to uniques 1202 """ 1203 uniques_vector = ObjectVector() 1204 return self._unique(values, uniques_vector, na_sentinel=na_sentinel, 1205 na_value=na_value, ignore_na=True, 1206 return_inverse=True) 1207 1208 def get_labels(self, ndarray[object] values, ObjectVector uniques, 1209 Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, 1210 object na_value=None): 1211 _, labels = self._unique(values, uniques, count_prior=count_prior, 1212 na_sentinel=na_sentinel, na_value=na_value, 1213 ignore_na=True, return_inverse=True) 1214 return labels 1215