1"""
2Template for each `dtype` helper function for hashtable
3
4WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
5"""
6
7
8{{py:
9
10# name
11cimported_types = ['float32',
12                   'float64',
13                   'int8',
14                   'int16',
15                   'int32',
16                   'int64',
17                   'pymap',
18                   'str',
19                   'strbox',
20                   'uint8',
21                   'uint16',
22                   'uint32',
23                   'uint64']
24}}
25
26{{for name in cimported_types}}
27from pandas._libs.khash cimport (
28    kh_destroy_{{name}},
29    kh_exist_{{name}},
30    kh_get_{{name}},
31    kh_init_{{name}},
32    kh_put_{{name}},
33    kh_resize_{{name}},
34)
35{{endfor}}
36
37# ----------------------------------------------------------------------
38# VectorData
39# ----------------------------------------------------------------------
40
41from pandas._libs.tslibs.util cimport get_c_string
42from pandas._libs.missing cimport C_NA
43
44{{py:
45
46# name, dtype, c_type
47# the generated StringVector is not actually used
48# but is included for completeness (rather ObjectVector is used
49# for uniques in hashtables)
50
51dtypes = [('Float64', 'float64', 'float64_t'),
52          ('Float32', 'float32', 'float32_t'),
53          ('Int64', 'int64', 'int64_t'),
54          ('Int32', 'int32', 'int32_t'),
55          ('Int16', 'int16', 'int16_t'),
56          ('Int8', 'int8', 'int8_t'),
57          ('String', 'string', 'char *'),
58          ('UInt64', 'uint64', 'uint64_t'),
59          ('UInt32', 'uint32', 'uint32_t'),
60          ('UInt16', 'uint16', 'uint16_t'),
61          ('UInt8', 'uint8', 'uint8_t')]
62}}
63
64{{for name, dtype, c_type in dtypes}}
65
66
67{{if dtype != 'int64'}}
68
69ctypedef struct {{name}}VectorData:
70    {{c_type}} *data
71    Py_ssize_t n, m
72
73{{endif}}
74
75
76@cython.wraparound(False)
77@cython.boundscheck(False)
78cdef inline void append_data_{{dtype}}({{name}}VectorData *data,
79                                       {{c_type}} x) nogil:
80
81    data.data[data.n] = x
82    data.n += 1
83
84{{endfor}}
85
86ctypedef fused vector_data:
87    Int64VectorData
88    Int32VectorData
89    Int16VectorData
90    Int8VectorData
91    UInt64VectorData
92    UInt32VectorData
93    UInt16VectorData
94    UInt8VectorData
95    Float64VectorData
96    Float32VectorData
97    StringVectorData
98
99cdef inline bint needs_resize(vector_data *data) nogil:
100    return data.n == data.m
101
102# ----------------------------------------------------------------------
103# Vector
104# ----------------------------------------------------------------------
105
106{{py:
107
108# name, dtype, c_type
109dtypes = [('Float64', 'float64', 'float64_t'),
110          ('UInt64', 'uint64', 'uint64_t'),
111          ('Int64', 'int64', 'int64_t'),
112          ('Float32', 'float32', 'float32_t'),
113          ('UInt32', 'uint32', 'uint32_t'),
114          ('Int32', 'int32', 'int32_t'),
115          ('UInt16', 'uint16', 'uint16_t'),
116          ('Int16', 'int16', 'int16_t'),
117          ('UInt8', 'uint8', 'uint8_t'),
118          ('Int8', 'int8', 'int8_t')]
119
120}}
121
122{{for name, dtype, c_type in dtypes}}
123
124cdef class {{name}}Vector:
125
126    {{if dtype != 'int64'}}
127    cdef:
128        bint external_view_exists
129        {{name}}VectorData *data
130        ndarray ao
131    {{endif}}
132
133    def __cinit__(self):
134        self.data = <{{name}}VectorData *>PyMem_Malloc(
135            sizeof({{name}}VectorData))
136        if not self.data:
137            raise MemoryError()
138        self.external_view_exists = False
139        self.data.n = 0
140        self.data.m = _INIT_VEC_CAP
141        self.ao = np.empty(self.data.m, dtype=np.{{dtype}})
142        self.data.data = <{{c_type}}*>self.ao.data
143
144    cdef resize(self):
145        self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
146        self.ao.resize(self.data.m, refcheck=False)
147        self.data.data = <{{c_type}}*>self.ao.data
148
149    def __dealloc__(self):
150        if self.data is not NULL:
151            PyMem_Free(self.data)
152            self.data = NULL
153
154    def __len__(self) -> int:
155        return self.data.n
156
157    cpdef to_array(self):
158        if self.data.m != self.data.n:
159            if self.external_view_exists:
160                # should never happen
161                raise ValueError("should have raised on append()")
162            self.ao.resize(self.data.n, refcheck=False)
163            self.data.m = self.data.n
164        self.external_view_exists = True
165        return self.ao
166
167    cdef inline void append(self, {{c_type}} x):
168
169        if needs_resize(self.data):
170            if self.external_view_exists:
171                raise ValueError("external reference but "
172                                 "Vector.resize() needed")
173            self.resize()
174
175        append_data_{{dtype}}(self.data, x)
176
177    cdef extend(self, const {{c_type}}[:] x):
178        for i in range(len(x)):
179            self.append(x[i])
180
181{{endfor}}
182
183cdef class StringVector:
184
185    cdef:
186        StringVectorData *data
187        bint external_view_exists
188
189    def __cinit__(self):
190        self.data = <StringVectorData *>PyMem_Malloc(sizeof(StringVectorData))
191        if not self.data:
192            raise MemoryError()
193        self.external_view_exists = False
194        self.data.n = 0
195        self.data.m = _INIT_VEC_CAP
196        self.data.data = <char **>malloc(self.data.m * sizeof(char *))
197        if not self.data.data:
198            raise MemoryError()
199
200    cdef resize(self):
201        cdef:
202            char **orig_data
203            Py_ssize_t i, m
204
205        m = self.data.m
206        self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
207
208        orig_data = self.data.data
209        self.data.data = <char **>malloc(self.data.m * sizeof(char *))
210        if not self.data.data:
211            raise MemoryError()
212        for i in range(m):
213            self.data.data[i] = orig_data[i]
214
215    def __dealloc__(self):
216        if self.data is not NULL:
217            if self.data.data is not NULL:
218                free(self.data.data)
219            PyMem_Free(self.data)
220            self.data = NULL
221
222    def __len__(self) -> int:
223        return self.data.n
224
225    def to_array(self):
226        cdef:
227            ndarray ao
228            Py_ssize_t n
229            object val
230
231        ao = np.empty(self.data.n, dtype=object)
232        for i in range(self.data.n):
233            val = self.data.data[i]
234            ao[i] = val
235        self.external_view_exists = True
236        self.data.m = self.data.n
237        return ao
238
239    cdef inline void append(self, char *x):
240
241        if needs_resize(self.data):
242            self.resize()
243
244        append_data_string(self.data, x)
245
246    cdef extend(self, ndarray[object] x):
247        for i in range(len(x)):
248            self.append(x[i])
249
250
251cdef class ObjectVector:
252
253    cdef:
254        PyObject **data
255        Py_ssize_t n, m
256        ndarray ao
257        bint external_view_exists
258
259    def __cinit__(self):
260        self.external_view_exists = False
261        self.n = 0
262        self.m = _INIT_VEC_CAP
263        self.ao = np.empty(_INIT_VEC_CAP, dtype=object)
264        self.data = <PyObject**>self.ao.data
265
266    def __len__(self) -> int:
267        return self.n
268
269    cdef inline append(self, object obj):
270        if self.n == self.m:
271            if self.external_view_exists:
272                raise ValueError("external reference but "
273                                 "Vector.resize() needed")
274            self.m = max(self.m * 2, _INIT_VEC_CAP)
275            self.ao.resize(self.m, refcheck=False)
276            self.data = <PyObject**>self.ao.data
277
278        Py_INCREF(obj)
279        self.data[self.n] = <PyObject*>obj
280        self.n += 1
281
282    def to_array(self):
283        if self.m != self.n:
284            if self.external_view_exists:
285                raise ValueError("should have raised on append()")
286            self.ao.resize(self.n, refcheck=False)
287            self.m = self.n
288        self.external_view_exists = True
289        return self.ao
290
291    cdef extend(self, ndarray[object] x):
292        for i in range(len(x)):
293            self.append(x[i])
294
295# ----------------------------------------------------------------------
296# HashTable
297# ----------------------------------------------------------------------
298
299
300cdef class HashTable:
301
302    pass
303
304{{py:
305
306# name, dtype, float_group
307dtypes = [('Float64', 'float64', True),
308          ('UInt64', 'uint64', False),
309          ('Int64', 'int64', False),
310          ('Float32', 'float32', True),
311          ('UInt32', 'uint32', False),
312          ('Int32', 'int32', False),
313          ('UInt16', 'uint16', False),
314          ('Int16', 'int16', False),
315          ('UInt8', 'uint8', False),
316          ('Int8', 'int8', False)]
317
318}}
319
320
321{{for name, dtype, float_group in dtypes}}
322
323cdef class {{name}}HashTable(HashTable):
324
325    def __cinit__(self, int64_t size_hint=1):
326        self.table = kh_init_{{dtype}}()
327        if size_hint is not None:
328            size_hint = min(size_hint, SIZE_HINT_LIMIT)
329            kh_resize_{{dtype}}(self.table, size_hint)
330
331    def __len__(self) -> int:
332        return self.table.size
333
334    def __dealloc__(self):
335        if self.table is not NULL:
336            kh_destroy_{{dtype}}(self.table)
337            self.table = NULL
338
339    def __contains__(self, object key):
340        cdef:
341            khiter_t k
342        k = kh_get_{{dtype}}(self.table, key)
343        return k != self.table.n_buckets
344
345    def sizeof(self, deep=False):
346        """ return the size of my table in bytes """
347        overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*)
348        for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t)
349        for_pairs =  self.table.n_buckets * (sizeof({{dtype}}_t) + # keys
350                                             sizeof(Py_ssize_t))   # vals
351        return overhead + for_flags + for_pairs
352
353    cpdef get_item(self, {{dtype}}_t val):
354        cdef:
355            khiter_t k
356        k = kh_get_{{dtype}}(self.table, val)
357        if k != self.table.n_buckets:
358            return self.table.vals[k]
359        else:
360            raise KeyError(val)
361
362    cpdef set_item(self, {{dtype}}_t key, Py_ssize_t val):
363        cdef:
364            khiter_t k
365            int ret = 0
366
367        k = kh_put_{{dtype}}(self.table, key, &ret)
368        self.table.keys[k] = key
369        if kh_exist_{{dtype}}(self.table, k):
370            self.table.vals[k] = val
371        else:
372            raise KeyError(key)
373
374    @cython.boundscheck(False)
375    def map(self, const {{dtype}}_t[:] keys, const int64_t[:] values):
376        cdef:
377            Py_ssize_t i, n = len(values)
378            int ret = 0
379            {{dtype}}_t key
380            khiter_t k
381
382        with nogil:
383            for i in range(n):
384                key = keys[i]
385                k = kh_put_{{dtype}}(self.table, key, &ret)
386                self.table.vals[k] = <Py_ssize_t>values[i]
387
388    @cython.boundscheck(False)
389    def map_locations(self, const {{dtype}}_t[:] values):
390        cdef:
391            Py_ssize_t i, n = len(values)
392            int ret = 0
393            {{dtype}}_t val
394            khiter_t k
395
396        with nogil:
397            for i in range(n):
398                val = values[i]
399                k = kh_put_{{dtype}}(self.table, val, &ret)
400                self.table.vals[k] = i
401
402    @cython.boundscheck(False)
403    def lookup(self, const {{dtype}}_t[:] values):
404        cdef:
405            Py_ssize_t i, n = len(values)
406            int ret = 0
407            {{dtype}}_t val
408            khiter_t k
409            intp_t[:] locs = np.empty(n, dtype=np.intp)
410
411        with nogil:
412            for i in range(n):
413                val = values[i]
414                k = kh_get_{{dtype}}(self.table, val)
415                if k != self.table.n_buckets:
416                    locs[i] = self.table.vals[k]
417                else:
418                    locs[i] = -1
419
420        return np.asarray(locs)
421
422    @cython.boundscheck(False)
423    @cython.wraparound(False)
424    def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
425                Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
426                object na_value=None, bint ignore_na=False,
427                object mask=None, bint return_inverse=False):
428        """
429        Calculate unique values and labels (no sorting!)
430
431        Parameters
432        ----------
433        values : ndarray[{{dtype}}]
434            Array of values of which unique will be calculated
435        uniques : {{name}}Vector
436            Vector into which uniques will be written
437        count_prior : Py_ssize_t, default 0
438            Number of existing entries in uniques
439        na_sentinel : Py_ssize_t, default -1
440            Sentinel value used for all NA-values in inverse
441        na_value : object, default None
442            Value to identify as missing. If na_value is None, then
443            any value "val" satisfying val != val is considered missing.
444            If na_value is not None, then _additionally_, any value "val"
445            satisfying val == na_value is considered missing.
446        ignore_na : boolean, default False
447            Whether NA-values should be ignored for calculating the uniques. If
448            True, the labels corresponding to missing values will be set to
449            na_sentinel.
450        mask : ndarray[bool], optional
451            If not None, the mask is used as indicator for missing values
452            (True = missing, False = valid) instead of `na_value` or
453            condition "val != val".
454        return_inverse : boolean, default False
455            Whether the mapping of the original array values to their location
456            in the vector of uniques should be returned.
457
458        Returns
459        -------
460        uniques : ndarray[{{dtype}}]
461            Unique values of input, not sorted
462        labels : ndarray[int64] (if return_inverse=True)
463            The labels from values to uniques
464        """
465        cdef:
466            Py_ssize_t i, idx, count = count_prior, n = len(values)
467            int64_t[:] labels
468            int ret = 0
469            {{dtype}}_t val, na_value2
470            khiter_t k
471            {{name}}VectorData *ud
472            bint use_na_value, use_mask
473            uint8_t[:] mask_values
474
475        if return_inverse:
476            labels = np.empty(n, dtype=np.int64)
477        ud = uniques.data
478        use_na_value = na_value is not None
479        use_mask = mask is not None
480
481        if use_mask:
482            mask_values = mask.view("uint8")
483
484        if use_na_value:
485            # We need this na_value2 because we want to allow users
486            # to *optionally* specify an NA sentinel *of the correct* type.
487            # We use None, to make it optional, which requires `object` type
488            # for the parameter. To please the compiler, we use na_value2,
489            # which is only used if it's *specified*.
490            na_value2 = <{{dtype}}_t>na_value
491        else:
492            na_value2 = 0
493
494        with nogil:
495            for i in range(n):
496                val = values[i]
497
498                if ignore_na and use_mask:
499                    if mask_values[i]:
500                        labels[i] = na_sentinel
501                        continue
502                elif ignore_na and (
503                {{if not name.lower().startswith(("uint", "int"))}}
504                val != val or
505                {{endif}}
506                (use_na_value and val == na_value2)
507                ):
508                    # if missing values do not count as unique values (i.e. if
509                    # ignore_na is True), skip the hashtable entry for them,
510                    # and replace the corresponding label with na_sentinel
511                    labels[i] = na_sentinel
512                    continue
513
514                k = kh_get_{{dtype}}(self.table, val)
515
516                if k == self.table.n_buckets:
517                    # k hasn't been seen yet
518                    k = kh_put_{{dtype}}(self.table, val, &ret)
519
520                    if needs_resize(ud):
521                        with gil:
522                            if uniques.external_view_exists:
523                                raise ValueError("external reference to "
524                                                 "uniques held, but "
525                                                 "Vector.resize() needed")
526                            uniques.resize()
527                    append_data_{{dtype}}(ud, val)
528                    if return_inverse:
529                        self.table.vals[k] = count
530                        labels[i] = count
531                        count += 1
532                elif return_inverse:
533                    # k falls into a previous bucket
534                    # only relevant in case we need to construct the inverse
535                    idx = self.table.vals[k]
536                    labels[i] = idx
537
538        if return_inverse:
539            return uniques.to_array(), np.asarray(labels)
540        return uniques.to_array()
541
542    def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False):
543        """
544        Calculate unique values and labels (no sorting!)
545
546        Parameters
547        ----------
548        values : ndarray[{{dtype}}]
549            Array of values of which unique will be calculated
550        return_inverse : boolean, default False
551            Whether the mapping of the original array values to their location
552            in the vector of uniques should be returned.
553
554        Returns
555        -------
556        uniques : ndarray[{{dtype}}]
557            Unique values of input, not sorted
558        labels : ndarray[int64] (if return_inverse)
559            The labels from values to uniques
560        """
561        uniques = {{name}}Vector()
562        return self._unique(values, uniques, ignore_na=False,
563                            return_inverse=return_inverse)
564
565    def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1,
566                  object na_value=None, object mask=None):
567        """
568        Calculate unique values and labels (no sorting!)
569
570        Missing values are not included in the "uniques" for this method.
571        The labels for any missing values will be set to "na_sentinel"
572
573        Parameters
574        ----------
575        values : ndarray[{{dtype}}]
576            Array of values of which unique will be calculated
577        na_sentinel : Py_ssize_t, default -1
578            Sentinel value used for all NA-values in inverse
579        na_value : object, default None
580            Value to identify as missing. If na_value is None, then
581            any value "val" satisfying val != val is considered missing.
582            If na_value is not None, then _additionally_, any value "val"
583            satisfying val == na_value is considered missing.
584        mask : ndarray[bool], optional
585            If not None, the mask is used as indicator for missing values
586            (True = missing, False = valid) instead of `na_value` or
587            condition "val != val".
588
589        Returns
590        -------
591        uniques : ndarray[{{dtype}}]
592            Unique values of input, not sorted
593        labels : ndarray[int64]
594            The labels from values to uniques
595        """
596        uniques_vector = {{name}}Vector()
597        return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
598                            na_value=na_value, ignore_na=True, mask=mask,
599                            return_inverse=True)
600
601    def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
602                   Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
603                   object na_value=None):
604        _, labels = self._unique(values, uniques, count_prior=count_prior,
605                                 na_sentinel=na_sentinel, na_value=na_value,
606                                 ignore_na=True, return_inverse=True)
607        return labels
608
609    @cython.boundscheck(False)
610    def get_labels_groupby(self, const {{dtype}}_t[:] values):
611        cdef:
612            Py_ssize_t i, n = len(values)
613            intp_t[:] labels
614            Py_ssize_t idx, count = 0
615            int ret = 0
616            {{dtype}}_t val
617            khiter_t k
618            {{name}}Vector uniques = {{name}}Vector()
619            {{name}}VectorData *ud
620
621        labels = np.empty(n, dtype=np.intp)
622        ud = uniques.data
623
624        with nogil:
625            for i in range(n):
626                val = values[i]
627
628                # specific for groupby
629                {{if dtype != 'uint64'}}
630                if val < 0:
631                    labels[i] = -1
632                    continue
633                {{endif}}
634
635                k = kh_get_{{dtype}}(self.table, val)
636                if k != self.table.n_buckets:
637                    idx = self.table.vals[k]
638                    labels[i] = idx
639                else:
640                    k = kh_put_{{dtype}}(self.table, val, &ret)
641                    self.table.vals[k] = count
642
643                    if needs_resize(ud):
644                        with gil:
645                            uniques.resize()
646                    append_data_{{dtype}}(ud, val)
647                    labels[i] = count
648                    count += 1
649
650        arr_uniques = uniques.to_array()
651
652        return np.asarray(labels), arr_uniques
653
654{{endfor}}
655
656
657cdef class StringHashTable(HashTable):
658    # these by-definition *must* be strings
659    # or a sentinel np.nan / None missing value
660    na_string_sentinel = '__nan__'
661
662    def __init__(self, int64_t size_hint=1):
663        self.table = kh_init_str()
664        if size_hint is not None:
665            size_hint = min(size_hint, SIZE_HINT_LIMIT)
666            kh_resize_str(self.table, size_hint)
667
668    def __dealloc__(self):
669        if self.table is not NULL:
670            kh_destroy_str(self.table)
671            self.table = NULL
672
673    def sizeof(self, deep=False):
674        overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*)
675        for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t)
676        for_pairs =  self.table.n_buckets * (sizeof(char *) +      # keys
677                                             sizeof(Py_ssize_t))   # vals
678        return overhead + for_flags + for_pairs
679
680    cpdef get_item(self, str val):
681        cdef:
682            khiter_t k
683            const char *v
684        v = get_c_string(val)
685
686        k = kh_get_str(self.table, v)
687        if k != self.table.n_buckets:
688            return self.table.vals[k]
689        else:
690            raise KeyError(val)
691
692    cpdef set_item(self, str key, Py_ssize_t val):
693        cdef:
694            khiter_t k
695            int ret = 0
696            const char *v
697
698        v = get_c_string(key)
699
700        k = kh_put_str(self.table, v, &ret)
701        self.table.keys[k] = v
702        if kh_exist_str(self.table, k):
703            self.table.vals[k] = val
704        else:
705            raise KeyError(key)
706
707    @cython.boundscheck(False)
708    def get_indexer(self, ndarray[object] values):
709        cdef:
710            Py_ssize_t i, n = len(values)
711            ndarray[intp_t] labels = np.empty(n, dtype=np.intp)
712            intp_t *resbuf = <intp_t*>labels.data
713            khiter_t k
714            kh_str_t *table = self.table
715            const char *v
716            const char **vecs
717
718        vecs = <const char **>malloc(n * sizeof(char *))
719        for i in range(n):
720            val = values[i]
721            v = get_c_string(val)
722            vecs[i] = v
723
724        with nogil:
725            for i in range(n):
726                k = kh_get_str(table, vecs[i])
727                if k != table.n_buckets:
728                    resbuf[i] = table.vals[k]
729                else:
730                    resbuf[i] = -1
731
732        free(vecs)
733        return labels
734
735    @cython.boundscheck(False)
736    def lookup(self, ndarray[object] values):
737        cdef:
738            Py_ssize_t i, n = len(values)
739            int ret = 0
740            object val
741            const char *v
742            khiter_t k
743            intp_t[:] locs = np.empty(n, dtype=np.intp)
744
745        # these by-definition *must* be strings
746        vecs = <const char **>malloc(n * sizeof(char *))
747        for i in range(n):
748            val = values[i]
749
750            if isinstance(val, str):
751                # GH#31499 if we have a np.str_ get_c_string won't recognize
752                #  it as a str, even though isinstance does.
753                v = get_c_string(<str>val)
754            else:
755                v = get_c_string(self.na_string_sentinel)
756            vecs[i] = v
757
758        with nogil:
759            for i in range(n):
760                v = vecs[i]
761                k = kh_get_str(self.table, v)
762                if k != self.table.n_buckets:
763                    locs[i] = self.table.vals[k]
764                else:
765                    locs[i] = -1
766
767        free(vecs)
768        return np.asarray(locs)
769
770    @cython.boundscheck(False)
771    def map_locations(self, ndarray[object] values):
772        cdef:
773            Py_ssize_t i, n = len(values)
774            int ret = 0
775            object val
776            const char *v
777            const char **vecs
778            khiter_t k
779
780        # these by-definition *must* be strings
781        vecs = <const char **>malloc(n * sizeof(char *))
782        for i in range(n):
783            val = values[i]
784
785            if isinstance(val, str):
786                # GH#31499 if we have a np.str_ get_c_string won't recognize
787                #  it as a str, even though isinstance does.
788                v = get_c_string(<str>val)
789            else:
790                v = get_c_string(self.na_string_sentinel)
791            vecs[i] = v
792
793        with nogil:
794            for i in range(n):
795                v = vecs[i]
796                k = kh_put_str(self.table, v, &ret)
797                self.table.vals[k] = i
798        free(vecs)
799
800    @cython.boundscheck(False)
801    @cython.wraparound(False)
802    def _unique(self, ndarray[object] values, ObjectVector uniques,
803                Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
804                object na_value=None, bint ignore_na=False,
805                bint return_inverse=False):
806        """
807        Calculate unique values and labels (no sorting!)
808
809        Parameters
810        ----------
811        values : ndarray[object]
812            Array of values of which unique will be calculated
813        uniques : ObjectVector
814            Vector into which uniques will be written
815        count_prior : Py_ssize_t, default 0
816            Number of existing entries in uniques
817        na_sentinel : Py_ssize_t, default -1
818            Sentinel value used for all NA-values in inverse
819        na_value : object, default None
820            Value to identify as missing. If na_value is None, then any value
821            that is not a string is considered missing. If na_value is
822            not None, then _additionally_ any value "val" satisfying
823            val == na_value is considered missing.
824        ignore_na : boolean, default False
825            Whether NA-values should be ignored for calculating the uniques. If
826            True, the labels corresponding to missing values will be set to
827            na_sentinel.
828        return_inverse : boolean, default False
829            Whether the mapping of the original array values to their location
830            in the vector of uniques should be returned.
831
832        Returns
833        -------
834        uniques : ndarray[object]
835            Unique values of input, not sorted
836        labels : ndarray[int64] (if return_inverse=True)
837            The labels from values to uniques
838        """
839        cdef:
840            Py_ssize_t i, idx, count = count_prior, n = len(values)
841            int64_t[:] labels
842            int64_t[:] uindexer
843            int ret = 0
844            object val
845            const char *v
846            const char **vecs
847            khiter_t k
848            bint use_na_value
849
850        if return_inverse:
851            labels = np.zeros(n, dtype=np.int64)
852        uindexer = np.empty(n, dtype=np.int64)
853        use_na_value = na_value is not None
854
855        # assign pointers and pre-filter out missing (if ignore_na)
856        vecs = <const char **>malloc(n * sizeof(char *))
857        for i in range(n):
858            val = values[i]
859
860            if (ignore_na
861                and (not isinstance(val, str)
862                     or (use_na_value and val == na_value))):
863                # if missing values do not count as unique values (i.e. if
864                # ignore_na is True), we can skip the actual value, and
865                # replace the label with na_sentinel directly
866                labels[i] = na_sentinel
867            else:
868                # if ignore_na is False, we also stringify NaN/None/etc.
869                try:
870                    v = get_c_string(<str>val)
871                except UnicodeEncodeError:
872                    v = get_c_string(<str>repr(val))
873                vecs[i] = v
874
875        # compute
876        with nogil:
877            for i in range(n):
878                if ignore_na and labels[i] == na_sentinel:
879                    # skip entries for ignored missing values (see above)
880                    continue
881
882                v = vecs[i]
883                k = kh_get_str(self.table, v)
884                if k == self.table.n_buckets:
885                    # k hasn't been seen yet
886                    k = kh_put_str(self.table, v, &ret)
887                    uindexer[count] = i
888                    if return_inverse:
889                        self.table.vals[k] = count
890                        labels[i] = <int64_t>count
891                    count += 1
892                elif return_inverse:
893                    # k falls into a previous bucket
894                    # only relevant in case we need to construct the inverse
895                    idx = self.table.vals[k]
896                    labels[i] = <int64_t>idx
897
898        free(vecs)
899
900        # uniques
901        for i in range(count):
902            uniques.append(values[uindexer[i]])
903
904        if return_inverse:
905            return uniques.to_array(), np.asarray(labels)
906        return uniques.to_array()
907
908    def unique(self, ndarray[object] values, bint return_inverse=False):
909        """
910        Calculate unique values and labels (no sorting!)
911
912        Parameters
913        ----------
914        values : ndarray[object]
915            Array of values of which unique will be calculated
916        return_inverse : boolean, default False
917            Whether the mapping of the original array values to their location
918            in the vector of uniques should be returned.
919
920        Returns
921        -------
922        uniques : ndarray[object]
923            Unique values of input, not sorted
924        labels : ndarray[int64] (if return_inverse)
925            The labels from values to uniques
926        """
927        uniques = ObjectVector()
928        return self._unique(values, uniques, ignore_na=False,
929                            return_inverse=return_inverse)
930
931    def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
932                  object na_value=None, object mask=None):
933        """
934        Calculate unique values and labels (no sorting!)
935
936        Missing values are not included in the "uniques" for this method.
937        The labels for any missing values will be set to "na_sentinel"
938
939        Parameters
940        ----------
941        values : ndarray[object]
942            Array of values of which unique will be calculated
943        na_sentinel : Py_ssize_t, default -1
944            Sentinel value used for all NA-values in inverse
945        na_value : object, default None
946            Value to identify as missing. If na_value is None, then any value
947            that is not a string is considered missing. If na_value is
948            not None, then _additionally_ any value "val" satisfying
949            val == na_value is considered missing.
950        mask : ndarray[bool], optional
951            Not yet implementd for StringHashTable.
952
953        Returns
954        -------
955        uniques : ndarray[object]
956            Unique values of input, not sorted
957        labels : ndarray[int64]
958            The labels from values to uniques
959        """
960        uniques_vector = ObjectVector()
961        return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
962                            na_value=na_value, ignore_na=True,
963                            return_inverse=True)
964
965    def get_labels(self, ndarray[object] values, ObjectVector uniques,
966                   Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
967                   object na_value=None):
968        _, labels = self._unique(values, uniques, count_prior=count_prior,
969                                 na_sentinel=na_sentinel, na_value=na_value,
970                                 ignore_na=True, return_inverse=True)
971        return labels
972
973
974cdef class PyObjectHashTable(HashTable):
975
976    def __init__(self, int64_t size_hint=1):
977        self.table = kh_init_pymap()
978        if size_hint is not None:
979            size_hint = min(size_hint, SIZE_HINT_LIMIT)
980            kh_resize_pymap(self.table, size_hint)
981
982    def __dealloc__(self):
983        if self.table is not NULL:
984            kh_destroy_pymap(self.table)
985            self.table = NULL
986
987    def __len__(self) -> int:
988        return self.table.size
989
990    def __contains__(self, object key):
991        cdef:
992            khiter_t k
993        hash(key)
994
995        k = kh_get_pymap(self.table, <PyObject*>key)
996        return k != self.table.n_buckets
997
998    def sizeof(self, deep=False):
999        """ return the size of my table in bytes """
1000        overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*)
1001        for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t)
1002        for_pairs =  self.table.n_buckets * (sizeof(PyObject *) +  # keys
1003                                             sizeof(Py_ssize_t))   # vals
1004        return overhead + for_flags + for_pairs
1005
1006    cpdef get_item(self, object val):
1007        cdef:
1008            khiter_t k
1009
1010        k = kh_get_pymap(self.table, <PyObject*>val)
1011        if k != self.table.n_buckets:
1012            return self.table.vals[k]
1013        else:
1014            raise KeyError(val)
1015
1016    cpdef set_item(self, object key, Py_ssize_t val):
1017        cdef:
1018            khiter_t k
1019            int ret = 0
1020            char* buf
1021
1022        hash(key)
1023
1024        k = kh_put_pymap(self.table, <PyObject*>key, &ret)
1025        # self.table.keys[k] = key
1026        if kh_exist_pymap(self.table, k):
1027            self.table.vals[k] = val
1028        else:
1029            raise KeyError(key)
1030
1031    def map_locations(self, ndarray[object] values):
1032        cdef:
1033            Py_ssize_t i, n = len(values)
1034            int ret = 0
1035            object val
1036            khiter_t k
1037
1038        for i in range(n):
1039            val = values[i]
1040            hash(val)
1041
1042            k = kh_put_pymap(self.table, <PyObject*>val, &ret)
1043            self.table.vals[k] = i
1044
1045    def lookup(self, ndarray[object] values):
1046        cdef:
1047            Py_ssize_t i, n = len(values)
1048            int ret = 0
1049            object val
1050            khiter_t k
1051            intp_t[:] locs = np.empty(n, dtype=np.intp)
1052
1053        for i in range(n):
1054            val = values[i]
1055            hash(val)
1056
1057            k = kh_get_pymap(self.table, <PyObject*>val)
1058            if k != self.table.n_buckets:
1059                locs[i] = self.table.vals[k]
1060            else:
1061                locs[i] = -1
1062
1063        return np.asarray(locs)
1064
1065    @cython.boundscheck(False)
1066    @cython.wraparound(False)
1067    def _unique(self, ndarray[object] values, ObjectVector uniques,
1068                Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
1069                object na_value=None, bint ignore_na=False,
1070                bint return_inverse=False):
1071        """
1072        Calculate unique values and labels (no sorting!)
1073
1074        Parameters
1075        ----------
1076        values : ndarray[object]
1077            Array of values of which unique will be calculated
1078        uniques : ObjectVector
1079            Vector into which uniques will be written
1080        count_prior : Py_ssize_t, default 0
1081            Number of existing entries in uniques
1082        na_sentinel : Py_ssize_t, default -1
1083            Sentinel value used for all NA-values in inverse
1084        na_value : object, default None
1085            Value to identify as missing. If na_value is None, then None _plus_
1086            any value "val" satisfying val != val is considered missing.
1087            If na_value is not None, then _additionally_, any value "val"
1088            satisfying val == na_value is considered missing.
1089        ignore_na : boolean, default False
1090            Whether NA-values should be ignored for calculating the uniques. If
1091            True, the labels corresponding to missing values will be set to
1092            na_sentinel.
1093        return_inverse : boolean, default False
1094            Whether the mapping of the original array values to their location
1095            in the vector of uniques should be returned.
1096
1097        Returns
1098        -------
1099        uniques : ndarray[object]
1100            Unique values of input, not sorted
1101        labels : ndarray[int64] (if return_inverse=True)
1102            The labels from values to uniques
1103        """
1104        cdef:
1105            Py_ssize_t i, idx, count = count_prior, n = len(values)
1106            int64_t[:] labels
1107            int ret = 0
1108            object val
1109            khiter_t k
1110            bint use_na_value
1111
1112        if return_inverse:
1113            labels = np.empty(n, dtype=np.int64)
1114        use_na_value = na_value is not None
1115
1116        for i in range(n):
1117            val = values[i]
1118            hash(val)
1119
1120            if ignore_na and (
1121                (val is C_NA)
1122                or (val != val)
1123                or (val is None)
1124                or (use_na_value and val == na_value)
1125            ):
1126                # if missing values do not count as unique values (i.e. if
1127                # ignore_na is True), skip the hashtable entry for them, and
1128                # replace the corresponding label with na_sentinel
1129                labels[i] = na_sentinel
1130                continue
1131
1132            k = kh_get_pymap(self.table, <PyObject*>val)
1133            if k == self.table.n_buckets:
1134                # k hasn't been seen yet
1135                k = kh_put_pymap(self.table, <PyObject*>val, &ret)
1136                uniques.append(val)
1137                if return_inverse:
1138                    self.table.vals[k] = count
1139                    labels[i] = count
1140                    count += 1
1141            elif return_inverse:
1142                # k falls into a previous bucket
1143                # only relevant in case we need to construct the inverse
1144                idx = self.table.vals[k]
1145                labels[i] = idx
1146
1147        if return_inverse:
1148            return uniques.to_array(), np.asarray(labels)
1149        return uniques.to_array()
1150
1151    def unique(self, ndarray[object] values, bint return_inverse=False):
1152        """
1153        Calculate unique values and labels (no sorting!)
1154
1155        Parameters
1156        ----------
1157        values : ndarray[object]
1158            Array of values of which unique will be calculated
1159        return_inverse : boolean, default False
1160            Whether the mapping of the original array values to their location
1161            in the vector of uniques should be returned.
1162
1163        Returns
1164        -------
1165        uniques : ndarray[object]
1166            Unique values of input, not sorted
1167        labels : ndarray[int64] (if return_inverse)
1168            The labels from values to uniques
1169        """
1170        uniques = ObjectVector()
1171        return self._unique(values, uniques, ignore_na=False,
1172                            return_inverse=return_inverse)
1173
1174    def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
1175                  object na_value=None, object mask=None):
1176        """
1177        Calculate unique values and labels (no sorting!)
1178
1179        Missing values are not included in the "uniques" for this method.
1180        The labels for any missing values will be set to "na_sentinel"
1181
1182        Parameters
1183        ----------
1184        values : ndarray[object]
1185            Array of values of which unique will be calculated
1186        na_sentinel : Py_ssize_t, default -1
1187            Sentinel value used for all NA-values in inverse
1188        na_value : object, default None
1189            Value to identify as missing. If na_value is None, then None _plus_
1190            any value "val" satisfying val != val is considered missing.
1191            If na_value is not None, then _additionally_, any value "val"
1192            satisfying val == na_value is considered missing.
1193        mask : ndarray[bool], optional
1194            Not yet implemented for PyObjectHashTable.
1195
1196        Returns
1197        -------
1198        uniques : ndarray[object]
1199            Unique values of input, not sorted
1200        labels : ndarray[int64]
1201            The labels from values to uniques
1202        """
1203        uniques_vector = ObjectVector()
1204        return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
1205                            na_value=na_value, ignore_na=True,
1206                            return_inverse=True)
1207
1208    def get_labels(self, ndarray[object] values, ObjectVector uniques,
1209                   Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
1210                   object na_value=None):
1211        _, labels = self._unique(values, uniques, count_prior=count_prior,
1212                                 na_sentinel=na_sentinel, na_value=na_value,
1213                                 ignore_na=True, return_inverse=True)
1214        return labels
1215