1"""
2Generic data algorithms. This module is experimental at the moment and not
3intended for public consumption
4"""
5from __future__ import annotations
6
7import operator
8from textwrap import dedent
9from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union, cast
10from warnings import catch_warnings, simplefilter, warn
11
12import numpy as np
13
14from pandas._libs import Timestamp, algos, hashtable as htable, iNaT, lib
15from pandas._typing import AnyArrayLike, ArrayLike, DtypeObj, FrameOrSeriesUnion
16from pandas.util._decorators import doc
17
18from pandas.core.dtypes.cast import (
19    construct_1d_object_array_from_listlike,
20    infer_dtype_from_array,
21    maybe_promote,
22)
23from pandas.core.dtypes.common import (
24    ensure_float64,
25    ensure_int64,
26    ensure_object,
27    ensure_platform_int,
28    ensure_uint64,
29    is_array_like,
30    is_bool_dtype,
31    is_categorical_dtype,
32    is_complex_dtype,
33    is_datetime64_dtype,
34    is_datetime64_ns_dtype,
35    is_extension_array_dtype,
36    is_float_dtype,
37    is_integer,
38    is_integer_dtype,
39    is_list_like,
40    is_numeric_dtype,
41    is_object_dtype,
42    is_period_dtype,
43    is_scalar,
44    is_signed_integer_dtype,
45    is_timedelta64_dtype,
46    is_unsigned_integer_dtype,
47    needs_i8_conversion,
48    pandas_dtype,
49)
50from pandas.core.dtypes.generic import (
51    ABCDatetimeArray,
52    ABCExtensionArray,
53    ABCIndexClass,
54    ABCMultiIndex,
55    ABCRangeIndex,
56    ABCSeries,
57    ABCTimedeltaArray,
58)
59from pandas.core.dtypes.missing import isna, na_value_for_dtype
60
61from pandas.core.construction import array, extract_array
62from pandas.core.indexers import validate_indices
63
64if TYPE_CHECKING:
65    from pandas import Categorical, DataFrame, Index, Series
66
67_shared_docs: Dict[str, str] = {}
68
69
70# --------------- #
71# dtype access    #
72# --------------- #
73def _ensure_data(
74    values: ArrayLike, dtype: Optional[DtypeObj] = None
75) -> Tuple[np.ndarray, DtypeObj]:
76    """
77    routine to ensure that our data is of the correct
78    input dtype for lower-level routines
79
80    This will coerce:
81    - ints -> int64
82    - uint -> uint64
83    - bool -> uint64 (TODO this should be uint8)
84    - datetimelike -> i8
85    - datetime64tz -> i8 (in local tz)
86    - categorical -> codes
87
88    Parameters
89    ----------
90    values : array-like
91    dtype : pandas_dtype, optional
92        coerce to this dtype
93
94    Returns
95    -------
96    values : ndarray
97    pandas_dtype : np.dtype or ExtensionDtype
98    """
99
100    if dtype is not None:
101        # We only have non-None dtype when called from `isin`, and
102        #  both Datetimelike and Categorical dispatch before getting here.
103        assert not needs_i8_conversion(dtype)
104        assert not is_categorical_dtype(dtype)
105
106    if not isinstance(values, ABCMultiIndex):
107        # extract_array would raise
108        values = extract_array(values, extract_numpy=True)
109
110    # we check some simple dtypes first
111    if is_object_dtype(dtype):
112        return ensure_object(np.asarray(values)), np.dtype("object")
113    elif is_object_dtype(values) and dtype is None:
114        return ensure_object(np.asarray(values)), np.dtype("object")
115
116    try:
117        if is_bool_dtype(values) or is_bool_dtype(dtype):
118            # we are actually coercing to uint64
119            # until our algos support uint8 directly (see TODO)
120            return np.asarray(values).astype("uint64"), np.dtype("bool")
121        elif is_signed_integer_dtype(values) or is_signed_integer_dtype(dtype):
122            return ensure_int64(values), np.dtype("int64")
123        elif is_unsigned_integer_dtype(values) or is_unsigned_integer_dtype(dtype):
124            return ensure_uint64(values), np.dtype("uint64")
125        elif is_float_dtype(values) or is_float_dtype(dtype):
126            return ensure_float64(values), np.dtype("float64")
127        elif is_complex_dtype(values) or is_complex_dtype(dtype):
128
129            # ignore the fact that we are casting to float
130            # which discards complex parts
131            with catch_warnings():
132                simplefilter("ignore", np.ComplexWarning)
133                values = ensure_float64(values)
134            return values, np.dtype("float64")
135
136    except (TypeError, ValueError, OverflowError):
137        # if we are trying to coerce to a dtype
138        # and it is incompatible this will fall through to here
139        return ensure_object(values), np.dtype("object")
140
141    # datetimelike
142    if needs_i8_conversion(values.dtype) or needs_i8_conversion(dtype):
143        if is_period_dtype(values.dtype) or is_period_dtype(dtype):
144            from pandas import PeriodIndex
145
146            values = PeriodIndex(values)._data
147            dtype = values.dtype
148        elif is_timedelta64_dtype(values.dtype) or is_timedelta64_dtype(dtype):
149            from pandas import TimedeltaIndex
150
151            values = TimedeltaIndex(values)._data
152            dtype = values.dtype
153        else:
154            # Datetime
155            if values.ndim > 1 and is_datetime64_ns_dtype(values.dtype):
156                # Avoid calling the DatetimeIndex constructor as it is 1D only
157                # Note: this is reached by DataFrame.rank calls GH#27027
158                # TODO(EA2D): special case not needed with 2D EAs
159                asi8 = values.view("i8")
160                dtype = values.dtype
161                return asi8, dtype
162
163            from pandas import DatetimeIndex
164
165            values = DatetimeIndex(values)._data
166            dtype = values.dtype
167
168        return values.asi8, dtype
169
170    elif is_categorical_dtype(values.dtype) and (
171        is_categorical_dtype(dtype) or dtype is None
172    ):
173        values = cast("Categorical", values)
174        values = values.codes
175        dtype = pandas_dtype("category")
176
177        # we are actually coercing to int64
178        # until our algos support int* directly (not all do)
179        values = ensure_int64(values)
180
181        return values, dtype
182
183    # we have failed, return object
184    values = np.asarray(values, dtype=object)
185    return ensure_object(values), np.dtype("object")
186
187
188def _reconstruct_data(
189    values: ArrayLike, dtype: DtypeObj, original: AnyArrayLike
190) -> ArrayLike:
191    """
192    reverse of _ensure_data
193
194    Parameters
195    ----------
196    values : np.ndarray or ExtensionArray
197    dtype : np.ndtype or ExtensionDtype
198    original : AnyArrayLike
199
200    Returns
201    -------
202    ExtensionArray or np.ndarray
203    """
204    if isinstance(values, ABCExtensionArray) and values.dtype == dtype:
205        # Catch DatetimeArray/TimedeltaArray
206        return values
207
208    if is_extension_array_dtype(dtype):
209        cls = dtype.construct_array_type()
210        if isinstance(values, cls) and values.dtype == dtype:
211            return values
212
213        values = cls._from_sequence(values)
214    elif is_bool_dtype(dtype):
215        values = values.astype(dtype, copy=False)
216
217        # we only support object dtypes bool Index
218        if isinstance(original, ABCIndexClass):
219            values = values.astype(object, copy=False)
220    elif dtype is not None:
221        if is_datetime64_dtype(dtype):
222            dtype = "datetime64[ns]"
223        elif is_timedelta64_dtype(dtype):
224            dtype = "timedelta64[ns]"
225
226        values = values.astype(dtype, copy=False)
227
228    return values
229
230
231def _ensure_arraylike(values):
232    """
233    ensure that we are arraylike if not already
234    """
235    if not is_array_like(values):
236        inferred = lib.infer_dtype(values, skipna=False)
237        if inferred in ["mixed", "string", "mixed-integer"]:
238            # "mixed-integer" to ensure we do not cast ["ss", 42] to str GH#22160
239            if isinstance(values, tuple):
240                values = list(values)
241            values = construct_1d_object_array_from_listlike(values)
242        else:
243            values = np.asarray(values)
244    return values
245
246
247_hashtables = {
248    "float64": htable.Float64HashTable,
249    "uint64": htable.UInt64HashTable,
250    "int64": htable.Int64HashTable,
251    "string": htable.StringHashTable,
252    "object": htable.PyObjectHashTable,
253}
254
255
256def _get_hashtable_algo(values: np.ndarray):
257    """
258    Parameters
259    ----------
260    values : np.ndarray
261
262    Returns
263    -------
264    htable : HashTable subclass
265    values : ndarray
266    """
267    values, _ = _ensure_data(values)
268
269    ndtype = _check_object_for_strings(values)
270    htable = _hashtables[ndtype]
271    return htable, values
272
273
274def _get_values_for_rank(values: ArrayLike):
275    if is_categorical_dtype(values):
276        values = cast("Categorical", values)._values_for_rank()
277
278    values, _ = _ensure_data(values)
279    return values
280
281
282def get_data_algo(values: ArrayLike):
283    values = _get_values_for_rank(values)
284
285    ndtype = _check_object_for_strings(values)
286    htable = _hashtables.get(ndtype, _hashtables["object"])
287
288    return htable, values
289
290
291def _check_object_for_strings(values) -> str:
292    """
293    Check if we can use string hashtable instead of object hashtable.
294
295    Parameters
296    ----------
297    values : ndarray
298
299    Returns
300    -------
301    str
302    """
303    ndtype = values.dtype.name
304    if ndtype == "object":
305
306        # it's cheaper to use a String Hash Table than Object; we infer
307        # including nulls because that is the only difference between
308        # StringHashTable and ObjectHashtable
309        if lib.infer_dtype(values, skipna=False) in ["string"]:
310            ndtype = "string"
311    return ndtype
312
313
314# --------------- #
315# top-level algos #
316# --------------- #
317
318
319def unique(values):
320    """
321    Hash table-based unique. Uniques are returned in order
322    of appearance. This does NOT sort.
323
324    Significantly faster than numpy.unique. Includes NA values.
325
326    Parameters
327    ----------
328    values : 1d array-like
329
330    Returns
331    -------
332    numpy.ndarray or ExtensionArray
333
334        The return can be:
335
336        * Index : when the input is an Index
337        * Categorical : when the input is a Categorical dtype
338        * ndarray : when the input is a Series/ndarray
339
340        Return numpy.ndarray or ExtensionArray.
341
342    See Also
343    --------
344    Index.unique : Return unique values from an Index.
345    Series.unique : Return unique values of Series object.
346
347    Examples
348    --------
349    >>> pd.unique(pd.Series([2, 1, 3, 3]))
350    array([2, 1, 3])
351
352    >>> pd.unique(pd.Series([2] + [1] * 5))
353    array([2, 1])
354
355    >>> pd.unique(pd.Series([pd.Timestamp('20160101'),
356    ...                     pd.Timestamp('20160101')]))
357    array(['2016-01-01T00:00:00.000000000'], dtype='datetime64[ns]')
358
359    >>> pd.unique(pd.Series([pd.Timestamp('20160101', tz='US/Eastern'),
360    ...                      pd.Timestamp('20160101', tz='US/Eastern')]))
361    array([Timestamp('2016-01-01 00:00:00-0500', tz='US/Eastern')],
362          dtype=object)
363
364    >>> pd.unique(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'),
365    ...                     pd.Timestamp('20160101', tz='US/Eastern')]))
366    DatetimeIndex(['2016-01-01 00:00:00-05:00'],
367    ...           dtype='datetime64[ns, US/Eastern]', freq=None)
368
369    >>> pd.unique(list('baabc'))
370    array(['b', 'a', 'c'], dtype=object)
371
372    An unordered Categorical will return categories in the
373    order of appearance.
374
375    >>> pd.unique(pd.Series(pd.Categorical(list('baabc'))))
376    [b, a, c]
377    Categories (3, object): [b, a, c]
378
379    >>> pd.unique(pd.Series(pd.Categorical(list('baabc'),
380    ...                                    categories=list('abc'))))
381    [b, a, c]
382    Categories (3, object): [b, a, c]
383
384    An ordered Categorical preserves the category ordering.
385
386    >>> pd.unique(pd.Series(pd.Categorical(list('baabc'),
387    ...                                    categories=list('abc'),
388    ...                                    ordered=True)))
389    [b, a, c]
390    Categories (3, object): [a < b < c]
391
392    An array of tuples
393
394    >>> pd.unique([('a', 'b'), ('b', 'a'), ('a', 'c'), ('b', 'a')])
395    array([('a', 'b'), ('b', 'a'), ('a', 'c')], dtype=object)
396    """
397    values = _ensure_arraylike(values)
398
399    if is_extension_array_dtype(values):
400        # Dispatch to extension dtype's unique.
401        return values.unique()
402
403    original = values
404    htable, values = _get_hashtable_algo(values)
405
406    table = htable(len(values))
407    uniques = table.unique(values)
408    uniques = _reconstruct_data(uniques, original.dtype, original)
409    return uniques
410
411
412unique1d = unique
413
414
415def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray:
416    """
417    Compute the isin boolean array.
418
419    Parameters
420    ----------
421    comps : array-like
422    values : array-like
423
424    Returns
425    -------
426    ndarray[bool]
427        Same length as `comps`.
428    """
429    if not is_list_like(comps):
430        raise TypeError(
431            "only list-like objects are allowed to be passed "
432            f"to isin(), you passed a [{type(comps).__name__}]"
433        )
434    if not is_list_like(values):
435        raise TypeError(
436            "only list-like objects are allowed to be passed "
437            f"to isin(), you passed a [{type(values).__name__}]"
438        )
439
440    if not isinstance(
441        values, (ABCIndexClass, ABCSeries, ABCExtensionArray, np.ndarray)
442    ):
443        values = _ensure_arraylike(list(values))
444    elif isinstance(values, ABCMultiIndex):
445        # Avoid raising in extract_array
446        values = np.array(values)
447    else:
448        values = extract_array(values, extract_numpy=True)
449
450    comps = _ensure_arraylike(comps)
451    comps = extract_array(comps, extract_numpy=True)
452    if is_categorical_dtype(comps.dtype):
453        # TODO(extension)
454        # handle categoricals
455        return cast("Categorical", comps).isin(values)
456
457    if needs_i8_conversion(comps.dtype):
458        # Dispatch to DatetimeLikeArrayMixin.isin
459        return array(comps).isin(values)
460    elif needs_i8_conversion(values.dtype) and not is_object_dtype(comps.dtype):
461        # e.g. comps are integers and values are datetime64s
462        return np.zeros(comps.shape, dtype=bool)
463        # TODO: not quite right ... Sparse/Categorical
464    elif needs_i8_conversion(values.dtype):
465        return isin(comps, values.astype(object))
466
467    elif is_extension_array_dtype(comps.dtype) or is_extension_array_dtype(
468        values.dtype
469    ):
470        return isin(np.asarray(comps), np.asarray(values))
471
472    # GH16012
473    # Ensure np.in1d doesn't get object types or it *may* throw an exception
474    # Albeit hashmap has O(1) look-up (vs. O(logn) in sorted array),
475    # in1d is faster for small sizes
476    if len(comps) > 1_000_000 and len(values) <= 26 and not is_object_dtype(comps):
477        # If the values include nan we need to check for nan explicitly
478        # since np.nan it not equal to np.nan
479        if isna(values).any():
480            f = lambda c, v: np.logical_or(np.in1d(c, v), np.isnan(c))
481        else:
482            f = np.in1d
483
484    else:
485        common = np.find_common_type([values.dtype, comps.dtype], [])
486        values = values.astype(common, copy=False)
487        comps = comps.astype(common, copy=False)
488        name = common.name
489        if name == "bool":
490            name = "uint8"
491        f = getattr(htable, f"ismember_{name}")
492
493    return f(comps, values)
494
495
496def factorize_array(
497    values: np.ndarray, na_sentinel: int = -1, size_hint=None, na_value=None, mask=None
498) -> Tuple[np.ndarray, np.ndarray]:
499    """
500    Factorize an array-like to codes and uniques.
501
502    This doesn't do any coercion of types or unboxing before factorization.
503
504    Parameters
505    ----------
506    values : ndarray
507    na_sentinel : int, default -1
508    size_hint : int, optional
509        Passed through to the hashtable's 'get_labels' method
510    na_value : object, optional
511        A value in `values` to consider missing. Note: only use this
512        parameter when you know that you don't have any values pandas would
513        consider missing in the array (NaN for float data, iNaT for
514        datetimes, etc.).
515    mask : ndarray[bool], optional
516        If not None, the mask is used as indicator for missing values
517        (True = missing, False = valid) instead of `na_value` or
518        condition "val != val".
519
520    Returns
521    -------
522    codes : ndarray
523    uniques : ndarray
524    """
525    hash_klass, values = get_data_algo(values)
526
527    table = hash_klass(size_hint or len(values))
528    uniques, codes = table.factorize(
529        values, na_sentinel=na_sentinel, na_value=na_value, mask=mask
530    )
531
532    codes = ensure_platform_int(codes)
533    return codes, uniques
534
535
536@doc(
537    values=dedent(
538        """\
539    values : sequence
540        A 1-D sequence. Sequences that aren't pandas objects are
541        coerced to ndarrays before factorization.
542    """
543    ),
544    sort=dedent(
545        """\
546    sort : bool, default False
547        Sort `uniques` and shuffle `codes` to maintain the
548        relationship.
549    """
550    ),
551    size_hint=dedent(
552        """\
553    size_hint : int, optional
554        Hint to the hashtable sizer.
555    """
556    ),
557)
558def factorize(
559    values,
560    sort: bool = False,
561    na_sentinel: Optional[int] = -1,
562    size_hint: Optional[int] = None,
563) -> Tuple[np.ndarray, Union[np.ndarray, "Index"]]:
564    """
565    Encode the object as an enumerated type or categorical variable.
566
567    This method is useful for obtaining a numeric representation of an
568    array when all that matters is identifying distinct values. `factorize`
569    is available as both a top-level function :func:`pandas.factorize`,
570    and as a method :meth:`Series.factorize` and :meth:`Index.factorize`.
571
572    Parameters
573    ----------
574    {values}{sort}
575    na_sentinel : int or None, default -1
576        Value to mark "not found". If None, will not drop the NaN
577        from the uniques of the values.
578
579        .. versionchanged:: 1.1.2
580    {size_hint}\
581
582    Returns
583    -------
584    codes : ndarray
585        An integer ndarray that's an indexer into `uniques`.
586        ``uniques.take(codes)`` will have the same values as `values`.
587    uniques : ndarray, Index, or Categorical
588        The unique valid values. When `values` is Categorical, `uniques`
589        is a Categorical. When `values` is some other pandas object, an
590        `Index` is returned. Otherwise, a 1-D ndarray is returned.
591
592        .. note ::
593
594           Even if there's a missing value in `values`, `uniques` will
595           *not* contain an entry for it.
596
597    See Also
598    --------
599    cut : Discretize continuous-valued array.
600    unique : Find the unique value in an array.
601
602    Examples
603    --------
604    These examples all show factorize as a top-level method like
605    ``pd.factorize(values)``. The results are identical for methods like
606    :meth:`Series.factorize`.
607
608    >>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'])
609    >>> codes
610    array([0, 0, 1, 2, 0]...)
611    >>> uniques
612    array(['b', 'a', 'c'], dtype=object)
613
614    With ``sort=True``, the `uniques` will be sorted, and `codes` will be
615    shuffled so that the relationship is the maintained.
616
617    >>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True)
618    >>> codes
619    array([1, 1, 0, 2, 1]...)
620    >>> uniques
621    array(['a', 'b', 'c'], dtype=object)
622
623    Missing values are indicated in `codes` with `na_sentinel`
624    (``-1`` by default). Note that missing values are never
625    included in `uniques`.
626
627    >>> codes, uniques = pd.factorize(['b', None, 'a', 'c', 'b'])
628    >>> codes
629    array([ 0, -1,  1,  2,  0]...)
630    >>> uniques
631    array(['b', 'a', 'c'], dtype=object)
632
633    Thus far, we've only factorized lists (which are internally coerced to
634    NumPy arrays). When factorizing pandas objects, the type of `uniques`
635    will differ. For Categoricals, a `Categorical` is returned.
636
637    >>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c'])
638    >>> codes, uniques = pd.factorize(cat)
639    >>> codes
640    array([0, 0, 1]...)
641    >>> uniques
642    ['a', 'c']
643    Categories (3, object): ['a', 'b', 'c']
644
645    Notice that ``'b'`` is in ``uniques.categories``, despite not being
646    present in ``cat.values``.
647
648    For all other pandas objects, an Index of the appropriate type is
649    returned.
650
651    >>> cat = pd.Series(['a', 'a', 'c'])
652    >>> codes, uniques = pd.factorize(cat)
653    >>> codes
654    array([0, 0, 1]...)
655    >>> uniques
656    Index(['a', 'c'], dtype='object')
657
658    If NaN is in the values, and we want to include NaN in the uniques of the
659    values, it can be achieved by setting ``na_sentinel=None``.
660
661    >>> values = np.array([1, 2, 1, np.nan])
662    >>> codes, uniques = pd.factorize(values)  # default: na_sentinel=-1
663    >>> codes
664    array([ 0,  1,  0, -1])
665    >>> uniques
666    array([1., 2.])
667
668    >>> codes, uniques = pd.factorize(values, na_sentinel=None)
669    >>> codes
670    array([0, 1, 0, 2])
671    >>> uniques
672    array([ 1.,  2., nan])
673    """
674    # Implementation notes: This method is responsible for 3 things
675    # 1.) coercing data to array-like (ndarray, Index, extension array)
676    # 2.) factorizing codes and uniques
677    # 3.) Maybe boxing the uniques in an Index
678    #
679    # Step 2 is dispatched to extension types (like Categorical). They are
680    # responsible only for factorization. All data coercion, sorting and boxing
681    # should happen here.
682
683    if isinstance(values, ABCRangeIndex):
684        return values.factorize(sort=sort)
685
686    values = _ensure_arraylike(values)
687    original = values
688    if not isinstance(values, ABCMultiIndex):
689        values = extract_array(values, extract_numpy=True)
690
691    # GH35667, if na_sentinel=None, we will not dropna NaNs from the uniques
692    # of values, assign na_sentinel=-1 to replace code value for NaN.
693    dropna = True
694    if na_sentinel is None:
695        na_sentinel = -1
696        dropna = False
697
698    if (
699        isinstance(values, (ABCDatetimeArray, ABCTimedeltaArray))
700        and values.freq is not None
701    ):
702        codes, uniques = values.factorize(sort=sort)
703        if isinstance(original, ABCIndexClass):
704            uniques = original._shallow_copy(uniques, name=None)
705        elif isinstance(original, ABCSeries):
706            from pandas import Index
707
708            uniques = Index(uniques)
709        return codes, uniques
710
711    if is_extension_array_dtype(values.dtype):
712        codes, uniques = values.factorize(na_sentinel=na_sentinel)
713        dtype = original.dtype
714    else:
715        values, dtype = _ensure_data(values)
716
717        if original.dtype.kind in ["m", "M"]:
718            na_value = na_value_for_dtype(original.dtype)
719        else:
720            na_value = None
721
722        codes, uniques = factorize_array(
723            values, na_sentinel=na_sentinel, size_hint=size_hint, na_value=na_value
724        )
725
726    if sort and len(uniques) > 0:
727        uniques, codes = safe_sort(
728            uniques, codes, na_sentinel=na_sentinel, assume_unique=True, verify=False
729        )
730
731    code_is_na = codes == na_sentinel
732    if not dropna and code_is_na.any():
733        # na_value is set based on the dtype of uniques, and compat set to False is
734        # because we do not want na_value to be 0 for integers
735        na_value = na_value_for_dtype(uniques.dtype, compat=False)
736        uniques = np.append(uniques, [na_value])
737        codes = np.where(code_is_na, len(uniques) - 1, codes)
738
739    uniques = _reconstruct_data(uniques, dtype, original)
740
741    # return original tenor
742    if isinstance(original, ABCIndexClass):
743        if original.dtype.kind in ["m", "M"] and isinstance(uniques, np.ndarray):
744            uniques = type(original._data)._simple_new(uniques, dtype=original.dtype)
745        uniques = original._shallow_copy(uniques, name=None)
746    elif isinstance(original, ABCSeries):
747        from pandas import Index
748
749        uniques = Index(uniques)
750
751    return codes, uniques
752
753
754def value_counts(
755    values,
756    sort: bool = True,
757    ascending: bool = False,
758    normalize: bool = False,
759    bins=None,
760    dropna: bool = True,
761) -> Series:
762    """
763    Compute a histogram of the counts of non-null values.
764
765    Parameters
766    ----------
767    values : ndarray (1-d)
768    sort : bool, default True
769        Sort by values
770    ascending : bool, default False
771        Sort in ascending order
772    normalize: bool, default False
773        If True then compute a relative histogram
774    bins : integer, optional
775        Rather than count values, group them into half-open bins,
776        convenience for pd.cut, only works with numeric data
777    dropna : bool, default True
778        Don't include counts of NaN
779
780    Returns
781    -------
782    Series
783    """
784    from pandas.core.series import Series
785
786    name = getattr(values, "name", None)
787
788    if bins is not None:
789        from pandas.core.reshape.tile import cut
790
791        values = Series(values)
792        try:
793            ii = cut(values, bins, include_lowest=True)
794        except TypeError as err:
795            raise TypeError("bins argument only works with numeric data.") from err
796
797        # count, remove nulls (from the index), and but the bins
798        result = ii.value_counts(dropna=dropna)
799        result = result[result.index.notna()]
800        result.index = result.index.astype("interval")
801        result = result.sort_index()
802
803        # if we are dropna and we have NO values
804        if dropna and (result._values == 0).all():
805            result = result.iloc[0:0]
806
807        # normalizing is by len of all (regardless of dropna)
808        counts = np.array([len(ii)])
809
810    else:
811
812        if is_extension_array_dtype(values):
813
814            # handle Categorical and sparse,
815            result = Series(values)._values.value_counts(dropna=dropna)
816            result.name = name
817            counts = result._values
818
819        else:
820            keys, counts = value_counts_arraylike(values, dropna)
821
822            result = Series(counts, index=keys, name=name)
823
824    if sort:
825        result = result.sort_values(ascending=ascending)
826
827    if normalize:
828        result = result / float(counts.sum())
829
830    return result
831
832
833# Called once from SparseArray, otherwise could be private
834def value_counts_arraylike(values, dropna: bool):
835    """
836    Parameters
837    ----------
838    values : arraylike
839    dropna : bool
840
841    Returns
842    -------
843    uniques : np.ndarray or ExtensionArray
844    counts : np.ndarray
845    """
846    values = _ensure_arraylike(values)
847    original = values
848    values, _ = _ensure_data(values)
849    ndtype = values.dtype.name
850
851    if needs_i8_conversion(original.dtype):
852        # datetime, timedelta, or period
853
854        keys, counts = htable.value_count_int64(values, dropna)
855
856        if dropna:
857            msk = keys != iNaT
858            keys, counts = keys[msk], counts[msk]
859
860    else:
861        # ndarray like
862
863        # TODO: handle uint8
864        f = getattr(htable, f"value_count_{ndtype}")
865        keys, counts = f(values, dropna)
866
867        mask = isna(values)
868        if not dropna and mask.any():
869            if not isna(keys).any():
870                keys = np.insert(keys, 0, np.NaN)
871                counts = np.insert(counts, 0, mask.sum())
872
873    keys = _reconstruct_data(keys, original.dtype, original)
874
875    return keys, counts
876
877
878def duplicated(values: ArrayLike, keep: str = "first") -> np.ndarray:
879    """
880    Return boolean ndarray denoting duplicate values.
881
882    Parameters
883    ----------
884    values : ndarray-like
885        Array over which to check for duplicate values.
886    keep : {'first', 'last', False}, default 'first'
887        - ``first`` : Mark duplicates as ``True`` except for the first
888          occurrence.
889        - ``last`` : Mark duplicates as ``True`` except for the last
890          occurrence.
891        - False : Mark all duplicates as ``True``.
892
893    Returns
894    -------
895    duplicated : ndarray
896    """
897    values, _ = _ensure_data(values)
898    ndtype = values.dtype.name
899    f = getattr(htable, f"duplicated_{ndtype}")
900    return f(values, keep=keep)
901
902
903def mode(values, dropna: bool = True) -> Series:
904    """
905    Returns the mode(s) of an array.
906
907    Parameters
908    ----------
909    values : array-like
910        Array over which to check for duplicate values.
911    dropna : boolean, default True
912        Don't consider counts of NaN/NaT.
913
914        .. versionadded:: 0.24.0
915
916    Returns
917    -------
918    mode : Series
919    """
920    from pandas import Series
921
922    values = _ensure_arraylike(values)
923    original = values
924
925    # categorical is a fast-path
926    if is_categorical_dtype(values):
927        if isinstance(values, Series):
928            # TODO: should we be passing `name` below?
929            return Series(values._values.mode(dropna=dropna), name=values.name)
930        return values.mode(dropna=dropna)
931
932    if dropna and needs_i8_conversion(values.dtype):
933        mask = values.isnull()
934        values = values[~mask]
935
936    values, _ = _ensure_data(values)
937    ndtype = values.dtype.name
938
939    f = getattr(htable, f"mode_{ndtype}")
940    result = f(values, dropna=dropna)
941    try:
942        result = np.sort(result)
943    except TypeError as err:
944        warn(f"Unable to sort modes: {err}")
945
946    result = _reconstruct_data(result, original.dtype, original)
947    return Series(result)
948
949
950def rank(
951    values,
952    axis: int = 0,
953    method: str = "average",
954    na_option: str = "keep",
955    ascending: bool = True,
956    pct: bool = False,
957):
958    """
959    Rank the values along a given axis.
960
961    Parameters
962    ----------
963    values : array-like
964        Array whose values will be ranked. The number of dimensions in this
965        array must not exceed 2.
966    axis : int, default 0
967        Axis over which to perform rankings.
968    method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
969        The method by which tiebreaks are broken during the ranking.
970    na_option : {'keep', 'top'}, default 'keep'
971        The method by which NaNs are placed in the ranking.
972        - ``keep``: rank each NaN value with a NaN ranking
973        - ``top``: replace each NaN with either +/- inf so that they
974                   there are ranked at the top
975    ascending : boolean, default True
976        Whether or not the elements should be ranked in ascending order.
977    pct : boolean, default False
978        Whether or not to the display the returned rankings in integer form
979        (e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1).
980    """
981    if values.ndim == 1:
982        values = _get_values_for_rank(values)
983        ranks = algos.rank_1d(
984            values,
985            ties_method=method,
986            ascending=ascending,
987            na_option=na_option,
988            pct=pct,
989        )
990    elif values.ndim == 2:
991        values = _get_values_for_rank(values)
992        ranks = algos.rank_2d(
993            values,
994            axis=axis,
995            ties_method=method,
996            ascending=ascending,
997            na_option=na_option,
998            pct=pct,
999        )
1000    else:
1001        raise TypeError("Array with ndim > 2 are not supported.")
1002
1003    return ranks
1004
1005
1006def checked_add_with_arr(arr, b, arr_mask=None, b_mask=None):
1007    """
1008    Perform array addition that checks for underflow and overflow.
1009
1010    Performs the addition of an int64 array and an int64 integer (or array)
1011    but checks that they do not result in overflow first. For elements that
1012    are indicated to be NaN, whether or not there is overflow for that element
1013    is automatically ignored.
1014
1015    Parameters
1016    ----------
1017    arr : array addend.
1018    b : array or scalar addend.
1019    arr_mask : boolean array or None
1020        array indicating which elements to exclude from checking
1021    b_mask : boolean array or boolean or None
1022        array or scalar indicating which element(s) to exclude from checking
1023
1024    Returns
1025    -------
1026    sum : An array for elements x + b for each element x in arr if b is
1027          a scalar or an array for elements x + y for each element pair
1028          (x, y) in (arr, b).
1029
1030    Raises
1031    ------
1032    OverflowError if any x + y exceeds the maximum or minimum int64 value.
1033    """
1034    # For performance reasons, we broadcast 'b' to the new array 'b2'
1035    # so that it has the same size as 'arr'.
1036    b2 = np.broadcast_to(b, arr.shape)
1037    if b_mask is not None:
1038        # We do the same broadcasting for b_mask as well.
1039        b2_mask = np.broadcast_to(b_mask, arr.shape)
1040    else:
1041        b2_mask = None
1042
1043    # For elements that are NaN, regardless of their value, we should
1044    # ignore whether they overflow or not when doing the checked add.
1045    if arr_mask is not None and b2_mask is not None:
1046        not_nan = np.logical_not(arr_mask | b2_mask)
1047    elif arr_mask is not None:
1048        not_nan = np.logical_not(arr_mask)
1049    elif b_mask is not None:
1050        not_nan = np.logical_not(b2_mask)
1051    else:
1052        not_nan = np.empty(arr.shape, dtype=bool)
1053        not_nan.fill(True)
1054
1055    # gh-14324: For each element in 'arr' and its corresponding element
1056    # in 'b2', we check the sign of the element in 'b2'. If it is positive,
1057    # we then check whether its sum with the element in 'arr' exceeds
1058    # np.iinfo(np.int64).max. If so, we have an overflow error. If it
1059    # it is negative, we then check whether its sum with the element in
1060    # 'arr' exceeds np.iinfo(np.int64).min. If so, we have an overflow
1061    # error as well.
1062    mask1 = b2 > 0
1063    mask2 = b2 < 0
1064
1065    if not mask1.any():
1066        to_raise = ((np.iinfo(np.int64).min - b2 > arr) & not_nan).any()
1067    elif not mask2.any():
1068        to_raise = ((np.iinfo(np.int64).max - b2 < arr) & not_nan).any()
1069    else:
1070        to_raise = (
1071            (np.iinfo(np.int64).max - b2[mask1] < arr[mask1]) & not_nan[mask1]
1072        ).any() or (
1073            (np.iinfo(np.int64).min - b2[mask2] > arr[mask2]) & not_nan[mask2]
1074        ).any()
1075
1076    if to_raise:
1077        raise OverflowError("Overflow in int64 addition")
1078    return arr + b
1079
1080
1081def quantile(x, q, interpolation_method="fraction"):
1082    """
1083    Compute sample quantile or quantiles of the input array. For example, q=0.5
1084    computes the median.
1085
1086    The `interpolation_method` parameter supports three values, namely
1087    `fraction` (default), `lower` and `higher`. Interpolation is done only,
1088    if the desired quantile lies between two data points `i` and `j`. For
1089    `fraction`, the result is an interpolated value between `i` and `j`;
1090    for `lower`, the result is `i`, for `higher` the result is `j`.
1091
1092    Parameters
1093    ----------
1094    x : ndarray
1095        Values from which to extract score.
1096    q : scalar or array
1097        Percentile at which to extract score.
1098    interpolation_method : {'fraction', 'lower', 'higher'}, optional
1099        This optional parameter specifies the interpolation method to use,
1100        when the desired quantile lies between two data points `i` and `j`:
1101
1102        - fraction: `i + (j - i)*fraction`, where `fraction` is the
1103                    fractional part of the index surrounded by `i` and `j`.
1104        -lower: `i`.
1105        - higher: `j`.
1106
1107    Returns
1108    -------
1109    score : float
1110        Score at percentile.
1111
1112    Examples
1113    --------
1114    >>> from scipy import stats
1115    >>> a = np.arange(100)
1116    >>> stats.scoreatpercentile(a, 50)
1117    49.5
1118
1119    """
1120    x = np.asarray(x)
1121    mask = isna(x)
1122
1123    x = x[~mask]
1124
1125    values = np.sort(x)
1126
1127    def _interpolate(a, b, fraction):
1128        """
1129        Returns the point at the given fraction between a and b, where
1130        'fraction' must be between 0 and 1.
1131        """
1132        return a + (b - a) * fraction
1133
1134    def _get_score(at):
1135        if len(values) == 0:
1136            return np.nan
1137
1138        idx = at * (len(values) - 1)
1139        if idx % 1 == 0:
1140            score = values[int(idx)]
1141        else:
1142            if interpolation_method == "fraction":
1143                score = _interpolate(values[int(idx)], values[int(idx) + 1], idx % 1)
1144            elif interpolation_method == "lower":
1145                score = values[np.floor(idx)]
1146            elif interpolation_method == "higher":
1147                score = values[np.ceil(idx)]
1148            else:
1149                raise ValueError(
1150                    "interpolation_method can only be 'fraction' "
1151                    ", 'lower' or 'higher'"
1152                )
1153
1154        return score
1155
1156    if is_scalar(q):
1157        return _get_score(q)
1158    else:
1159        q = np.asarray(q, np.float64)
1160        result = [_get_score(x) for x in q]
1161        result = np.array(result, dtype=np.float64)
1162        return result
1163
1164
1165# --------------- #
1166# select n        #
1167# --------------- #
1168
1169
1170class SelectN:
1171    def __init__(self, obj, n: int, keep: str):
1172        self.obj = obj
1173        self.n = n
1174        self.keep = keep
1175
1176        if self.keep not in ("first", "last", "all"):
1177            raise ValueError('keep must be either "first", "last" or "all"')
1178
1179    def compute(self, method: str) -> FrameOrSeriesUnion:
1180        raise NotImplementedError
1181
1182    def nlargest(self):
1183        return self.compute("nlargest")
1184
1185    def nsmallest(self):
1186        return self.compute("nsmallest")
1187
1188    @staticmethod
1189    def is_valid_dtype_n_method(dtype: DtypeObj) -> bool:
1190        """
1191        Helper function to determine if dtype is valid for
1192        nsmallest/nlargest methods
1193        """
1194        return (
1195            is_numeric_dtype(dtype) and not is_complex_dtype(dtype)
1196        ) or needs_i8_conversion(dtype)
1197
1198
1199class SelectNSeries(SelectN):
1200    """
1201    Implement n largest/smallest for Series
1202
1203    Parameters
1204    ----------
1205    obj : Series
1206    n : int
1207    keep : {'first', 'last'}, default 'first'
1208
1209    Returns
1210    -------
1211    nordered : Series
1212    """
1213
1214    def compute(self, method: str) -> Series:
1215
1216        n = self.n
1217        dtype = self.obj.dtype
1218        if not self.is_valid_dtype_n_method(dtype):
1219            raise TypeError(f"Cannot use method '{method}' with dtype {dtype}")
1220
1221        if n <= 0:
1222            return self.obj[[]]
1223
1224        dropped = self.obj.dropna()
1225
1226        # slow method
1227        if n >= len(self.obj):
1228            ascending = method == "nsmallest"
1229            return dropped.sort_values(ascending=ascending).head(n)
1230
1231        # fast method
1232        arr, pandas_dtype = _ensure_data(dropped.values)
1233        if method == "nlargest":
1234            arr = -arr
1235            if is_integer_dtype(pandas_dtype):
1236                # GH 21426: ensure reverse ordering at boundaries
1237                arr -= 1
1238
1239            elif is_bool_dtype(pandas_dtype):
1240                # GH 26154: ensure False is smaller than True
1241                arr = 1 - (-arr)
1242
1243        if self.keep == "last":
1244            arr = arr[::-1]
1245
1246        narr = len(arr)
1247        n = min(n, narr)
1248
1249        kth_val = algos.kth_smallest(arr.copy(), n - 1)
1250        (ns,) = np.nonzero(arr <= kth_val)
1251        inds = ns[arr[ns].argsort(kind="mergesort")]
1252
1253        if self.keep != "all":
1254            inds = inds[:n]
1255
1256        if self.keep == "last":
1257            # reverse indices
1258            inds = narr - 1 - inds
1259
1260        return dropped.iloc[inds]
1261
1262
1263class SelectNFrame(SelectN):
1264    """
1265    Implement n largest/smallest for DataFrame
1266
1267    Parameters
1268    ----------
1269    obj : DataFrame
1270    n : int
1271    keep : {'first', 'last'}, default 'first'
1272    columns : list or str
1273
1274    Returns
1275    -------
1276    nordered : DataFrame
1277    """
1278
1279    def __init__(self, obj, n: int, keep: str, columns):
1280        super().__init__(obj, n, keep)
1281        if not is_list_like(columns) or isinstance(columns, tuple):
1282            columns = [columns]
1283        columns = list(columns)
1284        self.columns = columns
1285
1286    def compute(self, method: str) -> DataFrame:
1287
1288        from pandas import Int64Index
1289
1290        n = self.n
1291        frame = self.obj
1292        columns = self.columns
1293
1294        for column in columns:
1295            dtype = frame[column].dtype
1296            if not self.is_valid_dtype_n_method(dtype):
1297                raise TypeError(
1298                    f"Column {repr(column)} has dtype {dtype}, "
1299                    f"cannot use method {repr(method)} with this dtype"
1300                )
1301
1302        def get_indexer(current_indexer, other_indexer):
1303            """
1304            Helper function to concat `current_indexer` and `other_indexer`
1305            depending on `method`
1306            """
1307            if method == "nsmallest":
1308                return current_indexer.append(other_indexer)
1309            else:
1310                return other_indexer.append(current_indexer)
1311
1312        # Below we save and reset the index in case index contains duplicates
1313        original_index = frame.index
1314        cur_frame = frame = frame.reset_index(drop=True)
1315        cur_n = n
1316        indexer = Int64Index([])
1317
1318        for i, column in enumerate(columns):
1319            # For each column we apply method to cur_frame[column].
1320            # If it's the last column or if we have the number of
1321            # results desired we are done.
1322            # Otherwise there are duplicates of the largest/smallest
1323            # value and we need to look at the rest of the columns
1324            # to determine which of the rows with the largest/smallest
1325            # value in the column to keep.
1326            series = cur_frame[column]
1327            is_last_column = len(columns) - 1 == i
1328            values = getattr(series, method)(
1329                cur_n, keep=self.keep if is_last_column else "all"
1330            )
1331
1332            if is_last_column or len(values) <= cur_n:
1333                indexer = get_indexer(indexer, values.index)
1334                break
1335
1336            # Now find all values which are equal to
1337            # the (nsmallest: largest)/(nlargest: smallest)
1338            # from our series.
1339            border_value = values == values[values.index[-1]]
1340
1341            # Some of these values are among the top-n
1342            # some aren't.
1343            unsafe_values = values[border_value]
1344
1345            # These values are definitely among the top-n
1346            safe_values = values[~border_value]
1347            indexer = get_indexer(indexer, safe_values.index)
1348
1349            # Go on and separate the unsafe_values on the remaining
1350            # columns.
1351            cur_frame = cur_frame.loc[unsafe_values.index]
1352            cur_n = n - len(indexer)
1353
1354        frame = frame.take(indexer)
1355
1356        # Restore the index on frame
1357        frame.index = original_index.take(indexer)
1358
1359        # If there is only one column, the frame is already sorted.
1360        if len(columns) == 1:
1361            return frame
1362
1363        ascending = method == "nsmallest"
1364
1365        return frame.sort_values(columns, ascending=ascending, kind="mergesort")
1366
1367
1368# ---- #
1369# take #
1370# ---- #
1371
1372
1373def _view_wrapper(f, arr_dtype=None, out_dtype=None, fill_wrap=None):
1374    def wrapper(arr, indexer, out, fill_value=np.nan):
1375        if arr_dtype is not None:
1376            arr = arr.view(arr_dtype)
1377        if out_dtype is not None:
1378            out = out.view(out_dtype)
1379        if fill_wrap is not None:
1380            fill_value = fill_wrap(fill_value)
1381        f(arr, indexer, out, fill_value=fill_value)
1382
1383    return wrapper
1384
1385
1386def _convert_wrapper(f, conv_dtype):
1387    def wrapper(arr, indexer, out, fill_value=np.nan):
1388        arr = arr.astype(conv_dtype)
1389        f(arr, indexer, out, fill_value=fill_value)
1390
1391    return wrapper
1392
1393
1394def _take_2d_multi_object(arr, indexer, out, fill_value, mask_info):
1395    # this is not ideal, performance-wise, but it's better than raising
1396    # an exception (best to optimize in Cython to avoid getting here)
1397    row_idx, col_idx = indexer
1398    if mask_info is not None:
1399        (row_mask, col_mask), (row_needs, col_needs) = mask_info
1400    else:
1401        row_mask = row_idx == -1
1402        col_mask = col_idx == -1
1403        row_needs = row_mask.any()
1404        col_needs = col_mask.any()
1405    if fill_value is not None:
1406        if row_needs:
1407            out[row_mask, :] = fill_value
1408        if col_needs:
1409            out[:, col_mask] = fill_value
1410    for i in range(len(row_idx)):
1411        u_ = row_idx[i]
1412        for j in range(len(col_idx)):
1413            v = col_idx[j]
1414            out[i, j] = arr[u_, v]
1415
1416
1417def _take_nd_object(arr, indexer, out, axis: int, fill_value, mask_info):
1418    if mask_info is not None:
1419        mask, needs_masking = mask_info
1420    else:
1421        mask = indexer == -1
1422        needs_masking = mask.any()
1423    if arr.dtype != out.dtype:
1424        arr = arr.astype(out.dtype)
1425    if arr.shape[axis] > 0:
1426        arr.take(ensure_platform_int(indexer), axis=axis, out=out)
1427    if needs_masking:
1428        outindexer = [slice(None)] * arr.ndim
1429        outindexer[axis] = mask
1430        out[tuple(outindexer)] = fill_value
1431
1432
1433_take_1d_dict = {
1434    ("int8", "int8"): algos.take_1d_int8_int8,
1435    ("int8", "int32"): algos.take_1d_int8_int32,
1436    ("int8", "int64"): algos.take_1d_int8_int64,
1437    ("int8", "float64"): algos.take_1d_int8_float64,
1438    ("int16", "int16"): algos.take_1d_int16_int16,
1439    ("int16", "int32"): algos.take_1d_int16_int32,
1440    ("int16", "int64"): algos.take_1d_int16_int64,
1441    ("int16", "float64"): algos.take_1d_int16_float64,
1442    ("int32", "int32"): algos.take_1d_int32_int32,
1443    ("int32", "int64"): algos.take_1d_int32_int64,
1444    ("int32", "float64"): algos.take_1d_int32_float64,
1445    ("int64", "int64"): algos.take_1d_int64_int64,
1446    ("int64", "float64"): algos.take_1d_int64_float64,
1447    ("float32", "float32"): algos.take_1d_float32_float32,
1448    ("float32", "float64"): algos.take_1d_float32_float64,
1449    ("float64", "float64"): algos.take_1d_float64_float64,
1450    ("object", "object"): algos.take_1d_object_object,
1451    ("bool", "bool"): _view_wrapper(algos.take_1d_bool_bool, np.uint8, np.uint8),
1452    ("bool", "object"): _view_wrapper(algos.take_1d_bool_object, np.uint8, None),
1453    ("datetime64[ns]", "datetime64[ns]"): _view_wrapper(
1454        algos.take_1d_int64_int64, np.int64, np.int64, np.int64
1455    ),
1456}
1457
1458_take_2d_axis0_dict = {
1459    ("int8", "int8"): algos.take_2d_axis0_int8_int8,
1460    ("int8", "int32"): algos.take_2d_axis0_int8_int32,
1461    ("int8", "int64"): algos.take_2d_axis0_int8_int64,
1462    ("int8", "float64"): algos.take_2d_axis0_int8_float64,
1463    ("int16", "int16"): algos.take_2d_axis0_int16_int16,
1464    ("int16", "int32"): algos.take_2d_axis0_int16_int32,
1465    ("int16", "int64"): algos.take_2d_axis0_int16_int64,
1466    ("int16", "float64"): algos.take_2d_axis0_int16_float64,
1467    ("int32", "int32"): algos.take_2d_axis0_int32_int32,
1468    ("int32", "int64"): algos.take_2d_axis0_int32_int64,
1469    ("int32", "float64"): algos.take_2d_axis0_int32_float64,
1470    ("int64", "int64"): algos.take_2d_axis0_int64_int64,
1471    ("int64", "float64"): algos.take_2d_axis0_int64_float64,
1472    ("float32", "float32"): algos.take_2d_axis0_float32_float32,
1473    ("float32", "float64"): algos.take_2d_axis0_float32_float64,
1474    ("float64", "float64"): algos.take_2d_axis0_float64_float64,
1475    ("object", "object"): algos.take_2d_axis0_object_object,
1476    ("bool", "bool"): _view_wrapper(algos.take_2d_axis0_bool_bool, np.uint8, np.uint8),
1477    ("bool", "object"): _view_wrapper(algos.take_2d_axis0_bool_object, np.uint8, None),
1478    ("datetime64[ns]", "datetime64[ns]"): _view_wrapper(
1479        algos.take_2d_axis0_int64_int64, np.int64, np.int64, fill_wrap=np.int64
1480    ),
1481}
1482
1483_take_2d_axis1_dict = {
1484    ("int8", "int8"): algos.take_2d_axis1_int8_int8,
1485    ("int8", "int32"): algos.take_2d_axis1_int8_int32,
1486    ("int8", "int64"): algos.take_2d_axis1_int8_int64,
1487    ("int8", "float64"): algos.take_2d_axis1_int8_float64,
1488    ("int16", "int16"): algos.take_2d_axis1_int16_int16,
1489    ("int16", "int32"): algos.take_2d_axis1_int16_int32,
1490    ("int16", "int64"): algos.take_2d_axis1_int16_int64,
1491    ("int16", "float64"): algos.take_2d_axis1_int16_float64,
1492    ("int32", "int32"): algos.take_2d_axis1_int32_int32,
1493    ("int32", "int64"): algos.take_2d_axis1_int32_int64,
1494    ("int32", "float64"): algos.take_2d_axis1_int32_float64,
1495    ("int64", "int64"): algos.take_2d_axis1_int64_int64,
1496    ("int64", "float64"): algos.take_2d_axis1_int64_float64,
1497    ("float32", "float32"): algos.take_2d_axis1_float32_float32,
1498    ("float32", "float64"): algos.take_2d_axis1_float32_float64,
1499    ("float64", "float64"): algos.take_2d_axis1_float64_float64,
1500    ("object", "object"): algos.take_2d_axis1_object_object,
1501    ("bool", "bool"): _view_wrapper(algos.take_2d_axis1_bool_bool, np.uint8, np.uint8),
1502    ("bool", "object"): _view_wrapper(algos.take_2d_axis1_bool_object, np.uint8, None),
1503    ("datetime64[ns]", "datetime64[ns]"): _view_wrapper(
1504        algos.take_2d_axis1_int64_int64, np.int64, np.int64, fill_wrap=np.int64
1505    ),
1506}
1507
1508_take_2d_multi_dict = {
1509    ("int8", "int8"): algos.take_2d_multi_int8_int8,
1510    ("int8", "int32"): algos.take_2d_multi_int8_int32,
1511    ("int8", "int64"): algos.take_2d_multi_int8_int64,
1512    ("int8", "float64"): algos.take_2d_multi_int8_float64,
1513    ("int16", "int16"): algos.take_2d_multi_int16_int16,
1514    ("int16", "int32"): algos.take_2d_multi_int16_int32,
1515    ("int16", "int64"): algos.take_2d_multi_int16_int64,
1516    ("int16", "float64"): algos.take_2d_multi_int16_float64,
1517    ("int32", "int32"): algos.take_2d_multi_int32_int32,
1518    ("int32", "int64"): algos.take_2d_multi_int32_int64,
1519    ("int32", "float64"): algos.take_2d_multi_int32_float64,
1520    ("int64", "int64"): algos.take_2d_multi_int64_int64,
1521    ("int64", "float64"): algos.take_2d_multi_int64_float64,
1522    ("float32", "float32"): algos.take_2d_multi_float32_float32,
1523    ("float32", "float64"): algos.take_2d_multi_float32_float64,
1524    ("float64", "float64"): algos.take_2d_multi_float64_float64,
1525    ("object", "object"): algos.take_2d_multi_object_object,
1526    ("bool", "bool"): _view_wrapper(algos.take_2d_multi_bool_bool, np.uint8, np.uint8),
1527    ("bool", "object"): _view_wrapper(algos.take_2d_multi_bool_object, np.uint8, None),
1528    ("datetime64[ns]", "datetime64[ns]"): _view_wrapper(
1529        algos.take_2d_multi_int64_int64, np.int64, np.int64, fill_wrap=np.int64
1530    ),
1531}
1532
1533
1534def _get_take_nd_function(
1535    ndim: int, arr_dtype, out_dtype, axis: int = 0, mask_info=None
1536):
1537    if ndim <= 2:
1538        tup = (arr_dtype.name, out_dtype.name)
1539        if ndim == 1:
1540            func = _take_1d_dict.get(tup, None)
1541        elif ndim == 2:
1542            if axis == 0:
1543                func = _take_2d_axis0_dict.get(tup, None)
1544            else:
1545                func = _take_2d_axis1_dict.get(tup, None)
1546        if func is not None:
1547            return func
1548
1549        tup = (out_dtype.name, out_dtype.name)
1550        if ndim == 1:
1551            func = _take_1d_dict.get(tup, None)
1552        elif ndim == 2:
1553            if axis == 0:
1554                func = _take_2d_axis0_dict.get(tup, None)
1555            else:
1556                func = _take_2d_axis1_dict.get(tup, None)
1557        if func is not None:
1558            func = _convert_wrapper(func, out_dtype)
1559            return func
1560
1561    def func2(arr, indexer, out, fill_value=np.nan):
1562        indexer = ensure_int64(indexer)
1563        _take_nd_object(
1564            arr, indexer, out, axis=axis, fill_value=fill_value, mask_info=mask_info
1565        )
1566
1567    return func2
1568
1569
1570def take(arr, indices, axis: int = 0, allow_fill: bool = False, fill_value=None):
1571    """
1572    Take elements from an array.
1573
1574    Parameters
1575    ----------
1576    arr : sequence
1577        Non array-likes (sequences without a dtype) are coerced
1578        to an ndarray.
1579    indices : sequence of integers
1580        Indices to be taken.
1581    axis : int, default 0
1582        The axis over which to select values.
1583    allow_fill : bool, default False
1584        How to handle negative values in `indices`.
1585
1586        * False: negative values in `indices` indicate positional indices
1587          from the right (the default). This is similar to :func:`numpy.take`.
1588
1589        * True: negative values in `indices` indicate
1590          missing values. These values are set to `fill_value`. Any other
1591          negative values raise a ``ValueError``.
1592
1593    fill_value : any, optional
1594        Fill value to use for NA-indices when `allow_fill` is True.
1595        This may be ``None``, in which case the default NA value for
1596        the type (``self.dtype.na_value``) is used.
1597
1598        For multi-dimensional `arr`, each *element* is filled with
1599        `fill_value`.
1600
1601    Returns
1602    -------
1603    ndarray or ExtensionArray
1604        Same type as the input.
1605
1606    Raises
1607    ------
1608    IndexError
1609        When `indices` is out of bounds for the array.
1610    ValueError
1611        When the indexer contains negative values other than ``-1``
1612        and `allow_fill` is True.
1613
1614    Notes
1615    -----
1616    When `allow_fill` is False, `indices` may be whatever dimensionality
1617    is accepted by NumPy for `arr`.
1618
1619    When `allow_fill` is True, `indices` should be 1-D.
1620
1621    See Also
1622    --------
1623    numpy.take : Take elements from an array along an axis.
1624
1625    Examples
1626    --------
1627    >>> from pandas.api.extensions import take
1628
1629    With the default ``allow_fill=False``, negative numbers indicate
1630    positional indices from the right.
1631
1632    >>> take(np.array([10, 20, 30]), [0, 0, -1])
1633    array([10, 10, 30])
1634
1635    Setting ``allow_fill=True`` will place `fill_value` in those positions.
1636
1637    >>> take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True)
1638    array([10., 10., nan])
1639
1640    >>> take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True,
1641    ...      fill_value=-10)
1642    array([ 10,  10, -10])
1643    """
1644    if not is_array_like(arr):
1645        arr = np.asarray(arr)
1646
1647    indices = np.asarray(indices, dtype=np.intp)
1648
1649    if allow_fill:
1650        # Pandas style, -1 means NA
1651        validate_indices(indices, arr.shape[axis])
1652        result = take_1d(
1653            arr, indices, axis=axis, allow_fill=True, fill_value=fill_value
1654        )
1655    else:
1656        # NumPy style
1657        result = arr.take(indices, axis=axis)
1658    return result
1659
1660
1661def take_nd(
1662    arr, indexer, axis: int = 0, out=None, fill_value=np.nan, allow_fill: bool = True
1663):
1664    """
1665    Specialized Cython take which sets NaN values in one pass
1666
1667    This dispatches to ``take`` defined on ExtensionArrays. It does not
1668    currently dispatch to ``SparseArray.take`` for sparse ``arr``.
1669
1670    Parameters
1671    ----------
1672    arr : array-like
1673        Input array.
1674    indexer : ndarray
1675        1-D array of indices to take, subarrays corresponding to -1 value
1676        indices are filed with fill_value
1677    axis : int, default 0
1678        Axis to take from
1679    out : ndarray or None, default None
1680        Optional output array, must be appropriate type to hold input and
1681        fill_value together, if indexer has any -1 value entries; call
1682        maybe_promote to determine this type for any fill_value
1683    fill_value : any, default np.nan
1684        Fill value to replace -1 values with
1685    allow_fill : boolean, default True
1686        If False, indexer is assumed to contain no -1 values so no filling
1687        will be done.  This short-circuits computation of a mask.  Result is
1688        undefined if allow_fill == False and -1 is present in indexer.
1689
1690    Returns
1691    -------
1692    subarray : array-like
1693        May be the same type as the input, or cast to an ndarray.
1694    """
1695    mask_info = None
1696
1697    if isinstance(arr, ABCExtensionArray):
1698        # Check for EA to catch DatetimeArray, TimedeltaArray
1699        return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)
1700
1701    arr = extract_array(arr)
1702    arr = np.asarray(arr)
1703
1704    if indexer is None:
1705        indexer = np.arange(arr.shape[axis], dtype=np.int64)
1706        dtype, fill_value = arr.dtype, arr.dtype.type()
1707    else:
1708        indexer = ensure_int64(indexer, copy=False)
1709        if not allow_fill:
1710            dtype, fill_value = arr.dtype, arr.dtype.type()
1711            mask_info = None, False
1712        else:
1713            # check for promotion based on types only (do this first because
1714            # it's faster than computing a mask)
1715            dtype, fill_value = maybe_promote(arr.dtype, fill_value)
1716            if dtype != arr.dtype and (out is None or out.dtype != dtype):
1717                # check if promotion is actually required based on indexer
1718                mask = indexer == -1
1719                needs_masking = mask.any()
1720                mask_info = mask, needs_masking
1721                if needs_masking:
1722                    if out is not None and out.dtype != dtype:
1723                        raise TypeError("Incompatible type for fill_value")
1724                else:
1725                    # if not, then depromote, set fill_value to dummy
1726                    # (it won't be used but we don't want the cython code
1727                    # to crash when trying to cast it to dtype)
1728                    dtype, fill_value = arr.dtype, arr.dtype.type()
1729
1730    flip_order = False
1731    if arr.ndim == 2:
1732        if arr.flags.f_contiguous:
1733            flip_order = True
1734
1735    if flip_order:
1736        arr = arr.T
1737        axis = arr.ndim - axis - 1
1738        if out is not None:
1739            out = out.T
1740
1741    # at this point, it's guaranteed that dtype can hold both the arr values
1742    # and the fill_value
1743    if out is None:
1744        out_shape_ = list(arr.shape)
1745        out_shape_[axis] = len(indexer)
1746        out_shape = tuple(out_shape_)
1747        if arr.flags.f_contiguous and axis == arr.ndim - 1:
1748            # minor tweak that can make an order-of-magnitude difference
1749            # for dataframes initialized directly from 2-d ndarrays
1750            # (s.t. df.values is c-contiguous and df._mgr.blocks[0] is its
1751            # f-contiguous transpose)
1752            out = np.empty(out_shape, dtype=dtype, order="F")
1753        else:
1754            out = np.empty(out_shape, dtype=dtype)
1755
1756    func = _get_take_nd_function(
1757        arr.ndim, arr.dtype, out.dtype, axis=axis, mask_info=mask_info
1758    )
1759    func(arr, indexer, out, fill_value)
1760
1761    if flip_order:
1762        out = out.T
1763    return out
1764
1765
1766take_1d = take_nd
1767
1768
1769def take_2d_multi(arr, indexer, fill_value=np.nan):
1770    """
1771    Specialized Cython take which sets NaN values in one pass.
1772    """
1773    # This is only called from one place in DataFrame._reindex_multi,
1774    #  so we know indexer is well-behaved.
1775    assert indexer is not None
1776    assert indexer[0] is not None
1777    assert indexer[1] is not None
1778
1779    row_idx, col_idx = indexer
1780
1781    row_idx = ensure_int64(row_idx)
1782    col_idx = ensure_int64(col_idx)
1783    indexer = row_idx, col_idx
1784    mask_info = None
1785
1786    # check for promotion based on types only (do this first because
1787    # it's faster than computing a mask)
1788    dtype, fill_value = maybe_promote(arr.dtype, fill_value)
1789    if dtype != arr.dtype:
1790        # check if promotion is actually required based on indexer
1791        row_mask = row_idx == -1
1792        col_mask = col_idx == -1
1793        row_needs = row_mask.any()
1794        col_needs = col_mask.any()
1795        mask_info = (row_mask, col_mask), (row_needs, col_needs)
1796
1797        if not (row_needs or col_needs):
1798            # if not, then depromote, set fill_value to dummy
1799            # (it won't be used but we don't want the cython code
1800            # to crash when trying to cast it to dtype)
1801            dtype, fill_value = arr.dtype, arr.dtype.type()
1802
1803    # at this point, it's guaranteed that dtype can hold both the arr values
1804    # and the fill_value
1805    out_shape = len(row_idx), len(col_idx)
1806    out = np.empty(out_shape, dtype=dtype)
1807
1808    func = _take_2d_multi_dict.get((arr.dtype.name, out.dtype.name), None)
1809    if func is None and arr.dtype != out.dtype:
1810        func = _take_2d_multi_dict.get((out.dtype.name, out.dtype.name), None)
1811        if func is not None:
1812            func = _convert_wrapper(func, out.dtype)
1813    if func is None:
1814
1815        def func(arr, indexer, out, fill_value=np.nan):
1816            _take_2d_multi_object(
1817                arr, indexer, out, fill_value=fill_value, mask_info=mask_info
1818            )
1819
1820    func(arr, indexer, out=out, fill_value=fill_value)
1821    return out
1822
1823
1824# ------------ #
1825# searchsorted #
1826# ------------ #
1827
1828
1829def searchsorted(arr, value, side="left", sorter=None) -> np.ndarray:
1830    """
1831    Find indices where elements should be inserted to maintain order.
1832
1833    .. versionadded:: 0.25.0
1834
1835    Find the indices into a sorted array `arr` (a) such that, if the
1836    corresponding elements in `value` were inserted before the indices,
1837    the order of `arr` would be preserved.
1838
1839    Assuming that `arr` is sorted:
1840
1841    ======  ================================
1842    `side`  returned index `i` satisfies
1843    ======  ================================
1844    left    ``arr[i-1] < value <= self[i]``
1845    right   ``arr[i-1] <= value < self[i]``
1846    ======  ================================
1847
1848    Parameters
1849    ----------
1850    arr: array-like
1851        Input array. If `sorter` is None, then it must be sorted in
1852        ascending order, otherwise `sorter` must be an array of indices
1853        that sort it.
1854    value : array_like
1855        Values to insert into `arr`.
1856    side : {'left', 'right'}, optional
1857        If 'left', the index of the first suitable location found is given.
1858        If 'right', return the last such index.  If there is no suitable
1859        index, return either 0 or N (where N is the length of `self`).
1860    sorter : 1-D array_like, optional
1861        Optional array of integer indices that sort array a into ascending
1862        order. They are typically the result of argsort.
1863
1864    Returns
1865    -------
1866    array of ints
1867        Array of insertion points with the same shape as `value`.
1868
1869    See Also
1870    --------
1871    numpy.searchsorted : Similar method from NumPy.
1872    """
1873    if sorter is not None:
1874        sorter = ensure_platform_int(sorter)
1875
1876    if (
1877        isinstance(arr, np.ndarray)
1878        and is_integer_dtype(arr.dtype)
1879        and (is_integer(value) or is_integer_dtype(value))
1880    ):
1881        # if `arr` and `value` have different dtypes, `arr` would be
1882        # recast by numpy, causing a slow search.
1883        # Before searching below, we therefore try to give `value` the
1884        # same dtype as `arr`, while guarding against integer overflows.
1885        iinfo = np.iinfo(arr.dtype.type)
1886        value_arr = np.array([value]) if is_scalar(value) else np.array(value)
1887        if (value_arr >= iinfo.min).all() and (value_arr <= iinfo.max).all():
1888            # value within bounds, so no overflow, so can convert value dtype
1889            # to dtype of arr
1890            dtype = arr.dtype
1891        else:
1892            dtype = value_arr.dtype
1893
1894        if is_scalar(value):
1895            value = dtype.type(value)
1896        else:
1897            value = array(value, dtype=dtype)
1898    elif not (
1899        is_object_dtype(arr) or is_numeric_dtype(arr) or is_categorical_dtype(arr)
1900    ):
1901        # E.g. if `arr` is an array with dtype='datetime64[ns]'
1902        # and `value` is a pd.Timestamp, we may need to convert value
1903        value_ser = array([value]) if is_scalar(value) else array(value)
1904        value = value_ser[0] if is_scalar(value) else value_ser
1905        if isinstance(value, Timestamp) and value.tzinfo is None:
1906            value = value.to_datetime64()
1907
1908    result = arr.searchsorted(value, side=side, sorter=sorter)
1909    return result
1910
1911
1912# ---- #
1913# diff #
1914# ---- #
1915
1916_diff_special = {"float64", "float32", "int64", "int32", "int16", "int8"}
1917
1918
1919def diff(arr, n: int, axis: int = 0, stacklevel=3):
1920    """
1921    difference of n between self,
1922    analogous to s-s.shift(n)
1923
1924    Parameters
1925    ----------
1926    arr : ndarray
1927    n : int
1928        number of periods
1929    axis : int
1930        axis to shift on
1931    stacklevel : int
1932        The stacklevel for the lost dtype warning.
1933
1934    Returns
1935    -------
1936    shifted
1937    """
1938    from pandas.core.arrays import PandasDtype
1939
1940    n = int(n)
1941    na = np.nan
1942    dtype = arr.dtype
1943
1944    if dtype.kind == "b":
1945        op = operator.xor
1946    else:
1947        op = operator.sub
1948
1949    if isinstance(dtype, PandasDtype):
1950        # PandasArray cannot necessarily hold shifted versions of itself.
1951        arr = np.asarray(arr)
1952        dtype = arr.dtype
1953
1954    if is_extension_array_dtype(dtype):
1955        if hasattr(arr, f"__{op.__name__}__"):
1956            if axis != 0:
1957                raise ValueError(f"cannot diff {type(arr).__name__} on axis={axis}")
1958            return op(arr, arr.shift(n))
1959        else:
1960            warn(
1961                "dtype lost in 'diff()'. In the future this will raise a "
1962                "TypeError. Convert to a suitable dtype prior to calling 'diff'.",
1963                FutureWarning,
1964                stacklevel=stacklevel,
1965            )
1966            arr = np.asarray(arr)
1967            dtype = arr.dtype
1968
1969    is_timedelta = False
1970    is_bool = False
1971    if needs_i8_conversion(arr.dtype):
1972        dtype = np.int64
1973        arr = arr.view("i8")
1974        na = iNaT
1975        is_timedelta = True
1976
1977    elif is_bool_dtype(dtype):
1978        # We have to cast in order to be able to hold np.nan
1979        dtype = np.object_
1980        is_bool = True
1981
1982    elif is_integer_dtype(dtype):
1983        # We have to cast in order to be able to hold np.nan
1984
1985        # int8, int16 are incompatible with float64,
1986        # see https://github.com/cython/cython/issues/2646
1987        if arr.dtype.name in ["int8", "int16"]:
1988            dtype = np.float32
1989        else:
1990            dtype = np.float64
1991
1992    orig_ndim = arr.ndim
1993    if orig_ndim == 1:
1994        # reshape so we can always use algos.diff_2d
1995        arr = arr.reshape(-1, 1)
1996        # TODO: require axis == 0
1997
1998    dtype = np.dtype(dtype)
1999    out_arr = np.empty(arr.shape, dtype=dtype)
2000
2001    na_indexer = [slice(None)] * arr.ndim
2002    na_indexer[axis] = slice(None, n) if n >= 0 else slice(n, None)
2003    out_arr[tuple(na_indexer)] = na
2004
2005    if arr.ndim == 2 and arr.dtype.name in _diff_special:
2006        # TODO: can diff_2d dtype specialization troubles be fixed by defining
2007        #  out_arr inside diff_2d?
2008        algos.diff_2d(arr, out_arr, n, axis, datetimelike=is_timedelta)
2009    else:
2010        # To keep mypy happy, _res_indexer is a list while res_indexer is
2011        #  a tuple, ditto for lag_indexer.
2012        _res_indexer = [slice(None)] * arr.ndim
2013        _res_indexer[axis] = slice(n, None) if n >= 0 else slice(None, n)
2014        res_indexer = tuple(_res_indexer)
2015
2016        _lag_indexer = [slice(None)] * arr.ndim
2017        _lag_indexer[axis] = slice(None, -n) if n > 0 else slice(-n, None)
2018        lag_indexer = tuple(_lag_indexer)
2019
2020        # need to make sure that we account for na for datelike/timedelta
2021        # we don't actually want to subtract these i8 numbers
2022        if is_timedelta:
2023            res = arr[res_indexer]
2024            lag = arr[lag_indexer]
2025
2026            mask = (arr[res_indexer] == na) | (arr[lag_indexer] == na)
2027            if mask.any():
2028                res = res.copy()
2029                res[mask] = 0
2030                lag = lag.copy()
2031                lag[mask] = 0
2032
2033            result = res - lag
2034            result[mask] = na
2035            out_arr[res_indexer] = result
2036        elif is_bool:
2037            out_arr[res_indexer] = arr[res_indexer] ^ arr[lag_indexer]
2038        else:
2039            out_arr[res_indexer] = arr[res_indexer] - arr[lag_indexer]
2040
2041    if is_timedelta:
2042        out_arr = out_arr.view("timedelta64[ns]")
2043
2044    if orig_ndim == 1:
2045        out_arr = out_arr[:, 0]
2046    return out_arr
2047
2048
2049# --------------------------------------------------------------------
2050# Helper functions
2051
2052# Note: safe_sort is in algorithms.py instead of sorting.py because it is
2053#  low-dependency, is used in this module, and used private methods from
2054#  this module.
2055def safe_sort(
2056    values,
2057    codes=None,
2058    na_sentinel: int = -1,
2059    assume_unique: bool = False,
2060    verify: bool = True,
2061) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
2062    """
2063    Sort ``values`` and reorder corresponding ``codes``.
2064
2065    ``values`` should be unique if ``codes`` is not None.
2066    Safe for use with mixed types (int, str), orders ints before strs.
2067
2068    Parameters
2069    ----------
2070    values : list-like
2071        Sequence; must be unique if ``codes`` is not None.
2072    codes : list_like, optional
2073        Indices to ``values``. All out of bound indices are treated as
2074        "not found" and will be masked with ``na_sentinel``.
2075    na_sentinel : int, default -1
2076        Value in ``codes`` to mark "not found".
2077        Ignored when ``codes`` is None.
2078    assume_unique : bool, default False
2079        When True, ``values`` are assumed to be unique, which can speed up
2080        the calculation. Ignored when ``codes`` is None.
2081    verify : bool, default True
2082        Check if codes are out of bound for the values and put out of bound
2083        codes equal to na_sentinel. If ``verify=False``, it is assumed there
2084        are no out of bound codes. Ignored when ``codes`` is None.
2085
2086        .. versionadded:: 0.25.0
2087
2088    Returns
2089    -------
2090    ordered : ndarray
2091        Sorted ``values``
2092    new_codes : ndarray
2093        Reordered ``codes``; returned when ``codes`` is not None.
2094
2095    Raises
2096    ------
2097    TypeError
2098        * If ``values`` is not list-like or if ``codes`` is neither None
2099        nor list-like
2100        * If ``values`` cannot be sorted
2101    ValueError
2102        * If ``codes`` is not None and ``values`` contain duplicates.
2103    """
2104    if not is_list_like(values):
2105        raise TypeError(
2106            "Only list-like objects are allowed to be passed to safe_sort as values"
2107        )
2108
2109    if not isinstance(values, (np.ndarray, ABCExtensionArray)):
2110        # don't convert to string types
2111        dtype, _ = infer_dtype_from_array(values)
2112        values = np.asarray(values, dtype=dtype)
2113
2114    sorter = None
2115
2116    if (
2117        not is_extension_array_dtype(values)
2118        and lib.infer_dtype(values, skipna=False) == "mixed-integer"
2119    ):
2120        ordered = _sort_mixed(values)
2121    else:
2122        try:
2123            sorter = values.argsort()
2124            ordered = values.take(sorter)
2125        except TypeError:
2126            # Previous sorters failed or were not applicable, try `_sort_mixed`
2127            # which would work, but which fails for special case of 1d arrays
2128            # with tuples.
2129            if values.size and isinstance(values[0], tuple):
2130                ordered = _sort_tuples(values)
2131            else:
2132                ordered = _sort_mixed(values)
2133
2134    # codes:
2135
2136    if codes is None:
2137        return ordered
2138
2139    if not is_list_like(codes):
2140        raise TypeError(
2141            "Only list-like objects or None are allowed to "
2142            "be passed to safe_sort as codes"
2143        )
2144    codes = ensure_platform_int(np.asarray(codes))
2145
2146    if not assume_unique and not len(unique(values)) == len(values):
2147        raise ValueError("values should be unique if codes is not None")
2148
2149    if sorter is None:
2150        # mixed types
2151        hash_klass, values = get_data_algo(values)
2152        t = hash_klass(len(values))
2153        t.map_locations(values)
2154        sorter = ensure_platform_int(t.lookup(ordered))
2155
2156    if na_sentinel == -1:
2157        # take_1d is faster, but only works for na_sentinels of -1
2158        order2 = sorter.argsort()
2159        new_codes = take_1d(order2, codes, fill_value=-1)
2160        if verify:
2161            mask = (codes < -len(values)) | (codes >= len(values))
2162        else:
2163            mask = None
2164    else:
2165        reverse_indexer = np.empty(len(sorter), dtype=np.int_)
2166        reverse_indexer.put(sorter, np.arange(len(sorter)))
2167        # Out of bound indices will be masked with `na_sentinel` next, so we
2168        # may deal with them here without performance loss using `mode='wrap'`
2169        new_codes = reverse_indexer.take(codes, mode="wrap")
2170
2171        mask = codes == na_sentinel
2172        if verify:
2173            mask = mask | (codes < -len(values)) | (codes >= len(values))
2174
2175    if mask is not None:
2176        np.putmask(new_codes, mask, na_sentinel)
2177
2178    return ordered, ensure_platform_int(new_codes)
2179
2180
2181def _sort_mixed(values):
2182    """ order ints before strings in 1d arrays, safe in py3 """
2183    str_pos = np.array([isinstance(x, str) for x in values], dtype=bool)
2184    nums = np.sort(values[~str_pos])
2185    strs = np.sort(values[str_pos])
2186    return np.concatenate([nums, np.asarray(strs, dtype=object)])
2187
2188
2189def _sort_tuples(values: np.ndarray[tuple]):
2190    """
2191    Convert array of tuples (1d) to array or array (2d).
2192    We need to keep the columns separately as they contain different types and
2193    nans (can't use `np.sort` as it may fail when str and nan are mixed in a
2194    column as types cannot be compared).
2195    """
2196    from pandas.core.internals.construction import to_arrays
2197    from pandas.core.sorting import lexsort_indexer
2198
2199    arrays, _ = to_arrays(values, None)
2200    indexer = lexsort_indexer(arrays, orders=True)
2201    return values[indexer]
2202