1from csv import QUOTE_NONNUMERIC
2from functools import partial
3import operator
4from shutil import get_terminal_size
5from typing import Dict, Hashable, List, Sequence, Type, TypeVar, Union, cast
6from warnings import warn
7
8import numpy as np
9
10from pandas._config import get_option
11
12from pandas._libs import NaT, algos as libalgos, hashtable as htable, lib
13from pandas._libs.lib import no_default
14from pandas._typing import ArrayLike, Dtype, Ordered, Scalar
15from pandas.compat.numpy import function as nv
16from pandas.util._decorators import cache_readonly, deprecate_kwarg
17from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs
18
19from pandas.core.dtypes.cast import (
20    coerce_indexer_dtype,
21    maybe_cast_to_extension_array,
22    maybe_infer_to_datetimelike,
23)
24from pandas.core.dtypes.common import (
25    ensure_int64,
26    ensure_object,
27    is_categorical_dtype,
28    is_datetime64_dtype,
29    is_dict_like,
30    is_dtype_equal,
31    is_extension_array_dtype,
32    is_hashable,
33    is_integer_dtype,
34    is_list_like,
35    is_object_dtype,
36    is_scalar,
37    is_timedelta64_dtype,
38    needs_i8_conversion,
39)
40from pandas.core.dtypes.dtypes import CategoricalDtype
41from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
42from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna, notna
43
44from pandas.core import ops
45from pandas.core.accessor import PandasDelegate, delegate_names
46import pandas.core.algorithms as algorithms
47from pandas.core.algorithms import factorize, get_data_algo, take_1d, unique1d
48from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
49from pandas.core.base import ExtensionArray, NoNewAttributesMixin, PandasObject
50import pandas.core.common as com
51from pandas.core.construction import array, extract_array, sanitize_array
52from pandas.core.indexers import deprecate_ndim_indexing
53from pandas.core.missing import interpolate_2d
54from pandas.core.ops.common import unpack_zerodim_and_defer
55from pandas.core.sorting import nargsort
56from pandas.core.strings.object_array import ObjectStringArrayMixin
57
58from pandas.io.formats import console
59
60CategoricalT = TypeVar("CategoricalT", bound="Categorical")
61
62
63def _cat_compare_op(op):
64    opname = f"__{op.__name__}__"
65    fill_value = True if op is operator.ne else False
66
67    @unpack_zerodim_and_defer(opname)
68    def func(self, other):
69        hashable = is_hashable(other)
70        if is_list_like(other) and len(other) != len(self) and not hashable:
71            # in hashable case we may have a tuple that is itself a category
72            raise ValueError("Lengths must match.")
73
74        if not self.ordered:
75            if opname in ["__lt__", "__gt__", "__le__", "__ge__"]:
76                raise TypeError(
77                    "Unordered Categoricals can only compare equality or not"
78                )
79        if isinstance(other, Categorical):
80            # Two Categoricals can only be compared if the categories are
81            # the same (maybe up to ordering, depending on ordered)
82
83            msg = "Categoricals can only be compared if 'categories' are the same."
84            if not self._categories_match_up_to_permutation(other):
85                raise TypeError(msg)
86
87            if not self.ordered and not self.categories.equals(other.categories):
88                # both unordered and different order
89                other_codes = recode_for_categories(
90                    other.codes, other.categories, self.categories, copy=False
91                )
92            else:
93                other_codes = other._codes
94
95            ret = op(self._codes, other_codes)
96            mask = (self._codes == -1) | (other_codes == -1)
97            if mask.any():
98                ret[mask] = fill_value
99            return ret
100
101        if hashable:
102            if other in self.categories:
103                i = self._unbox_scalar(other)
104                ret = op(self._codes, i)
105
106                if opname not in {"__eq__", "__ge__", "__gt__"}:
107                    # GH#29820 performance trick; get_loc will always give i>=0,
108                    #  so in the cases (__ne__, __le__, __lt__) the setting
109                    #  here is a no-op, so can be skipped.
110                    mask = self._codes == -1
111                    ret[mask] = fill_value
112                return ret
113            else:
114                return ops.invalid_comparison(self, other, op)
115        else:
116            # allow categorical vs object dtype array comparisons for equality
117            # these are only positional comparisons
118            if opname not in ["__eq__", "__ne__"]:
119                raise TypeError(
120                    f"Cannot compare a Categorical for op {opname} with "
121                    f"type {type(other)}.\nIf you want to compare values, "
122                    "use 'np.asarray(cat) <op> other'."
123                )
124
125            if isinstance(other, ExtensionArray) and needs_i8_conversion(other.dtype):
126                # We would return NotImplemented here, but that messes up
127                #  ExtensionIndex's wrapped methods
128                return op(other, self)
129            return getattr(np.array(self), opname)(np.array(other))
130
131    func.__name__ = opname
132
133    return func
134
135
136def contains(cat, key, container):
137    """
138    Helper for membership check for ``key`` in ``cat``.
139
140    This is a helper method for :method:`__contains__`
141    and :class:`CategoricalIndex.__contains__`.
142
143    Returns True if ``key`` is in ``cat.categories`` and the
144    location of ``key`` in ``categories`` is in ``container``.
145
146    Parameters
147    ----------
148    cat : :class:`Categorical`or :class:`categoricalIndex`
149    key : a hashable object
150        The key to check membership for.
151    container : Container (e.g. list-like or mapping)
152        The container to check for membership in.
153
154    Returns
155    -------
156    is_in : bool
157        True if ``key`` is in ``self.categories`` and location of
158        ``key`` in ``categories`` is in ``container``, else False.
159
160    Notes
161    -----
162    This method does not check for NaN values. Do that separately
163    before calling this method.
164    """
165    hash(key)
166
167    # get location of key in categories.
168    # If a KeyError, the key isn't in categories, so logically
169    #  can't be in container either.
170    try:
171        loc = cat.categories.get_loc(key)
172    except (KeyError, TypeError):
173        return False
174
175    # loc is the location of key in categories, but also the *value*
176    # for key in container. So, `key` may be in categories,
177    # but still not in `container`. Example ('b' in categories,
178    # but not in values):
179    # 'b' in Categorical(['a'], categories=['a', 'b'])  # False
180    if is_scalar(loc):
181        return loc in container
182    else:
183        # if categories is an IntervalIndex, loc is an array.
184        return any(loc_ in container for loc_ in loc)
185
186
187class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMixin):
188    """
189    Represent a categorical variable in classic R / S-plus fashion.
190
191    `Categoricals` can only take on only a limited, and usually fixed, number
192    of possible values (`categories`). In contrast to statistical categorical
193    variables, a `Categorical` might have an order, but numerical operations
194    (additions, divisions, ...) are not possible.
195
196    All values of the `Categorical` are either in `categories` or `np.nan`.
197    Assigning values outside of `categories` will raise a `ValueError`. Order
198    is defined by the order of the `categories`, not lexical order of the
199    values.
200
201    Parameters
202    ----------
203    values : list-like
204        The values of the categorical. If categories are given, values not in
205        categories will be replaced with NaN.
206    categories : Index-like (unique), optional
207        The unique categories for this categorical. If not given, the
208        categories are assumed to be the unique values of `values` (sorted, if
209        possible, otherwise in the order in which they appear).
210    ordered : bool, default False
211        Whether or not this categorical is treated as a ordered categorical.
212        If True, the resulting categorical will be ordered.
213        An ordered categorical respects, when sorted, the order of its
214        `categories` attribute (which in turn is the `categories` argument, if
215        provided).
216    dtype : CategoricalDtype
217        An instance of ``CategoricalDtype`` to use for this categorical.
218
219    Attributes
220    ----------
221    categories : Index
222        The categories of this categorical
223    codes : ndarray
224        The codes (integer positions, which point to the categories) of this
225        categorical, read only.
226    ordered : bool
227        Whether or not this Categorical is ordered.
228    dtype : CategoricalDtype
229        The instance of ``CategoricalDtype`` storing the ``categories``
230        and ``ordered``.
231
232    Methods
233    -------
234    from_codes
235    __array__
236
237    Raises
238    ------
239    ValueError
240        If the categories do not validate.
241    TypeError
242        If an explicit ``ordered=True`` is given but no `categories` and the
243        `values` are not sortable.
244
245    See Also
246    --------
247    CategoricalDtype : Type for categorical data.
248    CategoricalIndex : An Index with an underlying ``Categorical``.
249
250    Notes
251    -----
252    See the `user guide
253    <https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html>`_
254    for more.
255
256    Examples
257    --------
258    >>> pd.Categorical([1, 2, 3, 1, 2, 3])
259    [1, 2, 3, 1, 2, 3]
260    Categories (3, int64): [1, 2, 3]
261
262    >>> pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'])
263    ['a', 'b', 'c', 'a', 'b', 'c']
264    Categories (3, object): ['a', 'b', 'c']
265
266    Missing values are not included as a category.
267
268    >>> c = pd.Categorical([1, 2, 3, 1, 2, 3, np.nan])
269    >>> c
270    [1, 2, 3, 1, 2, 3, NaN]
271    Categories (3, int64): [1, 2, 3]
272
273    However, their presence is indicated in the `codes` attribute
274    by code `-1`.
275
276    >>> c.codes
277    array([ 0,  1,  2,  0,  1,  2, -1], dtype=int8)
278
279    Ordered `Categoricals` can be sorted according to the custom order
280    of the categories and can have a min and max value.
281
282    >>> c = pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'], ordered=True,
283    ...                    categories=['c', 'b', 'a'])
284    >>> c
285    ['a', 'b', 'c', 'a', 'b', 'c']
286    Categories (3, object): ['c' < 'b' < 'a']
287    >>> c.min()
288    'c'
289    """
290
291    # For comparisons, so that numpy uses our implementation if the compare
292    # ops, which raise
293    __array_priority__ = 1000
294    _dtype = CategoricalDtype(ordered=False)
295    # tolist is not actually deprecated, just suppressed in the __dir__
296    _hidden_attrs = PandasObject._hidden_attrs | frozenset(["tolist"])
297    _typ = "categorical"
298    _can_hold_na = True
299
300    def __init__(
301        self, values, categories=None, ordered=None, dtype=None, fastpath=False
302    ):
303
304        dtype = CategoricalDtype._from_values_or_dtype(
305            values, categories, ordered, dtype
306        )
307        # At this point, dtype is always a CategoricalDtype, but
308        # we may have dtype.categories be None, and we need to
309        # infer categories in a factorization step further below
310
311        if fastpath:
312            self._codes = coerce_indexer_dtype(values, dtype.categories)
313            self._dtype = self._dtype.update_dtype(dtype)
314            return
315
316        # null_mask indicates missing values we want to exclude from inference.
317        # This means: only missing values in list-likes (not arrays/ndframes).
318        null_mask = np.array(False)
319
320        # sanitize input
321        if is_categorical_dtype(values):
322            if dtype.categories is None:
323                dtype = CategoricalDtype(values.categories, dtype.ordered)
324        elif not isinstance(values, (ABCIndexClass, ABCSeries)):
325            # sanitize_array coerces np.nan to a string under certain versions
326            # of numpy
327            values = maybe_infer_to_datetimelike(values, convert_dates=True)
328            if not isinstance(values, (np.ndarray, ExtensionArray)):
329                values = com.convert_to_list_like(values)
330
331                # By convention, empty lists result in object dtype:
332                sanitize_dtype = np.dtype("O") if len(values) == 0 else None
333                null_mask = isna(values)
334                if null_mask.any():
335                    values = [values[idx] for idx in np.where(~null_mask)[0]]
336                values = sanitize_array(values, None, dtype=sanitize_dtype)
337
338        if dtype.categories is None:
339            try:
340                codes, categories = factorize(values, sort=True)
341            except TypeError as err:
342                codes, categories = factorize(values, sort=False)
343                if dtype.ordered:
344                    # raise, as we don't have a sortable data structure and so
345                    # the user should give us one by specifying categories
346                    raise TypeError(
347                        "'values' is not ordered, please "
348                        "explicitly specify the categories order "
349                        "by passing in a categories argument."
350                    ) from err
351            except ValueError as err:
352
353                # TODO(EA2D)
354                raise NotImplementedError(
355                    "> 1 ndim Categorical are not supported at this time"
356                ) from err
357
358            # we're inferring from values
359            dtype = CategoricalDtype(categories, dtype.ordered)
360
361        elif is_categorical_dtype(values.dtype):
362            old_codes = extract_array(values).codes
363            codes = recode_for_categories(
364                old_codes, values.dtype.categories, dtype.categories
365            )
366
367        else:
368            codes = _get_codes_for_values(values, dtype.categories)
369
370        if null_mask.any():
371            # Reinsert -1 placeholders for previously removed missing values
372            full_codes = -np.ones(null_mask.shape, dtype=codes.dtype)
373            full_codes[~null_mask] = codes
374            codes = full_codes
375
376        self._dtype = self._dtype.update_dtype(dtype)
377        self._codes = coerce_indexer_dtype(codes, dtype.categories)
378
379    @property
380    def dtype(self) -> CategoricalDtype:
381        """
382        The :class:`~pandas.api.types.CategoricalDtype` for this instance.
383        """
384        return self._dtype
385
386    @property
387    def _constructor(self) -> Type["Categorical"]:
388        return Categorical
389
390    @classmethod
391    def _from_sequence(cls, scalars, *, dtype=None, copy=False):
392        return Categorical(scalars, dtype=dtype)
393
394    def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike:
395        """
396        Coerce this type to another dtype
397
398        Parameters
399        ----------
400        dtype : numpy dtype or pandas type
401        copy : bool, default True
402            By default, astype always returns a newly allocated object.
403            If copy is set to False and dtype is categorical, the original
404            object is returned.
405        """
406        if self.dtype is dtype:
407            result = self.copy() if copy else self
408
409        elif is_categorical_dtype(dtype):
410            dtype = cast(Union[str, CategoricalDtype], dtype)
411
412            # GH 10696/18593/18630
413            dtype = self.dtype.update_dtype(dtype)
414            self = self.copy() if copy else self
415            result = self._set_dtype(dtype)
416
417        # TODO: consolidate with ndarray case?
418        elif is_extension_array_dtype(dtype):
419            result = array(self, dtype=dtype, copy=copy)
420
421        elif is_integer_dtype(dtype) and self.isna().any():
422            raise ValueError("Cannot convert float NaN to integer")
423
424        elif len(self.codes) == 0 or len(self.categories) == 0:
425            result = np.array(self, dtype=dtype, copy=copy)
426
427        else:
428            # GH8628 (PERF): astype category codes instead of astyping array
429            try:
430                new_cats = np.asarray(self.categories)
431                new_cats = new_cats.astype(dtype=dtype, copy=copy)
432                fill_value = lib.item_from_zerodim(np.array(np.nan).astype(dtype))
433            except (
434                TypeError,  # downstream error msg for CategoricalIndex is misleading
435                ValueError,
436            ):
437                msg = f"Cannot cast {self.categories.dtype} dtype to {dtype}"
438                raise ValueError(msg)
439
440            result = take_1d(
441                new_cats,
442                libalgos.ensure_platform_int(self._codes),
443                fill_value=fill_value,
444            )
445
446        return result
447
448    @cache_readonly
449    def itemsize(self) -> int:
450        """
451        return the size of a single category
452        """
453        return self.categories.itemsize
454
455    def tolist(self) -> List[Scalar]:
456        """
457        Return a list of the values.
458
459        These are each a scalar type, which is a Python scalar
460        (for str, int, float) or a pandas scalar
461        (for Timestamp/Timedelta/Interval/Period)
462        """
463        return list(self)
464
465    to_list = tolist
466
467    @classmethod
468    def _from_inferred_categories(
469        cls, inferred_categories, inferred_codes, dtype, true_values=None
470    ):
471        """
472        Construct a Categorical from inferred values.
473
474        For inferred categories (`dtype` is None) the categories are sorted.
475        For explicit `dtype`, the `inferred_categories` are cast to the
476        appropriate type.
477
478        Parameters
479        ----------
480        inferred_categories : Index
481        inferred_codes : Index
482        dtype : CategoricalDtype or 'category'
483        true_values : list, optional
484            If none are provided, the default ones are
485            "True", "TRUE", and "true."
486
487        Returns
488        -------
489        Categorical
490        """
491        from pandas import Index, to_datetime, to_numeric, to_timedelta
492
493        cats = Index(inferred_categories)
494        known_categories = (
495            isinstance(dtype, CategoricalDtype) and dtype.categories is not None
496        )
497
498        if known_categories:
499            # Convert to a specialized type with `dtype` if specified.
500            if dtype.categories.is_numeric():
501                cats = to_numeric(inferred_categories, errors="coerce")
502            elif is_datetime64_dtype(dtype.categories):
503                cats = to_datetime(inferred_categories, errors="coerce")
504            elif is_timedelta64_dtype(dtype.categories):
505                cats = to_timedelta(inferred_categories, errors="coerce")
506            elif dtype.categories.is_boolean():
507                if true_values is None:
508                    true_values = ["True", "TRUE", "true"]
509
510                cats = cats.isin(true_values)
511
512        if known_categories:
513            # Recode from observation order to dtype.categories order.
514            categories = dtype.categories
515            codes = recode_for_categories(inferred_codes, cats, categories)
516        elif not cats.is_monotonic_increasing:
517            # Sort categories and recode for unknown categories.
518            unsorted = cats.copy()
519            categories = cats.sort_values()
520
521            codes = recode_for_categories(inferred_codes, unsorted, categories)
522            dtype = CategoricalDtype(categories, ordered=False)
523        else:
524            dtype = CategoricalDtype(cats, ordered=False)
525            codes = inferred_codes
526
527        return cls(codes, dtype=dtype, fastpath=True)
528
529    @classmethod
530    def from_codes(cls, codes, categories=None, ordered=None, dtype=None):
531        """
532        Make a Categorical type from codes and categories or dtype.
533
534        This constructor is useful if you already have codes and
535        categories/dtype and so do not need the (computation intensive)
536        factorization step, which is usually done on the constructor.
537
538        If your data does not follow this convention, please use the normal
539        constructor.
540
541        Parameters
542        ----------
543        codes : array-like of int
544            An integer array, where each integer points to a category in
545            categories or dtype.categories, or else is -1 for NaN.
546        categories : index-like, optional
547            The categories for the categorical. Items need to be unique.
548            If the categories are not given here, then they must be provided
549            in `dtype`.
550        ordered : bool, optional
551            Whether or not this categorical is treated as an ordered
552            categorical. If not given here or in `dtype`, the resulting
553            categorical will be unordered.
554        dtype : CategoricalDtype or "category", optional
555            If :class:`CategoricalDtype`, cannot be used together with
556            `categories` or `ordered`.
557
558            .. versionadded:: 0.24.0
559
560               When `dtype` is provided, neither `categories` nor `ordered`
561               should be provided.
562
563        Returns
564        -------
565        Categorical
566
567        Examples
568        --------
569        >>> dtype = pd.CategoricalDtype(['a', 'b'], ordered=True)
570        >>> pd.Categorical.from_codes(codes=[0, 1, 0, 1], dtype=dtype)
571        ['a', 'b', 'a', 'b']
572        Categories (2, object): ['a' < 'b']
573        """
574        dtype = CategoricalDtype._from_values_or_dtype(
575            categories=categories, ordered=ordered, dtype=dtype
576        )
577        if dtype.categories is None:
578            msg = (
579                "The categories must be provided in 'categories' or "
580                "'dtype'. Both were None."
581            )
582            raise ValueError(msg)
583
584        if is_extension_array_dtype(codes) and is_integer_dtype(codes):
585            # Avoid the implicit conversion of Int to object
586            if isna(codes).any():
587                raise ValueError("codes cannot contain NA values")
588            codes = codes.to_numpy(dtype=np.int64)
589        else:
590            codes = np.asarray(codes)
591        if len(codes) and not is_integer_dtype(codes):
592            raise ValueError("codes need to be array-like integers")
593
594        if len(codes) and (codes.max() >= len(dtype.categories) or codes.min() < -1):
595            raise ValueError("codes need to be between -1 and len(categories)-1")
596
597        return cls(codes, dtype=dtype, fastpath=True)
598
599    # ------------------------------------------------------------------
600    # Categories/Codes/Ordered
601
602    @property
603    def categories(self):
604        """
605        The categories of this categorical.
606
607        Setting assigns new values to each category (effectively a rename of
608        each individual category).
609
610        The assigned value has to be a list-like object. All items must be
611        unique and the number of items in the new categories must be the same
612        as the number of items in the old categories.
613
614        Assigning to `categories` is a inplace operation!
615
616        Raises
617        ------
618        ValueError
619            If the new categories do not validate as categories or if the
620            number of new categories is unequal the number of old categories
621
622        See Also
623        --------
624        rename_categories : Rename categories.
625        reorder_categories : Reorder categories.
626        add_categories : Add new categories.
627        remove_categories : Remove the specified categories.
628        remove_unused_categories : Remove categories which are not used.
629        set_categories : Set the categories to the specified ones.
630        """
631        return self.dtype.categories
632
633    @categories.setter
634    def categories(self, categories):
635        new_dtype = CategoricalDtype(categories, ordered=self.ordered)
636        if self.dtype.categories is not None and len(self.dtype.categories) != len(
637            new_dtype.categories
638        ):
639            raise ValueError(
640                "new categories need to have the same number of "
641                "items as the old categories!"
642            )
643        self._dtype = new_dtype
644
645    @property
646    def ordered(self) -> Ordered:
647        """
648        Whether the categories have an ordered relationship.
649        """
650        return self.dtype.ordered
651
652    @property
653    def codes(self) -> np.ndarray:
654        """
655        The category codes of this categorical.
656
657        Codes are an array of integers which are the positions of the actual
658        values in the categories array.
659
660        There is no setter, use the other categorical methods and the normal item
661        setter to change values in the categorical.
662
663        Returns
664        -------
665        ndarray[int]
666            A non-writable view of the `codes` array.
667        """
668        v = self._codes.view()
669        v.flags.writeable = False
670        return v
671
672    def _set_categories(self, categories, fastpath=False):
673        """
674        Sets new categories inplace
675
676        Parameters
677        ----------
678        fastpath : bool, default False
679           Don't perform validation of the categories for uniqueness or nulls
680
681        Examples
682        --------
683        >>> c = pd.Categorical(['a', 'b'])
684        >>> c
685        ['a', 'b']
686        Categories (2, object): ['a', 'b']
687
688        >>> c._set_categories(pd.Index(['a', 'c']))
689        >>> c
690        ['a', 'c']
691        Categories (2, object): ['a', 'c']
692        """
693        if fastpath:
694            new_dtype = CategoricalDtype._from_fastpath(categories, self.ordered)
695        else:
696            new_dtype = CategoricalDtype(categories, ordered=self.ordered)
697        if (
698            not fastpath
699            and self.dtype.categories is not None
700            and len(new_dtype.categories) != len(self.dtype.categories)
701        ):
702            raise ValueError(
703                "new categories need to have the same number of "
704                "items than the old categories!"
705            )
706
707        self._dtype = new_dtype
708
709    def _set_dtype(self, dtype: CategoricalDtype) -> "Categorical":
710        """
711        Internal method for directly updating the CategoricalDtype
712
713        Parameters
714        ----------
715        dtype : CategoricalDtype
716
717        Notes
718        -----
719        We don't do any validation here. It's assumed that the dtype is
720        a (valid) instance of `CategoricalDtype`.
721        """
722        codes = recode_for_categories(self.codes, self.categories, dtype.categories)
723        return type(self)(codes, dtype=dtype, fastpath=True)
724
725    def set_ordered(self, value, inplace=False):
726        """
727        Set the ordered attribute to the boolean value.
728
729        Parameters
730        ----------
731        value : bool
732           Set whether this categorical is ordered (True) or not (False).
733        inplace : bool, default False
734           Whether or not to set the ordered attribute in-place or return
735           a copy of this categorical with ordered set to the value.
736        """
737        inplace = validate_bool_kwarg(inplace, "inplace")
738        new_dtype = CategoricalDtype(self.categories, ordered=value)
739        cat = self if inplace else self.copy()
740        cat._dtype = new_dtype
741        if not inplace:
742            return cat
743
744    def as_ordered(self, inplace=False):
745        """
746        Set the Categorical to be ordered.
747
748        Parameters
749        ----------
750        inplace : bool, default False
751           Whether or not to set the ordered attribute in-place or return
752           a copy of this categorical with ordered set to True.
753
754        Returns
755        -------
756        Categorical or None
757            Ordered Categorical or None if ``inplace=True``.
758        """
759        inplace = validate_bool_kwarg(inplace, "inplace")
760        return self.set_ordered(True, inplace=inplace)
761
762    def as_unordered(self, inplace=False):
763        """
764        Set the Categorical to be unordered.
765
766        Parameters
767        ----------
768        inplace : bool, default False
769           Whether or not to set the ordered attribute in-place or return
770           a copy of this categorical with ordered set to False.
771
772        Returns
773        -------
774        Categorical or None
775            Unordered Categorical or None if ``inplace=True``.
776        """
777        inplace = validate_bool_kwarg(inplace, "inplace")
778        return self.set_ordered(False, inplace=inplace)
779
780    def set_categories(self, new_categories, ordered=None, rename=False, inplace=False):
781        """
782        Set the categories to the specified new_categories.
783
784        `new_categories` can include new categories (which will result in
785        unused categories) or remove old categories (which results in values
786        set to NaN). If `rename==True`, the categories will simple be renamed
787        (less or more items than in old categories will result in values set to
788        NaN or in unused categories respectively).
789
790        This method can be used to perform more than one action of adding,
791        removing, and reordering simultaneously and is therefore faster than
792        performing the individual steps via the more specialised methods.
793
794        On the other hand this methods does not do checks (e.g., whether the
795        old categories are included in the new categories on a reorder), which
796        can result in surprising changes, for example when using special string
797        dtypes, which does not considers a S1 string equal to a single char
798        python string.
799
800        Parameters
801        ----------
802        new_categories : Index-like
803           The categories in new order.
804        ordered : bool, default False
805           Whether or not the categorical is treated as a ordered categorical.
806           If not given, do not change the ordered information.
807        rename : bool, default False
808           Whether or not the new_categories should be considered as a rename
809           of the old categories or as reordered categories.
810        inplace : bool, default False
811           Whether or not to reorder the categories in-place or return a copy
812           of this categorical with reordered categories.
813
814        Returns
815        -------
816        Categorical with reordered categories or None if inplace.
817
818        Raises
819        ------
820        ValueError
821            If new_categories does not validate as categories
822
823        See Also
824        --------
825        rename_categories : Rename categories.
826        reorder_categories : Reorder categories.
827        add_categories : Add new categories.
828        remove_categories : Remove the specified categories.
829        remove_unused_categories : Remove categories which are not used.
830        """
831        inplace = validate_bool_kwarg(inplace, "inplace")
832        if ordered is None:
833            ordered = self.dtype.ordered
834        new_dtype = CategoricalDtype(new_categories, ordered=ordered)
835
836        cat = self if inplace else self.copy()
837        if rename:
838            if cat.dtype.categories is not None and len(new_dtype.categories) < len(
839                cat.dtype.categories
840            ):
841                # remove all _codes which are larger and set to -1/NaN
842                cat._codes[cat._codes >= len(new_dtype.categories)] = -1
843        else:
844            codes = recode_for_categories(
845                cat.codes, cat.categories, new_dtype.categories
846            )
847            cat._codes = codes
848        cat._dtype = new_dtype
849
850        if not inplace:
851            return cat
852
853    def rename_categories(self, new_categories, inplace=False):
854        """
855        Rename categories.
856
857        Parameters
858        ----------
859        new_categories : list-like, dict-like or callable
860
861            New categories which will replace old categories.
862
863            * list-like: all items must be unique and the number of items in
864              the new categories must match the existing number of categories.
865
866            * dict-like: specifies a mapping from
867              old categories to new. Categories not contained in the mapping
868              are passed through and extra categories in the mapping are
869              ignored.
870
871            * callable : a callable that is called on all items in the old
872              categories and whose return values comprise the new categories.
873
874        inplace : bool, default False
875            Whether or not to rename the categories inplace or return a copy of
876            this categorical with renamed categories.
877
878        Returns
879        -------
880        cat : Categorical or None
881            Categorical with removed categories or None if ``inplace=True``.
882
883        Raises
884        ------
885        ValueError
886            If new categories are list-like and do not have the same number of
887            items than the current categories or do not validate as categories
888
889        See Also
890        --------
891        reorder_categories : Reorder categories.
892        add_categories : Add new categories.
893        remove_categories : Remove the specified categories.
894        remove_unused_categories : Remove categories which are not used.
895        set_categories : Set the categories to the specified ones.
896
897        Examples
898        --------
899        >>> c = pd.Categorical(['a', 'a', 'b'])
900        >>> c.rename_categories([0, 1])
901        [0, 0, 1]
902        Categories (2, int64): [0, 1]
903
904        For dict-like ``new_categories``, extra keys are ignored and
905        categories not in the dictionary are passed through
906
907        >>> c.rename_categories({'a': 'A', 'c': 'C'})
908        ['A', 'A', 'b']
909        Categories (2, object): ['A', 'b']
910
911        You may also provide a callable to create the new categories
912
913        >>> c.rename_categories(lambda x: x.upper())
914        ['A', 'A', 'B']
915        Categories (2, object): ['A', 'B']
916        """
917        inplace = validate_bool_kwarg(inplace, "inplace")
918        cat = self if inplace else self.copy()
919
920        if is_dict_like(new_categories):
921            cat.categories = [new_categories.get(item, item) for item in cat.categories]
922        elif callable(new_categories):
923            cat.categories = [new_categories(item) for item in cat.categories]
924        else:
925            cat.categories = new_categories
926        if not inplace:
927            return cat
928
929    def reorder_categories(self, new_categories, ordered=None, inplace=False):
930        """
931        Reorder categories as specified in new_categories.
932
933        `new_categories` need to include all old categories and no new category
934        items.
935
936        Parameters
937        ----------
938        new_categories : Index-like
939           The categories in new order.
940        ordered : bool, optional
941           Whether or not the categorical is treated as a ordered categorical.
942           If not given, do not change the ordered information.
943        inplace : bool, default False
944           Whether or not to reorder the categories inplace or return a copy of
945           this categorical with reordered categories.
946
947        Returns
948        -------
949        cat : Categorical or None
950            Categorical with removed categories or None if ``inplace=True``.
951
952        Raises
953        ------
954        ValueError
955            If the new categories do not contain all old category items or any
956            new ones
957
958        See Also
959        --------
960        rename_categories : Rename categories.
961        add_categories : Add new categories.
962        remove_categories : Remove the specified categories.
963        remove_unused_categories : Remove categories which are not used.
964        set_categories : Set the categories to the specified ones.
965        """
966        inplace = validate_bool_kwarg(inplace, "inplace")
967        if set(self.dtype.categories) != set(new_categories):
968            raise ValueError(
969                "items in new_categories are not the same as in old categories"
970            )
971        return self.set_categories(new_categories, ordered=ordered, inplace=inplace)
972
973    def add_categories(self, new_categories, inplace=False):
974        """
975        Add new categories.
976
977        `new_categories` will be included at the last/highest place in the
978        categories and will be unused directly after this call.
979
980        Parameters
981        ----------
982        new_categories : category or list-like of category
983           The new categories to be included.
984        inplace : bool, default False
985           Whether or not to add the categories inplace or return a copy of
986           this categorical with added categories.
987
988        Returns
989        -------
990        cat : Categorical or None
991            Categorical with new categories added or None if ``inplace=True``.
992
993        Raises
994        ------
995        ValueError
996            If the new categories include old categories or do not validate as
997            categories
998
999        See Also
1000        --------
1001        rename_categories : Rename categories.
1002        reorder_categories : Reorder categories.
1003        remove_categories : Remove the specified categories.
1004        remove_unused_categories : Remove categories which are not used.
1005        set_categories : Set the categories to the specified ones.
1006        """
1007        inplace = validate_bool_kwarg(inplace, "inplace")
1008        if not is_list_like(new_categories):
1009            new_categories = [new_categories]
1010        already_included = set(new_categories) & set(self.dtype.categories)
1011        if len(already_included) != 0:
1012            raise ValueError(
1013                f"new categories must not include old categories: {already_included}"
1014            )
1015        new_categories = list(self.dtype.categories) + list(new_categories)
1016        new_dtype = CategoricalDtype(new_categories, self.ordered)
1017
1018        cat = self if inplace else self.copy()
1019        cat._dtype = new_dtype
1020        cat._codes = coerce_indexer_dtype(cat._codes, new_dtype.categories)
1021        if not inplace:
1022            return cat
1023
1024    def remove_categories(self, removals, inplace=False):
1025        """
1026        Remove the specified categories.
1027
1028        `removals` must be included in the old categories. Values which were in
1029        the removed categories will be set to NaN
1030
1031        Parameters
1032        ----------
1033        removals : category or list of categories
1034           The categories which should be removed.
1035        inplace : bool, default False
1036           Whether or not to remove the categories inplace or return a copy of
1037           this categorical with removed categories.
1038
1039        Returns
1040        -------
1041        cat : Categorical or None
1042            Categorical with removed categories or None if ``inplace=True``.
1043
1044        Raises
1045        ------
1046        ValueError
1047            If the removals are not contained in the categories
1048
1049        See Also
1050        --------
1051        rename_categories : Rename categories.
1052        reorder_categories : Reorder categories.
1053        add_categories : Add new categories.
1054        remove_unused_categories : Remove categories which are not used.
1055        set_categories : Set the categories to the specified ones.
1056        """
1057        inplace = validate_bool_kwarg(inplace, "inplace")
1058        if not is_list_like(removals):
1059            removals = [removals]
1060
1061        removal_set = set(removals)
1062        not_included = removal_set - set(self.dtype.categories)
1063        new_categories = [c for c in self.dtype.categories if c not in removal_set]
1064
1065        # GH 10156
1066        if any(isna(removals)):
1067            not_included = {x for x in not_included if notna(x)}
1068            new_categories = [x for x in new_categories if notna(x)]
1069
1070        if len(not_included) != 0:
1071            raise ValueError(f"removals must all be in old categories: {not_included}")
1072
1073        return self.set_categories(
1074            new_categories, ordered=self.ordered, rename=False, inplace=inplace
1075        )
1076
1077    def remove_unused_categories(self, inplace=no_default):
1078        """
1079        Remove categories which are not used.
1080
1081        Parameters
1082        ----------
1083        inplace : bool, default False
1084           Whether or not to drop unused categories inplace or return a copy of
1085           this categorical with unused categories dropped.
1086
1087           .. deprecated:: 1.2.0
1088
1089        Returns
1090        -------
1091        cat : Categorical or None
1092            Categorical with unused categories dropped or None if ``inplace=True``.
1093
1094        See Also
1095        --------
1096        rename_categories : Rename categories.
1097        reorder_categories : Reorder categories.
1098        add_categories : Add new categories.
1099        remove_categories : Remove the specified categories.
1100        set_categories : Set the categories to the specified ones.
1101        """
1102        if inplace is not no_default:
1103            warn(
1104                "The `inplace` parameter in pandas.Categorical."
1105                "remove_unused_categories is deprecated and "
1106                "will be removed in a future version.",
1107                FutureWarning,
1108                stacklevel=2,
1109            )
1110        else:
1111            inplace = False
1112
1113        inplace = validate_bool_kwarg(inplace, "inplace")
1114        cat = self if inplace else self.copy()
1115        idx, inv = np.unique(cat._codes, return_inverse=True)
1116
1117        if idx.size != 0 and idx[0] == -1:  # na sentinel
1118            idx, inv = idx[1:], inv - 1
1119
1120        new_categories = cat.dtype.categories.take(idx)
1121        new_dtype = CategoricalDtype._from_fastpath(
1122            new_categories, ordered=self.ordered
1123        )
1124        cat._dtype = new_dtype
1125        cat._codes = coerce_indexer_dtype(inv, new_dtype.categories)
1126
1127        if not inplace:
1128            return cat
1129
1130    # ------------------------------------------------------------------
1131
1132    def map(self, mapper):
1133        """
1134        Map categories using input correspondence (dict, Series, or function).
1135
1136        Maps the categories to new categories. If the mapping correspondence is
1137        one-to-one the result is a :class:`~pandas.Categorical` which has the
1138        same order property as the original, otherwise a :class:`~pandas.Index`
1139        is returned. NaN values are unaffected.
1140
1141        If a `dict` or :class:`~pandas.Series` is used any unmapped category is
1142        mapped to `NaN`. Note that if this happens an :class:`~pandas.Index`
1143        will be returned.
1144
1145        Parameters
1146        ----------
1147        mapper : function, dict, or Series
1148            Mapping correspondence.
1149
1150        Returns
1151        -------
1152        pandas.Categorical or pandas.Index
1153            Mapped categorical.
1154
1155        See Also
1156        --------
1157        CategoricalIndex.map : Apply a mapping correspondence on a
1158            :class:`~pandas.CategoricalIndex`.
1159        Index.map : Apply a mapping correspondence on an
1160            :class:`~pandas.Index`.
1161        Series.map : Apply a mapping correspondence on a
1162            :class:`~pandas.Series`.
1163        Series.apply : Apply more complex functions on a
1164            :class:`~pandas.Series`.
1165
1166        Examples
1167        --------
1168        >>> cat = pd.Categorical(['a', 'b', 'c'])
1169        >>> cat
1170        ['a', 'b', 'c']
1171        Categories (3, object): ['a', 'b', 'c']
1172        >>> cat.map(lambda x: x.upper())
1173        ['A', 'B', 'C']
1174        Categories (3, object): ['A', 'B', 'C']
1175        >>> cat.map({'a': 'first', 'b': 'second', 'c': 'third'})
1176        ['first', 'second', 'third']
1177        Categories (3, object): ['first', 'second', 'third']
1178
1179        If the mapping is one-to-one the ordering of the categories is
1180        preserved:
1181
1182        >>> cat = pd.Categorical(['a', 'b', 'c'], ordered=True)
1183        >>> cat
1184        ['a', 'b', 'c']
1185        Categories (3, object): ['a' < 'b' < 'c']
1186        >>> cat.map({'a': 3, 'b': 2, 'c': 1})
1187        [3, 2, 1]
1188        Categories (3, int64): [3 < 2 < 1]
1189
1190        If the mapping is not one-to-one an :class:`~pandas.Index` is returned:
1191
1192        >>> cat.map({'a': 'first', 'b': 'second', 'c': 'first'})
1193        Index(['first', 'second', 'first'], dtype='object')
1194
1195        If a `dict` is used, all unmapped categories are mapped to `NaN` and
1196        the result is an :class:`~pandas.Index`:
1197
1198        >>> cat.map({'a': 'first', 'b': 'second'})
1199        Index(['first', 'second', nan], dtype='object')
1200        """
1201        new_categories = self.categories.map(mapper)
1202        try:
1203            return self.from_codes(
1204                self._codes.copy(), categories=new_categories, ordered=self.ordered
1205            )
1206        except ValueError:
1207            # NA values are represented in self._codes with -1
1208            # np.take causes NA values to take final element in new_categories
1209            if np.any(self._codes == -1):
1210                new_categories = new_categories.insert(len(new_categories), np.nan)
1211            return np.take(new_categories, self._codes)
1212
1213    __eq__ = _cat_compare_op(operator.eq)
1214    __ne__ = _cat_compare_op(operator.ne)
1215    __lt__ = _cat_compare_op(operator.lt)
1216    __gt__ = _cat_compare_op(operator.gt)
1217    __le__ = _cat_compare_op(operator.le)
1218    __ge__ = _cat_compare_op(operator.ge)
1219
1220    # -------------------------------------------------------------
1221    # Validators; ideally these can be de-duplicated
1222
1223    def _validate_searchsorted_value(self, value):
1224        # searchsorted is very performance sensitive. By converting codes
1225        # to same dtype as self.codes, we get much faster performance.
1226        if is_scalar(value):
1227            codes = self._unbox_scalar(value)
1228        else:
1229            locs = [self.categories.get_loc(x) for x in value]
1230            codes = np.array(locs, dtype=self.codes.dtype)
1231        return codes
1232
1233    def _validate_fill_value(self, fill_value):
1234        """
1235        Convert a user-facing fill_value to a representation to use with our
1236        underlying ndarray, raising TypeError if this is not possible.
1237
1238        Parameters
1239        ----------
1240        fill_value : object
1241
1242        Returns
1243        -------
1244        fill_value : int
1245
1246        Raises
1247        ------
1248        TypeError
1249        """
1250
1251        if is_valid_nat_for_dtype(fill_value, self.categories.dtype):
1252            fill_value = -1
1253        elif fill_value in self.categories:
1254            fill_value = self._unbox_scalar(fill_value)
1255        else:
1256            raise TypeError(
1257                f"'fill_value={fill_value}' is not present "
1258                "in this Categorical's categories"
1259            )
1260        return fill_value
1261
1262    _validate_scalar = _validate_fill_value
1263
1264    # -------------------------------------------------------------
1265
1266    def __array__(self, dtype=None) -> np.ndarray:
1267        """
1268        The numpy array interface.
1269
1270        Returns
1271        -------
1272        numpy.array
1273            A numpy array of either the specified dtype or,
1274            if dtype==None (default), the same dtype as
1275            categorical.categories.dtype.
1276        """
1277        ret = take_1d(self.categories._values, self._codes)
1278        if dtype and not is_dtype_equal(dtype, self.categories.dtype):
1279            return np.asarray(ret, dtype)
1280        # When we're a Categorical[ExtensionArray], like Interval,
1281        # we need to ensure __array__ gets all the way to an
1282        # ndarray.
1283        return np.asarray(ret)
1284
1285    def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
1286        # for binary ops, use our custom dunder methods
1287        result = ops.maybe_dispatch_ufunc_to_dunder_op(
1288            self, ufunc, method, *inputs, **kwargs
1289        )
1290        if result is not NotImplemented:
1291            return result
1292
1293        # for all other cases, raise for now (similarly as what happens in
1294        # Series.__array_prepare__)
1295        raise TypeError(
1296            f"Object with dtype {self.dtype} cannot perform "
1297            f"the numpy op {ufunc.__name__}"
1298        )
1299
1300    def __setstate__(self, state):
1301        """Necessary for making this object picklable"""
1302        if not isinstance(state, dict):
1303            raise Exception("invalid pickle state")
1304
1305        if "_dtype" not in state:
1306            state["_dtype"] = CategoricalDtype(state["_categories"], state["_ordered"])
1307
1308        for k, v in state.items():
1309            setattr(self, k, v)
1310
1311    @property
1312    def nbytes(self) -> int:
1313        return self._codes.nbytes + self.dtype.categories.values.nbytes
1314
1315    def memory_usage(self, deep: bool = False) -> int:
1316        """
1317        Memory usage of my values
1318
1319        Parameters
1320        ----------
1321        deep : bool
1322            Introspect the data deeply, interrogate
1323            `object` dtypes for system-level memory consumption
1324
1325        Returns
1326        -------
1327        bytes used
1328
1329        Notes
1330        -----
1331        Memory usage does not include memory consumed by elements that
1332        are not components of the array if deep=False
1333
1334        See Also
1335        --------
1336        numpy.ndarray.nbytes
1337        """
1338        return self._codes.nbytes + self.dtype.categories.memory_usage(deep=deep)
1339
1340    def isna(self):
1341        """
1342        Detect missing values
1343
1344        Missing values (-1 in .codes) are detected.
1345
1346        Returns
1347        -------
1348        a boolean array of whether my values are null
1349
1350        See Also
1351        --------
1352        isna : Top-level isna.
1353        isnull : Alias of isna.
1354        Categorical.notna : Boolean inverse of Categorical.isna.
1355
1356        """
1357        return self._codes == -1
1358
1359    isnull = isna
1360
1361    def notna(self):
1362        """
1363        Inverse of isna
1364
1365        Both missing values (-1 in .codes) and NA as a category are detected as
1366        null.
1367
1368        Returns
1369        -------
1370        a boolean array of whether my values are not null
1371
1372        See Also
1373        --------
1374        notna : Top-level notna.
1375        notnull : Alias of notna.
1376        Categorical.isna : Boolean inverse of Categorical.notna.
1377
1378        """
1379        return ~self.isna()
1380
1381    notnull = notna
1382
1383    def value_counts(self, dropna=True):
1384        """
1385        Return a Series containing counts of each category.
1386
1387        Every category will have an entry, even those with a count of 0.
1388
1389        Parameters
1390        ----------
1391        dropna : bool, default True
1392            Don't include counts of NaN.
1393
1394        Returns
1395        -------
1396        counts : Series
1397
1398        See Also
1399        --------
1400        Series.value_counts
1401        """
1402        from pandas import CategoricalIndex, Series
1403
1404        code, cat = self._codes, self.categories
1405        ncat, mask = (len(cat), code >= 0)
1406        ix, clean = np.arange(ncat), mask.all()
1407
1408        if dropna or clean:
1409            obs = code if clean else code[mask]
1410            count = np.bincount(obs, minlength=ncat or 0)
1411        else:
1412            count = np.bincount(np.where(mask, code, ncat))
1413            ix = np.append(ix, -1)
1414
1415        ix = self._from_backing_data(ix)
1416
1417        return Series(count, index=CategoricalIndex(ix), dtype="int64")
1418
1419    def _internal_get_values(self):
1420        """
1421        Return the values.
1422
1423        For internal compatibility with pandas formatting.
1424
1425        Returns
1426        -------
1427        np.ndarray or Index
1428            A numpy array of the same dtype as categorical.categories.dtype or
1429            Index if datetime / periods.
1430        """
1431        # if we are a datetime and period index, return Index to keep metadata
1432        if needs_i8_conversion(self.categories.dtype):
1433            return self.categories.take(self._codes, fill_value=NaT)
1434        elif is_integer_dtype(self.categories) and -1 in self._codes:
1435            return self.categories.astype("object").take(self._codes, fill_value=np.nan)
1436        return np.array(self)
1437
1438    def check_for_ordered(self, op):
1439        """ assert that we are ordered """
1440        if not self.ordered:
1441            raise TypeError(
1442                f"Categorical is not ordered for operation {op}\n"
1443                "you can use .as_ordered() to change the "
1444                "Categorical to an ordered one\n"
1445            )
1446
1447    def argsort(self, ascending=True, kind="quicksort", **kwargs):
1448        """
1449        Return the indices that would sort the Categorical.
1450
1451        .. versionchanged:: 0.25.0
1452
1453           Changed to sort missing values at the end.
1454
1455        Parameters
1456        ----------
1457        ascending : bool, default True
1458            Whether the indices should result in an ascending
1459            or descending sort.
1460        kind : {'quicksort', 'mergesort', 'heapsort'}, optional
1461            Sorting algorithm.
1462        **kwargs:
1463            passed through to :func:`numpy.argsort`.
1464
1465        Returns
1466        -------
1467        numpy.array
1468
1469        See Also
1470        --------
1471        numpy.ndarray.argsort
1472
1473        Notes
1474        -----
1475        While an ordering is applied to the category values, arg-sorting
1476        in this context refers more to organizing and grouping together
1477        based on matching category values. Thus, this function can be
1478        called on an unordered Categorical instance unlike the functions
1479        'Categorical.min' and 'Categorical.max'.
1480
1481        Examples
1482        --------
1483        >>> pd.Categorical(['b', 'b', 'a', 'c']).argsort()
1484        array([2, 0, 1, 3])
1485
1486        >>> cat = pd.Categorical(['b', 'b', 'a', 'c'],
1487        ...                      categories=['c', 'b', 'a'],
1488        ...                      ordered=True)
1489        >>> cat.argsort()
1490        array([3, 0, 1, 2])
1491
1492        Missing values are placed at the end
1493
1494        >>> cat = pd.Categorical([2, None, 1])
1495        >>> cat.argsort()
1496        array([2, 0, 1])
1497        """
1498        return super().argsort(ascending=ascending, kind=kind, **kwargs)
1499
1500    def sort_values(
1501        self, inplace: bool = False, ascending: bool = True, na_position: str = "last"
1502    ):
1503        """
1504        Sort the Categorical by category value returning a new
1505        Categorical by default.
1506
1507        While an ordering is applied to the category values, sorting in this
1508        context refers more to organizing and grouping together based on
1509        matching category values. Thus, this function can be called on an
1510        unordered Categorical instance unlike the functions 'Categorical.min'
1511        and 'Categorical.max'.
1512
1513        Parameters
1514        ----------
1515        inplace : bool, default False
1516            Do operation in place.
1517        ascending : bool, default True
1518            Order ascending. Passing False orders descending. The
1519            ordering parameter provides the method by which the
1520            category values are organized.
1521        na_position : {'first', 'last'} (optional, default='last')
1522            'first' puts NaNs at the beginning
1523            'last' puts NaNs at the end
1524
1525        Returns
1526        -------
1527        Categorical or None
1528
1529        See Also
1530        --------
1531        Categorical.sort
1532        Series.sort_values
1533
1534        Examples
1535        --------
1536        >>> c = pd.Categorical([1, 2, 2, 1, 5])
1537        >>> c
1538        [1, 2, 2, 1, 5]
1539        Categories (3, int64): [1, 2, 5]
1540        >>> c.sort_values()
1541        [1, 1, 2, 2, 5]
1542        Categories (3, int64): [1, 2, 5]
1543        >>> c.sort_values(ascending=False)
1544        [5, 2, 2, 1, 1]
1545        Categories (3, int64): [1, 2, 5]
1546
1547        Inplace sorting can be done as well:
1548
1549        >>> c.sort_values(inplace=True)
1550        >>> c
1551        [1, 1, 2, 2, 5]
1552        Categories (3, int64): [1, 2, 5]
1553        >>>
1554        >>> c = pd.Categorical([1, 2, 2, 1, 5])
1555
1556        'sort_values' behaviour with NaNs. Note that 'na_position'
1557        is independent of the 'ascending' parameter:
1558
1559        >>> c = pd.Categorical([np.nan, 2, 2, np.nan, 5])
1560        >>> c
1561        [NaN, 2, 2, NaN, 5]
1562        Categories (2, int64): [2, 5]
1563        >>> c.sort_values()
1564        [2, 2, 5, NaN, NaN]
1565        Categories (2, int64): [2, 5]
1566        >>> c.sort_values(ascending=False)
1567        [5, 2, 2, NaN, NaN]
1568        Categories (2, int64): [2, 5]
1569        >>> c.sort_values(na_position='first')
1570        [NaN, NaN, 2, 2, 5]
1571        Categories (2, int64): [2, 5]
1572        >>> c.sort_values(ascending=False, na_position='first')
1573        [NaN, NaN, 5, 2, 2]
1574        Categories (2, int64): [2, 5]
1575        """
1576        inplace = validate_bool_kwarg(inplace, "inplace")
1577        if na_position not in ["last", "first"]:
1578            raise ValueError(f"invalid na_position: {repr(na_position)}")
1579
1580        sorted_idx = nargsort(self, ascending=ascending, na_position=na_position)
1581
1582        if inplace:
1583            self._codes[:] = self._codes[sorted_idx]
1584        else:
1585            codes = self._codes[sorted_idx]
1586            return self._from_backing_data(codes)
1587
1588    def _values_for_rank(self):
1589        """
1590        For correctly ranking ordered categorical data. See GH#15420
1591
1592        Ordered categorical data should be ranked on the basis of
1593        codes with -1 translated to NaN.
1594
1595        Returns
1596        -------
1597        numpy.array
1598
1599        """
1600        from pandas import Series
1601
1602        if self.ordered:
1603            values = self.codes
1604            mask = values == -1
1605            if mask.any():
1606                values = values.astype("float64")
1607                values[mask] = np.nan
1608        elif self.categories.is_numeric():
1609            values = np.array(self)
1610        else:
1611            #  reorder the categories (so rank can use the float codes)
1612            #  instead of passing an object array to rank
1613            values = np.array(
1614                self.rename_categories(Series(self.categories).rank().values)
1615            )
1616        return values
1617
1618    def view(self, dtype=None):
1619        if dtype is not None:
1620            raise NotImplementedError(dtype)
1621        return self._from_backing_data(self._ndarray)
1622
1623    def to_dense(self):
1624        """
1625        Return my 'dense' representation
1626
1627        For internal compatibility with numpy arrays.
1628
1629        Returns
1630        -------
1631        dense : array
1632        """
1633        warn(
1634            "Categorical.to_dense is deprecated and will be removed in "
1635            "a future version.  Use np.asarray(cat) instead.",
1636            FutureWarning,
1637            stacklevel=2,
1638        )
1639        return np.asarray(self)
1640
1641    def fillna(self, value=None, method=None, limit=None):
1642        """
1643        Fill NA/NaN values using the specified method.
1644
1645        Parameters
1646        ----------
1647        value : scalar, dict, Series
1648            If a scalar value is passed it is used to fill all missing values.
1649            Alternatively, a Series or dict can be used to fill in different
1650            values for each index. The value should not be a list. The
1651            value(s) passed should either be in the categories or should be
1652            NaN.
1653        method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
1654            Method to use for filling holes in reindexed Series
1655            pad / ffill: propagate last valid observation forward to next valid
1656            backfill / bfill: use NEXT valid observation to fill gap
1657        limit : int, default None
1658            (Not implemented yet for Categorical!)
1659            If method is specified, this is the maximum number of consecutive
1660            NaN values to forward/backward fill. In other words, if there is
1661            a gap with more than this number of consecutive NaNs, it will only
1662            be partially filled. If method is not specified, this is the
1663            maximum number of entries along the entire axis where NaNs will be
1664            filled.
1665
1666        Returns
1667        -------
1668        filled : Categorical with NA/NaN filled
1669        """
1670        value, method = validate_fillna_kwargs(
1671            value, method, validate_scalar_dict_value=False
1672        )
1673        value = extract_array(value, extract_numpy=True)
1674
1675        if value is None:
1676            value = np.nan
1677        if limit is not None:
1678            raise NotImplementedError(
1679                "specifying a limit for fillna has not been implemented yet"
1680            )
1681
1682        if method is not None:
1683            # pad / bfill
1684
1685            # TODO: dispatch when self.categories is EA-dtype
1686            values = np.asarray(self).reshape(-1, len(self))
1687            values = interpolate_2d(values, method, 0, None).astype(
1688                self.categories.dtype
1689            )[0]
1690            codes = _get_codes_for_values(values, self.categories)
1691
1692        else:
1693            # We copy even if there is nothing to fill
1694            codes = self._ndarray.copy()
1695            mask = self.isna()
1696
1697            new_codes = self._validate_setitem_value(value)
1698
1699            if isinstance(value, (np.ndarray, Categorical)):
1700                # We get ndarray or Categorical if called via Series.fillna,
1701                #  where it will unwrap another aligned Series before getting here
1702                codes[mask] = new_codes[mask]
1703            else:
1704                codes[mask] = new_codes
1705
1706        return self._from_backing_data(codes)
1707
1708    # ------------------------------------------------------------------
1709    # NDArrayBackedExtensionArray compat
1710
1711    @property
1712    def _ndarray(self) -> np.ndarray:
1713        return self._codes
1714
1715    def _from_backing_data(self, arr: np.ndarray) -> "Categorical":
1716        return self._constructor(arr, dtype=self.dtype, fastpath=True)
1717
1718    def _box_func(self, i: int):
1719        if i == -1:
1720            return np.NaN
1721        return self.categories[i]
1722
1723    def _unbox_scalar(self, key) -> int:
1724        # searchsorted is very performance sensitive. By converting codes
1725        # to same dtype as self.codes, we get much faster performance.
1726        code = self.categories.get_loc(key)
1727        code = self._codes.dtype.type(code)
1728        return code
1729
1730    # ------------------------------------------------------------------
1731
1732    def take_nd(self, indexer, allow_fill: bool = False, fill_value=None):
1733        # GH#27745 deprecate alias that other EAs dont have
1734        warn(
1735            "Categorical.take_nd is deprecated, use Categorical.take instead",
1736            FutureWarning,
1737            stacklevel=2,
1738        )
1739        return self.take(indexer, allow_fill=allow_fill, fill_value=fill_value)
1740
1741    def __iter__(self):
1742        """
1743        Returns an Iterator over the values of this Categorical.
1744        """
1745        return iter(self._internal_get_values().tolist())
1746
1747    def __contains__(self, key) -> bool:
1748        """
1749        Returns True if `key` is in this Categorical.
1750        """
1751        # if key is a NaN, check if any NaN is in self.
1752        if is_valid_nat_for_dtype(key, self.categories.dtype):
1753            return self.isna().any()
1754
1755        return contains(self, key, container=self._codes)
1756
1757    # ------------------------------------------------------------------
1758    # Rendering Methods
1759
1760    def _formatter(self, boxed=False):
1761        # Defer to CategoricalFormatter's formatter.
1762        return None
1763
1764    def _tidy_repr(self, max_vals=10, footer=True) -> str:
1765        """
1766        a short repr displaying only max_vals and an optional (but default
1767        footer)
1768        """
1769        num = max_vals // 2
1770        head = self[:num]._get_repr(length=False, footer=False)
1771        tail = self[-(max_vals - num) :]._get_repr(length=False, footer=False)
1772
1773        result = f"{head[:-1]}, ..., {tail[1:]}"
1774        if footer:
1775            result = f"{result}\n{self._repr_footer()}"
1776
1777        return str(result)
1778
1779    def _repr_categories(self):
1780        """
1781        return the base repr for the categories
1782        """
1783        max_categories = (
1784            10
1785            if get_option("display.max_categories") == 0
1786            else get_option("display.max_categories")
1787        )
1788        from pandas.io.formats import format as fmt
1789
1790        format_array = partial(
1791            fmt.format_array, formatter=None, quoting=QUOTE_NONNUMERIC
1792        )
1793        if len(self.categories) > max_categories:
1794            num = max_categories // 2
1795            head = format_array(self.categories[:num])
1796            tail = format_array(self.categories[-num:])
1797            category_strs = head + ["..."] + tail
1798        else:
1799            category_strs = format_array(self.categories)
1800
1801        # Strip all leading spaces, which format_array adds for columns...
1802        category_strs = [x.strip() for x in category_strs]
1803        return category_strs
1804
1805    def _repr_categories_info(self) -> str:
1806        """
1807        Returns a string representation of the footer.
1808        """
1809        category_strs = self._repr_categories()
1810        dtype = str(self.categories.dtype)
1811        levheader = f"Categories ({len(self.categories)}, {dtype}): "
1812        width, height = get_terminal_size()
1813        max_width = get_option("display.width") or width
1814        if console.in_ipython_frontend():
1815            # 0 = no breaks
1816            max_width = 0
1817        levstring = ""
1818        start = True
1819        cur_col_len = len(levheader)  # header
1820        sep_len, sep = (3, " < ") if self.ordered else (2, ", ")
1821        linesep = sep.rstrip() + "\n"  # remove whitespace
1822        for val in category_strs:
1823            if max_width != 0 and cur_col_len + sep_len + len(val) > max_width:
1824                levstring += linesep + (" " * (len(levheader) + 1))
1825                cur_col_len = len(levheader) + 1  # header + a whitespace
1826            elif not start:
1827                levstring += sep
1828                cur_col_len += len(val)
1829            levstring += val
1830            start = False
1831        # replace to simple save space by
1832        return levheader + "[" + levstring.replace(" < ... < ", " ... ") + "]"
1833
1834    def _repr_footer(self) -> str:
1835        info = self._repr_categories_info()
1836        return f"Length: {len(self)}\n{info}"
1837
1838    def _get_repr(self, length=True, na_rep="NaN", footer=True) -> str:
1839        from pandas.io.formats import format as fmt
1840
1841        formatter = fmt.CategoricalFormatter(
1842            self, length=length, na_rep=na_rep, footer=footer
1843        )
1844        result = formatter.to_string()
1845        return str(result)
1846
1847    def __repr__(self) -> str:
1848        """
1849        String representation.
1850        """
1851        _maxlen = 10
1852        if len(self._codes) > _maxlen:
1853            result = self._tidy_repr(_maxlen)
1854        elif len(self._codes) > 0:
1855            result = self._get_repr(length=len(self) > _maxlen)
1856        else:
1857            msg = self._get_repr(length=False, footer=True).replace("\n", ", ")
1858            result = f"[], {msg}"
1859
1860        return result
1861
1862    # ------------------------------------------------------------------
1863
1864    def __getitem__(self, key):
1865        """
1866        Return an item.
1867        """
1868        result = super().__getitem__(key)
1869        if getattr(result, "ndim", 0) > 1:
1870            result = result._ndarray
1871            deprecate_ndim_indexing(result)
1872        return result
1873
1874    def _validate_setitem_value(self, value):
1875        value = extract_array(value, extract_numpy=True)
1876
1877        # require identical categories set
1878        if isinstance(value, Categorical):
1879            if not is_dtype_equal(self.dtype, value.dtype):
1880                raise ValueError(
1881                    "Cannot set a Categorical with another, "
1882                    "without identical categories"
1883                )
1884            # is_dtype_equal implies categories_match_up_to_permutation
1885            value = self._encode_with_my_categories(value)
1886            return value._codes
1887
1888        # wrap scalars and hashable-listlikes in list
1889        rvalue = value if not is_hashable(value) else [value]
1890
1891        from pandas import Index
1892
1893        to_add = Index(rvalue).difference(self.categories)
1894
1895        # no assignments of values not in categories, but it's always ok to set
1896        # something to np.nan
1897        if len(to_add) and not isna(to_add).all():
1898            raise ValueError(
1899                "Cannot setitem on a Categorical with a new "
1900                "category, set the categories first"
1901            )
1902
1903        codes = self.categories.get_indexer(rvalue)
1904        return codes.astype(self._ndarray.dtype, copy=False)
1905
1906    def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]:
1907        """
1908        Compute the inverse of a categorical, returning
1909        a dict of categories -> indexers.
1910
1911        *This is an internal function*
1912
1913        Returns
1914        -------
1915        dict of categories -> indexers
1916
1917        Examples
1918        --------
1919        >>> c = pd.Categorical(list('aabca'))
1920        >>> c
1921        ['a', 'a', 'b', 'c', 'a']
1922        Categories (3, object): ['a', 'b', 'c']
1923        >>> c.categories
1924        Index(['a', 'b', 'c'], dtype='object')
1925        >>> c.codes
1926        array([0, 0, 1, 2, 0], dtype=int8)
1927        >>> c._reverse_indexer()
1928        {'a': array([0, 1, 4]), 'b': array([2]), 'c': array([3])}
1929
1930        """
1931        categories = self.categories
1932        r, counts = libalgos.groupsort_indexer(
1933            self.codes.astype("int64"), categories.size
1934        )
1935        counts = counts.cumsum()
1936        _result = (r[start:end] for start, end in zip(counts, counts[1:]))
1937        return dict(zip(categories, _result))
1938
1939    # ------------------------------------------------------------------
1940    # Reductions
1941
1942    @deprecate_kwarg(old_arg_name="numeric_only", new_arg_name="skipna")
1943    def min(self, *, skipna=True, **kwargs):
1944        """
1945        The minimum value of the object.
1946
1947        Only ordered `Categoricals` have a minimum!
1948
1949        .. versionchanged:: 1.0.0
1950
1951           Returns an NA value on empty arrays
1952
1953        Raises
1954        ------
1955        TypeError
1956            If the `Categorical` is not `ordered`.
1957
1958        Returns
1959        -------
1960        min : the minimum of this `Categorical`
1961        """
1962        nv.validate_minmax_axis(kwargs.get("axis", 0))
1963        nv.validate_min((), kwargs)
1964        self.check_for_ordered("min")
1965
1966        if not len(self._codes):
1967            return self.dtype.na_value
1968
1969        good = self._codes != -1
1970        if not good.all():
1971            if skipna and good.any():
1972                pointer = self._codes[good].min()
1973            else:
1974                return np.nan
1975        else:
1976            pointer = self._codes.min()
1977        return self._wrap_reduction_result(None, pointer)
1978
1979    @deprecate_kwarg(old_arg_name="numeric_only", new_arg_name="skipna")
1980    def max(self, *, skipna=True, **kwargs):
1981        """
1982        The maximum value of the object.
1983
1984        Only ordered `Categoricals` have a maximum!
1985
1986        .. versionchanged:: 1.0.0
1987
1988           Returns an NA value on empty arrays
1989
1990        Raises
1991        ------
1992        TypeError
1993            If the `Categorical` is not `ordered`.
1994
1995        Returns
1996        -------
1997        max : the maximum of this `Categorical`
1998        """
1999        nv.validate_minmax_axis(kwargs.get("axis", 0))
2000        nv.validate_max((), kwargs)
2001        self.check_for_ordered("max")
2002
2003        if not len(self._codes):
2004            return self.dtype.na_value
2005
2006        good = self._codes != -1
2007        if not good.all():
2008            if skipna and good.any():
2009                pointer = self._codes[good].max()
2010            else:
2011                return np.nan
2012        else:
2013            pointer = self._codes.max()
2014        return self._wrap_reduction_result(None, pointer)
2015
2016    def mode(self, dropna=True):
2017        """
2018        Returns the mode(s) of the Categorical.
2019
2020        Always returns `Categorical` even if only one value.
2021
2022        Parameters
2023        ----------
2024        dropna : bool, default True
2025            Don't consider counts of NaN/NaT.
2026
2027            .. versionadded:: 0.24.0
2028
2029        Returns
2030        -------
2031        modes : `Categorical` (sorted)
2032        """
2033        codes = self._codes
2034        if dropna:
2035            good = self._codes != -1
2036            codes = self._codes[good]
2037        codes = sorted(htable.mode_int64(ensure_int64(codes), dropna))
2038        return self._from_backing_data(codes)
2039
2040    # ------------------------------------------------------------------
2041    # ExtensionArray Interface
2042
2043    def unique(self):
2044        """
2045        Return the ``Categorical`` which ``categories`` and ``codes`` are
2046        unique. Unused categories are NOT returned.
2047
2048        - unordered category: values and categories are sorted by appearance
2049          order.
2050        - ordered category: values are sorted by appearance order, categories
2051          keeps existing order.
2052
2053        Returns
2054        -------
2055        unique values : ``Categorical``
2056
2057        See Also
2058        --------
2059        pandas.unique
2060        CategoricalIndex.unique
2061        Series.unique : Return unique values of Series object.
2062
2063        Examples
2064        --------
2065        An unordered Categorical will return categories in the
2066        order of appearance.
2067
2068        >>> pd.Categorical(list("baabc")).unique()
2069        ['b', 'a', 'c']
2070        Categories (3, object): ['b', 'a', 'c']
2071
2072        >>> pd.Categorical(list("baabc"), categories=list("abc")).unique()
2073        ['b', 'a', 'c']
2074        Categories (3, object): ['b', 'a', 'c']
2075
2076        An ordered Categorical preserves the category ordering.
2077
2078        >>> pd.Categorical(
2079        ...     list("baabc"), categories=list("abc"), ordered=True
2080        ... ).unique()
2081        ['b', 'a', 'c']
2082        Categories (3, object): ['a' < 'b' < 'c']
2083        """
2084        # unlike np.unique, unique1d does not sort
2085        unique_codes = unique1d(self.codes)
2086        cat = self.copy()
2087
2088        # keep nan in codes
2089        cat._codes = unique_codes
2090
2091        # exclude nan from indexer for categories
2092        take_codes = unique_codes[unique_codes != -1]
2093        if self.ordered:
2094            take_codes = np.sort(take_codes)
2095        return cat.set_categories(cat.categories.take(take_codes))
2096
2097    def _values_for_factorize(self):
2098        return self._ndarray, -1
2099
2100    @classmethod
2101    def _from_factorized(cls, uniques, original):
2102        return original._constructor(
2103            original.categories.take(uniques), dtype=original.dtype
2104        )
2105
2106    def equals(self, other: object) -> bool:
2107        """
2108        Returns True if categorical arrays are equal.
2109
2110        Parameters
2111        ----------
2112        other : `Categorical`
2113
2114        Returns
2115        -------
2116        bool
2117        """
2118        if not isinstance(other, Categorical):
2119            return False
2120        elif self._categories_match_up_to_permutation(other):
2121            other = self._encode_with_my_categories(other)
2122            return np.array_equal(self._codes, other._codes)
2123        return False
2124
2125    @classmethod
2126    def _concat_same_type(
2127        cls: Type[CategoricalT], to_concat: Sequence[CategoricalT], axis: int = 0
2128    ) -> CategoricalT:
2129        from pandas.core.dtypes.concat import union_categoricals
2130
2131        return union_categoricals(to_concat)
2132
2133    # ------------------------------------------------------------------
2134
2135    def _encode_with_my_categories(self, other: "Categorical") -> "Categorical":
2136        """
2137        Re-encode another categorical using this Categorical's categories.
2138
2139        Notes
2140        -----
2141        This assumes we have already checked
2142        self._categories_match_up_to_permutation(other).
2143        """
2144        # Indexing on codes is more efficient if categories are the same,
2145        #  so we can apply some optimizations based on the degree of
2146        #  dtype-matching.
2147        codes = recode_for_categories(
2148            other.codes, other.categories, self.categories, copy=False
2149        )
2150        return self._from_backing_data(codes)
2151
2152    def _categories_match_up_to_permutation(self, other: "Categorical") -> bool:
2153        """
2154        Returns True if categoricals are the same dtype
2155          same categories, and same ordered
2156
2157        Parameters
2158        ----------
2159        other : Categorical
2160
2161        Returns
2162        -------
2163        bool
2164        """
2165        return hash(self.dtype) == hash(other.dtype)
2166
2167    def is_dtype_equal(self, other) -> bool:
2168        warn(
2169            "Categorical.is_dtype_equal is deprecated and will be removed "
2170            "in a future version",
2171            FutureWarning,
2172            stacklevel=2,
2173        )
2174        try:
2175            return self._categories_match_up_to_permutation(other)
2176        except (AttributeError, TypeError):
2177            return False
2178
2179    def describe(self):
2180        """
2181        Describes this Categorical
2182
2183        Returns
2184        -------
2185        description: `DataFrame`
2186            A dataframe with frequency and counts by category.
2187        """
2188        counts = self.value_counts(dropna=False)
2189        freqs = counts / float(counts.sum())
2190
2191        from pandas.core.reshape.concat import concat
2192
2193        result = concat([counts, freqs], axis=1)
2194        result.columns = ["counts", "freqs"]
2195        result.index.name = "categories"
2196
2197        return result
2198
2199    def isin(self, values) -> np.ndarray:
2200        """
2201        Check whether `values` are contained in Categorical.
2202
2203        Return a boolean NumPy Array showing whether each element in
2204        the Categorical matches an element in the passed sequence of
2205        `values` exactly.
2206
2207        Parameters
2208        ----------
2209        values : set or list-like
2210            The sequence of values to test. Passing in a single string will
2211            raise a ``TypeError``. Instead, turn a single string into a
2212            list of one element.
2213
2214        Returns
2215        -------
2216        isin : numpy.ndarray (bool dtype)
2217
2218        Raises
2219        ------
2220        TypeError
2221          * If `values` is not a set or list-like
2222
2223        See Also
2224        --------
2225        pandas.Series.isin : Equivalent method on Series.
2226
2227        Examples
2228        --------
2229        >>> s = pd.Categorical(['lama', 'cow', 'lama', 'beetle', 'lama',
2230        ...                'hippo'])
2231        >>> s.isin(['cow', 'lama'])
2232        array([ True,  True,  True, False,  True, False])
2233
2234        Passing a single string as ``s.isin('lama')`` will raise an error. Use
2235        a list of one element instead:
2236
2237        >>> s.isin(['lama'])
2238        array([ True, False,  True, False,  True, False])
2239        """
2240        if not is_list_like(values):
2241            values_type = type(values).__name__
2242            raise TypeError(
2243                "only list-like objects are allowed to be passed "
2244                f"to isin(), you passed a [{values_type}]"
2245            )
2246        values = sanitize_array(values, None, None)
2247        null_mask = np.asarray(isna(values))
2248        code_values = self.categories.get_indexer(values)
2249        code_values = code_values[null_mask | (code_values >= 0)]
2250        return algorithms.isin(self.codes, code_values)
2251
2252    def replace(self, to_replace, value, inplace: bool = False):
2253        """
2254        Replaces all instances of one value with another
2255
2256        Parameters
2257        ----------
2258        to_replace: object
2259            The value to be replaced
2260
2261        value: object
2262            The value to replace it with
2263
2264        inplace: bool
2265            Whether the operation is done in-place
2266
2267        Returns
2268        -------
2269        None if inplace is True, otherwise the new Categorical after replacement
2270
2271
2272        Examples
2273        --------
2274        >>> s = pd.Categorical([1, 2, 1, 3])
2275        >>> s.replace(1, 3)
2276        [3, 2, 3, 3]
2277        Categories (2, int64): [2, 3]
2278        """
2279        inplace = validate_bool_kwarg(inplace, "inplace")
2280        cat = self if inplace else self.copy()
2281
2282        # build a dict of (to replace -> value) pairs
2283        if is_list_like(to_replace):
2284            # if to_replace is list-like and value is scalar
2285            replace_dict = {replace_value: value for replace_value in to_replace}
2286        else:
2287            # if both to_replace and value are scalar
2288            replace_dict = {to_replace: value}
2289
2290        # other cases, like if both to_replace and value are list-like or if
2291        # to_replace is a dict, are handled separately in NDFrame
2292        for replace_value, new_value in replace_dict.items():
2293            if new_value == replace_value:
2294                continue
2295            if replace_value in cat.categories:
2296                if isna(new_value):
2297                    cat.remove_categories(replace_value, inplace=True)
2298                    continue
2299                categories = cat.categories.tolist()
2300                index = categories.index(replace_value)
2301                if new_value in cat.categories:
2302                    value_index = categories.index(new_value)
2303                    cat._codes[cat._codes == index] = value_index
2304                    cat.remove_categories(replace_value, inplace=True)
2305                else:
2306                    categories[index] = new_value
2307                    cat.rename_categories(categories, inplace=True)
2308        if not inplace:
2309            return cat
2310
2311    # ------------------------------------------------------------------------
2312    # String methods interface
2313    def _str_map(self, f, na_value=np.nan, dtype=np.dtype(object)):
2314        # Optimization to apply the callable `f` to the categories once
2315        # and rebuild the result by `take`ing from the result with the codes.
2316        # Returns the same type as the object-dtype implementation though.
2317        from pandas.core.arrays import PandasArray
2318
2319        categories = self.categories
2320        codes = self.codes
2321        result = PandasArray(categories.to_numpy())._str_map(f, na_value, dtype)
2322        return take_1d(result, codes, fill_value=na_value)
2323
2324    def _str_get_dummies(self, sep="|"):
2325        # sep may not be in categories. Just bail on this.
2326        from pandas.core.arrays import PandasArray
2327
2328        return PandasArray(self.astype(str))._str_get_dummies(sep)
2329
2330
2331# The Series.cat accessor
2332
2333
2334@delegate_names(
2335    delegate=Categorical, accessors=["categories", "ordered"], typ="property"
2336)
2337@delegate_names(
2338    delegate=Categorical,
2339    accessors=[
2340        "rename_categories",
2341        "reorder_categories",
2342        "add_categories",
2343        "remove_categories",
2344        "remove_unused_categories",
2345        "set_categories",
2346        "as_ordered",
2347        "as_unordered",
2348    ],
2349    typ="method",
2350)
2351class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):
2352    """
2353    Accessor object for categorical properties of the Series values.
2354
2355    Be aware that assigning to `categories` is a inplace operation, while all
2356    methods return new categorical data per default (but can be called with
2357    `inplace=True`).
2358
2359    Parameters
2360    ----------
2361    data : Series or CategoricalIndex
2362
2363    Examples
2364    --------
2365    >>> s = pd.Series(list("abbccc")).astype("category")
2366    >>> s
2367    0    a
2368    1    b
2369    2    b
2370    3    c
2371    4    c
2372    5    c
2373    dtype: category
2374    Categories (3, object): ['a', 'b', 'c']
2375
2376    >>> s.cat.categories
2377    Index(['a', 'b', 'c'], dtype='object')
2378
2379    >>> s.cat.rename_categories(list("cba"))
2380    0    c
2381    1    b
2382    2    b
2383    3    a
2384    4    a
2385    5    a
2386    dtype: category
2387    Categories (3, object): ['c', 'b', 'a']
2388
2389    >>> s.cat.reorder_categories(list("cba"))
2390    0    a
2391    1    b
2392    2    b
2393    3    c
2394    4    c
2395    5    c
2396    dtype: category
2397    Categories (3, object): ['c', 'b', 'a']
2398
2399    >>> s.cat.add_categories(["d", "e"])
2400    0    a
2401    1    b
2402    2    b
2403    3    c
2404    4    c
2405    5    c
2406    dtype: category
2407    Categories (5, object): ['a', 'b', 'c', 'd', 'e']
2408
2409    >>> s.cat.remove_categories(["a", "c"])
2410    0    NaN
2411    1      b
2412    2      b
2413    3    NaN
2414    4    NaN
2415    5    NaN
2416    dtype: category
2417    Categories (1, object): ['b']
2418
2419    >>> s1 = s.cat.add_categories(["d", "e"])
2420    >>> s1.cat.remove_unused_categories()
2421    0    a
2422    1    b
2423    2    b
2424    3    c
2425    4    c
2426    5    c
2427    dtype: category
2428    Categories (3, object): ['a', 'b', 'c']
2429
2430    >>> s.cat.set_categories(list("abcde"))
2431    0    a
2432    1    b
2433    2    b
2434    3    c
2435    4    c
2436    5    c
2437    dtype: category
2438    Categories (5, object): ['a', 'b', 'c', 'd', 'e']
2439
2440    >>> s.cat.as_ordered()
2441    0    a
2442    1    b
2443    2    b
2444    3    c
2445    4    c
2446    5    c
2447    dtype: category
2448    Categories (3, object): ['a' < 'b' < 'c']
2449
2450    >>> s.cat.as_unordered()
2451    0    a
2452    1    b
2453    2    b
2454    3    c
2455    4    c
2456    5    c
2457    dtype: category
2458    Categories (3, object): ['a', 'b', 'c']
2459    """
2460
2461    def __init__(self, data):
2462        self._validate(data)
2463        self._parent = data.values
2464        self._index = data.index
2465        self._name = data.name
2466        self._freeze()
2467
2468    @staticmethod
2469    def _validate(data):
2470        if not is_categorical_dtype(data.dtype):
2471            raise AttributeError("Can only use .cat accessor with a 'category' dtype")
2472
2473    def _delegate_property_get(self, name):
2474        return getattr(self._parent, name)
2475
2476    def _delegate_property_set(self, name, new_values):
2477        return setattr(self._parent, name, new_values)
2478
2479    @property
2480    def codes(self):
2481        """
2482        Return Series of codes as well as the index.
2483        """
2484        from pandas import Series
2485
2486        return Series(self._parent.codes, index=self._index)
2487
2488    def _delegate_method(self, name, *args, **kwargs):
2489        from pandas import Series
2490
2491        method = getattr(self._parent, name)
2492        res = method(*args, **kwargs)
2493        if res is not None:
2494            return Series(res, index=self._index, name=self._name)
2495
2496
2497# utility routines
2498
2499
2500def _get_codes_for_values(values, categories) -> np.ndarray:
2501    """
2502    utility routine to turn values into codes given the specified categories
2503
2504    If `values` is known to be a Categorical, use recode_for_categories instead.
2505    """
2506    dtype_equal = is_dtype_equal(values.dtype, categories.dtype)
2507
2508    if is_extension_array_dtype(categories.dtype) and is_object_dtype(values):
2509        # Support inferring the correct extension dtype from an array of
2510        # scalar objects. e.g.
2511        # Categorical(array[Period, Period], categories=PeriodIndex(...))
2512        cls = categories.dtype.construct_array_type()
2513        values = maybe_cast_to_extension_array(cls, values)
2514        if not isinstance(values, cls):
2515            # exception raised in _from_sequence
2516            values = ensure_object(values)
2517            categories = ensure_object(categories)
2518    elif not dtype_equal:
2519        values = ensure_object(values)
2520        categories = ensure_object(categories)
2521
2522    if isinstance(categories, ABCIndexClass):
2523        return coerce_indexer_dtype(categories.get_indexer_for(values), categories)
2524
2525    # Only hit here when we've already coerced to object dtypee.
2526
2527    hash_klass, vals = get_data_algo(values)
2528    _, cats = get_data_algo(categories)
2529    t = hash_klass(len(cats))
2530    t.map_locations(cats)
2531    return coerce_indexer_dtype(t.lookup(vals), cats)
2532
2533
2534def recode_for_categories(
2535    codes: np.ndarray, old_categories, new_categories, copy: bool = True
2536) -> np.ndarray:
2537    """
2538    Convert a set of codes for to a new set of categories
2539
2540    Parameters
2541    ----------
2542    codes : np.ndarray
2543    old_categories, new_categories : Index
2544    copy: bool, default True
2545        Whether to copy if the codes are unchanged.
2546
2547    Returns
2548    -------
2549    new_codes : np.ndarray[np.int64]
2550
2551    Examples
2552    --------
2553    >>> old_cat = pd.Index(['b', 'a', 'c'])
2554    >>> new_cat = pd.Index(['a', 'b'])
2555    >>> codes = np.array([0, 1, 1, 2])
2556    >>> recode_for_categories(codes, old_cat, new_cat)
2557    array([ 1,  0,  0, -1], dtype=int8)
2558    """
2559    if len(old_categories) == 0:
2560        # All null anyway, so just retain the nulls
2561        if copy:
2562            return codes.copy()
2563        return codes
2564    elif new_categories.equals(old_categories):
2565        # Same categories, so no need to actually recode
2566        if copy:
2567            return codes.copy()
2568        return codes
2569
2570    indexer = coerce_indexer_dtype(
2571        new_categories.get_indexer(old_categories), new_categories
2572    )
2573    new_codes = take_1d(indexer, codes, fill_value=-1)
2574    return new_codes
2575
2576
2577def factorize_from_iterable(values):
2578    """
2579    Factorize an input `values` into `categories` and `codes`. Preserves
2580    categorical dtype in `categories`.
2581
2582    *This is an internal function*
2583
2584    Parameters
2585    ----------
2586    values : list-like
2587
2588    Returns
2589    -------
2590    codes : ndarray
2591    categories : Index
2592        If `values` has a categorical dtype, then `categories` is
2593        a CategoricalIndex keeping the categories and order of `values`.
2594    """
2595    if not is_list_like(values):
2596        raise TypeError("Input must be list-like")
2597
2598    if is_categorical_dtype(values):
2599        values = extract_array(values)
2600        # The Categorical we want to build has the same categories
2601        # as values but its codes are by def [0, ..., len(n_categories) - 1]
2602        cat_codes = np.arange(len(values.categories), dtype=values.codes.dtype)
2603        categories = Categorical.from_codes(cat_codes, dtype=values.dtype)
2604        codes = values.codes
2605    else:
2606        # The value of ordered is irrelevant since we don't use cat as such,
2607        # but only the resulting categories, the order of which is independent
2608        # from ordered. Set ordered to False as default. See GH #15457
2609        cat = Categorical(values, ordered=False)
2610        categories = cat.categories
2611        codes = cat.codes
2612    return codes, categories
2613
2614
2615def factorize_from_iterables(iterables):
2616    """
2617    A higher-level wrapper over `factorize_from_iterable`.
2618
2619    *This is an internal function*
2620
2621    Parameters
2622    ----------
2623    iterables : list-like of list-likes
2624
2625    Returns
2626    -------
2627    codes_list : list of ndarrays
2628    categories_list : list of Indexes
2629
2630    Notes
2631    -----
2632    See `factorize_from_iterable` for more info.
2633    """
2634    if len(iterables) == 0:
2635        # For consistency, it should return a list of 2 lists.
2636        return [[], []]
2637    return map(list, zip(*(factorize_from_iterable(it) for it in iterables)))
2638