1from typing import Any, List, Optional
2import warnings
3
4import numpy as np
5
6from pandas._config import get_option
7
8from pandas._libs import index as libindex
9from pandas._libs.lib import no_default
10from pandas._typing import ArrayLike, Label
11from pandas.util._decorators import Appender, cache_readonly, doc
12
13from pandas.core.dtypes.common import (
14    ensure_platform_int,
15    is_categorical_dtype,
16    is_scalar,
17)
18from pandas.core.dtypes.dtypes import CategoricalDtype
19from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna, notna
20
21from pandas.core import accessor
22from pandas.core.arrays.categorical import Categorical, contains
23from pandas.core.construction import extract_array
24import pandas.core.indexes.base as ibase
25from pandas.core.indexes.base import Index, _index_shared_docs, maybe_extract_name
26from pandas.core.indexes.extension import NDArrayBackedExtensionIndex, inherit_names
27import pandas.core.missing as missing
28
29_index_doc_kwargs = dict(ibase._index_doc_kwargs)
30_index_doc_kwargs.update({"target_klass": "CategoricalIndex"})
31
32
33@inherit_names(
34    [
35        "argsort",
36        "_internal_get_values",
37        "tolist",
38        "codes",
39        "categories",
40        "ordered",
41        "_reverse_indexer",
42        "searchsorted",
43        "is_dtype_equal",
44        "min",
45        "max",
46    ],
47    Categorical,
48)
49@accessor.delegate_names(
50    delegate=Categorical,
51    accessors=[
52        "rename_categories",
53        "reorder_categories",
54        "add_categories",
55        "remove_categories",
56        "remove_unused_categories",
57        "set_categories",
58        "as_ordered",
59        "as_unordered",
60    ],
61    typ="method",
62    overwrite=True,
63)
64class CategoricalIndex(NDArrayBackedExtensionIndex, accessor.PandasDelegate):
65    """
66    Index based on an underlying :class:`Categorical`.
67
68    CategoricalIndex, like Categorical, can only take on a limited,
69    and usually fixed, number of possible values (`categories`). Also,
70    like Categorical, it might have an order, but numerical operations
71    (additions, divisions, ...) are not possible.
72
73    Parameters
74    ----------
75    data : array-like (1-dimensional)
76        The values of the categorical. If `categories` are given, values not in
77        `categories` will be replaced with NaN.
78    categories : index-like, optional
79        The categories for the categorical. Items need to be unique.
80        If the categories are not given here (and also not in `dtype`), they
81        will be inferred from the `data`.
82    ordered : bool, optional
83        Whether or not this categorical is treated as an ordered
84        categorical. If not given here or in `dtype`, the resulting
85        categorical will be unordered.
86    dtype : CategoricalDtype or "category", optional
87        If :class:`CategoricalDtype`, cannot be used together with
88        `categories` or `ordered`.
89    copy : bool, default False
90        Make a copy of input ndarray.
91    name : object, optional
92        Name to be stored in the index.
93
94    Attributes
95    ----------
96    codes
97    categories
98    ordered
99
100    Methods
101    -------
102    rename_categories
103    reorder_categories
104    add_categories
105    remove_categories
106    remove_unused_categories
107    set_categories
108    as_ordered
109    as_unordered
110    map
111
112    Raises
113    ------
114    ValueError
115        If the categories do not validate.
116    TypeError
117        If an explicit ``ordered=True`` is given but no `categories` and the
118        `values` are not sortable.
119
120    See Also
121    --------
122    Index : The base pandas Index type.
123    Categorical : A categorical array.
124    CategoricalDtype : Type for categorical data.
125
126    Notes
127    -----
128    See the `user guide
129    <https://pandas.pydata.org/pandas-docs/stable/user_guide/advanced.html#categoricalindex>`_
130    for more.
131
132    Examples
133    --------
134    >>> pd.CategoricalIndex(["a", "b", "c", "a", "b", "c"])
135    CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'],
136                     categories=['a', 'b', 'c'], ordered=False, dtype='category')
137
138    ``CategoricalIndex`` can also be instantiated from a ``Categorical``:
139
140    >>> c = pd.Categorical(["a", "b", "c", "a", "b", "c"])
141    >>> pd.CategoricalIndex(c)
142    CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'],
143                     categories=['a', 'b', 'c'], ordered=False, dtype='category')
144
145    Ordered ``CategoricalIndex`` can have a min and max value.
146
147    >>> ci = pd.CategoricalIndex(
148    ...     ["a", "b", "c", "a", "b", "c"], ordered=True, categories=["c", "b", "a"]
149    ... )
150    >>> ci
151    CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'],
152                     categories=['c', 'b', 'a'], ordered=True, dtype='category')
153    >>> ci.min()
154    'c'
155    """
156
157    _typ = "categoricalindex"
158
159    @property
160    def _can_hold_strings(self):
161        return self.categories._can_hold_strings
162
163    codes: np.ndarray
164    categories: Index
165    _data: Categorical
166    _values: Categorical
167
168    @property
169    def _engine_type(self):
170        # self.codes can have dtype int8, int16, int32 or int64, so we need
171        # to return the corresponding engine type (libindex.Int8Engine, etc.).
172        return {
173            np.int8: libindex.Int8Engine,
174            np.int16: libindex.Int16Engine,
175            np.int32: libindex.Int32Engine,
176            np.int64: libindex.Int64Engine,
177        }[self.codes.dtype.type]
178
179    _attributes = ["name"]
180
181    # --------------------------------------------------------------------
182    # Constructors
183
184    def __new__(
185        cls, data=None, categories=None, ordered=None, dtype=None, copy=False, name=None
186    ):
187
188        dtype = CategoricalDtype._from_values_or_dtype(data, categories, ordered, dtype)
189
190        name = maybe_extract_name(name, data, cls)
191
192        if not is_categorical_dtype(data):
193            # don't allow scalars
194            # if data is None, then categories must be provided
195            if is_scalar(data):
196                if data is not None or categories is None:
197                    raise cls._scalar_data_error(data)
198                data = []
199
200        assert isinstance(dtype, CategoricalDtype), dtype
201        data = extract_array(data, extract_numpy=True)
202
203        if not isinstance(data, Categorical):
204            data = Categorical(data, dtype=dtype)
205        elif isinstance(dtype, CategoricalDtype) and dtype != data.dtype:
206            # we want to silently ignore dtype='category'
207            data = data._set_dtype(dtype)
208
209        data = data.copy() if copy else data
210
211        return cls._simple_new(data, name=name)
212
213    @classmethod
214    def _simple_new(cls, values: Categorical, name: Label = None):
215        assert isinstance(values, Categorical), type(values)
216        result = object.__new__(cls)
217
218        result._data = values
219        result.name = name
220        result._cache = {}
221
222        result._reset_identity()
223        return result
224
225    # --------------------------------------------------------------------
226
227    # error: Argument 1 of "_shallow_copy" is incompatible with supertype
228    #  "ExtensionIndex"; supertype defines the argument type as
229    #  "Optional[ExtensionArray]"  [override]
230    @doc(Index._shallow_copy)
231    def _shallow_copy(  # type:ignore[override]
232        self,
233        values: Optional[Categorical] = None,
234        name: Label = no_default,
235    ):
236        name = self.name if name is no_default else name
237
238        if values is not None:
239            # In tests we only get here with Categorical objects that
240            #  have matching .ordered, and values.categories a subset of
241            #  our own.  However we do _not_ have a dtype match in general.
242            values = Categorical(values, dtype=self.dtype)
243
244        return super()._shallow_copy(values=values, name=name)
245
246    def _is_dtype_compat(self, other) -> Categorical:
247        """
248        *this is an internal non-public method*
249
250        provide a comparison between the dtype of self and other (coercing if
251        needed)
252
253        Parameters
254        ----------
255        other : Index
256
257        Returns
258        -------
259        Categorical
260
261        Raises
262        ------
263        TypeError if the dtypes are not compatible
264        """
265        if is_categorical_dtype(other):
266            other = extract_array(other)
267            if not other._categories_match_up_to_permutation(self):
268                raise TypeError(
269                    "categories must match existing categories when appending"
270                )
271        else:
272            values = other
273
274            cat = Categorical(other, dtype=self.dtype)
275            other = CategoricalIndex(cat)
276            if not other.isin(values).all():
277                raise TypeError(
278                    "cannot append a non-category item to a CategoricalIndex"
279                )
280            other = other._values
281
282            if not ((other == values) | (isna(other) & isna(values))).all():
283                # GH#37667 see test_equals_non_category
284                raise TypeError(
285                    "categories must match existing categories when appending"
286                )
287
288        return other
289
290    def equals(self, other: object) -> bool:
291        """
292        Determine if two CategoricalIndex objects contain the same elements.
293
294        Returns
295        -------
296        bool
297            If two CategoricalIndex objects have equal elements True,
298            otherwise False.
299        """
300        if self.is_(other):
301            return True
302
303        if not isinstance(other, Index):
304            return False
305
306        try:
307            other = self._is_dtype_compat(other)
308        except (TypeError, ValueError):
309            return False
310
311        return self._data.equals(other)
312
313    # --------------------------------------------------------------------
314    # Rendering Methods
315
316    @property
317    def _formatter_func(self):
318        return self.categories._formatter_func
319
320    def _format_attrs(self):
321        """
322        Return a list of tuples of the (attr,formatted_value)
323        """
324        max_categories = (
325            10
326            if get_option("display.max_categories") == 0
327            else get_option("display.max_categories")
328        )
329        attrs = [
330            (
331                "categories",
332                ibase.default_pprint(self.categories, max_seq_items=max_categories),
333            ),
334            # pandas\core\indexes\category.py:315: error: "CategoricalIndex"
335            # has no attribute "ordered"  [attr-defined]
336            ("ordered", self.ordered),  # type: ignore[attr-defined]
337        ]
338        if self.name is not None:
339            attrs.append(("name", ibase.default_pprint(self.name)))
340        attrs.append(("dtype", f"'{self.dtype.name}'"))
341        max_seq_items = get_option("display.max_seq_items") or len(self)
342        if len(self) > max_seq_items:
343            attrs.append(("length", len(self)))
344        return attrs
345
346    def _format_with_header(self, header: List[str], na_rep: str = "NaN") -> List[str]:
347        from pandas.io.formats.printing import pprint_thing
348
349        result = [
350            pprint_thing(x, escape_chars=("\t", "\r", "\n")) if notna(x) else na_rep
351            for x in self._values
352        ]
353        return header + result
354
355    # --------------------------------------------------------------------
356
357    @property
358    def inferred_type(self) -> str:
359        return "categorical"
360
361    @property
362    def values(self):
363        """ return the underlying data, which is a Categorical """
364        return self._data
365
366    @doc(Index.__contains__)
367    def __contains__(self, key: Any) -> bool:
368        # if key is a NaN, check if any NaN is in self.
369        if is_valid_nat_for_dtype(key, self.categories.dtype):
370            return self.hasnans
371
372        return contains(self, key, container=self._engine)
373
374    @doc(Index.astype)
375    def astype(self, dtype, copy=True):
376        res_data = self._data.astype(dtype, copy=copy)
377        return Index(res_data, name=self.name)
378
379    @doc(Index.fillna)
380    def fillna(self, value, downcast=None):
381        value = self._require_scalar(value)
382        cat = self._data.fillna(value)
383        return type(self)._simple_new(cat, name=self.name)
384
385    @cache_readonly
386    def _engine(self):
387        # we are going to look things up with the codes themselves.
388        # To avoid a reference cycle, bind `codes` to a local variable, so
389        # `self` is not passed into the lambda.
390        codes = self.codes
391        return self._engine_type(lambda: codes, len(self))
392
393    @doc(Index.unique)
394    def unique(self, level=None):
395        if level is not None:
396            self._validate_index_level(level)
397        result = self._values.unique()
398        # Use _simple_new instead of _shallow_copy to ensure we keep dtype
399        #  of result, not self.
400        return type(self)._simple_new(result, name=self.name)
401
402    def reindex(self, target, method=None, level=None, limit=None, tolerance=None):
403        """
404        Create index with target's values (move/add/delete values as necessary)
405
406        Returns
407        -------
408        new_index : pd.Index
409            Resulting index
410        indexer : np.ndarray or None
411            Indices of output values in original index
412
413        """
414        if method is not None:
415            raise NotImplementedError(
416                "argument method is not implemented for CategoricalIndex.reindex"
417            )
418        if level is not None:
419            raise NotImplementedError(
420                "argument level is not implemented for CategoricalIndex.reindex"
421            )
422        if limit is not None:
423            raise NotImplementedError(
424                "argument limit is not implemented for CategoricalIndex.reindex"
425            )
426
427        target = ibase.ensure_index(target)
428
429        missing: List[int]
430        if self.equals(target):
431            indexer = None
432            missing = []
433        else:
434            indexer, missing = self.get_indexer_non_unique(np.array(target))
435
436        if len(self.codes) and indexer is not None:
437            new_target = self.take(indexer)
438        else:
439            new_target = target
440
441        # filling in missing if needed
442        if len(missing):
443            cats = self.categories.get_indexer(target)
444
445            if (cats == -1).any():
446                # coerce to a regular index here!
447                result = Index(np.array(self), name=self.name)
448                new_target, indexer, _ = result._reindex_non_unique(np.array(target))
449            else:
450
451                codes = new_target.codes.copy()
452                codes[indexer == -1] = cats[missing]
453                cat = self._data._from_backing_data(codes)
454                new_target = type(self)._simple_new(cat, name=self.name)
455
456        # we always want to return an Index type here
457        # to be consistent with .reindex for other index types (e.g. they don't
458        # coerce based on the actual values, only on the dtype)
459        # unless we had an initial Categorical to begin with
460        # in which case we are going to conform to the passed Categorical
461        new_target = np.asarray(new_target)
462        if is_categorical_dtype(target):
463            new_target = Categorical(new_target, dtype=target.dtype)
464            new_target = type(self)._simple_new(new_target, name=self.name)
465        else:
466            new_target = Index(new_target, name=self.name)
467
468        return new_target, indexer
469
470    def _reindex_non_unique(self, target):
471        """
472        reindex from a non-unique; which CategoricalIndex's are almost
473        always
474        """
475        new_target, indexer = self.reindex(target)
476        new_indexer = None
477
478        check = indexer == -1
479        if check.any():
480            new_indexer = np.arange(len(self.take(indexer)))
481            new_indexer[check] = -1
482
483        cats = self.categories.get_indexer(target)
484        if not (cats == -1).any():
485            # .reindex returns normal Index. Revert to CategoricalIndex if
486            # all targets are included in my categories
487            new_target = Categorical(new_target, dtype=self.dtype)
488            new_target = type(self)._simple_new(new_target, name=self.name)
489
490        return new_target, indexer, new_indexer
491
492    # --------------------------------------------------------------------
493    # Indexing Methods
494
495    def _maybe_cast_indexer(self, key) -> int:
496        return self._data._unbox_scalar(key)
497
498    @Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs)
499    def get_indexer(self, target, method=None, limit=None, tolerance=None):
500        method = missing.clean_reindex_fill_method(method)
501        target = ibase.ensure_index(target)
502
503        self._check_indexing_method(method)
504
505        if self.is_unique and self.equals(target):
506            return np.arange(len(self), dtype="intp")
507
508        return self._get_indexer_non_unique(target._values)[0]
509
510    @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs)
511    def get_indexer_non_unique(self, target):
512        target = ibase.ensure_index(target)
513        return self._get_indexer_non_unique(target._values)
514
515    def _get_indexer_non_unique(self, values: ArrayLike):
516        """
517        get_indexer_non_unique but after unrapping the target Index object.
518        """
519        # Note: we use engine.get_indexer_non_unique for get_indexer in addition
520        #  to get_indexer_non_unique because, even if `target` is unique, any
521        #  non-category entries in it will be encoded as -1  so `codes` may
522        #  not be unique.
523
524        if isinstance(values, Categorical):
525            # Indexing on codes is more efficient if categories are the same,
526            #  so we can apply some optimizations based on the degree of
527            #  dtype-matching.
528            cat = self._data._encode_with_my_categories(values)
529            codes = cat._codes
530        else:
531            codes = self.categories.get_indexer(values)
532
533        indexer, missing = self._engine.get_indexer_non_unique(codes)
534        return ensure_platform_int(indexer), missing
535
536    @doc(Index._convert_list_indexer)
537    def _convert_list_indexer(self, keyarr):
538        # Return our indexer or raise if all of the values are not included in
539        # the categories
540
541        if self.categories._defer_to_indexing:
542            # See tests.indexing.interval.test_interval:test_loc_getitem_frame
543            indexer = self.categories._convert_list_indexer(keyarr)
544            return Index(self.codes).get_indexer_for(indexer)
545
546        return self.get_indexer_for(keyarr)
547
548    @doc(Index._maybe_cast_slice_bound)
549    def _maybe_cast_slice_bound(self, label, side: str, kind):
550        if kind == "loc":
551            return label
552
553        return super()._maybe_cast_slice_bound(label, side, kind)
554
555    # --------------------------------------------------------------------
556
557    def _is_comparable_dtype(self, dtype):
558        return self.categories._is_comparable_dtype(dtype)
559
560    def take_nd(self, *args, **kwargs):
561        """Alias for `take`"""
562        warnings.warn(
563            "CategoricalIndex.take_nd is deprecated, use CategoricalIndex.take instead",
564            FutureWarning,
565            stacklevel=2,
566        )
567        return self.take(*args, **kwargs)
568
569    def map(self, mapper):
570        """
571        Map values using input correspondence (a dict, Series, or function).
572
573        Maps the values (their categories, not the codes) of the index to new
574        categories. If the mapping correspondence is one-to-one the result is a
575        :class:`~pandas.CategoricalIndex` which has the same order property as
576        the original, otherwise an :class:`~pandas.Index` is returned.
577
578        If a `dict` or :class:`~pandas.Series` is used any unmapped category is
579        mapped to `NaN`. Note that if this happens an :class:`~pandas.Index`
580        will be returned.
581
582        Parameters
583        ----------
584        mapper : function, dict, or Series
585            Mapping correspondence.
586
587        Returns
588        -------
589        pandas.CategoricalIndex or pandas.Index
590            Mapped index.
591
592        See Also
593        --------
594        Index.map : Apply a mapping correspondence on an
595            :class:`~pandas.Index`.
596        Series.map : Apply a mapping correspondence on a
597            :class:`~pandas.Series`.
598        Series.apply : Apply more complex functions on a
599            :class:`~pandas.Series`.
600
601        Examples
602        --------
603        >>> idx = pd.CategoricalIndex(['a', 'b', 'c'])
604        >>> idx
605        CategoricalIndex(['a', 'b', 'c'], categories=['a', 'b', 'c'],
606                          ordered=False, dtype='category')
607        >>> idx.map(lambda x: x.upper())
608        CategoricalIndex(['A', 'B', 'C'], categories=['A', 'B', 'C'],
609                         ordered=False, dtype='category')
610        >>> idx.map({'a': 'first', 'b': 'second', 'c': 'third'})
611        CategoricalIndex(['first', 'second', 'third'], categories=['first',
612                         'second', 'third'], ordered=False, dtype='category')
613
614        If the mapping is one-to-one the ordering of the categories is
615        preserved:
616
617        >>> idx = pd.CategoricalIndex(['a', 'b', 'c'], ordered=True)
618        >>> idx
619        CategoricalIndex(['a', 'b', 'c'], categories=['a', 'b', 'c'],
620                         ordered=True, dtype='category')
621        >>> idx.map({'a': 3, 'b': 2, 'c': 1})
622        CategoricalIndex([3, 2, 1], categories=[3, 2, 1], ordered=True,
623                         dtype='category')
624
625        If the mapping is not one-to-one an :class:`~pandas.Index` is returned:
626
627        >>> idx.map({'a': 'first', 'b': 'second', 'c': 'first'})
628        Index(['first', 'second', 'first'], dtype='object')
629
630        If a `dict` is used, all unmapped categories are mapped to `NaN` and
631        the result is an :class:`~pandas.Index`:
632
633        >>> idx.map({'a': 'first', 'b': 'second'})
634        Index(['first', 'second', nan], dtype='object')
635        """
636        mapped = self._values.map(mapper)
637        return Index(mapped, name=self.name)
638
639    def _concat(self, to_concat: List["Index"], name: Label) -> Index:
640        # if calling index is category, don't check dtype of others
641        try:
642            codes = np.concatenate([self._is_dtype_compat(c).codes for c in to_concat])
643        except TypeError:
644            # not all to_concat elements are among our categories (or NA)
645            from pandas.core.dtypes.concat import concat_compat
646
647            res = concat_compat(to_concat)
648            return Index(res, name=name)
649        else:
650            cat = self._data._from_backing_data(codes)
651            return type(self)._simple_new(cat, name=name)
652
653    def _delegate_method(self, name: str, *args, **kwargs):
654        """ method delegation to the ._values """
655        method = getattr(self._values, name)
656        if "inplace" in kwargs:
657            raise ValueError("cannot use inplace with CategoricalIndex")
658        res = method(*args, **kwargs)
659        if is_scalar(res):
660            return res
661        return CategoricalIndex(res, name=self.name)
662