1from typing import Any, List, Optional 2import warnings 3 4import numpy as np 5 6from pandas._config import get_option 7 8from pandas._libs import index as libindex 9from pandas._libs.lib import no_default 10from pandas._typing import ArrayLike, Label 11from pandas.util._decorators import Appender, cache_readonly, doc 12 13from pandas.core.dtypes.common import ( 14 ensure_platform_int, 15 is_categorical_dtype, 16 is_scalar, 17) 18from pandas.core.dtypes.dtypes import CategoricalDtype 19from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna, notna 20 21from pandas.core import accessor 22from pandas.core.arrays.categorical import Categorical, contains 23from pandas.core.construction import extract_array 24import pandas.core.indexes.base as ibase 25from pandas.core.indexes.base import Index, _index_shared_docs, maybe_extract_name 26from pandas.core.indexes.extension import NDArrayBackedExtensionIndex, inherit_names 27import pandas.core.missing as missing 28 29_index_doc_kwargs = dict(ibase._index_doc_kwargs) 30_index_doc_kwargs.update({"target_klass": "CategoricalIndex"}) 31 32 33@inherit_names( 34 [ 35 "argsort", 36 "_internal_get_values", 37 "tolist", 38 "codes", 39 "categories", 40 "ordered", 41 "_reverse_indexer", 42 "searchsorted", 43 "is_dtype_equal", 44 "min", 45 "max", 46 ], 47 Categorical, 48) 49@accessor.delegate_names( 50 delegate=Categorical, 51 accessors=[ 52 "rename_categories", 53 "reorder_categories", 54 "add_categories", 55 "remove_categories", 56 "remove_unused_categories", 57 "set_categories", 58 "as_ordered", 59 "as_unordered", 60 ], 61 typ="method", 62 overwrite=True, 63) 64class CategoricalIndex(NDArrayBackedExtensionIndex, accessor.PandasDelegate): 65 """ 66 Index based on an underlying :class:`Categorical`. 67 68 CategoricalIndex, like Categorical, can only take on a limited, 69 and usually fixed, number of possible values (`categories`). Also, 70 like Categorical, it might have an order, but numerical operations 71 (additions, divisions, ...) are not possible. 72 73 Parameters 74 ---------- 75 data : array-like (1-dimensional) 76 The values of the categorical. If `categories` are given, values not in 77 `categories` will be replaced with NaN. 78 categories : index-like, optional 79 The categories for the categorical. Items need to be unique. 80 If the categories are not given here (and also not in `dtype`), they 81 will be inferred from the `data`. 82 ordered : bool, optional 83 Whether or not this categorical is treated as an ordered 84 categorical. If not given here or in `dtype`, the resulting 85 categorical will be unordered. 86 dtype : CategoricalDtype or "category", optional 87 If :class:`CategoricalDtype`, cannot be used together with 88 `categories` or `ordered`. 89 copy : bool, default False 90 Make a copy of input ndarray. 91 name : object, optional 92 Name to be stored in the index. 93 94 Attributes 95 ---------- 96 codes 97 categories 98 ordered 99 100 Methods 101 ------- 102 rename_categories 103 reorder_categories 104 add_categories 105 remove_categories 106 remove_unused_categories 107 set_categories 108 as_ordered 109 as_unordered 110 map 111 112 Raises 113 ------ 114 ValueError 115 If the categories do not validate. 116 TypeError 117 If an explicit ``ordered=True`` is given but no `categories` and the 118 `values` are not sortable. 119 120 See Also 121 -------- 122 Index : The base pandas Index type. 123 Categorical : A categorical array. 124 CategoricalDtype : Type for categorical data. 125 126 Notes 127 ----- 128 See the `user guide 129 <https://pandas.pydata.org/pandas-docs/stable/user_guide/advanced.html#categoricalindex>`_ 130 for more. 131 132 Examples 133 -------- 134 >>> pd.CategoricalIndex(["a", "b", "c", "a", "b", "c"]) 135 CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'], 136 categories=['a', 'b', 'c'], ordered=False, dtype='category') 137 138 ``CategoricalIndex`` can also be instantiated from a ``Categorical``: 139 140 >>> c = pd.Categorical(["a", "b", "c", "a", "b", "c"]) 141 >>> pd.CategoricalIndex(c) 142 CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'], 143 categories=['a', 'b', 'c'], ordered=False, dtype='category') 144 145 Ordered ``CategoricalIndex`` can have a min and max value. 146 147 >>> ci = pd.CategoricalIndex( 148 ... ["a", "b", "c", "a", "b", "c"], ordered=True, categories=["c", "b", "a"] 149 ... ) 150 >>> ci 151 CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'], 152 categories=['c', 'b', 'a'], ordered=True, dtype='category') 153 >>> ci.min() 154 'c' 155 """ 156 157 _typ = "categoricalindex" 158 159 @property 160 def _can_hold_strings(self): 161 return self.categories._can_hold_strings 162 163 codes: np.ndarray 164 categories: Index 165 _data: Categorical 166 _values: Categorical 167 168 @property 169 def _engine_type(self): 170 # self.codes can have dtype int8, int16, int32 or int64, so we need 171 # to return the corresponding engine type (libindex.Int8Engine, etc.). 172 return { 173 np.int8: libindex.Int8Engine, 174 np.int16: libindex.Int16Engine, 175 np.int32: libindex.Int32Engine, 176 np.int64: libindex.Int64Engine, 177 }[self.codes.dtype.type] 178 179 _attributes = ["name"] 180 181 # -------------------------------------------------------------------- 182 # Constructors 183 184 def __new__( 185 cls, data=None, categories=None, ordered=None, dtype=None, copy=False, name=None 186 ): 187 188 dtype = CategoricalDtype._from_values_or_dtype(data, categories, ordered, dtype) 189 190 name = maybe_extract_name(name, data, cls) 191 192 if not is_categorical_dtype(data): 193 # don't allow scalars 194 # if data is None, then categories must be provided 195 if is_scalar(data): 196 if data is not None or categories is None: 197 raise cls._scalar_data_error(data) 198 data = [] 199 200 assert isinstance(dtype, CategoricalDtype), dtype 201 data = extract_array(data, extract_numpy=True) 202 203 if not isinstance(data, Categorical): 204 data = Categorical(data, dtype=dtype) 205 elif isinstance(dtype, CategoricalDtype) and dtype != data.dtype: 206 # we want to silently ignore dtype='category' 207 data = data._set_dtype(dtype) 208 209 data = data.copy() if copy else data 210 211 return cls._simple_new(data, name=name) 212 213 @classmethod 214 def _simple_new(cls, values: Categorical, name: Label = None): 215 assert isinstance(values, Categorical), type(values) 216 result = object.__new__(cls) 217 218 result._data = values 219 result.name = name 220 result._cache = {} 221 222 result._reset_identity() 223 return result 224 225 # -------------------------------------------------------------------- 226 227 # error: Argument 1 of "_shallow_copy" is incompatible with supertype 228 # "ExtensionIndex"; supertype defines the argument type as 229 # "Optional[ExtensionArray]" [override] 230 @doc(Index._shallow_copy) 231 def _shallow_copy( # type:ignore[override] 232 self, 233 values: Optional[Categorical] = None, 234 name: Label = no_default, 235 ): 236 name = self.name if name is no_default else name 237 238 if values is not None: 239 # In tests we only get here with Categorical objects that 240 # have matching .ordered, and values.categories a subset of 241 # our own. However we do _not_ have a dtype match in general. 242 values = Categorical(values, dtype=self.dtype) 243 244 return super()._shallow_copy(values=values, name=name) 245 246 def _is_dtype_compat(self, other) -> Categorical: 247 """ 248 *this is an internal non-public method* 249 250 provide a comparison between the dtype of self and other (coercing if 251 needed) 252 253 Parameters 254 ---------- 255 other : Index 256 257 Returns 258 ------- 259 Categorical 260 261 Raises 262 ------ 263 TypeError if the dtypes are not compatible 264 """ 265 if is_categorical_dtype(other): 266 other = extract_array(other) 267 if not other._categories_match_up_to_permutation(self): 268 raise TypeError( 269 "categories must match existing categories when appending" 270 ) 271 else: 272 values = other 273 274 cat = Categorical(other, dtype=self.dtype) 275 other = CategoricalIndex(cat) 276 if not other.isin(values).all(): 277 raise TypeError( 278 "cannot append a non-category item to a CategoricalIndex" 279 ) 280 other = other._values 281 282 if not ((other == values) | (isna(other) & isna(values))).all(): 283 # GH#37667 see test_equals_non_category 284 raise TypeError( 285 "categories must match existing categories when appending" 286 ) 287 288 return other 289 290 def equals(self, other: object) -> bool: 291 """ 292 Determine if two CategoricalIndex objects contain the same elements. 293 294 Returns 295 ------- 296 bool 297 If two CategoricalIndex objects have equal elements True, 298 otherwise False. 299 """ 300 if self.is_(other): 301 return True 302 303 if not isinstance(other, Index): 304 return False 305 306 try: 307 other = self._is_dtype_compat(other) 308 except (TypeError, ValueError): 309 return False 310 311 return self._data.equals(other) 312 313 # -------------------------------------------------------------------- 314 # Rendering Methods 315 316 @property 317 def _formatter_func(self): 318 return self.categories._formatter_func 319 320 def _format_attrs(self): 321 """ 322 Return a list of tuples of the (attr,formatted_value) 323 """ 324 max_categories = ( 325 10 326 if get_option("display.max_categories") == 0 327 else get_option("display.max_categories") 328 ) 329 attrs = [ 330 ( 331 "categories", 332 ibase.default_pprint(self.categories, max_seq_items=max_categories), 333 ), 334 # pandas\core\indexes\category.py:315: error: "CategoricalIndex" 335 # has no attribute "ordered" [attr-defined] 336 ("ordered", self.ordered), # type: ignore[attr-defined] 337 ] 338 if self.name is not None: 339 attrs.append(("name", ibase.default_pprint(self.name))) 340 attrs.append(("dtype", f"'{self.dtype.name}'")) 341 max_seq_items = get_option("display.max_seq_items") or len(self) 342 if len(self) > max_seq_items: 343 attrs.append(("length", len(self))) 344 return attrs 345 346 def _format_with_header(self, header: List[str], na_rep: str = "NaN") -> List[str]: 347 from pandas.io.formats.printing import pprint_thing 348 349 result = [ 350 pprint_thing(x, escape_chars=("\t", "\r", "\n")) if notna(x) else na_rep 351 for x in self._values 352 ] 353 return header + result 354 355 # -------------------------------------------------------------------- 356 357 @property 358 def inferred_type(self) -> str: 359 return "categorical" 360 361 @property 362 def values(self): 363 """ return the underlying data, which is a Categorical """ 364 return self._data 365 366 @doc(Index.__contains__) 367 def __contains__(self, key: Any) -> bool: 368 # if key is a NaN, check if any NaN is in self. 369 if is_valid_nat_for_dtype(key, self.categories.dtype): 370 return self.hasnans 371 372 return contains(self, key, container=self._engine) 373 374 @doc(Index.astype) 375 def astype(self, dtype, copy=True): 376 res_data = self._data.astype(dtype, copy=copy) 377 return Index(res_data, name=self.name) 378 379 @doc(Index.fillna) 380 def fillna(self, value, downcast=None): 381 value = self._require_scalar(value) 382 cat = self._data.fillna(value) 383 return type(self)._simple_new(cat, name=self.name) 384 385 @cache_readonly 386 def _engine(self): 387 # we are going to look things up with the codes themselves. 388 # To avoid a reference cycle, bind `codes` to a local variable, so 389 # `self` is not passed into the lambda. 390 codes = self.codes 391 return self._engine_type(lambda: codes, len(self)) 392 393 @doc(Index.unique) 394 def unique(self, level=None): 395 if level is not None: 396 self._validate_index_level(level) 397 result = self._values.unique() 398 # Use _simple_new instead of _shallow_copy to ensure we keep dtype 399 # of result, not self. 400 return type(self)._simple_new(result, name=self.name) 401 402 def reindex(self, target, method=None, level=None, limit=None, tolerance=None): 403 """ 404 Create index with target's values (move/add/delete values as necessary) 405 406 Returns 407 ------- 408 new_index : pd.Index 409 Resulting index 410 indexer : np.ndarray or None 411 Indices of output values in original index 412 413 """ 414 if method is not None: 415 raise NotImplementedError( 416 "argument method is not implemented for CategoricalIndex.reindex" 417 ) 418 if level is not None: 419 raise NotImplementedError( 420 "argument level is not implemented for CategoricalIndex.reindex" 421 ) 422 if limit is not None: 423 raise NotImplementedError( 424 "argument limit is not implemented for CategoricalIndex.reindex" 425 ) 426 427 target = ibase.ensure_index(target) 428 429 missing: List[int] 430 if self.equals(target): 431 indexer = None 432 missing = [] 433 else: 434 indexer, missing = self.get_indexer_non_unique(np.array(target)) 435 436 if len(self.codes) and indexer is not None: 437 new_target = self.take(indexer) 438 else: 439 new_target = target 440 441 # filling in missing if needed 442 if len(missing): 443 cats = self.categories.get_indexer(target) 444 445 if (cats == -1).any(): 446 # coerce to a regular index here! 447 result = Index(np.array(self), name=self.name) 448 new_target, indexer, _ = result._reindex_non_unique(np.array(target)) 449 else: 450 451 codes = new_target.codes.copy() 452 codes[indexer == -1] = cats[missing] 453 cat = self._data._from_backing_data(codes) 454 new_target = type(self)._simple_new(cat, name=self.name) 455 456 # we always want to return an Index type here 457 # to be consistent with .reindex for other index types (e.g. they don't 458 # coerce based on the actual values, only on the dtype) 459 # unless we had an initial Categorical to begin with 460 # in which case we are going to conform to the passed Categorical 461 new_target = np.asarray(new_target) 462 if is_categorical_dtype(target): 463 new_target = Categorical(new_target, dtype=target.dtype) 464 new_target = type(self)._simple_new(new_target, name=self.name) 465 else: 466 new_target = Index(new_target, name=self.name) 467 468 return new_target, indexer 469 470 def _reindex_non_unique(self, target): 471 """ 472 reindex from a non-unique; which CategoricalIndex's are almost 473 always 474 """ 475 new_target, indexer = self.reindex(target) 476 new_indexer = None 477 478 check = indexer == -1 479 if check.any(): 480 new_indexer = np.arange(len(self.take(indexer))) 481 new_indexer[check] = -1 482 483 cats = self.categories.get_indexer(target) 484 if not (cats == -1).any(): 485 # .reindex returns normal Index. Revert to CategoricalIndex if 486 # all targets are included in my categories 487 new_target = Categorical(new_target, dtype=self.dtype) 488 new_target = type(self)._simple_new(new_target, name=self.name) 489 490 return new_target, indexer, new_indexer 491 492 # -------------------------------------------------------------------- 493 # Indexing Methods 494 495 def _maybe_cast_indexer(self, key) -> int: 496 return self._data._unbox_scalar(key) 497 498 @Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs) 499 def get_indexer(self, target, method=None, limit=None, tolerance=None): 500 method = missing.clean_reindex_fill_method(method) 501 target = ibase.ensure_index(target) 502 503 self._check_indexing_method(method) 504 505 if self.is_unique and self.equals(target): 506 return np.arange(len(self), dtype="intp") 507 508 return self._get_indexer_non_unique(target._values)[0] 509 510 @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs) 511 def get_indexer_non_unique(self, target): 512 target = ibase.ensure_index(target) 513 return self._get_indexer_non_unique(target._values) 514 515 def _get_indexer_non_unique(self, values: ArrayLike): 516 """ 517 get_indexer_non_unique but after unrapping the target Index object. 518 """ 519 # Note: we use engine.get_indexer_non_unique for get_indexer in addition 520 # to get_indexer_non_unique because, even if `target` is unique, any 521 # non-category entries in it will be encoded as -1 so `codes` may 522 # not be unique. 523 524 if isinstance(values, Categorical): 525 # Indexing on codes is more efficient if categories are the same, 526 # so we can apply some optimizations based on the degree of 527 # dtype-matching. 528 cat = self._data._encode_with_my_categories(values) 529 codes = cat._codes 530 else: 531 codes = self.categories.get_indexer(values) 532 533 indexer, missing = self._engine.get_indexer_non_unique(codes) 534 return ensure_platform_int(indexer), missing 535 536 @doc(Index._convert_list_indexer) 537 def _convert_list_indexer(self, keyarr): 538 # Return our indexer or raise if all of the values are not included in 539 # the categories 540 541 if self.categories._defer_to_indexing: 542 # See tests.indexing.interval.test_interval:test_loc_getitem_frame 543 indexer = self.categories._convert_list_indexer(keyarr) 544 return Index(self.codes).get_indexer_for(indexer) 545 546 return self.get_indexer_for(keyarr) 547 548 @doc(Index._maybe_cast_slice_bound) 549 def _maybe_cast_slice_bound(self, label, side: str, kind): 550 if kind == "loc": 551 return label 552 553 return super()._maybe_cast_slice_bound(label, side, kind) 554 555 # -------------------------------------------------------------------- 556 557 def _is_comparable_dtype(self, dtype): 558 return self.categories._is_comparable_dtype(dtype) 559 560 def take_nd(self, *args, **kwargs): 561 """Alias for `take`""" 562 warnings.warn( 563 "CategoricalIndex.take_nd is deprecated, use CategoricalIndex.take instead", 564 FutureWarning, 565 stacklevel=2, 566 ) 567 return self.take(*args, **kwargs) 568 569 def map(self, mapper): 570 """ 571 Map values using input correspondence (a dict, Series, or function). 572 573 Maps the values (their categories, not the codes) of the index to new 574 categories. If the mapping correspondence is one-to-one the result is a 575 :class:`~pandas.CategoricalIndex` which has the same order property as 576 the original, otherwise an :class:`~pandas.Index` is returned. 577 578 If a `dict` or :class:`~pandas.Series` is used any unmapped category is 579 mapped to `NaN`. Note that if this happens an :class:`~pandas.Index` 580 will be returned. 581 582 Parameters 583 ---------- 584 mapper : function, dict, or Series 585 Mapping correspondence. 586 587 Returns 588 ------- 589 pandas.CategoricalIndex or pandas.Index 590 Mapped index. 591 592 See Also 593 -------- 594 Index.map : Apply a mapping correspondence on an 595 :class:`~pandas.Index`. 596 Series.map : Apply a mapping correspondence on a 597 :class:`~pandas.Series`. 598 Series.apply : Apply more complex functions on a 599 :class:`~pandas.Series`. 600 601 Examples 602 -------- 603 >>> idx = pd.CategoricalIndex(['a', 'b', 'c']) 604 >>> idx 605 CategoricalIndex(['a', 'b', 'c'], categories=['a', 'b', 'c'], 606 ordered=False, dtype='category') 607 >>> idx.map(lambda x: x.upper()) 608 CategoricalIndex(['A', 'B', 'C'], categories=['A', 'B', 'C'], 609 ordered=False, dtype='category') 610 >>> idx.map({'a': 'first', 'b': 'second', 'c': 'third'}) 611 CategoricalIndex(['first', 'second', 'third'], categories=['first', 612 'second', 'third'], ordered=False, dtype='category') 613 614 If the mapping is one-to-one the ordering of the categories is 615 preserved: 616 617 >>> idx = pd.CategoricalIndex(['a', 'b', 'c'], ordered=True) 618 >>> idx 619 CategoricalIndex(['a', 'b', 'c'], categories=['a', 'b', 'c'], 620 ordered=True, dtype='category') 621 >>> idx.map({'a': 3, 'b': 2, 'c': 1}) 622 CategoricalIndex([3, 2, 1], categories=[3, 2, 1], ordered=True, 623 dtype='category') 624 625 If the mapping is not one-to-one an :class:`~pandas.Index` is returned: 626 627 >>> idx.map({'a': 'first', 'b': 'second', 'c': 'first'}) 628 Index(['first', 'second', 'first'], dtype='object') 629 630 If a `dict` is used, all unmapped categories are mapped to `NaN` and 631 the result is an :class:`~pandas.Index`: 632 633 >>> idx.map({'a': 'first', 'b': 'second'}) 634 Index(['first', 'second', nan], dtype='object') 635 """ 636 mapped = self._values.map(mapper) 637 return Index(mapped, name=self.name) 638 639 def _concat(self, to_concat: List["Index"], name: Label) -> Index: 640 # if calling index is category, don't check dtype of others 641 try: 642 codes = np.concatenate([self._is_dtype_compat(c).codes for c in to_concat]) 643 except TypeError: 644 # not all to_concat elements are among our categories (or NA) 645 from pandas.core.dtypes.concat import concat_compat 646 647 res = concat_compat(to_concat) 648 return Index(res, name=name) 649 else: 650 cat = self._data._from_backing_data(codes) 651 return type(self)._simple_new(cat, name=name) 652 653 def _delegate_method(self, name: str, *args, **kwargs): 654 """ method delegation to the ._values """ 655 method = getattr(self._values, name) 656 if "inplace" in kwargs: 657 raise ValueError("cannot use inplace with CategoricalIndex") 658 res = method(*args, **kwargs) 659 if is_scalar(res): 660 return res 661 return CategoricalIndex(res, name=self.name) 662