1from csv import QUOTE_NONNUMERIC 2from functools import partial 3import operator 4from shutil import get_terminal_size 5from typing import Dict, Hashable, List, Sequence, Type, TypeVar, Union, cast 6from warnings import warn 7 8import numpy as np 9 10from pandas._config import get_option 11 12from pandas._libs import NaT, algos as libalgos, hashtable as htable, lib 13from pandas._libs.lib import no_default 14from pandas._typing import ArrayLike, Dtype, Ordered, Scalar 15from pandas.compat.numpy import function as nv 16from pandas.util._decorators import cache_readonly, deprecate_kwarg 17from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs 18 19from pandas.core.dtypes.cast import ( 20 coerce_indexer_dtype, 21 maybe_cast_to_extension_array, 22 maybe_infer_to_datetimelike, 23) 24from pandas.core.dtypes.common import ( 25 ensure_int64, 26 ensure_object, 27 is_categorical_dtype, 28 is_datetime64_dtype, 29 is_dict_like, 30 is_dtype_equal, 31 is_extension_array_dtype, 32 is_hashable, 33 is_integer_dtype, 34 is_list_like, 35 is_object_dtype, 36 is_scalar, 37 is_timedelta64_dtype, 38 needs_i8_conversion, 39) 40from pandas.core.dtypes.dtypes import CategoricalDtype 41from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries 42from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna, notna 43 44from pandas.core import ops 45from pandas.core.accessor import PandasDelegate, delegate_names 46import pandas.core.algorithms as algorithms 47from pandas.core.algorithms import factorize, get_data_algo, take_1d, unique1d 48from pandas.core.arrays._mixins import NDArrayBackedExtensionArray 49from pandas.core.base import ExtensionArray, NoNewAttributesMixin, PandasObject 50import pandas.core.common as com 51from pandas.core.construction import array, extract_array, sanitize_array 52from pandas.core.indexers import deprecate_ndim_indexing 53from pandas.core.missing import interpolate_2d 54from pandas.core.ops.common import unpack_zerodim_and_defer 55from pandas.core.sorting import nargsort 56from pandas.core.strings.object_array import ObjectStringArrayMixin 57 58from pandas.io.formats import console 59 60CategoricalT = TypeVar("CategoricalT", bound="Categorical") 61 62 63def _cat_compare_op(op): 64 opname = f"__{op.__name__}__" 65 fill_value = True if op is operator.ne else False 66 67 @unpack_zerodim_and_defer(opname) 68 def func(self, other): 69 hashable = is_hashable(other) 70 if is_list_like(other) and len(other) != len(self) and not hashable: 71 # in hashable case we may have a tuple that is itself a category 72 raise ValueError("Lengths must match.") 73 74 if not self.ordered: 75 if opname in ["__lt__", "__gt__", "__le__", "__ge__"]: 76 raise TypeError( 77 "Unordered Categoricals can only compare equality or not" 78 ) 79 if isinstance(other, Categorical): 80 # Two Categoricals can only be compared if the categories are 81 # the same (maybe up to ordering, depending on ordered) 82 83 msg = "Categoricals can only be compared if 'categories' are the same." 84 if not self._categories_match_up_to_permutation(other): 85 raise TypeError(msg) 86 87 if not self.ordered and not self.categories.equals(other.categories): 88 # both unordered and different order 89 other_codes = recode_for_categories( 90 other.codes, other.categories, self.categories, copy=False 91 ) 92 else: 93 other_codes = other._codes 94 95 ret = op(self._codes, other_codes) 96 mask = (self._codes == -1) | (other_codes == -1) 97 if mask.any(): 98 ret[mask] = fill_value 99 return ret 100 101 if hashable: 102 if other in self.categories: 103 i = self._unbox_scalar(other) 104 ret = op(self._codes, i) 105 106 if opname not in {"__eq__", "__ge__", "__gt__"}: 107 # GH#29820 performance trick; get_loc will always give i>=0, 108 # so in the cases (__ne__, __le__, __lt__) the setting 109 # here is a no-op, so can be skipped. 110 mask = self._codes == -1 111 ret[mask] = fill_value 112 return ret 113 else: 114 return ops.invalid_comparison(self, other, op) 115 else: 116 # allow categorical vs object dtype array comparisons for equality 117 # these are only positional comparisons 118 if opname not in ["__eq__", "__ne__"]: 119 raise TypeError( 120 f"Cannot compare a Categorical for op {opname} with " 121 f"type {type(other)}.\nIf you want to compare values, " 122 "use 'np.asarray(cat) <op> other'." 123 ) 124 125 if isinstance(other, ExtensionArray) and needs_i8_conversion(other.dtype): 126 # We would return NotImplemented here, but that messes up 127 # ExtensionIndex's wrapped methods 128 return op(other, self) 129 return getattr(np.array(self), opname)(np.array(other)) 130 131 func.__name__ = opname 132 133 return func 134 135 136def contains(cat, key, container): 137 """ 138 Helper for membership check for ``key`` in ``cat``. 139 140 This is a helper method for :method:`__contains__` 141 and :class:`CategoricalIndex.__contains__`. 142 143 Returns True if ``key`` is in ``cat.categories`` and the 144 location of ``key`` in ``categories`` is in ``container``. 145 146 Parameters 147 ---------- 148 cat : :class:`Categorical`or :class:`categoricalIndex` 149 key : a hashable object 150 The key to check membership for. 151 container : Container (e.g. list-like or mapping) 152 The container to check for membership in. 153 154 Returns 155 ------- 156 is_in : bool 157 True if ``key`` is in ``self.categories`` and location of 158 ``key`` in ``categories`` is in ``container``, else False. 159 160 Notes 161 ----- 162 This method does not check for NaN values. Do that separately 163 before calling this method. 164 """ 165 hash(key) 166 167 # get location of key in categories. 168 # If a KeyError, the key isn't in categories, so logically 169 # can't be in container either. 170 try: 171 loc = cat.categories.get_loc(key) 172 except (KeyError, TypeError): 173 return False 174 175 # loc is the location of key in categories, but also the *value* 176 # for key in container. So, `key` may be in categories, 177 # but still not in `container`. Example ('b' in categories, 178 # but not in values): 179 # 'b' in Categorical(['a'], categories=['a', 'b']) # False 180 if is_scalar(loc): 181 return loc in container 182 else: 183 # if categories is an IntervalIndex, loc is an array. 184 return any(loc_ in container for loc_ in loc) 185 186 187class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMixin): 188 """ 189 Represent a categorical variable in classic R / S-plus fashion. 190 191 `Categoricals` can only take on only a limited, and usually fixed, number 192 of possible values (`categories`). In contrast to statistical categorical 193 variables, a `Categorical` might have an order, but numerical operations 194 (additions, divisions, ...) are not possible. 195 196 All values of the `Categorical` are either in `categories` or `np.nan`. 197 Assigning values outside of `categories` will raise a `ValueError`. Order 198 is defined by the order of the `categories`, not lexical order of the 199 values. 200 201 Parameters 202 ---------- 203 values : list-like 204 The values of the categorical. If categories are given, values not in 205 categories will be replaced with NaN. 206 categories : Index-like (unique), optional 207 The unique categories for this categorical. If not given, the 208 categories are assumed to be the unique values of `values` (sorted, if 209 possible, otherwise in the order in which they appear). 210 ordered : bool, default False 211 Whether or not this categorical is treated as a ordered categorical. 212 If True, the resulting categorical will be ordered. 213 An ordered categorical respects, when sorted, the order of its 214 `categories` attribute (which in turn is the `categories` argument, if 215 provided). 216 dtype : CategoricalDtype 217 An instance of ``CategoricalDtype`` to use for this categorical. 218 219 Attributes 220 ---------- 221 categories : Index 222 The categories of this categorical 223 codes : ndarray 224 The codes (integer positions, which point to the categories) of this 225 categorical, read only. 226 ordered : bool 227 Whether or not this Categorical is ordered. 228 dtype : CategoricalDtype 229 The instance of ``CategoricalDtype`` storing the ``categories`` 230 and ``ordered``. 231 232 Methods 233 ------- 234 from_codes 235 __array__ 236 237 Raises 238 ------ 239 ValueError 240 If the categories do not validate. 241 TypeError 242 If an explicit ``ordered=True`` is given but no `categories` and the 243 `values` are not sortable. 244 245 See Also 246 -------- 247 CategoricalDtype : Type for categorical data. 248 CategoricalIndex : An Index with an underlying ``Categorical``. 249 250 Notes 251 ----- 252 See the `user guide 253 <https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html>`_ 254 for more. 255 256 Examples 257 -------- 258 >>> pd.Categorical([1, 2, 3, 1, 2, 3]) 259 [1, 2, 3, 1, 2, 3] 260 Categories (3, int64): [1, 2, 3] 261 262 >>> pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c']) 263 ['a', 'b', 'c', 'a', 'b', 'c'] 264 Categories (3, object): ['a', 'b', 'c'] 265 266 Missing values are not included as a category. 267 268 >>> c = pd.Categorical([1, 2, 3, 1, 2, 3, np.nan]) 269 >>> c 270 [1, 2, 3, 1, 2, 3, NaN] 271 Categories (3, int64): [1, 2, 3] 272 273 However, their presence is indicated in the `codes` attribute 274 by code `-1`. 275 276 >>> c.codes 277 array([ 0, 1, 2, 0, 1, 2, -1], dtype=int8) 278 279 Ordered `Categoricals` can be sorted according to the custom order 280 of the categories and can have a min and max value. 281 282 >>> c = pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'], ordered=True, 283 ... categories=['c', 'b', 'a']) 284 >>> c 285 ['a', 'b', 'c', 'a', 'b', 'c'] 286 Categories (3, object): ['c' < 'b' < 'a'] 287 >>> c.min() 288 'c' 289 """ 290 291 # For comparisons, so that numpy uses our implementation if the compare 292 # ops, which raise 293 __array_priority__ = 1000 294 _dtype = CategoricalDtype(ordered=False) 295 # tolist is not actually deprecated, just suppressed in the __dir__ 296 _hidden_attrs = PandasObject._hidden_attrs | frozenset(["tolist"]) 297 _typ = "categorical" 298 _can_hold_na = True 299 300 def __init__( 301 self, values, categories=None, ordered=None, dtype=None, fastpath=False 302 ): 303 304 dtype = CategoricalDtype._from_values_or_dtype( 305 values, categories, ordered, dtype 306 ) 307 # At this point, dtype is always a CategoricalDtype, but 308 # we may have dtype.categories be None, and we need to 309 # infer categories in a factorization step further below 310 311 if fastpath: 312 self._codes = coerce_indexer_dtype(values, dtype.categories) 313 self._dtype = self._dtype.update_dtype(dtype) 314 return 315 316 # null_mask indicates missing values we want to exclude from inference. 317 # This means: only missing values in list-likes (not arrays/ndframes). 318 null_mask = np.array(False) 319 320 # sanitize input 321 if is_categorical_dtype(values): 322 if dtype.categories is None: 323 dtype = CategoricalDtype(values.categories, dtype.ordered) 324 elif not isinstance(values, (ABCIndexClass, ABCSeries)): 325 # sanitize_array coerces np.nan to a string under certain versions 326 # of numpy 327 values = maybe_infer_to_datetimelike(values, convert_dates=True) 328 if not isinstance(values, (np.ndarray, ExtensionArray)): 329 values = com.convert_to_list_like(values) 330 331 # By convention, empty lists result in object dtype: 332 sanitize_dtype = np.dtype("O") if len(values) == 0 else None 333 null_mask = isna(values) 334 if null_mask.any(): 335 values = [values[idx] for idx in np.where(~null_mask)[0]] 336 values = sanitize_array(values, None, dtype=sanitize_dtype) 337 338 if dtype.categories is None: 339 try: 340 codes, categories = factorize(values, sort=True) 341 except TypeError as err: 342 codes, categories = factorize(values, sort=False) 343 if dtype.ordered: 344 # raise, as we don't have a sortable data structure and so 345 # the user should give us one by specifying categories 346 raise TypeError( 347 "'values' is not ordered, please " 348 "explicitly specify the categories order " 349 "by passing in a categories argument." 350 ) from err 351 except ValueError as err: 352 353 # TODO(EA2D) 354 raise NotImplementedError( 355 "> 1 ndim Categorical are not supported at this time" 356 ) from err 357 358 # we're inferring from values 359 dtype = CategoricalDtype(categories, dtype.ordered) 360 361 elif is_categorical_dtype(values.dtype): 362 old_codes = extract_array(values).codes 363 codes = recode_for_categories( 364 old_codes, values.dtype.categories, dtype.categories 365 ) 366 367 else: 368 codes = _get_codes_for_values(values, dtype.categories) 369 370 if null_mask.any(): 371 # Reinsert -1 placeholders for previously removed missing values 372 full_codes = -np.ones(null_mask.shape, dtype=codes.dtype) 373 full_codes[~null_mask] = codes 374 codes = full_codes 375 376 self._dtype = self._dtype.update_dtype(dtype) 377 self._codes = coerce_indexer_dtype(codes, dtype.categories) 378 379 @property 380 def dtype(self) -> CategoricalDtype: 381 """ 382 The :class:`~pandas.api.types.CategoricalDtype` for this instance. 383 """ 384 return self._dtype 385 386 @property 387 def _constructor(self) -> Type["Categorical"]: 388 return Categorical 389 390 @classmethod 391 def _from_sequence(cls, scalars, *, dtype=None, copy=False): 392 return Categorical(scalars, dtype=dtype) 393 394 def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: 395 """ 396 Coerce this type to another dtype 397 398 Parameters 399 ---------- 400 dtype : numpy dtype or pandas type 401 copy : bool, default True 402 By default, astype always returns a newly allocated object. 403 If copy is set to False and dtype is categorical, the original 404 object is returned. 405 """ 406 if self.dtype is dtype: 407 result = self.copy() if copy else self 408 409 elif is_categorical_dtype(dtype): 410 dtype = cast(Union[str, CategoricalDtype], dtype) 411 412 # GH 10696/18593/18630 413 dtype = self.dtype.update_dtype(dtype) 414 self = self.copy() if copy else self 415 result = self._set_dtype(dtype) 416 417 # TODO: consolidate with ndarray case? 418 elif is_extension_array_dtype(dtype): 419 result = array(self, dtype=dtype, copy=copy) 420 421 elif is_integer_dtype(dtype) and self.isna().any(): 422 raise ValueError("Cannot convert float NaN to integer") 423 424 elif len(self.codes) == 0 or len(self.categories) == 0: 425 result = np.array(self, dtype=dtype, copy=copy) 426 427 else: 428 # GH8628 (PERF): astype category codes instead of astyping array 429 try: 430 new_cats = np.asarray(self.categories) 431 new_cats = new_cats.astype(dtype=dtype, copy=copy) 432 fill_value = lib.item_from_zerodim(np.array(np.nan).astype(dtype)) 433 except ( 434 TypeError, # downstream error msg for CategoricalIndex is misleading 435 ValueError, 436 ): 437 msg = f"Cannot cast {self.categories.dtype} dtype to {dtype}" 438 raise ValueError(msg) 439 440 result = take_1d( 441 new_cats, 442 libalgos.ensure_platform_int(self._codes), 443 fill_value=fill_value, 444 ) 445 446 return result 447 448 @cache_readonly 449 def itemsize(self) -> int: 450 """ 451 return the size of a single category 452 """ 453 return self.categories.itemsize 454 455 def tolist(self) -> List[Scalar]: 456 """ 457 Return a list of the values. 458 459 These are each a scalar type, which is a Python scalar 460 (for str, int, float) or a pandas scalar 461 (for Timestamp/Timedelta/Interval/Period) 462 """ 463 return list(self) 464 465 to_list = tolist 466 467 @classmethod 468 def _from_inferred_categories( 469 cls, inferred_categories, inferred_codes, dtype, true_values=None 470 ): 471 """ 472 Construct a Categorical from inferred values. 473 474 For inferred categories (`dtype` is None) the categories are sorted. 475 For explicit `dtype`, the `inferred_categories` are cast to the 476 appropriate type. 477 478 Parameters 479 ---------- 480 inferred_categories : Index 481 inferred_codes : Index 482 dtype : CategoricalDtype or 'category' 483 true_values : list, optional 484 If none are provided, the default ones are 485 "True", "TRUE", and "true." 486 487 Returns 488 ------- 489 Categorical 490 """ 491 from pandas import Index, to_datetime, to_numeric, to_timedelta 492 493 cats = Index(inferred_categories) 494 known_categories = ( 495 isinstance(dtype, CategoricalDtype) and dtype.categories is not None 496 ) 497 498 if known_categories: 499 # Convert to a specialized type with `dtype` if specified. 500 if dtype.categories.is_numeric(): 501 cats = to_numeric(inferred_categories, errors="coerce") 502 elif is_datetime64_dtype(dtype.categories): 503 cats = to_datetime(inferred_categories, errors="coerce") 504 elif is_timedelta64_dtype(dtype.categories): 505 cats = to_timedelta(inferred_categories, errors="coerce") 506 elif dtype.categories.is_boolean(): 507 if true_values is None: 508 true_values = ["True", "TRUE", "true"] 509 510 cats = cats.isin(true_values) 511 512 if known_categories: 513 # Recode from observation order to dtype.categories order. 514 categories = dtype.categories 515 codes = recode_for_categories(inferred_codes, cats, categories) 516 elif not cats.is_monotonic_increasing: 517 # Sort categories and recode for unknown categories. 518 unsorted = cats.copy() 519 categories = cats.sort_values() 520 521 codes = recode_for_categories(inferred_codes, unsorted, categories) 522 dtype = CategoricalDtype(categories, ordered=False) 523 else: 524 dtype = CategoricalDtype(cats, ordered=False) 525 codes = inferred_codes 526 527 return cls(codes, dtype=dtype, fastpath=True) 528 529 @classmethod 530 def from_codes(cls, codes, categories=None, ordered=None, dtype=None): 531 """ 532 Make a Categorical type from codes and categories or dtype. 533 534 This constructor is useful if you already have codes and 535 categories/dtype and so do not need the (computation intensive) 536 factorization step, which is usually done on the constructor. 537 538 If your data does not follow this convention, please use the normal 539 constructor. 540 541 Parameters 542 ---------- 543 codes : array-like of int 544 An integer array, where each integer points to a category in 545 categories or dtype.categories, or else is -1 for NaN. 546 categories : index-like, optional 547 The categories for the categorical. Items need to be unique. 548 If the categories are not given here, then they must be provided 549 in `dtype`. 550 ordered : bool, optional 551 Whether or not this categorical is treated as an ordered 552 categorical. If not given here or in `dtype`, the resulting 553 categorical will be unordered. 554 dtype : CategoricalDtype or "category", optional 555 If :class:`CategoricalDtype`, cannot be used together with 556 `categories` or `ordered`. 557 558 .. versionadded:: 0.24.0 559 560 When `dtype` is provided, neither `categories` nor `ordered` 561 should be provided. 562 563 Returns 564 ------- 565 Categorical 566 567 Examples 568 -------- 569 >>> dtype = pd.CategoricalDtype(['a', 'b'], ordered=True) 570 >>> pd.Categorical.from_codes(codes=[0, 1, 0, 1], dtype=dtype) 571 ['a', 'b', 'a', 'b'] 572 Categories (2, object): ['a' < 'b'] 573 """ 574 dtype = CategoricalDtype._from_values_or_dtype( 575 categories=categories, ordered=ordered, dtype=dtype 576 ) 577 if dtype.categories is None: 578 msg = ( 579 "The categories must be provided in 'categories' or " 580 "'dtype'. Both were None." 581 ) 582 raise ValueError(msg) 583 584 if is_extension_array_dtype(codes) and is_integer_dtype(codes): 585 # Avoid the implicit conversion of Int to object 586 if isna(codes).any(): 587 raise ValueError("codes cannot contain NA values") 588 codes = codes.to_numpy(dtype=np.int64) 589 else: 590 codes = np.asarray(codes) 591 if len(codes) and not is_integer_dtype(codes): 592 raise ValueError("codes need to be array-like integers") 593 594 if len(codes) and (codes.max() >= len(dtype.categories) or codes.min() < -1): 595 raise ValueError("codes need to be between -1 and len(categories)-1") 596 597 return cls(codes, dtype=dtype, fastpath=True) 598 599 # ------------------------------------------------------------------ 600 # Categories/Codes/Ordered 601 602 @property 603 def categories(self): 604 """ 605 The categories of this categorical. 606 607 Setting assigns new values to each category (effectively a rename of 608 each individual category). 609 610 The assigned value has to be a list-like object. All items must be 611 unique and the number of items in the new categories must be the same 612 as the number of items in the old categories. 613 614 Assigning to `categories` is a inplace operation! 615 616 Raises 617 ------ 618 ValueError 619 If the new categories do not validate as categories or if the 620 number of new categories is unequal the number of old categories 621 622 See Also 623 -------- 624 rename_categories : Rename categories. 625 reorder_categories : Reorder categories. 626 add_categories : Add new categories. 627 remove_categories : Remove the specified categories. 628 remove_unused_categories : Remove categories which are not used. 629 set_categories : Set the categories to the specified ones. 630 """ 631 return self.dtype.categories 632 633 @categories.setter 634 def categories(self, categories): 635 new_dtype = CategoricalDtype(categories, ordered=self.ordered) 636 if self.dtype.categories is not None and len(self.dtype.categories) != len( 637 new_dtype.categories 638 ): 639 raise ValueError( 640 "new categories need to have the same number of " 641 "items as the old categories!" 642 ) 643 self._dtype = new_dtype 644 645 @property 646 def ordered(self) -> Ordered: 647 """ 648 Whether the categories have an ordered relationship. 649 """ 650 return self.dtype.ordered 651 652 @property 653 def codes(self) -> np.ndarray: 654 """ 655 The category codes of this categorical. 656 657 Codes are an array of integers which are the positions of the actual 658 values in the categories array. 659 660 There is no setter, use the other categorical methods and the normal item 661 setter to change values in the categorical. 662 663 Returns 664 ------- 665 ndarray[int] 666 A non-writable view of the `codes` array. 667 """ 668 v = self._codes.view() 669 v.flags.writeable = False 670 return v 671 672 def _set_categories(self, categories, fastpath=False): 673 """ 674 Sets new categories inplace 675 676 Parameters 677 ---------- 678 fastpath : bool, default False 679 Don't perform validation of the categories for uniqueness or nulls 680 681 Examples 682 -------- 683 >>> c = pd.Categorical(['a', 'b']) 684 >>> c 685 ['a', 'b'] 686 Categories (2, object): ['a', 'b'] 687 688 >>> c._set_categories(pd.Index(['a', 'c'])) 689 >>> c 690 ['a', 'c'] 691 Categories (2, object): ['a', 'c'] 692 """ 693 if fastpath: 694 new_dtype = CategoricalDtype._from_fastpath(categories, self.ordered) 695 else: 696 new_dtype = CategoricalDtype(categories, ordered=self.ordered) 697 if ( 698 not fastpath 699 and self.dtype.categories is not None 700 and len(new_dtype.categories) != len(self.dtype.categories) 701 ): 702 raise ValueError( 703 "new categories need to have the same number of " 704 "items than the old categories!" 705 ) 706 707 self._dtype = new_dtype 708 709 def _set_dtype(self, dtype: CategoricalDtype) -> "Categorical": 710 """ 711 Internal method for directly updating the CategoricalDtype 712 713 Parameters 714 ---------- 715 dtype : CategoricalDtype 716 717 Notes 718 ----- 719 We don't do any validation here. It's assumed that the dtype is 720 a (valid) instance of `CategoricalDtype`. 721 """ 722 codes = recode_for_categories(self.codes, self.categories, dtype.categories) 723 return type(self)(codes, dtype=dtype, fastpath=True) 724 725 def set_ordered(self, value, inplace=False): 726 """ 727 Set the ordered attribute to the boolean value. 728 729 Parameters 730 ---------- 731 value : bool 732 Set whether this categorical is ordered (True) or not (False). 733 inplace : bool, default False 734 Whether or not to set the ordered attribute in-place or return 735 a copy of this categorical with ordered set to the value. 736 """ 737 inplace = validate_bool_kwarg(inplace, "inplace") 738 new_dtype = CategoricalDtype(self.categories, ordered=value) 739 cat = self if inplace else self.copy() 740 cat._dtype = new_dtype 741 if not inplace: 742 return cat 743 744 def as_ordered(self, inplace=False): 745 """ 746 Set the Categorical to be ordered. 747 748 Parameters 749 ---------- 750 inplace : bool, default False 751 Whether or not to set the ordered attribute in-place or return 752 a copy of this categorical with ordered set to True. 753 754 Returns 755 ------- 756 Categorical or None 757 Ordered Categorical or None if ``inplace=True``. 758 """ 759 inplace = validate_bool_kwarg(inplace, "inplace") 760 return self.set_ordered(True, inplace=inplace) 761 762 def as_unordered(self, inplace=False): 763 """ 764 Set the Categorical to be unordered. 765 766 Parameters 767 ---------- 768 inplace : bool, default False 769 Whether or not to set the ordered attribute in-place or return 770 a copy of this categorical with ordered set to False. 771 772 Returns 773 ------- 774 Categorical or None 775 Unordered Categorical or None if ``inplace=True``. 776 """ 777 inplace = validate_bool_kwarg(inplace, "inplace") 778 return self.set_ordered(False, inplace=inplace) 779 780 def set_categories(self, new_categories, ordered=None, rename=False, inplace=False): 781 """ 782 Set the categories to the specified new_categories. 783 784 `new_categories` can include new categories (which will result in 785 unused categories) or remove old categories (which results in values 786 set to NaN). If `rename==True`, the categories will simple be renamed 787 (less or more items than in old categories will result in values set to 788 NaN or in unused categories respectively). 789 790 This method can be used to perform more than one action of adding, 791 removing, and reordering simultaneously and is therefore faster than 792 performing the individual steps via the more specialised methods. 793 794 On the other hand this methods does not do checks (e.g., whether the 795 old categories are included in the new categories on a reorder), which 796 can result in surprising changes, for example when using special string 797 dtypes, which does not considers a S1 string equal to a single char 798 python string. 799 800 Parameters 801 ---------- 802 new_categories : Index-like 803 The categories in new order. 804 ordered : bool, default False 805 Whether or not the categorical is treated as a ordered categorical. 806 If not given, do not change the ordered information. 807 rename : bool, default False 808 Whether or not the new_categories should be considered as a rename 809 of the old categories or as reordered categories. 810 inplace : bool, default False 811 Whether or not to reorder the categories in-place or return a copy 812 of this categorical with reordered categories. 813 814 Returns 815 ------- 816 Categorical with reordered categories or None if inplace. 817 818 Raises 819 ------ 820 ValueError 821 If new_categories does not validate as categories 822 823 See Also 824 -------- 825 rename_categories : Rename categories. 826 reorder_categories : Reorder categories. 827 add_categories : Add new categories. 828 remove_categories : Remove the specified categories. 829 remove_unused_categories : Remove categories which are not used. 830 """ 831 inplace = validate_bool_kwarg(inplace, "inplace") 832 if ordered is None: 833 ordered = self.dtype.ordered 834 new_dtype = CategoricalDtype(new_categories, ordered=ordered) 835 836 cat = self if inplace else self.copy() 837 if rename: 838 if cat.dtype.categories is not None and len(new_dtype.categories) < len( 839 cat.dtype.categories 840 ): 841 # remove all _codes which are larger and set to -1/NaN 842 cat._codes[cat._codes >= len(new_dtype.categories)] = -1 843 else: 844 codes = recode_for_categories( 845 cat.codes, cat.categories, new_dtype.categories 846 ) 847 cat._codes = codes 848 cat._dtype = new_dtype 849 850 if not inplace: 851 return cat 852 853 def rename_categories(self, new_categories, inplace=False): 854 """ 855 Rename categories. 856 857 Parameters 858 ---------- 859 new_categories : list-like, dict-like or callable 860 861 New categories which will replace old categories. 862 863 * list-like: all items must be unique and the number of items in 864 the new categories must match the existing number of categories. 865 866 * dict-like: specifies a mapping from 867 old categories to new. Categories not contained in the mapping 868 are passed through and extra categories in the mapping are 869 ignored. 870 871 * callable : a callable that is called on all items in the old 872 categories and whose return values comprise the new categories. 873 874 inplace : bool, default False 875 Whether or not to rename the categories inplace or return a copy of 876 this categorical with renamed categories. 877 878 Returns 879 ------- 880 cat : Categorical or None 881 Categorical with removed categories or None if ``inplace=True``. 882 883 Raises 884 ------ 885 ValueError 886 If new categories are list-like and do not have the same number of 887 items than the current categories or do not validate as categories 888 889 See Also 890 -------- 891 reorder_categories : Reorder categories. 892 add_categories : Add new categories. 893 remove_categories : Remove the specified categories. 894 remove_unused_categories : Remove categories which are not used. 895 set_categories : Set the categories to the specified ones. 896 897 Examples 898 -------- 899 >>> c = pd.Categorical(['a', 'a', 'b']) 900 >>> c.rename_categories([0, 1]) 901 [0, 0, 1] 902 Categories (2, int64): [0, 1] 903 904 For dict-like ``new_categories``, extra keys are ignored and 905 categories not in the dictionary are passed through 906 907 >>> c.rename_categories({'a': 'A', 'c': 'C'}) 908 ['A', 'A', 'b'] 909 Categories (2, object): ['A', 'b'] 910 911 You may also provide a callable to create the new categories 912 913 >>> c.rename_categories(lambda x: x.upper()) 914 ['A', 'A', 'B'] 915 Categories (2, object): ['A', 'B'] 916 """ 917 inplace = validate_bool_kwarg(inplace, "inplace") 918 cat = self if inplace else self.copy() 919 920 if is_dict_like(new_categories): 921 cat.categories = [new_categories.get(item, item) for item in cat.categories] 922 elif callable(new_categories): 923 cat.categories = [new_categories(item) for item in cat.categories] 924 else: 925 cat.categories = new_categories 926 if not inplace: 927 return cat 928 929 def reorder_categories(self, new_categories, ordered=None, inplace=False): 930 """ 931 Reorder categories as specified in new_categories. 932 933 `new_categories` need to include all old categories and no new category 934 items. 935 936 Parameters 937 ---------- 938 new_categories : Index-like 939 The categories in new order. 940 ordered : bool, optional 941 Whether or not the categorical is treated as a ordered categorical. 942 If not given, do not change the ordered information. 943 inplace : bool, default False 944 Whether or not to reorder the categories inplace or return a copy of 945 this categorical with reordered categories. 946 947 Returns 948 ------- 949 cat : Categorical or None 950 Categorical with removed categories or None if ``inplace=True``. 951 952 Raises 953 ------ 954 ValueError 955 If the new categories do not contain all old category items or any 956 new ones 957 958 See Also 959 -------- 960 rename_categories : Rename categories. 961 add_categories : Add new categories. 962 remove_categories : Remove the specified categories. 963 remove_unused_categories : Remove categories which are not used. 964 set_categories : Set the categories to the specified ones. 965 """ 966 inplace = validate_bool_kwarg(inplace, "inplace") 967 if set(self.dtype.categories) != set(new_categories): 968 raise ValueError( 969 "items in new_categories are not the same as in old categories" 970 ) 971 return self.set_categories(new_categories, ordered=ordered, inplace=inplace) 972 973 def add_categories(self, new_categories, inplace=False): 974 """ 975 Add new categories. 976 977 `new_categories` will be included at the last/highest place in the 978 categories and will be unused directly after this call. 979 980 Parameters 981 ---------- 982 new_categories : category or list-like of category 983 The new categories to be included. 984 inplace : bool, default False 985 Whether or not to add the categories inplace or return a copy of 986 this categorical with added categories. 987 988 Returns 989 ------- 990 cat : Categorical or None 991 Categorical with new categories added or None if ``inplace=True``. 992 993 Raises 994 ------ 995 ValueError 996 If the new categories include old categories or do not validate as 997 categories 998 999 See Also 1000 -------- 1001 rename_categories : Rename categories. 1002 reorder_categories : Reorder categories. 1003 remove_categories : Remove the specified categories. 1004 remove_unused_categories : Remove categories which are not used. 1005 set_categories : Set the categories to the specified ones. 1006 """ 1007 inplace = validate_bool_kwarg(inplace, "inplace") 1008 if not is_list_like(new_categories): 1009 new_categories = [new_categories] 1010 already_included = set(new_categories) & set(self.dtype.categories) 1011 if len(already_included) != 0: 1012 raise ValueError( 1013 f"new categories must not include old categories: {already_included}" 1014 ) 1015 new_categories = list(self.dtype.categories) + list(new_categories) 1016 new_dtype = CategoricalDtype(new_categories, self.ordered) 1017 1018 cat = self if inplace else self.copy() 1019 cat._dtype = new_dtype 1020 cat._codes = coerce_indexer_dtype(cat._codes, new_dtype.categories) 1021 if not inplace: 1022 return cat 1023 1024 def remove_categories(self, removals, inplace=False): 1025 """ 1026 Remove the specified categories. 1027 1028 `removals` must be included in the old categories. Values which were in 1029 the removed categories will be set to NaN 1030 1031 Parameters 1032 ---------- 1033 removals : category or list of categories 1034 The categories which should be removed. 1035 inplace : bool, default False 1036 Whether or not to remove the categories inplace or return a copy of 1037 this categorical with removed categories. 1038 1039 Returns 1040 ------- 1041 cat : Categorical or None 1042 Categorical with removed categories or None if ``inplace=True``. 1043 1044 Raises 1045 ------ 1046 ValueError 1047 If the removals are not contained in the categories 1048 1049 See Also 1050 -------- 1051 rename_categories : Rename categories. 1052 reorder_categories : Reorder categories. 1053 add_categories : Add new categories. 1054 remove_unused_categories : Remove categories which are not used. 1055 set_categories : Set the categories to the specified ones. 1056 """ 1057 inplace = validate_bool_kwarg(inplace, "inplace") 1058 if not is_list_like(removals): 1059 removals = [removals] 1060 1061 removal_set = set(removals) 1062 not_included = removal_set - set(self.dtype.categories) 1063 new_categories = [c for c in self.dtype.categories if c not in removal_set] 1064 1065 # GH 10156 1066 if any(isna(removals)): 1067 not_included = {x for x in not_included if notna(x)} 1068 new_categories = [x for x in new_categories if notna(x)] 1069 1070 if len(not_included) != 0: 1071 raise ValueError(f"removals must all be in old categories: {not_included}") 1072 1073 return self.set_categories( 1074 new_categories, ordered=self.ordered, rename=False, inplace=inplace 1075 ) 1076 1077 def remove_unused_categories(self, inplace=no_default): 1078 """ 1079 Remove categories which are not used. 1080 1081 Parameters 1082 ---------- 1083 inplace : bool, default False 1084 Whether or not to drop unused categories inplace or return a copy of 1085 this categorical with unused categories dropped. 1086 1087 .. deprecated:: 1.2.0 1088 1089 Returns 1090 ------- 1091 cat : Categorical or None 1092 Categorical with unused categories dropped or None if ``inplace=True``. 1093 1094 See Also 1095 -------- 1096 rename_categories : Rename categories. 1097 reorder_categories : Reorder categories. 1098 add_categories : Add new categories. 1099 remove_categories : Remove the specified categories. 1100 set_categories : Set the categories to the specified ones. 1101 """ 1102 if inplace is not no_default: 1103 warn( 1104 "The `inplace` parameter in pandas.Categorical." 1105 "remove_unused_categories is deprecated and " 1106 "will be removed in a future version.", 1107 FutureWarning, 1108 stacklevel=2, 1109 ) 1110 else: 1111 inplace = False 1112 1113 inplace = validate_bool_kwarg(inplace, "inplace") 1114 cat = self if inplace else self.copy() 1115 idx, inv = np.unique(cat._codes, return_inverse=True) 1116 1117 if idx.size != 0 and idx[0] == -1: # na sentinel 1118 idx, inv = idx[1:], inv - 1 1119 1120 new_categories = cat.dtype.categories.take(idx) 1121 new_dtype = CategoricalDtype._from_fastpath( 1122 new_categories, ordered=self.ordered 1123 ) 1124 cat._dtype = new_dtype 1125 cat._codes = coerce_indexer_dtype(inv, new_dtype.categories) 1126 1127 if not inplace: 1128 return cat 1129 1130 # ------------------------------------------------------------------ 1131 1132 def map(self, mapper): 1133 """ 1134 Map categories using input correspondence (dict, Series, or function). 1135 1136 Maps the categories to new categories. If the mapping correspondence is 1137 one-to-one the result is a :class:`~pandas.Categorical` which has the 1138 same order property as the original, otherwise a :class:`~pandas.Index` 1139 is returned. NaN values are unaffected. 1140 1141 If a `dict` or :class:`~pandas.Series` is used any unmapped category is 1142 mapped to `NaN`. Note that if this happens an :class:`~pandas.Index` 1143 will be returned. 1144 1145 Parameters 1146 ---------- 1147 mapper : function, dict, or Series 1148 Mapping correspondence. 1149 1150 Returns 1151 ------- 1152 pandas.Categorical or pandas.Index 1153 Mapped categorical. 1154 1155 See Also 1156 -------- 1157 CategoricalIndex.map : Apply a mapping correspondence on a 1158 :class:`~pandas.CategoricalIndex`. 1159 Index.map : Apply a mapping correspondence on an 1160 :class:`~pandas.Index`. 1161 Series.map : Apply a mapping correspondence on a 1162 :class:`~pandas.Series`. 1163 Series.apply : Apply more complex functions on a 1164 :class:`~pandas.Series`. 1165 1166 Examples 1167 -------- 1168 >>> cat = pd.Categorical(['a', 'b', 'c']) 1169 >>> cat 1170 ['a', 'b', 'c'] 1171 Categories (3, object): ['a', 'b', 'c'] 1172 >>> cat.map(lambda x: x.upper()) 1173 ['A', 'B', 'C'] 1174 Categories (3, object): ['A', 'B', 'C'] 1175 >>> cat.map({'a': 'first', 'b': 'second', 'c': 'third'}) 1176 ['first', 'second', 'third'] 1177 Categories (3, object): ['first', 'second', 'third'] 1178 1179 If the mapping is one-to-one the ordering of the categories is 1180 preserved: 1181 1182 >>> cat = pd.Categorical(['a', 'b', 'c'], ordered=True) 1183 >>> cat 1184 ['a', 'b', 'c'] 1185 Categories (3, object): ['a' < 'b' < 'c'] 1186 >>> cat.map({'a': 3, 'b': 2, 'c': 1}) 1187 [3, 2, 1] 1188 Categories (3, int64): [3 < 2 < 1] 1189 1190 If the mapping is not one-to-one an :class:`~pandas.Index` is returned: 1191 1192 >>> cat.map({'a': 'first', 'b': 'second', 'c': 'first'}) 1193 Index(['first', 'second', 'first'], dtype='object') 1194 1195 If a `dict` is used, all unmapped categories are mapped to `NaN` and 1196 the result is an :class:`~pandas.Index`: 1197 1198 >>> cat.map({'a': 'first', 'b': 'second'}) 1199 Index(['first', 'second', nan], dtype='object') 1200 """ 1201 new_categories = self.categories.map(mapper) 1202 try: 1203 return self.from_codes( 1204 self._codes.copy(), categories=new_categories, ordered=self.ordered 1205 ) 1206 except ValueError: 1207 # NA values are represented in self._codes with -1 1208 # np.take causes NA values to take final element in new_categories 1209 if np.any(self._codes == -1): 1210 new_categories = new_categories.insert(len(new_categories), np.nan) 1211 return np.take(new_categories, self._codes) 1212 1213 __eq__ = _cat_compare_op(operator.eq) 1214 __ne__ = _cat_compare_op(operator.ne) 1215 __lt__ = _cat_compare_op(operator.lt) 1216 __gt__ = _cat_compare_op(operator.gt) 1217 __le__ = _cat_compare_op(operator.le) 1218 __ge__ = _cat_compare_op(operator.ge) 1219 1220 # ------------------------------------------------------------- 1221 # Validators; ideally these can be de-duplicated 1222 1223 def _validate_searchsorted_value(self, value): 1224 # searchsorted is very performance sensitive. By converting codes 1225 # to same dtype as self.codes, we get much faster performance. 1226 if is_scalar(value): 1227 codes = self._unbox_scalar(value) 1228 else: 1229 locs = [self.categories.get_loc(x) for x in value] 1230 codes = np.array(locs, dtype=self.codes.dtype) 1231 return codes 1232 1233 def _validate_fill_value(self, fill_value): 1234 """ 1235 Convert a user-facing fill_value to a representation to use with our 1236 underlying ndarray, raising TypeError if this is not possible. 1237 1238 Parameters 1239 ---------- 1240 fill_value : object 1241 1242 Returns 1243 ------- 1244 fill_value : int 1245 1246 Raises 1247 ------ 1248 TypeError 1249 """ 1250 1251 if is_valid_nat_for_dtype(fill_value, self.categories.dtype): 1252 fill_value = -1 1253 elif fill_value in self.categories: 1254 fill_value = self._unbox_scalar(fill_value) 1255 else: 1256 raise TypeError( 1257 f"'fill_value={fill_value}' is not present " 1258 "in this Categorical's categories" 1259 ) 1260 return fill_value 1261 1262 _validate_scalar = _validate_fill_value 1263 1264 # ------------------------------------------------------------- 1265 1266 def __array__(self, dtype=None) -> np.ndarray: 1267 """ 1268 The numpy array interface. 1269 1270 Returns 1271 ------- 1272 numpy.array 1273 A numpy array of either the specified dtype or, 1274 if dtype==None (default), the same dtype as 1275 categorical.categories.dtype. 1276 """ 1277 ret = take_1d(self.categories._values, self._codes) 1278 if dtype and not is_dtype_equal(dtype, self.categories.dtype): 1279 return np.asarray(ret, dtype) 1280 # When we're a Categorical[ExtensionArray], like Interval, 1281 # we need to ensure __array__ gets all the way to an 1282 # ndarray. 1283 return np.asarray(ret) 1284 1285 def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): 1286 # for binary ops, use our custom dunder methods 1287 result = ops.maybe_dispatch_ufunc_to_dunder_op( 1288 self, ufunc, method, *inputs, **kwargs 1289 ) 1290 if result is not NotImplemented: 1291 return result 1292 1293 # for all other cases, raise for now (similarly as what happens in 1294 # Series.__array_prepare__) 1295 raise TypeError( 1296 f"Object with dtype {self.dtype} cannot perform " 1297 f"the numpy op {ufunc.__name__}" 1298 ) 1299 1300 def __setstate__(self, state): 1301 """Necessary for making this object picklable""" 1302 if not isinstance(state, dict): 1303 raise Exception("invalid pickle state") 1304 1305 if "_dtype" not in state: 1306 state["_dtype"] = CategoricalDtype(state["_categories"], state["_ordered"]) 1307 1308 for k, v in state.items(): 1309 setattr(self, k, v) 1310 1311 @property 1312 def nbytes(self) -> int: 1313 return self._codes.nbytes + self.dtype.categories.values.nbytes 1314 1315 def memory_usage(self, deep: bool = False) -> int: 1316 """ 1317 Memory usage of my values 1318 1319 Parameters 1320 ---------- 1321 deep : bool 1322 Introspect the data deeply, interrogate 1323 `object` dtypes for system-level memory consumption 1324 1325 Returns 1326 ------- 1327 bytes used 1328 1329 Notes 1330 ----- 1331 Memory usage does not include memory consumed by elements that 1332 are not components of the array if deep=False 1333 1334 See Also 1335 -------- 1336 numpy.ndarray.nbytes 1337 """ 1338 return self._codes.nbytes + self.dtype.categories.memory_usage(deep=deep) 1339 1340 def isna(self): 1341 """ 1342 Detect missing values 1343 1344 Missing values (-1 in .codes) are detected. 1345 1346 Returns 1347 ------- 1348 a boolean array of whether my values are null 1349 1350 See Also 1351 -------- 1352 isna : Top-level isna. 1353 isnull : Alias of isna. 1354 Categorical.notna : Boolean inverse of Categorical.isna. 1355 1356 """ 1357 return self._codes == -1 1358 1359 isnull = isna 1360 1361 def notna(self): 1362 """ 1363 Inverse of isna 1364 1365 Both missing values (-1 in .codes) and NA as a category are detected as 1366 null. 1367 1368 Returns 1369 ------- 1370 a boolean array of whether my values are not null 1371 1372 See Also 1373 -------- 1374 notna : Top-level notna. 1375 notnull : Alias of notna. 1376 Categorical.isna : Boolean inverse of Categorical.notna. 1377 1378 """ 1379 return ~self.isna() 1380 1381 notnull = notna 1382 1383 def value_counts(self, dropna=True): 1384 """ 1385 Return a Series containing counts of each category. 1386 1387 Every category will have an entry, even those with a count of 0. 1388 1389 Parameters 1390 ---------- 1391 dropna : bool, default True 1392 Don't include counts of NaN. 1393 1394 Returns 1395 ------- 1396 counts : Series 1397 1398 See Also 1399 -------- 1400 Series.value_counts 1401 """ 1402 from pandas import CategoricalIndex, Series 1403 1404 code, cat = self._codes, self.categories 1405 ncat, mask = (len(cat), code >= 0) 1406 ix, clean = np.arange(ncat), mask.all() 1407 1408 if dropna or clean: 1409 obs = code if clean else code[mask] 1410 count = np.bincount(obs, minlength=ncat or 0) 1411 else: 1412 count = np.bincount(np.where(mask, code, ncat)) 1413 ix = np.append(ix, -1) 1414 1415 ix = self._from_backing_data(ix) 1416 1417 return Series(count, index=CategoricalIndex(ix), dtype="int64") 1418 1419 def _internal_get_values(self): 1420 """ 1421 Return the values. 1422 1423 For internal compatibility with pandas formatting. 1424 1425 Returns 1426 ------- 1427 np.ndarray or Index 1428 A numpy array of the same dtype as categorical.categories.dtype or 1429 Index if datetime / periods. 1430 """ 1431 # if we are a datetime and period index, return Index to keep metadata 1432 if needs_i8_conversion(self.categories.dtype): 1433 return self.categories.take(self._codes, fill_value=NaT) 1434 elif is_integer_dtype(self.categories) and -1 in self._codes: 1435 return self.categories.astype("object").take(self._codes, fill_value=np.nan) 1436 return np.array(self) 1437 1438 def check_for_ordered(self, op): 1439 """ assert that we are ordered """ 1440 if not self.ordered: 1441 raise TypeError( 1442 f"Categorical is not ordered for operation {op}\n" 1443 "you can use .as_ordered() to change the " 1444 "Categorical to an ordered one\n" 1445 ) 1446 1447 def argsort(self, ascending=True, kind="quicksort", **kwargs): 1448 """ 1449 Return the indices that would sort the Categorical. 1450 1451 .. versionchanged:: 0.25.0 1452 1453 Changed to sort missing values at the end. 1454 1455 Parameters 1456 ---------- 1457 ascending : bool, default True 1458 Whether the indices should result in an ascending 1459 or descending sort. 1460 kind : {'quicksort', 'mergesort', 'heapsort'}, optional 1461 Sorting algorithm. 1462 **kwargs: 1463 passed through to :func:`numpy.argsort`. 1464 1465 Returns 1466 ------- 1467 numpy.array 1468 1469 See Also 1470 -------- 1471 numpy.ndarray.argsort 1472 1473 Notes 1474 ----- 1475 While an ordering is applied to the category values, arg-sorting 1476 in this context refers more to organizing and grouping together 1477 based on matching category values. Thus, this function can be 1478 called on an unordered Categorical instance unlike the functions 1479 'Categorical.min' and 'Categorical.max'. 1480 1481 Examples 1482 -------- 1483 >>> pd.Categorical(['b', 'b', 'a', 'c']).argsort() 1484 array([2, 0, 1, 3]) 1485 1486 >>> cat = pd.Categorical(['b', 'b', 'a', 'c'], 1487 ... categories=['c', 'b', 'a'], 1488 ... ordered=True) 1489 >>> cat.argsort() 1490 array([3, 0, 1, 2]) 1491 1492 Missing values are placed at the end 1493 1494 >>> cat = pd.Categorical([2, None, 1]) 1495 >>> cat.argsort() 1496 array([2, 0, 1]) 1497 """ 1498 return super().argsort(ascending=ascending, kind=kind, **kwargs) 1499 1500 def sort_values( 1501 self, inplace: bool = False, ascending: bool = True, na_position: str = "last" 1502 ): 1503 """ 1504 Sort the Categorical by category value returning a new 1505 Categorical by default. 1506 1507 While an ordering is applied to the category values, sorting in this 1508 context refers more to organizing and grouping together based on 1509 matching category values. Thus, this function can be called on an 1510 unordered Categorical instance unlike the functions 'Categorical.min' 1511 and 'Categorical.max'. 1512 1513 Parameters 1514 ---------- 1515 inplace : bool, default False 1516 Do operation in place. 1517 ascending : bool, default True 1518 Order ascending. Passing False orders descending. The 1519 ordering parameter provides the method by which the 1520 category values are organized. 1521 na_position : {'first', 'last'} (optional, default='last') 1522 'first' puts NaNs at the beginning 1523 'last' puts NaNs at the end 1524 1525 Returns 1526 ------- 1527 Categorical or None 1528 1529 See Also 1530 -------- 1531 Categorical.sort 1532 Series.sort_values 1533 1534 Examples 1535 -------- 1536 >>> c = pd.Categorical([1, 2, 2, 1, 5]) 1537 >>> c 1538 [1, 2, 2, 1, 5] 1539 Categories (3, int64): [1, 2, 5] 1540 >>> c.sort_values() 1541 [1, 1, 2, 2, 5] 1542 Categories (3, int64): [1, 2, 5] 1543 >>> c.sort_values(ascending=False) 1544 [5, 2, 2, 1, 1] 1545 Categories (3, int64): [1, 2, 5] 1546 1547 Inplace sorting can be done as well: 1548 1549 >>> c.sort_values(inplace=True) 1550 >>> c 1551 [1, 1, 2, 2, 5] 1552 Categories (3, int64): [1, 2, 5] 1553 >>> 1554 >>> c = pd.Categorical([1, 2, 2, 1, 5]) 1555 1556 'sort_values' behaviour with NaNs. Note that 'na_position' 1557 is independent of the 'ascending' parameter: 1558 1559 >>> c = pd.Categorical([np.nan, 2, 2, np.nan, 5]) 1560 >>> c 1561 [NaN, 2, 2, NaN, 5] 1562 Categories (2, int64): [2, 5] 1563 >>> c.sort_values() 1564 [2, 2, 5, NaN, NaN] 1565 Categories (2, int64): [2, 5] 1566 >>> c.sort_values(ascending=False) 1567 [5, 2, 2, NaN, NaN] 1568 Categories (2, int64): [2, 5] 1569 >>> c.sort_values(na_position='first') 1570 [NaN, NaN, 2, 2, 5] 1571 Categories (2, int64): [2, 5] 1572 >>> c.sort_values(ascending=False, na_position='first') 1573 [NaN, NaN, 5, 2, 2] 1574 Categories (2, int64): [2, 5] 1575 """ 1576 inplace = validate_bool_kwarg(inplace, "inplace") 1577 if na_position not in ["last", "first"]: 1578 raise ValueError(f"invalid na_position: {repr(na_position)}") 1579 1580 sorted_idx = nargsort(self, ascending=ascending, na_position=na_position) 1581 1582 if inplace: 1583 self._codes[:] = self._codes[sorted_idx] 1584 else: 1585 codes = self._codes[sorted_idx] 1586 return self._from_backing_data(codes) 1587 1588 def _values_for_rank(self): 1589 """ 1590 For correctly ranking ordered categorical data. See GH#15420 1591 1592 Ordered categorical data should be ranked on the basis of 1593 codes with -1 translated to NaN. 1594 1595 Returns 1596 ------- 1597 numpy.array 1598 1599 """ 1600 from pandas import Series 1601 1602 if self.ordered: 1603 values = self.codes 1604 mask = values == -1 1605 if mask.any(): 1606 values = values.astype("float64") 1607 values[mask] = np.nan 1608 elif self.categories.is_numeric(): 1609 values = np.array(self) 1610 else: 1611 # reorder the categories (so rank can use the float codes) 1612 # instead of passing an object array to rank 1613 values = np.array( 1614 self.rename_categories(Series(self.categories).rank().values) 1615 ) 1616 return values 1617 1618 def view(self, dtype=None): 1619 if dtype is not None: 1620 raise NotImplementedError(dtype) 1621 return self._from_backing_data(self._ndarray) 1622 1623 def to_dense(self): 1624 """ 1625 Return my 'dense' representation 1626 1627 For internal compatibility with numpy arrays. 1628 1629 Returns 1630 ------- 1631 dense : array 1632 """ 1633 warn( 1634 "Categorical.to_dense is deprecated and will be removed in " 1635 "a future version. Use np.asarray(cat) instead.", 1636 FutureWarning, 1637 stacklevel=2, 1638 ) 1639 return np.asarray(self) 1640 1641 def fillna(self, value=None, method=None, limit=None): 1642 """ 1643 Fill NA/NaN values using the specified method. 1644 1645 Parameters 1646 ---------- 1647 value : scalar, dict, Series 1648 If a scalar value is passed it is used to fill all missing values. 1649 Alternatively, a Series or dict can be used to fill in different 1650 values for each index. The value should not be a list. The 1651 value(s) passed should either be in the categories or should be 1652 NaN. 1653 method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None 1654 Method to use for filling holes in reindexed Series 1655 pad / ffill: propagate last valid observation forward to next valid 1656 backfill / bfill: use NEXT valid observation to fill gap 1657 limit : int, default None 1658 (Not implemented yet for Categorical!) 1659 If method is specified, this is the maximum number of consecutive 1660 NaN values to forward/backward fill. In other words, if there is 1661 a gap with more than this number of consecutive NaNs, it will only 1662 be partially filled. If method is not specified, this is the 1663 maximum number of entries along the entire axis where NaNs will be 1664 filled. 1665 1666 Returns 1667 ------- 1668 filled : Categorical with NA/NaN filled 1669 """ 1670 value, method = validate_fillna_kwargs( 1671 value, method, validate_scalar_dict_value=False 1672 ) 1673 value = extract_array(value, extract_numpy=True) 1674 1675 if value is None: 1676 value = np.nan 1677 if limit is not None: 1678 raise NotImplementedError( 1679 "specifying a limit for fillna has not been implemented yet" 1680 ) 1681 1682 if method is not None: 1683 # pad / bfill 1684 1685 # TODO: dispatch when self.categories is EA-dtype 1686 values = np.asarray(self).reshape(-1, len(self)) 1687 values = interpolate_2d(values, method, 0, None).astype( 1688 self.categories.dtype 1689 )[0] 1690 codes = _get_codes_for_values(values, self.categories) 1691 1692 else: 1693 # We copy even if there is nothing to fill 1694 codes = self._ndarray.copy() 1695 mask = self.isna() 1696 1697 new_codes = self._validate_setitem_value(value) 1698 1699 if isinstance(value, (np.ndarray, Categorical)): 1700 # We get ndarray or Categorical if called via Series.fillna, 1701 # where it will unwrap another aligned Series before getting here 1702 codes[mask] = new_codes[mask] 1703 else: 1704 codes[mask] = new_codes 1705 1706 return self._from_backing_data(codes) 1707 1708 # ------------------------------------------------------------------ 1709 # NDArrayBackedExtensionArray compat 1710 1711 @property 1712 def _ndarray(self) -> np.ndarray: 1713 return self._codes 1714 1715 def _from_backing_data(self, arr: np.ndarray) -> "Categorical": 1716 return self._constructor(arr, dtype=self.dtype, fastpath=True) 1717 1718 def _box_func(self, i: int): 1719 if i == -1: 1720 return np.NaN 1721 return self.categories[i] 1722 1723 def _unbox_scalar(self, key) -> int: 1724 # searchsorted is very performance sensitive. By converting codes 1725 # to same dtype as self.codes, we get much faster performance. 1726 code = self.categories.get_loc(key) 1727 code = self._codes.dtype.type(code) 1728 return code 1729 1730 # ------------------------------------------------------------------ 1731 1732 def take_nd(self, indexer, allow_fill: bool = False, fill_value=None): 1733 # GH#27745 deprecate alias that other EAs dont have 1734 warn( 1735 "Categorical.take_nd is deprecated, use Categorical.take instead", 1736 FutureWarning, 1737 stacklevel=2, 1738 ) 1739 return self.take(indexer, allow_fill=allow_fill, fill_value=fill_value) 1740 1741 def __iter__(self): 1742 """ 1743 Returns an Iterator over the values of this Categorical. 1744 """ 1745 return iter(self._internal_get_values().tolist()) 1746 1747 def __contains__(self, key) -> bool: 1748 """ 1749 Returns True if `key` is in this Categorical. 1750 """ 1751 # if key is a NaN, check if any NaN is in self. 1752 if is_valid_nat_for_dtype(key, self.categories.dtype): 1753 return self.isna().any() 1754 1755 return contains(self, key, container=self._codes) 1756 1757 # ------------------------------------------------------------------ 1758 # Rendering Methods 1759 1760 def _formatter(self, boxed=False): 1761 # Defer to CategoricalFormatter's formatter. 1762 return None 1763 1764 def _tidy_repr(self, max_vals=10, footer=True) -> str: 1765 """ 1766 a short repr displaying only max_vals and an optional (but default 1767 footer) 1768 """ 1769 num = max_vals // 2 1770 head = self[:num]._get_repr(length=False, footer=False) 1771 tail = self[-(max_vals - num) :]._get_repr(length=False, footer=False) 1772 1773 result = f"{head[:-1]}, ..., {tail[1:]}" 1774 if footer: 1775 result = f"{result}\n{self._repr_footer()}" 1776 1777 return str(result) 1778 1779 def _repr_categories(self): 1780 """ 1781 return the base repr for the categories 1782 """ 1783 max_categories = ( 1784 10 1785 if get_option("display.max_categories") == 0 1786 else get_option("display.max_categories") 1787 ) 1788 from pandas.io.formats import format as fmt 1789 1790 format_array = partial( 1791 fmt.format_array, formatter=None, quoting=QUOTE_NONNUMERIC 1792 ) 1793 if len(self.categories) > max_categories: 1794 num = max_categories // 2 1795 head = format_array(self.categories[:num]) 1796 tail = format_array(self.categories[-num:]) 1797 category_strs = head + ["..."] + tail 1798 else: 1799 category_strs = format_array(self.categories) 1800 1801 # Strip all leading spaces, which format_array adds for columns... 1802 category_strs = [x.strip() for x in category_strs] 1803 return category_strs 1804 1805 def _repr_categories_info(self) -> str: 1806 """ 1807 Returns a string representation of the footer. 1808 """ 1809 category_strs = self._repr_categories() 1810 dtype = str(self.categories.dtype) 1811 levheader = f"Categories ({len(self.categories)}, {dtype}): " 1812 width, height = get_terminal_size() 1813 max_width = get_option("display.width") or width 1814 if console.in_ipython_frontend(): 1815 # 0 = no breaks 1816 max_width = 0 1817 levstring = "" 1818 start = True 1819 cur_col_len = len(levheader) # header 1820 sep_len, sep = (3, " < ") if self.ordered else (2, ", ") 1821 linesep = sep.rstrip() + "\n" # remove whitespace 1822 for val in category_strs: 1823 if max_width != 0 and cur_col_len + sep_len + len(val) > max_width: 1824 levstring += linesep + (" " * (len(levheader) + 1)) 1825 cur_col_len = len(levheader) + 1 # header + a whitespace 1826 elif not start: 1827 levstring += sep 1828 cur_col_len += len(val) 1829 levstring += val 1830 start = False 1831 # replace to simple save space by 1832 return levheader + "[" + levstring.replace(" < ... < ", " ... ") + "]" 1833 1834 def _repr_footer(self) -> str: 1835 info = self._repr_categories_info() 1836 return f"Length: {len(self)}\n{info}" 1837 1838 def _get_repr(self, length=True, na_rep="NaN", footer=True) -> str: 1839 from pandas.io.formats import format as fmt 1840 1841 formatter = fmt.CategoricalFormatter( 1842 self, length=length, na_rep=na_rep, footer=footer 1843 ) 1844 result = formatter.to_string() 1845 return str(result) 1846 1847 def __repr__(self) -> str: 1848 """ 1849 String representation. 1850 """ 1851 _maxlen = 10 1852 if len(self._codes) > _maxlen: 1853 result = self._tidy_repr(_maxlen) 1854 elif len(self._codes) > 0: 1855 result = self._get_repr(length=len(self) > _maxlen) 1856 else: 1857 msg = self._get_repr(length=False, footer=True).replace("\n", ", ") 1858 result = f"[], {msg}" 1859 1860 return result 1861 1862 # ------------------------------------------------------------------ 1863 1864 def __getitem__(self, key): 1865 """ 1866 Return an item. 1867 """ 1868 result = super().__getitem__(key) 1869 if getattr(result, "ndim", 0) > 1: 1870 result = result._ndarray 1871 deprecate_ndim_indexing(result) 1872 return result 1873 1874 def _validate_setitem_value(self, value): 1875 value = extract_array(value, extract_numpy=True) 1876 1877 # require identical categories set 1878 if isinstance(value, Categorical): 1879 if not is_dtype_equal(self.dtype, value.dtype): 1880 raise ValueError( 1881 "Cannot set a Categorical with another, " 1882 "without identical categories" 1883 ) 1884 # is_dtype_equal implies categories_match_up_to_permutation 1885 value = self._encode_with_my_categories(value) 1886 return value._codes 1887 1888 # wrap scalars and hashable-listlikes in list 1889 rvalue = value if not is_hashable(value) else [value] 1890 1891 from pandas import Index 1892 1893 to_add = Index(rvalue).difference(self.categories) 1894 1895 # no assignments of values not in categories, but it's always ok to set 1896 # something to np.nan 1897 if len(to_add) and not isna(to_add).all(): 1898 raise ValueError( 1899 "Cannot setitem on a Categorical with a new " 1900 "category, set the categories first" 1901 ) 1902 1903 codes = self.categories.get_indexer(rvalue) 1904 return codes.astype(self._ndarray.dtype, copy=False) 1905 1906 def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]: 1907 """ 1908 Compute the inverse of a categorical, returning 1909 a dict of categories -> indexers. 1910 1911 *This is an internal function* 1912 1913 Returns 1914 ------- 1915 dict of categories -> indexers 1916 1917 Examples 1918 -------- 1919 >>> c = pd.Categorical(list('aabca')) 1920 >>> c 1921 ['a', 'a', 'b', 'c', 'a'] 1922 Categories (3, object): ['a', 'b', 'c'] 1923 >>> c.categories 1924 Index(['a', 'b', 'c'], dtype='object') 1925 >>> c.codes 1926 array([0, 0, 1, 2, 0], dtype=int8) 1927 >>> c._reverse_indexer() 1928 {'a': array([0, 1, 4]), 'b': array([2]), 'c': array([3])} 1929 1930 """ 1931 categories = self.categories 1932 r, counts = libalgos.groupsort_indexer( 1933 self.codes.astype("int64"), categories.size 1934 ) 1935 counts = counts.cumsum() 1936 _result = (r[start:end] for start, end in zip(counts, counts[1:])) 1937 return dict(zip(categories, _result)) 1938 1939 # ------------------------------------------------------------------ 1940 # Reductions 1941 1942 @deprecate_kwarg(old_arg_name="numeric_only", new_arg_name="skipna") 1943 def min(self, *, skipna=True, **kwargs): 1944 """ 1945 The minimum value of the object. 1946 1947 Only ordered `Categoricals` have a minimum! 1948 1949 .. versionchanged:: 1.0.0 1950 1951 Returns an NA value on empty arrays 1952 1953 Raises 1954 ------ 1955 TypeError 1956 If the `Categorical` is not `ordered`. 1957 1958 Returns 1959 ------- 1960 min : the minimum of this `Categorical` 1961 """ 1962 nv.validate_minmax_axis(kwargs.get("axis", 0)) 1963 nv.validate_min((), kwargs) 1964 self.check_for_ordered("min") 1965 1966 if not len(self._codes): 1967 return self.dtype.na_value 1968 1969 good = self._codes != -1 1970 if not good.all(): 1971 if skipna and good.any(): 1972 pointer = self._codes[good].min() 1973 else: 1974 return np.nan 1975 else: 1976 pointer = self._codes.min() 1977 return self._wrap_reduction_result(None, pointer) 1978 1979 @deprecate_kwarg(old_arg_name="numeric_only", new_arg_name="skipna") 1980 def max(self, *, skipna=True, **kwargs): 1981 """ 1982 The maximum value of the object. 1983 1984 Only ordered `Categoricals` have a maximum! 1985 1986 .. versionchanged:: 1.0.0 1987 1988 Returns an NA value on empty arrays 1989 1990 Raises 1991 ------ 1992 TypeError 1993 If the `Categorical` is not `ordered`. 1994 1995 Returns 1996 ------- 1997 max : the maximum of this `Categorical` 1998 """ 1999 nv.validate_minmax_axis(kwargs.get("axis", 0)) 2000 nv.validate_max((), kwargs) 2001 self.check_for_ordered("max") 2002 2003 if not len(self._codes): 2004 return self.dtype.na_value 2005 2006 good = self._codes != -1 2007 if not good.all(): 2008 if skipna and good.any(): 2009 pointer = self._codes[good].max() 2010 else: 2011 return np.nan 2012 else: 2013 pointer = self._codes.max() 2014 return self._wrap_reduction_result(None, pointer) 2015 2016 def mode(self, dropna=True): 2017 """ 2018 Returns the mode(s) of the Categorical. 2019 2020 Always returns `Categorical` even if only one value. 2021 2022 Parameters 2023 ---------- 2024 dropna : bool, default True 2025 Don't consider counts of NaN/NaT. 2026 2027 .. versionadded:: 0.24.0 2028 2029 Returns 2030 ------- 2031 modes : `Categorical` (sorted) 2032 """ 2033 codes = self._codes 2034 if dropna: 2035 good = self._codes != -1 2036 codes = self._codes[good] 2037 codes = sorted(htable.mode_int64(ensure_int64(codes), dropna)) 2038 return self._from_backing_data(codes) 2039 2040 # ------------------------------------------------------------------ 2041 # ExtensionArray Interface 2042 2043 def unique(self): 2044 """ 2045 Return the ``Categorical`` which ``categories`` and ``codes`` are 2046 unique. Unused categories are NOT returned. 2047 2048 - unordered category: values and categories are sorted by appearance 2049 order. 2050 - ordered category: values are sorted by appearance order, categories 2051 keeps existing order. 2052 2053 Returns 2054 ------- 2055 unique values : ``Categorical`` 2056 2057 See Also 2058 -------- 2059 pandas.unique 2060 CategoricalIndex.unique 2061 Series.unique : Return unique values of Series object. 2062 2063 Examples 2064 -------- 2065 An unordered Categorical will return categories in the 2066 order of appearance. 2067 2068 >>> pd.Categorical(list("baabc")).unique() 2069 ['b', 'a', 'c'] 2070 Categories (3, object): ['b', 'a', 'c'] 2071 2072 >>> pd.Categorical(list("baabc"), categories=list("abc")).unique() 2073 ['b', 'a', 'c'] 2074 Categories (3, object): ['b', 'a', 'c'] 2075 2076 An ordered Categorical preserves the category ordering. 2077 2078 >>> pd.Categorical( 2079 ... list("baabc"), categories=list("abc"), ordered=True 2080 ... ).unique() 2081 ['b', 'a', 'c'] 2082 Categories (3, object): ['a' < 'b' < 'c'] 2083 """ 2084 # unlike np.unique, unique1d does not sort 2085 unique_codes = unique1d(self.codes) 2086 cat = self.copy() 2087 2088 # keep nan in codes 2089 cat._codes = unique_codes 2090 2091 # exclude nan from indexer for categories 2092 take_codes = unique_codes[unique_codes != -1] 2093 if self.ordered: 2094 take_codes = np.sort(take_codes) 2095 return cat.set_categories(cat.categories.take(take_codes)) 2096 2097 def _values_for_factorize(self): 2098 return self._ndarray, -1 2099 2100 @classmethod 2101 def _from_factorized(cls, uniques, original): 2102 return original._constructor( 2103 original.categories.take(uniques), dtype=original.dtype 2104 ) 2105 2106 def equals(self, other: object) -> bool: 2107 """ 2108 Returns True if categorical arrays are equal. 2109 2110 Parameters 2111 ---------- 2112 other : `Categorical` 2113 2114 Returns 2115 ------- 2116 bool 2117 """ 2118 if not isinstance(other, Categorical): 2119 return False 2120 elif self._categories_match_up_to_permutation(other): 2121 other = self._encode_with_my_categories(other) 2122 return np.array_equal(self._codes, other._codes) 2123 return False 2124 2125 @classmethod 2126 def _concat_same_type( 2127 cls: Type[CategoricalT], to_concat: Sequence[CategoricalT], axis: int = 0 2128 ) -> CategoricalT: 2129 from pandas.core.dtypes.concat import union_categoricals 2130 2131 return union_categoricals(to_concat) 2132 2133 # ------------------------------------------------------------------ 2134 2135 def _encode_with_my_categories(self, other: "Categorical") -> "Categorical": 2136 """ 2137 Re-encode another categorical using this Categorical's categories. 2138 2139 Notes 2140 ----- 2141 This assumes we have already checked 2142 self._categories_match_up_to_permutation(other). 2143 """ 2144 # Indexing on codes is more efficient if categories are the same, 2145 # so we can apply some optimizations based on the degree of 2146 # dtype-matching. 2147 codes = recode_for_categories( 2148 other.codes, other.categories, self.categories, copy=False 2149 ) 2150 return self._from_backing_data(codes) 2151 2152 def _categories_match_up_to_permutation(self, other: "Categorical") -> bool: 2153 """ 2154 Returns True if categoricals are the same dtype 2155 same categories, and same ordered 2156 2157 Parameters 2158 ---------- 2159 other : Categorical 2160 2161 Returns 2162 ------- 2163 bool 2164 """ 2165 return hash(self.dtype) == hash(other.dtype) 2166 2167 def is_dtype_equal(self, other) -> bool: 2168 warn( 2169 "Categorical.is_dtype_equal is deprecated and will be removed " 2170 "in a future version", 2171 FutureWarning, 2172 stacklevel=2, 2173 ) 2174 try: 2175 return self._categories_match_up_to_permutation(other) 2176 except (AttributeError, TypeError): 2177 return False 2178 2179 def describe(self): 2180 """ 2181 Describes this Categorical 2182 2183 Returns 2184 ------- 2185 description: `DataFrame` 2186 A dataframe with frequency and counts by category. 2187 """ 2188 counts = self.value_counts(dropna=False) 2189 freqs = counts / float(counts.sum()) 2190 2191 from pandas.core.reshape.concat import concat 2192 2193 result = concat([counts, freqs], axis=1) 2194 result.columns = ["counts", "freqs"] 2195 result.index.name = "categories" 2196 2197 return result 2198 2199 def isin(self, values) -> np.ndarray: 2200 """ 2201 Check whether `values` are contained in Categorical. 2202 2203 Return a boolean NumPy Array showing whether each element in 2204 the Categorical matches an element in the passed sequence of 2205 `values` exactly. 2206 2207 Parameters 2208 ---------- 2209 values : set or list-like 2210 The sequence of values to test. Passing in a single string will 2211 raise a ``TypeError``. Instead, turn a single string into a 2212 list of one element. 2213 2214 Returns 2215 ------- 2216 isin : numpy.ndarray (bool dtype) 2217 2218 Raises 2219 ------ 2220 TypeError 2221 * If `values` is not a set or list-like 2222 2223 See Also 2224 -------- 2225 pandas.Series.isin : Equivalent method on Series. 2226 2227 Examples 2228 -------- 2229 >>> s = pd.Categorical(['lama', 'cow', 'lama', 'beetle', 'lama', 2230 ... 'hippo']) 2231 >>> s.isin(['cow', 'lama']) 2232 array([ True, True, True, False, True, False]) 2233 2234 Passing a single string as ``s.isin('lama')`` will raise an error. Use 2235 a list of one element instead: 2236 2237 >>> s.isin(['lama']) 2238 array([ True, False, True, False, True, False]) 2239 """ 2240 if not is_list_like(values): 2241 values_type = type(values).__name__ 2242 raise TypeError( 2243 "only list-like objects are allowed to be passed " 2244 f"to isin(), you passed a [{values_type}]" 2245 ) 2246 values = sanitize_array(values, None, None) 2247 null_mask = np.asarray(isna(values)) 2248 code_values = self.categories.get_indexer(values) 2249 code_values = code_values[null_mask | (code_values >= 0)] 2250 return algorithms.isin(self.codes, code_values) 2251 2252 def replace(self, to_replace, value, inplace: bool = False): 2253 """ 2254 Replaces all instances of one value with another 2255 2256 Parameters 2257 ---------- 2258 to_replace: object 2259 The value to be replaced 2260 2261 value: object 2262 The value to replace it with 2263 2264 inplace: bool 2265 Whether the operation is done in-place 2266 2267 Returns 2268 ------- 2269 None if inplace is True, otherwise the new Categorical after replacement 2270 2271 2272 Examples 2273 -------- 2274 >>> s = pd.Categorical([1, 2, 1, 3]) 2275 >>> s.replace(1, 3) 2276 [3, 2, 3, 3] 2277 Categories (2, int64): [2, 3] 2278 """ 2279 inplace = validate_bool_kwarg(inplace, "inplace") 2280 cat = self if inplace else self.copy() 2281 2282 # build a dict of (to replace -> value) pairs 2283 if is_list_like(to_replace): 2284 # if to_replace is list-like and value is scalar 2285 replace_dict = {replace_value: value for replace_value in to_replace} 2286 else: 2287 # if both to_replace and value are scalar 2288 replace_dict = {to_replace: value} 2289 2290 # other cases, like if both to_replace and value are list-like or if 2291 # to_replace is a dict, are handled separately in NDFrame 2292 for replace_value, new_value in replace_dict.items(): 2293 if new_value == replace_value: 2294 continue 2295 if replace_value in cat.categories: 2296 if isna(new_value): 2297 cat.remove_categories(replace_value, inplace=True) 2298 continue 2299 categories = cat.categories.tolist() 2300 index = categories.index(replace_value) 2301 if new_value in cat.categories: 2302 value_index = categories.index(new_value) 2303 cat._codes[cat._codes == index] = value_index 2304 cat.remove_categories(replace_value, inplace=True) 2305 else: 2306 categories[index] = new_value 2307 cat.rename_categories(categories, inplace=True) 2308 if not inplace: 2309 return cat 2310 2311 # ------------------------------------------------------------------------ 2312 # String methods interface 2313 def _str_map(self, f, na_value=np.nan, dtype=np.dtype(object)): 2314 # Optimization to apply the callable `f` to the categories once 2315 # and rebuild the result by `take`ing from the result with the codes. 2316 # Returns the same type as the object-dtype implementation though. 2317 from pandas.core.arrays import PandasArray 2318 2319 categories = self.categories 2320 codes = self.codes 2321 result = PandasArray(categories.to_numpy())._str_map(f, na_value, dtype) 2322 return take_1d(result, codes, fill_value=na_value) 2323 2324 def _str_get_dummies(self, sep="|"): 2325 # sep may not be in categories. Just bail on this. 2326 from pandas.core.arrays import PandasArray 2327 2328 return PandasArray(self.astype(str))._str_get_dummies(sep) 2329 2330 2331# The Series.cat accessor 2332 2333 2334@delegate_names( 2335 delegate=Categorical, accessors=["categories", "ordered"], typ="property" 2336) 2337@delegate_names( 2338 delegate=Categorical, 2339 accessors=[ 2340 "rename_categories", 2341 "reorder_categories", 2342 "add_categories", 2343 "remove_categories", 2344 "remove_unused_categories", 2345 "set_categories", 2346 "as_ordered", 2347 "as_unordered", 2348 ], 2349 typ="method", 2350) 2351class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin): 2352 """ 2353 Accessor object for categorical properties of the Series values. 2354 2355 Be aware that assigning to `categories` is a inplace operation, while all 2356 methods return new categorical data per default (but can be called with 2357 `inplace=True`). 2358 2359 Parameters 2360 ---------- 2361 data : Series or CategoricalIndex 2362 2363 Examples 2364 -------- 2365 >>> s = pd.Series(list("abbccc")).astype("category") 2366 >>> s 2367 0 a 2368 1 b 2369 2 b 2370 3 c 2371 4 c 2372 5 c 2373 dtype: category 2374 Categories (3, object): ['a', 'b', 'c'] 2375 2376 >>> s.cat.categories 2377 Index(['a', 'b', 'c'], dtype='object') 2378 2379 >>> s.cat.rename_categories(list("cba")) 2380 0 c 2381 1 b 2382 2 b 2383 3 a 2384 4 a 2385 5 a 2386 dtype: category 2387 Categories (3, object): ['c', 'b', 'a'] 2388 2389 >>> s.cat.reorder_categories(list("cba")) 2390 0 a 2391 1 b 2392 2 b 2393 3 c 2394 4 c 2395 5 c 2396 dtype: category 2397 Categories (3, object): ['c', 'b', 'a'] 2398 2399 >>> s.cat.add_categories(["d", "e"]) 2400 0 a 2401 1 b 2402 2 b 2403 3 c 2404 4 c 2405 5 c 2406 dtype: category 2407 Categories (5, object): ['a', 'b', 'c', 'd', 'e'] 2408 2409 >>> s.cat.remove_categories(["a", "c"]) 2410 0 NaN 2411 1 b 2412 2 b 2413 3 NaN 2414 4 NaN 2415 5 NaN 2416 dtype: category 2417 Categories (1, object): ['b'] 2418 2419 >>> s1 = s.cat.add_categories(["d", "e"]) 2420 >>> s1.cat.remove_unused_categories() 2421 0 a 2422 1 b 2423 2 b 2424 3 c 2425 4 c 2426 5 c 2427 dtype: category 2428 Categories (3, object): ['a', 'b', 'c'] 2429 2430 >>> s.cat.set_categories(list("abcde")) 2431 0 a 2432 1 b 2433 2 b 2434 3 c 2435 4 c 2436 5 c 2437 dtype: category 2438 Categories (5, object): ['a', 'b', 'c', 'd', 'e'] 2439 2440 >>> s.cat.as_ordered() 2441 0 a 2442 1 b 2443 2 b 2444 3 c 2445 4 c 2446 5 c 2447 dtype: category 2448 Categories (3, object): ['a' < 'b' < 'c'] 2449 2450 >>> s.cat.as_unordered() 2451 0 a 2452 1 b 2453 2 b 2454 3 c 2455 4 c 2456 5 c 2457 dtype: category 2458 Categories (3, object): ['a', 'b', 'c'] 2459 """ 2460 2461 def __init__(self, data): 2462 self._validate(data) 2463 self._parent = data.values 2464 self._index = data.index 2465 self._name = data.name 2466 self._freeze() 2467 2468 @staticmethod 2469 def _validate(data): 2470 if not is_categorical_dtype(data.dtype): 2471 raise AttributeError("Can only use .cat accessor with a 'category' dtype") 2472 2473 def _delegate_property_get(self, name): 2474 return getattr(self._parent, name) 2475 2476 def _delegate_property_set(self, name, new_values): 2477 return setattr(self._parent, name, new_values) 2478 2479 @property 2480 def codes(self): 2481 """ 2482 Return Series of codes as well as the index. 2483 """ 2484 from pandas import Series 2485 2486 return Series(self._parent.codes, index=self._index) 2487 2488 def _delegate_method(self, name, *args, **kwargs): 2489 from pandas import Series 2490 2491 method = getattr(self._parent, name) 2492 res = method(*args, **kwargs) 2493 if res is not None: 2494 return Series(res, index=self._index, name=self._name) 2495 2496 2497# utility routines 2498 2499 2500def _get_codes_for_values(values, categories) -> np.ndarray: 2501 """ 2502 utility routine to turn values into codes given the specified categories 2503 2504 If `values` is known to be a Categorical, use recode_for_categories instead. 2505 """ 2506 dtype_equal = is_dtype_equal(values.dtype, categories.dtype) 2507 2508 if is_extension_array_dtype(categories.dtype) and is_object_dtype(values): 2509 # Support inferring the correct extension dtype from an array of 2510 # scalar objects. e.g. 2511 # Categorical(array[Period, Period], categories=PeriodIndex(...)) 2512 cls = categories.dtype.construct_array_type() 2513 values = maybe_cast_to_extension_array(cls, values) 2514 if not isinstance(values, cls): 2515 # exception raised in _from_sequence 2516 values = ensure_object(values) 2517 categories = ensure_object(categories) 2518 elif not dtype_equal: 2519 values = ensure_object(values) 2520 categories = ensure_object(categories) 2521 2522 if isinstance(categories, ABCIndexClass): 2523 return coerce_indexer_dtype(categories.get_indexer_for(values), categories) 2524 2525 # Only hit here when we've already coerced to object dtypee. 2526 2527 hash_klass, vals = get_data_algo(values) 2528 _, cats = get_data_algo(categories) 2529 t = hash_klass(len(cats)) 2530 t.map_locations(cats) 2531 return coerce_indexer_dtype(t.lookup(vals), cats) 2532 2533 2534def recode_for_categories( 2535 codes: np.ndarray, old_categories, new_categories, copy: bool = True 2536) -> np.ndarray: 2537 """ 2538 Convert a set of codes for to a new set of categories 2539 2540 Parameters 2541 ---------- 2542 codes : np.ndarray 2543 old_categories, new_categories : Index 2544 copy: bool, default True 2545 Whether to copy if the codes are unchanged. 2546 2547 Returns 2548 ------- 2549 new_codes : np.ndarray[np.int64] 2550 2551 Examples 2552 -------- 2553 >>> old_cat = pd.Index(['b', 'a', 'c']) 2554 >>> new_cat = pd.Index(['a', 'b']) 2555 >>> codes = np.array([0, 1, 1, 2]) 2556 >>> recode_for_categories(codes, old_cat, new_cat) 2557 array([ 1, 0, 0, -1], dtype=int8) 2558 """ 2559 if len(old_categories) == 0: 2560 # All null anyway, so just retain the nulls 2561 if copy: 2562 return codes.copy() 2563 return codes 2564 elif new_categories.equals(old_categories): 2565 # Same categories, so no need to actually recode 2566 if copy: 2567 return codes.copy() 2568 return codes 2569 2570 indexer = coerce_indexer_dtype( 2571 new_categories.get_indexer(old_categories), new_categories 2572 ) 2573 new_codes = take_1d(indexer, codes, fill_value=-1) 2574 return new_codes 2575 2576 2577def factorize_from_iterable(values): 2578 """ 2579 Factorize an input `values` into `categories` and `codes`. Preserves 2580 categorical dtype in `categories`. 2581 2582 *This is an internal function* 2583 2584 Parameters 2585 ---------- 2586 values : list-like 2587 2588 Returns 2589 ------- 2590 codes : ndarray 2591 categories : Index 2592 If `values` has a categorical dtype, then `categories` is 2593 a CategoricalIndex keeping the categories and order of `values`. 2594 """ 2595 if not is_list_like(values): 2596 raise TypeError("Input must be list-like") 2597 2598 if is_categorical_dtype(values): 2599 values = extract_array(values) 2600 # The Categorical we want to build has the same categories 2601 # as values but its codes are by def [0, ..., len(n_categories) - 1] 2602 cat_codes = np.arange(len(values.categories), dtype=values.codes.dtype) 2603 categories = Categorical.from_codes(cat_codes, dtype=values.dtype) 2604 codes = values.codes 2605 else: 2606 # The value of ordered is irrelevant since we don't use cat as such, 2607 # but only the resulting categories, the order of which is independent 2608 # from ordered. Set ordered to False as default. See GH #15457 2609 cat = Categorical(values, ordered=False) 2610 categories = cat.categories 2611 codes = cat.codes 2612 return codes, categories 2613 2614 2615def factorize_from_iterables(iterables): 2616 """ 2617 A higher-level wrapper over `factorize_from_iterable`. 2618 2619 *This is an internal function* 2620 2621 Parameters 2622 ---------- 2623 iterables : list-like of list-likes 2624 2625 Returns 2626 ------- 2627 codes_list : list of ndarrays 2628 categories_list : list of Indexes 2629 2630 Notes 2631 ----- 2632 See `factorize_from_iterable` for more info. 2633 """ 2634 if len(iterables) == 0: 2635 # For consistency, it should return a list of 2 lists. 2636 return [[], []] 2637 return map(list, zip(*(factorize_from_iterable(it) for it in iterables))) 2638