1""" 2Generic data algorithms. This module is experimental at the moment and not 3intended for public consumption 4""" 5from __future__ import annotations 6 7import operator 8from textwrap import dedent 9from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union, cast 10from warnings import catch_warnings, simplefilter, warn 11 12import numpy as np 13 14from pandas._libs import Timestamp, algos, hashtable as htable, iNaT, lib 15from pandas._typing import AnyArrayLike, ArrayLike, DtypeObj, FrameOrSeriesUnion 16from pandas.util._decorators import doc 17 18from pandas.core.dtypes.cast import ( 19 construct_1d_object_array_from_listlike, 20 infer_dtype_from_array, 21 maybe_promote, 22) 23from pandas.core.dtypes.common import ( 24 ensure_float64, 25 ensure_int64, 26 ensure_object, 27 ensure_platform_int, 28 ensure_uint64, 29 is_array_like, 30 is_bool_dtype, 31 is_categorical_dtype, 32 is_complex_dtype, 33 is_datetime64_dtype, 34 is_datetime64_ns_dtype, 35 is_extension_array_dtype, 36 is_float_dtype, 37 is_integer, 38 is_integer_dtype, 39 is_list_like, 40 is_numeric_dtype, 41 is_object_dtype, 42 is_period_dtype, 43 is_scalar, 44 is_signed_integer_dtype, 45 is_timedelta64_dtype, 46 is_unsigned_integer_dtype, 47 needs_i8_conversion, 48 pandas_dtype, 49) 50from pandas.core.dtypes.generic import ( 51 ABCDatetimeArray, 52 ABCExtensionArray, 53 ABCIndexClass, 54 ABCMultiIndex, 55 ABCRangeIndex, 56 ABCSeries, 57 ABCTimedeltaArray, 58) 59from pandas.core.dtypes.missing import isna, na_value_for_dtype 60 61from pandas.core.construction import array, extract_array 62from pandas.core.indexers import validate_indices 63 64if TYPE_CHECKING: 65 from pandas import Categorical, DataFrame, Index, Series 66 67_shared_docs: Dict[str, str] = {} 68 69 70# --------------- # 71# dtype access # 72# --------------- # 73def _ensure_data( 74 values: ArrayLike, dtype: Optional[DtypeObj] = None 75) -> Tuple[np.ndarray, DtypeObj]: 76 """ 77 routine to ensure that our data is of the correct 78 input dtype for lower-level routines 79 80 This will coerce: 81 - ints -> int64 82 - uint -> uint64 83 - bool -> uint64 (TODO this should be uint8) 84 - datetimelike -> i8 85 - datetime64tz -> i8 (in local tz) 86 - categorical -> codes 87 88 Parameters 89 ---------- 90 values : array-like 91 dtype : pandas_dtype, optional 92 coerce to this dtype 93 94 Returns 95 ------- 96 values : ndarray 97 pandas_dtype : np.dtype or ExtensionDtype 98 """ 99 100 if dtype is not None: 101 # We only have non-None dtype when called from `isin`, and 102 # both Datetimelike and Categorical dispatch before getting here. 103 assert not needs_i8_conversion(dtype) 104 assert not is_categorical_dtype(dtype) 105 106 if not isinstance(values, ABCMultiIndex): 107 # extract_array would raise 108 values = extract_array(values, extract_numpy=True) 109 110 # we check some simple dtypes first 111 if is_object_dtype(dtype): 112 return ensure_object(np.asarray(values)), np.dtype("object") 113 elif is_object_dtype(values) and dtype is None: 114 return ensure_object(np.asarray(values)), np.dtype("object") 115 116 try: 117 if is_bool_dtype(values) or is_bool_dtype(dtype): 118 # we are actually coercing to uint64 119 # until our algos support uint8 directly (see TODO) 120 return np.asarray(values).astype("uint64"), np.dtype("bool") 121 elif is_signed_integer_dtype(values) or is_signed_integer_dtype(dtype): 122 return ensure_int64(values), np.dtype("int64") 123 elif is_unsigned_integer_dtype(values) or is_unsigned_integer_dtype(dtype): 124 return ensure_uint64(values), np.dtype("uint64") 125 elif is_float_dtype(values) or is_float_dtype(dtype): 126 return ensure_float64(values), np.dtype("float64") 127 elif is_complex_dtype(values) or is_complex_dtype(dtype): 128 129 # ignore the fact that we are casting to float 130 # which discards complex parts 131 with catch_warnings(): 132 simplefilter("ignore", np.ComplexWarning) 133 values = ensure_float64(values) 134 return values, np.dtype("float64") 135 136 except (TypeError, ValueError, OverflowError): 137 # if we are trying to coerce to a dtype 138 # and it is incompatible this will fall through to here 139 return ensure_object(values), np.dtype("object") 140 141 # datetimelike 142 if needs_i8_conversion(values.dtype) or needs_i8_conversion(dtype): 143 if is_period_dtype(values.dtype) or is_period_dtype(dtype): 144 from pandas import PeriodIndex 145 146 values = PeriodIndex(values)._data 147 dtype = values.dtype 148 elif is_timedelta64_dtype(values.dtype) or is_timedelta64_dtype(dtype): 149 from pandas import TimedeltaIndex 150 151 values = TimedeltaIndex(values)._data 152 dtype = values.dtype 153 else: 154 # Datetime 155 if values.ndim > 1 and is_datetime64_ns_dtype(values.dtype): 156 # Avoid calling the DatetimeIndex constructor as it is 1D only 157 # Note: this is reached by DataFrame.rank calls GH#27027 158 # TODO(EA2D): special case not needed with 2D EAs 159 asi8 = values.view("i8") 160 dtype = values.dtype 161 return asi8, dtype 162 163 from pandas import DatetimeIndex 164 165 values = DatetimeIndex(values)._data 166 dtype = values.dtype 167 168 return values.asi8, dtype 169 170 elif is_categorical_dtype(values.dtype) and ( 171 is_categorical_dtype(dtype) or dtype is None 172 ): 173 values = cast("Categorical", values) 174 values = values.codes 175 dtype = pandas_dtype("category") 176 177 # we are actually coercing to int64 178 # until our algos support int* directly (not all do) 179 values = ensure_int64(values) 180 181 return values, dtype 182 183 # we have failed, return object 184 values = np.asarray(values, dtype=object) 185 return ensure_object(values), np.dtype("object") 186 187 188def _reconstruct_data( 189 values: ArrayLike, dtype: DtypeObj, original: AnyArrayLike 190) -> ArrayLike: 191 """ 192 reverse of _ensure_data 193 194 Parameters 195 ---------- 196 values : np.ndarray or ExtensionArray 197 dtype : np.ndtype or ExtensionDtype 198 original : AnyArrayLike 199 200 Returns 201 ------- 202 ExtensionArray or np.ndarray 203 """ 204 if isinstance(values, ABCExtensionArray) and values.dtype == dtype: 205 # Catch DatetimeArray/TimedeltaArray 206 return values 207 208 if is_extension_array_dtype(dtype): 209 cls = dtype.construct_array_type() 210 if isinstance(values, cls) and values.dtype == dtype: 211 return values 212 213 values = cls._from_sequence(values) 214 elif is_bool_dtype(dtype): 215 values = values.astype(dtype, copy=False) 216 217 # we only support object dtypes bool Index 218 if isinstance(original, ABCIndexClass): 219 values = values.astype(object, copy=False) 220 elif dtype is not None: 221 if is_datetime64_dtype(dtype): 222 dtype = "datetime64[ns]" 223 elif is_timedelta64_dtype(dtype): 224 dtype = "timedelta64[ns]" 225 226 values = values.astype(dtype, copy=False) 227 228 return values 229 230 231def _ensure_arraylike(values): 232 """ 233 ensure that we are arraylike if not already 234 """ 235 if not is_array_like(values): 236 inferred = lib.infer_dtype(values, skipna=False) 237 if inferred in ["mixed", "string", "mixed-integer"]: 238 # "mixed-integer" to ensure we do not cast ["ss", 42] to str GH#22160 239 if isinstance(values, tuple): 240 values = list(values) 241 values = construct_1d_object_array_from_listlike(values) 242 else: 243 values = np.asarray(values) 244 return values 245 246 247_hashtables = { 248 "float64": htable.Float64HashTable, 249 "uint64": htable.UInt64HashTable, 250 "int64": htable.Int64HashTable, 251 "string": htable.StringHashTable, 252 "object": htable.PyObjectHashTable, 253} 254 255 256def _get_hashtable_algo(values: np.ndarray): 257 """ 258 Parameters 259 ---------- 260 values : np.ndarray 261 262 Returns 263 ------- 264 htable : HashTable subclass 265 values : ndarray 266 """ 267 values, _ = _ensure_data(values) 268 269 ndtype = _check_object_for_strings(values) 270 htable = _hashtables[ndtype] 271 return htable, values 272 273 274def _get_values_for_rank(values: ArrayLike): 275 if is_categorical_dtype(values): 276 values = cast("Categorical", values)._values_for_rank() 277 278 values, _ = _ensure_data(values) 279 return values 280 281 282def get_data_algo(values: ArrayLike): 283 values = _get_values_for_rank(values) 284 285 ndtype = _check_object_for_strings(values) 286 htable = _hashtables.get(ndtype, _hashtables["object"]) 287 288 return htable, values 289 290 291def _check_object_for_strings(values) -> str: 292 """ 293 Check if we can use string hashtable instead of object hashtable. 294 295 Parameters 296 ---------- 297 values : ndarray 298 299 Returns 300 ------- 301 str 302 """ 303 ndtype = values.dtype.name 304 if ndtype == "object": 305 306 # it's cheaper to use a String Hash Table than Object; we infer 307 # including nulls because that is the only difference between 308 # StringHashTable and ObjectHashtable 309 if lib.infer_dtype(values, skipna=False) in ["string"]: 310 ndtype = "string" 311 return ndtype 312 313 314# --------------- # 315# top-level algos # 316# --------------- # 317 318 319def unique(values): 320 """ 321 Hash table-based unique. Uniques are returned in order 322 of appearance. This does NOT sort. 323 324 Significantly faster than numpy.unique. Includes NA values. 325 326 Parameters 327 ---------- 328 values : 1d array-like 329 330 Returns 331 ------- 332 numpy.ndarray or ExtensionArray 333 334 The return can be: 335 336 * Index : when the input is an Index 337 * Categorical : when the input is a Categorical dtype 338 * ndarray : when the input is a Series/ndarray 339 340 Return numpy.ndarray or ExtensionArray. 341 342 See Also 343 -------- 344 Index.unique : Return unique values from an Index. 345 Series.unique : Return unique values of Series object. 346 347 Examples 348 -------- 349 >>> pd.unique(pd.Series([2, 1, 3, 3])) 350 array([2, 1, 3]) 351 352 >>> pd.unique(pd.Series([2] + [1] * 5)) 353 array([2, 1]) 354 355 >>> pd.unique(pd.Series([pd.Timestamp('20160101'), 356 ... pd.Timestamp('20160101')])) 357 array(['2016-01-01T00:00:00.000000000'], dtype='datetime64[ns]') 358 359 >>> pd.unique(pd.Series([pd.Timestamp('20160101', tz='US/Eastern'), 360 ... pd.Timestamp('20160101', tz='US/Eastern')])) 361 array([Timestamp('2016-01-01 00:00:00-0500', tz='US/Eastern')], 362 dtype=object) 363 364 >>> pd.unique(pd.Index([pd.Timestamp('20160101', tz='US/Eastern'), 365 ... pd.Timestamp('20160101', tz='US/Eastern')])) 366 DatetimeIndex(['2016-01-01 00:00:00-05:00'], 367 ... dtype='datetime64[ns, US/Eastern]', freq=None) 368 369 >>> pd.unique(list('baabc')) 370 array(['b', 'a', 'c'], dtype=object) 371 372 An unordered Categorical will return categories in the 373 order of appearance. 374 375 >>> pd.unique(pd.Series(pd.Categorical(list('baabc')))) 376 [b, a, c] 377 Categories (3, object): [b, a, c] 378 379 >>> pd.unique(pd.Series(pd.Categorical(list('baabc'), 380 ... categories=list('abc')))) 381 [b, a, c] 382 Categories (3, object): [b, a, c] 383 384 An ordered Categorical preserves the category ordering. 385 386 >>> pd.unique(pd.Series(pd.Categorical(list('baabc'), 387 ... categories=list('abc'), 388 ... ordered=True))) 389 [b, a, c] 390 Categories (3, object): [a < b < c] 391 392 An array of tuples 393 394 >>> pd.unique([('a', 'b'), ('b', 'a'), ('a', 'c'), ('b', 'a')]) 395 array([('a', 'b'), ('b', 'a'), ('a', 'c')], dtype=object) 396 """ 397 values = _ensure_arraylike(values) 398 399 if is_extension_array_dtype(values): 400 # Dispatch to extension dtype's unique. 401 return values.unique() 402 403 original = values 404 htable, values = _get_hashtable_algo(values) 405 406 table = htable(len(values)) 407 uniques = table.unique(values) 408 uniques = _reconstruct_data(uniques, original.dtype, original) 409 return uniques 410 411 412unique1d = unique 413 414 415def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: 416 """ 417 Compute the isin boolean array. 418 419 Parameters 420 ---------- 421 comps : array-like 422 values : array-like 423 424 Returns 425 ------- 426 ndarray[bool] 427 Same length as `comps`. 428 """ 429 if not is_list_like(comps): 430 raise TypeError( 431 "only list-like objects are allowed to be passed " 432 f"to isin(), you passed a [{type(comps).__name__}]" 433 ) 434 if not is_list_like(values): 435 raise TypeError( 436 "only list-like objects are allowed to be passed " 437 f"to isin(), you passed a [{type(values).__name__}]" 438 ) 439 440 if not isinstance( 441 values, (ABCIndexClass, ABCSeries, ABCExtensionArray, np.ndarray) 442 ): 443 values = _ensure_arraylike(list(values)) 444 elif isinstance(values, ABCMultiIndex): 445 # Avoid raising in extract_array 446 values = np.array(values) 447 else: 448 values = extract_array(values, extract_numpy=True) 449 450 comps = _ensure_arraylike(comps) 451 comps = extract_array(comps, extract_numpy=True) 452 if is_categorical_dtype(comps.dtype): 453 # TODO(extension) 454 # handle categoricals 455 return cast("Categorical", comps).isin(values) 456 457 if needs_i8_conversion(comps.dtype): 458 # Dispatch to DatetimeLikeArrayMixin.isin 459 return array(comps).isin(values) 460 elif needs_i8_conversion(values.dtype) and not is_object_dtype(comps.dtype): 461 # e.g. comps are integers and values are datetime64s 462 return np.zeros(comps.shape, dtype=bool) 463 # TODO: not quite right ... Sparse/Categorical 464 elif needs_i8_conversion(values.dtype): 465 return isin(comps, values.astype(object)) 466 467 elif is_extension_array_dtype(comps.dtype) or is_extension_array_dtype( 468 values.dtype 469 ): 470 return isin(np.asarray(comps), np.asarray(values)) 471 472 # GH16012 473 # Ensure np.in1d doesn't get object types or it *may* throw an exception 474 # Albeit hashmap has O(1) look-up (vs. O(logn) in sorted array), 475 # in1d is faster for small sizes 476 if len(comps) > 1_000_000 and len(values) <= 26 and not is_object_dtype(comps): 477 # If the values include nan we need to check for nan explicitly 478 # since np.nan it not equal to np.nan 479 if isna(values).any(): 480 f = lambda c, v: np.logical_or(np.in1d(c, v), np.isnan(c)) 481 else: 482 f = np.in1d 483 484 else: 485 common = np.find_common_type([values.dtype, comps.dtype], []) 486 values = values.astype(common, copy=False) 487 comps = comps.astype(common, copy=False) 488 name = common.name 489 if name == "bool": 490 name = "uint8" 491 f = getattr(htable, f"ismember_{name}") 492 493 return f(comps, values) 494 495 496def factorize_array( 497 values: np.ndarray, na_sentinel: int = -1, size_hint=None, na_value=None, mask=None 498) -> Tuple[np.ndarray, np.ndarray]: 499 """ 500 Factorize an array-like to codes and uniques. 501 502 This doesn't do any coercion of types or unboxing before factorization. 503 504 Parameters 505 ---------- 506 values : ndarray 507 na_sentinel : int, default -1 508 size_hint : int, optional 509 Passed through to the hashtable's 'get_labels' method 510 na_value : object, optional 511 A value in `values` to consider missing. Note: only use this 512 parameter when you know that you don't have any values pandas would 513 consider missing in the array (NaN for float data, iNaT for 514 datetimes, etc.). 515 mask : ndarray[bool], optional 516 If not None, the mask is used as indicator for missing values 517 (True = missing, False = valid) instead of `na_value` or 518 condition "val != val". 519 520 Returns 521 ------- 522 codes : ndarray 523 uniques : ndarray 524 """ 525 hash_klass, values = get_data_algo(values) 526 527 table = hash_klass(size_hint or len(values)) 528 uniques, codes = table.factorize( 529 values, na_sentinel=na_sentinel, na_value=na_value, mask=mask 530 ) 531 532 codes = ensure_platform_int(codes) 533 return codes, uniques 534 535 536@doc( 537 values=dedent( 538 """\ 539 values : sequence 540 A 1-D sequence. Sequences that aren't pandas objects are 541 coerced to ndarrays before factorization. 542 """ 543 ), 544 sort=dedent( 545 """\ 546 sort : bool, default False 547 Sort `uniques` and shuffle `codes` to maintain the 548 relationship. 549 """ 550 ), 551 size_hint=dedent( 552 """\ 553 size_hint : int, optional 554 Hint to the hashtable sizer. 555 """ 556 ), 557) 558def factorize( 559 values, 560 sort: bool = False, 561 na_sentinel: Optional[int] = -1, 562 size_hint: Optional[int] = None, 563) -> Tuple[np.ndarray, Union[np.ndarray, "Index"]]: 564 """ 565 Encode the object as an enumerated type or categorical variable. 566 567 This method is useful for obtaining a numeric representation of an 568 array when all that matters is identifying distinct values. `factorize` 569 is available as both a top-level function :func:`pandas.factorize`, 570 and as a method :meth:`Series.factorize` and :meth:`Index.factorize`. 571 572 Parameters 573 ---------- 574 {values}{sort} 575 na_sentinel : int or None, default -1 576 Value to mark "not found". If None, will not drop the NaN 577 from the uniques of the values. 578 579 .. versionchanged:: 1.1.2 580 {size_hint}\ 581 582 Returns 583 ------- 584 codes : ndarray 585 An integer ndarray that's an indexer into `uniques`. 586 ``uniques.take(codes)`` will have the same values as `values`. 587 uniques : ndarray, Index, or Categorical 588 The unique valid values. When `values` is Categorical, `uniques` 589 is a Categorical. When `values` is some other pandas object, an 590 `Index` is returned. Otherwise, a 1-D ndarray is returned. 591 592 .. note :: 593 594 Even if there's a missing value in `values`, `uniques` will 595 *not* contain an entry for it. 596 597 See Also 598 -------- 599 cut : Discretize continuous-valued array. 600 unique : Find the unique value in an array. 601 602 Examples 603 -------- 604 These examples all show factorize as a top-level method like 605 ``pd.factorize(values)``. The results are identical for methods like 606 :meth:`Series.factorize`. 607 608 >>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b']) 609 >>> codes 610 array([0, 0, 1, 2, 0]...) 611 >>> uniques 612 array(['b', 'a', 'c'], dtype=object) 613 614 With ``sort=True``, the `uniques` will be sorted, and `codes` will be 615 shuffled so that the relationship is the maintained. 616 617 >>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True) 618 >>> codes 619 array([1, 1, 0, 2, 1]...) 620 >>> uniques 621 array(['a', 'b', 'c'], dtype=object) 622 623 Missing values are indicated in `codes` with `na_sentinel` 624 (``-1`` by default). Note that missing values are never 625 included in `uniques`. 626 627 >>> codes, uniques = pd.factorize(['b', None, 'a', 'c', 'b']) 628 >>> codes 629 array([ 0, -1, 1, 2, 0]...) 630 >>> uniques 631 array(['b', 'a', 'c'], dtype=object) 632 633 Thus far, we've only factorized lists (which are internally coerced to 634 NumPy arrays). When factorizing pandas objects, the type of `uniques` 635 will differ. For Categoricals, a `Categorical` is returned. 636 637 >>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c']) 638 >>> codes, uniques = pd.factorize(cat) 639 >>> codes 640 array([0, 0, 1]...) 641 >>> uniques 642 ['a', 'c'] 643 Categories (3, object): ['a', 'b', 'c'] 644 645 Notice that ``'b'`` is in ``uniques.categories``, despite not being 646 present in ``cat.values``. 647 648 For all other pandas objects, an Index of the appropriate type is 649 returned. 650 651 >>> cat = pd.Series(['a', 'a', 'c']) 652 >>> codes, uniques = pd.factorize(cat) 653 >>> codes 654 array([0, 0, 1]...) 655 >>> uniques 656 Index(['a', 'c'], dtype='object') 657 658 If NaN is in the values, and we want to include NaN in the uniques of the 659 values, it can be achieved by setting ``na_sentinel=None``. 660 661 >>> values = np.array([1, 2, 1, np.nan]) 662 >>> codes, uniques = pd.factorize(values) # default: na_sentinel=-1 663 >>> codes 664 array([ 0, 1, 0, -1]) 665 >>> uniques 666 array([1., 2.]) 667 668 >>> codes, uniques = pd.factorize(values, na_sentinel=None) 669 >>> codes 670 array([0, 1, 0, 2]) 671 >>> uniques 672 array([ 1., 2., nan]) 673 """ 674 # Implementation notes: This method is responsible for 3 things 675 # 1.) coercing data to array-like (ndarray, Index, extension array) 676 # 2.) factorizing codes and uniques 677 # 3.) Maybe boxing the uniques in an Index 678 # 679 # Step 2 is dispatched to extension types (like Categorical). They are 680 # responsible only for factorization. All data coercion, sorting and boxing 681 # should happen here. 682 683 if isinstance(values, ABCRangeIndex): 684 return values.factorize(sort=sort) 685 686 values = _ensure_arraylike(values) 687 original = values 688 if not isinstance(values, ABCMultiIndex): 689 values = extract_array(values, extract_numpy=True) 690 691 # GH35667, if na_sentinel=None, we will not dropna NaNs from the uniques 692 # of values, assign na_sentinel=-1 to replace code value for NaN. 693 dropna = True 694 if na_sentinel is None: 695 na_sentinel = -1 696 dropna = False 697 698 if ( 699 isinstance(values, (ABCDatetimeArray, ABCTimedeltaArray)) 700 and values.freq is not None 701 ): 702 codes, uniques = values.factorize(sort=sort) 703 if isinstance(original, ABCIndexClass): 704 uniques = original._shallow_copy(uniques, name=None) 705 elif isinstance(original, ABCSeries): 706 from pandas import Index 707 708 uniques = Index(uniques) 709 return codes, uniques 710 711 if is_extension_array_dtype(values.dtype): 712 codes, uniques = values.factorize(na_sentinel=na_sentinel) 713 dtype = original.dtype 714 else: 715 values, dtype = _ensure_data(values) 716 717 if original.dtype.kind in ["m", "M"]: 718 na_value = na_value_for_dtype(original.dtype) 719 else: 720 na_value = None 721 722 codes, uniques = factorize_array( 723 values, na_sentinel=na_sentinel, size_hint=size_hint, na_value=na_value 724 ) 725 726 if sort and len(uniques) > 0: 727 uniques, codes = safe_sort( 728 uniques, codes, na_sentinel=na_sentinel, assume_unique=True, verify=False 729 ) 730 731 code_is_na = codes == na_sentinel 732 if not dropna and code_is_na.any(): 733 # na_value is set based on the dtype of uniques, and compat set to False is 734 # because we do not want na_value to be 0 for integers 735 na_value = na_value_for_dtype(uniques.dtype, compat=False) 736 uniques = np.append(uniques, [na_value]) 737 codes = np.where(code_is_na, len(uniques) - 1, codes) 738 739 uniques = _reconstruct_data(uniques, dtype, original) 740 741 # return original tenor 742 if isinstance(original, ABCIndexClass): 743 if original.dtype.kind in ["m", "M"] and isinstance(uniques, np.ndarray): 744 uniques = type(original._data)._simple_new(uniques, dtype=original.dtype) 745 uniques = original._shallow_copy(uniques, name=None) 746 elif isinstance(original, ABCSeries): 747 from pandas import Index 748 749 uniques = Index(uniques) 750 751 return codes, uniques 752 753 754def value_counts( 755 values, 756 sort: bool = True, 757 ascending: bool = False, 758 normalize: bool = False, 759 bins=None, 760 dropna: bool = True, 761) -> Series: 762 """ 763 Compute a histogram of the counts of non-null values. 764 765 Parameters 766 ---------- 767 values : ndarray (1-d) 768 sort : bool, default True 769 Sort by values 770 ascending : bool, default False 771 Sort in ascending order 772 normalize: bool, default False 773 If True then compute a relative histogram 774 bins : integer, optional 775 Rather than count values, group them into half-open bins, 776 convenience for pd.cut, only works with numeric data 777 dropna : bool, default True 778 Don't include counts of NaN 779 780 Returns 781 ------- 782 Series 783 """ 784 from pandas.core.series import Series 785 786 name = getattr(values, "name", None) 787 788 if bins is not None: 789 from pandas.core.reshape.tile import cut 790 791 values = Series(values) 792 try: 793 ii = cut(values, bins, include_lowest=True) 794 except TypeError as err: 795 raise TypeError("bins argument only works with numeric data.") from err 796 797 # count, remove nulls (from the index), and but the bins 798 result = ii.value_counts(dropna=dropna) 799 result = result[result.index.notna()] 800 result.index = result.index.astype("interval") 801 result = result.sort_index() 802 803 # if we are dropna and we have NO values 804 if dropna and (result._values == 0).all(): 805 result = result.iloc[0:0] 806 807 # normalizing is by len of all (regardless of dropna) 808 counts = np.array([len(ii)]) 809 810 else: 811 812 if is_extension_array_dtype(values): 813 814 # handle Categorical and sparse, 815 result = Series(values)._values.value_counts(dropna=dropna) 816 result.name = name 817 counts = result._values 818 819 else: 820 keys, counts = value_counts_arraylike(values, dropna) 821 822 result = Series(counts, index=keys, name=name) 823 824 if sort: 825 result = result.sort_values(ascending=ascending) 826 827 if normalize: 828 result = result / float(counts.sum()) 829 830 return result 831 832 833# Called once from SparseArray, otherwise could be private 834def value_counts_arraylike(values, dropna: bool): 835 """ 836 Parameters 837 ---------- 838 values : arraylike 839 dropna : bool 840 841 Returns 842 ------- 843 uniques : np.ndarray or ExtensionArray 844 counts : np.ndarray 845 """ 846 values = _ensure_arraylike(values) 847 original = values 848 values, _ = _ensure_data(values) 849 ndtype = values.dtype.name 850 851 if needs_i8_conversion(original.dtype): 852 # datetime, timedelta, or period 853 854 keys, counts = htable.value_count_int64(values, dropna) 855 856 if dropna: 857 msk = keys != iNaT 858 keys, counts = keys[msk], counts[msk] 859 860 else: 861 # ndarray like 862 863 # TODO: handle uint8 864 f = getattr(htable, f"value_count_{ndtype}") 865 keys, counts = f(values, dropna) 866 867 mask = isna(values) 868 if not dropna and mask.any(): 869 if not isna(keys).any(): 870 keys = np.insert(keys, 0, np.NaN) 871 counts = np.insert(counts, 0, mask.sum()) 872 873 keys = _reconstruct_data(keys, original.dtype, original) 874 875 return keys, counts 876 877 878def duplicated(values: ArrayLike, keep: str = "first") -> np.ndarray: 879 """ 880 Return boolean ndarray denoting duplicate values. 881 882 Parameters 883 ---------- 884 values : ndarray-like 885 Array over which to check for duplicate values. 886 keep : {'first', 'last', False}, default 'first' 887 - ``first`` : Mark duplicates as ``True`` except for the first 888 occurrence. 889 - ``last`` : Mark duplicates as ``True`` except for the last 890 occurrence. 891 - False : Mark all duplicates as ``True``. 892 893 Returns 894 ------- 895 duplicated : ndarray 896 """ 897 values, _ = _ensure_data(values) 898 ndtype = values.dtype.name 899 f = getattr(htable, f"duplicated_{ndtype}") 900 return f(values, keep=keep) 901 902 903def mode(values, dropna: bool = True) -> Series: 904 """ 905 Returns the mode(s) of an array. 906 907 Parameters 908 ---------- 909 values : array-like 910 Array over which to check for duplicate values. 911 dropna : boolean, default True 912 Don't consider counts of NaN/NaT. 913 914 .. versionadded:: 0.24.0 915 916 Returns 917 ------- 918 mode : Series 919 """ 920 from pandas import Series 921 922 values = _ensure_arraylike(values) 923 original = values 924 925 # categorical is a fast-path 926 if is_categorical_dtype(values): 927 if isinstance(values, Series): 928 # TODO: should we be passing `name` below? 929 return Series(values._values.mode(dropna=dropna), name=values.name) 930 return values.mode(dropna=dropna) 931 932 if dropna and needs_i8_conversion(values.dtype): 933 mask = values.isnull() 934 values = values[~mask] 935 936 values, _ = _ensure_data(values) 937 ndtype = values.dtype.name 938 939 f = getattr(htable, f"mode_{ndtype}") 940 result = f(values, dropna=dropna) 941 try: 942 result = np.sort(result) 943 except TypeError as err: 944 warn(f"Unable to sort modes: {err}") 945 946 result = _reconstruct_data(result, original.dtype, original) 947 return Series(result) 948 949 950def rank( 951 values, 952 axis: int = 0, 953 method: str = "average", 954 na_option: str = "keep", 955 ascending: bool = True, 956 pct: bool = False, 957): 958 """ 959 Rank the values along a given axis. 960 961 Parameters 962 ---------- 963 values : array-like 964 Array whose values will be ranked. The number of dimensions in this 965 array must not exceed 2. 966 axis : int, default 0 967 Axis over which to perform rankings. 968 method : {'average', 'min', 'max', 'first', 'dense'}, default 'average' 969 The method by which tiebreaks are broken during the ranking. 970 na_option : {'keep', 'top'}, default 'keep' 971 The method by which NaNs are placed in the ranking. 972 - ``keep``: rank each NaN value with a NaN ranking 973 - ``top``: replace each NaN with either +/- inf so that they 974 there are ranked at the top 975 ascending : boolean, default True 976 Whether or not the elements should be ranked in ascending order. 977 pct : boolean, default False 978 Whether or not to the display the returned rankings in integer form 979 (e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1). 980 """ 981 if values.ndim == 1: 982 values = _get_values_for_rank(values) 983 ranks = algos.rank_1d( 984 values, 985 ties_method=method, 986 ascending=ascending, 987 na_option=na_option, 988 pct=pct, 989 ) 990 elif values.ndim == 2: 991 values = _get_values_for_rank(values) 992 ranks = algos.rank_2d( 993 values, 994 axis=axis, 995 ties_method=method, 996 ascending=ascending, 997 na_option=na_option, 998 pct=pct, 999 ) 1000 else: 1001 raise TypeError("Array with ndim > 2 are not supported.") 1002 1003 return ranks 1004 1005 1006def checked_add_with_arr(arr, b, arr_mask=None, b_mask=None): 1007 """ 1008 Perform array addition that checks for underflow and overflow. 1009 1010 Performs the addition of an int64 array and an int64 integer (or array) 1011 but checks that they do not result in overflow first. For elements that 1012 are indicated to be NaN, whether or not there is overflow for that element 1013 is automatically ignored. 1014 1015 Parameters 1016 ---------- 1017 arr : array addend. 1018 b : array or scalar addend. 1019 arr_mask : boolean array or None 1020 array indicating which elements to exclude from checking 1021 b_mask : boolean array or boolean or None 1022 array or scalar indicating which element(s) to exclude from checking 1023 1024 Returns 1025 ------- 1026 sum : An array for elements x + b for each element x in arr if b is 1027 a scalar or an array for elements x + y for each element pair 1028 (x, y) in (arr, b). 1029 1030 Raises 1031 ------ 1032 OverflowError if any x + y exceeds the maximum or minimum int64 value. 1033 """ 1034 # For performance reasons, we broadcast 'b' to the new array 'b2' 1035 # so that it has the same size as 'arr'. 1036 b2 = np.broadcast_to(b, arr.shape) 1037 if b_mask is not None: 1038 # We do the same broadcasting for b_mask as well. 1039 b2_mask = np.broadcast_to(b_mask, arr.shape) 1040 else: 1041 b2_mask = None 1042 1043 # For elements that are NaN, regardless of their value, we should 1044 # ignore whether they overflow or not when doing the checked add. 1045 if arr_mask is not None and b2_mask is not None: 1046 not_nan = np.logical_not(arr_mask | b2_mask) 1047 elif arr_mask is not None: 1048 not_nan = np.logical_not(arr_mask) 1049 elif b_mask is not None: 1050 not_nan = np.logical_not(b2_mask) 1051 else: 1052 not_nan = np.empty(arr.shape, dtype=bool) 1053 not_nan.fill(True) 1054 1055 # gh-14324: For each element in 'arr' and its corresponding element 1056 # in 'b2', we check the sign of the element in 'b2'. If it is positive, 1057 # we then check whether its sum with the element in 'arr' exceeds 1058 # np.iinfo(np.int64).max. If so, we have an overflow error. If it 1059 # it is negative, we then check whether its sum with the element in 1060 # 'arr' exceeds np.iinfo(np.int64).min. If so, we have an overflow 1061 # error as well. 1062 mask1 = b2 > 0 1063 mask2 = b2 < 0 1064 1065 if not mask1.any(): 1066 to_raise = ((np.iinfo(np.int64).min - b2 > arr) & not_nan).any() 1067 elif not mask2.any(): 1068 to_raise = ((np.iinfo(np.int64).max - b2 < arr) & not_nan).any() 1069 else: 1070 to_raise = ( 1071 (np.iinfo(np.int64).max - b2[mask1] < arr[mask1]) & not_nan[mask1] 1072 ).any() or ( 1073 (np.iinfo(np.int64).min - b2[mask2] > arr[mask2]) & not_nan[mask2] 1074 ).any() 1075 1076 if to_raise: 1077 raise OverflowError("Overflow in int64 addition") 1078 return arr + b 1079 1080 1081def quantile(x, q, interpolation_method="fraction"): 1082 """ 1083 Compute sample quantile or quantiles of the input array. For example, q=0.5 1084 computes the median. 1085 1086 The `interpolation_method` parameter supports three values, namely 1087 `fraction` (default), `lower` and `higher`. Interpolation is done only, 1088 if the desired quantile lies between two data points `i` and `j`. For 1089 `fraction`, the result is an interpolated value between `i` and `j`; 1090 for `lower`, the result is `i`, for `higher` the result is `j`. 1091 1092 Parameters 1093 ---------- 1094 x : ndarray 1095 Values from which to extract score. 1096 q : scalar or array 1097 Percentile at which to extract score. 1098 interpolation_method : {'fraction', 'lower', 'higher'}, optional 1099 This optional parameter specifies the interpolation method to use, 1100 when the desired quantile lies between two data points `i` and `j`: 1101 1102 - fraction: `i + (j - i)*fraction`, where `fraction` is the 1103 fractional part of the index surrounded by `i` and `j`. 1104 -lower: `i`. 1105 - higher: `j`. 1106 1107 Returns 1108 ------- 1109 score : float 1110 Score at percentile. 1111 1112 Examples 1113 -------- 1114 >>> from scipy import stats 1115 >>> a = np.arange(100) 1116 >>> stats.scoreatpercentile(a, 50) 1117 49.5 1118 1119 """ 1120 x = np.asarray(x) 1121 mask = isna(x) 1122 1123 x = x[~mask] 1124 1125 values = np.sort(x) 1126 1127 def _interpolate(a, b, fraction): 1128 """ 1129 Returns the point at the given fraction between a and b, where 1130 'fraction' must be between 0 and 1. 1131 """ 1132 return a + (b - a) * fraction 1133 1134 def _get_score(at): 1135 if len(values) == 0: 1136 return np.nan 1137 1138 idx = at * (len(values) - 1) 1139 if idx % 1 == 0: 1140 score = values[int(idx)] 1141 else: 1142 if interpolation_method == "fraction": 1143 score = _interpolate(values[int(idx)], values[int(idx) + 1], idx % 1) 1144 elif interpolation_method == "lower": 1145 score = values[np.floor(idx)] 1146 elif interpolation_method == "higher": 1147 score = values[np.ceil(idx)] 1148 else: 1149 raise ValueError( 1150 "interpolation_method can only be 'fraction' " 1151 ", 'lower' or 'higher'" 1152 ) 1153 1154 return score 1155 1156 if is_scalar(q): 1157 return _get_score(q) 1158 else: 1159 q = np.asarray(q, np.float64) 1160 result = [_get_score(x) for x in q] 1161 result = np.array(result, dtype=np.float64) 1162 return result 1163 1164 1165# --------------- # 1166# select n # 1167# --------------- # 1168 1169 1170class SelectN: 1171 def __init__(self, obj, n: int, keep: str): 1172 self.obj = obj 1173 self.n = n 1174 self.keep = keep 1175 1176 if self.keep not in ("first", "last", "all"): 1177 raise ValueError('keep must be either "first", "last" or "all"') 1178 1179 def compute(self, method: str) -> FrameOrSeriesUnion: 1180 raise NotImplementedError 1181 1182 def nlargest(self): 1183 return self.compute("nlargest") 1184 1185 def nsmallest(self): 1186 return self.compute("nsmallest") 1187 1188 @staticmethod 1189 def is_valid_dtype_n_method(dtype: DtypeObj) -> bool: 1190 """ 1191 Helper function to determine if dtype is valid for 1192 nsmallest/nlargest methods 1193 """ 1194 return ( 1195 is_numeric_dtype(dtype) and not is_complex_dtype(dtype) 1196 ) or needs_i8_conversion(dtype) 1197 1198 1199class SelectNSeries(SelectN): 1200 """ 1201 Implement n largest/smallest for Series 1202 1203 Parameters 1204 ---------- 1205 obj : Series 1206 n : int 1207 keep : {'first', 'last'}, default 'first' 1208 1209 Returns 1210 ------- 1211 nordered : Series 1212 """ 1213 1214 def compute(self, method: str) -> Series: 1215 1216 n = self.n 1217 dtype = self.obj.dtype 1218 if not self.is_valid_dtype_n_method(dtype): 1219 raise TypeError(f"Cannot use method '{method}' with dtype {dtype}") 1220 1221 if n <= 0: 1222 return self.obj[[]] 1223 1224 dropped = self.obj.dropna() 1225 1226 # slow method 1227 if n >= len(self.obj): 1228 ascending = method == "nsmallest" 1229 return dropped.sort_values(ascending=ascending).head(n) 1230 1231 # fast method 1232 arr, pandas_dtype = _ensure_data(dropped.values) 1233 if method == "nlargest": 1234 arr = -arr 1235 if is_integer_dtype(pandas_dtype): 1236 # GH 21426: ensure reverse ordering at boundaries 1237 arr -= 1 1238 1239 elif is_bool_dtype(pandas_dtype): 1240 # GH 26154: ensure False is smaller than True 1241 arr = 1 - (-arr) 1242 1243 if self.keep == "last": 1244 arr = arr[::-1] 1245 1246 narr = len(arr) 1247 n = min(n, narr) 1248 1249 kth_val = algos.kth_smallest(arr.copy(), n - 1) 1250 (ns,) = np.nonzero(arr <= kth_val) 1251 inds = ns[arr[ns].argsort(kind="mergesort")] 1252 1253 if self.keep != "all": 1254 inds = inds[:n] 1255 1256 if self.keep == "last": 1257 # reverse indices 1258 inds = narr - 1 - inds 1259 1260 return dropped.iloc[inds] 1261 1262 1263class SelectNFrame(SelectN): 1264 """ 1265 Implement n largest/smallest for DataFrame 1266 1267 Parameters 1268 ---------- 1269 obj : DataFrame 1270 n : int 1271 keep : {'first', 'last'}, default 'first' 1272 columns : list or str 1273 1274 Returns 1275 ------- 1276 nordered : DataFrame 1277 """ 1278 1279 def __init__(self, obj, n: int, keep: str, columns): 1280 super().__init__(obj, n, keep) 1281 if not is_list_like(columns) or isinstance(columns, tuple): 1282 columns = [columns] 1283 columns = list(columns) 1284 self.columns = columns 1285 1286 def compute(self, method: str) -> DataFrame: 1287 1288 from pandas import Int64Index 1289 1290 n = self.n 1291 frame = self.obj 1292 columns = self.columns 1293 1294 for column in columns: 1295 dtype = frame[column].dtype 1296 if not self.is_valid_dtype_n_method(dtype): 1297 raise TypeError( 1298 f"Column {repr(column)} has dtype {dtype}, " 1299 f"cannot use method {repr(method)} with this dtype" 1300 ) 1301 1302 def get_indexer(current_indexer, other_indexer): 1303 """ 1304 Helper function to concat `current_indexer` and `other_indexer` 1305 depending on `method` 1306 """ 1307 if method == "nsmallest": 1308 return current_indexer.append(other_indexer) 1309 else: 1310 return other_indexer.append(current_indexer) 1311 1312 # Below we save and reset the index in case index contains duplicates 1313 original_index = frame.index 1314 cur_frame = frame = frame.reset_index(drop=True) 1315 cur_n = n 1316 indexer = Int64Index([]) 1317 1318 for i, column in enumerate(columns): 1319 # For each column we apply method to cur_frame[column]. 1320 # If it's the last column or if we have the number of 1321 # results desired we are done. 1322 # Otherwise there are duplicates of the largest/smallest 1323 # value and we need to look at the rest of the columns 1324 # to determine which of the rows with the largest/smallest 1325 # value in the column to keep. 1326 series = cur_frame[column] 1327 is_last_column = len(columns) - 1 == i 1328 values = getattr(series, method)( 1329 cur_n, keep=self.keep if is_last_column else "all" 1330 ) 1331 1332 if is_last_column or len(values) <= cur_n: 1333 indexer = get_indexer(indexer, values.index) 1334 break 1335 1336 # Now find all values which are equal to 1337 # the (nsmallest: largest)/(nlargest: smallest) 1338 # from our series. 1339 border_value = values == values[values.index[-1]] 1340 1341 # Some of these values are among the top-n 1342 # some aren't. 1343 unsafe_values = values[border_value] 1344 1345 # These values are definitely among the top-n 1346 safe_values = values[~border_value] 1347 indexer = get_indexer(indexer, safe_values.index) 1348 1349 # Go on and separate the unsafe_values on the remaining 1350 # columns. 1351 cur_frame = cur_frame.loc[unsafe_values.index] 1352 cur_n = n - len(indexer) 1353 1354 frame = frame.take(indexer) 1355 1356 # Restore the index on frame 1357 frame.index = original_index.take(indexer) 1358 1359 # If there is only one column, the frame is already sorted. 1360 if len(columns) == 1: 1361 return frame 1362 1363 ascending = method == "nsmallest" 1364 1365 return frame.sort_values(columns, ascending=ascending, kind="mergesort") 1366 1367 1368# ---- # 1369# take # 1370# ---- # 1371 1372 1373def _view_wrapper(f, arr_dtype=None, out_dtype=None, fill_wrap=None): 1374 def wrapper(arr, indexer, out, fill_value=np.nan): 1375 if arr_dtype is not None: 1376 arr = arr.view(arr_dtype) 1377 if out_dtype is not None: 1378 out = out.view(out_dtype) 1379 if fill_wrap is not None: 1380 fill_value = fill_wrap(fill_value) 1381 f(arr, indexer, out, fill_value=fill_value) 1382 1383 return wrapper 1384 1385 1386def _convert_wrapper(f, conv_dtype): 1387 def wrapper(arr, indexer, out, fill_value=np.nan): 1388 arr = arr.astype(conv_dtype) 1389 f(arr, indexer, out, fill_value=fill_value) 1390 1391 return wrapper 1392 1393 1394def _take_2d_multi_object(arr, indexer, out, fill_value, mask_info): 1395 # this is not ideal, performance-wise, but it's better than raising 1396 # an exception (best to optimize in Cython to avoid getting here) 1397 row_idx, col_idx = indexer 1398 if mask_info is not None: 1399 (row_mask, col_mask), (row_needs, col_needs) = mask_info 1400 else: 1401 row_mask = row_idx == -1 1402 col_mask = col_idx == -1 1403 row_needs = row_mask.any() 1404 col_needs = col_mask.any() 1405 if fill_value is not None: 1406 if row_needs: 1407 out[row_mask, :] = fill_value 1408 if col_needs: 1409 out[:, col_mask] = fill_value 1410 for i in range(len(row_idx)): 1411 u_ = row_idx[i] 1412 for j in range(len(col_idx)): 1413 v = col_idx[j] 1414 out[i, j] = arr[u_, v] 1415 1416 1417def _take_nd_object(arr, indexer, out, axis: int, fill_value, mask_info): 1418 if mask_info is not None: 1419 mask, needs_masking = mask_info 1420 else: 1421 mask = indexer == -1 1422 needs_masking = mask.any() 1423 if arr.dtype != out.dtype: 1424 arr = arr.astype(out.dtype) 1425 if arr.shape[axis] > 0: 1426 arr.take(ensure_platform_int(indexer), axis=axis, out=out) 1427 if needs_masking: 1428 outindexer = [slice(None)] * arr.ndim 1429 outindexer[axis] = mask 1430 out[tuple(outindexer)] = fill_value 1431 1432 1433_take_1d_dict = { 1434 ("int8", "int8"): algos.take_1d_int8_int8, 1435 ("int8", "int32"): algos.take_1d_int8_int32, 1436 ("int8", "int64"): algos.take_1d_int8_int64, 1437 ("int8", "float64"): algos.take_1d_int8_float64, 1438 ("int16", "int16"): algos.take_1d_int16_int16, 1439 ("int16", "int32"): algos.take_1d_int16_int32, 1440 ("int16", "int64"): algos.take_1d_int16_int64, 1441 ("int16", "float64"): algos.take_1d_int16_float64, 1442 ("int32", "int32"): algos.take_1d_int32_int32, 1443 ("int32", "int64"): algos.take_1d_int32_int64, 1444 ("int32", "float64"): algos.take_1d_int32_float64, 1445 ("int64", "int64"): algos.take_1d_int64_int64, 1446 ("int64", "float64"): algos.take_1d_int64_float64, 1447 ("float32", "float32"): algos.take_1d_float32_float32, 1448 ("float32", "float64"): algos.take_1d_float32_float64, 1449 ("float64", "float64"): algos.take_1d_float64_float64, 1450 ("object", "object"): algos.take_1d_object_object, 1451 ("bool", "bool"): _view_wrapper(algos.take_1d_bool_bool, np.uint8, np.uint8), 1452 ("bool", "object"): _view_wrapper(algos.take_1d_bool_object, np.uint8, None), 1453 ("datetime64[ns]", "datetime64[ns]"): _view_wrapper( 1454 algos.take_1d_int64_int64, np.int64, np.int64, np.int64 1455 ), 1456} 1457 1458_take_2d_axis0_dict = { 1459 ("int8", "int8"): algos.take_2d_axis0_int8_int8, 1460 ("int8", "int32"): algos.take_2d_axis0_int8_int32, 1461 ("int8", "int64"): algos.take_2d_axis0_int8_int64, 1462 ("int8", "float64"): algos.take_2d_axis0_int8_float64, 1463 ("int16", "int16"): algos.take_2d_axis0_int16_int16, 1464 ("int16", "int32"): algos.take_2d_axis0_int16_int32, 1465 ("int16", "int64"): algos.take_2d_axis0_int16_int64, 1466 ("int16", "float64"): algos.take_2d_axis0_int16_float64, 1467 ("int32", "int32"): algos.take_2d_axis0_int32_int32, 1468 ("int32", "int64"): algos.take_2d_axis0_int32_int64, 1469 ("int32", "float64"): algos.take_2d_axis0_int32_float64, 1470 ("int64", "int64"): algos.take_2d_axis0_int64_int64, 1471 ("int64", "float64"): algos.take_2d_axis0_int64_float64, 1472 ("float32", "float32"): algos.take_2d_axis0_float32_float32, 1473 ("float32", "float64"): algos.take_2d_axis0_float32_float64, 1474 ("float64", "float64"): algos.take_2d_axis0_float64_float64, 1475 ("object", "object"): algos.take_2d_axis0_object_object, 1476 ("bool", "bool"): _view_wrapper(algos.take_2d_axis0_bool_bool, np.uint8, np.uint8), 1477 ("bool", "object"): _view_wrapper(algos.take_2d_axis0_bool_object, np.uint8, None), 1478 ("datetime64[ns]", "datetime64[ns]"): _view_wrapper( 1479 algos.take_2d_axis0_int64_int64, np.int64, np.int64, fill_wrap=np.int64 1480 ), 1481} 1482 1483_take_2d_axis1_dict = { 1484 ("int8", "int8"): algos.take_2d_axis1_int8_int8, 1485 ("int8", "int32"): algos.take_2d_axis1_int8_int32, 1486 ("int8", "int64"): algos.take_2d_axis1_int8_int64, 1487 ("int8", "float64"): algos.take_2d_axis1_int8_float64, 1488 ("int16", "int16"): algos.take_2d_axis1_int16_int16, 1489 ("int16", "int32"): algos.take_2d_axis1_int16_int32, 1490 ("int16", "int64"): algos.take_2d_axis1_int16_int64, 1491 ("int16", "float64"): algos.take_2d_axis1_int16_float64, 1492 ("int32", "int32"): algos.take_2d_axis1_int32_int32, 1493 ("int32", "int64"): algos.take_2d_axis1_int32_int64, 1494 ("int32", "float64"): algos.take_2d_axis1_int32_float64, 1495 ("int64", "int64"): algos.take_2d_axis1_int64_int64, 1496 ("int64", "float64"): algos.take_2d_axis1_int64_float64, 1497 ("float32", "float32"): algos.take_2d_axis1_float32_float32, 1498 ("float32", "float64"): algos.take_2d_axis1_float32_float64, 1499 ("float64", "float64"): algos.take_2d_axis1_float64_float64, 1500 ("object", "object"): algos.take_2d_axis1_object_object, 1501 ("bool", "bool"): _view_wrapper(algos.take_2d_axis1_bool_bool, np.uint8, np.uint8), 1502 ("bool", "object"): _view_wrapper(algos.take_2d_axis1_bool_object, np.uint8, None), 1503 ("datetime64[ns]", "datetime64[ns]"): _view_wrapper( 1504 algos.take_2d_axis1_int64_int64, np.int64, np.int64, fill_wrap=np.int64 1505 ), 1506} 1507 1508_take_2d_multi_dict = { 1509 ("int8", "int8"): algos.take_2d_multi_int8_int8, 1510 ("int8", "int32"): algos.take_2d_multi_int8_int32, 1511 ("int8", "int64"): algos.take_2d_multi_int8_int64, 1512 ("int8", "float64"): algos.take_2d_multi_int8_float64, 1513 ("int16", "int16"): algos.take_2d_multi_int16_int16, 1514 ("int16", "int32"): algos.take_2d_multi_int16_int32, 1515 ("int16", "int64"): algos.take_2d_multi_int16_int64, 1516 ("int16", "float64"): algos.take_2d_multi_int16_float64, 1517 ("int32", "int32"): algos.take_2d_multi_int32_int32, 1518 ("int32", "int64"): algos.take_2d_multi_int32_int64, 1519 ("int32", "float64"): algos.take_2d_multi_int32_float64, 1520 ("int64", "int64"): algos.take_2d_multi_int64_int64, 1521 ("int64", "float64"): algos.take_2d_multi_int64_float64, 1522 ("float32", "float32"): algos.take_2d_multi_float32_float32, 1523 ("float32", "float64"): algos.take_2d_multi_float32_float64, 1524 ("float64", "float64"): algos.take_2d_multi_float64_float64, 1525 ("object", "object"): algos.take_2d_multi_object_object, 1526 ("bool", "bool"): _view_wrapper(algos.take_2d_multi_bool_bool, np.uint8, np.uint8), 1527 ("bool", "object"): _view_wrapper(algos.take_2d_multi_bool_object, np.uint8, None), 1528 ("datetime64[ns]", "datetime64[ns]"): _view_wrapper( 1529 algos.take_2d_multi_int64_int64, np.int64, np.int64, fill_wrap=np.int64 1530 ), 1531} 1532 1533 1534def _get_take_nd_function( 1535 ndim: int, arr_dtype, out_dtype, axis: int = 0, mask_info=None 1536): 1537 if ndim <= 2: 1538 tup = (arr_dtype.name, out_dtype.name) 1539 if ndim == 1: 1540 func = _take_1d_dict.get(tup, None) 1541 elif ndim == 2: 1542 if axis == 0: 1543 func = _take_2d_axis0_dict.get(tup, None) 1544 else: 1545 func = _take_2d_axis1_dict.get(tup, None) 1546 if func is not None: 1547 return func 1548 1549 tup = (out_dtype.name, out_dtype.name) 1550 if ndim == 1: 1551 func = _take_1d_dict.get(tup, None) 1552 elif ndim == 2: 1553 if axis == 0: 1554 func = _take_2d_axis0_dict.get(tup, None) 1555 else: 1556 func = _take_2d_axis1_dict.get(tup, None) 1557 if func is not None: 1558 func = _convert_wrapper(func, out_dtype) 1559 return func 1560 1561 def func2(arr, indexer, out, fill_value=np.nan): 1562 indexer = ensure_int64(indexer) 1563 _take_nd_object( 1564 arr, indexer, out, axis=axis, fill_value=fill_value, mask_info=mask_info 1565 ) 1566 1567 return func2 1568 1569 1570def take(arr, indices, axis: int = 0, allow_fill: bool = False, fill_value=None): 1571 """ 1572 Take elements from an array. 1573 1574 Parameters 1575 ---------- 1576 arr : sequence 1577 Non array-likes (sequences without a dtype) are coerced 1578 to an ndarray. 1579 indices : sequence of integers 1580 Indices to be taken. 1581 axis : int, default 0 1582 The axis over which to select values. 1583 allow_fill : bool, default False 1584 How to handle negative values in `indices`. 1585 1586 * False: negative values in `indices` indicate positional indices 1587 from the right (the default). This is similar to :func:`numpy.take`. 1588 1589 * True: negative values in `indices` indicate 1590 missing values. These values are set to `fill_value`. Any other 1591 negative values raise a ``ValueError``. 1592 1593 fill_value : any, optional 1594 Fill value to use for NA-indices when `allow_fill` is True. 1595 This may be ``None``, in which case the default NA value for 1596 the type (``self.dtype.na_value``) is used. 1597 1598 For multi-dimensional `arr`, each *element* is filled with 1599 `fill_value`. 1600 1601 Returns 1602 ------- 1603 ndarray or ExtensionArray 1604 Same type as the input. 1605 1606 Raises 1607 ------ 1608 IndexError 1609 When `indices` is out of bounds for the array. 1610 ValueError 1611 When the indexer contains negative values other than ``-1`` 1612 and `allow_fill` is True. 1613 1614 Notes 1615 ----- 1616 When `allow_fill` is False, `indices` may be whatever dimensionality 1617 is accepted by NumPy for `arr`. 1618 1619 When `allow_fill` is True, `indices` should be 1-D. 1620 1621 See Also 1622 -------- 1623 numpy.take : Take elements from an array along an axis. 1624 1625 Examples 1626 -------- 1627 >>> from pandas.api.extensions import take 1628 1629 With the default ``allow_fill=False``, negative numbers indicate 1630 positional indices from the right. 1631 1632 >>> take(np.array([10, 20, 30]), [0, 0, -1]) 1633 array([10, 10, 30]) 1634 1635 Setting ``allow_fill=True`` will place `fill_value` in those positions. 1636 1637 >>> take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True) 1638 array([10., 10., nan]) 1639 1640 >>> take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True, 1641 ... fill_value=-10) 1642 array([ 10, 10, -10]) 1643 """ 1644 if not is_array_like(arr): 1645 arr = np.asarray(arr) 1646 1647 indices = np.asarray(indices, dtype=np.intp) 1648 1649 if allow_fill: 1650 # Pandas style, -1 means NA 1651 validate_indices(indices, arr.shape[axis]) 1652 result = take_1d( 1653 arr, indices, axis=axis, allow_fill=True, fill_value=fill_value 1654 ) 1655 else: 1656 # NumPy style 1657 result = arr.take(indices, axis=axis) 1658 return result 1659 1660 1661def take_nd( 1662 arr, indexer, axis: int = 0, out=None, fill_value=np.nan, allow_fill: bool = True 1663): 1664 """ 1665 Specialized Cython take which sets NaN values in one pass 1666 1667 This dispatches to ``take`` defined on ExtensionArrays. It does not 1668 currently dispatch to ``SparseArray.take`` for sparse ``arr``. 1669 1670 Parameters 1671 ---------- 1672 arr : array-like 1673 Input array. 1674 indexer : ndarray 1675 1-D array of indices to take, subarrays corresponding to -1 value 1676 indices are filed with fill_value 1677 axis : int, default 0 1678 Axis to take from 1679 out : ndarray or None, default None 1680 Optional output array, must be appropriate type to hold input and 1681 fill_value together, if indexer has any -1 value entries; call 1682 maybe_promote to determine this type for any fill_value 1683 fill_value : any, default np.nan 1684 Fill value to replace -1 values with 1685 allow_fill : boolean, default True 1686 If False, indexer is assumed to contain no -1 values so no filling 1687 will be done. This short-circuits computation of a mask. Result is 1688 undefined if allow_fill == False and -1 is present in indexer. 1689 1690 Returns 1691 ------- 1692 subarray : array-like 1693 May be the same type as the input, or cast to an ndarray. 1694 """ 1695 mask_info = None 1696 1697 if isinstance(arr, ABCExtensionArray): 1698 # Check for EA to catch DatetimeArray, TimedeltaArray 1699 return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) 1700 1701 arr = extract_array(arr) 1702 arr = np.asarray(arr) 1703 1704 if indexer is None: 1705 indexer = np.arange(arr.shape[axis], dtype=np.int64) 1706 dtype, fill_value = arr.dtype, arr.dtype.type() 1707 else: 1708 indexer = ensure_int64(indexer, copy=False) 1709 if not allow_fill: 1710 dtype, fill_value = arr.dtype, arr.dtype.type() 1711 mask_info = None, False 1712 else: 1713 # check for promotion based on types only (do this first because 1714 # it's faster than computing a mask) 1715 dtype, fill_value = maybe_promote(arr.dtype, fill_value) 1716 if dtype != arr.dtype and (out is None or out.dtype != dtype): 1717 # check if promotion is actually required based on indexer 1718 mask = indexer == -1 1719 needs_masking = mask.any() 1720 mask_info = mask, needs_masking 1721 if needs_masking: 1722 if out is not None and out.dtype != dtype: 1723 raise TypeError("Incompatible type for fill_value") 1724 else: 1725 # if not, then depromote, set fill_value to dummy 1726 # (it won't be used but we don't want the cython code 1727 # to crash when trying to cast it to dtype) 1728 dtype, fill_value = arr.dtype, arr.dtype.type() 1729 1730 flip_order = False 1731 if arr.ndim == 2: 1732 if arr.flags.f_contiguous: 1733 flip_order = True 1734 1735 if flip_order: 1736 arr = arr.T 1737 axis = arr.ndim - axis - 1 1738 if out is not None: 1739 out = out.T 1740 1741 # at this point, it's guaranteed that dtype can hold both the arr values 1742 # and the fill_value 1743 if out is None: 1744 out_shape_ = list(arr.shape) 1745 out_shape_[axis] = len(indexer) 1746 out_shape = tuple(out_shape_) 1747 if arr.flags.f_contiguous and axis == arr.ndim - 1: 1748 # minor tweak that can make an order-of-magnitude difference 1749 # for dataframes initialized directly from 2-d ndarrays 1750 # (s.t. df.values is c-contiguous and df._mgr.blocks[0] is its 1751 # f-contiguous transpose) 1752 out = np.empty(out_shape, dtype=dtype, order="F") 1753 else: 1754 out = np.empty(out_shape, dtype=dtype) 1755 1756 func = _get_take_nd_function( 1757 arr.ndim, arr.dtype, out.dtype, axis=axis, mask_info=mask_info 1758 ) 1759 func(arr, indexer, out, fill_value) 1760 1761 if flip_order: 1762 out = out.T 1763 return out 1764 1765 1766take_1d = take_nd 1767 1768 1769def take_2d_multi(arr, indexer, fill_value=np.nan): 1770 """ 1771 Specialized Cython take which sets NaN values in one pass. 1772 """ 1773 # This is only called from one place in DataFrame._reindex_multi, 1774 # so we know indexer is well-behaved. 1775 assert indexer is not None 1776 assert indexer[0] is not None 1777 assert indexer[1] is not None 1778 1779 row_idx, col_idx = indexer 1780 1781 row_idx = ensure_int64(row_idx) 1782 col_idx = ensure_int64(col_idx) 1783 indexer = row_idx, col_idx 1784 mask_info = None 1785 1786 # check for promotion based on types only (do this first because 1787 # it's faster than computing a mask) 1788 dtype, fill_value = maybe_promote(arr.dtype, fill_value) 1789 if dtype != arr.dtype: 1790 # check if promotion is actually required based on indexer 1791 row_mask = row_idx == -1 1792 col_mask = col_idx == -1 1793 row_needs = row_mask.any() 1794 col_needs = col_mask.any() 1795 mask_info = (row_mask, col_mask), (row_needs, col_needs) 1796 1797 if not (row_needs or col_needs): 1798 # if not, then depromote, set fill_value to dummy 1799 # (it won't be used but we don't want the cython code 1800 # to crash when trying to cast it to dtype) 1801 dtype, fill_value = arr.dtype, arr.dtype.type() 1802 1803 # at this point, it's guaranteed that dtype can hold both the arr values 1804 # and the fill_value 1805 out_shape = len(row_idx), len(col_idx) 1806 out = np.empty(out_shape, dtype=dtype) 1807 1808 func = _take_2d_multi_dict.get((arr.dtype.name, out.dtype.name), None) 1809 if func is None and arr.dtype != out.dtype: 1810 func = _take_2d_multi_dict.get((out.dtype.name, out.dtype.name), None) 1811 if func is not None: 1812 func = _convert_wrapper(func, out.dtype) 1813 if func is None: 1814 1815 def func(arr, indexer, out, fill_value=np.nan): 1816 _take_2d_multi_object( 1817 arr, indexer, out, fill_value=fill_value, mask_info=mask_info 1818 ) 1819 1820 func(arr, indexer, out=out, fill_value=fill_value) 1821 return out 1822 1823 1824# ------------ # 1825# searchsorted # 1826# ------------ # 1827 1828 1829def searchsorted(arr, value, side="left", sorter=None) -> np.ndarray: 1830 """ 1831 Find indices where elements should be inserted to maintain order. 1832 1833 .. versionadded:: 0.25.0 1834 1835 Find the indices into a sorted array `arr` (a) such that, if the 1836 corresponding elements in `value` were inserted before the indices, 1837 the order of `arr` would be preserved. 1838 1839 Assuming that `arr` is sorted: 1840 1841 ====== ================================ 1842 `side` returned index `i` satisfies 1843 ====== ================================ 1844 left ``arr[i-1] < value <= self[i]`` 1845 right ``arr[i-1] <= value < self[i]`` 1846 ====== ================================ 1847 1848 Parameters 1849 ---------- 1850 arr: array-like 1851 Input array. If `sorter` is None, then it must be sorted in 1852 ascending order, otherwise `sorter` must be an array of indices 1853 that sort it. 1854 value : array_like 1855 Values to insert into `arr`. 1856 side : {'left', 'right'}, optional 1857 If 'left', the index of the first suitable location found is given. 1858 If 'right', return the last such index. If there is no suitable 1859 index, return either 0 or N (where N is the length of `self`). 1860 sorter : 1-D array_like, optional 1861 Optional array of integer indices that sort array a into ascending 1862 order. They are typically the result of argsort. 1863 1864 Returns 1865 ------- 1866 array of ints 1867 Array of insertion points with the same shape as `value`. 1868 1869 See Also 1870 -------- 1871 numpy.searchsorted : Similar method from NumPy. 1872 """ 1873 if sorter is not None: 1874 sorter = ensure_platform_int(sorter) 1875 1876 if ( 1877 isinstance(arr, np.ndarray) 1878 and is_integer_dtype(arr.dtype) 1879 and (is_integer(value) or is_integer_dtype(value)) 1880 ): 1881 # if `arr` and `value` have different dtypes, `arr` would be 1882 # recast by numpy, causing a slow search. 1883 # Before searching below, we therefore try to give `value` the 1884 # same dtype as `arr`, while guarding against integer overflows. 1885 iinfo = np.iinfo(arr.dtype.type) 1886 value_arr = np.array([value]) if is_scalar(value) else np.array(value) 1887 if (value_arr >= iinfo.min).all() and (value_arr <= iinfo.max).all(): 1888 # value within bounds, so no overflow, so can convert value dtype 1889 # to dtype of arr 1890 dtype = arr.dtype 1891 else: 1892 dtype = value_arr.dtype 1893 1894 if is_scalar(value): 1895 value = dtype.type(value) 1896 else: 1897 value = array(value, dtype=dtype) 1898 elif not ( 1899 is_object_dtype(arr) or is_numeric_dtype(arr) or is_categorical_dtype(arr) 1900 ): 1901 # E.g. if `arr` is an array with dtype='datetime64[ns]' 1902 # and `value` is a pd.Timestamp, we may need to convert value 1903 value_ser = array([value]) if is_scalar(value) else array(value) 1904 value = value_ser[0] if is_scalar(value) else value_ser 1905 if isinstance(value, Timestamp) and value.tzinfo is None: 1906 value = value.to_datetime64() 1907 1908 result = arr.searchsorted(value, side=side, sorter=sorter) 1909 return result 1910 1911 1912# ---- # 1913# diff # 1914# ---- # 1915 1916_diff_special = {"float64", "float32", "int64", "int32", "int16", "int8"} 1917 1918 1919def diff(arr, n: int, axis: int = 0, stacklevel=3): 1920 """ 1921 difference of n between self, 1922 analogous to s-s.shift(n) 1923 1924 Parameters 1925 ---------- 1926 arr : ndarray 1927 n : int 1928 number of periods 1929 axis : int 1930 axis to shift on 1931 stacklevel : int 1932 The stacklevel for the lost dtype warning. 1933 1934 Returns 1935 ------- 1936 shifted 1937 """ 1938 from pandas.core.arrays import PandasDtype 1939 1940 n = int(n) 1941 na = np.nan 1942 dtype = arr.dtype 1943 1944 if dtype.kind == "b": 1945 op = operator.xor 1946 else: 1947 op = operator.sub 1948 1949 if isinstance(dtype, PandasDtype): 1950 # PandasArray cannot necessarily hold shifted versions of itself. 1951 arr = np.asarray(arr) 1952 dtype = arr.dtype 1953 1954 if is_extension_array_dtype(dtype): 1955 if hasattr(arr, f"__{op.__name__}__"): 1956 if axis != 0: 1957 raise ValueError(f"cannot diff {type(arr).__name__} on axis={axis}") 1958 return op(arr, arr.shift(n)) 1959 else: 1960 warn( 1961 "dtype lost in 'diff()'. In the future this will raise a " 1962 "TypeError. Convert to a suitable dtype prior to calling 'diff'.", 1963 FutureWarning, 1964 stacklevel=stacklevel, 1965 ) 1966 arr = np.asarray(arr) 1967 dtype = arr.dtype 1968 1969 is_timedelta = False 1970 is_bool = False 1971 if needs_i8_conversion(arr.dtype): 1972 dtype = np.int64 1973 arr = arr.view("i8") 1974 na = iNaT 1975 is_timedelta = True 1976 1977 elif is_bool_dtype(dtype): 1978 # We have to cast in order to be able to hold np.nan 1979 dtype = np.object_ 1980 is_bool = True 1981 1982 elif is_integer_dtype(dtype): 1983 # We have to cast in order to be able to hold np.nan 1984 1985 # int8, int16 are incompatible with float64, 1986 # see https://github.com/cython/cython/issues/2646 1987 if arr.dtype.name in ["int8", "int16"]: 1988 dtype = np.float32 1989 else: 1990 dtype = np.float64 1991 1992 orig_ndim = arr.ndim 1993 if orig_ndim == 1: 1994 # reshape so we can always use algos.diff_2d 1995 arr = arr.reshape(-1, 1) 1996 # TODO: require axis == 0 1997 1998 dtype = np.dtype(dtype) 1999 out_arr = np.empty(arr.shape, dtype=dtype) 2000 2001 na_indexer = [slice(None)] * arr.ndim 2002 na_indexer[axis] = slice(None, n) if n >= 0 else slice(n, None) 2003 out_arr[tuple(na_indexer)] = na 2004 2005 if arr.ndim == 2 and arr.dtype.name in _diff_special: 2006 # TODO: can diff_2d dtype specialization troubles be fixed by defining 2007 # out_arr inside diff_2d? 2008 algos.diff_2d(arr, out_arr, n, axis, datetimelike=is_timedelta) 2009 else: 2010 # To keep mypy happy, _res_indexer is a list while res_indexer is 2011 # a tuple, ditto for lag_indexer. 2012 _res_indexer = [slice(None)] * arr.ndim 2013 _res_indexer[axis] = slice(n, None) if n >= 0 else slice(None, n) 2014 res_indexer = tuple(_res_indexer) 2015 2016 _lag_indexer = [slice(None)] * arr.ndim 2017 _lag_indexer[axis] = slice(None, -n) if n > 0 else slice(-n, None) 2018 lag_indexer = tuple(_lag_indexer) 2019 2020 # need to make sure that we account for na for datelike/timedelta 2021 # we don't actually want to subtract these i8 numbers 2022 if is_timedelta: 2023 res = arr[res_indexer] 2024 lag = arr[lag_indexer] 2025 2026 mask = (arr[res_indexer] == na) | (arr[lag_indexer] == na) 2027 if mask.any(): 2028 res = res.copy() 2029 res[mask] = 0 2030 lag = lag.copy() 2031 lag[mask] = 0 2032 2033 result = res - lag 2034 result[mask] = na 2035 out_arr[res_indexer] = result 2036 elif is_bool: 2037 out_arr[res_indexer] = arr[res_indexer] ^ arr[lag_indexer] 2038 else: 2039 out_arr[res_indexer] = arr[res_indexer] - arr[lag_indexer] 2040 2041 if is_timedelta: 2042 out_arr = out_arr.view("timedelta64[ns]") 2043 2044 if orig_ndim == 1: 2045 out_arr = out_arr[:, 0] 2046 return out_arr 2047 2048 2049# -------------------------------------------------------------------- 2050# Helper functions 2051 2052# Note: safe_sort is in algorithms.py instead of sorting.py because it is 2053# low-dependency, is used in this module, and used private methods from 2054# this module. 2055def safe_sort( 2056 values, 2057 codes=None, 2058 na_sentinel: int = -1, 2059 assume_unique: bool = False, 2060 verify: bool = True, 2061) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]: 2062 """ 2063 Sort ``values`` and reorder corresponding ``codes``. 2064 2065 ``values`` should be unique if ``codes`` is not None. 2066 Safe for use with mixed types (int, str), orders ints before strs. 2067 2068 Parameters 2069 ---------- 2070 values : list-like 2071 Sequence; must be unique if ``codes`` is not None. 2072 codes : list_like, optional 2073 Indices to ``values``. All out of bound indices are treated as 2074 "not found" and will be masked with ``na_sentinel``. 2075 na_sentinel : int, default -1 2076 Value in ``codes`` to mark "not found". 2077 Ignored when ``codes`` is None. 2078 assume_unique : bool, default False 2079 When True, ``values`` are assumed to be unique, which can speed up 2080 the calculation. Ignored when ``codes`` is None. 2081 verify : bool, default True 2082 Check if codes are out of bound for the values and put out of bound 2083 codes equal to na_sentinel. If ``verify=False``, it is assumed there 2084 are no out of bound codes. Ignored when ``codes`` is None. 2085 2086 .. versionadded:: 0.25.0 2087 2088 Returns 2089 ------- 2090 ordered : ndarray 2091 Sorted ``values`` 2092 new_codes : ndarray 2093 Reordered ``codes``; returned when ``codes`` is not None. 2094 2095 Raises 2096 ------ 2097 TypeError 2098 * If ``values`` is not list-like or if ``codes`` is neither None 2099 nor list-like 2100 * If ``values`` cannot be sorted 2101 ValueError 2102 * If ``codes`` is not None and ``values`` contain duplicates. 2103 """ 2104 if not is_list_like(values): 2105 raise TypeError( 2106 "Only list-like objects are allowed to be passed to safe_sort as values" 2107 ) 2108 2109 if not isinstance(values, (np.ndarray, ABCExtensionArray)): 2110 # don't convert to string types 2111 dtype, _ = infer_dtype_from_array(values) 2112 values = np.asarray(values, dtype=dtype) 2113 2114 sorter = None 2115 2116 if ( 2117 not is_extension_array_dtype(values) 2118 and lib.infer_dtype(values, skipna=False) == "mixed-integer" 2119 ): 2120 ordered = _sort_mixed(values) 2121 else: 2122 try: 2123 sorter = values.argsort() 2124 ordered = values.take(sorter) 2125 except TypeError: 2126 # Previous sorters failed or were not applicable, try `_sort_mixed` 2127 # which would work, but which fails for special case of 1d arrays 2128 # with tuples. 2129 if values.size and isinstance(values[0], tuple): 2130 ordered = _sort_tuples(values) 2131 else: 2132 ordered = _sort_mixed(values) 2133 2134 # codes: 2135 2136 if codes is None: 2137 return ordered 2138 2139 if not is_list_like(codes): 2140 raise TypeError( 2141 "Only list-like objects or None are allowed to " 2142 "be passed to safe_sort as codes" 2143 ) 2144 codes = ensure_platform_int(np.asarray(codes)) 2145 2146 if not assume_unique and not len(unique(values)) == len(values): 2147 raise ValueError("values should be unique if codes is not None") 2148 2149 if sorter is None: 2150 # mixed types 2151 hash_klass, values = get_data_algo(values) 2152 t = hash_klass(len(values)) 2153 t.map_locations(values) 2154 sorter = ensure_platform_int(t.lookup(ordered)) 2155 2156 if na_sentinel == -1: 2157 # take_1d is faster, but only works for na_sentinels of -1 2158 order2 = sorter.argsort() 2159 new_codes = take_1d(order2, codes, fill_value=-1) 2160 if verify: 2161 mask = (codes < -len(values)) | (codes >= len(values)) 2162 else: 2163 mask = None 2164 else: 2165 reverse_indexer = np.empty(len(sorter), dtype=np.int_) 2166 reverse_indexer.put(sorter, np.arange(len(sorter))) 2167 # Out of bound indices will be masked with `na_sentinel` next, so we 2168 # may deal with them here without performance loss using `mode='wrap'` 2169 new_codes = reverse_indexer.take(codes, mode="wrap") 2170 2171 mask = codes == na_sentinel 2172 if verify: 2173 mask = mask | (codes < -len(values)) | (codes >= len(values)) 2174 2175 if mask is not None: 2176 np.putmask(new_codes, mask, na_sentinel) 2177 2178 return ordered, ensure_platform_int(new_codes) 2179 2180 2181def _sort_mixed(values): 2182 """ order ints before strings in 1d arrays, safe in py3 """ 2183 str_pos = np.array([isinstance(x, str) for x in values], dtype=bool) 2184 nums = np.sort(values[~str_pos]) 2185 strs = np.sort(values[str_pos]) 2186 return np.concatenate([nums, np.asarray(strs, dtype=object)]) 2187 2188 2189def _sort_tuples(values: np.ndarray[tuple]): 2190 """ 2191 Convert array of tuples (1d) to array or array (2d). 2192 We need to keep the columns separately as they contain different types and 2193 nans (can't use `np.sort` as it may fail when str and nan are mixed in a 2194 column as types cannot be compared). 2195 """ 2196 from pandas.core.internals.construction import to_arrays 2197 from pandas.core.sorting import lexsort_indexer 2198 2199 arrays, _ = to_arrays(values, None) 2200 indexer = lexsort_indexer(arrays, orders=True) 2201 return values[indexer] 2202