1import numbers 2from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Type, Union 3import warnings 4 5import numpy as np 6 7from pandas._libs import iNaT, lib, missing as libmissing 8from pandas._typing import ArrayLike, DtypeObj 9from pandas.compat.numpy import function as nv 10from pandas.util._decorators import cache_readonly 11 12from pandas.core.dtypes.base import register_extension_dtype 13from pandas.core.dtypes.common import ( 14 is_bool_dtype, 15 is_datetime64_dtype, 16 is_float, 17 is_float_dtype, 18 is_integer_dtype, 19 is_list_like, 20 is_object_dtype, 21 pandas_dtype, 22) 23from pandas.core.dtypes.missing import isna 24 25from pandas.core import ops 26from pandas.core.ops import invalid_comparison 27from pandas.core.tools.numeric import to_numeric 28 29from .masked import BaseMaskedArray, BaseMaskedDtype 30from .numeric import NumericArray 31 32if TYPE_CHECKING: 33 import pyarrow 34 35 36class _IntegerDtype(BaseMaskedDtype): 37 """ 38 An ExtensionDtype to hold a single size & kind of integer dtype. 39 40 These specific implementations are subclasses of the non-public 41 _IntegerDtype. For example we have Int8Dtype to represent signed int 8s. 42 43 The attributes name & type are set when these subclasses are created. 44 """ 45 46 def __repr__(self) -> str: 47 sign = "U" if self.is_unsigned_integer else "" 48 return f"{sign}Int{8 * self.itemsize}Dtype()" 49 50 @cache_readonly 51 def is_signed_integer(self) -> bool: 52 return self.kind == "i" 53 54 @cache_readonly 55 def is_unsigned_integer(self) -> bool: 56 return self.kind == "u" 57 58 @property 59 def _is_numeric(self) -> bool: 60 return True 61 62 @classmethod 63 def construct_array_type(cls) -> Type["IntegerArray"]: 64 """ 65 Return the array type associated with this dtype. 66 67 Returns 68 ------- 69 type 70 """ 71 return IntegerArray 72 73 def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: 74 # we only handle nullable EA dtypes and numeric numpy dtypes 75 if not all( 76 isinstance(t, BaseMaskedDtype) 77 or ( 78 isinstance(t, np.dtype) 79 and (np.issubdtype(t, np.number) or np.issubdtype(t, np.bool_)) 80 ) 81 for t in dtypes 82 ): 83 return None 84 np_dtype = np.find_common_type( 85 [t.numpy_dtype if isinstance(t, BaseMaskedDtype) else t for t in dtypes], [] 86 ) 87 if np.issubdtype(np_dtype, np.integer): 88 return INT_STR_TO_DTYPE[str(np_dtype)] 89 elif np.issubdtype(np_dtype, np.floating): 90 from pandas.core.arrays.floating import FLOAT_STR_TO_DTYPE 91 92 return FLOAT_STR_TO_DTYPE[str(np_dtype)] 93 return None 94 95 def __from_arrow__( 96 self, array: Union["pyarrow.Array", "pyarrow.ChunkedArray"] 97 ) -> "IntegerArray": 98 """ 99 Construct IntegerArray from pyarrow Array/ChunkedArray. 100 """ 101 import pyarrow 102 103 from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask 104 105 pyarrow_type = pyarrow.from_numpy_dtype(self.type) 106 if not array.type.equals(pyarrow_type): 107 array = array.cast(pyarrow_type) 108 109 if isinstance(array, pyarrow.Array): 110 chunks = [array] 111 else: 112 # pyarrow.ChunkedArray 113 chunks = array.chunks 114 115 results = [] 116 for arr in chunks: 117 data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype=self.type) 118 int_arr = IntegerArray(data.copy(), ~mask, copy=False) 119 results.append(int_arr) 120 121 return IntegerArray._concat_same_type(results) 122 123 124def integer_array(values, dtype=None, copy: bool = False) -> "IntegerArray": 125 """ 126 Infer and return an integer array of the values. 127 128 Parameters 129 ---------- 130 values : 1D list-like 131 dtype : dtype, optional 132 dtype to coerce 133 copy : bool, default False 134 135 Returns 136 ------- 137 IntegerArray 138 139 Raises 140 ------ 141 TypeError if incompatible types 142 """ 143 values, mask = coerce_to_array(values, dtype=dtype, copy=copy) 144 return IntegerArray(values, mask) 145 146 147def safe_cast(values, dtype, copy: bool): 148 """ 149 Safely cast the values to the dtype if they 150 are equivalent, meaning floats must be equivalent to the 151 ints. 152 153 """ 154 try: 155 return values.astype(dtype, casting="safe", copy=copy) 156 except TypeError as err: 157 158 casted = values.astype(dtype, copy=copy) 159 if (casted == values).all(): 160 return casted 161 162 raise TypeError( 163 f"cannot safely cast non-equivalent {values.dtype} to {np.dtype(dtype)}" 164 ) from err 165 166 167def coerce_to_array( 168 values, dtype, mask=None, copy: bool = False 169) -> Tuple[np.ndarray, np.ndarray]: 170 """ 171 Coerce the input values array to numpy arrays with a mask 172 173 Parameters 174 ---------- 175 values : 1D list-like 176 dtype : integer dtype 177 mask : bool 1D array, optional 178 copy : bool, default False 179 if True, copy the input 180 181 Returns 182 ------- 183 tuple of (values, mask) 184 """ 185 # if values is integer numpy array, preserve its dtype 186 if dtype is None and hasattr(values, "dtype"): 187 if is_integer_dtype(values.dtype): 188 dtype = values.dtype 189 190 if dtype is not None: 191 if isinstance(dtype, str) and ( 192 dtype.startswith("Int") or dtype.startswith("UInt") 193 ): 194 # Avoid DeprecationWarning from NumPy about np.dtype("Int64") 195 # https://github.com/numpy/numpy/pull/7476 196 dtype = dtype.lower() 197 198 if not issubclass(type(dtype), _IntegerDtype): 199 try: 200 dtype = INT_STR_TO_DTYPE[str(np.dtype(dtype))] 201 except KeyError as err: 202 raise ValueError(f"invalid dtype specified {dtype}") from err 203 204 if isinstance(values, IntegerArray): 205 values, mask = values._data, values._mask 206 if dtype is not None: 207 values = values.astype(dtype.numpy_dtype, copy=False) 208 209 if copy: 210 values = values.copy() 211 mask = mask.copy() 212 return values, mask 213 214 values = np.array(values, copy=copy) 215 if is_object_dtype(values): 216 inferred_type = lib.infer_dtype(values, skipna=True) 217 if inferred_type == "empty": 218 values = np.empty(len(values)) 219 values.fill(np.nan) 220 elif inferred_type not in [ 221 "floating", 222 "integer", 223 "mixed-integer", 224 "integer-na", 225 "mixed-integer-float", 226 ]: 227 raise TypeError(f"{values.dtype} cannot be converted to an IntegerDtype") 228 229 elif is_bool_dtype(values) and is_integer_dtype(dtype): 230 values = np.array(values, dtype=int, copy=copy) 231 232 elif not (is_integer_dtype(values) or is_float_dtype(values)): 233 raise TypeError(f"{values.dtype} cannot be converted to an IntegerDtype") 234 235 if mask is None: 236 mask = isna(values) 237 else: 238 assert len(mask) == len(values) 239 240 if not values.ndim == 1: 241 raise TypeError("values must be a 1D list-like") 242 if not mask.ndim == 1: 243 raise TypeError("mask must be a 1D list-like") 244 245 # infer dtype if needed 246 if dtype is None: 247 dtype = np.dtype("int64") 248 else: 249 dtype = dtype.type 250 251 # if we are float, let's make sure that we can 252 # safely cast 253 254 # we copy as need to coerce here 255 if mask.any(): 256 values = values.copy() 257 values[mask] = 1 258 values = safe_cast(values, dtype, copy=False) 259 else: 260 values = safe_cast(values, dtype, copy=False) 261 262 return values, mask 263 264 265class IntegerArray(NumericArray): 266 """ 267 Array of integer (optional missing) values. 268 269 .. versionadded:: 0.24.0 270 271 .. versionchanged:: 1.0.0 272 273 Now uses :attr:`pandas.NA` as the missing value rather 274 than :attr:`numpy.nan`. 275 276 .. warning:: 277 278 IntegerArray is currently experimental, and its API or internal 279 implementation may change without warning. 280 281 We represent an IntegerArray with 2 numpy arrays: 282 283 - data: contains a numpy integer array of the appropriate dtype 284 - mask: a boolean array holding a mask on the data, True is missing 285 286 To construct an IntegerArray from generic array-like input, use 287 :func:`pandas.array` with one of the integer dtypes (see examples). 288 289 See :ref:`integer_na` for more. 290 291 Parameters 292 ---------- 293 values : numpy.ndarray 294 A 1-d integer-dtype array. 295 mask : numpy.ndarray 296 A 1-d boolean-dtype array indicating missing values. 297 copy : bool, default False 298 Whether to copy the `values` and `mask`. 299 300 Attributes 301 ---------- 302 None 303 304 Methods 305 ------- 306 None 307 308 Returns 309 ------- 310 IntegerArray 311 312 Examples 313 -------- 314 Create an IntegerArray with :func:`pandas.array`. 315 316 >>> int_array = pd.array([1, None, 3], dtype=pd.Int32Dtype()) 317 >>> int_array 318 <IntegerArray> 319 [1, <NA>, 3] 320 Length: 3, dtype: Int32 321 322 String aliases for the dtypes are also available. They are capitalized. 323 324 >>> pd.array([1, None, 3], dtype='Int32') 325 <IntegerArray> 326 [1, <NA>, 3] 327 Length: 3, dtype: Int32 328 329 >>> pd.array([1, None, 3], dtype='UInt16') 330 <IntegerArray> 331 [1, <NA>, 3] 332 Length: 3, dtype: UInt16 333 """ 334 335 # The value used to fill '_data' to avoid upcasting 336 _internal_fill_value = 1 337 338 @cache_readonly 339 def dtype(self) -> _IntegerDtype: 340 return INT_STR_TO_DTYPE[str(self._data.dtype)] 341 342 def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): 343 if not (isinstance(values, np.ndarray) and values.dtype.kind in ["i", "u"]): 344 raise TypeError( 345 "values should be integer numpy array. Use " 346 "the 'pd.array' function instead" 347 ) 348 super().__init__(values, mask, copy=copy) 349 350 def __neg__(self): 351 return type(self)(-self._data, self._mask.copy()) 352 353 def __pos__(self): 354 return self 355 356 def __abs__(self): 357 return type(self)(np.abs(self._data), self._mask.copy()) 358 359 @classmethod 360 def _from_sequence( 361 cls, scalars, *, dtype=None, copy: bool = False 362 ) -> "IntegerArray": 363 return integer_array(scalars, dtype=dtype, copy=copy) 364 365 @classmethod 366 def _from_sequence_of_strings( 367 cls, strings, *, dtype=None, copy: bool = False 368 ) -> "IntegerArray": 369 scalars = to_numeric(strings, errors="raise") 370 return cls._from_sequence(scalars, dtype=dtype, copy=copy) 371 372 _HANDLED_TYPES = (np.ndarray, numbers.Number) 373 374 def __array_ufunc__(self, ufunc, method: str, *inputs, **kwargs): 375 # For IntegerArray inputs, we apply the ufunc to ._data 376 # and mask the result. 377 if method == "reduce": 378 # Not clear how to handle missing values in reductions. Raise. 379 raise NotImplementedError("The 'reduce' method is not supported.") 380 out = kwargs.get("out", ()) 381 382 for x in inputs + out: 383 if not isinstance(x, self._HANDLED_TYPES + (IntegerArray,)): 384 return NotImplemented 385 386 # for binary ops, use our custom dunder methods 387 result = ops.maybe_dispatch_ufunc_to_dunder_op( 388 self, ufunc, method, *inputs, **kwargs 389 ) 390 if result is not NotImplemented: 391 return result 392 393 mask = np.zeros(len(self), dtype=bool) 394 inputs2 = [] 395 for x in inputs: 396 if isinstance(x, IntegerArray): 397 mask |= x._mask 398 inputs2.append(x._data) 399 else: 400 inputs2.append(x) 401 402 def reconstruct(x): 403 # we don't worry about scalar `x` here, since we 404 # raise for reduce up above. 405 406 if is_integer_dtype(x.dtype): 407 m = mask.copy() 408 return IntegerArray(x, m) 409 else: 410 x[mask] = np.nan 411 return x 412 413 result = getattr(ufunc, method)(*inputs2, **kwargs) 414 if isinstance(result, tuple): 415 return tuple(reconstruct(x) for x in result) 416 else: 417 return reconstruct(result) 418 419 def _coerce_to_array(self, value) -> Tuple[np.ndarray, np.ndarray]: 420 return coerce_to_array(value, dtype=self.dtype) 421 422 def astype(self, dtype, copy: bool = True) -> ArrayLike: 423 """ 424 Cast to a NumPy array or ExtensionArray with 'dtype'. 425 426 Parameters 427 ---------- 428 dtype : str or dtype 429 Typecode or data-type to which the array is cast. 430 copy : bool, default True 431 Whether to copy the data, even if not necessary. If False, 432 a copy is made only if the old dtype does not match the 433 new dtype. 434 435 Returns 436 ------- 437 ndarray or ExtensionArray 438 NumPy ndarray, BooleanArray or IntegerArray with 'dtype' for its dtype. 439 440 Raises 441 ------ 442 TypeError 443 if incompatible type with an IntegerDtype, equivalent of same_kind 444 casting 445 """ 446 from pandas.core.arrays.masked import BaseMaskedDtype 447 from pandas.core.arrays.string_ import StringDtype 448 449 dtype = pandas_dtype(dtype) 450 451 # if the dtype is exactly the same, we can fastpath 452 if self.dtype == dtype: 453 # return the same object for copy=False 454 return self.copy() if copy else self 455 # if we are astyping to another nullable masked dtype, we can fastpath 456 if isinstance(dtype, BaseMaskedDtype): 457 data = self._data.astype(dtype.numpy_dtype, copy=copy) 458 # mask is copied depending on whether the data was copied, and 459 # not directly depending on the `copy` keyword 460 mask = self._mask if data is self._data else self._mask.copy() 461 return dtype.construct_array_type()(data, mask, copy=False) 462 elif isinstance(dtype, StringDtype): 463 return dtype.construct_array_type()._from_sequence(self, copy=False) 464 465 # coerce 466 if is_float_dtype(dtype): 467 # In astype, we consider dtype=float to also mean na_value=np.nan 468 na_value = np.nan 469 elif is_datetime64_dtype(dtype): 470 na_value = np.datetime64("NaT") 471 else: 472 na_value = lib.no_default 473 474 return self.to_numpy(dtype=dtype, na_value=na_value, copy=False) 475 476 def _values_for_argsort(self) -> np.ndarray: 477 """ 478 Return values for sorting. 479 480 Returns 481 ------- 482 ndarray 483 The transformed values should maintain the ordering between values 484 within the array. 485 486 See Also 487 -------- 488 ExtensionArray.argsort : Return the indices that would sort this array. 489 """ 490 data = self._data.copy() 491 if self._mask.any(): 492 data[self._mask] = data.min() - 1 493 return data 494 495 def _cmp_method(self, other, op): 496 from pandas.core.arrays import BooleanArray 497 498 mask = None 499 500 if isinstance(other, BaseMaskedArray): 501 other, mask = other._data, other._mask 502 503 elif is_list_like(other): 504 other = np.asarray(other) 505 if other.ndim > 1: 506 raise NotImplementedError("can only perform ops with 1-d structures") 507 if len(self) != len(other): 508 raise ValueError("Lengths must match to compare") 509 510 if other is libmissing.NA: 511 # numpy does not handle pd.NA well as "other" scalar (it returns 512 # a scalar False instead of an array) 513 # This may be fixed by NA.__array_ufunc__. Revisit this check 514 # once that's implemented. 515 result = np.zeros(self._data.shape, dtype="bool") 516 mask = np.ones(self._data.shape, dtype="bool") 517 else: 518 with warnings.catch_warnings(): 519 # numpy may show a FutureWarning: 520 # elementwise comparison failed; returning scalar instead, 521 # but in the future will perform elementwise comparison 522 # before returning NotImplemented. We fall back to the correct 523 # behavior today, so that should be fine to ignore. 524 warnings.filterwarnings("ignore", "elementwise", FutureWarning) 525 with np.errstate(all="ignore"): 526 method = getattr(self._data, f"__{op.__name__}__") 527 result = method(other) 528 529 if result is NotImplemented: 530 result = invalid_comparison(self._data, other, op) 531 532 # nans propagate 533 if mask is None: 534 mask = self._mask.copy() 535 else: 536 mask = self._mask | mask 537 538 return BooleanArray(result, mask) 539 540 def sum(self, *, skipna=True, min_count=0, **kwargs): 541 nv.validate_sum((), kwargs) 542 return super()._reduce("sum", skipna=skipna, min_count=min_count) 543 544 def prod(self, *, skipna=True, min_count=0, **kwargs): 545 nv.validate_prod((), kwargs) 546 return super()._reduce("prod", skipna=skipna, min_count=min_count) 547 548 def min(self, *, skipna=True, **kwargs): 549 nv.validate_min((), kwargs) 550 return super()._reduce("min", skipna=skipna) 551 552 def max(self, *, skipna=True, **kwargs): 553 nv.validate_max((), kwargs) 554 return super()._reduce("max", skipna=skipna) 555 556 def _maybe_mask_result(self, result, mask, other, op_name: str): 557 """ 558 Parameters 559 ---------- 560 result : array-like 561 mask : array-like bool 562 other : scalar or array-like 563 op_name : str 564 """ 565 # if we have a float operand we are by-definition 566 # a float result 567 # or our op is a divide 568 if (is_float_dtype(other) or is_float(other)) or ( 569 op_name in ["rtruediv", "truediv"] 570 ): 571 from pandas.core.arrays import FloatingArray 572 573 return FloatingArray(result, mask, copy=False) 574 575 if result.dtype == "timedelta64[ns]": 576 from pandas.core.arrays import TimedeltaArray 577 578 result[mask] = iNaT 579 return TimedeltaArray._simple_new(result) 580 581 return type(self)(result, mask, copy=False) 582 583 584_dtype_docstring = """ 585An ExtensionDtype for {dtype} integer data. 586 587.. versionchanged:: 1.0.0 588 589 Now uses :attr:`pandas.NA` as its missing value, 590 rather than :attr:`numpy.nan`. 591 592Attributes 593---------- 594None 595 596Methods 597------- 598None 599""" 600 601# create the Dtype 602 603 604@register_extension_dtype 605class Int8Dtype(_IntegerDtype): 606 type = np.int8 607 name = "Int8" 608 __doc__ = _dtype_docstring.format(dtype="int8") 609 610 611@register_extension_dtype 612class Int16Dtype(_IntegerDtype): 613 type = np.int16 614 name = "Int16" 615 __doc__ = _dtype_docstring.format(dtype="int16") 616 617 618@register_extension_dtype 619class Int32Dtype(_IntegerDtype): 620 type = np.int32 621 name = "Int32" 622 __doc__ = _dtype_docstring.format(dtype="int32") 623 624 625@register_extension_dtype 626class Int64Dtype(_IntegerDtype): 627 type = np.int64 628 name = "Int64" 629 __doc__ = _dtype_docstring.format(dtype="int64") 630 631 632@register_extension_dtype 633class UInt8Dtype(_IntegerDtype): 634 type = np.uint8 635 name = "UInt8" 636 __doc__ = _dtype_docstring.format(dtype="uint8") 637 638 639@register_extension_dtype 640class UInt16Dtype(_IntegerDtype): 641 type = np.uint16 642 name = "UInt16" 643 __doc__ = _dtype_docstring.format(dtype="uint16") 644 645 646@register_extension_dtype 647class UInt32Dtype(_IntegerDtype): 648 type = np.uint32 649 name = "UInt32" 650 __doc__ = _dtype_docstring.format(dtype="uint32") 651 652 653@register_extension_dtype 654class UInt64Dtype(_IntegerDtype): 655 type = np.uint64 656 name = "UInt64" 657 __doc__ = _dtype_docstring.format(dtype="uint64") 658 659 660INT_STR_TO_DTYPE: Dict[str, _IntegerDtype] = { 661 "int8": Int8Dtype(), 662 "int16": Int16Dtype(), 663 "int32": Int32Dtype(), 664 "int64": Int64Dtype(), 665 "uint8": UInt8Dtype(), 666 "uint16": UInt16Dtype(), 667 "uint32": UInt32Dtype(), 668 "uint64": UInt64Dtype(), 669} 670