1import numbers
2from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Type, Union
3import warnings
4
5import numpy as np
6
7from pandas._libs import iNaT, lib, missing as libmissing
8from pandas._typing import ArrayLike, DtypeObj
9from pandas.compat.numpy import function as nv
10from pandas.util._decorators import cache_readonly
11
12from pandas.core.dtypes.base import register_extension_dtype
13from pandas.core.dtypes.common import (
14    is_bool_dtype,
15    is_datetime64_dtype,
16    is_float,
17    is_float_dtype,
18    is_integer_dtype,
19    is_list_like,
20    is_object_dtype,
21    pandas_dtype,
22)
23from pandas.core.dtypes.missing import isna
24
25from pandas.core import ops
26from pandas.core.ops import invalid_comparison
27from pandas.core.tools.numeric import to_numeric
28
29from .masked import BaseMaskedArray, BaseMaskedDtype
30from .numeric import NumericArray
31
32if TYPE_CHECKING:
33    import pyarrow
34
35
36class _IntegerDtype(BaseMaskedDtype):
37    """
38    An ExtensionDtype to hold a single size & kind of integer dtype.
39
40    These specific implementations are subclasses of the non-public
41    _IntegerDtype. For example we have Int8Dtype to represent signed int 8s.
42
43    The attributes name & type are set when these subclasses are created.
44    """
45
46    def __repr__(self) -> str:
47        sign = "U" if self.is_unsigned_integer else ""
48        return f"{sign}Int{8 * self.itemsize}Dtype()"
49
50    @cache_readonly
51    def is_signed_integer(self) -> bool:
52        return self.kind == "i"
53
54    @cache_readonly
55    def is_unsigned_integer(self) -> bool:
56        return self.kind == "u"
57
58    @property
59    def _is_numeric(self) -> bool:
60        return True
61
62    @classmethod
63    def construct_array_type(cls) -> Type["IntegerArray"]:
64        """
65        Return the array type associated with this dtype.
66
67        Returns
68        -------
69        type
70        """
71        return IntegerArray
72
73    def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]:
74        # we only handle nullable EA dtypes and numeric numpy dtypes
75        if not all(
76            isinstance(t, BaseMaskedDtype)
77            or (
78                isinstance(t, np.dtype)
79                and (np.issubdtype(t, np.number) or np.issubdtype(t, np.bool_))
80            )
81            for t in dtypes
82        ):
83            return None
84        np_dtype = np.find_common_type(
85            [t.numpy_dtype if isinstance(t, BaseMaskedDtype) else t for t in dtypes], []
86        )
87        if np.issubdtype(np_dtype, np.integer):
88            return INT_STR_TO_DTYPE[str(np_dtype)]
89        elif np.issubdtype(np_dtype, np.floating):
90            from pandas.core.arrays.floating import FLOAT_STR_TO_DTYPE
91
92            return FLOAT_STR_TO_DTYPE[str(np_dtype)]
93        return None
94
95    def __from_arrow__(
96        self, array: Union["pyarrow.Array", "pyarrow.ChunkedArray"]
97    ) -> "IntegerArray":
98        """
99        Construct IntegerArray from pyarrow Array/ChunkedArray.
100        """
101        import pyarrow
102
103        from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask
104
105        pyarrow_type = pyarrow.from_numpy_dtype(self.type)
106        if not array.type.equals(pyarrow_type):
107            array = array.cast(pyarrow_type)
108
109        if isinstance(array, pyarrow.Array):
110            chunks = [array]
111        else:
112            # pyarrow.ChunkedArray
113            chunks = array.chunks
114
115        results = []
116        for arr in chunks:
117            data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype=self.type)
118            int_arr = IntegerArray(data.copy(), ~mask, copy=False)
119            results.append(int_arr)
120
121        return IntegerArray._concat_same_type(results)
122
123
124def integer_array(values, dtype=None, copy: bool = False) -> "IntegerArray":
125    """
126    Infer and return an integer array of the values.
127
128    Parameters
129    ----------
130    values : 1D list-like
131    dtype : dtype, optional
132        dtype to coerce
133    copy : bool, default False
134
135    Returns
136    -------
137    IntegerArray
138
139    Raises
140    ------
141    TypeError if incompatible types
142    """
143    values, mask = coerce_to_array(values, dtype=dtype, copy=copy)
144    return IntegerArray(values, mask)
145
146
147def safe_cast(values, dtype, copy: bool):
148    """
149    Safely cast the values to the dtype if they
150    are equivalent, meaning floats must be equivalent to the
151    ints.
152
153    """
154    try:
155        return values.astype(dtype, casting="safe", copy=copy)
156    except TypeError as err:
157
158        casted = values.astype(dtype, copy=copy)
159        if (casted == values).all():
160            return casted
161
162        raise TypeError(
163            f"cannot safely cast non-equivalent {values.dtype} to {np.dtype(dtype)}"
164        ) from err
165
166
167def coerce_to_array(
168    values, dtype, mask=None, copy: bool = False
169) -> Tuple[np.ndarray, np.ndarray]:
170    """
171    Coerce the input values array to numpy arrays with a mask
172
173    Parameters
174    ----------
175    values : 1D list-like
176    dtype : integer dtype
177    mask : bool 1D array, optional
178    copy : bool, default False
179        if True, copy the input
180
181    Returns
182    -------
183    tuple of (values, mask)
184    """
185    # if values is integer numpy array, preserve its dtype
186    if dtype is None and hasattr(values, "dtype"):
187        if is_integer_dtype(values.dtype):
188            dtype = values.dtype
189
190    if dtype is not None:
191        if isinstance(dtype, str) and (
192            dtype.startswith("Int") or dtype.startswith("UInt")
193        ):
194            # Avoid DeprecationWarning from NumPy about np.dtype("Int64")
195            # https://github.com/numpy/numpy/pull/7476
196            dtype = dtype.lower()
197
198        if not issubclass(type(dtype), _IntegerDtype):
199            try:
200                dtype = INT_STR_TO_DTYPE[str(np.dtype(dtype))]
201            except KeyError as err:
202                raise ValueError(f"invalid dtype specified {dtype}") from err
203
204    if isinstance(values, IntegerArray):
205        values, mask = values._data, values._mask
206        if dtype is not None:
207            values = values.astype(dtype.numpy_dtype, copy=False)
208
209        if copy:
210            values = values.copy()
211            mask = mask.copy()
212        return values, mask
213
214    values = np.array(values, copy=copy)
215    if is_object_dtype(values):
216        inferred_type = lib.infer_dtype(values, skipna=True)
217        if inferred_type == "empty":
218            values = np.empty(len(values))
219            values.fill(np.nan)
220        elif inferred_type not in [
221            "floating",
222            "integer",
223            "mixed-integer",
224            "integer-na",
225            "mixed-integer-float",
226        ]:
227            raise TypeError(f"{values.dtype} cannot be converted to an IntegerDtype")
228
229    elif is_bool_dtype(values) and is_integer_dtype(dtype):
230        values = np.array(values, dtype=int, copy=copy)
231
232    elif not (is_integer_dtype(values) or is_float_dtype(values)):
233        raise TypeError(f"{values.dtype} cannot be converted to an IntegerDtype")
234
235    if mask is None:
236        mask = isna(values)
237    else:
238        assert len(mask) == len(values)
239
240    if not values.ndim == 1:
241        raise TypeError("values must be a 1D list-like")
242    if not mask.ndim == 1:
243        raise TypeError("mask must be a 1D list-like")
244
245    # infer dtype if needed
246    if dtype is None:
247        dtype = np.dtype("int64")
248    else:
249        dtype = dtype.type
250
251    # if we are float, let's make sure that we can
252    # safely cast
253
254    # we copy as need to coerce here
255    if mask.any():
256        values = values.copy()
257        values[mask] = 1
258        values = safe_cast(values, dtype, copy=False)
259    else:
260        values = safe_cast(values, dtype, copy=False)
261
262    return values, mask
263
264
265class IntegerArray(NumericArray):
266    """
267    Array of integer (optional missing) values.
268
269    .. versionadded:: 0.24.0
270
271    .. versionchanged:: 1.0.0
272
273       Now uses :attr:`pandas.NA` as the missing value rather
274       than :attr:`numpy.nan`.
275
276    .. warning::
277
278       IntegerArray is currently experimental, and its API or internal
279       implementation may change without warning.
280
281    We represent an IntegerArray with 2 numpy arrays:
282
283    - data: contains a numpy integer array of the appropriate dtype
284    - mask: a boolean array holding a mask on the data, True is missing
285
286    To construct an IntegerArray from generic array-like input, use
287    :func:`pandas.array` with one of the integer dtypes (see examples).
288
289    See :ref:`integer_na` for more.
290
291    Parameters
292    ----------
293    values : numpy.ndarray
294        A 1-d integer-dtype array.
295    mask : numpy.ndarray
296        A 1-d boolean-dtype array indicating missing values.
297    copy : bool, default False
298        Whether to copy the `values` and `mask`.
299
300    Attributes
301    ----------
302    None
303
304    Methods
305    -------
306    None
307
308    Returns
309    -------
310    IntegerArray
311
312    Examples
313    --------
314    Create an IntegerArray with :func:`pandas.array`.
315
316    >>> int_array = pd.array([1, None, 3], dtype=pd.Int32Dtype())
317    >>> int_array
318    <IntegerArray>
319    [1, <NA>, 3]
320    Length: 3, dtype: Int32
321
322    String aliases for the dtypes are also available. They are capitalized.
323
324    >>> pd.array([1, None, 3], dtype='Int32')
325    <IntegerArray>
326    [1, <NA>, 3]
327    Length: 3, dtype: Int32
328
329    >>> pd.array([1, None, 3], dtype='UInt16')
330    <IntegerArray>
331    [1, <NA>, 3]
332    Length: 3, dtype: UInt16
333    """
334
335    # The value used to fill '_data' to avoid upcasting
336    _internal_fill_value = 1
337
338    @cache_readonly
339    def dtype(self) -> _IntegerDtype:
340        return INT_STR_TO_DTYPE[str(self._data.dtype)]
341
342    def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False):
343        if not (isinstance(values, np.ndarray) and values.dtype.kind in ["i", "u"]):
344            raise TypeError(
345                "values should be integer numpy array. Use "
346                "the 'pd.array' function instead"
347            )
348        super().__init__(values, mask, copy=copy)
349
350    def __neg__(self):
351        return type(self)(-self._data, self._mask.copy())
352
353    def __pos__(self):
354        return self
355
356    def __abs__(self):
357        return type(self)(np.abs(self._data), self._mask.copy())
358
359    @classmethod
360    def _from_sequence(
361        cls, scalars, *, dtype=None, copy: bool = False
362    ) -> "IntegerArray":
363        return integer_array(scalars, dtype=dtype, copy=copy)
364
365    @classmethod
366    def _from_sequence_of_strings(
367        cls, strings, *, dtype=None, copy: bool = False
368    ) -> "IntegerArray":
369        scalars = to_numeric(strings, errors="raise")
370        return cls._from_sequence(scalars, dtype=dtype, copy=copy)
371
372    _HANDLED_TYPES = (np.ndarray, numbers.Number)
373
374    def __array_ufunc__(self, ufunc, method: str, *inputs, **kwargs):
375        # For IntegerArray inputs, we apply the ufunc to ._data
376        # and mask the result.
377        if method == "reduce":
378            # Not clear how to handle missing values in reductions. Raise.
379            raise NotImplementedError("The 'reduce' method is not supported.")
380        out = kwargs.get("out", ())
381
382        for x in inputs + out:
383            if not isinstance(x, self._HANDLED_TYPES + (IntegerArray,)):
384                return NotImplemented
385
386        # for binary ops, use our custom dunder methods
387        result = ops.maybe_dispatch_ufunc_to_dunder_op(
388            self, ufunc, method, *inputs, **kwargs
389        )
390        if result is not NotImplemented:
391            return result
392
393        mask = np.zeros(len(self), dtype=bool)
394        inputs2 = []
395        for x in inputs:
396            if isinstance(x, IntegerArray):
397                mask |= x._mask
398                inputs2.append(x._data)
399            else:
400                inputs2.append(x)
401
402        def reconstruct(x):
403            # we don't worry about scalar `x` here, since we
404            # raise for reduce up above.
405
406            if is_integer_dtype(x.dtype):
407                m = mask.copy()
408                return IntegerArray(x, m)
409            else:
410                x[mask] = np.nan
411            return x
412
413        result = getattr(ufunc, method)(*inputs2, **kwargs)
414        if isinstance(result, tuple):
415            return tuple(reconstruct(x) for x in result)
416        else:
417            return reconstruct(result)
418
419    def _coerce_to_array(self, value) -> Tuple[np.ndarray, np.ndarray]:
420        return coerce_to_array(value, dtype=self.dtype)
421
422    def astype(self, dtype, copy: bool = True) -> ArrayLike:
423        """
424        Cast to a NumPy array or ExtensionArray with 'dtype'.
425
426        Parameters
427        ----------
428        dtype : str or dtype
429            Typecode or data-type to which the array is cast.
430        copy : bool, default True
431            Whether to copy the data, even if not necessary. If False,
432            a copy is made only if the old dtype does not match the
433            new dtype.
434
435        Returns
436        -------
437        ndarray or ExtensionArray
438            NumPy ndarray, BooleanArray or IntegerArray with 'dtype' for its dtype.
439
440        Raises
441        ------
442        TypeError
443            if incompatible type with an IntegerDtype, equivalent of same_kind
444            casting
445        """
446        from pandas.core.arrays.masked import BaseMaskedDtype
447        from pandas.core.arrays.string_ import StringDtype
448
449        dtype = pandas_dtype(dtype)
450
451        # if the dtype is exactly the same, we can fastpath
452        if self.dtype == dtype:
453            # return the same object for copy=False
454            return self.copy() if copy else self
455        # if we are astyping to another nullable masked dtype, we can fastpath
456        if isinstance(dtype, BaseMaskedDtype):
457            data = self._data.astype(dtype.numpy_dtype, copy=copy)
458            # mask is copied depending on whether the data was copied, and
459            # not directly depending on the `copy` keyword
460            mask = self._mask if data is self._data else self._mask.copy()
461            return dtype.construct_array_type()(data, mask, copy=False)
462        elif isinstance(dtype, StringDtype):
463            return dtype.construct_array_type()._from_sequence(self, copy=False)
464
465        # coerce
466        if is_float_dtype(dtype):
467            # In astype, we consider dtype=float to also mean na_value=np.nan
468            na_value = np.nan
469        elif is_datetime64_dtype(dtype):
470            na_value = np.datetime64("NaT")
471        else:
472            na_value = lib.no_default
473
474        return self.to_numpy(dtype=dtype, na_value=na_value, copy=False)
475
476    def _values_for_argsort(self) -> np.ndarray:
477        """
478        Return values for sorting.
479
480        Returns
481        -------
482        ndarray
483            The transformed values should maintain the ordering between values
484            within the array.
485
486        See Also
487        --------
488        ExtensionArray.argsort : Return the indices that would sort this array.
489        """
490        data = self._data.copy()
491        if self._mask.any():
492            data[self._mask] = data.min() - 1
493        return data
494
495    def _cmp_method(self, other, op):
496        from pandas.core.arrays import BooleanArray
497
498        mask = None
499
500        if isinstance(other, BaseMaskedArray):
501            other, mask = other._data, other._mask
502
503        elif is_list_like(other):
504            other = np.asarray(other)
505            if other.ndim > 1:
506                raise NotImplementedError("can only perform ops with 1-d structures")
507            if len(self) != len(other):
508                raise ValueError("Lengths must match to compare")
509
510        if other is libmissing.NA:
511            # numpy does not handle pd.NA well as "other" scalar (it returns
512            # a scalar False instead of an array)
513            # This may be fixed by NA.__array_ufunc__. Revisit this check
514            # once that's implemented.
515            result = np.zeros(self._data.shape, dtype="bool")
516            mask = np.ones(self._data.shape, dtype="bool")
517        else:
518            with warnings.catch_warnings():
519                # numpy may show a FutureWarning:
520                #     elementwise comparison failed; returning scalar instead,
521                #     but in the future will perform elementwise comparison
522                # before returning NotImplemented. We fall back to the correct
523                # behavior today, so that should be fine to ignore.
524                warnings.filterwarnings("ignore", "elementwise", FutureWarning)
525                with np.errstate(all="ignore"):
526                    method = getattr(self._data, f"__{op.__name__}__")
527                    result = method(other)
528
529                if result is NotImplemented:
530                    result = invalid_comparison(self._data, other, op)
531
532        # nans propagate
533        if mask is None:
534            mask = self._mask.copy()
535        else:
536            mask = self._mask | mask
537
538        return BooleanArray(result, mask)
539
540    def sum(self, *, skipna=True, min_count=0, **kwargs):
541        nv.validate_sum((), kwargs)
542        return super()._reduce("sum", skipna=skipna, min_count=min_count)
543
544    def prod(self, *, skipna=True, min_count=0, **kwargs):
545        nv.validate_prod((), kwargs)
546        return super()._reduce("prod", skipna=skipna, min_count=min_count)
547
548    def min(self, *, skipna=True, **kwargs):
549        nv.validate_min((), kwargs)
550        return super()._reduce("min", skipna=skipna)
551
552    def max(self, *, skipna=True, **kwargs):
553        nv.validate_max((), kwargs)
554        return super()._reduce("max", skipna=skipna)
555
556    def _maybe_mask_result(self, result, mask, other, op_name: str):
557        """
558        Parameters
559        ----------
560        result : array-like
561        mask : array-like bool
562        other : scalar or array-like
563        op_name : str
564        """
565        # if we have a float operand we are by-definition
566        # a float result
567        # or our op is a divide
568        if (is_float_dtype(other) or is_float(other)) or (
569            op_name in ["rtruediv", "truediv"]
570        ):
571            from pandas.core.arrays import FloatingArray
572
573            return FloatingArray(result, mask, copy=False)
574
575        if result.dtype == "timedelta64[ns]":
576            from pandas.core.arrays import TimedeltaArray
577
578            result[mask] = iNaT
579            return TimedeltaArray._simple_new(result)
580
581        return type(self)(result, mask, copy=False)
582
583
584_dtype_docstring = """
585An ExtensionDtype for {dtype} integer data.
586
587.. versionchanged:: 1.0.0
588
589   Now uses :attr:`pandas.NA` as its missing value,
590   rather than :attr:`numpy.nan`.
591
592Attributes
593----------
594None
595
596Methods
597-------
598None
599"""
600
601# create the Dtype
602
603
604@register_extension_dtype
605class Int8Dtype(_IntegerDtype):
606    type = np.int8
607    name = "Int8"
608    __doc__ = _dtype_docstring.format(dtype="int8")
609
610
611@register_extension_dtype
612class Int16Dtype(_IntegerDtype):
613    type = np.int16
614    name = "Int16"
615    __doc__ = _dtype_docstring.format(dtype="int16")
616
617
618@register_extension_dtype
619class Int32Dtype(_IntegerDtype):
620    type = np.int32
621    name = "Int32"
622    __doc__ = _dtype_docstring.format(dtype="int32")
623
624
625@register_extension_dtype
626class Int64Dtype(_IntegerDtype):
627    type = np.int64
628    name = "Int64"
629    __doc__ = _dtype_docstring.format(dtype="int64")
630
631
632@register_extension_dtype
633class UInt8Dtype(_IntegerDtype):
634    type = np.uint8
635    name = "UInt8"
636    __doc__ = _dtype_docstring.format(dtype="uint8")
637
638
639@register_extension_dtype
640class UInt16Dtype(_IntegerDtype):
641    type = np.uint16
642    name = "UInt16"
643    __doc__ = _dtype_docstring.format(dtype="uint16")
644
645
646@register_extension_dtype
647class UInt32Dtype(_IntegerDtype):
648    type = np.uint32
649    name = "UInt32"
650    __doc__ = _dtype_docstring.format(dtype="uint32")
651
652
653@register_extension_dtype
654class UInt64Dtype(_IntegerDtype):
655    type = np.uint64
656    name = "UInt64"
657    __doc__ = _dtype_docstring.format(dtype="uint64")
658
659
660INT_STR_TO_DTYPE: Dict[str, _IntegerDtype] = {
661    "int8": Int8Dtype(),
662    "int16": Int16Dtype(),
663    "int32": Int32Dtype(),
664    "int64": Int64Dtype(),
665    "uint8": UInt8Dtype(),
666    "uint16": UInt16Dtype(),
667    "uint32": UInt32Dtype(),
668    "uint64": UInt64Dtype(),
669}
670