1from __future__ import annotations
2
3from typing import TYPE_CHECKING, Any, Optional, Sequence, Tuple, Type, TypeVar, Union
4
5import numpy as np
6
7from pandas._libs import lib, missing as libmissing
8from pandas._typing import Scalar
9from pandas.errors import AbstractMethodError
10from pandas.util._decorators import cache_readonly, doc
11
12from pandas.core.dtypes.base import ExtensionDtype
13from pandas.core.dtypes.common import (
14    is_integer,
15    is_object_dtype,
16    is_scalar,
17    is_string_dtype,
18)
19from pandas.core.dtypes.missing import isna, notna
20
21from pandas.core import nanops
22from pandas.core.algorithms import factorize_array, take
23from pandas.core.array_algos import masked_reductions
24from pandas.core.arraylike import OpsMixin
25from pandas.core.arrays import ExtensionArray
26from pandas.core.indexers import check_array_indexer
27
28if TYPE_CHECKING:
29    from pandas import Series
30
31
32BaseMaskedArrayT = TypeVar("BaseMaskedArrayT", bound="BaseMaskedArray")
33
34
35class BaseMaskedDtype(ExtensionDtype):
36    """
37    Base class for dtypes for BasedMaskedArray subclasses.
38    """
39
40    name: str
41    base = None
42    type: Type
43
44    na_value = libmissing.NA
45
46    @cache_readonly
47    def numpy_dtype(self) -> np.dtype:
48        """ Return an instance of our numpy dtype """
49        return np.dtype(self.type)
50
51    @cache_readonly
52    def kind(self) -> str:
53        return self.numpy_dtype.kind
54
55    @cache_readonly
56    def itemsize(self) -> int:
57        """ Return the number of bytes in this dtype """
58        return self.numpy_dtype.itemsize
59
60    @classmethod
61    def construct_array_type(cls) -> Type[BaseMaskedArray]:
62        """
63        Return the array type associated with this dtype.
64
65        Returns
66        -------
67        type
68        """
69        raise NotImplementedError
70
71
72class BaseMaskedArray(OpsMixin, ExtensionArray):
73    """
74    Base class for masked arrays (which use _data and _mask to store the data).
75
76    numpy based
77    """
78
79    # The value used to fill '_data' to avoid upcasting
80    _internal_fill_value: Scalar
81
82    def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False):
83        # values is supposed to already be validated in the subclass
84        if not (isinstance(mask, np.ndarray) and mask.dtype == np.bool_):
85            raise TypeError(
86                "mask should be boolean numpy array. Use "
87                "the 'pd.array' function instead"
88            )
89        if values.ndim != 1:
90            raise ValueError("values must be a 1D array")
91        if mask.ndim != 1:
92            raise ValueError("mask must be a 1D array")
93
94        if copy:
95            values = values.copy()
96            mask = mask.copy()
97
98        self._data = values
99        self._mask = mask
100
101    @property
102    def dtype(self) -> BaseMaskedDtype:
103        raise AbstractMethodError(self)
104
105    def __getitem__(
106        self, item: Union[int, slice, np.ndarray]
107    ) -> Union[BaseMaskedArray, Any]:
108        if is_integer(item):
109            if self._mask[item]:
110                return self.dtype.na_value
111            return self._data[item]
112
113        item = check_array_indexer(self, item)
114
115        return type(self)(self._data[item], self._mask[item])
116
117    def _coerce_to_array(self, values) -> Tuple[np.ndarray, np.ndarray]:
118        raise AbstractMethodError(self)
119
120    def __setitem__(self, key, value) -> None:
121        _is_scalar = is_scalar(value)
122        if _is_scalar:
123            value = [value]
124        value, mask = self._coerce_to_array(value)
125
126        if _is_scalar:
127            value = value[0]
128            mask = mask[0]
129
130        key = check_array_indexer(self, key)
131        self._data[key] = value
132        self._mask[key] = mask
133
134    def __iter__(self):
135        for i in range(len(self)):
136            if self._mask[i]:
137                yield self.dtype.na_value
138            else:
139                yield self._data[i]
140
141    def __len__(self) -> int:
142        return len(self._data)
143
144    def __invert__(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
145        return type(self)(~self._data, self._mask.copy())
146
147    def to_numpy(
148        self, dtype=None, copy: bool = False, na_value: Scalar = lib.no_default
149    ) -> np.ndarray:
150        """
151        Convert to a NumPy Array.
152
153        By default converts to an object-dtype NumPy array. Specify the `dtype` and
154        `na_value` keywords to customize the conversion.
155
156        Parameters
157        ----------
158        dtype : dtype, default object
159            The numpy dtype to convert to.
160        copy : bool, default False
161            Whether to ensure that the returned value is a not a view on
162            the array. Note that ``copy=False`` does not *ensure* that
163            ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
164            a copy is made, even if not strictly necessary. This is typically
165            only possible when no missing values are present and `dtype`
166            is the equivalent numpy dtype.
167        na_value : scalar, optional
168             Scalar missing value indicator to use in numpy array. Defaults
169             to the native missing value indicator of this array (pd.NA).
170
171        Returns
172        -------
173        numpy.ndarray
174
175        Examples
176        --------
177        An object-dtype is the default result
178
179        >>> a = pd.array([True, False, pd.NA], dtype="boolean")
180        >>> a.to_numpy()
181        array([True, False, <NA>], dtype=object)
182
183        When no missing values are present, an equivalent dtype can be used.
184
185        >>> pd.array([True, False], dtype="boolean").to_numpy(dtype="bool")
186        array([ True, False])
187        >>> pd.array([1, 2], dtype="Int64").to_numpy("int64")
188        array([1, 2])
189
190        However, requesting such dtype will raise a ValueError if
191        missing values are present and the default missing value :attr:`NA`
192        is used.
193
194        >>> a = pd.array([True, False, pd.NA], dtype="boolean")
195        >>> a
196        <BooleanArray>
197        [True, False, <NA>]
198        Length: 3, dtype: boolean
199
200        >>> a.to_numpy(dtype="bool")
201        Traceback (most recent call last):
202        ...
203        ValueError: cannot convert to bool numpy array in presence of missing values
204
205        Specify a valid `na_value` instead
206
207        >>> a.to_numpy(dtype="bool", na_value=False)
208        array([ True, False, False])
209        """
210        if na_value is lib.no_default:
211            na_value = libmissing.NA
212        if dtype is None:
213            dtype = object
214        if self._hasna:
215            if (
216                not is_object_dtype(dtype)
217                and not is_string_dtype(dtype)
218                and na_value is libmissing.NA
219            ):
220                raise ValueError(
221                    f"cannot convert to '{dtype}'-dtype NumPy array "
222                    "with missing values. Specify an appropriate 'na_value' "
223                    "for this dtype."
224                )
225            # don't pass copy to astype -> always need a copy since we are mutating
226            data = self._data.astype(dtype)
227            data[self._mask] = na_value
228        else:
229            data = self._data.astype(dtype, copy=copy)
230        return data
231
232    __array_priority__ = 1000  # higher than ndarray so ops dispatch to us
233
234    def __array__(self, dtype=None) -> np.ndarray:
235        """
236        the array interface, return my values
237        We return an object array here to preserve our scalar values
238        """
239        return self.to_numpy(dtype=dtype)
240
241    def __arrow_array__(self, type=None):
242        """
243        Convert myself into a pyarrow Array.
244        """
245        import pyarrow as pa
246
247        return pa.array(self._data, mask=self._mask, type=type)
248
249    @property
250    def _hasna(self) -> bool:
251        # Note: this is expensive right now! The hope is that we can
252        # make this faster by having an optional mask, but not have to change
253        # source code using it..
254        return self._mask.any()
255
256    def isna(self) -> np.ndarray:
257        return self._mask
258
259    @property
260    def _na_value(self):
261        return self.dtype.na_value
262
263    @property
264    def nbytes(self) -> int:
265        return self._data.nbytes + self._mask.nbytes
266
267    @classmethod
268    def _concat_same_type(
269        cls: Type[BaseMaskedArrayT], to_concat: Sequence[BaseMaskedArrayT]
270    ) -> BaseMaskedArrayT:
271        data = np.concatenate([x._data for x in to_concat])
272        mask = np.concatenate([x._mask for x in to_concat])
273        return cls(data, mask)
274
275    def take(
276        self: BaseMaskedArrayT,
277        indexer,
278        *,
279        allow_fill: bool = False,
280        fill_value: Optional[Scalar] = None,
281    ) -> BaseMaskedArrayT:
282        # we always fill with 1 internally
283        # to avoid upcasting
284        data_fill_value = self._internal_fill_value if isna(fill_value) else fill_value
285        result = take(
286            self._data, indexer, fill_value=data_fill_value, allow_fill=allow_fill
287        )
288
289        mask = take(self._mask, indexer, fill_value=True, allow_fill=allow_fill)
290
291        # if we are filling
292        # we only fill where the indexer is null
293        # not existing missing values
294        # TODO(jreback) what if we have a non-na float as a fill value?
295        if allow_fill and notna(fill_value):
296            fill_mask = np.asarray(indexer) == -1
297            result[fill_mask] = fill_value
298            mask = mask ^ fill_mask
299
300        return type(self)(result, mask, copy=False)
301
302    def copy(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
303        data, mask = self._data, self._mask
304        data = data.copy()
305        mask = mask.copy()
306        return type(self)(data, mask, copy=False)
307
308    @doc(ExtensionArray.factorize)
309    def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ExtensionArray]:
310        arr = self._data
311        mask = self._mask
312
313        codes, uniques = factorize_array(arr, na_sentinel=na_sentinel, mask=mask)
314
315        # the hashtables don't handle all different types of bits
316        uniques = uniques.astype(self.dtype.numpy_dtype, copy=False)
317        uniques = type(self)(uniques, np.zeros(len(uniques), dtype=bool))
318        return codes, uniques
319
320    def value_counts(self, dropna: bool = True) -> "Series":
321        """
322        Returns a Series containing counts of each unique value.
323
324        Parameters
325        ----------
326        dropna : bool, default True
327            Don't include counts of missing values.
328
329        Returns
330        -------
331        counts : Series
332
333        See Also
334        --------
335        Series.value_counts
336        """
337        from pandas import Index, Series
338        from pandas.arrays import IntegerArray
339
340        # compute counts on the data with no nans
341        data = self._data[~self._mask]
342        value_counts = Index(data).value_counts()
343
344        # TODO(extension)
345        # if we have allow Index to hold an ExtensionArray
346        # this is easier
347        index = value_counts.index._values.astype(object)
348
349        # if we want nans, count the mask
350        if dropna:
351            counts = value_counts._values
352        else:
353            counts = np.empty(len(value_counts) + 1, dtype="int64")
354            counts[:-1] = value_counts
355            counts[-1] = self._mask.sum()
356
357            index = Index(
358                np.concatenate([index, np.array([self.dtype.na_value], dtype=object)]),
359                dtype=object,
360            )
361
362        mask = np.zeros(len(counts), dtype="bool")
363        counts = IntegerArray(counts, mask)
364
365        return Series(counts, index=index)
366
367    def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
368        data = self._data
369        mask = self._mask
370
371        if name in {"sum", "prod", "min", "max"}:
372            op = getattr(masked_reductions, name)
373            return op(data, mask, skipna=skipna, **kwargs)
374
375        # coerce to a nan-aware float if needed
376        # (we explicitly use NaN within reductions)
377        if self._hasna:
378            data = self.to_numpy("float64", na_value=np.nan)
379
380        op = getattr(nanops, "nan" + name)
381        result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs)
382
383        if np.isnan(result):
384            return libmissing.NA
385
386        return result
387