1from __future__ import annotations 2 3from typing import TYPE_CHECKING, Any, Optional, Sequence, Tuple, Type, TypeVar, Union 4 5import numpy as np 6 7from pandas._libs import lib, missing as libmissing 8from pandas._typing import Scalar 9from pandas.errors import AbstractMethodError 10from pandas.util._decorators import cache_readonly, doc 11 12from pandas.core.dtypes.base import ExtensionDtype 13from pandas.core.dtypes.common import ( 14 is_integer, 15 is_object_dtype, 16 is_scalar, 17 is_string_dtype, 18) 19from pandas.core.dtypes.missing import isna, notna 20 21from pandas.core import nanops 22from pandas.core.algorithms import factorize_array, take 23from pandas.core.array_algos import masked_reductions 24from pandas.core.arraylike import OpsMixin 25from pandas.core.arrays import ExtensionArray 26from pandas.core.indexers import check_array_indexer 27 28if TYPE_CHECKING: 29 from pandas import Series 30 31 32BaseMaskedArrayT = TypeVar("BaseMaskedArrayT", bound="BaseMaskedArray") 33 34 35class BaseMaskedDtype(ExtensionDtype): 36 """ 37 Base class for dtypes for BasedMaskedArray subclasses. 38 """ 39 40 name: str 41 base = None 42 type: Type 43 44 na_value = libmissing.NA 45 46 @cache_readonly 47 def numpy_dtype(self) -> np.dtype: 48 """ Return an instance of our numpy dtype """ 49 return np.dtype(self.type) 50 51 @cache_readonly 52 def kind(self) -> str: 53 return self.numpy_dtype.kind 54 55 @cache_readonly 56 def itemsize(self) -> int: 57 """ Return the number of bytes in this dtype """ 58 return self.numpy_dtype.itemsize 59 60 @classmethod 61 def construct_array_type(cls) -> Type[BaseMaskedArray]: 62 """ 63 Return the array type associated with this dtype. 64 65 Returns 66 ------- 67 type 68 """ 69 raise NotImplementedError 70 71 72class BaseMaskedArray(OpsMixin, ExtensionArray): 73 """ 74 Base class for masked arrays (which use _data and _mask to store the data). 75 76 numpy based 77 """ 78 79 # The value used to fill '_data' to avoid upcasting 80 _internal_fill_value: Scalar 81 82 def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): 83 # values is supposed to already be validated in the subclass 84 if not (isinstance(mask, np.ndarray) and mask.dtype == np.bool_): 85 raise TypeError( 86 "mask should be boolean numpy array. Use " 87 "the 'pd.array' function instead" 88 ) 89 if values.ndim != 1: 90 raise ValueError("values must be a 1D array") 91 if mask.ndim != 1: 92 raise ValueError("mask must be a 1D array") 93 94 if copy: 95 values = values.copy() 96 mask = mask.copy() 97 98 self._data = values 99 self._mask = mask 100 101 @property 102 def dtype(self) -> BaseMaskedDtype: 103 raise AbstractMethodError(self) 104 105 def __getitem__( 106 self, item: Union[int, slice, np.ndarray] 107 ) -> Union[BaseMaskedArray, Any]: 108 if is_integer(item): 109 if self._mask[item]: 110 return self.dtype.na_value 111 return self._data[item] 112 113 item = check_array_indexer(self, item) 114 115 return type(self)(self._data[item], self._mask[item]) 116 117 def _coerce_to_array(self, values) -> Tuple[np.ndarray, np.ndarray]: 118 raise AbstractMethodError(self) 119 120 def __setitem__(self, key, value) -> None: 121 _is_scalar = is_scalar(value) 122 if _is_scalar: 123 value = [value] 124 value, mask = self._coerce_to_array(value) 125 126 if _is_scalar: 127 value = value[0] 128 mask = mask[0] 129 130 key = check_array_indexer(self, key) 131 self._data[key] = value 132 self._mask[key] = mask 133 134 def __iter__(self): 135 for i in range(len(self)): 136 if self._mask[i]: 137 yield self.dtype.na_value 138 else: 139 yield self._data[i] 140 141 def __len__(self) -> int: 142 return len(self._data) 143 144 def __invert__(self: BaseMaskedArrayT) -> BaseMaskedArrayT: 145 return type(self)(~self._data, self._mask.copy()) 146 147 def to_numpy( 148 self, dtype=None, copy: bool = False, na_value: Scalar = lib.no_default 149 ) -> np.ndarray: 150 """ 151 Convert to a NumPy Array. 152 153 By default converts to an object-dtype NumPy array. Specify the `dtype` and 154 `na_value` keywords to customize the conversion. 155 156 Parameters 157 ---------- 158 dtype : dtype, default object 159 The numpy dtype to convert to. 160 copy : bool, default False 161 Whether to ensure that the returned value is a not a view on 162 the array. Note that ``copy=False`` does not *ensure* that 163 ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that 164 a copy is made, even if not strictly necessary. This is typically 165 only possible when no missing values are present and `dtype` 166 is the equivalent numpy dtype. 167 na_value : scalar, optional 168 Scalar missing value indicator to use in numpy array. Defaults 169 to the native missing value indicator of this array (pd.NA). 170 171 Returns 172 ------- 173 numpy.ndarray 174 175 Examples 176 -------- 177 An object-dtype is the default result 178 179 >>> a = pd.array([True, False, pd.NA], dtype="boolean") 180 >>> a.to_numpy() 181 array([True, False, <NA>], dtype=object) 182 183 When no missing values are present, an equivalent dtype can be used. 184 185 >>> pd.array([True, False], dtype="boolean").to_numpy(dtype="bool") 186 array([ True, False]) 187 >>> pd.array([1, 2], dtype="Int64").to_numpy("int64") 188 array([1, 2]) 189 190 However, requesting such dtype will raise a ValueError if 191 missing values are present and the default missing value :attr:`NA` 192 is used. 193 194 >>> a = pd.array([True, False, pd.NA], dtype="boolean") 195 >>> a 196 <BooleanArray> 197 [True, False, <NA>] 198 Length: 3, dtype: boolean 199 200 >>> a.to_numpy(dtype="bool") 201 Traceback (most recent call last): 202 ... 203 ValueError: cannot convert to bool numpy array in presence of missing values 204 205 Specify a valid `na_value` instead 206 207 >>> a.to_numpy(dtype="bool", na_value=False) 208 array([ True, False, False]) 209 """ 210 if na_value is lib.no_default: 211 na_value = libmissing.NA 212 if dtype is None: 213 dtype = object 214 if self._hasna: 215 if ( 216 not is_object_dtype(dtype) 217 and not is_string_dtype(dtype) 218 and na_value is libmissing.NA 219 ): 220 raise ValueError( 221 f"cannot convert to '{dtype}'-dtype NumPy array " 222 "with missing values. Specify an appropriate 'na_value' " 223 "for this dtype." 224 ) 225 # don't pass copy to astype -> always need a copy since we are mutating 226 data = self._data.astype(dtype) 227 data[self._mask] = na_value 228 else: 229 data = self._data.astype(dtype, copy=copy) 230 return data 231 232 __array_priority__ = 1000 # higher than ndarray so ops dispatch to us 233 234 def __array__(self, dtype=None) -> np.ndarray: 235 """ 236 the array interface, return my values 237 We return an object array here to preserve our scalar values 238 """ 239 return self.to_numpy(dtype=dtype) 240 241 def __arrow_array__(self, type=None): 242 """ 243 Convert myself into a pyarrow Array. 244 """ 245 import pyarrow as pa 246 247 return pa.array(self._data, mask=self._mask, type=type) 248 249 @property 250 def _hasna(self) -> bool: 251 # Note: this is expensive right now! The hope is that we can 252 # make this faster by having an optional mask, but not have to change 253 # source code using it.. 254 return self._mask.any() 255 256 def isna(self) -> np.ndarray: 257 return self._mask 258 259 @property 260 def _na_value(self): 261 return self.dtype.na_value 262 263 @property 264 def nbytes(self) -> int: 265 return self._data.nbytes + self._mask.nbytes 266 267 @classmethod 268 def _concat_same_type( 269 cls: Type[BaseMaskedArrayT], to_concat: Sequence[BaseMaskedArrayT] 270 ) -> BaseMaskedArrayT: 271 data = np.concatenate([x._data for x in to_concat]) 272 mask = np.concatenate([x._mask for x in to_concat]) 273 return cls(data, mask) 274 275 def take( 276 self: BaseMaskedArrayT, 277 indexer, 278 *, 279 allow_fill: bool = False, 280 fill_value: Optional[Scalar] = None, 281 ) -> BaseMaskedArrayT: 282 # we always fill with 1 internally 283 # to avoid upcasting 284 data_fill_value = self._internal_fill_value if isna(fill_value) else fill_value 285 result = take( 286 self._data, indexer, fill_value=data_fill_value, allow_fill=allow_fill 287 ) 288 289 mask = take(self._mask, indexer, fill_value=True, allow_fill=allow_fill) 290 291 # if we are filling 292 # we only fill where the indexer is null 293 # not existing missing values 294 # TODO(jreback) what if we have a non-na float as a fill value? 295 if allow_fill and notna(fill_value): 296 fill_mask = np.asarray(indexer) == -1 297 result[fill_mask] = fill_value 298 mask = mask ^ fill_mask 299 300 return type(self)(result, mask, copy=False) 301 302 def copy(self: BaseMaskedArrayT) -> BaseMaskedArrayT: 303 data, mask = self._data, self._mask 304 data = data.copy() 305 mask = mask.copy() 306 return type(self)(data, mask, copy=False) 307 308 @doc(ExtensionArray.factorize) 309 def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ExtensionArray]: 310 arr = self._data 311 mask = self._mask 312 313 codes, uniques = factorize_array(arr, na_sentinel=na_sentinel, mask=mask) 314 315 # the hashtables don't handle all different types of bits 316 uniques = uniques.astype(self.dtype.numpy_dtype, copy=False) 317 uniques = type(self)(uniques, np.zeros(len(uniques), dtype=bool)) 318 return codes, uniques 319 320 def value_counts(self, dropna: bool = True) -> "Series": 321 """ 322 Returns a Series containing counts of each unique value. 323 324 Parameters 325 ---------- 326 dropna : bool, default True 327 Don't include counts of missing values. 328 329 Returns 330 ------- 331 counts : Series 332 333 See Also 334 -------- 335 Series.value_counts 336 """ 337 from pandas import Index, Series 338 from pandas.arrays import IntegerArray 339 340 # compute counts on the data with no nans 341 data = self._data[~self._mask] 342 value_counts = Index(data).value_counts() 343 344 # TODO(extension) 345 # if we have allow Index to hold an ExtensionArray 346 # this is easier 347 index = value_counts.index._values.astype(object) 348 349 # if we want nans, count the mask 350 if dropna: 351 counts = value_counts._values 352 else: 353 counts = np.empty(len(value_counts) + 1, dtype="int64") 354 counts[:-1] = value_counts 355 counts[-1] = self._mask.sum() 356 357 index = Index( 358 np.concatenate([index, np.array([self.dtype.na_value], dtype=object)]), 359 dtype=object, 360 ) 361 362 mask = np.zeros(len(counts), dtype="bool") 363 counts = IntegerArray(counts, mask) 364 365 return Series(counts, index=index) 366 367 def _reduce(self, name: str, *, skipna: bool = True, **kwargs): 368 data = self._data 369 mask = self._mask 370 371 if name in {"sum", "prod", "min", "max"}: 372 op = getattr(masked_reductions, name) 373 return op(data, mask, skipna=skipna, **kwargs) 374 375 # coerce to a nan-aware float if needed 376 # (we explicitly use NaN within reductions) 377 if self._hasna: 378 data = self.to_numpy("float64", na_value=np.nan) 379 380 op = getattr(nanops, "nan" + name) 381 result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs) 382 383 if np.isnan(result): 384 return libmissing.NA 385 386 return result 387