1import operator 2 3import numpy as np 4import pytest 5 6from pandas.core.dtypes.common import is_bool_dtype 7 8import pandas as pd 9import pandas._testing as tm 10from pandas.core.sorting import nargsort 11 12from .base import BaseExtensionTests 13 14 15class BaseMethodsTests(BaseExtensionTests): 16 """Various Series and DataFrame methods.""" 17 18 @pytest.mark.parametrize("dropna", [True, False]) 19 def test_value_counts(self, all_data, dropna): 20 all_data = all_data[:10] 21 if dropna: 22 other = np.array(all_data[~all_data.isna()]) 23 else: 24 other = all_data 25 26 result = pd.Series(all_data).value_counts(dropna=dropna).sort_index() 27 expected = pd.Series(other).value_counts(dropna=dropna).sort_index() 28 29 self.assert_series_equal(result, expected) 30 31 def test_value_counts_with_normalize(self, data): 32 # GH 33172 33 data = data[:10].unique() 34 values = np.array(data[~data.isna()]) 35 36 result = ( 37 pd.Series(data, dtype=data.dtype).value_counts(normalize=True).sort_index() 38 ) 39 40 expected = pd.Series([1 / len(values)] * len(values), index=result.index) 41 self.assert_series_equal(result, expected) 42 43 def test_count(self, data_missing): 44 df = pd.DataFrame({"A": data_missing}) 45 result = df.count(axis="columns") 46 expected = pd.Series([0, 1]) 47 self.assert_series_equal(result, expected) 48 49 def test_series_count(self, data_missing): 50 # GH#26835 51 ser = pd.Series(data_missing) 52 result = ser.count() 53 expected = 1 54 assert result == expected 55 56 def test_apply_simple_series(self, data): 57 result = pd.Series(data).apply(id) 58 assert isinstance(result, pd.Series) 59 60 def test_argsort(self, data_for_sorting): 61 result = pd.Series(data_for_sorting).argsort() 62 expected = pd.Series(np.array([2, 0, 1], dtype=np.int64)) 63 self.assert_series_equal(result, expected) 64 65 def test_argsort_missing_array(self, data_missing_for_sorting): 66 result = data_missing_for_sorting.argsort() 67 expected = np.array([2, 0, 1], dtype=np.dtype("int")) 68 # we don't care whether it's int32 or int64 69 result = result.astype("int64", casting="safe") 70 expected = expected.astype("int64", casting="safe") 71 tm.assert_numpy_array_equal(result, expected) 72 73 def test_argsort_missing(self, data_missing_for_sorting): 74 result = pd.Series(data_missing_for_sorting).argsort() 75 expected = pd.Series(np.array([1, -1, 0], dtype=np.int64)) 76 self.assert_series_equal(result, expected) 77 78 def test_argmin_argmax(self, data_for_sorting, data_missing_for_sorting, na_value): 79 # GH 24382 80 81 # data_for_sorting -> [B, C, A] with A < B < C 82 assert data_for_sorting.argmax() == 1 83 assert data_for_sorting.argmin() == 2 84 85 # with repeated values -> first occurence 86 data = data_for_sorting.take([2, 0, 0, 1, 1, 2]) 87 assert data.argmax() == 3 88 assert data.argmin() == 0 89 90 # with missing values 91 # data_missing_for_sorting -> [B, NA, A] with A < B and NA missing. 92 assert data_missing_for_sorting.argmax() == 0 93 assert data_missing_for_sorting.argmin() == 2 94 95 @pytest.mark.parametrize("method", ["argmax", "argmin"]) 96 def test_argmin_argmax_empty_array(self, method, data): 97 # GH 24382 98 err_msg = "attempt to get" 99 with pytest.raises(ValueError, match=err_msg): 100 getattr(data[:0], method)() 101 102 @pytest.mark.parametrize("method", ["argmax", "argmin"]) 103 def test_argmin_argmax_all_na(self, method, data, na_value): 104 # all missing with skipna=True is the same as emtpy 105 err_msg = "attempt to get" 106 data_na = type(data)._from_sequence([na_value, na_value], dtype=data.dtype) 107 with pytest.raises(ValueError, match=err_msg): 108 getattr(data_na, method)() 109 110 @pytest.mark.parametrize( 111 "na_position, expected", 112 [ 113 ("last", np.array([2, 0, 1], dtype=np.dtype("intp"))), 114 ("first", np.array([1, 2, 0], dtype=np.dtype("intp"))), 115 ], 116 ) 117 def test_nargsort(self, data_missing_for_sorting, na_position, expected): 118 # GH 25439 119 result = nargsort(data_missing_for_sorting, na_position=na_position) 120 tm.assert_numpy_array_equal(result, expected) 121 122 @pytest.mark.parametrize("ascending", [True, False]) 123 def test_sort_values(self, data_for_sorting, ascending, sort_by_key): 124 ser = pd.Series(data_for_sorting) 125 result = ser.sort_values(ascending=ascending, key=sort_by_key) 126 expected = ser.iloc[[2, 0, 1]] 127 if not ascending: 128 # GH 35922. Expect stable sort 129 if ser.nunique() == 2: 130 expected = ser.iloc[[0, 1, 2]] 131 else: 132 expected = ser.iloc[[1, 0, 2]] 133 134 self.assert_series_equal(result, expected) 135 136 @pytest.mark.parametrize("ascending", [True, False]) 137 def test_sort_values_missing( 138 self, data_missing_for_sorting, ascending, sort_by_key 139 ): 140 ser = pd.Series(data_missing_for_sorting) 141 result = ser.sort_values(ascending=ascending, key=sort_by_key) 142 if ascending: 143 expected = ser.iloc[[2, 0, 1]] 144 else: 145 expected = ser.iloc[[0, 2, 1]] 146 self.assert_series_equal(result, expected) 147 148 @pytest.mark.parametrize("ascending", [True, False]) 149 def test_sort_values_frame(self, data_for_sorting, ascending): 150 df = pd.DataFrame({"A": [1, 2, 1], "B": data_for_sorting}) 151 result = df.sort_values(["A", "B"]) 152 expected = pd.DataFrame( 153 {"A": [1, 1, 2], "B": data_for_sorting.take([2, 0, 1])}, index=[2, 0, 1] 154 ) 155 self.assert_frame_equal(result, expected) 156 157 @pytest.mark.parametrize("box", [pd.Series, lambda x: x]) 158 @pytest.mark.parametrize("method", [lambda x: x.unique(), pd.unique]) 159 def test_unique(self, data, box, method): 160 duplicated = box(data._from_sequence([data[0], data[0]])) 161 162 result = method(duplicated) 163 164 assert len(result) == 1 165 assert isinstance(result, type(data)) 166 assert result[0] == duplicated[0] 167 168 @pytest.mark.parametrize("na_sentinel", [-1, -2]) 169 def test_factorize(self, data_for_grouping, na_sentinel): 170 codes, uniques = pd.factorize(data_for_grouping, na_sentinel=na_sentinel) 171 expected_codes = np.array( 172 [0, 0, na_sentinel, na_sentinel, 1, 1, 0, 2], dtype=np.intp 173 ) 174 expected_uniques = data_for_grouping.take([0, 4, 7]) 175 176 tm.assert_numpy_array_equal(codes, expected_codes) 177 self.assert_extension_array_equal(uniques, expected_uniques) 178 179 @pytest.mark.parametrize("na_sentinel", [-1, -2]) 180 def test_factorize_equivalence(self, data_for_grouping, na_sentinel): 181 codes_1, uniques_1 = pd.factorize(data_for_grouping, na_sentinel=na_sentinel) 182 codes_2, uniques_2 = data_for_grouping.factorize(na_sentinel=na_sentinel) 183 184 tm.assert_numpy_array_equal(codes_1, codes_2) 185 self.assert_extension_array_equal(uniques_1, uniques_2) 186 assert len(uniques_1) == len(pd.unique(uniques_1)) 187 assert uniques_1.dtype == data_for_grouping.dtype 188 189 def test_factorize_empty(self, data): 190 codes, uniques = pd.factorize(data[:0]) 191 expected_codes = np.array([], dtype=np.intp) 192 expected_uniques = type(data)._from_sequence([], dtype=data[:0].dtype) 193 194 tm.assert_numpy_array_equal(codes, expected_codes) 195 self.assert_extension_array_equal(uniques, expected_uniques) 196 197 def test_fillna_copy_frame(self, data_missing): 198 arr = data_missing.take([1, 1]) 199 df = pd.DataFrame({"A": arr}) 200 201 filled_val = df.iloc[0, 0] 202 result = df.fillna(filled_val) 203 204 assert df.A.values is not result.A.values 205 206 def test_fillna_copy_series(self, data_missing): 207 arr = data_missing.take([1, 1]) 208 ser = pd.Series(arr) 209 210 filled_val = ser[0] 211 result = ser.fillna(filled_val) 212 213 assert ser._values is not result._values 214 assert ser._values is arr 215 216 def test_fillna_length_mismatch(self, data_missing): 217 msg = "Length of 'value' does not match." 218 with pytest.raises(ValueError, match=msg): 219 data_missing.fillna(data_missing.take([1])) 220 221 def test_combine_le(self, data_repeated): 222 # GH 20825 223 # Test that combine works when doing a <= (le) comparison 224 orig_data1, orig_data2 = data_repeated(2) 225 s1 = pd.Series(orig_data1) 226 s2 = pd.Series(orig_data2) 227 result = s1.combine(s2, lambda x1, x2: x1 <= x2) 228 expected = pd.Series( 229 [a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))] 230 ) 231 self.assert_series_equal(result, expected) 232 233 val = s1.iloc[0] 234 result = s1.combine(val, lambda x1, x2: x1 <= x2) 235 expected = pd.Series([a <= val for a in list(orig_data1)]) 236 self.assert_series_equal(result, expected) 237 238 def test_combine_add(self, data_repeated): 239 # GH 20825 240 orig_data1, orig_data2 = data_repeated(2) 241 s1 = pd.Series(orig_data1) 242 s2 = pd.Series(orig_data2) 243 result = s1.combine(s2, lambda x1, x2: x1 + x2) 244 with np.errstate(over="ignore"): 245 expected = pd.Series( 246 orig_data1._from_sequence( 247 [a + b for (a, b) in zip(list(orig_data1), list(orig_data2))] 248 ) 249 ) 250 self.assert_series_equal(result, expected) 251 252 val = s1.iloc[0] 253 result = s1.combine(val, lambda x1, x2: x1 + x2) 254 expected = pd.Series( 255 orig_data1._from_sequence([a + val for a in list(orig_data1)]) 256 ) 257 self.assert_series_equal(result, expected) 258 259 def test_combine_first(self, data): 260 # https://github.com/pandas-dev/pandas/issues/24147 261 a = pd.Series(data[:3]) 262 b = pd.Series(data[2:5], index=[2, 3, 4]) 263 result = a.combine_first(b) 264 expected = pd.Series(data[:5]) 265 self.assert_series_equal(result, expected) 266 267 @pytest.mark.parametrize("frame", [True, False]) 268 @pytest.mark.parametrize( 269 "periods, indices", 270 [(-2, [2, 3, 4, -1, -1]), (0, [0, 1, 2, 3, 4]), (2, [-1, -1, 0, 1, 2])], 271 ) 272 def test_container_shift(self, data, frame, periods, indices): 273 # https://github.com/pandas-dev/pandas/issues/22386 274 subset = data[:5] 275 data = pd.Series(subset, name="A") 276 expected = pd.Series(subset.take(indices, allow_fill=True), name="A") 277 278 if frame: 279 result = data.to_frame(name="A").assign(B=1).shift(periods) 280 expected = pd.concat( 281 [expected, pd.Series([1] * 5, name="B").shift(periods)], axis=1 282 ) 283 compare = self.assert_frame_equal 284 else: 285 result = data.shift(periods) 286 compare = self.assert_series_equal 287 288 compare(result, expected) 289 290 def test_shift_0_periods(self, data): 291 # GH#33856 shifting with periods=0 should return a copy, not same obj 292 result = data.shift(0) 293 assert data[0] != data[1] # otherwise below is invalid 294 data[0] = data[1] 295 assert result[0] != result[1] # i.e. not the same object/view 296 297 @pytest.mark.parametrize("periods", [1, -2]) 298 def test_diff(self, data, periods): 299 data = data[:5] 300 if is_bool_dtype(data.dtype): 301 op = operator.xor 302 else: 303 op = operator.sub 304 try: 305 # does this array implement ops? 306 op(data, data) 307 except Exception: 308 pytest.skip(f"{type(data)} does not support diff") 309 s = pd.Series(data) 310 result = s.diff(periods) 311 expected = pd.Series(op(data, data.shift(periods))) 312 self.assert_series_equal(result, expected) 313 314 df = pd.DataFrame({"A": data, "B": [1.0] * 5}) 315 result = df.diff(periods) 316 if periods == 1: 317 b = [np.nan, 0, 0, 0, 0] 318 else: 319 b = [0, 0, 0, np.nan, np.nan] 320 expected = pd.DataFrame({"A": expected, "B": b}) 321 self.assert_frame_equal(result, expected) 322 323 @pytest.mark.parametrize( 324 "periods, indices", 325 [[-4, [-1, -1]], [-1, [1, -1]], [0, [0, 1]], [1, [-1, 0]], [4, [-1, -1]]], 326 ) 327 def test_shift_non_empty_array(self, data, periods, indices): 328 # https://github.com/pandas-dev/pandas/issues/23911 329 subset = data[:2] 330 result = subset.shift(periods) 331 expected = subset.take(indices, allow_fill=True) 332 self.assert_extension_array_equal(result, expected) 333 334 @pytest.mark.parametrize("periods", [-4, -1, 0, 1, 4]) 335 def test_shift_empty_array(self, data, periods): 336 # https://github.com/pandas-dev/pandas/issues/23911 337 empty = data[:0] 338 result = empty.shift(periods) 339 expected = empty 340 self.assert_extension_array_equal(result, expected) 341 342 def test_shift_zero_copies(self, data): 343 result = data.shift(0) 344 assert result is not data 345 346 result = data[:0].shift(2) 347 assert result is not data 348 349 def test_shift_fill_value(self, data): 350 arr = data[:4] 351 fill_value = data[0] 352 result = arr.shift(1, fill_value=fill_value) 353 expected = data.take([0, 0, 1, 2]) 354 self.assert_extension_array_equal(result, expected) 355 356 result = arr.shift(-2, fill_value=fill_value) 357 expected = data.take([2, 3, 0, 0]) 358 self.assert_extension_array_equal(result, expected) 359 360 def test_not_hashable(self, data): 361 # We are in general mutable, so not hashable 362 with pytest.raises(TypeError, match="unhashable type"): 363 hash(data) 364 365 def test_hash_pandas_object_works(self, data, as_frame): 366 # https://github.com/pandas-dev/pandas/issues/23066 367 data = pd.Series(data) 368 if as_frame: 369 data = data.to_frame() 370 a = pd.util.hash_pandas_object(data) 371 b = pd.util.hash_pandas_object(data) 372 self.assert_equal(a, b) 373 374 def test_searchsorted(self, data_for_sorting, as_series): 375 b, c, a = data_for_sorting 376 arr = type(data_for_sorting)._from_sequence([a, b, c]) 377 378 if as_series: 379 arr = pd.Series(arr) 380 assert arr.searchsorted(a) == 0 381 assert arr.searchsorted(a, side="right") == 1 382 383 assert arr.searchsorted(b) == 1 384 assert arr.searchsorted(b, side="right") == 2 385 386 assert arr.searchsorted(c) == 2 387 assert arr.searchsorted(c, side="right") == 3 388 389 result = arr.searchsorted(arr.take([0, 2])) 390 expected = np.array([0, 2], dtype=np.intp) 391 392 tm.assert_numpy_array_equal(result, expected) 393 394 # sorter 395 sorter = np.array([1, 2, 0]) 396 assert data_for_sorting.searchsorted(a, sorter=sorter) == 0 397 398 def test_where_series(self, data, na_value, as_frame): 399 assert data[0] != data[1] 400 cls = type(data) 401 a, b = data[:2] 402 403 ser = pd.Series(cls._from_sequence([a, a, b, b], dtype=data.dtype)) 404 cond = np.array([True, True, False, False]) 405 406 if as_frame: 407 ser = ser.to_frame(name="a") 408 cond = cond.reshape(-1, 1) 409 410 result = ser.where(cond) 411 expected = pd.Series( 412 cls._from_sequence([a, a, na_value, na_value], dtype=data.dtype) 413 ) 414 415 if as_frame: 416 expected = expected.to_frame(name="a") 417 self.assert_equal(result, expected) 418 419 # array other 420 cond = np.array([True, False, True, True]) 421 other = cls._from_sequence([a, b, a, b], dtype=data.dtype) 422 if as_frame: 423 other = pd.DataFrame({"a": other}) 424 cond = pd.DataFrame({"a": cond}) 425 result = ser.where(cond, other) 426 expected = pd.Series(cls._from_sequence([a, b, b, b], dtype=data.dtype)) 427 if as_frame: 428 expected = expected.to_frame(name="a") 429 self.assert_equal(result, expected) 430 431 @pytest.mark.parametrize("repeats", [0, 1, 2, [1, 2, 3]]) 432 def test_repeat(self, data, repeats, as_series, use_numpy): 433 arr = type(data)._from_sequence(data[:3], dtype=data.dtype) 434 if as_series: 435 arr = pd.Series(arr) 436 437 result = np.repeat(arr, repeats) if use_numpy else arr.repeat(repeats) 438 439 repeats = [repeats] * 3 if isinstance(repeats, int) else repeats 440 expected = [x for x, n in zip(arr, repeats) for _ in range(n)] 441 expected = type(data)._from_sequence(expected, dtype=data.dtype) 442 if as_series: 443 expected = pd.Series(expected, index=arr.index.repeat(repeats)) 444 445 self.assert_equal(result, expected) 446 447 @pytest.mark.parametrize( 448 "repeats, kwargs, error, msg", 449 [ 450 (2, {"axis": 1}, ValueError, "axis"), 451 (-1, {}, ValueError, "negative"), 452 ([1, 2], {}, ValueError, "shape"), 453 (2, {"foo": "bar"}, TypeError, "'foo'"), 454 ], 455 ) 456 def test_repeat_raises(self, data, repeats, kwargs, error, msg, use_numpy): 457 with pytest.raises(error, match=msg): 458 if use_numpy: 459 np.repeat(data, repeats, **kwargs) 460 else: 461 data.repeat(repeats, **kwargs) 462 463 @pytest.mark.parametrize("box", [pd.array, pd.Series, pd.DataFrame]) 464 def test_equals(self, data, na_value, as_series, box): 465 data2 = type(data)._from_sequence([data[0]] * len(data), dtype=data.dtype) 466 data_na = type(data)._from_sequence([na_value] * len(data), dtype=data.dtype) 467 468 data = tm.box_expected(data, box, transpose=False) 469 data2 = tm.box_expected(data2, box, transpose=False) 470 data_na = tm.box_expected(data_na, box, transpose=False) 471 472 # we are asserting with `is True/False` explicitly, to test that the 473 # result is an actual Python bool, and not something "truthy" 474 475 assert data.equals(data) is True 476 assert data.equals(data.copy()) is True 477 478 # unequal other data 479 assert data.equals(data2) is False 480 assert data.equals(data_na) is False 481 482 # different length 483 assert data[:2].equals(data[:3]) is False 484 485 # emtpy are equal 486 assert data[:0].equals(data[:0]) is True 487 488 # other types 489 assert data.equals(None) is False 490 assert data[[0]].equals(data[0]) is False 491