1import operator
2
3import numpy as np
4import pytest
5
6from pandas.core.dtypes.common import is_bool_dtype
7
8import pandas as pd
9import pandas._testing as tm
10from pandas.core.sorting import nargsort
11
12from .base import BaseExtensionTests
13
14
15class BaseMethodsTests(BaseExtensionTests):
16    """Various Series and DataFrame methods."""
17
18    @pytest.mark.parametrize("dropna", [True, False])
19    def test_value_counts(self, all_data, dropna):
20        all_data = all_data[:10]
21        if dropna:
22            other = np.array(all_data[~all_data.isna()])
23        else:
24            other = all_data
25
26        result = pd.Series(all_data).value_counts(dropna=dropna).sort_index()
27        expected = pd.Series(other).value_counts(dropna=dropna).sort_index()
28
29        self.assert_series_equal(result, expected)
30
31    def test_value_counts_with_normalize(self, data):
32        # GH 33172
33        data = data[:10].unique()
34        values = np.array(data[~data.isna()])
35
36        result = (
37            pd.Series(data, dtype=data.dtype).value_counts(normalize=True).sort_index()
38        )
39
40        expected = pd.Series([1 / len(values)] * len(values), index=result.index)
41        self.assert_series_equal(result, expected)
42
43    def test_count(self, data_missing):
44        df = pd.DataFrame({"A": data_missing})
45        result = df.count(axis="columns")
46        expected = pd.Series([0, 1])
47        self.assert_series_equal(result, expected)
48
49    def test_series_count(self, data_missing):
50        # GH#26835
51        ser = pd.Series(data_missing)
52        result = ser.count()
53        expected = 1
54        assert result == expected
55
56    def test_apply_simple_series(self, data):
57        result = pd.Series(data).apply(id)
58        assert isinstance(result, pd.Series)
59
60    def test_argsort(self, data_for_sorting):
61        result = pd.Series(data_for_sorting).argsort()
62        expected = pd.Series(np.array([2, 0, 1], dtype=np.int64))
63        self.assert_series_equal(result, expected)
64
65    def test_argsort_missing_array(self, data_missing_for_sorting):
66        result = data_missing_for_sorting.argsort()
67        expected = np.array([2, 0, 1], dtype=np.dtype("int"))
68        # we don't care whether it's int32 or int64
69        result = result.astype("int64", casting="safe")
70        expected = expected.astype("int64", casting="safe")
71        tm.assert_numpy_array_equal(result, expected)
72
73    def test_argsort_missing(self, data_missing_for_sorting):
74        result = pd.Series(data_missing_for_sorting).argsort()
75        expected = pd.Series(np.array([1, -1, 0], dtype=np.int64))
76        self.assert_series_equal(result, expected)
77
78    def test_argmin_argmax(self, data_for_sorting, data_missing_for_sorting, na_value):
79        # GH 24382
80
81        # data_for_sorting -> [B, C, A] with A < B < C
82        assert data_for_sorting.argmax() == 1
83        assert data_for_sorting.argmin() == 2
84
85        # with repeated values -> first occurence
86        data = data_for_sorting.take([2, 0, 0, 1, 1, 2])
87        assert data.argmax() == 3
88        assert data.argmin() == 0
89
90        # with missing values
91        # data_missing_for_sorting -> [B, NA, A] with A < B and NA missing.
92        assert data_missing_for_sorting.argmax() == 0
93        assert data_missing_for_sorting.argmin() == 2
94
95    @pytest.mark.parametrize("method", ["argmax", "argmin"])
96    def test_argmin_argmax_empty_array(self, method, data):
97        # GH 24382
98        err_msg = "attempt to get"
99        with pytest.raises(ValueError, match=err_msg):
100            getattr(data[:0], method)()
101
102    @pytest.mark.parametrize("method", ["argmax", "argmin"])
103    def test_argmin_argmax_all_na(self, method, data, na_value):
104        # all missing with skipna=True is the same as emtpy
105        err_msg = "attempt to get"
106        data_na = type(data)._from_sequence([na_value, na_value], dtype=data.dtype)
107        with pytest.raises(ValueError, match=err_msg):
108            getattr(data_na, method)()
109
110    @pytest.mark.parametrize(
111        "na_position, expected",
112        [
113            ("last", np.array([2, 0, 1], dtype=np.dtype("intp"))),
114            ("first", np.array([1, 2, 0], dtype=np.dtype("intp"))),
115        ],
116    )
117    def test_nargsort(self, data_missing_for_sorting, na_position, expected):
118        # GH 25439
119        result = nargsort(data_missing_for_sorting, na_position=na_position)
120        tm.assert_numpy_array_equal(result, expected)
121
122    @pytest.mark.parametrize("ascending", [True, False])
123    def test_sort_values(self, data_for_sorting, ascending, sort_by_key):
124        ser = pd.Series(data_for_sorting)
125        result = ser.sort_values(ascending=ascending, key=sort_by_key)
126        expected = ser.iloc[[2, 0, 1]]
127        if not ascending:
128            # GH 35922. Expect stable sort
129            if ser.nunique() == 2:
130                expected = ser.iloc[[0, 1, 2]]
131            else:
132                expected = ser.iloc[[1, 0, 2]]
133
134        self.assert_series_equal(result, expected)
135
136    @pytest.mark.parametrize("ascending", [True, False])
137    def test_sort_values_missing(
138        self, data_missing_for_sorting, ascending, sort_by_key
139    ):
140        ser = pd.Series(data_missing_for_sorting)
141        result = ser.sort_values(ascending=ascending, key=sort_by_key)
142        if ascending:
143            expected = ser.iloc[[2, 0, 1]]
144        else:
145            expected = ser.iloc[[0, 2, 1]]
146        self.assert_series_equal(result, expected)
147
148    @pytest.mark.parametrize("ascending", [True, False])
149    def test_sort_values_frame(self, data_for_sorting, ascending):
150        df = pd.DataFrame({"A": [1, 2, 1], "B": data_for_sorting})
151        result = df.sort_values(["A", "B"])
152        expected = pd.DataFrame(
153            {"A": [1, 1, 2], "B": data_for_sorting.take([2, 0, 1])}, index=[2, 0, 1]
154        )
155        self.assert_frame_equal(result, expected)
156
157    @pytest.mark.parametrize("box", [pd.Series, lambda x: x])
158    @pytest.mark.parametrize("method", [lambda x: x.unique(), pd.unique])
159    def test_unique(self, data, box, method):
160        duplicated = box(data._from_sequence([data[0], data[0]]))
161
162        result = method(duplicated)
163
164        assert len(result) == 1
165        assert isinstance(result, type(data))
166        assert result[0] == duplicated[0]
167
168    @pytest.mark.parametrize("na_sentinel", [-1, -2])
169    def test_factorize(self, data_for_grouping, na_sentinel):
170        codes, uniques = pd.factorize(data_for_grouping, na_sentinel=na_sentinel)
171        expected_codes = np.array(
172            [0, 0, na_sentinel, na_sentinel, 1, 1, 0, 2], dtype=np.intp
173        )
174        expected_uniques = data_for_grouping.take([0, 4, 7])
175
176        tm.assert_numpy_array_equal(codes, expected_codes)
177        self.assert_extension_array_equal(uniques, expected_uniques)
178
179    @pytest.mark.parametrize("na_sentinel", [-1, -2])
180    def test_factorize_equivalence(self, data_for_grouping, na_sentinel):
181        codes_1, uniques_1 = pd.factorize(data_for_grouping, na_sentinel=na_sentinel)
182        codes_2, uniques_2 = data_for_grouping.factorize(na_sentinel=na_sentinel)
183
184        tm.assert_numpy_array_equal(codes_1, codes_2)
185        self.assert_extension_array_equal(uniques_1, uniques_2)
186        assert len(uniques_1) == len(pd.unique(uniques_1))
187        assert uniques_1.dtype == data_for_grouping.dtype
188
189    def test_factorize_empty(self, data):
190        codes, uniques = pd.factorize(data[:0])
191        expected_codes = np.array([], dtype=np.intp)
192        expected_uniques = type(data)._from_sequence([], dtype=data[:0].dtype)
193
194        tm.assert_numpy_array_equal(codes, expected_codes)
195        self.assert_extension_array_equal(uniques, expected_uniques)
196
197    def test_fillna_copy_frame(self, data_missing):
198        arr = data_missing.take([1, 1])
199        df = pd.DataFrame({"A": arr})
200
201        filled_val = df.iloc[0, 0]
202        result = df.fillna(filled_val)
203
204        assert df.A.values is not result.A.values
205
206    def test_fillna_copy_series(self, data_missing):
207        arr = data_missing.take([1, 1])
208        ser = pd.Series(arr)
209
210        filled_val = ser[0]
211        result = ser.fillna(filled_val)
212
213        assert ser._values is not result._values
214        assert ser._values is arr
215
216    def test_fillna_length_mismatch(self, data_missing):
217        msg = "Length of 'value' does not match."
218        with pytest.raises(ValueError, match=msg):
219            data_missing.fillna(data_missing.take([1]))
220
221    def test_combine_le(self, data_repeated):
222        # GH 20825
223        # Test that combine works when doing a <= (le) comparison
224        orig_data1, orig_data2 = data_repeated(2)
225        s1 = pd.Series(orig_data1)
226        s2 = pd.Series(orig_data2)
227        result = s1.combine(s2, lambda x1, x2: x1 <= x2)
228        expected = pd.Series(
229            [a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))]
230        )
231        self.assert_series_equal(result, expected)
232
233        val = s1.iloc[0]
234        result = s1.combine(val, lambda x1, x2: x1 <= x2)
235        expected = pd.Series([a <= val for a in list(orig_data1)])
236        self.assert_series_equal(result, expected)
237
238    def test_combine_add(self, data_repeated):
239        # GH 20825
240        orig_data1, orig_data2 = data_repeated(2)
241        s1 = pd.Series(orig_data1)
242        s2 = pd.Series(orig_data2)
243        result = s1.combine(s2, lambda x1, x2: x1 + x2)
244        with np.errstate(over="ignore"):
245            expected = pd.Series(
246                orig_data1._from_sequence(
247                    [a + b for (a, b) in zip(list(orig_data1), list(orig_data2))]
248                )
249            )
250        self.assert_series_equal(result, expected)
251
252        val = s1.iloc[0]
253        result = s1.combine(val, lambda x1, x2: x1 + x2)
254        expected = pd.Series(
255            orig_data1._from_sequence([a + val for a in list(orig_data1)])
256        )
257        self.assert_series_equal(result, expected)
258
259    def test_combine_first(self, data):
260        # https://github.com/pandas-dev/pandas/issues/24147
261        a = pd.Series(data[:3])
262        b = pd.Series(data[2:5], index=[2, 3, 4])
263        result = a.combine_first(b)
264        expected = pd.Series(data[:5])
265        self.assert_series_equal(result, expected)
266
267    @pytest.mark.parametrize("frame", [True, False])
268    @pytest.mark.parametrize(
269        "periods, indices",
270        [(-2, [2, 3, 4, -1, -1]), (0, [0, 1, 2, 3, 4]), (2, [-1, -1, 0, 1, 2])],
271    )
272    def test_container_shift(self, data, frame, periods, indices):
273        # https://github.com/pandas-dev/pandas/issues/22386
274        subset = data[:5]
275        data = pd.Series(subset, name="A")
276        expected = pd.Series(subset.take(indices, allow_fill=True), name="A")
277
278        if frame:
279            result = data.to_frame(name="A").assign(B=1).shift(periods)
280            expected = pd.concat(
281                [expected, pd.Series([1] * 5, name="B").shift(periods)], axis=1
282            )
283            compare = self.assert_frame_equal
284        else:
285            result = data.shift(periods)
286            compare = self.assert_series_equal
287
288        compare(result, expected)
289
290    def test_shift_0_periods(self, data):
291        # GH#33856 shifting with periods=0 should return a copy, not same obj
292        result = data.shift(0)
293        assert data[0] != data[1]  # otherwise below is invalid
294        data[0] = data[1]
295        assert result[0] != result[1]  # i.e. not the same object/view
296
297    @pytest.mark.parametrize("periods", [1, -2])
298    def test_diff(self, data, periods):
299        data = data[:5]
300        if is_bool_dtype(data.dtype):
301            op = operator.xor
302        else:
303            op = operator.sub
304        try:
305            # does this array implement ops?
306            op(data, data)
307        except Exception:
308            pytest.skip(f"{type(data)} does not support diff")
309        s = pd.Series(data)
310        result = s.diff(periods)
311        expected = pd.Series(op(data, data.shift(periods)))
312        self.assert_series_equal(result, expected)
313
314        df = pd.DataFrame({"A": data, "B": [1.0] * 5})
315        result = df.diff(periods)
316        if periods == 1:
317            b = [np.nan, 0, 0, 0, 0]
318        else:
319            b = [0, 0, 0, np.nan, np.nan]
320        expected = pd.DataFrame({"A": expected, "B": b})
321        self.assert_frame_equal(result, expected)
322
323    @pytest.mark.parametrize(
324        "periods, indices",
325        [[-4, [-1, -1]], [-1, [1, -1]], [0, [0, 1]], [1, [-1, 0]], [4, [-1, -1]]],
326    )
327    def test_shift_non_empty_array(self, data, periods, indices):
328        # https://github.com/pandas-dev/pandas/issues/23911
329        subset = data[:2]
330        result = subset.shift(periods)
331        expected = subset.take(indices, allow_fill=True)
332        self.assert_extension_array_equal(result, expected)
333
334    @pytest.mark.parametrize("periods", [-4, -1, 0, 1, 4])
335    def test_shift_empty_array(self, data, periods):
336        # https://github.com/pandas-dev/pandas/issues/23911
337        empty = data[:0]
338        result = empty.shift(periods)
339        expected = empty
340        self.assert_extension_array_equal(result, expected)
341
342    def test_shift_zero_copies(self, data):
343        result = data.shift(0)
344        assert result is not data
345
346        result = data[:0].shift(2)
347        assert result is not data
348
349    def test_shift_fill_value(self, data):
350        arr = data[:4]
351        fill_value = data[0]
352        result = arr.shift(1, fill_value=fill_value)
353        expected = data.take([0, 0, 1, 2])
354        self.assert_extension_array_equal(result, expected)
355
356        result = arr.shift(-2, fill_value=fill_value)
357        expected = data.take([2, 3, 0, 0])
358        self.assert_extension_array_equal(result, expected)
359
360    def test_not_hashable(self, data):
361        # We are in general mutable, so not hashable
362        with pytest.raises(TypeError, match="unhashable type"):
363            hash(data)
364
365    def test_hash_pandas_object_works(self, data, as_frame):
366        # https://github.com/pandas-dev/pandas/issues/23066
367        data = pd.Series(data)
368        if as_frame:
369            data = data.to_frame()
370        a = pd.util.hash_pandas_object(data)
371        b = pd.util.hash_pandas_object(data)
372        self.assert_equal(a, b)
373
374    def test_searchsorted(self, data_for_sorting, as_series):
375        b, c, a = data_for_sorting
376        arr = type(data_for_sorting)._from_sequence([a, b, c])
377
378        if as_series:
379            arr = pd.Series(arr)
380        assert arr.searchsorted(a) == 0
381        assert arr.searchsorted(a, side="right") == 1
382
383        assert arr.searchsorted(b) == 1
384        assert arr.searchsorted(b, side="right") == 2
385
386        assert arr.searchsorted(c) == 2
387        assert arr.searchsorted(c, side="right") == 3
388
389        result = arr.searchsorted(arr.take([0, 2]))
390        expected = np.array([0, 2], dtype=np.intp)
391
392        tm.assert_numpy_array_equal(result, expected)
393
394        # sorter
395        sorter = np.array([1, 2, 0])
396        assert data_for_sorting.searchsorted(a, sorter=sorter) == 0
397
398    def test_where_series(self, data, na_value, as_frame):
399        assert data[0] != data[1]
400        cls = type(data)
401        a, b = data[:2]
402
403        ser = pd.Series(cls._from_sequence([a, a, b, b], dtype=data.dtype))
404        cond = np.array([True, True, False, False])
405
406        if as_frame:
407            ser = ser.to_frame(name="a")
408            cond = cond.reshape(-1, 1)
409
410        result = ser.where(cond)
411        expected = pd.Series(
412            cls._from_sequence([a, a, na_value, na_value], dtype=data.dtype)
413        )
414
415        if as_frame:
416            expected = expected.to_frame(name="a")
417        self.assert_equal(result, expected)
418
419        # array other
420        cond = np.array([True, False, True, True])
421        other = cls._from_sequence([a, b, a, b], dtype=data.dtype)
422        if as_frame:
423            other = pd.DataFrame({"a": other})
424            cond = pd.DataFrame({"a": cond})
425        result = ser.where(cond, other)
426        expected = pd.Series(cls._from_sequence([a, b, b, b], dtype=data.dtype))
427        if as_frame:
428            expected = expected.to_frame(name="a")
429        self.assert_equal(result, expected)
430
431    @pytest.mark.parametrize("repeats", [0, 1, 2, [1, 2, 3]])
432    def test_repeat(self, data, repeats, as_series, use_numpy):
433        arr = type(data)._from_sequence(data[:3], dtype=data.dtype)
434        if as_series:
435            arr = pd.Series(arr)
436
437        result = np.repeat(arr, repeats) if use_numpy else arr.repeat(repeats)
438
439        repeats = [repeats] * 3 if isinstance(repeats, int) else repeats
440        expected = [x for x, n in zip(arr, repeats) for _ in range(n)]
441        expected = type(data)._from_sequence(expected, dtype=data.dtype)
442        if as_series:
443            expected = pd.Series(expected, index=arr.index.repeat(repeats))
444
445        self.assert_equal(result, expected)
446
447    @pytest.mark.parametrize(
448        "repeats, kwargs, error, msg",
449        [
450            (2, {"axis": 1}, ValueError, "axis"),
451            (-1, {}, ValueError, "negative"),
452            ([1, 2], {}, ValueError, "shape"),
453            (2, {"foo": "bar"}, TypeError, "'foo'"),
454        ],
455    )
456    def test_repeat_raises(self, data, repeats, kwargs, error, msg, use_numpy):
457        with pytest.raises(error, match=msg):
458            if use_numpy:
459                np.repeat(data, repeats, **kwargs)
460            else:
461                data.repeat(repeats, **kwargs)
462
463    @pytest.mark.parametrize("box", [pd.array, pd.Series, pd.DataFrame])
464    def test_equals(self, data, na_value, as_series, box):
465        data2 = type(data)._from_sequence([data[0]] * len(data), dtype=data.dtype)
466        data_na = type(data)._from_sequence([na_value] * len(data), dtype=data.dtype)
467
468        data = tm.box_expected(data, box, transpose=False)
469        data2 = tm.box_expected(data2, box, transpose=False)
470        data_na = tm.box_expected(data_na, box, transpose=False)
471
472        # we are asserting with `is True/False` explicitly, to test that the
473        # result is an actual Python bool, and not something "truthy"
474
475        assert data.equals(data) is True
476        assert data.equals(data.copy()) is True
477
478        # unequal other data
479        assert data.equals(data2) is False
480        assert data.equals(data_na) is False
481
482        # different length
483        assert data[:2].equals(data[:3]) is False
484
485        # emtpy are equal
486        assert data[:0].equals(data[:0]) is True
487
488        # other types
489        assert data.equals(None) is False
490        assert data[[0]].equals(data[0]) is False
491