1import numpy as np
2import pytest
3
4from pandas.core.dtypes.base import registry as ea_registry
5from pandas.core.dtypes.dtypes import DatetimeTZDtype, IntervalDtype, PeriodDtype
6
7from pandas import (
8    Categorical,
9    DataFrame,
10    Index,
11    Interval,
12    NaT,
13    Period,
14    PeriodIndex,
15    Series,
16    Timestamp,
17    date_range,
18    notna,
19    period_range,
20)
21import pandas._testing as tm
22from pandas.core.arrays import SparseArray
23
24
25class TestDataFrameSetItem:
26    @pytest.mark.parametrize("dtype", ["int32", "int64", "float32", "float64"])
27    def test_setitem_dtype(self, dtype, float_frame):
28        arr = np.random.randn(len(float_frame))
29
30        float_frame[dtype] = np.array(arr, dtype=dtype)
31        assert float_frame[dtype].dtype.name == dtype
32
33    def test_setitem_list_not_dataframe(self, float_frame):
34        data = np.random.randn(len(float_frame), 2)
35        float_frame[["A", "B"]] = data
36        tm.assert_almost_equal(float_frame[["A", "B"]].values, data)
37
38    def test_setitem_error_msmgs(self):
39
40        # GH 7432
41        df = DataFrame(
42            {"bar": [1, 2, 3], "baz": ["d", "e", "f"]},
43            index=Index(["a", "b", "c"], name="foo"),
44        )
45        ser = Series(
46            ["g", "h", "i", "j"],
47            index=Index(["a", "b", "c", "a"], name="foo"),
48            name="fiz",
49        )
50        msg = "cannot reindex from a duplicate axis"
51        with pytest.raises(ValueError, match=msg):
52            df["newcol"] = ser
53
54        # GH 4107, more descriptive error message
55        df = DataFrame(np.random.randint(0, 2, (4, 4)), columns=["a", "b", "c", "d"])
56
57        msg = "incompatible index of inserted column with frame index"
58        with pytest.raises(TypeError, match=msg):
59            df["gr"] = df.groupby(["b", "c"]).count()
60
61    def test_setitem_benchmark(self):
62        # from the vb_suite/frame_methods/frame_insert_columns
63        N = 10
64        K = 5
65        df = DataFrame(index=range(N))
66        new_col = np.random.randn(N)
67        for i in range(K):
68            df[i] = new_col
69        expected = DataFrame(np.repeat(new_col, K).reshape(N, K), index=range(N))
70        tm.assert_frame_equal(df, expected)
71
72    def test_setitem_different_dtype(self):
73        df = DataFrame(
74            np.random.randn(5, 3), index=np.arange(5), columns=["c", "b", "a"]
75        )
76        df.insert(0, "foo", df["a"])
77        df.insert(2, "bar", df["c"])
78
79        # diff dtype
80
81        # new item
82        df["x"] = df["a"].astype("float32")
83        result = df.dtypes
84        expected = Series(
85            [np.dtype("float64")] * 5 + [np.dtype("float32")],
86            index=["foo", "c", "bar", "b", "a", "x"],
87        )
88        tm.assert_series_equal(result, expected)
89
90        # replacing current (in different block)
91        df["a"] = df["a"].astype("float32")
92        result = df.dtypes
93        expected = Series(
94            [np.dtype("float64")] * 4 + [np.dtype("float32")] * 2,
95            index=["foo", "c", "bar", "b", "a", "x"],
96        )
97        tm.assert_series_equal(result, expected)
98
99        df["y"] = df["a"].astype("int32")
100        result = df.dtypes
101        expected = Series(
102            [np.dtype("float64")] * 4 + [np.dtype("float32")] * 2 + [np.dtype("int32")],
103            index=["foo", "c", "bar", "b", "a", "x", "y"],
104        )
105        tm.assert_series_equal(result, expected)
106
107    def test_setitem_empty_columns(self):
108        # GH 13522
109        df = DataFrame(index=["A", "B", "C"])
110        df["X"] = df.index
111        df["X"] = ["x", "y", "z"]
112        exp = DataFrame(data={"X": ["x", "y", "z"]}, index=["A", "B", "C"])
113        tm.assert_frame_equal(df, exp)
114
115    def test_setitem_dt64_index_empty_columns(self):
116        rng = date_range("1/1/2000 00:00:00", "1/1/2000 1:59:50", freq="10s")
117        df = DataFrame(index=np.arange(len(rng)))
118
119        df["A"] = rng
120        assert df["A"].dtype == np.dtype("M8[ns]")
121
122    def test_setitem_timestamp_empty_columns(self):
123        # GH#19843
124        df = DataFrame(index=range(3))
125        df["now"] = Timestamp("20130101", tz="UTC")
126
127        expected = DataFrame(
128            [[Timestamp("20130101", tz="UTC")]] * 3, index=[0, 1, 2], columns=["now"]
129        )
130        tm.assert_frame_equal(df, expected)
131
132    def test_setitem_wrong_length_categorical_dtype_raises(self):
133        # GH#29523
134        cat = Categorical.from_codes([0, 1, 1, 0, 1, 2], ["a", "b", "c"])
135        df = DataFrame(range(10), columns=["bar"])
136
137        msg = (
138            rf"Length of values \({len(cat)}\) "
139            rf"does not match length of index \({len(df)}\)"
140        )
141        with pytest.raises(ValueError, match=msg):
142            df["foo"] = cat
143
144    def test_setitem_with_sparse_value(self):
145        # GH#8131
146        df = DataFrame({"c_1": ["a", "b", "c"], "n_1": [1.0, 2.0, 3.0]})
147        sp_array = SparseArray([0, 0, 1])
148        df["new_column"] = sp_array
149
150        expected = Series(sp_array, name="new_column")
151        tm.assert_series_equal(df["new_column"], expected)
152
153    def test_setitem_with_unaligned_sparse_value(self):
154        df = DataFrame({"c_1": ["a", "b", "c"], "n_1": [1.0, 2.0, 3.0]})
155        sp_series = Series(SparseArray([0, 0, 1]), index=[2, 1, 0])
156
157        df["new_column"] = sp_series
158        expected = Series(SparseArray([1, 0, 0]), name="new_column")
159        tm.assert_series_equal(df["new_column"], expected)
160
161    def test_setitem_dict_preserves_dtypes(self):
162        # https://github.com/pandas-dev/pandas/issues/34573
163        expected = DataFrame(
164            {
165                "a": Series([0, 1, 2], dtype="int64"),
166                "b": Series([1, 2, 3], dtype=float),
167                "c": Series([1, 2, 3], dtype=float),
168            }
169        )
170        df = DataFrame(
171            {
172                "a": Series([], dtype="int64"),
173                "b": Series([], dtype=float),
174                "c": Series([], dtype=float),
175            }
176        )
177        for idx, b in enumerate([1, 2, 3]):
178            df.loc[df.shape[0]] = {"a": int(idx), "b": float(b), "c": float(b)}
179        tm.assert_frame_equal(df, expected)
180
181    @pytest.mark.parametrize(
182        "obj,dtype",
183        [
184            (Period("2020-01"), PeriodDtype("M")),
185            (Interval(left=0, right=5), IntervalDtype("int64")),
186            (
187                Timestamp("2011-01-01", tz="US/Eastern"),
188                DatetimeTZDtype(tz="US/Eastern"),
189            ),
190        ],
191    )
192    def test_setitem_extension_types(self, obj, dtype):
193        # GH: 34832
194        expected = DataFrame({"idx": [1, 2, 3], "obj": Series([obj] * 3, dtype=dtype)})
195
196        df = DataFrame({"idx": [1, 2, 3]})
197        df["obj"] = obj
198
199        tm.assert_frame_equal(df, expected)
200
201    @pytest.mark.parametrize(
202        "ea_name",
203        [
204            dtype.name
205            for dtype in ea_registry.dtypes
206            # property would require instantiation
207            if not isinstance(dtype.name, property)
208        ]
209        # mypy doesn't allow adding lists of different types
210        # https://github.com/python/mypy/issues/5492
211        + ["datetime64[ns, UTC]", "period[D]"],  # type: ignore[list-item]
212    )
213    def test_setitem_with_ea_name(self, ea_name):
214        # GH 38386
215        result = DataFrame([0])
216        result[ea_name] = [1]
217        expected = DataFrame({0: [0], ea_name: [1]})
218        tm.assert_frame_equal(result, expected)
219
220    def test_setitem_dt64_ndarray_with_NaT_and_diff_time_units(self):
221        # GH#7492
222        data_ns = np.array([1, "nat"], dtype="datetime64[ns]")
223        result = Series(data_ns).to_frame()
224        result["new"] = data_ns
225        expected = DataFrame({0: [1, None], "new": [1, None]}, dtype="datetime64[ns]")
226        tm.assert_frame_equal(result, expected)
227
228        # OutOfBoundsDatetime error shouldn't occur
229        data_s = np.array([1, "nat"], dtype="datetime64[s]")
230        result["new"] = data_s
231        expected = DataFrame({0: [1, None], "new": [1e9, None]}, dtype="datetime64[ns]")
232        tm.assert_frame_equal(result, expected)
233
234    @pytest.mark.parametrize("unit", ["h", "m", "s", "ms", "D", "M", "Y"])
235    def test_frame_setitem_datetime64_col_other_units(self, unit):
236        # Check that non-nano dt64 values get cast to dt64 on setitem
237        #  into a not-yet-existing column
238        n = 100
239
240        dtype = np.dtype(f"M8[{unit}]")
241        vals = np.arange(n, dtype=np.int64).view(dtype)
242        ex_vals = vals.astype("datetime64[ns]")
243
244        df = DataFrame({"ints": np.arange(n)}, index=np.arange(n))
245        df[unit] = vals
246
247        assert df[unit].dtype == np.dtype("M8[ns]")
248        assert (df[unit].values == ex_vals).all()
249
250    @pytest.mark.parametrize("unit", ["h", "m", "s", "ms", "D", "M", "Y"])
251    def test_frame_setitem_existing_datetime64_col_other_units(self, unit):
252        # Check that non-nano dt64 values get cast to dt64 on setitem
253        #  into an already-existing dt64 column
254        n = 100
255
256        dtype = np.dtype(f"M8[{unit}]")
257        vals = np.arange(n, dtype=np.int64).view(dtype)
258        ex_vals = vals.astype("datetime64[ns]")
259
260        df = DataFrame({"ints": np.arange(n)}, index=np.arange(n))
261        df["dates"] = np.arange(n, dtype=np.int64).view("M8[ns]")
262
263        # We overwrite existing dt64 column with new, non-nano dt64 vals
264        df["dates"] = vals
265        assert (df["dates"].values == ex_vals).all()
266
267    def test_setitem_dt64tz(self, timezone_frame):
268
269        df = timezone_frame
270        idx = df["B"].rename("foo")
271
272        # setitem
273        df["C"] = idx
274        tm.assert_series_equal(df["C"], Series(idx, name="C"))
275
276        df["D"] = "foo"
277        df["D"] = idx
278        tm.assert_series_equal(df["D"], Series(idx, name="D"))
279        del df["D"]
280
281        # assert that A & C are not sharing the same base (e.g. they
282        # are copies)
283        b1 = df._mgr.blocks[1]
284        b2 = df._mgr.blocks[2]
285        tm.assert_extension_array_equal(b1.values, b2.values)
286        b1base = b1.values._data.base
287        b2base = b2.values._data.base
288        assert b1base is None or (id(b1base) != id(b2base))
289
290        # with nan
291        df2 = df.copy()
292        df2.iloc[1, 1] = NaT
293        df2.iloc[1, 2] = NaT
294        result = df2["B"]
295        tm.assert_series_equal(notna(result), Series([True, False, True], name="B"))
296        tm.assert_series_equal(df2.dtypes, df.dtypes)
297
298    def test_setitem_periodindex(self):
299        rng = period_range("1/1/2000", periods=5, name="index")
300        df = DataFrame(np.random.randn(5, 3), index=rng)
301
302        df["Index"] = rng
303        rs = Index(df["Index"])
304        tm.assert_index_equal(rs, rng, check_names=False)
305        assert rs.name == "Index"
306        assert rng.name == "index"
307
308        rs = df.reset_index().set_index("index")
309        assert isinstance(rs.index, PeriodIndex)
310        tm.assert_index_equal(rs.index, rng)
311
312    def test_setitem_complete_column_with_array(self):
313        # GH#37954
314        df = DataFrame({"a": ["one", "two", "three"], "b": [1, 2, 3]})
315        arr = np.array([[1, 1], [3, 1], [5, 1]])
316        df[["c", "d"]] = arr
317        expected = DataFrame(
318            {
319                "a": ["one", "two", "three"],
320                "b": [1, 2, 3],
321                "c": [1, 3, 5],
322                "d": [1, 1, 1],
323            }
324        )
325        tm.assert_frame_equal(df, expected)
326
327    @pytest.mark.parametrize("dtype", ["f8", "i8", "u8"])
328    def test_setitem_bool_with_numeric_index(self, dtype):
329        # GH#36319
330        cols = Index([1, 2, 3], dtype=dtype)
331        df = DataFrame(np.random.randn(3, 3), columns=cols)
332
333        df[False] = ["a", "b", "c"]
334
335        expected_cols = Index([1, 2, 3, False], dtype=object)
336        if dtype == "f8":
337            expected_cols = Index([1.0, 2.0, 3.0, False], dtype=object)
338
339        tm.assert_index_equal(df.columns, expected_cols)
340
341
342class TestDataFrameSetItemWithExpansion:
343    def test_setitem_listlike_views(self):
344        # GH#38148
345        df = DataFrame({"a": [1, 2, 3], "b": [4, 4, 6]})
346
347        # get one column as a view of df
348        ser = df["a"]
349
350        # add columns with list-like indexer
351        df[["c", "d"]] = np.array([[0.1, 0.2], [0.3, 0.4], [0.4, 0.5]])
352
353        # edit in place the first column to check view semantics
354        df.iloc[0, 0] = 100
355
356        expected = Series([100, 2, 3], name="a")
357        tm.assert_series_equal(ser, expected)
358
359    def test_setitem_string_column_numpy_dtype_raising(self):
360        # GH#39010
361        df = DataFrame([[1, 2], [3, 4]])
362        df["0 - Name"] = [5, 6]
363        expected = DataFrame([[1, 2, 5], [3, 4, 6]], columns=[0, 1, "0 - Name"])
364        tm.assert_frame_equal(df, expected)
365
366
367class TestDataFrameSetItemSlicing:
368    def test_setitem_slice_position(self):
369        # GH#31469
370        df = DataFrame(np.zeros((100, 1)))
371        df[-4:] = 1
372        arr = np.zeros((100, 1))
373        arr[-4:] = 1
374        expected = DataFrame(arr)
375        tm.assert_frame_equal(df, expected)
376
377
378class TestDataFrameSetItemCallable:
379    def test_setitem_callable(self):
380        # GH#12533
381        df = DataFrame({"A": [1, 2, 3, 4], "B": [5, 6, 7, 8]})
382        df[lambda x: "A"] = [11, 12, 13, 14]
383
384        exp = DataFrame({"A": [11, 12, 13, 14], "B": [5, 6, 7, 8]})
385        tm.assert_frame_equal(df, exp)
386
387
388class TestDataFrameSetItemBooleanMask:
389    @pytest.mark.parametrize(
390        "mask_type",
391        [lambda df: df > np.abs(df) / 2, lambda df: (df > np.abs(df) / 2).values],
392        ids=["dataframe", "array"],
393    )
394    def test_setitem_boolean_mask(self, mask_type, float_frame):
395
396        # Test for issue #18582
397        df = float_frame.copy()
398        mask = mask_type(df)
399
400        # index with boolean mask
401        result = df.copy()
402        result[mask] = np.nan
403
404        expected = df.copy()
405        expected.values[np.array(mask)] = np.nan
406        tm.assert_frame_equal(result, expected)
407
408    @pytest.mark.parametrize("indexer", [lambda x: x, lambda x: x.loc])
409    def test_setitem_boolean_mask_aligning(self, indexer):
410        # GH#39931
411        df = DataFrame({"a": [1, 4, 2, 3], "b": [5, 6, 7, 8]})
412        expected = df.copy()
413        mask = df["a"] >= 3
414        indexer(df)[mask] = indexer(df)[mask].sort_values("a")
415        tm.assert_frame_equal(df, expected)
416