1import numpy as np 2import pytest 3 4from pandas.core.dtypes.base import registry as ea_registry 5from pandas.core.dtypes.dtypes import DatetimeTZDtype, IntervalDtype, PeriodDtype 6 7from pandas import ( 8 Categorical, 9 DataFrame, 10 Index, 11 Interval, 12 NaT, 13 Period, 14 PeriodIndex, 15 Series, 16 Timestamp, 17 date_range, 18 notna, 19 period_range, 20) 21import pandas._testing as tm 22from pandas.core.arrays import SparseArray 23 24 25class TestDataFrameSetItem: 26 @pytest.mark.parametrize("dtype", ["int32", "int64", "float32", "float64"]) 27 def test_setitem_dtype(self, dtype, float_frame): 28 arr = np.random.randn(len(float_frame)) 29 30 float_frame[dtype] = np.array(arr, dtype=dtype) 31 assert float_frame[dtype].dtype.name == dtype 32 33 def test_setitem_list_not_dataframe(self, float_frame): 34 data = np.random.randn(len(float_frame), 2) 35 float_frame[["A", "B"]] = data 36 tm.assert_almost_equal(float_frame[["A", "B"]].values, data) 37 38 def test_setitem_error_msmgs(self): 39 40 # GH 7432 41 df = DataFrame( 42 {"bar": [1, 2, 3], "baz": ["d", "e", "f"]}, 43 index=Index(["a", "b", "c"], name="foo"), 44 ) 45 ser = Series( 46 ["g", "h", "i", "j"], 47 index=Index(["a", "b", "c", "a"], name="foo"), 48 name="fiz", 49 ) 50 msg = "cannot reindex from a duplicate axis" 51 with pytest.raises(ValueError, match=msg): 52 df["newcol"] = ser 53 54 # GH 4107, more descriptive error message 55 df = DataFrame(np.random.randint(0, 2, (4, 4)), columns=["a", "b", "c", "d"]) 56 57 msg = "incompatible index of inserted column with frame index" 58 with pytest.raises(TypeError, match=msg): 59 df["gr"] = df.groupby(["b", "c"]).count() 60 61 def test_setitem_benchmark(self): 62 # from the vb_suite/frame_methods/frame_insert_columns 63 N = 10 64 K = 5 65 df = DataFrame(index=range(N)) 66 new_col = np.random.randn(N) 67 for i in range(K): 68 df[i] = new_col 69 expected = DataFrame(np.repeat(new_col, K).reshape(N, K), index=range(N)) 70 tm.assert_frame_equal(df, expected) 71 72 def test_setitem_different_dtype(self): 73 df = DataFrame( 74 np.random.randn(5, 3), index=np.arange(5), columns=["c", "b", "a"] 75 ) 76 df.insert(0, "foo", df["a"]) 77 df.insert(2, "bar", df["c"]) 78 79 # diff dtype 80 81 # new item 82 df["x"] = df["a"].astype("float32") 83 result = df.dtypes 84 expected = Series( 85 [np.dtype("float64")] * 5 + [np.dtype("float32")], 86 index=["foo", "c", "bar", "b", "a", "x"], 87 ) 88 tm.assert_series_equal(result, expected) 89 90 # replacing current (in different block) 91 df["a"] = df["a"].astype("float32") 92 result = df.dtypes 93 expected = Series( 94 [np.dtype("float64")] * 4 + [np.dtype("float32")] * 2, 95 index=["foo", "c", "bar", "b", "a", "x"], 96 ) 97 tm.assert_series_equal(result, expected) 98 99 df["y"] = df["a"].astype("int32") 100 result = df.dtypes 101 expected = Series( 102 [np.dtype("float64")] * 4 + [np.dtype("float32")] * 2 + [np.dtype("int32")], 103 index=["foo", "c", "bar", "b", "a", "x", "y"], 104 ) 105 tm.assert_series_equal(result, expected) 106 107 def test_setitem_empty_columns(self): 108 # GH 13522 109 df = DataFrame(index=["A", "B", "C"]) 110 df["X"] = df.index 111 df["X"] = ["x", "y", "z"] 112 exp = DataFrame(data={"X": ["x", "y", "z"]}, index=["A", "B", "C"]) 113 tm.assert_frame_equal(df, exp) 114 115 def test_setitem_dt64_index_empty_columns(self): 116 rng = date_range("1/1/2000 00:00:00", "1/1/2000 1:59:50", freq="10s") 117 df = DataFrame(index=np.arange(len(rng))) 118 119 df["A"] = rng 120 assert df["A"].dtype == np.dtype("M8[ns]") 121 122 def test_setitem_timestamp_empty_columns(self): 123 # GH#19843 124 df = DataFrame(index=range(3)) 125 df["now"] = Timestamp("20130101", tz="UTC") 126 127 expected = DataFrame( 128 [[Timestamp("20130101", tz="UTC")]] * 3, index=[0, 1, 2], columns=["now"] 129 ) 130 tm.assert_frame_equal(df, expected) 131 132 def test_setitem_wrong_length_categorical_dtype_raises(self): 133 # GH#29523 134 cat = Categorical.from_codes([0, 1, 1, 0, 1, 2], ["a", "b", "c"]) 135 df = DataFrame(range(10), columns=["bar"]) 136 137 msg = ( 138 rf"Length of values \({len(cat)}\) " 139 rf"does not match length of index \({len(df)}\)" 140 ) 141 with pytest.raises(ValueError, match=msg): 142 df["foo"] = cat 143 144 def test_setitem_with_sparse_value(self): 145 # GH#8131 146 df = DataFrame({"c_1": ["a", "b", "c"], "n_1": [1.0, 2.0, 3.0]}) 147 sp_array = SparseArray([0, 0, 1]) 148 df["new_column"] = sp_array 149 150 expected = Series(sp_array, name="new_column") 151 tm.assert_series_equal(df["new_column"], expected) 152 153 def test_setitem_with_unaligned_sparse_value(self): 154 df = DataFrame({"c_1": ["a", "b", "c"], "n_1": [1.0, 2.0, 3.0]}) 155 sp_series = Series(SparseArray([0, 0, 1]), index=[2, 1, 0]) 156 157 df["new_column"] = sp_series 158 expected = Series(SparseArray([1, 0, 0]), name="new_column") 159 tm.assert_series_equal(df["new_column"], expected) 160 161 def test_setitem_dict_preserves_dtypes(self): 162 # https://github.com/pandas-dev/pandas/issues/34573 163 expected = DataFrame( 164 { 165 "a": Series([0, 1, 2], dtype="int64"), 166 "b": Series([1, 2, 3], dtype=float), 167 "c": Series([1, 2, 3], dtype=float), 168 } 169 ) 170 df = DataFrame( 171 { 172 "a": Series([], dtype="int64"), 173 "b": Series([], dtype=float), 174 "c": Series([], dtype=float), 175 } 176 ) 177 for idx, b in enumerate([1, 2, 3]): 178 df.loc[df.shape[0]] = {"a": int(idx), "b": float(b), "c": float(b)} 179 tm.assert_frame_equal(df, expected) 180 181 @pytest.mark.parametrize( 182 "obj,dtype", 183 [ 184 (Period("2020-01"), PeriodDtype("M")), 185 (Interval(left=0, right=5), IntervalDtype("int64")), 186 ( 187 Timestamp("2011-01-01", tz="US/Eastern"), 188 DatetimeTZDtype(tz="US/Eastern"), 189 ), 190 ], 191 ) 192 def test_setitem_extension_types(self, obj, dtype): 193 # GH: 34832 194 expected = DataFrame({"idx": [1, 2, 3], "obj": Series([obj] * 3, dtype=dtype)}) 195 196 df = DataFrame({"idx": [1, 2, 3]}) 197 df["obj"] = obj 198 199 tm.assert_frame_equal(df, expected) 200 201 @pytest.mark.parametrize( 202 "ea_name", 203 [ 204 dtype.name 205 for dtype in ea_registry.dtypes 206 # property would require instantiation 207 if not isinstance(dtype.name, property) 208 ] 209 # mypy doesn't allow adding lists of different types 210 # https://github.com/python/mypy/issues/5492 211 + ["datetime64[ns, UTC]", "period[D]"], # type: ignore[list-item] 212 ) 213 def test_setitem_with_ea_name(self, ea_name): 214 # GH 38386 215 result = DataFrame([0]) 216 result[ea_name] = [1] 217 expected = DataFrame({0: [0], ea_name: [1]}) 218 tm.assert_frame_equal(result, expected) 219 220 def test_setitem_dt64_ndarray_with_NaT_and_diff_time_units(self): 221 # GH#7492 222 data_ns = np.array([1, "nat"], dtype="datetime64[ns]") 223 result = Series(data_ns).to_frame() 224 result["new"] = data_ns 225 expected = DataFrame({0: [1, None], "new": [1, None]}, dtype="datetime64[ns]") 226 tm.assert_frame_equal(result, expected) 227 228 # OutOfBoundsDatetime error shouldn't occur 229 data_s = np.array([1, "nat"], dtype="datetime64[s]") 230 result["new"] = data_s 231 expected = DataFrame({0: [1, None], "new": [1e9, None]}, dtype="datetime64[ns]") 232 tm.assert_frame_equal(result, expected) 233 234 @pytest.mark.parametrize("unit", ["h", "m", "s", "ms", "D", "M", "Y"]) 235 def test_frame_setitem_datetime64_col_other_units(self, unit): 236 # Check that non-nano dt64 values get cast to dt64 on setitem 237 # into a not-yet-existing column 238 n = 100 239 240 dtype = np.dtype(f"M8[{unit}]") 241 vals = np.arange(n, dtype=np.int64).view(dtype) 242 ex_vals = vals.astype("datetime64[ns]") 243 244 df = DataFrame({"ints": np.arange(n)}, index=np.arange(n)) 245 df[unit] = vals 246 247 assert df[unit].dtype == np.dtype("M8[ns]") 248 assert (df[unit].values == ex_vals).all() 249 250 @pytest.mark.parametrize("unit", ["h", "m", "s", "ms", "D", "M", "Y"]) 251 def test_frame_setitem_existing_datetime64_col_other_units(self, unit): 252 # Check that non-nano dt64 values get cast to dt64 on setitem 253 # into an already-existing dt64 column 254 n = 100 255 256 dtype = np.dtype(f"M8[{unit}]") 257 vals = np.arange(n, dtype=np.int64).view(dtype) 258 ex_vals = vals.astype("datetime64[ns]") 259 260 df = DataFrame({"ints": np.arange(n)}, index=np.arange(n)) 261 df["dates"] = np.arange(n, dtype=np.int64).view("M8[ns]") 262 263 # We overwrite existing dt64 column with new, non-nano dt64 vals 264 df["dates"] = vals 265 assert (df["dates"].values == ex_vals).all() 266 267 def test_setitem_dt64tz(self, timezone_frame): 268 269 df = timezone_frame 270 idx = df["B"].rename("foo") 271 272 # setitem 273 df["C"] = idx 274 tm.assert_series_equal(df["C"], Series(idx, name="C")) 275 276 df["D"] = "foo" 277 df["D"] = idx 278 tm.assert_series_equal(df["D"], Series(idx, name="D")) 279 del df["D"] 280 281 # assert that A & C are not sharing the same base (e.g. they 282 # are copies) 283 b1 = df._mgr.blocks[1] 284 b2 = df._mgr.blocks[2] 285 tm.assert_extension_array_equal(b1.values, b2.values) 286 b1base = b1.values._data.base 287 b2base = b2.values._data.base 288 assert b1base is None or (id(b1base) != id(b2base)) 289 290 # with nan 291 df2 = df.copy() 292 df2.iloc[1, 1] = NaT 293 df2.iloc[1, 2] = NaT 294 result = df2["B"] 295 tm.assert_series_equal(notna(result), Series([True, False, True], name="B")) 296 tm.assert_series_equal(df2.dtypes, df.dtypes) 297 298 def test_setitem_periodindex(self): 299 rng = period_range("1/1/2000", periods=5, name="index") 300 df = DataFrame(np.random.randn(5, 3), index=rng) 301 302 df["Index"] = rng 303 rs = Index(df["Index"]) 304 tm.assert_index_equal(rs, rng, check_names=False) 305 assert rs.name == "Index" 306 assert rng.name == "index" 307 308 rs = df.reset_index().set_index("index") 309 assert isinstance(rs.index, PeriodIndex) 310 tm.assert_index_equal(rs.index, rng) 311 312 def test_setitem_complete_column_with_array(self): 313 # GH#37954 314 df = DataFrame({"a": ["one", "two", "three"], "b": [1, 2, 3]}) 315 arr = np.array([[1, 1], [3, 1], [5, 1]]) 316 df[["c", "d"]] = arr 317 expected = DataFrame( 318 { 319 "a": ["one", "two", "three"], 320 "b": [1, 2, 3], 321 "c": [1, 3, 5], 322 "d": [1, 1, 1], 323 } 324 ) 325 tm.assert_frame_equal(df, expected) 326 327 @pytest.mark.parametrize("dtype", ["f8", "i8", "u8"]) 328 def test_setitem_bool_with_numeric_index(self, dtype): 329 # GH#36319 330 cols = Index([1, 2, 3], dtype=dtype) 331 df = DataFrame(np.random.randn(3, 3), columns=cols) 332 333 df[False] = ["a", "b", "c"] 334 335 expected_cols = Index([1, 2, 3, False], dtype=object) 336 if dtype == "f8": 337 expected_cols = Index([1.0, 2.0, 3.0, False], dtype=object) 338 339 tm.assert_index_equal(df.columns, expected_cols) 340 341 342class TestDataFrameSetItemWithExpansion: 343 def test_setitem_listlike_views(self): 344 # GH#38148 345 df = DataFrame({"a": [1, 2, 3], "b": [4, 4, 6]}) 346 347 # get one column as a view of df 348 ser = df["a"] 349 350 # add columns with list-like indexer 351 df[["c", "d"]] = np.array([[0.1, 0.2], [0.3, 0.4], [0.4, 0.5]]) 352 353 # edit in place the first column to check view semantics 354 df.iloc[0, 0] = 100 355 356 expected = Series([100, 2, 3], name="a") 357 tm.assert_series_equal(ser, expected) 358 359 def test_setitem_string_column_numpy_dtype_raising(self): 360 # GH#39010 361 df = DataFrame([[1, 2], [3, 4]]) 362 df["0 - Name"] = [5, 6] 363 expected = DataFrame([[1, 2, 5], [3, 4, 6]], columns=[0, 1, "0 - Name"]) 364 tm.assert_frame_equal(df, expected) 365 366 367class TestDataFrameSetItemSlicing: 368 def test_setitem_slice_position(self): 369 # GH#31469 370 df = DataFrame(np.zeros((100, 1))) 371 df[-4:] = 1 372 arr = np.zeros((100, 1)) 373 arr[-4:] = 1 374 expected = DataFrame(arr) 375 tm.assert_frame_equal(df, expected) 376 377 378class TestDataFrameSetItemCallable: 379 def test_setitem_callable(self): 380 # GH#12533 381 df = DataFrame({"A": [1, 2, 3, 4], "B": [5, 6, 7, 8]}) 382 df[lambda x: "A"] = [11, 12, 13, 14] 383 384 exp = DataFrame({"A": [11, 12, 13, 14], "B": [5, 6, 7, 8]}) 385 tm.assert_frame_equal(df, exp) 386 387 388class TestDataFrameSetItemBooleanMask: 389 @pytest.mark.parametrize( 390 "mask_type", 391 [lambda df: df > np.abs(df) / 2, lambda df: (df > np.abs(df) / 2).values], 392 ids=["dataframe", "array"], 393 ) 394 def test_setitem_boolean_mask(self, mask_type, float_frame): 395 396 # Test for issue #18582 397 df = float_frame.copy() 398 mask = mask_type(df) 399 400 # index with boolean mask 401 result = df.copy() 402 result[mask] = np.nan 403 404 expected = df.copy() 405 expected.values[np.array(mask)] = np.nan 406 tm.assert_frame_equal(result, expected) 407 408 @pytest.mark.parametrize("indexer", [lambda x: x, lambda x: x.loc]) 409 def test_setitem_boolean_mask_aligning(self, indexer): 410 # GH#39931 411 df = DataFrame({"a": [1, 4, 2, 3], "b": [5, 6, 7, 8]}) 412 expected = df.copy() 413 mask = df["a"] >= 3 414 indexer(df)[mask] = indexer(df)[mask].sort_values("a") 415 tm.assert_frame_equal(df, expected) 416