1from datetime import datetime 2import re 3 4import numpy as np 5import pytest 6 7from pandas import DataFrame, NaT 8import pandas._testing as tm 9 10 11@pytest.mark.parametrize("subset", ["a", ["a"], ["a", "B"]]) 12def test_drop_duplicates_with_misspelled_column_name(subset): 13 # GH 19730 14 df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]}) 15 msg = re.escape("Index(['a'], dtype='object')") 16 17 with pytest.raises(KeyError, match=msg): 18 df.drop_duplicates(subset) 19 20 21def test_drop_duplicates(): 22 df = DataFrame( 23 { 24 "AAA": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"], 25 "B": ["one", "one", "two", "two", "two", "two", "one", "two"], 26 "C": [1, 1, 2, 2, 2, 2, 1, 2], 27 "D": range(8), 28 } 29 ) 30 # single column 31 result = df.drop_duplicates("AAA") 32 expected = df[:2] 33 tm.assert_frame_equal(result, expected) 34 35 result = df.drop_duplicates("AAA", keep="last") 36 expected = df.loc[[6, 7]] 37 tm.assert_frame_equal(result, expected) 38 39 result = df.drop_duplicates("AAA", keep=False) 40 expected = df.loc[[]] 41 tm.assert_frame_equal(result, expected) 42 assert len(result) == 0 43 44 # multi column 45 expected = df.loc[[0, 1, 2, 3]] 46 result = df.drop_duplicates(np.array(["AAA", "B"])) 47 tm.assert_frame_equal(result, expected) 48 result = df.drop_duplicates(["AAA", "B"]) 49 tm.assert_frame_equal(result, expected) 50 51 result = df.drop_duplicates(("AAA", "B"), keep="last") 52 expected = df.loc[[0, 5, 6, 7]] 53 tm.assert_frame_equal(result, expected) 54 55 result = df.drop_duplicates(("AAA", "B"), keep=False) 56 expected = df.loc[[0]] 57 tm.assert_frame_equal(result, expected) 58 59 # consider everything 60 df2 = df.loc[:, ["AAA", "B", "C"]] 61 62 result = df2.drop_duplicates() 63 # in this case only 64 expected = df2.drop_duplicates(["AAA", "B"]) 65 tm.assert_frame_equal(result, expected) 66 67 result = df2.drop_duplicates(keep="last") 68 expected = df2.drop_duplicates(["AAA", "B"], keep="last") 69 tm.assert_frame_equal(result, expected) 70 71 result = df2.drop_duplicates(keep=False) 72 expected = df2.drop_duplicates(["AAA", "B"], keep=False) 73 tm.assert_frame_equal(result, expected) 74 75 # integers 76 result = df.drop_duplicates("C") 77 expected = df.iloc[[0, 2]] 78 tm.assert_frame_equal(result, expected) 79 result = df.drop_duplicates("C", keep="last") 80 expected = df.iloc[[-2, -1]] 81 tm.assert_frame_equal(result, expected) 82 83 df["E"] = df["C"].astype("int8") 84 result = df.drop_duplicates("E") 85 expected = df.iloc[[0, 2]] 86 tm.assert_frame_equal(result, expected) 87 result = df.drop_duplicates("E", keep="last") 88 expected = df.iloc[[-2, -1]] 89 tm.assert_frame_equal(result, expected) 90 91 # GH 11376 92 df = DataFrame({"x": [7, 6, 3, 3, 4, 8, 0], "y": [0, 6, 5, 5, 9, 1, 2]}) 93 expected = df.loc[df.index != 3] 94 tm.assert_frame_equal(df.drop_duplicates(), expected) 95 96 df = DataFrame([[1, 0], [0, 2]]) 97 tm.assert_frame_equal(df.drop_duplicates(), df) 98 99 df = DataFrame([[-2, 0], [0, -4]]) 100 tm.assert_frame_equal(df.drop_duplicates(), df) 101 102 x = np.iinfo(np.int64).max / 3 * 2 103 df = DataFrame([[-x, x], [0, x + 4]]) 104 tm.assert_frame_equal(df.drop_duplicates(), df) 105 106 df = DataFrame([[-x, x], [x, x + 4]]) 107 tm.assert_frame_equal(df.drop_duplicates(), df) 108 109 # GH 11864 110 df = DataFrame([i] * 9 for i in range(16)) 111 df = df.append([[1] + [0] * 8], ignore_index=True) 112 113 for keep in ["first", "last", False]: 114 assert df.duplicated(keep=keep).sum() == 0 115 116 117def test_drop_duplicates_with_duplicate_column_names(): 118 # GH17836 119 df = DataFrame([[1, 2, 5], [3, 4, 6], [3, 4, 7]], columns=["a", "a", "b"]) 120 121 result0 = df.drop_duplicates() 122 tm.assert_frame_equal(result0, df) 123 124 result1 = df.drop_duplicates("a") 125 expected1 = df[:2] 126 tm.assert_frame_equal(result1, expected1) 127 128 129def test_drop_duplicates_for_take_all(): 130 df = DataFrame( 131 { 132 "AAA": ["foo", "bar", "baz", "bar", "foo", "bar", "qux", "foo"], 133 "B": ["one", "one", "two", "two", "two", "two", "one", "two"], 134 "C": [1, 1, 2, 2, 2, 2, 1, 2], 135 "D": range(8), 136 } 137 ) 138 # single column 139 result = df.drop_duplicates("AAA") 140 expected = df.iloc[[0, 1, 2, 6]] 141 tm.assert_frame_equal(result, expected) 142 143 result = df.drop_duplicates("AAA", keep="last") 144 expected = df.iloc[[2, 5, 6, 7]] 145 tm.assert_frame_equal(result, expected) 146 147 result = df.drop_duplicates("AAA", keep=False) 148 expected = df.iloc[[2, 6]] 149 tm.assert_frame_equal(result, expected) 150 151 # multiple columns 152 result = df.drop_duplicates(["AAA", "B"]) 153 expected = df.iloc[[0, 1, 2, 3, 4, 6]] 154 tm.assert_frame_equal(result, expected) 155 156 result = df.drop_duplicates(["AAA", "B"], keep="last") 157 expected = df.iloc[[0, 1, 2, 5, 6, 7]] 158 tm.assert_frame_equal(result, expected) 159 160 result = df.drop_duplicates(["AAA", "B"], keep=False) 161 expected = df.iloc[[0, 1, 2, 6]] 162 tm.assert_frame_equal(result, expected) 163 164 165def test_drop_duplicates_tuple(): 166 df = DataFrame( 167 { 168 ("AA", "AB"): ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"], 169 "B": ["one", "one", "two", "two", "two", "two", "one", "two"], 170 "C": [1, 1, 2, 2, 2, 2, 1, 2], 171 "D": range(8), 172 } 173 ) 174 # single column 175 result = df.drop_duplicates(("AA", "AB")) 176 expected = df[:2] 177 tm.assert_frame_equal(result, expected) 178 179 result = df.drop_duplicates(("AA", "AB"), keep="last") 180 expected = df.loc[[6, 7]] 181 tm.assert_frame_equal(result, expected) 182 183 result = df.drop_duplicates(("AA", "AB"), keep=False) 184 expected = df.loc[[]] # empty df 185 assert len(result) == 0 186 tm.assert_frame_equal(result, expected) 187 188 # multi column 189 expected = df.loc[[0, 1, 2, 3]] 190 result = df.drop_duplicates((("AA", "AB"), "B")) 191 tm.assert_frame_equal(result, expected) 192 193 194@pytest.mark.parametrize( 195 "df", 196 [ 197 DataFrame(), 198 DataFrame(columns=[]), 199 DataFrame(columns=["A", "B", "C"]), 200 DataFrame(index=[]), 201 DataFrame(index=["A", "B", "C"]), 202 ], 203) 204def test_drop_duplicates_empty(df): 205 # GH 20516 206 result = df.drop_duplicates() 207 tm.assert_frame_equal(result, df) 208 209 result = df.copy() 210 result.drop_duplicates(inplace=True) 211 tm.assert_frame_equal(result, df) 212 213 214def test_drop_duplicates_NA(): 215 # none 216 df = DataFrame( 217 { 218 "A": [None, None, "foo", "bar", "foo", "bar", "bar", "foo"], 219 "B": ["one", "one", "two", "two", "two", "two", "one", "two"], 220 "C": [1.0, np.nan, np.nan, np.nan, 1.0, 1.0, 1, 1.0], 221 "D": range(8), 222 } 223 ) 224 # single column 225 result = df.drop_duplicates("A") 226 expected = df.loc[[0, 2, 3]] 227 tm.assert_frame_equal(result, expected) 228 229 result = df.drop_duplicates("A", keep="last") 230 expected = df.loc[[1, 6, 7]] 231 tm.assert_frame_equal(result, expected) 232 233 result = df.drop_duplicates("A", keep=False) 234 expected = df.loc[[]] # empty df 235 tm.assert_frame_equal(result, expected) 236 assert len(result) == 0 237 238 # multi column 239 result = df.drop_duplicates(["A", "B"]) 240 expected = df.loc[[0, 2, 3, 6]] 241 tm.assert_frame_equal(result, expected) 242 243 result = df.drop_duplicates(["A", "B"], keep="last") 244 expected = df.loc[[1, 5, 6, 7]] 245 tm.assert_frame_equal(result, expected) 246 247 result = df.drop_duplicates(["A", "B"], keep=False) 248 expected = df.loc[[6]] 249 tm.assert_frame_equal(result, expected) 250 251 # nan 252 df = DataFrame( 253 { 254 "A": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"], 255 "B": ["one", "one", "two", "two", "two", "two", "one", "two"], 256 "C": [1.0, np.nan, np.nan, np.nan, 1.0, 1.0, 1, 1.0], 257 "D": range(8), 258 } 259 ) 260 # single column 261 result = df.drop_duplicates("C") 262 expected = df[:2] 263 tm.assert_frame_equal(result, expected) 264 265 result = df.drop_duplicates("C", keep="last") 266 expected = df.loc[[3, 7]] 267 tm.assert_frame_equal(result, expected) 268 269 result = df.drop_duplicates("C", keep=False) 270 expected = df.loc[[]] # empty df 271 tm.assert_frame_equal(result, expected) 272 assert len(result) == 0 273 274 # multi column 275 result = df.drop_duplicates(["C", "B"]) 276 expected = df.loc[[0, 1, 2, 4]] 277 tm.assert_frame_equal(result, expected) 278 279 result = df.drop_duplicates(["C", "B"], keep="last") 280 expected = df.loc[[1, 3, 6, 7]] 281 tm.assert_frame_equal(result, expected) 282 283 result = df.drop_duplicates(["C", "B"], keep=False) 284 expected = df.loc[[1]] 285 tm.assert_frame_equal(result, expected) 286 287 288def test_drop_duplicates_NA_for_take_all(): 289 # none 290 df = DataFrame( 291 { 292 "A": [None, None, "foo", "bar", "foo", "baz", "bar", "qux"], 293 "C": [1.0, np.nan, np.nan, np.nan, 1.0, 2.0, 3, 1.0], 294 } 295 ) 296 297 # single column 298 result = df.drop_duplicates("A") 299 expected = df.iloc[[0, 2, 3, 5, 7]] 300 tm.assert_frame_equal(result, expected) 301 302 result = df.drop_duplicates("A", keep="last") 303 expected = df.iloc[[1, 4, 5, 6, 7]] 304 tm.assert_frame_equal(result, expected) 305 306 result = df.drop_duplicates("A", keep=False) 307 expected = df.iloc[[5, 7]] 308 tm.assert_frame_equal(result, expected) 309 310 # nan 311 312 # single column 313 result = df.drop_duplicates("C") 314 expected = df.iloc[[0, 1, 5, 6]] 315 tm.assert_frame_equal(result, expected) 316 317 result = df.drop_duplicates("C", keep="last") 318 expected = df.iloc[[3, 5, 6, 7]] 319 tm.assert_frame_equal(result, expected) 320 321 result = df.drop_duplicates("C", keep=False) 322 expected = df.iloc[[5, 6]] 323 tm.assert_frame_equal(result, expected) 324 325 326def test_drop_duplicates_inplace(): 327 orig = DataFrame( 328 { 329 "A": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"], 330 "B": ["one", "one", "two", "two", "two", "two", "one", "two"], 331 "C": [1, 1, 2, 2, 2, 2, 1, 2], 332 "D": range(8), 333 } 334 ) 335 # single column 336 df = orig.copy() 337 return_value = df.drop_duplicates("A", inplace=True) 338 expected = orig[:2] 339 result = df 340 tm.assert_frame_equal(result, expected) 341 assert return_value is None 342 343 df = orig.copy() 344 return_value = df.drop_duplicates("A", keep="last", inplace=True) 345 expected = orig.loc[[6, 7]] 346 result = df 347 tm.assert_frame_equal(result, expected) 348 assert return_value is None 349 350 df = orig.copy() 351 return_value = df.drop_duplicates("A", keep=False, inplace=True) 352 expected = orig.loc[[]] 353 result = df 354 tm.assert_frame_equal(result, expected) 355 assert len(df) == 0 356 assert return_value is None 357 358 # multi column 359 df = orig.copy() 360 return_value = df.drop_duplicates(["A", "B"], inplace=True) 361 expected = orig.loc[[0, 1, 2, 3]] 362 result = df 363 tm.assert_frame_equal(result, expected) 364 assert return_value is None 365 366 df = orig.copy() 367 return_value = df.drop_duplicates(["A", "B"], keep="last", inplace=True) 368 expected = orig.loc[[0, 5, 6, 7]] 369 result = df 370 tm.assert_frame_equal(result, expected) 371 assert return_value is None 372 373 df = orig.copy() 374 return_value = df.drop_duplicates(["A", "B"], keep=False, inplace=True) 375 expected = orig.loc[[0]] 376 result = df 377 tm.assert_frame_equal(result, expected) 378 assert return_value is None 379 380 # consider everything 381 orig2 = orig.loc[:, ["A", "B", "C"]].copy() 382 383 df2 = orig2.copy() 384 return_value = df2.drop_duplicates(inplace=True) 385 # in this case only 386 expected = orig2.drop_duplicates(["A", "B"]) 387 result = df2 388 tm.assert_frame_equal(result, expected) 389 assert return_value is None 390 391 df2 = orig2.copy() 392 return_value = df2.drop_duplicates(keep="last", inplace=True) 393 expected = orig2.drop_duplicates(["A", "B"], keep="last") 394 result = df2 395 tm.assert_frame_equal(result, expected) 396 assert return_value is None 397 398 df2 = orig2.copy() 399 return_value = df2.drop_duplicates(keep=False, inplace=True) 400 expected = orig2.drop_duplicates(["A", "B"], keep=False) 401 result = df2 402 tm.assert_frame_equal(result, expected) 403 assert return_value is None 404 405 406@pytest.mark.parametrize("inplace", [True, False]) 407@pytest.mark.parametrize( 408 "origin_dict, output_dict, ignore_index, output_index", 409 [ 410 ({"A": [2, 2, 3]}, {"A": [2, 3]}, True, [0, 1]), 411 ({"A": [2, 2, 3]}, {"A": [2, 3]}, False, [0, 2]), 412 ({"A": [2, 2, 3], "B": [2, 2, 4]}, {"A": [2, 3], "B": [2, 4]}, True, [0, 1]), 413 ({"A": [2, 2, 3], "B": [2, 2, 4]}, {"A": [2, 3], "B": [2, 4]}, False, [0, 2]), 414 ], 415) 416def test_drop_duplicates_ignore_index( 417 inplace, origin_dict, output_dict, ignore_index, output_index 418): 419 # GH 30114 420 df = DataFrame(origin_dict) 421 expected = DataFrame(output_dict, index=output_index) 422 423 if inplace: 424 result_df = df.copy() 425 result_df.drop_duplicates(ignore_index=ignore_index, inplace=inplace) 426 else: 427 result_df = df.drop_duplicates(ignore_index=ignore_index, inplace=inplace) 428 429 tm.assert_frame_equal(result_df, expected) 430 tm.assert_frame_equal(df, DataFrame(origin_dict)) 431 432 433def test_drop_duplicates_null_in_object_column(nulls_fixture): 434 # https://github.com/pandas-dev/pandas/issues/32992 435 df = DataFrame([[1, nulls_fixture], [2, "a"]], dtype=object) 436 result = df.drop_duplicates() 437 tm.assert_frame_equal(result, df) 438 439 440@pytest.mark.parametrize("keep", ["first", "last", False]) 441def test_drop_duplicates_series_vs_dataframe(keep): 442 # GH#14192 443 df = DataFrame( 444 { 445 "a": [1, 1, 1, "one", "one"], 446 "b": [2, 2, np.nan, np.nan, np.nan], 447 "c": [3, 3, np.nan, np.nan, "three"], 448 "d": [1, 2, 3, 4, 4], 449 "e": [ 450 datetime(2015, 1, 1), 451 datetime(2015, 1, 1), 452 datetime(2015, 2, 1), 453 NaT, 454 NaT, 455 ], 456 } 457 ) 458 for column in df.columns: 459 dropped_frame = df[[column]].drop_duplicates(keep=keep) 460 dropped_series = df[column].drop_duplicates(keep=keep) 461 tm.assert_frame_equal(dropped_frame, dropped_series.to_frame()) 462 463 464@pytest.mark.parametrize("arg", [[1], 1, "True", [], 0]) 465def test_drop_duplicates_non_boolean_ignore_index(arg): 466 # GH#38274 467 df = DataFrame({"a": [1, 2, 1, 3]}) 468 msg = '^For argument "ignore_index" expected type bool, received type .*.$' 469 with pytest.raises(ValueError, match=msg): 470 df.drop_duplicates(ignore_index=arg) 471