1from datetime import datetime
2import re
3
4import numpy as np
5import pytest
6
7from pandas import DataFrame, NaT
8import pandas._testing as tm
9
10
11@pytest.mark.parametrize("subset", ["a", ["a"], ["a", "B"]])
12def test_drop_duplicates_with_misspelled_column_name(subset):
13    # GH 19730
14    df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]})
15    msg = re.escape("Index(['a'], dtype='object')")
16
17    with pytest.raises(KeyError, match=msg):
18        df.drop_duplicates(subset)
19
20
21def test_drop_duplicates():
22    df = DataFrame(
23        {
24            "AAA": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"],
25            "B": ["one", "one", "two", "two", "two", "two", "one", "two"],
26            "C": [1, 1, 2, 2, 2, 2, 1, 2],
27            "D": range(8),
28        }
29    )
30    # single column
31    result = df.drop_duplicates("AAA")
32    expected = df[:2]
33    tm.assert_frame_equal(result, expected)
34
35    result = df.drop_duplicates("AAA", keep="last")
36    expected = df.loc[[6, 7]]
37    tm.assert_frame_equal(result, expected)
38
39    result = df.drop_duplicates("AAA", keep=False)
40    expected = df.loc[[]]
41    tm.assert_frame_equal(result, expected)
42    assert len(result) == 0
43
44    # multi column
45    expected = df.loc[[0, 1, 2, 3]]
46    result = df.drop_duplicates(np.array(["AAA", "B"]))
47    tm.assert_frame_equal(result, expected)
48    result = df.drop_duplicates(["AAA", "B"])
49    tm.assert_frame_equal(result, expected)
50
51    result = df.drop_duplicates(("AAA", "B"), keep="last")
52    expected = df.loc[[0, 5, 6, 7]]
53    tm.assert_frame_equal(result, expected)
54
55    result = df.drop_duplicates(("AAA", "B"), keep=False)
56    expected = df.loc[[0]]
57    tm.assert_frame_equal(result, expected)
58
59    # consider everything
60    df2 = df.loc[:, ["AAA", "B", "C"]]
61
62    result = df2.drop_duplicates()
63    # in this case only
64    expected = df2.drop_duplicates(["AAA", "B"])
65    tm.assert_frame_equal(result, expected)
66
67    result = df2.drop_duplicates(keep="last")
68    expected = df2.drop_duplicates(["AAA", "B"], keep="last")
69    tm.assert_frame_equal(result, expected)
70
71    result = df2.drop_duplicates(keep=False)
72    expected = df2.drop_duplicates(["AAA", "B"], keep=False)
73    tm.assert_frame_equal(result, expected)
74
75    # integers
76    result = df.drop_duplicates("C")
77    expected = df.iloc[[0, 2]]
78    tm.assert_frame_equal(result, expected)
79    result = df.drop_duplicates("C", keep="last")
80    expected = df.iloc[[-2, -1]]
81    tm.assert_frame_equal(result, expected)
82
83    df["E"] = df["C"].astype("int8")
84    result = df.drop_duplicates("E")
85    expected = df.iloc[[0, 2]]
86    tm.assert_frame_equal(result, expected)
87    result = df.drop_duplicates("E", keep="last")
88    expected = df.iloc[[-2, -1]]
89    tm.assert_frame_equal(result, expected)
90
91    # GH 11376
92    df = DataFrame({"x": [7, 6, 3, 3, 4, 8, 0], "y": [0, 6, 5, 5, 9, 1, 2]})
93    expected = df.loc[df.index != 3]
94    tm.assert_frame_equal(df.drop_duplicates(), expected)
95
96    df = DataFrame([[1, 0], [0, 2]])
97    tm.assert_frame_equal(df.drop_duplicates(), df)
98
99    df = DataFrame([[-2, 0], [0, -4]])
100    tm.assert_frame_equal(df.drop_duplicates(), df)
101
102    x = np.iinfo(np.int64).max / 3 * 2
103    df = DataFrame([[-x, x], [0, x + 4]])
104    tm.assert_frame_equal(df.drop_duplicates(), df)
105
106    df = DataFrame([[-x, x], [x, x + 4]])
107    tm.assert_frame_equal(df.drop_duplicates(), df)
108
109    # GH 11864
110    df = DataFrame([i] * 9 for i in range(16))
111    df = df.append([[1] + [0] * 8], ignore_index=True)
112
113    for keep in ["first", "last", False]:
114        assert df.duplicated(keep=keep).sum() == 0
115
116
117def test_drop_duplicates_with_duplicate_column_names():
118    # GH17836
119    df = DataFrame([[1, 2, 5], [3, 4, 6], [3, 4, 7]], columns=["a", "a", "b"])
120
121    result0 = df.drop_duplicates()
122    tm.assert_frame_equal(result0, df)
123
124    result1 = df.drop_duplicates("a")
125    expected1 = df[:2]
126    tm.assert_frame_equal(result1, expected1)
127
128
129def test_drop_duplicates_for_take_all():
130    df = DataFrame(
131        {
132            "AAA": ["foo", "bar", "baz", "bar", "foo", "bar", "qux", "foo"],
133            "B": ["one", "one", "two", "two", "two", "two", "one", "two"],
134            "C": [1, 1, 2, 2, 2, 2, 1, 2],
135            "D": range(8),
136        }
137    )
138    # single column
139    result = df.drop_duplicates("AAA")
140    expected = df.iloc[[0, 1, 2, 6]]
141    tm.assert_frame_equal(result, expected)
142
143    result = df.drop_duplicates("AAA", keep="last")
144    expected = df.iloc[[2, 5, 6, 7]]
145    tm.assert_frame_equal(result, expected)
146
147    result = df.drop_duplicates("AAA", keep=False)
148    expected = df.iloc[[2, 6]]
149    tm.assert_frame_equal(result, expected)
150
151    # multiple columns
152    result = df.drop_duplicates(["AAA", "B"])
153    expected = df.iloc[[0, 1, 2, 3, 4, 6]]
154    tm.assert_frame_equal(result, expected)
155
156    result = df.drop_duplicates(["AAA", "B"], keep="last")
157    expected = df.iloc[[0, 1, 2, 5, 6, 7]]
158    tm.assert_frame_equal(result, expected)
159
160    result = df.drop_duplicates(["AAA", "B"], keep=False)
161    expected = df.iloc[[0, 1, 2, 6]]
162    tm.assert_frame_equal(result, expected)
163
164
165def test_drop_duplicates_tuple():
166    df = DataFrame(
167        {
168            ("AA", "AB"): ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"],
169            "B": ["one", "one", "two", "two", "two", "two", "one", "two"],
170            "C": [1, 1, 2, 2, 2, 2, 1, 2],
171            "D": range(8),
172        }
173    )
174    # single column
175    result = df.drop_duplicates(("AA", "AB"))
176    expected = df[:2]
177    tm.assert_frame_equal(result, expected)
178
179    result = df.drop_duplicates(("AA", "AB"), keep="last")
180    expected = df.loc[[6, 7]]
181    tm.assert_frame_equal(result, expected)
182
183    result = df.drop_duplicates(("AA", "AB"), keep=False)
184    expected = df.loc[[]]  # empty df
185    assert len(result) == 0
186    tm.assert_frame_equal(result, expected)
187
188    # multi column
189    expected = df.loc[[0, 1, 2, 3]]
190    result = df.drop_duplicates((("AA", "AB"), "B"))
191    tm.assert_frame_equal(result, expected)
192
193
194@pytest.mark.parametrize(
195    "df",
196    [
197        DataFrame(),
198        DataFrame(columns=[]),
199        DataFrame(columns=["A", "B", "C"]),
200        DataFrame(index=[]),
201        DataFrame(index=["A", "B", "C"]),
202    ],
203)
204def test_drop_duplicates_empty(df):
205    # GH 20516
206    result = df.drop_duplicates()
207    tm.assert_frame_equal(result, df)
208
209    result = df.copy()
210    result.drop_duplicates(inplace=True)
211    tm.assert_frame_equal(result, df)
212
213
214def test_drop_duplicates_NA():
215    # none
216    df = DataFrame(
217        {
218            "A": [None, None, "foo", "bar", "foo", "bar", "bar", "foo"],
219            "B": ["one", "one", "two", "two", "two", "two", "one", "two"],
220            "C": [1.0, np.nan, np.nan, np.nan, 1.0, 1.0, 1, 1.0],
221            "D": range(8),
222        }
223    )
224    # single column
225    result = df.drop_duplicates("A")
226    expected = df.loc[[0, 2, 3]]
227    tm.assert_frame_equal(result, expected)
228
229    result = df.drop_duplicates("A", keep="last")
230    expected = df.loc[[1, 6, 7]]
231    tm.assert_frame_equal(result, expected)
232
233    result = df.drop_duplicates("A", keep=False)
234    expected = df.loc[[]]  # empty df
235    tm.assert_frame_equal(result, expected)
236    assert len(result) == 0
237
238    # multi column
239    result = df.drop_duplicates(["A", "B"])
240    expected = df.loc[[0, 2, 3, 6]]
241    tm.assert_frame_equal(result, expected)
242
243    result = df.drop_duplicates(["A", "B"], keep="last")
244    expected = df.loc[[1, 5, 6, 7]]
245    tm.assert_frame_equal(result, expected)
246
247    result = df.drop_duplicates(["A", "B"], keep=False)
248    expected = df.loc[[6]]
249    tm.assert_frame_equal(result, expected)
250
251    # nan
252    df = DataFrame(
253        {
254            "A": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"],
255            "B": ["one", "one", "two", "two", "two", "two", "one", "two"],
256            "C": [1.0, np.nan, np.nan, np.nan, 1.0, 1.0, 1, 1.0],
257            "D": range(8),
258        }
259    )
260    # single column
261    result = df.drop_duplicates("C")
262    expected = df[:2]
263    tm.assert_frame_equal(result, expected)
264
265    result = df.drop_duplicates("C", keep="last")
266    expected = df.loc[[3, 7]]
267    tm.assert_frame_equal(result, expected)
268
269    result = df.drop_duplicates("C", keep=False)
270    expected = df.loc[[]]  # empty df
271    tm.assert_frame_equal(result, expected)
272    assert len(result) == 0
273
274    # multi column
275    result = df.drop_duplicates(["C", "B"])
276    expected = df.loc[[0, 1, 2, 4]]
277    tm.assert_frame_equal(result, expected)
278
279    result = df.drop_duplicates(["C", "B"], keep="last")
280    expected = df.loc[[1, 3, 6, 7]]
281    tm.assert_frame_equal(result, expected)
282
283    result = df.drop_duplicates(["C", "B"], keep=False)
284    expected = df.loc[[1]]
285    tm.assert_frame_equal(result, expected)
286
287
288def test_drop_duplicates_NA_for_take_all():
289    # none
290    df = DataFrame(
291        {
292            "A": [None, None, "foo", "bar", "foo", "baz", "bar", "qux"],
293            "C": [1.0, np.nan, np.nan, np.nan, 1.0, 2.0, 3, 1.0],
294        }
295    )
296
297    # single column
298    result = df.drop_duplicates("A")
299    expected = df.iloc[[0, 2, 3, 5, 7]]
300    tm.assert_frame_equal(result, expected)
301
302    result = df.drop_duplicates("A", keep="last")
303    expected = df.iloc[[1, 4, 5, 6, 7]]
304    tm.assert_frame_equal(result, expected)
305
306    result = df.drop_duplicates("A", keep=False)
307    expected = df.iloc[[5, 7]]
308    tm.assert_frame_equal(result, expected)
309
310    # nan
311
312    # single column
313    result = df.drop_duplicates("C")
314    expected = df.iloc[[0, 1, 5, 6]]
315    tm.assert_frame_equal(result, expected)
316
317    result = df.drop_duplicates("C", keep="last")
318    expected = df.iloc[[3, 5, 6, 7]]
319    tm.assert_frame_equal(result, expected)
320
321    result = df.drop_duplicates("C", keep=False)
322    expected = df.iloc[[5, 6]]
323    tm.assert_frame_equal(result, expected)
324
325
326def test_drop_duplicates_inplace():
327    orig = DataFrame(
328        {
329            "A": ["foo", "bar", "foo", "bar", "foo", "bar", "bar", "foo"],
330            "B": ["one", "one", "two", "two", "two", "two", "one", "two"],
331            "C": [1, 1, 2, 2, 2, 2, 1, 2],
332            "D": range(8),
333        }
334    )
335    # single column
336    df = orig.copy()
337    return_value = df.drop_duplicates("A", inplace=True)
338    expected = orig[:2]
339    result = df
340    tm.assert_frame_equal(result, expected)
341    assert return_value is None
342
343    df = orig.copy()
344    return_value = df.drop_duplicates("A", keep="last", inplace=True)
345    expected = orig.loc[[6, 7]]
346    result = df
347    tm.assert_frame_equal(result, expected)
348    assert return_value is None
349
350    df = orig.copy()
351    return_value = df.drop_duplicates("A", keep=False, inplace=True)
352    expected = orig.loc[[]]
353    result = df
354    tm.assert_frame_equal(result, expected)
355    assert len(df) == 0
356    assert return_value is None
357
358    # multi column
359    df = orig.copy()
360    return_value = df.drop_duplicates(["A", "B"], inplace=True)
361    expected = orig.loc[[0, 1, 2, 3]]
362    result = df
363    tm.assert_frame_equal(result, expected)
364    assert return_value is None
365
366    df = orig.copy()
367    return_value = df.drop_duplicates(["A", "B"], keep="last", inplace=True)
368    expected = orig.loc[[0, 5, 6, 7]]
369    result = df
370    tm.assert_frame_equal(result, expected)
371    assert return_value is None
372
373    df = orig.copy()
374    return_value = df.drop_duplicates(["A", "B"], keep=False, inplace=True)
375    expected = orig.loc[[0]]
376    result = df
377    tm.assert_frame_equal(result, expected)
378    assert return_value is None
379
380    # consider everything
381    orig2 = orig.loc[:, ["A", "B", "C"]].copy()
382
383    df2 = orig2.copy()
384    return_value = df2.drop_duplicates(inplace=True)
385    # in this case only
386    expected = orig2.drop_duplicates(["A", "B"])
387    result = df2
388    tm.assert_frame_equal(result, expected)
389    assert return_value is None
390
391    df2 = orig2.copy()
392    return_value = df2.drop_duplicates(keep="last", inplace=True)
393    expected = orig2.drop_duplicates(["A", "B"], keep="last")
394    result = df2
395    tm.assert_frame_equal(result, expected)
396    assert return_value is None
397
398    df2 = orig2.copy()
399    return_value = df2.drop_duplicates(keep=False, inplace=True)
400    expected = orig2.drop_duplicates(["A", "B"], keep=False)
401    result = df2
402    tm.assert_frame_equal(result, expected)
403    assert return_value is None
404
405
406@pytest.mark.parametrize("inplace", [True, False])
407@pytest.mark.parametrize(
408    "origin_dict, output_dict, ignore_index, output_index",
409    [
410        ({"A": [2, 2, 3]}, {"A": [2, 3]}, True, [0, 1]),
411        ({"A": [2, 2, 3]}, {"A": [2, 3]}, False, [0, 2]),
412        ({"A": [2, 2, 3], "B": [2, 2, 4]}, {"A": [2, 3], "B": [2, 4]}, True, [0, 1]),
413        ({"A": [2, 2, 3], "B": [2, 2, 4]}, {"A": [2, 3], "B": [2, 4]}, False, [0, 2]),
414    ],
415)
416def test_drop_duplicates_ignore_index(
417    inplace, origin_dict, output_dict, ignore_index, output_index
418):
419    # GH 30114
420    df = DataFrame(origin_dict)
421    expected = DataFrame(output_dict, index=output_index)
422
423    if inplace:
424        result_df = df.copy()
425        result_df.drop_duplicates(ignore_index=ignore_index, inplace=inplace)
426    else:
427        result_df = df.drop_duplicates(ignore_index=ignore_index, inplace=inplace)
428
429    tm.assert_frame_equal(result_df, expected)
430    tm.assert_frame_equal(df, DataFrame(origin_dict))
431
432
433def test_drop_duplicates_null_in_object_column(nulls_fixture):
434    # https://github.com/pandas-dev/pandas/issues/32992
435    df = DataFrame([[1, nulls_fixture], [2, "a"]], dtype=object)
436    result = df.drop_duplicates()
437    tm.assert_frame_equal(result, df)
438
439
440@pytest.mark.parametrize("keep", ["first", "last", False])
441def test_drop_duplicates_series_vs_dataframe(keep):
442    # GH#14192
443    df = DataFrame(
444        {
445            "a": [1, 1, 1, "one", "one"],
446            "b": [2, 2, np.nan, np.nan, np.nan],
447            "c": [3, 3, np.nan, np.nan, "three"],
448            "d": [1, 2, 3, 4, 4],
449            "e": [
450                datetime(2015, 1, 1),
451                datetime(2015, 1, 1),
452                datetime(2015, 2, 1),
453                NaT,
454                NaT,
455            ],
456        }
457    )
458    for column in df.columns:
459        dropped_frame = df[[column]].drop_duplicates(keep=keep)
460        dropped_series = df[column].drop_duplicates(keep=keep)
461        tm.assert_frame_equal(dropped_frame, dropped_series.to_frame())
462
463
464@pytest.mark.parametrize("arg", [[1], 1, "True", [], 0])
465def test_drop_duplicates_non_boolean_ignore_index(arg):
466    # GH#38274
467    df = DataFrame({"a": [1, 2, 1, 3]})
468    msg = '^For argument "ignore_index" expected type bool, received type .*.$'
469    with pytest.raises(ValueError, match=msg):
470        df.drop_duplicates(ignore_index=arg)
471