1import datetime
2
3import dateutil
4import numpy as np
5import pytest
6
7import pandas as pd
8from pandas import DataFrame, Series
9import pandas._testing as tm
10
11
12class TestDataFrameMissingData:
13    def test_dropEmptyRows(self, float_frame):
14        N = len(float_frame.index)
15        mat = np.random.randn(N)
16        mat[:5] = np.nan
17
18        frame = DataFrame({"foo": mat}, index=float_frame.index)
19        original = Series(mat, index=float_frame.index, name="foo")
20        expected = original.dropna()
21        inplace_frame1, inplace_frame2 = frame.copy(), frame.copy()
22
23        smaller_frame = frame.dropna(how="all")
24        # check that original was preserved
25        tm.assert_series_equal(frame["foo"], original)
26        return_value = inplace_frame1.dropna(how="all", inplace=True)
27        tm.assert_series_equal(smaller_frame["foo"], expected)
28        tm.assert_series_equal(inplace_frame1["foo"], expected)
29        assert return_value is None
30
31        smaller_frame = frame.dropna(how="all", subset=["foo"])
32        return_value = inplace_frame2.dropna(how="all", subset=["foo"], inplace=True)
33        tm.assert_series_equal(smaller_frame["foo"], expected)
34        tm.assert_series_equal(inplace_frame2["foo"], expected)
35        assert return_value is None
36
37    def test_dropIncompleteRows(self, float_frame):
38        N = len(float_frame.index)
39        mat = np.random.randn(N)
40        mat[:5] = np.nan
41
42        frame = DataFrame({"foo": mat}, index=float_frame.index)
43        frame["bar"] = 5
44        original = Series(mat, index=float_frame.index, name="foo")
45        inp_frame1, inp_frame2 = frame.copy(), frame.copy()
46
47        smaller_frame = frame.dropna()
48        tm.assert_series_equal(frame["foo"], original)
49        return_value = inp_frame1.dropna(inplace=True)
50
51        exp = Series(mat[5:], index=float_frame.index[5:], name="foo")
52        tm.assert_series_equal(smaller_frame["foo"], exp)
53        tm.assert_series_equal(inp_frame1["foo"], exp)
54        assert return_value is None
55
56        samesize_frame = frame.dropna(subset=["bar"])
57        tm.assert_series_equal(frame["foo"], original)
58        assert (frame["bar"] == 5).all()
59        return_value = inp_frame2.dropna(subset=["bar"], inplace=True)
60        tm.assert_index_equal(samesize_frame.index, float_frame.index)
61        tm.assert_index_equal(inp_frame2.index, float_frame.index)
62        assert return_value is None
63
64    def test_dropna(self):
65        df = DataFrame(np.random.randn(6, 4))
66        df[2][:2] = np.nan
67
68        dropped = df.dropna(axis=1)
69        expected = df.loc[:, [0, 1, 3]]
70        inp = df.copy()
71        return_value = inp.dropna(axis=1, inplace=True)
72        tm.assert_frame_equal(dropped, expected)
73        tm.assert_frame_equal(inp, expected)
74        assert return_value is None
75
76        dropped = df.dropna(axis=0)
77        expected = df.loc[list(range(2, 6))]
78        inp = df.copy()
79        return_value = inp.dropna(axis=0, inplace=True)
80        tm.assert_frame_equal(dropped, expected)
81        tm.assert_frame_equal(inp, expected)
82        assert return_value is None
83
84        # threshold
85        dropped = df.dropna(axis=1, thresh=5)
86        expected = df.loc[:, [0, 1, 3]]
87        inp = df.copy()
88        return_value = inp.dropna(axis=1, thresh=5, inplace=True)
89        tm.assert_frame_equal(dropped, expected)
90        tm.assert_frame_equal(inp, expected)
91        assert return_value is None
92
93        dropped = df.dropna(axis=0, thresh=4)
94        expected = df.loc[range(2, 6)]
95        inp = df.copy()
96        return_value = inp.dropna(axis=0, thresh=4, inplace=True)
97        tm.assert_frame_equal(dropped, expected)
98        tm.assert_frame_equal(inp, expected)
99        assert return_value is None
100
101        dropped = df.dropna(axis=1, thresh=4)
102        tm.assert_frame_equal(dropped, df)
103
104        dropped = df.dropna(axis=1, thresh=3)
105        tm.assert_frame_equal(dropped, df)
106
107        # subset
108        dropped = df.dropna(axis=0, subset=[0, 1, 3])
109        inp = df.copy()
110        return_value = inp.dropna(axis=0, subset=[0, 1, 3], inplace=True)
111        tm.assert_frame_equal(dropped, df)
112        tm.assert_frame_equal(inp, df)
113        assert return_value is None
114
115        # all
116        dropped = df.dropna(axis=1, how="all")
117        tm.assert_frame_equal(dropped, df)
118
119        df[2] = np.nan
120        dropped = df.dropna(axis=1, how="all")
121        expected = df.loc[:, [0, 1, 3]]
122        tm.assert_frame_equal(dropped, expected)
123
124        # bad input
125        msg = "No axis named 3 for object type DataFrame"
126        with pytest.raises(ValueError, match=msg):
127            df.dropna(axis=3)
128
129    def test_drop_and_dropna_caching(self):
130        # tst that cacher updates
131        original = Series([1, 2, np.nan], name="A")
132        expected = Series([1, 2], dtype=original.dtype, name="A")
133        df = DataFrame({"A": original.values.copy()})
134        df2 = df.copy()
135        df["A"].dropna()
136        tm.assert_series_equal(df["A"], original)
137
138        ser = df["A"]
139        return_value = ser.dropna(inplace=True)
140        tm.assert_series_equal(ser, expected)
141        tm.assert_series_equal(df["A"], original)
142        assert return_value is None
143
144        df2["A"].drop([1])
145        tm.assert_series_equal(df2["A"], original)
146
147        ser = df2["A"]
148        return_value = ser.drop([1], inplace=True)
149        tm.assert_series_equal(ser, original.drop([1]))
150        tm.assert_series_equal(df2["A"], original)
151        assert return_value is None
152
153    def test_dropna_corner(self, float_frame):
154        # bad input
155        msg = "invalid how option: foo"
156        with pytest.raises(ValueError, match=msg):
157            float_frame.dropna(how="foo")
158        msg = "must specify how or thresh"
159        with pytest.raises(TypeError, match=msg):
160            float_frame.dropna(how=None)
161        # non-existent column - 8303
162        with pytest.raises(KeyError, match=r"^\['X'\]$"):
163            float_frame.dropna(subset=["A", "X"])
164
165    def test_dropna_multiple_axes(self):
166        df = DataFrame(
167            [
168                [1, np.nan, 2, 3],
169                [4, np.nan, 5, 6],
170                [np.nan, np.nan, np.nan, np.nan],
171                [7, np.nan, 8, 9],
172            ]
173        )
174
175        # GH20987
176        with pytest.raises(TypeError, match="supplying multiple axes"):
177            df.dropna(how="all", axis=[0, 1])
178        with pytest.raises(TypeError, match="supplying multiple axes"):
179            df.dropna(how="all", axis=(0, 1))
180
181        inp = df.copy()
182        with pytest.raises(TypeError, match="supplying multiple axes"):
183            inp.dropna(how="all", axis=(0, 1), inplace=True)
184
185    def test_dropna_tz_aware_datetime(self):
186        # GH13407
187        df = DataFrame()
188        dt1 = datetime.datetime(2015, 1, 1, tzinfo=dateutil.tz.tzutc())
189        dt2 = datetime.datetime(2015, 2, 2, tzinfo=dateutil.tz.tzutc())
190        df["Time"] = [dt1]
191        result = df.dropna(axis=0)
192        expected = DataFrame({"Time": [dt1]})
193        tm.assert_frame_equal(result, expected)
194
195        # Ex2
196        df = DataFrame({"Time": [dt1, None, np.nan, dt2]})
197        result = df.dropna(axis=0)
198        expected = DataFrame([dt1, dt2], columns=["Time"], index=[0, 3])
199        tm.assert_frame_equal(result, expected)
200
201    def test_dropna_categorical_interval_index(self):
202        # GH 25087
203        ii = pd.IntervalIndex.from_breaks([0, 2.78, 3.14, 6.28])
204        ci = pd.CategoricalIndex(ii)
205        df = DataFrame({"A": list("abc")}, index=ci)
206
207        expected = df
208        result = df.dropna()
209        tm.assert_frame_equal(result, expected)
210