1import datetime 2 3import dateutil 4import numpy as np 5import pytest 6 7import pandas as pd 8from pandas import DataFrame, Series 9import pandas._testing as tm 10 11 12class TestDataFrameMissingData: 13 def test_dropEmptyRows(self, float_frame): 14 N = len(float_frame.index) 15 mat = np.random.randn(N) 16 mat[:5] = np.nan 17 18 frame = DataFrame({"foo": mat}, index=float_frame.index) 19 original = Series(mat, index=float_frame.index, name="foo") 20 expected = original.dropna() 21 inplace_frame1, inplace_frame2 = frame.copy(), frame.copy() 22 23 smaller_frame = frame.dropna(how="all") 24 # check that original was preserved 25 tm.assert_series_equal(frame["foo"], original) 26 return_value = inplace_frame1.dropna(how="all", inplace=True) 27 tm.assert_series_equal(smaller_frame["foo"], expected) 28 tm.assert_series_equal(inplace_frame1["foo"], expected) 29 assert return_value is None 30 31 smaller_frame = frame.dropna(how="all", subset=["foo"]) 32 return_value = inplace_frame2.dropna(how="all", subset=["foo"], inplace=True) 33 tm.assert_series_equal(smaller_frame["foo"], expected) 34 tm.assert_series_equal(inplace_frame2["foo"], expected) 35 assert return_value is None 36 37 def test_dropIncompleteRows(self, float_frame): 38 N = len(float_frame.index) 39 mat = np.random.randn(N) 40 mat[:5] = np.nan 41 42 frame = DataFrame({"foo": mat}, index=float_frame.index) 43 frame["bar"] = 5 44 original = Series(mat, index=float_frame.index, name="foo") 45 inp_frame1, inp_frame2 = frame.copy(), frame.copy() 46 47 smaller_frame = frame.dropna() 48 tm.assert_series_equal(frame["foo"], original) 49 return_value = inp_frame1.dropna(inplace=True) 50 51 exp = Series(mat[5:], index=float_frame.index[5:], name="foo") 52 tm.assert_series_equal(smaller_frame["foo"], exp) 53 tm.assert_series_equal(inp_frame1["foo"], exp) 54 assert return_value is None 55 56 samesize_frame = frame.dropna(subset=["bar"]) 57 tm.assert_series_equal(frame["foo"], original) 58 assert (frame["bar"] == 5).all() 59 return_value = inp_frame2.dropna(subset=["bar"], inplace=True) 60 tm.assert_index_equal(samesize_frame.index, float_frame.index) 61 tm.assert_index_equal(inp_frame2.index, float_frame.index) 62 assert return_value is None 63 64 def test_dropna(self): 65 df = DataFrame(np.random.randn(6, 4)) 66 df[2][:2] = np.nan 67 68 dropped = df.dropna(axis=1) 69 expected = df.loc[:, [0, 1, 3]] 70 inp = df.copy() 71 return_value = inp.dropna(axis=1, inplace=True) 72 tm.assert_frame_equal(dropped, expected) 73 tm.assert_frame_equal(inp, expected) 74 assert return_value is None 75 76 dropped = df.dropna(axis=0) 77 expected = df.loc[list(range(2, 6))] 78 inp = df.copy() 79 return_value = inp.dropna(axis=0, inplace=True) 80 tm.assert_frame_equal(dropped, expected) 81 tm.assert_frame_equal(inp, expected) 82 assert return_value is None 83 84 # threshold 85 dropped = df.dropna(axis=1, thresh=5) 86 expected = df.loc[:, [0, 1, 3]] 87 inp = df.copy() 88 return_value = inp.dropna(axis=1, thresh=5, inplace=True) 89 tm.assert_frame_equal(dropped, expected) 90 tm.assert_frame_equal(inp, expected) 91 assert return_value is None 92 93 dropped = df.dropna(axis=0, thresh=4) 94 expected = df.loc[range(2, 6)] 95 inp = df.copy() 96 return_value = inp.dropna(axis=0, thresh=4, inplace=True) 97 tm.assert_frame_equal(dropped, expected) 98 tm.assert_frame_equal(inp, expected) 99 assert return_value is None 100 101 dropped = df.dropna(axis=1, thresh=4) 102 tm.assert_frame_equal(dropped, df) 103 104 dropped = df.dropna(axis=1, thresh=3) 105 tm.assert_frame_equal(dropped, df) 106 107 # subset 108 dropped = df.dropna(axis=0, subset=[0, 1, 3]) 109 inp = df.copy() 110 return_value = inp.dropna(axis=0, subset=[0, 1, 3], inplace=True) 111 tm.assert_frame_equal(dropped, df) 112 tm.assert_frame_equal(inp, df) 113 assert return_value is None 114 115 # all 116 dropped = df.dropna(axis=1, how="all") 117 tm.assert_frame_equal(dropped, df) 118 119 df[2] = np.nan 120 dropped = df.dropna(axis=1, how="all") 121 expected = df.loc[:, [0, 1, 3]] 122 tm.assert_frame_equal(dropped, expected) 123 124 # bad input 125 msg = "No axis named 3 for object type DataFrame" 126 with pytest.raises(ValueError, match=msg): 127 df.dropna(axis=3) 128 129 def test_drop_and_dropna_caching(self): 130 # tst that cacher updates 131 original = Series([1, 2, np.nan], name="A") 132 expected = Series([1, 2], dtype=original.dtype, name="A") 133 df = DataFrame({"A": original.values.copy()}) 134 df2 = df.copy() 135 df["A"].dropna() 136 tm.assert_series_equal(df["A"], original) 137 138 ser = df["A"] 139 return_value = ser.dropna(inplace=True) 140 tm.assert_series_equal(ser, expected) 141 tm.assert_series_equal(df["A"], original) 142 assert return_value is None 143 144 df2["A"].drop([1]) 145 tm.assert_series_equal(df2["A"], original) 146 147 ser = df2["A"] 148 return_value = ser.drop([1], inplace=True) 149 tm.assert_series_equal(ser, original.drop([1])) 150 tm.assert_series_equal(df2["A"], original) 151 assert return_value is None 152 153 def test_dropna_corner(self, float_frame): 154 # bad input 155 msg = "invalid how option: foo" 156 with pytest.raises(ValueError, match=msg): 157 float_frame.dropna(how="foo") 158 msg = "must specify how or thresh" 159 with pytest.raises(TypeError, match=msg): 160 float_frame.dropna(how=None) 161 # non-existent column - 8303 162 with pytest.raises(KeyError, match=r"^\['X'\]$"): 163 float_frame.dropna(subset=["A", "X"]) 164 165 def test_dropna_multiple_axes(self): 166 df = DataFrame( 167 [ 168 [1, np.nan, 2, 3], 169 [4, np.nan, 5, 6], 170 [np.nan, np.nan, np.nan, np.nan], 171 [7, np.nan, 8, 9], 172 ] 173 ) 174 175 # GH20987 176 with pytest.raises(TypeError, match="supplying multiple axes"): 177 df.dropna(how="all", axis=[0, 1]) 178 with pytest.raises(TypeError, match="supplying multiple axes"): 179 df.dropna(how="all", axis=(0, 1)) 180 181 inp = df.copy() 182 with pytest.raises(TypeError, match="supplying multiple axes"): 183 inp.dropna(how="all", axis=(0, 1), inplace=True) 184 185 def test_dropna_tz_aware_datetime(self): 186 # GH13407 187 df = DataFrame() 188 dt1 = datetime.datetime(2015, 1, 1, tzinfo=dateutil.tz.tzutc()) 189 dt2 = datetime.datetime(2015, 2, 2, tzinfo=dateutil.tz.tzutc()) 190 df["Time"] = [dt1] 191 result = df.dropna(axis=0) 192 expected = DataFrame({"Time": [dt1]}) 193 tm.assert_frame_equal(result, expected) 194 195 # Ex2 196 df = DataFrame({"Time": [dt1, None, np.nan, dt2]}) 197 result = df.dropna(axis=0) 198 expected = DataFrame([dt1, dt2], columns=["Time"], index=[0, 3]) 199 tm.assert_frame_equal(result, expected) 200 201 def test_dropna_categorical_interval_index(self): 202 # GH 25087 203 ii = pd.IntervalIndex.from_breaks([0, 2.78, 3.14, 6.28]) 204 ci = pd.CategoricalIndex(ii) 205 df = DataFrame({"A": list("abc")}, index=ci) 206 207 expected = df 208 result = df.dropna() 209 tm.assert_frame_equal(result, expected) 210