1""" 2Tests for statistical reductions of 2nd moment or higher: var, skew, kurt, ... 3""" 4import inspect 5 6import numpy as np 7import pytest 8 9import pandas.util._test_decorators as td 10 11import pandas as pd 12from pandas import DataFrame, Series 13import pandas._testing as tm 14from pandas.core.arrays import DatetimeArray, PeriodArray, TimedeltaArray 15 16 17class TestDatetimeLikeStatReductions: 18 @pytest.mark.parametrize("box", [Series, pd.Index, DatetimeArray]) 19 def test_dt64_mean(self, tz_naive_fixture, box): 20 tz = tz_naive_fixture 21 22 dti = pd.date_range("2001-01-01", periods=11, tz=tz) 23 # shuffle so that we are not just working with monotone-increasing 24 dti = dti.take([4, 1, 3, 10, 9, 7, 8, 5, 0, 2, 6]) 25 dtarr = dti._data 26 27 obj = box(dtarr) 28 assert obj.mean() == pd.Timestamp("2001-01-06", tz=tz) 29 assert obj.mean(skipna=False) == pd.Timestamp("2001-01-06", tz=tz) 30 31 # dtarr[-2] will be the first date 2001-01-1 32 dtarr[-2] = pd.NaT 33 34 obj = box(dtarr) 35 assert obj.mean() == pd.Timestamp("2001-01-06 07:12:00", tz=tz) 36 assert obj.mean(skipna=False) is pd.NaT 37 38 @pytest.mark.parametrize("box", [Series, pd.Index, PeriodArray]) 39 def test_period_mean(self, box): 40 # GH#24757 41 dti = pd.date_range("2001-01-01", periods=11) 42 # shuffle so that we are not just working with monotone-increasing 43 dti = dti.take([4, 1, 3, 10, 9, 7, 8, 5, 0, 2, 6]) 44 45 # use hourly frequency to avoid rounding errors in expected results 46 # TODO: flesh this out with different frequencies 47 parr = dti._data.to_period("H") 48 obj = box(parr) 49 with pytest.raises(TypeError, match="ambiguous"): 50 obj.mean() 51 with pytest.raises(TypeError, match="ambiguous"): 52 obj.mean(skipna=True) 53 54 # parr[-2] will be the first date 2001-01-1 55 parr[-2] = pd.NaT 56 57 with pytest.raises(TypeError, match="ambiguous"): 58 obj.mean() 59 with pytest.raises(TypeError, match="ambiguous"): 60 obj.mean(skipna=True) 61 62 @pytest.mark.parametrize("box", [Series, pd.Index, TimedeltaArray]) 63 def test_td64_mean(self, box): 64 tdi = pd.TimedeltaIndex([0, 3, -2, -7, 1, 2, -1, 3, 5, -2, 4], unit="D") 65 66 tdarr = tdi._data 67 obj = box(tdarr) 68 69 result = obj.mean() 70 expected = np.array(tdarr).mean() 71 assert result == expected 72 73 tdarr[0] = pd.NaT 74 assert obj.mean(skipna=False) is pd.NaT 75 76 result2 = obj.mean(skipna=True) 77 assert result2 == tdi[1:].mean() 78 79 # exact equality fails by 1 nanosecond 80 assert result2.round("us") == (result * 11.0 / 10).round("us") 81 82 83class TestSeriesStatReductions: 84 # Note: the name TestSeriesStatReductions indicates these tests 85 # were moved from a series-specific test file, _not_ that these tests are 86 # intended long-term to be series-specific 87 88 def _check_stat_op( 89 self, name, alternate, string_series_, check_objects=False, check_allna=False 90 ): 91 92 with pd.option_context("use_bottleneck", False): 93 f = getattr(Series, name) 94 95 # add some NaNs 96 string_series_[5:15] = np.NaN 97 98 # mean, idxmax, idxmin, min, and max are valid for dates 99 if name not in ["max", "min", "mean", "median", "std"]: 100 ds = Series(pd.date_range("1/1/2001", periods=10)) 101 with pytest.raises(TypeError): 102 f(ds) 103 104 # skipna or no 105 assert pd.notna(f(string_series_)) 106 assert pd.isna(f(string_series_, skipna=False)) 107 108 # check the result is correct 109 nona = string_series_.dropna() 110 tm.assert_almost_equal(f(nona), alternate(nona.values)) 111 tm.assert_almost_equal(f(string_series_), alternate(nona.values)) 112 113 allna = string_series_ * np.nan 114 115 if check_allna: 116 assert np.isnan(f(allna)) 117 118 # dtype=object with None, it works! 119 s = Series([1, 2, 3, None, 5]) 120 f(s) 121 122 # GH#2888 123 items = [0] 124 items.extend(range(2 ** 40, 2 ** 40 + 1000)) 125 s = Series(items, dtype="int64") 126 tm.assert_almost_equal(float(f(s)), float(alternate(s.values))) 127 128 # check date range 129 if check_objects: 130 s = Series(pd.bdate_range("1/1/2000", periods=10)) 131 res = f(s) 132 exp = alternate(s) 133 assert res == exp 134 135 # check on string data 136 if name not in ["sum", "min", "max"]: 137 with pytest.raises(TypeError): 138 f(Series(list("abc"))) 139 140 # Invalid axis. 141 with pytest.raises(ValueError): 142 f(string_series_, axis=1) 143 144 # Unimplemented numeric_only parameter. 145 if "numeric_only" in inspect.getfullargspec(f).args: 146 with pytest.raises(NotImplementedError, match=name): 147 f(string_series_, numeric_only=True) 148 149 def test_sum(self): 150 string_series = tm.makeStringSeries().rename("series") 151 self._check_stat_op("sum", np.sum, string_series, check_allna=False) 152 153 def test_mean(self): 154 string_series = tm.makeStringSeries().rename("series") 155 self._check_stat_op("mean", np.mean, string_series) 156 157 def test_median(self): 158 string_series = tm.makeStringSeries().rename("series") 159 self._check_stat_op("median", np.median, string_series) 160 161 # test with integers, test failure 162 int_ts = Series(np.ones(10, dtype=int), index=range(10)) 163 tm.assert_almost_equal(np.median(int_ts), int_ts.median()) 164 165 def test_prod(self): 166 string_series = tm.makeStringSeries().rename("series") 167 self._check_stat_op("prod", np.prod, string_series) 168 169 def test_min(self): 170 string_series = tm.makeStringSeries().rename("series") 171 self._check_stat_op("min", np.min, string_series, check_objects=True) 172 173 def test_max(self): 174 string_series = tm.makeStringSeries().rename("series") 175 self._check_stat_op("max", np.max, string_series, check_objects=True) 176 177 def test_var_std(self): 178 string_series = tm.makeStringSeries().rename("series") 179 datetime_series = tm.makeTimeSeries().rename("ts") 180 181 alt = lambda x: np.std(x, ddof=1) 182 self._check_stat_op("std", alt, string_series) 183 184 alt = lambda x: np.var(x, ddof=1) 185 self._check_stat_op("var", alt, string_series) 186 187 result = datetime_series.std(ddof=4) 188 expected = np.std(datetime_series.values, ddof=4) 189 tm.assert_almost_equal(result, expected) 190 191 result = datetime_series.var(ddof=4) 192 expected = np.var(datetime_series.values, ddof=4) 193 tm.assert_almost_equal(result, expected) 194 195 # 1 - element series with ddof=1 196 s = datetime_series.iloc[[0]] 197 result = s.var(ddof=1) 198 assert pd.isna(result) 199 200 result = s.std(ddof=1) 201 assert pd.isna(result) 202 203 def test_sem(self): 204 string_series = tm.makeStringSeries().rename("series") 205 datetime_series = tm.makeTimeSeries().rename("ts") 206 207 alt = lambda x: np.std(x, ddof=1) / np.sqrt(len(x)) 208 self._check_stat_op("sem", alt, string_series) 209 210 result = datetime_series.sem(ddof=4) 211 expected = np.std(datetime_series.values, ddof=4) / np.sqrt( 212 len(datetime_series.values) 213 ) 214 tm.assert_almost_equal(result, expected) 215 216 # 1 - element series with ddof=1 217 s = datetime_series.iloc[[0]] 218 result = s.sem(ddof=1) 219 assert pd.isna(result) 220 221 @td.skip_if_no_scipy 222 def test_skew(self): 223 from scipy.stats import skew 224 225 string_series = tm.makeStringSeries().rename("series") 226 227 alt = lambda x: skew(x, bias=False) 228 self._check_stat_op("skew", alt, string_series) 229 230 # test corner cases, skew() returns NaN unless there's at least 3 231 # values 232 min_N = 3 233 for i in range(1, min_N + 1): 234 s = Series(np.ones(i)) 235 df = DataFrame(np.ones((i, i))) 236 if i < min_N: 237 assert np.isnan(s.skew()) 238 assert np.isnan(df.skew()).all() 239 else: 240 assert 0 == s.skew() 241 assert (df.skew() == 0).all() 242 243 @td.skip_if_no_scipy 244 def test_kurt(self): 245 from scipy.stats import kurtosis 246 247 string_series = tm.makeStringSeries().rename("series") 248 249 alt = lambda x: kurtosis(x, bias=False) 250 self._check_stat_op("kurt", alt, string_series) 251 252 index = pd.MultiIndex( 253 levels=[["bar"], ["one", "two", "three"], [0, 1]], 254 codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]], 255 ) 256 s = Series(np.random.randn(6), index=index) 257 tm.assert_almost_equal(s.kurt(), s.kurt(level=0)["bar"]) 258 259 # test corner cases, kurt() returns NaN unless there's at least 4 260 # values 261 min_N = 4 262 for i in range(1, min_N + 1): 263 s = Series(np.ones(i)) 264 df = DataFrame(np.ones((i, i))) 265 if i < min_N: 266 assert np.isnan(s.kurt()) 267 assert np.isnan(df.kurt()).all() 268 else: 269 assert 0 == s.kurt() 270 assert (df.kurt() == 0).all() 271