1"""
2Tests for statistical reductions of 2nd moment or higher: var, skew, kurt, ...
3"""
4import inspect
5
6import numpy as np
7import pytest
8
9import pandas.util._test_decorators as td
10
11import pandas as pd
12from pandas import DataFrame, Series
13import pandas._testing as tm
14from pandas.core.arrays import DatetimeArray, PeriodArray, TimedeltaArray
15
16
17class TestDatetimeLikeStatReductions:
18    @pytest.mark.parametrize("box", [Series, pd.Index, DatetimeArray])
19    def test_dt64_mean(self, tz_naive_fixture, box):
20        tz = tz_naive_fixture
21
22        dti = pd.date_range("2001-01-01", periods=11, tz=tz)
23        # shuffle so that we are not just working with monotone-increasing
24        dti = dti.take([4, 1, 3, 10, 9, 7, 8, 5, 0, 2, 6])
25        dtarr = dti._data
26
27        obj = box(dtarr)
28        assert obj.mean() == pd.Timestamp("2001-01-06", tz=tz)
29        assert obj.mean(skipna=False) == pd.Timestamp("2001-01-06", tz=tz)
30
31        # dtarr[-2] will be the first date 2001-01-1
32        dtarr[-2] = pd.NaT
33
34        obj = box(dtarr)
35        assert obj.mean() == pd.Timestamp("2001-01-06 07:12:00", tz=tz)
36        assert obj.mean(skipna=False) is pd.NaT
37
38    @pytest.mark.parametrize("box", [Series, pd.Index, PeriodArray])
39    def test_period_mean(self, box):
40        # GH#24757
41        dti = pd.date_range("2001-01-01", periods=11)
42        # shuffle so that we are not just working with monotone-increasing
43        dti = dti.take([4, 1, 3, 10, 9, 7, 8, 5, 0, 2, 6])
44
45        # use hourly frequency to avoid rounding errors in expected results
46        #  TODO: flesh this out with different frequencies
47        parr = dti._data.to_period("H")
48        obj = box(parr)
49        with pytest.raises(TypeError, match="ambiguous"):
50            obj.mean()
51        with pytest.raises(TypeError, match="ambiguous"):
52            obj.mean(skipna=True)
53
54        # parr[-2] will be the first date 2001-01-1
55        parr[-2] = pd.NaT
56
57        with pytest.raises(TypeError, match="ambiguous"):
58            obj.mean()
59        with pytest.raises(TypeError, match="ambiguous"):
60            obj.mean(skipna=True)
61
62    @pytest.mark.parametrize("box", [Series, pd.Index, TimedeltaArray])
63    def test_td64_mean(self, box):
64        tdi = pd.TimedeltaIndex([0, 3, -2, -7, 1, 2, -1, 3, 5, -2, 4], unit="D")
65
66        tdarr = tdi._data
67        obj = box(tdarr)
68
69        result = obj.mean()
70        expected = np.array(tdarr).mean()
71        assert result == expected
72
73        tdarr[0] = pd.NaT
74        assert obj.mean(skipna=False) is pd.NaT
75
76        result2 = obj.mean(skipna=True)
77        assert result2 == tdi[1:].mean()
78
79        # exact equality fails by 1 nanosecond
80        assert result2.round("us") == (result * 11.0 / 10).round("us")
81
82
83class TestSeriesStatReductions:
84    # Note: the name TestSeriesStatReductions indicates these tests
85    #  were moved from a series-specific test file, _not_ that these tests are
86    #  intended long-term to be series-specific
87
88    def _check_stat_op(
89        self, name, alternate, string_series_, check_objects=False, check_allna=False
90    ):
91
92        with pd.option_context("use_bottleneck", False):
93            f = getattr(Series, name)
94
95            # add some NaNs
96            string_series_[5:15] = np.NaN
97
98            # mean, idxmax, idxmin, min, and max are valid for dates
99            if name not in ["max", "min", "mean", "median", "std"]:
100                ds = Series(pd.date_range("1/1/2001", periods=10))
101                with pytest.raises(TypeError):
102                    f(ds)
103
104            # skipna or no
105            assert pd.notna(f(string_series_))
106            assert pd.isna(f(string_series_, skipna=False))
107
108            # check the result is correct
109            nona = string_series_.dropna()
110            tm.assert_almost_equal(f(nona), alternate(nona.values))
111            tm.assert_almost_equal(f(string_series_), alternate(nona.values))
112
113            allna = string_series_ * np.nan
114
115            if check_allna:
116                assert np.isnan(f(allna))
117
118            # dtype=object with None, it works!
119            s = Series([1, 2, 3, None, 5])
120            f(s)
121
122            # GH#2888
123            items = [0]
124            items.extend(range(2 ** 40, 2 ** 40 + 1000))
125            s = Series(items, dtype="int64")
126            tm.assert_almost_equal(float(f(s)), float(alternate(s.values)))
127
128            # check date range
129            if check_objects:
130                s = Series(pd.bdate_range("1/1/2000", periods=10))
131                res = f(s)
132                exp = alternate(s)
133                assert res == exp
134
135            # check on string data
136            if name not in ["sum", "min", "max"]:
137                with pytest.raises(TypeError):
138                    f(Series(list("abc")))
139
140            # Invalid axis.
141            with pytest.raises(ValueError):
142                f(string_series_, axis=1)
143
144            # Unimplemented numeric_only parameter.
145            if "numeric_only" in inspect.getfullargspec(f).args:
146                with pytest.raises(NotImplementedError, match=name):
147                    f(string_series_, numeric_only=True)
148
149    def test_sum(self):
150        string_series = tm.makeStringSeries().rename("series")
151        self._check_stat_op("sum", np.sum, string_series, check_allna=False)
152
153    def test_mean(self):
154        string_series = tm.makeStringSeries().rename("series")
155        self._check_stat_op("mean", np.mean, string_series)
156
157    def test_median(self):
158        string_series = tm.makeStringSeries().rename("series")
159        self._check_stat_op("median", np.median, string_series)
160
161        # test with integers, test failure
162        int_ts = Series(np.ones(10, dtype=int), index=range(10))
163        tm.assert_almost_equal(np.median(int_ts), int_ts.median())
164
165    def test_prod(self):
166        string_series = tm.makeStringSeries().rename("series")
167        self._check_stat_op("prod", np.prod, string_series)
168
169    def test_min(self):
170        string_series = tm.makeStringSeries().rename("series")
171        self._check_stat_op("min", np.min, string_series, check_objects=True)
172
173    def test_max(self):
174        string_series = tm.makeStringSeries().rename("series")
175        self._check_stat_op("max", np.max, string_series, check_objects=True)
176
177    def test_var_std(self):
178        string_series = tm.makeStringSeries().rename("series")
179        datetime_series = tm.makeTimeSeries().rename("ts")
180
181        alt = lambda x: np.std(x, ddof=1)
182        self._check_stat_op("std", alt, string_series)
183
184        alt = lambda x: np.var(x, ddof=1)
185        self._check_stat_op("var", alt, string_series)
186
187        result = datetime_series.std(ddof=4)
188        expected = np.std(datetime_series.values, ddof=4)
189        tm.assert_almost_equal(result, expected)
190
191        result = datetime_series.var(ddof=4)
192        expected = np.var(datetime_series.values, ddof=4)
193        tm.assert_almost_equal(result, expected)
194
195        # 1 - element series with ddof=1
196        s = datetime_series.iloc[[0]]
197        result = s.var(ddof=1)
198        assert pd.isna(result)
199
200        result = s.std(ddof=1)
201        assert pd.isna(result)
202
203    def test_sem(self):
204        string_series = tm.makeStringSeries().rename("series")
205        datetime_series = tm.makeTimeSeries().rename("ts")
206
207        alt = lambda x: np.std(x, ddof=1) / np.sqrt(len(x))
208        self._check_stat_op("sem", alt, string_series)
209
210        result = datetime_series.sem(ddof=4)
211        expected = np.std(datetime_series.values, ddof=4) / np.sqrt(
212            len(datetime_series.values)
213        )
214        tm.assert_almost_equal(result, expected)
215
216        # 1 - element series with ddof=1
217        s = datetime_series.iloc[[0]]
218        result = s.sem(ddof=1)
219        assert pd.isna(result)
220
221    @td.skip_if_no_scipy
222    def test_skew(self):
223        from scipy.stats import skew
224
225        string_series = tm.makeStringSeries().rename("series")
226
227        alt = lambda x: skew(x, bias=False)
228        self._check_stat_op("skew", alt, string_series)
229
230        # test corner cases, skew() returns NaN unless there's at least 3
231        # values
232        min_N = 3
233        for i in range(1, min_N + 1):
234            s = Series(np.ones(i))
235            df = DataFrame(np.ones((i, i)))
236            if i < min_N:
237                assert np.isnan(s.skew())
238                assert np.isnan(df.skew()).all()
239            else:
240                assert 0 == s.skew()
241                assert (df.skew() == 0).all()
242
243    @td.skip_if_no_scipy
244    def test_kurt(self):
245        from scipy.stats import kurtosis
246
247        string_series = tm.makeStringSeries().rename("series")
248
249        alt = lambda x: kurtosis(x, bias=False)
250        self._check_stat_op("kurt", alt, string_series)
251
252        index = pd.MultiIndex(
253            levels=[["bar"], ["one", "two", "three"], [0, 1]],
254            codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]],
255        )
256        s = Series(np.random.randn(6), index=index)
257        tm.assert_almost_equal(s.kurt(), s.kurt(level=0)["bar"])
258
259        # test corner cases, kurt() returns NaN unless there's at least 4
260        # values
261        min_N = 4
262        for i in range(1, min_N + 1):
263            s = Series(np.ones(i))
264            df = DataFrame(np.ones((i, i)))
265            if i < min_N:
266                assert np.isnan(s.kurt())
267                assert np.isnan(df.kurt()).all()
268            else:
269                assert 0 == s.kurt()
270                assert (df.kurt() == 0).all()
271