1import re
2import sys
3
4import numpy as np
5import pytest
6
7from pandas.compat import PYPY
8
9from pandas import Categorical, Index, NaT, Series, date_range
10import pandas._testing as tm
11from pandas.api.types import is_scalar
12
13
14class TestCategoricalAnalytics:
15    @pytest.mark.parametrize("aggregation", ["min", "max"])
16    def test_min_max_not_ordered_raises(self, aggregation):
17        # unordered cats have no min/max
18        cat = Categorical(["a", "b", "c", "d"], ordered=False)
19        msg = f"Categorical is not ordered for operation {aggregation}"
20        agg_func = getattr(cat, aggregation)
21
22        with pytest.raises(TypeError, match=msg):
23            agg_func()
24
25    def test_min_max_ordered(self):
26        cat = Categorical(["a", "b", "c", "d"], ordered=True)
27        _min = cat.min()
28        _max = cat.max()
29        assert _min == "a"
30        assert _max == "d"
31
32        cat = Categorical(
33            ["a", "b", "c", "d"], categories=["d", "c", "b", "a"], ordered=True
34        )
35        _min = cat.min()
36        _max = cat.max()
37        assert _min == "d"
38        assert _max == "a"
39
40    @pytest.mark.parametrize(
41        "categories,expected",
42        [
43            (list("ABC"), np.NaN),
44            ([1, 2, 3], np.NaN),
45            pytest.param(
46                Series(date_range("2020-01-01", periods=3), dtype="category"),
47                NaT,
48                marks=pytest.mark.xfail(
49                    reason="https://github.com/pandas-dev/pandas/issues/29962"
50                ),
51            ),
52        ],
53    )
54    @pytest.mark.parametrize("aggregation", ["min", "max"])
55    def test_min_max_ordered_empty(self, categories, expected, aggregation):
56        # GH 30227
57        cat = Categorical([], categories=categories, ordered=True)
58
59        agg_func = getattr(cat, aggregation)
60        result = agg_func()
61        assert result is expected
62
63    @pytest.mark.parametrize(
64        "values, categories",
65        [(["a", "b", "c", np.nan], list("cba")), ([1, 2, 3, np.nan], [3, 2, 1])],
66    )
67    @pytest.mark.parametrize("skipna", [True, False])
68    @pytest.mark.parametrize("function", ["min", "max"])
69    def test_min_max_with_nan(self, values, categories, function, skipna):
70        # GH 25303
71        cat = Categorical(values, categories=categories, ordered=True)
72        result = getattr(cat, function)(skipna=skipna)
73
74        if skipna is False:
75            assert result is np.nan
76        else:
77            expected = categories[0] if function == "min" else categories[2]
78            assert result == expected
79
80    @pytest.mark.parametrize("function", ["min", "max"])
81    @pytest.mark.parametrize("skipna", [True, False])
82    def test_min_max_only_nan(self, function, skipna):
83        # https://github.com/pandas-dev/pandas/issues/33450
84        cat = Categorical([np.nan], categories=[1, 2], ordered=True)
85        result = getattr(cat, function)(skipna=skipna)
86        assert result is np.nan
87
88    @pytest.mark.parametrize("method", ["min", "max"])
89    def test_deprecate_numeric_only_min_max(self, method):
90        # GH 25303
91        cat = Categorical(
92            [np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1], ordered=True
93        )
94        with tm.assert_produces_warning(expected_warning=FutureWarning):
95            getattr(cat, method)(numeric_only=True)
96
97    @pytest.mark.parametrize("method", ["min", "max"])
98    def test_numpy_min_max_raises(self, method):
99        cat = Categorical(["a", "b", "c", "b"], ordered=False)
100        msg = (
101            f"Categorical is not ordered for operation {method}\n"
102            "you can use .as_ordered() to change the Categorical to an ordered one"
103        )
104        method = getattr(np, method)
105        with pytest.raises(TypeError, match=re.escape(msg)):
106            method(cat)
107
108    @pytest.mark.parametrize("kwarg", ["axis", "out", "keepdims"])
109    @pytest.mark.parametrize("method", ["min", "max"])
110    def test_numpy_min_max_unsupported_kwargs_raises(self, method, kwarg):
111        cat = Categorical(["a", "b", "c", "b"], ordered=True)
112        msg = (
113            f"the '{kwarg}' parameter is not supported in the pandas implementation "
114            f"of {method}"
115        )
116        if kwarg == "axis":
117            msg = r"`axis` must be fewer than the number of dimensions \(1\)"
118        kwargs = {kwarg: 42}
119        method = getattr(np, method)
120        with pytest.raises(ValueError, match=msg):
121            method(cat, **kwargs)
122
123    @pytest.mark.parametrize("method, expected", [("min", "a"), ("max", "c")])
124    def test_numpy_min_max_axis_equals_none(self, method, expected):
125        cat = Categorical(["a", "b", "c", "b"], ordered=True)
126        method = getattr(np, method)
127        result = method(cat, axis=None)
128        assert result == expected
129
130    @pytest.mark.parametrize(
131        "values,categories,exp_mode",
132        [
133            ([1, 1, 2, 4, 5, 5, 5], [5, 4, 3, 2, 1], [5]),
134            ([1, 1, 1, 4, 5, 5, 5], [5, 4, 3, 2, 1], [5, 1]),
135            ([1, 2, 3, 4, 5], [5, 4, 3, 2, 1], [5, 4, 3, 2, 1]),
136            ([np.nan, np.nan, np.nan, 4, 5], [5, 4, 3, 2, 1], [5, 4]),
137            ([np.nan, np.nan, np.nan, 4, 5, 4], [5, 4, 3, 2, 1], [4]),
138            ([np.nan, np.nan, 4, 5, 4], [5, 4, 3, 2, 1], [4]),
139        ],
140    )
141    def test_mode(self, values, categories, exp_mode):
142        s = Categorical(values, categories=categories, ordered=True)
143        res = s.mode()
144        exp = Categorical(exp_mode, categories=categories, ordered=True)
145        tm.assert_categorical_equal(res, exp)
146
147    def test_searchsorted(self, ordered):
148        # https://github.com/pandas-dev/pandas/issues/8420
149        # https://github.com/pandas-dev/pandas/issues/14522
150
151        cat = Categorical(
152            ["cheese", "milk", "apple", "bread", "bread"],
153            categories=["cheese", "milk", "apple", "bread"],
154            ordered=ordered,
155        )
156        ser = Series(cat)
157
158        # Searching for single item argument, side='left' (default)
159        res_cat = cat.searchsorted("apple")
160        assert res_cat == 2
161        assert is_scalar(res_cat)
162
163        res_ser = ser.searchsorted("apple")
164        assert res_ser == 2
165        assert is_scalar(res_ser)
166
167        # Searching for single item array, side='left' (default)
168        res_cat = cat.searchsorted(["bread"])
169        res_ser = ser.searchsorted(["bread"])
170        exp = np.array([3], dtype=np.intp)
171        tm.assert_numpy_array_equal(res_cat, exp)
172        tm.assert_numpy_array_equal(res_ser, exp)
173
174        # Searching for several items array, side='right'
175        res_cat = cat.searchsorted(["apple", "bread"], side="right")
176        res_ser = ser.searchsorted(["apple", "bread"], side="right")
177        exp = np.array([3, 5], dtype=np.intp)
178        tm.assert_numpy_array_equal(res_cat, exp)
179        tm.assert_numpy_array_equal(res_ser, exp)
180
181        # Searching for a single value that is not from the Categorical
182        with pytest.raises(KeyError, match="cucumber"):
183            cat.searchsorted("cucumber")
184        with pytest.raises(KeyError, match="cucumber"):
185            ser.searchsorted("cucumber")
186
187        # Searching for multiple values one of each is not from the Categorical
188        with pytest.raises(KeyError, match="cucumber"):
189            cat.searchsorted(["bread", "cucumber"])
190        with pytest.raises(KeyError, match="cucumber"):
191            ser.searchsorted(["bread", "cucumber"])
192
193    def test_unique(self):
194        # categories are reordered based on value when ordered=False
195        cat = Categorical(["a", "b"])
196        exp = Index(["a", "b"])
197        res = cat.unique()
198        tm.assert_index_equal(res.categories, exp)
199        tm.assert_categorical_equal(res, cat)
200
201        cat = Categorical(["a", "b", "a", "a"], categories=["a", "b", "c"])
202        res = cat.unique()
203        tm.assert_index_equal(res.categories, exp)
204        tm.assert_categorical_equal(res, Categorical(exp))
205
206        cat = Categorical(["c", "a", "b", "a", "a"], categories=["a", "b", "c"])
207        exp = Index(["c", "a", "b"])
208        res = cat.unique()
209        tm.assert_index_equal(res.categories, exp)
210        exp_cat = Categorical(exp, categories=["c", "a", "b"])
211        tm.assert_categorical_equal(res, exp_cat)
212
213        # nan must be removed
214        cat = Categorical(["b", np.nan, "b", np.nan, "a"], categories=["a", "b", "c"])
215        res = cat.unique()
216        exp = Index(["b", "a"])
217        tm.assert_index_equal(res.categories, exp)
218        exp_cat = Categorical(["b", np.nan, "a"], categories=["b", "a"])
219        tm.assert_categorical_equal(res, exp_cat)
220
221    def test_unique_ordered(self):
222        # keep categories order when ordered=True
223        cat = Categorical(["b", "a", "b"], categories=["a", "b"], ordered=True)
224        res = cat.unique()
225        exp_cat = Categorical(["b", "a"], categories=["a", "b"], ordered=True)
226        tm.assert_categorical_equal(res, exp_cat)
227
228        cat = Categorical(
229            ["c", "b", "a", "a"], categories=["a", "b", "c"], ordered=True
230        )
231        res = cat.unique()
232        exp_cat = Categorical(["c", "b", "a"], categories=["a", "b", "c"], ordered=True)
233        tm.assert_categorical_equal(res, exp_cat)
234
235        cat = Categorical(["b", "a", "a"], categories=["a", "b", "c"], ordered=True)
236        res = cat.unique()
237        exp_cat = Categorical(["b", "a"], categories=["a", "b"], ordered=True)
238        tm.assert_categorical_equal(res, exp_cat)
239
240        cat = Categorical(
241            ["b", "b", np.nan, "a"], categories=["a", "b", "c"], ordered=True
242        )
243        res = cat.unique()
244        exp_cat = Categorical(["b", np.nan, "a"], categories=["a", "b"], ordered=True)
245        tm.assert_categorical_equal(res, exp_cat)
246
247    def test_unique_index_series(self):
248        c = Categorical([3, 1, 2, 2, 1], categories=[3, 2, 1])
249        # Categorical.unique sorts categories by appearance order
250        # if ordered=False
251        exp = Categorical([3, 1, 2], categories=[3, 1, 2])
252        tm.assert_categorical_equal(c.unique(), exp)
253
254        tm.assert_index_equal(Index(c).unique(), Index(exp))
255        tm.assert_categorical_equal(Series(c).unique(), exp)
256
257        c = Categorical([1, 1, 2, 2], categories=[3, 2, 1])
258        exp = Categorical([1, 2], categories=[1, 2])
259        tm.assert_categorical_equal(c.unique(), exp)
260        tm.assert_index_equal(Index(c).unique(), Index(exp))
261        tm.assert_categorical_equal(Series(c).unique(), exp)
262
263        c = Categorical([3, 1, 2, 2, 1], categories=[3, 2, 1], ordered=True)
264        # Categorical.unique keeps categories order if ordered=True
265        exp = Categorical([3, 1, 2], categories=[3, 2, 1], ordered=True)
266        tm.assert_categorical_equal(c.unique(), exp)
267
268        tm.assert_index_equal(Index(c).unique(), Index(exp))
269        tm.assert_categorical_equal(Series(c).unique(), exp)
270
271    def test_shift(self):
272        # GH 9416
273        cat = Categorical(["a", "b", "c", "d", "a"])
274
275        # shift forward
276        sp1 = cat.shift(1)
277        xp1 = Categorical([np.nan, "a", "b", "c", "d"])
278        tm.assert_categorical_equal(sp1, xp1)
279        tm.assert_categorical_equal(cat[:-1], sp1[1:])
280
281        # shift back
282        sn2 = cat.shift(-2)
283        xp2 = Categorical(
284            ["c", "d", "a", np.nan, np.nan], categories=["a", "b", "c", "d"]
285        )
286        tm.assert_categorical_equal(sn2, xp2)
287        tm.assert_categorical_equal(cat[2:], sn2[:-2])
288
289        # shift by zero
290        tm.assert_categorical_equal(cat, cat.shift(0))
291
292    def test_nbytes(self):
293        cat = Categorical([1, 2, 3])
294        exp = 3 + 3 * 8  # 3 int8s for values + 3 int64s for categories
295        assert cat.nbytes == exp
296
297    def test_memory_usage(self):
298        cat = Categorical([1, 2, 3])
299
300        # .categories is an index, so we include the hashtable
301        assert 0 < cat.nbytes <= cat.memory_usage()
302        assert 0 < cat.nbytes <= cat.memory_usage(deep=True)
303
304        cat = Categorical(["foo", "foo", "bar"])
305        assert cat.memory_usage(deep=True) > cat.nbytes
306
307        if not PYPY:
308            # sys.getsizeof will call the .memory_usage with
309            # deep=True, and add on some GC overhead
310            diff = cat.memory_usage(deep=True) - sys.getsizeof(cat)
311            assert abs(diff) < 100
312
313    def test_map(self):
314        c = Categorical(list("ABABC"), categories=list("CBA"), ordered=True)
315        result = c.map(lambda x: x.lower())
316        exp = Categorical(list("ababc"), categories=list("cba"), ordered=True)
317        tm.assert_categorical_equal(result, exp)
318
319        c = Categorical(list("ABABC"), categories=list("ABC"), ordered=False)
320        result = c.map(lambda x: x.lower())
321        exp = Categorical(list("ababc"), categories=list("abc"), ordered=False)
322        tm.assert_categorical_equal(result, exp)
323
324        result = c.map(lambda x: 1)
325        # GH 12766: Return an index not an array
326        tm.assert_index_equal(result, Index(np.array([1] * 5, dtype=np.int64)))
327
328    @pytest.mark.parametrize("value", [1, "True", [1, 2, 3], 5.0])
329    def test_validate_inplace_raises(self, value):
330        cat = Categorical(["A", "B", "B", "C", "A"])
331        msg = (
332            'For argument "inplace" expected type bool, '
333            f"received type {type(value).__name__}"
334        )
335        with pytest.raises(ValueError, match=msg):
336            cat.set_ordered(value=True, inplace=value)
337
338        with pytest.raises(ValueError, match=msg):
339            cat.as_ordered(inplace=value)
340
341        with pytest.raises(ValueError, match=msg):
342            cat.as_unordered(inplace=value)
343
344        with pytest.raises(ValueError, match=msg):
345            cat.set_categories(["X", "Y", "Z"], rename=True, inplace=value)
346
347        with pytest.raises(ValueError, match=msg):
348            cat.rename_categories(["X", "Y", "Z"], inplace=value)
349
350        with pytest.raises(ValueError, match=msg):
351            cat.reorder_categories(["X", "Y", "Z"], ordered=True, inplace=value)
352
353        with pytest.raises(ValueError, match=msg):
354            cat.add_categories(new_categories=["D", "E", "F"], inplace=value)
355
356        with pytest.raises(ValueError, match=msg):
357            cat.remove_categories(removals=["D", "E", "F"], inplace=value)
358
359        with pytest.raises(ValueError, match=msg):
360            with tm.assert_produces_warning(FutureWarning):
361                # issue #37643 inplace kwarg deprecated
362                cat.remove_unused_categories(inplace=value)
363
364        with pytest.raises(ValueError, match=msg):
365            cat.sort_values(inplace=value)
366