1import re
2
3import numpy as np
4import pytest
5
6from pandas import Categorical, CategoricalIndex, DataFrame, Index, Series
7import pandas._testing as tm
8from pandas.core.arrays.categorical import recode_for_categories
9from pandas.tests.arrays.categorical.common import TestCategorical
10
11
12class TestCategoricalAPI:
13    def test_ordered_api(self):
14        # GH 9347
15        cat1 = Categorical(list("acb"), ordered=False)
16        tm.assert_index_equal(cat1.categories, Index(["a", "b", "c"]))
17        assert not cat1.ordered
18
19        cat2 = Categorical(list("acb"), categories=list("bca"), ordered=False)
20        tm.assert_index_equal(cat2.categories, Index(["b", "c", "a"]))
21        assert not cat2.ordered
22
23        cat3 = Categorical(list("acb"), ordered=True)
24        tm.assert_index_equal(cat3.categories, Index(["a", "b", "c"]))
25        assert cat3.ordered
26
27        cat4 = Categorical(list("acb"), categories=list("bca"), ordered=True)
28        tm.assert_index_equal(cat4.categories, Index(["b", "c", "a"]))
29        assert cat4.ordered
30
31    def test_set_ordered(self):
32
33        cat = Categorical(["a", "b", "c", "a"], ordered=True)
34        cat2 = cat.as_unordered()
35        assert not cat2.ordered
36        cat2 = cat.as_ordered()
37        assert cat2.ordered
38        cat2.as_unordered(inplace=True)
39        assert not cat2.ordered
40        cat2.as_ordered(inplace=True)
41        assert cat2.ordered
42
43        assert cat2.set_ordered(True).ordered
44        assert not cat2.set_ordered(False).ordered
45        cat2.set_ordered(True, inplace=True)
46        assert cat2.ordered
47        cat2.set_ordered(False, inplace=True)
48        assert not cat2.ordered
49
50        # removed in 0.19.0
51        msg = "can't set attribute"
52        with pytest.raises(AttributeError, match=msg):
53            cat.ordered = True
54        with pytest.raises(AttributeError, match=msg):
55            cat.ordered = False
56
57    def test_rename_categories(self):
58        cat = Categorical(["a", "b", "c", "a"])
59
60        # inplace=False: the old one must not be changed
61        res = cat.rename_categories([1, 2, 3])
62        tm.assert_numpy_array_equal(
63            res.__array__(), np.array([1, 2, 3, 1], dtype=np.int64)
64        )
65        tm.assert_index_equal(res.categories, Index([1, 2, 3]))
66
67        exp_cat = np.array(["a", "b", "c", "a"], dtype=np.object_)
68        tm.assert_numpy_array_equal(cat.__array__(), exp_cat)
69
70        exp_cat = Index(["a", "b", "c"])
71        tm.assert_index_equal(cat.categories, exp_cat)
72
73        # GH18862 (let rename_categories take callables)
74        result = cat.rename_categories(lambda x: x.upper())
75        expected = Categorical(["A", "B", "C", "A"])
76        tm.assert_categorical_equal(result, expected)
77
78        # and now inplace
79        res = cat.rename_categories([1, 2, 3], inplace=True)
80        assert res is None
81        tm.assert_numpy_array_equal(
82            cat.__array__(), np.array([1, 2, 3, 1], dtype=np.int64)
83        )
84        tm.assert_index_equal(cat.categories, Index([1, 2, 3]))
85
86    @pytest.mark.parametrize("new_categories", [[1, 2, 3, 4], [1, 2]])
87    def test_rename_categories_wrong_length_raises(self, new_categories):
88        cat = Categorical(["a", "b", "c", "a"])
89        msg = (
90            "new categories need to have the same number of items as the "
91            "old categories!"
92        )
93        with pytest.raises(ValueError, match=msg):
94            cat.rename_categories(new_categories)
95
96    def test_rename_categories_series(self):
97        # https://github.com/pandas-dev/pandas/issues/17981
98        c = Categorical(["a", "b"])
99        result = c.rename_categories(Series([0, 1], index=["a", "b"]))
100        expected = Categorical([0, 1])
101        tm.assert_categorical_equal(result, expected)
102
103    def test_rename_categories_dict(self):
104        # GH 17336
105        cat = Categorical(["a", "b", "c", "d"])
106        res = cat.rename_categories({"a": 4, "b": 3, "c": 2, "d": 1})
107        expected = Index([4, 3, 2, 1])
108        tm.assert_index_equal(res.categories, expected)
109
110        # Test for inplace
111        res = cat.rename_categories({"a": 4, "b": 3, "c": 2, "d": 1}, inplace=True)
112        assert res is None
113        tm.assert_index_equal(cat.categories, expected)
114
115        # Test for dicts of smaller length
116        cat = Categorical(["a", "b", "c", "d"])
117        res = cat.rename_categories({"a": 1, "c": 3})
118
119        expected = Index([1, "b", 3, "d"])
120        tm.assert_index_equal(res.categories, expected)
121
122        # Test for dicts with bigger length
123        cat = Categorical(["a", "b", "c", "d"])
124        res = cat.rename_categories({"a": 1, "b": 2, "c": 3, "d": 4, "e": 5, "f": 6})
125        expected = Index([1, 2, 3, 4])
126        tm.assert_index_equal(res.categories, expected)
127
128        # Test for dicts with no items from old categories
129        cat = Categorical(["a", "b", "c", "d"])
130        res = cat.rename_categories({"f": 1, "g": 3})
131
132        expected = Index(["a", "b", "c", "d"])
133        tm.assert_index_equal(res.categories, expected)
134
135    def test_reorder_categories(self):
136        cat = Categorical(["a", "b", "c", "a"], ordered=True)
137        old = cat.copy()
138        new = Categorical(
139            ["a", "b", "c", "a"], categories=["c", "b", "a"], ordered=True
140        )
141
142        # first inplace == False
143        res = cat.reorder_categories(["c", "b", "a"])
144        # cat must be the same as before
145        tm.assert_categorical_equal(cat, old)
146        # only res is changed
147        tm.assert_categorical_equal(res, new)
148
149        # inplace == True
150        res = cat.reorder_categories(["c", "b", "a"], inplace=True)
151        assert res is None
152        tm.assert_categorical_equal(cat, new)
153
154    @pytest.mark.parametrize(
155        "new_categories",
156        [
157            ["a"],  # not all "old" included in "new"
158            ["a", "b", "d"],  # still not all "old" in "new"
159            ["a", "b", "c", "d"],  # all "old" included in "new", but too long
160        ],
161    )
162    def test_reorder_categories_raises(self, new_categories):
163        cat = Categorical(["a", "b", "c", "a"], ordered=True)
164        msg = "items in new_categories are not the same as in old categories"
165        with pytest.raises(ValueError, match=msg):
166            cat.reorder_categories(new_categories)
167
168    def test_add_categories(self):
169        cat = Categorical(["a", "b", "c", "a"], ordered=True)
170        old = cat.copy()
171        new = Categorical(
172            ["a", "b", "c", "a"], categories=["a", "b", "c", "d"], ordered=True
173        )
174
175        # first inplace == False
176        res = cat.add_categories("d")
177        tm.assert_categorical_equal(cat, old)
178        tm.assert_categorical_equal(res, new)
179
180        res = cat.add_categories(["d"])
181        tm.assert_categorical_equal(cat, old)
182        tm.assert_categorical_equal(res, new)
183
184        # inplace == True
185        res = cat.add_categories("d", inplace=True)
186        tm.assert_categorical_equal(cat, new)
187        assert res is None
188
189        # GH 9927
190        cat = Categorical(list("abc"), ordered=True)
191        expected = Categorical(list("abc"), categories=list("abcde"), ordered=True)
192        # test with Series, np.array, index, list
193        res = cat.add_categories(Series(["d", "e"]))
194        tm.assert_categorical_equal(res, expected)
195        res = cat.add_categories(np.array(["d", "e"]))
196        tm.assert_categorical_equal(res, expected)
197        res = cat.add_categories(Index(["d", "e"]))
198        tm.assert_categorical_equal(res, expected)
199        res = cat.add_categories(["d", "e"])
200        tm.assert_categorical_equal(res, expected)
201
202    def test_add_categories_existing_raises(self):
203        # new is in old categories
204        cat = Categorical(["a", "b", "c", "d"], ordered=True)
205        msg = re.escape("new categories must not include old categories: {'d'}")
206        with pytest.raises(ValueError, match=msg):
207            cat.add_categories(["d"])
208
209    def test_set_categories(self):
210        cat = Categorical(["a", "b", "c", "a"], ordered=True)
211        exp_categories = Index(["c", "b", "a"])
212        exp_values = np.array(["a", "b", "c", "a"], dtype=np.object_)
213
214        res = cat.set_categories(["c", "b", "a"], inplace=True)
215        tm.assert_index_equal(cat.categories, exp_categories)
216        tm.assert_numpy_array_equal(cat.__array__(), exp_values)
217        assert res is None
218
219        res = cat.set_categories(["a", "b", "c"])
220        # cat must be the same as before
221        tm.assert_index_equal(cat.categories, exp_categories)
222        tm.assert_numpy_array_equal(cat.__array__(), exp_values)
223        # only res is changed
224        exp_categories_back = Index(["a", "b", "c"])
225        tm.assert_index_equal(res.categories, exp_categories_back)
226        tm.assert_numpy_array_equal(res.__array__(), exp_values)
227
228        # not all "old" included in "new" -> all not included ones are now
229        # np.nan
230        cat = Categorical(["a", "b", "c", "a"], ordered=True)
231        res = cat.set_categories(["a"])
232        tm.assert_numpy_array_equal(res.codes, np.array([0, -1, -1, 0], dtype=np.int8))
233
234        # still not all "old" in "new"
235        res = cat.set_categories(["a", "b", "d"])
236        tm.assert_numpy_array_equal(res.codes, np.array([0, 1, -1, 0], dtype=np.int8))
237        tm.assert_index_equal(res.categories, Index(["a", "b", "d"]))
238
239        # all "old" included in "new"
240        cat = cat.set_categories(["a", "b", "c", "d"])
241        exp_categories = Index(["a", "b", "c", "d"])
242        tm.assert_index_equal(cat.categories, exp_categories)
243
244        # internals...
245        c = Categorical([1, 2, 3, 4, 1], categories=[1, 2, 3, 4], ordered=True)
246        tm.assert_numpy_array_equal(c._codes, np.array([0, 1, 2, 3, 0], dtype=np.int8))
247        tm.assert_index_equal(c.categories, Index([1, 2, 3, 4]))
248
249        exp = np.array([1, 2, 3, 4, 1], dtype=np.int64)
250        tm.assert_numpy_array_equal(np.asarray(c), exp)
251
252        # all "pointers" to '4' must be changed from 3 to 0,...
253        c = c.set_categories([4, 3, 2, 1])
254
255        # positions are changed
256        tm.assert_numpy_array_equal(c._codes, np.array([3, 2, 1, 0, 3], dtype=np.int8))
257
258        # categories are now in new order
259        tm.assert_index_equal(c.categories, Index([4, 3, 2, 1]))
260
261        # output is the same
262        exp = np.array([1, 2, 3, 4, 1], dtype=np.int64)
263        tm.assert_numpy_array_equal(np.asarray(c), exp)
264        assert c.min() == 4
265        assert c.max() == 1
266
267        # set_categories should set the ordering if specified
268        c2 = c.set_categories([4, 3, 2, 1], ordered=False)
269        assert not c2.ordered
270
271        tm.assert_numpy_array_equal(np.asarray(c), np.asarray(c2))
272
273        # set_categories should pass thru the ordering
274        c2 = c.set_ordered(False).set_categories([4, 3, 2, 1])
275        assert not c2.ordered
276
277        tm.assert_numpy_array_equal(np.asarray(c), np.asarray(c2))
278
279    def test_to_dense_deprecated(self):
280        cat = Categorical(["a", "b", "c", "a"], ordered=True)
281
282        with tm.assert_produces_warning(FutureWarning):
283            cat.to_dense()
284
285    @pytest.mark.parametrize(
286        "values, categories, new_categories",
287        [
288            # No NaNs, same cats, same order
289            (["a", "b", "a"], ["a", "b"], ["a", "b"]),
290            # No NaNs, same cats, different order
291            (["a", "b", "a"], ["a", "b"], ["b", "a"]),
292            # Same, unsorted
293            (["b", "a", "a"], ["a", "b"], ["a", "b"]),
294            # No NaNs, same cats, different order
295            (["b", "a", "a"], ["a", "b"], ["b", "a"]),
296            # NaNs
297            (["a", "b", "c"], ["a", "b"], ["a", "b"]),
298            (["a", "b", "c"], ["a", "b"], ["b", "a"]),
299            (["b", "a", "c"], ["a", "b"], ["a", "b"]),
300            (["b", "a", "c"], ["a", "b"], ["a", "b"]),
301            # Introduce NaNs
302            (["a", "b", "c"], ["a", "b"], ["a"]),
303            (["a", "b", "c"], ["a", "b"], ["b"]),
304            (["b", "a", "c"], ["a", "b"], ["a"]),
305            (["b", "a", "c"], ["a", "b"], ["a"]),
306            # No overlap
307            (["a", "b", "c"], ["a", "b"], ["d", "e"]),
308        ],
309    )
310    @pytest.mark.parametrize("ordered", [True, False])
311    def test_set_categories_many(self, values, categories, new_categories, ordered):
312        c = Categorical(values, categories)
313        expected = Categorical(values, new_categories, ordered)
314        result = c.set_categories(new_categories, ordered=ordered)
315        tm.assert_categorical_equal(result, expected)
316
317    def test_set_categories_rename_less(self):
318        # GH 24675
319        cat = Categorical(["A", "B"])
320        result = cat.set_categories(["A"], rename=True)
321        expected = Categorical(["A", np.nan])
322        tm.assert_categorical_equal(result, expected)
323
324    def test_set_categories_private(self):
325        cat = Categorical(["a", "b", "c"], categories=["a", "b", "c", "d"])
326        cat._set_categories(["a", "c", "d", "e"])
327        expected = Categorical(["a", "c", "d"], categories=list("acde"))
328        tm.assert_categorical_equal(cat, expected)
329
330        # fastpath
331        cat = Categorical(["a", "b", "c"], categories=["a", "b", "c", "d"])
332        cat._set_categories(["a", "c", "d", "e"], fastpath=True)
333        expected = Categorical(["a", "c", "d"], categories=list("acde"))
334        tm.assert_categorical_equal(cat, expected)
335
336    def test_remove_categories(self):
337        cat = Categorical(["a", "b", "c", "a"], ordered=True)
338        old = cat.copy()
339        new = Categorical(["a", "b", np.nan, "a"], categories=["a", "b"], ordered=True)
340
341        # first inplace == False
342        res = cat.remove_categories("c")
343        tm.assert_categorical_equal(cat, old)
344        tm.assert_categorical_equal(res, new)
345
346        res = cat.remove_categories(["c"])
347        tm.assert_categorical_equal(cat, old)
348        tm.assert_categorical_equal(res, new)
349
350        # inplace == True
351        res = cat.remove_categories("c", inplace=True)
352        tm.assert_categorical_equal(cat, new)
353        assert res is None
354
355    @pytest.mark.parametrize("removals", [["c"], ["c", np.nan], "c", ["c", "c"]])
356    def test_remove_categories_raises(self, removals):
357        cat = Categorical(["a", "b", "a"])
358        message = re.escape("removals must all be in old categories: {'c'}")
359
360        with pytest.raises(ValueError, match=message):
361            cat.remove_categories(removals)
362
363    def test_remove_unused_categories(self):
364        c = Categorical(["a", "b", "c", "d", "a"], categories=["a", "b", "c", "d", "e"])
365        exp_categories_all = Index(["a", "b", "c", "d", "e"])
366        exp_categories_dropped = Index(["a", "b", "c", "d"])
367
368        tm.assert_index_equal(c.categories, exp_categories_all)
369
370        res = c.remove_unused_categories()
371        tm.assert_index_equal(res.categories, exp_categories_dropped)
372        tm.assert_index_equal(c.categories, exp_categories_all)
373
374        with tm.assert_produces_warning(FutureWarning):
375            # issue #37643 inplace kwarg deprecated
376            res = c.remove_unused_categories(inplace=True)
377
378        tm.assert_index_equal(c.categories, exp_categories_dropped)
379        assert res is None
380
381        # with NaN values (GH11599)
382        c = Categorical(["a", "b", "c", np.nan], categories=["a", "b", "c", "d", "e"])
383        res = c.remove_unused_categories()
384        tm.assert_index_equal(res.categories, Index(np.array(["a", "b", "c"])))
385        exp_codes = np.array([0, 1, 2, -1], dtype=np.int8)
386        tm.assert_numpy_array_equal(res.codes, exp_codes)
387        tm.assert_index_equal(c.categories, exp_categories_all)
388
389        val = ["F", np.nan, "D", "B", "D", "F", np.nan]
390        cat = Categorical(values=val, categories=list("ABCDEFG"))
391        out = cat.remove_unused_categories()
392        tm.assert_index_equal(out.categories, Index(["B", "D", "F"]))
393        exp_codes = np.array([2, -1, 1, 0, 1, 2, -1], dtype=np.int8)
394        tm.assert_numpy_array_equal(out.codes, exp_codes)
395        assert out.tolist() == val
396
397        alpha = list("abcdefghijklmnopqrstuvwxyz")
398        val = np.random.choice(alpha[::2], 10000).astype("object")
399        val[np.random.choice(len(val), 100)] = np.nan
400
401        cat = Categorical(values=val, categories=alpha)
402        out = cat.remove_unused_categories()
403        assert out.tolist() == val.tolist()
404
405
406class TestCategoricalAPIWithFactor(TestCategorical):
407    def test_describe(self):
408        # string type
409        desc = self.factor.describe()
410        assert self.factor.ordered
411        exp_index = CategoricalIndex(
412            ["a", "b", "c"], name="categories", ordered=self.factor.ordered
413        )
414        expected = DataFrame(
415            {"counts": [3, 2, 3], "freqs": [3 / 8.0, 2 / 8.0, 3 / 8.0]}, index=exp_index
416        )
417        tm.assert_frame_equal(desc, expected)
418
419        # check unused categories
420        cat = self.factor.copy()
421        cat.set_categories(["a", "b", "c", "d"], inplace=True)
422        desc = cat.describe()
423
424        exp_index = CategoricalIndex(
425            list("abcd"), ordered=self.factor.ordered, name="categories"
426        )
427        expected = DataFrame(
428            {"counts": [3, 2, 3, 0], "freqs": [3 / 8.0, 2 / 8.0, 3 / 8.0, 0]},
429            index=exp_index,
430        )
431        tm.assert_frame_equal(desc, expected)
432
433        # check an integer one
434        cat = Categorical([1, 2, 3, 1, 2, 3, 3, 2, 1, 1, 1])
435        desc = cat.describe()
436        exp_index = CategoricalIndex([1, 2, 3], ordered=cat.ordered, name="categories")
437        expected = DataFrame(
438            {"counts": [5, 3, 3], "freqs": [5 / 11.0, 3 / 11.0, 3 / 11.0]},
439            index=exp_index,
440        )
441        tm.assert_frame_equal(desc, expected)
442
443        # https://github.com/pandas-dev/pandas/issues/3678
444        # describe should work with NaN
445        cat = Categorical([np.nan, 1, 2, 2])
446        desc = cat.describe()
447        expected = DataFrame(
448            {"counts": [1, 2, 1], "freqs": [1 / 4.0, 2 / 4.0, 1 / 4.0]},
449            index=CategoricalIndex(
450                [1, 2, np.nan], categories=[1, 2], name="categories"
451            ),
452        )
453        tm.assert_frame_equal(desc, expected)
454
455    def test_set_categories_inplace(self):
456        cat = self.factor.copy()
457        cat.set_categories(["a", "b", "c", "d"], inplace=True)
458        tm.assert_index_equal(cat.categories, Index(["a", "b", "c", "d"]))
459
460
461class TestPrivateCategoricalAPI:
462    def test_codes_immutable(self):
463
464        # Codes should be read only
465        c = Categorical(["a", "b", "c", "a", np.nan])
466        exp = np.array([0, 1, 2, 0, -1], dtype="int8")
467        tm.assert_numpy_array_equal(c.codes, exp)
468
469        # Assignments to codes should raise
470        with pytest.raises(AttributeError, match="can't set attribute"):
471            c.codes = np.array([0, 1, 2, 0, 1], dtype="int8")
472
473        # changes in the codes array should raise
474        codes = c.codes
475
476        with pytest.raises(ValueError, match="assignment destination is read-only"):
477            codes[4] = 1
478
479        # But even after getting the codes, the original array should still be
480        # writeable!
481        c[4] = "a"
482        exp = np.array([0, 1, 2, 0, 0], dtype="int8")
483        tm.assert_numpy_array_equal(c.codes, exp)
484        c._codes[4] = 2
485        exp = np.array([0, 1, 2, 0, 2], dtype="int8")
486        tm.assert_numpy_array_equal(c.codes, exp)
487
488    @pytest.mark.parametrize(
489        "codes, old, new, expected",
490        [
491            ([0, 1], ["a", "b"], ["a", "b"], [0, 1]),
492            ([0, 1], ["b", "a"], ["b", "a"], [0, 1]),
493            ([0, 1], ["a", "b"], ["b", "a"], [1, 0]),
494            ([0, 1], ["b", "a"], ["a", "b"], [1, 0]),
495            ([0, 1, 0, 1], ["a", "b"], ["a", "b", "c"], [0, 1, 0, 1]),
496            ([0, 1, 2, 2], ["a", "b", "c"], ["a", "b"], [0, 1, -1, -1]),
497            ([0, 1, -1], ["a", "b", "c"], ["a", "b", "c"], [0, 1, -1]),
498            ([0, 1, -1], ["a", "b", "c"], ["b"], [-1, 0, -1]),
499            ([0, 1, -1], ["a", "b", "c"], ["d"], [-1, -1, -1]),
500            ([0, 1, -1], ["a", "b", "c"], [], [-1, -1, -1]),
501            ([-1, -1], [], ["a", "b"], [-1, -1]),
502            ([1, 0], ["b", "a"], ["a", "b"], [0, 1]),
503        ],
504    )
505    def test_recode_to_categories(self, codes, old, new, expected):
506        codes = np.asanyarray(codes, dtype=np.int8)
507        expected = np.asanyarray(expected, dtype=np.int8)
508        old = Index(old)
509        new = Index(new)
510        result = recode_for_categories(codes, old, new)
511        tm.assert_numpy_array_equal(result, expected)
512
513    def test_recode_to_categories_large(self):
514        N = 1000
515        codes = np.arange(N)
516        old = Index(codes)
517        expected = np.arange(N - 1, -1, -1, dtype=np.int16)
518        new = Index(expected)
519        result = recode_for_categories(codes, old, new)
520        tm.assert_numpy_array_equal(result, expected)
521