1import re 2 3import numpy as np 4import pytest 5 6from pandas import Categorical, CategoricalIndex, DataFrame, Index, Series 7import pandas._testing as tm 8from pandas.core.arrays.categorical import recode_for_categories 9from pandas.tests.arrays.categorical.common import TestCategorical 10 11 12class TestCategoricalAPI: 13 def test_ordered_api(self): 14 # GH 9347 15 cat1 = Categorical(list("acb"), ordered=False) 16 tm.assert_index_equal(cat1.categories, Index(["a", "b", "c"])) 17 assert not cat1.ordered 18 19 cat2 = Categorical(list("acb"), categories=list("bca"), ordered=False) 20 tm.assert_index_equal(cat2.categories, Index(["b", "c", "a"])) 21 assert not cat2.ordered 22 23 cat3 = Categorical(list("acb"), ordered=True) 24 tm.assert_index_equal(cat3.categories, Index(["a", "b", "c"])) 25 assert cat3.ordered 26 27 cat4 = Categorical(list("acb"), categories=list("bca"), ordered=True) 28 tm.assert_index_equal(cat4.categories, Index(["b", "c", "a"])) 29 assert cat4.ordered 30 31 def test_set_ordered(self): 32 33 cat = Categorical(["a", "b", "c", "a"], ordered=True) 34 cat2 = cat.as_unordered() 35 assert not cat2.ordered 36 cat2 = cat.as_ordered() 37 assert cat2.ordered 38 cat2.as_unordered(inplace=True) 39 assert not cat2.ordered 40 cat2.as_ordered(inplace=True) 41 assert cat2.ordered 42 43 assert cat2.set_ordered(True).ordered 44 assert not cat2.set_ordered(False).ordered 45 cat2.set_ordered(True, inplace=True) 46 assert cat2.ordered 47 cat2.set_ordered(False, inplace=True) 48 assert not cat2.ordered 49 50 # removed in 0.19.0 51 msg = "can't set attribute" 52 with pytest.raises(AttributeError, match=msg): 53 cat.ordered = True 54 with pytest.raises(AttributeError, match=msg): 55 cat.ordered = False 56 57 def test_rename_categories(self): 58 cat = Categorical(["a", "b", "c", "a"]) 59 60 # inplace=False: the old one must not be changed 61 res = cat.rename_categories([1, 2, 3]) 62 tm.assert_numpy_array_equal( 63 res.__array__(), np.array([1, 2, 3, 1], dtype=np.int64) 64 ) 65 tm.assert_index_equal(res.categories, Index([1, 2, 3])) 66 67 exp_cat = np.array(["a", "b", "c", "a"], dtype=np.object_) 68 tm.assert_numpy_array_equal(cat.__array__(), exp_cat) 69 70 exp_cat = Index(["a", "b", "c"]) 71 tm.assert_index_equal(cat.categories, exp_cat) 72 73 # GH18862 (let rename_categories take callables) 74 result = cat.rename_categories(lambda x: x.upper()) 75 expected = Categorical(["A", "B", "C", "A"]) 76 tm.assert_categorical_equal(result, expected) 77 78 # and now inplace 79 res = cat.rename_categories([1, 2, 3], inplace=True) 80 assert res is None 81 tm.assert_numpy_array_equal( 82 cat.__array__(), np.array([1, 2, 3, 1], dtype=np.int64) 83 ) 84 tm.assert_index_equal(cat.categories, Index([1, 2, 3])) 85 86 @pytest.mark.parametrize("new_categories", [[1, 2, 3, 4], [1, 2]]) 87 def test_rename_categories_wrong_length_raises(self, new_categories): 88 cat = Categorical(["a", "b", "c", "a"]) 89 msg = ( 90 "new categories need to have the same number of items as the " 91 "old categories!" 92 ) 93 with pytest.raises(ValueError, match=msg): 94 cat.rename_categories(new_categories) 95 96 def test_rename_categories_series(self): 97 # https://github.com/pandas-dev/pandas/issues/17981 98 c = Categorical(["a", "b"]) 99 result = c.rename_categories(Series([0, 1], index=["a", "b"])) 100 expected = Categorical([0, 1]) 101 tm.assert_categorical_equal(result, expected) 102 103 def test_rename_categories_dict(self): 104 # GH 17336 105 cat = Categorical(["a", "b", "c", "d"]) 106 res = cat.rename_categories({"a": 4, "b": 3, "c": 2, "d": 1}) 107 expected = Index([4, 3, 2, 1]) 108 tm.assert_index_equal(res.categories, expected) 109 110 # Test for inplace 111 res = cat.rename_categories({"a": 4, "b": 3, "c": 2, "d": 1}, inplace=True) 112 assert res is None 113 tm.assert_index_equal(cat.categories, expected) 114 115 # Test for dicts of smaller length 116 cat = Categorical(["a", "b", "c", "d"]) 117 res = cat.rename_categories({"a": 1, "c": 3}) 118 119 expected = Index([1, "b", 3, "d"]) 120 tm.assert_index_equal(res.categories, expected) 121 122 # Test for dicts with bigger length 123 cat = Categorical(["a", "b", "c", "d"]) 124 res = cat.rename_categories({"a": 1, "b": 2, "c": 3, "d": 4, "e": 5, "f": 6}) 125 expected = Index([1, 2, 3, 4]) 126 tm.assert_index_equal(res.categories, expected) 127 128 # Test for dicts with no items from old categories 129 cat = Categorical(["a", "b", "c", "d"]) 130 res = cat.rename_categories({"f": 1, "g": 3}) 131 132 expected = Index(["a", "b", "c", "d"]) 133 tm.assert_index_equal(res.categories, expected) 134 135 def test_reorder_categories(self): 136 cat = Categorical(["a", "b", "c", "a"], ordered=True) 137 old = cat.copy() 138 new = Categorical( 139 ["a", "b", "c", "a"], categories=["c", "b", "a"], ordered=True 140 ) 141 142 # first inplace == False 143 res = cat.reorder_categories(["c", "b", "a"]) 144 # cat must be the same as before 145 tm.assert_categorical_equal(cat, old) 146 # only res is changed 147 tm.assert_categorical_equal(res, new) 148 149 # inplace == True 150 res = cat.reorder_categories(["c", "b", "a"], inplace=True) 151 assert res is None 152 tm.assert_categorical_equal(cat, new) 153 154 @pytest.mark.parametrize( 155 "new_categories", 156 [ 157 ["a"], # not all "old" included in "new" 158 ["a", "b", "d"], # still not all "old" in "new" 159 ["a", "b", "c", "d"], # all "old" included in "new", but too long 160 ], 161 ) 162 def test_reorder_categories_raises(self, new_categories): 163 cat = Categorical(["a", "b", "c", "a"], ordered=True) 164 msg = "items in new_categories are not the same as in old categories" 165 with pytest.raises(ValueError, match=msg): 166 cat.reorder_categories(new_categories) 167 168 def test_add_categories(self): 169 cat = Categorical(["a", "b", "c", "a"], ordered=True) 170 old = cat.copy() 171 new = Categorical( 172 ["a", "b", "c", "a"], categories=["a", "b", "c", "d"], ordered=True 173 ) 174 175 # first inplace == False 176 res = cat.add_categories("d") 177 tm.assert_categorical_equal(cat, old) 178 tm.assert_categorical_equal(res, new) 179 180 res = cat.add_categories(["d"]) 181 tm.assert_categorical_equal(cat, old) 182 tm.assert_categorical_equal(res, new) 183 184 # inplace == True 185 res = cat.add_categories("d", inplace=True) 186 tm.assert_categorical_equal(cat, new) 187 assert res is None 188 189 # GH 9927 190 cat = Categorical(list("abc"), ordered=True) 191 expected = Categorical(list("abc"), categories=list("abcde"), ordered=True) 192 # test with Series, np.array, index, list 193 res = cat.add_categories(Series(["d", "e"])) 194 tm.assert_categorical_equal(res, expected) 195 res = cat.add_categories(np.array(["d", "e"])) 196 tm.assert_categorical_equal(res, expected) 197 res = cat.add_categories(Index(["d", "e"])) 198 tm.assert_categorical_equal(res, expected) 199 res = cat.add_categories(["d", "e"]) 200 tm.assert_categorical_equal(res, expected) 201 202 def test_add_categories_existing_raises(self): 203 # new is in old categories 204 cat = Categorical(["a", "b", "c", "d"], ordered=True) 205 msg = re.escape("new categories must not include old categories: {'d'}") 206 with pytest.raises(ValueError, match=msg): 207 cat.add_categories(["d"]) 208 209 def test_set_categories(self): 210 cat = Categorical(["a", "b", "c", "a"], ordered=True) 211 exp_categories = Index(["c", "b", "a"]) 212 exp_values = np.array(["a", "b", "c", "a"], dtype=np.object_) 213 214 res = cat.set_categories(["c", "b", "a"], inplace=True) 215 tm.assert_index_equal(cat.categories, exp_categories) 216 tm.assert_numpy_array_equal(cat.__array__(), exp_values) 217 assert res is None 218 219 res = cat.set_categories(["a", "b", "c"]) 220 # cat must be the same as before 221 tm.assert_index_equal(cat.categories, exp_categories) 222 tm.assert_numpy_array_equal(cat.__array__(), exp_values) 223 # only res is changed 224 exp_categories_back = Index(["a", "b", "c"]) 225 tm.assert_index_equal(res.categories, exp_categories_back) 226 tm.assert_numpy_array_equal(res.__array__(), exp_values) 227 228 # not all "old" included in "new" -> all not included ones are now 229 # np.nan 230 cat = Categorical(["a", "b", "c", "a"], ordered=True) 231 res = cat.set_categories(["a"]) 232 tm.assert_numpy_array_equal(res.codes, np.array([0, -1, -1, 0], dtype=np.int8)) 233 234 # still not all "old" in "new" 235 res = cat.set_categories(["a", "b", "d"]) 236 tm.assert_numpy_array_equal(res.codes, np.array([0, 1, -1, 0], dtype=np.int8)) 237 tm.assert_index_equal(res.categories, Index(["a", "b", "d"])) 238 239 # all "old" included in "new" 240 cat = cat.set_categories(["a", "b", "c", "d"]) 241 exp_categories = Index(["a", "b", "c", "d"]) 242 tm.assert_index_equal(cat.categories, exp_categories) 243 244 # internals... 245 c = Categorical([1, 2, 3, 4, 1], categories=[1, 2, 3, 4], ordered=True) 246 tm.assert_numpy_array_equal(c._codes, np.array([0, 1, 2, 3, 0], dtype=np.int8)) 247 tm.assert_index_equal(c.categories, Index([1, 2, 3, 4])) 248 249 exp = np.array([1, 2, 3, 4, 1], dtype=np.int64) 250 tm.assert_numpy_array_equal(np.asarray(c), exp) 251 252 # all "pointers" to '4' must be changed from 3 to 0,... 253 c = c.set_categories([4, 3, 2, 1]) 254 255 # positions are changed 256 tm.assert_numpy_array_equal(c._codes, np.array([3, 2, 1, 0, 3], dtype=np.int8)) 257 258 # categories are now in new order 259 tm.assert_index_equal(c.categories, Index([4, 3, 2, 1])) 260 261 # output is the same 262 exp = np.array([1, 2, 3, 4, 1], dtype=np.int64) 263 tm.assert_numpy_array_equal(np.asarray(c), exp) 264 assert c.min() == 4 265 assert c.max() == 1 266 267 # set_categories should set the ordering if specified 268 c2 = c.set_categories([4, 3, 2, 1], ordered=False) 269 assert not c2.ordered 270 271 tm.assert_numpy_array_equal(np.asarray(c), np.asarray(c2)) 272 273 # set_categories should pass thru the ordering 274 c2 = c.set_ordered(False).set_categories([4, 3, 2, 1]) 275 assert not c2.ordered 276 277 tm.assert_numpy_array_equal(np.asarray(c), np.asarray(c2)) 278 279 def test_to_dense_deprecated(self): 280 cat = Categorical(["a", "b", "c", "a"], ordered=True) 281 282 with tm.assert_produces_warning(FutureWarning): 283 cat.to_dense() 284 285 @pytest.mark.parametrize( 286 "values, categories, new_categories", 287 [ 288 # No NaNs, same cats, same order 289 (["a", "b", "a"], ["a", "b"], ["a", "b"]), 290 # No NaNs, same cats, different order 291 (["a", "b", "a"], ["a", "b"], ["b", "a"]), 292 # Same, unsorted 293 (["b", "a", "a"], ["a", "b"], ["a", "b"]), 294 # No NaNs, same cats, different order 295 (["b", "a", "a"], ["a", "b"], ["b", "a"]), 296 # NaNs 297 (["a", "b", "c"], ["a", "b"], ["a", "b"]), 298 (["a", "b", "c"], ["a", "b"], ["b", "a"]), 299 (["b", "a", "c"], ["a", "b"], ["a", "b"]), 300 (["b", "a", "c"], ["a", "b"], ["a", "b"]), 301 # Introduce NaNs 302 (["a", "b", "c"], ["a", "b"], ["a"]), 303 (["a", "b", "c"], ["a", "b"], ["b"]), 304 (["b", "a", "c"], ["a", "b"], ["a"]), 305 (["b", "a", "c"], ["a", "b"], ["a"]), 306 # No overlap 307 (["a", "b", "c"], ["a", "b"], ["d", "e"]), 308 ], 309 ) 310 @pytest.mark.parametrize("ordered", [True, False]) 311 def test_set_categories_many(self, values, categories, new_categories, ordered): 312 c = Categorical(values, categories) 313 expected = Categorical(values, new_categories, ordered) 314 result = c.set_categories(new_categories, ordered=ordered) 315 tm.assert_categorical_equal(result, expected) 316 317 def test_set_categories_rename_less(self): 318 # GH 24675 319 cat = Categorical(["A", "B"]) 320 result = cat.set_categories(["A"], rename=True) 321 expected = Categorical(["A", np.nan]) 322 tm.assert_categorical_equal(result, expected) 323 324 def test_set_categories_private(self): 325 cat = Categorical(["a", "b", "c"], categories=["a", "b", "c", "d"]) 326 cat._set_categories(["a", "c", "d", "e"]) 327 expected = Categorical(["a", "c", "d"], categories=list("acde")) 328 tm.assert_categorical_equal(cat, expected) 329 330 # fastpath 331 cat = Categorical(["a", "b", "c"], categories=["a", "b", "c", "d"]) 332 cat._set_categories(["a", "c", "d", "e"], fastpath=True) 333 expected = Categorical(["a", "c", "d"], categories=list("acde")) 334 tm.assert_categorical_equal(cat, expected) 335 336 def test_remove_categories(self): 337 cat = Categorical(["a", "b", "c", "a"], ordered=True) 338 old = cat.copy() 339 new = Categorical(["a", "b", np.nan, "a"], categories=["a", "b"], ordered=True) 340 341 # first inplace == False 342 res = cat.remove_categories("c") 343 tm.assert_categorical_equal(cat, old) 344 tm.assert_categorical_equal(res, new) 345 346 res = cat.remove_categories(["c"]) 347 tm.assert_categorical_equal(cat, old) 348 tm.assert_categorical_equal(res, new) 349 350 # inplace == True 351 res = cat.remove_categories("c", inplace=True) 352 tm.assert_categorical_equal(cat, new) 353 assert res is None 354 355 @pytest.mark.parametrize("removals", [["c"], ["c", np.nan], "c", ["c", "c"]]) 356 def test_remove_categories_raises(self, removals): 357 cat = Categorical(["a", "b", "a"]) 358 message = re.escape("removals must all be in old categories: {'c'}") 359 360 with pytest.raises(ValueError, match=message): 361 cat.remove_categories(removals) 362 363 def test_remove_unused_categories(self): 364 c = Categorical(["a", "b", "c", "d", "a"], categories=["a", "b", "c", "d", "e"]) 365 exp_categories_all = Index(["a", "b", "c", "d", "e"]) 366 exp_categories_dropped = Index(["a", "b", "c", "d"]) 367 368 tm.assert_index_equal(c.categories, exp_categories_all) 369 370 res = c.remove_unused_categories() 371 tm.assert_index_equal(res.categories, exp_categories_dropped) 372 tm.assert_index_equal(c.categories, exp_categories_all) 373 374 with tm.assert_produces_warning(FutureWarning): 375 # issue #37643 inplace kwarg deprecated 376 res = c.remove_unused_categories(inplace=True) 377 378 tm.assert_index_equal(c.categories, exp_categories_dropped) 379 assert res is None 380 381 # with NaN values (GH11599) 382 c = Categorical(["a", "b", "c", np.nan], categories=["a", "b", "c", "d", "e"]) 383 res = c.remove_unused_categories() 384 tm.assert_index_equal(res.categories, Index(np.array(["a", "b", "c"]))) 385 exp_codes = np.array([0, 1, 2, -1], dtype=np.int8) 386 tm.assert_numpy_array_equal(res.codes, exp_codes) 387 tm.assert_index_equal(c.categories, exp_categories_all) 388 389 val = ["F", np.nan, "D", "B", "D", "F", np.nan] 390 cat = Categorical(values=val, categories=list("ABCDEFG")) 391 out = cat.remove_unused_categories() 392 tm.assert_index_equal(out.categories, Index(["B", "D", "F"])) 393 exp_codes = np.array([2, -1, 1, 0, 1, 2, -1], dtype=np.int8) 394 tm.assert_numpy_array_equal(out.codes, exp_codes) 395 assert out.tolist() == val 396 397 alpha = list("abcdefghijklmnopqrstuvwxyz") 398 val = np.random.choice(alpha[::2], 10000).astype("object") 399 val[np.random.choice(len(val), 100)] = np.nan 400 401 cat = Categorical(values=val, categories=alpha) 402 out = cat.remove_unused_categories() 403 assert out.tolist() == val.tolist() 404 405 406class TestCategoricalAPIWithFactor(TestCategorical): 407 def test_describe(self): 408 # string type 409 desc = self.factor.describe() 410 assert self.factor.ordered 411 exp_index = CategoricalIndex( 412 ["a", "b", "c"], name="categories", ordered=self.factor.ordered 413 ) 414 expected = DataFrame( 415 {"counts": [3, 2, 3], "freqs": [3 / 8.0, 2 / 8.0, 3 / 8.0]}, index=exp_index 416 ) 417 tm.assert_frame_equal(desc, expected) 418 419 # check unused categories 420 cat = self.factor.copy() 421 cat.set_categories(["a", "b", "c", "d"], inplace=True) 422 desc = cat.describe() 423 424 exp_index = CategoricalIndex( 425 list("abcd"), ordered=self.factor.ordered, name="categories" 426 ) 427 expected = DataFrame( 428 {"counts": [3, 2, 3, 0], "freqs": [3 / 8.0, 2 / 8.0, 3 / 8.0, 0]}, 429 index=exp_index, 430 ) 431 tm.assert_frame_equal(desc, expected) 432 433 # check an integer one 434 cat = Categorical([1, 2, 3, 1, 2, 3, 3, 2, 1, 1, 1]) 435 desc = cat.describe() 436 exp_index = CategoricalIndex([1, 2, 3], ordered=cat.ordered, name="categories") 437 expected = DataFrame( 438 {"counts": [5, 3, 3], "freqs": [5 / 11.0, 3 / 11.0, 3 / 11.0]}, 439 index=exp_index, 440 ) 441 tm.assert_frame_equal(desc, expected) 442 443 # https://github.com/pandas-dev/pandas/issues/3678 444 # describe should work with NaN 445 cat = Categorical([np.nan, 1, 2, 2]) 446 desc = cat.describe() 447 expected = DataFrame( 448 {"counts": [1, 2, 1], "freqs": [1 / 4.0, 2 / 4.0, 1 / 4.0]}, 449 index=CategoricalIndex( 450 [1, 2, np.nan], categories=[1, 2], name="categories" 451 ), 452 ) 453 tm.assert_frame_equal(desc, expected) 454 455 def test_set_categories_inplace(self): 456 cat = self.factor.copy() 457 cat.set_categories(["a", "b", "c", "d"], inplace=True) 458 tm.assert_index_equal(cat.categories, Index(["a", "b", "c", "d"])) 459 460 461class TestPrivateCategoricalAPI: 462 def test_codes_immutable(self): 463 464 # Codes should be read only 465 c = Categorical(["a", "b", "c", "a", np.nan]) 466 exp = np.array([0, 1, 2, 0, -1], dtype="int8") 467 tm.assert_numpy_array_equal(c.codes, exp) 468 469 # Assignments to codes should raise 470 with pytest.raises(AttributeError, match="can't set attribute"): 471 c.codes = np.array([0, 1, 2, 0, 1], dtype="int8") 472 473 # changes in the codes array should raise 474 codes = c.codes 475 476 with pytest.raises(ValueError, match="assignment destination is read-only"): 477 codes[4] = 1 478 479 # But even after getting the codes, the original array should still be 480 # writeable! 481 c[4] = "a" 482 exp = np.array([0, 1, 2, 0, 0], dtype="int8") 483 tm.assert_numpy_array_equal(c.codes, exp) 484 c._codes[4] = 2 485 exp = np.array([0, 1, 2, 0, 2], dtype="int8") 486 tm.assert_numpy_array_equal(c.codes, exp) 487 488 @pytest.mark.parametrize( 489 "codes, old, new, expected", 490 [ 491 ([0, 1], ["a", "b"], ["a", "b"], [0, 1]), 492 ([0, 1], ["b", "a"], ["b", "a"], [0, 1]), 493 ([0, 1], ["a", "b"], ["b", "a"], [1, 0]), 494 ([0, 1], ["b", "a"], ["a", "b"], [1, 0]), 495 ([0, 1, 0, 1], ["a", "b"], ["a", "b", "c"], [0, 1, 0, 1]), 496 ([0, 1, 2, 2], ["a", "b", "c"], ["a", "b"], [0, 1, -1, -1]), 497 ([0, 1, -1], ["a", "b", "c"], ["a", "b", "c"], [0, 1, -1]), 498 ([0, 1, -1], ["a", "b", "c"], ["b"], [-1, 0, -1]), 499 ([0, 1, -1], ["a", "b", "c"], ["d"], [-1, -1, -1]), 500 ([0, 1, -1], ["a", "b", "c"], [], [-1, -1, -1]), 501 ([-1, -1], [], ["a", "b"], [-1, -1]), 502 ([1, 0], ["b", "a"], ["a", "b"], [0, 1]), 503 ], 504 ) 505 def test_recode_to_categories(self, codes, old, new, expected): 506 codes = np.asanyarray(codes, dtype=np.int8) 507 expected = np.asanyarray(expected, dtype=np.int8) 508 old = Index(old) 509 new = Index(new) 510 result = recode_for_categories(codes, old, new) 511 tm.assert_numpy_array_equal(result, expected) 512 513 def test_recode_to_categories_large(self): 514 N = 1000 515 codes = np.arange(N) 516 old = Index(codes) 517 expected = np.arange(N - 1, -1, -1, dtype=np.int16) 518 new = Index(expected) 519 result = recode_for_categories(codes, old, new) 520 tm.assert_numpy_array_equal(result, expected) 521