1import re 2import sys 3 4import numpy as np 5import pytest 6 7from pandas.compat import PYPY 8 9from pandas import Categorical, Index, NaT, Series, date_range 10import pandas._testing as tm 11from pandas.api.types import is_scalar 12 13 14class TestCategoricalAnalytics: 15 @pytest.mark.parametrize("aggregation", ["min", "max"]) 16 def test_min_max_not_ordered_raises(self, aggregation): 17 # unordered cats have no min/max 18 cat = Categorical(["a", "b", "c", "d"], ordered=False) 19 msg = f"Categorical is not ordered for operation {aggregation}" 20 agg_func = getattr(cat, aggregation) 21 22 with pytest.raises(TypeError, match=msg): 23 agg_func() 24 25 def test_min_max_ordered(self): 26 cat = Categorical(["a", "b", "c", "d"], ordered=True) 27 _min = cat.min() 28 _max = cat.max() 29 assert _min == "a" 30 assert _max == "d" 31 32 cat = Categorical( 33 ["a", "b", "c", "d"], categories=["d", "c", "b", "a"], ordered=True 34 ) 35 _min = cat.min() 36 _max = cat.max() 37 assert _min == "d" 38 assert _max == "a" 39 40 @pytest.mark.parametrize( 41 "categories,expected", 42 [ 43 (list("ABC"), np.NaN), 44 ([1, 2, 3], np.NaN), 45 pytest.param( 46 Series(date_range("2020-01-01", periods=3), dtype="category"), 47 NaT, 48 marks=pytest.mark.xfail( 49 reason="https://github.com/pandas-dev/pandas/issues/29962" 50 ), 51 ), 52 ], 53 ) 54 @pytest.mark.parametrize("aggregation", ["min", "max"]) 55 def test_min_max_ordered_empty(self, categories, expected, aggregation): 56 # GH 30227 57 cat = Categorical([], categories=categories, ordered=True) 58 59 agg_func = getattr(cat, aggregation) 60 result = agg_func() 61 assert result is expected 62 63 @pytest.mark.parametrize( 64 "values, categories", 65 [(["a", "b", "c", np.nan], list("cba")), ([1, 2, 3, np.nan], [3, 2, 1])], 66 ) 67 @pytest.mark.parametrize("skipna", [True, False]) 68 @pytest.mark.parametrize("function", ["min", "max"]) 69 def test_min_max_with_nan(self, values, categories, function, skipna): 70 # GH 25303 71 cat = Categorical(values, categories=categories, ordered=True) 72 result = getattr(cat, function)(skipna=skipna) 73 74 if skipna is False: 75 assert result is np.nan 76 else: 77 expected = categories[0] if function == "min" else categories[2] 78 assert result == expected 79 80 @pytest.mark.parametrize("function", ["min", "max"]) 81 @pytest.mark.parametrize("skipna", [True, False]) 82 def test_min_max_only_nan(self, function, skipna): 83 # https://github.com/pandas-dev/pandas/issues/33450 84 cat = Categorical([np.nan], categories=[1, 2], ordered=True) 85 result = getattr(cat, function)(skipna=skipna) 86 assert result is np.nan 87 88 @pytest.mark.parametrize("method", ["min", "max"]) 89 def test_deprecate_numeric_only_min_max(self, method): 90 # GH 25303 91 cat = Categorical( 92 [np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1], ordered=True 93 ) 94 with tm.assert_produces_warning(expected_warning=FutureWarning): 95 getattr(cat, method)(numeric_only=True) 96 97 @pytest.mark.parametrize("method", ["min", "max"]) 98 def test_numpy_min_max_raises(self, method): 99 cat = Categorical(["a", "b", "c", "b"], ordered=False) 100 msg = ( 101 f"Categorical is not ordered for operation {method}\n" 102 "you can use .as_ordered() to change the Categorical to an ordered one" 103 ) 104 method = getattr(np, method) 105 with pytest.raises(TypeError, match=re.escape(msg)): 106 method(cat) 107 108 @pytest.mark.parametrize("kwarg", ["axis", "out", "keepdims"]) 109 @pytest.mark.parametrize("method", ["min", "max"]) 110 def test_numpy_min_max_unsupported_kwargs_raises(self, method, kwarg): 111 cat = Categorical(["a", "b", "c", "b"], ordered=True) 112 msg = ( 113 f"the '{kwarg}' parameter is not supported in the pandas implementation " 114 f"of {method}" 115 ) 116 if kwarg == "axis": 117 msg = r"`axis` must be fewer than the number of dimensions \(1\)" 118 kwargs = {kwarg: 42} 119 method = getattr(np, method) 120 with pytest.raises(ValueError, match=msg): 121 method(cat, **kwargs) 122 123 @pytest.mark.parametrize("method, expected", [("min", "a"), ("max", "c")]) 124 def test_numpy_min_max_axis_equals_none(self, method, expected): 125 cat = Categorical(["a", "b", "c", "b"], ordered=True) 126 method = getattr(np, method) 127 result = method(cat, axis=None) 128 assert result == expected 129 130 @pytest.mark.parametrize( 131 "values,categories,exp_mode", 132 [ 133 ([1, 1, 2, 4, 5, 5, 5], [5, 4, 3, 2, 1], [5]), 134 ([1, 1, 1, 4, 5, 5, 5], [5, 4, 3, 2, 1], [5, 1]), 135 ([1, 2, 3, 4, 5], [5, 4, 3, 2, 1], [5, 4, 3, 2, 1]), 136 ([np.nan, np.nan, np.nan, 4, 5], [5, 4, 3, 2, 1], [5, 4]), 137 ([np.nan, np.nan, np.nan, 4, 5, 4], [5, 4, 3, 2, 1], [4]), 138 ([np.nan, np.nan, 4, 5, 4], [5, 4, 3, 2, 1], [4]), 139 ], 140 ) 141 def test_mode(self, values, categories, exp_mode): 142 s = Categorical(values, categories=categories, ordered=True) 143 res = s.mode() 144 exp = Categorical(exp_mode, categories=categories, ordered=True) 145 tm.assert_categorical_equal(res, exp) 146 147 def test_searchsorted(self, ordered): 148 # https://github.com/pandas-dev/pandas/issues/8420 149 # https://github.com/pandas-dev/pandas/issues/14522 150 151 cat = Categorical( 152 ["cheese", "milk", "apple", "bread", "bread"], 153 categories=["cheese", "milk", "apple", "bread"], 154 ordered=ordered, 155 ) 156 ser = Series(cat) 157 158 # Searching for single item argument, side='left' (default) 159 res_cat = cat.searchsorted("apple") 160 assert res_cat == 2 161 assert is_scalar(res_cat) 162 163 res_ser = ser.searchsorted("apple") 164 assert res_ser == 2 165 assert is_scalar(res_ser) 166 167 # Searching for single item array, side='left' (default) 168 res_cat = cat.searchsorted(["bread"]) 169 res_ser = ser.searchsorted(["bread"]) 170 exp = np.array([3], dtype=np.intp) 171 tm.assert_numpy_array_equal(res_cat, exp) 172 tm.assert_numpy_array_equal(res_ser, exp) 173 174 # Searching for several items array, side='right' 175 res_cat = cat.searchsorted(["apple", "bread"], side="right") 176 res_ser = ser.searchsorted(["apple", "bread"], side="right") 177 exp = np.array([3, 5], dtype=np.intp) 178 tm.assert_numpy_array_equal(res_cat, exp) 179 tm.assert_numpy_array_equal(res_ser, exp) 180 181 # Searching for a single value that is not from the Categorical 182 with pytest.raises(KeyError, match="cucumber"): 183 cat.searchsorted("cucumber") 184 with pytest.raises(KeyError, match="cucumber"): 185 ser.searchsorted("cucumber") 186 187 # Searching for multiple values one of each is not from the Categorical 188 with pytest.raises(KeyError, match="cucumber"): 189 cat.searchsorted(["bread", "cucumber"]) 190 with pytest.raises(KeyError, match="cucumber"): 191 ser.searchsorted(["bread", "cucumber"]) 192 193 def test_unique(self): 194 # categories are reordered based on value when ordered=False 195 cat = Categorical(["a", "b"]) 196 exp = Index(["a", "b"]) 197 res = cat.unique() 198 tm.assert_index_equal(res.categories, exp) 199 tm.assert_categorical_equal(res, cat) 200 201 cat = Categorical(["a", "b", "a", "a"], categories=["a", "b", "c"]) 202 res = cat.unique() 203 tm.assert_index_equal(res.categories, exp) 204 tm.assert_categorical_equal(res, Categorical(exp)) 205 206 cat = Categorical(["c", "a", "b", "a", "a"], categories=["a", "b", "c"]) 207 exp = Index(["c", "a", "b"]) 208 res = cat.unique() 209 tm.assert_index_equal(res.categories, exp) 210 exp_cat = Categorical(exp, categories=["c", "a", "b"]) 211 tm.assert_categorical_equal(res, exp_cat) 212 213 # nan must be removed 214 cat = Categorical(["b", np.nan, "b", np.nan, "a"], categories=["a", "b", "c"]) 215 res = cat.unique() 216 exp = Index(["b", "a"]) 217 tm.assert_index_equal(res.categories, exp) 218 exp_cat = Categorical(["b", np.nan, "a"], categories=["b", "a"]) 219 tm.assert_categorical_equal(res, exp_cat) 220 221 def test_unique_ordered(self): 222 # keep categories order when ordered=True 223 cat = Categorical(["b", "a", "b"], categories=["a", "b"], ordered=True) 224 res = cat.unique() 225 exp_cat = Categorical(["b", "a"], categories=["a", "b"], ordered=True) 226 tm.assert_categorical_equal(res, exp_cat) 227 228 cat = Categorical( 229 ["c", "b", "a", "a"], categories=["a", "b", "c"], ordered=True 230 ) 231 res = cat.unique() 232 exp_cat = Categorical(["c", "b", "a"], categories=["a", "b", "c"], ordered=True) 233 tm.assert_categorical_equal(res, exp_cat) 234 235 cat = Categorical(["b", "a", "a"], categories=["a", "b", "c"], ordered=True) 236 res = cat.unique() 237 exp_cat = Categorical(["b", "a"], categories=["a", "b"], ordered=True) 238 tm.assert_categorical_equal(res, exp_cat) 239 240 cat = Categorical( 241 ["b", "b", np.nan, "a"], categories=["a", "b", "c"], ordered=True 242 ) 243 res = cat.unique() 244 exp_cat = Categorical(["b", np.nan, "a"], categories=["a", "b"], ordered=True) 245 tm.assert_categorical_equal(res, exp_cat) 246 247 def test_unique_index_series(self): 248 c = Categorical([3, 1, 2, 2, 1], categories=[3, 2, 1]) 249 # Categorical.unique sorts categories by appearance order 250 # if ordered=False 251 exp = Categorical([3, 1, 2], categories=[3, 1, 2]) 252 tm.assert_categorical_equal(c.unique(), exp) 253 254 tm.assert_index_equal(Index(c).unique(), Index(exp)) 255 tm.assert_categorical_equal(Series(c).unique(), exp) 256 257 c = Categorical([1, 1, 2, 2], categories=[3, 2, 1]) 258 exp = Categorical([1, 2], categories=[1, 2]) 259 tm.assert_categorical_equal(c.unique(), exp) 260 tm.assert_index_equal(Index(c).unique(), Index(exp)) 261 tm.assert_categorical_equal(Series(c).unique(), exp) 262 263 c = Categorical([3, 1, 2, 2, 1], categories=[3, 2, 1], ordered=True) 264 # Categorical.unique keeps categories order if ordered=True 265 exp = Categorical([3, 1, 2], categories=[3, 2, 1], ordered=True) 266 tm.assert_categorical_equal(c.unique(), exp) 267 268 tm.assert_index_equal(Index(c).unique(), Index(exp)) 269 tm.assert_categorical_equal(Series(c).unique(), exp) 270 271 def test_shift(self): 272 # GH 9416 273 cat = Categorical(["a", "b", "c", "d", "a"]) 274 275 # shift forward 276 sp1 = cat.shift(1) 277 xp1 = Categorical([np.nan, "a", "b", "c", "d"]) 278 tm.assert_categorical_equal(sp1, xp1) 279 tm.assert_categorical_equal(cat[:-1], sp1[1:]) 280 281 # shift back 282 sn2 = cat.shift(-2) 283 xp2 = Categorical( 284 ["c", "d", "a", np.nan, np.nan], categories=["a", "b", "c", "d"] 285 ) 286 tm.assert_categorical_equal(sn2, xp2) 287 tm.assert_categorical_equal(cat[2:], sn2[:-2]) 288 289 # shift by zero 290 tm.assert_categorical_equal(cat, cat.shift(0)) 291 292 def test_nbytes(self): 293 cat = Categorical([1, 2, 3]) 294 exp = 3 + 3 * 8 # 3 int8s for values + 3 int64s for categories 295 assert cat.nbytes == exp 296 297 def test_memory_usage(self): 298 cat = Categorical([1, 2, 3]) 299 300 # .categories is an index, so we include the hashtable 301 assert 0 < cat.nbytes <= cat.memory_usage() 302 assert 0 < cat.nbytes <= cat.memory_usage(deep=True) 303 304 cat = Categorical(["foo", "foo", "bar"]) 305 assert cat.memory_usage(deep=True) > cat.nbytes 306 307 if not PYPY: 308 # sys.getsizeof will call the .memory_usage with 309 # deep=True, and add on some GC overhead 310 diff = cat.memory_usage(deep=True) - sys.getsizeof(cat) 311 assert abs(diff) < 100 312 313 def test_map(self): 314 c = Categorical(list("ABABC"), categories=list("CBA"), ordered=True) 315 result = c.map(lambda x: x.lower()) 316 exp = Categorical(list("ababc"), categories=list("cba"), ordered=True) 317 tm.assert_categorical_equal(result, exp) 318 319 c = Categorical(list("ABABC"), categories=list("ABC"), ordered=False) 320 result = c.map(lambda x: x.lower()) 321 exp = Categorical(list("ababc"), categories=list("abc"), ordered=False) 322 tm.assert_categorical_equal(result, exp) 323 324 result = c.map(lambda x: 1) 325 # GH 12766: Return an index not an array 326 tm.assert_index_equal(result, Index(np.array([1] * 5, dtype=np.int64))) 327 328 @pytest.mark.parametrize("value", [1, "True", [1, 2, 3], 5.0]) 329 def test_validate_inplace_raises(self, value): 330 cat = Categorical(["A", "B", "B", "C", "A"]) 331 msg = ( 332 'For argument "inplace" expected type bool, ' 333 f"received type {type(value).__name__}" 334 ) 335 with pytest.raises(ValueError, match=msg): 336 cat.set_ordered(value=True, inplace=value) 337 338 with pytest.raises(ValueError, match=msg): 339 cat.as_ordered(inplace=value) 340 341 with pytest.raises(ValueError, match=msg): 342 cat.as_unordered(inplace=value) 343 344 with pytest.raises(ValueError, match=msg): 345 cat.set_categories(["X", "Y", "Z"], rename=True, inplace=value) 346 347 with pytest.raises(ValueError, match=msg): 348 cat.rename_categories(["X", "Y", "Z"], inplace=value) 349 350 with pytest.raises(ValueError, match=msg): 351 cat.reorder_categories(["X", "Y", "Z"], ordered=True, inplace=value) 352 353 with pytest.raises(ValueError, match=msg): 354 cat.add_categories(new_categories=["D", "E", "F"], inplace=value) 355 356 with pytest.raises(ValueError, match=msg): 357 cat.remove_categories(removals=["D", "E", "F"], inplace=value) 358 359 with pytest.raises(ValueError, match=msg): 360 with tm.assert_produces_warning(FutureWarning): 361 # issue #37643 inplace kwarg deprecated 362 cat.remove_unused_categories(inplace=value) 363 364 with pytest.raises(ValueError, match=msg): 365 cat.sort_values(inplace=value) 366