1from datetime import datetime, timedelta 2from importlib import reload 3import string 4import sys 5 6import numpy as np 7import pytest 8 9from pandas._libs.tslibs import iNaT 10 11from pandas import ( 12 NA, 13 Categorical, 14 CategoricalDtype, 15 Index, 16 Interval, 17 NaT, 18 Series, 19 Timedelta, 20 Timestamp, 21 cut, 22 date_range, 23) 24import pandas._testing as tm 25 26 27class TestAstypeAPI: 28 def test_arg_for_errors_in_astype(self): 29 # see GH#14878 30 ser = Series([1, 2, 3]) 31 32 msg = ( 33 r"Expected value of kwarg 'errors' to be one of \['raise', " 34 r"'ignore'\]\. Supplied value is 'False'" 35 ) 36 with pytest.raises(ValueError, match=msg): 37 ser.astype(np.float64, errors=False) 38 39 ser.astype(np.int8, errors="raise") 40 41 @pytest.mark.parametrize("dtype_class", [dict, Series]) 42 def test_astype_dict_like(self, dtype_class): 43 # see GH#7271 44 ser = Series(range(0, 10, 2), name="abc") 45 46 dt1 = dtype_class({"abc": str}) 47 result = ser.astype(dt1) 48 expected = Series(["0", "2", "4", "6", "8"], name="abc") 49 tm.assert_series_equal(result, expected) 50 51 dt2 = dtype_class({"abc": "float64"}) 52 result = ser.astype(dt2) 53 expected = Series([0.0, 2.0, 4.0, 6.0, 8.0], dtype="float64", name="abc") 54 tm.assert_series_equal(result, expected) 55 56 dt3 = dtype_class({"abc": str, "def": str}) 57 msg = ( 58 "Only the Series name can be used for the key in Series dtype " 59 r"mappings\." 60 ) 61 with pytest.raises(KeyError, match=msg): 62 ser.astype(dt3) 63 64 dt4 = dtype_class({0: str}) 65 with pytest.raises(KeyError, match=msg): 66 ser.astype(dt4) 67 68 # GH#16717 69 # if dtypes provided is empty, it should error 70 if dtype_class is Series: 71 dt5 = dtype_class({}, dtype=object) 72 else: 73 dt5 = dtype_class({}) 74 75 with pytest.raises(KeyError, match=msg): 76 ser.astype(dt5) 77 78 79class TestAstype: 80 @pytest.mark.parametrize("dtype", np.typecodes["All"]) 81 def test_astype_empty_constructor_equality(self, dtype): 82 # see GH#15524 83 84 if dtype not in ( 85 "S", 86 "V", # poor support (if any) currently 87 "M", 88 "m", # Generic timestamps raise a ValueError. Already tested. 89 ): 90 init_empty = Series([], dtype=dtype) 91 with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): 92 as_type_empty = Series([]).astype(dtype) 93 tm.assert_series_equal(init_empty, as_type_empty) 94 95 @pytest.mark.parametrize("dtype", [str, np.str_]) 96 @pytest.mark.parametrize( 97 "series", 98 [ 99 Series([string.digits * 10, tm.rands(63), tm.rands(64), tm.rands(1000)]), 100 Series([string.digits * 10, tm.rands(63), tm.rands(64), np.nan, 1.0]), 101 ], 102 ) 103 def test_astype_str_map(self, dtype, series): 104 # see GH#4405 105 result = series.astype(dtype) 106 expected = series.map(str) 107 tm.assert_series_equal(result, expected) 108 109 def test_astype_float_to_period(self): 110 result = Series([np.nan]).astype("period[D]") 111 expected = Series([NaT], dtype="period[D]") 112 tm.assert_series_equal(result, expected) 113 114 def test_astype_no_pandas_dtype(self): 115 # https://github.com/pandas-dev/pandas/pull/24866 116 ser = Series([1, 2], dtype="int64") 117 # Don't have PandasDtype in the public API, so we use `.array.dtype`, 118 # which is a PandasDtype. 119 result = ser.astype(ser.array.dtype) 120 tm.assert_series_equal(result, ser) 121 122 @pytest.mark.parametrize("dtype", [np.datetime64, np.timedelta64]) 123 def test_astype_generic_timestamp_no_frequency(self, dtype, request): 124 # see GH#15524, GH#15987 125 data = [1] 126 s = Series(data) 127 128 if np.dtype(dtype).name not in ["timedelta64", "datetime64"]: 129 mark = pytest.mark.xfail(reason="GH#33890 Is assigned ns unit") 130 request.node.add_marker(mark) 131 132 msg = ( 133 fr"The '{dtype.__name__}' dtype has no unit\. " 134 fr"Please pass in '{dtype.__name__}\[ns\]' instead." 135 ) 136 with pytest.raises(ValueError, match=msg): 137 s.astype(dtype) 138 139 def test_astype_dt64_to_str(self): 140 # GH#10442 : testing astype(str) is correct for Series/DatetimeIndex 141 dti = date_range("2012-01-01", periods=3) 142 result = Series(dti).astype(str) 143 expected = Series(["2012-01-01", "2012-01-02", "2012-01-03"], dtype=object) 144 tm.assert_series_equal(result, expected) 145 146 def test_astype_dt64tz_to_str(self): 147 # GH#10442 : testing astype(str) is correct for Series/DatetimeIndex 148 dti_tz = date_range("2012-01-01", periods=3, tz="US/Eastern") 149 result = Series(dti_tz).astype(str) 150 expected = Series( 151 [ 152 "2012-01-01 00:00:00-05:00", 153 "2012-01-02 00:00:00-05:00", 154 "2012-01-03 00:00:00-05:00", 155 ], 156 dtype=object, 157 ) 158 tm.assert_series_equal(result, expected) 159 160 def test_astype_datetime(self): 161 s = Series(iNaT, dtype="M8[ns]", index=range(5)) 162 163 s = s.astype("O") 164 assert s.dtype == np.object_ 165 166 s = Series([datetime(2001, 1, 2, 0, 0)]) 167 168 s = s.astype("O") 169 assert s.dtype == np.object_ 170 171 s = Series([datetime(2001, 1, 2, 0, 0) for i in range(3)]) 172 173 s[1] = np.nan 174 assert s.dtype == "M8[ns]" 175 176 s = s.astype("O") 177 assert s.dtype == np.object_ 178 179 def test_astype_datetime64tz(self): 180 s = Series(date_range("20130101", periods=3, tz="US/Eastern")) 181 182 # astype 183 result = s.astype(object) 184 expected = Series(s.astype(object), dtype=object) 185 tm.assert_series_equal(result, expected) 186 187 result = Series(s.values).dt.tz_localize("UTC").dt.tz_convert(s.dt.tz) 188 tm.assert_series_equal(result, s) 189 190 # astype - object, preserves on construction 191 result = Series(s.astype(object)) 192 expected = s.astype(object) 193 tm.assert_series_equal(result, expected) 194 195 # astype - datetime64[ns, tz] 196 result = Series(s.values).astype("datetime64[ns, US/Eastern]") 197 tm.assert_series_equal(result, s) 198 199 result = Series(s.values).astype(s.dtype) 200 tm.assert_series_equal(result, s) 201 202 result = s.astype("datetime64[ns, CET]") 203 expected = Series(date_range("20130101 06:00:00", periods=3, tz="CET")) 204 tm.assert_series_equal(result, expected) 205 206 def test_astype_str_cast_dt64(self): 207 # see GH#9757 208 ts = Series([Timestamp("2010-01-04 00:00:00")]) 209 s = ts.astype(str) 210 211 expected = Series(["2010-01-04"]) 212 tm.assert_series_equal(s, expected) 213 214 ts = Series([Timestamp("2010-01-04 00:00:00", tz="US/Eastern")]) 215 s = ts.astype(str) 216 217 expected = Series(["2010-01-04 00:00:00-05:00"]) 218 tm.assert_series_equal(s, expected) 219 220 def test_astype_str_cast_td64(self): 221 # see GH#9757 222 223 td = Series([Timedelta(1, unit="d")]) 224 ser = td.astype(str) 225 226 expected = Series(["1 days"]) 227 tm.assert_series_equal(ser, expected) 228 229 def test_dt64_series_astype_object(self): 230 dt64ser = Series(date_range("20130101", periods=3)) 231 result = dt64ser.astype(object) 232 assert isinstance(result.iloc[0], datetime) 233 assert result.dtype == np.object_ 234 235 def test_td64_series_astype_object(self): 236 tdser = Series(["59 Days", "59 Days", "NaT"], dtype="timedelta64[ns]") 237 result = tdser.astype(object) 238 assert isinstance(result.iloc[0], timedelta) 239 assert result.dtype == np.object_ 240 241 @pytest.mark.parametrize( 242 "values", 243 [ 244 Series(["x", "y", "z"], dtype="string"), 245 Series(["x", "y", "z"], dtype="category"), 246 Series(3 * [Timestamp("2020-01-01", tz="UTC")]), 247 Series(3 * [Interval(0, 1)]), 248 ], 249 ) 250 @pytest.mark.parametrize("errors", ["raise", "ignore"]) 251 def test_astype_ignores_errors_for_extension_dtypes(self, values, errors): 252 # https://github.com/pandas-dev/pandas/issues/35471 253 if errors == "ignore": 254 expected = values 255 result = values.astype(float, errors="ignore") 256 tm.assert_series_equal(result, expected) 257 else: 258 msg = "(Cannot cast)|(could not convert)" 259 with pytest.raises((ValueError, TypeError), match=msg): 260 values.astype(float, errors=errors) 261 262 @pytest.mark.parametrize("dtype", [np.float16, np.float32, np.float64]) 263 def test_astype_from_float_to_str(self, dtype): 264 # https://github.com/pandas-dev/pandas/issues/36451 265 s = Series([0.1], dtype=dtype) 266 result = s.astype(str) 267 expected = Series(["0.1"]) 268 tm.assert_series_equal(result, expected) 269 270 @pytest.mark.parametrize( 271 "value, string_value", 272 [ 273 (None, "None"), 274 (np.nan, "nan"), 275 (NA, "<NA>"), 276 ], 277 ) 278 def test_astype_to_str_preserves_na(self, value, string_value): 279 # https://github.com/pandas-dev/pandas/issues/36904 280 s = Series(["a", "b", value], dtype=object) 281 result = s.astype(str) 282 expected = Series(["a", "b", string_value], dtype=object) 283 tm.assert_series_equal(result, expected) 284 285 @pytest.mark.parametrize("dtype", ["float32", "float64", "int64", "int32"]) 286 def test_astype(self, dtype): 287 s = Series(np.random.randn(5), name="foo") 288 as_typed = s.astype(dtype) 289 290 assert as_typed.dtype == dtype 291 assert as_typed.name == s.name 292 293 @pytest.mark.parametrize("value", [np.nan, np.inf]) 294 @pytest.mark.parametrize("dtype", [np.int32, np.int64]) 295 def test_astype_cast_nan_inf_int(self, dtype, value): 296 # gh-14265: check NaN and inf raise error when converting to int 297 msg = "Cannot convert non-finite values \\(NA or inf\\) to integer" 298 s = Series([value]) 299 300 with pytest.raises(ValueError, match=msg): 301 s.astype(dtype) 302 303 @pytest.mark.parametrize("dtype", [int, np.int8, np.int64]) 304 def test_astype_cast_object_int_fail(self, dtype): 305 arr = Series(["car", "house", "tree", "1"]) 306 msg = r"invalid literal for int\(\) with base 10: 'car'" 307 with pytest.raises(ValueError, match=msg): 308 arr.astype(dtype) 309 310 def test_astype_cast_object_int(self): 311 arr = Series(["1", "2", "3", "4"], dtype=object) 312 result = arr.astype(int) 313 314 tm.assert_series_equal(result, Series(np.arange(1, 5))) 315 316 def test_astype_unicode(self): 317 # see GH#7758: A bit of magic is required to set 318 # default encoding to utf-8 319 digits = string.digits 320 test_series = [ 321 Series([digits * 10, tm.rands(63), tm.rands(64), tm.rands(1000)]), 322 Series(["データーサイエンス、お前はもう死んでいる"]), 323 ] 324 325 former_encoding = None 326 327 if sys.getdefaultencoding() == "utf-8": 328 test_series.append(Series(["野菜食べないとやばい".encode()])) 329 330 for s in test_series: 331 res = s.astype("unicode") 332 expec = s.map(str) 333 tm.assert_series_equal(res, expec) 334 335 # Restore the former encoding 336 if former_encoding is not None and former_encoding != "utf-8": 337 reload(sys) 338 sys.setdefaultencoding(former_encoding) 339 340 def test_astype_bytes(self): 341 # GH#39474 342 result = Series(["foo", "bar", "baz"]).astype(bytes) 343 assert result.dtypes == np.dtype("S3") 344 345 346class TestAstypeCategorical: 347 def test_astype_categorical_invalid_conversions(self): 348 # invalid conversion (these are NOT a dtype) 349 cat = Categorical([f"{i} - {i + 499}" for i in range(0, 10000, 500)]) 350 ser = Series(np.random.randint(0, 10000, 100)).sort_values() 351 ser = cut(ser, range(0, 10500, 500), right=False, labels=cat) 352 353 msg = ( 354 "dtype '<class 'pandas.core.arrays.categorical.Categorical'>' " 355 "not understood" 356 ) 357 with pytest.raises(TypeError, match=msg): 358 ser.astype(Categorical) 359 with pytest.raises(TypeError, match=msg): 360 ser.astype("object").astype(Categorical) 361 362 def test_astype_categoricaldtype(self): 363 s = Series(["a", "b", "a"]) 364 result = s.astype(CategoricalDtype(["a", "b"], ordered=True)) 365 expected = Series(Categorical(["a", "b", "a"], ordered=True)) 366 tm.assert_series_equal(result, expected) 367 368 result = s.astype(CategoricalDtype(["a", "b"], ordered=False)) 369 expected = Series(Categorical(["a", "b", "a"], ordered=False)) 370 tm.assert_series_equal(result, expected) 371 372 result = s.astype(CategoricalDtype(["a", "b", "c"], ordered=False)) 373 expected = Series( 374 Categorical(["a", "b", "a"], categories=["a", "b", "c"], ordered=False) 375 ) 376 tm.assert_series_equal(result, expected) 377 tm.assert_index_equal(result.cat.categories, Index(["a", "b", "c"])) 378 379 @pytest.mark.parametrize("name", [None, "foo"]) 380 @pytest.mark.parametrize("dtype_ordered", [True, False]) 381 @pytest.mark.parametrize("series_ordered", [True, False]) 382 def test_astype_categorical_to_categorical( 383 self, name, dtype_ordered, series_ordered 384 ): 385 # GH#10696, GH#18593 386 s_data = list("abcaacbab") 387 s_dtype = CategoricalDtype(list("bac"), ordered=series_ordered) 388 s = Series(s_data, dtype=s_dtype, name=name) 389 390 # unspecified categories 391 dtype = CategoricalDtype(ordered=dtype_ordered) 392 result = s.astype(dtype) 393 exp_dtype = CategoricalDtype(s_dtype.categories, dtype_ordered) 394 expected = Series(s_data, name=name, dtype=exp_dtype) 395 tm.assert_series_equal(result, expected) 396 397 # different categories 398 dtype = CategoricalDtype(list("adc"), dtype_ordered) 399 result = s.astype(dtype) 400 expected = Series(s_data, name=name, dtype=dtype) 401 tm.assert_series_equal(result, expected) 402 403 if dtype_ordered is False: 404 # not specifying ordered, so only test once 405 expected = s 406 result = s.astype("category") 407 tm.assert_series_equal(result, expected) 408 409 def test_astype_bool_missing_to_categorical(self): 410 # GH-19182 411 s = Series([True, False, np.nan]) 412 assert s.dtypes == np.object_ 413 414 result = s.astype(CategoricalDtype(categories=[True, False])) 415 expected = Series(Categorical([True, False, np.nan], categories=[True, False])) 416 tm.assert_series_equal(result, expected) 417 418 def test_astype_categories_raises(self): 419 # deprecated GH#17636, removed in GH#27141 420 s = Series(["a", "b", "a"]) 421 with pytest.raises(TypeError, match="got an unexpected"): 422 s.astype("category", categories=["a", "b"], ordered=True) 423