1from datetime import timedelta 2from decimal import Decimal 3import re 4 5from dateutil.tz import tzlocal 6import numpy as np 7import pytest 8 9from pandas.compat import is_platform_windows 10import pandas.util._test_decorators as td 11 12import pandas as pd 13from pandas import ( 14 Categorical, 15 DataFrame, 16 Index, 17 MultiIndex, 18 Series, 19 Timestamp, 20 date_range, 21 isna, 22 notna, 23 to_datetime, 24 to_timedelta, 25) 26import pandas._testing as tm 27import pandas.core.algorithms as algorithms 28import pandas.core.nanops as nanops 29 30 31def assert_stat_op_calc( 32 opname, 33 alternative, 34 frame, 35 has_skipna=True, 36 check_dtype=True, 37 check_dates=False, 38 rtol=1e-5, 39 atol=1e-8, 40 skipna_alternative=None, 41): 42 """ 43 Check that operator opname works as advertised on frame 44 45 Parameters 46 ---------- 47 opname : string 48 Name of the operator to test on frame 49 alternative : function 50 Function that opname is tested against; i.e. "frame.opname()" should 51 equal "alternative(frame)". 52 frame : DataFrame 53 The object that the tests are executed on 54 has_skipna : bool, default True 55 Whether the method "opname" has the kwarg "skip_na" 56 check_dtype : bool, default True 57 Whether the dtypes of the result of "frame.opname()" and 58 "alternative(frame)" should be checked. 59 check_dates : bool, default false 60 Whether opname should be tested on a Datetime Series 61 rtol : float, default 1e-5 62 Relative tolerance. 63 atol : float, default 1e-8 64 Absolute tolerance. 65 skipna_alternative : function, default None 66 NaN-safe version of alternative 67 """ 68 f = getattr(frame, opname) 69 70 if check_dates: 71 expected_warning = FutureWarning if opname in ["mean", "median"] else None 72 df = DataFrame({"b": date_range("1/1/2001", periods=2)}) 73 with tm.assert_produces_warning(expected_warning): 74 result = getattr(df, opname)() 75 assert isinstance(result, Series) 76 77 df["a"] = range(len(df)) 78 with tm.assert_produces_warning(expected_warning): 79 result = getattr(df, opname)() 80 assert isinstance(result, Series) 81 assert len(result) 82 83 if has_skipna: 84 85 def wrapper(x): 86 return alternative(x.values) 87 88 skipna_wrapper = tm._make_skipna_wrapper(alternative, skipna_alternative) 89 result0 = f(axis=0, skipna=False) 90 result1 = f(axis=1, skipna=False) 91 tm.assert_series_equal( 92 result0, frame.apply(wrapper), check_dtype=check_dtype, rtol=rtol, atol=atol 93 ) 94 # HACK: win32 95 tm.assert_series_equal( 96 result1, 97 frame.apply(wrapper, axis=1), 98 check_dtype=False, 99 rtol=rtol, 100 atol=atol, 101 ) 102 else: 103 skipna_wrapper = alternative 104 105 result0 = f(axis=0) 106 result1 = f(axis=1) 107 tm.assert_series_equal( 108 result0, 109 frame.apply(skipna_wrapper), 110 check_dtype=check_dtype, 111 rtol=rtol, 112 atol=atol, 113 ) 114 115 if opname in ["sum", "prod"]: 116 expected = frame.apply(skipna_wrapper, axis=1) 117 tm.assert_series_equal( 118 result1, expected, check_dtype=False, rtol=rtol, atol=atol 119 ) 120 121 # check dtypes 122 if check_dtype: 123 lcd_dtype = frame.values.dtype 124 assert lcd_dtype == result0.dtype 125 assert lcd_dtype == result1.dtype 126 127 # bad axis 128 with pytest.raises(ValueError, match="No axis named 2"): 129 f(axis=2) 130 131 # all NA case 132 if has_skipna: 133 all_na = frame * np.NaN 134 r0 = getattr(all_na, opname)(axis=0) 135 r1 = getattr(all_na, opname)(axis=1) 136 if opname in ["sum", "prod"]: 137 unit = 1 if opname == "prod" else 0 # result for empty sum/prod 138 expected = Series(unit, index=r0.index, dtype=r0.dtype) 139 tm.assert_series_equal(r0, expected) 140 expected = Series(unit, index=r1.index, dtype=r1.dtype) 141 tm.assert_series_equal(r1, expected) 142 143 144def assert_stat_op_api(opname, float_frame, float_string_frame, has_numeric_only=False): 145 """ 146 Check that API for operator opname works as advertised on frame 147 148 Parameters 149 ---------- 150 opname : string 151 Name of the operator to test on frame 152 float_frame : DataFrame 153 DataFrame with columns of type float 154 float_string_frame : DataFrame 155 DataFrame with both float and string columns 156 has_numeric_only : bool, default False 157 Whether the method "opname" has the kwarg "numeric_only" 158 """ 159 # make sure works on mixed-type frame 160 getattr(float_string_frame, opname)(axis=0) 161 getattr(float_string_frame, opname)(axis=1) 162 163 if has_numeric_only: 164 getattr(float_string_frame, opname)(axis=0, numeric_only=True) 165 getattr(float_string_frame, opname)(axis=1, numeric_only=True) 166 getattr(float_frame, opname)(axis=0, numeric_only=False) 167 getattr(float_frame, opname)(axis=1, numeric_only=False) 168 169 170def assert_bool_op_calc(opname, alternative, frame, has_skipna=True): 171 """ 172 Check that bool operator opname works as advertised on frame 173 174 Parameters 175 ---------- 176 opname : string 177 Name of the operator to test on frame 178 alternative : function 179 Function that opname is tested against; i.e. "frame.opname()" should 180 equal "alternative(frame)". 181 frame : DataFrame 182 The object that the tests are executed on 183 has_skipna : bool, default True 184 Whether the method "opname" has the kwarg "skip_na" 185 """ 186 f = getattr(frame, opname) 187 188 if has_skipna: 189 190 def skipna_wrapper(x): 191 nona = x.dropna().values 192 return alternative(nona) 193 194 def wrapper(x): 195 return alternative(x.values) 196 197 result0 = f(axis=0, skipna=False) 198 result1 = f(axis=1, skipna=False) 199 200 tm.assert_series_equal(result0, frame.apply(wrapper)) 201 tm.assert_series_equal( 202 result1, frame.apply(wrapper, axis=1), check_dtype=False 203 ) # HACK: win32 204 else: 205 skipna_wrapper = alternative 206 wrapper = alternative 207 208 result0 = f(axis=0) 209 result1 = f(axis=1) 210 211 tm.assert_series_equal(result0, frame.apply(skipna_wrapper)) 212 tm.assert_series_equal( 213 result1, frame.apply(skipna_wrapper, axis=1), check_dtype=False 214 ) 215 216 # bad axis 217 with pytest.raises(ValueError, match="No axis named 2"): 218 f(axis=2) 219 220 # all NA case 221 if has_skipna: 222 all_na = frame * np.NaN 223 r0 = getattr(all_na, opname)(axis=0) 224 r1 = getattr(all_na, opname)(axis=1) 225 if opname == "any": 226 assert not r0.any() 227 assert not r1.any() 228 else: 229 assert r0.all() 230 assert r1.all() 231 232 233def assert_bool_op_api( 234 opname, bool_frame_with_na, float_string_frame, has_bool_only=False 235): 236 """ 237 Check that API for boolean operator opname works as advertised on frame 238 239 Parameters 240 ---------- 241 opname : string 242 Name of the operator to test on frame 243 float_frame : DataFrame 244 DataFrame with columns of type float 245 float_string_frame : DataFrame 246 DataFrame with both float and string columns 247 has_bool_only : bool, default False 248 Whether the method "opname" has the kwarg "bool_only" 249 """ 250 # make sure op works on mixed-type frame 251 mixed = float_string_frame 252 mixed["_bool_"] = np.random.randn(len(mixed)) > 0.5 253 getattr(mixed, opname)(axis=0) 254 getattr(mixed, opname)(axis=1) 255 256 if has_bool_only: 257 getattr(mixed, opname)(axis=0, bool_only=True) 258 getattr(mixed, opname)(axis=1, bool_only=True) 259 getattr(bool_frame_with_na, opname)(axis=0, bool_only=False) 260 getattr(bool_frame_with_na, opname)(axis=1, bool_only=False) 261 262 263class TestDataFrameAnalytics: 264 265 # --------------------------------------------------------------------- 266 # Reductions 267 268 def test_stat_op_api(self, float_frame, float_string_frame): 269 assert_stat_op_api( 270 "count", float_frame, float_string_frame, has_numeric_only=True 271 ) 272 assert_stat_op_api( 273 "sum", float_frame, float_string_frame, has_numeric_only=True 274 ) 275 276 assert_stat_op_api("nunique", float_frame, float_string_frame) 277 assert_stat_op_api("mean", float_frame, float_string_frame) 278 assert_stat_op_api("product", float_frame, float_string_frame) 279 assert_stat_op_api("median", float_frame, float_string_frame) 280 assert_stat_op_api("min", float_frame, float_string_frame) 281 assert_stat_op_api("max", float_frame, float_string_frame) 282 assert_stat_op_api("mad", float_frame, float_string_frame) 283 assert_stat_op_api("var", float_frame, float_string_frame) 284 assert_stat_op_api("std", float_frame, float_string_frame) 285 assert_stat_op_api("sem", float_frame, float_string_frame) 286 assert_stat_op_api("median", float_frame, float_string_frame) 287 288 try: 289 from scipy.stats import kurtosis, skew # noqa:F401 290 291 assert_stat_op_api("skew", float_frame, float_string_frame) 292 assert_stat_op_api("kurt", float_frame, float_string_frame) 293 except ImportError: 294 pass 295 296 def test_stat_op_calc(self, float_frame_with_na, mixed_float_frame): 297 def count(s): 298 return notna(s).sum() 299 300 def nunique(s): 301 return len(algorithms.unique1d(s.dropna())) 302 303 def mad(x): 304 return np.abs(x - x.mean()).mean() 305 306 def var(x): 307 return np.var(x, ddof=1) 308 309 def std(x): 310 return np.std(x, ddof=1) 311 312 def sem(x): 313 return np.std(x, ddof=1) / np.sqrt(len(x)) 314 315 def skewness(x): 316 from scipy.stats import skew # noqa:F811 317 318 if len(x) < 3: 319 return np.nan 320 return skew(x, bias=False) 321 322 def kurt(x): 323 from scipy.stats import kurtosis # noqa:F811 324 325 if len(x) < 4: 326 return np.nan 327 return kurtosis(x, bias=False) 328 329 assert_stat_op_calc( 330 "nunique", 331 nunique, 332 float_frame_with_na, 333 has_skipna=False, 334 check_dtype=False, 335 check_dates=True, 336 ) 337 338 # GH#32571 check_less_precise is needed on apparently-random 339 # py37-npdev builds and OSX-PY36-min_version builds 340 # mixed types (with upcasting happening) 341 assert_stat_op_calc( 342 "sum", 343 np.sum, 344 mixed_float_frame.astype("float32"), 345 check_dtype=False, 346 rtol=1e-3, 347 ) 348 349 assert_stat_op_calc( 350 "sum", np.sum, float_frame_with_na, skipna_alternative=np.nansum 351 ) 352 assert_stat_op_calc("mean", np.mean, float_frame_with_na, check_dates=True) 353 assert_stat_op_calc( 354 "product", np.prod, float_frame_with_na, skipna_alternative=np.nanprod 355 ) 356 357 assert_stat_op_calc("mad", mad, float_frame_with_na) 358 assert_stat_op_calc("var", var, float_frame_with_na) 359 assert_stat_op_calc("std", std, float_frame_with_na) 360 assert_stat_op_calc("sem", sem, float_frame_with_na) 361 362 assert_stat_op_calc( 363 "count", 364 count, 365 float_frame_with_na, 366 has_skipna=False, 367 check_dtype=False, 368 check_dates=True, 369 ) 370 371 try: 372 from scipy import kurtosis, skew # noqa:F401 373 374 assert_stat_op_calc("skew", skewness, float_frame_with_na) 375 assert_stat_op_calc("kurt", kurt, float_frame_with_na) 376 except ImportError: 377 pass 378 379 # TODO: Ensure warning isn't emitted in the first place 380 @pytest.mark.filterwarnings("ignore:All-NaN:RuntimeWarning") 381 def test_median(self, float_frame_with_na, int_frame): 382 def wrapper(x): 383 if isna(x).any(): 384 return np.nan 385 return np.median(x) 386 387 assert_stat_op_calc("median", wrapper, float_frame_with_na, check_dates=True) 388 assert_stat_op_calc( 389 "median", wrapper, int_frame, check_dtype=False, check_dates=True 390 ) 391 392 @pytest.mark.parametrize( 393 "method", ["sum", "mean", "prod", "var", "std", "skew", "min", "max"] 394 ) 395 def test_stat_operators_attempt_obj_array(self, method): 396 # GH#676 397 data = { 398 "a": [ 399 -0.00049987540199591344, 400 -0.0016467257772919831, 401 0.00067695870775883013, 402 ], 403 "b": [-0, -0, 0.0], 404 "c": [ 405 0.00031111847529610595, 406 0.0014902627951905339, 407 -0.00094099200035979691, 408 ], 409 } 410 df1 = DataFrame(data, index=["foo", "bar", "baz"], dtype="O") 411 412 df2 = DataFrame({0: [np.nan, 2], 1: [np.nan, 3], 2: [np.nan, 4]}, dtype=object) 413 414 for df in [df1, df2]: 415 assert df.values.dtype == np.object_ 416 result = getattr(df, method)(1) 417 expected = getattr(df.astype("f8"), method)(1) 418 419 if method in ["sum", "prod"]: 420 tm.assert_series_equal(result, expected) 421 422 @pytest.mark.parametrize("op", ["mean", "std", "var", "skew", "kurt", "sem"]) 423 def test_mixed_ops(self, op): 424 # GH#16116 425 df = DataFrame( 426 { 427 "int": [1, 2, 3, 4], 428 "float": [1.0, 2.0, 3.0, 4.0], 429 "str": ["a", "b", "c", "d"], 430 } 431 ) 432 433 result = getattr(df, op)() 434 assert len(result) == 2 435 436 with pd.option_context("use_bottleneck", False): 437 result = getattr(df, op)() 438 assert len(result) == 2 439 440 def test_reduce_mixed_frame(self): 441 # GH 6806 442 df = DataFrame( 443 { 444 "bool_data": [True, True, False, False, False], 445 "int_data": [10, 20, 30, 40, 50], 446 "string_data": ["a", "b", "c", "d", "e"], 447 } 448 ) 449 df.reindex(columns=["bool_data", "int_data", "string_data"]) 450 test = df.sum(axis=0) 451 tm.assert_numpy_array_equal( 452 test.values, np.array([2, 150, "abcde"], dtype=object) 453 ) 454 tm.assert_series_equal(test, df.T.sum(axis=1)) 455 456 def test_nunique(self): 457 df = DataFrame({"A": [1, 1, 1], "B": [1, 2, 3], "C": [1, np.nan, 3]}) 458 tm.assert_series_equal(df.nunique(), Series({"A": 1, "B": 3, "C": 2})) 459 tm.assert_series_equal( 460 df.nunique(dropna=False), Series({"A": 1, "B": 3, "C": 3}) 461 ) 462 tm.assert_series_equal(df.nunique(axis=1), Series({0: 1, 1: 2, 2: 2})) 463 tm.assert_series_equal( 464 df.nunique(axis=1, dropna=False), Series({0: 1, 1: 3, 2: 2}) 465 ) 466 467 @pytest.mark.parametrize("tz", [None, "UTC"]) 468 def test_mean_mixed_datetime_numeric(self, tz): 469 # https://github.com/pandas-dev/pandas/issues/24752 470 df = DataFrame({"A": [1, 1], "B": [Timestamp("2000", tz=tz)] * 2}) 471 with tm.assert_produces_warning(FutureWarning): 472 result = df.mean() 473 expected = Series([1.0], index=["A"]) 474 tm.assert_series_equal(result, expected) 475 476 @pytest.mark.parametrize("tz", [None, "UTC"]) 477 def test_mean_excludes_datetimes(self, tz): 478 # https://github.com/pandas-dev/pandas/issues/24752 479 # Our long-term desired behavior is unclear, but the behavior in 480 # 0.24.0rc1 was buggy. 481 df = DataFrame({"A": [Timestamp("2000", tz=tz)] * 2}) 482 with tm.assert_produces_warning(FutureWarning): 483 result = df.mean() 484 485 expected = Series(dtype=np.float64) 486 tm.assert_series_equal(result, expected) 487 488 def test_mean_mixed_string_decimal(self): 489 # GH 11670 490 # possible bug when calculating mean of DataFrame? 491 492 d = [ 493 {"A": 2, "B": None, "C": Decimal("628.00")}, 494 {"A": 1, "B": None, "C": Decimal("383.00")}, 495 {"A": 3, "B": None, "C": Decimal("651.00")}, 496 {"A": 2, "B": None, "C": Decimal("575.00")}, 497 {"A": 4, "B": None, "C": Decimal("1114.00")}, 498 {"A": 1, "B": "TEST", "C": Decimal("241.00")}, 499 {"A": 2, "B": None, "C": Decimal("572.00")}, 500 {"A": 4, "B": None, "C": Decimal("609.00")}, 501 {"A": 3, "B": None, "C": Decimal("820.00")}, 502 {"A": 5, "B": None, "C": Decimal("1223.00")}, 503 ] 504 505 df = DataFrame(d) 506 507 result = df.mean() 508 expected = Series([2.7, 681.6], index=["A", "C"]) 509 tm.assert_series_equal(result, expected) 510 511 def test_var_std(self, datetime_frame): 512 result = datetime_frame.std(ddof=4) 513 expected = datetime_frame.apply(lambda x: x.std(ddof=4)) 514 tm.assert_almost_equal(result, expected) 515 516 result = datetime_frame.var(ddof=4) 517 expected = datetime_frame.apply(lambda x: x.var(ddof=4)) 518 tm.assert_almost_equal(result, expected) 519 520 arr = np.repeat(np.random.random((1, 1000)), 1000, 0) 521 result = nanops.nanvar(arr, axis=0) 522 assert not (result < 0).any() 523 524 with pd.option_context("use_bottleneck", False): 525 result = nanops.nanvar(arr, axis=0) 526 assert not (result < 0).any() 527 528 @pytest.mark.parametrize("meth", ["sem", "var", "std"]) 529 def test_numeric_only_flag(self, meth): 530 # GH 9201 531 df1 = DataFrame(np.random.randn(5, 3), columns=["foo", "bar", "baz"]) 532 # set one entry to a number in str format 533 df1.loc[0, "foo"] = "100" 534 535 df2 = DataFrame(np.random.randn(5, 3), columns=["foo", "bar", "baz"]) 536 # set one entry to a non-number str 537 df2.loc[0, "foo"] = "a" 538 539 result = getattr(df1, meth)(axis=1, numeric_only=True) 540 expected = getattr(df1[["bar", "baz"]], meth)(axis=1) 541 tm.assert_series_equal(expected, result) 542 543 result = getattr(df2, meth)(axis=1, numeric_only=True) 544 expected = getattr(df2[["bar", "baz"]], meth)(axis=1) 545 tm.assert_series_equal(expected, result) 546 547 # df1 has all numbers, df2 has a letter inside 548 msg = r"unsupported operand type\(s\) for -: 'float' and 'str'" 549 with pytest.raises(TypeError, match=msg): 550 getattr(df1, meth)(axis=1, numeric_only=False) 551 msg = "could not convert string to float: 'a'" 552 with pytest.raises(TypeError, match=msg): 553 getattr(df2, meth)(axis=1, numeric_only=False) 554 555 def test_sem(self, datetime_frame): 556 result = datetime_frame.sem(ddof=4) 557 expected = datetime_frame.apply(lambda x: x.std(ddof=4) / np.sqrt(len(x))) 558 tm.assert_almost_equal(result, expected) 559 560 arr = np.repeat(np.random.random((1, 1000)), 1000, 0) 561 result = nanops.nansem(arr, axis=0) 562 assert not (result < 0).any() 563 564 with pd.option_context("use_bottleneck", False): 565 result = nanops.nansem(arr, axis=0) 566 assert not (result < 0).any() 567 568 @td.skip_if_no_scipy 569 def test_kurt(self): 570 index = MultiIndex( 571 levels=[["bar"], ["one", "two", "three"], [0, 1]], 572 codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]], 573 ) 574 df = DataFrame(np.random.randn(6, 3), index=index) 575 576 kurt = df.kurt() 577 kurt2 = df.kurt(level=0).xs("bar") 578 tm.assert_series_equal(kurt, kurt2, check_names=False) 579 assert kurt.name is None 580 assert kurt2.name == "bar" 581 582 @pytest.mark.parametrize( 583 "dropna, expected", 584 [ 585 ( 586 True, 587 { 588 "A": [12], 589 "B": [10.0], 590 "C": [1.0], 591 "D": ["a"], 592 "E": Categorical(["a"], categories=["a"]), 593 "F": to_datetime(["2000-1-2"]), 594 "G": to_timedelta(["1 days"]), 595 }, 596 ), 597 ( 598 False, 599 { 600 "A": [12], 601 "B": [10.0], 602 "C": [np.nan], 603 "D": np.array([np.nan], dtype=object), 604 "E": Categorical([np.nan], categories=["a"]), 605 "F": [pd.NaT], 606 "G": to_timedelta([pd.NaT]), 607 }, 608 ), 609 ( 610 True, 611 { 612 "H": [8, 9, np.nan, np.nan], 613 "I": [8, 9, np.nan, np.nan], 614 "J": [1, np.nan, np.nan, np.nan], 615 "K": Categorical(["a", np.nan, np.nan, np.nan], categories=["a"]), 616 "L": to_datetime(["2000-1-2", "NaT", "NaT", "NaT"]), 617 "M": to_timedelta(["1 days", "nan", "nan", "nan"]), 618 "N": [0, 1, 2, 3], 619 }, 620 ), 621 ( 622 False, 623 { 624 "H": [8, 9, np.nan, np.nan], 625 "I": [8, 9, np.nan, np.nan], 626 "J": [1, np.nan, np.nan, np.nan], 627 "K": Categorical([np.nan, "a", np.nan, np.nan], categories=["a"]), 628 "L": to_datetime(["NaT", "2000-1-2", "NaT", "NaT"]), 629 "M": to_timedelta(["nan", "1 days", "nan", "nan"]), 630 "N": [0, 1, 2, 3], 631 }, 632 ), 633 ], 634 ) 635 def test_mode_dropna(self, dropna, expected): 636 637 df = DataFrame( 638 { 639 "A": [12, 12, 19, 11], 640 "B": [10, 10, np.nan, 3], 641 "C": [1, np.nan, np.nan, np.nan], 642 "D": [np.nan, np.nan, "a", np.nan], 643 "E": Categorical([np.nan, np.nan, "a", np.nan]), 644 "F": to_datetime(["NaT", "2000-1-2", "NaT", "NaT"]), 645 "G": to_timedelta(["1 days", "nan", "nan", "nan"]), 646 "H": [8, 8, 9, 9], 647 "I": [9, 9, 8, 8], 648 "J": [1, 1, np.nan, np.nan], 649 "K": Categorical(["a", np.nan, "a", np.nan]), 650 "L": to_datetime(["2000-1-2", "2000-1-2", "NaT", "NaT"]), 651 "M": to_timedelta(["1 days", "nan", "1 days", "nan"]), 652 "N": np.arange(4, dtype="int64"), 653 } 654 ) 655 656 result = df[sorted(expected.keys())].mode(dropna=dropna) 657 expected = DataFrame(expected) 658 tm.assert_frame_equal(result, expected) 659 660 def test_mode_sortwarning(self): 661 # Check for the warning that is raised when the mode 662 # results cannot be sorted 663 664 df = DataFrame({"A": [np.nan, np.nan, "a", "a"]}) 665 expected = DataFrame({"A": ["a", np.nan]}) 666 667 with tm.assert_produces_warning(UserWarning, check_stacklevel=False): 668 result = df.mode(dropna=False) 669 result = result.sort_values(by="A").reset_index(drop=True) 670 671 tm.assert_frame_equal(result, expected) 672 673 def test_operators_timedelta64(self): 674 df = DataFrame( 675 { 676 "A": date_range("2012-1-1", periods=3, freq="D"), 677 "B": date_range("2012-1-2", periods=3, freq="D"), 678 "C": Timestamp("20120101") - timedelta(minutes=5, seconds=5), 679 } 680 ) 681 682 diffs = DataFrame({"A": df["A"] - df["C"], "B": df["A"] - df["B"]}) 683 684 # min 685 result = diffs.min() 686 assert result[0] == diffs.loc[0, "A"] 687 assert result[1] == diffs.loc[0, "B"] 688 689 result = diffs.min(axis=1) 690 assert (result == diffs.loc[0, "B"]).all() 691 692 # max 693 result = diffs.max() 694 assert result[0] == diffs.loc[2, "A"] 695 assert result[1] == diffs.loc[2, "B"] 696 697 result = diffs.max(axis=1) 698 assert (result == diffs["A"]).all() 699 700 # abs 701 result = diffs.abs() 702 result2 = abs(diffs) 703 expected = DataFrame({"A": df["A"] - df["C"], "B": df["B"] - df["A"]}) 704 tm.assert_frame_equal(result, expected) 705 tm.assert_frame_equal(result2, expected) 706 707 # mixed frame 708 mixed = diffs.copy() 709 mixed["C"] = "foo" 710 mixed["D"] = 1 711 mixed["E"] = 1.0 712 mixed["F"] = Timestamp("20130101") 713 714 # results in an object array 715 result = mixed.min() 716 expected = Series( 717 [ 718 pd.Timedelta(timedelta(seconds=5 * 60 + 5)), 719 pd.Timedelta(timedelta(days=-1)), 720 "foo", 721 1, 722 1.0, 723 Timestamp("20130101"), 724 ], 725 index=mixed.columns, 726 ) 727 tm.assert_series_equal(result, expected) 728 729 # excludes numeric 730 result = mixed.min(axis=1) 731 expected = Series([1, 1, 1.0], index=[0, 1, 2]) 732 tm.assert_series_equal(result, expected) 733 734 # works when only those columns are selected 735 result = mixed[["A", "B"]].min(1) 736 expected = Series([timedelta(days=-1)] * 3) 737 tm.assert_series_equal(result, expected) 738 739 result = mixed[["A", "B"]].min() 740 expected = Series( 741 [timedelta(seconds=5 * 60 + 5), timedelta(days=-1)], index=["A", "B"] 742 ) 743 tm.assert_series_equal(result, expected) 744 745 # GH 3106 746 df = DataFrame( 747 { 748 "time": date_range("20130102", periods=5), 749 "time2": date_range("20130105", periods=5), 750 } 751 ) 752 df["off1"] = df["time2"] - df["time"] 753 assert df["off1"].dtype == "timedelta64[ns]" 754 755 df["off2"] = df["time"] - df["time2"] 756 df._consolidate_inplace() 757 assert df["off1"].dtype == "timedelta64[ns]" 758 assert df["off2"].dtype == "timedelta64[ns]" 759 760 def test_std_timedelta64_skipna_false(self): 761 # GH#37392 762 tdi = pd.timedelta_range("1 Day", periods=10) 763 df = DataFrame({"A": tdi, "B": tdi}) 764 df.iloc[-2, -1] = pd.NaT 765 766 result = df.std(skipna=False) 767 expected = Series( 768 [df["A"].std(), pd.NaT], index=["A", "B"], dtype="timedelta64[ns]" 769 ) 770 tm.assert_series_equal(result, expected) 771 772 result = df.std(axis=1, skipna=False) 773 expected = Series([pd.Timedelta(0)] * 8 + [pd.NaT, pd.Timedelta(0)]) 774 tm.assert_series_equal(result, expected) 775 776 def test_sum_corner(self): 777 empty_frame = DataFrame() 778 779 axis0 = empty_frame.sum(0) 780 axis1 = empty_frame.sum(1) 781 assert isinstance(axis0, Series) 782 assert isinstance(axis1, Series) 783 assert len(axis0) == 0 784 assert len(axis1) == 0 785 786 @pytest.mark.parametrize("method, unit", [("sum", 0), ("prod", 1)]) 787 @pytest.mark.parametrize("numeric_only", [None, True, False]) 788 def test_sum_prod_nanops(self, method, unit, numeric_only): 789 idx = ["a", "b", "c"] 790 df = DataFrame({"a": [unit, unit], "b": [unit, np.nan], "c": [np.nan, np.nan]}) 791 # The default 792 result = getattr(df, method)(numeric_only=numeric_only) 793 expected = Series([unit, unit, unit], index=idx, dtype="float64") 794 795 # min_count=1 796 result = getattr(df, method)(numeric_only=numeric_only, min_count=1) 797 expected = Series([unit, unit, np.nan], index=idx) 798 tm.assert_series_equal(result, expected) 799 800 # min_count=0 801 result = getattr(df, method)(numeric_only=numeric_only, min_count=0) 802 expected = Series([unit, unit, unit], index=idx, dtype="float64") 803 tm.assert_series_equal(result, expected) 804 805 result = getattr(df.iloc[1:], method)(numeric_only=numeric_only, min_count=1) 806 expected = Series([unit, np.nan, np.nan], index=idx) 807 tm.assert_series_equal(result, expected) 808 809 # min_count > 1 810 df = DataFrame({"A": [unit] * 10, "B": [unit] * 5 + [np.nan] * 5}) 811 result = getattr(df, method)(numeric_only=numeric_only, min_count=5) 812 expected = Series(result, index=["A", "B"]) 813 tm.assert_series_equal(result, expected) 814 815 result = getattr(df, method)(numeric_only=numeric_only, min_count=6) 816 expected = Series(result, index=["A", "B"]) 817 tm.assert_series_equal(result, expected) 818 819 def test_sum_nanops_timedelta(self): 820 # prod isn't defined on timedeltas 821 idx = ["a", "b", "c"] 822 df = DataFrame({"a": [0, 0], "b": [0, np.nan], "c": [np.nan, np.nan]}) 823 824 df2 = df.apply(pd.to_timedelta) 825 826 # 0 by default 827 result = df2.sum() 828 expected = Series([0, 0, 0], dtype="m8[ns]", index=idx) 829 tm.assert_series_equal(result, expected) 830 831 # min_count=0 832 result = df2.sum(min_count=0) 833 tm.assert_series_equal(result, expected) 834 835 # min_count=1 836 result = df2.sum(min_count=1) 837 expected = Series([0, 0, np.nan], dtype="m8[ns]", index=idx) 838 tm.assert_series_equal(result, expected) 839 840 def test_sum_nanops_min_count(self): 841 # https://github.com/pandas-dev/pandas/issues/39738 842 df = DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) 843 result = df.sum(min_count=10) 844 expected = Series([np.nan, np.nan], index=["x", "y"]) 845 tm.assert_series_equal(result, expected) 846 847 def test_sum_object(self, float_frame): 848 values = float_frame.values.astype(int) 849 frame = DataFrame(values, index=float_frame.index, columns=float_frame.columns) 850 deltas = frame * timedelta(1) 851 deltas.sum() 852 853 def test_sum_bool(self, float_frame): 854 # ensure this works, bug report 855 bools = np.isnan(float_frame) 856 bools.sum(1) 857 bools.sum(0) 858 859 def test_sum_mixed_datetime(self): 860 # GH#30886 861 df = DataFrame( 862 {"A": pd.date_range("2000", periods=4), "B": [1, 2, 3, 4]} 863 ).reindex([2, 3, 4]) 864 result = df.sum() 865 866 expected = Series({"B": 7.0}) 867 tm.assert_series_equal(result, expected) 868 869 def test_mean_corner(self, float_frame, float_string_frame): 870 # unit test when have object data 871 the_mean = float_string_frame.mean(axis=0) 872 the_sum = float_string_frame.sum(axis=0, numeric_only=True) 873 tm.assert_index_equal(the_sum.index, the_mean.index) 874 assert len(the_mean.index) < len(float_string_frame.columns) 875 876 # xs sum mixed type, just want to know it works... 877 the_mean = float_string_frame.mean(axis=1) 878 the_sum = float_string_frame.sum(axis=1, numeric_only=True) 879 tm.assert_index_equal(the_sum.index, the_mean.index) 880 881 # take mean of boolean column 882 float_frame["bool"] = float_frame["A"] > 0 883 means = float_frame.mean(0) 884 assert means["bool"] == float_frame["bool"].values.mean() 885 886 def test_mean_datetimelike(self): 887 # GH#24757 check that datetimelike are excluded by default, handled 888 # correctly with numeric_only=True 889 890 df = DataFrame( 891 { 892 "A": np.arange(3), 893 "B": pd.date_range("2016-01-01", periods=3), 894 "C": pd.timedelta_range("1D", periods=3), 895 "D": pd.period_range("2016", periods=3, freq="A"), 896 } 897 ) 898 result = df.mean(numeric_only=True) 899 expected = Series({"A": 1.0}) 900 tm.assert_series_equal(result, expected) 901 902 with tm.assert_produces_warning(FutureWarning): 903 # in the future datetime columns will be included 904 result = df.mean() 905 expected = Series({"A": 1.0, "C": df.loc[1, "C"]}) 906 tm.assert_series_equal(result, expected) 907 908 def test_mean_datetimelike_numeric_only_false(self): 909 df = DataFrame( 910 { 911 "A": np.arange(3), 912 "B": pd.date_range("2016-01-01", periods=3), 913 "C": pd.timedelta_range("1D", periods=3), 914 } 915 ) 916 917 # datetime(tz) and timedelta work 918 result = df.mean(numeric_only=False) 919 expected = Series({"A": 1, "B": df.loc[1, "B"], "C": df.loc[1, "C"]}) 920 tm.assert_series_equal(result, expected) 921 922 # mean of period is not allowed 923 df["D"] = pd.period_range("2016", periods=3, freq="A") 924 925 with pytest.raises(TypeError, match="mean is not implemented for Period"): 926 df.mean(numeric_only=False) 927 928 def test_mean_extensionarray_numeric_only_true(self): 929 # https://github.com/pandas-dev/pandas/issues/33256 930 arr = np.random.randint(1000, size=(10, 5)) 931 df = DataFrame(arr, dtype="Int64") 932 result = df.mean(numeric_only=True) 933 expected = DataFrame(arr).mean() 934 tm.assert_series_equal(result, expected) 935 936 def test_stats_mixed_type(self, float_string_frame): 937 # don't blow up 938 float_string_frame.std(1) 939 float_string_frame.var(1) 940 float_string_frame.mean(1) 941 float_string_frame.skew(1) 942 943 def test_sum_bools(self): 944 df = DataFrame(index=range(1), columns=range(10)) 945 bools = isna(df) 946 assert bools.sum(axis=1)[0] == 10 947 948 # ---------------------------------------------------------------------- 949 # Index of max / min 950 951 def test_idxmin(self, float_frame, int_frame): 952 frame = float_frame 953 frame.iloc[5:10] = np.nan 954 frame.iloc[15:20, -2:] = np.nan 955 for skipna in [True, False]: 956 for axis in [0, 1]: 957 for df in [frame, int_frame]: 958 result = df.idxmin(axis=axis, skipna=skipna) 959 expected = df.apply(Series.idxmin, axis=axis, skipna=skipna) 960 tm.assert_series_equal(result, expected) 961 962 msg = "No axis named 2 for object type DataFrame" 963 with pytest.raises(ValueError, match=msg): 964 frame.idxmin(axis=2) 965 966 def test_idxmax(self, float_frame, int_frame): 967 frame = float_frame 968 frame.iloc[5:10] = np.nan 969 frame.iloc[15:20, -2:] = np.nan 970 for skipna in [True, False]: 971 for axis in [0, 1]: 972 for df in [frame, int_frame]: 973 result = df.idxmax(axis=axis, skipna=skipna) 974 expected = df.apply(Series.idxmax, axis=axis, skipna=skipna) 975 tm.assert_series_equal(result, expected) 976 977 msg = "No axis named 2 for object type DataFrame" 978 with pytest.raises(ValueError, match=msg): 979 frame.idxmax(axis=2) 980 981 def test_idxmax_mixed_dtype(self): 982 # don't cast to object, which would raise in nanops 983 dti = pd.date_range("2016-01-01", periods=3) 984 985 df = DataFrame({1: [0, 2, 1], 2: range(3)[::-1], 3: dti}) 986 987 result = df.idxmax() 988 expected = Series([1, 0, 2], index=[1, 2, 3]) 989 tm.assert_series_equal(result, expected) 990 991 result = df.idxmin() 992 expected = Series([0, 2, 0], index=[1, 2, 3]) 993 tm.assert_series_equal(result, expected) 994 995 # ---------------------------------------------------------------------- 996 # Logical reductions 997 998 @pytest.mark.parametrize("opname", ["any", "all"]) 999 def test_any_all(self, opname, bool_frame_with_na, float_string_frame): 1000 assert_bool_op_calc( 1001 opname, getattr(np, opname), bool_frame_with_na, has_skipna=True 1002 ) 1003 assert_bool_op_api( 1004 opname, bool_frame_with_na, float_string_frame, has_bool_only=True 1005 ) 1006 1007 def test_any_all_extra(self): 1008 df = DataFrame( 1009 { 1010 "A": [True, False, False], 1011 "B": [True, True, False], 1012 "C": [True, True, True], 1013 }, 1014 index=["a", "b", "c"], 1015 ) 1016 result = df[["A", "B"]].any(1) 1017 expected = Series([True, True, False], index=["a", "b", "c"]) 1018 tm.assert_series_equal(result, expected) 1019 1020 result = df[["A", "B"]].any(1, bool_only=True) 1021 tm.assert_series_equal(result, expected) 1022 1023 result = df.all(1) 1024 expected = Series([True, False, False], index=["a", "b", "c"]) 1025 tm.assert_series_equal(result, expected) 1026 1027 result = df.all(1, bool_only=True) 1028 tm.assert_series_equal(result, expected) 1029 1030 # Axis is None 1031 result = df.all(axis=None).item() 1032 assert result is False 1033 1034 result = df.any(axis=None).item() 1035 assert result is True 1036 1037 result = df[["C"]].all(axis=None).item() 1038 assert result is True 1039 1040 def test_any_datetime(self): 1041 1042 # GH 23070 1043 float_data = [1, np.nan, 3, np.nan] 1044 datetime_data = [ 1045 Timestamp("1960-02-15"), 1046 Timestamp("1960-02-16"), 1047 pd.NaT, 1048 pd.NaT, 1049 ] 1050 df = DataFrame({"A": float_data, "B": datetime_data}) 1051 1052 result = df.any(1) 1053 expected = Series([True, True, True, False]) 1054 tm.assert_series_equal(result, expected) 1055 1056 def test_any_all_bool_only(self): 1057 1058 # GH 25101 1059 df = DataFrame( 1060 {"col1": [1, 2, 3], "col2": [4, 5, 6], "col3": [None, None, None]} 1061 ) 1062 1063 result = df.all(bool_only=True) 1064 expected = Series(dtype=np.bool_) 1065 tm.assert_series_equal(result, expected) 1066 1067 df = DataFrame( 1068 { 1069 "col1": [1, 2, 3], 1070 "col2": [4, 5, 6], 1071 "col3": [None, None, None], 1072 "col4": [False, False, True], 1073 } 1074 ) 1075 1076 result = df.all(bool_only=True) 1077 expected = Series({"col4": False}) 1078 tm.assert_series_equal(result, expected) 1079 1080 @pytest.mark.parametrize( 1081 "func, data, expected", 1082 [ 1083 (np.any, {}, False), 1084 (np.all, {}, True), 1085 (np.any, {"A": []}, False), 1086 (np.all, {"A": []}, True), 1087 (np.any, {"A": [False, False]}, False), 1088 (np.all, {"A": [False, False]}, False), 1089 (np.any, {"A": [True, False]}, True), 1090 (np.all, {"A": [True, False]}, False), 1091 (np.any, {"A": [True, True]}, True), 1092 (np.all, {"A": [True, True]}, True), 1093 (np.any, {"A": [False], "B": [False]}, False), 1094 (np.all, {"A": [False], "B": [False]}, False), 1095 (np.any, {"A": [False, False], "B": [False, True]}, True), 1096 (np.all, {"A": [False, False], "B": [False, True]}, False), 1097 # other types 1098 (np.all, {"A": Series([0.0, 1.0], dtype="float")}, False), 1099 (np.any, {"A": Series([0.0, 1.0], dtype="float")}, True), 1100 (np.all, {"A": Series([0, 1], dtype=int)}, False), 1101 (np.any, {"A": Series([0, 1], dtype=int)}, True), 1102 pytest.param(np.all, {"A": Series([0, 1], dtype="M8[ns]")}, False), 1103 pytest.param(np.all, {"A": Series([0, 1], dtype="M8[ns, UTC]")}, False), 1104 pytest.param(np.any, {"A": Series([0, 1], dtype="M8[ns]")}, True), 1105 pytest.param(np.any, {"A": Series([0, 1], dtype="M8[ns, UTC]")}, True), 1106 pytest.param(np.all, {"A": Series([1, 2], dtype="M8[ns]")}, True), 1107 pytest.param(np.all, {"A": Series([1, 2], dtype="M8[ns, UTC]")}, True), 1108 pytest.param(np.any, {"A": Series([1, 2], dtype="M8[ns]")}, True), 1109 pytest.param(np.any, {"A": Series([1, 2], dtype="M8[ns, UTC]")}, True), 1110 pytest.param(np.all, {"A": Series([0, 1], dtype="m8[ns]")}, False), 1111 pytest.param(np.any, {"A": Series([0, 1], dtype="m8[ns]")}, True), 1112 pytest.param(np.all, {"A": Series([1, 2], dtype="m8[ns]")}, True), 1113 pytest.param(np.any, {"A": Series([1, 2], dtype="m8[ns]")}, True), 1114 # np.all on Categorical raises, so the reduction drops the 1115 # column, so all is being done on an empty Series, so is True 1116 (np.all, {"A": Series([0, 1], dtype="category")}, True), 1117 (np.any, {"A": Series([0, 1], dtype="category")}, False), 1118 (np.all, {"A": Series([1, 2], dtype="category")}, True), 1119 (np.any, {"A": Series([1, 2], dtype="category")}, False), 1120 # Mix GH#21484 1121 pytest.param( 1122 np.all, 1123 { 1124 "A": Series([10, 20], dtype="M8[ns]"), 1125 "B": Series([10, 20], dtype="m8[ns]"), 1126 }, 1127 True, 1128 ), 1129 ], 1130 ) 1131 def test_any_all_np_func(self, func, data, expected): 1132 # GH 19976 1133 data = DataFrame(data) 1134 result = func(data) 1135 assert isinstance(result, np.bool_) 1136 assert result.item() is expected 1137 1138 # method version 1139 result = getattr(DataFrame(data), func.__name__)(axis=None) 1140 assert isinstance(result, np.bool_) 1141 assert result.item() is expected 1142 1143 def test_any_all_object(self): 1144 # GH 19976 1145 result = np.all(DataFrame(columns=["a", "b"])).item() 1146 assert result is True 1147 1148 result = np.any(DataFrame(columns=["a", "b"])).item() 1149 assert result is False 1150 1151 def test_any_all_object_bool_only(self): 1152 df = DataFrame({"A": ["foo", 2], "B": [True, False]}).astype(object) 1153 df._consolidate_inplace() 1154 df["C"] = Series([True, True]) 1155 1156 # The underlying bug is in DataFrame._get_bool_data, so we check 1157 # that while we're here 1158 res = df._get_bool_data() 1159 expected = df[["B", "C"]] 1160 tm.assert_frame_equal(res, expected) 1161 1162 res = df.all(bool_only=True, axis=0) 1163 expected = Series([False, True], index=["B", "C"]) 1164 tm.assert_series_equal(res, expected) 1165 1166 # operating on a subset of columns should not produce a _larger_ Series 1167 res = df[["B", "C"]].all(bool_only=True, axis=0) 1168 tm.assert_series_equal(res, expected) 1169 1170 assert not df.all(bool_only=True, axis=None) 1171 1172 res = df.any(bool_only=True, axis=0) 1173 expected = Series([True, True], index=["B", "C"]) 1174 tm.assert_series_equal(res, expected) 1175 1176 # operating on a subset of columns should not produce a _larger_ Series 1177 res = df[["B", "C"]].any(bool_only=True, axis=0) 1178 tm.assert_series_equal(res, expected) 1179 1180 assert df.any(bool_only=True, axis=None) 1181 1182 @pytest.mark.parametrize("method", ["any", "all"]) 1183 def test_any_all_level_axis_none_raises(self, method): 1184 df = DataFrame( 1185 {"A": 1}, 1186 index=MultiIndex.from_product( 1187 [["A", "B"], ["a", "b"]], names=["out", "in"] 1188 ), 1189 ) 1190 xpr = "Must specify 'axis' when aggregating by level." 1191 with pytest.raises(ValueError, match=xpr): 1192 getattr(df, method)(axis=None, level="out") 1193 1194 # --------------------------------------------------------------------- 1195 # Unsorted 1196 1197 def test_series_broadcasting(self): 1198 # smoke test for numpy warnings 1199 # GH 16378, GH 16306 1200 df = DataFrame([1.0, 1.0, 1.0]) 1201 df_nan = DataFrame({"A": [np.nan, 2.0, np.nan]}) 1202 s = Series([1, 1, 1]) 1203 s_nan = Series([np.nan, np.nan, 1]) 1204 1205 with tm.assert_produces_warning(None): 1206 df_nan.clip(lower=s, axis=0) 1207 for op in ["lt", "le", "gt", "ge", "eq", "ne"]: 1208 getattr(df, op)(s_nan, axis=0) 1209 1210 1211class TestDataFrameReductions: 1212 def test_min_max_dt64_with_NaT(self): 1213 # Both NaT and Timestamp are in DataFrame. 1214 df = DataFrame({"foo": [pd.NaT, pd.NaT, Timestamp("2012-05-01")]}) 1215 1216 res = df.min() 1217 exp = Series([Timestamp("2012-05-01")], index=["foo"]) 1218 tm.assert_series_equal(res, exp) 1219 1220 res = df.max() 1221 exp = Series([Timestamp("2012-05-01")], index=["foo"]) 1222 tm.assert_series_equal(res, exp) 1223 1224 # GH12941, only NaTs are in DataFrame. 1225 df = DataFrame({"foo": [pd.NaT, pd.NaT]}) 1226 1227 res = df.min() 1228 exp = Series([pd.NaT], index=["foo"]) 1229 tm.assert_series_equal(res, exp) 1230 1231 res = df.max() 1232 exp = Series([pd.NaT], index=["foo"]) 1233 tm.assert_series_equal(res, exp) 1234 1235 def test_min_max_dt64_with_NaT_skipna_false(self, tz_naive_fixture): 1236 # GH#36907 1237 tz = tz_naive_fixture 1238 if isinstance(tz, tzlocal) and is_platform_windows(): 1239 pytest.xfail( 1240 reason="GH#37659 OSError raised within tzlocal bc Windows " 1241 "chokes in times before 1970-01-01" 1242 ) 1243 1244 df = DataFrame( 1245 { 1246 "a": [ 1247 Timestamp("2020-01-01 08:00:00", tz=tz), 1248 Timestamp("1920-02-01 09:00:00", tz=tz), 1249 ], 1250 "b": [Timestamp("2020-02-01 08:00:00", tz=tz), pd.NaT], 1251 } 1252 ) 1253 1254 res = df.min(axis=1, skipna=False) 1255 expected = Series([df.loc[0, "a"], pd.NaT]) 1256 assert expected.dtype == df["a"].dtype 1257 1258 tm.assert_series_equal(res, expected) 1259 1260 res = df.max(axis=1, skipna=False) 1261 expected = Series([df.loc[0, "b"], pd.NaT]) 1262 assert expected.dtype == df["a"].dtype 1263 1264 tm.assert_series_equal(res, expected) 1265 1266 def test_min_max_dt64_api_consistency_with_NaT(self): 1267 # Calling the following sum functions returned an error for dataframes but 1268 # returned NaT for series. These tests check that the API is consistent in 1269 # min/max calls on empty Series/DataFrames. See GH:33704 for more 1270 # information 1271 df = DataFrame({"x": pd.to_datetime([])}) 1272 expected_dt_series = Series(pd.to_datetime([])) 1273 # check axis 0 1274 assert (df.min(axis=0).x is pd.NaT) == (expected_dt_series.min() is pd.NaT) 1275 assert (df.max(axis=0).x is pd.NaT) == (expected_dt_series.max() is pd.NaT) 1276 1277 # check axis 1 1278 tm.assert_series_equal(df.min(axis=1), expected_dt_series) 1279 tm.assert_series_equal(df.max(axis=1), expected_dt_series) 1280 1281 def test_min_max_dt64_api_consistency_empty_df(self): 1282 # check DataFrame/Series api consistency when calling min/max on an empty 1283 # DataFrame/Series. 1284 df = DataFrame({"x": []}) 1285 expected_float_series = Series([], dtype=float) 1286 # check axis 0 1287 assert np.isnan(df.min(axis=0).x) == np.isnan(expected_float_series.min()) 1288 assert np.isnan(df.max(axis=0).x) == np.isnan(expected_float_series.max()) 1289 # check axis 1 1290 tm.assert_series_equal(df.min(axis=1), expected_float_series) 1291 tm.assert_series_equal(df.min(axis=1), expected_float_series) 1292 1293 @pytest.mark.parametrize( 1294 "initial", 1295 ["2018-10-08 13:36:45+00:00", "2018-10-08 13:36:45+03:00"], # Non-UTC timezone 1296 ) 1297 @pytest.mark.parametrize("method", ["min", "max"]) 1298 def test_preserve_timezone(self, initial: str, method): 1299 # GH 28552 1300 initial_dt = pd.to_datetime(initial) 1301 expected = Series([initial_dt]) 1302 df = DataFrame([expected]) 1303 result = getattr(df, method)(axis=1) 1304 tm.assert_series_equal(result, expected) 1305 1306 def test_frame_any_all_with_level(self): 1307 df = DataFrame( 1308 {"data": [False, False, True, False, True, False, True]}, 1309 index=[ 1310 ["one", "one", "two", "one", "two", "two", "two"], 1311 [0, 1, 0, 2, 1, 2, 3], 1312 ], 1313 ) 1314 1315 result = df.any(level=0) 1316 ex = DataFrame({"data": [False, True]}, index=["one", "two"]) 1317 tm.assert_frame_equal(result, ex) 1318 1319 result = df.all(level=0) 1320 ex = DataFrame({"data": [False, False]}, index=["one", "two"]) 1321 tm.assert_frame_equal(result, ex) 1322 1323 def test_frame_any_with_timedelta(self): 1324 # GH#17667 1325 df = DataFrame( 1326 { 1327 "a": Series([0, 0]), 1328 "t": Series([pd.to_timedelta(0, "s"), pd.to_timedelta(1, "ms")]), 1329 } 1330 ) 1331 1332 result = df.any(axis=0) 1333 expected = Series(data=[False, True], index=["a", "t"]) 1334 tm.assert_series_equal(result, expected) 1335 1336 result = df.any(axis=1) 1337 expected = Series(data=[False, True]) 1338 tm.assert_series_equal(result, expected) 1339 1340 1341class TestNuisanceColumns: 1342 @pytest.mark.parametrize("method", ["any", "all"]) 1343 def test_any_all_categorical_dtype_nuisance_column(self, method): 1344 # GH#36076 DataFrame should match Series behavior 1345 ser = Series([0, 1], dtype="category", name="A") 1346 df = ser.to_frame() 1347 1348 # Double-check the Series behavior is to raise 1349 with pytest.raises(TypeError, match="does not implement reduction"): 1350 getattr(ser, method)() 1351 1352 with pytest.raises(TypeError, match="does not implement reduction"): 1353 getattr(np, method)(ser) 1354 1355 with pytest.raises(TypeError, match="does not implement reduction"): 1356 getattr(df, method)(bool_only=False) 1357 1358 # With bool_only=None, operating on this column raises and is ignored, 1359 # so we expect an empty result. 1360 result = getattr(df, method)(bool_only=None) 1361 expected = Series([], index=Index([]), dtype=bool) 1362 tm.assert_series_equal(result, expected) 1363 1364 result = getattr(np, method)(df, axis=0) 1365 tm.assert_series_equal(result, expected) 1366 1367 def test_median_categorical_dtype_nuisance_column(self): 1368 # GH#21020 DataFrame.median should match Series.median 1369 df = DataFrame({"A": Categorical([1, 2, 2, 2, 3])}) 1370 ser = df["A"] 1371 1372 # Double-check the Series behavior is to raise 1373 with pytest.raises(TypeError, match="does not implement reduction"): 1374 ser.median() 1375 1376 with pytest.raises(TypeError, match="does not implement reduction"): 1377 df.median(numeric_only=False) 1378 1379 result = df.median() 1380 expected = Series([], index=Index([]), dtype=np.float64) 1381 tm.assert_series_equal(result, expected) 1382 1383 # same thing, but with an additional non-categorical column 1384 df["B"] = df["A"].astype(int) 1385 1386 with pytest.raises(TypeError, match="does not implement reduction"): 1387 df.median(numeric_only=False) 1388 1389 result = df.median() 1390 expected = Series([2.0], index=["B"]) 1391 tm.assert_series_equal(result, expected) 1392 1393 # TODO: np.median(df, axis=0) gives np.array([2.0, 2.0]) instead 1394 # of expected.values 1395 1396 @pytest.mark.parametrize("method", ["min", "max"]) 1397 def test_min_max_categorical_dtype_non_ordered_nuisance_column(self, method): 1398 # GH#28949 DataFrame.min should behave like Series.min 1399 cat = Categorical(["a", "b", "c", "b"], ordered=False) 1400 ser = Series(cat) 1401 df = ser.to_frame("A") 1402 1403 # Double-check the Series behavior 1404 with pytest.raises(TypeError, match="is not ordered for operation"): 1405 getattr(ser, method)() 1406 1407 with pytest.raises(TypeError, match="is not ordered for operation"): 1408 getattr(np, method)(ser) 1409 1410 with pytest.raises(TypeError, match="is not ordered for operation"): 1411 getattr(df, method)(numeric_only=False) 1412 1413 result = getattr(df, method)() 1414 expected = Series([], index=Index([]), dtype=np.float64) 1415 tm.assert_series_equal(result, expected) 1416 1417 result = getattr(np, method)(df) 1418 tm.assert_series_equal(result, expected) 1419 1420 # same thing, but with an additional non-categorical column 1421 df["B"] = df["A"].astype(object) 1422 result = getattr(df, method)() 1423 if method == "min": 1424 expected = Series(["a"], index=["B"]) 1425 else: 1426 expected = Series(["c"], index=["B"]) 1427 tm.assert_series_equal(result, expected) 1428 1429 result = getattr(np, method)(df) 1430 tm.assert_series_equal(result, expected) 1431 1432 def test_reduction_object_block_splits_nuisance_columns(self): 1433 # GH#37827 1434 df = DataFrame({"A": [0, 1, 2], "B": ["a", "b", "c"]}, dtype=object) 1435 1436 # We should only exclude "B", not "A" 1437 result = df.mean() 1438 expected = Series([1.0], index=["A"]) 1439 tm.assert_series_equal(result, expected) 1440 1441 # Same behavior but heterogeneous dtype 1442 df["C"] = df["A"].astype(int) + 4 1443 1444 result = df.mean() 1445 expected = Series([1.0, 5.0], index=["A", "C"]) 1446 tm.assert_series_equal(result, expected) 1447 1448 1449def test_sum_timedelta64_skipna_false(): 1450 # GH#17235 1451 arr = np.arange(8).astype(np.int64).view("m8[s]").reshape(4, 2) 1452 arr[-1, -1] = "Nat" 1453 1454 df = DataFrame(arr) 1455 1456 result = df.sum(skipna=False) 1457 expected = Series([pd.Timedelta(seconds=12), pd.NaT]) 1458 tm.assert_series_equal(result, expected) 1459 1460 result = df.sum(axis=0, skipna=False) 1461 tm.assert_series_equal(result, expected) 1462 1463 result = df.sum(axis=1, skipna=False) 1464 expected = Series( 1465 [ 1466 pd.Timedelta(seconds=1), 1467 pd.Timedelta(seconds=5), 1468 pd.Timedelta(seconds=9), 1469 pd.NaT, 1470 ] 1471 ) 1472 tm.assert_series_equal(result, expected) 1473 1474 1475def test_mixed_frame_with_integer_sum(): 1476 # https://github.com/pandas-dev/pandas/issues/34520 1477 df = DataFrame([["a", 1]], columns=list("ab")) 1478 df = df.astype({"b": "Int64"}) 1479 result = df.sum() 1480 expected = Series(["a", 1], index=["a", "b"]) 1481 tm.assert_series_equal(result, expected) 1482 1483 1484@pytest.mark.parametrize("numeric_only", [True, False, None]) 1485@pytest.mark.parametrize("method", ["min", "max"]) 1486def test_minmax_extensionarray(method, numeric_only): 1487 # https://github.com/pandas-dev/pandas/issues/32651 1488 int64_info = np.iinfo("int64") 1489 ser = Series([int64_info.max, None, int64_info.min], dtype=pd.Int64Dtype()) 1490 df = DataFrame({"Int64": ser}) 1491 result = getattr(df, method)(numeric_only=numeric_only) 1492 expected = Series( 1493 [getattr(int64_info, method)], index=Index(["Int64"], dtype="object") 1494 ) 1495 tm.assert_series_equal(result, expected) 1496 1497 1498def test_prod_sum_min_count_mixed_object(): 1499 # https://github.com/pandas-dev/pandas/issues/41074 1500 df = DataFrame([1, "a", True]) 1501 1502 result = df.prod(axis=0, min_count=1, numeric_only=False) 1503 expected = Series(["a"]) 1504 tm.assert_series_equal(result, expected) 1505 1506 msg = re.escape("unsupported operand type(s) for +: 'int' and 'str'") 1507 with pytest.raises(TypeError, match=msg): 1508 df.sum(axis=0, min_count=1, numeric_only=False) 1509