1from datetime import datetime 2from decimal import Decimal 3from io import StringIO 4 5import numpy as np 6import pytest 7 8from pandas.errors import PerformanceWarning 9 10import pandas as pd 11from pandas import ( 12 DataFrame, 13 Grouper, 14 Index, 15 MultiIndex, 16 Series, 17 Timestamp, 18 date_range, 19 read_csv, 20) 21import pandas._testing as tm 22from pandas.core.base import SpecificationError 23import pandas.core.common as com 24 25 26def test_repr(): 27 # GH18203 28 result = repr(Grouper(key="A", level="B")) 29 expected = "Grouper(key='A', level='B', axis=0, sort=False)" 30 assert result == expected 31 32 33@pytest.mark.parametrize("dtype", ["int64", "int32", "float64", "float32"]) 34def test_basic(dtype): 35 36 data = Series(np.arange(9) // 3, index=np.arange(9), dtype=dtype) 37 38 index = np.arange(9) 39 np.random.shuffle(index) 40 data = data.reindex(index) 41 42 grouped = data.groupby(lambda x: x // 3) 43 44 for k, v in grouped: 45 assert len(v) == 3 46 47 agged = grouped.aggregate(np.mean) 48 assert agged[1] == 1 49 50 tm.assert_series_equal(agged, grouped.agg(np.mean)) # shorthand 51 tm.assert_series_equal(agged, grouped.mean()) 52 tm.assert_series_equal(grouped.agg(np.sum), grouped.sum()) 53 54 expected = grouped.apply(lambda x: x * x.sum()) 55 transformed = grouped.transform(lambda x: x * x.sum()) 56 assert transformed[7] == 12 57 tm.assert_series_equal(transformed, expected) 58 59 value_grouped = data.groupby(data) 60 tm.assert_series_equal( 61 value_grouped.aggregate(np.mean), agged, check_index_type=False 62 ) 63 64 # complex agg 65 agged = grouped.aggregate([np.mean, np.std]) 66 67 msg = r"nested renamer is not supported" 68 with pytest.raises(SpecificationError, match=msg): 69 grouped.aggregate({"one": np.mean, "two": np.std}) 70 71 group_constants = {0: 10, 1: 20, 2: 30} 72 agged = grouped.agg(lambda x: group_constants[x.name] + x.mean()) 73 assert agged[1] == 21 74 75 # corner cases 76 msg = "Must produce aggregated value" 77 # exception raised is type Exception 78 with pytest.raises(Exception, match=msg): 79 grouped.aggregate(lambda x: x * 2) 80 81 82def test_groupby_nonobject_dtype(mframe, df_mixed_floats): 83 key = mframe.index.codes[0] 84 grouped = mframe.groupby(key) 85 result = grouped.sum() 86 87 expected = mframe.groupby(key.astype("O")).sum() 88 tm.assert_frame_equal(result, expected) 89 90 # GH 3911, mixed frame non-conversion 91 df = df_mixed_floats.copy() 92 df["value"] = range(len(df)) 93 94 def max_value(group): 95 return group.loc[group["value"].idxmax()] 96 97 applied = df.groupby("A").apply(max_value) 98 result = applied.dtypes 99 expected = Series( 100 [np.dtype("object")] * 2 + [np.dtype("float64")] * 2 + [np.dtype("int64")], 101 index=["A", "B", "C", "D", "value"], 102 ) 103 tm.assert_series_equal(result, expected) 104 105 106def test_groupby_return_type(): 107 108 # GH2893, return a reduced type 109 df1 = DataFrame( 110 [ 111 {"val1": 1, "val2": 20}, 112 {"val1": 1, "val2": 19}, 113 {"val1": 2, "val2": 27}, 114 {"val1": 2, "val2": 12}, 115 ] 116 ) 117 118 def func(dataf): 119 return dataf["val2"] - dataf["val2"].mean() 120 121 with tm.assert_produces_warning(FutureWarning): 122 result = df1.groupby("val1", squeeze=True).apply(func) 123 assert isinstance(result, Series) 124 125 df2 = DataFrame( 126 [ 127 {"val1": 1, "val2": 20}, 128 {"val1": 1, "val2": 19}, 129 {"val1": 1, "val2": 27}, 130 {"val1": 1, "val2": 12}, 131 ] 132 ) 133 134 def func(dataf): 135 return dataf["val2"] - dataf["val2"].mean() 136 137 with tm.assert_produces_warning(FutureWarning): 138 result = df2.groupby("val1", squeeze=True).apply(func) 139 assert isinstance(result, Series) 140 141 # GH3596, return a consistent type (regression in 0.11 from 0.10.1) 142 df = DataFrame([[1, 1], [1, 1]], columns=["X", "Y"]) 143 with tm.assert_produces_warning(FutureWarning): 144 result = df.groupby("X", squeeze=False).count() 145 assert isinstance(result, DataFrame) 146 147 148def test_inconsistent_return_type(): 149 # GH5592 150 # inconsistent return type 151 df = DataFrame( 152 { 153 "A": ["Tiger", "Tiger", "Tiger", "Lamb", "Lamb", "Pony", "Pony"], 154 "B": Series(np.arange(7), dtype="int64"), 155 "C": date_range("20130101", periods=7), 156 } 157 ) 158 159 def f(grp): 160 return grp.iloc[0] 161 162 expected = df.groupby("A").first()[["B"]] 163 result = df.groupby("A").apply(f)[["B"]] 164 tm.assert_frame_equal(result, expected) 165 166 def f(grp): 167 if grp.name == "Tiger": 168 return None 169 return grp.iloc[0] 170 171 result = df.groupby("A").apply(f)[["B"]] 172 e = expected.copy() 173 e.loc["Tiger"] = np.nan 174 tm.assert_frame_equal(result, e) 175 176 def f(grp): 177 if grp.name == "Pony": 178 return None 179 return grp.iloc[0] 180 181 result = df.groupby("A").apply(f)[["B"]] 182 e = expected.copy() 183 e.loc["Pony"] = np.nan 184 tm.assert_frame_equal(result, e) 185 186 # 5592 revisited, with datetimes 187 def f(grp): 188 if grp.name == "Pony": 189 return None 190 return grp.iloc[0] 191 192 result = df.groupby("A").apply(f)[["C"]] 193 e = df.groupby("A").first()[["C"]] 194 e.loc["Pony"] = pd.NaT 195 tm.assert_frame_equal(result, e) 196 197 # scalar outputs 198 def f(grp): 199 if grp.name == "Pony": 200 return None 201 return grp.iloc[0].loc["C"] 202 203 result = df.groupby("A").apply(f) 204 e = df.groupby("A").first()["C"].copy() 205 e.loc["Pony"] = np.nan 206 e.name = None 207 tm.assert_series_equal(result, e) 208 209 210def test_pass_args_kwargs(ts, tsframe): 211 def f(x, q=None, axis=0): 212 return np.percentile(x, q, axis=axis) 213 214 g = lambda x: np.percentile(x, 80, axis=0) 215 216 # Series 217 ts_grouped = ts.groupby(lambda x: x.month) 218 agg_result = ts_grouped.agg(np.percentile, 80, axis=0) 219 apply_result = ts_grouped.apply(np.percentile, 80, axis=0) 220 trans_result = ts_grouped.transform(np.percentile, 80, axis=0) 221 222 agg_expected = ts_grouped.quantile(0.8) 223 trans_expected = ts_grouped.transform(g) 224 225 tm.assert_series_equal(apply_result, agg_expected) 226 tm.assert_series_equal(agg_result, agg_expected) 227 tm.assert_series_equal(trans_result, trans_expected) 228 229 agg_result = ts_grouped.agg(f, q=80) 230 apply_result = ts_grouped.apply(f, q=80) 231 trans_result = ts_grouped.transform(f, q=80) 232 tm.assert_series_equal(agg_result, agg_expected) 233 tm.assert_series_equal(apply_result, agg_expected) 234 tm.assert_series_equal(trans_result, trans_expected) 235 236 # DataFrame 237 df_grouped = tsframe.groupby(lambda x: x.month) 238 agg_result = df_grouped.agg(np.percentile, 80, axis=0) 239 apply_result = df_grouped.apply(DataFrame.quantile, 0.8) 240 expected = df_grouped.quantile(0.8) 241 tm.assert_frame_equal(apply_result, expected, check_names=False) 242 tm.assert_frame_equal(agg_result, expected) 243 244 agg_result = df_grouped.agg(f, q=80) 245 apply_result = df_grouped.apply(DataFrame.quantile, q=0.8) 246 tm.assert_frame_equal(agg_result, expected) 247 tm.assert_frame_equal(apply_result, expected, check_names=False) 248 249 250def test_len(): 251 df = tm.makeTimeDataFrame() 252 grouped = df.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]) 253 assert len(grouped) == len(df) 254 255 grouped = df.groupby([lambda x: x.year, lambda x: x.month]) 256 expected = len({(x.year, x.month) for x in df.index}) 257 assert len(grouped) == expected 258 259 # issue 11016 260 df = DataFrame({"a": [np.nan] * 3, "b": [1, 2, 3]}) 261 assert len(df.groupby("a")) == 0 262 assert len(df.groupby("b")) == 3 263 assert len(df.groupby(["a", "b"])) == 3 264 265 266def test_basic_regression(): 267 # regression 268 result = Series([1.0 * x for x in list(range(1, 10)) * 10]) 269 270 data = np.random.random(1100) * 10.0 271 groupings = Series(data) 272 273 grouped = result.groupby(groupings) 274 grouped.mean() 275 276 277@pytest.mark.parametrize( 278 "dtype", ["float64", "float32", "int64", "int32", "int16", "int8"] 279) 280def test_with_na_groups(dtype): 281 index = Index(np.arange(10)) 282 values = Series(np.ones(10), index, dtype=dtype) 283 labels = Series( 284 [np.nan, "foo", "bar", "bar", np.nan, np.nan, "bar", "bar", np.nan, "foo"], 285 index=index, 286 ) 287 288 # this SHOULD be an int 289 grouped = values.groupby(labels) 290 agged = grouped.agg(len) 291 expected = Series([4, 2], index=["bar", "foo"]) 292 293 tm.assert_series_equal(agged, expected, check_dtype=False) 294 295 # assert issubclass(agged.dtype.type, np.integer) 296 297 # explicitly return a float from my function 298 def f(x): 299 return float(len(x)) 300 301 agged = grouped.agg(f) 302 expected = Series([4, 2], index=["bar", "foo"]) 303 304 tm.assert_series_equal(agged, expected, check_dtype=False) 305 assert issubclass(agged.dtype.type, np.dtype(dtype).type) 306 307 308def test_indices_concatenation_order(): 309 310 # GH 2808 311 312 def f1(x): 313 y = x[(x.b % 2) == 1] ** 2 314 if y.empty: 315 multiindex = MultiIndex(levels=[[]] * 2, codes=[[]] * 2, names=["b", "c"]) 316 res = DataFrame(columns=["a"], index=multiindex) 317 return res 318 else: 319 y = y.set_index(["b", "c"]) 320 return y 321 322 def f2(x): 323 y = x[(x.b % 2) == 1] ** 2 324 if y.empty: 325 return DataFrame() 326 else: 327 y = y.set_index(["b", "c"]) 328 return y 329 330 def f3(x): 331 y = x[(x.b % 2) == 1] ** 2 332 if y.empty: 333 multiindex = MultiIndex( 334 levels=[[]] * 2, codes=[[]] * 2, names=["foo", "bar"] 335 ) 336 res = DataFrame(columns=["a", "b"], index=multiindex) 337 return res 338 else: 339 return y 340 341 df = DataFrame({"a": [1, 2, 2, 2], "b": range(4), "c": range(5, 9)}) 342 343 df2 = DataFrame({"a": [3, 2, 2, 2], "b": range(4), "c": range(5, 9)}) 344 345 # correct result 346 result1 = df.groupby("a").apply(f1) 347 result2 = df2.groupby("a").apply(f1) 348 tm.assert_frame_equal(result1, result2) 349 350 # should fail (not the same number of levels) 351 msg = "Cannot concat indices that do not have the same number of levels" 352 with pytest.raises(AssertionError, match=msg): 353 df.groupby("a").apply(f2) 354 with pytest.raises(AssertionError, match=msg): 355 df2.groupby("a").apply(f2) 356 357 # should fail (incorrect shape) 358 with pytest.raises(AssertionError, match=msg): 359 df.groupby("a").apply(f3) 360 with pytest.raises(AssertionError, match=msg): 361 df2.groupby("a").apply(f3) 362 363 364def test_attr_wrapper(ts): 365 grouped = ts.groupby(lambda x: x.weekday()) 366 367 result = grouped.std() 368 expected = grouped.agg(lambda x: np.std(x, ddof=1)) 369 tm.assert_series_equal(result, expected) 370 371 # this is pretty cool 372 result = grouped.describe() 373 expected = {name: gp.describe() for name, gp in grouped} 374 expected = DataFrame(expected).T 375 tm.assert_frame_equal(result, expected) 376 377 # get attribute 378 result = grouped.dtype 379 expected = grouped.agg(lambda x: x.dtype) 380 tm.assert_series_equal(result, expected) 381 382 # make sure raises error 383 msg = "'SeriesGroupBy' object has no attribute 'foo'" 384 with pytest.raises(AttributeError, match=msg): 385 getattr(grouped, "foo") 386 387 388def test_frame_groupby(tsframe): 389 grouped = tsframe.groupby(lambda x: x.weekday()) 390 391 # aggregate 392 aggregated = grouped.aggregate(np.mean) 393 assert len(aggregated) == 5 394 assert len(aggregated.columns) == 4 395 396 # by string 397 tscopy = tsframe.copy() 398 tscopy["weekday"] = [x.weekday() for x in tscopy.index] 399 stragged = tscopy.groupby("weekday").aggregate(np.mean) 400 tm.assert_frame_equal(stragged, aggregated, check_names=False) 401 402 # transform 403 grouped = tsframe.head(30).groupby(lambda x: x.weekday()) 404 transformed = grouped.transform(lambda x: x - x.mean()) 405 assert len(transformed) == 30 406 assert len(transformed.columns) == 4 407 408 # transform propagate 409 transformed = grouped.transform(lambda x: x.mean()) 410 for name, group in grouped: 411 mean = group.mean() 412 for idx in group.index: 413 tm.assert_series_equal(transformed.xs(idx), mean, check_names=False) 414 415 # iterate 416 for weekday, group in grouped: 417 assert group.index[0].weekday() == weekday 418 419 # groups / group_indices 420 groups = grouped.groups 421 indices = grouped.indices 422 423 for k, v in groups.items(): 424 samething = tsframe.index.take(indices[k]) 425 assert (samething == v).all() 426 427 428def test_frame_groupby_columns(tsframe): 429 mapping = {"A": 0, "B": 0, "C": 1, "D": 1} 430 grouped = tsframe.groupby(mapping, axis=1) 431 432 # aggregate 433 aggregated = grouped.aggregate(np.mean) 434 assert len(aggregated) == len(tsframe) 435 assert len(aggregated.columns) == 2 436 437 # transform 438 tf = lambda x: x - x.mean() 439 groupedT = tsframe.T.groupby(mapping, axis=0) 440 tm.assert_frame_equal(groupedT.transform(tf).T, grouped.transform(tf)) 441 442 # iterate 443 for k, v in grouped: 444 assert len(v.columns) == 2 445 446 447def test_frame_set_name_single(df): 448 grouped = df.groupby("A") 449 450 result = grouped.mean() 451 assert result.index.name == "A" 452 453 result = df.groupby("A", as_index=False).mean() 454 assert result.index.name != "A" 455 456 result = grouped.agg(np.mean) 457 assert result.index.name == "A" 458 459 result = grouped.agg({"C": np.mean, "D": np.std}) 460 assert result.index.name == "A" 461 462 result = grouped["C"].mean() 463 assert result.index.name == "A" 464 result = grouped["C"].agg(np.mean) 465 assert result.index.name == "A" 466 result = grouped["C"].agg([np.mean, np.std]) 467 assert result.index.name == "A" 468 469 msg = r"nested renamer is not supported" 470 with pytest.raises(SpecificationError, match=msg): 471 grouped["C"].agg({"foo": np.mean, "bar": np.std}) 472 473 474def test_multi_func(df): 475 col1 = df["A"] 476 col2 = df["B"] 477 478 grouped = df.groupby([col1.get, col2.get]) 479 agged = grouped.mean() 480 expected = df.groupby(["A", "B"]).mean() 481 482 # TODO groupby get drops names 483 tm.assert_frame_equal( 484 agged.loc[:, ["C", "D"]], expected.loc[:, ["C", "D"]], check_names=False 485 ) 486 487 # some "groups" with no data 488 df = DataFrame( 489 { 490 "v1": np.random.randn(6), 491 "v2": np.random.randn(6), 492 "k1": np.array(["b", "b", "b", "a", "a", "a"]), 493 "k2": np.array(["1", "1", "1", "2", "2", "2"]), 494 }, 495 index=["one", "two", "three", "four", "five", "six"], 496 ) 497 # only verify that it works for now 498 grouped = df.groupby(["k1", "k2"]) 499 grouped.agg(np.sum) 500 501 502def test_multi_key_multiple_functions(df): 503 grouped = df.groupby(["A", "B"])["C"] 504 505 agged = grouped.agg([np.mean, np.std]) 506 expected = DataFrame({"mean": grouped.agg(np.mean), "std": grouped.agg(np.std)}) 507 tm.assert_frame_equal(agged, expected) 508 509 510def test_frame_multi_key_function_list(): 511 data = DataFrame( 512 { 513 "A": [ 514 "foo", 515 "foo", 516 "foo", 517 "foo", 518 "bar", 519 "bar", 520 "bar", 521 "bar", 522 "foo", 523 "foo", 524 "foo", 525 ], 526 "B": [ 527 "one", 528 "one", 529 "one", 530 "two", 531 "one", 532 "one", 533 "one", 534 "two", 535 "two", 536 "two", 537 "one", 538 ], 539 "C": [ 540 "dull", 541 "dull", 542 "shiny", 543 "dull", 544 "dull", 545 "shiny", 546 "shiny", 547 "dull", 548 "shiny", 549 "shiny", 550 "shiny", 551 ], 552 "D": np.random.randn(11), 553 "E": np.random.randn(11), 554 "F": np.random.randn(11), 555 } 556 ) 557 558 grouped = data.groupby(["A", "B"]) 559 funcs = [np.mean, np.std] 560 agged = grouped.agg(funcs) 561 expected = pd.concat( 562 [grouped["D"].agg(funcs), grouped["E"].agg(funcs), grouped["F"].agg(funcs)], 563 keys=["D", "E", "F"], 564 axis=1, 565 ) 566 assert isinstance(agged.index, MultiIndex) 567 assert isinstance(expected.index, MultiIndex) 568 tm.assert_frame_equal(agged, expected) 569 570 571@pytest.mark.parametrize("op", [lambda x: x.sum(), lambda x: x.mean()]) 572def test_groupby_multiple_columns(df, op): 573 data = df 574 grouped = data.groupby(["A", "B"]) 575 576 result1 = op(grouped) 577 578 keys = [] 579 values = [] 580 for n1, gp1 in data.groupby("A"): 581 for n2, gp2 in gp1.groupby("B"): 582 keys.append((n1, n2)) 583 values.append(op(gp2.loc[:, ["C", "D"]])) 584 585 mi = MultiIndex.from_tuples(keys, names=["A", "B"]) 586 expected = pd.concat(values, axis=1).T 587 expected.index = mi 588 589 # a little bit crude 590 for col in ["C", "D"]: 591 result_col = op(grouped[col]) 592 pivoted = result1[col] 593 exp = expected[col] 594 tm.assert_series_equal(result_col, exp) 595 tm.assert_series_equal(pivoted, exp) 596 597 # test single series works the same 598 result = data["C"].groupby([data["A"], data["B"]]).mean() 599 expected = data.groupby(["A", "B"]).mean()["C"] 600 601 tm.assert_series_equal(result, expected) 602 603 604def test_as_index_select_column(): 605 # GH 5764 606 df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"]) 607 result = df.groupby("A", as_index=False)["B"].get_group(1) 608 expected = Series([2, 4], name="B") 609 tm.assert_series_equal(result, expected) 610 611 result = df.groupby("A", as_index=False)["B"].apply(lambda x: x.cumsum()) 612 expected = Series( 613 [2, 6, 6], name="B", index=MultiIndex.from_tuples([(0, 0), (0, 1), (1, 2)]) 614 ) 615 tm.assert_series_equal(result, expected) 616 617 618def test_groupby_as_index_select_column_sum_empty_df(): 619 # GH 35246 620 df = DataFrame(columns=["A", "B", "C"]) 621 left = df.groupby(by="A", as_index=False)["B"].sum() 622 assert type(left) is DataFrame 623 assert left.to_dict() == {"A": {}, "B": {}} 624 625 626def test_groupby_as_index_agg(df): 627 grouped = df.groupby("A", as_index=False) 628 629 # single-key 630 631 result = grouped.agg(np.mean) 632 expected = grouped.mean() 633 tm.assert_frame_equal(result, expected) 634 635 result2 = grouped.agg({"C": np.mean, "D": np.sum}) 636 expected2 = grouped.mean() 637 expected2["D"] = grouped.sum()["D"] 638 tm.assert_frame_equal(result2, expected2) 639 640 grouped = df.groupby("A", as_index=True) 641 642 msg = r"nested renamer is not supported" 643 with pytest.raises(SpecificationError, match=msg): 644 grouped["C"].agg({"Q": np.sum}) 645 646 # multi-key 647 648 grouped = df.groupby(["A", "B"], as_index=False) 649 650 result = grouped.agg(np.mean) 651 expected = grouped.mean() 652 tm.assert_frame_equal(result, expected) 653 654 result2 = grouped.agg({"C": np.mean, "D": np.sum}) 655 expected2 = grouped.mean() 656 expected2["D"] = grouped.sum()["D"] 657 tm.assert_frame_equal(result2, expected2) 658 659 expected3 = grouped["C"].sum() 660 expected3 = DataFrame(expected3).rename(columns={"C": "Q"}) 661 result3 = grouped["C"].agg({"Q": np.sum}) 662 tm.assert_frame_equal(result3, expected3) 663 664 # GH7115 & GH8112 & GH8582 665 df = DataFrame(np.random.randint(0, 100, (50, 3)), columns=["jim", "joe", "jolie"]) 666 ts = Series(np.random.randint(5, 10, 50), name="jim") 667 668 gr = df.groupby(ts) 669 gr.nth(0) # invokes set_selection_from_grouper internally 670 tm.assert_frame_equal(gr.apply(sum), df.groupby(ts).apply(sum)) 671 672 for attr in ["mean", "max", "count", "idxmax", "cumsum", "all"]: 673 gr = df.groupby(ts, as_index=False) 674 left = getattr(gr, attr)() 675 676 gr = df.groupby(ts.values, as_index=True) 677 right = getattr(gr, attr)().reset_index(drop=True) 678 679 tm.assert_frame_equal(left, right) 680 681 682def test_ops_not_as_index(reduction_func): 683 # GH 10355, 21090 684 # Using as_index=False should not modify grouped column 685 686 if reduction_func in ("corrwith",): 687 pytest.skip("Test not applicable") 688 689 if reduction_func in ("nth", "ngroup"): 690 pytest.skip("Skip until behavior is determined (GH #5755)") 691 692 df = DataFrame(np.random.randint(0, 5, size=(100, 2)), columns=["a", "b"]) 693 expected = getattr(df.groupby("a"), reduction_func)() 694 if reduction_func == "size": 695 expected = expected.rename("size") 696 expected = expected.reset_index() 697 698 g = df.groupby("a", as_index=False) 699 700 result = getattr(g, reduction_func)() 701 tm.assert_frame_equal(result, expected) 702 703 result = g.agg(reduction_func) 704 tm.assert_frame_equal(result, expected) 705 706 result = getattr(g["b"], reduction_func)() 707 tm.assert_frame_equal(result, expected) 708 709 result = g["b"].agg(reduction_func) 710 tm.assert_frame_equal(result, expected) 711 712 713def test_as_index_series_return_frame(df): 714 grouped = df.groupby("A", as_index=False) 715 grouped2 = df.groupby(["A", "B"], as_index=False) 716 717 result = grouped["C"].agg(np.sum) 718 expected = grouped.agg(np.sum).loc[:, ["A", "C"]] 719 assert isinstance(result, DataFrame) 720 tm.assert_frame_equal(result, expected) 721 722 result2 = grouped2["C"].agg(np.sum) 723 expected2 = grouped2.agg(np.sum).loc[:, ["A", "B", "C"]] 724 assert isinstance(result2, DataFrame) 725 tm.assert_frame_equal(result2, expected2) 726 727 result = grouped["C"].sum() 728 expected = grouped.sum().loc[:, ["A", "C"]] 729 assert isinstance(result, DataFrame) 730 tm.assert_frame_equal(result, expected) 731 732 result2 = grouped2["C"].sum() 733 expected2 = grouped2.sum().loc[:, ["A", "B", "C"]] 734 assert isinstance(result2, DataFrame) 735 tm.assert_frame_equal(result2, expected2) 736 737 738def test_as_index_series_column_slice_raises(df): 739 # GH15072 740 grouped = df.groupby("A", as_index=False) 741 msg = r"Column\(s\) C already selected" 742 743 with pytest.raises(IndexError, match=msg): 744 grouped["C"].__getitem__("D") 745 746 747def test_groupby_as_index_cython(df): 748 data = df 749 750 # single-key 751 grouped = data.groupby("A", as_index=False) 752 result = grouped.mean() 753 expected = data.groupby(["A"]).mean() 754 expected.insert(0, "A", expected.index) 755 expected.index = np.arange(len(expected)) 756 tm.assert_frame_equal(result, expected) 757 758 # multi-key 759 grouped = data.groupby(["A", "B"], as_index=False) 760 result = grouped.mean() 761 expected = data.groupby(["A", "B"]).mean() 762 763 arrays = list(zip(*expected.index.values)) 764 expected.insert(0, "A", arrays[0]) 765 expected.insert(1, "B", arrays[1]) 766 expected.index = np.arange(len(expected)) 767 tm.assert_frame_equal(result, expected) 768 769 770def test_groupby_as_index_series_scalar(df): 771 grouped = df.groupby(["A", "B"], as_index=False) 772 773 # GH #421 774 775 result = grouped["C"].agg(len) 776 expected = grouped.agg(len).loc[:, ["A", "B", "C"]] 777 tm.assert_frame_equal(result, expected) 778 779 780def test_groupby_as_index_corner(df, ts): 781 msg = "as_index=False only valid with DataFrame" 782 with pytest.raises(TypeError, match=msg): 783 ts.groupby(lambda x: x.weekday(), as_index=False) 784 785 msg = "as_index=False only valid for axis=0" 786 with pytest.raises(ValueError, match=msg): 787 df.groupby(lambda x: x.lower(), as_index=False, axis=1) 788 789 790def test_groupby_multiple_key(df): 791 df = tm.makeTimeDataFrame() 792 grouped = df.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]) 793 agged = grouped.sum() 794 tm.assert_almost_equal(df.values, agged.values) 795 796 grouped = df.T.groupby( 797 [lambda x: x.year, lambda x: x.month, lambda x: x.day], axis=1 798 ) 799 800 agged = grouped.agg(lambda x: x.sum()) 801 tm.assert_index_equal(agged.index, df.columns) 802 tm.assert_almost_equal(df.T.values, agged.values) 803 804 agged = grouped.agg(lambda x: x.sum()) 805 tm.assert_almost_equal(df.T.values, agged.values) 806 807 808def test_groupby_multi_corner(df): 809 # test that having an all-NA column doesn't mess you up 810 df = df.copy() 811 df["bad"] = np.nan 812 agged = df.groupby(["A", "B"]).mean() 813 814 expected = df.groupby(["A", "B"]).mean() 815 expected["bad"] = np.nan 816 817 tm.assert_frame_equal(agged, expected) 818 819 820def test_omit_nuisance(df): 821 grouped = df.groupby("A") 822 823 result = grouped.mean() 824 expected = df.loc[:, ["A", "C", "D"]].groupby("A").mean() 825 tm.assert_frame_equal(result, expected) 826 827 agged = grouped.agg(np.mean) 828 exp = grouped.mean() 829 tm.assert_frame_equal(agged, exp) 830 831 df = df.loc[:, ["A", "C", "D"]] 832 df["E"] = datetime.now() 833 grouped = df.groupby("A") 834 result = grouped.agg(np.sum) 835 expected = grouped.sum() 836 tm.assert_frame_equal(result, expected) 837 838 # won't work with axis = 1 839 grouped = df.groupby({"A": 0, "C": 0, "D": 1, "E": 1}, axis=1) 840 msg = "reduction operation 'sum' not allowed for this dtype" 841 with pytest.raises(TypeError, match=msg): 842 grouped.agg(lambda x: x.sum(0, numeric_only=False)) 843 844 845def test_omit_nuisance_sem(df): 846 # GH 38774 - sem should work with nuisance columns 847 grouped = df.groupby("A") 848 result = grouped.sem() 849 expected = df.loc[:, ["A", "C", "D"]].groupby("A").sem() 850 tm.assert_frame_equal(result, expected) 851 852 853def test_omit_nuisance_python_multiple(three_group): 854 grouped = three_group.groupby(["A", "B"]) 855 856 agged = grouped.agg(np.mean) 857 exp = grouped.mean() 858 tm.assert_frame_equal(agged, exp) 859 860 861def test_empty_groups_corner(mframe): 862 # handle empty groups 863 df = DataFrame( 864 { 865 "k1": np.array(["b", "b", "b", "a", "a", "a"]), 866 "k2": np.array(["1", "1", "1", "2", "2", "2"]), 867 "k3": ["foo", "bar"] * 3, 868 "v1": np.random.randn(6), 869 "v2": np.random.randn(6), 870 } 871 ) 872 873 grouped = df.groupby(["k1", "k2"]) 874 result = grouped.agg(np.mean) 875 expected = grouped.mean() 876 tm.assert_frame_equal(result, expected) 877 878 grouped = mframe[3:5].groupby(level=0) 879 agged = grouped.apply(lambda x: x.mean()) 880 agged_A = grouped["A"].apply(np.mean) 881 tm.assert_series_equal(agged["A"], agged_A) 882 assert agged.index.name == "first" 883 884 885def test_nonsense_func(): 886 df = DataFrame([0]) 887 msg = r"unsupported operand type\(s\) for \+: 'int' and 'str'" 888 with pytest.raises(TypeError, match=msg): 889 df.groupby(lambda x: x + "foo") 890 891 892def test_wrap_aggregated_output_multindex(mframe): 893 df = mframe.T 894 df["baz", "two"] = "peekaboo" 895 896 keys = [np.array([0, 0, 1]), np.array([0, 0, 1])] 897 agged = df.groupby(keys).agg(np.mean) 898 assert isinstance(agged.columns, MultiIndex) 899 900 def aggfun(ser): 901 if ser.name == ("foo", "one"): 902 raise TypeError 903 else: 904 return ser.sum() 905 906 agged2 = df.groupby(keys).aggregate(aggfun) 907 assert len(agged2.columns) + 1 == len(df.columns) 908 909 910def test_groupby_level_apply(mframe): 911 912 result = mframe.groupby(level=0).count() 913 assert result.index.name == "first" 914 result = mframe.groupby(level=1).count() 915 assert result.index.name == "second" 916 917 result = mframe["A"].groupby(level=0).count() 918 assert result.index.name == "first" 919 920 921def test_groupby_level_mapper(mframe): 922 deleveled = mframe.reset_index() 923 924 mapper0 = {"foo": 0, "bar": 0, "baz": 1, "qux": 1} 925 mapper1 = {"one": 0, "two": 0, "three": 1} 926 927 result0 = mframe.groupby(mapper0, level=0).sum() 928 result1 = mframe.groupby(mapper1, level=1).sum() 929 930 mapped_level0 = np.array([mapper0.get(x) for x in deleveled["first"]]) 931 mapped_level1 = np.array([mapper1.get(x) for x in deleveled["second"]]) 932 expected0 = mframe.groupby(mapped_level0).sum() 933 expected1 = mframe.groupby(mapped_level1).sum() 934 expected0.index.name, expected1.index.name = "first", "second" 935 936 tm.assert_frame_equal(result0, expected0) 937 tm.assert_frame_equal(result1, expected1) 938 939 940def test_groupby_level_nonmulti(): 941 # GH 1313, GH 13901 942 s = Series([1, 2, 3, 10, 4, 5, 20, 6], Index([1, 2, 3, 1, 4, 5, 2, 6], name="foo")) 943 expected = Series([11, 22, 3, 4, 5, 6], Index(range(1, 7), name="foo")) 944 945 result = s.groupby(level=0).sum() 946 tm.assert_series_equal(result, expected) 947 result = s.groupby(level=[0]).sum() 948 tm.assert_series_equal(result, expected) 949 result = s.groupby(level=-1).sum() 950 tm.assert_series_equal(result, expected) 951 result = s.groupby(level=[-1]).sum() 952 tm.assert_series_equal(result, expected) 953 954 msg = "level > 0 or level < -1 only valid with MultiIndex" 955 with pytest.raises(ValueError, match=msg): 956 s.groupby(level=1) 957 with pytest.raises(ValueError, match=msg): 958 s.groupby(level=-2) 959 msg = "No group keys passed!" 960 with pytest.raises(ValueError, match=msg): 961 s.groupby(level=[]) 962 msg = "multiple levels only valid with MultiIndex" 963 with pytest.raises(ValueError, match=msg): 964 s.groupby(level=[0, 0]) 965 with pytest.raises(ValueError, match=msg): 966 s.groupby(level=[0, 1]) 967 msg = "level > 0 or level < -1 only valid with MultiIndex" 968 with pytest.raises(ValueError, match=msg): 969 s.groupby(level=[1]) 970 971 972def test_groupby_complex(): 973 # GH 12902 974 a = Series(data=np.arange(4) * (1 + 2j), index=[0, 0, 1, 1]) 975 expected = Series((1 + 2j, 5 + 10j)) 976 977 result = a.groupby(level=0).sum() 978 tm.assert_series_equal(result, expected) 979 980 result = a.sum(level=0) 981 tm.assert_series_equal(result, expected) 982 983 984def test_groupby_series_indexed_differently(): 985 s1 = Series( 986 [5.0, -9.0, 4.0, 100.0, -5.0, 55.0, 6.7], 987 index=Index(["a", "b", "c", "d", "e", "f", "g"]), 988 ) 989 s2 = Series( 990 [1.0, 1.0, 4.0, 5.0, 5.0, 7.0], index=Index(["a", "b", "d", "f", "g", "h"]) 991 ) 992 993 grouped = s1.groupby(s2) 994 agged = grouped.mean() 995 exp = s1.groupby(s2.reindex(s1.index).get).mean() 996 tm.assert_series_equal(agged, exp) 997 998 999def test_groupby_with_hier_columns(): 1000 tuples = list( 1001 zip( 1002 *[ 1003 ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], 1004 ["one", "two", "one", "two", "one", "two", "one", "two"], 1005 ] 1006 ) 1007 ) 1008 index = MultiIndex.from_tuples(tuples) 1009 columns = MultiIndex.from_tuples( 1010 [("A", "cat"), ("B", "dog"), ("B", "cat"), ("A", "dog")] 1011 ) 1012 df = DataFrame(np.random.randn(8, 4), index=index, columns=columns) 1013 1014 result = df.groupby(level=0).mean() 1015 tm.assert_index_equal(result.columns, columns) 1016 1017 result = df.groupby(level=0, axis=1).mean() 1018 tm.assert_index_equal(result.index, df.index) 1019 1020 result = df.groupby(level=0).agg(np.mean) 1021 tm.assert_index_equal(result.columns, columns) 1022 1023 result = df.groupby(level=0).apply(lambda x: x.mean()) 1024 tm.assert_index_equal(result.columns, columns) 1025 1026 result = df.groupby(level=0, axis=1).agg(lambda x: x.mean(1)) 1027 tm.assert_index_equal(result.columns, Index(["A", "B"])) 1028 tm.assert_index_equal(result.index, df.index) 1029 1030 # add a nuisance column 1031 sorted_columns, _ = columns.sortlevel(0) 1032 df["A", "foo"] = "bar" 1033 result = df.groupby(level=0).mean() 1034 tm.assert_index_equal(result.columns, df.columns[:-1]) 1035 1036 1037def test_grouping_ndarray(df): 1038 grouped = df.groupby(df["A"].values) 1039 1040 result = grouped.sum() 1041 expected = df.groupby("A").sum() 1042 tm.assert_frame_equal( 1043 result, expected, check_names=False 1044 ) # Note: no names when grouping by value 1045 1046 1047def test_groupby_wrong_multi_labels(): 1048 data = """index,foo,bar,baz,spam,data 10490,foo1,bar1,baz1,spam2,20 10501,foo1,bar2,baz1,spam3,30 10512,foo2,bar2,baz1,spam2,40 10523,foo1,bar1,baz2,spam1,50 10534,foo3,bar1,baz2,spam1,60""" 1054 1055 data = read_csv(StringIO(data), index_col=0) 1056 1057 grouped = data.groupby(["foo", "bar", "baz", "spam"]) 1058 1059 result = grouped.agg(np.mean) 1060 expected = grouped.mean() 1061 tm.assert_frame_equal(result, expected) 1062 1063 1064def test_groupby_series_with_name(df): 1065 result = df.groupby(df["A"]).mean() 1066 result2 = df.groupby(df["A"], as_index=False).mean() 1067 assert result.index.name == "A" 1068 assert "A" in result2 1069 1070 result = df.groupby([df["A"], df["B"]]).mean() 1071 result2 = df.groupby([df["A"], df["B"]], as_index=False).mean() 1072 assert result.index.names == ("A", "B") 1073 assert "A" in result2 1074 assert "B" in result2 1075 1076 1077def test_seriesgroupby_name_attr(df): 1078 # GH 6265 1079 result = df.groupby("A")["C"] 1080 assert result.count().name == "C" 1081 assert result.mean().name == "C" 1082 1083 testFunc = lambda x: np.sum(x) * 2 1084 assert result.agg(testFunc).name == "C" 1085 1086 1087def test_consistency_name(): 1088 # GH 12363 1089 1090 df = DataFrame( 1091 { 1092 "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], 1093 "B": ["one", "one", "two", "two", "two", "two", "one", "two"], 1094 "C": np.random.randn(8) + 1.0, 1095 "D": np.arange(8), 1096 } 1097 ) 1098 1099 expected = df.groupby(["A"]).B.count() 1100 result = df.B.groupby(df.A).count() 1101 tm.assert_series_equal(result, expected) 1102 1103 1104def test_groupby_name_propagation(df): 1105 # GH 6124 1106 def summarize(df, name=None): 1107 return Series({"count": 1, "mean": 2, "omissions": 3}, name=name) 1108 1109 def summarize_random_name(df): 1110 # Provide a different name for each Series. In this case, groupby 1111 # should not attempt to propagate the Series name since they are 1112 # inconsistent. 1113 return Series({"count": 1, "mean": 2, "omissions": 3}, name=df.iloc[0]["A"]) 1114 1115 metrics = df.groupby("A").apply(summarize) 1116 assert metrics.columns.name is None 1117 metrics = df.groupby("A").apply(summarize, "metrics") 1118 assert metrics.columns.name == "metrics" 1119 metrics = df.groupby("A").apply(summarize_random_name) 1120 assert metrics.columns.name is None 1121 1122 1123def test_groupby_nonstring_columns(): 1124 df = DataFrame([np.arange(10) for x in range(10)]) 1125 grouped = df.groupby(0) 1126 result = grouped.mean() 1127 expected = df.groupby(df[0]).mean() 1128 tm.assert_frame_equal(result, expected) 1129 1130 1131def test_groupby_mixed_type_columns(): 1132 # GH 13432, unorderable types in py3 1133 df = DataFrame([[0, 1, 2]], columns=["A", "B", 0]) 1134 expected = DataFrame([[1, 2]], columns=["B", 0], index=Index([0], name="A")) 1135 1136 result = df.groupby("A").first() 1137 tm.assert_frame_equal(result, expected) 1138 1139 result = df.groupby("A").sum() 1140 tm.assert_frame_equal(result, expected) 1141 1142 1143# TODO: Ensure warning isn't emitted in the first place 1144@pytest.mark.filterwarnings("ignore:Mean of:RuntimeWarning") 1145def test_cython_grouper_series_bug_noncontig(): 1146 arr = np.empty((100, 100)) 1147 arr.fill(np.nan) 1148 obj = Series(arr[:, 0]) 1149 inds = np.tile(range(10), 10) 1150 1151 result = obj.groupby(inds).agg(Series.median) 1152 assert result.isna().all() 1153 1154 1155def test_series_grouper_noncontig_index(): 1156 index = Index(tm.rands_array(10, 100)) 1157 1158 values = Series(np.random.randn(50), index=index[::2]) 1159 labels = np.random.randint(0, 5, 50) 1160 1161 # it works! 1162 grouped = values.groupby(labels) 1163 1164 # accessing the index elements causes segfault 1165 f = lambda x: len(set(map(id, x.index))) 1166 grouped.agg(f) 1167 1168 1169def test_convert_objects_leave_decimal_alone(): 1170 1171 s = Series(range(5)) 1172 labels = np.array(["a", "b", "c", "d", "e"], dtype="O") 1173 1174 def convert_fast(x): 1175 return Decimal(str(x.mean())) 1176 1177 def convert_force_pure(x): 1178 # base will be length 0 1179 assert len(x.values.base) > 0 1180 return Decimal(str(x.mean())) 1181 1182 grouped = s.groupby(labels) 1183 1184 result = grouped.agg(convert_fast) 1185 assert result.dtype == np.object_ 1186 assert isinstance(result[0], Decimal) 1187 1188 result = grouped.agg(convert_force_pure) 1189 assert result.dtype == np.object_ 1190 assert isinstance(result[0], Decimal) 1191 1192 1193def test_groupby_dtype_inference_empty(): 1194 # GH 6733 1195 df = DataFrame({"x": [], "range": np.arange(0, dtype="int64")}) 1196 assert df["x"].dtype == np.float64 1197 1198 result = df.groupby("x").first() 1199 exp_index = Index([], name="x", dtype=np.float64) 1200 expected = DataFrame({"range": Series([], index=exp_index, dtype="int64")}) 1201 tm.assert_frame_equal(result, expected, by_blocks=True) 1202 1203 1204def test_groupby_unit64_float_conversion(): 1205 # GH: 30859 groupby converts unit64 to floats sometimes 1206 df = DataFrame({"first": [1], "second": [1], "value": [16148277970000000000]}) 1207 result = df.groupby(["first", "second"])["value"].max() 1208 expected = Series( 1209 [16148277970000000000], 1210 MultiIndex.from_product([[1], [1]], names=["first", "second"]), 1211 name="value", 1212 ) 1213 tm.assert_series_equal(result, expected) 1214 1215 1216def test_groupby_list_infer_array_like(df): 1217 result = df.groupby(list(df["A"])).mean() 1218 expected = df.groupby(df["A"]).mean() 1219 tm.assert_frame_equal(result, expected, check_names=False) 1220 1221 with pytest.raises(KeyError, match=r"^'foo'$"): 1222 df.groupby(list(df["A"][:-1])) 1223 1224 # pathological case of ambiguity 1225 df = DataFrame({"foo": [0, 1], "bar": [3, 4], "val": np.random.randn(2)}) 1226 1227 result = df.groupby(["foo", "bar"]).mean() 1228 expected = df.groupby([df["foo"], df["bar"]]).mean()[["val"]] 1229 1230 1231def test_groupby_keys_same_size_as_index(): 1232 # GH 11185 1233 freq = "s" 1234 index = pd.date_range( 1235 start=Timestamp("2015-09-29T11:34:44-0700"), periods=2, freq=freq 1236 ) 1237 df = DataFrame([["A", 10], ["B", 15]], columns=["metric", "values"], index=index) 1238 result = df.groupby([Grouper(level=0, freq=freq), "metric"]).mean() 1239 expected = df.set_index([df.index, "metric"]) 1240 1241 tm.assert_frame_equal(result, expected) 1242 1243 1244def test_groupby_one_row(): 1245 # GH 11741 1246 msg = r"^'Z'$" 1247 df1 = DataFrame(np.random.randn(1, 4), columns=list("ABCD")) 1248 with pytest.raises(KeyError, match=msg): 1249 df1.groupby("Z") 1250 df2 = DataFrame(np.random.randn(2, 4), columns=list("ABCD")) 1251 with pytest.raises(KeyError, match=msg): 1252 df2.groupby("Z") 1253 1254 1255def test_groupby_nat_exclude(): 1256 # GH 6992 1257 df = DataFrame( 1258 { 1259 "values": np.random.randn(8), 1260 "dt": [ 1261 np.nan, 1262 Timestamp("2013-01-01"), 1263 np.nan, 1264 Timestamp("2013-02-01"), 1265 np.nan, 1266 Timestamp("2013-02-01"), 1267 np.nan, 1268 Timestamp("2013-01-01"), 1269 ], 1270 "str": [np.nan, "a", np.nan, "a", np.nan, "a", np.nan, "b"], 1271 } 1272 ) 1273 grouped = df.groupby("dt") 1274 1275 expected = [Index([1, 7]), Index([3, 5])] 1276 keys = sorted(grouped.groups.keys()) 1277 assert len(keys) == 2 1278 for k, e in zip(keys, expected): 1279 # grouped.groups keys are np.datetime64 with system tz 1280 # not to be affected by tz, only compare values 1281 tm.assert_index_equal(grouped.groups[k], e) 1282 1283 # confirm obj is not filtered 1284 tm.assert_frame_equal(grouped.grouper.groupings[0].obj, df) 1285 assert grouped.ngroups == 2 1286 1287 expected = { 1288 Timestamp("2013-01-01 00:00:00"): np.array([1, 7], dtype=np.intp), 1289 Timestamp("2013-02-01 00:00:00"): np.array([3, 5], dtype=np.intp), 1290 } 1291 1292 for k in grouped.indices: 1293 tm.assert_numpy_array_equal(grouped.indices[k], expected[k]) 1294 1295 tm.assert_frame_equal(grouped.get_group(Timestamp("2013-01-01")), df.iloc[[1, 7]]) 1296 tm.assert_frame_equal(grouped.get_group(Timestamp("2013-02-01")), df.iloc[[3, 5]]) 1297 1298 with pytest.raises(KeyError, match=r"^NaT$"): 1299 grouped.get_group(pd.NaT) 1300 1301 nan_df = DataFrame( 1302 {"nan": [np.nan, np.nan, np.nan], "nat": [pd.NaT, pd.NaT, pd.NaT]} 1303 ) 1304 assert nan_df["nan"].dtype == "float64" 1305 assert nan_df["nat"].dtype == "datetime64[ns]" 1306 1307 for key in ["nan", "nat"]: 1308 grouped = nan_df.groupby(key) 1309 assert grouped.groups == {} 1310 assert grouped.ngroups == 0 1311 assert grouped.indices == {} 1312 with pytest.raises(KeyError, match=r"^nan$"): 1313 grouped.get_group(np.nan) 1314 with pytest.raises(KeyError, match=r"^NaT$"): 1315 grouped.get_group(pd.NaT) 1316 1317 1318def test_groupby_two_group_keys_all_nan(): 1319 # GH #36842: Grouping over two group keys shouldn't raise an error 1320 df = DataFrame({"a": [np.nan, np.nan], "b": [np.nan, np.nan], "c": [1, 2]}) 1321 result = df.groupby(["a", "b"]).indices 1322 assert result == {} 1323 1324 1325def test_groupby_2d_malformed(): 1326 d = DataFrame(index=range(2)) 1327 d["group"] = ["g1", "g2"] 1328 d["zeros"] = [0, 0] 1329 d["ones"] = [1, 1] 1330 d["label"] = ["l1", "l2"] 1331 tmp = d.groupby(["group"]).mean() 1332 res_values = np.array([[0, 1], [0, 1]], dtype=np.int64) 1333 tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"])) 1334 tm.assert_numpy_array_equal(tmp.values, res_values) 1335 1336 1337def test_int32_overflow(): 1338 B = np.concatenate((np.arange(10000), np.arange(10000), np.arange(5000))) 1339 A = np.arange(25000) 1340 df = DataFrame({"A": A, "B": B, "C": A, "D": B, "E": np.random.randn(25000)}) 1341 1342 left = df.groupby(["A", "B", "C", "D"]).sum() 1343 right = df.groupby(["D", "C", "B", "A"]).sum() 1344 assert len(left) == len(right) 1345 1346 1347def test_groupby_sort_multi(): 1348 df = DataFrame( 1349 { 1350 "a": ["foo", "bar", "baz"], 1351 "b": [3, 2, 1], 1352 "c": [0, 1, 2], 1353 "d": np.random.randn(3), 1354 } 1355 ) 1356 1357 tups = [tuple(row) for row in df[["a", "b", "c"]].values] 1358 tups = com.asarray_tuplesafe(tups) 1359 result = df.groupby(["a", "b", "c"], sort=True).sum() 1360 tm.assert_numpy_array_equal(result.index.values, tups[[1, 2, 0]]) 1361 1362 tups = [tuple(row) for row in df[["c", "a", "b"]].values] 1363 tups = com.asarray_tuplesafe(tups) 1364 result = df.groupby(["c", "a", "b"], sort=True).sum() 1365 tm.assert_numpy_array_equal(result.index.values, tups) 1366 1367 tups = [tuple(x) for x in df[["b", "c", "a"]].values] 1368 tups = com.asarray_tuplesafe(tups) 1369 result = df.groupby(["b", "c", "a"], sort=True).sum() 1370 tm.assert_numpy_array_equal(result.index.values, tups[[2, 1, 0]]) 1371 1372 df = DataFrame( 1373 {"a": [0, 1, 2, 0, 1, 2], "b": [0, 0, 0, 1, 1, 1], "d": np.random.randn(6)} 1374 ) 1375 grouped = df.groupby(["a", "b"])["d"] 1376 result = grouped.sum() 1377 1378 def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): 1379 tups = [tuple(row) for row in df[keys].values] 1380 tups = com.asarray_tuplesafe(tups) 1381 expected = f(df.groupby(tups)[field]) 1382 for k, v in expected.items(): 1383 assert result[k] == v 1384 1385 _check_groupby(df, result, ["a", "b"], "d") 1386 1387 1388def test_dont_clobber_name_column(): 1389 df = DataFrame( 1390 {"key": ["a", "a", "a", "b", "b", "b"], "name": ["foo", "bar", "baz"] * 2} 1391 ) 1392 1393 result = df.groupby("key").apply(lambda x: x) 1394 tm.assert_frame_equal(result, df) 1395 1396 1397def test_skip_group_keys(): 1398 1399 tsf = tm.makeTimeDataFrame() 1400 1401 grouped = tsf.groupby(lambda x: x.month, group_keys=False) 1402 result = grouped.apply(lambda x: x.sort_values(by="A")[:3]) 1403 1404 pieces = [group.sort_values(by="A")[:3] for key, group in grouped] 1405 1406 expected = pd.concat(pieces) 1407 tm.assert_frame_equal(result, expected) 1408 1409 grouped = tsf["A"].groupby(lambda x: x.month, group_keys=False) 1410 result = grouped.apply(lambda x: x.sort_values()[:3]) 1411 1412 pieces = [group.sort_values()[:3] for key, group in grouped] 1413 1414 expected = pd.concat(pieces) 1415 tm.assert_series_equal(result, expected) 1416 1417 1418def test_no_nonsense_name(float_frame): 1419 # GH #995 1420 s = float_frame["C"].copy() 1421 s.name = None 1422 1423 result = s.groupby(float_frame["A"]).agg(np.sum) 1424 assert result.name is None 1425 1426 1427def test_multifunc_sum_bug(): 1428 # GH #1065 1429 x = DataFrame(np.arange(9).reshape(3, 3)) 1430 x["test"] = 0 1431 x["fl"] = [1.3, 1.5, 1.6] 1432 1433 grouped = x.groupby("test") 1434 result = grouped.agg({"fl": "sum", 2: "size"}) 1435 assert result["fl"].dtype == np.float64 1436 1437 1438def test_handle_dict_return_value(df): 1439 def f(group): 1440 return {"max": group.max(), "min": group.min()} 1441 1442 def g(group): 1443 return Series({"max": group.max(), "min": group.min()}) 1444 1445 result = df.groupby("A")["C"].apply(f) 1446 expected = df.groupby("A")["C"].apply(g) 1447 1448 assert isinstance(result, Series) 1449 tm.assert_series_equal(result, expected) 1450 1451 1452@pytest.mark.parametrize("grouper", ["A", ["A", "B"]]) 1453def test_set_group_name(df, grouper): 1454 def f(group): 1455 assert group.name is not None 1456 return group 1457 1458 def freduce(group): 1459 assert group.name is not None 1460 return group.sum() 1461 1462 def foo(x): 1463 return freduce(x) 1464 1465 grouped = df.groupby(grouper) 1466 1467 # make sure all these work 1468 grouped.apply(f) 1469 grouped.aggregate(freduce) 1470 grouped.aggregate({"C": freduce, "D": freduce}) 1471 grouped.transform(f) 1472 1473 grouped["C"].apply(f) 1474 grouped["C"].aggregate(freduce) 1475 grouped["C"].aggregate([freduce, foo]) 1476 grouped["C"].transform(f) 1477 1478 1479def test_group_name_available_in_inference_pass(): 1480 # gh-15062 1481 df = DataFrame({"a": [0, 0, 1, 1, 2, 2], "b": np.arange(6)}) 1482 1483 names = [] 1484 1485 def f(group): 1486 names.append(group.name) 1487 return group.copy() 1488 1489 df.groupby("a", sort=False, group_keys=False).apply(f) 1490 1491 expected_names = [0, 1, 2] 1492 assert names == expected_names 1493 1494 1495def test_no_dummy_key_names(df): 1496 # see gh-1291 1497 result = df.groupby(df["A"].values).sum() 1498 assert result.index.name is None 1499 1500 result = df.groupby([df["A"].values, df["B"].values]).sum() 1501 assert result.index.names == (None, None) 1502 1503 1504def test_groupby_sort_multiindex_series(): 1505 # series multiindex groupby sort argument was not being passed through 1506 # _compress_group_index 1507 # GH 9444 1508 index = MultiIndex( 1509 levels=[[1, 2], [1, 2]], 1510 codes=[[0, 0, 0, 0, 1, 1], [1, 1, 0, 0, 0, 0]], 1511 names=["a", "b"], 1512 ) 1513 mseries = Series([0, 1, 2, 3, 4, 5], index=index) 1514 index = MultiIndex( 1515 levels=[[1, 2], [1, 2]], codes=[[0, 0, 1], [1, 0, 0]], names=["a", "b"] 1516 ) 1517 mseries_result = Series([0, 2, 4], index=index) 1518 1519 result = mseries.groupby(level=["a", "b"], sort=False).first() 1520 tm.assert_series_equal(result, mseries_result) 1521 result = mseries.groupby(level=["a", "b"], sort=True).first() 1522 tm.assert_series_equal(result, mseries_result.sort_index()) 1523 1524 1525def test_groupby_reindex_inside_function(): 1526 1527 periods = 1000 1528 ind = date_range(start="2012/1/1", freq="5min", periods=periods) 1529 df = DataFrame({"high": np.arange(periods), "low": np.arange(periods)}, index=ind) 1530 1531 def agg_before(func, fix=False): 1532 """ 1533 Run an aggregate func on the subset of data. 1534 """ 1535 1536 def _func(data): 1537 d = data.loc[data.index.map(lambda x: x.hour < 11)].dropna() 1538 if fix: 1539 data[data.index[0]] 1540 if len(d) == 0: 1541 return None 1542 return func(d) 1543 1544 return _func 1545 1546 grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day)) 1547 closure_bad = grouped.agg({"high": agg_before(np.max)}) 1548 closure_good = grouped.agg({"high": agg_before(np.max, True)}) 1549 1550 tm.assert_frame_equal(closure_bad, closure_good) 1551 1552 1553def test_groupby_multiindex_missing_pair(): 1554 # GH9049 1555 df = DataFrame( 1556 { 1557 "group1": ["a", "a", "a", "b"], 1558 "group2": ["c", "c", "d", "c"], 1559 "value": [1, 1, 1, 5], 1560 } 1561 ) 1562 df = df.set_index(["group1", "group2"]) 1563 df_grouped = df.groupby(level=["group1", "group2"], sort=True) 1564 1565 res = df_grouped.agg("sum") 1566 idx = MultiIndex.from_tuples( 1567 [("a", "c"), ("a", "d"), ("b", "c")], names=["group1", "group2"] 1568 ) 1569 exp = DataFrame([[2], [1], [5]], index=idx, columns=["value"]) 1570 1571 tm.assert_frame_equal(res, exp) 1572 1573 1574def test_groupby_multiindex_not_lexsorted(): 1575 # GH 11640 1576 1577 # define the lexsorted version 1578 lexsorted_mi = MultiIndex.from_tuples( 1579 [("a", ""), ("b1", "c1"), ("b2", "c2")], names=["b", "c"] 1580 ) 1581 lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi) 1582 assert lexsorted_df.columns.is_lexsorted() 1583 1584 # define the non-lexsorted version 1585 not_lexsorted_df = DataFrame( 1586 columns=["a", "b", "c", "d"], data=[[1, "b1", "c1", 3], [1, "b2", "c2", 4]] 1587 ) 1588 not_lexsorted_df = not_lexsorted_df.pivot_table( 1589 index="a", columns=["b", "c"], values="d" 1590 ) 1591 not_lexsorted_df = not_lexsorted_df.reset_index() 1592 assert not not_lexsorted_df.columns.is_lexsorted() 1593 1594 # compare the results 1595 tm.assert_frame_equal(lexsorted_df, not_lexsorted_df) 1596 1597 expected = lexsorted_df.groupby("a").mean() 1598 with tm.assert_produces_warning(PerformanceWarning): 1599 result = not_lexsorted_df.groupby("a").mean() 1600 tm.assert_frame_equal(expected, result) 1601 1602 # a transforming function should work regardless of sort 1603 # GH 14776 1604 df = DataFrame( 1605 {"x": ["a", "a", "b", "a"], "y": [1, 1, 2, 2], "z": [1, 2, 3, 4]} 1606 ).set_index(["x", "y"]) 1607 assert not df.index.is_lexsorted() 1608 1609 for level in [0, 1, [0, 1]]: 1610 for sort in [False, True]: 1611 result = df.groupby(level=level, sort=sort).apply(DataFrame.drop_duplicates) 1612 expected = df 1613 tm.assert_frame_equal(expected, result) 1614 1615 result = ( 1616 df.sort_index() 1617 .groupby(level=level, sort=sort) 1618 .apply(DataFrame.drop_duplicates) 1619 ) 1620 expected = df.sort_index() 1621 tm.assert_frame_equal(expected, result) 1622 1623 1624def test_index_label_overlaps_location(): 1625 # checking we don't have any label/location confusion in the 1626 # wake of GH5375 1627 df = DataFrame(list("ABCDE"), index=[2, 0, 2, 1, 1]) 1628 g = df.groupby(list("ababb")) 1629 actual = g.filter(lambda x: len(x) > 2) 1630 expected = df.iloc[[1, 3, 4]] 1631 tm.assert_frame_equal(actual, expected) 1632 1633 ser = df[0] 1634 g = ser.groupby(list("ababb")) 1635 actual = g.filter(lambda x: len(x) > 2) 1636 expected = ser.take([1, 3, 4]) 1637 tm.assert_series_equal(actual, expected) 1638 1639 # ... and again, with a generic Index of floats 1640 df.index = df.index.astype(float) 1641 g = df.groupby(list("ababb")) 1642 actual = g.filter(lambda x: len(x) > 2) 1643 expected = df.iloc[[1, 3, 4]] 1644 tm.assert_frame_equal(actual, expected) 1645 1646 ser = df[0] 1647 g = ser.groupby(list("ababb")) 1648 actual = g.filter(lambda x: len(x) > 2) 1649 expected = ser.take([1, 3, 4]) 1650 tm.assert_series_equal(actual, expected) 1651 1652 1653def test_transform_doesnt_clobber_ints(): 1654 # GH 7972 1655 n = 6 1656 x = np.arange(n) 1657 df = DataFrame({"a": x // 2, "b": 2.0 * x, "c": 3.0 * x}) 1658 df2 = DataFrame({"a": x // 2 * 1.0, "b": 2.0 * x, "c": 3.0 * x}) 1659 1660 gb = df.groupby("a") 1661 result = gb.transform("mean") 1662 1663 gb2 = df2.groupby("a") 1664 expected = gb2.transform("mean") 1665 tm.assert_frame_equal(result, expected) 1666 1667 1668@pytest.mark.parametrize( 1669 "sort_column", 1670 ["ints", "floats", "strings", ["ints", "floats"], ["ints", "strings"]], 1671) 1672@pytest.mark.parametrize( 1673 "group_column", ["int_groups", "string_groups", ["int_groups", "string_groups"]] 1674) 1675def test_groupby_preserves_sort(sort_column, group_column): 1676 # Test to ensure that groupby always preserves sort order of original 1677 # object. Issue #8588 and #9651 1678 1679 df = DataFrame( 1680 { 1681 "int_groups": [3, 1, 0, 1, 0, 3, 3, 3], 1682 "string_groups": ["z", "a", "z", "a", "a", "g", "g", "g"], 1683 "ints": [8, 7, 4, 5, 2, 9, 1, 1], 1684 "floats": [2.3, 5.3, 6.2, -2.4, 2.2, 1.1, 1.1, 5], 1685 "strings": ["z", "d", "a", "e", "word", "word2", "42", "47"], 1686 } 1687 ) 1688 1689 # Try sorting on different types and with different group types 1690 1691 df = df.sort_values(by=sort_column) 1692 g = df.groupby(group_column) 1693 1694 def test_sort(x): 1695 tm.assert_frame_equal(x, x.sort_values(by=sort_column)) 1696 1697 g.apply(test_sort) 1698 1699 1700def test_pivot_table_values_key_error(): 1701 # This test is designed to replicate the error in issue #14938 1702 df = DataFrame( 1703 { 1704 "eventDate": pd.date_range(datetime.today(), periods=20, freq="M").tolist(), 1705 "thename": range(0, 20), 1706 } 1707 ) 1708 1709 df["year"] = df.set_index("eventDate").index.year 1710 df["month"] = df.set_index("eventDate").index.month 1711 1712 with pytest.raises(KeyError, match="'badname'"): 1713 df.reset_index().pivot_table( 1714 index="year", columns="month", values="badname", aggfunc="count" 1715 ) 1716 1717 1718def test_empty_dataframe_groupby(): 1719 # GH8093 1720 df = DataFrame(columns=["A", "B", "C"]) 1721 1722 result = df.groupby("A").sum() 1723 expected = DataFrame(columns=["B", "C"], dtype=np.float64) 1724 expected.index.name = "A" 1725 1726 tm.assert_frame_equal(result, expected) 1727 1728 1729def test_tuple_as_grouping(): 1730 # https://github.com/pandas-dev/pandas/issues/18314 1731 df = DataFrame( 1732 { 1733 ("a", "b"): [1, 1, 1, 1], 1734 "a": [2, 2, 2, 2], 1735 "b": [2, 2, 2, 2], 1736 "c": [1, 1, 1, 1], 1737 } 1738 ) 1739 1740 with pytest.raises(KeyError, match=r"('a', 'b')"): 1741 df[["a", "b", "c"]].groupby(("a", "b")) 1742 1743 result = df.groupby(("a", "b"))["c"].sum() 1744 expected = Series([4], name="c", index=Index([1], name=("a", "b"))) 1745 tm.assert_series_equal(result, expected) 1746 1747 1748def test_tuple_correct_keyerror(): 1749 # https://github.com/pandas-dev/pandas/issues/18798 1750 df = DataFrame(1, index=range(3), columns=MultiIndex.from_product([[1, 2], [3, 4]])) 1751 with pytest.raises(KeyError, match=r"^\(7, 8\)$"): 1752 df.groupby((7, 8)).mean() 1753 1754 1755def test_groupby_agg_ohlc_non_first(): 1756 # GH 21716 1757 df = DataFrame( 1758 [[1], [1]], 1759 columns=["foo"], 1760 index=pd.date_range("2018-01-01", periods=2, freq="D"), 1761 ) 1762 1763 expected = DataFrame( 1764 [[1, 1, 1, 1, 1], [1, 1, 1, 1, 1]], 1765 columns=MultiIndex.from_tuples( 1766 ( 1767 ("foo", "sum", "foo"), 1768 ("foo", "ohlc", "open"), 1769 ("foo", "ohlc", "high"), 1770 ("foo", "ohlc", "low"), 1771 ("foo", "ohlc", "close"), 1772 ) 1773 ), 1774 index=pd.date_range("2018-01-01", periods=2, freq="D"), 1775 ) 1776 1777 result = df.groupby(Grouper(freq="D")).agg(["sum", "ohlc"]) 1778 1779 tm.assert_frame_equal(result, expected) 1780 1781 1782def test_groupby_multiindex_nat(): 1783 # GH 9236 1784 values = [ 1785 (pd.NaT, "a"), 1786 (datetime(2012, 1, 2), "a"), 1787 (datetime(2012, 1, 2), "b"), 1788 (datetime(2012, 1, 3), "a"), 1789 ] 1790 mi = MultiIndex.from_tuples(values, names=["date", None]) 1791 ser = Series([3, 2, 2.5, 4], index=mi) 1792 1793 result = ser.groupby(level=1).mean() 1794 expected = Series([3.0, 2.5], index=["a", "b"]) 1795 tm.assert_series_equal(result, expected) 1796 1797 1798def test_groupby_empty_list_raises(): 1799 # GH 5289 1800 values = zip(range(10), range(10)) 1801 df = DataFrame(values, columns=["apple", "b"]) 1802 msg = "Grouper and axis must be same length" 1803 with pytest.raises(ValueError, match=msg): 1804 df.groupby([[]]) 1805 1806 1807def test_groupby_multiindex_series_keys_len_equal_group_axis(): 1808 # GH 25704 1809 index_array = [["x", "x"], ["a", "b"], ["k", "k"]] 1810 index_names = ["first", "second", "third"] 1811 ri = MultiIndex.from_arrays(index_array, names=index_names) 1812 s = Series(data=[1, 2], index=ri) 1813 result = s.groupby(["first", "third"]).sum() 1814 1815 index_array = [["x"], ["k"]] 1816 index_names = ["first", "third"] 1817 ei = MultiIndex.from_arrays(index_array, names=index_names) 1818 expected = Series([3], index=ei) 1819 1820 tm.assert_series_equal(result, expected) 1821 1822 1823def test_groupby_groups_in_BaseGrouper(): 1824 # GH 26326 1825 # Test if DataFrame grouped with a pandas.Grouper has correct groups 1826 mi = MultiIndex.from_product([["A", "B"], ["C", "D"]], names=["alpha", "beta"]) 1827 df = DataFrame({"foo": [1, 2, 1, 2], "bar": [1, 2, 3, 4]}, index=mi) 1828 result = df.groupby([Grouper(level="alpha"), "beta"]) 1829 expected = df.groupby(["alpha", "beta"]) 1830 assert result.groups == expected.groups 1831 1832 result = df.groupby(["beta", Grouper(level="alpha")]) 1833 expected = df.groupby(["beta", "alpha"]) 1834 assert result.groups == expected.groups 1835 1836 1837@pytest.mark.parametrize("group_name", ["x", ["x"]]) 1838def test_groupby_axis_1(group_name): 1839 # GH 27614 1840 df = DataFrame( 1841 np.arange(12).reshape(3, 4), index=[0, 1, 0], columns=[10, 20, 10, 20] 1842 ) 1843 df.index.name = "y" 1844 df.columns.name = "x" 1845 1846 results = df.groupby(group_name, axis=1).sum() 1847 expected = df.T.groupby(group_name).sum().T 1848 tm.assert_frame_equal(results, expected) 1849 1850 # test on MI column 1851 iterables = [["bar", "baz", "foo"], ["one", "two"]] 1852 mi = MultiIndex.from_product(iterables=iterables, names=["x", "x1"]) 1853 df = DataFrame(np.arange(18).reshape(3, 6), index=[0, 1, 0], columns=mi) 1854 results = df.groupby(group_name, axis=1).sum() 1855 expected = df.T.groupby(group_name).sum().T 1856 tm.assert_frame_equal(results, expected) 1857 1858 1859@pytest.mark.parametrize( 1860 "op, expected", 1861 [ 1862 ( 1863 "shift", 1864 { 1865 "time": [ 1866 None, 1867 None, 1868 Timestamp("2019-01-01 12:00:00"), 1869 Timestamp("2019-01-01 12:30:00"), 1870 None, 1871 None, 1872 ] 1873 }, 1874 ), 1875 ( 1876 "bfill", 1877 { 1878 "time": [ 1879 Timestamp("2019-01-01 12:00:00"), 1880 Timestamp("2019-01-01 12:30:00"), 1881 Timestamp("2019-01-01 14:00:00"), 1882 Timestamp("2019-01-01 14:30:00"), 1883 Timestamp("2019-01-01 14:00:00"), 1884 Timestamp("2019-01-01 14:30:00"), 1885 ] 1886 }, 1887 ), 1888 ( 1889 "ffill", 1890 { 1891 "time": [ 1892 Timestamp("2019-01-01 12:00:00"), 1893 Timestamp("2019-01-01 12:30:00"), 1894 Timestamp("2019-01-01 12:00:00"), 1895 Timestamp("2019-01-01 12:30:00"), 1896 Timestamp("2019-01-01 14:00:00"), 1897 Timestamp("2019-01-01 14:30:00"), 1898 ] 1899 }, 1900 ), 1901 ], 1902) 1903def test_shift_bfill_ffill_tz(tz_naive_fixture, op, expected): 1904 # GH19995, GH27992: Check that timezone does not drop in shift, bfill, and ffill 1905 tz = tz_naive_fixture 1906 data = { 1907 "id": ["A", "B", "A", "B", "A", "B"], 1908 "time": [ 1909 Timestamp("2019-01-01 12:00:00"), 1910 Timestamp("2019-01-01 12:30:00"), 1911 None, 1912 None, 1913 Timestamp("2019-01-01 14:00:00"), 1914 Timestamp("2019-01-01 14:30:00"), 1915 ], 1916 } 1917 df = DataFrame(data).assign(time=lambda x: x.time.dt.tz_localize(tz)) 1918 1919 grouped = df.groupby("id") 1920 result = getattr(grouped, op)() 1921 expected = DataFrame(expected).assign(time=lambda x: x.time.dt.tz_localize(tz)) 1922 tm.assert_frame_equal(result, expected) 1923 1924 1925def test_groupby_only_none_group(): 1926 # see GH21624 1927 # this was crashing with "ValueError: Length of passed values is 1, index implies 0" 1928 df = DataFrame({"g": [None], "x": 1}) 1929 actual = df.groupby("g")["x"].transform("sum") 1930 expected = Series([np.nan], name="x") 1931 1932 tm.assert_series_equal(actual, expected) 1933 1934 1935def test_groupby_duplicate_index(): 1936 # GH#29189 the groupby call here used to raise 1937 ser = Series([2, 5, 6, 8], index=[2.0, 4.0, 4.0, 5.0]) 1938 gb = ser.groupby(level=0) 1939 1940 result = gb.mean() 1941 expected = Series([2, 5.5, 8], index=[2.0, 4.0, 5.0]) 1942 tm.assert_series_equal(result, expected) 1943 1944 1945@pytest.mark.parametrize("bool_agg_func", ["any", "all"]) 1946def test_bool_aggs_dup_column_labels(bool_agg_func): 1947 # 21668 1948 df = DataFrame([[True, True]], columns=["a", "a"]) 1949 grp_by = df.groupby([0]) 1950 result = getattr(grp_by, bool_agg_func)() 1951 1952 expected = df 1953 tm.assert_frame_equal(result, expected) 1954 1955 1956@pytest.mark.parametrize( 1957 "idx", [Index(["a", "a"]), MultiIndex.from_tuples((("a", "a"), ("a", "a")))] 1958) 1959@pytest.mark.filterwarnings("ignore:tshift is deprecated:FutureWarning") 1960def test_dup_labels_output_shape(groupby_func, idx): 1961 if groupby_func in {"size", "ngroup", "cumcount"}: 1962 pytest.skip("Not applicable") 1963 1964 df = DataFrame([[1, 1]], columns=idx) 1965 grp_by = df.groupby([0]) 1966 1967 args = [] 1968 if groupby_func in {"fillna", "nth"}: 1969 args.append(0) 1970 elif groupby_func == "corrwith": 1971 args.append(df) 1972 elif groupby_func == "tshift": 1973 df.index = [Timestamp("today")] 1974 args.extend([1, "D"]) 1975 1976 result = getattr(grp_by, groupby_func)(*args) 1977 1978 assert result.shape == (1, 2) 1979 tm.assert_index_equal(result.columns, idx) 1980 1981 1982def test_groupby_crash_on_nunique(axis): 1983 # Fix following 30253 1984 df = DataFrame({("A", "B"): [1, 2], ("A", "C"): [1, 3], ("D", "B"): [0, 0]}) 1985 1986 axis_number = df._get_axis_number(axis) 1987 if not axis_number: 1988 df = df.T 1989 1990 result = df.groupby(axis=axis_number, level=0).nunique() 1991 1992 expected = DataFrame({"A": [1, 2], "D": [1, 1]}) 1993 if not axis_number: 1994 expected = expected.T 1995 1996 tm.assert_frame_equal(result, expected) 1997 1998 1999def test_groupby_list_level(): 2000 # GH 9790 2001 expected = DataFrame(np.arange(0, 9).reshape(3, 3)) 2002 result = expected.groupby(level=[0]).mean() 2003 tm.assert_frame_equal(result, expected) 2004 2005 2006@pytest.mark.parametrize( 2007 "max_seq_items, expected", 2008 [ 2009 (5, "{0: [0], 1: [1], 2: [2], 3: [3], 4: [4]}"), 2010 (4, "{0: [0], 1: [1], 2: [2], 3: [3], ...}"), 2011 ], 2012) 2013def test_groups_repr_truncates(max_seq_items, expected): 2014 # GH 1135 2015 df = DataFrame(np.random.randn(5, 1)) 2016 df["a"] = df.index 2017 2018 with pd.option_context("display.max_seq_items", max_seq_items): 2019 result = df.groupby("a").groups.__repr__() 2020 assert result == expected 2021 2022 result = df.groupby(np.array(df.a)).groups.__repr__() 2023 assert result == expected 2024 2025 2026def test_group_on_two_row_multiindex_returns_one_tuple_key(): 2027 # GH 18451 2028 df = DataFrame([{"a": 1, "b": 2, "c": 99}, {"a": 1, "b": 2, "c": 88}]) 2029 df = df.set_index(["a", "b"]) 2030 2031 grp = df.groupby(["a", "b"]) 2032 result = grp.indices 2033 expected = {(1, 2): np.array([0, 1], dtype=np.int64)} 2034 2035 assert len(result) == 1 2036 key = (1, 2) 2037 assert (result[key] == expected[key]).all() 2038 2039 2040@pytest.mark.parametrize( 2041 "klass, attr, value", 2042 [ 2043 (DataFrame, "level", "a"), 2044 (DataFrame, "as_index", False), 2045 (DataFrame, "sort", False), 2046 (DataFrame, "group_keys", False), 2047 (DataFrame, "squeeze", True), 2048 (DataFrame, "observed", True), 2049 (DataFrame, "dropna", False), 2050 pytest.param( 2051 Series, 2052 "axis", 2053 1, 2054 marks=pytest.mark.xfail( 2055 reason="GH 35443: Attribute currently not passed on to series" 2056 ), 2057 ), 2058 (Series, "level", "a"), 2059 (Series, "as_index", False), 2060 (Series, "sort", False), 2061 (Series, "group_keys", False), 2062 (Series, "squeeze", True), 2063 (Series, "observed", True), 2064 (Series, "dropna", False), 2065 ], 2066) 2067@pytest.mark.filterwarnings( 2068 "ignore:The `squeeze` parameter is deprecated:FutureWarning" 2069) 2070def test_subsetting_columns_keeps_attrs(klass, attr, value): 2071 # GH 9959 - When subsetting columns, don't drop attributes 2072 df = DataFrame({"a": [1], "b": [2], "c": [3]}) 2073 if attr != "axis": 2074 df = df.set_index("a") 2075 2076 expected = df.groupby("a", **{attr: value}) 2077 result = expected[["b"]] if klass is DataFrame else expected["b"] 2078 assert getattr(result, attr) == getattr(expected, attr) 2079 2080 2081def test_subsetting_columns_axis_1(): 2082 # GH 37725 2083 g = DataFrame({"A": [1], "B": [2], "C": [3]}).groupby([0, 0, 1], axis=1) 2084 match = "Cannot subset columns when using axis=1" 2085 with pytest.raises(ValueError, match=match): 2086 g[["A", "B"]].sum() 2087 2088 2089@pytest.mark.parametrize("func", ["sum", "any", "shift"]) 2090def test_groupby_column_index_name_lost(func): 2091 # GH: 29764 groupby loses index sometimes 2092 expected = Index(["a"], name="idx") 2093 df = DataFrame([[1]], columns=expected) 2094 df_grouped = df.groupby([1]) 2095 result = getattr(df_grouped, func)().columns 2096 tm.assert_index_equal(result, expected) 2097 2098 2099def test_groupby_duplicate_columns(): 2100 # GH: 31735 2101 df = DataFrame( 2102 {"A": ["f", "e", "g", "h"], "B": ["a", "b", "c", "d"], "C": [1, 2, 3, 4]} 2103 ).astype(object) 2104 df.columns = ["A", "B", "B"] 2105 result = df.groupby([0, 0, 0, 0]).min() 2106 expected = DataFrame([["e", "a", 1]], columns=["A", "B", "B"]) 2107 tm.assert_frame_equal(result, expected) 2108 2109 2110def test_groupby_series_with_tuple_name(): 2111 # GH 37755 2112 ser = Series([1, 2, 3, 4], index=[1, 1, 2, 2], name=("a", "a")) 2113 ser.index.name = ("b", "b") 2114 result = ser.groupby(level=0).last() 2115 expected = Series([2, 4], index=[1, 2], name=("a", "a")) 2116 expected.index.name = ("b", "b") 2117 tm.assert_series_equal(result, expected) 2118