1from datetime import datetime
2from decimal import Decimal
3from io import StringIO
4
5import numpy as np
6import pytest
7
8from pandas.errors import PerformanceWarning
9
10import pandas as pd
11from pandas import (
12    DataFrame,
13    Grouper,
14    Index,
15    MultiIndex,
16    Series,
17    Timestamp,
18    date_range,
19    read_csv,
20)
21import pandas._testing as tm
22from pandas.core.base import SpecificationError
23import pandas.core.common as com
24
25
26def test_repr():
27    # GH18203
28    result = repr(Grouper(key="A", level="B"))
29    expected = "Grouper(key='A', level='B', axis=0, sort=False)"
30    assert result == expected
31
32
33@pytest.mark.parametrize("dtype", ["int64", "int32", "float64", "float32"])
34def test_basic(dtype):
35
36    data = Series(np.arange(9) // 3, index=np.arange(9), dtype=dtype)
37
38    index = np.arange(9)
39    np.random.shuffle(index)
40    data = data.reindex(index)
41
42    grouped = data.groupby(lambda x: x // 3)
43
44    for k, v in grouped:
45        assert len(v) == 3
46
47    agged = grouped.aggregate(np.mean)
48    assert agged[1] == 1
49
50    tm.assert_series_equal(agged, grouped.agg(np.mean))  # shorthand
51    tm.assert_series_equal(agged, grouped.mean())
52    tm.assert_series_equal(grouped.agg(np.sum), grouped.sum())
53
54    expected = grouped.apply(lambda x: x * x.sum())
55    transformed = grouped.transform(lambda x: x * x.sum())
56    assert transformed[7] == 12
57    tm.assert_series_equal(transformed, expected)
58
59    value_grouped = data.groupby(data)
60    tm.assert_series_equal(
61        value_grouped.aggregate(np.mean), agged, check_index_type=False
62    )
63
64    # complex agg
65    agged = grouped.aggregate([np.mean, np.std])
66
67    msg = r"nested renamer is not supported"
68    with pytest.raises(SpecificationError, match=msg):
69        grouped.aggregate({"one": np.mean, "two": np.std})
70
71    group_constants = {0: 10, 1: 20, 2: 30}
72    agged = grouped.agg(lambda x: group_constants[x.name] + x.mean())
73    assert agged[1] == 21
74
75    # corner cases
76    msg = "Must produce aggregated value"
77    # exception raised is type Exception
78    with pytest.raises(Exception, match=msg):
79        grouped.aggregate(lambda x: x * 2)
80
81
82def test_groupby_nonobject_dtype(mframe, df_mixed_floats):
83    key = mframe.index.codes[0]
84    grouped = mframe.groupby(key)
85    result = grouped.sum()
86
87    expected = mframe.groupby(key.astype("O")).sum()
88    tm.assert_frame_equal(result, expected)
89
90    # GH 3911, mixed frame non-conversion
91    df = df_mixed_floats.copy()
92    df["value"] = range(len(df))
93
94    def max_value(group):
95        return group.loc[group["value"].idxmax()]
96
97    applied = df.groupby("A").apply(max_value)
98    result = applied.dtypes
99    expected = Series(
100        [np.dtype("object")] * 2 + [np.dtype("float64")] * 2 + [np.dtype("int64")],
101        index=["A", "B", "C", "D", "value"],
102    )
103    tm.assert_series_equal(result, expected)
104
105
106def test_groupby_return_type():
107
108    # GH2893, return a reduced type
109    df1 = DataFrame(
110        [
111            {"val1": 1, "val2": 20},
112            {"val1": 1, "val2": 19},
113            {"val1": 2, "val2": 27},
114            {"val1": 2, "val2": 12},
115        ]
116    )
117
118    def func(dataf):
119        return dataf["val2"] - dataf["val2"].mean()
120
121    with tm.assert_produces_warning(FutureWarning):
122        result = df1.groupby("val1", squeeze=True).apply(func)
123    assert isinstance(result, Series)
124
125    df2 = DataFrame(
126        [
127            {"val1": 1, "val2": 20},
128            {"val1": 1, "val2": 19},
129            {"val1": 1, "val2": 27},
130            {"val1": 1, "val2": 12},
131        ]
132    )
133
134    def func(dataf):
135        return dataf["val2"] - dataf["val2"].mean()
136
137    with tm.assert_produces_warning(FutureWarning):
138        result = df2.groupby("val1", squeeze=True).apply(func)
139    assert isinstance(result, Series)
140
141    # GH3596, return a consistent type (regression in 0.11 from 0.10.1)
142    df = DataFrame([[1, 1], [1, 1]], columns=["X", "Y"])
143    with tm.assert_produces_warning(FutureWarning):
144        result = df.groupby("X", squeeze=False).count()
145    assert isinstance(result, DataFrame)
146
147
148def test_inconsistent_return_type():
149    # GH5592
150    # inconsistent return type
151    df = DataFrame(
152        {
153            "A": ["Tiger", "Tiger", "Tiger", "Lamb", "Lamb", "Pony", "Pony"],
154            "B": Series(np.arange(7), dtype="int64"),
155            "C": date_range("20130101", periods=7),
156        }
157    )
158
159    def f(grp):
160        return grp.iloc[0]
161
162    expected = df.groupby("A").first()[["B"]]
163    result = df.groupby("A").apply(f)[["B"]]
164    tm.assert_frame_equal(result, expected)
165
166    def f(grp):
167        if grp.name == "Tiger":
168            return None
169        return grp.iloc[0]
170
171    result = df.groupby("A").apply(f)[["B"]]
172    e = expected.copy()
173    e.loc["Tiger"] = np.nan
174    tm.assert_frame_equal(result, e)
175
176    def f(grp):
177        if grp.name == "Pony":
178            return None
179        return grp.iloc[0]
180
181    result = df.groupby("A").apply(f)[["B"]]
182    e = expected.copy()
183    e.loc["Pony"] = np.nan
184    tm.assert_frame_equal(result, e)
185
186    # 5592 revisited, with datetimes
187    def f(grp):
188        if grp.name == "Pony":
189            return None
190        return grp.iloc[0]
191
192    result = df.groupby("A").apply(f)[["C"]]
193    e = df.groupby("A").first()[["C"]]
194    e.loc["Pony"] = pd.NaT
195    tm.assert_frame_equal(result, e)
196
197    # scalar outputs
198    def f(grp):
199        if grp.name == "Pony":
200            return None
201        return grp.iloc[0].loc["C"]
202
203    result = df.groupby("A").apply(f)
204    e = df.groupby("A").first()["C"].copy()
205    e.loc["Pony"] = np.nan
206    e.name = None
207    tm.assert_series_equal(result, e)
208
209
210def test_pass_args_kwargs(ts, tsframe):
211    def f(x, q=None, axis=0):
212        return np.percentile(x, q, axis=axis)
213
214    g = lambda x: np.percentile(x, 80, axis=0)
215
216    # Series
217    ts_grouped = ts.groupby(lambda x: x.month)
218    agg_result = ts_grouped.agg(np.percentile, 80, axis=0)
219    apply_result = ts_grouped.apply(np.percentile, 80, axis=0)
220    trans_result = ts_grouped.transform(np.percentile, 80, axis=0)
221
222    agg_expected = ts_grouped.quantile(0.8)
223    trans_expected = ts_grouped.transform(g)
224
225    tm.assert_series_equal(apply_result, agg_expected)
226    tm.assert_series_equal(agg_result, agg_expected)
227    tm.assert_series_equal(trans_result, trans_expected)
228
229    agg_result = ts_grouped.agg(f, q=80)
230    apply_result = ts_grouped.apply(f, q=80)
231    trans_result = ts_grouped.transform(f, q=80)
232    tm.assert_series_equal(agg_result, agg_expected)
233    tm.assert_series_equal(apply_result, agg_expected)
234    tm.assert_series_equal(trans_result, trans_expected)
235
236    # DataFrame
237    df_grouped = tsframe.groupby(lambda x: x.month)
238    agg_result = df_grouped.agg(np.percentile, 80, axis=0)
239    apply_result = df_grouped.apply(DataFrame.quantile, 0.8)
240    expected = df_grouped.quantile(0.8)
241    tm.assert_frame_equal(apply_result, expected, check_names=False)
242    tm.assert_frame_equal(agg_result, expected)
243
244    agg_result = df_grouped.agg(f, q=80)
245    apply_result = df_grouped.apply(DataFrame.quantile, q=0.8)
246    tm.assert_frame_equal(agg_result, expected)
247    tm.assert_frame_equal(apply_result, expected, check_names=False)
248
249
250def test_len():
251    df = tm.makeTimeDataFrame()
252    grouped = df.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day])
253    assert len(grouped) == len(df)
254
255    grouped = df.groupby([lambda x: x.year, lambda x: x.month])
256    expected = len({(x.year, x.month) for x in df.index})
257    assert len(grouped) == expected
258
259    # issue 11016
260    df = DataFrame({"a": [np.nan] * 3, "b": [1, 2, 3]})
261    assert len(df.groupby("a")) == 0
262    assert len(df.groupby("b")) == 3
263    assert len(df.groupby(["a", "b"])) == 3
264
265
266def test_basic_regression():
267    # regression
268    result = Series([1.0 * x for x in list(range(1, 10)) * 10])
269
270    data = np.random.random(1100) * 10.0
271    groupings = Series(data)
272
273    grouped = result.groupby(groupings)
274    grouped.mean()
275
276
277@pytest.mark.parametrize(
278    "dtype", ["float64", "float32", "int64", "int32", "int16", "int8"]
279)
280def test_with_na_groups(dtype):
281    index = Index(np.arange(10))
282    values = Series(np.ones(10), index, dtype=dtype)
283    labels = Series(
284        [np.nan, "foo", "bar", "bar", np.nan, np.nan, "bar", "bar", np.nan, "foo"],
285        index=index,
286    )
287
288    # this SHOULD be an int
289    grouped = values.groupby(labels)
290    agged = grouped.agg(len)
291    expected = Series([4, 2], index=["bar", "foo"])
292
293    tm.assert_series_equal(agged, expected, check_dtype=False)
294
295    # assert issubclass(agged.dtype.type, np.integer)
296
297    # explicitly return a float from my function
298    def f(x):
299        return float(len(x))
300
301    agged = grouped.agg(f)
302    expected = Series([4, 2], index=["bar", "foo"])
303
304    tm.assert_series_equal(agged, expected, check_dtype=False)
305    assert issubclass(agged.dtype.type, np.dtype(dtype).type)
306
307
308def test_indices_concatenation_order():
309
310    # GH 2808
311
312    def f1(x):
313        y = x[(x.b % 2) == 1] ** 2
314        if y.empty:
315            multiindex = MultiIndex(levels=[[]] * 2, codes=[[]] * 2, names=["b", "c"])
316            res = DataFrame(columns=["a"], index=multiindex)
317            return res
318        else:
319            y = y.set_index(["b", "c"])
320            return y
321
322    def f2(x):
323        y = x[(x.b % 2) == 1] ** 2
324        if y.empty:
325            return DataFrame()
326        else:
327            y = y.set_index(["b", "c"])
328            return y
329
330    def f3(x):
331        y = x[(x.b % 2) == 1] ** 2
332        if y.empty:
333            multiindex = MultiIndex(
334                levels=[[]] * 2, codes=[[]] * 2, names=["foo", "bar"]
335            )
336            res = DataFrame(columns=["a", "b"], index=multiindex)
337            return res
338        else:
339            return y
340
341    df = DataFrame({"a": [1, 2, 2, 2], "b": range(4), "c": range(5, 9)})
342
343    df2 = DataFrame({"a": [3, 2, 2, 2], "b": range(4), "c": range(5, 9)})
344
345    # correct result
346    result1 = df.groupby("a").apply(f1)
347    result2 = df2.groupby("a").apply(f1)
348    tm.assert_frame_equal(result1, result2)
349
350    # should fail (not the same number of levels)
351    msg = "Cannot concat indices that do not have the same number of levels"
352    with pytest.raises(AssertionError, match=msg):
353        df.groupby("a").apply(f2)
354    with pytest.raises(AssertionError, match=msg):
355        df2.groupby("a").apply(f2)
356
357    # should fail (incorrect shape)
358    with pytest.raises(AssertionError, match=msg):
359        df.groupby("a").apply(f3)
360    with pytest.raises(AssertionError, match=msg):
361        df2.groupby("a").apply(f3)
362
363
364def test_attr_wrapper(ts):
365    grouped = ts.groupby(lambda x: x.weekday())
366
367    result = grouped.std()
368    expected = grouped.agg(lambda x: np.std(x, ddof=1))
369    tm.assert_series_equal(result, expected)
370
371    # this is pretty cool
372    result = grouped.describe()
373    expected = {name: gp.describe() for name, gp in grouped}
374    expected = DataFrame(expected).T
375    tm.assert_frame_equal(result, expected)
376
377    # get attribute
378    result = grouped.dtype
379    expected = grouped.agg(lambda x: x.dtype)
380    tm.assert_series_equal(result, expected)
381
382    # make sure raises error
383    msg = "'SeriesGroupBy' object has no attribute 'foo'"
384    with pytest.raises(AttributeError, match=msg):
385        getattr(grouped, "foo")
386
387
388def test_frame_groupby(tsframe):
389    grouped = tsframe.groupby(lambda x: x.weekday())
390
391    # aggregate
392    aggregated = grouped.aggregate(np.mean)
393    assert len(aggregated) == 5
394    assert len(aggregated.columns) == 4
395
396    # by string
397    tscopy = tsframe.copy()
398    tscopy["weekday"] = [x.weekday() for x in tscopy.index]
399    stragged = tscopy.groupby("weekday").aggregate(np.mean)
400    tm.assert_frame_equal(stragged, aggregated, check_names=False)
401
402    # transform
403    grouped = tsframe.head(30).groupby(lambda x: x.weekday())
404    transformed = grouped.transform(lambda x: x - x.mean())
405    assert len(transformed) == 30
406    assert len(transformed.columns) == 4
407
408    # transform propagate
409    transformed = grouped.transform(lambda x: x.mean())
410    for name, group in grouped:
411        mean = group.mean()
412        for idx in group.index:
413            tm.assert_series_equal(transformed.xs(idx), mean, check_names=False)
414
415    # iterate
416    for weekday, group in grouped:
417        assert group.index[0].weekday() == weekday
418
419    # groups / group_indices
420    groups = grouped.groups
421    indices = grouped.indices
422
423    for k, v in groups.items():
424        samething = tsframe.index.take(indices[k])
425        assert (samething == v).all()
426
427
428def test_frame_groupby_columns(tsframe):
429    mapping = {"A": 0, "B": 0, "C": 1, "D": 1}
430    grouped = tsframe.groupby(mapping, axis=1)
431
432    # aggregate
433    aggregated = grouped.aggregate(np.mean)
434    assert len(aggregated) == len(tsframe)
435    assert len(aggregated.columns) == 2
436
437    # transform
438    tf = lambda x: x - x.mean()
439    groupedT = tsframe.T.groupby(mapping, axis=0)
440    tm.assert_frame_equal(groupedT.transform(tf).T, grouped.transform(tf))
441
442    # iterate
443    for k, v in grouped:
444        assert len(v.columns) == 2
445
446
447def test_frame_set_name_single(df):
448    grouped = df.groupby("A")
449
450    result = grouped.mean()
451    assert result.index.name == "A"
452
453    result = df.groupby("A", as_index=False).mean()
454    assert result.index.name != "A"
455
456    result = grouped.agg(np.mean)
457    assert result.index.name == "A"
458
459    result = grouped.agg({"C": np.mean, "D": np.std})
460    assert result.index.name == "A"
461
462    result = grouped["C"].mean()
463    assert result.index.name == "A"
464    result = grouped["C"].agg(np.mean)
465    assert result.index.name == "A"
466    result = grouped["C"].agg([np.mean, np.std])
467    assert result.index.name == "A"
468
469    msg = r"nested renamer is not supported"
470    with pytest.raises(SpecificationError, match=msg):
471        grouped["C"].agg({"foo": np.mean, "bar": np.std})
472
473
474def test_multi_func(df):
475    col1 = df["A"]
476    col2 = df["B"]
477
478    grouped = df.groupby([col1.get, col2.get])
479    agged = grouped.mean()
480    expected = df.groupby(["A", "B"]).mean()
481
482    # TODO groupby get drops names
483    tm.assert_frame_equal(
484        agged.loc[:, ["C", "D"]], expected.loc[:, ["C", "D"]], check_names=False
485    )
486
487    # some "groups" with no data
488    df = DataFrame(
489        {
490            "v1": np.random.randn(6),
491            "v2": np.random.randn(6),
492            "k1": np.array(["b", "b", "b", "a", "a", "a"]),
493            "k2": np.array(["1", "1", "1", "2", "2", "2"]),
494        },
495        index=["one", "two", "three", "four", "five", "six"],
496    )
497    # only verify that it works for now
498    grouped = df.groupby(["k1", "k2"])
499    grouped.agg(np.sum)
500
501
502def test_multi_key_multiple_functions(df):
503    grouped = df.groupby(["A", "B"])["C"]
504
505    agged = grouped.agg([np.mean, np.std])
506    expected = DataFrame({"mean": grouped.agg(np.mean), "std": grouped.agg(np.std)})
507    tm.assert_frame_equal(agged, expected)
508
509
510def test_frame_multi_key_function_list():
511    data = DataFrame(
512        {
513            "A": [
514                "foo",
515                "foo",
516                "foo",
517                "foo",
518                "bar",
519                "bar",
520                "bar",
521                "bar",
522                "foo",
523                "foo",
524                "foo",
525            ],
526            "B": [
527                "one",
528                "one",
529                "one",
530                "two",
531                "one",
532                "one",
533                "one",
534                "two",
535                "two",
536                "two",
537                "one",
538            ],
539            "C": [
540                "dull",
541                "dull",
542                "shiny",
543                "dull",
544                "dull",
545                "shiny",
546                "shiny",
547                "dull",
548                "shiny",
549                "shiny",
550                "shiny",
551            ],
552            "D": np.random.randn(11),
553            "E": np.random.randn(11),
554            "F": np.random.randn(11),
555        }
556    )
557
558    grouped = data.groupby(["A", "B"])
559    funcs = [np.mean, np.std]
560    agged = grouped.agg(funcs)
561    expected = pd.concat(
562        [grouped["D"].agg(funcs), grouped["E"].agg(funcs), grouped["F"].agg(funcs)],
563        keys=["D", "E", "F"],
564        axis=1,
565    )
566    assert isinstance(agged.index, MultiIndex)
567    assert isinstance(expected.index, MultiIndex)
568    tm.assert_frame_equal(agged, expected)
569
570
571@pytest.mark.parametrize("op", [lambda x: x.sum(), lambda x: x.mean()])
572def test_groupby_multiple_columns(df, op):
573    data = df
574    grouped = data.groupby(["A", "B"])
575
576    result1 = op(grouped)
577
578    keys = []
579    values = []
580    for n1, gp1 in data.groupby("A"):
581        for n2, gp2 in gp1.groupby("B"):
582            keys.append((n1, n2))
583            values.append(op(gp2.loc[:, ["C", "D"]]))
584
585    mi = MultiIndex.from_tuples(keys, names=["A", "B"])
586    expected = pd.concat(values, axis=1).T
587    expected.index = mi
588
589    # a little bit crude
590    for col in ["C", "D"]:
591        result_col = op(grouped[col])
592        pivoted = result1[col]
593        exp = expected[col]
594        tm.assert_series_equal(result_col, exp)
595        tm.assert_series_equal(pivoted, exp)
596
597    # test single series works the same
598    result = data["C"].groupby([data["A"], data["B"]]).mean()
599    expected = data.groupby(["A", "B"]).mean()["C"]
600
601    tm.assert_series_equal(result, expected)
602
603
604def test_as_index_select_column():
605    # GH 5764
606    df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"])
607    result = df.groupby("A", as_index=False)["B"].get_group(1)
608    expected = Series([2, 4], name="B")
609    tm.assert_series_equal(result, expected)
610
611    result = df.groupby("A", as_index=False)["B"].apply(lambda x: x.cumsum())
612    expected = Series(
613        [2, 6, 6], name="B", index=MultiIndex.from_tuples([(0, 0), (0, 1), (1, 2)])
614    )
615    tm.assert_series_equal(result, expected)
616
617
618def test_groupby_as_index_select_column_sum_empty_df():
619    # GH 35246
620    df = DataFrame(columns=["A", "B", "C"])
621    left = df.groupby(by="A", as_index=False)["B"].sum()
622    assert type(left) is DataFrame
623    assert left.to_dict() == {"A": {}, "B": {}}
624
625
626def test_groupby_as_index_agg(df):
627    grouped = df.groupby("A", as_index=False)
628
629    # single-key
630
631    result = grouped.agg(np.mean)
632    expected = grouped.mean()
633    tm.assert_frame_equal(result, expected)
634
635    result2 = grouped.agg({"C": np.mean, "D": np.sum})
636    expected2 = grouped.mean()
637    expected2["D"] = grouped.sum()["D"]
638    tm.assert_frame_equal(result2, expected2)
639
640    grouped = df.groupby("A", as_index=True)
641
642    msg = r"nested renamer is not supported"
643    with pytest.raises(SpecificationError, match=msg):
644        grouped["C"].agg({"Q": np.sum})
645
646    # multi-key
647
648    grouped = df.groupby(["A", "B"], as_index=False)
649
650    result = grouped.agg(np.mean)
651    expected = grouped.mean()
652    tm.assert_frame_equal(result, expected)
653
654    result2 = grouped.agg({"C": np.mean, "D": np.sum})
655    expected2 = grouped.mean()
656    expected2["D"] = grouped.sum()["D"]
657    tm.assert_frame_equal(result2, expected2)
658
659    expected3 = grouped["C"].sum()
660    expected3 = DataFrame(expected3).rename(columns={"C": "Q"})
661    result3 = grouped["C"].agg({"Q": np.sum})
662    tm.assert_frame_equal(result3, expected3)
663
664    # GH7115 & GH8112 & GH8582
665    df = DataFrame(np.random.randint(0, 100, (50, 3)), columns=["jim", "joe", "jolie"])
666    ts = Series(np.random.randint(5, 10, 50), name="jim")
667
668    gr = df.groupby(ts)
669    gr.nth(0)  # invokes set_selection_from_grouper internally
670    tm.assert_frame_equal(gr.apply(sum), df.groupby(ts).apply(sum))
671
672    for attr in ["mean", "max", "count", "idxmax", "cumsum", "all"]:
673        gr = df.groupby(ts, as_index=False)
674        left = getattr(gr, attr)()
675
676        gr = df.groupby(ts.values, as_index=True)
677        right = getattr(gr, attr)().reset_index(drop=True)
678
679        tm.assert_frame_equal(left, right)
680
681
682def test_ops_not_as_index(reduction_func):
683    # GH 10355, 21090
684    # Using as_index=False should not modify grouped column
685
686    if reduction_func in ("corrwith",):
687        pytest.skip("Test not applicable")
688
689    if reduction_func in ("nth", "ngroup"):
690        pytest.skip("Skip until behavior is determined (GH #5755)")
691
692    df = DataFrame(np.random.randint(0, 5, size=(100, 2)), columns=["a", "b"])
693    expected = getattr(df.groupby("a"), reduction_func)()
694    if reduction_func == "size":
695        expected = expected.rename("size")
696    expected = expected.reset_index()
697
698    g = df.groupby("a", as_index=False)
699
700    result = getattr(g, reduction_func)()
701    tm.assert_frame_equal(result, expected)
702
703    result = g.agg(reduction_func)
704    tm.assert_frame_equal(result, expected)
705
706    result = getattr(g["b"], reduction_func)()
707    tm.assert_frame_equal(result, expected)
708
709    result = g["b"].agg(reduction_func)
710    tm.assert_frame_equal(result, expected)
711
712
713def test_as_index_series_return_frame(df):
714    grouped = df.groupby("A", as_index=False)
715    grouped2 = df.groupby(["A", "B"], as_index=False)
716
717    result = grouped["C"].agg(np.sum)
718    expected = grouped.agg(np.sum).loc[:, ["A", "C"]]
719    assert isinstance(result, DataFrame)
720    tm.assert_frame_equal(result, expected)
721
722    result2 = grouped2["C"].agg(np.sum)
723    expected2 = grouped2.agg(np.sum).loc[:, ["A", "B", "C"]]
724    assert isinstance(result2, DataFrame)
725    tm.assert_frame_equal(result2, expected2)
726
727    result = grouped["C"].sum()
728    expected = grouped.sum().loc[:, ["A", "C"]]
729    assert isinstance(result, DataFrame)
730    tm.assert_frame_equal(result, expected)
731
732    result2 = grouped2["C"].sum()
733    expected2 = grouped2.sum().loc[:, ["A", "B", "C"]]
734    assert isinstance(result2, DataFrame)
735    tm.assert_frame_equal(result2, expected2)
736
737
738def test_as_index_series_column_slice_raises(df):
739    # GH15072
740    grouped = df.groupby("A", as_index=False)
741    msg = r"Column\(s\) C already selected"
742
743    with pytest.raises(IndexError, match=msg):
744        grouped["C"].__getitem__("D")
745
746
747def test_groupby_as_index_cython(df):
748    data = df
749
750    # single-key
751    grouped = data.groupby("A", as_index=False)
752    result = grouped.mean()
753    expected = data.groupby(["A"]).mean()
754    expected.insert(0, "A", expected.index)
755    expected.index = np.arange(len(expected))
756    tm.assert_frame_equal(result, expected)
757
758    # multi-key
759    grouped = data.groupby(["A", "B"], as_index=False)
760    result = grouped.mean()
761    expected = data.groupby(["A", "B"]).mean()
762
763    arrays = list(zip(*expected.index.values))
764    expected.insert(0, "A", arrays[0])
765    expected.insert(1, "B", arrays[1])
766    expected.index = np.arange(len(expected))
767    tm.assert_frame_equal(result, expected)
768
769
770def test_groupby_as_index_series_scalar(df):
771    grouped = df.groupby(["A", "B"], as_index=False)
772
773    # GH #421
774
775    result = grouped["C"].agg(len)
776    expected = grouped.agg(len).loc[:, ["A", "B", "C"]]
777    tm.assert_frame_equal(result, expected)
778
779
780def test_groupby_as_index_corner(df, ts):
781    msg = "as_index=False only valid with DataFrame"
782    with pytest.raises(TypeError, match=msg):
783        ts.groupby(lambda x: x.weekday(), as_index=False)
784
785    msg = "as_index=False only valid for axis=0"
786    with pytest.raises(ValueError, match=msg):
787        df.groupby(lambda x: x.lower(), as_index=False, axis=1)
788
789
790def test_groupby_multiple_key(df):
791    df = tm.makeTimeDataFrame()
792    grouped = df.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day])
793    agged = grouped.sum()
794    tm.assert_almost_equal(df.values, agged.values)
795
796    grouped = df.T.groupby(
797        [lambda x: x.year, lambda x: x.month, lambda x: x.day], axis=1
798    )
799
800    agged = grouped.agg(lambda x: x.sum())
801    tm.assert_index_equal(agged.index, df.columns)
802    tm.assert_almost_equal(df.T.values, agged.values)
803
804    agged = grouped.agg(lambda x: x.sum())
805    tm.assert_almost_equal(df.T.values, agged.values)
806
807
808def test_groupby_multi_corner(df):
809    # test that having an all-NA column doesn't mess you up
810    df = df.copy()
811    df["bad"] = np.nan
812    agged = df.groupby(["A", "B"]).mean()
813
814    expected = df.groupby(["A", "B"]).mean()
815    expected["bad"] = np.nan
816
817    tm.assert_frame_equal(agged, expected)
818
819
820def test_omit_nuisance(df):
821    grouped = df.groupby("A")
822
823    result = grouped.mean()
824    expected = df.loc[:, ["A", "C", "D"]].groupby("A").mean()
825    tm.assert_frame_equal(result, expected)
826
827    agged = grouped.agg(np.mean)
828    exp = grouped.mean()
829    tm.assert_frame_equal(agged, exp)
830
831    df = df.loc[:, ["A", "C", "D"]]
832    df["E"] = datetime.now()
833    grouped = df.groupby("A")
834    result = grouped.agg(np.sum)
835    expected = grouped.sum()
836    tm.assert_frame_equal(result, expected)
837
838    # won't work with axis = 1
839    grouped = df.groupby({"A": 0, "C": 0, "D": 1, "E": 1}, axis=1)
840    msg = "reduction operation 'sum' not allowed for this dtype"
841    with pytest.raises(TypeError, match=msg):
842        grouped.agg(lambda x: x.sum(0, numeric_only=False))
843
844
845def test_omit_nuisance_sem(df):
846    # GH 38774 - sem should work with nuisance columns
847    grouped = df.groupby("A")
848    result = grouped.sem()
849    expected = df.loc[:, ["A", "C", "D"]].groupby("A").sem()
850    tm.assert_frame_equal(result, expected)
851
852
853def test_omit_nuisance_python_multiple(three_group):
854    grouped = three_group.groupby(["A", "B"])
855
856    agged = grouped.agg(np.mean)
857    exp = grouped.mean()
858    tm.assert_frame_equal(agged, exp)
859
860
861def test_empty_groups_corner(mframe):
862    # handle empty groups
863    df = DataFrame(
864        {
865            "k1": np.array(["b", "b", "b", "a", "a", "a"]),
866            "k2": np.array(["1", "1", "1", "2", "2", "2"]),
867            "k3": ["foo", "bar"] * 3,
868            "v1": np.random.randn(6),
869            "v2": np.random.randn(6),
870        }
871    )
872
873    grouped = df.groupby(["k1", "k2"])
874    result = grouped.agg(np.mean)
875    expected = grouped.mean()
876    tm.assert_frame_equal(result, expected)
877
878    grouped = mframe[3:5].groupby(level=0)
879    agged = grouped.apply(lambda x: x.mean())
880    agged_A = grouped["A"].apply(np.mean)
881    tm.assert_series_equal(agged["A"], agged_A)
882    assert agged.index.name == "first"
883
884
885def test_nonsense_func():
886    df = DataFrame([0])
887    msg = r"unsupported operand type\(s\) for \+: 'int' and 'str'"
888    with pytest.raises(TypeError, match=msg):
889        df.groupby(lambda x: x + "foo")
890
891
892def test_wrap_aggregated_output_multindex(mframe):
893    df = mframe.T
894    df["baz", "two"] = "peekaboo"
895
896    keys = [np.array([0, 0, 1]), np.array([0, 0, 1])]
897    agged = df.groupby(keys).agg(np.mean)
898    assert isinstance(agged.columns, MultiIndex)
899
900    def aggfun(ser):
901        if ser.name == ("foo", "one"):
902            raise TypeError
903        else:
904            return ser.sum()
905
906    agged2 = df.groupby(keys).aggregate(aggfun)
907    assert len(agged2.columns) + 1 == len(df.columns)
908
909
910def test_groupby_level_apply(mframe):
911
912    result = mframe.groupby(level=0).count()
913    assert result.index.name == "first"
914    result = mframe.groupby(level=1).count()
915    assert result.index.name == "second"
916
917    result = mframe["A"].groupby(level=0).count()
918    assert result.index.name == "first"
919
920
921def test_groupby_level_mapper(mframe):
922    deleveled = mframe.reset_index()
923
924    mapper0 = {"foo": 0, "bar": 0, "baz": 1, "qux": 1}
925    mapper1 = {"one": 0, "two": 0, "three": 1}
926
927    result0 = mframe.groupby(mapper0, level=0).sum()
928    result1 = mframe.groupby(mapper1, level=1).sum()
929
930    mapped_level0 = np.array([mapper0.get(x) for x in deleveled["first"]])
931    mapped_level1 = np.array([mapper1.get(x) for x in deleveled["second"]])
932    expected0 = mframe.groupby(mapped_level0).sum()
933    expected1 = mframe.groupby(mapped_level1).sum()
934    expected0.index.name, expected1.index.name = "first", "second"
935
936    tm.assert_frame_equal(result0, expected0)
937    tm.assert_frame_equal(result1, expected1)
938
939
940def test_groupby_level_nonmulti():
941    # GH 1313, GH 13901
942    s = Series([1, 2, 3, 10, 4, 5, 20, 6], Index([1, 2, 3, 1, 4, 5, 2, 6], name="foo"))
943    expected = Series([11, 22, 3, 4, 5, 6], Index(range(1, 7), name="foo"))
944
945    result = s.groupby(level=0).sum()
946    tm.assert_series_equal(result, expected)
947    result = s.groupby(level=[0]).sum()
948    tm.assert_series_equal(result, expected)
949    result = s.groupby(level=-1).sum()
950    tm.assert_series_equal(result, expected)
951    result = s.groupby(level=[-1]).sum()
952    tm.assert_series_equal(result, expected)
953
954    msg = "level > 0 or level < -1 only valid with MultiIndex"
955    with pytest.raises(ValueError, match=msg):
956        s.groupby(level=1)
957    with pytest.raises(ValueError, match=msg):
958        s.groupby(level=-2)
959    msg = "No group keys passed!"
960    with pytest.raises(ValueError, match=msg):
961        s.groupby(level=[])
962    msg = "multiple levels only valid with MultiIndex"
963    with pytest.raises(ValueError, match=msg):
964        s.groupby(level=[0, 0])
965    with pytest.raises(ValueError, match=msg):
966        s.groupby(level=[0, 1])
967    msg = "level > 0 or level < -1 only valid with MultiIndex"
968    with pytest.raises(ValueError, match=msg):
969        s.groupby(level=[1])
970
971
972def test_groupby_complex():
973    # GH 12902
974    a = Series(data=np.arange(4) * (1 + 2j), index=[0, 0, 1, 1])
975    expected = Series((1 + 2j, 5 + 10j))
976
977    result = a.groupby(level=0).sum()
978    tm.assert_series_equal(result, expected)
979
980    result = a.sum(level=0)
981    tm.assert_series_equal(result, expected)
982
983
984def test_groupby_series_indexed_differently():
985    s1 = Series(
986        [5.0, -9.0, 4.0, 100.0, -5.0, 55.0, 6.7],
987        index=Index(["a", "b", "c", "d", "e", "f", "g"]),
988    )
989    s2 = Series(
990        [1.0, 1.0, 4.0, 5.0, 5.0, 7.0], index=Index(["a", "b", "d", "f", "g", "h"])
991    )
992
993    grouped = s1.groupby(s2)
994    agged = grouped.mean()
995    exp = s1.groupby(s2.reindex(s1.index).get).mean()
996    tm.assert_series_equal(agged, exp)
997
998
999def test_groupby_with_hier_columns():
1000    tuples = list(
1001        zip(
1002            *[
1003                ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
1004                ["one", "two", "one", "two", "one", "two", "one", "two"],
1005            ]
1006        )
1007    )
1008    index = MultiIndex.from_tuples(tuples)
1009    columns = MultiIndex.from_tuples(
1010        [("A", "cat"), ("B", "dog"), ("B", "cat"), ("A", "dog")]
1011    )
1012    df = DataFrame(np.random.randn(8, 4), index=index, columns=columns)
1013
1014    result = df.groupby(level=0).mean()
1015    tm.assert_index_equal(result.columns, columns)
1016
1017    result = df.groupby(level=0, axis=1).mean()
1018    tm.assert_index_equal(result.index, df.index)
1019
1020    result = df.groupby(level=0).agg(np.mean)
1021    tm.assert_index_equal(result.columns, columns)
1022
1023    result = df.groupby(level=0).apply(lambda x: x.mean())
1024    tm.assert_index_equal(result.columns, columns)
1025
1026    result = df.groupby(level=0, axis=1).agg(lambda x: x.mean(1))
1027    tm.assert_index_equal(result.columns, Index(["A", "B"]))
1028    tm.assert_index_equal(result.index, df.index)
1029
1030    # add a nuisance column
1031    sorted_columns, _ = columns.sortlevel(0)
1032    df["A", "foo"] = "bar"
1033    result = df.groupby(level=0).mean()
1034    tm.assert_index_equal(result.columns, df.columns[:-1])
1035
1036
1037def test_grouping_ndarray(df):
1038    grouped = df.groupby(df["A"].values)
1039
1040    result = grouped.sum()
1041    expected = df.groupby("A").sum()
1042    tm.assert_frame_equal(
1043        result, expected, check_names=False
1044    )  # Note: no names when grouping by value
1045
1046
1047def test_groupby_wrong_multi_labels():
1048    data = """index,foo,bar,baz,spam,data
10490,foo1,bar1,baz1,spam2,20
10501,foo1,bar2,baz1,spam3,30
10512,foo2,bar2,baz1,spam2,40
10523,foo1,bar1,baz2,spam1,50
10534,foo3,bar1,baz2,spam1,60"""
1054
1055    data = read_csv(StringIO(data), index_col=0)
1056
1057    grouped = data.groupby(["foo", "bar", "baz", "spam"])
1058
1059    result = grouped.agg(np.mean)
1060    expected = grouped.mean()
1061    tm.assert_frame_equal(result, expected)
1062
1063
1064def test_groupby_series_with_name(df):
1065    result = df.groupby(df["A"]).mean()
1066    result2 = df.groupby(df["A"], as_index=False).mean()
1067    assert result.index.name == "A"
1068    assert "A" in result2
1069
1070    result = df.groupby([df["A"], df["B"]]).mean()
1071    result2 = df.groupby([df["A"], df["B"]], as_index=False).mean()
1072    assert result.index.names == ("A", "B")
1073    assert "A" in result2
1074    assert "B" in result2
1075
1076
1077def test_seriesgroupby_name_attr(df):
1078    # GH 6265
1079    result = df.groupby("A")["C"]
1080    assert result.count().name == "C"
1081    assert result.mean().name == "C"
1082
1083    testFunc = lambda x: np.sum(x) * 2
1084    assert result.agg(testFunc).name == "C"
1085
1086
1087def test_consistency_name():
1088    # GH 12363
1089
1090    df = DataFrame(
1091        {
1092            "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
1093            "B": ["one", "one", "two", "two", "two", "two", "one", "two"],
1094            "C": np.random.randn(8) + 1.0,
1095            "D": np.arange(8),
1096        }
1097    )
1098
1099    expected = df.groupby(["A"]).B.count()
1100    result = df.B.groupby(df.A).count()
1101    tm.assert_series_equal(result, expected)
1102
1103
1104def test_groupby_name_propagation(df):
1105    # GH 6124
1106    def summarize(df, name=None):
1107        return Series({"count": 1, "mean": 2, "omissions": 3}, name=name)
1108
1109    def summarize_random_name(df):
1110        # Provide a different name for each Series.  In this case, groupby
1111        # should not attempt to propagate the Series name since they are
1112        # inconsistent.
1113        return Series({"count": 1, "mean": 2, "omissions": 3}, name=df.iloc[0]["A"])
1114
1115    metrics = df.groupby("A").apply(summarize)
1116    assert metrics.columns.name is None
1117    metrics = df.groupby("A").apply(summarize, "metrics")
1118    assert metrics.columns.name == "metrics"
1119    metrics = df.groupby("A").apply(summarize_random_name)
1120    assert metrics.columns.name is None
1121
1122
1123def test_groupby_nonstring_columns():
1124    df = DataFrame([np.arange(10) for x in range(10)])
1125    grouped = df.groupby(0)
1126    result = grouped.mean()
1127    expected = df.groupby(df[0]).mean()
1128    tm.assert_frame_equal(result, expected)
1129
1130
1131def test_groupby_mixed_type_columns():
1132    # GH 13432, unorderable types in py3
1133    df = DataFrame([[0, 1, 2]], columns=["A", "B", 0])
1134    expected = DataFrame([[1, 2]], columns=["B", 0], index=Index([0], name="A"))
1135
1136    result = df.groupby("A").first()
1137    tm.assert_frame_equal(result, expected)
1138
1139    result = df.groupby("A").sum()
1140    tm.assert_frame_equal(result, expected)
1141
1142
1143# TODO: Ensure warning isn't emitted in the first place
1144@pytest.mark.filterwarnings("ignore:Mean of:RuntimeWarning")
1145def test_cython_grouper_series_bug_noncontig():
1146    arr = np.empty((100, 100))
1147    arr.fill(np.nan)
1148    obj = Series(arr[:, 0])
1149    inds = np.tile(range(10), 10)
1150
1151    result = obj.groupby(inds).agg(Series.median)
1152    assert result.isna().all()
1153
1154
1155def test_series_grouper_noncontig_index():
1156    index = Index(tm.rands_array(10, 100))
1157
1158    values = Series(np.random.randn(50), index=index[::2])
1159    labels = np.random.randint(0, 5, 50)
1160
1161    # it works!
1162    grouped = values.groupby(labels)
1163
1164    # accessing the index elements causes segfault
1165    f = lambda x: len(set(map(id, x.index)))
1166    grouped.agg(f)
1167
1168
1169def test_convert_objects_leave_decimal_alone():
1170
1171    s = Series(range(5))
1172    labels = np.array(["a", "b", "c", "d", "e"], dtype="O")
1173
1174    def convert_fast(x):
1175        return Decimal(str(x.mean()))
1176
1177    def convert_force_pure(x):
1178        # base will be length 0
1179        assert len(x.values.base) > 0
1180        return Decimal(str(x.mean()))
1181
1182    grouped = s.groupby(labels)
1183
1184    result = grouped.agg(convert_fast)
1185    assert result.dtype == np.object_
1186    assert isinstance(result[0], Decimal)
1187
1188    result = grouped.agg(convert_force_pure)
1189    assert result.dtype == np.object_
1190    assert isinstance(result[0], Decimal)
1191
1192
1193def test_groupby_dtype_inference_empty():
1194    # GH 6733
1195    df = DataFrame({"x": [], "range": np.arange(0, dtype="int64")})
1196    assert df["x"].dtype == np.float64
1197
1198    result = df.groupby("x").first()
1199    exp_index = Index([], name="x", dtype=np.float64)
1200    expected = DataFrame({"range": Series([], index=exp_index, dtype="int64")})
1201    tm.assert_frame_equal(result, expected, by_blocks=True)
1202
1203
1204def test_groupby_unit64_float_conversion():
1205    #  GH: 30859 groupby converts unit64 to floats sometimes
1206    df = DataFrame({"first": [1], "second": [1], "value": [16148277970000000000]})
1207    result = df.groupby(["first", "second"])["value"].max()
1208    expected = Series(
1209        [16148277970000000000],
1210        MultiIndex.from_product([[1], [1]], names=["first", "second"]),
1211        name="value",
1212    )
1213    tm.assert_series_equal(result, expected)
1214
1215
1216def test_groupby_list_infer_array_like(df):
1217    result = df.groupby(list(df["A"])).mean()
1218    expected = df.groupby(df["A"]).mean()
1219    tm.assert_frame_equal(result, expected, check_names=False)
1220
1221    with pytest.raises(KeyError, match=r"^'foo'$"):
1222        df.groupby(list(df["A"][:-1]))
1223
1224    # pathological case of ambiguity
1225    df = DataFrame({"foo": [0, 1], "bar": [3, 4], "val": np.random.randn(2)})
1226
1227    result = df.groupby(["foo", "bar"]).mean()
1228    expected = df.groupby([df["foo"], df["bar"]]).mean()[["val"]]
1229
1230
1231def test_groupby_keys_same_size_as_index():
1232    # GH 11185
1233    freq = "s"
1234    index = pd.date_range(
1235        start=Timestamp("2015-09-29T11:34:44-0700"), periods=2, freq=freq
1236    )
1237    df = DataFrame([["A", 10], ["B", 15]], columns=["metric", "values"], index=index)
1238    result = df.groupby([Grouper(level=0, freq=freq), "metric"]).mean()
1239    expected = df.set_index([df.index, "metric"])
1240
1241    tm.assert_frame_equal(result, expected)
1242
1243
1244def test_groupby_one_row():
1245    # GH 11741
1246    msg = r"^'Z'$"
1247    df1 = DataFrame(np.random.randn(1, 4), columns=list("ABCD"))
1248    with pytest.raises(KeyError, match=msg):
1249        df1.groupby("Z")
1250    df2 = DataFrame(np.random.randn(2, 4), columns=list("ABCD"))
1251    with pytest.raises(KeyError, match=msg):
1252        df2.groupby("Z")
1253
1254
1255def test_groupby_nat_exclude():
1256    # GH 6992
1257    df = DataFrame(
1258        {
1259            "values": np.random.randn(8),
1260            "dt": [
1261                np.nan,
1262                Timestamp("2013-01-01"),
1263                np.nan,
1264                Timestamp("2013-02-01"),
1265                np.nan,
1266                Timestamp("2013-02-01"),
1267                np.nan,
1268                Timestamp("2013-01-01"),
1269            ],
1270            "str": [np.nan, "a", np.nan, "a", np.nan, "a", np.nan, "b"],
1271        }
1272    )
1273    grouped = df.groupby("dt")
1274
1275    expected = [Index([1, 7]), Index([3, 5])]
1276    keys = sorted(grouped.groups.keys())
1277    assert len(keys) == 2
1278    for k, e in zip(keys, expected):
1279        # grouped.groups keys are np.datetime64 with system tz
1280        # not to be affected by tz, only compare values
1281        tm.assert_index_equal(grouped.groups[k], e)
1282
1283    # confirm obj is not filtered
1284    tm.assert_frame_equal(grouped.grouper.groupings[0].obj, df)
1285    assert grouped.ngroups == 2
1286
1287    expected = {
1288        Timestamp("2013-01-01 00:00:00"): np.array([1, 7], dtype=np.intp),
1289        Timestamp("2013-02-01 00:00:00"): np.array([3, 5], dtype=np.intp),
1290    }
1291
1292    for k in grouped.indices:
1293        tm.assert_numpy_array_equal(grouped.indices[k], expected[k])
1294
1295    tm.assert_frame_equal(grouped.get_group(Timestamp("2013-01-01")), df.iloc[[1, 7]])
1296    tm.assert_frame_equal(grouped.get_group(Timestamp("2013-02-01")), df.iloc[[3, 5]])
1297
1298    with pytest.raises(KeyError, match=r"^NaT$"):
1299        grouped.get_group(pd.NaT)
1300
1301    nan_df = DataFrame(
1302        {"nan": [np.nan, np.nan, np.nan], "nat": [pd.NaT, pd.NaT, pd.NaT]}
1303    )
1304    assert nan_df["nan"].dtype == "float64"
1305    assert nan_df["nat"].dtype == "datetime64[ns]"
1306
1307    for key in ["nan", "nat"]:
1308        grouped = nan_df.groupby(key)
1309        assert grouped.groups == {}
1310        assert grouped.ngroups == 0
1311        assert grouped.indices == {}
1312        with pytest.raises(KeyError, match=r"^nan$"):
1313            grouped.get_group(np.nan)
1314        with pytest.raises(KeyError, match=r"^NaT$"):
1315            grouped.get_group(pd.NaT)
1316
1317
1318def test_groupby_two_group_keys_all_nan():
1319    # GH #36842: Grouping over two group keys shouldn't raise an error
1320    df = DataFrame({"a": [np.nan, np.nan], "b": [np.nan, np.nan], "c": [1, 2]})
1321    result = df.groupby(["a", "b"]).indices
1322    assert result == {}
1323
1324
1325def test_groupby_2d_malformed():
1326    d = DataFrame(index=range(2))
1327    d["group"] = ["g1", "g2"]
1328    d["zeros"] = [0, 0]
1329    d["ones"] = [1, 1]
1330    d["label"] = ["l1", "l2"]
1331    tmp = d.groupby(["group"]).mean()
1332    res_values = np.array([[0, 1], [0, 1]], dtype=np.int64)
1333    tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"]))
1334    tm.assert_numpy_array_equal(tmp.values, res_values)
1335
1336
1337def test_int32_overflow():
1338    B = np.concatenate((np.arange(10000), np.arange(10000), np.arange(5000)))
1339    A = np.arange(25000)
1340    df = DataFrame({"A": A, "B": B, "C": A, "D": B, "E": np.random.randn(25000)})
1341
1342    left = df.groupby(["A", "B", "C", "D"]).sum()
1343    right = df.groupby(["D", "C", "B", "A"]).sum()
1344    assert len(left) == len(right)
1345
1346
1347def test_groupby_sort_multi():
1348    df = DataFrame(
1349        {
1350            "a": ["foo", "bar", "baz"],
1351            "b": [3, 2, 1],
1352            "c": [0, 1, 2],
1353            "d": np.random.randn(3),
1354        }
1355    )
1356
1357    tups = [tuple(row) for row in df[["a", "b", "c"]].values]
1358    tups = com.asarray_tuplesafe(tups)
1359    result = df.groupby(["a", "b", "c"], sort=True).sum()
1360    tm.assert_numpy_array_equal(result.index.values, tups[[1, 2, 0]])
1361
1362    tups = [tuple(row) for row in df[["c", "a", "b"]].values]
1363    tups = com.asarray_tuplesafe(tups)
1364    result = df.groupby(["c", "a", "b"], sort=True).sum()
1365    tm.assert_numpy_array_equal(result.index.values, tups)
1366
1367    tups = [tuple(x) for x in df[["b", "c", "a"]].values]
1368    tups = com.asarray_tuplesafe(tups)
1369    result = df.groupby(["b", "c", "a"], sort=True).sum()
1370    tm.assert_numpy_array_equal(result.index.values, tups[[2, 1, 0]])
1371
1372    df = DataFrame(
1373        {"a": [0, 1, 2, 0, 1, 2], "b": [0, 0, 0, 1, 1, 1], "d": np.random.randn(6)}
1374    )
1375    grouped = df.groupby(["a", "b"])["d"]
1376    result = grouped.sum()
1377
1378    def _check_groupby(df, result, keys, field, f=lambda x: x.sum()):
1379        tups = [tuple(row) for row in df[keys].values]
1380        tups = com.asarray_tuplesafe(tups)
1381        expected = f(df.groupby(tups)[field])
1382        for k, v in expected.items():
1383            assert result[k] == v
1384
1385    _check_groupby(df, result, ["a", "b"], "d")
1386
1387
1388def test_dont_clobber_name_column():
1389    df = DataFrame(
1390        {"key": ["a", "a", "a", "b", "b", "b"], "name": ["foo", "bar", "baz"] * 2}
1391    )
1392
1393    result = df.groupby("key").apply(lambda x: x)
1394    tm.assert_frame_equal(result, df)
1395
1396
1397def test_skip_group_keys():
1398
1399    tsf = tm.makeTimeDataFrame()
1400
1401    grouped = tsf.groupby(lambda x: x.month, group_keys=False)
1402    result = grouped.apply(lambda x: x.sort_values(by="A")[:3])
1403
1404    pieces = [group.sort_values(by="A")[:3] for key, group in grouped]
1405
1406    expected = pd.concat(pieces)
1407    tm.assert_frame_equal(result, expected)
1408
1409    grouped = tsf["A"].groupby(lambda x: x.month, group_keys=False)
1410    result = grouped.apply(lambda x: x.sort_values()[:3])
1411
1412    pieces = [group.sort_values()[:3] for key, group in grouped]
1413
1414    expected = pd.concat(pieces)
1415    tm.assert_series_equal(result, expected)
1416
1417
1418def test_no_nonsense_name(float_frame):
1419    # GH #995
1420    s = float_frame["C"].copy()
1421    s.name = None
1422
1423    result = s.groupby(float_frame["A"]).agg(np.sum)
1424    assert result.name is None
1425
1426
1427def test_multifunc_sum_bug():
1428    # GH #1065
1429    x = DataFrame(np.arange(9).reshape(3, 3))
1430    x["test"] = 0
1431    x["fl"] = [1.3, 1.5, 1.6]
1432
1433    grouped = x.groupby("test")
1434    result = grouped.agg({"fl": "sum", 2: "size"})
1435    assert result["fl"].dtype == np.float64
1436
1437
1438def test_handle_dict_return_value(df):
1439    def f(group):
1440        return {"max": group.max(), "min": group.min()}
1441
1442    def g(group):
1443        return Series({"max": group.max(), "min": group.min()})
1444
1445    result = df.groupby("A")["C"].apply(f)
1446    expected = df.groupby("A")["C"].apply(g)
1447
1448    assert isinstance(result, Series)
1449    tm.assert_series_equal(result, expected)
1450
1451
1452@pytest.mark.parametrize("grouper", ["A", ["A", "B"]])
1453def test_set_group_name(df, grouper):
1454    def f(group):
1455        assert group.name is not None
1456        return group
1457
1458    def freduce(group):
1459        assert group.name is not None
1460        return group.sum()
1461
1462    def foo(x):
1463        return freduce(x)
1464
1465    grouped = df.groupby(grouper)
1466
1467    # make sure all these work
1468    grouped.apply(f)
1469    grouped.aggregate(freduce)
1470    grouped.aggregate({"C": freduce, "D": freduce})
1471    grouped.transform(f)
1472
1473    grouped["C"].apply(f)
1474    grouped["C"].aggregate(freduce)
1475    grouped["C"].aggregate([freduce, foo])
1476    grouped["C"].transform(f)
1477
1478
1479def test_group_name_available_in_inference_pass():
1480    # gh-15062
1481    df = DataFrame({"a": [0, 0, 1, 1, 2, 2], "b": np.arange(6)})
1482
1483    names = []
1484
1485    def f(group):
1486        names.append(group.name)
1487        return group.copy()
1488
1489    df.groupby("a", sort=False, group_keys=False).apply(f)
1490
1491    expected_names = [0, 1, 2]
1492    assert names == expected_names
1493
1494
1495def test_no_dummy_key_names(df):
1496    # see gh-1291
1497    result = df.groupby(df["A"].values).sum()
1498    assert result.index.name is None
1499
1500    result = df.groupby([df["A"].values, df["B"].values]).sum()
1501    assert result.index.names == (None, None)
1502
1503
1504def test_groupby_sort_multiindex_series():
1505    # series multiindex groupby sort argument was not being passed through
1506    # _compress_group_index
1507    # GH 9444
1508    index = MultiIndex(
1509        levels=[[1, 2], [1, 2]],
1510        codes=[[0, 0, 0, 0, 1, 1], [1, 1, 0, 0, 0, 0]],
1511        names=["a", "b"],
1512    )
1513    mseries = Series([0, 1, 2, 3, 4, 5], index=index)
1514    index = MultiIndex(
1515        levels=[[1, 2], [1, 2]], codes=[[0, 0, 1], [1, 0, 0]], names=["a", "b"]
1516    )
1517    mseries_result = Series([0, 2, 4], index=index)
1518
1519    result = mseries.groupby(level=["a", "b"], sort=False).first()
1520    tm.assert_series_equal(result, mseries_result)
1521    result = mseries.groupby(level=["a", "b"], sort=True).first()
1522    tm.assert_series_equal(result, mseries_result.sort_index())
1523
1524
1525def test_groupby_reindex_inside_function():
1526
1527    periods = 1000
1528    ind = date_range(start="2012/1/1", freq="5min", periods=periods)
1529    df = DataFrame({"high": np.arange(periods), "low": np.arange(periods)}, index=ind)
1530
1531    def agg_before(func, fix=False):
1532        """
1533        Run an aggregate func on the subset of data.
1534        """
1535
1536        def _func(data):
1537            d = data.loc[data.index.map(lambda x: x.hour < 11)].dropna()
1538            if fix:
1539                data[data.index[0]]
1540            if len(d) == 0:
1541                return None
1542            return func(d)
1543
1544        return _func
1545
1546    grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day))
1547    closure_bad = grouped.agg({"high": agg_before(np.max)})
1548    closure_good = grouped.agg({"high": agg_before(np.max, True)})
1549
1550    tm.assert_frame_equal(closure_bad, closure_good)
1551
1552
1553def test_groupby_multiindex_missing_pair():
1554    # GH9049
1555    df = DataFrame(
1556        {
1557            "group1": ["a", "a", "a", "b"],
1558            "group2": ["c", "c", "d", "c"],
1559            "value": [1, 1, 1, 5],
1560        }
1561    )
1562    df = df.set_index(["group1", "group2"])
1563    df_grouped = df.groupby(level=["group1", "group2"], sort=True)
1564
1565    res = df_grouped.agg("sum")
1566    idx = MultiIndex.from_tuples(
1567        [("a", "c"), ("a", "d"), ("b", "c")], names=["group1", "group2"]
1568    )
1569    exp = DataFrame([[2], [1], [5]], index=idx, columns=["value"])
1570
1571    tm.assert_frame_equal(res, exp)
1572
1573
1574def test_groupby_multiindex_not_lexsorted():
1575    # GH 11640
1576
1577    # define the lexsorted version
1578    lexsorted_mi = MultiIndex.from_tuples(
1579        [("a", ""), ("b1", "c1"), ("b2", "c2")], names=["b", "c"]
1580    )
1581    lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi)
1582    assert lexsorted_df.columns.is_lexsorted()
1583
1584    # define the non-lexsorted version
1585    not_lexsorted_df = DataFrame(
1586        columns=["a", "b", "c", "d"], data=[[1, "b1", "c1", 3], [1, "b2", "c2", 4]]
1587    )
1588    not_lexsorted_df = not_lexsorted_df.pivot_table(
1589        index="a", columns=["b", "c"], values="d"
1590    )
1591    not_lexsorted_df = not_lexsorted_df.reset_index()
1592    assert not not_lexsorted_df.columns.is_lexsorted()
1593
1594    # compare the results
1595    tm.assert_frame_equal(lexsorted_df, not_lexsorted_df)
1596
1597    expected = lexsorted_df.groupby("a").mean()
1598    with tm.assert_produces_warning(PerformanceWarning):
1599        result = not_lexsorted_df.groupby("a").mean()
1600    tm.assert_frame_equal(expected, result)
1601
1602    # a transforming function should work regardless of sort
1603    # GH 14776
1604    df = DataFrame(
1605        {"x": ["a", "a", "b", "a"], "y": [1, 1, 2, 2], "z": [1, 2, 3, 4]}
1606    ).set_index(["x", "y"])
1607    assert not df.index.is_lexsorted()
1608
1609    for level in [0, 1, [0, 1]]:
1610        for sort in [False, True]:
1611            result = df.groupby(level=level, sort=sort).apply(DataFrame.drop_duplicates)
1612            expected = df
1613            tm.assert_frame_equal(expected, result)
1614
1615            result = (
1616                df.sort_index()
1617                .groupby(level=level, sort=sort)
1618                .apply(DataFrame.drop_duplicates)
1619            )
1620            expected = df.sort_index()
1621            tm.assert_frame_equal(expected, result)
1622
1623
1624def test_index_label_overlaps_location():
1625    # checking we don't have any label/location confusion in the
1626    # wake of GH5375
1627    df = DataFrame(list("ABCDE"), index=[2, 0, 2, 1, 1])
1628    g = df.groupby(list("ababb"))
1629    actual = g.filter(lambda x: len(x) > 2)
1630    expected = df.iloc[[1, 3, 4]]
1631    tm.assert_frame_equal(actual, expected)
1632
1633    ser = df[0]
1634    g = ser.groupby(list("ababb"))
1635    actual = g.filter(lambda x: len(x) > 2)
1636    expected = ser.take([1, 3, 4])
1637    tm.assert_series_equal(actual, expected)
1638
1639    # ... and again, with a generic Index of floats
1640    df.index = df.index.astype(float)
1641    g = df.groupby(list("ababb"))
1642    actual = g.filter(lambda x: len(x) > 2)
1643    expected = df.iloc[[1, 3, 4]]
1644    tm.assert_frame_equal(actual, expected)
1645
1646    ser = df[0]
1647    g = ser.groupby(list("ababb"))
1648    actual = g.filter(lambda x: len(x) > 2)
1649    expected = ser.take([1, 3, 4])
1650    tm.assert_series_equal(actual, expected)
1651
1652
1653def test_transform_doesnt_clobber_ints():
1654    # GH 7972
1655    n = 6
1656    x = np.arange(n)
1657    df = DataFrame({"a": x // 2, "b": 2.0 * x, "c": 3.0 * x})
1658    df2 = DataFrame({"a": x // 2 * 1.0, "b": 2.0 * x, "c": 3.0 * x})
1659
1660    gb = df.groupby("a")
1661    result = gb.transform("mean")
1662
1663    gb2 = df2.groupby("a")
1664    expected = gb2.transform("mean")
1665    tm.assert_frame_equal(result, expected)
1666
1667
1668@pytest.mark.parametrize(
1669    "sort_column",
1670    ["ints", "floats", "strings", ["ints", "floats"], ["ints", "strings"]],
1671)
1672@pytest.mark.parametrize(
1673    "group_column", ["int_groups", "string_groups", ["int_groups", "string_groups"]]
1674)
1675def test_groupby_preserves_sort(sort_column, group_column):
1676    # Test to ensure that groupby always preserves sort order of original
1677    # object. Issue #8588 and #9651
1678
1679    df = DataFrame(
1680        {
1681            "int_groups": [3, 1, 0, 1, 0, 3, 3, 3],
1682            "string_groups": ["z", "a", "z", "a", "a", "g", "g", "g"],
1683            "ints": [8, 7, 4, 5, 2, 9, 1, 1],
1684            "floats": [2.3, 5.3, 6.2, -2.4, 2.2, 1.1, 1.1, 5],
1685            "strings": ["z", "d", "a", "e", "word", "word2", "42", "47"],
1686        }
1687    )
1688
1689    # Try sorting on different types and with different group types
1690
1691    df = df.sort_values(by=sort_column)
1692    g = df.groupby(group_column)
1693
1694    def test_sort(x):
1695        tm.assert_frame_equal(x, x.sort_values(by=sort_column))
1696
1697    g.apply(test_sort)
1698
1699
1700def test_pivot_table_values_key_error():
1701    # This test is designed to replicate the error in issue #14938
1702    df = DataFrame(
1703        {
1704            "eventDate": pd.date_range(datetime.today(), periods=20, freq="M").tolist(),
1705            "thename": range(0, 20),
1706        }
1707    )
1708
1709    df["year"] = df.set_index("eventDate").index.year
1710    df["month"] = df.set_index("eventDate").index.month
1711
1712    with pytest.raises(KeyError, match="'badname'"):
1713        df.reset_index().pivot_table(
1714            index="year", columns="month", values="badname", aggfunc="count"
1715        )
1716
1717
1718def test_empty_dataframe_groupby():
1719    # GH8093
1720    df = DataFrame(columns=["A", "B", "C"])
1721
1722    result = df.groupby("A").sum()
1723    expected = DataFrame(columns=["B", "C"], dtype=np.float64)
1724    expected.index.name = "A"
1725
1726    tm.assert_frame_equal(result, expected)
1727
1728
1729def test_tuple_as_grouping():
1730    # https://github.com/pandas-dev/pandas/issues/18314
1731    df = DataFrame(
1732        {
1733            ("a", "b"): [1, 1, 1, 1],
1734            "a": [2, 2, 2, 2],
1735            "b": [2, 2, 2, 2],
1736            "c": [1, 1, 1, 1],
1737        }
1738    )
1739
1740    with pytest.raises(KeyError, match=r"('a', 'b')"):
1741        df[["a", "b", "c"]].groupby(("a", "b"))
1742
1743    result = df.groupby(("a", "b"))["c"].sum()
1744    expected = Series([4], name="c", index=Index([1], name=("a", "b")))
1745    tm.assert_series_equal(result, expected)
1746
1747
1748def test_tuple_correct_keyerror():
1749    # https://github.com/pandas-dev/pandas/issues/18798
1750    df = DataFrame(1, index=range(3), columns=MultiIndex.from_product([[1, 2], [3, 4]]))
1751    with pytest.raises(KeyError, match=r"^\(7, 8\)$"):
1752        df.groupby((7, 8)).mean()
1753
1754
1755def test_groupby_agg_ohlc_non_first():
1756    # GH 21716
1757    df = DataFrame(
1758        [[1], [1]],
1759        columns=["foo"],
1760        index=pd.date_range("2018-01-01", periods=2, freq="D"),
1761    )
1762
1763    expected = DataFrame(
1764        [[1, 1, 1, 1, 1], [1, 1, 1, 1, 1]],
1765        columns=MultiIndex.from_tuples(
1766            (
1767                ("foo", "sum", "foo"),
1768                ("foo", "ohlc", "open"),
1769                ("foo", "ohlc", "high"),
1770                ("foo", "ohlc", "low"),
1771                ("foo", "ohlc", "close"),
1772            )
1773        ),
1774        index=pd.date_range("2018-01-01", periods=2, freq="D"),
1775    )
1776
1777    result = df.groupby(Grouper(freq="D")).agg(["sum", "ohlc"])
1778
1779    tm.assert_frame_equal(result, expected)
1780
1781
1782def test_groupby_multiindex_nat():
1783    # GH 9236
1784    values = [
1785        (pd.NaT, "a"),
1786        (datetime(2012, 1, 2), "a"),
1787        (datetime(2012, 1, 2), "b"),
1788        (datetime(2012, 1, 3), "a"),
1789    ]
1790    mi = MultiIndex.from_tuples(values, names=["date", None])
1791    ser = Series([3, 2, 2.5, 4], index=mi)
1792
1793    result = ser.groupby(level=1).mean()
1794    expected = Series([3.0, 2.5], index=["a", "b"])
1795    tm.assert_series_equal(result, expected)
1796
1797
1798def test_groupby_empty_list_raises():
1799    # GH 5289
1800    values = zip(range(10), range(10))
1801    df = DataFrame(values, columns=["apple", "b"])
1802    msg = "Grouper and axis must be same length"
1803    with pytest.raises(ValueError, match=msg):
1804        df.groupby([[]])
1805
1806
1807def test_groupby_multiindex_series_keys_len_equal_group_axis():
1808    # GH 25704
1809    index_array = [["x", "x"], ["a", "b"], ["k", "k"]]
1810    index_names = ["first", "second", "third"]
1811    ri = MultiIndex.from_arrays(index_array, names=index_names)
1812    s = Series(data=[1, 2], index=ri)
1813    result = s.groupby(["first", "third"]).sum()
1814
1815    index_array = [["x"], ["k"]]
1816    index_names = ["first", "third"]
1817    ei = MultiIndex.from_arrays(index_array, names=index_names)
1818    expected = Series([3], index=ei)
1819
1820    tm.assert_series_equal(result, expected)
1821
1822
1823def test_groupby_groups_in_BaseGrouper():
1824    # GH 26326
1825    # Test if DataFrame grouped with a pandas.Grouper has correct groups
1826    mi = MultiIndex.from_product([["A", "B"], ["C", "D"]], names=["alpha", "beta"])
1827    df = DataFrame({"foo": [1, 2, 1, 2], "bar": [1, 2, 3, 4]}, index=mi)
1828    result = df.groupby([Grouper(level="alpha"), "beta"])
1829    expected = df.groupby(["alpha", "beta"])
1830    assert result.groups == expected.groups
1831
1832    result = df.groupby(["beta", Grouper(level="alpha")])
1833    expected = df.groupby(["beta", "alpha"])
1834    assert result.groups == expected.groups
1835
1836
1837@pytest.mark.parametrize("group_name", ["x", ["x"]])
1838def test_groupby_axis_1(group_name):
1839    # GH 27614
1840    df = DataFrame(
1841        np.arange(12).reshape(3, 4), index=[0, 1, 0], columns=[10, 20, 10, 20]
1842    )
1843    df.index.name = "y"
1844    df.columns.name = "x"
1845
1846    results = df.groupby(group_name, axis=1).sum()
1847    expected = df.T.groupby(group_name).sum().T
1848    tm.assert_frame_equal(results, expected)
1849
1850    # test on MI column
1851    iterables = [["bar", "baz", "foo"], ["one", "two"]]
1852    mi = MultiIndex.from_product(iterables=iterables, names=["x", "x1"])
1853    df = DataFrame(np.arange(18).reshape(3, 6), index=[0, 1, 0], columns=mi)
1854    results = df.groupby(group_name, axis=1).sum()
1855    expected = df.T.groupby(group_name).sum().T
1856    tm.assert_frame_equal(results, expected)
1857
1858
1859@pytest.mark.parametrize(
1860    "op, expected",
1861    [
1862        (
1863            "shift",
1864            {
1865                "time": [
1866                    None,
1867                    None,
1868                    Timestamp("2019-01-01 12:00:00"),
1869                    Timestamp("2019-01-01 12:30:00"),
1870                    None,
1871                    None,
1872                ]
1873            },
1874        ),
1875        (
1876            "bfill",
1877            {
1878                "time": [
1879                    Timestamp("2019-01-01 12:00:00"),
1880                    Timestamp("2019-01-01 12:30:00"),
1881                    Timestamp("2019-01-01 14:00:00"),
1882                    Timestamp("2019-01-01 14:30:00"),
1883                    Timestamp("2019-01-01 14:00:00"),
1884                    Timestamp("2019-01-01 14:30:00"),
1885                ]
1886            },
1887        ),
1888        (
1889            "ffill",
1890            {
1891                "time": [
1892                    Timestamp("2019-01-01 12:00:00"),
1893                    Timestamp("2019-01-01 12:30:00"),
1894                    Timestamp("2019-01-01 12:00:00"),
1895                    Timestamp("2019-01-01 12:30:00"),
1896                    Timestamp("2019-01-01 14:00:00"),
1897                    Timestamp("2019-01-01 14:30:00"),
1898                ]
1899            },
1900        ),
1901    ],
1902)
1903def test_shift_bfill_ffill_tz(tz_naive_fixture, op, expected):
1904    # GH19995, GH27992: Check that timezone does not drop in shift, bfill, and ffill
1905    tz = tz_naive_fixture
1906    data = {
1907        "id": ["A", "B", "A", "B", "A", "B"],
1908        "time": [
1909            Timestamp("2019-01-01 12:00:00"),
1910            Timestamp("2019-01-01 12:30:00"),
1911            None,
1912            None,
1913            Timestamp("2019-01-01 14:00:00"),
1914            Timestamp("2019-01-01 14:30:00"),
1915        ],
1916    }
1917    df = DataFrame(data).assign(time=lambda x: x.time.dt.tz_localize(tz))
1918
1919    grouped = df.groupby("id")
1920    result = getattr(grouped, op)()
1921    expected = DataFrame(expected).assign(time=lambda x: x.time.dt.tz_localize(tz))
1922    tm.assert_frame_equal(result, expected)
1923
1924
1925def test_groupby_only_none_group():
1926    # see GH21624
1927    # this was crashing with "ValueError: Length of passed values is 1, index implies 0"
1928    df = DataFrame({"g": [None], "x": 1})
1929    actual = df.groupby("g")["x"].transform("sum")
1930    expected = Series([np.nan], name="x")
1931
1932    tm.assert_series_equal(actual, expected)
1933
1934
1935def test_groupby_duplicate_index():
1936    # GH#29189 the groupby call here used to raise
1937    ser = Series([2, 5, 6, 8], index=[2.0, 4.0, 4.0, 5.0])
1938    gb = ser.groupby(level=0)
1939
1940    result = gb.mean()
1941    expected = Series([2, 5.5, 8], index=[2.0, 4.0, 5.0])
1942    tm.assert_series_equal(result, expected)
1943
1944
1945@pytest.mark.parametrize("bool_agg_func", ["any", "all"])
1946def test_bool_aggs_dup_column_labels(bool_agg_func):
1947    # 21668
1948    df = DataFrame([[True, True]], columns=["a", "a"])
1949    grp_by = df.groupby([0])
1950    result = getattr(grp_by, bool_agg_func)()
1951
1952    expected = df
1953    tm.assert_frame_equal(result, expected)
1954
1955
1956@pytest.mark.parametrize(
1957    "idx", [Index(["a", "a"]), MultiIndex.from_tuples((("a", "a"), ("a", "a")))]
1958)
1959@pytest.mark.filterwarnings("ignore:tshift is deprecated:FutureWarning")
1960def test_dup_labels_output_shape(groupby_func, idx):
1961    if groupby_func in {"size", "ngroup", "cumcount"}:
1962        pytest.skip("Not applicable")
1963
1964    df = DataFrame([[1, 1]], columns=idx)
1965    grp_by = df.groupby([0])
1966
1967    args = []
1968    if groupby_func in {"fillna", "nth"}:
1969        args.append(0)
1970    elif groupby_func == "corrwith":
1971        args.append(df)
1972    elif groupby_func == "tshift":
1973        df.index = [Timestamp("today")]
1974        args.extend([1, "D"])
1975
1976    result = getattr(grp_by, groupby_func)(*args)
1977
1978    assert result.shape == (1, 2)
1979    tm.assert_index_equal(result.columns, idx)
1980
1981
1982def test_groupby_crash_on_nunique(axis):
1983    # Fix following 30253
1984    df = DataFrame({("A", "B"): [1, 2], ("A", "C"): [1, 3], ("D", "B"): [0, 0]})
1985
1986    axis_number = df._get_axis_number(axis)
1987    if not axis_number:
1988        df = df.T
1989
1990    result = df.groupby(axis=axis_number, level=0).nunique()
1991
1992    expected = DataFrame({"A": [1, 2], "D": [1, 1]})
1993    if not axis_number:
1994        expected = expected.T
1995
1996    tm.assert_frame_equal(result, expected)
1997
1998
1999def test_groupby_list_level():
2000    # GH 9790
2001    expected = DataFrame(np.arange(0, 9).reshape(3, 3))
2002    result = expected.groupby(level=[0]).mean()
2003    tm.assert_frame_equal(result, expected)
2004
2005
2006@pytest.mark.parametrize(
2007    "max_seq_items, expected",
2008    [
2009        (5, "{0: [0], 1: [1], 2: [2], 3: [3], 4: [4]}"),
2010        (4, "{0: [0], 1: [1], 2: [2], 3: [3], ...}"),
2011    ],
2012)
2013def test_groups_repr_truncates(max_seq_items, expected):
2014    # GH 1135
2015    df = DataFrame(np.random.randn(5, 1))
2016    df["a"] = df.index
2017
2018    with pd.option_context("display.max_seq_items", max_seq_items):
2019        result = df.groupby("a").groups.__repr__()
2020        assert result == expected
2021
2022        result = df.groupby(np.array(df.a)).groups.__repr__()
2023        assert result == expected
2024
2025
2026def test_group_on_two_row_multiindex_returns_one_tuple_key():
2027    # GH 18451
2028    df = DataFrame([{"a": 1, "b": 2, "c": 99}, {"a": 1, "b": 2, "c": 88}])
2029    df = df.set_index(["a", "b"])
2030
2031    grp = df.groupby(["a", "b"])
2032    result = grp.indices
2033    expected = {(1, 2): np.array([0, 1], dtype=np.int64)}
2034
2035    assert len(result) == 1
2036    key = (1, 2)
2037    assert (result[key] == expected[key]).all()
2038
2039
2040@pytest.mark.parametrize(
2041    "klass, attr, value",
2042    [
2043        (DataFrame, "level", "a"),
2044        (DataFrame, "as_index", False),
2045        (DataFrame, "sort", False),
2046        (DataFrame, "group_keys", False),
2047        (DataFrame, "squeeze", True),
2048        (DataFrame, "observed", True),
2049        (DataFrame, "dropna", False),
2050        pytest.param(
2051            Series,
2052            "axis",
2053            1,
2054            marks=pytest.mark.xfail(
2055                reason="GH 35443: Attribute currently not passed on to series"
2056            ),
2057        ),
2058        (Series, "level", "a"),
2059        (Series, "as_index", False),
2060        (Series, "sort", False),
2061        (Series, "group_keys", False),
2062        (Series, "squeeze", True),
2063        (Series, "observed", True),
2064        (Series, "dropna", False),
2065    ],
2066)
2067@pytest.mark.filterwarnings(
2068    "ignore:The `squeeze` parameter is deprecated:FutureWarning"
2069)
2070def test_subsetting_columns_keeps_attrs(klass, attr, value):
2071    # GH 9959 - When subsetting columns, don't drop attributes
2072    df = DataFrame({"a": [1], "b": [2], "c": [3]})
2073    if attr != "axis":
2074        df = df.set_index("a")
2075
2076    expected = df.groupby("a", **{attr: value})
2077    result = expected[["b"]] if klass is DataFrame else expected["b"]
2078    assert getattr(result, attr) == getattr(expected, attr)
2079
2080
2081def test_subsetting_columns_axis_1():
2082    # GH 37725
2083    g = DataFrame({"A": [1], "B": [2], "C": [3]}).groupby([0, 0, 1], axis=1)
2084    match = "Cannot subset columns when using axis=1"
2085    with pytest.raises(ValueError, match=match):
2086        g[["A", "B"]].sum()
2087
2088
2089@pytest.mark.parametrize("func", ["sum", "any", "shift"])
2090def test_groupby_column_index_name_lost(func):
2091    # GH: 29764 groupby loses index sometimes
2092    expected = Index(["a"], name="idx")
2093    df = DataFrame([[1]], columns=expected)
2094    df_grouped = df.groupby([1])
2095    result = getattr(df_grouped, func)().columns
2096    tm.assert_index_equal(result, expected)
2097
2098
2099def test_groupby_duplicate_columns():
2100    # GH: 31735
2101    df = DataFrame(
2102        {"A": ["f", "e", "g", "h"], "B": ["a", "b", "c", "d"], "C": [1, 2, 3, 4]}
2103    ).astype(object)
2104    df.columns = ["A", "B", "B"]
2105    result = df.groupby([0, 0, 0, 0]).min()
2106    expected = DataFrame([["e", "a", 1]], columns=["A", "B", "B"])
2107    tm.assert_frame_equal(result, expected)
2108
2109
2110def test_groupby_series_with_tuple_name():
2111    # GH 37755
2112    ser = Series([1, 2, 3, 4], index=[1, 1, 2, 2], name=("a", "a"))
2113    ser.index.name = ("b", "b")
2114    result = ser.groupby(level=0).last()
2115    expected = Series([2, 4], index=[1, 2], name=("a", "a"))
2116    expected.index.name = ("b", "b")
2117    tm.assert_series_equal(result, expected)
2118