1import numpy as np 2import pytest 3 4import pandas as pd 5import pandas._testing as tm 6 7 8def test_error(): 9 df = pd.DataFrame( 10 {"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1} 11 ) 12 with pytest.raises(ValueError, match="column must be a scalar"): 13 df.explode(list("AA")) 14 15 df.columns = list("AA") 16 with pytest.raises(ValueError, match="columns must be unique"): 17 df.explode("A") 18 19 20def test_basic(): 21 df = pd.DataFrame( 22 {"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1} 23 ) 24 result = df.explode("A") 25 expected = pd.DataFrame( 26 { 27 "A": pd.Series( 28 [0, 1, 2, np.nan, np.nan, 3, 4], index=list("aaabcdd"), dtype=object 29 ), 30 "B": 1, 31 } 32 ) 33 tm.assert_frame_equal(result, expected) 34 35 36def test_multi_index_rows(): 37 df = pd.DataFrame( 38 {"A": np.array([[0, 1, 2], np.nan, [], (3, 4)], dtype=object), "B": 1}, 39 index=pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)]), 40 ) 41 42 result = df.explode("A") 43 expected = pd.DataFrame( 44 { 45 "A": pd.Series( 46 [0, 1, 2, np.nan, np.nan, 3, 4], 47 index=pd.MultiIndex.from_tuples( 48 [ 49 ("a", 1), 50 ("a", 1), 51 ("a", 1), 52 ("a", 2), 53 ("b", 1), 54 ("b", 2), 55 ("b", 2), 56 ] 57 ), 58 dtype=object, 59 ), 60 "B": 1, 61 } 62 ) 63 tm.assert_frame_equal(result, expected) 64 65 66def test_multi_index_columns(): 67 df = pd.DataFrame( 68 {("A", 1): np.array([[0, 1, 2], np.nan, [], (3, 4)], dtype=object), ("A", 2): 1} 69 ) 70 71 result = df.explode(("A", 1)) 72 expected = pd.DataFrame( 73 { 74 ("A", 1): pd.Series( 75 [0, 1, 2, np.nan, np.nan, 3, 4], 76 index=pd.Index([0, 0, 0, 1, 2, 3, 3]), 77 dtype=object, 78 ), 79 ("A", 2): 1, 80 } 81 ) 82 tm.assert_frame_equal(result, expected) 83 84 85def test_usecase(): 86 # explode a single column 87 # gh-10511 88 df = pd.DataFrame( 89 [[11, range(5), 10], [22, range(3), 20]], columns=list("ABC") 90 ).set_index("C") 91 result = df.explode("B") 92 93 expected = pd.DataFrame( 94 { 95 "A": [11, 11, 11, 11, 11, 22, 22, 22], 96 "B": np.array([0, 1, 2, 3, 4, 0, 1, 2], dtype=object), 97 "C": [10, 10, 10, 10, 10, 20, 20, 20], 98 }, 99 columns=list("ABC"), 100 ).set_index("C") 101 102 tm.assert_frame_equal(result, expected) 103 104 # gh-8517 105 df = pd.DataFrame( 106 [["2014-01-01", "Alice", "A B"], ["2014-01-02", "Bob", "C D"]], 107 columns=["dt", "name", "text"], 108 ) 109 result = df.assign(text=df.text.str.split(" ")).explode("text") 110 expected = pd.DataFrame( 111 [ 112 ["2014-01-01", "Alice", "A"], 113 ["2014-01-01", "Alice", "B"], 114 ["2014-01-02", "Bob", "C"], 115 ["2014-01-02", "Bob", "D"], 116 ], 117 columns=["dt", "name", "text"], 118 index=[0, 0, 1, 1], 119 ) 120 tm.assert_frame_equal(result, expected) 121 122 123@pytest.mark.parametrize( 124 "input_dict, input_index, expected_dict, expected_index", 125 [ 126 ( 127 {"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]}, 128 [0, 0], 129 {"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]}, 130 [0, 0, 0, 0], 131 ), 132 ( 133 {"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]}, 134 pd.Index([0, 0], name="my_index"), 135 {"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]}, 136 pd.Index([0, 0, 0, 0], name="my_index"), 137 ), 138 ( 139 {"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]}, 140 pd.MultiIndex.from_arrays( 141 [[0, 0], [1, 1]], names=["my_first_index", "my_second_index"] 142 ), 143 {"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]}, 144 pd.MultiIndex.from_arrays( 145 [[0, 0, 0, 0], [1, 1, 1, 1]], 146 names=["my_first_index", "my_second_index"], 147 ), 148 ), 149 ( 150 {"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]}, 151 pd.MultiIndex.from_arrays([[0, 0], [1, 1]], names=["my_index", None]), 152 {"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]}, 153 pd.MultiIndex.from_arrays( 154 [[0, 0, 0, 0], [1, 1, 1, 1]], names=["my_index", None] 155 ), 156 ), 157 ], 158) 159def test_duplicate_index(input_dict, input_index, expected_dict, expected_index): 160 # GH 28005 161 df = pd.DataFrame(input_dict, index=input_index) 162 result = df.explode("col1") 163 expected = pd.DataFrame(expected_dict, index=expected_index, dtype=object) 164 tm.assert_frame_equal(result, expected) 165 166 167def test_ignore_index(): 168 # GH 34932 169 df = pd.DataFrame({"id": range(0, 20, 10), "values": [list("ab"), list("cd")]}) 170 result = df.explode("values", ignore_index=True) 171 expected = pd.DataFrame( 172 {"id": [0, 0, 10, 10], "values": list("abcd")}, index=[0, 1, 2, 3] 173 ) 174 tm.assert_frame_equal(result, expected) 175 176 177def test_explode_sets(): 178 # https://github.com/pandas-dev/pandas/issues/35614 179 df = pd.DataFrame({"a": [{"x", "y"}], "b": [1]}, index=[1]) 180 result = df.explode(column="a").sort_values(by="a") 181 expected = pd.DataFrame({"a": ["x", "y"], "b": [1, 1]}, index=[1, 1]) 182 tm.assert_frame_equal(result, expected) 183