1import random 2 3import numpy as np 4import pytest 5 6from pandas.errors import PerformanceWarning 7 8import pandas as pd 9from pandas import Categorical, DataFrame, NaT, Timestamp, date_range 10import pandas._testing as tm 11 12 13class TestDataFrameSortValues: 14 def test_sort_values(self): 15 frame = DataFrame( 16 [[1, 1, 2], [3, 1, 0], [4, 5, 6]], index=[1, 2, 3], columns=list("ABC") 17 ) 18 19 # by column (axis=0) 20 sorted_df = frame.sort_values(by="A") 21 indexer = frame["A"].argsort().values 22 expected = frame.loc[frame.index[indexer]] 23 tm.assert_frame_equal(sorted_df, expected) 24 25 sorted_df = frame.sort_values(by="A", ascending=False) 26 indexer = indexer[::-1] 27 expected = frame.loc[frame.index[indexer]] 28 tm.assert_frame_equal(sorted_df, expected) 29 30 sorted_df = frame.sort_values(by="A", ascending=False) 31 tm.assert_frame_equal(sorted_df, expected) 32 33 # GH4839 34 sorted_df = frame.sort_values(by=["A"], ascending=[False]) 35 tm.assert_frame_equal(sorted_df, expected) 36 37 # multiple bys 38 sorted_df = frame.sort_values(by=["B", "C"]) 39 expected = frame.loc[[2, 1, 3]] 40 tm.assert_frame_equal(sorted_df, expected) 41 42 sorted_df = frame.sort_values(by=["B", "C"], ascending=False) 43 tm.assert_frame_equal(sorted_df, expected[::-1]) 44 45 sorted_df = frame.sort_values(by=["B", "A"], ascending=[True, False]) 46 tm.assert_frame_equal(sorted_df, expected) 47 48 msg = "No axis named 2 for object type DataFrame" 49 with pytest.raises(ValueError, match=msg): 50 frame.sort_values(by=["A", "B"], axis=2, inplace=True) 51 52 # by row (axis=1): GH#10806 53 sorted_df = frame.sort_values(by=3, axis=1) 54 expected = frame 55 tm.assert_frame_equal(sorted_df, expected) 56 57 sorted_df = frame.sort_values(by=3, axis=1, ascending=False) 58 expected = frame.reindex(columns=["C", "B", "A"]) 59 tm.assert_frame_equal(sorted_df, expected) 60 61 sorted_df = frame.sort_values(by=[1, 2], axis="columns") 62 expected = frame.reindex(columns=["B", "A", "C"]) 63 tm.assert_frame_equal(sorted_df, expected) 64 65 sorted_df = frame.sort_values(by=[1, 3], axis=1, ascending=[True, False]) 66 tm.assert_frame_equal(sorted_df, expected) 67 68 sorted_df = frame.sort_values(by=[1, 3], axis=1, ascending=False) 69 expected = frame.reindex(columns=["C", "B", "A"]) 70 tm.assert_frame_equal(sorted_df, expected) 71 72 msg = r"Length of ascending \(5\) != length of by \(2\)" 73 with pytest.raises(ValueError, match=msg): 74 frame.sort_values(by=["A", "B"], axis=0, ascending=[True] * 5) 75 76 def test_sort_values_inplace(self): 77 frame = DataFrame( 78 np.random.randn(4, 4), index=[1, 2, 3, 4], columns=["A", "B", "C", "D"] 79 ) 80 81 sorted_df = frame.copy() 82 return_value = sorted_df.sort_values(by="A", inplace=True) 83 assert return_value is None 84 expected = frame.sort_values(by="A") 85 tm.assert_frame_equal(sorted_df, expected) 86 87 sorted_df = frame.copy() 88 return_value = sorted_df.sort_values(by=1, axis=1, inplace=True) 89 assert return_value is None 90 expected = frame.sort_values(by=1, axis=1) 91 tm.assert_frame_equal(sorted_df, expected) 92 93 sorted_df = frame.copy() 94 return_value = sorted_df.sort_values(by="A", ascending=False, inplace=True) 95 assert return_value is None 96 expected = frame.sort_values(by="A", ascending=False) 97 tm.assert_frame_equal(sorted_df, expected) 98 99 sorted_df = frame.copy() 100 return_value = sorted_df.sort_values( 101 by=["A", "B"], ascending=False, inplace=True 102 ) 103 assert return_value is None 104 expected = frame.sort_values(by=["A", "B"], ascending=False) 105 tm.assert_frame_equal(sorted_df, expected) 106 107 def test_sort_values_multicolumn(self): 108 A = np.arange(5).repeat(20) 109 B = np.tile(np.arange(5), 20) 110 random.shuffle(A) 111 random.shuffle(B) 112 frame = DataFrame({"A": A, "B": B, "C": np.random.randn(100)}) 113 114 result = frame.sort_values(by=["A", "B"]) 115 indexer = np.lexsort((frame["B"], frame["A"])) 116 expected = frame.take(indexer) 117 tm.assert_frame_equal(result, expected) 118 119 result = frame.sort_values(by=["A", "B"], ascending=False) 120 indexer = np.lexsort( 121 (frame["B"].rank(ascending=False), frame["A"].rank(ascending=False)) 122 ) 123 expected = frame.take(indexer) 124 tm.assert_frame_equal(result, expected) 125 126 result = frame.sort_values(by=["B", "A"]) 127 indexer = np.lexsort((frame["A"], frame["B"])) 128 expected = frame.take(indexer) 129 tm.assert_frame_equal(result, expected) 130 131 def test_sort_values_multicolumn_uint64(self): 132 # GH#9918 133 # uint64 multicolumn sort 134 135 df = DataFrame( 136 { 137 "a": pd.Series([18446637057563306014, 1162265347240853609]), 138 "b": pd.Series([1, 2]), 139 } 140 ) 141 df["a"] = df["a"].astype(np.uint64) 142 result = df.sort_values(["a", "b"]) 143 144 expected = DataFrame( 145 { 146 "a": pd.Series([18446637057563306014, 1162265347240853609]), 147 "b": pd.Series([1, 2]), 148 }, 149 index=pd.Index([1, 0]), 150 ) 151 152 tm.assert_frame_equal(result, expected) 153 154 def test_sort_values_nan(self): 155 # GH#3917 156 df = DataFrame( 157 {"A": [1, 2, np.nan, 1, 6, 8, 4], "B": [9, np.nan, 5, 2, 5, 4, 5]} 158 ) 159 160 # sort one column only 161 expected = DataFrame( 162 {"A": [np.nan, 1, 1, 2, 4, 6, 8], "B": [5, 9, 2, np.nan, 5, 5, 4]}, 163 index=[2, 0, 3, 1, 6, 4, 5], 164 ) 165 sorted_df = df.sort_values(["A"], na_position="first") 166 tm.assert_frame_equal(sorted_df, expected) 167 168 expected = DataFrame( 169 {"A": [np.nan, 8, 6, 4, 2, 1, 1], "B": [5, 4, 5, 5, np.nan, 9, 2]}, 170 index=[2, 5, 4, 6, 1, 0, 3], 171 ) 172 sorted_df = df.sort_values(["A"], na_position="first", ascending=False) 173 tm.assert_frame_equal(sorted_df, expected) 174 175 expected = df.reindex(columns=["B", "A"]) 176 sorted_df = df.sort_values(by=1, axis=1, na_position="first") 177 tm.assert_frame_equal(sorted_df, expected) 178 179 # na_position='last', order 180 expected = DataFrame( 181 {"A": [1, 1, 2, 4, 6, 8, np.nan], "B": [2, 9, np.nan, 5, 5, 4, 5]}, 182 index=[3, 0, 1, 6, 4, 5, 2], 183 ) 184 sorted_df = df.sort_values(["A", "B"]) 185 tm.assert_frame_equal(sorted_df, expected) 186 187 # na_position='first', order 188 expected = DataFrame( 189 {"A": [np.nan, 1, 1, 2, 4, 6, 8], "B": [5, 2, 9, np.nan, 5, 5, 4]}, 190 index=[2, 3, 0, 1, 6, 4, 5], 191 ) 192 sorted_df = df.sort_values(["A", "B"], na_position="first") 193 tm.assert_frame_equal(sorted_df, expected) 194 195 # na_position='first', not order 196 expected = DataFrame( 197 {"A": [np.nan, 1, 1, 2, 4, 6, 8], "B": [5, 9, 2, np.nan, 5, 5, 4]}, 198 index=[2, 0, 3, 1, 6, 4, 5], 199 ) 200 sorted_df = df.sort_values(["A", "B"], ascending=[1, 0], na_position="first") 201 tm.assert_frame_equal(sorted_df, expected) 202 203 # na_position='last', not order 204 expected = DataFrame( 205 {"A": [8, 6, 4, 2, 1, 1, np.nan], "B": [4, 5, 5, np.nan, 2, 9, 5]}, 206 index=[5, 4, 6, 1, 3, 0, 2], 207 ) 208 sorted_df = df.sort_values(["A", "B"], ascending=[0, 1], na_position="last") 209 tm.assert_frame_equal(sorted_df, expected) 210 211 def test_sort_values_stable_descending_sort(self): 212 # GH#6399 213 df = DataFrame( 214 [[2, "first"], [2, "second"], [1, "a"], [1, "b"]], 215 columns=["sort_col", "order"], 216 ) 217 sorted_df = df.sort_values(by="sort_col", kind="mergesort", ascending=False) 218 tm.assert_frame_equal(df, sorted_df) 219 220 def test_sort_values_stable_descending_multicolumn_sort(self): 221 df = DataFrame( 222 {"A": [1, 2, np.nan, 1, 6, 8, 4], "B": [9, np.nan, 5, 2, 5, 4, 5]} 223 ) 224 # test stable mergesort 225 expected = DataFrame( 226 {"A": [np.nan, 8, 6, 4, 2, 1, 1], "B": [5, 4, 5, 5, np.nan, 2, 9]}, 227 index=[2, 5, 4, 6, 1, 3, 0], 228 ) 229 sorted_df = df.sort_values( 230 ["A", "B"], ascending=[0, 1], na_position="first", kind="mergesort" 231 ) 232 tm.assert_frame_equal(sorted_df, expected) 233 234 expected = DataFrame( 235 {"A": [np.nan, 8, 6, 4, 2, 1, 1], "B": [5, 4, 5, 5, np.nan, 9, 2]}, 236 index=[2, 5, 4, 6, 1, 0, 3], 237 ) 238 sorted_df = df.sort_values( 239 ["A", "B"], ascending=[0, 0], na_position="first", kind="mergesort" 240 ) 241 tm.assert_frame_equal(sorted_df, expected) 242 243 def test_sort_values_stable_categorial(self): 244 # GH#16793 245 df = DataFrame({"x": Categorical(np.repeat([1, 2, 3, 4], 5), ordered=True)}) 246 expected = df.copy() 247 sorted_df = df.sort_values("x", kind="mergesort") 248 tm.assert_frame_equal(sorted_df, expected) 249 250 def test_sort_values_datetimes(self): 251 252 # GH#3461, argsort / lexsort differences for a datetime column 253 df = DataFrame( 254 ["a", "a", "a", "b", "c", "d", "e", "f", "g"], 255 columns=["A"], 256 index=date_range("20130101", periods=9), 257 ) 258 dts = [ 259 Timestamp(x) 260 for x in [ 261 "2004-02-11", 262 "2004-01-21", 263 "2004-01-26", 264 "2005-09-20", 265 "2010-10-04", 266 "2009-05-12", 267 "2008-11-12", 268 "2010-09-28", 269 "2010-09-28", 270 ] 271 ] 272 df["B"] = dts[::2] + dts[1::2] 273 df["C"] = 2.0 274 df["A1"] = 3.0 275 276 df1 = df.sort_values(by="A") 277 df2 = df.sort_values(by=["A"]) 278 tm.assert_frame_equal(df1, df2) 279 280 df1 = df.sort_values(by="B") 281 df2 = df.sort_values(by=["B"]) 282 tm.assert_frame_equal(df1, df2) 283 284 df1 = df.sort_values(by="B") 285 286 df2 = df.sort_values(by=["C", "B"]) 287 tm.assert_frame_equal(df1, df2) 288 289 def test_sort_values_frame_column_inplace_sort_exception(self, float_frame): 290 s = float_frame["A"] 291 with pytest.raises(ValueError, match="This Series is a view"): 292 s.sort_values(inplace=True) 293 294 cp = s.copy() 295 cp.sort_values() # it works! 296 297 def test_sort_values_nat_values_in_int_column(self): 298 299 # GH#14922: "sorting with large float and multiple columns incorrect" 300 301 # cause was that the int64 value NaT was considered as "na". Which is 302 # only correct for datetime64 columns. 303 304 int_values = (2, int(NaT)) 305 float_values = (2.0, -1.797693e308) 306 307 df = DataFrame( 308 {"int": int_values, "float": float_values}, columns=["int", "float"] 309 ) 310 311 df_reversed = DataFrame( 312 {"int": int_values[::-1], "float": float_values[::-1]}, 313 columns=["int", "float"], 314 index=[1, 0], 315 ) 316 317 # NaT is not a "na" for int64 columns, so na_position must not 318 # influence the result: 319 df_sorted = df.sort_values(["int", "float"], na_position="last") 320 tm.assert_frame_equal(df_sorted, df_reversed) 321 322 df_sorted = df.sort_values(["int", "float"], na_position="first") 323 tm.assert_frame_equal(df_sorted, df_reversed) 324 325 # reverse sorting order 326 df_sorted = df.sort_values(["int", "float"], ascending=False) 327 tm.assert_frame_equal(df_sorted, df) 328 329 # and now check if NaT is still considered as "na" for datetime64 330 # columns: 331 df = DataFrame( 332 {"datetime": [Timestamp("2016-01-01"), NaT], "float": float_values}, 333 columns=["datetime", "float"], 334 ) 335 336 df_reversed = DataFrame( 337 {"datetime": [NaT, Timestamp("2016-01-01")], "float": float_values[::-1]}, 338 columns=["datetime", "float"], 339 index=[1, 0], 340 ) 341 342 df_sorted = df.sort_values(["datetime", "float"], na_position="first") 343 tm.assert_frame_equal(df_sorted, df_reversed) 344 345 df_sorted = df.sort_values(["datetime", "float"], na_position="last") 346 tm.assert_frame_equal(df_sorted, df) 347 348 # Ascending should not affect the results. 349 df_sorted = df.sort_values(["datetime", "float"], ascending=False) 350 tm.assert_frame_equal(df_sorted, df) 351 352 def test_sort_nat(self): 353 # GH 16836 354 355 d1 = [Timestamp(x) for x in ["2016-01-01", "2015-01-01", np.nan, "2016-01-01"]] 356 d2 = [ 357 Timestamp(x) 358 for x in ["2017-01-01", "2014-01-01", "2016-01-01", "2015-01-01"] 359 ] 360 df = DataFrame({"a": d1, "b": d2}, index=[0, 1, 2, 3]) 361 362 d3 = [Timestamp(x) for x in ["2015-01-01", "2016-01-01", "2016-01-01", np.nan]] 363 d4 = [ 364 Timestamp(x) 365 for x in ["2014-01-01", "2015-01-01", "2017-01-01", "2016-01-01"] 366 ] 367 expected = DataFrame({"a": d3, "b": d4}, index=[1, 3, 0, 2]) 368 sorted_df = df.sort_values(by=["a", "b"]) 369 tm.assert_frame_equal(sorted_df, expected) 370 371 def test_sort_values_na_position_with_categories(self): 372 # GH#22556 373 # Positioning missing value properly when column is Categorical. 374 categories = ["A", "B", "C"] 375 category_indices = [0, 2, 4] 376 list_of_nans = [np.nan, np.nan] 377 na_indices = [1, 3] 378 na_position_first = "first" 379 na_position_last = "last" 380 column_name = "c" 381 382 reversed_categories = sorted(categories, reverse=True) 383 reversed_category_indices = sorted(category_indices, reverse=True) 384 reversed_na_indices = sorted(na_indices) 385 386 df = DataFrame( 387 { 388 column_name: Categorical( 389 ["A", np.nan, "B", np.nan, "C"], categories=categories, ordered=True 390 ) 391 } 392 ) 393 # sort ascending with na first 394 result = df.sort_values( 395 by=column_name, ascending=True, na_position=na_position_first 396 ) 397 expected = DataFrame( 398 { 399 column_name: Categorical( 400 list_of_nans + categories, categories=categories, ordered=True 401 ) 402 }, 403 index=na_indices + category_indices, 404 ) 405 406 tm.assert_frame_equal(result, expected) 407 408 # sort ascending with na last 409 result = df.sort_values( 410 by=column_name, ascending=True, na_position=na_position_last 411 ) 412 expected = DataFrame( 413 { 414 column_name: Categorical( 415 categories + list_of_nans, categories=categories, ordered=True 416 ) 417 }, 418 index=category_indices + na_indices, 419 ) 420 421 tm.assert_frame_equal(result, expected) 422 423 # sort descending with na first 424 result = df.sort_values( 425 by=column_name, ascending=False, na_position=na_position_first 426 ) 427 expected = DataFrame( 428 { 429 column_name: Categorical( 430 list_of_nans + reversed_categories, 431 categories=categories, 432 ordered=True, 433 ) 434 }, 435 index=reversed_na_indices + reversed_category_indices, 436 ) 437 438 tm.assert_frame_equal(result, expected) 439 440 # sort descending with na last 441 result = df.sort_values( 442 by=column_name, ascending=False, na_position=na_position_last 443 ) 444 expected = DataFrame( 445 { 446 column_name: Categorical( 447 reversed_categories + list_of_nans, 448 categories=categories, 449 ordered=True, 450 ) 451 }, 452 index=reversed_category_indices + reversed_na_indices, 453 ) 454 455 tm.assert_frame_equal(result, expected) 456 457 def test_sort_values_nat(self): 458 459 # GH#16836 460 461 d1 = [Timestamp(x) for x in ["2016-01-01", "2015-01-01", np.nan, "2016-01-01"]] 462 d2 = [ 463 Timestamp(x) 464 for x in ["2017-01-01", "2014-01-01", "2016-01-01", "2015-01-01"] 465 ] 466 df = DataFrame({"a": d1, "b": d2}, index=[0, 1, 2, 3]) 467 468 d3 = [Timestamp(x) for x in ["2015-01-01", "2016-01-01", "2016-01-01", np.nan]] 469 d4 = [ 470 Timestamp(x) 471 for x in ["2014-01-01", "2015-01-01", "2017-01-01", "2016-01-01"] 472 ] 473 expected = DataFrame({"a": d3, "b": d4}, index=[1, 3, 0, 2]) 474 sorted_df = df.sort_values(by=["a", "b"]) 475 tm.assert_frame_equal(sorted_df, expected) 476 477 def test_sort_values_na_position_with_categories_raises(self): 478 df = DataFrame( 479 { 480 "c": Categorical( 481 ["A", np.nan, "B", np.nan, "C"], 482 categories=["A", "B", "C"], 483 ordered=True, 484 ) 485 } 486 ) 487 488 with pytest.raises(ValueError, match="invalid na_position: bad_position"): 489 df.sort_values(by="c", ascending=False, na_position="bad_position") 490 491 @pytest.mark.parametrize("inplace", [True, False]) 492 @pytest.mark.parametrize( 493 "original_dict, sorted_dict, ignore_index, output_index", 494 [ 495 ({"A": [1, 2, 3]}, {"A": [3, 2, 1]}, True, [0, 1, 2]), 496 ({"A": [1, 2, 3]}, {"A": [3, 2, 1]}, False, [2, 1, 0]), 497 ( 498 {"A": [1, 2, 3], "B": [2, 3, 4]}, 499 {"A": [3, 2, 1], "B": [4, 3, 2]}, 500 True, 501 [0, 1, 2], 502 ), 503 ( 504 {"A": [1, 2, 3], "B": [2, 3, 4]}, 505 {"A": [3, 2, 1], "B": [4, 3, 2]}, 506 False, 507 [2, 1, 0], 508 ), 509 ], 510 ) 511 def test_sort_values_ignore_index( 512 self, inplace, original_dict, sorted_dict, ignore_index, output_index 513 ): 514 # GH 30114 515 df = DataFrame(original_dict) 516 expected = DataFrame(sorted_dict, index=output_index) 517 kwargs = {"ignore_index": ignore_index, "inplace": inplace} 518 519 if inplace: 520 result_df = df.copy() 521 result_df.sort_values("A", ascending=False, **kwargs) 522 else: 523 result_df = df.sort_values("A", ascending=False, **kwargs) 524 525 tm.assert_frame_equal(result_df, expected) 526 tm.assert_frame_equal(df, DataFrame(original_dict)) 527 528 def test_sort_values_nat_na_position_default(self): 529 # GH 13230 530 expected = DataFrame( 531 { 532 "A": [1, 2, 3, 4, 4], 533 "date": pd.DatetimeIndex( 534 [ 535 "2010-01-01 09:00:00", 536 "2010-01-01 09:00:01", 537 "2010-01-01 09:00:02", 538 "2010-01-01 09:00:03", 539 "NaT", 540 ] 541 ), 542 } 543 ) 544 result = expected.sort_values(["A", "date"]) 545 tm.assert_frame_equal(result, expected) 546 547 548class TestDataFrameSortKey: # test key sorting (issue 27237) 549 def test_sort_values_inplace_key(self, sort_by_key): 550 frame = DataFrame( 551 np.random.randn(4, 4), index=[1, 2, 3, 4], columns=["A", "B", "C", "D"] 552 ) 553 554 sorted_df = frame.copy() 555 return_value = sorted_df.sort_values(by="A", inplace=True, key=sort_by_key) 556 assert return_value is None 557 expected = frame.sort_values(by="A", key=sort_by_key) 558 tm.assert_frame_equal(sorted_df, expected) 559 560 sorted_df = frame.copy() 561 return_value = sorted_df.sort_values( 562 by=1, axis=1, inplace=True, key=sort_by_key 563 ) 564 assert return_value is None 565 expected = frame.sort_values(by=1, axis=1, key=sort_by_key) 566 tm.assert_frame_equal(sorted_df, expected) 567 568 sorted_df = frame.copy() 569 return_value = sorted_df.sort_values( 570 by="A", ascending=False, inplace=True, key=sort_by_key 571 ) 572 assert return_value is None 573 expected = frame.sort_values(by="A", ascending=False, key=sort_by_key) 574 tm.assert_frame_equal(sorted_df, expected) 575 576 sorted_df = frame.copy() 577 sorted_df.sort_values( 578 by=["A", "B"], ascending=False, inplace=True, key=sort_by_key 579 ) 580 expected = frame.sort_values(by=["A", "B"], ascending=False, key=sort_by_key) 581 tm.assert_frame_equal(sorted_df, expected) 582 583 def test_sort_values_key(self): 584 df = DataFrame(np.array([0, 5, np.nan, 3, 2, np.nan])) 585 586 result = df.sort_values(0) 587 expected = df.iloc[[0, 4, 3, 1, 2, 5]] 588 tm.assert_frame_equal(result, expected) 589 590 result = df.sort_values(0, key=lambda x: x + 5) 591 expected = df.iloc[[0, 4, 3, 1, 2, 5]] 592 tm.assert_frame_equal(result, expected) 593 594 result = df.sort_values(0, key=lambda x: -x, ascending=False) 595 expected = df.iloc[[0, 4, 3, 1, 2, 5]] 596 tm.assert_frame_equal(result, expected) 597 598 def test_sort_values_by_key(self): 599 df = DataFrame( 600 { 601 "a": np.array([0, 3, np.nan, 3, 2, np.nan]), 602 "b": np.array([0, 2, np.nan, 5, 2, np.nan]), 603 } 604 ) 605 606 result = df.sort_values("a", key=lambda x: -x) 607 expected = df.iloc[[1, 3, 4, 0, 2, 5]] 608 tm.assert_frame_equal(result, expected) 609 610 result = df.sort_values(by=["a", "b"], key=lambda x: -x) 611 expected = df.iloc[[3, 1, 4, 0, 2, 5]] 612 tm.assert_frame_equal(result, expected) 613 614 result = df.sort_values(by=["a", "b"], key=lambda x: -x, ascending=False) 615 expected = df.iloc[[0, 4, 1, 3, 2, 5]] 616 tm.assert_frame_equal(result, expected) 617 618 def test_sort_values_by_key_by_name(self): 619 df = DataFrame( 620 { 621 "a": np.array([0, 3, np.nan, 3, 2, np.nan]), 622 "b": np.array([0, 2, np.nan, 5, 2, np.nan]), 623 } 624 ) 625 626 def key(col): 627 if col.name == "a": 628 return -col 629 else: 630 return col 631 632 result = df.sort_values(by="a", key=key) 633 expected = df.iloc[[1, 3, 4, 0, 2, 5]] 634 tm.assert_frame_equal(result, expected) 635 636 result = df.sort_values(by=["a"], key=key) 637 expected = df.iloc[[1, 3, 4, 0, 2, 5]] 638 tm.assert_frame_equal(result, expected) 639 640 result = df.sort_values(by="b", key=key) 641 expected = df.iloc[[0, 1, 4, 3, 2, 5]] 642 tm.assert_frame_equal(result, expected) 643 644 result = df.sort_values(by=["a", "b"], key=key) 645 expected = df.iloc[[1, 3, 4, 0, 2, 5]] 646 tm.assert_frame_equal(result, expected) 647 648 def test_sort_values_key_string(self): 649 df = DataFrame(np.array([["hello", "goodbye"], ["hello", "Hello"]])) 650 651 result = df.sort_values(1) 652 expected = df[::-1] 653 tm.assert_frame_equal(result, expected) 654 655 result = df.sort_values([0, 1], key=lambda col: col.str.lower()) 656 tm.assert_frame_equal(result, df) 657 658 result = df.sort_values( 659 [0, 1], key=lambda col: col.str.lower(), ascending=False 660 ) 661 expected = df.sort_values(1, key=lambda col: col.str.lower(), ascending=False) 662 tm.assert_frame_equal(result, expected) 663 664 def test_sort_values_key_empty(self, sort_by_key): 665 df = DataFrame(np.array([])) 666 667 df.sort_values(0, key=sort_by_key) 668 df.sort_index(key=sort_by_key) 669 670 def test_changes_length_raises(self): 671 df = DataFrame({"A": [1, 2, 3]}) 672 with pytest.raises(ValueError, match="change the shape"): 673 df.sort_values("A", key=lambda x: x[:1]) 674 675 def test_sort_values_key_axes(self): 676 df = DataFrame({0: ["Hello", "goodbye"], 1: [0, 1]}) 677 678 result = df.sort_values(0, key=lambda col: col.str.lower()) 679 expected = df[::-1] 680 tm.assert_frame_equal(result, expected) 681 682 result = df.sort_values(1, key=lambda col: -col) 683 expected = df[::-1] 684 tm.assert_frame_equal(result, expected) 685 686 def test_sort_values_key_dict_axis(self): 687 df = DataFrame({0: ["Hello", 0], 1: ["goodbye", 1]}) 688 689 result = df.sort_values(0, key=lambda col: col.str.lower(), axis=1) 690 expected = df.loc[:, ::-1] 691 tm.assert_frame_equal(result, expected) 692 693 result = df.sort_values(1, key=lambda col: -col, axis=1) 694 expected = df.loc[:, ::-1] 695 tm.assert_frame_equal(result, expected) 696 697 @pytest.mark.parametrize("ordered", [True, False]) 698 def test_sort_values_key_casts_to_categorical(self, ordered): 699 # https://github.com/pandas-dev/pandas/issues/36383 700 categories = ["c", "b", "a"] 701 df = DataFrame({"x": [1, 1, 1], "y": ["a", "b", "c"]}) 702 703 def sorter(key): 704 if key.name == "y": 705 return pd.Series( 706 Categorical(key, categories=categories, ordered=ordered) 707 ) 708 return key 709 710 result = df.sort_values(by=["x", "y"], key=sorter) 711 expected = DataFrame( 712 {"x": [1, 1, 1], "y": ["c", "b", "a"]}, index=pd.Index([2, 1, 0]) 713 ) 714 715 tm.assert_frame_equal(result, expected) 716 717 718@pytest.fixture 719def df_none(): 720 return DataFrame( 721 { 722 "outer": ["a", "a", "a", "b", "b", "b"], 723 "inner": [1, 2, 2, 2, 1, 1], 724 "A": np.arange(6, 0, -1), 725 ("B", 5): ["one", "one", "two", "two", "one", "one"], 726 } 727 ) 728 729 730@pytest.fixture(params=[["outer"], ["outer", "inner"]]) 731def df_idx(request, df_none): 732 levels = request.param 733 return df_none.set_index(levels) 734 735 736@pytest.fixture( 737 params=[ 738 "inner", # index level 739 ["outer"], # list of index level 740 "A", # column 741 [("B", 5)], # list of column 742 ["inner", "outer"], # two index levels 743 [("B", 5), "outer"], # index level and column 744 ["A", ("B", 5)], # Two columns 745 ["inner", "outer"], # two index levels and column 746 ] 747) 748def sort_names(request): 749 return request.param 750 751 752@pytest.fixture(params=[True, False]) 753def ascending(request): 754 return request.param 755 756 757class TestSortValuesLevelAsStr: 758 def test_sort_index_level_and_column_label( 759 self, df_none, df_idx, sort_names, ascending 760 ): 761 # GH#14353 762 763 # Get index levels from df_idx 764 levels = df_idx.index.names 765 766 # Compute expected by sorting on columns and the setting index 767 expected = df_none.sort_values( 768 by=sort_names, ascending=ascending, axis=0 769 ).set_index(levels) 770 771 # Compute result sorting on mix on columns and index levels 772 result = df_idx.sort_values(by=sort_names, ascending=ascending, axis=0) 773 774 tm.assert_frame_equal(result, expected) 775 776 def test_sort_column_level_and_index_label( 777 self, df_none, df_idx, sort_names, ascending 778 ): 779 # GH#14353 780 781 # Get levels from df_idx 782 levels = df_idx.index.names 783 784 # Compute expected by sorting on axis=0, setting index levels, and then 785 # transposing. For some cases this will result in a frame with 786 # multiple column levels 787 expected = ( 788 df_none.sort_values(by=sort_names, ascending=ascending, axis=0) 789 .set_index(levels) 790 .T 791 ) 792 793 # Compute result by transposing and sorting on axis=1. 794 result = df_idx.T.sort_values(by=sort_names, ascending=ascending, axis=1) 795 796 if len(levels) > 1: 797 # Accessing multi-level columns that are not lexsorted raises a 798 # performance warning 799 with tm.assert_produces_warning(PerformanceWarning, check_stacklevel=False): 800 tm.assert_frame_equal(result, expected) 801 else: 802 tm.assert_frame_equal(result, expected) 803