1""" 2SQL-style merge routines 3""" 4 5import copy 6import datetime 7from functools import partial 8import hashlib 9import string 10from typing import TYPE_CHECKING, Optional, Tuple, cast 11import warnings 12 13import numpy as np 14 15from pandas._libs import Timedelta, hashtable as libhashtable, join as libjoin, lib 16from pandas._typing import ArrayLike, FrameOrSeries, FrameOrSeriesUnion 17from pandas.errors import MergeError 18from pandas.util._decorators import Appender, Substitution 19 20from pandas.core.dtypes.common import ( 21 ensure_float64, 22 ensure_int64, 23 ensure_object, 24 is_array_like, 25 is_bool, 26 is_bool_dtype, 27 is_categorical_dtype, 28 is_datetime64tz_dtype, 29 is_dtype_equal, 30 is_extension_array_dtype, 31 is_float_dtype, 32 is_integer, 33 is_integer_dtype, 34 is_list_like, 35 is_number, 36 is_numeric_dtype, 37 is_object_dtype, 38 needs_i8_conversion, 39) 40from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries 41from pandas.core.dtypes.missing import isna, na_value_for_dtype 42 43from pandas import Categorical, Index, MultiIndex 44from pandas.core import groupby 45import pandas.core.algorithms as algos 46import pandas.core.common as com 47from pandas.core.construction import extract_array 48from pandas.core.frame import _merge_doc 49from pandas.core.internals import concatenate_block_managers 50from pandas.core.sorting import is_int64_overflow_possible 51 52if TYPE_CHECKING: 53 from pandas import DataFrame 54 from pandas.core.arrays import DatetimeArray 55 56 57@Substitution("\nleft : DataFrame") 58@Appender(_merge_doc, indents=0) 59def merge( 60 left, 61 right, 62 how: str = "inner", 63 on=None, 64 left_on=None, 65 right_on=None, 66 left_index: bool = False, 67 right_index: bool = False, 68 sort: bool = False, 69 suffixes=("_x", "_y"), 70 copy: bool = True, 71 indicator: bool = False, 72 validate=None, 73) -> "DataFrame": 74 op = _MergeOperation( 75 left, 76 right, 77 how=how, 78 on=on, 79 left_on=left_on, 80 right_on=right_on, 81 left_index=left_index, 82 right_index=right_index, 83 sort=sort, 84 suffixes=suffixes, 85 copy=copy, 86 indicator=indicator, 87 validate=validate, 88 ) 89 return op.get_result() 90 91 92if __debug__: 93 merge.__doc__ = _merge_doc % "\nleft : DataFrame" 94 95 96def _groupby_and_merge(by, on, left: "DataFrame", right: "DataFrame", merge_pieces): 97 """ 98 groupby & merge; we are always performing a left-by type operation 99 100 Parameters 101 ---------- 102 by: field to group 103 on: duplicates field 104 left: DataFrame 105 right: DataFrame 106 merge_pieces: function for merging 107 """ 108 pieces = [] 109 if not isinstance(by, (list, tuple)): 110 by = [by] 111 112 lby = left.groupby(by, sort=False) 113 rby: Optional[groupby.DataFrameGroupBy] = None 114 115 # if we can groupby the rhs 116 # then we can get vastly better perf 117 if all(item in right.columns for item in by): 118 rby = right.groupby(by, sort=False) 119 120 for key, lhs in lby: 121 122 if rby is None: 123 rhs = right 124 else: 125 try: 126 rhs = right.take(rby.indices[key]) 127 except KeyError: 128 # key doesn't exist in left 129 lcols = lhs.columns.tolist() 130 cols = lcols + [r for r in right.columns if r not in set(lcols)] 131 merged = lhs.reindex(columns=cols) 132 merged.index = range(len(merged)) 133 pieces.append(merged) 134 continue 135 136 merged = merge_pieces(lhs, rhs) 137 138 # make sure join keys are in the merged 139 # TODO, should merge_pieces do this? 140 merged[by] = key 141 142 pieces.append(merged) 143 144 # preserve the original order 145 # if we have a missing piece this can be reset 146 from pandas.core.reshape.concat import concat 147 148 result = concat(pieces, ignore_index=True) 149 result = result.reindex(columns=pieces[0].columns, copy=False) 150 return result, lby 151 152 153def merge_ordered( 154 left, 155 right, 156 on=None, 157 left_on=None, 158 right_on=None, 159 left_by=None, 160 right_by=None, 161 fill_method=None, 162 suffixes=("_x", "_y"), 163 how: str = "outer", 164) -> "DataFrame": 165 """ 166 Perform merge with optional filling/interpolation. 167 168 Designed for ordered data like time series data. Optionally 169 perform group-wise merge (see examples). 170 171 Parameters 172 ---------- 173 left : DataFrame 174 right : DataFrame 175 on : label or list 176 Field names to join on. Must be found in both DataFrames. 177 left_on : label or list, or array-like 178 Field names to join on in left DataFrame. Can be a vector or list of 179 vectors of the length of the DataFrame to use a particular vector as 180 the join key instead of columns. 181 right_on : label or list, or array-like 182 Field names to join on in right DataFrame or vector/list of vectors per 183 left_on docs. 184 left_by : column name or list of column names 185 Group left DataFrame by group columns and merge piece by piece with 186 right DataFrame. 187 right_by : column name or list of column names 188 Group right DataFrame by group columns and merge piece by piece with 189 left DataFrame. 190 fill_method : {'ffill', None}, default None 191 Interpolation method for data. 192 suffixes : list-like, default is ("_x", "_y") 193 A length-2 sequence where each element is optionally a string 194 indicating the suffix to add to overlapping column names in 195 `left` and `right` respectively. Pass a value of `None` instead 196 of a string to indicate that the column name from `left` or 197 `right` should be left as-is, with no suffix. At least one of the 198 values must not be None. 199 200 .. versionchanged:: 0.25.0 201 how : {'left', 'right', 'outer', 'inner'}, default 'outer' 202 * left: use only keys from left frame (SQL: left outer join) 203 * right: use only keys from right frame (SQL: right outer join) 204 * outer: use union of keys from both frames (SQL: full outer join) 205 * inner: use intersection of keys from both frames (SQL: inner join). 206 207 Returns 208 ------- 209 DataFrame 210 The merged DataFrame output type will the be same as 211 'left', if it is a subclass of DataFrame. 212 213 See Also 214 -------- 215 merge : Merge with a database-style join. 216 merge_asof : Merge on nearest keys. 217 218 Examples 219 -------- 220 >>> df1 = pd.DataFrame( 221 ... { 222 ... "key": ["a", "c", "e", "a", "c", "e"], 223 ... "lvalue": [1, 2, 3, 1, 2, 3], 224 ... "group": ["a", "a", "a", "b", "b", "b"] 225 ... } 226 ... ) 227 >>> df1 228 key lvalue group 229 0 a 1 a 230 1 c 2 a 231 2 e 3 a 232 3 a 1 b 233 4 c 2 b 234 5 e 3 b 235 236 >>> df2 = pd.DataFrame({"key": ["b", "c", "d"], "rvalue": [1, 2, 3]}) 237 >>> df2 238 key rvalue 239 0 b 1 240 1 c 2 241 2 d 3 242 243 >>> merge_ordered(df1, df2, fill_method="ffill", left_by="group") 244 key lvalue group rvalue 245 0 a 1 a NaN 246 1 b 1 a 1.0 247 2 c 2 a 2.0 248 3 d 2 a 3.0 249 4 e 3 a 3.0 250 5 a 1 b NaN 251 6 b 1 b 1.0 252 7 c 2 b 2.0 253 8 d 2 b 3.0 254 9 e 3 b 3.0 255 """ 256 257 def _merger(x, y): 258 # perform the ordered merge operation 259 op = _OrderedMerge( 260 x, 261 y, 262 on=on, 263 left_on=left_on, 264 right_on=right_on, 265 suffixes=suffixes, 266 fill_method=fill_method, 267 how=how, 268 ) 269 return op.get_result() 270 271 if left_by is not None and right_by is not None: 272 raise ValueError("Can only group either left or right frames") 273 elif left_by is not None: 274 if isinstance(left_by, str): 275 left_by = [left_by] 276 check = set(left_by).difference(left.columns) 277 if len(check) != 0: 278 raise KeyError(f"{check} not found in left columns") 279 result, _ = _groupby_and_merge( 280 left_by, on, left, right, lambda x, y: _merger(x, y) 281 ) 282 elif right_by is not None: 283 if isinstance(right_by, str): 284 right_by = [right_by] 285 check = set(right_by).difference(right.columns) 286 if len(check) != 0: 287 raise KeyError(f"{check} not found in right columns") 288 result, _ = _groupby_and_merge( 289 right_by, on, right, left, lambda x, y: _merger(y, x) 290 ) 291 else: 292 result = _merger(left, right) 293 return result 294 295 296def merge_asof( 297 left, 298 right, 299 on=None, 300 left_on=None, 301 right_on=None, 302 left_index: bool = False, 303 right_index: bool = False, 304 by=None, 305 left_by=None, 306 right_by=None, 307 suffixes=("_x", "_y"), 308 tolerance=None, 309 allow_exact_matches: bool = True, 310 direction: str = "backward", 311) -> "DataFrame": 312 """ 313 Perform an asof merge. 314 315 This is similar to a left-join except that we match on nearest 316 key rather than equal keys. Both DataFrames must be sorted by the key. 317 318 For each row in the left DataFrame: 319 320 - A "backward" search selects the last row in the right DataFrame whose 321 'on' key is less than or equal to the left's key. 322 323 - A "forward" search selects the first row in the right DataFrame whose 324 'on' key is greater than or equal to the left's key. 325 326 - A "nearest" search selects the row in the right DataFrame whose 'on' 327 key is closest in absolute distance to the left's key. 328 329 The default is "backward" and is compatible in versions below 0.20.0. 330 The direction parameter was added in version 0.20.0 and introduces 331 "forward" and "nearest". 332 333 Optionally match on equivalent keys with 'by' before searching with 'on'. 334 335 Parameters 336 ---------- 337 left : DataFrame 338 right : DataFrame 339 on : label 340 Field name to join on. Must be found in both DataFrames. 341 The data MUST be ordered. Furthermore this must be a numeric column, 342 such as datetimelike, integer, or float. On or left_on/right_on 343 must be given. 344 left_on : label 345 Field name to join on in left DataFrame. 346 right_on : label 347 Field name to join on in right DataFrame. 348 left_index : bool 349 Use the index of the left DataFrame as the join key. 350 right_index : bool 351 Use the index of the right DataFrame as the join key. 352 by : column name or list of column names 353 Match on these columns before performing merge operation. 354 left_by : column name 355 Field names to match on in the left DataFrame. 356 right_by : column name 357 Field names to match on in the right DataFrame. 358 suffixes : 2-length sequence (tuple, list, ...) 359 Suffix to apply to overlapping column names in the left and right 360 side, respectively. 361 tolerance : int or Timedelta, optional, default None 362 Select asof tolerance within this range; must be compatible 363 with the merge index. 364 allow_exact_matches : bool, default True 365 366 - If True, allow matching with the same 'on' value 367 (i.e. less-than-or-equal-to / greater-than-or-equal-to) 368 - If False, don't match the same 'on' value 369 (i.e., strictly less-than / strictly greater-than). 370 371 direction : 'backward' (default), 'forward', or 'nearest' 372 Whether to search for prior, subsequent, or closest matches. 373 374 Returns 375 ------- 376 merged : DataFrame 377 378 See Also 379 -------- 380 merge : Merge with a database-style join. 381 merge_ordered : Merge with optional filling/interpolation. 382 383 Examples 384 -------- 385 >>> left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]}) 386 >>> left 387 a left_val 388 0 1 a 389 1 5 b 390 2 10 c 391 392 >>> right = pd.DataFrame({"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]}) 393 >>> right 394 a right_val 395 0 1 1 396 1 2 2 397 2 3 3 398 3 6 6 399 4 7 7 400 401 >>> pd.merge_asof(left, right, on="a") 402 a left_val right_val 403 0 1 a 1 404 1 5 b 3 405 2 10 c 7 406 407 >>> pd.merge_asof(left, right, on="a", allow_exact_matches=False) 408 a left_val right_val 409 0 1 a NaN 410 1 5 b 3.0 411 2 10 c 7.0 412 413 >>> pd.merge_asof(left, right, on="a", direction="forward") 414 a left_val right_val 415 0 1 a 1.0 416 1 5 b 6.0 417 2 10 c NaN 418 419 >>> pd.merge_asof(left, right, on="a", direction="nearest") 420 a left_val right_val 421 0 1 a 1 422 1 5 b 6 423 2 10 c 7 424 425 We can use indexed DataFrames as well. 426 427 >>> left = pd.DataFrame({"left_val": ["a", "b", "c"]}, index=[1, 5, 10]) 428 >>> left 429 left_val 430 1 a 431 5 b 432 10 c 433 434 >>> right = pd.DataFrame({"right_val": [1, 2, 3, 6, 7]}, index=[1, 2, 3, 6, 7]) 435 >>> right 436 right_val 437 1 1 438 2 2 439 3 3 440 6 6 441 7 7 442 443 >>> pd.merge_asof(left, right, left_index=True, right_index=True) 444 left_val right_val 445 1 a 1 446 5 b 3 447 10 c 7 448 449 Here is a real-world times-series example 450 451 >>> quotes = pd.DataFrame( 452 ... { 453 ... "time": [ 454 ... pd.Timestamp("2016-05-25 13:30:00.023"), 455 ... pd.Timestamp("2016-05-25 13:30:00.023"), 456 ... pd.Timestamp("2016-05-25 13:30:00.030"), 457 ... pd.Timestamp("2016-05-25 13:30:00.041"), 458 ... pd.Timestamp("2016-05-25 13:30:00.048"), 459 ... pd.Timestamp("2016-05-25 13:30:00.049"), 460 ... pd.Timestamp("2016-05-25 13:30:00.072"), 461 ... pd.Timestamp("2016-05-25 13:30:00.075") 462 ... ], 463 ... "ticker": [ 464 ... "GOOG", 465 ... "MSFT", 466 ... "MSFT", 467 ... "MSFT", 468 ... "GOOG", 469 ... "AAPL", 470 ... "GOOG", 471 ... "MSFT" 472 ... ], 473 ... "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01], 474 ... "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03] 475 ... } 476 ... ) 477 >>> quotes 478 time ticker bid ask 479 0 2016-05-25 13:30:00.023 GOOG 720.50 720.93 480 1 2016-05-25 13:30:00.023 MSFT 51.95 51.96 481 2 2016-05-25 13:30:00.030 MSFT 51.97 51.98 482 3 2016-05-25 13:30:00.041 MSFT 51.99 52.00 483 4 2016-05-25 13:30:00.048 GOOG 720.50 720.93 484 5 2016-05-25 13:30:00.049 AAPL 97.99 98.01 485 6 2016-05-25 13:30:00.072 GOOG 720.50 720.88 486 7 2016-05-25 13:30:00.075 MSFT 52.01 52.03 487 488 >>> trades = pd.DataFrame( 489 ... { 490 ... "time": [ 491 ... pd.Timestamp("2016-05-25 13:30:00.023"), 492 ... pd.Timestamp("2016-05-25 13:30:00.038"), 493 ... pd.Timestamp("2016-05-25 13:30:00.048"), 494 ... pd.Timestamp("2016-05-25 13:30:00.048"), 495 ... pd.Timestamp("2016-05-25 13:30:00.048") 496 ... ], 497 ... "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], 498 ... "price": [51.95, 51.95, 720.77, 720.92, 98.0], 499 ... "quantity": [75, 155, 100, 100, 100] 500 ... } 501 ... ) 502 >>> trades 503 time ticker price quantity 504 0 2016-05-25 13:30:00.023 MSFT 51.95 75 505 1 2016-05-25 13:30:00.038 MSFT 51.95 155 506 2 2016-05-25 13:30:00.048 GOOG 720.77 100 507 3 2016-05-25 13:30:00.048 GOOG 720.92 100 508 4 2016-05-25 13:30:00.048 AAPL 98.00 100 509 510 By default we are taking the asof of the quotes 511 512 >>> pd.merge_asof(trades, quotes, on="time", by="ticker") 513 time ticker price quantity bid ask 514 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96 515 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98 516 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93 517 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93 518 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN 519 520 We only asof within 2ms between the quote time and the trade time 521 522 >>> pd.merge_asof( 523 ... trades, quotes, on="time", by="ticker", tolerance=pd.Timedelta("2ms") 524 ... ) 525 time ticker price quantity bid ask 526 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96 527 1 2016-05-25 13:30:00.038 MSFT 51.95 155 NaN NaN 528 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93 529 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93 530 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN 531 532 We only asof within 10ms between the quote time and the trade time 533 and we exclude exact matches on time. However *prior* data will 534 propagate forward 535 536 >>> pd.merge_asof( 537 ... trades, 538 ... quotes, 539 ... on="time", 540 ... by="ticker", 541 ... tolerance=pd.Timedelta("10ms"), 542 ... allow_exact_matches=False 543 ... ) 544 time ticker price quantity bid ask 545 0 2016-05-25 13:30:00.023 MSFT 51.95 75 NaN NaN 546 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98 547 2 2016-05-25 13:30:00.048 GOOG 720.77 100 NaN NaN 548 3 2016-05-25 13:30:00.048 GOOG 720.92 100 NaN NaN 549 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN 550 """ 551 op = _AsOfMerge( 552 left, 553 right, 554 on=on, 555 left_on=left_on, 556 right_on=right_on, 557 left_index=left_index, 558 right_index=right_index, 559 by=by, 560 left_by=left_by, 561 right_by=right_by, 562 suffixes=suffixes, 563 how="asof", 564 tolerance=tolerance, 565 allow_exact_matches=allow_exact_matches, 566 direction=direction, 567 ) 568 return op.get_result() 569 570 571# TODO: transformations?? 572# TODO: only copy DataFrames when modification necessary 573class _MergeOperation: 574 """ 575 Perform a database (SQL) merge operation between two DataFrame or Series 576 objects using either columns as keys or their row indexes 577 """ 578 579 _merge_type = "merge" 580 581 def __init__( 582 self, 583 left: FrameOrSeriesUnion, 584 right: FrameOrSeriesUnion, 585 how: str = "inner", 586 on=None, 587 left_on=None, 588 right_on=None, 589 axis=1, 590 left_index: bool = False, 591 right_index: bool = False, 592 sort: bool = True, 593 suffixes=("_x", "_y"), 594 copy: bool = True, 595 indicator: bool = False, 596 validate=None, 597 ): 598 _left = _validate_operand(left) 599 _right = _validate_operand(right) 600 self.left = self.orig_left = _left 601 self.right = self.orig_right = _right 602 self.how = how 603 604 # bm_axis -> the axis on the BlockManager 605 self.bm_axis = axis 606 # axis --> the axis on the Series/DataFrame 607 self.axis = 1 - axis if self.left.ndim == 2 else 0 608 609 self.on = com.maybe_make_list(on) 610 self.left_on = com.maybe_make_list(left_on) 611 self.right_on = com.maybe_make_list(right_on) 612 613 self.copy = copy 614 self.suffixes = suffixes 615 self.sort = sort 616 617 self.left_index = left_index 618 self.right_index = right_index 619 620 self.indicator = indicator 621 622 self.indicator_name: Optional[str] 623 if isinstance(self.indicator, str): 624 self.indicator_name = self.indicator 625 elif isinstance(self.indicator, bool): 626 self.indicator_name = "_merge" if self.indicator else None 627 else: 628 raise ValueError( 629 "indicator option can only accept boolean or string arguments" 630 ) 631 632 if not is_bool(left_index): 633 raise ValueError( 634 f"left_index parameter must be of type bool, not {type(left_index)}" 635 ) 636 if not is_bool(right_index): 637 raise ValueError( 638 f"right_index parameter must be of type bool, not {type(right_index)}" 639 ) 640 641 # warn user when merging between different levels 642 if _left.columns.nlevels != _right.columns.nlevels: 643 msg = ( 644 "merging between different levels can give an unintended " 645 f"result ({left.columns.nlevels} levels on the left," 646 f"{right.columns.nlevels} on the right)" 647 ) 648 warnings.warn(msg, UserWarning) 649 650 self._validate_specification() 651 652 cross_col = None 653 if self.how == "cross": 654 ( 655 self.left, 656 self.right, 657 self.how, 658 cross_col, 659 ) = self._create_cross_configuration(self.left, self.right) 660 self.left_on = self.right_on = [cross_col] 661 self._cross = cross_col 662 663 # note this function has side effects 664 ( 665 self.left_join_keys, 666 self.right_join_keys, 667 self.join_names, 668 ) = self._get_merge_keys() 669 670 # validate the merge keys dtypes. We may need to coerce 671 # to avoid incompatible dtypes 672 self._maybe_coerce_merge_keys() 673 674 # If argument passed to validate, 675 # check if columns specified as unique 676 # are in fact unique. 677 if validate is not None: 678 self._validate(validate) 679 680 def get_result(self): 681 if self.indicator: 682 self.left, self.right = self._indicator_pre_merge(self.left, self.right) 683 684 join_index, left_indexer, right_indexer = self._get_join_info() 685 686 llabels, rlabels = _items_overlap_with_suffix( 687 self.left._info_axis, self.right._info_axis, self.suffixes 688 ) 689 690 lindexers = {1: left_indexer} if left_indexer is not None else {} 691 rindexers = {1: right_indexer} if right_indexer is not None else {} 692 693 result_data = concatenate_block_managers( 694 [(self.left._mgr, lindexers), (self.right._mgr, rindexers)], 695 axes=[llabels.append(rlabels), join_index], 696 concat_axis=0, 697 copy=self.copy, 698 ) 699 700 typ = self.left._constructor 701 result = typ(result_data).__finalize__(self, method=self._merge_type) 702 703 if self.indicator: 704 result = self._indicator_post_merge(result) 705 706 self._maybe_add_join_keys(result, left_indexer, right_indexer) 707 708 self._maybe_restore_index_levels(result) 709 710 self._maybe_drop_cross_column(result, self._cross) 711 712 return result.__finalize__(self, method="merge") 713 714 def _maybe_drop_cross_column(self, result: "DataFrame", cross_col: Optional[str]): 715 if cross_col is not None: 716 result.drop(columns=cross_col, inplace=True) 717 718 def _indicator_pre_merge( 719 self, left: "DataFrame", right: "DataFrame" 720 ) -> Tuple["DataFrame", "DataFrame"]: 721 722 columns = left.columns.union(right.columns) 723 724 for i in ["_left_indicator", "_right_indicator"]: 725 if i in columns: 726 raise ValueError( 727 "Cannot use `indicator=True` option when " 728 f"data contains a column named {i}" 729 ) 730 if self.indicator_name in columns: 731 raise ValueError( 732 "Cannot use name of an existing column for indicator column" 733 ) 734 735 left = left.copy() 736 right = right.copy() 737 738 left["_left_indicator"] = 1 739 left["_left_indicator"] = left["_left_indicator"].astype("int8") 740 741 right["_right_indicator"] = 2 742 right["_right_indicator"] = right["_right_indicator"].astype("int8") 743 744 return left, right 745 746 def _indicator_post_merge(self, result): 747 748 result["_left_indicator"] = result["_left_indicator"].fillna(0) 749 result["_right_indicator"] = result["_right_indicator"].fillna(0) 750 751 result[self.indicator_name] = Categorical( 752 (result["_left_indicator"] + result["_right_indicator"]), 753 categories=[1, 2, 3], 754 ) 755 result[self.indicator_name] = result[self.indicator_name].cat.rename_categories( 756 ["left_only", "right_only", "both"] 757 ) 758 759 result = result.drop(labels=["_left_indicator", "_right_indicator"], axis=1) 760 return result 761 762 def _maybe_restore_index_levels(self, result): 763 """ 764 Restore index levels specified as `on` parameters 765 766 Here we check for cases where `self.left_on` and `self.right_on` pairs 767 each reference an index level in their respective DataFrames. The 768 joined columns corresponding to these pairs are then restored to the 769 index of `result`. 770 771 **Note:** This method has side effects. It modifies `result` in-place 772 773 Parameters 774 ---------- 775 result: DataFrame 776 merge result 777 778 Returns 779 ------- 780 None 781 """ 782 names_to_restore = [] 783 for name, left_key, right_key in zip( 784 self.join_names, self.left_on, self.right_on 785 ): 786 if ( 787 self.orig_left._is_level_reference(left_key) 788 and self.orig_right._is_level_reference(right_key) 789 and name not in result.index.names 790 ): 791 792 names_to_restore.append(name) 793 794 if names_to_restore: 795 result.set_index(names_to_restore, inplace=True) 796 797 def _maybe_add_join_keys(self, result, left_indexer, right_indexer): 798 799 left_has_missing = None 800 right_has_missing = None 801 802 keys = zip(self.join_names, self.left_on, self.right_on) 803 for i, (name, lname, rname) in enumerate(keys): 804 if not _should_fill(lname, rname): 805 continue 806 807 take_left, take_right = None, None 808 809 if name in result: 810 811 if left_indexer is not None and right_indexer is not None: 812 if name in self.left: 813 814 if left_has_missing is None: 815 left_has_missing = (left_indexer == -1).any() 816 817 if left_has_missing: 818 take_right = self.right_join_keys[i] 819 820 if not is_dtype_equal( 821 result[name].dtype, self.left[name].dtype 822 ): 823 take_left = self.left[name]._values 824 825 elif name in self.right: 826 827 if right_has_missing is None: 828 right_has_missing = (right_indexer == -1).any() 829 830 if right_has_missing: 831 take_left = self.left_join_keys[i] 832 833 if not is_dtype_equal( 834 result[name].dtype, self.right[name].dtype 835 ): 836 take_right = self.right[name]._values 837 838 elif left_indexer is not None and is_array_like(self.left_join_keys[i]): 839 take_left = self.left_join_keys[i] 840 take_right = self.right_join_keys[i] 841 842 if take_left is not None or take_right is not None: 843 844 if take_left is None: 845 lvals = result[name]._values 846 else: 847 lfill = na_value_for_dtype(take_left.dtype) 848 lvals = algos.take_1d(take_left, left_indexer, fill_value=lfill) 849 850 if take_right is None: 851 rvals = result[name]._values 852 else: 853 rfill = na_value_for_dtype(take_right.dtype) 854 rvals = algos.take_1d(take_right, right_indexer, fill_value=rfill) 855 856 # if we have an all missing left_indexer 857 # make sure to just use the right values or vice-versa 858 mask_left = left_indexer == -1 859 mask_right = right_indexer == -1 860 if mask_left.all(): 861 key_col = rvals 862 elif right_indexer is not None and mask_right.all(): 863 key_col = lvals 864 else: 865 key_col = Index(lvals).where(~mask_left, rvals) 866 867 if result._is_label_reference(name): 868 result[name] = key_col 869 elif result._is_level_reference(name): 870 if isinstance(result.index, MultiIndex): 871 key_col.name = name 872 idx_list = [ 873 result.index.get_level_values(level_name) 874 if level_name != name 875 else key_col 876 for level_name in result.index.names 877 ] 878 879 result.set_index(idx_list, inplace=True) 880 else: 881 result.index = Index(key_col, name=name) 882 else: 883 result.insert(i, name or f"key_{i}", key_col) 884 885 def _get_join_indexers(self): 886 """ return the join indexers """ 887 return get_join_indexers( 888 self.left_join_keys, self.right_join_keys, sort=self.sort, how=self.how 889 ) 890 891 def _get_join_info(self): 892 left_ax = self.left.axes[self.axis] 893 right_ax = self.right.axes[self.axis] 894 895 if self.left_index and self.right_index and self.how != "asof": 896 join_index, left_indexer, right_indexer = left_ax.join( 897 right_ax, how=self.how, return_indexers=True, sort=self.sort 898 ) 899 elif self.right_index and self.how == "left": 900 join_index, left_indexer, right_indexer = _left_join_on_index( 901 left_ax, right_ax, self.left_join_keys, sort=self.sort 902 ) 903 904 elif self.left_index and self.how == "right": 905 join_index, right_indexer, left_indexer = _left_join_on_index( 906 right_ax, left_ax, self.right_join_keys, sort=self.sort 907 ) 908 else: 909 (left_indexer, right_indexer) = self._get_join_indexers() 910 911 if self.right_index: 912 if len(self.left) > 0: 913 join_index = self._create_join_index( 914 self.left.index, 915 self.right.index, 916 left_indexer, 917 right_indexer, 918 how="right", 919 ) 920 else: 921 join_index = self.right.index.take(right_indexer) 922 left_indexer = np.array([-1] * len(join_index)) 923 elif self.left_index: 924 if len(self.right) > 0: 925 join_index = self._create_join_index( 926 self.right.index, 927 self.left.index, 928 right_indexer, 929 left_indexer, 930 how="left", 931 ) 932 else: 933 join_index = self.left.index.take(left_indexer) 934 right_indexer = np.array([-1] * len(join_index)) 935 else: 936 join_index = Index(np.arange(len(left_indexer))) 937 938 if len(join_index) == 0: 939 join_index = join_index.astype(object) 940 return join_index, left_indexer, right_indexer 941 942 def _create_join_index( 943 self, 944 index: Index, 945 other_index: Index, 946 indexer, 947 other_indexer, 948 how: str = "left", 949 ): 950 """ 951 Create a join index by rearranging one index to match another 952 953 Parameters 954 ---------- 955 index: Index being rearranged 956 other_index: Index used to supply values not found in index 957 indexer: how to rearrange index 958 how: replacement is only necessary if indexer based on other_index 959 960 Returns 961 ------- 962 join_index 963 """ 964 if self.how in (how, "outer") and not isinstance(other_index, MultiIndex): 965 # if final index requires values in other_index but not target 966 # index, indexer may hold missing (-1) values, causing Index.take 967 # to take the final value in target index. So, we set the last 968 # element to be the desired fill value. We do not use allow_fill 969 # and fill_value because it throws a ValueError on integer indices 970 mask = indexer == -1 971 if np.any(mask): 972 fill_value = na_value_for_dtype(index.dtype, compat=False) 973 index = index.append(Index([fill_value])) 974 return index.take(indexer) 975 976 def _get_merge_keys(self): 977 """ 978 Note: has side effects (copy/delete key columns) 979 980 Parameters 981 ---------- 982 left 983 right 984 on 985 986 Returns 987 ------- 988 left_keys, right_keys 989 """ 990 left_keys = [] 991 right_keys = [] 992 # pandas\core\reshape\merge.py:966: error: Need type annotation for 993 # 'join_names' (hint: "join_names: List[<type>] = ...") 994 # [var-annotated] 995 join_names = [] # type: ignore[var-annotated] 996 right_drop = [] 997 left_drop = [] 998 999 left, right = self.left, self.right 1000 1001 is_lkey = lambda x: is_array_like(x) and len(x) == len(left) 1002 is_rkey = lambda x: is_array_like(x) and len(x) == len(right) 1003 1004 # Note that pd.merge_asof() has separate 'on' and 'by' parameters. A 1005 # user could, for example, request 'left_index' and 'left_by'. In a 1006 # regular pd.merge(), users cannot specify both 'left_index' and 1007 # 'left_on'. (Instead, users have a MultiIndex). That means the 1008 # self.left_on in this function is always empty in a pd.merge(), but 1009 # a pd.merge_asof(left_index=True, left_by=...) will result in a 1010 # self.left_on array with a None in the middle of it. This requires 1011 # a work-around as designated in the code below. 1012 # See _validate_specification() for where this happens. 1013 1014 # ugh, spaghetti re #733 1015 if _any(self.left_on) and _any(self.right_on): 1016 for lk, rk in zip(self.left_on, self.right_on): 1017 if is_lkey(lk): 1018 left_keys.append(lk) 1019 if is_rkey(rk): 1020 right_keys.append(rk) 1021 join_names.append(None) # what to do? 1022 else: 1023 if rk is not None: 1024 right_keys.append(right._get_label_or_level_values(rk)) 1025 join_names.append(rk) 1026 else: 1027 # work-around for merge_asof(right_index=True) 1028 right_keys.append(right.index) 1029 join_names.append(right.index.name) 1030 else: 1031 if not is_rkey(rk): 1032 if rk is not None: 1033 right_keys.append(right._get_label_or_level_values(rk)) 1034 else: 1035 # work-around for merge_asof(right_index=True) 1036 right_keys.append(right.index) 1037 if lk is not None and lk == rk: 1038 # avoid key upcast in corner case (length-0) 1039 if len(left) > 0: 1040 right_drop.append(rk) 1041 else: 1042 left_drop.append(lk) 1043 else: 1044 right_keys.append(rk) 1045 if lk is not None: 1046 left_keys.append(left._get_label_or_level_values(lk)) 1047 join_names.append(lk) 1048 else: 1049 # work-around for merge_asof(left_index=True) 1050 left_keys.append(left.index) 1051 join_names.append(left.index.name) 1052 elif _any(self.left_on): 1053 for k in self.left_on: 1054 if is_lkey(k): 1055 left_keys.append(k) 1056 join_names.append(None) 1057 else: 1058 left_keys.append(left._get_label_or_level_values(k)) 1059 join_names.append(k) 1060 if isinstance(self.right.index, MultiIndex): 1061 right_keys = [ 1062 lev._values.take(lev_codes) 1063 for lev, lev_codes in zip( 1064 self.right.index.levels, self.right.index.codes 1065 ) 1066 ] 1067 else: 1068 right_keys = [self.right.index._values] 1069 elif _any(self.right_on): 1070 for k in self.right_on: 1071 if is_rkey(k): 1072 right_keys.append(k) 1073 join_names.append(None) 1074 else: 1075 right_keys.append(right._get_label_or_level_values(k)) 1076 join_names.append(k) 1077 if isinstance(self.left.index, MultiIndex): 1078 left_keys = [ 1079 lev._values.take(lev_codes) 1080 for lev, lev_codes in zip( 1081 self.left.index.levels, self.left.index.codes 1082 ) 1083 ] 1084 else: 1085 left_keys = [self.left.index._values] 1086 1087 if left_drop: 1088 self.left = self.left._drop_labels_or_levels(left_drop) 1089 1090 if right_drop: 1091 self.right = self.right._drop_labels_or_levels(right_drop) 1092 1093 return left_keys, right_keys, join_names 1094 1095 def _maybe_coerce_merge_keys(self): 1096 # we have valid merges but we may have to further 1097 # coerce these if they are originally incompatible types 1098 # 1099 # for example if these are categorical, but are not dtype_equal 1100 # or if we have object and integer dtypes 1101 1102 for lk, rk, name in zip( 1103 self.left_join_keys, self.right_join_keys, self.join_names 1104 ): 1105 if (len(lk) and not len(rk)) or (not len(lk) and len(rk)): 1106 continue 1107 1108 lk_is_cat = is_categorical_dtype(lk.dtype) 1109 rk_is_cat = is_categorical_dtype(rk.dtype) 1110 lk_is_object = is_object_dtype(lk.dtype) 1111 rk_is_object = is_object_dtype(rk.dtype) 1112 1113 # if either left or right is a categorical 1114 # then the must match exactly in categories & ordered 1115 if lk_is_cat and rk_is_cat: 1116 if lk._categories_match_up_to_permutation(rk): 1117 continue 1118 1119 elif lk_is_cat or rk_is_cat: 1120 pass 1121 1122 elif is_dtype_equal(lk.dtype, rk.dtype): 1123 continue 1124 1125 msg = ( 1126 f"You are trying to merge on {lk.dtype} and " 1127 f"{rk.dtype} columns. If you wish to proceed you should use pd.concat" 1128 ) 1129 1130 # if we are numeric, then allow differing 1131 # kinds to proceed, eg. int64 and int8, int and float 1132 # further if we are object, but we infer to 1133 # the same, then proceed 1134 if is_numeric_dtype(lk.dtype) and is_numeric_dtype(rk.dtype): 1135 if lk.dtype.kind == rk.dtype.kind: 1136 continue 1137 1138 # check whether ints and floats 1139 elif is_integer_dtype(rk.dtype) and is_float_dtype(lk.dtype): 1140 if not (lk == lk.astype(rk.dtype))[~np.isnan(lk)].all(): 1141 warnings.warn( 1142 "You are merging on int and float " 1143 "columns where the float values " 1144 "are not equal to their int representation", 1145 UserWarning, 1146 ) 1147 continue 1148 1149 elif is_float_dtype(rk.dtype) and is_integer_dtype(lk.dtype): 1150 if not (rk == rk.astype(lk.dtype))[~np.isnan(rk)].all(): 1151 warnings.warn( 1152 "You are merging on int and float " 1153 "columns where the float values " 1154 "are not equal to their int representation", 1155 UserWarning, 1156 ) 1157 continue 1158 1159 # let's infer and see if we are ok 1160 elif lib.infer_dtype(lk, skipna=False) == lib.infer_dtype( 1161 rk, skipna=False 1162 ): 1163 continue 1164 1165 # Check if we are trying to merge on obviously 1166 # incompatible dtypes GH 9780, GH 15800 1167 1168 # bool values are coerced to object 1169 elif (lk_is_object and is_bool_dtype(rk.dtype)) or ( 1170 is_bool_dtype(lk.dtype) and rk_is_object 1171 ): 1172 pass 1173 1174 # object values are allowed to be merged 1175 elif (lk_is_object and is_numeric_dtype(rk.dtype)) or ( 1176 is_numeric_dtype(lk.dtype) and rk_is_object 1177 ): 1178 inferred_left = lib.infer_dtype(lk, skipna=False) 1179 inferred_right = lib.infer_dtype(rk, skipna=False) 1180 bool_types = ["integer", "mixed-integer", "boolean", "empty"] 1181 string_types = ["string", "unicode", "mixed", "bytes", "empty"] 1182 1183 # inferred bool 1184 if inferred_left in bool_types and inferred_right in bool_types: 1185 pass 1186 1187 # unless we are merging non-string-like with string-like 1188 elif ( 1189 inferred_left in string_types and inferred_right not in string_types 1190 ) or ( 1191 inferred_right in string_types and inferred_left not in string_types 1192 ): 1193 raise ValueError(msg) 1194 1195 # datetimelikes must match exactly 1196 elif needs_i8_conversion(lk.dtype) and not needs_i8_conversion(rk.dtype): 1197 raise ValueError(msg) 1198 elif not needs_i8_conversion(lk.dtype) and needs_i8_conversion(rk.dtype): 1199 raise ValueError(msg) 1200 elif is_datetime64tz_dtype(lk.dtype) and not is_datetime64tz_dtype( 1201 rk.dtype 1202 ): 1203 raise ValueError(msg) 1204 elif not is_datetime64tz_dtype(lk.dtype) and is_datetime64tz_dtype( 1205 rk.dtype 1206 ): 1207 raise ValueError(msg) 1208 1209 elif lk_is_object and rk_is_object: 1210 continue 1211 1212 # Houston, we have a problem! 1213 # let's coerce to object if the dtypes aren't 1214 # categorical, otherwise coerce to the category 1215 # dtype. If we coerced categories to object, 1216 # then we would lose type information on some 1217 # columns, and end up trying to merge 1218 # incompatible dtypes. See GH 16900. 1219 if name in self.left.columns: 1220 typ = lk.categories.dtype if lk_is_cat else object 1221 self.left = self.left.assign(**{name: self.left[name].astype(typ)}) 1222 if name in self.right.columns: 1223 typ = rk.categories.dtype if rk_is_cat else object 1224 self.right = self.right.assign(**{name: self.right[name].astype(typ)}) 1225 1226 def _create_cross_configuration( 1227 self, left, right 1228 ) -> Tuple["DataFrame", "DataFrame", str, str]: 1229 """ 1230 Creates the configuration to dispatch the cross operation to inner join, 1231 e.g. adding a join column and resetting parameters. Join column is added 1232 to a new object, no inplace modification 1233 1234 Parameters 1235 ---------- 1236 left: DataFrame 1237 right DataFrame 1238 1239 Returns 1240 ------- 1241 a tuple (left, right, how, cross_col) representing the adjusted 1242 DataFrames with cross_col, the merge operation set to inner and the column 1243 to join over. 1244 """ 1245 cross_col = f"_cross_{hashlib.md5().hexdigest()}" 1246 how = "inner" 1247 return ( 1248 left.assign(**{cross_col: 1}), 1249 right.assign(**{cross_col: 1}), 1250 how, 1251 cross_col, 1252 ) 1253 1254 def _validate_specification(self): 1255 if self.how == "cross": 1256 if ( 1257 self.left_index 1258 or self.right_index 1259 or self.right_on is not None 1260 or self.left_on is not None 1261 or self.on is not None 1262 ): 1263 raise MergeError( 1264 "Can not pass on, right_on, left_on or set right_index=True or " 1265 "left_index=True" 1266 ) 1267 return 1268 # Hm, any way to make this logic less complicated?? 1269 elif self.on is None and self.left_on is None and self.right_on is None: 1270 1271 if self.left_index and self.right_index: 1272 self.left_on, self.right_on = (), () 1273 elif self.left_index: 1274 raise MergeError("Must pass right_on or right_index=True") 1275 elif self.right_index: 1276 raise MergeError("Must pass left_on or left_index=True") 1277 else: 1278 # use the common columns 1279 left_cols = self.left.columns 1280 right_cols = self.right.columns 1281 common_cols = left_cols.intersection(right_cols) 1282 if len(common_cols) == 0: 1283 raise MergeError( 1284 "No common columns to perform merge on. " 1285 f"Merge options: left_on={self.left_on}, " 1286 f"right_on={self.right_on}, " 1287 f"left_index={self.left_index}, " 1288 f"right_index={self.right_index}" 1289 ) 1290 if ( 1291 not left_cols.join(common_cols, how="inner").is_unique 1292 or not right_cols.join(common_cols, how="inner").is_unique 1293 ): 1294 raise MergeError(f"Data columns not unique: {repr(common_cols)}") 1295 self.left_on = self.right_on = common_cols 1296 elif self.on is not None: 1297 if self.left_on is not None or self.right_on is not None: 1298 raise MergeError( 1299 'Can only pass argument "on" OR "left_on" ' 1300 'and "right_on", not a combination of both.' 1301 ) 1302 if self.left_index or self.right_index: 1303 raise MergeError( 1304 'Can only pass argument "on" OR "left_index" ' 1305 'and "right_index", not a combination of both.' 1306 ) 1307 self.left_on = self.right_on = self.on 1308 elif self.left_on is not None: 1309 if self.left_index: 1310 raise MergeError( 1311 'Can only pass argument "left_on" OR "left_index" not both.' 1312 ) 1313 if not self.right_index and self.right_on is None: 1314 raise MergeError('Must pass "right_on" OR "right_index".') 1315 n = len(self.left_on) 1316 if self.right_index: 1317 if len(self.left_on) != self.right.index.nlevels: 1318 raise ValueError( 1319 "len(left_on) must equal the number " 1320 'of levels in the index of "right"' 1321 ) 1322 self.right_on = [None] * n 1323 elif self.right_on is not None: 1324 if self.right_index: 1325 raise MergeError( 1326 'Can only pass argument "right_on" OR "right_index" not both.' 1327 ) 1328 if not self.left_index and self.left_on is None: 1329 raise MergeError('Must pass "left_on" OR "left_index".') 1330 n = len(self.right_on) 1331 if self.left_index: 1332 if len(self.right_on) != self.left.index.nlevels: 1333 raise ValueError( 1334 "len(right_on) must equal the number " 1335 'of levels in the index of "left"' 1336 ) 1337 self.left_on = [None] * n 1338 if self.how != "cross" and len(self.right_on) != len(self.left_on): 1339 raise ValueError("len(right_on) must equal len(left_on)") 1340 1341 def _validate(self, validate: str): 1342 1343 # Check uniqueness of each 1344 if self.left_index: 1345 left_unique = self.orig_left.index.is_unique 1346 else: 1347 left_unique = MultiIndex.from_arrays(self.left_join_keys).is_unique 1348 1349 if self.right_index: 1350 right_unique = self.orig_right.index.is_unique 1351 else: 1352 right_unique = MultiIndex.from_arrays(self.right_join_keys).is_unique 1353 1354 # Check data integrity 1355 if validate in ["one_to_one", "1:1"]: 1356 if not left_unique and not right_unique: 1357 raise MergeError( 1358 "Merge keys are not unique in either left " 1359 "or right dataset; not a one-to-one merge" 1360 ) 1361 elif not left_unique: 1362 raise MergeError( 1363 "Merge keys are not unique in left dataset; not a one-to-one merge" 1364 ) 1365 elif not right_unique: 1366 raise MergeError( 1367 "Merge keys are not unique in right dataset; not a one-to-one merge" 1368 ) 1369 1370 elif validate in ["one_to_many", "1:m"]: 1371 if not left_unique: 1372 raise MergeError( 1373 "Merge keys are not unique in left dataset; not a one-to-many merge" 1374 ) 1375 1376 elif validate in ["many_to_one", "m:1"]: 1377 if not right_unique: 1378 raise MergeError( 1379 "Merge keys are not unique in right dataset; " 1380 "not a many-to-one merge" 1381 ) 1382 1383 elif validate in ["many_to_many", "m:m"]: 1384 pass 1385 1386 else: 1387 raise ValueError("Not a valid argument for validate") 1388 1389 1390def get_join_indexers( 1391 left_keys, right_keys, sort: bool = False, how: str = "inner", **kwargs 1392): 1393 """ 1394 1395 Parameters 1396 ---------- 1397 left_keys: ndarray, Index, Series 1398 right_keys: ndarray, Index, Series 1399 sort: bool, default False 1400 how: string {'inner', 'outer', 'left', 'right'}, default 'inner' 1401 1402 Returns 1403 ------- 1404 tuple of (left_indexer, right_indexer) 1405 indexers into the left_keys, right_keys 1406 1407 """ 1408 assert len(left_keys) == len( 1409 right_keys 1410 ), "left_key and right_keys must be the same length" 1411 1412 # get left & right join labels and num. of levels at each location 1413 mapped = ( 1414 _factorize_keys(left_keys[n], right_keys[n], sort=sort, how=how) 1415 for n in range(len(left_keys)) 1416 ) 1417 zipped = zip(*mapped) 1418 llab, rlab, shape = [list(x) for x in zipped] 1419 1420 # get flat i8 keys from label lists 1421 lkey, rkey = _get_join_keys(llab, rlab, shape, sort) 1422 1423 # factorize keys to a dense i8 space 1424 # `count` is the num. of unique keys 1425 # set(lkey) | set(rkey) == range(count) 1426 1427 lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort, how=how) 1428 # preserve left frame order if how == 'left' and sort == False 1429 kwargs = copy.copy(kwargs) 1430 if how in ("left", "right"): 1431 kwargs["sort"] = sort 1432 join_func = { 1433 "inner": libjoin.inner_join, 1434 "left": libjoin.left_outer_join, 1435 "right": lambda x, y, count, **kwargs: libjoin.left_outer_join( 1436 y, x, count, **kwargs 1437 )[::-1], 1438 "outer": libjoin.full_outer_join, 1439 }[how] 1440 1441 return join_func(lkey, rkey, count, **kwargs) 1442 1443 1444def restore_dropped_levels_multijoin( 1445 left: MultiIndex, 1446 right: MultiIndex, 1447 dropped_level_names, 1448 join_index, 1449 lindexer, 1450 rindexer, 1451): 1452 """ 1453 *this is an internal non-public method* 1454 1455 Returns the levels, labels and names of a multi-index to multi-index join. 1456 Depending on the type of join, this method restores the appropriate 1457 dropped levels of the joined multi-index. 1458 The method relies on lidx, rindexer which hold the index positions of 1459 left and right, where a join was feasible 1460 1461 Parameters 1462 ---------- 1463 left : MultiIndex 1464 left index 1465 right : MultiIndex 1466 right index 1467 dropped_level_names : str array 1468 list of non-common level names 1469 join_index : MultiIndex 1470 the index of the join between the 1471 common levels of left and right 1472 lindexer : intp array 1473 left indexer 1474 rindexer : intp array 1475 right indexer 1476 1477 Returns 1478 ------- 1479 levels : list of Index 1480 levels of combined multiindexes 1481 labels : intp array 1482 labels of combined multiindexes 1483 names : str array 1484 names of combined multiindexes 1485 1486 """ 1487 1488 def _convert_to_multiindex(index) -> MultiIndex: 1489 if isinstance(index, MultiIndex): 1490 return index 1491 else: 1492 return MultiIndex.from_arrays([index._values], names=[index.name]) 1493 1494 # For multi-multi joins with one overlapping level, 1495 # the returned index if of type Index 1496 # Assure that join_index is of type MultiIndex 1497 # so that dropped levels can be appended 1498 join_index = _convert_to_multiindex(join_index) 1499 1500 join_levels = join_index.levels 1501 join_codes = join_index.codes 1502 join_names = join_index.names 1503 1504 # lindexer and rindexer hold the indexes where the join occurred 1505 # for left and right respectively. If left/right is None then 1506 # the join occurred on all indices of left/right 1507 if lindexer is None: 1508 lindexer = range(left.size) 1509 1510 if rindexer is None: 1511 rindexer = range(right.size) 1512 1513 # Iterate through the levels that must be restored 1514 for dropped_level_name in dropped_level_names: 1515 if dropped_level_name in left.names: 1516 idx = left 1517 indexer = lindexer 1518 else: 1519 idx = right 1520 indexer = rindexer 1521 1522 # The index of the level name to be restored 1523 name_idx = idx.names.index(dropped_level_name) 1524 1525 restore_levels = idx.levels[name_idx] 1526 # Inject -1 in the codes list where a join was not possible 1527 # IOW indexer[i]=-1 1528 codes = idx.codes[name_idx] 1529 restore_codes = algos.take_nd(codes, indexer, fill_value=-1) 1530 1531 join_levels = join_levels + [restore_levels] 1532 join_codes = join_codes + [restore_codes] 1533 join_names = join_names + [dropped_level_name] 1534 1535 return join_levels, join_codes, join_names 1536 1537 1538class _OrderedMerge(_MergeOperation): 1539 _merge_type = "ordered_merge" 1540 1541 def __init__( 1542 self, 1543 left, 1544 right, 1545 on=None, 1546 left_on=None, 1547 right_on=None, 1548 left_index: bool = False, 1549 right_index: bool = False, 1550 axis=1, 1551 suffixes=("_x", "_y"), 1552 copy: bool = True, 1553 fill_method=None, 1554 how: str = "outer", 1555 ): 1556 1557 self.fill_method = fill_method 1558 _MergeOperation.__init__( 1559 self, 1560 left, 1561 right, 1562 on=on, 1563 left_on=left_on, 1564 left_index=left_index, 1565 right_index=right_index, 1566 right_on=right_on, 1567 axis=axis, 1568 how=how, 1569 suffixes=suffixes, 1570 sort=True, # factorize sorts 1571 ) 1572 1573 def get_result(self): 1574 join_index, left_indexer, right_indexer = self._get_join_info() 1575 1576 llabels, rlabels = _items_overlap_with_suffix( 1577 self.left._info_axis, self.right._info_axis, self.suffixes 1578 ) 1579 1580 if self.fill_method == "ffill": 1581 left_join_indexer = libjoin.ffill_indexer(left_indexer) 1582 right_join_indexer = libjoin.ffill_indexer(right_indexer) 1583 else: 1584 left_join_indexer = left_indexer 1585 right_join_indexer = right_indexer 1586 1587 lindexers = {1: left_join_indexer} if left_join_indexer is not None else {} 1588 rindexers = {1: right_join_indexer} if right_join_indexer is not None else {} 1589 1590 result_data = concatenate_block_managers( 1591 [(self.left._mgr, lindexers), (self.right._mgr, rindexers)], 1592 axes=[llabels.append(rlabels), join_index], 1593 concat_axis=0, 1594 copy=self.copy, 1595 ) 1596 1597 typ = self.left._constructor 1598 result = typ(result_data) 1599 1600 self._maybe_add_join_keys(result, left_indexer, right_indexer) 1601 1602 return result 1603 1604 1605def _asof_function(direction: str): 1606 name = f"asof_join_{direction}" 1607 return getattr(libjoin, name, None) 1608 1609 1610def _asof_by_function(direction: str): 1611 name = f"asof_join_{direction}_on_X_by_Y" 1612 return getattr(libjoin, name, None) 1613 1614 1615_type_casters = { 1616 "int64_t": ensure_int64, 1617 "double": ensure_float64, 1618 "object": ensure_object, 1619} 1620 1621 1622def _get_cython_type_upcast(dtype): 1623 """ Upcast a dtype to 'int64_t', 'double', or 'object' """ 1624 if is_integer_dtype(dtype): 1625 return "int64_t" 1626 elif is_float_dtype(dtype): 1627 return "double" 1628 else: 1629 return "object" 1630 1631 1632class _AsOfMerge(_OrderedMerge): 1633 _merge_type = "asof_merge" 1634 1635 def __init__( 1636 self, 1637 left, 1638 right, 1639 on=None, 1640 left_on=None, 1641 right_on=None, 1642 left_index: bool = False, 1643 right_index: bool = False, 1644 by=None, 1645 left_by=None, 1646 right_by=None, 1647 axis=1, 1648 suffixes=("_x", "_y"), 1649 copy: bool = True, 1650 fill_method=None, 1651 how: str = "asof", 1652 tolerance=None, 1653 allow_exact_matches: bool = True, 1654 direction: str = "backward", 1655 ): 1656 1657 self.by = by 1658 self.left_by = left_by 1659 self.right_by = right_by 1660 self.tolerance = tolerance 1661 self.allow_exact_matches = allow_exact_matches 1662 self.direction = direction 1663 1664 _OrderedMerge.__init__( 1665 self, 1666 left, 1667 right, 1668 on=on, 1669 left_on=left_on, 1670 right_on=right_on, 1671 left_index=left_index, 1672 right_index=right_index, 1673 axis=axis, 1674 how=how, 1675 suffixes=suffixes, 1676 fill_method=fill_method, 1677 ) 1678 1679 def _validate_specification(self): 1680 super()._validate_specification() 1681 1682 # we only allow on to be a single item for on 1683 if len(self.left_on) != 1 and not self.left_index: 1684 raise MergeError("can only asof on a key for left") 1685 1686 if len(self.right_on) != 1 and not self.right_index: 1687 raise MergeError("can only asof on a key for right") 1688 1689 if self.left_index and isinstance(self.left.index, MultiIndex): 1690 raise MergeError("left can only have one index") 1691 1692 if self.right_index and isinstance(self.right.index, MultiIndex): 1693 raise MergeError("right can only have one index") 1694 1695 # set 'by' columns 1696 if self.by is not None: 1697 if self.left_by is not None or self.right_by is not None: 1698 raise MergeError("Can only pass by OR left_by and right_by") 1699 self.left_by = self.right_by = self.by 1700 if self.left_by is None and self.right_by is not None: 1701 raise MergeError("missing left_by") 1702 if self.left_by is not None and self.right_by is None: 1703 raise MergeError("missing right_by") 1704 1705 # add 'by' to our key-list so we can have it in the 1706 # output as a key 1707 if self.left_by is not None: 1708 if not is_list_like(self.left_by): 1709 self.left_by = [self.left_by] 1710 if not is_list_like(self.right_by): 1711 self.right_by = [self.right_by] 1712 1713 if len(self.left_by) != len(self.right_by): 1714 raise MergeError("left_by and right_by must be same length") 1715 1716 self.left_on = self.left_by + list(self.left_on) 1717 self.right_on = self.right_by + list(self.right_on) 1718 1719 # check 'direction' is valid 1720 if self.direction not in ["backward", "forward", "nearest"]: 1721 raise MergeError(f"direction invalid: {self.direction}") 1722 1723 def _get_merge_keys(self): 1724 1725 # note this function has side effects 1726 (left_join_keys, right_join_keys, join_names) = super()._get_merge_keys() 1727 1728 # validate index types are the same 1729 for i, (lk, rk) in enumerate(zip(left_join_keys, right_join_keys)): 1730 if not is_dtype_equal(lk.dtype, rk.dtype): 1731 if is_categorical_dtype(lk.dtype) and is_categorical_dtype(rk.dtype): 1732 # The generic error message is confusing for categoricals. 1733 # 1734 # In this function, the join keys include both the original 1735 # ones of the merge_asof() call, and also the keys passed 1736 # to its by= argument. Unordered but equal categories 1737 # are not supported for the former, but will fail 1738 # later with a ValueError, so we don't *need* to check 1739 # for them here. 1740 msg = ( 1741 f"incompatible merge keys [{i}] {repr(lk.dtype)} and " 1742 f"{repr(rk.dtype)}, both sides category, but not equal ones" 1743 ) 1744 else: 1745 msg = ( 1746 f"incompatible merge keys [{i}] {repr(lk.dtype)} and " 1747 f"{repr(rk.dtype)}, must be the same type" 1748 ) 1749 raise MergeError(msg) 1750 1751 # validate tolerance; datetime.timedelta or Timedelta if we have a DTI 1752 if self.tolerance is not None: 1753 1754 if self.left_index: 1755 lt = self.left.index 1756 else: 1757 lt = left_join_keys[-1] 1758 1759 msg = ( 1760 f"incompatible tolerance {self.tolerance}, must be compat " 1761 f"with type {repr(lt.dtype)}" 1762 ) 1763 1764 if needs_i8_conversion(lt): 1765 if not isinstance(self.tolerance, datetime.timedelta): 1766 raise MergeError(msg) 1767 if self.tolerance < Timedelta(0): 1768 raise MergeError("tolerance must be positive") 1769 1770 elif is_integer_dtype(lt): 1771 if not is_integer(self.tolerance): 1772 raise MergeError(msg) 1773 if self.tolerance < 0: 1774 raise MergeError("tolerance must be positive") 1775 1776 elif is_float_dtype(lt): 1777 if not is_number(self.tolerance): 1778 raise MergeError(msg) 1779 if self.tolerance < 0: 1780 raise MergeError("tolerance must be positive") 1781 1782 else: 1783 raise MergeError("key must be integer, timestamp or float") 1784 1785 # validate allow_exact_matches 1786 if not is_bool(self.allow_exact_matches): 1787 msg = ( 1788 "allow_exact_matches must be boolean, " 1789 f"passed {self.allow_exact_matches}" 1790 ) 1791 raise MergeError(msg) 1792 1793 return left_join_keys, right_join_keys, join_names 1794 1795 def _get_join_indexers(self): 1796 """ return the join indexers """ 1797 1798 def flip(xs) -> np.ndarray: 1799 """ unlike np.transpose, this returns an array of tuples """ 1800 xs = [ 1801 x 1802 if not is_extension_array_dtype(x) 1803 else extract_array(x)._values_for_argsort() 1804 for x in xs 1805 ] 1806 labels = list(string.ascii_lowercase[: len(xs)]) 1807 dtypes = [x.dtype for x in xs] 1808 labeled_dtypes = list(zip(labels, dtypes)) 1809 return np.array(list(zip(*xs)), labeled_dtypes) 1810 1811 # values to compare 1812 left_values = ( 1813 self.left.index._values if self.left_index else self.left_join_keys[-1] 1814 ) 1815 right_values = ( 1816 self.right.index._values if self.right_index else self.right_join_keys[-1] 1817 ) 1818 tolerance = self.tolerance 1819 1820 # we require sortedness and non-null values in the join keys 1821 if not Index(left_values).is_monotonic: 1822 side = "left" 1823 if isna(left_values).any(): 1824 raise ValueError(f"Merge keys contain null values on {side} side") 1825 else: 1826 raise ValueError(f"{side} keys must be sorted") 1827 1828 if not Index(right_values).is_monotonic: 1829 side = "right" 1830 if isna(right_values).any(): 1831 raise ValueError(f"Merge keys contain null values on {side} side") 1832 else: 1833 raise ValueError(f"{side} keys must be sorted") 1834 1835 # initial type conversion as needed 1836 if needs_i8_conversion(left_values): 1837 left_values = left_values.view("i8") 1838 right_values = right_values.view("i8") 1839 if tolerance is not None: 1840 tolerance = Timedelta(tolerance) 1841 tolerance = tolerance.value 1842 1843 # a "by" parameter requires special handling 1844 if self.left_by is not None: 1845 # remove 'on' parameter from values if one existed 1846 if self.left_index and self.right_index: 1847 left_by_values = self.left_join_keys 1848 right_by_values = self.right_join_keys 1849 else: 1850 left_by_values = self.left_join_keys[0:-1] 1851 right_by_values = self.right_join_keys[0:-1] 1852 1853 # get tuple representation of values if more than one 1854 if len(left_by_values) == 1: 1855 left_by_values = left_by_values[0] 1856 right_by_values = right_by_values[0] 1857 else: 1858 left_by_values = flip(left_by_values) 1859 right_by_values = flip(right_by_values) 1860 1861 # upcast 'by' parameter because HashTable is limited 1862 by_type = _get_cython_type_upcast(left_by_values.dtype) 1863 by_type_caster = _type_casters[by_type] 1864 left_by_values = by_type_caster(left_by_values) 1865 right_by_values = by_type_caster(right_by_values) 1866 1867 # choose appropriate function by type 1868 func = _asof_by_function(self.direction) 1869 return func( 1870 left_values, 1871 right_values, 1872 left_by_values, 1873 right_by_values, 1874 self.allow_exact_matches, 1875 tolerance, 1876 ) 1877 else: 1878 # choose appropriate function by type 1879 func = _asof_function(self.direction) 1880 return func(left_values, right_values, self.allow_exact_matches, tolerance) 1881 1882 1883def _get_multiindex_indexer(join_keys, index: MultiIndex, sort: bool): 1884 1885 # left & right join labels and num. of levels at each location 1886 mapped = ( 1887 _factorize_keys(index.levels[n], join_keys[n], sort=sort) 1888 for n in range(index.nlevels) 1889 ) 1890 zipped = zip(*mapped) 1891 rcodes, lcodes, shape = [list(x) for x in zipped] 1892 if sort: 1893 rcodes = list(map(np.take, rcodes, index.codes)) 1894 else: 1895 i8copy = lambda a: a.astype("i8", subok=False, copy=True) 1896 rcodes = list(map(i8copy, index.codes)) 1897 1898 # fix right labels if there were any nulls 1899 for i in range(len(join_keys)): 1900 mask = index.codes[i] == -1 1901 if mask.any(): 1902 # check if there already was any nulls at this location 1903 # if there was, it is factorized to `shape[i] - 1` 1904 a = join_keys[i][lcodes[i] == shape[i] - 1] 1905 if a.size == 0 or not a[0] != a[0]: 1906 shape[i] += 1 1907 1908 rcodes[i][mask] = shape[i] - 1 1909 1910 # get flat i8 join keys 1911 lkey, rkey = _get_join_keys(lcodes, rcodes, shape, sort) 1912 1913 # factorize keys to a dense i8 space 1914 lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort) 1915 1916 return libjoin.left_outer_join(lkey, rkey, count, sort=sort) 1917 1918 1919def _get_single_indexer(join_key, index, sort: bool = False): 1920 left_key, right_key, count = _factorize_keys(join_key, index, sort=sort) 1921 1922 left_indexer, right_indexer = libjoin.left_outer_join( 1923 ensure_int64(left_key), ensure_int64(right_key), count, sort=sort 1924 ) 1925 1926 return left_indexer, right_indexer 1927 1928 1929def _left_join_on_index(left_ax: Index, right_ax: Index, join_keys, sort: bool = False): 1930 if len(join_keys) > 1: 1931 if not ( 1932 isinstance(right_ax, MultiIndex) and len(join_keys) == right_ax.nlevels 1933 ): 1934 raise AssertionError( 1935 "If more than one join key is given then " 1936 "'right_ax' must be a MultiIndex and the " 1937 "number of join keys must be the number of levels in right_ax" 1938 ) 1939 1940 left_indexer, right_indexer = _get_multiindex_indexer( 1941 join_keys, right_ax, sort=sort 1942 ) 1943 else: 1944 jkey = join_keys[0] 1945 1946 left_indexer, right_indexer = _get_single_indexer(jkey, right_ax, sort=sort) 1947 1948 if sort or len(left_ax) != len(left_indexer): 1949 # if asked to sort or there are 1-to-many matches 1950 join_index = left_ax.take(left_indexer) 1951 return join_index, left_indexer, right_indexer 1952 1953 # left frame preserves order & length of its index 1954 return left_ax, None, right_indexer 1955 1956 1957def _factorize_keys( 1958 lk: ArrayLike, rk: ArrayLike, sort: bool = True, how: str = "inner" 1959) -> Tuple[np.ndarray, np.ndarray, int]: 1960 """ 1961 Encode left and right keys as enumerated types. 1962 1963 This is used to get the join indexers to be used when merging DataFrames. 1964 1965 Parameters 1966 ---------- 1967 lk : array-like 1968 Left key. 1969 rk : array-like 1970 Right key. 1971 sort : bool, defaults to True 1972 If True, the encoding is done such that the unique elements in the 1973 keys are sorted. 1974 how : {‘left’, ‘right’, ‘outer’, ‘inner’}, default ‘inner’ 1975 Type of merge. 1976 1977 Returns 1978 ------- 1979 array 1980 Left (resp. right if called with `key='right'`) labels, as enumerated type. 1981 array 1982 Right (resp. left if called with `key='right'`) labels, as enumerated type. 1983 int 1984 Number of unique elements in union of left and right labels. 1985 1986 See Also 1987 -------- 1988 merge : Merge DataFrame or named Series objects 1989 with a database-style join. 1990 algorithms.factorize : Encode the object as an enumerated type 1991 or categorical variable. 1992 1993 Examples 1994 -------- 1995 >>> lk = np.array(["a", "c", "b"]) 1996 >>> rk = np.array(["a", "c"]) 1997 1998 Here, the unique values are `'a', 'b', 'c'`. With the default 1999 `sort=True`, the encoding will be `{0: 'a', 1: 'b', 2: 'c'}`: 2000 2001 >>> pd.core.reshape.merge._factorize_keys(lk, rk) 2002 (array([0, 2, 1]), array([0, 2]), 3) 2003 2004 With the `sort=False`, the encoding will correspond to the order 2005 in which the unique elements first appear: `{0: 'a', 1: 'c', 2: 'b'}`: 2006 2007 >>> pd.core.reshape.merge._factorize_keys(lk, rk, sort=False) 2008 (array([0, 1, 2]), array([0, 1]), 3) 2009 """ 2010 # Some pre-processing for non-ndarray lk / rk 2011 lk = extract_array(lk, extract_numpy=True) 2012 rk = extract_array(rk, extract_numpy=True) 2013 2014 if is_datetime64tz_dtype(lk.dtype) and is_datetime64tz_dtype(rk.dtype): 2015 # Extract the ndarray (UTC-localized) values 2016 # Note: we dont need the dtypes to match, as these can still be compared 2017 lk = cast("DatetimeArray", lk)._ndarray 2018 rk = cast("DatetimeArray", rk)._ndarray 2019 2020 elif ( 2021 is_categorical_dtype(lk.dtype) 2022 and is_categorical_dtype(rk.dtype) 2023 and is_dtype_equal(lk.dtype, rk.dtype) 2024 ): 2025 assert isinstance(lk, Categorical) 2026 assert isinstance(rk, Categorical) 2027 # Cast rk to encoding so we can compare codes with lk 2028 rk = lk._encode_with_my_categories(rk) 2029 2030 lk = ensure_int64(lk.codes) 2031 rk = ensure_int64(rk.codes) 2032 2033 elif is_extension_array_dtype(lk.dtype) and is_dtype_equal(lk.dtype, rk.dtype): 2034 lk, _ = lk._values_for_factorize() 2035 rk, _ = rk._values_for_factorize() 2036 2037 if is_integer_dtype(lk.dtype) and is_integer_dtype(rk.dtype): 2038 # GH#23917 TODO: needs tests for case where lk is integer-dtype 2039 # and rk is datetime-dtype 2040 klass = libhashtable.Int64Factorizer 2041 lk = ensure_int64(np.asarray(lk)) 2042 rk = ensure_int64(np.asarray(rk)) 2043 2044 elif needs_i8_conversion(lk.dtype) and is_dtype_equal(lk.dtype, rk.dtype): 2045 # GH#23917 TODO: Needs tests for non-matching dtypes 2046 klass = libhashtable.Int64Factorizer 2047 lk = ensure_int64(np.asarray(lk, dtype=np.int64)) 2048 rk = ensure_int64(np.asarray(rk, dtype=np.int64)) 2049 2050 else: 2051 klass = libhashtable.Factorizer 2052 lk = ensure_object(lk) 2053 rk = ensure_object(rk) 2054 2055 rizer = klass(max(len(lk), len(rk))) 2056 2057 llab = rizer.factorize(lk) 2058 rlab = rizer.factorize(rk) 2059 2060 count = rizer.get_count() 2061 2062 if sort: 2063 uniques = rizer.uniques.to_array() 2064 llab, rlab = _sort_labels(uniques, llab, rlab) 2065 2066 # NA group 2067 lmask = llab == -1 2068 lany = lmask.any() 2069 rmask = rlab == -1 2070 rany = rmask.any() 2071 2072 if lany or rany: 2073 if lany: 2074 np.putmask(llab, lmask, count) 2075 if rany: 2076 np.putmask(rlab, rmask, count) 2077 count += 1 2078 2079 if how == "right": 2080 return rlab, llab, count 2081 return llab, rlab, count 2082 2083 2084def _sort_labels(uniques: np.ndarray, left, right): 2085 2086 llength = len(left) 2087 labels = np.concatenate([left, right]) 2088 2089 _, new_labels = algos.safe_sort(uniques, labels, na_sentinel=-1) 2090 new_labels = ensure_int64(new_labels) 2091 new_left, new_right = new_labels[:llength], new_labels[llength:] 2092 2093 return new_left, new_right 2094 2095 2096def _get_join_keys(llab, rlab, shape, sort: bool): 2097 2098 # how many levels can be done without overflow 2099 nlev = next( 2100 lev 2101 for lev in range(len(shape), 0, -1) 2102 if not is_int64_overflow_possible(shape[:lev]) 2103 ) 2104 2105 # get keys for the first `nlev` levels 2106 stride = np.prod(shape[1:nlev], dtype="i8") 2107 lkey = stride * llab[0].astype("i8", subok=False, copy=False) 2108 rkey = stride * rlab[0].astype("i8", subok=False, copy=False) 2109 2110 for i in range(1, nlev): 2111 with np.errstate(divide="ignore"): 2112 stride //= shape[i] 2113 lkey += llab[i] * stride 2114 rkey += rlab[i] * stride 2115 2116 if nlev == len(shape): # all done! 2117 return lkey, rkey 2118 2119 # densify current keys to avoid overflow 2120 lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort) 2121 2122 llab = [lkey] + llab[nlev:] 2123 rlab = [rkey] + rlab[nlev:] 2124 shape = [count] + shape[nlev:] 2125 2126 return _get_join_keys(llab, rlab, shape, sort) 2127 2128 2129def _should_fill(lname, rname) -> bool: 2130 if not isinstance(lname, str) or not isinstance(rname, str): 2131 return True 2132 return lname == rname 2133 2134 2135def _any(x) -> bool: 2136 return x is not None and com.any_not_none(*x) 2137 2138 2139def _validate_operand(obj: FrameOrSeries) -> "DataFrame": 2140 if isinstance(obj, ABCDataFrame): 2141 return obj 2142 elif isinstance(obj, ABCSeries): 2143 if obj.name is None: 2144 raise ValueError("Cannot merge a Series without a name") 2145 else: 2146 return obj.to_frame() 2147 else: 2148 raise TypeError( 2149 f"Can only merge Series or DataFrame objects, a {type(obj)} was passed" 2150 ) 2151 2152 2153def _items_overlap_with_suffix(left: Index, right: Index, suffixes: Tuple[str, str]): 2154 """ 2155 Suffixes type validation. 2156 2157 If two indices overlap, add suffixes to overlapping entries. 2158 2159 If corresponding suffix is empty, the entry is simply converted to string. 2160 2161 """ 2162 if not is_list_like(suffixes, allow_sets=False): 2163 warnings.warn( 2164 f"Passing 'suffixes' as a {type(suffixes)}, is not supported and may give " 2165 "unexpected results. Provide 'suffixes' as a tuple instead. In the " 2166 "future a 'TypeError' will be raised.", 2167 FutureWarning, 2168 stacklevel=4, 2169 ) 2170 2171 to_rename = left.intersection(right) 2172 if len(to_rename) == 0: 2173 return left, right 2174 2175 lsuffix, rsuffix = suffixes 2176 2177 if not lsuffix and not rsuffix: 2178 raise ValueError(f"columns overlap but no suffix specified: {to_rename}") 2179 2180 def renamer(x, suffix): 2181 """ 2182 Rename the left and right indices. 2183 2184 If there is overlap, and suffix is not None, add 2185 suffix, otherwise, leave it as-is. 2186 2187 Parameters 2188 ---------- 2189 x : original column name 2190 suffix : str or None 2191 2192 Returns 2193 ------- 2194 x : renamed column name 2195 """ 2196 if x in to_rename and suffix is not None: 2197 return f"{x}{suffix}" 2198 return x 2199 2200 lrenamer = partial(renamer, suffix=lsuffix) 2201 rrenamer = partial(renamer, suffix=rsuffix) 2202 2203 return (left._transform_index(lrenamer), right._transform_index(rrenamer)) 2204