1"""
2SQL-style merge routines
3"""
4
5import copy
6import datetime
7from functools import partial
8import hashlib
9import string
10from typing import TYPE_CHECKING, Optional, Tuple, cast
11import warnings
12
13import numpy as np
14
15from pandas._libs import Timedelta, hashtable as libhashtable, join as libjoin, lib
16from pandas._typing import ArrayLike, FrameOrSeries, FrameOrSeriesUnion
17from pandas.errors import MergeError
18from pandas.util._decorators import Appender, Substitution
19
20from pandas.core.dtypes.common import (
21    ensure_float64,
22    ensure_int64,
23    ensure_object,
24    is_array_like,
25    is_bool,
26    is_bool_dtype,
27    is_categorical_dtype,
28    is_datetime64tz_dtype,
29    is_dtype_equal,
30    is_extension_array_dtype,
31    is_float_dtype,
32    is_integer,
33    is_integer_dtype,
34    is_list_like,
35    is_number,
36    is_numeric_dtype,
37    is_object_dtype,
38    needs_i8_conversion,
39)
40from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
41from pandas.core.dtypes.missing import isna, na_value_for_dtype
42
43from pandas import Categorical, Index, MultiIndex
44from pandas.core import groupby
45import pandas.core.algorithms as algos
46import pandas.core.common as com
47from pandas.core.construction import extract_array
48from pandas.core.frame import _merge_doc
49from pandas.core.internals import concatenate_block_managers
50from pandas.core.sorting import is_int64_overflow_possible
51
52if TYPE_CHECKING:
53    from pandas import DataFrame
54    from pandas.core.arrays import DatetimeArray
55
56
57@Substitution("\nleft : DataFrame")
58@Appender(_merge_doc, indents=0)
59def merge(
60    left,
61    right,
62    how: str = "inner",
63    on=None,
64    left_on=None,
65    right_on=None,
66    left_index: bool = False,
67    right_index: bool = False,
68    sort: bool = False,
69    suffixes=("_x", "_y"),
70    copy: bool = True,
71    indicator: bool = False,
72    validate=None,
73) -> "DataFrame":
74    op = _MergeOperation(
75        left,
76        right,
77        how=how,
78        on=on,
79        left_on=left_on,
80        right_on=right_on,
81        left_index=left_index,
82        right_index=right_index,
83        sort=sort,
84        suffixes=suffixes,
85        copy=copy,
86        indicator=indicator,
87        validate=validate,
88    )
89    return op.get_result()
90
91
92if __debug__:
93    merge.__doc__ = _merge_doc % "\nleft : DataFrame"
94
95
96def _groupby_and_merge(by, on, left: "DataFrame", right: "DataFrame", merge_pieces):
97    """
98    groupby & merge; we are always performing a left-by type operation
99
100    Parameters
101    ----------
102    by: field to group
103    on: duplicates field
104    left: DataFrame
105    right: DataFrame
106    merge_pieces: function for merging
107    """
108    pieces = []
109    if not isinstance(by, (list, tuple)):
110        by = [by]
111
112    lby = left.groupby(by, sort=False)
113    rby: Optional[groupby.DataFrameGroupBy] = None
114
115    # if we can groupby the rhs
116    # then we can get vastly better perf
117    if all(item in right.columns for item in by):
118        rby = right.groupby(by, sort=False)
119
120    for key, lhs in lby:
121
122        if rby is None:
123            rhs = right
124        else:
125            try:
126                rhs = right.take(rby.indices[key])
127            except KeyError:
128                # key doesn't exist in left
129                lcols = lhs.columns.tolist()
130                cols = lcols + [r for r in right.columns if r not in set(lcols)]
131                merged = lhs.reindex(columns=cols)
132                merged.index = range(len(merged))
133                pieces.append(merged)
134                continue
135
136        merged = merge_pieces(lhs, rhs)
137
138        # make sure join keys are in the merged
139        # TODO, should merge_pieces do this?
140        merged[by] = key
141
142        pieces.append(merged)
143
144    # preserve the original order
145    # if we have a missing piece this can be reset
146    from pandas.core.reshape.concat import concat
147
148    result = concat(pieces, ignore_index=True)
149    result = result.reindex(columns=pieces[0].columns, copy=False)
150    return result, lby
151
152
153def merge_ordered(
154    left,
155    right,
156    on=None,
157    left_on=None,
158    right_on=None,
159    left_by=None,
160    right_by=None,
161    fill_method=None,
162    suffixes=("_x", "_y"),
163    how: str = "outer",
164) -> "DataFrame":
165    """
166    Perform merge with optional filling/interpolation.
167
168    Designed for ordered data like time series data. Optionally
169    perform group-wise merge (see examples).
170
171    Parameters
172    ----------
173    left : DataFrame
174    right : DataFrame
175    on : label or list
176        Field names to join on. Must be found in both DataFrames.
177    left_on : label or list, or array-like
178        Field names to join on in left DataFrame. Can be a vector or list of
179        vectors of the length of the DataFrame to use a particular vector as
180        the join key instead of columns.
181    right_on : label or list, or array-like
182        Field names to join on in right DataFrame or vector/list of vectors per
183        left_on docs.
184    left_by : column name or list of column names
185        Group left DataFrame by group columns and merge piece by piece with
186        right DataFrame.
187    right_by : column name or list of column names
188        Group right DataFrame by group columns and merge piece by piece with
189        left DataFrame.
190    fill_method : {'ffill', None}, default None
191        Interpolation method for data.
192    suffixes : list-like, default is ("_x", "_y")
193        A length-2 sequence where each element is optionally a string
194        indicating the suffix to add to overlapping column names in
195        `left` and `right` respectively. Pass a value of `None` instead
196        of a string to indicate that the column name from `left` or
197        `right` should be left as-is, with no suffix. At least one of the
198        values must not be None.
199
200        .. versionchanged:: 0.25.0
201    how : {'left', 'right', 'outer', 'inner'}, default 'outer'
202        * left: use only keys from left frame (SQL: left outer join)
203        * right: use only keys from right frame (SQL: right outer join)
204        * outer: use union of keys from both frames (SQL: full outer join)
205        * inner: use intersection of keys from both frames (SQL: inner join).
206
207    Returns
208    -------
209    DataFrame
210        The merged DataFrame output type will the be same as
211        'left', if it is a subclass of DataFrame.
212
213    See Also
214    --------
215    merge : Merge with a database-style join.
216    merge_asof : Merge on nearest keys.
217
218    Examples
219    --------
220    >>> df1 = pd.DataFrame(
221    ...     {
222    ...         "key": ["a", "c", "e", "a", "c", "e"],
223    ...         "lvalue": [1, 2, 3, 1, 2, 3],
224    ...         "group": ["a", "a", "a", "b", "b", "b"]
225    ...     }
226    ... )
227    >>> df1
228          key  lvalue group
229    0   a       1     a
230    1   c       2     a
231    2   e       3     a
232    3   a       1     b
233    4   c       2     b
234    5   e       3     b
235
236    >>> df2 = pd.DataFrame({"key": ["b", "c", "d"], "rvalue": [1, 2, 3]})
237    >>> df2
238          key  rvalue
239    0   b       1
240    1   c       2
241    2   d       3
242
243    >>> merge_ordered(df1, df2, fill_method="ffill", left_by="group")
244      key  lvalue group  rvalue
245    0   a       1     a     NaN
246    1   b       1     a     1.0
247    2   c       2     a     2.0
248    3   d       2     a     3.0
249    4   e       3     a     3.0
250    5   a       1     b     NaN
251    6   b       1     b     1.0
252    7   c       2     b     2.0
253    8   d       2     b     3.0
254    9   e       3     b     3.0
255    """
256
257    def _merger(x, y):
258        # perform the ordered merge operation
259        op = _OrderedMerge(
260            x,
261            y,
262            on=on,
263            left_on=left_on,
264            right_on=right_on,
265            suffixes=suffixes,
266            fill_method=fill_method,
267            how=how,
268        )
269        return op.get_result()
270
271    if left_by is not None and right_by is not None:
272        raise ValueError("Can only group either left or right frames")
273    elif left_by is not None:
274        if isinstance(left_by, str):
275            left_by = [left_by]
276        check = set(left_by).difference(left.columns)
277        if len(check) != 0:
278            raise KeyError(f"{check} not found in left columns")
279        result, _ = _groupby_and_merge(
280            left_by, on, left, right, lambda x, y: _merger(x, y)
281        )
282    elif right_by is not None:
283        if isinstance(right_by, str):
284            right_by = [right_by]
285        check = set(right_by).difference(right.columns)
286        if len(check) != 0:
287            raise KeyError(f"{check} not found in right columns")
288        result, _ = _groupby_and_merge(
289            right_by, on, right, left, lambda x, y: _merger(y, x)
290        )
291    else:
292        result = _merger(left, right)
293    return result
294
295
296def merge_asof(
297    left,
298    right,
299    on=None,
300    left_on=None,
301    right_on=None,
302    left_index: bool = False,
303    right_index: bool = False,
304    by=None,
305    left_by=None,
306    right_by=None,
307    suffixes=("_x", "_y"),
308    tolerance=None,
309    allow_exact_matches: bool = True,
310    direction: str = "backward",
311) -> "DataFrame":
312    """
313    Perform an asof merge.
314
315    This is similar to a left-join except that we match on nearest
316    key rather than equal keys. Both DataFrames must be sorted by the key.
317
318    For each row in the left DataFrame:
319
320      - A "backward" search selects the last row in the right DataFrame whose
321        'on' key is less than or equal to the left's key.
322
323      - A "forward" search selects the first row in the right DataFrame whose
324        'on' key is greater than or equal to the left's key.
325
326      - A "nearest" search selects the row in the right DataFrame whose 'on'
327        key is closest in absolute distance to the left's key.
328
329    The default is "backward" and is compatible in versions below 0.20.0.
330    The direction parameter was added in version 0.20.0 and introduces
331    "forward" and "nearest".
332
333    Optionally match on equivalent keys with 'by' before searching with 'on'.
334
335    Parameters
336    ----------
337    left : DataFrame
338    right : DataFrame
339    on : label
340        Field name to join on. Must be found in both DataFrames.
341        The data MUST be ordered. Furthermore this must be a numeric column,
342        such as datetimelike, integer, or float. On or left_on/right_on
343        must be given.
344    left_on : label
345        Field name to join on in left DataFrame.
346    right_on : label
347        Field name to join on in right DataFrame.
348    left_index : bool
349        Use the index of the left DataFrame as the join key.
350    right_index : bool
351        Use the index of the right DataFrame as the join key.
352    by : column name or list of column names
353        Match on these columns before performing merge operation.
354    left_by : column name
355        Field names to match on in the left DataFrame.
356    right_by : column name
357        Field names to match on in the right DataFrame.
358    suffixes : 2-length sequence (tuple, list, ...)
359        Suffix to apply to overlapping column names in the left and right
360        side, respectively.
361    tolerance : int or Timedelta, optional, default None
362        Select asof tolerance within this range; must be compatible
363        with the merge index.
364    allow_exact_matches : bool, default True
365
366        - If True, allow matching with the same 'on' value
367          (i.e. less-than-or-equal-to / greater-than-or-equal-to)
368        - If False, don't match the same 'on' value
369          (i.e., strictly less-than / strictly greater-than).
370
371    direction : 'backward' (default), 'forward', or 'nearest'
372        Whether to search for prior, subsequent, or closest matches.
373
374    Returns
375    -------
376    merged : DataFrame
377
378    See Also
379    --------
380    merge : Merge with a database-style join.
381    merge_ordered : Merge with optional filling/interpolation.
382
383    Examples
384    --------
385    >>> left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]})
386    >>> left
387        a left_val
388    0   1        a
389    1   5        b
390    2  10        c
391
392    >>> right = pd.DataFrame({"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]})
393    >>> right
394       a  right_val
395    0  1          1
396    1  2          2
397    2  3          3
398    3  6          6
399    4  7          7
400
401    >>> pd.merge_asof(left, right, on="a")
402        a left_val  right_val
403    0   1        a          1
404    1   5        b          3
405    2  10        c          7
406
407    >>> pd.merge_asof(left, right, on="a", allow_exact_matches=False)
408        a left_val  right_val
409    0   1        a        NaN
410    1   5        b        3.0
411    2  10        c        7.0
412
413    >>> pd.merge_asof(left, right, on="a", direction="forward")
414        a left_val  right_val
415    0   1        a        1.0
416    1   5        b        6.0
417    2  10        c        NaN
418
419    >>> pd.merge_asof(left, right, on="a", direction="nearest")
420        a left_val  right_val
421    0   1        a          1
422    1   5        b          6
423    2  10        c          7
424
425    We can use indexed DataFrames as well.
426
427    >>> left = pd.DataFrame({"left_val": ["a", "b", "c"]}, index=[1, 5, 10])
428    >>> left
429       left_val
430    1         a
431    5         b
432    10        c
433
434    >>> right = pd.DataFrame({"right_val": [1, 2, 3, 6, 7]}, index=[1, 2, 3, 6, 7])
435    >>> right
436       right_val
437    1          1
438    2          2
439    3          3
440    6          6
441    7          7
442
443    >>> pd.merge_asof(left, right, left_index=True, right_index=True)
444       left_val  right_val
445    1         a          1
446    5         b          3
447    10        c          7
448
449    Here is a real-world times-series example
450
451    >>> quotes = pd.DataFrame(
452    ...     {
453    ...         "time": [
454    ...             pd.Timestamp("2016-05-25 13:30:00.023"),
455    ...             pd.Timestamp("2016-05-25 13:30:00.023"),
456    ...             pd.Timestamp("2016-05-25 13:30:00.030"),
457    ...             pd.Timestamp("2016-05-25 13:30:00.041"),
458    ...             pd.Timestamp("2016-05-25 13:30:00.048"),
459    ...             pd.Timestamp("2016-05-25 13:30:00.049"),
460    ...             pd.Timestamp("2016-05-25 13:30:00.072"),
461    ...             pd.Timestamp("2016-05-25 13:30:00.075")
462    ...         ],
463    ...         "ticker": [
464    ...                "GOOG",
465    ...                "MSFT",
466    ...                "MSFT",
467    ...                "MSFT",
468    ...                "GOOG",
469    ...                "AAPL",
470    ...                "GOOG",
471    ...                "MSFT"
472    ...            ],
473    ...            "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],
474    ...            "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03]
475    ...     }
476    ... )
477    >>> quotes
478                         time ticker     bid     ask
479    0 2016-05-25 13:30:00.023   GOOG  720.50  720.93
480    1 2016-05-25 13:30:00.023   MSFT   51.95   51.96
481    2 2016-05-25 13:30:00.030   MSFT   51.97   51.98
482    3 2016-05-25 13:30:00.041   MSFT   51.99   52.00
483    4 2016-05-25 13:30:00.048   GOOG  720.50  720.93
484    5 2016-05-25 13:30:00.049   AAPL   97.99   98.01
485    6 2016-05-25 13:30:00.072   GOOG  720.50  720.88
486    7 2016-05-25 13:30:00.075   MSFT   52.01   52.03
487
488    >>> trades = pd.DataFrame(
489    ...        {
490    ...            "time": [
491    ...                pd.Timestamp("2016-05-25 13:30:00.023"),
492    ...                pd.Timestamp("2016-05-25 13:30:00.038"),
493    ...                pd.Timestamp("2016-05-25 13:30:00.048"),
494    ...                pd.Timestamp("2016-05-25 13:30:00.048"),
495    ...                pd.Timestamp("2016-05-25 13:30:00.048")
496    ...            ],
497    ...            "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"],
498    ...            "price": [51.95, 51.95, 720.77, 720.92, 98.0],
499    ...            "quantity": [75, 155, 100, 100, 100]
500    ...        }
501    ...    )
502    >>> trades
503                         time ticker   price  quantity
504    0 2016-05-25 13:30:00.023   MSFT   51.95        75
505    1 2016-05-25 13:30:00.038   MSFT   51.95       155
506    2 2016-05-25 13:30:00.048   GOOG  720.77       100
507    3 2016-05-25 13:30:00.048   GOOG  720.92       100
508    4 2016-05-25 13:30:00.048   AAPL   98.00       100
509
510    By default we are taking the asof of the quotes
511
512    >>> pd.merge_asof(trades, quotes, on="time", by="ticker")
513                         time ticker   price  quantity     bid     ask
514    0 2016-05-25 13:30:00.023   MSFT   51.95        75   51.95   51.96
515    1 2016-05-25 13:30:00.038   MSFT   51.95       155   51.97   51.98
516    2 2016-05-25 13:30:00.048   GOOG  720.77       100  720.50  720.93
517    3 2016-05-25 13:30:00.048   GOOG  720.92       100  720.50  720.93
518    4 2016-05-25 13:30:00.048   AAPL   98.00       100     NaN     NaN
519
520    We only asof within 2ms between the quote time and the trade time
521
522    >>> pd.merge_asof(
523    ...     trades, quotes, on="time", by="ticker", tolerance=pd.Timedelta("2ms")
524    ... )
525                         time ticker   price  quantity     bid     ask
526    0 2016-05-25 13:30:00.023   MSFT   51.95        75   51.95   51.96
527    1 2016-05-25 13:30:00.038   MSFT   51.95       155     NaN     NaN
528    2 2016-05-25 13:30:00.048   GOOG  720.77       100  720.50  720.93
529    3 2016-05-25 13:30:00.048   GOOG  720.92       100  720.50  720.93
530    4 2016-05-25 13:30:00.048   AAPL   98.00       100     NaN     NaN
531
532    We only asof within 10ms between the quote time and the trade time
533    and we exclude exact matches on time. However *prior* data will
534    propagate forward
535
536    >>> pd.merge_asof(
537    ...     trades,
538    ...     quotes,
539    ...     on="time",
540    ...     by="ticker",
541    ...     tolerance=pd.Timedelta("10ms"),
542    ...     allow_exact_matches=False
543    ... )
544                         time ticker   price  quantity     bid     ask
545    0 2016-05-25 13:30:00.023   MSFT   51.95        75     NaN     NaN
546    1 2016-05-25 13:30:00.038   MSFT   51.95       155   51.97   51.98
547    2 2016-05-25 13:30:00.048   GOOG  720.77       100     NaN     NaN
548    3 2016-05-25 13:30:00.048   GOOG  720.92       100     NaN     NaN
549    4 2016-05-25 13:30:00.048   AAPL   98.00       100     NaN     NaN
550    """
551    op = _AsOfMerge(
552        left,
553        right,
554        on=on,
555        left_on=left_on,
556        right_on=right_on,
557        left_index=left_index,
558        right_index=right_index,
559        by=by,
560        left_by=left_by,
561        right_by=right_by,
562        suffixes=suffixes,
563        how="asof",
564        tolerance=tolerance,
565        allow_exact_matches=allow_exact_matches,
566        direction=direction,
567    )
568    return op.get_result()
569
570
571# TODO: transformations??
572# TODO: only copy DataFrames when modification necessary
573class _MergeOperation:
574    """
575    Perform a database (SQL) merge operation between two DataFrame or Series
576    objects using either columns as keys or their row indexes
577    """
578
579    _merge_type = "merge"
580
581    def __init__(
582        self,
583        left: FrameOrSeriesUnion,
584        right: FrameOrSeriesUnion,
585        how: str = "inner",
586        on=None,
587        left_on=None,
588        right_on=None,
589        axis=1,
590        left_index: bool = False,
591        right_index: bool = False,
592        sort: bool = True,
593        suffixes=("_x", "_y"),
594        copy: bool = True,
595        indicator: bool = False,
596        validate=None,
597    ):
598        _left = _validate_operand(left)
599        _right = _validate_operand(right)
600        self.left = self.orig_left = _left
601        self.right = self.orig_right = _right
602        self.how = how
603
604        # bm_axis -> the axis on the BlockManager
605        self.bm_axis = axis
606        # axis --> the axis on the Series/DataFrame
607        self.axis = 1 - axis if self.left.ndim == 2 else 0
608
609        self.on = com.maybe_make_list(on)
610        self.left_on = com.maybe_make_list(left_on)
611        self.right_on = com.maybe_make_list(right_on)
612
613        self.copy = copy
614        self.suffixes = suffixes
615        self.sort = sort
616
617        self.left_index = left_index
618        self.right_index = right_index
619
620        self.indicator = indicator
621
622        self.indicator_name: Optional[str]
623        if isinstance(self.indicator, str):
624            self.indicator_name = self.indicator
625        elif isinstance(self.indicator, bool):
626            self.indicator_name = "_merge" if self.indicator else None
627        else:
628            raise ValueError(
629                "indicator option can only accept boolean or string arguments"
630            )
631
632        if not is_bool(left_index):
633            raise ValueError(
634                f"left_index parameter must be of type bool, not {type(left_index)}"
635            )
636        if not is_bool(right_index):
637            raise ValueError(
638                f"right_index parameter must be of type bool, not {type(right_index)}"
639            )
640
641        # warn user when merging between different levels
642        if _left.columns.nlevels != _right.columns.nlevels:
643            msg = (
644                "merging between different levels can give an unintended "
645                f"result ({left.columns.nlevels} levels on the left,"
646                f"{right.columns.nlevels} on the right)"
647            )
648            warnings.warn(msg, UserWarning)
649
650        self._validate_specification()
651
652        cross_col = None
653        if self.how == "cross":
654            (
655                self.left,
656                self.right,
657                self.how,
658                cross_col,
659            ) = self._create_cross_configuration(self.left, self.right)
660            self.left_on = self.right_on = [cross_col]
661        self._cross = cross_col
662
663        # note this function has side effects
664        (
665            self.left_join_keys,
666            self.right_join_keys,
667            self.join_names,
668        ) = self._get_merge_keys()
669
670        # validate the merge keys dtypes. We may need to coerce
671        # to avoid incompatible dtypes
672        self._maybe_coerce_merge_keys()
673
674        # If argument passed to validate,
675        # check if columns specified as unique
676        # are in fact unique.
677        if validate is not None:
678            self._validate(validate)
679
680    def get_result(self):
681        if self.indicator:
682            self.left, self.right = self._indicator_pre_merge(self.left, self.right)
683
684        join_index, left_indexer, right_indexer = self._get_join_info()
685
686        llabels, rlabels = _items_overlap_with_suffix(
687            self.left._info_axis, self.right._info_axis, self.suffixes
688        )
689
690        lindexers = {1: left_indexer} if left_indexer is not None else {}
691        rindexers = {1: right_indexer} if right_indexer is not None else {}
692
693        result_data = concatenate_block_managers(
694            [(self.left._mgr, lindexers), (self.right._mgr, rindexers)],
695            axes=[llabels.append(rlabels), join_index],
696            concat_axis=0,
697            copy=self.copy,
698        )
699
700        typ = self.left._constructor
701        result = typ(result_data).__finalize__(self, method=self._merge_type)
702
703        if self.indicator:
704            result = self._indicator_post_merge(result)
705
706        self._maybe_add_join_keys(result, left_indexer, right_indexer)
707
708        self._maybe_restore_index_levels(result)
709
710        self._maybe_drop_cross_column(result, self._cross)
711
712        return result.__finalize__(self, method="merge")
713
714    def _maybe_drop_cross_column(self, result: "DataFrame", cross_col: Optional[str]):
715        if cross_col is not None:
716            result.drop(columns=cross_col, inplace=True)
717
718    def _indicator_pre_merge(
719        self, left: "DataFrame", right: "DataFrame"
720    ) -> Tuple["DataFrame", "DataFrame"]:
721
722        columns = left.columns.union(right.columns)
723
724        for i in ["_left_indicator", "_right_indicator"]:
725            if i in columns:
726                raise ValueError(
727                    "Cannot use `indicator=True` option when "
728                    f"data contains a column named {i}"
729                )
730        if self.indicator_name in columns:
731            raise ValueError(
732                "Cannot use name of an existing column for indicator column"
733            )
734
735        left = left.copy()
736        right = right.copy()
737
738        left["_left_indicator"] = 1
739        left["_left_indicator"] = left["_left_indicator"].astype("int8")
740
741        right["_right_indicator"] = 2
742        right["_right_indicator"] = right["_right_indicator"].astype("int8")
743
744        return left, right
745
746    def _indicator_post_merge(self, result):
747
748        result["_left_indicator"] = result["_left_indicator"].fillna(0)
749        result["_right_indicator"] = result["_right_indicator"].fillna(0)
750
751        result[self.indicator_name] = Categorical(
752            (result["_left_indicator"] + result["_right_indicator"]),
753            categories=[1, 2, 3],
754        )
755        result[self.indicator_name] = result[self.indicator_name].cat.rename_categories(
756            ["left_only", "right_only", "both"]
757        )
758
759        result = result.drop(labels=["_left_indicator", "_right_indicator"], axis=1)
760        return result
761
762    def _maybe_restore_index_levels(self, result):
763        """
764        Restore index levels specified as `on` parameters
765
766        Here we check for cases where `self.left_on` and `self.right_on` pairs
767        each reference an index level in their respective DataFrames. The
768        joined columns corresponding to these pairs are then restored to the
769        index of `result`.
770
771        **Note:** This method has side effects. It modifies `result` in-place
772
773        Parameters
774        ----------
775        result: DataFrame
776            merge result
777
778        Returns
779        -------
780        None
781        """
782        names_to_restore = []
783        for name, left_key, right_key in zip(
784            self.join_names, self.left_on, self.right_on
785        ):
786            if (
787                self.orig_left._is_level_reference(left_key)
788                and self.orig_right._is_level_reference(right_key)
789                and name not in result.index.names
790            ):
791
792                names_to_restore.append(name)
793
794        if names_to_restore:
795            result.set_index(names_to_restore, inplace=True)
796
797    def _maybe_add_join_keys(self, result, left_indexer, right_indexer):
798
799        left_has_missing = None
800        right_has_missing = None
801
802        keys = zip(self.join_names, self.left_on, self.right_on)
803        for i, (name, lname, rname) in enumerate(keys):
804            if not _should_fill(lname, rname):
805                continue
806
807            take_left, take_right = None, None
808
809            if name in result:
810
811                if left_indexer is not None and right_indexer is not None:
812                    if name in self.left:
813
814                        if left_has_missing is None:
815                            left_has_missing = (left_indexer == -1).any()
816
817                        if left_has_missing:
818                            take_right = self.right_join_keys[i]
819
820                            if not is_dtype_equal(
821                                result[name].dtype, self.left[name].dtype
822                            ):
823                                take_left = self.left[name]._values
824
825                    elif name in self.right:
826
827                        if right_has_missing is None:
828                            right_has_missing = (right_indexer == -1).any()
829
830                        if right_has_missing:
831                            take_left = self.left_join_keys[i]
832
833                            if not is_dtype_equal(
834                                result[name].dtype, self.right[name].dtype
835                            ):
836                                take_right = self.right[name]._values
837
838            elif left_indexer is not None and is_array_like(self.left_join_keys[i]):
839                take_left = self.left_join_keys[i]
840                take_right = self.right_join_keys[i]
841
842            if take_left is not None or take_right is not None:
843
844                if take_left is None:
845                    lvals = result[name]._values
846                else:
847                    lfill = na_value_for_dtype(take_left.dtype)
848                    lvals = algos.take_1d(take_left, left_indexer, fill_value=lfill)
849
850                if take_right is None:
851                    rvals = result[name]._values
852                else:
853                    rfill = na_value_for_dtype(take_right.dtype)
854                    rvals = algos.take_1d(take_right, right_indexer, fill_value=rfill)
855
856                # if we have an all missing left_indexer
857                # make sure to just use the right values or vice-versa
858                mask_left = left_indexer == -1
859                mask_right = right_indexer == -1
860                if mask_left.all():
861                    key_col = rvals
862                elif right_indexer is not None and mask_right.all():
863                    key_col = lvals
864                else:
865                    key_col = Index(lvals).where(~mask_left, rvals)
866
867                if result._is_label_reference(name):
868                    result[name] = key_col
869                elif result._is_level_reference(name):
870                    if isinstance(result.index, MultiIndex):
871                        key_col.name = name
872                        idx_list = [
873                            result.index.get_level_values(level_name)
874                            if level_name != name
875                            else key_col
876                            for level_name in result.index.names
877                        ]
878
879                        result.set_index(idx_list, inplace=True)
880                    else:
881                        result.index = Index(key_col, name=name)
882                else:
883                    result.insert(i, name or f"key_{i}", key_col)
884
885    def _get_join_indexers(self):
886        """ return the join indexers """
887        return get_join_indexers(
888            self.left_join_keys, self.right_join_keys, sort=self.sort, how=self.how
889        )
890
891    def _get_join_info(self):
892        left_ax = self.left.axes[self.axis]
893        right_ax = self.right.axes[self.axis]
894
895        if self.left_index and self.right_index and self.how != "asof":
896            join_index, left_indexer, right_indexer = left_ax.join(
897                right_ax, how=self.how, return_indexers=True, sort=self.sort
898            )
899        elif self.right_index and self.how == "left":
900            join_index, left_indexer, right_indexer = _left_join_on_index(
901                left_ax, right_ax, self.left_join_keys, sort=self.sort
902            )
903
904        elif self.left_index and self.how == "right":
905            join_index, right_indexer, left_indexer = _left_join_on_index(
906                right_ax, left_ax, self.right_join_keys, sort=self.sort
907            )
908        else:
909            (left_indexer, right_indexer) = self._get_join_indexers()
910
911            if self.right_index:
912                if len(self.left) > 0:
913                    join_index = self._create_join_index(
914                        self.left.index,
915                        self.right.index,
916                        left_indexer,
917                        right_indexer,
918                        how="right",
919                    )
920                else:
921                    join_index = self.right.index.take(right_indexer)
922                    left_indexer = np.array([-1] * len(join_index))
923            elif self.left_index:
924                if len(self.right) > 0:
925                    join_index = self._create_join_index(
926                        self.right.index,
927                        self.left.index,
928                        right_indexer,
929                        left_indexer,
930                        how="left",
931                    )
932                else:
933                    join_index = self.left.index.take(left_indexer)
934                    right_indexer = np.array([-1] * len(join_index))
935            else:
936                join_index = Index(np.arange(len(left_indexer)))
937
938        if len(join_index) == 0:
939            join_index = join_index.astype(object)
940        return join_index, left_indexer, right_indexer
941
942    def _create_join_index(
943        self,
944        index: Index,
945        other_index: Index,
946        indexer,
947        other_indexer,
948        how: str = "left",
949    ):
950        """
951        Create a join index by rearranging one index to match another
952
953        Parameters
954        ----------
955        index: Index being rearranged
956        other_index: Index used to supply values not found in index
957        indexer: how to rearrange index
958        how: replacement is only necessary if indexer based on other_index
959
960        Returns
961        -------
962        join_index
963        """
964        if self.how in (how, "outer") and not isinstance(other_index, MultiIndex):
965            # if final index requires values in other_index but not target
966            # index, indexer may hold missing (-1) values, causing Index.take
967            # to take the final value in target index. So, we set the last
968            # element to be the desired fill value. We do not use allow_fill
969            # and fill_value because it throws a ValueError on integer indices
970            mask = indexer == -1
971            if np.any(mask):
972                fill_value = na_value_for_dtype(index.dtype, compat=False)
973                index = index.append(Index([fill_value]))
974        return index.take(indexer)
975
976    def _get_merge_keys(self):
977        """
978        Note: has side effects (copy/delete key columns)
979
980        Parameters
981        ----------
982        left
983        right
984        on
985
986        Returns
987        -------
988        left_keys, right_keys
989        """
990        left_keys = []
991        right_keys = []
992        # pandas\core\reshape\merge.py:966: error: Need type annotation for
993        # 'join_names' (hint: "join_names: List[<type>] = ...")
994        # [var-annotated]
995        join_names = []  # type: ignore[var-annotated]
996        right_drop = []
997        left_drop = []
998
999        left, right = self.left, self.right
1000
1001        is_lkey = lambda x: is_array_like(x) and len(x) == len(left)
1002        is_rkey = lambda x: is_array_like(x) and len(x) == len(right)
1003
1004        # Note that pd.merge_asof() has separate 'on' and 'by' parameters. A
1005        # user could, for example, request 'left_index' and 'left_by'. In a
1006        # regular pd.merge(), users cannot specify both 'left_index' and
1007        # 'left_on'. (Instead, users have a MultiIndex). That means the
1008        # self.left_on in this function is always empty in a pd.merge(), but
1009        # a pd.merge_asof(left_index=True, left_by=...) will result in a
1010        # self.left_on array with a None in the middle of it. This requires
1011        # a work-around as designated in the code below.
1012        # See _validate_specification() for where this happens.
1013
1014        # ugh, spaghetti re #733
1015        if _any(self.left_on) and _any(self.right_on):
1016            for lk, rk in zip(self.left_on, self.right_on):
1017                if is_lkey(lk):
1018                    left_keys.append(lk)
1019                    if is_rkey(rk):
1020                        right_keys.append(rk)
1021                        join_names.append(None)  # what to do?
1022                    else:
1023                        if rk is not None:
1024                            right_keys.append(right._get_label_or_level_values(rk))
1025                            join_names.append(rk)
1026                        else:
1027                            # work-around for merge_asof(right_index=True)
1028                            right_keys.append(right.index)
1029                            join_names.append(right.index.name)
1030                else:
1031                    if not is_rkey(rk):
1032                        if rk is not None:
1033                            right_keys.append(right._get_label_or_level_values(rk))
1034                        else:
1035                            # work-around for merge_asof(right_index=True)
1036                            right_keys.append(right.index)
1037                        if lk is not None and lk == rk:
1038                            # avoid key upcast in corner case (length-0)
1039                            if len(left) > 0:
1040                                right_drop.append(rk)
1041                            else:
1042                                left_drop.append(lk)
1043                    else:
1044                        right_keys.append(rk)
1045                    if lk is not None:
1046                        left_keys.append(left._get_label_or_level_values(lk))
1047                        join_names.append(lk)
1048                    else:
1049                        # work-around for merge_asof(left_index=True)
1050                        left_keys.append(left.index)
1051                        join_names.append(left.index.name)
1052        elif _any(self.left_on):
1053            for k in self.left_on:
1054                if is_lkey(k):
1055                    left_keys.append(k)
1056                    join_names.append(None)
1057                else:
1058                    left_keys.append(left._get_label_or_level_values(k))
1059                    join_names.append(k)
1060            if isinstance(self.right.index, MultiIndex):
1061                right_keys = [
1062                    lev._values.take(lev_codes)
1063                    for lev, lev_codes in zip(
1064                        self.right.index.levels, self.right.index.codes
1065                    )
1066                ]
1067            else:
1068                right_keys = [self.right.index._values]
1069        elif _any(self.right_on):
1070            for k in self.right_on:
1071                if is_rkey(k):
1072                    right_keys.append(k)
1073                    join_names.append(None)
1074                else:
1075                    right_keys.append(right._get_label_or_level_values(k))
1076                    join_names.append(k)
1077            if isinstance(self.left.index, MultiIndex):
1078                left_keys = [
1079                    lev._values.take(lev_codes)
1080                    for lev, lev_codes in zip(
1081                        self.left.index.levels, self.left.index.codes
1082                    )
1083                ]
1084            else:
1085                left_keys = [self.left.index._values]
1086
1087        if left_drop:
1088            self.left = self.left._drop_labels_or_levels(left_drop)
1089
1090        if right_drop:
1091            self.right = self.right._drop_labels_or_levels(right_drop)
1092
1093        return left_keys, right_keys, join_names
1094
1095    def _maybe_coerce_merge_keys(self):
1096        # we have valid merges but we may have to further
1097        # coerce these if they are originally incompatible types
1098        #
1099        # for example if these are categorical, but are not dtype_equal
1100        # or if we have object and integer dtypes
1101
1102        for lk, rk, name in zip(
1103            self.left_join_keys, self.right_join_keys, self.join_names
1104        ):
1105            if (len(lk) and not len(rk)) or (not len(lk) and len(rk)):
1106                continue
1107
1108            lk_is_cat = is_categorical_dtype(lk.dtype)
1109            rk_is_cat = is_categorical_dtype(rk.dtype)
1110            lk_is_object = is_object_dtype(lk.dtype)
1111            rk_is_object = is_object_dtype(rk.dtype)
1112
1113            # if either left or right is a categorical
1114            # then the must match exactly in categories & ordered
1115            if lk_is_cat and rk_is_cat:
1116                if lk._categories_match_up_to_permutation(rk):
1117                    continue
1118
1119            elif lk_is_cat or rk_is_cat:
1120                pass
1121
1122            elif is_dtype_equal(lk.dtype, rk.dtype):
1123                continue
1124
1125            msg = (
1126                f"You are trying to merge on {lk.dtype} and "
1127                f"{rk.dtype} columns. If you wish to proceed you should use pd.concat"
1128            )
1129
1130            # if we are numeric, then allow differing
1131            # kinds to proceed, eg. int64 and int8, int and float
1132            # further if we are object, but we infer to
1133            # the same, then proceed
1134            if is_numeric_dtype(lk.dtype) and is_numeric_dtype(rk.dtype):
1135                if lk.dtype.kind == rk.dtype.kind:
1136                    continue
1137
1138                # check whether ints and floats
1139                elif is_integer_dtype(rk.dtype) and is_float_dtype(lk.dtype):
1140                    if not (lk == lk.astype(rk.dtype))[~np.isnan(lk)].all():
1141                        warnings.warn(
1142                            "You are merging on int and float "
1143                            "columns where the float values "
1144                            "are not equal to their int representation",
1145                            UserWarning,
1146                        )
1147                    continue
1148
1149                elif is_float_dtype(rk.dtype) and is_integer_dtype(lk.dtype):
1150                    if not (rk == rk.astype(lk.dtype))[~np.isnan(rk)].all():
1151                        warnings.warn(
1152                            "You are merging on int and float "
1153                            "columns where the float values "
1154                            "are not equal to their int representation",
1155                            UserWarning,
1156                        )
1157                    continue
1158
1159                # let's infer and see if we are ok
1160                elif lib.infer_dtype(lk, skipna=False) == lib.infer_dtype(
1161                    rk, skipna=False
1162                ):
1163                    continue
1164
1165            # Check if we are trying to merge on obviously
1166            # incompatible dtypes GH 9780, GH 15800
1167
1168            # bool values are coerced to object
1169            elif (lk_is_object and is_bool_dtype(rk.dtype)) or (
1170                is_bool_dtype(lk.dtype) and rk_is_object
1171            ):
1172                pass
1173
1174            # object values are allowed to be merged
1175            elif (lk_is_object and is_numeric_dtype(rk.dtype)) or (
1176                is_numeric_dtype(lk.dtype) and rk_is_object
1177            ):
1178                inferred_left = lib.infer_dtype(lk, skipna=False)
1179                inferred_right = lib.infer_dtype(rk, skipna=False)
1180                bool_types = ["integer", "mixed-integer", "boolean", "empty"]
1181                string_types = ["string", "unicode", "mixed", "bytes", "empty"]
1182
1183                # inferred bool
1184                if inferred_left in bool_types and inferred_right in bool_types:
1185                    pass
1186
1187                # unless we are merging non-string-like with string-like
1188                elif (
1189                    inferred_left in string_types and inferred_right not in string_types
1190                ) or (
1191                    inferred_right in string_types and inferred_left not in string_types
1192                ):
1193                    raise ValueError(msg)
1194
1195            # datetimelikes must match exactly
1196            elif needs_i8_conversion(lk.dtype) and not needs_i8_conversion(rk.dtype):
1197                raise ValueError(msg)
1198            elif not needs_i8_conversion(lk.dtype) and needs_i8_conversion(rk.dtype):
1199                raise ValueError(msg)
1200            elif is_datetime64tz_dtype(lk.dtype) and not is_datetime64tz_dtype(
1201                rk.dtype
1202            ):
1203                raise ValueError(msg)
1204            elif not is_datetime64tz_dtype(lk.dtype) and is_datetime64tz_dtype(
1205                rk.dtype
1206            ):
1207                raise ValueError(msg)
1208
1209            elif lk_is_object and rk_is_object:
1210                continue
1211
1212            # Houston, we have a problem!
1213            # let's coerce to object if the dtypes aren't
1214            # categorical, otherwise coerce to the category
1215            # dtype. If we coerced categories to object,
1216            # then we would lose type information on some
1217            # columns, and end up trying to merge
1218            # incompatible dtypes. See GH 16900.
1219            if name in self.left.columns:
1220                typ = lk.categories.dtype if lk_is_cat else object
1221                self.left = self.left.assign(**{name: self.left[name].astype(typ)})
1222            if name in self.right.columns:
1223                typ = rk.categories.dtype if rk_is_cat else object
1224                self.right = self.right.assign(**{name: self.right[name].astype(typ)})
1225
1226    def _create_cross_configuration(
1227        self, left, right
1228    ) -> Tuple["DataFrame", "DataFrame", str, str]:
1229        """
1230        Creates the configuration to dispatch the cross operation to inner join,
1231        e.g. adding a join column and resetting parameters. Join column is added
1232        to a new object, no inplace modification
1233
1234        Parameters
1235        ----------
1236        left: DataFrame
1237        right DataFrame
1238
1239        Returns
1240        -------
1241            a tuple (left, right, how, cross_col) representing the adjusted
1242            DataFrames with cross_col, the merge operation set to inner and the column
1243            to join over.
1244        """
1245        cross_col = f"_cross_{hashlib.md5().hexdigest()}"
1246        how = "inner"
1247        return (
1248            left.assign(**{cross_col: 1}),
1249            right.assign(**{cross_col: 1}),
1250            how,
1251            cross_col,
1252        )
1253
1254    def _validate_specification(self):
1255        if self.how == "cross":
1256            if (
1257                self.left_index
1258                or self.right_index
1259                or self.right_on is not None
1260                or self.left_on is not None
1261                or self.on is not None
1262            ):
1263                raise MergeError(
1264                    "Can not pass on, right_on, left_on or set right_index=True or "
1265                    "left_index=True"
1266                )
1267            return
1268        # Hm, any way to make this logic less complicated??
1269        elif self.on is None and self.left_on is None and self.right_on is None:
1270
1271            if self.left_index and self.right_index:
1272                self.left_on, self.right_on = (), ()
1273            elif self.left_index:
1274                raise MergeError("Must pass right_on or right_index=True")
1275            elif self.right_index:
1276                raise MergeError("Must pass left_on or left_index=True")
1277            else:
1278                # use the common columns
1279                left_cols = self.left.columns
1280                right_cols = self.right.columns
1281                common_cols = left_cols.intersection(right_cols)
1282                if len(common_cols) == 0:
1283                    raise MergeError(
1284                        "No common columns to perform merge on. "
1285                        f"Merge options: left_on={self.left_on}, "
1286                        f"right_on={self.right_on}, "
1287                        f"left_index={self.left_index}, "
1288                        f"right_index={self.right_index}"
1289                    )
1290                if (
1291                    not left_cols.join(common_cols, how="inner").is_unique
1292                    or not right_cols.join(common_cols, how="inner").is_unique
1293                ):
1294                    raise MergeError(f"Data columns not unique: {repr(common_cols)}")
1295                self.left_on = self.right_on = common_cols
1296        elif self.on is not None:
1297            if self.left_on is not None or self.right_on is not None:
1298                raise MergeError(
1299                    'Can only pass argument "on" OR "left_on" '
1300                    'and "right_on", not a combination of both.'
1301                )
1302            if self.left_index or self.right_index:
1303                raise MergeError(
1304                    'Can only pass argument "on" OR "left_index" '
1305                    'and "right_index", not a combination of both.'
1306                )
1307            self.left_on = self.right_on = self.on
1308        elif self.left_on is not None:
1309            if self.left_index:
1310                raise MergeError(
1311                    'Can only pass argument "left_on" OR "left_index" not both.'
1312                )
1313            if not self.right_index and self.right_on is None:
1314                raise MergeError('Must pass "right_on" OR "right_index".')
1315            n = len(self.left_on)
1316            if self.right_index:
1317                if len(self.left_on) != self.right.index.nlevels:
1318                    raise ValueError(
1319                        "len(left_on) must equal the number "
1320                        'of levels in the index of "right"'
1321                    )
1322                self.right_on = [None] * n
1323        elif self.right_on is not None:
1324            if self.right_index:
1325                raise MergeError(
1326                    'Can only pass argument "right_on" OR "right_index" not both.'
1327                )
1328            if not self.left_index and self.left_on is None:
1329                raise MergeError('Must pass "left_on" OR "left_index".')
1330            n = len(self.right_on)
1331            if self.left_index:
1332                if len(self.right_on) != self.left.index.nlevels:
1333                    raise ValueError(
1334                        "len(right_on) must equal the number "
1335                        'of levels in the index of "left"'
1336                    )
1337                self.left_on = [None] * n
1338        if self.how != "cross" and len(self.right_on) != len(self.left_on):
1339            raise ValueError("len(right_on) must equal len(left_on)")
1340
1341    def _validate(self, validate: str):
1342
1343        # Check uniqueness of each
1344        if self.left_index:
1345            left_unique = self.orig_left.index.is_unique
1346        else:
1347            left_unique = MultiIndex.from_arrays(self.left_join_keys).is_unique
1348
1349        if self.right_index:
1350            right_unique = self.orig_right.index.is_unique
1351        else:
1352            right_unique = MultiIndex.from_arrays(self.right_join_keys).is_unique
1353
1354        # Check data integrity
1355        if validate in ["one_to_one", "1:1"]:
1356            if not left_unique and not right_unique:
1357                raise MergeError(
1358                    "Merge keys are not unique in either left "
1359                    "or right dataset; not a one-to-one merge"
1360                )
1361            elif not left_unique:
1362                raise MergeError(
1363                    "Merge keys are not unique in left dataset; not a one-to-one merge"
1364                )
1365            elif not right_unique:
1366                raise MergeError(
1367                    "Merge keys are not unique in right dataset; not a one-to-one merge"
1368                )
1369
1370        elif validate in ["one_to_many", "1:m"]:
1371            if not left_unique:
1372                raise MergeError(
1373                    "Merge keys are not unique in left dataset; not a one-to-many merge"
1374                )
1375
1376        elif validate in ["many_to_one", "m:1"]:
1377            if not right_unique:
1378                raise MergeError(
1379                    "Merge keys are not unique in right dataset; "
1380                    "not a many-to-one merge"
1381                )
1382
1383        elif validate in ["many_to_many", "m:m"]:
1384            pass
1385
1386        else:
1387            raise ValueError("Not a valid argument for validate")
1388
1389
1390def get_join_indexers(
1391    left_keys, right_keys, sort: bool = False, how: str = "inner", **kwargs
1392):
1393    """
1394
1395    Parameters
1396    ----------
1397    left_keys: ndarray, Index, Series
1398    right_keys: ndarray, Index, Series
1399    sort: bool, default False
1400    how: string {'inner', 'outer', 'left', 'right'}, default 'inner'
1401
1402    Returns
1403    -------
1404    tuple of (left_indexer, right_indexer)
1405        indexers into the left_keys, right_keys
1406
1407    """
1408    assert len(left_keys) == len(
1409        right_keys
1410    ), "left_key and right_keys must be the same length"
1411
1412    # get left & right join labels and num. of levels at each location
1413    mapped = (
1414        _factorize_keys(left_keys[n], right_keys[n], sort=sort, how=how)
1415        for n in range(len(left_keys))
1416    )
1417    zipped = zip(*mapped)
1418    llab, rlab, shape = [list(x) for x in zipped]
1419
1420    # get flat i8 keys from label lists
1421    lkey, rkey = _get_join_keys(llab, rlab, shape, sort)
1422
1423    # factorize keys to a dense i8 space
1424    # `count` is the num. of unique keys
1425    # set(lkey) | set(rkey) == range(count)
1426
1427    lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort, how=how)
1428    # preserve left frame order if how == 'left' and sort == False
1429    kwargs = copy.copy(kwargs)
1430    if how in ("left", "right"):
1431        kwargs["sort"] = sort
1432    join_func = {
1433        "inner": libjoin.inner_join,
1434        "left": libjoin.left_outer_join,
1435        "right": lambda x, y, count, **kwargs: libjoin.left_outer_join(
1436            y, x, count, **kwargs
1437        )[::-1],
1438        "outer": libjoin.full_outer_join,
1439    }[how]
1440
1441    return join_func(lkey, rkey, count, **kwargs)
1442
1443
1444def restore_dropped_levels_multijoin(
1445    left: MultiIndex,
1446    right: MultiIndex,
1447    dropped_level_names,
1448    join_index,
1449    lindexer,
1450    rindexer,
1451):
1452    """
1453    *this is an internal non-public method*
1454
1455    Returns the levels, labels and names of a multi-index to multi-index join.
1456    Depending on the type of join, this method restores the appropriate
1457    dropped levels of the joined multi-index.
1458    The method relies on lidx, rindexer which hold the index positions of
1459    left and right, where a join was feasible
1460
1461    Parameters
1462    ----------
1463    left : MultiIndex
1464        left index
1465    right : MultiIndex
1466        right index
1467    dropped_level_names : str array
1468        list of non-common level names
1469    join_index : MultiIndex
1470        the index of the join between the
1471        common levels of left and right
1472    lindexer : intp array
1473        left indexer
1474    rindexer : intp array
1475        right indexer
1476
1477    Returns
1478    -------
1479    levels : list of Index
1480        levels of combined multiindexes
1481    labels : intp array
1482        labels of combined multiindexes
1483    names : str array
1484        names of combined multiindexes
1485
1486    """
1487
1488    def _convert_to_multiindex(index) -> MultiIndex:
1489        if isinstance(index, MultiIndex):
1490            return index
1491        else:
1492            return MultiIndex.from_arrays([index._values], names=[index.name])
1493
1494    # For multi-multi joins with one overlapping level,
1495    # the returned index if of type Index
1496    # Assure that join_index is of type MultiIndex
1497    # so that dropped levels can be appended
1498    join_index = _convert_to_multiindex(join_index)
1499
1500    join_levels = join_index.levels
1501    join_codes = join_index.codes
1502    join_names = join_index.names
1503
1504    # lindexer and rindexer hold the indexes where the join occurred
1505    # for left and right respectively. If left/right is None then
1506    # the join occurred on all indices of left/right
1507    if lindexer is None:
1508        lindexer = range(left.size)
1509
1510    if rindexer is None:
1511        rindexer = range(right.size)
1512
1513    # Iterate through the levels that must be restored
1514    for dropped_level_name in dropped_level_names:
1515        if dropped_level_name in left.names:
1516            idx = left
1517            indexer = lindexer
1518        else:
1519            idx = right
1520            indexer = rindexer
1521
1522        # The index of the level name to be restored
1523        name_idx = idx.names.index(dropped_level_name)
1524
1525        restore_levels = idx.levels[name_idx]
1526        # Inject -1 in the codes list where a join was not possible
1527        # IOW indexer[i]=-1
1528        codes = idx.codes[name_idx]
1529        restore_codes = algos.take_nd(codes, indexer, fill_value=-1)
1530
1531        join_levels = join_levels + [restore_levels]
1532        join_codes = join_codes + [restore_codes]
1533        join_names = join_names + [dropped_level_name]
1534
1535    return join_levels, join_codes, join_names
1536
1537
1538class _OrderedMerge(_MergeOperation):
1539    _merge_type = "ordered_merge"
1540
1541    def __init__(
1542        self,
1543        left,
1544        right,
1545        on=None,
1546        left_on=None,
1547        right_on=None,
1548        left_index: bool = False,
1549        right_index: bool = False,
1550        axis=1,
1551        suffixes=("_x", "_y"),
1552        copy: bool = True,
1553        fill_method=None,
1554        how: str = "outer",
1555    ):
1556
1557        self.fill_method = fill_method
1558        _MergeOperation.__init__(
1559            self,
1560            left,
1561            right,
1562            on=on,
1563            left_on=left_on,
1564            left_index=left_index,
1565            right_index=right_index,
1566            right_on=right_on,
1567            axis=axis,
1568            how=how,
1569            suffixes=suffixes,
1570            sort=True,  # factorize sorts
1571        )
1572
1573    def get_result(self):
1574        join_index, left_indexer, right_indexer = self._get_join_info()
1575
1576        llabels, rlabels = _items_overlap_with_suffix(
1577            self.left._info_axis, self.right._info_axis, self.suffixes
1578        )
1579
1580        if self.fill_method == "ffill":
1581            left_join_indexer = libjoin.ffill_indexer(left_indexer)
1582            right_join_indexer = libjoin.ffill_indexer(right_indexer)
1583        else:
1584            left_join_indexer = left_indexer
1585            right_join_indexer = right_indexer
1586
1587        lindexers = {1: left_join_indexer} if left_join_indexer is not None else {}
1588        rindexers = {1: right_join_indexer} if right_join_indexer is not None else {}
1589
1590        result_data = concatenate_block_managers(
1591            [(self.left._mgr, lindexers), (self.right._mgr, rindexers)],
1592            axes=[llabels.append(rlabels), join_index],
1593            concat_axis=0,
1594            copy=self.copy,
1595        )
1596
1597        typ = self.left._constructor
1598        result = typ(result_data)
1599
1600        self._maybe_add_join_keys(result, left_indexer, right_indexer)
1601
1602        return result
1603
1604
1605def _asof_function(direction: str):
1606    name = f"asof_join_{direction}"
1607    return getattr(libjoin, name, None)
1608
1609
1610def _asof_by_function(direction: str):
1611    name = f"asof_join_{direction}_on_X_by_Y"
1612    return getattr(libjoin, name, None)
1613
1614
1615_type_casters = {
1616    "int64_t": ensure_int64,
1617    "double": ensure_float64,
1618    "object": ensure_object,
1619}
1620
1621
1622def _get_cython_type_upcast(dtype):
1623    """ Upcast a dtype to 'int64_t', 'double', or 'object' """
1624    if is_integer_dtype(dtype):
1625        return "int64_t"
1626    elif is_float_dtype(dtype):
1627        return "double"
1628    else:
1629        return "object"
1630
1631
1632class _AsOfMerge(_OrderedMerge):
1633    _merge_type = "asof_merge"
1634
1635    def __init__(
1636        self,
1637        left,
1638        right,
1639        on=None,
1640        left_on=None,
1641        right_on=None,
1642        left_index: bool = False,
1643        right_index: bool = False,
1644        by=None,
1645        left_by=None,
1646        right_by=None,
1647        axis=1,
1648        suffixes=("_x", "_y"),
1649        copy: bool = True,
1650        fill_method=None,
1651        how: str = "asof",
1652        tolerance=None,
1653        allow_exact_matches: bool = True,
1654        direction: str = "backward",
1655    ):
1656
1657        self.by = by
1658        self.left_by = left_by
1659        self.right_by = right_by
1660        self.tolerance = tolerance
1661        self.allow_exact_matches = allow_exact_matches
1662        self.direction = direction
1663
1664        _OrderedMerge.__init__(
1665            self,
1666            left,
1667            right,
1668            on=on,
1669            left_on=left_on,
1670            right_on=right_on,
1671            left_index=left_index,
1672            right_index=right_index,
1673            axis=axis,
1674            how=how,
1675            suffixes=suffixes,
1676            fill_method=fill_method,
1677        )
1678
1679    def _validate_specification(self):
1680        super()._validate_specification()
1681
1682        # we only allow on to be a single item for on
1683        if len(self.left_on) != 1 and not self.left_index:
1684            raise MergeError("can only asof on a key for left")
1685
1686        if len(self.right_on) != 1 and not self.right_index:
1687            raise MergeError("can only asof on a key for right")
1688
1689        if self.left_index and isinstance(self.left.index, MultiIndex):
1690            raise MergeError("left can only have one index")
1691
1692        if self.right_index and isinstance(self.right.index, MultiIndex):
1693            raise MergeError("right can only have one index")
1694
1695        # set 'by' columns
1696        if self.by is not None:
1697            if self.left_by is not None or self.right_by is not None:
1698                raise MergeError("Can only pass by OR left_by and right_by")
1699            self.left_by = self.right_by = self.by
1700        if self.left_by is None and self.right_by is not None:
1701            raise MergeError("missing left_by")
1702        if self.left_by is not None and self.right_by is None:
1703            raise MergeError("missing right_by")
1704
1705        # add 'by' to our key-list so we can have it in the
1706        # output as a key
1707        if self.left_by is not None:
1708            if not is_list_like(self.left_by):
1709                self.left_by = [self.left_by]
1710            if not is_list_like(self.right_by):
1711                self.right_by = [self.right_by]
1712
1713            if len(self.left_by) != len(self.right_by):
1714                raise MergeError("left_by and right_by must be same length")
1715
1716            self.left_on = self.left_by + list(self.left_on)
1717            self.right_on = self.right_by + list(self.right_on)
1718
1719        # check 'direction' is valid
1720        if self.direction not in ["backward", "forward", "nearest"]:
1721            raise MergeError(f"direction invalid: {self.direction}")
1722
1723    def _get_merge_keys(self):
1724
1725        # note this function has side effects
1726        (left_join_keys, right_join_keys, join_names) = super()._get_merge_keys()
1727
1728        # validate index types are the same
1729        for i, (lk, rk) in enumerate(zip(left_join_keys, right_join_keys)):
1730            if not is_dtype_equal(lk.dtype, rk.dtype):
1731                if is_categorical_dtype(lk.dtype) and is_categorical_dtype(rk.dtype):
1732                    # The generic error message is confusing for categoricals.
1733                    #
1734                    # In this function, the join keys include both the original
1735                    # ones of the merge_asof() call, and also the keys passed
1736                    # to its by= argument. Unordered but equal categories
1737                    # are not supported for the former, but will fail
1738                    # later with a ValueError, so we don't *need* to check
1739                    # for them here.
1740                    msg = (
1741                        f"incompatible merge keys [{i}] {repr(lk.dtype)} and "
1742                        f"{repr(rk.dtype)}, both sides category, but not equal ones"
1743                    )
1744                else:
1745                    msg = (
1746                        f"incompatible merge keys [{i}] {repr(lk.dtype)} and "
1747                        f"{repr(rk.dtype)}, must be the same type"
1748                    )
1749                raise MergeError(msg)
1750
1751        # validate tolerance; datetime.timedelta or Timedelta if we have a DTI
1752        if self.tolerance is not None:
1753
1754            if self.left_index:
1755                lt = self.left.index
1756            else:
1757                lt = left_join_keys[-1]
1758
1759            msg = (
1760                f"incompatible tolerance {self.tolerance}, must be compat "
1761                f"with type {repr(lt.dtype)}"
1762            )
1763
1764            if needs_i8_conversion(lt):
1765                if not isinstance(self.tolerance, datetime.timedelta):
1766                    raise MergeError(msg)
1767                if self.tolerance < Timedelta(0):
1768                    raise MergeError("tolerance must be positive")
1769
1770            elif is_integer_dtype(lt):
1771                if not is_integer(self.tolerance):
1772                    raise MergeError(msg)
1773                if self.tolerance < 0:
1774                    raise MergeError("tolerance must be positive")
1775
1776            elif is_float_dtype(lt):
1777                if not is_number(self.tolerance):
1778                    raise MergeError(msg)
1779                if self.tolerance < 0:
1780                    raise MergeError("tolerance must be positive")
1781
1782            else:
1783                raise MergeError("key must be integer, timestamp or float")
1784
1785        # validate allow_exact_matches
1786        if not is_bool(self.allow_exact_matches):
1787            msg = (
1788                "allow_exact_matches must be boolean, "
1789                f"passed {self.allow_exact_matches}"
1790            )
1791            raise MergeError(msg)
1792
1793        return left_join_keys, right_join_keys, join_names
1794
1795    def _get_join_indexers(self):
1796        """ return the join indexers """
1797
1798        def flip(xs) -> np.ndarray:
1799            """ unlike np.transpose, this returns an array of tuples """
1800            xs = [
1801                x
1802                if not is_extension_array_dtype(x)
1803                else extract_array(x)._values_for_argsort()
1804                for x in xs
1805            ]
1806            labels = list(string.ascii_lowercase[: len(xs)])
1807            dtypes = [x.dtype for x in xs]
1808            labeled_dtypes = list(zip(labels, dtypes))
1809            return np.array(list(zip(*xs)), labeled_dtypes)
1810
1811        # values to compare
1812        left_values = (
1813            self.left.index._values if self.left_index else self.left_join_keys[-1]
1814        )
1815        right_values = (
1816            self.right.index._values if self.right_index else self.right_join_keys[-1]
1817        )
1818        tolerance = self.tolerance
1819
1820        # we require sortedness and non-null values in the join keys
1821        if not Index(left_values).is_monotonic:
1822            side = "left"
1823            if isna(left_values).any():
1824                raise ValueError(f"Merge keys contain null values on {side} side")
1825            else:
1826                raise ValueError(f"{side} keys must be sorted")
1827
1828        if not Index(right_values).is_monotonic:
1829            side = "right"
1830            if isna(right_values).any():
1831                raise ValueError(f"Merge keys contain null values on {side} side")
1832            else:
1833                raise ValueError(f"{side} keys must be sorted")
1834
1835        # initial type conversion as needed
1836        if needs_i8_conversion(left_values):
1837            left_values = left_values.view("i8")
1838            right_values = right_values.view("i8")
1839            if tolerance is not None:
1840                tolerance = Timedelta(tolerance)
1841                tolerance = tolerance.value
1842
1843        # a "by" parameter requires special handling
1844        if self.left_by is not None:
1845            # remove 'on' parameter from values if one existed
1846            if self.left_index and self.right_index:
1847                left_by_values = self.left_join_keys
1848                right_by_values = self.right_join_keys
1849            else:
1850                left_by_values = self.left_join_keys[0:-1]
1851                right_by_values = self.right_join_keys[0:-1]
1852
1853            # get tuple representation of values if more than one
1854            if len(left_by_values) == 1:
1855                left_by_values = left_by_values[0]
1856                right_by_values = right_by_values[0]
1857            else:
1858                left_by_values = flip(left_by_values)
1859                right_by_values = flip(right_by_values)
1860
1861            # upcast 'by' parameter because HashTable is limited
1862            by_type = _get_cython_type_upcast(left_by_values.dtype)
1863            by_type_caster = _type_casters[by_type]
1864            left_by_values = by_type_caster(left_by_values)
1865            right_by_values = by_type_caster(right_by_values)
1866
1867            # choose appropriate function by type
1868            func = _asof_by_function(self.direction)
1869            return func(
1870                left_values,
1871                right_values,
1872                left_by_values,
1873                right_by_values,
1874                self.allow_exact_matches,
1875                tolerance,
1876            )
1877        else:
1878            # choose appropriate function by type
1879            func = _asof_function(self.direction)
1880            return func(left_values, right_values, self.allow_exact_matches, tolerance)
1881
1882
1883def _get_multiindex_indexer(join_keys, index: MultiIndex, sort: bool):
1884
1885    # left & right join labels and num. of levels at each location
1886    mapped = (
1887        _factorize_keys(index.levels[n], join_keys[n], sort=sort)
1888        for n in range(index.nlevels)
1889    )
1890    zipped = zip(*mapped)
1891    rcodes, lcodes, shape = [list(x) for x in zipped]
1892    if sort:
1893        rcodes = list(map(np.take, rcodes, index.codes))
1894    else:
1895        i8copy = lambda a: a.astype("i8", subok=False, copy=True)
1896        rcodes = list(map(i8copy, index.codes))
1897
1898    # fix right labels if there were any nulls
1899    for i in range(len(join_keys)):
1900        mask = index.codes[i] == -1
1901        if mask.any():
1902            # check if there already was any nulls at this location
1903            # if there was, it is factorized to `shape[i] - 1`
1904            a = join_keys[i][lcodes[i] == shape[i] - 1]
1905            if a.size == 0 or not a[0] != a[0]:
1906                shape[i] += 1
1907
1908            rcodes[i][mask] = shape[i] - 1
1909
1910    # get flat i8 join keys
1911    lkey, rkey = _get_join_keys(lcodes, rcodes, shape, sort)
1912
1913    # factorize keys to a dense i8 space
1914    lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort)
1915
1916    return libjoin.left_outer_join(lkey, rkey, count, sort=sort)
1917
1918
1919def _get_single_indexer(join_key, index, sort: bool = False):
1920    left_key, right_key, count = _factorize_keys(join_key, index, sort=sort)
1921
1922    left_indexer, right_indexer = libjoin.left_outer_join(
1923        ensure_int64(left_key), ensure_int64(right_key), count, sort=sort
1924    )
1925
1926    return left_indexer, right_indexer
1927
1928
1929def _left_join_on_index(left_ax: Index, right_ax: Index, join_keys, sort: bool = False):
1930    if len(join_keys) > 1:
1931        if not (
1932            isinstance(right_ax, MultiIndex) and len(join_keys) == right_ax.nlevels
1933        ):
1934            raise AssertionError(
1935                "If more than one join key is given then "
1936                "'right_ax' must be a MultiIndex and the "
1937                "number of join keys must be the number of levels in right_ax"
1938            )
1939
1940        left_indexer, right_indexer = _get_multiindex_indexer(
1941            join_keys, right_ax, sort=sort
1942        )
1943    else:
1944        jkey = join_keys[0]
1945
1946        left_indexer, right_indexer = _get_single_indexer(jkey, right_ax, sort=sort)
1947
1948    if sort or len(left_ax) != len(left_indexer):
1949        # if asked to sort or there are 1-to-many matches
1950        join_index = left_ax.take(left_indexer)
1951        return join_index, left_indexer, right_indexer
1952
1953    # left frame preserves order & length of its index
1954    return left_ax, None, right_indexer
1955
1956
1957def _factorize_keys(
1958    lk: ArrayLike, rk: ArrayLike, sort: bool = True, how: str = "inner"
1959) -> Tuple[np.ndarray, np.ndarray, int]:
1960    """
1961    Encode left and right keys as enumerated types.
1962
1963    This is used to get the join indexers to be used when merging DataFrames.
1964
1965    Parameters
1966    ----------
1967    lk : array-like
1968        Left key.
1969    rk : array-like
1970        Right key.
1971    sort : bool, defaults to True
1972        If True, the encoding is done such that the unique elements in the
1973        keys are sorted.
1974    how : {‘left’, ‘right’, ‘outer’, ‘inner’}, default ‘inner’
1975        Type of merge.
1976
1977    Returns
1978    -------
1979    array
1980        Left (resp. right if called with `key='right'`) labels, as enumerated type.
1981    array
1982        Right (resp. left if called with `key='right'`) labels, as enumerated type.
1983    int
1984        Number of unique elements in union of left and right labels.
1985
1986    See Also
1987    --------
1988    merge : Merge DataFrame or named Series objects
1989        with a database-style join.
1990    algorithms.factorize : Encode the object as an enumerated type
1991        or categorical variable.
1992
1993    Examples
1994    --------
1995    >>> lk = np.array(["a", "c", "b"])
1996    >>> rk = np.array(["a", "c"])
1997
1998    Here, the unique values are `'a', 'b', 'c'`. With the default
1999    `sort=True`, the encoding will be `{0: 'a', 1: 'b', 2: 'c'}`:
2000
2001    >>> pd.core.reshape.merge._factorize_keys(lk, rk)
2002    (array([0, 2, 1]), array([0, 2]), 3)
2003
2004    With the `sort=False`, the encoding will correspond to the order
2005    in which the unique elements first appear: `{0: 'a', 1: 'c', 2: 'b'}`:
2006
2007    >>> pd.core.reshape.merge._factorize_keys(lk, rk, sort=False)
2008    (array([0, 1, 2]), array([0, 1]), 3)
2009    """
2010    # Some pre-processing for non-ndarray lk / rk
2011    lk = extract_array(lk, extract_numpy=True)
2012    rk = extract_array(rk, extract_numpy=True)
2013
2014    if is_datetime64tz_dtype(lk.dtype) and is_datetime64tz_dtype(rk.dtype):
2015        # Extract the ndarray (UTC-localized) values
2016        # Note: we dont need the dtypes to match, as these can still be compared
2017        lk = cast("DatetimeArray", lk)._ndarray
2018        rk = cast("DatetimeArray", rk)._ndarray
2019
2020    elif (
2021        is_categorical_dtype(lk.dtype)
2022        and is_categorical_dtype(rk.dtype)
2023        and is_dtype_equal(lk.dtype, rk.dtype)
2024    ):
2025        assert isinstance(lk, Categorical)
2026        assert isinstance(rk, Categorical)
2027        # Cast rk to encoding so we can compare codes with lk
2028        rk = lk._encode_with_my_categories(rk)
2029
2030        lk = ensure_int64(lk.codes)
2031        rk = ensure_int64(rk.codes)
2032
2033    elif is_extension_array_dtype(lk.dtype) and is_dtype_equal(lk.dtype, rk.dtype):
2034        lk, _ = lk._values_for_factorize()
2035        rk, _ = rk._values_for_factorize()
2036
2037    if is_integer_dtype(lk.dtype) and is_integer_dtype(rk.dtype):
2038        # GH#23917 TODO: needs tests for case where lk is integer-dtype
2039        #  and rk is datetime-dtype
2040        klass = libhashtable.Int64Factorizer
2041        lk = ensure_int64(np.asarray(lk))
2042        rk = ensure_int64(np.asarray(rk))
2043
2044    elif needs_i8_conversion(lk.dtype) and is_dtype_equal(lk.dtype, rk.dtype):
2045        # GH#23917 TODO: Needs tests for non-matching dtypes
2046        klass = libhashtable.Int64Factorizer
2047        lk = ensure_int64(np.asarray(lk, dtype=np.int64))
2048        rk = ensure_int64(np.asarray(rk, dtype=np.int64))
2049
2050    else:
2051        klass = libhashtable.Factorizer
2052        lk = ensure_object(lk)
2053        rk = ensure_object(rk)
2054
2055    rizer = klass(max(len(lk), len(rk)))
2056
2057    llab = rizer.factorize(lk)
2058    rlab = rizer.factorize(rk)
2059
2060    count = rizer.get_count()
2061
2062    if sort:
2063        uniques = rizer.uniques.to_array()
2064        llab, rlab = _sort_labels(uniques, llab, rlab)
2065
2066    # NA group
2067    lmask = llab == -1
2068    lany = lmask.any()
2069    rmask = rlab == -1
2070    rany = rmask.any()
2071
2072    if lany or rany:
2073        if lany:
2074            np.putmask(llab, lmask, count)
2075        if rany:
2076            np.putmask(rlab, rmask, count)
2077        count += 1
2078
2079    if how == "right":
2080        return rlab, llab, count
2081    return llab, rlab, count
2082
2083
2084def _sort_labels(uniques: np.ndarray, left, right):
2085
2086    llength = len(left)
2087    labels = np.concatenate([left, right])
2088
2089    _, new_labels = algos.safe_sort(uniques, labels, na_sentinel=-1)
2090    new_labels = ensure_int64(new_labels)
2091    new_left, new_right = new_labels[:llength], new_labels[llength:]
2092
2093    return new_left, new_right
2094
2095
2096def _get_join_keys(llab, rlab, shape, sort: bool):
2097
2098    # how many levels can be done without overflow
2099    nlev = next(
2100        lev
2101        for lev in range(len(shape), 0, -1)
2102        if not is_int64_overflow_possible(shape[:lev])
2103    )
2104
2105    # get keys for the first `nlev` levels
2106    stride = np.prod(shape[1:nlev], dtype="i8")
2107    lkey = stride * llab[0].astype("i8", subok=False, copy=False)
2108    rkey = stride * rlab[0].astype("i8", subok=False, copy=False)
2109
2110    for i in range(1, nlev):
2111        with np.errstate(divide="ignore"):
2112            stride //= shape[i]
2113        lkey += llab[i] * stride
2114        rkey += rlab[i] * stride
2115
2116    if nlev == len(shape):  # all done!
2117        return lkey, rkey
2118
2119    # densify current keys to avoid overflow
2120    lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort)
2121
2122    llab = [lkey] + llab[nlev:]
2123    rlab = [rkey] + rlab[nlev:]
2124    shape = [count] + shape[nlev:]
2125
2126    return _get_join_keys(llab, rlab, shape, sort)
2127
2128
2129def _should_fill(lname, rname) -> bool:
2130    if not isinstance(lname, str) or not isinstance(rname, str):
2131        return True
2132    return lname == rname
2133
2134
2135def _any(x) -> bool:
2136    return x is not None and com.any_not_none(*x)
2137
2138
2139def _validate_operand(obj: FrameOrSeries) -> "DataFrame":
2140    if isinstance(obj, ABCDataFrame):
2141        return obj
2142    elif isinstance(obj, ABCSeries):
2143        if obj.name is None:
2144            raise ValueError("Cannot merge a Series without a name")
2145        else:
2146            return obj.to_frame()
2147    else:
2148        raise TypeError(
2149            f"Can only merge Series or DataFrame objects, a {type(obj)} was passed"
2150        )
2151
2152
2153def _items_overlap_with_suffix(left: Index, right: Index, suffixes: Tuple[str, str]):
2154    """
2155    Suffixes type validation.
2156
2157    If two indices overlap, add suffixes to overlapping entries.
2158
2159    If corresponding suffix is empty, the entry is simply converted to string.
2160
2161    """
2162    if not is_list_like(suffixes, allow_sets=False):
2163        warnings.warn(
2164            f"Passing 'suffixes' as a {type(suffixes)}, is not supported and may give "
2165            "unexpected results. Provide 'suffixes' as a tuple instead. In the "
2166            "future a 'TypeError' will be raised.",
2167            FutureWarning,
2168            stacklevel=4,
2169        )
2170
2171    to_rename = left.intersection(right)
2172    if len(to_rename) == 0:
2173        return left, right
2174
2175    lsuffix, rsuffix = suffixes
2176
2177    if not lsuffix and not rsuffix:
2178        raise ValueError(f"columns overlap but no suffix specified: {to_rename}")
2179
2180    def renamer(x, suffix):
2181        """
2182        Rename the left and right indices.
2183
2184        If there is overlap, and suffix is not None, add
2185        suffix, otherwise, leave it as-is.
2186
2187        Parameters
2188        ----------
2189        x : original column name
2190        suffix : str or None
2191
2192        Returns
2193        -------
2194        x : renamed column name
2195        """
2196        if x in to_rename and suffix is not None:
2197            return f"{x}{suffix}"
2198        return x
2199
2200    lrenamer = partial(renamer, suffix=lsuffix)
2201    rrenamer = partial(renamer, suffix=rsuffix)
2202
2203    return (left._transform_index(lrenamer), right._transform_index(rrenamer))
2204