1# coding: utf8
2# copies.py - copy detection for Mercurial
3#
4# Copyright 2008 Olivia Mackall <olivia@selenic.com>
5#
6# This software may be used and distributed according to the terms of the
7# GNU General Public License version 2 or any later version.
8
9from __future__ import absolute_import
10
11import collections
12import os
13
14from .i18n import _
15from .node import nullrev
16
17from . import (
18    match as matchmod,
19    pathutil,
20    policy,
21    pycompat,
22    util,
23)
24
25
26from .utils import stringutil
27
28from .revlogutils import (
29    flagutil,
30    sidedata as sidedatamod,
31)
32
33rustmod = policy.importrust("copy_tracing")
34
35
36def _filter(src, dst, t):
37    """filters out invalid copies after chaining"""
38
39    # When _chain()'ing copies in 'a' (from 'src' via some other commit 'mid')
40    # with copies in 'b' (from 'mid' to 'dst'), we can get the different cases
41    # in the following table (not including trivial cases). For example, case 6
42    # is where a file existed in 'src' and remained under that name in 'mid' and
43    # then was renamed between 'mid' and 'dst'.
44    #
45    # case src mid dst result
46    #   1   x   y   -    -
47    #   2   x   y   y   x->y
48    #   3   x   y   x    -
49    #   4   x   y   z   x->z
50    #   5   -   x   y    -
51    #   6   x   x   y   x->y
52    #
53    # _chain() takes care of chaining the copies in 'a' and 'b', but it
54    # cannot tell the difference between cases 1 and 2, between 3 and 4, or
55    # between 5 and 6, so it includes all cases in its result.
56    # Cases 1, 3, and 5 are then removed by _filter().
57
58    for k, v in list(t.items()):
59        if k == v:  # case 3
60            del t[k]
61        elif v not in src:  # case 5
62            # remove copies from files that didn't exist
63            del t[k]
64        elif k not in dst:  # case 1
65            # remove copies to files that were then removed
66            del t[k]
67
68
69def _chain(prefix, suffix):
70    """chain two sets of copies 'prefix' and 'suffix'"""
71    result = prefix.copy()
72    for key, value in pycompat.iteritems(suffix):
73        result[key] = prefix.get(value, value)
74    return result
75
76
77def _tracefile(fctx, am, basemf):
78    """return file context that is the ancestor of fctx present in ancestor
79    manifest am
80
81    Note: we used to try and stop after a given limit, however checking if that
82    limit is reached turned out to be very expensive. we are better off
83    disabling that feature."""
84
85    for f in fctx.ancestors():
86        path = f.path()
87        if am.get(path, None) == f.filenode():
88            return path
89        if basemf and basemf.get(path, None) == f.filenode():
90            return path
91
92
93def _dirstatecopies(repo, match=None):
94    ds = repo.dirstate
95    c = ds.copies().copy()
96    for k in list(c):
97        if not ds.get_entry(k).tracked or (match and not match(k)):
98            del c[k]
99    return c
100
101
102def _computeforwardmissing(a, b, match=None):
103    """Computes which files are in b but not a.
104    This is its own function so extensions can easily wrap this call to see what
105    files _forwardcopies is about to process.
106    """
107    ma = a.manifest()
108    mb = b.manifest()
109    return mb.filesnotin(ma, match=match)
110
111
112def usechangesetcentricalgo(repo):
113    """Checks if we should use changeset-centric copy algorithms"""
114    if repo.filecopiesmode == b'changeset-sidedata':
115        return True
116    readfrom = repo.ui.config(b'experimental', b'copies.read-from')
117    changesetsource = (b'changeset-only', b'compatibility')
118    return readfrom in changesetsource
119
120
121def _committedforwardcopies(a, b, base, match):
122    """Like _forwardcopies(), but b.rev() cannot be None (working copy)"""
123    # files might have to be traced back to the fctx parent of the last
124    # one-side-only changeset, but not further back than that
125    repo = a._repo
126
127    if usechangesetcentricalgo(repo):
128        return _changesetforwardcopies(a, b, match)
129
130    debug = repo.ui.debugflag and repo.ui.configbool(b'devel', b'debug.copies')
131    dbg = repo.ui.debug
132    if debug:
133        dbg(b'debug.copies:    looking into rename from %s to %s\n' % (a, b))
134    am = a.manifest()
135    basemf = None if base is None else base.manifest()
136
137    # find where new files came from
138    # we currently don't try to find where old files went, too expensive
139    # this means we can miss a case like 'hg rm b; hg cp a b'
140    cm = {}
141
142    # Computing the forward missing is quite expensive on large manifests, since
143    # it compares the entire manifests. We can optimize it in the common use
144    # case of computing what copies are in a commit versus its parent (like
145    # during a rebase or histedit). Note, we exclude merge commits from this
146    # optimization, since the ctx.files() for a merge commit is not correct for
147    # this comparison.
148    forwardmissingmatch = match
149    if b.p1() == a and b.p2().rev() == nullrev:
150        filesmatcher = matchmod.exact(b.files())
151        forwardmissingmatch = matchmod.intersectmatchers(match, filesmatcher)
152    if repo.ui.configbool(b'devel', b'copy-tracing.trace-all-files'):
153        missing = list(b.walk(match))
154        # _computeforwardmissing(a, b, match=forwardmissingmatch)
155        if debug:
156            dbg(b'debug.copies:      searching all files: %d\n' % len(missing))
157    else:
158        missing = _computeforwardmissing(a, b, match=forwardmissingmatch)
159        if debug:
160            dbg(
161                b'debug.copies:      missing files to search: %d\n'
162                % len(missing)
163            )
164
165    ancestrycontext = a._repo.changelog.ancestors([b.rev()], inclusive=True)
166
167    for f in sorted(missing):
168        if debug:
169            dbg(b'debug.copies:        tracing file: %s\n' % f)
170        fctx = b[f]
171        fctx._ancestrycontext = ancestrycontext
172
173        if debug:
174            start = util.timer()
175        opath = _tracefile(fctx, am, basemf)
176        if opath:
177            if debug:
178                dbg(b'debug.copies:          rename of: %s\n' % opath)
179            cm[f] = opath
180        if debug:
181            dbg(
182                b'debug.copies:          time: %f seconds\n'
183                % (util.timer() - start)
184            )
185    return cm
186
187
188def _revinfo_getter(repo, match):
189    """returns a function that returns the following data given a <rev>"
190
191    * p1: revision number of first parent
192    * p2: revision number of first parent
193    * changes: a ChangingFiles object
194    """
195    cl = repo.changelog
196    parents = cl.parentrevs
197    flags = cl.flags
198
199    HASCOPIESINFO = flagutil.REVIDX_HASCOPIESINFO
200
201    changelogrevision = cl.changelogrevision
202
203    if rustmod is not None:
204
205        def revinfo(rev):
206            p1, p2 = parents(rev)
207            if flags(rev) & HASCOPIESINFO:
208                raw = changelogrevision(rev)._sidedata.get(sidedatamod.SD_FILES)
209            else:
210                raw = None
211            return (p1, p2, raw)
212
213    else:
214
215        def revinfo(rev):
216            p1, p2 = parents(rev)
217            if flags(rev) & HASCOPIESINFO:
218                changes = changelogrevision(rev).changes
219            else:
220                changes = None
221            return (p1, p2, changes)
222
223    return revinfo
224
225
226def cached_is_ancestor(is_ancestor):
227    """return a cached version of is_ancestor"""
228    cache = {}
229
230    def _is_ancestor(anc, desc):
231        if anc > desc:
232            return False
233        elif anc == desc:
234            return True
235        key = (anc, desc)
236        ret = cache.get(key)
237        if ret is None:
238            ret = cache[key] = is_ancestor(anc, desc)
239        return ret
240
241    return _is_ancestor
242
243
244def _changesetforwardcopies(a, b, match):
245    if a.rev() in (nullrev, b.rev()):
246        return {}
247
248    repo = a.repo().unfiltered()
249    children = {}
250
251    cl = repo.changelog
252    isancestor = cl.isancestorrev
253
254    # To track rename from "A" to B, we need to gather all parent → children
255    # edges that are contains in `::B` but not in `::A`.
256    #
257    #
258    # To do so, we need to gather all revisions exclusive¹ to "B" (ie¹: `::b -
259    # ::a`) and also all the "roots point", ie the parents of the exclusive set
260    # that belong to ::a. These are exactly all the revisions needed to express
261    # the parent → children we need to combine.
262    #
263    # [1] actually, we need to gather all the edges within `(::a)::b`, ie:
264    # excluding paths that leads to roots that are not ancestors of `a`. We
265    # keep this out of the explanation because it is hard enough without this special case..
266
267    parents = cl._uncheckedparentrevs
268    graph_roots = (nullrev, nullrev)
269
270    ancestors = cl.ancestors([a.rev()], inclusive=True)
271    revs = cl.findmissingrevs(common=[a.rev()], heads=[b.rev()])
272    roots = set()
273    has_graph_roots = False
274    multi_thread = repo.ui.configbool(b'devel', b'copy-tracing.multi-thread')
275
276    # iterate over `only(B, A)`
277    for r in revs:
278        ps = parents(r)
279        if ps == graph_roots:
280            has_graph_roots = True
281        else:
282            p1, p2 = ps
283
284            # find all the "root points" (see larger comment above)
285            if p1 != nullrev and p1 in ancestors:
286                roots.add(p1)
287            if p2 != nullrev and p2 in ancestors:
288                roots.add(p2)
289    if not roots:
290        # no common revision to track copies from
291        return {}
292    if has_graph_roots:
293        # this deal with the special case mentionned in the [1] footnotes. We
294        # must filter out revisions that leads to non-common graphroots.
295        roots = list(roots)
296        m = min(roots)
297        h = [b.rev()]
298        roots_to_head = cl.reachableroots(m, h, roots, includepath=True)
299        roots_to_head = set(roots_to_head)
300        revs = [r for r in revs if r in roots_to_head]
301
302    if repo.filecopiesmode == b'changeset-sidedata':
303        # When using side-data, we will process the edges "from" the children.
304        # We iterate over the childre, gathering previous collected data for
305        # the parents. Do know when the parents data is no longer necessary, we
306        # keep a counter of how many children each revision has.
307        #
308        # An interresting property of `children_count` is that it only contains
309        # revision that will be relevant for a edge of the graph. So if a
310        # children has parent not in `children_count`, that edges should not be
311        # processed.
312        children_count = dict((r, 0) for r in roots)
313        for r in revs:
314            for p in cl.parentrevs(r):
315                if p == nullrev:
316                    continue
317                children_count[r] = 0
318                if p in children_count:
319                    children_count[p] += 1
320        revinfo = _revinfo_getter(repo, match)
321        with repo.changelog.reading():
322            return _combine_changeset_copies(
323                revs,
324                children_count,
325                b.rev(),
326                revinfo,
327                match,
328                isancestor,
329                multi_thread,
330            )
331    else:
332        # When not using side-data, we will process the edges "from" the parent.
333        # so we need a full mapping of the parent -> children relation.
334        children = dict((r, []) for r in roots)
335        for r in revs:
336            for p in cl.parentrevs(r):
337                if p == nullrev:
338                    continue
339                children[r] = []
340                if p in children:
341                    children[p].append(r)
342        x = revs.pop()
343        assert x == b.rev()
344        revs.extend(roots)
345        revs.sort()
346
347        revinfo = _revinfo_getter_extra(repo)
348        return _combine_changeset_copies_extra(
349            revs, children, b.rev(), revinfo, match, isancestor
350        )
351
352
353def _combine_changeset_copies(
354    revs, children_count, targetrev, revinfo, match, isancestor, multi_thread
355):
356    """combine the copies information for each item of iterrevs
357
358    revs: sorted iterable of revision to visit
359    children_count: a {parent: <number-of-relevant-children>} mapping.
360    targetrev: the final copies destination revision (not in iterrevs)
361    revinfo(rev): a function that return (p1, p2, p1copies, p2copies, removed)
362    match: a matcher
363
364    It returns the aggregated copies information for `targetrev`.
365    """
366
367    alwaysmatch = match.always()
368
369    if rustmod is not None:
370        final_copies = rustmod.combine_changeset_copies(
371            list(revs), children_count, targetrev, revinfo, multi_thread
372        )
373    else:
374        isancestor = cached_is_ancestor(isancestor)
375
376        all_copies = {}
377        # iterate over all the "children" side of copy tracing "edge"
378        for current_rev in revs:
379            p1, p2, changes = revinfo(current_rev)
380            current_copies = None
381            # iterate over all parents to chain the existing data with the
382            # data from the parent → child edge.
383            for parent, parent_rev in ((1, p1), (2, p2)):
384                if parent_rev == nullrev:
385                    continue
386                remaining_children = children_count.get(parent_rev)
387                if remaining_children is None:
388                    continue
389                remaining_children -= 1
390                children_count[parent_rev] = remaining_children
391                if remaining_children:
392                    copies = all_copies.get(parent_rev, None)
393                else:
394                    copies = all_copies.pop(parent_rev, None)
395
396                if copies is None:
397                    # this is a root
398                    newcopies = copies = {}
399                elif remaining_children:
400                    newcopies = copies.copy()
401                else:
402                    newcopies = copies
403                # chain the data in the edge with the existing data
404                if changes is not None:
405                    childcopies = {}
406                    if parent == 1:
407                        childcopies = changes.copied_from_p1
408                    elif parent == 2:
409                        childcopies = changes.copied_from_p2
410
411                    if childcopies:
412                        newcopies = copies.copy()
413                        for dest, source in pycompat.iteritems(childcopies):
414                            prev = copies.get(source)
415                            if prev is not None and prev[1] is not None:
416                                source = prev[1]
417                            newcopies[dest] = (current_rev, source)
418                        assert newcopies is not copies
419                    if changes.removed:
420                        for f in changes.removed:
421                            if f in newcopies:
422                                if newcopies is copies:
423                                    # copy on write to avoid affecting potential other
424                                    # branches.  when there are no other branches, this
425                                    # could be avoided.
426                                    newcopies = copies.copy()
427                                newcopies[f] = (current_rev, None)
428                # check potential need to combine the data from another parent (for
429                # that child). See comment below for details.
430                if current_copies is None:
431                    current_copies = newcopies
432                else:
433                    # we are the second parent to work on c, we need to merge our
434                    # work with the other.
435                    #
436                    # In case of conflict, parent 1 take precedence over parent 2.
437                    # This is an arbitrary choice made anew when implementing
438                    # changeset based copies. It was made without regards with
439                    # potential filelog related behavior.
440                    assert parent == 2
441                    current_copies = _merge_copies_dict(
442                        newcopies,
443                        current_copies,
444                        isancestor,
445                        changes,
446                        current_rev,
447                    )
448            all_copies[current_rev] = current_copies
449
450        # filter out internal details and return a {dest: source mapping}
451        final_copies = {}
452        for dest, (tt, source) in all_copies[targetrev].items():
453            if source is not None:
454                final_copies[dest] = source
455    if not alwaysmatch:
456        for filename in list(final_copies.keys()):
457            if not match(filename):
458                del final_copies[filename]
459    return final_copies
460
461
462# constant to decide which side to pick with _merge_copies_dict
463PICK_MINOR = 0
464PICK_MAJOR = 1
465PICK_EITHER = 2
466
467
468def _merge_copies_dict(minor, major, isancestor, changes, current_merge):
469    """merge two copies-mapping together, minor and major
470
471    In case of conflict, value from "major" will be picked.
472
473    - `isancestors(low_rev, high_rev)`: callable return True if `low_rev` is an
474                                        ancestors of `high_rev`,
475
476    - `ismerged(path)`: callable return True if `path` have been merged in the
477                        current revision,
478
479    return the resulting dict (in practice, the "minor" object, updated)
480    """
481    for dest, value in major.items():
482        other = minor.get(dest)
483        if other is None:
484            minor[dest] = value
485        else:
486            pick, overwrite = _compare_values(
487                changes, isancestor, dest, other, value
488            )
489            if overwrite:
490                if pick == PICK_MAJOR:
491                    minor[dest] = (current_merge, value[1])
492                else:
493                    minor[dest] = (current_merge, other[1])
494            elif pick == PICK_MAJOR:
495                minor[dest] = value
496    return minor
497
498
499def _compare_values(changes, isancestor, dest, minor, major):
500    """compare two value within a _merge_copies_dict loop iteration
501
502    return (pick, overwrite).
503
504    - pick is one of PICK_MINOR, PICK_MAJOR or PICK_EITHER
505    - overwrite is True if pick is a return of an ambiguity that needs resolution.
506    """
507    major_tt, major_value = major
508    minor_tt, minor_value = minor
509
510    if major_tt == minor_tt:
511        # if it comes from the same revision it must be the same value
512        assert major_value == minor_value
513        return PICK_EITHER, False
514    elif (
515        changes is not None
516        and minor_value is not None
517        and major_value is None
518        and dest in changes.salvaged
519    ):
520        # In this case, a deletion was reverted, the "alive" value overwrite
521        # the deleted one.
522        return PICK_MINOR, True
523    elif (
524        changes is not None
525        and major_value is not None
526        and minor_value is None
527        and dest in changes.salvaged
528    ):
529        # In this case, a deletion was reverted, the "alive" value overwrite
530        # the deleted one.
531        return PICK_MAJOR, True
532    elif isancestor(minor_tt, major_tt):
533        if changes is not None and dest in changes.merged:
534            # change to dest happened on the branch without copy-source change,
535            # so both source are valid and "major" wins.
536            return PICK_MAJOR, True
537        else:
538            return PICK_MAJOR, False
539    elif isancestor(major_tt, minor_tt):
540        if changes is not None and dest in changes.merged:
541            # change to dest happened on the branch without copy-source change,
542            # so both source are valid and "major" wins.
543            return PICK_MAJOR, True
544        else:
545            return PICK_MINOR, False
546    elif minor_value is None:
547        # in case of conflict, the "alive" side wins.
548        return PICK_MAJOR, True
549    elif major_value is None:
550        # in case of conflict, the "alive" side wins.
551        return PICK_MINOR, True
552    else:
553        # in case of conflict where both side are alive, major wins.
554        return PICK_MAJOR, True
555
556
557def _revinfo_getter_extra(repo):
558    """return a function that return multiple data given a <rev>"i
559
560    * p1: revision number of first parent
561    * p2: revision number of first parent
562    * p1copies: mapping of copies from p1
563    * p2copies: mapping of copies from p2
564    * removed: a list of removed files
565    * ismerged: a callback to know if file was merged in that revision
566    """
567    cl = repo.changelog
568    parents = cl.parentrevs
569
570    def get_ismerged(rev):
571        ctx = repo[rev]
572
573        def ismerged(path):
574            if path not in ctx.files():
575                return False
576            fctx = ctx[path]
577            parents = fctx._filelog.parents(fctx._filenode)
578            nb_parents = 0
579            for n in parents:
580                if n != repo.nullid:
581                    nb_parents += 1
582            return nb_parents >= 2
583
584        return ismerged
585
586    def revinfo(rev):
587        p1, p2 = parents(rev)
588        ctx = repo[rev]
589        p1copies, p2copies = ctx._copies
590        removed = ctx.filesremoved()
591        return p1, p2, p1copies, p2copies, removed, get_ismerged(rev)
592
593    return revinfo
594
595
596def _combine_changeset_copies_extra(
597    revs, children, targetrev, revinfo, match, isancestor
598):
599    """version of `_combine_changeset_copies` that works with the Google
600    specific "extra" based storage for copy information"""
601    all_copies = {}
602    alwaysmatch = match.always()
603    for r in revs:
604        copies = all_copies.pop(r, None)
605        if copies is None:
606            # this is a root
607            copies = {}
608        for i, c in enumerate(children[r]):
609            p1, p2, p1copies, p2copies, removed, ismerged = revinfo(c)
610            if r == p1:
611                parent = 1
612                childcopies = p1copies
613            else:
614                assert r == p2
615                parent = 2
616                childcopies = p2copies
617            if not alwaysmatch:
618                childcopies = {
619                    dst: src for dst, src in childcopies.items() if match(dst)
620                }
621            newcopies = copies
622            if childcopies:
623                newcopies = copies.copy()
624                for dest, source in pycompat.iteritems(childcopies):
625                    prev = copies.get(source)
626                    if prev is not None and prev[1] is not None:
627                        source = prev[1]
628                    newcopies[dest] = (c, source)
629                assert newcopies is not copies
630            for f in removed:
631                if f in newcopies:
632                    if newcopies is copies:
633                        # copy on write to avoid affecting potential other
634                        # branches.  when there are no other branches, this
635                        # could be avoided.
636                        newcopies = copies.copy()
637                    newcopies[f] = (c, None)
638            othercopies = all_copies.get(c)
639            if othercopies is None:
640                all_copies[c] = newcopies
641            else:
642                # we are the second parent to work on c, we need to merge our
643                # work with the other.
644                #
645                # In case of conflict, parent 1 take precedence over parent 2.
646                # This is an arbitrary choice made anew when implementing
647                # changeset based copies. It was made without regards with
648                # potential filelog related behavior.
649                if parent == 1:
650                    _merge_copies_dict_extra(
651                        othercopies, newcopies, isancestor, ismerged
652                    )
653                else:
654                    _merge_copies_dict_extra(
655                        newcopies, othercopies, isancestor, ismerged
656                    )
657                    all_copies[c] = newcopies
658
659    final_copies = {}
660    for dest, (tt, source) in all_copies[targetrev].items():
661        if source is not None:
662            final_copies[dest] = source
663    return final_copies
664
665
666def _merge_copies_dict_extra(minor, major, isancestor, ismerged):
667    """version of `_merge_copies_dict` that works with the Google
668    specific "extra" based storage for copy information"""
669    for dest, value in major.items():
670        other = minor.get(dest)
671        if other is None:
672            minor[dest] = value
673        else:
674            new_tt = value[0]
675            other_tt = other[0]
676            if value[1] == other[1]:
677                continue
678            # content from "major" wins, unless it is older
679            # than the branch point or there is a merge
680            if (
681                new_tt == other_tt
682                or not isancestor(new_tt, other_tt)
683                or ismerged(dest)
684            ):
685                minor[dest] = value
686
687
688def _forwardcopies(a, b, base=None, match=None):
689    """find {dst@b: src@a} copy mapping where a is an ancestor of b"""
690
691    if base is None:
692        base = a
693    match = a.repo().narrowmatch(match)
694    # check for working copy
695    if b.rev() is None:
696        cm = _committedforwardcopies(a, b.p1(), base, match)
697        # combine copies from dirstate if necessary
698        copies = _chain(cm, _dirstatecopies(b._repo, match))
699    else:
700        copies = _committedforwardcopies(a, b, base, match)
701    return copies
702
703
704def _backwardrenames(a, b, match):
705    """find renames from a to b"""
706    if a._repo.ui.config(b'experimental', b'copytrace') == b'off':
707        return {}
708
709    # We don't want to pass in "match" here, since that would filter
710    # the destination by it. Since we're reversing the copies, we want
711    # to filter the source instead.
712    copies = _forwardcopies(b, a)
713    return _reverse_renames(copies, a, match)
714
715
716def _reverse_renames(copies, dst, match):
717    """given copies to context 'dst', finds renames from that context"""
718    # Even though we're not taking copies into account, 1:n rename situations
719    # can still exist (e.g. hg cp a b; hg mv a c). In those cases we
720    # arbitrarily pick one of the renames.
721    r = {}
722    for k, v in sorted(pycompat.iteritems(copies)):
723        if match and not match(v):
724            continue
725        # remove copies
726        if v in dst:
727            continue
728        r[v] = k
729    return r
730
731
732def pathcopies(x, y, match=None):
733    """find {dst@y: src@x} copy mapping for directed compare"""
734    repo = x._repo
735    debug = repo.ui.debugflag and repo.ui.configbool(b'devel', b'debug.copies')
736    if debug:
737        repo.ui.debug(
738            b'debug.copies: searching copies from %s to %s\n' % (x, y)
739        )
740    if x == y or not x or not y:
741        return {}
742    if y.rev() is None and x == y.p1():
743        if debug:
744            repo.ui.debug(b'debug.copies: search mode: dirstate\n')
745        # short-circuit to avoid issues with merge states
746        return _dirstatecopies(repo, match)
747    a = y.ancestor(x)
748    if a == x:
749        if debug:
750            repo.ui.debug(b'debug.copies: search mode: forward\n')
751        copies = _forwardcopies(x, y, match=match)
752    elif a == y:
753        if debug:
754            repo.ui.debug(b'debug.copies: search mode: backward\n')
755        copies = _backwardrenames(x, y, match=match)
756    else:
757        if debug:
758            repo.ui.debug(b'debug.copies: search mode: combined\n')
759        base = None
760        if a.rev() != nullrev:
761            base = x
762        x_copies = _forwardcopies(a, x)
763        y_copies = _forwardcopies(a, y, base, match=match)
764        same_keys = set(x_copies) & set(y_copies)
765        for k in same_keys:
766            if x_copies.get(k) == y_copies.get(k):
767                del x_copies[k]
768                del y_copies[k]
769        x_backward_renames = _reverse_renames(x_copies, x, match)
770        copies = _chain(
771            x_backward_renames,
772            y_copies,
773        )
774    _filter(x, y, copies)
775    return copies
776
777
778def mergecopies(repo, c1, c2, base):
779    """
780    Finds moves and copies between context c1 and c2 that are relevant for
781    merging. 'base' will be used as the merge base.
782
783    Copytracing is used in commands like rebase, merge, unshelve, etc to merge
784    files that were moved/ copied in one merge parent and modified in another.
785    For example:
786
787    o          ---> 4 another commit
788    |
789    |   o      ---> 3 commit that modifies a.txt
790    |  /
791    o /        ---> 2 commit that moves a.txt to b.txt
792    |/
793    o          ---> 1 merge base
794
795    If we try to rebase revision 3 on revision 4, since there is no a.txt in
796    revision 4, and if user have copytrace disabled, we prints the following
797    message:
798
799    ```other changed <file> which local deleted```
800
801    Returns a tuple where:
802
803    "branch_copies" an instance of branch_copies.
804
805    "diverge" is a mapping of source name -> list of destination names
806    for divergent renames.
807
808    This function calls different copytracing algorithms based on config.
809    """
810    # avoid silly behavior for update from empty dir
811    if not c1 or not c2 or c1 == c2:
812        return branch_copies(), branch_copies(), {}
813
814    narrowmatch = c1.repo().narrowmatch()
815
816    # avoid silly behavior for parent -> working dir
817    if c2.node() is None and c1.node() == repo.dirstate.p1():
818        return (
819            branch_copies(_dirstatecopies(repo, narrowmatch)),
820            branch_copies(),
821            {},
822        )
823
824    copytracing = repo.ui.config(b'experimental', b'copytrace')
825    if stringutil.parsebool(copytracing) is False:
826        # stringutil.parsebool() returns None when it is unable to parse the
827        # value, so we should rely on making sure copytracing is on such cases
828        return branch_copies(), branch_copies(), {}
829
830    if usechangesetcentricalgo(repo):
831        # The heuristics don't make sense when we need changeset-centric algos
832        return _fullcopytracing(repo, c1, c2, base)
833
834    # Copy trace disabling is explicitly below the node == p1 logic above
835    # because the logic above is required for a simple copy to be kept across a
836    # rebase.
837    if copytracing == b'heuristics':
838        # Do full copytracing if only non-public revisions are involved as
839        # that will be fast enough and will also cover the copies which could
840        # be missed by heuristics
841        if _isfullcopytraceable(repo, c1, base):
842            return _fullcopytracing(repo, c1, c2, base)
843        return _heuristicscopytracing(repo, c1, c2, base)
844    else:
845        return _fullcopytracing(repo, c1, c2, base)
846
847
848def _isfullcopytraceable(repo, c1, base):
849    """Checks that if base, source and destination are all no-public branches,
850    if yes let's use the full copytrace algorithm for increased capabilities
851    since it will be fast enough.
852
853    `experimental.copytrace.sourcecommitlimit` can be used to set a limit for
854    number of changesets from c1 to base such that if number of changesets are
855    more than the limit, full copytracing algorithm won't be used.
856    """
857    if c1.rev() is None:
858        c1 = c1.p1()
859    if c1.mutable() and base.mutable():
860        sourcecommitlimit = repo.ui.configint(
861            b'experimental', b'copytrace.sourcecommitlimit'
862        )
863        commits = len(repo.revs(b'%d::%d', base.rev(), c1.rev()))
864        return commits < sourcecommitlimit
865    return False
866
867
868def _checksinglesidecopies(
869    src, dsts1, m1, m2, mb, c2, base, copy, renamedelete
870):
871    if src not in m2:
872        # deleted on side 2
873        if src not in m1:
874            # renamed on side 1, deleted on side 2
875            renamedelete[src] = dsts1
876    elif src not in mb:
877        # Work around the "short-circuit to avoid issues with merge states"
878        # thing in pathcopies(): pathcopies(x, y) can return a copy where the
879        # destination doesn't exist in y.
880        pass
881    elif mb[src] != m2[src] and not _related(c2[src], base[src]):
882        return
883    elif mb[src] != m2[src] or mb.flags(src) != m2.flags(src):
884        # modified on side 2
885        for dst in dsts1:
886            copy[dst] = src
887
888
889class branch_copies(object):
890    """Information about copies made on one side of a merge/graft.
891
892    "copy" is a mapping from destination name -> source name,
893    where source is in c1 and destination is in c2 or vice-versa.
894
895    "movewithdir" is a mapping from source name -> destination name,
896    where the file at source present in one context but not the other
897    needs to be moved to destination by the merge process, because the
898    other context moved the directory it is in.
899
900    "renamedelete" is a mapping of source name -> list of destination
901    names for files deleted in c1 that were renamed in c2 or vice-versa.
902
903    "dirmove" is a mapping of detected source dir -> destination dir renames.
904    This is needed for handling changes to new files previously grafted into
905    renamed directories.
906    """
907
908    def __init__(
909        self, copy=None, renamedelete=None, dirmove=None, movewithdir=None
910    ):
911        self.copy = {} if copy is None else copy
912        self.renamedelete = {} if renamedelete is None else renamedelete
913        self.dirmove = {} if dirmove is None else dirmove
914        self.movewithdir = {} if movewithdir is None else movewithdir
915
916    def __repr__(self):
917        return '<branch_copies\n  copy=%r\n  renamedelete=%r\n  dirmove=%r\n  movewithdir=%r\n>' % (
918            self.copy,
919            self.renamedelete,
920            self.dirmove,
921            self.movewithdir,
922        )
923
924
925def _fullcopytracing(repo, c1, c2, base):
926    """The full copytracing algorithm which finds all the new files that were
927    added from merge base up to the top commit and for each file it checks if
928    this file was copied from another file.
929
930    This is pretty slow when a lot of changesets are involved but will track all
931    the copies.
932    """
933    m1 = c1.manifest()
934    m2 = c2.manifest()
935    mb = base.manifest()
936
937    copies1 = pathcopies(base, c1)
938    copies2 = pathcopies(base, c2)
939
940    if not (copies1 or copies2):
941        return branch_copies(), branch_copies(), {}
942
943    inversecopies1 = {}
944    inversecopies2 = {}
945    for dst, src in copies1.items():
946        inversecopies1.setdefault(src, []).append(dst)
947    for dst, src in copies2.items():
948        inversecopies2.setdefault(src, []).append(dst)
949
950    copy1 = {}
951    copy2 = {}
952    diverge = {}
953    renamedelete1 = {}
954    renamedelete2 = {}
955    allsources = set(inversecopies1) | set(inversecopies2)
956    for src in allsources:
957        dsts1 = inversecopies1.get(src)
958        dsts2 = inversecopies2.get(src)
959        if dsts1 and dsts2:
960            # copied/renamed on both sides
961            if src not in m1 and src not in m2:
962                # renamed on both sides
963                dsts1 = set(dsts1)
964                dsts2 = set(dsts2)
965                # If there's some overlap in the rename destinations, we
966                # consider it not divergent. For example, if side 1 copies 'a'
967                # to 'b' and 'c' and deletes 'a', and side 2 copies 'a' to 'c'
968                # and 'd' and deletes 'a'.
969                if dsts1 & dsts2:
970                    for dst in dsts1 & dsts2:
971                        copy1[dst] = src
972                        copy2[dst] = src
973                else:
974                    diverge[src] = sorted(dsts1 | dsts2)
975            elif src in m1 and src in m2:
976                # copied on both sides
977                dsts1 = set(dsts1)
978                dsts2 = set(dsts2)
979                for dst in dsts1 & dsts2:
980                    copy1[dst] = src
981                    copy2[dst] = src
982            # TODO: Handle cases where it was renamed on one side and copied
983            # on the other side
984        elif dsts1:
985            # copied/renamed only on side 1
986            _checksinglesidecopies(
987                src, dsts1, m1, m2, mb, c2, base, copy1, renamedelete1
988            )
989        elif dsts2:
990            # copied/renamed only on side 2
991            _checksinglesidecopies(
992                src, dsts2, m2, m1, mb, c1, base, copy2, renamedelete2
993            )
994
995    # find interesting file sets from manifests
996    cache = []
997
998    def _get_addedfiles(idx):
999        if not cache:
1000            addedinm1 = m1.filesnotin(mb, repo.narrowmatch())
1001            addedinm2 = m2.filesnotin(mb, repo.narrowmatch())
1002            u1 = sorted(addedinm1 - addedinm2)
1003            u2 = sorted(addedinm2 - addedinm1)
1004            cache.extend((u1, u2))
1005        return cache[idx]
1006
1007    u1fn = lambda: _get_addedfiles(0)
1008    u2fn = lambda: _get_addedfiles(1)
1009    if repo.ui.debugflag:
1010        u1 = u1fn()
1011        u2 = u2fn()
1012
1013        header = b"  unmatched files in %s"
1014        if u1:
1015            repo.ui.debug(
1016                b"%s:\n   %s\n" % (header % b'local', b"\n   ".join(u1))
1017            )
1018        if u2:
1019            repo.ui.debug(
1020                b"%s:\n   %s\n" % (header % b'other', b"\n   ".join(u2))
1021            )
1022
1023        renamedeleteset = set()
1024        divergeset = set()
1025        for dsts in diverge.values():
1026            divergeset.update(dsts)
1027        for dsts in renamedelete1.values():
1028            renamedeleteset.update(dsts)
1029        for dsts in renamedelete2.values():
1030            renamedeleteset.update(dsts)
1031
1032        repo.ui.debug(
1033            b"  all copies found (* = to merge, ! = divergent, "
1034            b"% = renamed and deleted):\n"
1035        )
1036        for side, copies in ((b"local", copies1), (b"remote", copies2)):
1037            if not copies:
1038                continue
1039            repo.ui.debug(b"   on %s side:\n" % side)
1040            for f in sorted(copies):
1041                note = b""
1042                if f in copy1 or f in copy2:
1043                    note += b"*"
1044                if f in divergeset:
1045                    note += b"!"
1046                if f in renamedeleteset:
1047                    note += b"%"
1048                repo.ui.debug(
1049                    b"    src: '%s' -> dst: '%s' %s\n" % (copies[f], f, note)
1050                )
1051        del renamedeleteset
1052        del divergeset
1053
1054    repo.ui.debug(b"  checking for directory renames\n")
1055
1056    dirmove1, movewithdir2 = _dir_renames(repo, c1, copy1, copies1, u2fn)
1057    dirmove2, movewithdir1 = _dir_renames(repo, c2, copy2, copies2, u1fn)
1058
1059    branch_copies1 = branch_copies(copy1, renamedelete1, dirmove1, movewithdir1)
1060    branch_copies2 = branch_copies(copy2, renamedelete2, dirmove2, movewithdir2)
1061
1062    return branch_copies1, branch_copies2, diverge
1063
1064
1065def _dir_renames(repo, ctx, copy, fullcopy, addedfilesfn):
1066    """Finds moved directories and files that should move with them.
1067
1068    ctx: the context for one of the sides
1069    copy: files copied on the same side (as ctx)
1070    fullcopy: files copied on the same side (as ctx), including those that
1071              merge.manifestmerge() won't care about
1072    addedfilesfn: function returning added files on the other side (compared to
1073                  ctx)
1074    """
1075    # generate a directory move map
1076    invalid = set()
1077    dirmove = {}
1078
1079    # examine each file copy for a potential directory move, which is
1080    # when all the files in a directory are moved to a new directory
1081    for dst, src in pycompat.iteritems(fullcopy):
1082        dsrc, ddst = pathutil.dirname(src), pathutil.dirname(dst)
1083        if dsrc in invalid:
1084            # already seen to be uninteresting
1085            continue
1086        elif ctx.hasdir(dsrc) and ctx.hasdir(ddst):
1087            # directory wasn't entirely moved locally
1088            invalid.add(dsrc)
1089        elif dsrc in dirmove and dirmove[dsrc] != ddst:
1090            # files from the same directory moved to two different places
1091            invalid.add(dsrc)
1092        else:
1093            # looks good so far
1094            dirmove[dsrc] = ddst
1095
1096    for i in invalid:
1097        if i in dirmove:
1098            del dirmove[i]
1099    del invalid
1100
1101    if not dirmove:
1102        return {}, {}
1103
1104    dirmove = {k + b"/": v + b"/" for k, v in pycompat.iteritems(dirmove)}
1105
1106    for d in dirmove:
1107        repo.ui.debug(
1108            b"   discovered dir src: '%s' -> dst: '%s'\n" % (d, dirmove[d])
1109        )
1110
1111    # Sort the directories in reverse order, so we find children first
1112    # For example, if dir1/ was renamed to dir2/, and dir1/subdir1/
1113    # was renamed to dir2/subdir2/, we want to move dir1/subdir1/file
1114    # to dir2/subdir2/file (not dir2/subdir1/file)
1115    dirmove_children_first = sorted(dirmove, reverse=True)
1116
1117    movewithdir = {}
1118    # check unaccounted nonoverlapping files against directory moves
1119    for f in addedfilesfn():
1120        if f not in fullcopy:
1121            for d in dirmove_children_first:
1122                if f.startswith(d):
1123                    # new file added in a directory that was moved, move it
1124                    df = dirmove[d] + f[len(d) :]
1125                    if df not in copy:
1126                        movewithdir[f] = df
1127                        repo.ui.debug(
1128                            b"   pending file src: '%s' -> dst: '%s'\n"
1129                            % (f, df)
1130                        )
1131                    break
1132
1133    return dirmove, movewithdir
1134
1135
1136def _heuristicscopytracing(repo, c1, c2, base):
1137    """Fast copytracing using filename heuristics
1138
1139    Assumes that moves or renames are of following two types:
1140
1141    1) Inside a directory only (same directory name but different filenames)
1142    2) Move from one directory to another
1143                    (same filenames but different directory names)
1144
1145    Works only when there are no merge commits in the "source branch".
1146    Source branch is commits from base up to c2 not including base.
1147
1148    If merge is involved it fallbacks to _fullcopytracing().
1149
1150    Can be used by setting the following config:
1151
1152        [experimental]
1153        copytrace = heuristics
1154
1155    In some cases the copy/move candidates found by heuristics can be very large
1156    in number and that will make the algorithm slow. The number of possible
1157    candidates to check can be limited by using the config
1158    `experimental.copytrace.movecandidateslimit` which defaults to 100.
1159    """
1160
1161    if c1.rev() is None:
1162        c1 = c1.p1()
1163    if c2.rev() is None:
1164        c2 = c2.p1()
1165
1166    changedfiles = set()
1167    m1 = c1.manifest()
1168    if not repo.revs(b'%d::%d', base.rev(), c2.rev()):
1169        # If base is not in c2 branch, we switch to fullcopytracing
1170        repo.ui.debug(
1171            b"switching to full copytracing as base is not "
1172            b"an ancestor of c2\n"
1173        )
1174        return _fullcopytracing(repo, c1, c2, base)
1175
1176    ctx = c2
1177    while ctx != base:
1178        if len(ctx.parents()) == 2:
1179            # To keep things simple let's not handle merges
1180            repo.ui.debug(b"switching to full copytracing because of merges\n")
1181            return _fullcopytracing(repo, c1, c2, base)
1182        changedfiles.update(ctx.files())
1183        ctx = ctx.p1()
1184
1185    copies2 = {}
1186    cp = _forwardcopies(base, c2)
1187    for dst, src in pycompat.iteritems(cp):
1188        if src in m1:
1189            copies2[dst] = src
1190
1191    # file is missing if it isn't present in the destination, but is present in
1192    # the base and present in the source.
1193    # Presence in the base is important to exclude added files, presence in the
1194    # source is important to exclude removed files.
1195    filt = lambda f: f not in m1 and f in base and f in c2
1196    missingfiles = [f for f in changedfiles if filt(f)]
1197
1198    copies1 = {}
1199    if missingfiles:
1200        basenametofilename = collections.defaultdict(list)
1201        dirnametofilename = collections.defaultdict(list)
1202
1203        for f in m1.filesnotin(base.manifest()):
1204            basename = os.path.basename(f)
1205            dirname = os.path.dirname(f)
1206            basenametofilename[basename].append(f)
1207            dirnametofilename[dirname].append(f)
1208
1209        for f in missingfiles:
1210            basename = os.path.basename(f)
1211            dirname = os.path.dirname(f)
1212            samebasename = basenametofilename[basename]
1213            samedirname = dirnametofilename[dirname]
1214            movecandidates = samebasename + samedirname
1215            # f is guaranteed to be present in c2, that's why
1216            # c2.filectx(f) won't fail
1217            f2 = c2.filectx(f)
1218            # we can have a lot of candidates which can slow down the heuristics
1219            # config value to limit the number of candidates moves to check
1220            maxcandidates = repo.ui.configint(
1221                b'experimental', b'copytrace.movecandidateslimit'
1222            )
1223
1224            if len(movecandidates) > maxcandidates:
1225                repo.ui.status(
1226                    _(
1227                        b"skipping copytracing for '%s', more "
1228                        b"candidates than the limit: %d\n"
1229                    )
1230                    % (f, len(movecandidates))
1231                )
1232                continue
1233
1234            for candidate in movecandidates:
1235                f1 = c1.filectx(candidate)
1236                if _related(f1, f2):
1237                    # if there are a few related copies then we'll merge
1238                    # changes into all of them. This matches the behaviour
1239                    # of upstream copytracing
1240                    copies1[candidate] = f
1241
1242    return branch_copies(copies1), branch_copies(copies2), {}
1243
1244
1245def _related(f1, f2):
1246    """return True if f1 and f2 filectx have a common ancestor
1247
1248    Walk back to common ancestor to see if the two files originate
1249    from the same file. Since workingfilectx's rev() is None it messes
1250    up the integer comparison logic, hence the pre-step check for
1251    None (f1 and f2 can only be workingfilectx's initially).
1252    """
1253
1254    if f1 == f2:
1255        return True  # a match
1256
1257    g1, g2 = f1.ancestors(), f2.ancestors()
1258    try:
1259        f1r, f2r = f1.linkrev(), f2.linkrev()
1260
1261        if f1r is None:
1262            f1 = next(g1)
1263        if f2r is None:
1264            f2 = next(g2)
1265
1266        while True:
1267            f1r, f2r = f1.linkrev(), f2.linkrev()
1268            if f1r > f2r:
1269                f1 = next(g1)
1270            elif f2r > f1r:
1271                f2 = next(g2)
1272            else:  # f1 and f2 point to files in the same linkrev
1273                return f1 == f2  # true if they point to the same file
1274    except StopIteration:
1275        return False
1276
1277
1278def graftcopies(wctx, ctx, base):
1279    """reproduce copies between base and ctx in the wctx
1280
1281    Unlike mergecopies(), this function will only consider copies between base
1282    and ctx; it will ignore copies between base and wctx. Also unlike
1283    mergecopies(), this function will apply copies to the working copy (instead
1284    of just returning information about the copies). That makes it cheaper
1285    (especially in the common case of base==ctx.p1()) and useful also when
1286    experimental.copytrace=off.
1287
1288    merge.update() will have already marked most copies, but it will only
1289    mark copies if it thinks the source files are related (see
1290    merge._related()). It will also not mark copies if the file wasn't modified
1291    on the local side. This function adds the copies that were "missed"
1292    by merge.update().
1293    """
1294    new_copies = pathcopies(base, ctx)
1295    parent = wctx.p1()
1296    _filter(parent, wctx, new_copies)
1297    # Extra filtering to drop copy information for files that existed before
1298    # the graft. This is to handle the case of grafting a rename onto a commit
1299    # that already has the rename. Otherwise the presence of copy information
1300    # would result in the creation of an empty commit where we would prefer to
1301    # not create one.
1302    for dest, __ in list(new_copies.items()):
1303        if dest in parent:
1304            del new_copies[dest]
1305    for dst, src in pycompat.iteritems(new_copies):
1306        wctx[dst].markcopied(src)
1307