1# coding: utf8 2# copies.py - copy detection for Mercurial 3# 4# Copyright 2008 Olivia Mackall <olivia@selenic.com> 5# 6# This software may be used and distributed according to the terms of the 7# GNU General Public License version 2 or any later version. 8 9from __future__ import absolute_import 10 11import collections 12import os 13 14from .i18n import _ 15from .node import nullrev 16 17from . import ( 18 match as matchmod, 19 pathutil, 20 policy, 21 pycompat, 22 util, 23) 24 25 26from .utils import stringutil 27 28from .revlogutils import ( 29 flagutil, 30 sidedata as sidedatamod, 31) 32 33rustmod = policy.importrust("copy_tracing") 34 35 36def _filter(src, dst, t): 37 """filters out invalid copies after chaining""" 38 39 # When _chain()'ing copies in 'a' (from 'src' via some other commit 'mid') 40 # with copies in 'b' (from 'mid' to 'dst'), we can get the different cases 41 # in the following table (not including trivial cases). For example, case 6 42 # is where a file existed in 'src' and remained under that name in 'mid' and 43 # then was renamed between 'mid' and 'dst'. 44 # 45 # case src mid dst result 46 # 1 x y - - 47 # 2 x y y x->y 48 # 3 x y x - 49 # 4 x y z x->z 50 # 5 - x y - 51 # 6 x x y x->y 52 # 53 # _chain() takes care of chaining the copies in 'a' and 'b', but it 54 # cannot tell the difference between cases 1 and 2, between 3 and 4, or 55 # between 5 and 6, so it includes all cases in its result. 56 # Cases 1, 3, and 5 are then removed by _filter(). 57 58 for k, v in list(t.items()): 59 if k == v: # case 3 60 del t[k] 61 elif v not in src: # case 5 62 # remove copies from files that didn't exist 63 del t[k] 64 elif k not in dst: # case 1 65 # remove copies to files that were then removed 66 del t[k] 67 68 69def _chain(prefix, suffix): 70 """chain two sets of copies 'prefix' and 'suffix'""" 71 result = prefix.copy() 72 for key, value in pycompat.iteritems(suffix): 73 result[key] = prefix.get(value, value) 74 return result 75 76 77def _tracefile(fctx, am, basemf): 78 """return file context that is the ancestor of fctx present in ancestor 79 manifest am 80 81 Note: we used to try and stop after a given limit, however checking if that 82 limit is reached turned out to be very expensive. we are better off 83 disabling that feature.""" 84 85 for f in fctx.ancestors(): 86 path = f.path() 87 if am.get(path, None) == f.filenode(): 88 return path 89 if basemf and basemf.get(path, None) == f.filenode(): 90 return path 91 92 93def _dirstatecopies(repo, match=None): 94 ds = repo.dirstate 95 c = ds.copies().copy() 96 for k in list(c): 97 if not ds.get_entry(k).tracked or (match and not match(k)): 98 del c[k] 99 return c 100 101 102def _computeforwardmissing(a, b, match=None): 103 """Computes which files are in b but not a. 104 This is its own function so extensions can easily wrap this call to see what 105 files _forwardcopies is about to process. 106 """ 107 ma = a.manifest() 108 mb = b.manifest() 109 return mb.filesnotin(ma, match=match) 110 111 112def usechangesetcentricalgo(repo): 113 """Checks if we should use changeset-centric copy algorithms""" 114 if repo.filecopiesmode == b'changeset-sidedata': 115 return True 116 readfrom = repo.ui.config(b'experimental', b'copies.read-from') 117 changesetsource = (b'changeset-only', b'compatibility') 118 return readfrom in changesetsource 119 120 121def _committedforwardcopies(a, b, base, match): 122 """Like _forwardcopies(), but b.rev() cannot be None (working copy)""" 123 # files might have to be traced back to the fctx parent of the last 124 # one-side-only changeset, but not further back than that 125 repo = a._repo 126 127 if usechangesetcentricalgo(repo): 128 return _changesetforwardcopies(a, b, match) 129 130 debug = repo.ui.debugflag and repo.ui.configbool(b'devel', b'debug.copies') 131 dbg = repo.ui.debug 132 if debug: 133 dbg(b'debug.copies: looking into rename from %s to %s\n' % (a, b)) 134 am = a.manifest() 135 basemf = None if base is None else base.manifest() 136 137 # find where new files came from 138 # we currently don't try to find where old files went, too expensive 139 # this means we can miss a case like 'hg rm b; hg cp a b' 140 cm = {} 141 142 # Computing the forward missing is quite expensive on large manifests, since 143 # it compares the entire manifests. We can optimize it in the common use 144 # case of computing what copies are in a commit versus its parent (like 145 # during a rebase or histedit). Note, we exclude merge commits from this 146 # optimization, since the ctx.files() for a merge commit is not correct for 147 # this comparison. 148 forwardmissingmatch = match 149 if b.p1() == a and b.p2().rev() == nullrev: 150 filesmatcher = matchmod.exact(b.files()) 151 forwardmissingmatch = matchmod.intersectmatchers(match, filesmatcher) 152 if repo.ui.configbool(b'devel', b'copy-tracing.trace-all-files'): 153 missing = list(b.walk(match)) 154 # _computeforwardmissing(a, b, match=forwardmissingmatch) 155 if debug: 156 dbg(b'debug.copies: searching all files: %d\n' % len(missing)) 157 else: 158 missing = _computeforwardmissing(a, b, match=forwardmissingmatch) 159 if debug: 160 dbg( 161 b'debug.copies: missing files to search: %d\n' 162 % len(missing) 163 ) 164 165 ancestrycontext = a._repo.changelog.ancestors([b.rev()], inclusive=True) 166 167 for f in sorted(missing): 168 if debug: 169 dbg(b'debug.copies: tracing file: %s\n' % f) 170 fctx = b[f] 171 fctx._ancestrycontext = ancestrycontext 172 173 if debug: 174 start = util.timer() 175 opath = _tracefile(fctx, am, basemf) 176 if opath: 177 if debug: 178 dbg(b'debug.copies: rename of: %s\n' % opath) 179 cm[f] = opath 180 if debug: 181 dbg( 182 b'debug.copies: time: %f seconds\n' 183 % (util.timer() - start) 184 ) 185 return cm 186 187 188def _revinfo_getter(repo, match): 189 """returns a function that returns the following data given a <rev>" 190 191 * p1: revision number of first parent 192 * p2: revision number of first parent 193 * changes: a ChangingFiles object 194 """ 195 cl = repo.changelog 196 parents = cl.parentrevs 197 flags = cl.flags 198 199 HASCOPIESINFO = flagutil.REVIDX_HASCOPIESINFO 200 201 changelogrevision = cl.changelogrevision 202 203 if rustmod is not None: 204 205 def revinfo(rev): 206 p1, p2 = parents(rev) 207 if flags(rev) & HASCOPIESINFO: 208 raw = changelogrevision(rev)._sidedata.get(sidedatamod.SD_FILES) 209 else: 210 raw = None 211 return (p1, p2, raw) 212 213 else: 214 215 def revinfo(rev): 216 p1, p2 = parents(rev) 217 if flags(rev) & HASCOPIESINFO: 218 changes = changelogrevision(rev).changes 219 else: 220 changes = None 221 return (p1, p2, changes) 222 223 return revinfo 224 225 226def cached_is_ancestor(is_ancestor): 227 """return a cached version of is_ancestor""" 228 cache = {} 229 230 def _is_ancestor(anc, desc): 231 if anc > desc: 232 return False 233 elif anc == desc: 234 return True 235 key = (anc, desc) 236 ret = cache.get(key) 237 if ret is None: 238 ret = cache[key] = is_ancestor(anc, desc) 239 return ret 240 241 return _is_ancestor 242 243 244def _changesetforwardcopies(a, b, match): 245 if a.rev() in (nullrev, b.rev()): 246 return {} 247 248 repo = a.repo().unfiltered() 249 children = {} 250 251 cl = repo.changelog 252 isancestor = cl.isancestorrev 253 254 # To track rename from "A" to B, we need to gather all parent → children 255 # edges that are contains in `::B` but not in `::A`. 256 # 257 # 258 # To do so, we need to gather all revisions exclusive¹ to "B" (ie¹: `::b - 259 # ::a`) and also all the "roots point", ie the parents of the exclusive set 260 # that belong to ::a. These are exactly all the revisions needed to express 261 # the parent → children we need to combine. 262 # 263 # [1] actually, we need to gather all the edges within `(::a)::b`, ie: 264 # excluding paths that leads to roots that are not ancestors of `a`. We 265 # keep this out of the explanation because it is hard enough without this special case.. 266 267 parents = cl._uncheckedparentrevs 268 graph_roots = (nullrev, nullrev) 269 270 ancestors = cl.ancestors([a.rev()], inclusive=True) 271 revs = cl.findmissingrevs(common=[a.rev()], heads=[b.rev()]) 272 roots = set() 273 has_graph_roots = False 274 multi_thread = repo.ui.configbool(b'devel', b'copy-tracing.multi-thread') 275 276 # iterate over `only(B, A)` 277 for r in revs: 278 ps = parents(r) 279 if ps == graph_roots: 280 has_graph_roots = True 281 else: 282 p1, p2 = ps 283 284 # find all the "root points" (see larger comment above) 285 if p1 != nullrev and p1 in ancestors: 286 roots.add(p1) 287 if p2 != nullrev and p2 in ancestors: 288 roots.add(p2) 289 if not roots: 290 # no common revision to track copies from 291 return {} 292 if has_graph_roots: 293 # this deal with the special case mentionned in the [1] footnotes. We 294 # must filter out revisions that leads to non-common graphroots. 295 roots = list(roots) 296 m = min(roots) 297 h = [b.rev()] 298 roots_to_head = cl.reachableroots(m, h, roots, includepath=True) 299 roots_to_head = set(roots_to_head) 300 revs = [r for r in revs if r in roots_to_head] 301 302 if repo.filecopiesmode == b'changeset-sidedata': 303 # When using side-data, we will process the edges "from" the children. 304 # We iterate over the childre, gathering previous collected data for 305 # the parents. Do know when the parents data is no longer necessary, we 306 # keep a counter of how many children each revision has. 307 # 308 # An interresting property of `children_count` is that it only contains 309 # revision that will be relevant for a edge of the graph. So if a 310 # children has parent not in `children_count`, that edges should not be 311 # processed. 312 children_count = dict((r, 0) for r in roots) 313 for r in revs: 314 for p in cl.parentrevs(r): 315 if p == nullrev: 316 continue 317 children_count[r] = 0 318 if p in children_count: 319 children_count[p] += 1 320 revinfo = _revinfo_getter(repo, match) 321 with repo.changelog.reading(): 322 return _combine_changeset_copies( 323 revs, 324 children_count, 325 b.rev(), 326 revinfo, 327 match, 328 isancestor, 329 multi_thread, 330 ) 331 else: 332 # When not using side-data, we will process the edges "from" the parent. 333 # so we need a full mapping of the parent -> children relation. 334 children = dict((r, []) for r in roots) 335 for r in revs: 336 for p in cl.parentrevs(r): 337 if p == nullrev: 338 continue 339 children[r] = [] 340 if p in children: 341 children[p].append(r) 342 x = revs.pop() 343 assert x == b.rev() 344 revs.extend(roots) 345 revs.sort() 346 347 revinfo = _revinfo_getter_extra(repo) 348 return _combine_changeset_copies_extra( 349 revs, children, b.rev(), revinfo, match, isancestor 350 ) 351 352 353def _combine_changeset_copies( 354 revs, children_count, targetrev, revinfo, match, isancestor, multi_thread 355): 356 """combine the copies information for each item of iterrevs 357 358 revs: sorted iterable of revision to visit 359 children_count: a {parent: <number-of-relevant-children>} mapping. 360 targetrev: the final copies destination revision (not in iterrevs) 361 revinfo(rev): a function that return (p1, p2, p1copies, p2copies, removed) 362 match: a matcher 363 364 It returns the aggregated copies information for `targetrev`. 365 """ 366 367 alwaysmatch = match.always() 368 369 if rustmod is not None: 370 final_copies = rustmod.combine_changeset_copies( 371 list(revs), children_count, targetrev, revinfo, multi_thread 372 ) 373 else: 374 isancestor = cached_is_ancestor(isancestor) 375 376 all_copies = {} 377 # iterate over all the "children" side of copy tracing "edge" 378 for current_rev in revs: 379 p1, p2, changes = revinfo(current_rev) 380 current_copies = None 381 # iterate over all parents to chain the existing data with the 382 # data from the parent → child edge. 383 for parent, parent_rev in ((1, p1), (2, p2)): 384 if parent_rev == nullrev: 385 continue 386 remaining_children = children_count.get(parent_rev) 387 if remaining_children is None: 388 continue 389 remaining_children -= 1 390 children_count[parent_rev] = remaining_children 391 if remaining_children: 392 copies = all_copies.get(parent_rev, None) 393 else: 394 copies = all_copies.pop(parent_rev, None) 395 396 if copies is None: 397 # this is a root 398 newcopies = copies = {} 399 elif remaining_children: 400 newcopies = copies.copy() 401 else: 402 newcopies = copies 403 # chain the data in the edge with the existing data 404 if changes is not None: 405 childcopies = {} 406 if parent == 1: 407 childcopies = changes.copied_from_p1 408 elif parent == 2: 409 childcopies = changes.copied_from_p2 410 411 if childcopies: 412 newcopies = copies.copy() 413 for dest, source in pycompat.iteritems(childcopies): 414 prev = copies.get(source) 415 if prev is not None and prev[1] is not None: 416 source = prev[1] 417 newcopies[dest] = (current_rev, source) 418 assert newcopies is not copies 419 if changes.removed: 420 for f in changes.removed: 421 if f in newcopies: 422 if newcopies is copies: 423 # copy on write to avoid affecting potential other 424 # branches. when there are no other branches, this 425 # could be avoided. 426 newcopies = copies.copy() 427 newcopies[f] = (current_rev, None) 428 # check potential need to combine the data from another parent (for 429 # that child). See comment below for details. 430 if current_copies is None: 431 current_copies = newcopies 432 else: 433 # we are the second parent to work on c, we need to merge our 434 # work with the other. 435 # 436 # In case of conflict, parent 1 take precedence over parent 2. 437 # This is an arbitrary choice made anew when implementing 438 # changeset based copies. It was made without regards with 439 # potential filelog related behavior. 440 assert parent == 2 441 current_copies = _merge_copies_dict( 442 newcopies, 443 current_copies, 444 isancestor, 445 changes, 446 current_rev, 447 ) 448 all_copies[current_rev] = current_copies 449 450 # filter out internal details and return a {dest: source mapping} 451 final_copies = {} 452 for dest, (tt, source) in all_copies[targetrev].items(): 453 if source is not None: 454 final_copies[dest] = source 455 if not alwaysmatch: 456 for filename in list(final_copies.keys()): 457 if not match(filename): 458 del final_copies[filename] 459 return final_copies 460 461 462# constant to decide which side to pick with _merge_copies_dict 463PICK_MINOR = 0 464PICK_MAJOR = 1 465PICK_EITHER = 2 466 467 468def _merge_copies_dict(minor, major, isancestor, changes, current_merge): 469 """merge two copies-mapping together, minor and major 470 471 In case of conflict, value from "major" will be picked. 472 473 - `isancestors(low_rev, high_rev)`: callable return True if `low_rev` is an 474 ancestors of `high_rev`, 475 476 - `ismerged(path)`: callable return True if `path` have been merged in the 477 current revision, 478 479 return the resulting dict (in practice, the "minor" object, updated) 480 """ 481 for dest, value in major.items(): 482 other = minor.get(dest) 483 if other is None: 484 minor[dest] = value 485 else: 486 pick, overwrite = _compare_values( 487 changes, isancestor, dest, other, value 488 ) 489 if overwrite: 490 if pick == PICK_MAJOR: 491 minor[dest] = (current_merge, value[1]) 492 else: 493 minor[dest] = (current_merge, other[1]) 494 elif pick == PICK_MAJOR: 495 minor[dest] = value 496 return minor 497 498 499def _compare_values(changes, isancestor, dest, minor, major): 500 """compare two value within a _merge_copies_dict loop iteration 501 502 return (pick, overwrite). 503 504 - pick is one of PICK_MINOR, PICK_MAJOR or PICK_EITHER 505 - overwrite is True if pick is a return of an ambiguity that needs resolution. 506 """ 507 major_tt, major_value = major 508 minor_tt, minor_value = minor 509 510 if major_tt == minor_tt: 511 # if it comes from the same revision it must be the same value 512 assert major_value == minor_value 513 return PICK_EITHER, False 514 elif ( 515 changes is not None 516 and minor_value is not None 517 and major_value is None 518 and dest in changes.salvaged 519 ): 520 # In this case, a deletion was reverted, the "alive" value overwrite 521 # the deleted one. 522 return PICK_MINOR, True 523 elif ( 524 changes is not None 525 and major_value is not None 526 and minor_value is None 527 and dest in changes.salvaged 528 ): 529 # In this case, a deletion was reverted, the "alive" value overwrite 530 # the deleted one. 531 return PICK_MAJOR, True 532 elif isancestor(minor_tt, major_tt): 533 if changes is not None and dest in changes.merged: 534 # change to dest happened on the branch without copy-source change, 535 # so both source are valid and "major" wins. 536 return PICK_MAJOR, True 537 else: 538 return PICK_MAJOR, False 539 elif isancestor(major_tt, minor_tt): 540 if changes is not None and dest in changes.merged: 541 # change to dest happened on the branch without copy-source change, 542 # so both source are valid and "major" wins. 543 return PICK_MAJOR, True 544 else: 545 return PICK_MINOR, False 546 elif minor_value is None: 547 # in case of conflict, the "alive" side wins. 548 return PICK_MAJOR, True 549 elif major_value is None: 550 # in case of conflict, the "alive" side wins. 551 return PICK_MINOR, True 552 else: 553 # in case of conflict where both side are alive, major wins. 554 return PICK_MAJOR, True 555 556 557def _revinfo_getter_extra(repo): 558 """return a function that return multiple data given a <rev>"i 559 560 * p1: revision number of first parent 561 * p2: revision number of first parent 562 * p1copies: mapping of copies from p1 563 * p2copies: mapping of copies from p2 564 * removed: a list of removed files 565 * ismerged: a callback to know if file was merged in that revision 566 """ 567 cl = repo.changelog 568 parents = cl.parentrevs 569 570 def get_ismerged(rev): 571 ctx = repo[rev] 572 573 def ismerged(path): 574 if path not in ctx.files(): 575 return False 576 fctx = ctx[path] 577 parents = fctx._filelog.parents(fctx._filenode) 578 nb_parents = 0 579 for n in parents: 580 if n != repo.nullid: 581 nb_parents += 1 582 return nb_parents >= 2 583 584 return ismerged 585 586 def revinfo(rev): 587 p1, p2 = parents(rev) 588 ctx = repo[rev] 589 p1copies, p2copies = ctx._copies 590 removed = ctx.filesremoved() 591 return p1, p2, p1copies, p2copies, removed, get_ismerged(rev) 592 593 return revinfo 594 595 596def _combine_changeset_copies_extra( 597 revs, children, targetrev, revinfo, match, isancestor 598): 599 """version of `_combine_changeset_copies` that works with the Google 600 specific "extra" based storage for copy information""" 601 all_copies = {} 602 alwaysmatch = match.always() 603 for r in revs: 604 copies = all_copies.pop(r, None) 605 if copies is None: 606 # this is a root 607 copies = {} 608 for i, c in enumerate(children[r]): 609 p1, p2, p1copies, p2copies, removed, ismerged = revinfo(c) 610 if r == p1: 611 parent = 1 612 childcopies = p1copies 613 else: 614 assert r == p2 615 parent = 2 616 childcopies = p2copies 617 if not alwaysmatch: 618 childcopies = { 619 dst: src for dst, src in childcopies.items() if match(dst) 620 } 621 newcopies = copies 622 if childcopies: 623 newcopies = copies.copy() 624 for dest, source in pycompat.iteritems(childcopies): 625 prev = copies.get(source) 626 if prev is not None and prev[1] is not None: 627 source = prev[1] 628 newcopies[dest] = (c, source) 629 assert newcopies is not copies 630 for f in removed: 631 if f in newcopies: 632 if newcopies is copies: 633 # copy on write to avoid affecting potential other 634 # branches. when there are no other branches, this 635 # could be avoided. 636 newcopies = copies.copy() 637 newcopies[f] = (c, None) 638 othercopies = all_copies.get(c) 639 if othercopies is None: 640 all_copies[c] = newcopies 641 else: 642 # we are the second parent to work on c, we need to merge our 643 # work with the other. 644 # 645 # In case of conflict, parent 1 take precedence over parent 2. 646 # This is an arbitrary choice made anew when implementing 647 # changeset based copies. It was made without regards with 648 # potential filelog related behavior. 649 if parent == 1: 650 _merge_copies_dict_extra( 651 othercopies, newcopies, isancestor, ismerged 652 ) 653 else: 654 _merge_copies_dict_extra( 655 newcopies, othercopies, isancestor, ismerged 656 ) 657 all_copies[c] = newcopies 658 659 final_copies = {} 660 for dest, (tt, source) in all_copies[targetrev].items(): 661 if source is not None: 662 final_copies[dest] = source 663 return final_copies 664 665 666def _merge_copies_dict_extra(minor, major, isancestor, ismerged): 667 """version of `_merge_copies_dict` that works with the Google 668 specific "extra" based storage for copy information""" 669 for dest, value in major.items(): 670 other = minor.get(dest) 671 if other is None: 672 minor[dest] = value 673 else: 674 new_tt = value[0] 675 other_tt = other[0] 676 if value[1] == other[1]: 677 continue 678 # content from "major" wins, unless it is older 679 # than the branch point or there is a merge 680 if ( 681 new_tt == other_tt 682 or not isancestor(new_tt, other_tt) 683 or ismerged(dest) 684 ): 685 minor[dest] = value 686 687 688def _forwardcopies(a, b, base=None, match=None): 689 """find {dst@b: src@a} copy mapping where a is an ancestor of b""" 690 691 if base is None: 692 base = a 693 match = a.repo().narrowmatch(match) 694 # check for working copy 695 if b.rev() is None: 696 cm = _committedforwardcopies(a, b.p1(), base, match) 697 # combine copies from dirstate if necessary 698 copies = _chain(cm, _dirstatecopies(b._repo, match)) 699 else: 700 copies = _committedforwardcopies(a, b, base, match) 701 return copies 702 703 704def _backwardrenames(a, b, match): 705 """find renames from a to b""" 706 if a._repo.ui.config(b'experimental', b'copytrace') == b'off': 707 return {} 708 709 # We don't want to pass in "match" here, since that would filter 710 # the destination by it. Since we're reversing the copies, we want 711 # to filter the source instead. 712 copies = _forwardcopies(b, a) 713 return _reverse_renames(copies, a, match) 714 715 716def _reverse_renames(copies, dst, match): 717 """given copies to context 'dst', finds renames from that context""" 718 # Even though we're not taking copies into account, 1:n rename situations 719 # can still exist (e.g. hg cp a b; hg mv a c). In those cases we 720 # arbitrarily pick one of the renames. 721 r = {} 722 for k, v in sorted(pycompat.iteritems(copies)): 723 if match and not match(v): 724 continue 725 # remove copies 726 if v in dst: 727 continue 728 r[v] = k 729 return r 730 731 732def pathcopies(x, y, match=None): 733 """find {dst@y: src@x} copy mapping for directed compare""" 734 repo = x._repo 735 debug = repo.ui.debugflag and repo.ui.configbool(b'devel', b'debug.copies') 736 if debug: 737 repo.ui.debug( 738 b'debug.copies: searching copies from %s to %s\n' % (x, y) 739 ) 740 if x == y or not x or not y: 741 return {} 742 if y.rev() is None and x == y.p1(): 743 if debug: 744 repo.ui.debug(b'debug.copies: search mode: dirstate\n') 745 # short-circuit to avoid issues with merge states 746 return _dirstatecopies(repo, match) 747 a = y.ancestor(x) 748 if a == x: 749 if debug: 750 repo.ui.debug(b'debug.copies: search mode: forward\n') 751 copies = _forwardcopies(x, y, match=match) 752 elif a == y: 753 if debug: 754 repo.ui.debug(b'debug.copies: search mode: backward\n') 755 copies = _backwardrenames(x, y, match=match) 756 else: 757 if debug: 758 repo.ui.debug(b'debug.copies: search mode: combined\n') 759 base = None 760 if a.rev() != nullrev: 761 base = x 762 x_copies = _forwardcopies(a, x) 763 y_copies = _forwardcopies(a, y, base, match=match) 764 same_keys = set(x_copies) & set(y_copies) 765 for k in same_keys: 766 if x_copies.get(k) == y_copies.get(k): 767 del x_copies[k] 768 del y_copies[k] 769 x_backward_renames = _reverse_renames(x_copies, x, match) 770 copies = _chain( 771 x_backward_renames, 772 y_copies, 773 ) 774 _filter(x, y, copies) 775 return copies 776 777 778def mergecopies(repo, c1, c2, base): 779 """ 780 Finds moves and copies between context c1 and c2 that are relevant for 781 merging. 'base' will be used as the merge base. 782 783 Copytracing is used in commands like rebase, merge, unshelve, etc to merge 784 files that were moved/ copied in one merge parent and modified in another. 785 For example: 786 787 o ---> 4 another commit 788 | 789 | o ---> 3 commit that modifies a.txt 790 | / 791 o / ---> 2 commit that moves a.txt to b.txt 792 |/ 793 o ---> 1 merge base 794 795 If we try to rebase revision 3 on revision 4, since there is no a.txt in 796 revision 4, and if user have copytrace disabled, we prints the following 797 message: 798 799 ```other changed <file> which local deleted``` 800 801 Returns a tuple where: 802 803 "branch_copies" an instance of branch_copies. 804 805 "diverge" is a mapping of source name -> list of destination names 806 for divergent renames. 807 808 This function calls different copytracing algorithms based on config. 809 """ 810 # avoid silly behavior for update from empty dir 811 if not c1 or not c2 or c1 == c2: 812 return branch_copies(), branch_copies(), {} 813 814 narrowmatch = c1.repo().narrowmatch() 815 816 # avoid silly behavior for parent -> working dir 817 if c2.node() is None and c1.node() == repo.dirstate.p1(): 818 return ( 819 branch_copies(_dirstatecopies(repo, narrowmatch)), 820 branch_copies(), 821 {}, 822 ) 823 824 copytracing = repo.ui.config(b'experimental', b'copytrace') 825 if stringutil.parsebool(copytracing) is False: 826 # stringutil.parsebool() returns None when it is unable to parse the 827 # value, so we should rely on making sure copytracing is on such cases 828 return branch_copies(), branch_copies(), {} 829 830 if usechangesetcentricalgo(repo): 831 # The heuristics don't make sense when we need changeset-centric algos 832 return _fullcopytracing(repo, c1, c2, base) 833 834 # Copy trace disabling is explicitly below the node == p1 logic above 835 # because the logic above is required for a simple copy to be kept across a 836 # rebase. 837 if copytracing == b'heuristics': 838 # Do full copytracing if only non-public revisions are involved as 839 # that will be fast enough and will also cover the copies which could 840 # be missed by heuristics 841 if _isfullcopytraceable(repo, c1, base): 842 return _fullcopytracing(repo, c1, c2, base) 843 return _heuristicscopytracing(repo, c1, c2, base) 844 else: 845 return _fullcopytracing(repo, c1, c2, base) 846 847 848def _isfullcopytraceable(repo, c1, base): 849 """Checks that if base, source and destination are all no-public branches, 850 if yes let's use the full copytrace algorithm for increased capabilities 851 since it will be fast enough. 852 853 `experimental.copytrace.sourcecommitlimit` can be used to set a limit for 854 number of changesets from c1 to base such that if number of changesets are 855 more than the limit, full copytracing algorithm won't be used. 856 """ 857 if c1.rev() is None: 858 c1 = c1.p1() 859 if c1.mutable() and base.mutable(): 860 sourcecommitlimit = repo.ui.configint( 861 b'experimental', b'copytrace.sourcecommitlimit' 862 ) 863 commits = len(repo.revs(b'%d::%d', base.rev(), c1.rev())) 864 return commits < sourcecommitlimit 865 return False 866 867 868def _checksinglesidecopies( 869 src, dsts1, m1, m2, mb, c2, base, copy, renamedelete 870): 871 if src not in m2: 872 # deleted on side 2 873 if src not in m1: 874 # renamed on side 1, deleted on side 2 875 renamedelete[src] = dsts1 876 elif src not in mb: 877 # Work around the "short-circuit to avoid issues with merge states" 878 # thing in pathcopies(): pathcopies(x, y) can return a copy where the 879 # destination doesn't exist in y. 880 pass 881 elif mb[src] != m2[src] and not _related(c2[src], base[src]): 882 return 883 elif mb[src] != m2[src] or mb.flags(src) != m2.flags(src): 884 # modified on side 2 885 for dst in dsts1: 886 copy[dst] = src 887 888 889class branch_copies(object): 890 """Information about copies made on one side of a merge/graft. 891 892 "copy" is a mapping from destination name -> source name, 893 where source is in c1 and destination is in c2 or vice-versa. 894 895 "movewithdir" is a mapping from source name -> destination name, 896 where the file at source present in one context but not the other 897 needs to be moved to destination by the merge process, because the 898 other context moved the directory it is in. 899 900 "renamedelete" is a mapping of source name -> list of destination 901 names for files deleted in c1 that were renamed in c2 or vice-versa. 902 903 "dirmove" is a mapping of detected source dir -> destination dir renames. 904 This is needed for handling changes to new files previously grafted into 905 renamed directories. 906 """ 907 908 def __init__( 909 self, copy=None, renamedelete=None, dirmove=None, movewithdir=None 910 ): 911 self.copy = {} if copy is None else copy 912 self.renamedelete = {} if renamedelete is None else renamedelete 913 self.dirmove = {} if dirmove is None else dirmove 914 self.movewithdir = {} if movewithdir is None else movewithdir 915 916 def __repr__(self): 917 return '<branch_copies\n copy=%r\n renamedelete=%r\n dirmove=%r\n movewithdir=%r\n>' % ( 918 self.copy, 919 self.renamedelete, 920 self.dirmove, 921 self.movewithdir, 922 ) 923 924 925def _fullcopytracing(repo, c1, c2, base): 926 """The full copytracing algorithm which finds all the new files that were 927 added from merge base up to the top commit and for each file it checks if 928 this file was copied from another file. 929 930 This is pretty slow when a lot of changesets are involved but will track all 931 the copies. 932 """ 933 m1 = c1.manifest() 934 m2 = c2.manifest() 935 mb = base.manifest() 936 937 copies1 = pathcopies(base, c1) 938 copies2 = pathcopies(base, c2) 939 940 if not (copies1 or copies2): 941 return branch_copies(), branch_copies(), {} 942 943 inversecopies1 = {} 944 inversecopies2 = {} 945 for dst, src in copies1.items(): 946 inversecopies1.setdefault(src, []).append(dst) 947 for dst, src in copies2.items(): 948 inversecopies2.setdefault(src, []).append(dst) 949 950 copy1 = {} 951 copy2 = {} 952 diverge = {} 953 renamedelete1 = {} 954 renamedelete2 = {} 955 allsources = set(inversecopies1) | set(inversecopies2) 956 for src in allsources: 957 dsts1 = inversecopies1.get(src) 958 dsts2 = inversecopies2.get(src) 959 if dsts1 and dsts2: 960 # copied/renamed on both sides 961 if src not in m1 and src not in m2: 962 # renamed on both sides 963 dsts1 = set(dsts1) 964 dsts2 = set(dsts2) 965 # If there's some overlap in the rename destinations, we 966 # consider it not divergent. For example, if side 1 copies 'a' 967 # to 'b' and 'c' and deletes 'a', and side 2 copies 'a' to 'c' 968 # and 'd' and deletes 'a'. 969 if dsts1 & dsts2: 970 for dst in dsts1 & dsts2: 971 copy1[dst] = src 972 copy2[dst] = src 973 else: 974 diverge[src] = sorted(dsts1 | dsts2) 975 elif src in m1 and src in m2: 976 # copied on both sides 977 dsts1 = set(dsts1) 978 dsts2 = set(dsts2) 979 for dst in dsts1 & dsts2: 980 copy1[dst] = src 981 copy2[dst] = src 982 # TODO: Handle cases where it was renamed on one side and copied 983 # on the other side 984 elif dsts1: 985 # copied/renamed only on side 1 986 _checksinglesidecopies( 987 src, dsts1, m1, m2, mb, c2, base, copy1, renamedelete1 988 ) 989 elif dsts2: 990 # copied/renamed only on side 2 991 _checksinglesidecopies( 992 src, dsts2, m2, m1, mb, c1, base, copy2, renamedelete2 993 ) 994 995 # find interesting file sets from manifests 996 cache = [] 997 998 def _get_addedfiles(idx): 999 if not cache: 1000 addedinm1 = m1.filesnotin(mb, repo.narrowmatch()) 1001 addedinm2 = m2.filesnotin(mb, repo.narrowmatch()) 1002 u1 = sorted(addedinm1 - addedinm2) 1003 u2 = sorted(addedinm2 - addedinm1) 1004 cache.extend((u1, u2)) 1005 return cache[idx] 1006 1007 u1fn = lambda: _get_addedfiles(0) 1008 u2fn = lambda: _get_addedfiles(1) 1009 if repo.ui.debugflag: 1010 u1 = u1fn() 1011 u2 = u2fn() 1012 1013 header = b" unmatched files in %s" 1014 if u1: 1015 repo.ui.debug( 1016 b"%s:\n %s\n" % (header % b'local', b"\n ".join(u1)) 1017 ) 1018 if u2: 1019 repo.ui.debug( 1020 b"%s:\n %s\n" % (header % b'other', b"\n ".join(u2)) 1021 ) 1022 1023 renamedeleteset = set() 1024 divergeset = set() 1025 for dsts in diverge.values(): 1026 divergeset.update(dsts) 1027 for dsts in renamedelete1.values(): 1028 renamedeleteset.update(dsts) 1029 for dsts in renamedelete2.values(): 1030 renamedeleteset.update(dsts) 1031 1032 repo.ui.debug( 1033 b" all copies found (* = to merge, ! = divergent, " 1034 b"% = renamed and deleted):\n" 1035 ) 1036 for side, copies in ((b"local", copies1), (b"remote", copies2)): 1037 if not copies: 1038 continue 1039 repo.ui.debug(b" on %s side:\n" % side) 1040 for f in sorted(copies): 1041 note = b"" 1042 if f in copy1 or f in copy2: 1043 note += b"*" 1044 if f in divergeset: 1045 note += b"!" 1046 if f in renamedeleteset: 1047 note += b"%" 1048 repo.ui.debug( 1049 b" src: '%s' -> dst: '%s' %s\n" % (copies[f], f, note) 1050 ) 1051 del renamedeleteset 1052 del divergeset 1053 1054 repo.ui.debug(b" checking for directory renames\n") 1055 1056 dirmove1, movewithdir2 = _dir_renames(repo, c1, copy1, copies1, u2fn) 1057 dirmove2, movewithdir1 = _dir_renames(repo, c2, copy2, copies2, u1fn) 1058 1059 branch_copies1 = branch_copies(copy1, renamedelete1, dirmove1, movewithdir1) 1060 branch_copies2 = branch_copies(copy2, renamedelete2, dirmove2, movewithdir2) 1061 1062 return branch_copies1, branch_copies2, diverge 1063 1064 1065def _dir_renames(repo, ctx, copy, fullcopy, addedfilesfn): 1066 """Finds moved directories and files that should move with them. 1067 1068 ctx: the context for one of the sides 1069 copy: files copied on the same side (as ctx) 1070 fullcopy: files copied on the same side (as ctx), including those that 1071 merge.manifestmerge() won't care about 1072 addedfilesfn: function returning added files on the other side (compared to 1073 ctx) 1074 """ 1075 # generate a directory move map 1076 invalid = set() 1077 dirmove = {} 1078 1079 # examine each file copy for a potential directory move, which is 1080 # when all the files in a directory are moved to a new directory 1081 for dst, src in pycompat.iteritems(fullcopy): 1082 dsrc, ddst = pathutil.dirname(src), pathutil.dirname(dst) 1083 if dsrc in invalid: 1084 # already seen to be uninteresting 1085 continue 1086 elif ctx.hasdir(dsrc) and ctx.hasdir(ddst): 1087 # directory wasn't entirely moved locally 1088 invalid.add(dsrc) 1089 elif dsrc in dirmove and dirmove[dsrc] != ddst: 1090 # files from the same directory moved to two different places 1091 invalid.add(dsrc) 1092 else: 1093 # looks good so far 1094 dirmove[dsrc] = ddst 1095 1096 for i in invalid: 1097 if i in dirmove: 1098 del dirmove[i] 1099 del invalid 1100 1101 if not dirmove: 1102 return {}, {} 1103 1104 dirmove = {k + b"/": v + b"/" for k, v in pycompat.iteritems(dirmove)} 1105 1106 for d in dirmove: 1107 repo.ui.debug( 1108 b" discovered dir src: '%s' -> dst: '%s'\n" % (d, dirmove[d]) 1109 ) 1110 1111 # Sort the directories in reverse order, so we find children first 1112 # For example, if dir1/ was renamed to dir2/, and dir1/subdir1/ 1113 # was renamed to dir2/subdir2/, we want to move dir1/subdir1/file 1114 # to dir2/subdir2/file (not dir2/subdir1/file) 1115 dirmove_children_first = sorted(dirmove, reverse=True) 1116 1117 movewithdir = {} 1118 # check unaccounted nonoverlapping files against directory moves 1119 for f in addedfilesfn(): 1120 if f not in fullcopy: 1121 for d in dirmove_children_first: 1122 if f.startswith(d): 1123 # new file added in a directory that was moved, move it 1124 df = dirmove[d] + f[len(d) :] 1125 if df not in copy: 1126 movewithdir[f] = df 1127 repo.ui.debug( 1128 b" pending file src: '%s' -> dst: '%s'\n" 1129 % (f, df) 1130 ) 1131 break 1132 1133 return dirmove, movewithdir 1134 1135 1136def _heuristicscopytracing(repo, c1, c2, base): 1137 """Fast copytracing using filename heuristics 1138 1139 Assumes that moves or renames are of following two types: 1140 1141 1) Inside a directory only (same directory name but different filenames) 1142 2) Move from one directory to another 1143 (same filenames but different directory names) 1144 1145 Works only when there are no merge commits in the "source branch". 1146 Source branch is commits from base up to c2 not including base. 1147 1148 If merge is involved it fallbacks to _fullcopytracing(). 1149 1150 Can be used by setting the following config: 1151 1152 [experimental] 1153 copytrace = heuristics 1154 1155 In some cases the copy/move candidates found by heuristics can be very large 1156 in number and that will make the algorithm slow. The number of possible 1157 candidates to check can be limited by using the config 1158 `experimental.copytrace.movecandidateslimit` which defaults to 100. 1159 """ 1160 1161 if c1.rev() is None: 1162 c1 = c1.p1() 1163 if c2.rev() is None: 1164 c2 = c2.p1() 1165 1166 changedfiles = set() 1167 m1 = c1.manifest() 1168 if not repo.revs(b'%d::%d', base.rev(), c2.rev()): 1169 # If base is not in c2 branch, we switch to fullcopytracing 1170 repo.ui.debug( 1171 b"switching to full copytracing as base is not " 1172 b"an ancestor of c2\n" 1173 ) 1174 return _fullcopytracing(repo, c1, c2, base) 1175 1176 ctx = c2 1177 while ctx != base: 1178 if len(ctx.parents()) == 2: 1179 # To keep things simple let's not handle merges 1180 repo.ui.debug(b"switching to full copytracing because of merges\n") 1181 return _fullcopytracing(repo, c1, c2, base) 1182 changedfiles.update(ctx.files()) 1183 ctx = ctx.p1() 1184 1185 copies2 = {} 1186 cp = _forwardcopies(base, c2) 1187 for dst, src in pycompat.iteritems(cp): 1188 if src in m1: 1189 copies2[dst] = src 1190 1191 # file is missing if it isn't present in the destination, but is present in 1192 # the base and present in the source. 1193 # Presence in the base is important to exclude added files, presence in the 1194 # source is important to exclude removed files. 1195 filt = lambda f: f not in m1 and f in base and f in c2 1196 missingfiles = [f for f in changedfiles if filt(f)] 1197 1198 copies1 = {} 1199 if missingfiles: 1200 basenametofilename = collections.defaultdict(list) 1201 dirnametofilename = collections.defaultdict(list) 1202 1203 for f in m1.filesnotin(base.manifest()): 1204 basename = os.path.basename(f) 1205 dirname = os.path.dirname(f) 1206 basenametofilename[basename].append(f) 1207 dirnametofilename[dirname].append(f) 1208 1209 for f in missingfiles: 1210 basename = os.path.basename(f) 1211 dirname = os.path.dirname(f) 1212 samebasename = basenametofilename[basename] 1213 samedirname = dirnametofilename[dirname] 1214 movecandidates = samebasename + samedirname 1215 # f is guaranteed to be present in c2, that's why 1216 # c2.filectx(f) won't fail 1217 f2 = c2.filectx(f) 1218 # we can have a lot of candidates which can slow down the heuristics 1219 # config value to limit the number of candidates moves to check 1220 maxcandidates = repo.ui.configint( 1221 b'experimental', b'copytrace.movecandidateslimit' 1222 ) 1223 1224 if len(movecandidates) > maxcandidates: 1225 repo.ui.status( 1226 _( 1227 b"skipping copytracing for '%s', more " 1228 b"candidates than the limit: %d\n" 1229 ) 1230 % (f, len(movecandidates)) 1231 ) 1232 continue 1233 1234 for candidate in movecandidates: 1235 f1 = c1.filectx(candidate) 1236 if _related(f1, f2): 1237 # if there are a few related copies then we'll merge 1238 # changes into all of them. This matches the behaviour 1239 # of upstream copytracing 1240 copies1[candidate] = f 1241 1242 return branch_copies(copies1), branch_copies(copies2), {} 1243 1244 1245def _related(f1, f2): 1246 """return True if f1 and f2 filectx have a common ancestor 1247 1248 Walk back to common ancestor to see if the two files originate 1249 from the same file. Since workingfilectx's rev() is None it messes 1250 up the integer comparison logic, hence the pre-step check for 1251 None (f1 and f2 can only be workingfilectx's initially). 1252 """ 1253 1254 if f1 == f2: 1255 return True # a match 1256 1257 g1, g2 = f1.ancestors(), f2.ancestors() 1258 try: 1259 f1r, f2r = f1.linkrev(), f2.linkrev() 1260 1261 if f1r is None: 1262 f1 = next(g1) 1263 if f2r is None: 1264 f2 = next(g2) 1265 1266 while True: 1267 f1r, f2r = f1.linkrev(), f2.linkrev() 1268 if f1r > f2r: 1269 f1 = next(g1) 1270 elif f2r > f1r: 1271 f2 = next(g2) 1272 else: # f1 and f2 point to files in the same linkrev 1273 return f1 == f2 # true if they point to the same file 1274 except StopIteration: 1275 return False 1276 1277 1278def graftcopies(wctx, ctx, base): 1279 """reproduce copies between base and ctx in the wctx 1280 1281 Unlike mergecopies(), this function will only consider copies between base 1282 and ctx; it will ignore copies between base and wctx. Also unlike 1283 mergecopies(), this function will apply copies to the working copy (instead 1284 of just returning information about the copies). That makes it cheaper 1285 (especially in the common case of base==ctx.p1()) and useful also when 1286 experimental.copytrace=off. 1287 1288 merge.update() will have already marked most copies, but it will only 1289 mark copies if it thinks the source files are related (see 1290 merge._related()). It will also not mark copies if the file wasn't modified 1291 on the local side. This function adds the copies that were "missed" 1292 by merge.update(). 1293 """ 1294 new_copies = pathcopies(base, ctx) 1295 parent = wctx.p1() 1296 _filter(parent, wctx, new_copies) 1297 # Extra filtering to drop copy information for files that existed before 1298 # the graft. This is to handle the case of grafting a rename onto a commit 1299 # that already has the rename. Otherwise the presence of copy information 1300 # would result in the creation of an empty commit where we would prefer to 1301 # not create one. 1302 for dest, __ in list(new_copies.items()): 1303 if dest in parent: 1304 del new_copies[dest] 1305 for dst, src in pycompat.iteritems(new_copies): 1306 wctx[dst].markcopied(src) 1307