1# coding: utf-8
2# metadata.py -- code related to various metadata computation and access.
3#
4# Copyright 2019 Google, Inc <martinvonz@google.com>
5# Copyright 2020 Pierre-Yves David <pierre-yves.david@octobus.net>
6#
7# This software may be used and distributed according to the terms of the
8# GNU General Public License version 2 or any later version.
9from __future__ import absolute_import, print_function
10
11import multiprocessing
12import struct
13
14from .node import nullrev
15from . import (
16    error,
17    util,
18)
19
20from .revlogutils import (
21    flagutil as sidedataflag,
22    sidedata as sidedatamod,
23)
24
25
26class ChangingFiles(object):
27    """A class recording the changes made to files by a changeset
28
29    Actions performed on files are gathered into 3 sets:
30
31    - added:   files actively added in the changeset.
32    - merged:  files whose history got merged
33    - removed: files removed in the revision
34    - salvaged: files that might have been deleted by a merge but were not
35    - touched: files affected by the merge
36
37    and copies information is held by 2 mappings
38
39    - copied_from_p1: {"<new-name>": "<source-name-in-p1>"} mapping for copies
40    - copied_from_p2: {"<new-name>": "<source-name-in-p2>"} mapping for copies
41
42    See their inline help for details.
43    """
44
45    def __init__(
46        self,
47        touched=None,
48        added=None,
49        removed=None,
50        merged=None,
51        salvaged=None,
52        p1_copies=None,
53        p2_copies=None,
54    ):
55        self._added = set(() if added is None else added)
56        self._merged = set(() if merged is None else merged)
57        self._removed = set(() if removed is None else removed)
58        self._touched = set(() if touched is None else touched)
59        self._salvaged = set(() if salvaged is None else salvaged)
60        self._touched.update(self._added)
61        self._touched.update(self._merged)
62        self._touched.update(self._removed)
63        self._p1_copies = dict(() if p1_copies is None else p1_copies)
64        self._p2_copies = dict(() if p2_copies is None else p2_copies)
65
66    def __eq__(self, other):
67        return (
68            self.added == other.added
69            and self.merged == other.merged
70            and self.removed == other.removed
71            and self.salvaged == other.salvaged
72            and self.touched == other.touched
73            and self.copied_from_p1 == other.copied_from_p1
74            and self.copied_from_p2 == other.copied_from_p2
75        )
76
77    @property
78    def has_copies_info(self):
79        return bool(
80            self.removed
81            or self.merged
82            or self.salvaged
83            or self.copied_from_p1
84            or self.copied_from_p2
85        )
86
87    @util.propertycache
88    def added(self):
89        """files actively added in the changeset
90
91        Any file present in that revision that was absent in all the changeset's
92        parents.
93
94        In case of merge, this means a file absent in one of the parents but
95        existing in the other will *not* be contained in this set. (They were
96        added by an ancestor)
97        """
98        return frozenset(self._added)
99
100    def mark_added(self, filename):
101        if 'added' in vars(self):
102            del self.added
103        self._added.add(filename)
104        self.mark_touched(filename)
105
106    def update_added(self, filenames):
107        for f in filenames:
108            self.mark_added(f)
109
110    @util.propertycache
111    def merged(self):
112        """files actively merged during a merge
113
114        Any modified files which had modification on both size that needed merging.
115
116        In this case a new filenode was created and it has two parents.
117        """
118        return frozenset(self._merged)
119
120    def mark_merged(self, filename):
121        if 'merged' in vars(self):
122            del self.merged
123        self._merged.add(filename)
124        self.mark_touched(filename)
125
126    def update_merged(self, filenames):
127        for f in filenames:
128            self.mark_merged(f)
129
130    @util.propertycache
131    def removed(self):
132        """files actively removed by the changeset
133
134        In case of merge this will only contain the set of files removing "new"
135        content. For any file absent in the current changeset:
136
137        a) If the file exists in both parents, it is clearly "actively" removed
138        by this changeset.
139
140        b) If a file exists in only one parent and in none of the common
141        ancestors, then the file was newly added in one of the merged branches
142        and then got "actively" removed.
143
144        c) If a file exists in only one parent and at least one of the common
145        ancestors using the same filenode, then the file was unchanged on one
146        side and deleted on the other side. The merge "passively" propagated
147        that deletion, but didn't "actively" remove the file. In this case the
148        file is *not* included in the `removed` set.
149
150        d) If a file exists in only one parent and at least one of the common
151        ancestors using a different filenode, then the file was changed on one
152        side and removed on the other side. The merge process "actively"
153        decided to drop the new change and delete the file. Unlike in the
154        previous case, (c), the file included in the `removed` set.
155
156        Summary table for merge:
157
158        case | exists in parents | exists in gca || removed
159         (a) |       both        |     *         ||   yes
160         (b) |       one         |     none      ||   yes
161         (c) |       one         | same filenode ||   no
162         (d) |       one         |  new filenode ||   yes
163        """
164        return frozenset(self._removed)
165
166    def mark_removed(self, filename):
167        if 'removed' in vars(self):
168            del self.removed
169        self._removed.add(filename)
170        self.mark_touched(filename)
171
172    def update_removed(self, filenames):
173        for f in filenames:
174            self.mark_removed(f)
175
176    @util.propertycache
177    def salvaged(self):
178        """files that might have been deleted by a merge, but still exists.
179
180        During a merge, the manifest merging might select some files for
181        removal, or for a removed/changed conflict. If at commit time the file
182        still exists, its removal was "reverted" and the file is "salvaged"
183        """
184        return frozenset(self._salvaged)
185
186    def mark_salvaged(self, filename):
187        if "salvaged" in vars(self):
188            del self.salvaged
189        self._salvaged.add(filename)
190        self.mark_touched(filename)
191
192    def update_salvaged(self, filenames):
193        for f in filenames:
194            self.mark_salvaged(f)
195
196    @util.propertycache
197    def touched(self):
198        """files either actively modified, added or removed"""
199        return frozenset(self._touched)
200
201    def mark_touched(self, filename):
202        if 'touched' in vars(self):
203            del self.touched
204        self._touched.add(filename)
205
206    def update_touched(self, filenames):
207        for f in filenames:
208            self.mark_touched(f)
209
210    @util.propertycache
211    def copied_from_p1(self):
212        return self._p1_copies.copy()
213
214    def mark_copied_from_p1(self, source, dest):
215        if 'copied_from_p1' in vars(self):
216            del self.copied_from_p1
217        self._p1_copies[dest] = source
218
219    def update_copies_from_p1(self, copies):
220        for dest, source in copies.items():
221            self.mark_copied_from_p1(source, dest)
222
223    @util.propertycache
224    def copied_from_p2(self):
225        return self._p2_copies.copy()
226
227    def mark_copied_from_p2(self, source, dest):
228        if 'copied_from_p2' in vars(self):
229            del self.copied_from_p2
230        self._p2_copies[dest] = source
231
232    def update_copies_from_p2(self, copies):
233        for dest, source in copies.items():
234            self.mark_copied_from_p2(source, dest)
235
236
237def compute_all_files_changes(ctx):
238    """compute the files changed by a revision"""
239    p1 = ctx.p1()
240    p2 = ctx.p2()
241    if p1.rev() == nullrev and p2.rev() == nullrev:
242        return _process_root(ctx)
243    elif p1.rev() != nullrev and p2.rev() == nullrev:
244        return _process_linear(p1, ctx)
245    elif p1.rev() == nullrev and p2.rev() != nullrev:
246        # In the wild, one can encounter changeset where p1 is null but p2 is not
247        return _process_linear(p1, ctx, parent=2)
248    elif p1.rev() == p2.rev():
249        # In the wild, one can encounter such "non-merge"
250        return _process_linear(p1, ctx)
251    else:
252        return _process_merge(p1, p2, ctx)
253
254
255def _process_root(ctx):
256    """compute the appropriate changed files for a changeset with no parents"""
257    # Simple, there was nothing before it, so everything is added.
258    md = ChangingFiles()
259    manifest = ctx.manifest()
260    for filename in manifest:
261        md.mark_added(filename)
262    return md
263
264
265def _process_linear(parent_ctx, children_ctx, parent=1):
266    """compute the appropriate changed files for a changeset with a single parent"""
267    md = ChangingFiles()
268    parent_manifest = parent_ctx.manifest()
269    children_manifest = children_ctx.manifest()
270
271    copies_candidate = []
272
273    for filename, d in parent_manifest.diff(children_manifest).items():
274        if d[1][0] is None:
275            # no filenode for the "new" value, file is absent
276            md.mark_removed(filename)
277        else:
278            copies_candidate.append(filename)
279            if d[0][0] is None:
280                # not filenode for the "old" value file was absent
281                md.mark_added(filename)
282            else:
283                # filenode for both "old" and "new"
284                md.mark_touched(filename)
285
286    if parent == 1:
287        copied = md.mark_copied_from_p1
288    elif parent == 2:
289        copied = md.mark_copied_from_p2
290    else:
291        assert False, "bad parent value %d" % parent
292
293    for filename in copies_candidate:
294        copy_info = children_ctx[filename].renamed()
295        if copy_info:
296            source, srcnode = copy_info
297            copied(source, filename)
298
299    return md
300
301
302def _process_merge(p1_ctx, p2_ctx, ctx):
303    """compute the appropriate changed files for a changeset with two parents
304
305    This is a more advance case. The information we need to record is summarise
306    in the following table:
307
308    ┌──────────────┬──────────────┬──────────────┬──────────────┬──────────────┐
309    │ diff ╲  diff │       ø      │ (Some, None) │ (None, Some) │ (Some, Some) │
310    │  p2   ╲  p1  │              │              │              │              │
311    ├──────────────┼──────────────┼──────────────┼──────────────┼──────────────┤
312    │              │              │��  No Changes │��  No Changes │              │
313    │  ø           │��  No Changes │      OR      │     OR       │��  No Changes │
314    │              │              │��  Deleted[1] │��  Salvaged[2]│     [3]      │
315    ├──────────────┼──────────────┼──────────────┼──────────────┼──────────────┤
316    │              │��  No Changes │              │              │              │
317    │ (Some, None) │      OR      │��  Deleted    │       ø      │      ø       │
318    │              │��  Deleted[1] │              │              │              │
319    ├──────────────┼──────────────┼──────────────┼──────────────┼──────────────┤
320    │              │��  No Changes │              │              │   �� Touched  │
321    │ (None, Some) │     OR       │      ø       │��   Added     │OR �� Salvaged │
322    │              │��  Salvaged[2]│              │   (copied?)  │   (copied?)  │
323    ├──────────────┼──────────────┼──────────────┼──────────────┼──────────────┤
324    │              │              │              │   �� Touched  │   �� Merged   │
325    │ (Some, Some) │��  No Changes │      ø       │OR �� Salvaged │OR �� Touched  │
326    │              │     [3]      │              │   (copied?)  │   (copied?)  │
327    └──────────────┴──────────────┴──────────────┴──────────────┴──────────────┘
328
329    Special case [1]:
330
331      The situation is:
332        - parent-A:     file exists,
333        - parent-B:     no file,
334        - working-copy: no file.
335
336      Detecting a "deletion" will depend on the presence of actual change on
337      the "parent-A" branch:
338
339      Subcase �� or �� : if the state of the file in "parent-A" is unchanged
340      compared to the merge ancestors, then parent-A branch left the file
341      untouched while parent-B deleted it. We simply apply the change from
342      "parent-B" branch the file was automatically dropped.
343      The result is:
344          - file is not recorded as touched by the merge.
345
346      Subcase �� or �� : otherwise, the change from parent-A branch were explicitly dropped and
347      the file was "deleted again". From a user perspective, the message
348      about "locally changed" while "remotely deleted" (or the other way
349      around) was issued and the user chose to deleted the file.
350      The result:
351          - file is recorded as touched by the merge.
352
353
354    Special case [2]:
355
356      The situation is:
357        - parent-A:     no file,
358        - parent-B:     file,
359        - working-copy: file (same content as parent-B).
360
361      There are three subcases depending on the ancestors contents:
362
363      - A) the file is missing in all ancestors,
364      - B) at least one ancestor has the file with filenode ≠ from parent-B,
365      - C) all ancestors use the same filenode as parent-B,
366
367      Subcase (A) is the simpler, nothing happend on parent-A side while
368      parent-B added it.
369
370        The result:
371            - the file is not marked as touched by the merge.
372
373      Subcase (B) is the counter part of "Special case [1]", the file was
374        modified on parent-B side, while parent-A side deleted it. However this
375        time, the conflict was solved by keeping the file (and its
376        modification). We consider the file as "salvaged".
377
378        The result:
379            - the file is marked as "salvaged" by the merge.
380
381      Subcase (C) is subtle variation of the case above. In this case, the
382        file in unchanged on the parent-B side and actively removed on the
383        parent-A side. So the merge machinery correctly decide it should be
384        removed. However, the file was explicitly restored to its parent-B
385        content before the merge was commited. The file is be marked
386        as salvaged too. From the merge result perspective, this is similar to
387        Subcase (B), however from the merge resolution perspective they differ
388        since in (C), there was some conflict not obvious solution to the
389        merge (That got reversed)
390
391    Special case [3]:
392
393      The situation is:
394        - parent-A:     file,
395        - parent-B:     file (different filenode as parent-A),
396        - working-copy: file (same filenode as parent-B).
397
398      This case is in theory much simple, for this to happens, this mean the
399      filenode in parent-A is purely replacing the one in parent-B (either a
400      descendant, or a full new file history, see changeset). So the merge
401      introduce no changes, and the file is not affected by the merge...
402
403      However, in the wild it is possible to find commit with the above is not
404      True. For example repository have some commit where the *new* node is an
405      ancestor of the node in parent-A, or where parent-A and parent-B are two
406      branches of the same file history, yet not merge-filenode were created
407      (while the "merge" should have led to a "modification").
408
409      Detecting such cases (and not recording the file as modified) would be a
410      nice bonus. However do not any of this yet.
411    """
412
413    repo = ctx.repo()
414    md = ChangingFiles()
415
416    m = ctx.manifest()
417    p1m = p1_ctx.manifest()
418    p2m = p2_ctx.manifest()
419    diff_p1 = p1m.diff(m)
420    diff_p2 = p2m.diff(m)
421
422    cahs = ctx.repo().changelog.commonancestorsheads(
423        p1_ctx.node(), p2_ctx.node()
424    )
425    if not cahs:
426        cahs = [nullrev]
427    mas = [ctx.repo()[r].manifest() for r in cahs]
428
429    copy_candidates = []
430
431    # Dealing with case �� happens automatically.  Since there are no entry in
432    # d1 nor d2, we won't iterate on it ever.
433
434    # Iteration over d1 content will deal with all cases, but the one in the
435    # first column of the table.
436    for filename, d1 in diff_p1.items():
437
438        d2 = diff_p2.pop(filename, None)
439
440        if d2 is None:
441            # this deal with the first line of the table.
442            _process_other_unchanged(md, mas, filename, d1)
443        else:
444
445            if d1[0][0] is None and d2[0][0] is None:
446                # case �� — both deleted the file.
447                md.mark_added(filename)
448                copy_candidates.append(filename)
449            elif d1[1][0] is None and d2[1][0] is None:
450                # case �� — both deleted the file.
451                md.mark_removed(filename)
452            elif d1[1][0] is not None and d2[1][0] is not None:
453                if d1[0][0] is None or d2[0][0] is None:
454                    if any(_find(ma, filename) is not None for ma in mas):
455                        # case �� or ��
456                        md.mark_salvaged(filename)
457                    else:
458                        # case �� �� : touched
459                        md.mark_touched(filename)
460                else:
461                    fctx = repo.filectx(filename, fileid=d1[1][0])
462                    if fctx.p2().rev() == nullrev:
463                        # case ��
464                        # lets assume we can trust the file history. If the
465                        # filenode is not a merge, the file was not merged.
466                        md.mark_touched(filename)
467                    else:
468                        # case ��
469                        md.mark_merged(filename)
470                copy_candidates.append(filename)
471            else:
472                # Impossible case, the post-merge file status cannot be None on
473                # one side and Something on the other side.
474                assert False, "unreachable"
475
476    # Iteration over remaining d2 content deal with the first column of the
477    # table.
478    for filename, d2 in diff_p2.items():
479        _process_other_unchanged(md, mas, filename, d2)
480
481    for filename in copy_candidates:
482        copy_info = ctx[filename].renamed()
483        if copy_info:
484            source, srcnode = copy_info
485            if source in p1_ctx and p1_ctx[source].filenode() == srcnode:
486                md.mark_copied_from_p1(source, filename)
487            elif source in p2_ctx and p2_ctx[source].filenode() == srcnode:
488                md.mark_copied_from_p2(source, filename)
489    return md
490
491
492def _find(manifest, filename):
493    """return the associate filenode or None"""
494    if filename not in manifest:
495        return None
496    return manifest.find(filename)[0]
497
498
499def _process_other_unchanged(md, mas, filename, diff):
500    source_node = diff[0][0]
501    target_node = diff[1][0]
502
503    if source_node is not None and target_node is None:
504        if any(not _find(ma, filename) == source_node for ma in mas):
505            # case �� of ��
506            md.mark_removed(filename)
507        # else, we have case �� or �� : no change need to be recorded
508    elif source_node is None and target_node is not None:
509        if any(_find(ma, filename) is not None for ma in mas):
510            # case �� or ��
511            md.mark_salvaged(filename)
512        # else, we have case �� or �� : simple merge without intervention
513    elif source_node is not None and target_node is not None:
514        # case ��  or �� : simple merge without intervention
515        #
516        # In buggy case where source_node is not an ancestors of target_node.
517        # There should have a been a new filenode created, recording this as
518        # "modified". We do not deal with them yet.
519        pass
520    else:
521        # An impossible case, the diff algorithm should not return entry if the
522        # file is missing on both side.
523        assert False, "unreachable"
524
525
526def _missing_from_all_ancestors(mas, filename):
527    return all(_find(ma, filename) is None for ma in mas)
528
529
530def computechangesetfilesadded(ctx):
531    """return the list of files added in a changeset"""
532    added = []
533    for f in ctx.files():
534        if not any(f in p for p in ctx.parents()):
535            added.append(f)
536    return added
537
538
539def get_removal_filter(ctx, x=None):
540    """return a function to detect files "wrongly" detected as `removed`
541
542    When a file is removed relative to p1 in a merge, this
543    function determines whether the absence is due to a
544    deletion from a parent, or whether the merge commit
545    itself deletes the file. We decide this by doing a
546    simplified three way merge of the manifest entry for
547    the file. There are two ways we decide the merge
548    itself didn't delete a file:
549    - neither parent (nor the merge) contain the file
550    - exactly one parent contains the file, and that
551      parent has the same filelog entry as the merge
552      ancestor (or all of them if there two). In other
553      words, that parent left the file unchanged while the
554      other one deleted it.
555    One way to think about this is that deleting a file is
556    similar to emptying it, so the list of changed files
557    should be similar either way. The computation
558    described above is not done directly in _filecommit
559    when creating the list of changed files, however
560    it does something very similar by comparing filelog
561    nodes.
562    """
563
564    if x is not None:
565        p1, p2, m1, m2 = x
566    else:
567        p1 = ctx.p1()
568        p2 = ctx.p2()
569        m1 = p1.manifest()
570        m2 = p2.manifest()
571
572    @util.cachefunc
573    def mas():
574        p1n = p1.node()
575        p2n = p2.node()
576        cahs = ctx.repo().changelog.commonancestorsheads(p1n, p2n)
577        if not cahs:
578            cahs = [nullrev]
579        return [ctx.repo()[r].manifest() for r in cahs]
580
581    def deletionfromparent(f):
582        if f in m1:
583            return f not in m2 and all(
584                f in ma and ma.find(f) == m1.find(f) for ma in mas()
585            )
586        elif f in m2:
587            return all(f in ma and ma.find(f) == m2.find(f) for ma in mas())
588        else:
589            return True
590
591    return deletionfromparent
592
593
594def computechangesetfilesremoved(ctx):
595    """return the list of files removed in a changeset"""
596    removed = []
597    for f in ctx.files():
598        if f not in ctx:
599            removed.append(f)
600    if removed:
601        rf = get_removal_filter(ctx)
602        removed = [r for r in removed if not rf(r)]
603    return removed
604
605
606def computechangesetfilesmerged(ctx):
607    """return the list of files merged in a changeset"""
608    merged = []
609    if len(ctx.parents()) < 2:
610        return merged
611    for f in ctx.files():
612        if f in ctx:
613            fctx = ctx[f]
614            parents = fctx._filelog.parents(fctx._filenode)
615            if parents[1] != ctx.repo().nullid:
616                merged.append(f)
617    return merged
618
619
620def computechangesetcopies(ctx):
621    """return the copies data for a changeset
622
623    The copies data are returned as a pair of dictionnary (p1copies, p2copies).
624
625    Each dictionnary are in the form: `{newname: oldname}`
626    """
627    p1copies = {}
628    p2copies = {}
629    p1 = ctx.p1()
630    p2 = ctx.p2()
631    narrowmatch = ctx._repo.narrowmatch()
632    for dst in ctx.files():
633        if not narrowmatch(dst) or dst not in ctx:
634            continue
635        copied = ctx[dst].renamed()
636        if not copied:
637            continue
638        src, srcnode = copied
639        if src in p1 and p1[src].filenode() == srcnode:
640            p1copies[dst] = src
641        elif src in p2 and p2[src].filenode() == srcnode:
642            p2copies[dst] = src
643    return p1copies, p2copies
644
645
646def encodecopies(files, copies):
647    items = []
648    for i, dst in enumerate(files):
649        if dst in copies:
650            items.append(b'%d\0%s' % (i, copies[dst]))
651    if len(items) != len(copies):
652        raise error.ProgrammingError(
653            b'some copy targets missing from file list'
654        )
655    return b"\n".join(items)
656
657
658def decodecopies(files, data):
659    try:
660        copies = {}
661        if not data:
662            return copies
663        for l in data.split(b'\n'):
664            strindex, src = l.split(b'\0')
665            i = int(strindex)
666            dst = files[i]
667            copies[dst] = src
668        return copies
669    except (ValueError, IndexError):
670        # Perhaps someone had chosen the same key name (e.g. "p1copies") and
671        # used different syntax for the value.
672        return None
673
674
675def encodefileindices(files, subset):
676    subset = set(subset)
677    indices = []
678    for i, f in enumerate(files):
679        if f in subset:
680            indices.append(b'%d' % i)
681    return b'\n'.join(indices)
682
683
684def decodefileindices(files, data):
685    try:
686        subset = []
687        if not data:
688            return subset
689        for strindex in data.split(b'\n'):
690            i = int(strindex)
691            if i < 0 or i >= len(files):
692                return None
693            subset.append(files[i])
694        return subset
695    except (ValueError, IndexError):
696        # Perhaps someone had chosen the same key name (e.g. "added") and
697        # used different syntax for the value.
698        return None
699
700
701# see mercurial/helptext/internals/revlogs.txt for details about the format
702
703ACTION_MASK = int("111" "00", 2)
704# note: untouched file used as copy source will as `000` for this mask.
705ADDED_FLAG = int("001" "00", 2)
706MERGED_FLAG = int("010" "00", 2)
707REMOVED_FLAG = int("011" "00", 2)
708SALVAGED_FLAG = int("100" "00", 2)
709TOUCHED_FLAG = int("101" "00", 2)
710
711COPIED_MASK = int("11", 2)
712COPIED_FROM_P1_FLAG = int("10", 2)
713COPIED_FROM_P2_FLAG = int("11", 2)
714
715# structure is <flag><filename-end><copy-source>
716INDEX_HEADER = struct.Struct(">L")
717INDEX_ENTRY = struct.Struct(">bLL")
718
719
720def encode_files_sidedata(files):
721    all_files = set(files.touched)
722    all_files.update(files.copied_from_p1.values())
723    all_files.update(files.copied_from_p2.values())
724    all_files = sorted(all_files)
725    file_idx = {f: i for (i, f) in enumerate(all_files)}
726    file_idx[None] = 0
727
728    chunks = [INDEX_HEADER.pack(len(all_files))]
729
730    filename_length = 0
731    for f in all_files:
732        filename_size = len(f)
733        filename_length += filename_size
734        flag = 0
735        if f in files.added:
736            flag |= ADDED_FLAG
737        elif f in files.merged:
738            flag |= MERGED_FLAG
739        elif f in files.removed:
740            flag |= REMOVED_FLAG
741        elif f in files.salvaged:
742            flag |= SALVAGED_FLAG
743        elif f in files.touched:
744            flag |= TOUCHED_FLAG
745
746        copy = None
747        if f in files.copied_from_p1:
748            flag |= COPIED_FROM_P1_FLAG
749            copy = files.copied_from_p1.get(f)
750        elif f in files.copied_from_p2:
751            copy = files.copied_from_p2.get(f)
752            flag |= COPIED_FROM_P2_FLAG
753        copy_idx = file_idx[copy]
754        chunks.append(INDEX_ENTRY.pack(flag, filename_length, copy_idx))
755    chunks.extend(all_files)
756    return {sidedatamod.SD_FILES: b''.join(chunks)}
757
758
759def decode_files_sidedata(sidedata):
760    md = ChangingFiles()
761    raw = sidedata.get(sidedatamod.SD_FILES)
762
763    if raw is None:
764        return md
765
766    copies = []
767    all_files = []
768
769    assert len(raw) >= INDEX_HEADER.size
770    total_files = INDEX_HEADER.unpack_from(raw, 0)[0]
771
772    offset = INDEX_HEADER.size
773    file_offset_base = offset + (INDEX_ENTRY.size * total_files)
774    file_offset_last = file_offset_base
775
776    assert len(raw) >= file_offset_base
777
778    for idx in range(total_files):
779        flag, file_end, copy_idx = INDEX_ENTRY.unpack_from(raw, offset)
780        file_end += file_offset_base
781        filename = raw[file_offset_last:file_end]
782        filesize = file_end - file_offset_last
783        assert len(filename) == filesize
784        offset += INDEX_ENTRY.size
785        file_offset_last = file_end
786        all_files.append(filename)
787        if flag & ACTION_MASK == ADDED_FLAG:
788            md.mark_added(filename)
789        elif flag & ACTION_MASK == MERGED_FLAG:
790            md.mark_merged(filename)
791        elif flag & ACTION_MASK == REMOVED_FLAG:
792            md.mark_removed(filename)
793        elif flag & ACTION_MASK == SALVAGED_FLAG:
794            md.mark_salvaged(filename)
795        elif flag & ACTION_MASK == TOUCHED_FLAG:
796            md.mark_touched(filename)
797
798        copied = None
799        if flag & COPIED_MASK == COPIED_FROM_P1_FLAG:
800            copied = md.mark_copied_from_p1
801        elif flag & COPIED_MASK == COPIED_FROM_P2_FLAG:
802            copied = md.mark_copied_from_p2
803
804        if copied is not None:
805            copies.append((copied, filename, copy_idx))
806
807    for copied, filename, copy_idx in copies:
808        copied(all_files[copy_idx], filename)
809
810    return md
811
812
813def _getsidedata(srcrepo, rev):
814    ctx = srcrepo[rev]
815    files = compute_all_files_changes(ctx)
816    return encode_files_sidedata(files), files.has_copies_info
817
818
819def copies_sidedata_computer(repo, revlog, rev, existing_sidedata):
820    sidedata, has_copies_info = _getsidedata(repo, rev)
821    flags_to_add = sidedataflag.REVIDX_HASCOPIESINFO if has_copies_info else 0
822    return sidedata, (flags_to_add, 0)
823
824
825def _sidedata_worker(srcrepo, revs_queue, sidedata_queue, tokens):
826    """The function used by worker precomputing sidedata
827
828    It read an input queue containing revision numbers
829    It write in an output queue containing (rev, <sidedata-map>)
830
831    The `None` input value is used as a stop signal.
832
833    The `tokens` semaphore is user to avoid having too many unprocessed
834    entries. The workers needs to acquire one token before fetching a task.
835    They will be released by the consumer of the produced data.
836    """
837    tokens.acquire()
838    rev = revs_queue.get()
839    while rev is not None:
840        data = _getsidedata(srcrepo, rev)
841        sidedata_queue.put((rev, data))
842        tokens.acquire()
843        rev = revs_queue.get()
844    # processing of `None` is completed, release the token.
845    tokens.release()
846
847
848BUFF_PER_WORKER = 50
849
850
851def _get_worker_sidedata_adder(srcrepo, destrepo):
852    """The parallel version of the sidedata computation
853
854    This code spawn a pool of worker that precompute a buffer of sidedata
855    before we actually need them"""
856    # avoid circular import copies -> scmutil -> worker -> copies
857    from . import worker
858
859    nbworkers = worker._numworkers(srcrepo.ui)
860
861    tokens = multiprocessing.BoundedSemaphore(nbworkers * BUFF_PER_WORKER)
862    revsq = multiprocessing.Queue()
863    sidedataq = multiprocessing.Queue()
864
865    assert srcrepo.filtername is None
866    # queue all tasks beforehand, revision numbers are small and it make
867    # synchronisation simpler
868    #
869    # Since the computation for each node can be quite expensive, the overhead
870    # of using a single queue is not revelant. In practice, most computation
871    # are fast but some are very expensive and dominate all the other smaller
872    # cost.
873    for r in srcrepo.changelog.revs():
874        revsq.put(r)
875    # queue the "no more tasks" markers
876    for i in range(nbworkers):
877        revsq.put(None)
878
879    allworkers = []
880    for i in range(nbworkers):
881        args = (srcrepo, revsq, sidedataq, tokens)
882        w = multiprocessing.Process(target=_sidedata_worker, args=args)
883        allworkers.append(w)
884        w.start()
885
886    # dictionnary to store results for revision higher than we one we are
887    # looking for. For example, if we need the sidedatamap for 42, and 43 is
888    # received, when shelve 43 for later use.
889    staging = {}
890
891    def sidedata_companion(repo, revlog, rev, old_sidedata):
892        # Is the data previously shelved ?
893        data = staging.pop(rev, None)
894        if data is None:
895            # look at the queued result until we find the one we are lookig
896            # for (shelve the other ones)
897            r, data = sidedataq.get()
898            while r != rev:
899                staging[r] = data
900                r, data = sidedataq.get()
901        tokens.release()
902        sidedata, has_copies_info = data
903        new_flag = 0
904        if has_copies_info:
905            new_flag = sidedataflag.REVIDX_HASCOPIESINFO
906        return sidedata, (new_flag, 0)
907
908    return sidedata_companion
909