1# pvec.py - probabilistic vector clocks for Mercurial
2#
3# Copyright 2012 Olivia Mackall <olivia@selenic.com>
4#
5# This software may be used and distributed according to the terms of the
6# GNU General Public License version 2 or any later version.
7
8'''
9A "pvec" is a changeset property based on the theory of vector clocks
10that can be compared to discover relatedness without consulting a
11graph. This can be useful for tasks like determining how a
12disconnected patch relates to a repository.
13
14Currently a pvec consist of 448 bits, of which 24 are 'depth' and the
15remainder are a bit vector. It is represented as a 70-character base85
16string.
17
18Construction:
19
20- a root changeset has a depth of 0 and a bit vector based on its hash
21- a normal commit has a changeset where depth is increased by one and
22  one bit vector bit is flipped based on its hash
23- a merge changeset pvec is constructed by copying changes from one pvec into
24  the other to balance its depth
25
26Properties:
27
28- for linear changes, difference in depth is always <= hamming distance
29- otherwise, changes are probably divergent
30- when hamming distance is < 200, we can reliably detect when pvecs are near
31
32Issues:
33
34- hamming distance ceases to work over distances of ~ 200
35- detecting divergence is less accurate when the common ancestor is very close
36  to either revision or total distance is high
37- this could probably be improved by modeling the relation between
38  delta and hdist
39
40Uses:
41
42- a patch pvec can be used to locate the nearest available common ancestor for
43  resolving conflicts
44- ordering of patches can be established without a DAG
45- two head pvecs can be compared to determine whether push/pull/merge is needed
46  and approximately how many changesets are involved
47- can be used to find a heuristic divergence measure between changesets on
48  different branches
49'''
50
51from __future__ import absolute_import
52
53from .node import nullrev
54from . import (
55    pycompat,
56    util,
57)
58
59_size = 448  # 70 chars b85-encoded
60_bytes = _size // 8
61_depthbits = 24
62_depthbytes = _depthbits // 8
63_vecbytes = _bytes - _depthbytes
64_vecbits = _vecbytes * 8
65_radius = (_vecbits - 30) // 2  # high probability vectors are related
66
67
68def _bin(bs):
69    '''convert a bytestring to a long'''
70    v = 0
71    for b in bs:
72        v = v * 256 + ord(b)
73    return v
74
75
76def _str(v, l):
77    # type: (int, int) -> bytes
78    bs = b""
79    for p in pycompat.xrange(l):
80        bs = pycompat.bytechr(v & 255) + bs
81        v >>= 8
82    return bs
83
84
85def _split(b):
86    '''depth and bitvec'''
87    return _bin(b[:_depthbytes]), _bin(b[_depthbytes:])
88
89
90def _join(depth, bitvec):
91    return _str(depth, _depthbytes) + _str(bitvec, _vecbytes)
92
93
94def _hweight(x):
95    c = 0
96    while x:
97        if x & 1:
98            c += 1
99        x >>= 1
100    return c
101
102
103_htab = [_hweight(x) for x in pycompat.xrange(256)]
104
105
106def _hamming(a, b):
107    '''find the hamming distance between two longs'''
108    d = a ^ b
109    c = 0
110    while d:
111        c += _htab[d & 0xFF]
112        d >>= 8
113    return c
114
115
116def _mergevec(x, y, c):
117    # Ideally, this function would be x ^ y ^ ancestor, but finding
118    # ancestors is a nuisance. So instead we find the minimal number
119    # of changes to balance the depth and hamming distance
120
121    d1, v1 = x
122    d2, v2 = y
123    if d1 < d2:
124        d1, d2, v1, v2 = d2, d1, v2, v1
125
126    hdist = _hamming(v1, v2)
127    ddist = d1 - d2
128    v = v1
129    m = v1 ^ v2  # mask of different bits
130    i = 1
131
132    if hdist > ddist:
133        # if delta = 10 and hdist = 100, then we need to go up 55 steps
134        # to the ancestor and down 45
135        changes = (hdist - ddist + 1) // 2
136    else:
137        # must make at least one change
138        changes = 1
139    depth = d1 + changes
140
141    # copy changes from v2
142    if m:
143        while changes:
144            if m & i:
145                v ^= i
146                changes -= 1
147            i <<= 1
148    else:
149        v = _flipbit(v, c)
150
151    return depth, v
152
153
154def _flipbit(v, node):
155    # converting bit strings to longs is slow
156    bit = (hash(node) & 0xFFFFFFFF) % _vecbits
157    return v ^ (1 << bit)
158
159
160def ctxpvec(ctx):
161    '''construct a pvec for ctx while filling in the cache'''
162    r = ctx.repo()
163    if not util.safehasattr(r, "_pveccache"):
164        r._pveccache = {}
165    pvc = r._pveccache
166    if ctx.rev() not in pvc:
167        cl = r.changelog
168        for n in pycompat.xrange(ctx.rev() + 1):
169            if n not in pvc:
170                node = cl.node(n)
171                p1, p2 = cl.parentrevs(n)
172                if p1 == nullrev:
173                    # start with a 'random' vector at root
174                    pvc[n] = (0, _bin((node * 3)[:_vecbytes]))
175                elif p2 == nullrev:
176                    d, v = pvc[p1]
177                    pvc[n] = (d + 1, _flipbit(v, node))
178                else:
179                    pvc[n] = _mergevec(pvc[p1], pvc[p2], node)
180    bs = _join(*pvc[ctx.rev()])
181    return pvec(util.b85encode(bs))
182
183
184class pvec(object):
185    def __init__(self, hashorctx):
186        if isinstance(hashorctx, bytes):
187            self._bs = hashorctx
188            self._depth, self._vec = _split(util.b85decode(hashorctx))
189        else:
190            self._vec = ctxpvec(hashorctx)
191
192    def __str__(self):
193        return self._bs
194
195    def __eq__(self, b):
196        return self._vec == b._vec and self._depth == b._depth
197
198    def __lt__(self, b):
199        delta = b._depth - self._depth
200        if delta < 0:
201            return False  # always correct
202        if _hamming(self._vec, b._vec) > delta:
203            return False
204        return True
205
206    def __gt__(self, b):
207        return b < self
208
209    def __or__(self, b):
210        delta = abs(b._depth - self._depth)
211        if _hamming(self._vec, b._vec) <= delta:
212            return False
213        return True
214
215    def __sub__(self, b):
216        if self | b:
217            raise ValueError(b"concurrent pvecs")
218        return self._depth - b._depth
219
220    def distance(self, b):
221        d = abs(b._depth - self._depth)
222        h = _hamming(self._vec, b._vec)
223        return max(d, h)
224
225    def near(self, b):
226        dist = abs(b.depth - self._depth)
227        if dist > _radius or _hamming(self._vec, b._vec) > _radius:
228            return False
229