1# pvec.py - probabilistic vector clocks for Mercurial 2# 3# Copyright 2012 Olivia Mackall <olivia@selenic.com> 4# 5# This software may be used and distributed according to the terms of the 6# GNU General Public License version 2 or any later version. 7 8''' 9A "pvec" is a changeset property based on the theory of vector clocks 10that can be compared to discover relatedness without consulting a 11graph. This can be useful for tasks like determining how a 12disconnected patch relates to a repository. 13 14Currently a pvec consist of 448 bits, of which 24 are 'depth' and the 15remainder are a bit vector. It is represented as a 70-character base85 16string. 17 18Construction: 19 20- a root changeset has a depth of 0 and a bit vector based on its hash 21- a normal commit has a changeset where depth is increased by one and 22 one bit vector bit is flipped based on its hash 23- a merge changeset pvec is constructed by copying changes from one pvec into 24 the other to balance its depth 25 26Properties: 27 28- for linear changes, difference in depth is always <= hamming distance 29- otherwise, changes are probably divergent 30- when hamming distance is < 200, we can reliably detect when pvecs are near 31 32Issues: 33 34- hamming distance ceases to work over distances of ~ 200 35- detecting divergence is less accurate when the common ancestor is very close 36 to either revision or total distance is high 37- this could probably be improved by modeling the relation between 38 delta and hdist 39 40Uses: 41 42- a patch pvec can be used to locate the nearest available common ancestor for 43 resolving conflicts 44- ordering of patches can be established without a DAG 45- two head pvecs can be compared to determine whether push/pull/merge is needed 46 and approximately how many changesets are involved 47- can be used to find a heuristic divergence measure between changesets on 48 different branches 49''' 50 51from __future__ import absolute_import 52 53from .node import nullrev 54from . import ( 55 pycompat, 56 util, 57) 58 59_size = 448 # 70 chars b85-encoded 60_bytes = _size // 8 61_depthbits = 24 62_depthbytes = _depthbits // 8 63_vecbytes = _bytes - _depthbytes 64_vecbits = _vecbytes * 8 65_radius = (_vecbits - 30) // 2 # high probability vectors are related 66 67 68def _bin(bs): 69 '''convert a bytestring to a long''' 70 v = 0 71 for b in bs: 72 v = v * 256 + ord(b) 73 return v 74 75 76def _str(v, l): 77 # type: (int, int) -> bytes 78 bs = b"" 79 for p in pycompat.xrange(l): 80 bs = pycompat.bytechr(v & 255) + bs 81 v >>= 8 82 return bs 83 84 85def _split(b): 86 '''depth and bitvec''' 87 return _bin(b[:_depthbytes]), _bin(b[_depthbytes:]) 88 89 90def _join(depth, bitvec): 91 return _str(depth, _depthbytes) + _str(bitvec, _vecbytes) 92 93 94def _hweight(x): 95 c = 0 96 while x: 97 if x & 1: 98 c += 1 99 x >>= 1 100 return c 101 102 103_htab = [_hweight(x) for x in pycompat.xrange(256)] 104 105 106def _hamming(a, b): 107 '''find the hamming distance between two longs''' 108 d = a ^ b 109 c = 0 110 while d: 111 c += _htab[d & 0xFF] 112 d >>= 8 113 return c 114 115 116def _mergevec(x, y, c): 117 # Ideally, this function would be x ^ y ^ ancestor, but finding 118 # ancestors is a nuisance. So instead we find the minimal number 119 # of changes to balance the depth and hamming distance 120 121 d1, v1 = x 122 d2, v2 = y 123 if d1 < d2: 124 d1, d2, v1, v2 = d2, d1, v2, v1 125 126 hdist = _hamming(v1, v2) 127 ddist = d1 - d2 128 v = v1 129 m = v1 ^ v2 # mask of different bits 130 i = 1 131 132 if hdist > ddist: 133 # if delta = 10 and hdist = 100, then we need to go up 55 steps 134 # to the ancestor and down 45 135 changes = (hdist - ddist + 1) // 2 136 else: 137 # must make at least one change 138 changes = 1 139 depth = d1 + changes 140 141 # copy changes from v2 142 if m: 143 while changes: 144 if m & i: 145 v ^= i 146 changes -= 1 147 i <<= 1 148 else: 149 v = _flipbit(v, c) 150 151 return depth, v 152 153 154def _flipbit(v, node): 155 # converting bit strings to longs is slow 156 bit = (hash(node) & 0xFFFFFFFF) % _vecbits 157 return v ^ (1 << bit) 158 159 160def ctxpvec(ctx): 161 '''construct a pvec for ctx while filling in the cache''' 162 r = ctx.repo() 163 if not util.safehasattr(r, "_pveccache"): 164 r._pveccache = {} 165 pvc = r._pveccache 166 if ctx.rev() not in pvc: 167 cl = r.changelog 168 for n in pycompat.xrange(ctx.rev() + 1): 169 if n not in pvc: 170 node = cl.node(n) 171 p1, p2 = cl.parentrevs(n) 172 if p1 == nullrev: 173 # start with a 'random' vector at root 174 pvc[n] = (0, _bin((node * 3)[:_vecbytes])) 175 elif p2 == nullrev: 176 d, v = pvc[p1] 177 pvc[n] = (d + 1, _flipbit(v, node)) 178 else: 179 pvc[n] = _mergevec(pvc[p1], pvc[p2], node) 180 bs = _join(*pvc[ctx.rev()]) 181 return pvec(util.b85encode(bs)) 182 183 184class pvec(object): 185 def __init__(self, hashorctx): 186 if isinstance(hashorctx, bytes): 187 self._bs = hashorctx 188 self._depth, self._vec = _split(util.b85decode(hashorctx)) 189 else: 190 self._vec = ctxpvec(hashorctx) 191 192 def __str__(self): 193 return self._bs 194 195 def __eq__(self, b): 196 return self._vec == b._vec and self._depth == b._depth 197 198 def __lt__(self, b): 199 delta = b._depth - self._depth 200 if delta < 0: 201 return False # always correct 202 if _hamming(self._vec, b._vec) > delta: 203 return False 204 return True 205 206 def __gt__(self, b): 207 return b < self 208 209 def __or__(self, b): 210 delta = abs(b._depth - self._depth) 211 if _hamming(self._vec, b._vec) <= delta: 212 return False 213 return True 214 215 def __sub__(self, b): 216 if self | b: 217 raise ValueError(b"concurrent pvecs") 218 return self._depth - b._depth 219 220 def distance(self, b): 221 d = abs(b._depth - self._depth) 222 h = _hamming(self._vec, b._vec) 223 return max(d, h) 224 225 def near(self, b): 226 dist = abs(b.depth - self._depth) 227 if dist > _radius or _hamming(self._vec, b._vec) > _radius: 228 return False 229