1# linelog - efficient cache for annotate data 2# 3# Copyright 2018 Google LLC. 4# 5# This software may be used and distributed according to the terms of the 6# GNU General Public License version 2 or any later version. 7"""linelog is an efficient cache for annotate data inspired by SCCS Weaves. 8 9SCCS Weaves are an implementation of 10https://en.wikipedia.org/wiki/Interleaved_deltas. See 11mercurial/helptext/internals/linelog.txt for an exploration of SCCS weaves 12and how linelog works in detail. 13 14Here's a hacker's summary: a linelog is a program which is executed in 15the context of a revision. Executing the program emits information 16about lines, including the revision that introduced them and the line 17number in the file at the introducing revision. When an insertion or 18deletion is performed on the file, a jump instruction is used to patch 19in a new body of annotate information. 20""" 21from __future__ import absolute_import, print_function 22 23import abc 24import struct 25 26from .thirdparty import attr 27from . import pycompat 28 29_llentry = struct.Struct(b'>II') 30 31 32class LineLogError(Exception): 33 """Error raised when something bad happens internally in linelog.""" 34 35 36@attr.s 37class lineinfo(object): 38 # Introducing revision of this line. 39 rev = attr.ib() 40 # Line number for this line in its introducing revision. 41 linenum = attr.ib() 42 # Private. Offset in the linelog program of this line. Used internally. 43 _offset = attr.ib() 44 45 46@attr.s 47class annotateresult(object): 48 rev = attr.ib() 49 lines = attr.ib() 50 _eof = attr.ib() 51 52 def __iter__(self): 53 return iter(self.lines) 54 55 56class _llinstruction(object): # pytype: disable=ignored-metaclass 57 58 __metaclass__ = abc.ABCMeta 59 60 @abc.abstractmethod 61 def __init__(self, op1, op2): 62 pass 63 64 @abc.abstractmethod 65 def __str__(self): 66 pass 67 68 def __repr__(self): 69 return str(self) 70 71 @abc.abstractmethod 72 def __eq__(self, other): 73 pass 74 75 @abc.abstractmethod 76 def encode(self): 77 """Encode this instruction to the binary linelog format.""" 78 79 @abc.abstractmethod 80 def execute(self, rev, pc, emit): 81 """Execute this instruction. 82 83 Args: 84 rev: The revision we're annotating. 85 pc: The current offset in the linelog program. 86 emit: A function that accepts a single lineinfo object. 87 88 Returns: 89 The new value of pc. Returns None if exeuction should stop 90 (that is, we've found the end of the file.) 91 """ 92 93 94class _jge(_llinstruction): 95 """If the current rev is greater than or equal to op1, jump to op2.""" 96 97 def __init__(self, op1, op2): 98 self._cmprev = op1 99 self._target = op2 100 101 def __str__(self): 102 return 'JGE %d %d' % (self._cmprev, self._target) 103 104 def __eq__(self, other): 105 return ( 106 type(self) == type(other) 107 and self._cmprev == other._cmprev 108 and self._target == other._target 109 ) 110 111 def encode(self): 112 return _llentry.pack(self._cmprev << 2, self._target) 113 114 def execute(self, rev, pc, emit): 115 if rev >= self._cmprev: 116 return self._target 117 return pc + 1 118 119 120class _jump(_llinstruction): 121 """Unconditional jumps are expressed as a JGE with op1 set to 0.""" 122 123 def __init__(self, op1, op2): 124 if op1 != 0: 125 raise LineLogError(b"malformed JUMP, op1 must be 0, got %d" % op1) 126 self._target = op2 127 128 def __str__(self): 129 return 'JUMP %d' % (self._target) 130 131 def __eq__(self, other): 132 return type(self) == type(other) and self._target == other._target 133 134 def encode(self): 135 return _llentry.pack(0, self._target) 136 137 def execute(self, rev, pc, emit): 138 return self._target 139 140 141class _eof(_llinstruction): 142 """EOF is expressed as a JGE that always jumps to 0.""" 143 144 def __init__(self, op1, op2): 145 if op1 != 0: 146 raise LineLogError(b"malformed EOF, op1 must be 0, got %d" % op1) 147 if op2 != 0: 148 raise LineLogError(b"malformed EOF, op2 must be 0, got %d" % op2) 149 150 def __str__(self): 151 return r'EOF' 152 153 def __eq__(self, other): 154 return type(self) == type(other) 155 156 def encode(self): 157 return _llentry.pack(0, 0) 158 159 def execute(self, rev, pc, emit): 160 return None 161 162 163class _jl(_llinstruction): 164 """If the current rev is less than op1, jump to op2.""" 165 166 def __init__(self, op1, op2): 167 self._cmprev = op1 168 self._target = op2 169 170 def __str__(self): 171 return 'JL %d %d' % (self._cmprev, self._target) 172 173 def __eq__(self, other): 174 return ( 175 type(self) == type(other) 176 and self._cmprev == other._cmprev 177 and self._target == other._target 178 ) 179 180 def encode(self): 181 return _llentry.pack(1 | (self._cmprev << 2), self._target) 182 183 def execute(self, rev, pc, emit): 184 if rev < self._cmprev: 185 return self._target 186 return pc + 1 187 188 189class _line(_llinstruction): 190 """Emit a line.""" 191 192 def __init__(self, op1, op2): 193 # This line was introduced by this revision number. 194 self._rev = op1 195 # This line had the specified line number in the introducing revision. 196 self._origlineno = op2 197 198 def __str__(self): 199 return 'LINE %d %d' % (self._rev, self._origlineno) 200 201 def __eq__(self, other): 202 return ( 203 type(self) == type(other) 204 and self._rev == other._rev 205 and self._origlineno == other._origlineno 206 ) 207 208 def encode(self): 209 return _llentry.pack(2 | (self._rev << 2), self._origlineno) 210 211 def execute(self, rev, pc, emit): 212 emit(lineinfo(self._rev, self._origlineno, pc)) 213 return pc + 1 214 215 216def _decodeone(data, offset): 217 """Decode a single linelog instruction from an offset in a buffer.""" 218 try: 219 op1, op2 = _llentry.unpack_from(data, offset) 220 except struct.error as e: 221 raise LineLogError(b'reading an instruction failed: %r' % e) 222 opcode = op1 & 0b11 223 op1 = op1 >> 2 224 if opcode == 0: 225 if op1 == 0: 226 if op2 == 0: 227 return _eof(op1, op2) 228 return _jump(op1, op2) 229 return _jge(op1, op2) 230 elif opcode == 1: 231 return _jl(op1, op2) 232 elif opcode == 2: 233 return _line(op1, op2) 234 raise NotImplementedError(b'Unimplemented opcode %r' % opcode) 235 236 237class linelog(object): 238 """Efficient cache for per-line history information.""" 239 240 def __init__(self, program=None, maxrev=0): 241 if program is None: 242 # We pad the program with an extra leading EOF so that our 243 # offsets will match the C code exactly. This means we can 244 # interoperate with the C code. 245 program = [_eof(0, 0), _eof(0, 0)] 246 self._program = program 247 self._lastannotate = None 248 self._maxrev = maxrev 249 250 def __eq__(self, other): 251 return ( 252 type(self) == type(other) 253 and self._program == other._program 254 and self._maxrev == other._maxrev 255 ) 256 257 def __repr__(self): 258 return '<linelog at %s: maxrev=%d size=%d>' % ( 259 hex(id(self)), 260 self._maxrev, 261 len(self._program), 262 ) 263 264 def debugstr(self): 265 fmt = '%%%dd %%s' % len(str(len(self._program))) 266 return pycompat.sysstr(b'\n').join( 267 fmt % (idx, i) for idx, i in enumerate(self._program[1:], 1) 268 ) 269 270 @classmethod 271 def fromdata(cls, buf): 272 if len(buf) % _llentry.size != 0: 273 raise LineLogError( 274 b"invalid linelog buffer size %d (must be a multiple of %d)" 275 % (len(buf), _llentry.size) 276 ) 277 expected = len(buf) / _llentry.size 278 fakejge = _decodeone(buf, 0) 279 if isinstance(fakejge, _jump): 280 maxrev = 0 281 elif isinstance(fakejge, (_jge, _jl)): 282 maxrev = fakejge._cmprev 283 else: 284 raise LineLogError( 285 'Expected one of _jump, _jge, or _jl. Got %s.' 286 % type(fakejge).__name__ 287 ) 288 assert isinstance(fakejge, (_jump, _jge, _jl)) # help pytype 289 numentries = fakejge._target 290 if expected != numentries: 291 raise LineLogError( 292 b"corrupt linelog data: claimed" 293 b" %d entries but given data for %d entries" 294 % (expected, numentries) 295 ) 296 instructions = [_eof(0, 0)] 297 for offset in pycompat.xrange(1, numentries): 298 instructions.append(_decodeone(buf, offset * _llentry.size)) 299 return cls(instructions, maxrev=maxrev) 300 301 def encode(self): 302 hdr = _jge(self._maxrev, len(self._program)).encode() 303 return hdr + b''.join(i.encode() for i in self._program[1:]) 304 305 def clear(self): 306 self._program = [] 307 self._maxrev = 0 308 self._lastannotate = None 309 310 def replacelines_vec(self, rev, a1, a2, blines): 311 return self.replacelines( 312 rev, a1, a2, 0, len(blines), _internal_blines=blines 313 ) 314 315 def replacelines(self, rev, a1, a2, b1, b2, _internal_blines=None): 316 """Replace lines [a1, a2) with lines [b1, b2).""" 317 if self._lastannotate: 318 # TODO(augie): make replacelines() accept a revision at 319 # which we're editing as well as a revision to mark 320 # responsible for the edits. In hg-experimental it's 321 # stateful like this, so we're doing the same thing to 322 # retain compatibility with absorb until that's imported. 323 ar = self._lastannotate 324 else: 325 ar = self.annotate(rev) 326 # ar = self.annotate(self._maxrev) 327 if a1 > len(ar.lines): 328 raise LineLogError( 329 b'%d contains %d lines, tried to access line %d' 330 % (rev, len(ar.lines), a1) 331 ) 332 elif a1 == len(ar.lines): 333 # Simulated EOF instruction since we're at EOF, which 334 # doesn't have a "real" line. 335 a1inst = _eof(0, 0) 336 a1info = lineinfo(0, 0, ar._eof) 337 else: 338 a1info = ar.lines[a1] 339 a1inst = self._program[a1info._offset] 340 programlen = self._program.__len__ 341 oldproglen = programlen() 342 appendinst = self._program.append 343 344 # insert 345 blineinfos = [] 346 bappend = blineinfos.append 347 if b1 < b2: 348 # Determine the jump target for the JGE at the start of 349 # the new block. 350 tgt = oldproglen + (b2 - b1 + 1) 351 # Jump to skip the insert if we're at an older revision. 352 appendinst(_jl(rev, tgt)) 353 for linenum in pycompat.xrange(b1, b2): 354 if _internal_blines is None: 355 bappend(lineinfo(rev, linenum, programlen())) 356 appendinst(_line(rev, linenum)) 357 else: 358 newrev, newlinenum = _internal_blines[linenum] 359 bappend(lineinfo(newrev, newlinenum, programlen())) 360 appendinst(_line(newrev, newlinenum)) 361 # delete 362 if a1 < a2: 363 if a2 > len(ar.lines): 364 raise LineLogError( 365 b'%d contains %d lines, tried to access line %d' 366 % (rev, len(ar.lines), a2) 367 ) 368 elif a2 == len(ar.lines): 369 endaddr = ar._eof 370 else: 371 endaddr = ar.lines[a2]._offset 372 if a2 > 0 and rev < self._maxrev: 373 # If we're here, we're deleting a chunk of an old 374 # commit, so we need to be careful and not touch 375 # invisible lines between a2-1 and a2 (IOW, lines that 376 # are added later). 377 endaddr = ar.lines[a2 - 1]._offset + 1 378 appendinst(_jge(rev, endaddr)) 379 # copy instruction from a1 380 a1instpc = programlen() 381 appendinst(a1inst) 382 # if a1inst isn't a jump or EOF, then we need to add an unconditional 383 # jump back into the program here. 384 if not isinstance(a1inst, (_jump, _eof)): 385 appendinst(_jump(0, a1info._offset + 1)) 386 # Patch instruction at a1, which makes our patch live. 387 self._program[a1info._offset] = _jump(0, oldproglen) 388 389 # Update self._lastannotate in place. This serves as a cache to avoid 390 # expensive "self.annotate" in this function, when "replacelines" is 391 # used continuously. 392 if len(self._lastannotate.lines) > a1: 393 self._lastannotate.lines[a1]._offset = a1instpc 394 else: 395 assert isinstance(a1inst, _eof) 396 self._lastannotate._eof = a1instpc 397 self._lastannotate.lines[a1:a2] = blineinfos 398 self._lastannotate.rev = max(self._lastannotate.rev, rev) 399 400 if rev > self._maxrev: 401 self._maxrev = rev 402 403 def annotate(self, rev): 404 pc = 1 405 lines = [] 406 executed = 0 407 # Sanity check: if instructions executed exceeds len(program), we 408 # hit an infinite loop in the linelog program somehow and we 409 # should stop. 410 while pc is not None and executed < len(self._program): 411 inst = self._program[pc] 412 lastpc = pc 413 pc = inst.execute(rev, pc, lines.append) 414 executed += 1 415 if pc is not None: 416 raise LineLogError( 417 r'Probably hit an infinite loop in linelog. Program:\n' 418 + self.debugstr() 419 ) 420 ar = annotateresult(rev, lines, lastpc) 421 self._lastannotate = ar 422 return ar 423 424 @property 425 def maxrev(self): 426 return self._maxrev 427 428 # Stateful methods which depend on the value of the last 429 # annotation run. This API is for compatiblity with the original 430 # linelog, and we should probably consider refactoring it. 431 @property 432 def annotateresult(self): 433 """Return the last annotation result. C linelog code exposed this.""" 434 return [(l.rev, l.linenum) for l in self._lastannotate.lines] 435 436 def getoffset(self, line): 437 return self._lastannotate.lines[line]._offset 438 439 def getalllines(self, start=0, end=0): 440 """Get all lines that ever occurred in [start, end). 441 442 Passing start == end == 0 means "all lines ever". 443 444 This works in terms of *internal* program offsets, not line numbers. 445 """ 446 pc = start or 1 447 lines = [] 448 # only take as many steps as there are instructions in the 449 # program - if we don't find an EOF or our stop-line before 450 # then, something is badly broken. 451 for step in pycompat.xrange(len(self._program)): 452 inst = self._program[pc] 453 nextpc = pc + 1 454 if isinstance(inst, _jump): 455 nextpc = inst._target 456 elif isinstance(inst, _eof): 457 return lines 458 elif isinstance(inst, (_jl, _jge)): 459 pass 460 elif isinstance(inst, _line): 461 lines.append((inst._rev, inst._origlineno)) 462 else: 463 raise LineLogError(b"Illegal instruction %r" % inst) 464 if nextpc == end: 465 return lines 466 pc = nextpc 467 raise LineLogError(b"Failed to perform getalllines") 468