1# linelog - efficient cache for annotate data
2#
3# Copyright 2018 Google LLC.
4#
5# This software may be used and distributed according to the terms of the
6# GNU General Public License version 2 or any later version.
7"""linelog is an efficient cache for annotate data inspired by SCCS Weaves.
8
9SCCS Weaves are an implementation of
10https://en.wikipedia.org/wiki/Interleaved_deltas. See
11mercurial/helptext/internals/linelog.txt for an exploration of SCCS weaves
12and how linelog works in detail.
13
14Here's a hacker's summary: a linelog is a program which is executed in
15the context of a revision. Executing the program emits information
16about lines, including the revision that introduced them and the line
17number in the file at the introducing revision. When an insertion or
18deletion is performed on the file, a jump instruction is used to patch
19in a new body of annotate information.
20"""
21from __future__ import absolute_import, print_function
22
23import abc
24import struct
25
26from .thirdparty import attr
27from . import pycompat
28
29_llentry = struct.Struct(b'>II')
30
31
32class LineLogError(Exception):
33    """Error raised when something bad happens internally in linelog."""
34
35
36@attr.s
37class lineinfo(object):
38    # Introducing revision of this line.
39    rev = attr.ib()
40    # Line number for this line in its introducing revision.
41    linenum = attr.ib()
42    # Private. Offset in the linelog program of this line. Used internally.
43    _offset = attr.ib()
44
45
46@attr.s
47class annotateresult(object):
48    rev = attr.ib()
49    lines = attr.ib()
50    _eof = attr.ib()
51
52    def __iter__(self):
53        return iter(self.lines)
54
55
56class _llinstruction(object):  # pytype: disable=ignored-metaclass
57
58    __metaclass__ = abc.ABCMeta
59
60    @abc.abstractmethod
61    def __init__(self, op1, op2):
62        pass
63
64    @abc.abstractmethod
65    def __str__(self):
66        pass
67
68    def __repr__(self):
69        return str(self)
70
71    @abc.abstractmethod
72    def __eq__(self, other):
73        pass
74
75    @abc.abstractmethod
76    def encode(self):
77        """Encode this instruction to the binary linelog format."""
78
79    @abc.abstractmethod
80    def execute(self, rev, pc, emit):
81        """Execute this instruction.
82
83        Args:
84          rev: The revision we're annotating.
85          pc: The current offset in the linelog program.
86          emit: A function that accepts a single lineinfo object.
87
88        Returns:
89          The new value of pc. Returns None if exeuction should stop
90          (that is, we've found the end of the file.)
91        """
92
93
94class _jge(_llinstruction):
95    """If the current rev is greater than or equal to op1, jump to op2."""
96
97    def __init__(self, op1, op2):
98        self._cmprev = op1
99        self._target = op2
100
101    def __str__(self):
102        return 'JGE %d %d' % (self._cmprev, self._target)
103
104    def __eq__(self, other):
105        return (
106            type(self) == type(other)
107            and self._cmprev == other._cmprev
108            and self._target == other._target
109        )
110
111    def encode(self):
112        return _llentry.pack(self._cmprev << 2, self._target)
113
114    def execute(self, rev, pc, emit):
115        if rev >= self._cmprev:
116            return self._target
117        return pc + 1
118
119
120class _jump(_llinstruction):
121    """Unconditional jumps are expressed as a JGE with op1 set to 0."""
122
123    def __init__(self, op1, op2):
124        if op1 != 0:
125            raise LineLogError(b"malformed JUMP, op1 must be 0, got %d" % op1)
126        self._target = op2
127
128    def __str__(self):
129        return 'JUMP %d' % (self._target)
130
131    def __eq__(self, other):
132        return type(self) == type(other) and self._target == other._target
133
134    def encode(self):
135        return _llentry.pack(0, self._target)
136
137    def execute(self, rev, pc, emit):
138        return self._target
139
140
141class _eof(_llinstruction):
142    """EOF is expressed as a JGE that always jumps to 0."""
143
144    def __init__(self, op1, op2):
145        if op1 != 0:
146            raise LineLogError(b"malformed EOF, op1 must be 0, got %d" % op1)
147        if op2 != 0:
148            raise LineLogError(b"malformed EOF, op2 must be 0, got %d" % op2)
149
150    def __str__(self):
151        return r'EOF'
152
153    def __eq__(self, other):
154        return type(self) == type(other)
155
156    def encode(self):
157        return _llentry.pack(0, 0)
158
159    def execute(self, rev, pc, emit):
160        return None
161
162
163class _jl(_llinstruction):
164    """If the current rev is less than op1, jump to op2."""
165
166    def __init__(self, op1, op2):
167        self._cmprev = op1
168        self._target = op2
169
170    def __str__(self):
171        return 'JL %d %d' % (self._cmprev, self._target)
172
173    def __eq__(self, other):
174        return (
175            type(self) == type(other)
176            and self._cmprev == other._cmprev
177            and self._target == other._target
178        )
179
180    def encode(self):
181        return _llentry.pack(1 | (self._cmprev << 2), self._target)
182
183    def execute(self, rev, pc, emit):
184        if rev < self._cmprev:
185            return self._target
186        return pc + 1
187
188
189class _line(_llinstruction):
190    """Emit a line."""
191
192    def __init__(self, op1, op2):
193        # This line was introduced by this revision number.
194        self._rev = op1
195        # This line had the specified line number in the introducing revision.
196        self._origlineno = op2
197
198    def __str__(self):
199        return 'LINE %d %d' % (self._rev, self._origlineno)
200
201    def __eq__(self, other):
202        return (
203            type(self) == type(other)
204            and self._rev == other._rev
205            and self._origlineno == other._origlineno
206        )
207
208    def encode(self):
209        return _llentry.pack(2 | (self._rev << 2), self._origlineno)
210
211    def execute(self, rev, pc, emit):
212        emit(lineinfo(self._rev, self._origlineno, pc))
213        return pc + 1
214
215
216def _decodeone(data, offset):
217    """Decode a single linelog instruction from an offset in a buffer."""
218    try:
219        op1, op2 = _llentry.unpack_from(data, offset)
220    except struct.error as e:
221        raise LineLogError(b'reading an instruction failed: %r' % e)
222    opcode = op1 & 0b11
223    op1 = op1 >> 2
224    if opcode == 0:
225        if op1 == 0:
226            if op2 == 0:
227                return _eof(op1, op2)
228            return _jump(op1, op2)
229        return _jge(op1, op2)
230    elif opcode == 1:
231        return _jl(op1, op2)
232    elif opcode == 2:
233        return _line(op1, op2)
234    raise NotImplementedError(b'Unimplemented opcode %r' % opcode)
235
236
237class linelog(object):
238    """Efficient cache for per-line history information."""
239
240    def __init__(self, program=None, maxrev=0):
241        if program is None:
242            # We pad the program with an extra leading EOF so that our
243            # offsets will match the C code exactly. This means we can
244            # interoperate with the C code.
245            program = [_eof(0, 0), _eof(0, 0)]
246        self._program = program
247        self._lastannotate = None
248        self._maxrev = maxrev
249
250    def __eq__(self, other):
251        return (
252            type(self) == type(other)
253            and self._program == other._program
254            and self._maxrev == other._maxrev
255        )
256
257    def __repr__(self):
258        return '<linelog at %s: maxrev=%d size=%d>' % (
259            hex(id(self)),
260            self._maxrev,
261            len(self._program),
262        )
263
264    def debugstr(self):
265        fmt = '%%%dd %%s' % len(str(len(self._program)))
266        return pycompat.sysstr(b'\n').join(
267            fmt % (idx, i) for idx, i in enumerate(self._program[1:], 1)
268        )
269
270    @classmethod
271    def fromdata(cls, buf):
272        if len(buf) % _llentry.size != 0:
273            raise LineLogError(
274                b"invalid linelog buffer size %d (must be a multiple of %d)"
275                % (len(buf), _llentry.size)
276            )
277        expected = len(buf) / _llentry.size
278        fakejge = _decodeone(buf, 0)
279        if isinstance(fakejge, _jump):
280            maxrev = 0
281        elif isinstance(fakejge, (_jge, _jl)):
282            maxrev = fakejge._cmprev
283        else:
284            raise LineLogError(
285                'Expected one of _jump, _jge, or _jl. Got %s.'
286                % type(fakejge).__name__
287            )
288        assert isinstance(fakejge, (_jump, _jge, _jl))  # help pytype
289        numentries = fakejge._target
290        if expected != numentries:
291            raise LineLogError(
292                b"corrupt linelog data: claimed"
293                b" %d entries but given data for %d entries"
294                % (expected, numentries)
295            )
296        instructions = [_eof(0, 0)]
297        for offset in pycompat.xrange(1, numentries):
298            instructions.append(_decodeone(buf, offset * _llentry.size))
299        return cls(instructions, maxrev=maxrev)
300
301    def encode(self):
302        hdr = _jge(self._maxrev, len(self._program)).encode()
303        return hdr + b''.join(i.encode() for i in self._program[1:])
304
305    def clear(self):
306        self._program = []
307        self._maxrev = 0
308        self._lastannotate = None
309
310    def replacelines_vec(self, rev, a1, a2, blines):
311        return self.replacelines(
312            rev, a1, a2, 0, len(blines), _internal_blines=blines
313        )
314
315    def replacelines(self, rev, a1, a2, b1, b2, _internal_blines=None):
316        """Replace lines [a1, a2) with lines [b1, b2)."""
317        if self._lastannotate:
318            # TODO(augie): make replacelines() accept a revision at
319            # which we're editing as well as a revision to mark
320            # responsible for the edits. In hg-experimental it's
321            # stateful like this, so we're doing the same thing to
322            # retain compatibility with absorb until that's imported.
323            ar = self._lastannotate
324        else:
325            ar = self.annotate(rev)
326            #        ar = self.annotate(self._maxrev)
327        if a1 > len(ar.lines):
328            raise LineLogError(
329                b'%d contains %d lines, tried to access line %d'
330                % (rev, len(ar.lines), a1)
331            )
332        elif a1 == len(ar.lines):
333            # Simulated EOF instruction since we're at EOF, which
334            # doesn't have a "real" line.
335            a1inst = _eof(0, 0)
336            a1info = lineinfo(0, 0, ar._eof)
337        else:
338            a1info = ar.lines[a1]
339            a1inst = self._program[a1info._offset]
340        programlen = self._program.__len__
341        oldproglen = programlen()
342        appendinst = self._program.append
343
344        # insert
345        blineinfos = []
346        bappend = blineinfos.append
347        if b1 < b2:
348            # Determine the jump target for the JGE at the start of
349            # the new block.
350            tgt = oldproglen + (b2 - b1 + 1)
351            # Jump to skip the insert if we're at an older revision.
352            appendinst(_jl(rev, tgt))
353            for linenum in pycompat.xrange(b1, b2):
354                if _internal_blines is None:
355                    bappend(lineinfo(rev, linenum, programlen()))
356                    appendinst(_line(rev, linenum))
357                else:
358                    newrev, newlinenum = _internal_blines[linenum]
359                    bappend(lineinfo(newrev, newlinenum, programlen()))
360                    appendinst(_line(newrev, newlinenum))
361        # delete
362        if a1 < a2:
363            if a2 > len(ar.lines):
364                raise LineLogError(
365                    b'%d contains %d lines, tried to access line %d'
366                    % (rev, len(ar.lines), a2)
367                )
368            elif a2 == len(ar.lines):
369                endaddr = ar._eof
370            else:
371                endaddr = ar.lines[a2]._offset
372            if a2 > 0 and rev < self._maxrev:
373                # If we're here, we're deleting a chunk of an old
374                # commit, so we need to be careful and not touch
375                # invisible lines between a2-1 and a2 (IOW, lines that
376                # are added later).
377                endaddr = ar.lines[a2 - 1]._offset + 1
378            appendinst(_jge(rev, endaddr))
379        # copy instruction from a1
380        a1instpc = programlen()
381        appendinst(a1inst)
382        # if a1inst isn't a jump or EOF, then we need to add an unconditional
383        # jump back into the program here.
384        if not isinstance(a1inst, (_jump, _eof)):
385            appendinst(_jump(0, a1info._offset + 1))
386        # Patch instruction at a1, which makes our patch live.
387        self._program[a1info._offset] = _jump(0, oldproglen)
388
389        # Update self._lastannotate in place. This serves as a cache to avoid
390        # expensive "self.annotate" in this function, when "replacelines" is
391        # used continuously.
392        if len(self._lastannotate.lines) > a1:
393            self._lastannotate.lines[a1]._offset = a1instpc
394        else:
395            assert isinstance(a1inst, _eof)
396            self._lastannotate._eof = a1instpc
397        self._lastannotate.lines[a1:a2] = blineinfos
398        self._lastannotate.rev = max(self._lastannotate.rev, rev)
399
400        if rev > self._maxrev:
401            self._maxrev = rev
402
403    def annotate(self, rev):
404        pc = 1
405        lines = []
406        executed = 0
407        # Sanity check: if instructions executed exceeds len(program), we
408        # hit an infinite loop in the linelog program somehow and we
409        # should stop.
410        while pc is not None and executed < len(self._program):
411            inst = self._program[pc]
412            lastpc = pc
413            pc = inst.execute(rev, pc, lines.append)
414            executed += 1
415        if pc is not None:
416            raise LineLogError(
417                r'Probably hit an infinite loop in linelog. Program:\n'
418                + self.debugstr()
419            )
420        ar = annotateresult(rev, lines, lastpc)
421        self._lastannotate = ar
422        return ar
423
424    @property
425    def maxrev(self):
426        return self._maxrev
427
428    # Stateful methods which depend on the value of the last
429    # annotation run. This API is for compatiblity with the original
430    # linelog, and we should probably consider refactoring it.
431    @property
432    def annotateresult(self):
433        """Return the last annotation result. C linelog code exposed this."""
434        return [(l.rev, l.linenum) for l in self._lastannotate.lines]
435
436    def getoffset(self, line):
437        return self._lastannotate.lines[line]._offset
438
439    def getalllines(self, start=0, end=0):
440        """Get all lines that ever occurred in [start, end).
441
442        Passing start == end == 0 means "all lines ever".
443
444        This works in terms of *internal* program offsets, not line numbers.
445        """
446        pc = start or 1
447        lines = []
448        # only take as many steps as there are instructions in the
449        # program - if we don't find an EOF or our stop-line before
450        # then, something is badly broken.
451        for step in pycompat.xrange(len(self._program)):
452            inst = self._program[pc]
453            nextpc = pc + 1
454            if isinstance(inst, _jump):
455                nextpc = inst._target
456            elif isinstance(inst, _eof):
457                return lines
458            elif isinstance(inst, (_jl, _jge)):
459                pass
460            elif isinstance(inst, _line):
461                lines.append((inst._rev, inst._origlineno))
462            else:
463                raise LineLogError(b"Illegal instruction %r" % inst)
464            if nextpc == end:
465                return lines
466            pc = nextpc
467        raise LineLogError(b"Failed to perform getalllines")
468