1# storageutil.py - Storage functionality agnostic of backend implementation.
2#
3# Copyright 2018 Gregory Szorc <gregory.szorc@gmail.com>
4#
5# This software may be used and distributed according to the terms of the
6# GNU General Public License version 2 or any later version.
7
8from __future__ import absolute_import
9
10import re
11import struct
12
13from ..i18n import _
14from ..node import (
15    bin,
16    nullrev,
17    sha1nodeconstants,
18)
19from .. import (
20    dagop,
21    error,
22    mdiff,
23    pycompat,
24)
25from ..interfaces import repository
26from ..revlogutils import sidedata as sidedatamod
27from ..utils import hashutil
28
29_nullhash = hashutil.sha1(sha1nodeconstants.nullid)
30
31# revision data contains extra metadata not part of the official digest
32# Only used in changegroup >= v4.
33CG_FLAG_SIDEDATA = 1
34
35
36def hashrevisionsha1(text, p1, p2):
37    """Compute the SHA-1 for revision data and its parents.
38
39    This hash combines both the current file contents and its history
40    in a manner that makes it easy to distinguish nodes with the same
41    content in the revision graph.
42    """
43    # As of now, if one of the parent node is null, p2 is null
44    if p2 == sha1nodeconstants.nullid:
45        # deep copy of a hash is faster than creating one
46        s = _nullhash.copy()
47        s.update(p1)
48    else:
49        # none of the parent nodes are nullid
50        if p1 < p2:
51            a = p1
52            b = p2
53        else:
54            a = p2
55            b = p1
56        s = hashutil.sha1(a)
57        s.update(b)
58    s.update(text)
59    return s.digest()
60
61
62METADATA_RE = re.compile(b'\x01\n')
63
64
65def parsemeta(text):
66    """Parse metadata header from revision data.
67
68    Returns a 2-tuple of (metadata, offset), where both can be None if there
69    is no metadata.
70    """
71    # text can be buffer, so we can't use .startswith or .index
72    if text[:2] != b'\x01\n':
73        return None, None
74    s = METADATA_RE.search(text, 2).start()
75    mtext = text[2:s]
76    meta = {}
77    for l in mtext.splitlines():
78        k, v = l.split(b': ', 1)
79        meta[k] = v
80    return meta, s + 2
81
82
83def packmeta(meta, text):
84    """Add metadata to fulltext to produce revision text."""
85    keys = sorted(meta)
86    metatext = b''.join(b'%s: %s\n' % (k, meta[k]) for k in keys)
87    return b'\x01\n%s\x01\n%s' % (metatext, text)
88
89
90def iscensoredtext(text):
91    meta = parsemeta(text)[0]
92    return meta and b'censored' in meta
93
94
95def filtermetadata(text):
96    """Extract just the revision data from source text.
97
98    Returns ``text`` unless it has a metadata header, in which case we return
99    a new buffer without hte metadata.
100    """
101    if not text.startswith(b'\x01\n'):
102        return text
103
104    offset = text.index(b'\x01\n', 2)
105    return text[offset + 2 :]
106
107
108def filerevisioncopied(store, node):
109    """Resolve file revision copy metadata.
110
111    Returns ``False`` if the file has no copy metadata. Otherwise a
112    2-tuple of the source filename and node.
113    """
114    if store.parents(node)[0] != sha1nodeconstants.nullid:
115        return False
116
117    meta = parsemeta(store.revision(node))[0]
118
119    # copy and copyrev occur in pairs. In rare cases due to old bugs,
120    # one can occur without the other. So ensure both are present to flag
121    # as a copy.
122    if meta and b'copy' in meta and b'copyrev' in meta:
123        return meta[b'copy'], bin(meta[b'copyrev'])
124
125    return False
126
127
128def filedataequivalent(store, node, filedata):
129    """Determines whether file data is equivalent to a stored node.
130
131    Returns True if the passed file data would hash to the same value
132    as a stored revision and False otherwise.
133
134    When a stored revision is censored, filedata must be empty to have
135    equivalence.
136
137    When a stored revision has copy metadata, it is ignored as part
138    of the compare.
139    """
140
141    if filedata.startswith(b'\x01\n'):
142        revisiontext = b'\x01\n\x01\n' + filedata
143    else:
144        revisiontext = filedata
145
146    p1, p2 = store.parents(node)
147
148    computednode = hashrevisionsha1(revisiontext, p1, p2)
149
150    if computednode == node:
151        return True
152
153    # Censored files compare against the empty file.
154    if store.iscensored(store.rev(node)):
155        return filedata == b''
156
157    # Renaming a file produces a different hash, even if the data
158    # remains unchanged. Check if that's the case.
159    if store.renamed(node):
160        return store.read(node) == filedata
161
162    return False
163
164
165def iterrevs(storelen, start=0, stop=None):
166    """Iterate over revision numbers in a store."""
167    step = 1
168
169    if stop is not None:
170        if start > stop:
171            step = -1
172        stop += step
173        if stop > storelen:
174            stop = storelen
175    else:
176        stop = storelen
177
178    return pycompat.xrange(start, stop, step)
179
180
181def fileidlookup(store, fileid, identifier):
182    """Resolve the file node for a value.
183
184    ``store`` is an object implementing the ``ifileindex`` interface.
185
186    ``fileid`` can be:
187
188    * A 20 or 32 byte binary node.
189    * An integer revision number
190    * A 40 or 64 byte hex node.
191    * A bytes that can be parsed as an integer representing a revision number.
192
193    ``identifier`` is used to populate ``error.LookupError`` with an identifier
194    for the store.
195
196    Raises ``error.LookupError`` on failure.
197    """
198    if isinstance(fileid, int):
199        try:
200            return store.node(fileid)
201        except IndexError:
202            raise error.LookupError(
203                b'%d' % fileid, identifier, _(b'no match found')
204            )
205
206    if len(fileid) in (20, 32):
207        try:
208            store.rev(fileid)
209            return fileid
210        except error.LookupError:
211            pass
212
213    if len(fileid) in (40, 64):
214        try:
215            rawnode = bin(fileid)
216            store.rev(rawnode)
217            return rawnode
218        except TypeError:
219            pass
220
221    try:
222        rev = int(fileid)
223
224        if b'%d' % rev != fileid:
225            raise ValueError
226
227        try:
228            return store.node(rev)
229        except (IndexError, TypeError):
230            pass
231    except (ValueError, OverflowError):
232        pass
233
234    raise error.LookupError(fileid, identifier, _(b'no match found'))
235
236
237def resolvestripinfo(minlinkrev, tiprev, headrevs, linkrevfn, parentrevsfn):
238    """Resolve information needed to strip revisions.
239
240    Finds the minimum revision number that must be stripped in order to
241    strip ``minlinkrev``.
242
243    Returns a 2-tuple of the minimum revision number to do that and a set
244    of all revision numbers that have linkrevs that would be broken
245    by that strip.
246
247    ``tiprev`` is the current tip-most revision. It is ``len(store) - 1``.
248    ``headrevs`` is an iterable of head revisions.
249    ``linkrevfn`` is a callable that receives a revision and returns a linked
250    revision.
251    ``parentrevsfn`` is a callable that receives a revision number and returns
252    an iterable of its parent revision numbers.
253    """
254    brokenrevs = set()
255    strippoint = tiprev + 1
256
257    heads = {}
258    futurelargelinkrevs = set()
259    for head in headrevs:
260        headlinkrev = linkrevfn(head)
261        heads[head] = headlinkrev
262        if headlinkrev >= minlinkrev:
263            futurelargelinkrevs.add(headlinkrev)
264
265    # This algorithm involves walking down the rev graph, starting at the
266    # heads. Since the revs are topologically sorted according to linkrev,
267    # once all head linkrevs are below the minlink, we know there are
268    # no more revs that could have a linkrev greater than minlink.
269    # So we can stop walking.
270    while futurelargelinkrevs:
271        strippoint -= 1
272        linkrev = heads.pop(strippoint)
273
274        if linkrev < minlinkrev:
275            brokenrevs.add(strippoint)
276        else:
277            futurelargelinkrevs.remove(linkrev)
278
279        for p in parentrevsfn(strippoint):
280            if p != nullrev:
281                plinkrev = linkrevfn(p)
282                heads[p] = plinkrev
283                if plinkrev >= minlinkrev:
284                    futurelargelinkrevs.add(plinkrev)
285
286    return strippoint, brokenrevs
287
288
289def emitrevisions(
290    store,
291    nodes,
292    nodesorder,
293    resultcls,
294    deltaparentfn=None,
295    candeltafn=None,
296    rawsizefn=None,
297    revdifffn=None,
298    flagsfn=None,
299    deltamode=repository.CG_DELTAMODE_STD,
300    revisiondata=False,
301    assumehaveparentrevisions=False,
302    sidedata_helpers=None,
303):
304    """Generic implementation of ifiledata.emitrevisions().
305
306    Emitting revision data is subtly complex. This function attempts to
307    encapsulate all the logic for doing so in a backend-agnostic way.
308
309    ``store``
310       Object conforming to ``ifilestorage`` interface.
311
312    ``nodes``
313       List of revision nodes whose data to emit.
314
315    ``resultcls``
316       A type implementing the ``irevisiondelta`` interface that will be
317       constructed and returned.
318
319    ``deltaparentfn`` (optional)
320       Callable receiving a revision number and returning the revision number
321       of a revision that the internal delta is stored against. This delta
322       will be preferred over computing a new arbitrary delta.
323
324       If not defined, a delta will always be computed from raw revision
325       data.
326
327    ``candeltafn`` (optional)
328       Callable receiving a pair of revision numbers that returns a bool
329       indicating whether a delta between them can be produced.
330
331       If not defined, it is assumed that any two revisions can delta with
332       each other.
333
334    ``rawsizefn`` (optional)
335       Callable receiving a revision number and returning the length of the
336       ``store.rawdata(rev)``.
337
338       If not defined, ``len(store.rawdata(rev))`` will be called.
339
340    ``revdifffn`` (optional)
341       Callable receiving a pair of revision numbers that returns a delta
342       between them.
343
344       If not defined, a delta will be computed by invoking mdiff code
345       on ``store.revision()`` results.
346
347       Defining this function allows a precomputed or stored delta to be
348       used without having to compute on.
349
350    ``flagsfn`` (optional)
351       Callable receiving a revision number and returns the integer flags
352       value for it. If not defined, flags value will be 0.
353
354    ``deltamode``
355       constaint on delta to be sent:
356       * CG_DELTAMODE_STD  - normal mode, try to reuse storage deltas,
357       * CG_DELTAMODE_PREV - only delta against "prev",
358       * CG_DELTAMODE_FULL - only issue full snapshot.
359
360       Whether to send fulltext revisions instead of deltas, if allowed.
361
362    ``nodesorder``
363    ``revisiondata``
364    ``assumehaveparentrevisions``
365    ``sidedata_helpers`` (optional)
366        If not None, means that sidedata should be included.
367        See `revlogutil.sidedata.get_sidedata_helpers`.
368    """
369
370    fnode = store.node
371    frev = store.rev
372
373    if nodesorder == b'nodes':
374        revs = [frev(n) for n in nodes]
375    elif nodesorder == b'linear':
376        revs = {frev(n) for n in nodes}
377        revs = dagop.linearize(revs, store.parentrevs)
378    else:  # storage and default
379        revs = sorted(frev(n) for n in nodes)
380
381    prevrev = None
382
383    if deltamode == repository.CG_DELTAMODE_PREV or assumehaveparentrevisions:
384        prevrev = store.parentrevs(revs[0])[0]
385
386    # Set of revs available to delta against.
387    available = set()
388
389    for rev in revs:
390        if rev == nullrev:
391            continue
392
393        node = fnode(rev)
394        p1rev, p2rev = store.parentrevs(rev)
395
396        if deltaparentfn:
397            deltaparentrev = deltaparentfn(rev)
398        else:
399            deltaparentrev = nullrev
400
401        # Forced delta against previous mode.
402        if deltamode == repository.CG_DELTAMODE_PREV:
403            baserev = prevrev
404
405        # We're instructed to send fulltext. Honor that.
406        elif deltamode == repository.CG_DELTAMODE_FULL:
407            baserev = nullrev
408        # We're instructed to use p1. Honor that
409        elif deltamode == repository.CG_DELTAMODE_P1:
410            baserev = p1rev
411
412        # There is a delta in storage. We try to use that because it
413        # amounts to effectively copying data from storage and is
414        # therefore the fastest.
415        elif deltaparentrev != nullrev:
416            # Base revision was already emitted in this group. We can
417            # always safely use the delta.
418            if deltaparentrev in available:
419                baserev = deltaparentrev
420
421            # Base revision is a parent that hasn't been emitted already.
422            # Use it if we can assume the receiver has the parent revision.
423            elif assumehaveparentrevisions and deltaparentrev in (p1rev, p2rev):
424                baserev = deltaparentrev
425
426            # No guarantee the receiver has the delta parent. Send delta
427            # against last revision (if possible), which in the common case
428            # should be similar enough to this revision that the delta is
429            # reasonable.
430            elif prevrev is not None:
431                baserev = prevrev
432            else:
433                baserev = nullrev
434
435        # Storage has a fulltext revision.
436
437        # Let's use the previous revision, which is as good a guess as any.
438        # There is definitely room to improve this logic.
439        elif prevrev is not None:
440            baserev = prevrev
441        else:
442            baserev = nullrev
443
444        # But we can't actually use our chosen delta base for whatever
445        # reason. Reset to fulltext.
446        if baserev != nullrev and (candeltafn and not candeltafn(baserev, rev)):
447            baserev = nullrev
448
449        revision = None
450        delta = None
451        baserevisionsize = None
452
453        if revisiondata:
454            if store.iscensored(baserev) or store.iscensored(rev):
455                try:
456                    revision = store.rawdata(node)
457                except error.CensoredNodeError as e:
458                    revision = e.tombstone
459
460                if baserev != nullrev:
461                    if rawsizefn:
462                        baserevisionsize = rawsizefn(baserev)
463                    else:
464                        baserevisionsize = len(store.rawdata(baserev))
465
466            elif (
467                baserev == nullrev and deltamode != repository.CG_DELTAMODE_PREV
468            ):
469                revision = store.rawdata(node)
470                available.add(rev)
471            else:
472                if revdifffn:
473                    delta = revdifffn(baserev, rev)
474                else:
475                    delta = mdiff.textdiff(
476                        store.rawdata(baserev), store.rawdata(rev)
477                    )
478
479                available.add(rev)
480
481        serialized_sidedata = None
482        sidedata_flags = (0, 0)
483        if sidedata_helpers:
484            try:
485                old_sidedata = store.sidedata(rev)
486            except error.CensoredNodeError:
487                # skip any potential sidedata of the censored revision
488                sidedata = {}
489            else:
490                sidedata, sidedata_flags = sidedatamod.run_sidedata_helpers(
491                    store=store,
492                    sidedata_helpers=sidedata_helpers,
493                    sidedata=old_sidedata,
494                    rev=rev,
495                )
496            if sidedata:
497                serialized_sidedata = sidedatamod.serialize_sidedata(sidedata)
498
499        flags = flagsfn(rev) if flagsfn else 0
500        protocol_flags = 0
501        if serialized_sidedata:
502            # Advertise that sidedata exists to the other side
503            protocol_flags |= CG_FLAG_SIDEDATA
504            # Computers and removers can return flags to add and/or remove
505            flags = flags | sidedata_flags[0] & ~sidedata_flags[1]
506
507        yield resultcls(
508            node=node,
509            p1node=fnode(p1rev),
510            p2node=fnode(p2rev),
511            basenode=fnode(baserev),
512            flags=flags,
513            baserevisionsize=baserevisionsize,
514            revision=revision,
515            delta=delta,
516            sidedata=serialized_sidedata,
517            protocol_flags=protocol_flags,
518        )
519
520        prevrev = rev
521
522
523def deltaiscensored(delta, baserev, baselenfn):
524    """Determine if a delta represents censored revision data.
525
526    ``baserev`` is the base revision this delta is encoded against.
527    ``baselenfn`` is a callable receiving a revision number that resolves the
528    length of the revision fulltext.
529
530    Returns a bool indicating if the result of the delta represents a censored
531    revision.
532    """
533    # Fragile heuristic: unless new file meta keys are added alphabetically
534    # preceding "censored", all censored revisions are prefixed by
535    # "\1\ncensored:". A delta producing such a censored revision must be a
536    # full-replacement delta, so we inspect the first and only patch in the
537    # delta for this prefix.
538    hlen = struct.calcsize(b">lll")
539    if len(delta) <= hlen:
540        return False
541
542    oldlen = baselenfn(baserev)
543    newlen = len(delta) - hlen
544    if delta[:hlen] != mdiff.replacediffheader(oldlen, newlen):
545        return False
546
547    add = b"\1\ncensored:"
548    addlen = len(add)
549    return newlen >= addlen and delta[hlen : hlen + addlen] == add
550