1# Code dedicated to an cache around obsolescence property
2#
3# This module content aims at being upstreamed.
4#
5# Copyright 2017 Pierre-Yves David <pierre-yves.david@ens-lyon.org>
6#
7# This software may be used and distributed according to the terms of the
8# GNU General Public License version 2 or any later version.
9
10import errno
11import hashlib
12import struct
13
14from mercurial import (
15    localrepo,
16    obsolete,
17    phases,
18    node,
19    util,
20)
21
22from mercurial.utils.stringutil import forcebytestr
23
24from . import (
25    compat,
26    exthelper,
27)
28
29eh = exthelper.exthelper()
30
31obsstorefilecache = localrepo.localrepository.obsstore
32
33# obsstore is a filecache so we have do to some spacial dancing
34@eh.wrapfunction(obsstorefilecache, 'func')
35def obsstorewithcache(orig, repo):
36    obsstore = orig(repo)
37    obsstore.obscache = obscache(repo.unfiltered())
38
39    class cachekeyobsstore(obsstore.__class__):
40
41        _obskeysize = 200
42
43        def cachekey(self, index=None):
44            """return (current-length, cachekey)
45
46            'current-length': is the current length of the obsstore storage file,
47            'cachekey' is the hash of the last 200 bytes ending at 'index'.
48
49            if 'index' is unspecified, current obsstore length is used.
50            Cacheckey will be set to null id if the obstore is empty.
51
52            If the index specified is higher than the current obsstore file
53            length, cachekey will be set to None."""
54            # default value
55            obsstoresize = 0
56            keydata = b''
57            # try to get actual data from the obsstore
58            try:
59                with self.svfs(b'obsstore') as obsfile:
60                    obsfile.seek(0, 2)
61                    obsstoresize = obsfile.tell()
62                    if index is None:
63                        index = obsstoresize
64                    elif obsstoresize < index:
65                        return obsstoresize, None
66                    actualsize = min(index, self._obskeysize)
67                    if actualsize:
68                        obsfile.seek(index - actualsize, 0)
69                        keydata = obsfile.read(actualsize)
70            except (OSError, IOError) as e:
71                if e.errno != errno.ENOENT:
72                    raise
73            if keydata:
74                key = hashlib.sha1(keydata).digest()
75            else:
76                # reusing an existing "empty" value make it easier to define a
77                # default cachekey for 'no data'.
78                key = node.nullid
79            return obsstoresize, key
80
81    obsstore.__class__ = cachekeyobsstore
82
83    return obsstore
84
85def markersfrom(obsstore, byteoffset, firstmarker):
86    if not firstmarker:
87        return list(obsstore)
88    elif r'_all' in vars(obsstore):
89        # if the data are in memory, just use that
90        return obsstore._all[firstmarker:]
91    else:
92        obsdata = obsstore.svfs.tryread(b'obsstore')
93        return obsolete._readmarkers(obsdata, byteoffset)[1]
94
95
96class dualsourcecache(object):
97    """An abstract class for cache that needs both changelog and obsstore
98
99    This class handle the tracking of changelog and obsstore update. It provide
100    data to performs incremental update (see the 'updatefrom' function for
101    details).  This class can also detect stripping of the changelog or the
102    obsstore and can reset the cache in this cache (see the 'clear' function
103    for details).
104    """
105
106    # default key used for an empty cache
107    #
108    # The cache key covering the changesets and obsmarkers content
109    #
110    # The cache key parts are:
111    # - tip-rev,
112    # - tip-node,
113    # - obsstore-length (nb markers),
114    # - obsstore-file-size (in bytes),
115    # - obsstore "cache key"
116    emptykey = (node.nullrev, node.nullid, 0, 0, node.nullid)
117    _cachename = None # used for error message
118
119    def __init__(self):
120        super(dualsourcecache, self).__init__()
121        self._cachekey = None
122
123    def _updatefrom(self, repo, revs, obsmarkers):
124        """override this method to update your cache data incrementally
125
126        revs:      list of new revision in the changelog
127        obsmarker: list of new obsmarkers in the obsstore
128        """
129        raise NotImplementedError
130
131    def clear(self, reset=False):
132        """invalidate the cache content
133
134        if 'reset' is passed, we detected a strip and the cache will have to be
135        recomputed.
136        """
137        # /!\ IMPORTANT /!\
138        # You must overide this method to actually
139        if reset:
140            self._cachekey = self.emptykey
141        else:
142            self._cachekey = None
143
144    def load(self, repo):
145        """Load data from disk
146
147        Do not forget to restore the "cachekey" attribute while doing so.
148        """
149        raise NotImplementedError
150
151    # Useful public function (no need to override them)
152
153    def uptodate(self, repo):
154        """return True if the cache content is up to date False otherwise
155
156        This method can be used to detect of the cache is lagging behind new
157        data in either changelog or obsstore.
158        """
159        if self._cachekey is None:
160            self.load(repo)
161        status = self._checkkey(repo.changelog, repo.obsstore)
162        return (status is not None
163                and status[0] == self._cachekey[0] # tiprev
164                and status[1] == self._cachekey[3]) # obssize
165
166    def update(self, repo):
167        """update the cache with new repository data
168
169        The update will be incremental when possible"""
170        repo = repo.unfiltered()
171        # If we do not have any data, try loading from disk
172        if self._cachekey is None:
173            self.load(repo)
174
175        assert repo.filtername is None
176        cl = repo.changelog
177
178        upgrade = self._upgradeneeded(repo)
179        if upgrade is None:
180            return
181
182        reset, revs, obsmarkers, obskeypair = upgrade
183        if reset or self._cachekey is None:
184            repo.ui.log(b'evoext-cache', b'strip detected, %s cache reset\n' % self._cachename)
185            self.clear(reset=True)
186
187        starttime = util.timer()
188        revs = list(revs)
189        obsmarkers = list(obsmarkers)
190        self._updatefrom(repo, revs, obsmarkers)
191        duration = util.timer() - starttime
192        repo.ui.log(b'evoext-cache', b'updated %s in %.4f seconds (%dr, %do)\n',
193                    self._cachename, duration, len(revs), len(obsmarkers))
194
195        # update the key from the new data
196        key = list(self._cachekey)
197        if revs:
198            key[0] = cl.tiprev()
199            key[1] = cl.node(key[0])
200        if obsmarkers:
201            key[2] += len(obsmarkers)
202            key[3], key[4] = obskeypair
203        self._cachekey = tuple(key)
204
205    # from here, there are internal function only
206
207    def _checkkey(self, changelog, obsstore):
208        """internal function"""
209        key = self._cachekey
210        if key is None:
211            return None
212
213        ### Is the cache valid ?
214        keytiprev, keytipnode, keyobslength, keyobssize, keyobskey = key
215        # check for changelog strip
216        tiprev = changelog.tiprev()
217        if (tiprev < keytiprev
218                or changelog.node(keytiprev) != keytipnode):
219            return None
220        # check for obsstore strip
221        obssize, obskey = obsstore.cachekey(index=keyobssize)
222        if obskey != keyobskey:
223            return None
224        if obssize != keyobssize:
225            # we want to return the obskey for the new size
226            __, obskey = obsstore.cachekey(index=obssize)
227        return tiprev, obssize, obskey
228
229    def _upgradeneeded(self, repo):
230        """return (valid, start-rev, start-obs-idx)
231
232        'valid': is "False" if older cache value needs invalidation,
233
234        'start-rev': first revision not in the cache. None if cache is up to date,
235
236        'start-obs-idx': index of the first obs-markers not in the cache. None is
237                         up to date.
238        """
239
240        # We need to ensure we use the same changelog and obsstore through the
241        # processing. Otherwise some invalidation could update the object and their
242        # content after we computed the cache key.
243        cl = repo.changelog
244        obsstore = repo.obsstore
245        key = self._cachekey
246
247        reset = False
248
249        status = self._checkkey(cl, obsstore)
250        if status is None:
251            reset = True
252            key = self.emptykey
253            obssize, obskey = obsstore.cachekey()
254            tiprev = cl.tiprev()
255        else:
256            tiprev, obssize, obskey = status
257
258        keytiprev, keytipnode, keyobslength, keyobssize, keyobskey = key
259
260        if not reset and keytiprev == tiprev and keyobssize == obssize:
261            return None # nothing to upgrade
262
263        ### cache is valid, is there anything to update
264
265        # any new changesets ?
266        revs = ()
267        if keytiprev < tiprev:
268            revs = list(cl.revs(start=keytiprev + 1, stop=tiprev))
269
270        # any new markers
271        markers = ()
272        if keyobssize < obssize:
273            # XXX Three are a small race change here. Since the obsstore might have
274            # move forward between the time we computed the cache key and we access
275            # the data. To fix this we need so "up to" argument when fetching the
276            # markers here. Otherwise we might return more markers than covered by
277            # the cache key.
278            #
279            # In pratice the cache is only updated after each transaction within a
280            # lock. So we should be fine. We could enforce this with a new repository
281            # requirement (or fix the race, that is not too hard).
282            markers = markersfrom(obsstore, keyobssize, keyobslength)
283
284        return reset, revs, markers, (obssize, obskey)
285
286class obscache(dualsourcecache):
287    """cache the "does a rev" is the precursors of some obsmarkers data
288
289    This is not directly holding the "is this revision obsolete" information,
290    because phases data gets into play here. However, it allow to compute the
291    "obsolescence" set without reading the obsstore content.
292
293    Implementation note #1:
294
295      The obsstore is implementing only half of the transaction logic it
296      should. It properly record the starting point of the obsstore to allow
297      clean rollback. However it still write to the obsstore file directly
298      during the transaction. Instead it should be keeping data in memory and
299      write to a '.pending' file to make the data vailable for hooks.
300
301      This cache is not going futher than what the obstore is doing, so it does
302      not has any '.pending' logic. When the obsstore gains proper '.pending'
303      support, adding it to this cache should not be too hard. As the flag
304      always move from 0 to 1, we could have a second '.pending' cache file to
305      be read. If flag is set in any of them, the value is 1. For the same
306      reason, updating the file in place should be possible.
307
308    Implementation note #2:
309
310      Instead of having a large final update run, we could update this cache at
311      the level adding a new changeset or a new obsmarkers. More on this in the
312      'update code'.
313
314    Implementation note #3:
315
316        Storage-wise, we could have a "start rev" to avoid storing useless
317        zero. That would be especially useful for the '.pending' overlay.
318    """
319
320    _filepath = b'evoext-obscache-00'
321    _headerformat = b'>q20sQQ20s'
322
323    _cachename = b'evo-ext-obscache' # used for error message
324
325    def __init__(self, repo):
326        super(obscache, self).__init__()
327        self._ondiskkey = None
328
329    @util.propertycache
330    def get(self):
331        """final signature: obscache.get(rev)
332
333        return True if "rev" is used as "precursors for any obsmarkers
334
335        IMPORTANT: make sure the cache has been updated to match the repository
336        content before using it
337
338        We use a property cache to skip the attribute resolution overhead in
339        hot loops."""
340        return self._data.__getitem__
341
342    def _setdata(self, data):
343        """set a new bytearray data, invalidating the 'get' shortcut if needed"""
344        self._data = data
345        if r'get' in vars(self):
346            del self.get
347
348    def clear(self, reset=False):
349        """invalidate the cache content"""
350        super(obscache, self).clear(reset=reset)
351        self._setdata(bytearray())
352
353    def _updatefrom(self, repo, revs, obsmarkers):
354        if revs:
355            self._updaterevs(repo, revs)
356        if obsmarkers:
357            self._updatemarkers(repo, obsmarkers)
358
359    def _updaterevs(self, repo, revs):
360        """update the cache with new revisions
361
362        Newly added changeset might be affected by obsolescence markers
363        we already have locally. So we needs to have some global
364        knowledge about the markers to handle that question.
365
366        Right now this requires parsing all markers in the obsstore. We could
367        imagine using various optimisation (eg: another cache, network
368        exchange, etc).
369
370        A possible approach to this is to build a set of all node used as
371        precursors in `obsstore._obscandidate`. If markers are not loaded yet,
372        we could initialize it by doing a quick scan through the obsstore data
373        and filling a (pre-sized) set. Doing so would be much faster than
374        parsing all the obsmarkers since we would access less data, not create
375        any object beside the nodes and not have to decode any complex data.
376
377        For now we stick to the simpler approach of paying the
378        performance cost on new changesets.
379        """
380        new_entries = bytearray(len(revs))
381        if not self._data:
382            self._setdata(new_entries)
383        else:
384            self._data.extend(new_entries)
385        data = self._data
386        if repo.obsstore:
387            node = repo.changelog.node
388            succs = repo.obsstore.successors
389            for r in revs:
390                if node(r) in succs:
391                    data[r] = 1
392
393    def _updatemarkers(self, repo, obsmarkers):
394        """update the cache with new markers"""
395        getrev = compat.getgetrev(repo.changelog)
396        for m in obsmarkers:
397            r = getrev(m[0])
398            if r is not None:
399                self._data[r] = 1
400
401    def save(self, repo):
402        """save the data to disk"""
403
404        # XXX it happens that the obsstore is (buggilly) always up to date on disk
405        if self._cachekey is None or self._cachekey == self._ondiskkey:
406            return
407
408        try:
409            cachefile = repo.cachevfs(self._filepath, b'w', atomictemp=True)
410            headerdata = struct.pack(self._headerformat, *self._cachekey)
411            cachefile.write(headerdata)
412            cachefile.write(self._data)
413            cachefile.close()
414            self._ondiskkey = self._cachekey
415        except (IOError, OSError) as exc:
416            repo.ui.log(b'obscache', b'could not write update %s\n' % forcebytestr(exc))
417            repo.ui.debug(b'obscache: could not write update %s\n' % forcebytestr(exc))
418
419    def load(self, repo):
420        """load data from disk"""
421        assert repo.filtername is None
422
423        data = repo.cachevfs.tryread(self._filepath)
424        if not data:
425            self._cachekey = self.emptykey
426            self._setdata(bytearray())
427        else:
428            headersize = struct.calcsize(self._headerformat)
429            self._cachekey = struct.unpack(self._headerformat, data[:headersize])
430            self._setdata(bytearray(data[headersize:]))
431        self._ondiskkey = self._cachekey
432
433def _computeobsoleteset(orig, repo):
434    """the set of obsolete revisions"""
435    obs = set()
436    repo = repo.unfiltered()
437    notpublic = repo._phasecache.getrevset(repo, (phases.draft, phases.secret))
438    if notpublic:
439        obscache = repo.obsstore.obscache
440        # Since we warm the cache at the end of every transaction, the cache
441        # should be up to date. However a non-enabled client might have touched
442        # the repository.
443        #
444        # Updating the cache without a lock is sloppy, so we fallback to the
445        # old method and rely on the fact the next transaction will write the
446        # cache down anyway.
447        #
448        # With the current implementation updating the cache will requires to
449        # load the obsstore anyway. Once loaded, hitting the obsstore directly
450        # will be about as fast...
451        if not obscache.uptodate(repo):
452            if repo.currenttransaction() is None:
453                repo.ui.log(b'evoext-cache',
454                            b'obscache is out of date, '
455                            b'falling back to slower obsstore version\n')
456                repo.ui.debug(b'obscache is out of date\n')
457                return orig(repo)
458            else:
459                # If a transaction is open, it is worthwhile to update and use
460                # the cache, the lock prevent race and it will be written on
461                # disk when the transaction close.
462                obscache.update(repo)
463        isobs = obscache.get
464    for r in notpublic:
465        if isobs(r):
466            obs.add(r)
467    return obs
468
469@eh.uisetup
470def cachefuncs(ui):
471    orig = obsolete.cachefuncs[b'obsolete']
472    wrapped = lambda repo: _computeobsoleteset(orig, repo)
473    obsolete.cachefuncs[b'obsolete'] = wrapped
474
475@eh.reposetup
476def setupcache(ui, repo):
477
478    class obscacherepo(repo.__class__):
479
480        @localrepo.unfilteredmethod
481        def destroyed(self):
482            if r'obsstore' in vars(self):
483                self.obsstore.obscache.clear()
484            super(obscacherepo, self).destroyed()
485
486        @localrepo.unfilteredmethod
487        def updatecaches(self, tr=None, **kwargs):
488            super(obscacherepo, self).updatecaches(tr, **kwargs)
489            self.obsstore.obscache.update(self)
490            self.obsstore.obscache.save(self)
491
492    repo.__class__ = obscacherepo
493