1 /*-
2  * See the file LICENSE for redistribution information.
3  *
4  * Copyright (c) 2002, 2014 Oracle and/or its affiliates.  All rights reserved.
5  *
6  */
7 
8 package com.sleepycat.je.rep.vlsn;
9 
10 import static com.sleepycat.je.utilint.VLSN.NULL_VLSN;
11 
12 import java.util.ArrayList;
13 import java.util.NoSuchElementException;
14 import java.util.SortedMap;
15 import java.util.TreeMap;
16 
17 import com.sleepycat.bind.tuple.LongBinding;
18 import com.sleepycat.je.Cursor;
19 import com.sleepycat.je.CursorConfig;
20 import com.sleepycat.je.DatabaseEntry;
21 import com.sleepycat.je.DatabaseException;
22 import com.sleepycat.je.DbInternal;
23 import com.sleepycat.je.EnvironmentFailureException;
24 import com.sleepycat.je.LockMode;
25 import com.sleepycat.je.OperationStatus;
26 import com.sleepycat.je.dbi.DatabaseImpl;
27 import com.sleepycat.je.dbi.EnvironmentImpl;
28 import com.sleepycat.je.rep.vlsn.VLSNRange.VLSNRangeBinding;
29 import com.sleepycat.je.txn.BasicLocker;
30 import com.sleepycat.je.txn.Locker;
31 import com.sleepycat.je.txn.Txn;
32 import com.sleepycat.je.utilint.DbLsn;
33 import com.sleepycat.je.utilint.LongStat;
34 import com.sleepycat.je.utilint.StatGroup;
35 import com.sleepycat.je.utilint.VLSN;
36 
37 /**
38  * See @link{VLSNIndex} for an overview of the mapping system. The VLSNTracker
39  * packages the VLSNRange and the cached, in-memory VLSNBuckets.
40  *
41  * The tracker has a notion of the "currentBucket", which is the one receiving
42  * updates. All other cached buckets are finished and are awaiting a write to
43  * the database. Those finished buckets will only be updated in special
44  * circumstances, such as log cleaning or replication stream truncation, when
45  * we can assume that there will no readers viewing the buckets.
46  */
47 class VLSNTracker {
48     private final EnvironmentImpl envImpl;
49 
50     /* The first mapping that is is the tracker cache.  */
51     private VLSN firstTrackedVLSN = NULL_VLSN;
52 
53     /* The last VLSN that is stored on disk. */
54     private VLSN lastOnDiskVLSN = NULL_VLSN;
55 
56     /*
57      * A cache of buckets that are not on disk. The map key is the bucket's
58      * first VLSN.
59      */
60     SortedMap<Long, VLSNBucket> bucketCache;
61 
62     /*
63      * The range should always be updated through assignment to a new
64      * VLSNRange, to ensure that the values stay consistent. The current bucket
65      * mutex must be taken in order to update this value, so that the current
66      * bucket is updated before the range is updated.  When reading range
67      * fields, the caller must be sure to get a reference to a single range
68      * instance, so that there's no danger of getting inconsistent values.
69      */
70     protected volatile VLSNRange range;
71     private boolean rangeTruncated;
72 
73     /*
74      * In the future, we may want to vary stride, maxMappings and maxDistance
75      * dynamically, in reaction to efficient the mappings look.
76      */
77     private final int stride;
78     private final int maxMappings;
79     private final int maxDistance;
80 
81     private final LongStat nBucketsCreated;
82 
83     /*
84      * Create an VLSNTracker, with the range initialized from the mapping db.
85      */
VLSNTracker(EnvironmentImpl envImpl, DatabaseImpl mappingDbImpl, int stride, int maxMappings, int maxDistance, StatGroup statistics)86     VLSNTracker(EnvironmentImpl envImpl,
87                 DatabaseImpl mappingDbImpl,
88                 int stride,
89                 int maxMappings,
90                 int maxDistance,
91                 StatGroup statistics)
92         throws DatabaseException {
93 
94         this.stride = stride;
95         this.maxMappings = maxMappings;
96         this.maxDistance = maxDistance;
97         this.envImpl = envImpl;
98         nBucketsCreated =
99             new LongStat(statistics, VLSNIndexStatDefinition.N_BUCKETS_CREATED);
100 
101         bucketCache = new TreeMap<Long, VLSNBucket>();
102 
103         /* Read the current range information. */
104         DatabaseEntry key = new DatabaseEntry();
105         DatabaseEntry data = new DatabaseEntry();
106         LongBinding.longToEntry(VLSNRange.RANGE_KEY, key);
107 
108         Cursor cursor = null;
109         Locker locker = null;
110         try {
111             locker = BasicLocker.createBasicLocker(envImpl);
112             cursor = DbInternal.makeCursor(mappingDbImpl,
113                                            locker,
114                                            CursorConfig.DEFAULT);
115             DbInternal.getCursorImpl(cursor).setAllowEviction(false);
116 
117             OperationStatus status = cursor.getSearchKey(key, data,
118                                                          LockMode.DEFAULT);
119             if (status == OperationStatus.SUCCESS) {
120                 /* initialize the range from the database. */
121                 VLSNRangeBinding rangeBinding = new VLSNRangeBinding();
122                 range = rangeBinding.entryToObject(data);
123                 lastOnDiskVLSN = range.getLast();
124             } else if (status == OperationStatus.NOTFOUND) {
125                 /* No mappings exist before. */
126                 range = VLSNRange.EMPTY;
127             } else {
128                 throw EnvironmentFailureException.unexpectedState
129                     ("VLSNTracker init: status=" + status);
130             }
131         } finally {
132             if (cursor != null) {
133                 cursor.close();
134             }
135 
136             if (locker != null) {
137                 locker.operationEnd(true);
138             }
139         }
140     }
141 
142     /*
143      * Create an empty VLSNTracker. Used during recovery.
144      */
VLSNTracker(EnvironmentImpl envImpl, int stride, int maxMappings, int maxDistance)145     VLSNTracker(EnvironmentImpl envImpl,
146                 int stride,
147                 int maxMappings,
148                 int maxDistance) {
149 
150         this.envImpl = envImpl;
151         this.stride = stride;
152         this.maxMappings = maxMappings;
153         this.maxDistance = maxDistance;
154 
155         /* Set up a temporary stat group for use during recovery */
156         StatGroup statistics =
157             new StatGroup(VLSNIndexStatDefinition.GROUP_NAME,
158                           VLSNIndexStatDefinition.GROUP_DESC);
159         nBucketsCreated =
160             new LongStat(statistics,
161                          VLSNIndexStatDefinition.N_BUCKETS_CREATED);
162 
163         initEmpty();
164     }
165 
initEmpty()166     void initEmpty() {
167         bucketCache = new TreeMap<Long, VLSNBucket>();
168         range = VLSNRange.EMPTY;
169     }
170 
171     /**
172      * Return a bucket for reading a mapping for this VLSN. If vlsn is >
173      * lastOnDisk, a bucket is guaranteed to be returned.
174      */
getGTEBucket(VLSN vlsn)175     synchronized VLSNBucket getGTEBucket(VLSN vlsn) {
176 
177         if (lastOnDiskVLSN.compareTo(vlsn) >= 0) {
178             /* The bucket is in the database, not the cache. */
179             return null;
180         }
181 
182         Long pivot = vlsn.getSequence() + 1;
183         SortedMap<Long, VLSNBucket> head = bucketCache.headMap(pivot);
184         VLSNBucket prevBucket = null;
185         if (head.size() > 0) {
186             prevBucket = head.get(head.lastKey());
187             if (prevBucket.owns(vlsn)) {
188                 return prevBucket;
189             }
190         }
191 
192         /*
193          * If the key is not in the headMap, we must return the next bucket
194          * with mappings that follow on.
195          */
196         SortedMap<Long, VLSNBucket> tail = bucketCache.tailMap(pivot);
197         if (tail.size() > 0) {
198             VLSNBucket bucket = tail.get(tail.firstKey());
199             assert (bucket.owns(vlsn) || bucket.follows(vlsn)) :
200                "VLSN " + vlsn + " got wrong bucket " + bucket;
201             return bucket;
202         }
203 
204         throw EnvironmentFailureException.unexpectedState
205             (envImpl, "VLSN " + vlsn + " should be held within this tracker. " +
206              this + " prevBucket=" + prevBucket);
207     }
208 
209     /**
210      * Get the bucket which holds a mapping for this VLSN. If there is no such
211      * bucket, get the one directly preceding it. If this VLSN is >=
212      * firstTrackedVLSN, then we should be able to guarantee that a bucket is
213      * returned.
214      */
getLTEBucket(VLSN vlsn)215     synchronized VLSNBucket getLTEBucket(VLSN vlsn) {
216 
217         if (firstTrackedVLSN.equals(NULL_VLSN) ||
218             (firstTrackedVLSN.compareTo(vlsn) > 0)) {
219             return null;
220         }
221 
222         Long pivot = vlsn.getSequence() + 1;
223         SortedMap<Long, VLSNBucket> head = bucketCache.headMap(pivot);
224         if (head.size() > 0) {
225             return head.get(head.lastKey());
226         }
227 
228         /*
229          * We shouldn't get here. Get the tail purely for creating a debugging
230          * message.
231          */
232         SortedMap<Long, VLSNBucket> tail = bucketCache.tailMap(pivot);
233         VLSNBucket nextBucket = null;
234         if (tail.size() > 0) {
235             nextBucket = tail.get(tail.firstKey());
236         }
237         throw EnvironmentFailureException.unexpectedState
238             (envImpl, "VLSN " + vlsn + " should be held within this tracker. " +
239              this + " nextBucket=" + nextBucket);
240     }
241 
242     /**
243      * Record a new VLSN->LSN mapping. We guarantee that the first and last
244      * VLSNs in the range have a mapping. If a VLSN comes out of order, we will
245      * discard it, and will not update the range.
246      */
track(VLSN vlsn, long lsn, byte entryTypeNum)247     synchronized void track(VLSN vlsn, long lsn, byte entryTypeNum) {
248 
249         VLSNBucket currentBucket = null;
250 
251         if (vlsn.compareTo(lastOnDiskVLSN) < 0) {
252 
253             /*
254              * This VLSN is a laggard. It belongs to a bucket that has already
255              * gone to disk. Since on disk buckets can't be modified, throw
256              * this mapping away. Do be sure to update the range in case
257              * lastSync or lastTxnEnd should be updated as a result of this
258              * mapping.
259              */
260             updateRange(vlsn, entryTypeNum);
261             return;
262         }
263 
264         if (bucketCache.size() == 0) {
265             /* Nothing in the tracker, add a new bucket. */
266             currentBucket = new VLSNBucket(DbLsn.getFileNumber(lsn), stride,
267                                            maxMappings, maxDistance, vlsn);
268             nBucketsCreated.increment();
269 
270             bucketCache.put(currentBucket.getFirst().getSequence(),
271                             currentBucket);
272         } else {
273             /* Find the last bucket. Only the last bucket is updateable. */
274             currentBucket =  bucketCache.get(bucketCache.lastKey());
275         }
276 
277         /*
278          * This VLSN is a laggard that was preceded by an earlier mapping which
279          * came unseasonably early. This VLSN can't fit into the current
280          * bucket, and since we only want to update the last bucket, we'll
281          * throw this mapping away.
282          */
283         if (currentBucket.follows(vlsn)) {
284             updateRange(vlsn, entryTypeNum);
285             return;
286         }
287 
288         if (!currentBucket.put(vlsn, lsn)) {
289 
290             /*
291              * Couldn't put the mapping in this bucket. Close it and move to a
292              * new one.
293              */
294             currentBucket =
295                 new VLSNBucket(DbLsn.getFileNumber(lsn), stride,
296                                maxMappings, maxDistance, vlsn);
297             nBucketsCreated.increment();
298             bucketCache.put(currentBucket.getFirst().getSequence(),
299                             currentBucket);
300             if (!currentBucket.put(vlsn, lsn)) {
301                 throw EnvironmentFailureException.unexpectedState
302                     (envImpl, "Couldn't put VLSN " + vlsn + " into " +
303                      currentBucket);
304             }
305         }
306 
307         updateRange(vlsn, entryTypeNum);
308 
309         /*
310          * Only update the firstTrackedVLSN if this mapping is really resident
311          * in a bucket. Don't update it if it's an out-of-order, skipped
312          * mapping.
313          */
314         if (firstTrackedVLSN == NULL_VLSN) {
315             firstTrackedVLSN = vlsn;
316         }
317     }
318 
319     /**
320      * Update the range with a newly arrived VLSN.
321      */
updateRange(VLSN vlsn, byte entryTypeNum)322     private void updateRange(VLSN vlsn, byte entryTypeNum) {
323 
324         /*
325          * Get a reference to the volatile range field so that we always see
326          * consistent values.
327          */
328         VLSNRange currentRange = range;
329         range = currentRange.getUpdateForNewMapping(vlsn, entryTypeNum);
330     }
331 
332     /**
333      * Flush the tracker cache to disk.
334      * Ideally, we'd allow concurrent VLSN put() calls while a flush to database
335      * is happening. To do that, we need a more sophisticated synchronization
336      * scheme that defines the immutable vs. mutable portion of the tracker
337      * cache. See SR [#17689]
338      */
flushToDatabase(DatabaseImpl mappingDbImpl, Txn txn)339     synchronized void flushToDatabase(DatabaseImpl mappingDbImpl, Txn txn) {
340 
341 	VLSNRange flushRange = range;
342 
343         if (bucketCache.size() == 0) {
344             /*
345              * No mappings to write out, but it's possible that the range
346              * changed due to a truncation, in which case we must update the
347              * database. RangeTruncated is used to reduce the chance that
348              * we are doing unnecessary writes of the range record to the db.
349              */
350             if (rangeTruncated) {
351                 lastOnDiskVLSN = flushRange.writeToDatabase(envImpl,
352                 	                                    mappingDbImpl, txn);
353                 rangeTruncated = false;
354             }
355             return;
356         }
357 
358         /*
359          * Save information about the portion of the cache that we are trying
360          * to flush. Close off the last bucket, so the flush portion becomes
361          * immutable. In the past, this was in a synchronization block.
362          * This isn't strictly needed right now, since the method is currently
363          * fully synchronized, but is a good practice, and will be
364          * necessary if future changes make it possible to have concurrent
365          * puts.
366          */
367         VLSNBucket lastBucket = bucketCache.get(bucketCache.lastKey());
368         lastBucket.close();
369 
370         /*
371          * Write the flush portion to disk. Ideally, this portion would be done
372          * without synchronization on the tracker, to permit concurrent reads
373          * and writes to the tracker. Note that because there is no
374          * synchronization, the buckets must be flushed to disk in vlsn order,
375          * so that the vlsn index always looks consistent. There should be no
376          * gaps in the bucket range. This assumption permits concurrent access
377          * by VLSNIndex.getGTEBucketFromDatabase.
378          */
379         VLSN currentLastVLSN = lastOnDiskVLSN;
380         for (Long key : bucketCache.keySet()) {
381 
382             VLSNBucket target = bucketCache.get(key);
383 
384             /* Do some sanity checking before we write this bucket to disk. */
385             validateBeforeWrite(target,
386                                 flushRange,
387                                 currentLastVLSN,
388                                 target==lastBucket);
389             target.writeToDatabase(envImpl, mappingDbImpl, txn);
390             currentLastVLSN = target.getLast();
391         }
392 
393         lastOnDiskVLSN = flushRange.writeToDatabase(envImpl,
394         	                                    mappingDbImpl, txn);
395         rangeTruncated = false;
396 
397         /*
398          * Update the tracker to remove the parts that have been written to
399          * disk. Update firstTrackedVLSN, lastOnDiskVLSN, and clear the bucket
400          * cache.
401          *
402          * In the current implementation, bucketCache is guaranteed not to
403          * change, so we can just clear the cache. If we had written the
404          * buckets out without synchronization, the cache might have gained
405          * mappings, and we wouuld have to be careful to set the
406          * firstTrackedVLSN to be the first VLSN of the remaining buckets, and
407          * to set lastTrackedVLSN to be the last VLSN in flushRange.
408          *
409          * We would also have to explicitly call SortedMap.remove() to remove
410          * the written buckets. Do not use TreeMap.tailSet, which would
411          * inadvertently leave us with the semantics of a SubMap, which has
412          * mandated key ranges. With rollbacks, we may end up adding buckets
413          * that are "repeats" in the future.
414          */
415         bucketCache.clear();
416         firstTrackedVLSN = NULL_VLSN;
417     }
418 
419     /**
420      * Do some sanity checking before the write, to help prevent any corruption.
421      */
validateBeforeWrite(VLSNBucket target, VLSNRange flushRange, VLSN currentLastVLSN, boolean isLastBucket)422     private void validateBeforeWrite(VLSNBucket target,
423                                      VLSNRange flushRange,
424                                      VLSN currentLastVLSN,
425                                      boolean isLastBucket) {
426 
427         if (!(currentLastVLSN.equals(NULL_VLSN)) &&
428             (target.getFirst().compareTo(currentLastVLSN) <= 0)) {
429             throw EnvironmentFailureException.unexpectedState
430                 (envImpl, "target bucket overlaps previous bucket. " +
431                  "currentLastVLSN= " + currentLastVLSN + " target=" +
432                  target);
433         }
434 
435         if (target.getLast().compareTo(flushRange.getLast()) > 0) {
436             throw EnvironmentFailureException.unexpectedState
437                 (envImpl, "target bucket exceeds flush range. " +
438                  "range= " + flushRange + " target=" +
439                  target);
440         }
441 
442         if (isLastBucket) {
443             if (target.getLast().compareTo(flushRange.getLast()) != 0) {
444             throw EnvironmentFailureException.unexpectedState
445                 (envImpl, "end of last bucket should match end of range. " +
446                  "range= " + flushRange + " target=" + target);
447             }
448         }
449     }
450 
truncateFromHead(VLSN deleteEnd, long deleteFileNum)451     synchronized void truncateFromHead(VLSN deleteEnd, long deleteFileNum) {
452         /*
453          * Update range first, to prevent others from attempting to read the
454          * truncated portions. Update the firstTrackedLSN last, after the
455          * buckets are removed; getLTEBucket relies on the firstTrackedLSN
456          * value.
457          */
458         VLSNRange oldRange = range;
459         range = oldRange.shortenFromHead(deleteEnd);
460 
461         /*
462          * Ensure that the range is written to the db even if the resulting
463          * number of buckets is 0.
464          */
465         rangeTruncated = true;
466 
467         /*
468          * afterDelete may not == range.getFirst, becayse range.getFirst may
469          * be NULL_VLSN if the entire vlsn index has been deleted.
470          */
471         VLSN afterDelete = deleteEnd.getNext();
472 
473         /**
474          * Check if the cached buckets needs to be modified. Suppose vlsns 1-8
475          * are on disk, and vlsns 12->40 are in the tracker cache, with an out
476          * of order gap from 9 - 11.
477          *
478          * case 1: deleteEnd <= 8, Do not need to truncate anything in the
479          *  bucket cacke, but we may have to make a ghost bucket to fill the
480          *  gap.
481          * case 2: deleteEnd == 11, we don't need to modify the cache or
482          *         create a ghost bucket.
483          * case 3: deleteEnd >8 and < 11 is in the gap, we need to create a
484          *         ghost bucket to hold the beginning of the range..
485          * case 4: deleteEnd >= 12, we need to modify the bucket cache.
486          */
487         if (!lastOnDiskVLSN.equals(VLSN.NULL_VLSN) &&
488             (lastOnDiskVLSN.compareTo(deleteEnd) >= 0)) {
489 
490             /*
491              * case 1:  the buckets in the tracker cache are unaffected, all
492              * the pertinent mappings are on disk.
493              */
494             if (lastOnDiskVLSN.equals(deleteEnd)) {
495                 if (firstTrackedVLSN.compareTo(afterDelete) > 0) {
496                     checkForGhostBucket(bucketCache, deleteFileNum);
497                 }
498                 lastOnDiskVLSN = NULL_VLSN;
499             }
500             return;
501         }
502 
503         assert(!firstTrackedVLSN.equals(NULL_VLSN));
504 
505         if (firstTrackedVLSN.equals(afterDelete)) {
506             /* case 2: The tracker is lined up perfectly with the new range. */
507             lastOnDiskVLSN = NULL_VLSN;
508             return;
509         }
510 
511         if (firstTrackedVLSN.compareTo(afterDelete) > 0) {
512             /*
513              * case 3: we have to make a ghost bucket.
514              */
515             checkForGhostBucket(bucketCache, deleteFileNum);
516             lastOnDiskVLSN = NULL_VLSN;
517             return;
518         }
519 
520         /*
521          * case 4: we have to prune the buckets.
522          */
523         VLSNBucket owningBucket = getLTEBucket(deleteEnd);
524         Long retainBucketKey = owningBucket.getFirst().getNext().getSequence();
525         SortedMap<Long, VLSNBucket> tail;
526         try {
527             /*
528              * We need to chop off part of the bucket cache. Find the portion
529              * to retain.
530              */
531             tail = bucketCache.tailMap(retainBucketKey);
532             checkForGhostBucket(tail, deleteFileNum);
533             bucketCache = tail;
534         } catch (NoSuchElementException e) {
535             firstTrackedVLSN = NULL_VLSN;
536             bucketCache = new TreeMap<Long, VLSNBucket>();
537         }
538 
539         lastOnDiskVLSN = NULL_VLSN;
540     }
541 
checkForGhostBucket(SortedMap<Long, VLSNBucket> buckets, long deleteFileNum)542     private void checkForGhostBucket(SortedMap<Long, VLSNBucket> buckets,
543                                      long deleteFileNum) {
544         /*
545          * The range has just been truncated. If the first bucket's first vlsn
546          * != the first in range, insert a GhostBucket.
547          */
548         Long firstKey = buckets.firstKey();
549         VLSNBucket firstBucket = buckets.get(firstKey);
550         VLSN firstRangeVLSN = range.getFirst();
551         VLSN firstBucketVLSN = firstBucket.getFirst();
552         if (!firstBucketVLSN.equals(firstRangeVLSN)) {
553             long nextFile =
554                 envImpl.getFileManager().getFollowingFileNum(deleteFileNum,
555                                                    true /* forward */);
556             long lastPossibleLsn = firstBucket.getLsn(firstBucketVLSN);
557             VLSNBucket placeholder = new GhostBucket(firstRangeVLSN,
558                                                      DbLsn.makeLsn(nextFile, 0),
559                                                      lastPossibleLsn);
560             buckets.put(firstRangeVLSN.getSequence(), placeholder);
561         }
562     }
563 
564     /**
565      * Remove the mappings for VLSNs >= deleteStart.  This should only be used
566      * at syncup, or recovery. We can assume no new mappings are coming in, but
567      * the VLSNIndex may be read during this time; checkpoints continue to
568      * happen during rollbacks. Since syncup is always at a sync-able log
569      * entry, and since we don't roll back past a commit, we can assume that
570      * the lastSync can be changed to deleteStart.getPrev(), and lastTxnEnd is
571      * unchanged.  The VLSNRange itself is updated before the tracker and
572      * on-disk buckets are pruned.
573      *
574      * The VLSNIndex guarantees that the beginning and end vlsns in the range
575      * have mappings so LTE and GTE search operations work. When truncating the
576      * mappings in the tracker, check to see if the new end vlsn has a mapping,
577      * since that may not happen if the buckets are cut in the middle, or have
578      * a gap due to out of order mapping. For example, suppose mapping for vlsn
579      * 20 came before the mappings for vlsn 17, 18 and 19. The buckets would
580      * have this gap:
581      *     Bucket A: vlsns 10-16
582      *     Bucket B: vlsns 20-25
583      *     Bucket C: vlsns 26 - 30
584      * If the new end of vlsn range is 11-14, 17, 18, 19, 21 - 24, or 27-29,
585      * we have to end the vlsn tracker with a new mapping of
586      * (vlsn (deleteStart-1) ->  prevLsn) to cap off the buckets.
587      *
588      * Cases:
589      * End of range is in the gap -- either 17, 18, 19
590      * 1. on disk buckets exist, but don't encompass the gap: bucket A on disk
591      * 2. all buckets are in the tracker cache: bucket A in tracker
592      * 3. on disk buckets encompass the gap: bucket A and B on disk.
593      */
truncateFromTail(VLSN deleteStart, long prevLsn)594     synchronized void truncateFromTail(VLSN deleteStart, long prevLsn) {
595 
596         /*
597          * Update the range first, it will define the range that the vlsnIndex
598          * covers. Then adjust any mappings held in the bucket cache. Don't
599          * update the lastOnDiskVLSN marker, which says what is on disk. Since
600          * the on-disk portion may also get truncated, the caller will update
601          * the lastOnDiskVLSN field when it inspects the database.
602          */
603         VLSNRange oldRange = range;
604         range = oldRange.shortenFromEnd(deleteStart);
605 
606         /*
607          * Ensure that the range is written to the db even if the resulting
608          * number of buckets is 0.
609          */
610         rangeTruncated = true;
611 
612         if (firstTrackedVLSN.equals(NULL_VLSN)) {
613             /* No mappings in this tracker */
614             return;
615         }
616 
617         if (firstTrackedVLSN.compareTo(deleteStart) >= 0) {
618 
619             /*
620              * Everything in this tracker should be removed. In addition, the
621              * caller may be removing some items from the database.
622              */
623             bucketCache.clear();
624             firstTrackedVLSN = NULL_VLSN;
625             return;
626         }
627 
628         /*
629          * We need to do some pruning of the bucket in the cache. Find the
630          * buckets in the bucketCache that have mappings >= deleteStart, and
631          * remove their mappings. First establish the headset of buckets that
632          * should be preserved.
633          */
634         VLSNBucket targetBucket = getGTEBucket(deleteStart);
635         Long targetKey = targetBucket.getFirst().getSequence();
636 
637         /* The newCache has buckets that should be preserved. */
638         SortedMap<Long, VLSNBucket> newCache =
639             new TreeMap<Long, VLSNBucket>(bucketCache.headMap(targetKey));
640 
641         /*
642          * Prune any mappings >= deleteStart out of targetBucket. If it has any
643          * mappings left, add it to newCache.
644          */
645         targetBucket.removeFromTail(deleteStart, prevLsn);
646         if (!targetBucket.empty()) {
647             newCache.put(targetBucket.getFirst().getSequence(),
648                          targetBucket);
649         }
650 
651         bucketCache = newCache;
652 
653         /*
654          * Now all truncated mappings have been removed from the index. Check
655          * that the end point of the vlsn range is represented in this
656          * tracker. Since vlsn mappings can  come out of order, it's
657          * possible that the buckets have a gap, and that truncation will
658          * cleave the bucket cache just at a point where a mapping is
659          * missing. For example, vlsn mappings come in this order:
660          *           vlsn 16, vlsn 18, vlsn 17
661          * causing the buckets to have this gap:
662          *           bucket N: vlsn 10 - 16
663          *           bucket N+1: vlsn 18 - 20
664          * and that the vlsn range is truncated with a deleteStart of vlsn 18.
665          * The new end range becomes 17, but the buckets do not have a
666          * mapping for vlsn 17. If so, create a mapping now.
667          *
668          * If we are at this point, we know that the tracker should contain
669          * a mapping for the last VLSN in the range. Fix it now, so the
670          * datastructure is as tidy as possible.
671          */
672         boolean needEndMapping = false;
673         if (bucketCache.isEmpty()) {
674             needEndMapping = true;
675         } else {
676             VLSNBucket lastBucket = bucketCache.get(bucketCache.lastKey());
677             needEndMapping =
678                     (lastBucket.getLast().compareTo(range.getLast()) < 0);
679         }
680 
681         if (needEndMapping) {
682             addEndMapping(range.getLast(), prevLsn);
683         }
684     }
685 
686     /**
687      * Called by the VLSNIndex to see if we have to add a bucket in the tracker
688      * after pruning the on-disk storage. The get{LTE,GTE}Bucket methods
689      * assume that there are mappings for the first and last VLSN in the range.
690      * But since out of order vlsn mappings can cause non-contiguous buckets,
691      * the pruning may have lost the end mapping. [#23491]
692      */
ensureRangeEndIsMapped(VLSN lastVLSN, long lastLsn)693     synchronized void ensureRangeEndIsMapped(VLSN lastVLSN, long lastLsn) {
694 
695         /**
696          * if lastOnDiskVLSN < lastVLSN < firstTrackedVLSN or
697          *    lastOnDiskVLSN < lastVLSN and firstTrackedVLSN is null
698          * then we need to add a mapping for lastVLSN->lastLsn. Otherwise a
699          * mapping already exists.
700          */
701         if (lastOnDiskVLSN.compareTo(lastVLSN) < 0) {
702 
703             /*
704              * The on-disk vlsn mappings aren't sufficient to cover the
705              * lastVLSN. There needs to be a mapping in the tracker for the
706              * end point of the range.
707              */
708             if (!firstTrackedVLSN.equals(NULL_VLSN)) {
709 
710                 /*
711                  * There are mappings in the tracker, so we can assume
712                  * that they cover the end point, because truncateFromTail()
713                  * should have adjusted for that. But assert to be sure!
714                  */
715                 if (lastVLSN.compareTo(firstTrackedVLSN) < 0) {
716                     throw EnvironmentFailureException.unexpectedState
717                         (envImpl,
718                          "Expected this tracker to cover vlsn " + lastVLSN +
719                          " after truncateFromTail. LastOnDiskVLSN= " +
720                          lastOnDiskVLSN + " tracker=" + this);
721                 }
722 
723                 return;
724             }
725 
726             /*
727              * There are no mappings in the tracker. The on disk mappings were
728              * pruned, and a gap was created between what was on disk and what
729              * is in the tracker cache. Add a mapping.
730              */
731             addEndMapping(lastVLSN, lastLsn);
732         }
733     }
734 
735     /**
736      * Add a bucket holding a lastVLSN -> lastLsn mapping
737      */
addEndMapping(VLSN lastVLSN, long lastLsn)738     private void addEndMapping(VLSN lastVLSN, long lastLsn) {
739         /* Assert that it's in the range */
740         assert (lastVLSN.compareTo(range.getLast()) == 0) :
741             "lastVLSN=" + lastVLSN + " lastLsn = " + lastLsn +
742             " range=" + range;
743 
744         VLSNBucket addBucket =
745             new VLSNBucket(DbLsn.getFileNumber(lastLsn), stride,
746                            maxMappings, maxDistance, lastVLSN);
747         addBucket.put(lastVLSN, lastLsn);
748         nBucketsCreated.increment();
749         bucketCache.put(addBucket.getFirst().getSequence(), addBucket);
750         if (firstTrackedVLSN.equals(NULL_VLSN)) {
751             firstTrackedVLSN = lastVLSN;
752         }
753     }
754 
755     /**
756      * Attempt to replace the mappings in this vlsnIndex for
757      * deleteStart->lastVLSN with those from the recovery mapper.
758      * Since we do not supply a prevLsn argument, we may not be able to "cap"
759      * the truncated vlsn bucket the same what that we can in
760      * truncateFromTail. Because of that, we may need to remove mappings that
761      * are < deleteStart.
762      *
763      * For example, suppose a bucket holds mappings
764      *   10 -> 101
765      *   15 -> 201
766      *   18 -> 301
767      * If deleteStart == 17, we will have to delete all the way to vlsn 15.
768      *
769      * The maintenance of VLSNRange.lastSync and lastTxnEnd are simplified
770      * because of assumptions we can make because we are about to append
771      * mappings found by the recovery tracker that begin at deleteStart. If
772      * lastSync and lastTxnEnd are <= deleteStart, we know that they will
773      * either stay the same, or be replaced by lastSync and lastTxnEnd from the
774      * recoveryTracker. Even we delete mappings that are < deleteStart, we know
775      * that we did not roll back past an abort or commit, so that we do not
776      * have to updated the lastSync or lastTxnEnd value.
777      */
merge(VLSN prunedLastOnDiskVLSN, VLSNRecoveryTracker recoveryTracker)778     void merge(VLSN prunedLastOnDiskVLSN, VLSNRecoveryTracker recoveryTracker) {
779 
780         VLSNRange oldRange = range;
781         range = oldRange.merge(recoveryTracker.range);
782         VLSN recoveryFirst = recoveryTracker.getRange().getFirst();
783 
784         lastOnDiskVLSN = prunedLastOnDiskVLSN;
785 
786         /*
787          * Find the buckets in the bucketCache that have mappings >=
788          * recoveryFirst, and remove their mappings. First establish the
789          * headset of buckets that should be preserved. At this point, we
790          * have already pruned the database, so the bucket set may not
791          * be fully contiguous -- we may have pruned away mappings that
792          * would normally be in the database.
793          */
794         if (bucketCache.size() == 0) {
795             /* Just take all the recovery tracker's mappings */
796             bucketCache = recoveryTracker.bucketCache;
797         } else {
798             VLSNBucket targetBucket = getGTEBucket(recoveryFirst);
799             Long targetKey = targetBucket.getFirst().getSequence();
800 
801             /* The newCache holds buckets that should be preserved. */
802             SortedMap<Long, VLSNBucket> newCache =
803                 new TreeMap<Long, VLSNBucket>(bucketCache.headMap(targetKey));
804 
805             /*
806              * Prune any mappings >= recoveryFirst out of targetBucket. If it
807              * has any mappings left, add it to newCache.
808              */
809             targetBucket.removeFromTail(recoveryFirst, DbLsn.NULL_LSN);
810             if (!targetBucket.empty()) {
811                 newCache.put(targetBucket.getFirst().getSequence(),
812                              targetBucket);
813             }
814 
815             newCache.putAll(recoveryTracker.bucketCache);
816             bucketCache = newCache;
817         }
818 
819         if (bucketCache.size() > 0) {
820             VLSNBucket firstBucket = bucketCache.get(bucketCache.firstKey());
821             firstTrackedVLSN = firstBucket.getFirst();
822         }
823     }
824 
append(VLSNRecoveryTracker recoveryTracker)825     void append(VLSNRecoveryTracker recoveryTracker) {
826 
827         /*
828          * This method assume that there is no overlap between this tracker
829          * and the recovery tracker. Everything in this tracker should precede
830          * the recovery tracker.
831          */
832         if (!range.getLast().isNull()) {
833             if (range.getLast().compareTo
834                 (recoveryTracker.getFirstTracked()) >= 0) {
835 
836                 throw EnvironmentFailureException.unexpectedState
837                     (envImpl,
838                      "Expected this tracker to precede recovery tracker. " +
839                      "This tracker= " + this + " recoveryTracker = " +
840                      recoveryTracker);
841             }
842         }
843 
844         bucketCache.putAll(recoveryTracker.bucketCache);
845         VLSNRange currentRange = range;
846         range = currentRange.getUpdate(recoveryTracker.getRange());
847         if (bucketCache.size() > 0) {
848             VLSNBucket firstBucket = bucketCache.get(bucketCache.firstKey());
849             firstTrackedVLSN = firstBucket.getFirst();
850         }
851     }
852 
getRange()853     VLSNRange getRange() {
854         return range;
855     }
856 
857     @Override
toString()858     public String toString() {
859         StringBuilder sb = new StringBuilder();
860         sb.append(range);
861         sb.append(" firstTracked=").append(firstTrackedVLSN);
862         sb.append(" lastOnDiskVLSN=").append(lastOnDiskVLSN);
863 
864         for (VLSNBucket bucket: bucketCache.values()) {
865             sb.append("\n");
866             sb.append(bucket);
867         }
868         return sb.toString();
869     }
870 
871     /**
872      * For unit test support.
873      */
verify(boolean verbose)874     synchronized boolean verify(boolean verbose) {
875 
876         if (!range.verify(verbose)) {
877             return false;
878         }
879 
880         /* Check that all the buckets are in order. */
881         ArrayList<VLSN> firstVLSN = new ArrayList<VLSN>();
882         ArrayList<VLSN> lastVLSN = new ArrayList<VLSN>();
883 
884         for (VLSNBucket b : bucketCache.values()) {
885             firstVLSN.add(b.getFirst());
886             lastVLSN.add(b.getLast());
887         }
888 
889         if (!verifyBucketBoundaries(firstVLSN, lastVLSN)) {
890             return false;
891         }
892 
893         if (firstVLSN.size() > 0) {
894             if (!firstVLSN.get(0).equals(firstTrackedVLSN)) {
895                 if (verbose) {
896                     System.err.println("firstBucketVLSN = " +
897                                        firstVLSN.get(0) + " should equal " +
898                                        firstTrackedVLSN);
899                 }
900                 return false;
901             }
902 
903             VLSN lastBucketVLSN = lastVLSN.get(lastVLSN.size() -1);
904             if (!lastBucketVLSN.equals(range.getLast())) {
905                 if (verbose) {
906                     System.err.println("lastBucketVLSN = " + lastBucketVLSN +
907                                        " should equal " + range.getLast());
908                 }
909                 return false;
910             }
911         }
912 
913         return true;
914     }
915 
916     /*
917      * Used to check bucket boundaries for both the in-memory flush list and
918      * the on-disk database buckets. Buckets may not be adjacent, but must be
919      * in order.
920      */
verifyBucketBoundaries(ArrayList<VLSN> firstVLSN, ArrayList<VLSN> lastVLSN)921     static boolean verifyBucketBoundaries(ArrayList<VLSN> firstVLSN,
922                                           ArrayList<VLSN> lastVLSN) {
923         for (int i = 1; i < firstVLSN.size(); i++) {
924             VLSN first = firstVLSN.get(i);
925             VLSN prevLast = lastVLSN.get(i-1);
926             if (first.compareTo(prevLast.getNext()) < 0) {
927                 System.out.println("Boundary problem: bucket " + i +
928                                    " first " + first +
929                                    " follows bucket.last " + prevLast);
930                 return false;
931             }
932         }
933         return true;
934     }
935 
getFirstTracked()936     VLSN getFirstTracked() {
937         return firstTrackedVLSN;
938     }
939 
getLastOnDisk()940     VLSN getLastOnDisk() {
941         return lastOnDiskVLSN;
942     }
943 
setLastOnDiskVLSN(VLSN lastOnDisk)944     void setLastOnDiskVLSN(VLSN lastOnDisk) {
945         lastOnDiskVLSN = lastOnDisk;
946     }
947 
948     /*
949      * For unit test support only. Can only be called when replication stream
950      * is quiescent.
951      */
isFlushedToDisk()952     boolean isFlushedToDisk() {
953         return lastOnDiskVLSN.equals(range.getLast());
954     }
955 }
956