1 /*-
2  * See the file LICENSE for redistribution information.
3  *
4  * Copyright (c) 2002, 2014 Oracle and/or its affiliates.  All rights reserved.
5  *
6  */
7 
8 package com.sleepycat.je.rep.vlsn;
9 
10 import java.io.PrintStream;
11 import java.nio.ByteBuffer;
12 import java.util.ArrayList;
13 import java.util.List;
14 
15 import com.sleepycat.bind.tuple.LongBinding;
16 import com.sleepycat.bind.tuple.TupleBinding;
17 import com.sleepycat.bind.tuple.TupleInput;
18 import com.sleepycat.bind.tuple.TupleOutput;
19 import com.sleepycat.je.Cursor;
20 import com.sleepycat.je.CursorConfig;
21 import com.sleepycat.je.DatabaseEntry;
22 import com.sleepycat.je.DbInternal;
23 import com.sleepycat.je.EnvironmentFailureException;
24 import com.sleepycat.je.OperationStatus;
25 import com.sleepycat.je.config.EnvironmentParams;
26 import com.sleepycat.je.dbi.DatabaseImpl;
27 import com.sleepycat.je.dbi.EnvironmentImpl;
28 import com.sleepycat.je.log.FileReader;
29 import com.sleepycat.je.txn.Txn;
30 import com.sleepycat.je.utilint.DbLsn;
31 import com.sleepycat.je.utilint.VLSN;
32 
33 /**
34  * A VLSNBucket instance represents a set of VLSN->LSN mappings. Buckets are
35  * usually not updated, except at times when the replication stream may have
36  * been reduced in size, by log cleaning or syncup. The VLSNBuckets in the
37  * VLSNIndex's VLSNTracker are written to disk and are persistent. There are
38  * also VLSNBuckets in the temporary recovery-time tracker that are used for
39  * collecting mappings found in the log during recovery.
40  *
41  * VLSNBuckets only hold mappings from a single log file. A single log file
42  * may be mapped by multiple VLSNBuckets though.
43  *
44  * As a tradeoff in space vs time, a VLSNBucket only stores a sparse set of
45  * mappings and the caller must use a VLSNReader to scan the log file and
46  * find any log entries not mapped directly by the bucket. In addition,
47  * the VLSN is not actually stored. Only the offset portion of the LSN is
48  * stored, and the VLSN is intuited by a stride field. Each VLSNBucket
49  * only maps a single file, though a single file may be mapped by several
50  * VLSNBuckets.
51  *
52  * For example, suppose a node had these VLSN->LSN mappings:
53  *
54  * VLSN            LSN (file/offset)
55  * 9               10/100
56  * 10              10/110
57  * 11              10/120
58  * 12              10/130
59  * 13              10/140
60  * 14              11/100
61  * 15              11/120
62  *
63  * The mappings in file 10 could be represented by a VLSNBucket with
64  * a stride of 4. That means the bucket would hold the mappings for
65  *  9              10/100,
66  * 13              10/140
67  *
68  * And since the target log file number and the stride is known, the mappings
69  * can be represented in by the offset alone in this array: {100, 140}, rather
70  * than storing the whole lsn.
71  *
72  * Each bucket can also provide the mapping for the first and last VLSN it
73  * covers, even if the lastVLSN is not divisible by the stride. This is done to
74  * support forward and backward scanning. From the example above, the completed
75  * bucket can provide 9->10/100, 13->10/140, 15 -> 10/160 even though 15 is not
76  * a stride's worth away from 13.
77  *
78  * Because registering a VLSN->LSN mapping is done outside the log write latch,
79  * any inserts into the VLSNBucket may not be in order. However, when any
80  * VLSN is registered, we can assume that all VLSNs < that value do exist in
81  * the log. It's just an accident of timing that they haven't yet been
82  * registered. Note that out of order inserts into the buckets can create holes
83  * in the bucket's offset array, or cause the array to be shorter than
84  * anticipated.
85  *
86  * For example, if the insertion order into the bucket is vlsns 9, 15, we'll
87  * actually only keep an offset array of size 1. We have to be able to handle
88  * holes in the bucket, and can't count on filling them in when the lagging
89  * vlsn arrives, because it is possible that a reading thread will access the
90  * bucket before the laggard inserter arrives, or that the bucket might be
91  * flushed to disk, and become immutable.
92  */
93 public class VLSNBucket {
94 
95     /* On-disk version. */
96     private static final int VERSION = 1;
97 
98     /* File number for target file. */
99     private final long fileNumber;
100 
101     /* Interval between VLSN values that are mapped. */
102     private final int stride;
103 
104     protected VLSN firstVLSN = VLSN.NULL_VLSN;
105     protected VLSN lastVLSN = VLSN.NULL_VLSN;
106     private long lastLsn = DbLsn.NULL_LSN;
107 
108     /*
109      * The file offsets are really unsigned ints. The calls to put() are
110      * implemented to let us assume that the list is fully populated.  A
111      * subclass of truncateableList has been used in order to provide access to
112      * the ArrayList.removeFromRange method.
113      */
114     private TruncateableList<Integer> fileOffsets;
115 
116     /*
117      * The max number of offsets and maxDistance help guide when to close the
118      * bucket and start a new one. Not persistent.
119      */
120     private int maxMappings;
121     private int maxDistance;
122 
123     private static final int NO_OFFSET = 0;
124 
125     /* True if there are changes to the bucket that are not on disk. */
126     boolean dirty;
127 
128     /*
129      * True if the VLSNBucket will not accept any more modifications; used to
130      * safeguard the bucket while the index is being written to disk.
131      */
132     private boolean closed = false;
133 
VLSNBucket(long fileNumber, int stride, int maxMappings, int maxDistance, VLSN firstVLSN)134     VLSNBucket(long fileNumber,
135                int stride,
136                int maxMappings,
137                int maxDistance,
138                VLSN firstVLSN) {
139         this.fileNumber = fileNumber;
140         this.stride = stride;
141         this.maxMappings = maxMappings;
142         this.maxDistance = maxDistance;
143 
144         /*
145          * The VLSNs in the bucket are initialized to indicate what range
146          * should be covered by this bucket. But there may not be any offsets
147          * recorded either in the lastLsn or the fileOffset.
148          */
149         this.firstVLSN = firstVLSN;
150         this.lastVLSN = firstVLSN;
151 
152         /*
153          * Initialize the file offsets with a -1 value to correspond to
154          * firstVLSN.
155          */
156         fileOffsets = new TruncateableList<Integer>();
157         fileOffsets.add(0, NO_OFFSET);
158     }
159 
160     /* For reading from disk. */
VLSNBucket(TupleInput ti)161     private VLSNBucket(TupleInput ti) {
162         fileNumber = ti.readPackedLong();
163         stride = ti.readPackedInt();
164         firstVLSN = new VLSN(ti.readPackedLong());
165         lastVLSN = new VLSN(ti.readPackedLong());
166         lastLsn = ti.readPackedLong();
167         int size = ti.readPackedInt();
168         fileOffsets = new TruncateableList<Integer>(size);
169         for (int i = 0; i < size; i++) {
170             fileOffsets.add(i, DbLsn.getFileOffsetAsInt(ti.readUnsignedInt()));
171         }
172     }
173 
174     /**
175      * Record the LSN location for this VLSN.
176      *
177      * One key issue is that puts() are not synchronized, and the VLSNs may
178      * arrive out of order. If an out of order VLSN does arrive, we can still
179      * assume that the earlier VLSNs have been successfully logged. If a VLSN
180      * arrives that is divisible by the stride, and should be recorded in the
181      * fileOffsets, but is not the next VLSN that should be recorded, we'll pad
182      * out the fileOffsets list with placeholders.
183      *
184      * For example, suppose the stride is 3, and the first VLSN is 2. Then this
185      * bucket should record VLSN 2, 5, 8, ... etc.  If VLSN 8 arrives before
186      * VLSN 5, VLSN 8 will be recorded, and VLSN 5 will have an offset
187      * placeholder of NO_OFFSET. It is a non-issue if VLSNs 3, 4, 6, 7 arrive
188      * out of order, because they would not have been recorded anyway. This
189      * should not happen often, because the stride should be fairly large, and
190      * the calls to put() should be close together. If the insertion order is
191      * vlsn 2, 5, 9, then the file offsets array will be a little short, and
192      * will only have 2 elements, instead of 3.
193      *
194      * We follow this policy because we must always have a valid begin and end
195      * point for the range. We must handle placeholders in all cases, and can't
196      * count of later vlsn inserts, because a bucket can become immutable at
197      * any time if it is flushed to disk.
198      *
199      * @return false if this bucket will not accept this VLSN. Generally, a
200      * refusal might happen because the bucket was full or the mapping is too
201      * large a distance away from the previous mapping. In that case, the
202      * tracker will start another bucket.
203      */
put(VLSN vlsn, long lsn)204     synchronized boolean put(VLSN vlsn, long lsn) {
205 
206         if (closed) {
207             return false;
208         }
209 
210         if (!belongs(vlsn, lsn)) {
211             return false;
212         }
213 
214         /*
215          * Add it to the fileOffset if it's on a stride boundary and is the
216          * next mapping in the fileOffset list.
217          */
218         if (isModulo(vlsn)) {
219             int index = getIndex(vlsn);
220             int listLen = fileOffsets.size();
221             if (index < listLen) {
222                 fileOffsets.set(index, DbLsn.getFileOffsetAsInt(lsn));
223             } else if (index == listLen) {
224                 fileOffsets.add(DbLsn.getFileOffsetAsInt(lsn));
225             } else {
226                 for (int i = listLen; i < index; i++) {
227                     fileOffsets.add(NO_OFFSET);
228                 }
229                 fileOffsets.add(DbLsn.getFileOffsetAsInt(lsn));
230             }
231             dirty = true;
232         }
233 
234         /* If the lastLsn is less, or not initialized, set it to this VLSN. */
235         if ((lastVLSN.compareTo(vlsn) < 0) ||
236             (lastLsn == DbLsn.NULL_LSN)) {
237             lastVLSN = vlsn;
238             lastLsn = lsn;
239             dirty = true;
240         }
241 
242         return true;
243     }
244 
245     /*
246      * Return true if this VLSN is on a stride boundary.  Assumes
247      * !firstVLSN.isNull()
248      */
isModulo(VLSN vlsn)249     private boolean isModulo(VLSN vlsn) {
250         return (((vlsn.getSequence() - firstVLSN.getSequence()) % stride) ==
251                 0);
252     }
253 
getIndex(VLSN vlsn)254     private int getIndex(VLSN vlsn) {
255         assert isModulo(vlsn) : "Don't call getIndex on non-modulo VLSN " +
256             vlsn + " bucket=" + this;
257 
258         return (int) ((vlsn.getSequence() - firstVLSN.getSequence()) / stride);
259     }
260 
261     /**
262      * @return true if this VLSN->LSN mapping should go into this bucket.
263      */
belongs(VLSN vlsn, long lsn)264     private boolean belongs(VLSN vlsn, long lsn) {
265         assert vlsn.compareTo(firstVLSN) >= 0 :
266             "firstVLSN = " + firstVLSN + " should not be greater than " + vlsn;
267 
268         if (DbLsn.getFileNumber(lsn) != fileNumber) {
269             /* Mappings must be for same file. */
270             return false;
271         }
272 
273         if (emptyInternal()) {
274             return true;
275         }
276 
277         /*
278          * Some other thread beat us to the put() call and inserted a later
279          * mapping, so we know for sure that we fit in this bucket
280          */
281         if (lastVLSN.compareTo(vlsn) > 0) {
282             return true;
283         }
284 
285         boolean onStrideBoundary = isModulo(vlsn);
286         if (onStrideBoundary && (fileOffsets.size() >= maxMappings)) {
287             /* Too full. */
288             return false;
289         }
290 
291         /*
292          * Will this VLSN be next one recorded in the fileOffsets? If so,
293          * calculate the scan distance.
294          */
295         if ((onStrideBoundary && (getIndex(vlsn) == fileOffsets.size())) ||
296             lastVLSN.compareTo(vlsn) < 0) {
297             /* This VLSN is going in at the tail of the bucket. */
298             int lastOffset = fileOffsets.get(fileOffsets.size() - 1);
299             if ((DbLsn.getFileOffset(lsn) -
300                  DbLsn.convertIntFileOffsetToLong(lastOffset)) >
301                 maxDistance) {
302                 /* The scan distance is exceeded. */
303                 return false;
304             }
305         }
306 
307         return true;
308     }
309 
310     /**
311      * @return true if this bucket contains this mapping.
312      */
owns(VLSN vlsn)313     synchronized boolean owns(VLSN vlsn) {
314         if (vlsn.equals(VLSN.NULL_VLSN)) {
315             return false;
316         } else if (firstVLSN.equals(VLSN.NULL_VLSN)) {
317             return false;
318         } else {
319             return (firstVLSN.compareTo(vlsn) <= 0) &&
320                 (lastVLSN.compareTo(vlsn) >= 0);
321         }
322     }
323 
getFirst()324     synchronized VLSN getFirst() {
325         return firstVLSN;
326     }
327 
getLast()328     synchronized VLSN getLast() {
329         return lastVLSN;
330     }
331 
332     /**
333      * Return a file number that is less or equal to the first lsn mapped
334      * by this bucket. In standard VLSNBuckets, only one file is covered, so
335      * there is only one possible value. In GhostBuckets, multiple files could
336      * be covered.
337      * @return
338      */
getLTEFileNumber()339     long getLTEFileNumber() {
340         return fileNumber;
341     }
342 
343     /*
344      * Similar to getLTEFileNumber, for this implementation there's only one
345      * possible file.
346      */
getGTEFileNumber()347     long getGTEFileNumber() {
348         return fileNumber;
349     }
350 
empty()351     synchronized boolean empty() {
352         return emptyInternal();
353     }
354 
emptyInternal()355     private boolean emptyInternal() {
356         return (firstVLSN.equals(lastVLSN) &&
357                 (lastLsn == DbLsn.NULL_LSN));
358     }
359 
follows(VLSN vlsn)360     boolean follows(VLSN vlsn) {
361         return (firstVLSN.compareTo(vlsn) > 0);
362     }
363 
precedes(VLSN vlsn)364     boolean precedes(VLSN vlsn) {
365         return (!lastVLSN.equals(VLSN.NULL_VLSN) &&
366                 (lastVLSN.compareTo(vlsn) < 0));
367     }
368 
369     /**
370      * Returns the mapping whose VLSN is >= the VLSN parameter. For example, if
371      * the bucket holds mappings for vlsn 10, 13, 16,
372      *
373      *  - the greater than or equal mapping for VLSN 10 is 10/lsn
374      *  - the greater than or equal mapping for VLSN 11 is 13/lsn
375      *  - the greater than or equal mapping for VLSN 13 is 13/lsn
376      *
377      * File offsets may be null in the middle of the file offsets array because
378      * of out of order mappings. This method must return a non-null lsn, and
379      * must account for null offsets.
380      *
381      * @return the mapping whose VLSN is >= the VLSN parameter. Will never
382      * return NULL_LSN, because the VLSNRange begin and end point are always
383      * mapped.
384      */
getGTELsn(VLSN vlsn)385     public synchronized long getGTELsn(VLSN vlsn) {
386 
387         if (lastVLSN.equals(vlsn)) {
388             return lastLsn;
389         }
390 
391         int index;
392         if (firstVLSN.compareTo(vlsn) >= 0) {
393 
394             /*
395              * It's possible for vlsn to be < the firstVLSN if vlsn
396              * falls between two buckets. For example, if the buckets are:
397              *    bucketA = vlsn 10-> 20
398              *    bucketB = vlsn 22->30
399              * then vlsn 21 will fall between two buckets, and will get bucketB
400              */
401             index = 0;
402         } else {
403             index = getGTEIndex(vlsn);
404         }
405 
406         /*
407          * This should never happen. Throw this exception to make debugging
408          * info available.
409          */
410         if (index < 0) {
411             throw EnvironmentFailureException.unexpectedState
412                 ("index=" + index +
413                  " vlsn=" + vlsn +
414                  " bucket=" + this);
415         }
416 
417         if (index >= fileOffsets.size()) {
418             return lastLsn;
419         }
420         int useIndex = findPopulatedIndex(index, true /* forward */);
421         int offset = fileOffsets.get(useIndex);
422         return offset == NO_OFFSET ?
423             lastLsn : DbLsn.makeLsn(fileNumber, offset);
424     }
425 
426     /**
427      * Return the index for the mapping >= this VLSN. Note that this is just
428      * a stride calculation, and a non-existent file offset index might be
429      * returned.
430      */
getGTEIndex(VLSN vlsn)431     private int getGTEIndex(VLSN vlsn) {
432         long diff = vlsn.getSequence() - firstVLSN.getSequence();
433         return (int) ((diff + (stride - 1)) / stride);
434     }
435 
436     /**
437      * We'd like to return the mapping at startIndex for the get{LTE, GTE}
438      * Mapping methods, but the offsets may not be populated if put() calls
439      * have come out of order. Search for the next populated offset.
440      */
findPopulatedIndex(int startIndex, boolean forward)441     private int findPopulatedIndex(int startIndex, boolean forward) {
442         if (forward) {
443             for (int i = startIndex; i < fileOffsets.size(); i++) {
444                 if (fileOffsets.get(i) != NO_OFFSET) {
445                     return i;
446                 }
447             }
448         } else {
449             for (int i = startIndex; i >= 0; i--) {
450                 if (fileOffsets.get(i) != NO_OFFSET) {
451                     return i;
452                 }
453             }
454         }
455         return startIndex;
456     }
457 
458     /**
459      * Returns the lsn whose VLSN is <= the VLSN parameter. For example, if
460      * the bucket holds mappings for vlsn 10, 13, 16,
461      *
462      *  - the less than or equal mapping for VLSN 10 is 10/lsn
463      *  - the less than or equal mapping for VLSN 11 is 10/lsn
464      *  - the less than or equal mapping for VLSN 13 is 13/lsn
465      *
466      * File offsets may be null in the middle of the file offsets array because
467      * of out of order mappings. This method must return a non-null lsn, and
468      * must account for null offsets.
469      *
470      * @return the lsn whose VLSN is <= the VLSN parameter. Will never return
471      * NULL_LSN, because the VLSNRange begin and end points are always mapped.
472      */
getLTELsn(VLSN vlsn)473     synchronized long getLTELsn(VLSN vlsn) {
474 
475         /*
476          * It's possible for vlsn to be greater than lastVLSN if vlsn falls
477          * between two buckets.
478          * For example, if the buckets are:
479          *    bucketA = vlsn 10-> 20
480          *    bucketB = vlsn 22->30
481          * then vlsn 21 will fall between two buckets, and will get bucketA
482          */
483         if (lastVLSN.compareTo(vlsn) <= 0) {
484             return lastLsn;
485         }
486 
487         long diff = vlsn.getSequence() - firstVLSN.getSequence();
488 
489         /*
490          * Make sure that the file offset array isn't unexpectedly short due to
491          * out of order inserts.
492          */
493         int index = (int)(diff / stride);
494         if (index >= fileOffsets.size()) {
495             index = fileOffsets.size() - 1;
496         }
497 
498         int useIndex = findPopulatedIndex(index, false /* forward */);
499         int offset = fileOffsets.get(useIndex);
500 
501         assert offset != NO_OFFSET : "bucket should always have a non-null " +
502             "first offset. vlsn= " + vlsn + " bucket=" + this;
503 
504         return (DbLsn.makeLsn(fileNumber, offset));
505     }
506 
507     /**
508      * @return the lsn whose VLSN is == the VLSN parameter or DbLsn.NULL_LSN if
509      * there is no mapping. Note that because of out of order puts, there may
510      * be missing mappings that appear later on.
511      */
getLsn(VLSN vlsn)512     public synchronized long getLsn(VLSN vlsn) {
513         assert owns(vlsn) : "vlsn=" + vlsn + " " + this;
514 
515         if (lastVLSN.equals(vlsn)) {
516             return lastLsn;
517         }
518 
519         if (!isModulo(vlsn)) {
520             return DbLsn.NULL_LSN;
521         }
522 
523         int index = getIndex(vlsn);
524         if (index >= fileOffsets.size()) {
525             return DbLsn.NULL_LSN;
526         }
527 
528         int offset = fileOffsets.get(index);
529         if (offset == NO_OFFSET) {
530             return DbLsn.NULL_LSN;
531         }
532 
533         return DbLsn.makeLsn(fileNumber, offset);
534     }
535 
getLastLsn()536     synchronized long getLastLsn() {
537         return lastLsn;
538     }
539 
540     /**
541      * Remove the mappings from this bucket that are for VLSNs <=
542      * lastDuplicate.  If this results in a broken stride interval, package all
543      * those mappings into their own bucket and return it as a remainder
544      * bucket.
545      *
546      * For example, suppose this bucket has a stride of 5 and maps VLSN 10-23.
547      * Then it has mappings for 10, 15, 20, 23.
548      *
549      * If we need to remove mappings <= 16, we'll end up without a bucket that
550      * serves as a home base for vlsns 17,18,19. Those will be spun out into
551      * their own bucket, and this bucket will be adjusted to start at VLSN 20.
552      * This bucket should end up with
553      *
554      *  - firstVLSN = 20
555      *  - fileOffset is an array of size 1, for the LSN for VLSN 20
556      *  - lastVLSN = 23
557      *  - lastLsn = the same as before
558      *
559      * The spun-off bucket should be:
560      *  - firstVLSN = 17
561      *  - fileOffset is an array of size 1, for the LSN for VLSN 17
562      *  - lastVLSN = 19
563      *  - lastLsn = lsn for 19
564      *
565      * @return the newly created bucket that holds mappings from a broken
566      * stride interval, or null if there was no need to create such a bucket.
567      */
removeFromHead(EnvironmentImpl envImpl, VLSN lastDuplicate)568     VLSNBucket removeFromHead(EnvironmentImpl envImpl, VLSN lastDuplicate) {
569 
570         if (empty()) {
571             return null;
572         }
573 
574         /*
575          * No overlap, this bucket owns mappngs that follow the duplicate
576          * range.
577          */
578         if (lastDuplicate.compareTo(firstVLSN) < 0) {
579             return null;
580         }
581 
582         /*
583          * This whole bucket is to be deleted, all its mappings are <= the
584          * lastDuplicate.
585          */
586         if (lastVLSN.compareTo(lastDuplicate) <= 0) {
587             fileOffsets = null;
588             firstVLSN = VLSN.NULL_VLSN;
589             lastVLSN = VLSN.NULL_VLSN;
590             lastLsn = DbLsn.NULL_LSN;
591             return null;
592         }
593 
594         VLSN indexVLSN = firstVLSN;
595         int newFirstIndex = -1;
596 
597         /*
598          * Find the mappings that still belong. Using the example above, we
599          * should find that we can delete fileOffset[0] and fileOffset[1] and
600          * preserve fileOffset[2]
601          */
602         for (int i = 0; i < fileOffsets.size(); i++) {
603             if ((indexVLSN.compareTo(lastDuplicate) > 0) &&
604                 (fileOffsets.get(i) != NO_OFFSET)) {
605                 newFirstIndex = i;
606                 break;
607             }
608             indexVLSN = new VLSN(indexVLSN.getSequence() + stride);
609         }
610 
611         VLSNBucket remainder = null;
612         int lastOffset;
613         if (newFirstIndex == -1) {
614 
615             /*
616              * None of the VLSNs represented by the strided file offsets are
617              * needed anymore. This bucket consists solely of the last
618              * VLSN->LSN pair.
619              */
620             lastOffset = fileOffsets.get(fileOffsets.size() - 1);
621             fileOffsets = new TruncateableList<Integer>();
622             fileOffsets.add(DbLsn.getFileOffsetAsInt(lastLsn));
623             firstVLSN = lastVLSN;
624         } else {
625             /* Move the still-valid mappings to a new list. */
626             assert (newFirstIndex > 0);
627             lastOffset = fileOffsets.get(newFirstIndex - 1);
628             TruncateableList<Integer> newFileOffsets =
629                 new TruncateableList<Integer>
630                 (fileOffsets.subList(newFirstIndex, fileOffsets.size()));
631             fileOffsets = newFileOffsets;
632             firstVLSN =  new VLSN((newFirstIndex * stride) +
633                                   firstVLSN.getSequence());
634         }
635 
636         if (!firstVLSN.equals(lastDuplicate.getNext())) {
637 
638             /*
639              * If lastDuplicate was not on the same stride boundary as our old
640              * bucket, we may have a broken bucket of mappings to preserve.
641              * Using our example numbers above, we still need to make sure
642              * there's a bucket that matches VLSNs 17, 18 19.
643              */
644             long scanStart =  DbLsn.makeLsn(fileNumber, lastOffset);
645             remainder = scanForNewBucket(envImpl,
646                                          lastDuplicate.getNext(),
647                                          firstVLSN.getPrev(),
648                                          scanStart);
649         }
650 
651         dirty = true;
652         return remainder;
653     }
654 
655     /**
656      * Scan the log fle for VLSN->LSN mappings for creating a new bucket.
657      */
scanForNewBucket(EnvironmentImpl envImpl, VLSN first, VLSN last, long startLsn)658     private VLSNBucket scanForNewBucket(EnvironmentImpl envImpl,
659                                         VLSN first,
660                                         VLSN last,
661                                         long startLsn) {
662 
663         VLSNBucket newBucket = new VLSNBucket(fileNumber, stride,
664                                               maxMappings, maxDistance,
665                                               first);
666         int readBufferSize =
667             envImpl.getConfigManager().getInt
668             (EnvironmentParams.LOG_ITERATOR_MAX_SIZE);
669 
670         NewBucketReader scanner =
671             new NewBucketReader(newBucket, envImpl, readBufferSize, first,
672                                 last, startLsn);
673 
674         while (!scanner.isDone() && (scanner.readNextEntry())) {
675         }
676 
677         assert scanner.isDone();
678 
679         return newBucket;
680     }
681 
682     /**
683      * Remove the mappings from this bucket that are for VLSNs >=
684      * startOfDelete.  Unlike removing from the head, we need not worry about
685      * breaking a bucket stride interval.
686      *
687      * If prevLsn is NULL_VLSN, we don't have a good value to cap the bucket.
688      * Instead, we'll have to delete the bucket back to whatever was the next
689      * available lsn. For example, suppose the bucket has these mappings.  This
690      * strange bucket (stride 25 is missing) is possible if vlsn 26 arrived
691      * early, out of order.
692      *
693      *  in fileOffset: 10 -> 101
694      *  in fileOffset: 15 -> no offset
695      *  in fileOffset: 20 -> 201
696      *  lastVLSN->lastnLsn mapping  26 -> 250
697      *
698      * If we have a prevLsn and the startOfDelete is 17, then we can create
699      * a new mapping
700      *  in fileOffset: 10 -> 101
701      *  in fileOffset: 15 -> no offset
702      *  lastVLSN->lastnLsn mapping  17 -> 190
703      *
704      * If we don't have a prevLsn, then we know that we have to cut the bucket
705      * back to the largest known mapping, losing many mappings along the way.
706      *  in fileOffset: 10 -> 101
707      *  lastVLSN->lastnLsn mapping  10 -> 101
708      *
709      * If we are deleting in the vlsn area between the last stride and the
710      * last offset, (i.e. vlsn 23 is the startOfDelete) the with and without
711      * prevLSn cases would look like this:
712      *
713      * (there is a prevLsn, and 23 is startDelete. No need to truncate
714      * anything)
715      *  in fileOffset: 10 -> 101
716      *  in fileOffset: 15 -> no offset
717      *  in fileOffset: 20 -> 201
718      *  lastVLSN->lastnLsn mapping  23 -> prevLsn
719      *
720      * (there is no prevLsn, and 23 is startDelete)
721      *  in fileOffset: 10 -> 101
722      *  in fileOffset: 15 -> no offset
723      *  in fileOffset: 20 -> 201
724      *  lastVLSN->lastnLsn mapping  20 -> 201
725      *
726      * @param startOfDelete is the VLSN that begins the range to delete,
727      *        inclusive
728      * @param prevLsn is the lsn of startOfDelete.getPrev(). We'll be using it
729      * to cap off the end of the bucket, by assigning it to the lastLsn field.
730      */
removeFromTail(VLSN startOfDelete, long prevLsn)731     void removeFromTail(VLSN startOfDelete, long prevLsn) {
732 
733         if (empty()) {
734             return;
735         }
736 
737         if (lastVLSN.compareTo(startOfDelete) < 0) {
738             return;
739         }
740 
741         /* Delete all the mappings. */
742         if (firstVLSN.compareTo(startOfDelete) >= 0) {
743             lastVLSN = firstVLSN;
744             lastLsn = DbLsn.NULL_LSN;
745             fileOffsets.clear();
746             return;
747         }
748 
749         /* Delete some of the mappings. */
750         int deleteIndex = getGTEIndex(startOfDelete);
751 
752         /*
753          * This should never happen, because the startOfDelete should be a vlsn
754          * that is >= the first vlsn and we handled the case where
755          * startOfDelete == firstVLSN already.)  Throw this exception to make
756          * debugging info available.
757          */
758         if (deleteIndex <= 0) {
759             throw EnvironmentFailureException.unexpectedState
760                 ("deleteIndex=" + deleteIndex +
761                  " startOfDelete=" + startOfDelete +
762                  " bucket=" + this);
763         }
764 
765         /* See if there are any fileoffsets to prune off. */
766         if (deleteIndex < fileOffsets.size()) {
767 
768             /*
769              * The startOfDeleteVLSN is a value between the firstVLSN and
770              * the last file offset.
771              */
772             if (prevLsn == DbLsn.NULL_LSN) {
773                 int lastPopulatedIndex =
774                     findPopulatedIndex(deleteIndex-1, false);
775                 if (lastPopulatedIndex != (deleteIndex -1)) {
776                     deleteIndex = lastPopulatedIndex + 1;
777                 }
778             }
779             fileOffsets.truncate(deleteIndex);
780         } else {
781             /*
782              * The startOfDelete vlsn is somewhere between the last file offset
783              * and the lastVLSN.
784              */
785             if (prevLsn == DbLsn.NULL_LSN) {
786                 int lastIndex = fileOffsets.size() - 1;
787                 int lastPopulatedIndex = findPopulatedIndex(lastIndex, false);
788                 if (lastPopulatedIndex < lastIndex) {
789                     fileOffsets.truncate(lastPopulatedIndex);
790                 }
791             }
792         }
793 
794         /* Now set the lastVLSN -> lastLSN mapping. */
795         if (prevLsn == DbLsn.NULL_LSN) {
796             lastVLSN = new VLSN(((fileOffsets.size()-1) * stride) +
797                                 firstVLSN.getSequence());
798             Integer lastOffset = fileOffsets.get(fileOffsets.size() - 1);
799             assert lastOffset != null;
800             lastLsn = DbLsn.makeLsn(fileNumber,  lastOffset);
801         } else {
802             lastVLSN = startOfDelete.getPrev();
803             lastLsn = prevLsn;
804         }
805         dirty = true;
806     }
807 
808     /* For unit tests */
getNumOffsets()809     int getNumOffsets() {
810         return fileOffsets.size();
811     }
812 
close()813     void close() {
814         closed = true;
815     }
816 
817     /**
818      * Write this bucket to the mapping database.
819      */
writeToDatabase(EnvironmentImpl envImpl, DatabaseImpl bucketDbImpl, Txn txn)820     void writeToDatabase(EnvironmentImpl envImpl,
821                          DatabaseImpl bucketDbImpl,
822                          Txn txn) {
823 
824         if (!dirty) {
825             return;
826         }
827 
828         Cursor c = null;
829         try {
830             c = DbInternal.makeCursor(bucketDbImpl,
831                                       txn,
832                                       CursorConfig.DEFAULT);
833             writeToDatabase(envImpl, c);
834         } finally {
835             if (c != null) {
836                 c.close();
837             }
838         }
839     }
840 
841     /**
842      * Write this bucket to the mapping database using a cursor.  Note that
843      * this method must disable critical eviction. Critical eviction makes the
844      * calling thread search for a target IN node to evict. That target IN node
845      * may or may not be in the internal VLSN db.
846      *
847      * For example, when a new, replicated LN is inserted or modified, a
848      * new VLSN is allocated. To do so, the app thread that is executing the
849      * operation
850      *  A1. Takes a BIN latch on a BIN in a replicated db
851      *  A2. Takes the VLSNINdex mutex
852      *
853      * Anyone calling writeDatabase() has to take these steps:
854      *  B1. Take the VLSNIndex mutex
855      *  B2. Get a BIN latch for a BIN in the internal vlsn db.
856      *
857      * This difference in locking hierarchy could cause a deadlock except for
858      * the fact that A1 and B2 are guaranteed to be in different databases.  If
859      * writeDatabase() also did critical eviction, it would have a step where
860      * it tried to get a BIN latch on a replicated db, and we'd have a
861      * deadlock. [#18475]
862      */
writeToDatabase(EnvironmentImpl envImpl, Cursor cursor)863     void writeToDatabase(EnvironmentImpl envImpl, Cursor cursor) {
864 
865         if (!dirty) {
866             return;
867         }
868 
869         DatabaseEntry key = new DatabaseEntry();
870         DatabaseEntry data = new DatabaseEntry();
871 
872         LongBinding.longToEntry(firstVLSN.getSequence(), key);
873         VLSNBucketBinding bucketBinding = new VLSNBucketBinding();
874         bucketBinding.objectToEntry(this, data);
875 
876         DbInternal.getCursorImpl(cursor).setAllowEviction(false);
877         OperationStatus status = cursor.put(key, data);
878 
879         if (status != OperationStatus.SUCCESS) {
880             throw EnvironmentFailureException.unexpectedState
881                  (envImpl, "Unable to write VLSNBucket for file " +
882                  fileNumber + " status=" + status);
883         }
884         dirty = false;
885     }
886 
887     /**
888      * Instantiate this from the database. Assumes that this bucket will not be
889      * used for insertion in the future.
890      */
readFromDatabase(DatabaseEntry data)891     public static VLSNBucket readFromDatabase(DatabaseEntry data) {
892 
893         VLSNBucketBinding mapperBinding = new VLSNBucketBinding();
894         VLSNBucket bucket = mapperBinding.entryToObject(data);
895         return bucket;
896     }
897 
fillDataEntry(DatabaseEntry data)898     void fillDataEntry(DatabaseEntry data) {
899         VLSNBucketBinding binding = new VLSNBucketBinding();
900         binding.objectToEntry(this, data);
901     }
902 
903     /**
904      * @see java.lang.Object#toString()
905      */
906     @Override
toString()907     public String toString() {
908         return String.format("<VLSNBucket fileNum=%d(0x%x) numOffsets=%d " +
909                              "stride=%d firstVLSN=%s lastVLSN=%s lastLsn=%s/>",
910                              fileNumber, fileNumber,
911                              (fileOffsets == null) ? 0 : fileOffsets.size(),
912                              stride, firstVLSN, lastVLSN,
913                              DbLsn.getNoFormatString(lastLsn));
914     }
915 
916     /**
917      * For debugging and tracing.
918      */
dump(PrintStream out)919     public void dump(PrintStream out) {
920         if (fileOffsets == null) {
921             return;
922         }
923 
924         long vlsnVal = firstVLSN.getSequence();
925         int newlineCounter = 0;
926         for (Integer offset : fileOffsets) {
927             out.printf(" [%d 0x%x]", vlsnVal,
928                        DbLsn.convertIntFileOffsetToLong(offset));
929 
930             vlsnVal += stride;
931             if (++newlineCounter > 6) {
932                 out.println("\n");
933                 newlineCounter = 0;
934             }
935         }
936 
937         out.printf("\n---------Last: VLSN=%s LSN=%s", lastVLSN,
938                    DbLsn.getNoFormatString(lastLsn));
939     }
940 
isGhost()941     boolean isGhost() {
942         return false;
943     }
944 
writeToTupleOutput(TupleOutput to)945     void writeToTupleOutput(TupleOutput to) {
946 
947         to.writePackedLong(fileNumber);
948         to.writePackedInt(stride);
949         to.writePackedLong(firstVLSN.getSequence());
950         to.writePackedLong(lastVLSN.getSequence());
951         to.writePackedLong(lastLsn);
952         to.writePackedInt(fileOffsets.size());
953         for (Integer offset: fileOffsets) {
954             to.writeUnsignedInt(DbLsn.convertIntFileOffsetToLong(offset));
955         }
956     }
957 
958     /**
959      * Marshals a VLSNBucket to a byte buffer to store in the database.
960      * Doesn't persist the file number, because that's the key of the database.
961      * A number of the fields are transient and are also not stored.
962      */
963     private static class VLSNBucketBinding extends TupleBinding<VLSNBucket> {
964 
965         @Override
entryToObject(TupleInput ti)966         public VLSNBucket entryToObject(TupleInput ti) {
967 
968             int onDiskVersion = ti.readPackedInt();
969             if (onDiskVersion != VLSNBucket.VERSION) {
970                 throw EnvironmentFailureException.unexpectedState
971                     ("Don't expect version diff on_disk=" + onDiskVersion +
972                      " source=" + VLSNBucket.VERSION);
973             }
974             boolean isGhost = ti.readBoolean();
975             VLSNBucket bucket = null;
976             if (isGhost) {
977                 bucket = GhostBucket.makeNewInstance(ti);
978             } else {
979                 bucket = new VLSNBucket(ti);
980             }
981             return bucket;
982         }
983 
984         @Override
objectToEntry(VLSNBucket bucket, TupleOutput to)985         public void objectToEntry(VLSNBucket bucket, TupleOutput to) {
986             to.writePackedInt(VLSNBucket.VERSION);
987             to.writeBoolean(bucket.isGhost());
988             bucket.writeToTupleOutput(to);
989         }
990     }
991 
992     /**
993      * Scan a specific section of log and generate a new VLSNBucket for
994      * this section.
995      */
996     private static class NewBucketReader extends FileReader {
997         private final VLSNBucket remainderBucket;
998         private boolean done = false;
999         private final VLSN first;
1000         private final VLSN last;
1001 
NewBucketReader(VLSNBucket remainderBucket, EnvironmentImpl envImpl, int readBufferSize, VLSN first, VLSN last, long startLsn)1002         public NewBucketReader(VLSNBucket remainderBucket,
1003                                EnvironmentImpl envImpl,
1004                                int readBufferSize,
1005                                VLSN first,
1006                                VLSN last,
1007                                long startLsn) {
1008             super(envImpl,
1009                   readBufferSize,
1010                   true,            // forward
1011                   startLsn,
1012                   null,            // singleFileNumber
1013                   DbLsn.NULL_LSN,  // endOfFileLsn
1014                   DbLsn.NULL_LSN); // finishLsn
1015 
1016             this.remainderBucket = remainderBucket;
1017             this.first = first;
1018             this.last = last;
1019         }
1020 
1021         /**
1022          * Return true if this entry is replicated and its VLSN >= the
1023          * firstVLSN and the entry is not invisible. These entries will
1024          * be used to bring the VLSNIndex up to speed.
1025          */
1026         @Override
isTargetEntry()1027         protected boolean isTargetEntry()  {
1028             return (!currentEntryHeader.isInvisible() &&
1029                      entryIsReplicated() &&
1030                     (currentEntryHeader.getVLSN().compareTo(first) >= 0));
1031         }
1032 
1033         @Override
processEntry(ByteBuffer entryBuffer)1034         protected boolean processEntry(ByteBuffer entryBuffer) {
1035             if (currentEntryHeader.getVLSN().compareTo(last) > 0) {
1036                 done = true;
1037             } else {
1038                 remainderBucket.put(currentEntryHeader.getVLSN(),
1039                                     getLastLsn());
1040             }
1041 
1042             entryBuffer.position(entryBuffer.position() +
1043                                  currentEntryHeader.getItemSize());
1044             return true;
1045         }
1046 
isDone()1047         boolean isDone() {
1048             return done;
1049         }
1050     }
1051 
1052     @SuppressWarnings("serial")
1053     private static class TruncateableList<T> extends ArrayList<T> {
1054 
TruncateableList()1055         TruncateableList() {
1056             super();
1057         }
1058 
TruncateableList(int capacity)1059         TruncateableList(int capacity) {
1060             super(capacity);
1061         }
1062 
TruncateableList(List<T> list)1063         TruncateableList(List<T> list) {
1064             super(list);
1065         }
1066 
truncate(int fromIndex)1067         void truncate(int fromIndex) {
1068             removeRange(fromIndex, size());
1069         }
1070     }
1071 }
1072